From 632ecbc1c49bdc29f39e2ca2887fa3434eb49dc7 Mon Sep 17 00:00:00 2001 From: Luke Chen Date: Mon, 5 Dec 2022 15:46:38 +1100 Subject: Import wiredtiger: 043606c2f83e919f0f9de5c9c9cac7d771dc0af1 from branch mongodb-6.0 ref: 8e14a40cb6..043606c2f8 for: 6.0.4 WT-9805 Save the updates need to be deleted from history store and delete them later --- src/third_party/wiredtiger/import.data | 2 +- src/third_party/wiredtiger/src/btree/bt_cursor.c | 7 + src/third_party/wiredtiger/src/history/hs_rec.c | 289 +++++++++++---------- src/third_party/wiredtiger/src/include/extern.h | 2 + src/third_party/wiredtiger/src/include/reconcile.h | 23 +- .../wiredtiger/src/include/wt_internal.h | 2 + .../wiredtiger/src/reconcile/rec_visibility.c | 96 +++++-- .../wiredtiger/src/reconcile/rec_write.c | 10 + .../test/format/failure_configs/CONFIG.WT-9805 | 196 ++++++++++++++ .../wiredtiger/test/suite/test_prepare25.py | 2 +- .../wiredtiger/test/suite/test_prepare26.py | 134 ++++++++++ 11 files changed, 602 insertions(+), 161 deletions(-) create mode 100644 src/third_party/wiredtiger/test/format/failure_configs/CONFIG.WT-9805 create mode 100644 src/third_party/wiredtiger/test/suite/test_prepare26.py diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 66a7adfbc66..bf0a900a7fd 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-6.0", - "commit": "8e14a40cb6db5cd5d021c111b8e1abcaff7b79bc" + "commit": "043606c2f83e919f0f9de5c9c9cac7d771dc0af1" } diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index ec82ae02241..1ec2e8c1aa8 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -453,6 +453,9 @@ __wt_btcur_reset(WT_CURSOR_BTREE *cbt) WT_STAT_CONN_DATA_INCR(session, cursor_reset); F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + /* Initialize the update value as we are not pointing to any value. */ + cbt->upd_value->type = WT_UPDATE_INVALID; + WT_TIME_WINDOW_INIT(&cbt->upd_value->tw); return (__cursor_reset(cbt)); } @@ -1919,6 +1922,10 @@ __wt_btcur_open(WT_CURSOR_BTREE *cbt) cbt->modify_update = &cbt->_modify_update; cbt->upd_value = &cbt->_upd_value; + /* Initialize the value. */ + cbt->upd_value->type = WT_UPDATE_INVALID; + WT_TIME_WINDOW_INIT(&cbt->upd_value->tw); + #ifdef HAVE_DIAGNOSTIC cbt->lastkey = &cbt->_lastkey; cbt->lastrecno = WT_RECNO_OOB; diff --git a/src/third_party/wiredtiger/src/history/hs_rec.c b/src/third_party/wiredtiger/src/history/hs_rec.c index c50dd509d5a..c3b8f7aa81a 100644 --- a/src/third_party/wiredtiger/src/history/hs_rec.c +++ b/src/third_party/wiredtiger/src/history/hs_rec.c @@ -60,65 +60,6 @@ __hs_verbose_cache_stats(WT_SESSION_IMPL *session, WT_BTREE *btree) cache->hs_verb_gen_write = ckpt_gen_current; } -/* - * __hs_delete_record -- - * Delete the update left in the history store - */ -static int -__hs_delete_record(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_ITEM *key, - WT_UPDATE *delete_upd, WT_UPDATE *delete_tombstone) -{ - WT_DECL_RET; - bool hs_read_committed; -#ifdef HAVE_DIAGNOSTIC - WT_TIME_WINDOW *hs_tw; -#endif - - hs_read_committed = F_ISSET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED); - F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED); - - /* No need to delete from the history store if it is already obsolete. */ - if (delete_tombstone != NULL && __wt_txn_upd_visible_all(session, delete_tombstone)) { - ret = 0; - goto done; - } - - hs_cursor->set_key(hs_cursor, 4, S2BT(session)->id, key, WT_TS_MAX, UINT64_MAX); - WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_before(session, hs_cursor), true); - /* It's possible the value in the history store becomes obsolete concurrently. */ - if (ret == WT_NOTFOUND) { - WT_ASSERT( - session, delete_tombstone != NULL && __wt_txn_upd_visible_all(session, delete_tombstone)); - ret = 0; - goto done; - } - -#ifdef HAVE_DIAGNOSTIC - __wt_hs_upd_time_window(hs_cursor, &hs_tw); - WT_ASSERT(session, hs_tw->start_txn == WT_TXN_NONE || hs_tw->start_txn == delete_upd->txnid); - WT_ASSERT(session, hs_tw->start_ts == WT_TS_NONE || hs_tw->start_ts == delete_upd->start_ts); - WT_ASSERT(session, - hs_tw->durable_start_ts == WT_TS_NONE || hs_tw->durable_start_ts == delete_upd->durable_ts); - if (delete_tombstone != NULL) { - WT_ASSERT(session, hs_tw->stop_txn == delete_tombstone->txnid); - WT_ASSERT(session, hs_tw->stop_ts == delete_tombstone->start_ts); - WT_ASSERT(session, hs_tw->durable_stop_ts == delete_tombstone->durable_ts); - } else - WT_ASSERT(session, !WT_TIME_WINDOW_HAS_STOP(hs_tw)); -#endif - - WT_ERR(hs_cursor->remove(hs_cursor)); -done: - if (delete_tombstone != NULL) - F_CLR(delete_tombstone, WT_UPDATE_TO_DELETE_FROM_HS | WT_UPDATE_HS); - F_CLR(delete_upd, WT_UPDATE_TO_DELETE_FROM_HS | WT_UPDATE_HS); - -err: - if (!hs_read_committed) - F_CLR(hs_cursor, WT_CURSTD_HS_READ_COMMITTED); - return (ret); -} - /* * __hs_insert_record -- * A helper function to insert the record into the history store including stop time point. @@ -347,10 +288,46 @@ __hs_next_upd_full_value(WT_SESSION_IMPL *session, WT_UPDATE_VECTOR *updates, return (0); } +/* + * __hs_pack_key -- + * Pack the history store key + */ +static inline int +__hs_pack_key(WT_SESSION_IMPL *session, WT_BTREE *btree, WT_RECONCILE *r, WT_INSERT *ins, + WT_ROW *rip, WT_ITEM *key) +{ + WT_DECL_RET; + uint8_t *p; + + switch (r->page->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + p = key->mem; + WT_RET(__wt_vpack_uint(&p, 0, WT_INSERT_RECNO(ins))); + key->size = WT_PTRDIFF(p, key->data); + break; + case WT_PAGE_ROW_LEAF: + if (ins == NULL) { + WT_WITH_BTREE( + session, btree, ret = __wt_row_leaf_key(session, r->page, rip, key, false)); + WT_RET(ret); + } else { + key->data = WT_INSERT_KEY(ins); + key->size = WT_INSERT_KEY_SIZE(ins); + } + break; + default: + WT_RET(__wt_illegal_value(session, r->page->type)); + } + + return (ret); +} + /* * __wt_hs_insert_updates -- - * Copy one set of saved updates into the database's history store table. Whether the function - * fails or succeeds, if there is a successful write to history, cache_write_hs is set to true. + * Copy one set of saved updates into the database's history store table if they haven't been + * moved there. Whether the function fails or succeeds, if there is a successful write to + * history, cache_write_hs is set to true. */ int __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_MULTI *multi) @@ -372,14 +349,12 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_MULTI *mult WT_UPDATE_VECTOR out_of_order_ts_updates; WT_SAVE_UPD *list; WT_UPDATE *first_globally_visible_upd, *fix_ts_upd, *min_ts_upd, *out_of_order_ts_upd; - WT_UPDATE *delete_tombstone, *delete_upd, *newest_hs, *non_aborted_upd, *oldest_upd, *prev_upd, - *ref_upd, *tombstone, *upd; + WT_UPDATE *newest_hs, *non_aborted_upd, *oldest_upd, *prev_upd, *ref_upd, *tombstone, *upd; WT_TIME_WINDOW tw; wt_off_t hs_size; uint64_t insert_cnt, max_hs_size, modify_cnt; uint64_t cache_hs_insert_full_update, cache_hs_insert_reverse_modify, cache_hs_write_squash; uint32_t i; - uint8_t *p; int nentries; bool enable_reverse_modify, error_on_ooo_ts, hs_inserted, squashed; @@ -422,68 +397,6 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_MULTI *mult if (list->onpage_upd == NULL) continue; - /* History store table key component: source key. */ - switch (r->page->type) { - case WT_PAGE_COL_FIX: - case WT_PAGE_COL_VAR: - p = key->mem; - WT_ERR(__wt_vpack_uint(&p, 0, WT_INSERT_RECNO(list->ins))); - key->size = WT_PTRDIFF(p, key->data); - break; - case WT_PAGE_ROW_LEAF: - if (list->ins == NULL) { - WT_WITH_BTREE( - session, btree, ret = __wt_row_leaf_key(session, r->page, list->rip, key, false)); - WT_ERR(ret); - } else { - key->data = WT_INSERT_KEY(list->ins); - key->size = WT_INSERT_KEY_SIZE(list->ins); - } - break; - default: - WT_ERR(__wt_illegal_value(session, r->page->type)); - } - - newest_hs = first_globally_visible_upd = min_ts_upd = out_of_order_ts_upd = NULL; - ref_upd = list->onpage_upd; - delete_tombstone = delete_upd = NULL; - - __wt_update_vector_clear(&out_of_order_ts_updates); - __wt_update_vector_clear(&updates); - - /* - * Reverse deltas are only supported on 'S' and 'u' value formats. - */ - enable_reverse_modify = - (WT_STREQ(btree->value_format, "S") || WT_STREQ(btree->value_format, "u")); - - /* - * Delete the update that is both on the update chain and the history store from the history - * store. Otherwise, we will trigger out of order fix when the update is inserted to the - * history store again. - */ - for (upd = list->onpage_tombstone != NULL ? list->onpage_tombstone : list->onpage_upd; - upd != NULL; upd = upd->next) { - if (upd->txnid == WT_TXN_ABORTED) - continue; - - if (F_ISSET(upd, WT_UPDATE_TO_DELETE_FROM_HS)) { - /* - * If we want to remove an update from the history store in WiredTiger, it must be - * in history store. - */ - WT_ASSERT(session, F_ISSET(upd, WT_UPDATE_HS | WT_UPDATE_RESTORED_FROM_HS)); - if (upd->type == WT_UPDATE_TOMBSTONE) - delete_tombstone = upd; - else { - delete_upd = upd; - WT_ERR( - __hs_delete_record(session, hs_cursor, key, delete_upd, delete_tombstone)); - break; - } - } - } - /* Skip aborted updates. */ for (upd = list->onpage_upd->next; upd != NULL && upd->txnid == WT_TXN_ABORTED; upd = upd->next) @@ -497,6 +410,21 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_MULTI *mult if (F_ISSET(upd, WT_UPDATE_HS)) continue; + /* History store table key component: source key. */ + WT_ERR(__hs_pack_key(session, btree, r, list->ins, list->rip, key)); + + newest_hs = first_globally_visible_upd = min_ts_upd = out_of_order_ts_upd = NULL; + ref_upd = list->onpage_upd; + + __wt_update_vector_clear(&updates); + __wt_update_vector_clear(&out_of_order_ts_updates); + + /* + * Reverse deltas are only supported on 'S' and 'u' value formats. + */ + enable_reverse_modify = + (WT_STREQ(btree->value_format, "S") || WT_STREQ(btree->value_format, "u")); + /* * The algorithm assumes the oldest update on the update chain in memory is either a full * update or a tombstone. @@ -880,7 +808,7 @@ __wt_hs_delete_key_from_ts(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint3 { WT_DECL_RET; WT_ITEM hs_key; - wt_timestamp_t hs_ts; + wt_timestamp_t hs_start_ts; uint64_t hs_counter; uint32_t hs_btree_id; bool hs_read_all_flag; @@ -905,7 +833,7 @@ __wt_hs_delete_key_from_ts(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint3 ret = 0; goto done; } else { - WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_ts, &hs_counter)); + WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter)); ++hs_counter; } @@ -935,8 +863,8 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui WT_CURSOR_BTREE *hs_cbt; WT_DECL_RET; WT_ITEM hs_key, hs_value; - WT_TIME_WINDOW hs_insert_tw, tw, *twp; - wt_timestamp_t hs_ts; + WT_TIME_WINDOW hs_insert_tw, *twp; + wt_timestamp_t hs_durable_start_ts, hs_durable_stop_ts, hs_start_ts; uint64_t cache_hs_order_lose_durable_timestamp, cache_hs_order_reinsert, cache_hs_order_remove; uint64_t hs_counter, hs_upd_type; uint32_t hs_btree_id; @@ -1013,7 +941,7 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui continue; /* We shouldn't have crossed the btree and user key search space. */ - WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_ts, &hs_counter)); + WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter)); WT_ASSERT(session, hs_btree_id == btree_id); #ifdef HAVE_DIAGNOSTIC WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp)); @@ -1025,7 +953,7 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui * the cell. The cell's start timestamp can be cleared during reconciliation if it is * globally visible. */ - if (hs_ts >= ts || twp->stop_ts >= ts) + if (hs_start_ts >= ts || twp->stop_ts >= ts) break; } if (ret == WT_NOTFOUND) @@ -1092,7 +1020,7 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui */ for (; ret == 0; ret = hs_cursor->next(hs_cursor)) { /* We shouldn't have crossed the btree and user key search space. */ - WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_ts, &hs_counter)); + WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter)); WT_ASSERT(session, hs_btree_id == btree_id); #ifdef HAVE_DIAGNOSTIC WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp)); @@ -1110,7 +1038,7 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui * ignoring them. */ __wt_hs_upd_time_window(hs_cursor, &twp); - if (hs_ts < ts && twp->stop_ts < ts) + if (hs_start_ts < ts && twp->stop_ts < ts) continue; if (reinsert) { @@ -1164,10 +1092,10 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui /* Extract the underlying value for reinsertion. */ WT_ERR(hs_cursor->get_value( - hs_cursor, &tw.durable_stop_ts, &tw.durable_start_ts, &hs_upd_type, &hs_value)); + hs_cursor, &hs_durable_stop_ts, &hs_durable_start_ts, &hs_upd_type, &hs_value)); /* Reinsert the update with corrected timestamps. */ - if (no_ts_tombstone && hs_ts == ts) + if (no_ts_tombstone && hs_start_ts == ts) *counter = hs_counter; /* Insert the value back with different timestamps. */ @@ -1198,3 +1126,92 @@ err: return (ret); } + +/* + * __hs_delete_record -- + * Delete an update from the history store if it is not obsolete + */ +static int +__hs_delete_record( + WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_ITEM *key, WT_UPDATE *upd, WT_UPDATE *tombstone) +{ + WT_DECL_RET; + bool hs_read_committed; +#ifdef HAVE_DIAGNOSTIC + WT_TIME_WINDOW *hs_tw; +#endif + + if (r->hs_cursor == NULL) + WT_RET(__wt_curhs_open(session, NULL, &r->hs_cursor)); + hs_read_committed = F_ISSET(r->hs_cursor, WT_CURSTD_HS_READ_COMMITTED); + /* Ensure we can see all the content in the history store. */ + F_SET(r->hs_cursor, WT_CURSTD_HS_READ_COMMITTED); + + /* No need to delete from the history store if it is already obsolete. */ + if (tombstone != NULL && __wt_txn_upd_visible_all(session, tombstone)) + goto done; + + r->hs_cursor->set_key(r->hs_cursor, 4, S2BT(session)->id, key, WT_TS_MAX, UINT64_MAX); + WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_before(session, r->hs_cursor), true); + /* It's possible the value in the history store becomes obsolete concurrently. */ + if (ret == WT_NOTFOUND) { + WT_ASSERT(session, tombstone != NULL && __wt_txn_upd_visible_all(session, tombstone)); + ret = 0; + } else { +#ifdef HAVE_DIAGNOSTIC + __wt_hs_upd_time_window(r->hs_cursor, &hs_tw); + WT_ASSERT(session, hs_tw->start_txn == WT_TXN_NONE || hs_tw->start_txn == upd->txnid); + WT_ASSERT(session, hs_tw->start_ts == WT_TS_NONE || hs_tw->start_ts == upd->start_ts); + WT_ASSERT(session, + hs_tw->durable_start_ts == WT_TS_NONE || hs_tw->durable_start_ts == upd->durable_ts); + if (tombstone != NULL) { + WT_ASSERT(session, hs_tw->stop_txn == tombstone->txnid); + WT_ASSERT(session, hs_tw->stop_ts == tombstone->start_ts); + WT_ASSERT(session, hs_tw->durable_stop_ts == tombstone->durable_ts); + } else + WT_ASSERT(session, !WT_TIME_WINDOW_HAS_STOP(hs_tw)); +#endif + + WT_ERR(r->hs_cursor->remove(r->hs_cursor)); + } +done: + if (tombstone != NULL) + F_CLR(tombstone, WT_UPDATE_TO_DELETE_FROM_HS | WT_UPDATE_HS); + F_CLR(upd, WT_UPDATE_TO_DELETE_FROM_HS | WT_UPDATE_HS); + +err: + if (!hs_read_committed) + F_CLR(r->hs_cursor, WT_CURSTD_HS_READ_COMMITTED); + return (ret); +} + +/* + * __wt_hs_delete_updates -- + * Delete the updates from the history store + */ +int +__wt_hs_delete_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + WT_BTREE *btree; + WT_DECL_ITEM(key); + WT_DECL_RET; + WT_DELETE_HS_UPD *delete_hs_upd; + uint32_t i; + + /* Nothing to delete from the history store. */ + if (r->delete_hs_upd == NULL) + return (0); + + btree = S2BT(session); + + WT_RET(__wt_scr_alloc(session, WT_INTPACK64_MAXSIZE, &key)); + + for (delete_hs_upd = r->delete_hs_upd, i = 0; i < r->delete_hs_upd_next; ++delete_hs_upd, ++i) { + WT_ERR(__hs_pack_key(session, btree, r, delete_hs_upd->ins, delete_hs_upd->rip, key)); + WT_ERR(__hs_delete_record(session, r, key, delete_hs_upd->upd, delete_hs_upd->tombstone)); + } + +err: + __wt_scr_free(session, &key); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 7ebf7ba83e1..cdb6288ef17 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -776,6 +776,8 @@ extern int __wt_hs_config(WT_SESSION_IMPL *session, const char **cfg) extern int __wt_hs_delete_key_from_ts(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint32_t btree_id, const WT_ITEM *key, wt_timestamp_t ts, bool reinsert, bool ooo_tombstone, bool error_on_ooo_ts) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_hs_delete_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_hs_find_upd(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key, const char *value_format, uint64_t recno, WT_UPDATE_VALUE *upd_value, WT_ITEM *base_value_buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); diff --git a/src/third_party/wiredtiger/src/include/reconcile.h b/src/third_party/wiredtiger/src/include/reconcile.h index 024afebde5e..84cac92f426 100644 --- a/src/third_party/wiredtiger/src/include/reconcile.h +++ b/src/third_party/wiredtiger/src/include/reconcile.h @@ -64,6 +64,17 @@ struct __wt_rec_chunk { uint32_t auxentries; }; +/* + * WT_DELETE_HS_UPD -- + * Update that needs to be deleted from the history store. + */ +struct __wt_delete_hs_upd { + WT_INSERT *ins; /* Insert list reference */ + WT_ROW *rip; /* Original on-page reference */ + WT_UPDATE *upd; + WT_UPDATE *tombstone; +}; + /* * Reconciliation is the process of taking an in-memory page, walking each entry * in the page, building a backing disk image in a temporary buffer representing @@ -227,6 +238,15 @@ struct __wt_reconcile { size_t supd_allocated; size_t supd_memsize; /* Size of saved update structures */ + /* + * List of updates to be deleted from the history store. While reviewing updates for each page, + * we save the updates that needs to be deleted from history store here, and then delete them + * after we have built the disk image. + */ + WT_DELETE_HS_UPD *delete_hs_upd; /* Updates to delete from history store */ + uint32_t delete_hs_upd_next; + size_t delete_hs_upd_allocated; + /* List of pages we've written so far. */ WT_MULTI *multi; uint32_t multi_next; @@ -305,7 +325,8 @@ struct __wt_reconcile { }; typedef struct { - WT_UPDATE *upd; /* Update to write (or NULL) */ + WT_UPDATE *upd; /* Update to write (or NULL) */ + WT_UPDATE *tombstone; /* The tombstone to write (or NULL) */ WT_TIME_WINDOW tw; diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h index 1fa544ab841..000962fe8ab 100644 --- a/src/third_party/wiredtiger/src/include/wt_internal.h +++ b/src/third_party/wiredtiger/src/include/wt_internal.h @@ -185,6 +185,8 @@ struct __wt_data_handle; typedef struct __wt_data_handle WT_DATA_HANDLE; struct __wt_data_handle_cache; typedef struct __wt_data_handle_cache WT_DATA_HANDLE_CACHE; +struct __wt_delete_hs_upd; +typedef struct __wt_delete_hs_upd WT_DELETE_HS_UPD; struct __wt_dlh; typedef struct __wt_dlh WT_DLH; struct __wt_dsrc_stats; diff --git a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c index db9fa4f9798..dd76dd4cd6e 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c @@ -39,6 +39,27 @@ __rec_update_save(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ return (0); } +/* + * __rec_delete_hs_upd_save -- + * Save an update into a WT_DELETE_HS_UPD list to delete it from the history store later. + */ +static inline int +__rec_delete_hs_upd_save(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip, + WT_UPDATE *upd, WT_UPDATE *tombstone) +{ + WT_DELETE_HS_UPD *delete_hs_upd; + + WT_RET(__wt_realloc_def( + session, &r->delete_hs_upd_allocated, r->delete_hs_upd_next + 1, &r->delete_hs_upd)); + delete_hs_upd = &r->delete_hs_upd[r->delete_hs_upd_next]; + delete_hs_upd->ins = ins; + delete_hs_upd->rip = rip; + delete_hs_upd->upd = upd; + delete_hs_upd->tombstone = tombstone; + ++r->delete_hs_upd_next; + return (0); +} + /* * __rec_append_orig_value -- * Append the key's original value to its update list. It assumes that we have an onpage value, @@ -181,6 +202,45 @@ err: return (ret); } +/* + * __rec_find_and_save_delete_hs_upd -- + * Find and save the update that needs to be deleted from the history store later + */ +static int +__rec_find_and_save_delete_hs_upd(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, + WT_ROW *rip, WT_UPDATE_SELECT *upd_select) +{ + WT_UPDATE *delete_tombstone, *delete_upd; + + delete_tombstone = NULL; + + for (delete_upd = upd_select->tombstone != NULL ? upd_select->tombstone : upd_select->upd; + delete_upd != NULL; delete_upd = delete_upd->next) { + if (delete_upd->txnid == WT_TXN_ABORTED) + continue; + + if (F_ISSET(delete_upd, WT_UPDATE_TO_DELETE_FROM_HS)) { + /* + * If we want to remove an update from the history store in WiredTiger, it must be in + * history store. + */ + WT_ASSERT(session, F_ISSET(delete_upd, WT_UPDATE_HS | WT_UPDATE_RESTORED_FROM_HS)); + if (delete_upd->type == WT_UPDATE_TOMBSTONE) + delete_tombstone = delete_upd; + else { + WT_RET( + __rec_delete_hs_upd_save(session, r, ins, rip, delete_upd, delete_tombstone)); + break; + } + } + } + + /* If we delete a tombstone from the history store, we must also delete the update. */ + WT_ASSERT(session, delete_tombstone == NULL || delete_upd != NULL); + + return (0); +} + /* * __rec_need_save_upd -- * Return if we need to save the update chain @@ -189,8 +249,6 @@ static inline bool __rec_need_save_upd( WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE_SELECT *upd_select, bool has_newer_updates) { - WT_UPDATE *upd; - if (upd_select->tw.prepare) return (true); @@ -201,15 +259,6 @@ __rec_need_save_upd( if (upd_select->upd != NULL && upd_select->upd->type == WT_UPDATE_TOMBSTONE) return (false); - /* Save the update chain to delete the update from the history store later. */ - for (upd = upd_select->upd; upd != NULL; upd = upd->next) { - if (upd->txnid == WT_TXN_ABORTED) - continue; - - if (F_ISSET(upd, WT_UPDATE_TO_DELETE_FROM_HS)) - return (true); - } - /* * Don't save updates for any reconciliation that doesn't involve history store (in-memory * database, metadata, and history store reconciliation itself), except when the selected stop @@ -294,15 +343,10 @@ __rec_validate_upd_chain(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE *s if (!F_ISSET(r, WT_REC_CHECKPOINT_RUNNING)) return (0); - for (upd = select_upd; upd != NULL; upd = upd->next) { - if (upd->txnid == WT_TXN_ABORTED) - continue; - - /* Cannot delete the update from history store when checkpoint is running. */ - if (F_ISSET(upd, WT_UPDATE_TO_DELETE_FROM_HS)) { - WT_STAT_CONN_DATA_INCR(session, cache_eviction_blocked_remove_hs_race_with_checkpoint); - return (EBUSY); - } + /* Cannot delete the update from history store when checkpoint is running. */ + if (r->delete_hs_upd_next > 0) { + WT_STAT_CONN_DATA_INCR(session, cache_eviction_blocked_remove_hs_race_with_checkpoint); + return (EBUSY); } /* @@ -415,6 +459,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, W * both must be initialized. */ upd_select->upd = NULL; + upd_select->tombstone = NULL; upd_select->upd_saved = false; upd_select->ooo_tombstone = false; select_tw = &upd_select->tw; @@ -608,7 +653,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, W */ if (upd->type == WT_UPDATE_TOMBSTONE) { WT_TIME_WINDOW_SET_STOP(select_tw, upd); - tombstone = upd; + tombstone = upd_select->tombstone = upd; /* Find the update this tombstone applies to. */ if (!__wt_txn_upd_visible_all(session, upd)) { @@ -711,6 +756,13 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, W NULL : upd_select->upd; + /* + * If we have done a prepared rollback, we may have restored a history store value to the update + * chain but the same value is left in the history store. Save it to delete it from the history + * store later. + */ + WT_RET(__rec_find_and_save_delete_hs_upd(session, r, ins, rip, upd_select)); + /* Check the update chain for conditions that could prevent it's eviction. */ WT_RET(__rec_validate_upd_chain(session, r, onpage_upd, select_tw, vpack)); @@ -752,7 +804,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, W /* Catch this case in diagnostic builds. */ WT_STAT_CONN_DATA_INCR(session, cache_eviction_blocked_ooo_checkpoint_race_3); WT_ASSERT(session, false); - WT_RET(EBUSY); + return (EBUSY); } /* diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 88a156d3b1f..35bbfb4843b 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -619,6 +619,9 @@ __rec_init(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags, WT_SALVAGE_COO r->supd_next = 0; r->supd_memsize = 0; + /* The list of updates to be deleted from the history store. */ + r->delete_hs_upd_next = 0; + /* The list of pages we've written. */ r->multi = NULL; r->multi_next = 0; @@ -771,6 +774,7 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep) __wt_buf_free(session, &r->chunk_B.image); __wt_free(session, r->supd); + __wt_free(session, r->delete_hs_upd); __wt_rec_dictionary_free(session, r); @@ -2620,6 +2624,12 @@ __rec_hs_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r) /* Flag as unused for non diagnostic builds. */ WT_UNUSED(btree); + /* + * Delete the updates left in the history store by prepared rollback first before moving updates + * to the history store. + */ + WT_ERR(__wt_hs_delete_updates(session, r)); + /* Check if there's work to do. */ for (multi = r->multi, i = 0; i < r->multi_next; ++multi, ++i) if (multi->supd != NULL) diff --git a/src/third_party/wiredtiger/test/format/failure_configs/CONFIG.WT-9805 b/src/third_party/wiredtiger/test/format/failure_configs/CONFIG.WT-9805 new file mode 100644 index 00000000000..34fd526585b --- /dev/null +++ b/src/third_party/wiredtiger/test/format/failure_configs/CONFIG.WT-9805 @@ -0,0 +1,196 @@ +############################################ +# RUN PARAMETERS: V3 +############################################ +assert.read_timestamp=0 +backup=0 +backup.incremental=off +backup.incr_granularity=11691 +block_cache=0 +block_cache.cache_on_checkpoint=0 +block_cache.cache_on_writes=1 +block_cache.size=62 +btree.huffman_value=0 +cache=2178 +cache.evict_max=2 +cache.minimum=20 +checkpoint=on +checkpoint.log_size=200 +checkpoint.wait=31 +disk.data_extend=0 +disk.direct_io=0 +disk.encryption=rotn-7 +disk.mmap=1 +disk.mmap_all=0 +format.abort=0 +format.independent_thread_rng=1 +format.major_timeout=0 +import=0 +logging=0 +logging.compression=none +logging.file_max=64891 +logging.prealloc=0 +logging.remove=1 +ops.alter=0 +ops.compaction=0 +ops.hs_cursor=1 +ops.prepare=1 +ops.random_cursor=0 +ops.salvage=0 +ops.verify=1 +quiet=1 +runs.in_memory=0 +runs.ops=0 +runs.rows=1000000 +runs.tables=5 +runs.threads=32 +runs.timer=26 +runs.verify_failure_dump=0 +statistics=0 +statistics.server=0 +stress.aggressive_sweep=0 +stress.checkpoint=0 +stress.checkpoint_evict_page=0 +stress.checkpoint_reserved_txnid_delay=0 +stress.checkpoint_prepare=0 +stress.evict_reposition=0 +stress.failpoint_eviction_fail_after_reconciliation=1 +stress.failpoint_hs_delete_key_from_ts=0 +stress.hs_checkpoint_delay=0 +stress.hs_search=0 +stress.hs_sweep=0 +stress.split_1=0 +stress.split_2=0 +stress.split_3=0 +stress.split_4=0 +stress.split_5=0 +stress.split_6=0 +stress.split_7=0 +transaction.implicit=0 +transaction.timestamps=1 +wiredtiger.config=off +wiredtiger.rwlock=1 +wiredtiger.leak_memory=0 +############################################ +# TABLE PARAMETERS: table 1 +############################################ +table1.btree.compression=snappy +table1.btree.dictionary=0 +table1.btree.internal_key_truncation=1 +table1.btree.internal_page_max=9 +table1.btree.leaf_page_max=10 +table1.btree.memory_page_max=1 +table1.btree.repeat_data_pct=47 +table1.btree.split_pct=77 +table1.btree.value_max=3212 +table1.btree.value_min=0 +table1.disk.checksum=unencrypted +table1.disk.firstfit=0 +table1.ops.pct.delete=55 +table1.ops.pct.insert=42 +table1.ops.pct.modify=3 +table1.ops.pct.read=0 +table1.ops.pct.write=0 +table1.ops.truncate=1 +table1.runs.mirror=0 +table1.runs.source=table +table1.runs.type=variable-length column-store +############################################ +# TABLE PARAMETERS: table 2 +############################################ +table2.btree.compression=zstd +table2.btree.dictionary=0 +table2.btree.internal_key_truncation=1 +table2.btree.internal_page_max=13 +table2.btree.key_max=46 +table2.btree.key_min=25 +table2.btree.leaf_page_max=14 +table2.btree.memory_page_max=6 +table2.btree.prefix_len=0 +table2.btree.prefix_compression=1 +table2.btree.prefix_compression_min=8 +table2.btree.reverse=0 +table2.btree.split_pct=69 +table2.btree.value_max=3706 +table2.btree.value_min=13 +table2.disk.checksum=off +table2.disk.firstfit=0 +table2.ops.pct.delete=72 +table2.ops.pct.insert=0 +table2.ops.pct.modify=5 +table2.ops.pct.read=1 +table2.ops.pct.write=22 +table2.ops.truncate=1 +table2.runs.mirror=0 +table2.runs.source=table +table2.runs.type=row-store +############################################ +# TABLE PARAMETERS: table 3 +############################################ +table3.btree.bitcnt=3 +table3.btree.compression=zlib +table3.btree.internal_key_truncation=1 +table3.btree.internal_page_max=17 +table3.btree.leaf_page_max=11 +table3.btree.memory_page_max=10 +table3.btree.split_pct=81 +table3.disk.checksum=on +table3.disk.firstfit=0 +table3.ops.pct.delete=9 +table3.ops.pct.insert=44 +table3.ops.pct.modify=14 +table3.ops.pct.read=32 +table3.ops.pct.write=1 +table3.ops.truncate=1 +table3.runs.mirror=0 +table3.runs.source=table +table3.runs.type=fixed-length column-store +############################################ +# TABLE PARAMETERS: table 4 +############################################ +table4.btree.compression=zlib +table4.btree.dictionary=1 +table4.btree.internal_key_truncation=1 +table4.btree.internal_page_max=14 +table4.btree.key_max=30 +table4.btree.key_min=27 +table4.btree.leaf_page_max=14 +table4.btree.memory_page_max=9 +table4.btree.prefix_len=67 +table4.btree.prefix_compression=1 +table4.btree.prefix_compression_min=5 +table4.btree.reverse=0 +table4.btree.split_pct=88 +table4.btree.value_max=120 +table4.btree.value_min=20 +table4.disk.checksum=on +table4.disk.firstfit=0 +table4.ops.pct.delete=4 +table4.ops.pct.insert=41 +table4.ops.pct.modify=0 +table4.ops.pct.read=2 +table4.ops.pct.write=53 +table4.ops.truncate=1 +table4.runs.mirror=0 +table4.runs.source=file +table4.runs.type=row-store +############################################ +# TABLE PARAMETERS: table 5 +############################################ +table5.btree.bitcnt=2 +table5.btree.compression=snappy +table5.btree.internal_key_truncation=1 +table5.btree.internal_page_max=11 +table5.btree.leaf_page_max=14 +table5.btree.memory_page_max=7 +table5.btree.split_pct=97 +table5.disk.checksum=on +table5.disk.firstfit=0 +table5.ops.pct.delete=19 +table5.ops.pct.insert=2 +table5.ops.pct.modify=71 +table5.ops.pct.read=7 +table5.ops.pct.write=1 +table5.ops.truncate=1 +table5.runs.mirror=0 +table5.runs.source=table +table5.runs.type=fixed-length column-store diff --git a/src/third_party/wiredtiger/test/suite/test_prepare25.py b/src/third_party/wiredtiger/test/suite/test_prepare25.py index 95fce47181d..e512c3c7fef 100644 --- a/src/third_party/wiredtiger/test/suite/test_prepare25.py +++ b/src/third_party/wiredtiger/test/suite/test_prepare25.py @@ -31,7 +31,7 @@ from wtscenario import make_scenarios # test_prepare25.py # Test prepare rollback and then prepare commit with failed eviction. -class test_prepare23(wttest.WiredTigerTestCase): +class test_prepare25(wttest.WiredTigerTestCase): conn_config = 'timing_stress_for_test=[failpoint_eviction_fail_after_reconciliation]' format_values = [ diff --git a/src/third_party/wiredtiger/test/suite/test_prepare26.py b/src/third_party/wiredtiger/test/suite/test_prepare26.py new file mode 100644 index 00000000000..52e08cedff6 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_prepare26.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python +# +# Public Domain 2014-present MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wttest +from wtscenario import make_scenarios + +# test_prepare26.py +# Test prepare rollback and then delete the key. +class test_prepare26(wttest.WiredTigerTestCase): + format_values = [ + ('column', dict(key_format='r', value_format='S')), + ('column_fix', dict(key_format='r', value_format='8t')), + ('row_integer', dict(key_format='i', value_format='S')), + ] + + scenarios = make_scenarios(format_values) + + def test_prepare26(self): + uri = "table:test_prepare26" + self.session.create(uri, 'key_format=' + self.key_format + ',value_format=' + self.value_format) + + if self.value_format == '8t': + value_a = 97 + value_b = 98 + value_c = 99 + else: + value_a = "a" + value_b = "b" + value_c = "c" + + # Pin oldest timestamp to 1 + self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(1)) + + # Insert a value + cursor = self.session.open_cursor(uri) + self.session.begin_transaction() + cursor[1] = value_a + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(10)) + + # Do a prepared update + self.session.begin_transaction() + cursor[1] = value_c + self.session.prepare_transaction('prepare_timestamp=' + self.timestamp_str(20)) + + # Evict the page + session2 = self.conn.open_session() + evict_cursor = session2.open_cursor(uri, None, 'debug=(release_evict)') + session2.begin_transaction('ignore_prepare=true,read_timestamp=' + self.timestamp_str(10)) + self.assertEquals(evict_cursor[1], value_a) + evict_cursor.reset() + evict_cursor.close() + session2.rollback_transaction() + + # Rollback the prepared transaction + self.session.rollback_transaction() + + # Delete the key + self.session.begin_transaction() + cursor.set_key(1) + cursor.remove() + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(30)) + + # Set oldest timestamp to 30 + self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(30)) + + # Evict the page again + evict_cursor = session2.open_cursor(uri, None, 'debug=(release_evict)') + session2.begin_transaction() + evict_cursor.set_key(1) + if self.value_format == '8t': + self.assertEquals(evict_cursor[1], 0) + else: + evict_cursor.set_key(1) + self.assertEquals(evict_cursor.search(), wiredtiger.WT_NOTFOUND) + evict_cursor.reset() + evict_cursor.close() + session2.rollback_transaction() + + # Do another update + cursor = self.session.open_cursor(uri) + self.session.begin_transaction() + cursor[1] = value_b + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(40)) + + # Do another update + cursor = self.session.open_cursor(uri) + self.session.begin_transaction() + cursor[1] = value_c + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(50)) + + # Evict the page again + evict_cursor = session2.open_cursor(uri, None, 'debug=(release_evict)') + session2.begin_transaction('read_timestamp=' + self.timestamp_str(50)) + self.assertEquals(evict_cursor[1], value_c) + evict_cursor.reset() + evict_cursor.close() + session2.rollback_transaction() + + # Verify we read nothing at the oldest + self.session.begin_transaction('read_timestamp=' + self.timestamp_str(30)) + if self.value_format == '8t': + self.assertEquals(cursor[1], 0) + else: + cursor.set_key(1) + self.assertEquals(cursor.search(), wiredtiger.WT_NOTFOUND) + self.session.rollback_transaction() + +if __name__ == '__main__': + wttest.run() -- cgit v1.2.1