From a98ea6e3be4a0c0aa93f691bbc5b4ae26f3dfafa Mon Sep 17 00:00:00 2001 From: Luke Chen Date: Mon, 5 Dec 2022 16:26:09 +1100 Subject: Import wiredtiger: adba8ce2917cd4d131237b7ccf4fc89199e26480 from branch mongodb-5.0 ref: f1fa61904b..adba8ce291 for: 5.0.15 WT-9805 Save the updates need to be deleted from history store and delete them later --- src/third_party/wiredtiger/import.data | 2 +- src/third_party/wiredtiger/src/btree/bt_cursor.c | 7 + src/third_party/wiredtiger/src/history/hs_rec.c | 289 +++++++++++---------- src/third_party/wiredtiger/src/include/extern.h | 2 + src/third_party/wiredtiger/src/include/reconcile.h | 23 +- .../wiredtiger/src/include/wt_internal.h | 2 + .../wiredtiger/src/reconcile/rec_visibility.c | 94 +++++-- .../wiredtiger/src/reconcile/rec_write.c | 10 + .../wiredtiger/test/suite/test_prepare25.py | 2 +- .../wiredtiger/test/suite/test_prepare26.py | 134 ++++++++++ 10 files changed, 405 insertions(+), 160 deletions(-) create mode 100644 src/third_party/wiredtiger/test/suite/test_prepare26.py diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 8f4375f0d27..7f18ae7d4ff 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-5.0", - "commit": "f1fa61904bdec4a931daa49452cf3b92a3bca37a" + "commit": "adba8ce2917cd4d131237b7ccf4fc89199e26480" } diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index 013ccb16845..ecaf3bc74a7 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -454,6 +454,9 @@ __wt_btcur_reset(WT_CURSOR_BTREE *cbt) WT_STAT_CONN_DATA_INCR(session, cursor_reset); F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); + /* Initialize the update value as we are not pointing to any value. */ + cbt->upd_value->type = WT_UPDATE_INVALID; + WT_TIME_WINDOW_INIT(&cbt->upd_value->tw); return (__cursor_reset(cbt)); } @@ -1886,6 +1889,10 @@ __wt_btcur_open(WT_CURSOR_BTREE *cbt) cbt->modify_update = &cbt->_modify_update; cbt->upd_value = &cbt->_upd_value; + /* Initialize the value. */ + cbt->upd_value->type = WT_UPDATE_INVALID; + WT_TIME_WINDOW_INIT(&cbt->upd_value->tw); + #ifdef HAVE_DIAGNOSTIC cbt->lastkey = &cbt->_lastkey; cbt->lastrecno = WT_RECNO_OOB; diff --git a/src/third_party/wiredtiger/src/history/hs_rec.c b/src/third_party/wiredtiger/src/history/hs_rec.c index 01d3d40a7c1..a1fbd7c3014 100644 --- a/src/third_party/wiredtiger/src/history/hs_rec.c +++ b/src/third_party/wiredtiger/src/history/hs_rec.c @@ -60,65 +60,6 @@ __hs_verbose_cache_stats(WT_SESSION_IMPL *session, WT_BTREE *btree) cache->hs_verb_gen_write = ckpt_gen_current; } -/* - * __hs_delete_record -- - * Delete the update left in the history store - */ -static int -__hs_delete_record(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, WT_ITEM *key, - WT_UPDATE *delete_upd, WT_UPDATE *delete_tombstone) -{ - WT_DECL_RET; - bool hs_read_committed; -#ifdef HAVE_DIAGNOSTIC - WT_TIME_WINDOW *hs_tw; -#endif - - hs_read_committed = F_ISSET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED); - F_SET(hs_cursor, WT_CURSTD_HS_READ_COMMITTED); - - /* No need to delete from the history store if it is already obsolete. */ - if (delete_tombstone != NULL && __wt_txn_upd_visible_all(session, delete_tombstone)) { - ret = 0; - goto done; - } - - hs_cursor->set_key(hs_cursor, 4, S2BT(session)->id, key, WT_TS_MAX, UINT64_MAX); - WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_before(session, hs_cursor), true); - /* It's possible the value in the history store becomes obsolete concurrently. */ - if (ret == WT_NOTFOUND) { - WT_ASSERT( - session, delete_tombstone != NULL && __wt_txn_upd_visible_all(session, delete_tombstone)); - ret = 0; - goto done; - } - -#ifdef HAVE_DIAGNOSTIC - __wt_hs_upd_time_window(hs_cursor, &hs_tw); - WT_ASSERT(session, hs_tw->start_txn == WT_TXN_NONE || hs_tw->start_txn == delete_upd->txnid); - WT_ASSERT(session, hs_tw->start_ts == WT_TS_NONE || hs_tw->start_ts == delete_upd->start_ts); - WT_ASSERT(session, - hs_tw->durable_start_ts == WT_TS_NONE || hs_tw->durable_start_ts == delete_upd->durable_ts); - if (delete_tombstone != NULL) { - WT_ASSERT(session, hs_tw->stop_txn == delete_tombstone->txnid); - WT_ASSERT(session, hs_tw->stop_ts == delete_tombstone->start_ts); - WT_ASSERT(session, hs_tw->durable_stop_ts == delete_tombstone->durable_ts); - } else - WT_ASSERT(session, !WT_TIME_WINDOW_HAS_STOP(hs_tw)); -#endif - - WT_ERR(hs_cursor->remove(hs_cursor)); -done: - if (delete_tombstone != NULL) - F_CLR(delete_tombstone, WT_UPDATE_TO_DELETE_FROM_HS | WT_UPDATE_HS); - F_CLR(delete_upd, WT_UPDATE_TO_DELETE_FROM_HS | WT_UPDATE_HS); - -err: - if (!hs_read_committed) - F_CLR(hs_cursor, WT_CURSTD_HS_READ_COMMITTED); - return (ret); -} - /* * __hs_insert_record -- * A helper function to insert the record into the history store including stop time point. @@ -348,10 +289,46 @@ __hs_next_upd_full_value(WT_SESSION_IMPL *session, WT_UPDATE_VECTOR *updates, return (0); } +/* + * __hs_pack_key -- + * Pack the history store key + */ +static inline int +__hs_pack_key(WT_SESSION_IMPL *session, WT_BTREE *btree, WT_RECONCILE *r, WT_INSERT *ins, + WT_ROW *rip, WT_ITEM *key) +{ + WT_DECL_RET; + uint8_t *p; + + switch (r->page->type) { + case WT_PAGE_COL_FIX: + case WT_PAGE_COL_VAR: + p = key->mem; + WT_RET(__wt_vpack_uint(&p, 0, WT_INSERT_RECNO(ins))); + key->size = WT_PTRDIFF(p, key->data); + break; + case WT_PAGE_ROW_LEAF: + if (ins == NULL) { + WT_WITH_BTREE( + session, btree, ret = __wt_row_leaf_key(session, r->page, rip, key, false)); + WT_RET(ret); + } else { + key->data = WT_INSERT_KEY(ins); + key->size = WT_INSERT_KEY_SIZE(ins); + } + break; + default: + WT_RET(__wt_illegal_value(session, r->page->type)); + } + + return (ret); +} + /* * __wt_hs_insert_updates -- - * Copy one set of saved updates into the database's history store table. Whether the function - * fails or succeeds, if there is a successful write to history, cache_write_hs is set to true. + * Copy one set of saved updates into the database's history store table if they haven't been + * moved there. Whether the function fails or succeeds, if there is a successful write to + * history, cache_write_hs is set to true. */ int __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_MULTI *multi) @@ -373,13 +350,11 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_MULTI *mult WT_UPDATE_VECTOR out_of_order_ts_updates; WT_SAVE_UPD *list; WT_UPDATE *first_globally_visible_upd, *fix_ts_upd, *min_ts_upd, *out_of_order_ts_upd; - WT_UPDATE *delete_tombstone, *delete_upd, *newest_hs, *non_aborted_upd, *oldest_upd, *prev_upd, - *ref_upd, *tombstone, *upd; + WT_UPDATE *newest_hs, *non_aborted_upd, *oldest_upd, *prev_upd, *ref_upd, *tombstone, *upd; WT_TIME_WINDOW tw; wt_off_t hs_size; uint64_t insert_cnt, max_hs_size, modify_cnt; uint32_t i; - uint8_t *p; int nentries; bool enable_reverse_modify, error_on_ooo_ts, hs_inserted, squashed; @@ -421,68 +396,6 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_MULTI *mult if (list->onpage_upd == NULL) continue; - /* History store table key component: source key. */ - switch (r->page->type) { - case WT_PAGE_COL_FIX: - case WT_PAGE_COL_VAR: - p = key->mem; - WT_ERR(__wt_vpack_uint(&p, 0, WT_INSERT_RECNO(list->ins))); - key->size = WT_PTRDIFF(p, key->data); - break; - case WT_PAGE_ROW_LEAF: - if (list->ins == NULL) { - WT_WITH_BTREE( - session, btree, ret = __wt_row_leaf_key(session, r->page, list->rip, key, false)); - WT_ERR(ret); - } else { - key->data = WT_INSERT_KEY(list->ins); - key->size = WT_INSERT_KEY_SIZE(list->ins); - } - break; - default: - WT_ERR(__wt_illegal_value(session, r->page->type)); - } - - newest_hs = first_globally_visible_upd = min_ts_upd = out_of_order_ts_upd = NULL; - ref_upd = list->onpage_upd; - delete_tombstone = delete_upd = NULL; - - __wt_update_vector_clear(&out_of_order_ts_updates); - __wt_update_vector_clear(&updates); - - /* - * Reverse deltas are only supported on 'S' and 'u' value formats. - */ - enable_reverse_modify = - (WT_STREQ(btree->value_format, "S") || WT_STREQ(btree->value_format, "u")); - - /* - * Delete the update that is both on the update chain and the history store from the history - * store. Otherwise, we will trigger out of order fix when the update is inserted to the - * history store again. - */ - for (upd = list->onpage_tombstone != NULL ? list->onpage_tombstone : list->onpage_upd; - upd != NULL; upd = upd->next) { - if (upd->txnid == WT_TXN_ABORTED) - continue; - - if (F_ISSET(upd, WT_UPDATE_TO_DELETE_FROM_HS)) { - /* - * If we want to remove an update from the history store in WiredTiger, it must be - * in history store. - */ - WT_ASSERT(session, F_ISSET(upd, WT_UPDATE_HS | WT_UPDATE_RESTORED_FROM_HS)); - if (upd->type == WT_UPDATE_TOMBSTONE) - delete_tombstone = upd; - else { - delete_upd = upd; - WT_ERR( - __hs_delete_record(session, hs_cursor, key, delete_upd, delete_tombstone)); - break; - } - } - } - /* Skip aborted updates. */ for (upd = list->onpage_upd->next; upd != NULL && upd->txnid == WT_TXN_ABORTED; upd = upd->next) @@ -496,6 +409,21 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_MULTI *mult if (F_ISSET(upd, WT_UPDATE_HS)) continue; + /* History store table key component: source key. */ + WT_ERR(__hs_pack_key(session, btree, r, list->ins, list->rip, key)); + + newest_hs = first_globally_visible_upd = min_ts_upd = out_of_order_ts_upd = NULL; + ref_upd = list->onpage_upd; + + __wt_update_vector_clear(&updates); + __wt_update_vector_clear(&out_of_order_ts_updates); + + /* + * Reverse deltas are only supported on 'S' and 'u' value formats. + */ + enable_reverse_modify = + (WT_STREQ(btree->value_format, "S") || WT_STREQ(btree->value_format, "u")); + /* * The algorithm assumes the oldest update on the update chain in memory is either a full * update or a tombstone. @@ -885,7 +813,7 @@ __wt_hs_delete_key_from_ts(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint3 { WT_DECL_RET; WT_ITEM hs_key; - wt_timestamp_t hs_ts; + wt_timestamp_t hs_start_ts; uint64_t hs_counter; uint32_t hs_btree_id; bool hs_read_all_flag; @@ -910,7 +838,7 @@ __wt_hs_delete_key_from_ts(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint3 ret = 0; goto done; } else { - WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_ts, &hs_counter)); + WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter)); ++hs_counter; } @@ -940,8 +868,8 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui WT_CURSOR_BTREE *hs_cbt; WT_DECL_RET; WT_ITEM hs_key, hs_value; - WT_TIME_WINDOW hs_insert_tw, tw, *twp; - wt_timestamp_t hs_ts; + WT_TIME_WINDOW hs_insert_tw, *twp; + wt_timestamp_t hs_durable_start_ts, hs_durable_stop_ts, hs_start_ts; uint64_t hs_counter, hs_upd_type; uint32_t hs_btree_id; #ifdef HAVE_DIAGNOSTIC @@ -1016,7 +944,7 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui continue; /* We shouldn't have crossed the btree and user key search space. */ - WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_ts, &hs_counter)); + WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter)); WT_ASSERT(session, hs_btree_id == btree_id); #ifdef HAVE_DIAGNOSTIC WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp)); @@ -1028,7 +956,7 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui * the cell. The cell's start timestamp can be cleared during reconciliation if it is * globally visible. */ - if (hs_ts >= ts || twp->stop_ts >= ts) + if (hs_start_ts >= ts || twp->stop_ts >= ts) break; } if (ret == WT_NOTFOUND) @@ -1095,7 +1023,7 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui */ for (; ret == 0; ret = hs_cursor->next(hs_cursor)) { /* We shouldn't have crossed the btree and user key search space. */ - WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_ts, &hs_counter)); + WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter)); WT_ASSERT(session, hs_btree_id == btree_id); #ifdef HAVE_DIAGNOSTIC WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp)); @@ -1113,7 +1041,7 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui * ignoring them. */ __wt_hs_upd_time_window(hs_cursor, &twp); - if (hs_ts < ts && twp->stop_ts < ts) + if (hs_start_ts < ts && twp->stop_ts < ts) continue; if (reinsert) { @@ -1167,10 +1095,10 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui /* Extract the underlying value for reinsertion. */ WT_ERR(hs_cursor->get_value( - hs_cursor, &tw.durable_stop_ts, &tw.durable_start_ts, &hs_upd_type, &hs_value)); + hs_cursor, &hs_durable_stop_ts, &hs_durable_start_ts, &hs_upd_type, &hs_value)); /* Reinsert the update with corrected timestamps. */ - if (no_ts_tombstone && hs_ts == ts) + if (no_ts_tombstone && hs_start_ts == ts) *counter = hs_counter; /* Insert the value back with different timestamps. */ @@ -1197,3 +1125,92 @@ err: hs_insert_cursor->close(hs_insert_cursor); return (ret); } + +/* + * __hs_delete_record -- + * Delete an update from the history store if it is not obsolete + */ +static int +__hs_delete_record( + WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_ITEM *key, WT_UPDATE *upd, WT_UPDATE *tombstone) +{ + WT_DECL_RET; + bool hs_read_committed; +#ifdef HAVE_DIAGNOSTIC + WT_TIME_WINDOW *hs_tw; +#endif + + if (r->hs_cursor == NULL) + WT_RET(__wt_curhs_open(session, NULL, &r->hs_cursor)); + hs_read_committed = F_ISSET(r->hs_cursor, WT_CURSTD_HS_READ_COMMITTED); + /* Ensure we can see all the content in the history store. */ + F_SET(r->hs_cursor, WT_CURSTD_HS_READ_COMMITTED); + + /* No need to delete from the history store if it is already obsolete. */ + if (tombstone != NULL && __wt_txn_upd_visible_all(session, tombstone)) + goto done; + + r->hs_cursor->set_key(r->hs_cursor, 4, S2BT(session)->id, key, WT_TS_MAX, UINT64_MAX); + WT_ERR_NOTFOUND_OK(__wt_curhs_search_near_before(session, r->hs_cursor), true); + /* It's possible the value in the history store becomes obsolete concurrently. */ + if (ret == WT_NOTFOUND) { + WT_ASSERT(session, tombstone != NULL && __wt_txn_upd_visible_all(session, tombstone)); + ret = 0; + } else { +#ifdef HAVE_DIAGNOSTIC + __wt_hs_upd_time_window(r->hs_cursor, &hs_tw); + WT_ASSERT(session, hs_tw->start_txn == WT_TXN_NONE || hs_tw->start_txn == upd->txnid); + WT_ASSERT(session, hs_tw->start_ts == WT_TS_NONE || hs_tw->start_ts == upd->start_ts); + WT_ASSERT(session, + hs_tw->durable_start_ts == WT_TS_NONE || hs_tw->durable_start_ts == upd->durable_ts); + if (tombstone != NULL) { + WT_ASSERT(session, hs_tw->stop_txn == tombstone->txnid); + WT_ASSERT(session, hs_tw->stop_ts == tombstone->start_ts); + WT_ASSERT(session, hs_tw->durable_stop_ts == tombstone->durable_ts); + } else + WT_ASSERT(session, !WT_TIME_WINDOW_HAS_STOP(hs_tw)); +#endif + + WT_ERR(r->hs_cursor->remove(r->hs_cursor)); + } +done: + if (tombstone != NULL) + F_CLR(tombstone, WT_UPDATE_TO_DELETE_FROM_HS | WT_UPDATE_HS); + F_CLR(upd, WT_UPDATE_TO_DELETE_FROM_HS | WT_UPDATE_HS); + +err: + if (!hs_read_committed) + F_CLR(r->hs_cursor, WT_CURSTD_HS_READ_COMMITTED); + return (ret); +} + +/* + * __wt_hs_delete_updates -- + * Delete the updates from the history store + */ +int +__wt_hs_delete_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r) +{ + WT_BTREE *btree; + WT_DECL_ITEM(key); + WT_DECL_RET; + WT_DELETE_HS_UPD *delete_hs_upd; + uint32_t i; + + /* Nothing to delete from the history store. */ + if (r->delete_hs_upd == NULL) + return (0); + + btree = S2BT(session); + + WT_RET(__wt_scr_alloc(session, WT_INTPACK64_MAXSIZE, &key)); + + for (delete_hs_upd = r->delete_hs_upd, i = 0; i < r->delete_hs_upd_next; ++delete_hs_upd, ++i) { + WT_ERR(__hs_pack_key(session, btree, r, delete_hs_upd->ins, delete_hs_upd->rip, key)); + WT_ERR(__hs_delete_record(session, r, key, delete_hs_upd->upd, delete_hs_upd->tombstone)); + } + +err: + __wt_scr_free(session, &key); + return (ret); +} diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 48532c375df..57f30316c26 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -787,6 +787,8 @@ extern int __wt_hs_config(WT_SESSION_IMPL *session, const char **cfg) extern int __wt_hs_delete_key_from_ts(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint32_t btree_id, const WT_ITEM *key, wt_timestamp_t ts, bool reinsert, bool ooo_tombstone, bool error_on_ooo_ts) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_hs_delete_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_hs_find_upd(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key, const char *value_format, uint64_t recno, WT_UPDATE_VALUE *upd_value, WT_ITEM *base_value_buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); diff --git a/src/third_party/wiredtiger/src/include/reconcile.h b/src/third_party/wiredtiger/src/include/reconcile.h index 70664653e5d..9d648f62d6a 100644 --- a/src/third_party/wiredtiger/src/include/reconcile.h +++ b/src/third_party/wiredtiger/src/include/reconcile.h @@ -60,6 +60,17 @@ struct __wt_rec_chunk { WT_ITEM image; /* disk-image */ }; +/* + * WT_DELETE_HS_UPD -- + * Update that needs to be deleted from the history store. + */ +struct __wt_delete_hs_upd { + WT_INSERT *ins; /* Insert list reference */ + WT_ROW *rip; /* Original on-page reference */ + WT_UPDATE *upd; + WT_UPDATE *tombstone; +}; + /* * Reconciliation is the process of taking an in-memory page, walking each entry * in the page, building a backing disk image in a temporary buffer representing @@ -217,6 +228,15 @@ struct __wt_reconcile { size_t supd_allocated; size_t supd_memsize; /* Size of saved update structures */ + /* + * List of updates to be deleted from the history store. While reviewing updates for each page, + * we save the updates that needs to be deleted from history store here, and then delete them + * after we have built the disk image. + */ + WT_DELETE_HS_UPD *delete_hs_upd; /* Updates to delete from history store */ + uint32_t delete_hs_upd_next; + size_t delete_hs_upd_allocated; + /* List of pages we've written so far. */ WT_MULTI *multi; uint32_t multi_next; @@ -295,7 +315,8 @@ struct __wt_reconcile { }; typedef struct { - WT_UPDATE *upd; /* Update to write (or NULL) */ + WT_UPDATE *upd; /* Update to write (or NULL) */ + WT_UPDATE *tombstone; /* The tombstone to write (or NULL) */ WT_TIME_WINDOW tw; diff --git a/src/third_party/wiredtiger/src/include/wt_internal.h b/src/third_party/wiredtiger/src/include/wt_internal.h index d1666d35a9c..22724bc2be3 100644 --- a/src/third_party/wiredtiger/src/include/wt_internal.h +++ b/src/third_party/wiredtiger/src/include/wt_internal.h @@ -181,6 +181,8 @@ struct __wt_data_handle; typedef struct __wt_data_handle WT_DATA_HANDLE; struct __wt_data_handle_cache; typedef struct __wt_data_handle_cache WT_DATA_HANDLE_CACHE; +struct __wt_delete_hs_upd; +typedef struct __wt_delete_hs_upd WT_DELETE_HS_UPD; struct __wt_dlh; typedef struct __wt_dlh WT_DLH; struct __wt_dsrc_stats; diff --git a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c index 855fa327439..5d476a00c75 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c @@ -51,6 +51,27 @@ __rec_update_save(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ return (0); } +/* + * __rec_delete_hs_upd_save -- + * Save an update into a WT_DELETE_HS_UPD list to delete it from the history store later. + */ +static inline int +__rec_delete_hs_upd_save(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, WT_ROW *rip, + WT_UPDATE *upd, WT_UPDATE *tombstone) +{ + WT_DELETE_HS_UPD *delete_hs_upd; + + WT_RET(__wt_realloc_def( + session, &r->delete_hs_upd_allocated, r->delete_hs_upd_next + 1, &r->delete_hs_upd)); + delete_hs_upd = &r->delete_hs_upd[r->delete_hs_upd_next]; + delete_hs_upd->ins = ins; + delete_hs_upd->rip = rip; + delete_hs_upd->upd = upd; + delete_hs_upd->tombstone = tombstone; + ++r->delete_hs_upd_next; + return (0); +} + /* * __rec_append_orig_value -- * Append the key's original value to its update list. It assumes that we have an onpage value, @@ -193,6 +214,45 @@ err: return (ret); } +/* + * __rec_find_and_save_delete_hs_upd -- + * Find and save the update that needs to be deleted from the history store later + */ +static int +__rec_find_and_save_delete_hs_upd(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, + WT_ROW *rip, WT_UPDATE_SELECT *upd_select) +{ + WT_UPDATE *delete_tombstone, *delete_upd; + + delete_tombstone = NULL; + + for (delete_upd = upd_select->tombstone != NULL ? upd_select->tombstone : upd_select->upd; + delete_upd != NULL; delete_upd = delete_upd->next) { + if (delete_upd->txnid == WT_TXN_ABORTED) + continue; + + if (F_ISSET(delete_upd, WT_UPDATE_TO_DELETE_FROM_HS)) { + /* + * If we want to remove an update from the history store in WiredTiger, it must be in + * history store. + */ + WT_ASSERT(session, F_ISSET(delete_upd, WT_UPDATE_HS | WT_UPDATE_RESTORED_FROM_HS)); + if (delete_upd->type == WT_UPDATE_TOMBSTONE) + delete_tombstone = delete_upd; + else { + WT_RET( + __rec_delete_hs_upd_save(session, r, ins, rip, delete_upd, delete_tombstone)); + break; + } + } + } + + /* If we delete a tombstone from the history store, we must also delete the update. */ + WT_ASSERT(session, delete_tombstone == NULL || delete_upd != NULL); + + return (0); +} + /* * __rec_need_save_upd -- * Return if we need to save the update chain @@ -201,8 +261,6 @@ static inline bool __rec_need_save_upd( WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE_SELECT *upd_select, bool has_newer_updates) { - WT_UPDATE *upd; - if (upd_select->tw.prepare) return (true); @@ -213,15 +271,6 @@ __rec_need_save_upd( if (upd_select->upd != NULL && upd_select->upd->type == WT_UPDATE_TOMBSTONE) return (false); - /* Save the update chain to delete the update from the history store later. */ - for (upd = upd_select->upd; upd != NULL; upd = upd->next) { - if (upd->txnid == WT_TXN_ABORTED) - continue; - - if (F_ISSET(upd, WT_UPDATE_TO_DELETE_FROM_HS)) - return (true); - } - /* * Don't save updates for any reconciliation that doesn't involve history store (in-memory * database, fixed length column store, metadata, and history store reconciliation itself), @@ -308,15 +357,10 @@ __rec_validate_upd_chain(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_UPDATE *s if (!F_ISSET(r, WT_REC_CHECKPOINT_RUNNING)) return (0); - for (upd = select_upd; upd != NULL; upd = upd->next) { - if (upd->txnid == WT_TXN_ABORTED) - continue; - - /* Cannot delete the update from history store when checkpoint is running. */ - if (F_ISSET(upd, WT_UPDATE_TO_DELETE_FROM_HS)) { - WT_STAT_CONN_DATA_INCR(session, cache_eviction_blocked_remove_hs_race_with_checkpoint); - return (EBUSY); - } + /* Cannot delete the update from history store when checkpoint is running. */ + if (r->delete_hs_upd_next > 0) { + WT_STAT_CONN_DATA_INCR(session, cache_eviction_blocked_remove_hs_race_with_checkpoint); + return (EBUSY); } /* @@ -431,6 +475,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, W * both must be initialized. */ upd_select->upd = NULL; + upd_select->tombstone = NULL; upd_select->upd_saved = false; upd_select->ooo_tombstone = false; select_tw = &upd_select->tw; @@ -634,7 +679,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, W */ if (upd->type == WT_UPDATE_TOMBSTONE) { WT_TIME_WINDOW_SET_STOP(select_tw, upd); - tombstone = upd; + tombstone = upd_select->tombstone = upd; /* Find the update this tombstone applies to. */ if (!__wt_txn_upd_visible_all(session, upd)) { @@ -733,6 +778,13 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, W NULL : upd_select->upd; + /* + * If we have done a prepared rollback, we may have restored a history store value to the update + * chain but the same value is left in the history store. Save it to delete it from the history + * store later. + */ + WT_ERR(__rec_find_and_save_delete_hs_upd(session, r, ins, rip, upd_select)); + /* Check the update chain for conditions that could prevent it's eviction. */ WT_ERR(__rec_validate_upd_chain(session, r, onpage_upd, select_tw, vpack)); diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 74ec3714443..c1c25b192fd 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -623,6 +623,9 @@ __rec_init(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags, WT_SALVAGE_COO r->supd_next = 0; r->supd_memsize = 0; + /* The list of updates to be deleted from the history store. */ + r->delete_hs_upd_next = 0; + /* The list of pages we've written. */ r->multi = NULL; r->multi_next = 0; @@ -775,6 +778,7 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep) __wt_buf_free(session, &r->chunk_B.image); __wt_free(session, r->supd); + __wt_free(session, r->delete_hs_upd); __wt_rec_dictionary_free(session, r); @@ -2385,6 +2389,12 @@ __rec_hs_wrapup(WT_SESSION_IMPL *session, WT_RECONCILE *r) /* Flag as unused for non diagnostic builds. */ WT_UNUSED(btree); + /* + * Delete the updates left in the history store by prepared rollback first before moving updates + * to the history store. + */ + WT_ERR(__wt_hs_delete_updates(session, r)); + /* Check if there's work to do. */ for (multi = r->multi, i = 0; i < r->multi_next; ++multi, ++i) if (multi->supd != NULL) diff --git a/src/third_party/wiredtiger/test/suite/test_prepare25.py b/src/third_party/wiredtiger/test/suite/test_prepare25.py index 3059abb3b25..fd6100a54f7 100644 --- a/src/third_party/wiredtiger/test/suite/test_prepare25.py +++ b/src/third_party/wiredtiger/test/suite/test_prepare25.py @@ -31,7 +31,7 @@ from wtscenario import make_scenarios # test_prepare25.py # Test prepare rollback and then prepare commit with failed eviction. -class test_prepare23(wttest.WiredTigerTestCase): +class test_prepare25(wttest.WiredTigerTestCase): conn_config = 'timing_stress_for_test=[failpoint_eviction_fail_after_reconciliation]' format_values = [ diff --git a/src/third_party/wiredtiger/test/suite/test_prepare26.py b/src/third_party/wiredtiger/test/suite/test_prepare26.py new file mode 100644 index 00000000000..3a9fa353a1b --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_prepare26.py @@ -0,0 +1,134 @@ +#!/usr/bin/env python +# +# Public Domain 2014-present MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wttest +from wtscenario import make_scenarios + +# test_prepare26.py +# Test prepare rollback and then delete the key. +class test_prepare26(wttest.WiredTigerTestCase): + format_values = [ + ('column', dict(key_format='r', value_format='S')), + #('column_fix', dict(key_format='r', value_format='8t')), + ('row_integer', dict(key_format='i', value_format='S')), + ] + + scenarios = make_scenarios(format_values) + + def test_prepare26(self): + uri = "table:test_prepare26" + self.session.create(uri, 'key_format=' + self.key_format + ',value_format=' + self.value_format) + + if self.value_format == '8t': + value_a = 97 + value_b = 98 + value_c = 99 + else: + value_a = "a" + value_b = "b" + value_c = "c" + + # Pin oldest timestamp to 1 + self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(1)) + + # Insert a value + cursor = self.session.open_cursor(uri) + self.session.begin_transaction() + cursor[1] = value_a + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(10)) + + # Do a prepared update + self.session.begin_transaction() + cursor[1] = value_c + self.session.prepare_transaction('prepare_timestamp=' + self.timestamp_str(20)) + + # Evict the page + session2 = self.conn.open_session() + evict_cursor = session2.open_cursor(uri, None, 'debug=(release_evict)') + session2.begin_transaction('ignore_prepare=true,read_timestamp=' + self.timestamp_str(10)) + self.assertEquals(evict_cursor[1], value_a) + evict_cursor.reset() + evict_cursor.close() + session2.rollback_transaction() + + # Rollback the prepared transaction + self.session.rollback_transaction() + + # Delete the key + self.session.begin_transaction() + cursor.set_key(1) + cursor.remove() + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(30)) + + # Set oldest timestamp to 30 + self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(30)) + + # Evict the page again + evict_cursor = session2.open_cursor(uri, None, 'debug=(release_evict)') + session2.begin_transaction() + evict_cursor.set_key(1) + if self.value_format == '8t': + self.assertEquals(evict_cursor[1], 0) + else: + evict_cursor.set_key(1) + self.assertEquals(evict_cursor.search(), wiredtiger.WT_NOTFOUND) + evict_cursor.reset() + evict_cursor.close() + session2.rollback_transaction() + + # Do another update + cursor = self.session.open_cursor(uri) + self.session.begin_transaction() + cursor[1] = value_b + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(40)) + + # Do another update + cursor = self.session.open_cursor(uri) + self.session.begin_transaction() + cursor[1] = value_c + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(50)) + + # Evict the page again + evict_cursor = session2.open_cursor(uri, None, 'debug=(release_evict)') + session2.begin_transaction('read_timestamp=' + self.timestamp_str(50)) + self.assertEquals(evict_cursor[1], value_c) + evict_cursor.reset() + evict_cursor.close() + session2.rollback_transaction() + + # Verify we read nothing at the oldest + self.session.begin_transaction('read_timestamp=' + self.timestamp_str(30)) + if self.value_format == '8t': + self.assertEquals(cursor[1], 0) + else: + cursor.set_key(1) + self.assertEquals(cursor.search(), wiredtiger.WT_NOTFOUND) + self.session.rollback_transaction() + +if __name__ == '__main__': + wttest.run() -- cgit v1.2.1