diff options
author | Etienne Petrel <etienne.petrel@mongodb.com> | 2022-04-18 03:34:41 +0000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2022-04-18 06:08:31 +0000 |
commit | 456505ebd2581ea8c0bdf1451cc0260d685dab2d (patch) | |
tree | 857140df431e0b2809e1bde84c67a9ff6de6a1c4 | |
parent | e7b7713e9757cf07c20125c78d948663c10c6985 (diff) | |
download | mongo-456505ebd2581ea8c0bdf1451cc0260d685dab2d.tar.gz |
Import wiredtiger: a0eaa7957bb8ac7eca814b89fa406c17f857095b from branch mongodb-4.4
ref: 638aa27c3f..a0eaa7957b
for: 4.4.14
WT-8362 Remove or rewrite HS entries of a key when OOO tombstone is written to datastore
WT-8450 Report stats in hs_cleanup_stress, don't validate them
WT-8708 Fix timestamp usage error in test/checkpoint
WT-8824 Disable code coverage measurement on mongodb-4.4
WT-8879 Set the OOO flag when the selected tombstone is globally visible
WT-8909 Disable cpp test search_near_01 on 4.4
WT-8924 Don't check against on disk time window if there is an insert list when checking for conflicts in row-store
23 files changed, 507 insertions, 165 deletions
diff --git a/src/third_party/wiredtiger/dist/test_data.py b/src/third_party/wiredtiger/dist/test_data.py index f35d400be4e..5ce241fb6e4 100644 --- a/src/third_party/wiredtiger/dist/test_data.py +++ b/src/third_party/wiredtiger/dist/test_data.py @@ -220,6 +220,5 @@ methods = { How long the insertions will occur for.''')]), 'example_test' : Method(test_config), 'hs_cleanup' : Method(test_config), - 'search_near_01' : Method(test_config), 'search_near_02' : Method(test_config), } diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index 4f3cb88e859..a653e281545 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-4.4", - "commit": "638aa27c3f2893cfb422791b264a71459539e036" + "commit": "a0eaa7957bb8ac7eca814b89fa406c17f857095b" } diff --git a/src/third_party/wiredtiger/src/config/test_config.c b/src/third_party/wiredtiger/src/config/test_config.c index a9f954ace43..b0c6245bb10 100644 --- a/src/third_party/wiredtiger/src/config/test_config.c +++ b/src/third_party/wiredtiger/src/config/test_config.c @@ -126,19 +126,6 @@ static const WT_CONFIG_CHECK confchk_hs_cleanup[] = { {"workload_tracking", "category", NULL, NULL, confchk_workload_tracking_subconfigs, 2}, {NULL, NULL, NULL, NULL, NULL, 0}}; -static const WT_CONFIG_CHECK confchk_search_near_01[] = { - {"cache_size_mb", "int", NULL, "min=0,max=100000000000", NULL, 0}, - {"checkpoint_manager", "category", NULL, NULL, confchk_checkpoint_manager_subconfigs, 2}, - {"compression_enabled", "boolean", NULL, NULL, NULL, 0}, - {"duration_seconds", "int", NULL, "min=0,max=1000000", NULL, 0}, - {"enable_logging", "boolean", NULL, NULL, NULL, 0}, - {"runtime_monitor", "category", NULL, NULL, confchk_runtime_monitor_subconfigs, 5}, - {"statistics_config", "category", NULL, NULL, confchk_statistics_config_subconfigs, 2}, - {"timestamp_manager", "category", NULL, NULL, confchk_timestamp_manager_subconfigs, 4}, - {"workload_generator", "category", NULL, NULL, confchk_workload_generator_subconfigs, 6}, - {"workload_tracking", "category", NULL, NULL, confchk_workload_tracking_subconfigs, 2}, - {NULL, NULL, NULL, NULL, NULL, 0}}; - static const WT_CONFIG_CHECK confchk_search_near_02[] = { {"cache_size_mb", "int", NULL, "min=0,max=100000000000", NULL, 0}, {"checkpoint_manager", "category", NULL, NULL, confchk_checkpoint_manager_subconfigs, 2}, @@ -226,24 +213,6 @@ static const WT_CONFIG_ENTRY config_entries[] = { "ops_per_transaction=(max=1,min=0),thread_count=0,value_size=5))," "workload_tracking=(enabled=true,op_rate=1s)", confchk_hs_cleanup, 10}, - {"search_near_01", - "cache_size_mb=0,checkpoint_manager=(enabled=false,op_rate=1s)," - "compression_enabled=false,duration_seconds=0," - "enable_logging=false,runtime_monitor=(enabled=true,op_rate=1s," - "postrun_statistics=[],stat_cache_size=(enabled=false,limit=0)," - "stat_db_size=(enabled=false,limit=0))," - "statistics_config=(enable_logging=true,type=all)," - "timestamp_manager=(enabled=true,oldest_lag=1,op_rate=1s," - "stable_lag=1),workload_generator=(enabled=true," - "insert_config=(key_size=5,op_rate=1s,ops_per_transaction=(max=1," - "min=0),thread_count=0,value_size=5),op_rate=1s," - "populate_config=(collection_count=1,key_count_per_collection=0," - "key_size=5,thread_count=1,value_size=5),read_config=(key_size=5," - "op_rate=1s,ops_per_transaction=(max=1,min=0),thread_count=0," - "value_size=5),update_config=(key_size=5,op_rate=1s," - "ops_per_transaction=(max=1,min=0),thread_count=0,value_size=5))," - "workload_tracking=(enabled=true,op_rate=1s)", - confchk_search_near_01, 10}, {"search_near_02", "cache_size_mb=0,checkpoint_manager=(enabled=false,op_rate=1s)," "compression_enabled=false,duration_seconds=0," diff --git a/src/third_party/wiredtiger/src/history/hs_rec.c b/src/third_party/wiredtiger/src/history/hs_rec.c index 9ac013ef617..a98bbc7c7a2 100644 --- a/src/third_party/wiredtiger/src/history/hs_rec.c +++ b/src/third_party/wiredtiger/src/history/hs_rec.c @@ -9,8 +9,8 @@ #include "wt_internal.h" static int __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, - uint32_t btree_id, const WT_ITEM *key, wt_timestamp_t ts, bool reinsert, bool error_on_ooo_ts, - uint64_t *hs_counter); + uint32_t btree_id, const WT_ITEM *key, wt_timestamp_t ts, bool reinsert, bool ooo_tombstone, + bool error_on_ooo_ts, uint64_t *hs_counter); /* * __hs_verbose_cache_stats -- @@ -208,8 +208,8 @@ __hs_insert_record(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BTREE *btree, } if (ret == 0) - WT_ERR(__hs_delete_reinsert_from_pos( - session, cursor, btree->id, key, tw->start_ts + 1, true, error_on_ooo_ts, &counter)); + WT_ERR(__hs_delete_reinsert_from_pos(session, cursor, btree->id, key, tw->start_ts + 1, + true, false, error_on_ooo_ts, &counter)); #ifdef HAVE_DIAGNOSTIC /* @@ -533,7 +533,7 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_MULTI *mult if (!F_ISSET(fix_ts_upd, WT_UPDATE_FIXED_HS)) { /* Delete and reinsert any update of the key with a higher timestamp. */ WT_ERR(__wt_hs_delete_key_from_ts(session, hs_cursor, btree->id, key, - fix_ts_upd->start_ts + 1, true, error_on_ooo_ts)); + fix_ts_upd->start_ts + 1, true, false, error_on_ooo_ts)); F_SET(fix_ts_upd, WT_UPDATE_FIXED_HS); } } @@ -782,7 +782,7 @@ err: */ int __wt_hs_delete_key_from_ts(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint32_t btree_id, - const WT_ITEM *key, wt_timestamp_t ts, bool reinsert, bool error_on_ooo_ts) + const WT_ITEM *key, wt_timestamp_t ts, bool reinsert, bool ooo_tombstone, bool error_on_ooo_ts) { WT_DECL_RET; WT_ITEM hs_key; @@ -792,10 +792,10 @@ __wt_hs_delete_key_from_ts(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint3 bool hs_read_all_flag; /* - * If we will delete all the updates of the key from the history store, we should not reinsert - * any update. + * If we delete all the updates of the key from the history store, we should not reinsert any + * update except when an out-of-order tombstone is not globally visible yet. */ - WT_ASSERT(session, ts > WT_TS_NONE || !reinsert); + WT_ASSERT(session, ooo_tombstone || ts > WT_TS_NONE || !reinsert); hs_read_all_flag = F_ISSET(hs_cursor, WT_CURSTD_HS_READ_ALL); @@ -815,8 +815,8 @@ __wt_hs_delete_key_from_ts(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint3 ++hs_counter; } - WT_ERR(__hs_delete_reinsert_from_pos( - session, hs_cursor, btree_id, key, ts, reinsert, error_on_ooo_ts, &hs_counter)); + WT_ERR(__hs_delete_reinsert_from_pos(session, hs_cursor, btree_id, key, ts, reinsert, + ooo_tombstone, error_on_ooo_ts, &hs_counter)); done: err: @@ -834,7 +834,8 @@ err: */ static int __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, uint32_t btree_id, - const WT_ITEM *key, wt_timestamp_t ts, bool reinsert, bool error_on_ooo_ts, uint64_t *counter) + const WT_ITEM *key, wt_timestamp_t ts, bool reinsert, bool ooo_tombstone, bool error_on_ooo_ts, + uint64_t *counter) { WT_CURSOR *hs_insert_cursor; WT_CURSOR_BTREE *hs_cbt; @@ -858,9 +859,11 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui WT_UNUSED(key); #endif - /* If we will delete all the updates of the key from the history store, we should not reinsert - * any update. */ - WT_ASSERT(session, ts > WT_TS_NONE || !reinsert); + /* + * If we delete all the updates of the key from the history store, we should not reinsert any + * update except when an out-of-order tombstone is not globally visible yet. + */ + WT_ASSERT(session, ooo_tombstone || ts > WT_TS_NONE || !reinsert); for (; ret == 0; ret = hs_cursor->next(hs_cursor)) { /* Ignore records that are obsolete. */ @@ -971,7 +974,16 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui __wt_timestamp_to_string(hs_cbt->upd_value->tw.durable_stop_ts, ts_string[3]), __wt_timestamp_to_string(ts, ts_string[4])); - hs_insert_tw.start_ts = hs_insert_tw.durable_start_ts = ts - 1; + /* + * Use the original start time window's timestamps if it isn't out of order with respect + * to the new update. + */ + if (hs_cbt->upd_value->tw.start_ts >= ts) + hs_insert_tw.start_ts = hs_insert_tw.durable_start_ts = ooo_tombstone ? ts : ts - 1; + else { + hs_insert_tw.start_ts = hs_cbt->upd_value->tw.start_ts; + hs_insert_tw.durable_start_ts = hs_cbt->upd_value->tw.durable_start_ts; + } hs_insert_tw.start_txn = hs_cbt->upd_value->tw.start_txn; /* @@ -979,13 +991,17 @@ __hs_delete_reinsert_from_pos(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, ui * another moved update OR the update itself triggered the correction. In either case, * we should preserve the stop transaction id. */ - hs_insert_tw.stop_ts = hs_insert_tw.durable_stop_ts = ts - 1; + hs_insert_tw.stop_ts = hs_insert_tw.durable_stop_ts = ooo_tombstone ? ts : ts - 1; hs_insert_tw.stop_txn = hs_cbt->upd_value->tw.stop_txn; /* Extract the underlying value for reinsertion. */ WT_ERR(hs_cursor->get_value( hs_cursor, &tw.durable_stop_ts, &tw.durable_start_ts, &hs_upd_type, &hs_value)); + /* Reinsert the update with corrected timestamps. */ + if (ooo_tombstone && hs_ts == ts) + *counter = hs_counter; + /* Insert the value back with different timestamps. */ hs_insert_cursor->set_key( hs_insert_cursor, 4, btree_id, &hs_key, hs_insert_tw.start_ts, *counter); diff --git a/src/third_party/wiredtiger/src/include/api.h b/src/third_party/wiredtiger/src/include/api.h index 6f9c350afd6..7f18ccabcb1 100644 --- a/src/third_party/wiredtiger/src/include/api.h +++ b/src/third_party/wiredtiger/src/include/api.h @@ -89,7 +89,7 @@ * We should not leave any history store cursor open when return from an api call. \ * However, we cannot do a stricter check before WT-7247 is resolved. \ */ \ - WT_ASSERT(s, (s)->api_call_counter > 1 || (s)->hs_cursor_counter <= 2); \ + WT_ASSERT(s, (s)->api_call_counter > 1 || (s)->hs_cursor_counter <= 3); \ /* \ * No code after this line, otherwise error handling \ * won't be correct. \ diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index c3cd6385b1a..ffd9a61a4e1 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -780,8 +780,8 @@ extern int __wt_hex_to_raw(WT_SESSION_IMPL *session, const char *from, WT_ITEM * extern int __wt_hs_config(WT_SESSION_IMPL *session, const char **cfg) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_hs_delete_key_from_ts(WT_SESSION_IMPL *session, WT_CURSOR *hs_cursor, - uint32_t btree_id, const WT_ITEM *key, wt_timestamp_t ts, bool reinsert, bool error_on_ooo_ts) - WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); + uint32_t btree_id, const WT_ITEM *key, wt_timestamp_t ts, bool reinsert, bool ooo_tombstone, + bool error_on_ooo_ts) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_hs_find_upd(WT_SESSION_IMPL *session, uint32_t btree_id, WT_ITEM *key, const char *value_format, uint64_t recno, WT_UPDATE_VALUE *upd_value, WT_ITEM *base_value_buf) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); @@ -1232,6 +1232,9 @@ extern int __wt_rec_dictionary_init(WT_SESSION_IMPL *session, WT_RECONCILE *r, u WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_rec_dictionary_lookup(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REC_KV *val, WT_REC_DICTIONARY **dpp) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); +extern int __wt_rec_hs_clear_on_tombstone(WT_SESSION_IMPL *session, WT_RECONCILE *r, + wt_timestamp_t ts, uint64_t recno, WT_ITEM *rowkey, bool reinsert) + WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) WT_GCC_FUNC_DECL_ATTRIBUTE((warn_unused_result)); extern int __wt_rec_row_leaf(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref, diff --git a/src/third_party/wiredtiger/src/include/reconcile.h b/src/third_party/wiredtiger/src/include/reconcile.h index ca94b8a1478..70664653e5d 100644 --- a/src/third_party/wiredtiger/src/include/reconcile.h +++ b/src/third_party/wiredtiger/src/include/reconcile.h @@ -284,6 +284,14 @@ struct __wt_reconcile { bool rec_page_cell_with_ts; bool rec_page_cell_with_txn_id; bool rec_page_cell_with_prepared_txn; + + /* + * When removing a key due to a tombstone with a durable timestamp of "none", we also remove the + * history store contents associated with that key. Keep the pertinent state here: a flag to say + * whether this is appropriate, and a cached history store cursor for doing it. + */ + bool hs_clear_on_tombstone; + WT_CURSOR *hs_cursor; }; typedef struct { @@ -291,7 +299,8 @@ typedef struct { WT_TIME_WINDOW tw; - bool upd_saved; /* An element on the row's update chain was saved */ + bool upd_saved; /* An element on the row's update chain was saved */ + bool ooo_tombstone; /* Out-of-order tombstone */ } WT_UPDATE_SELECT; /* diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h index 9868964597e..5206910f139 100644 --- a/src/third_party/wiredtiger/src/include/session.h +++ b/src/third_party/wiredtiger/src/include/session.h @@ -159,7 +159,7 @@ struct __wt_session_impl { size_t op_handle_allocated; /* Bytes allocated */ void *reconcile; /* Reconciliation support */ - void (*reconcile_cleanup)(WT_SESSION_IMPL *); + int (*reconcile_cleanup)(WT_SESSION_IMPL *); /* Salvage support. */ void *salvage_track; diff --git a/src/third_party/wiredtiger/src/include/txn_inline.h b/src/third_party/wiredtiger/src/include/txn_inline.h index a6b13708057..c9247515bb8 100644 --- a/src/third_party/wiredtiger/src/include/txn_inline.h +++ b/src/third_party/wiredtiger/src/include/txn_inline.h @@ -1353,8 +1353,13 @@ __wt_txn_modify_check( * Check conflict against any on-page value if there is no update on the update chain except * aborted updates. Otherwise, we would have either already detected a conflict if we saw an * uncommitted update or determined that it would be safe to write if we saw a committed update. + * + * In the case of row-store we also need to check that the insert list is empty as the existence + * of it implies there is no on disk value for the given key. However we can still get a + * time-window from an unrelated on-disk value if we are not careful as the slot can still be + * set on the cursor b-tree. */ - if (!rollback && upd == NULL) { + if (!rollback && upd == NULL && (CUR2BT(cbt)->type != BTREE_ROW || cbt->ins == NULL)) { __wt_read_cell_time_window(cbt, &tw, &tw_found); if (tw_found) { if (WT_TIME_WINDOW_HAS_STOP(&tw)) diff --git a/src/third_party/wiredtiger/src/reconcile/rec_col.c b/src/third_party/wiredtiger/src/reconcile/rec_col.c index 7b716c4808a..dfca67b3ea6 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_col.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_col.c @@ -580,20 +580,17 @@ __wt_rec_col_var( WT_CELL *cell; WT_CELL_UNPACK_KV *vpack, _vpack; WT_COL *cip; - WT_CURSOR *hs_cursor; WT_CURSOR_BTREE *cbt; WT_DECL_ITEM(orig); WT_DECL_RET; WT_INSERT *ins; - WT_ITEM hs_recno_key; WT_PAGE *page; WT_TIME_WINDOW clear_tw, *twp; WT_UPDATE *upd; WT_UPDATE_SELECT upd_select; uint64_t n, nrepeat, repeat_count, rle, skip, src_recno; uint32_t i, size; - uint8_t *p, hs_recno_key_buf[WT_INTPACK64_MAXSIZE]; - bool deleted, hs_clear, orig_deleted, update_no_copy; + bool deleted, orig_deleted, update_no_copy; const void *data; btree = S2BT(session); @@ -625,11 +622,6 @@ __wt_rec_col_var( * they shouldn't open new dhandles. In those cases we won't ever need to blow away history * store content, so we can skip this. */ - hs_cursor = NULL; - hs_clear = F_ISSET(S2C(session), WT_CONN_HS_OPEN) && - !F_ISSET(session, WT_SESSION_NO_DATA_HANDLES) && !WT_IS_HS(btree->dhandle) && - !WT_IS_METADATA(btree->dhandle); - WT_RET(__wt_rec_split_init(session, r, page, pageref->ref_recno, btree->maxleafpage_precomp)); WT_RET(__wt_scr_alloc(session, 0, &orig)); @@ -816,41 +808,25 @@ record_loop: case WT_UPDATE_STANDARD: data = upd->data; size = upd->size; + /* + * When an out-of-order or mixed-mode tombstone is getting written to disk, + * remove any historical versions that are greater in the history store for this + * key. + */ + if (upd_select.ooo_tombstone && r->hs_clear_on_tombstone) + WT_ERR(__wt_rec_hs_clear_on_tombstone( + session, r, twp->durable_stop_ts, src_recno, NULL, true)); + break; case WT_UPDATE_TOMBSTONE: /* - * When removing a key due to a tombstone with a durable timestamp of "none", - * also remove the history store contents associated with that key. + * When an out-of-order or mixed-mode tombstone is getting written to disk, + * remove any historical versions that are greater in the history store for this + * key. */ - if (twp->durable_stop_ts == WT_TS_NONE && hs_clear) { - p = hs_recno_key_buf; - WT_ERR(__wt_vpack_uint(&p, 0, src_recno)); - hs_recno_key.data = hs_recno_key_buf; - hs_recno_key.size = WT_PTRDIFF(p, hs_recno_key_buf); - - /* Open a history store cursor if we don't yet have one. */ - if (hs_cursor == NULL) - WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor)); - - /* - * From WT_TS_NONE delete all the history store content of the key. This - * path will never be taken for a mixed-mode deletion being evicted and with - * a checkpoint that started prior to the eviction starting its - * reconciliation as previous checks done while selecting an update will - * detect that. - */ - WT_ERR(__wt_hs_delete_key_from_ts( - session, hs_cursor, btree->id, &hs_recno_key, WT_TS_NONE, false, false)); - - /* Fail 0.01% of the time. */ - if (F_ISSET(r, WT_REC_EVICT) && - __wt_failpoint(session, - WT_TIMING_STRESS_FAILPOINT_HISTORY_STORE_DELETE_KEY_FROM_TS, 0.01)) - WT_ERR(EBUSY); - - WT_STAT_CONN_INCR(session, cache_hs_key_truncate_onpage_removal); - WT_STAT_DATA_INCR(session, cache_hs_key_truncate_onpage_removal); - } + if (upd_select.ooo_tombstone && r->hs_clear_on_tombstone) + WT_ERR(__wt_rec_hs_clear_on_tombstone( + session, r, twp->durable_stop_ts, src_recno, NULL, false)); deleted = true; twp = &clear_tw; @@ -1077,8 +1053,6 @@ next: ret = __wt_rec_split_finish(session, r); err: - if (hs_cursor != NULL) - WT_TRET(hs_cursor->close(hs_cursor)); __wt_scr_free(session, &orig); return (ret); } diff --git a/src/third_party/wiredtiger/src/reconcile/rec_row.c b/src/third_party/wiredtiger/src/reconcile/rec_row.c index a977b7d088c..60c5722398e 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_row.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_row.c @@ -710,7 +710,6 @@ __wt_rec_row_leaf( WT_BTREE *btree; WT_CELL *cell; WT_CELL_UNPACK_KV *kpack, _kpack, *vpack, _vpack; - WT_CURSOR *hs_cursor; WT_CURSOR_BTREE *cbt; WT_DECL_ITEM(tmpkey); WT_DECL_RET; @@ -726,7 +725,7 @@ __wt_rec_row_leaf( uint64_t slvg_skip; uint32_t i; uint8_t key_prefix; - bool dictionary, hs_clear, key_onpage_ovfl, ovfl_key; + bool dictionary, key_onpage_ovfl, ovfl_key; void *copy; const void *key_data; @@ -755,11 +754,6 @@ __wt_rec_row_leaf( * they shouldn't open new dhandles. In those cases we won't ever need to blow away history * store content, so we can skip this. */ - hs_cursor = NULL; - hs_clear = F_ISSET(S2C(session), WT_CONN_HS_OPEN) && - !F_ISSET(session, WT_SESSION_NO_DATA_HANDLES) && !WT_IS_HS(btree->dhandle) && - !WT_IS_METADATA(btree->dhandle); - WT_RET(__wt_rec_split_init(session, r, page, 0, btree->maxleafpage_precomp)); /* @@ -896,6 +890,15 @@ __wt_rec_row_leaf( case WT_UPDATE_STANDARD: /* Take the value from the update. */ WT_ERR(__wt_rec_cell_build_val(session, r, upd->data, upd->size, twp, 0)); + /* + * When an out-of-order or mixed-mode tombstone is getting written to disk, remove + * any historical versions that are greater in the history store for that key. + */ + if (upd_select.ooo_tombstone && r->hs_clear_on_tombstone) { + WT_ERR(__wt_row_leaf_key(session, page, rip, tmpkey, true)); + WT_ERR(__wt_rec_hs_clear_on_tombstone( + session, r, twp->durable_stop_ts, WT_RECNO_OOB, tmpkey, true)); + } dictionary = true; break; case WT_UPDATE_TOMBSTONE: @@ -920,32 +923,13 @@ __wt_rec_row_leaf( } /* - * When removing a key due to a tombstone with a durable timestamp of "none", also - * remove the history store contents associated with that key. + * When an out-of-order or mixed-mode tombstone is getting written to disk, remove + * any historical versions that are greater in the history store for this key. */ - if (twp->durable_stop_ts == WT_TS_NONE && hs_clear) { + if (upd_select.ooo_tombstone && r->hs_clear_on_tombstone) { WT_ERR(__wt_row_leaf_key(session, page, rip, tmpkey, true)); - - /* Open a history store cursor if we don't yet have one. */ - if (hs_cursor == NULL) - WT_ERR(__wt_curhs_open(session, NULL, &hs_cursor)); - - /* - * From WT_TS_NONE delete all the history store content of the key. This path - * will never be taken for a mixed-mode deletion being evicted and with a - * checkpoint that started prior to the eviction starting its reconciliation as - * previous checks done while selecting an update will detect that. - */ - WT_ERR(__wt_hs_delete_key_from_ts( - session, hs_cursor, btree->id, tmpkey, WT_TS_NONE, false, false)); - - /* Fail 0.01% of the time. */ - if (F_ISSET(r, WT_REC_EVICT) && - __wt_failpoint( - session, WT_TIMING_STRESS_FAILPOINT_HISTORY_STORE_DELETE_KEY_FROM_TS, 0.01)) - WT_ERR(EBUSY); - WT_STAT_CONN_INCR(session, cache_hs_key_truncate_onpage_removal); - WT_STAT_DATA_INCR(session, cache_hs_key_truncate_onpage_removal); + WT_ERR(__wt_rec_hs_clear_on_tombstone( + session, r, twp->durable_stop_ts, WT_RECNO_OOB, tmpkey, false)); } /* @@ -1077,8 +1061,6 @@ leaf_insert: ret = __wt_rec_split_finish(session, r); err: - if (hs_cursor != NULL) - WT_TRET(hs_cursor->close(hs_cursor)); __wt_scr_free(session, &tmpkey); return (ret); } diff --git a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c index aed99b9f906..ba947c13ac4 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c @@ -402,6 +402,7 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, W */ upd_select->upd = NULL; upd_select->upd_saved = false; + upd_select->ooo_tombstone = false; select_tw = &upd_select->tw; WT_TIME_WINDOW_INIT(select_tw); @@ -699,6 +700,32 @@ __wt_rec_upd_select(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins, W WT_ERR(__rec_validate_upd_chain(session, r, onpage_upd, select_tw, vpack)); /* + * Set the flag if the selected tombstone is an out-of-order or mixed mode to an update. Based + * on this flag, the caller functions perform the history store truncation for this key. + */ + if (!is_hs_page && tombstone != NULL && + !F_ISSET(tombstone, WT_UPDATE_RESTORED_FROM_DS | WT_UPDATE_RESTORED_FROM_HS)) { + upd = upd_select->upd; + + /* + * The selected update can be the tombstone itself when the tombstone is globally visible. + * Compare the tombstone's timestamp with either the next update in the update list or the + * on-disk cell timestamp to determine if the tombstone is an out-of-order or mixed mode. + */ + if (tombstone == upd) { + upd = upd->next; + + /* Loop until a valid update is found. */ + while (upd != NULL && upd->txnid == WT_TXN_ABORTED) + upd = upd->next; + } + + if ((upd != NULL && upd->start_ts > tombstone->start_ts) || + (vpack != NULL && vpack->tw.start_ts > tombstone->start_ts)) + upd_select->ooo_tombstone = true; + } + + /* * Fixup any out of order timestamps, assert that checkpoint wasn't running when this round of * reconciliation started. * diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 1cd1a3be5d0..60d2e51da83 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -8,9 +8,9 @@ #include "wt_internal.h" -static void __rec_cleanup(WT_SESSION_IMPL *, WT_RECONCILE *); -static void __rec_destroy(WT_SESSION_IMPL *, void *); -static void __rec_destroy_session(WT_SESSION_IMPL *); +static int __rec_cleanup(WT_SESSION_IMPL *, WT_RECONCILE *); +static int __rec_destroy(WT_SESSION_IMPL *, void *); +static int __rec_destroy_session(WT_SESSION_IMPL *); static int __rec_init(WT_SESSION_IMPL *, WT_REF *, uint32_t, WT_SALVAGE_COOKIE *, void *); static int __rec_hs_wrapup(WT_SESSION_IMPL *, WT_RECONCILE *); static int __rec_root_write(WT_SESSION_IMPL *, WT_PAGE *, uint32_t); @@ -265,7 +265,7 @@ __reconcile(WT_SESSION_IMPL *session, WT_REF *ref, WT_SALVAGE_COOKIE *salvage, u btree->rec_multiblock_max = r->multi_next; /* Clean up the reconciliation structure. */ - __rec_cleanup(session, r); + WT_RET(__rec_cleanup(session, r)); /* * When threads perform eviction, don't cache block manager structures (even across calls), we @@ -675,6 +675,22 @@ __rec_init(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags, WT_SALVAGE_COO r->rec_page_cell_with_txn_id = false; r->rec_page_cell_with_prepared_txn = false; + /* + * When removing a key due to a tombstone with a durable timestamp of "none", also remove the + * history store contents associated with that key. It's safe to do even if we fail + * reconciliation after the removal, the history store content must be obsolete in order for us + * to consider removing the key. + * + * Ignore if this is metadata, as metadata doesn't have any history. + * + * Some code paths, such as schema removal, involve deleting keys in metadata and assert that + * they shouldn't open new dhandles. In those cases we won't ever need to blow away history + * store content, so we can skip this. + */ + r->hs_clear_on_tombstone = F_ISSET(S2C(session), WT_CONN_HS_OPEN) && + !F_ISSET(session, WT_SESSION_NO_DATA_HANDLES) && !WT_IS_HS(btree->dhandle) && + !WT_IS_METADATA(btree->dhandle); + /* * If we allocated the reconciliation structure and there was an error, clean up. If our caller * passed in a structure, they own it. @@ -684,8 +700,8 @@ err: if (ret == 0) *(WT_RECONCILE **)reconcilep = r; else { - __rec_cleanup(session, r); - __rec_destroy(session, &r); + WT_TRET(__rec_cleanup(session, r)); + WT_TRET(__rec_destroy(session, &r)); } } @@ -696,7 +712,7 @@ err: * __rec_cleanup -- * Clean up after a reconciliation run, except for structures cached across runs. */ -static void +static int __rec_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r) { WT_BTREE *btree; @@ -705,6 +721,9 @@ __rec_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r) btree = S2BT(session); + if (r->hs_cursor != NULL) + WT_RET(r->hs_cursor->reset(r->hs_cursor)); + if (btree->type == BTREE_ROW) for (multi = r->multi, i = 0; i < r->multi_next; ++multi, ++i) __wt_free(session, multi->key.ikey); @@ -717,19 +736,25 @@ __rec_cleanup(WT_SESSION_IMPL *session, WT_RECONCILE *r) /* Reconciliation is not re-entrant, make sure that doesn't happen. */ r->ref = NULL; + + return (0); } /* * __rec_destroy -- * Clean up the reconciliation structure. */ -static void +static int __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep) { WT_RECONCILE *r; if ((r = *(WT_RECONCILE **)reconcilep) == NULL) - return; + return (0); + + if (r->hs_cursor != NULL) + WT_RET(r->hs_cursor->close(r->hs_cursor)); + *(WT_RECONCILE **)reconcilep = NULL; __wt_buf_free(session, &r->chunk_A.key); @@ -752,16 +777,18 @@ __rec_destroy(WT_SESSION_IMPL *session, void *reconcilep) __wt_buf_free(session, &r->update_modify_cbt._upd_value.buf); __wt_free(session, r); + + return (0); } /* * __rec_destroy_session -- * Clean up the reconciliation structure, session version. */ -static void +static int __rec_destroy_session(WT_SESSION_IMPL *session) { - __rec_destroy(session, &session->reconcile); + return (__rec_destroy(session, &session->reconcile)); } /* @@ -2005,8 +2032,8 @@ __wt_bulk_wrapup(WT_SESSION_IMPL *session, WT_CURSOR_BULK *cbulk) __wt_page_modify_set(session, parent); err: - __rec_cleanup(session, r); - __rec_destroy(session, &cbulk->reconcile); + WT_TRET(__rec_cleanup(session, r)); + WT_TRET(__rec_destroy(session, &cbulk->reconcile)); return (ret); } @@ -2428,3 +2455,55 @@ err: __wt_scr_free(session, &tmp); return (ret); } + +/* + * __wt_rec_hs_clear_on_tombstone -- + * When removing a key due to a tombstone with a durable timestamp of "none", also remove the + * history store contents associated with that key. + */ +int +__wt_rec_hs_clear_on_tombstone(WT_SESSION_IMPL *session, WT_RECONCILE *r, wt_timestamp_t ts, + uint64_t recno, WT_ITEM *rowkey, bool reinsert) +{ + WT_BTREE *btree; + WT_ITEM hs_recno_key, *key; + uint8_t hs_recno_key_buf[WT_INTPACK64_MAXSIZE], *p; + + btree = S2BT(session); + + /* We should be passed a recno or a row-store key, but not both. */ + WT_ASSERT(session, (recno == WT_RECNO_OOB) != (rowkey == NULL)); + + if (rowkey != NULL) + key = rowkey; + else { + p = hs_recno_key_buf; + WT_RET(__wt_vpack_uint(&p, 0, recno)); + hs_recno_key.data = hs_recno_key_buf; + hs_recno_key.size = WT_PTRDIFF(p, hs_recno_key_buf); + key = &hs_recno_key; + } + + /* Open a history store cursor if we don't yet have one. */ + if (r->hs_cursor == NULL) + WT_RET(__wt_curhs_open(session, NULL, &r->hs_cursor)); + + /* + * From WT_TS_NONE delete all the history store content of the key. This path will never be + * taken for a mixed-mode deletion being evicted and with a checkpoint that started prior to the + * eviction starting its reconciliation as previous checks done while selecting an update will + * detect that. + */ + WT_RET(__wt_hs_delete_key_from_ts(session, r->hs_cursor, btree->id, key, ts, reinsert, true, + F_ISSET(r, WT_REC_CHECKPOINT_RUNNING))); + + /* Fail 0.01% of the time. */ + if (F_ISSET(r, WT_REC_EVICT) && + __wt_failpoint(session, WT_TIMING_STRESS_FAILPOINT_HISTORY_STORE_DELETE_KEY_FROM_TS, 1)) + return (EBUSY); + + WT_STAT_CONN_INCR(session, cache_hs_key_truncate_onpage_removal); + WT_STAT_DATA_INCR(session, cache_hs_key_truncate_onpage_removal); + + return (0); +} diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index b6058b868c9..03827e3e76d 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -178,7 +178,7 @@ __wt_session_release_resources(WT_SESSION_IMPL *session) /* Reconciliation cleanup */ if (session->reconcile_cleanup != NULL) - session->reconcile_cleanup(session); + WT_TRET(session->reconcile_cleanup(session)); /* Stashed memory. */ __wt_stash_discard(session); diff --git a/src/third_party/wiredtiger/test/checkpoint/checkpointer.c b/src/third_party/wiredtiger/test/checkpoint/checkpointer.c index 1c8b5135d49..a07604a6520 100644 --- a/src/third_party/wiredtiger/test/checkpoint/checkpointer.c +++ b/src/third_party/wiredtiger/test/checkpoint/checkpointer.c @@ -44,10 +44,10 @@ set_stable(void) char buf[128]; if (g.race_timetamps) - testutil_check(__wt_snprintf( - buf, sizeof(buf), "stable_timestamp=%x,oldest_timestamp=%x", g.ts_stable, g.ts_stable)); + testutil_check(__wt_snprintf(buf, sizeof(buf), + "stable_timestamp=%" PRIx64 ",oldest_timestamp=%" PRIx64, g.ts_stable, g.ts_stable)); else - testutil_check(__wt_snprintf(buf, sizeof(buf), "stable_timestamp=%x", g.ts_stable)); + testutil_check(__wt_snprintf(buf, sizeof(buf), "stable_timestamp=%" PRIx64, g.ts_stable)); testutil_check(g.conn->set_timestamp(g.conn, buf)); } @@ -202,7 +202,9 @@ real_checkpointer(void) verify_ts = stable_ts; else verify_ts = __wt_random(&rnd) % (stable_ts - oldest_ts + 1) + oldest_ts; - WT_ORDERED_READ(g.ts_oldest, g.ts_stable); + __wt_writelock((WT_SESSION_IMPL *)session, &g.clock_lock); + g.ts_oldest = g.ts_stable; + __wt_writeunlock((WT_SESSION_IMPL *)session, &g.clock_lock); } /* Execute a checkpoint */ @@ -225,7 +227,7 @@ real_checkpointer(void) /* Advance the oldest timestamp to the most recently set stable timestamp. */ if (g.use_timestamps && g.ts_oldest != 0) { testutil_check(__wt_snprintf( - timestamp_buf, sizeof(timestamp_buf), "oldest_timestamp=%x", g.ts_oldest)); + timestamp_buf, sizeof(timestamp_buf), "oldest_timestamp=%" PRIx64, g.ts_oldest)); testutil_check(g.conn->set_timestamp(g.conn, timestamp_buf)); } /* Random value between 4 and 8 seconds. */ diff --git a/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.h b/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.h index b3b65c5d828..93f76ec5e1c 100644 --- a/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.h +++ b/src/third_party/wiredtiger/test/checkpoint/test_checkpoint.h @@ -71,8 +71,8 @@ typedef struct { bool hs_checkpoint_timing_stress; /* History store checkpoint timing stress */ bool reserved_txnid_timing_stress; /* Reserved transaction id timing stress */ bool checkpoint_slow_timing_stress; /* Checkpoint slow timing stress */ - u_int ts_oldest; /* Current oldest timestamp */ - u_int ts_stable; /* Current stable timestamp */ + uint64_t ts_oldest; /* Current oldest timestamp */ + uint64_t ts_stable; /* Current stable timestamp */ bool mixed_mode_deletes; /* Run with mixed mode deletes */ bool use_timestamps; /* Use txn timestamps */ bool race_timetamps; /* Async update to oldest timestamp */ diff --git a/src/third_party/wiredtiger/test/checkpoint/workers.c b/src/third_party/wiredtiger/test/checkpoint/workers.c index 3c9313c4c99..5ed22005fe5 100644 --- a/src/third_party/wiredtiger/test/checkpoint/workers.c +++ b/src/third_party/wiredtiger/test/checkpoint/workers.c @@ -369,18 +369,18 @@ real_worker(void) next_rnd = __wt_random(&rnd); if (g.prepare && next_rnd % 2 == 0) { testutil_check(__wt_snprintf( - buf, sizeof(buf), "prepare_timestamp=%x", g.ts_stable + 1)); + buf, sizeof(buf), "prepare_timestamp=%" PRIx64, g.ts_stable + 1)); if ((ret = session->prepare_transaction(session, buf)) != 0) { __wt_readunlock((WT_SESSION_IMPL *)session, &g.clock_lock); (void)log_print_err("real_worker:prepare_transaction", ret, 1); goto err; } testutil_check(__wt_snprintf(buf, sizeof(buf), - "durable_timestamp=%x,commit_timestamp=%x", g.ts_stable + 3, - g.ts_stable + 1)); + "durable_timestamp=%" PRIx64 ",commit_timestamp=%" PRIx64, + g.ts_stable + 3, g.ts_stable + 1)); } else testutil_check(__wt_snprintf( - buf, sizeof(buf), "commit_timestamp=%x", g.ts_stable + 1)); + buf, sizeof(buf), "commit_timestamp=%" PRIx64, g.ts_stable + 1)); // Commit majority of times if (next_rnd % 49 != 0) { diff --git a/src/third_party/wiredtiger/test/cppsuite/configs/hs_cleanup_stress.txt b/src/third_party/wiredtiger/test/cppsuite/configs/hs_cleanup_stress.txt index 97de15635c5..ee2e6795df7 100644 --- a/src/third_party/wiredtiger/test/cppsuite/configs/hs_cleanup_stress.txt +++ b/src/third_party/wiredtiger/test/cppsuite/configs/hs_cleanup_stress.txt @@ -18,19 +18,19 @@ runtime_monitor= ( stat_cache_size= ( - enabled=true, + enabled=false, limit=110 ), # The data files compress to around 25MB per table at the end of a run so 250MB total. # +1.4GB for the history store. With an additional 150MB margin. stat_db_size= ( - enabled=true, + enabled=false, limit=1900000000, ), # Seems to insert around 477K records. Give it +-20K margin. # Seems to remove 180K records. Give it a similar margin. - postrun_statistics=[cache_hs_insert:457000:497000, cc_pages_removed:170000:200000] + #postrun_statistics=[cache_hs_insert:457000:497000, cc_pages_removed:170000:200000] ), timestamp_manager= ( diff --git a/src/third_party/wiredtiger/test/cppsuite/tests/run.cxx b/src/third_party/wiredtiger/test/cppsuite/tests/run.cxx index 627921a9aa4..caa0153333c 100755 --- a/src/third_party/wiredtiger/test/cppsuite/tests/run.cxx +++ b/src/third_party/wiredtiger/test/cppsuite/tests/run.cxx @@ -37,7 +37,6 @@ #include "burst_inserts.cxx" #include "example_test.cxx" #include "hs_cleanup.cxx" -#include "search_near_01.cxx" #include "search_near_02.cxx" std::string @@ -121,8 +120,6 @@ run_test(const std::string &test_name, const std::string &config, const std::str hs_cleanup(test_harness::test_args{config, test_name, wt_open_config}).run(); else if (test_name == "burst_inserts") burst_inserts(test_harness::test_args{config, test_name, wt_open_config}).run(); - else if (test_name == "search_near_01") - search_near_01(test_harness::test_args{config, test_name, wt_open_config}).run(); else { test_harness::logger::log_msg(LOG_ERROR, "Test not found: " + test_name); error_code = -1; @@ -145,8 +142,8 @@ main(int argc, char *argv[]) { std::string cfg, config_filename, current_cfg, current_test_name, test_name, wt_open_config; int64_t error_code = 0; - const std::vector<std::string> all_tests = {"base_test", "burst_inserts", "example_test", - "hs_cleanup", "search_near_01", "search_near_02"}; + const std::vector<std::string> all_tests = { + "base_test", "burst_inserts", "example_test", "hs_cleanup", "search_near_02"}; /* Set the program name for error messages. */ (void)testutil_set_progname(argv); diff --git a/src/third_party/wiredtiger/test/cppsuite/tests/search_near_01.cxx b/src/third_party/wiredtiger/test/cppsuite/tests/search_near_01.cxx index 20bcebff4a0..8ddc57a57d0 100644 --- a/src/third_party/wiredtiger/test/cppsuite/tests/search_near_01.cxx +++ b/src/third_party/wiredtiger/test/cppsuite/tests/search_near_01.cxx @@ -34,6 +34,8 @@ using namespace test_harness; /* + * Disabled as part of WT-8909. + * * In this test, we want to verify that search_near with prefix enabled only traverses the portion * of the tree that follows the prefix portion of the search key. The test is composed of a populate * phase followed by a read phase. The populate phase will insert a set of random generated keys diff --git a/src/third_party/wiredtiger/test/evergreen.yml b/src/third_party/wiredtiger/test/evergreen.yml index 208b1a2d3f7..f870a1f96e7 100755 --- a/src/third_party/wiredtiger/test/evergreen.yml +++ b/src/third_party/wiredtiger/test/evergreen.yml @@ -1050,7 +1050,6 @@ tasks: set -o errexit set -o verbose - ${test_env_vars|} $(pwd)/test/cppsuite/run -t search_near_01 -f test/cppsuite/configs/search_near_01_default.txt -l 2 ${test_env_vars|} $(pwd)/test/cppsuite/run -t search_near_02 -f test/cppsuite/configs/search_near_02_default.txt -l 2 - name: cppsuite-base-test-stress @@ -3456,6 +3455,9 @@ buildvariants: - name: code-statistics display_name: "Code statistics" + # Code coverage and complexity metrics are not required on the mongodb-4.4 branch as + # they are only required on the 'develop' branch. + activate: false batchtime: 10080 # 7 days run_on: - ubuntu2004-test diff --git a/src/third_party/wiredtiger/test/suite/test_hs29.py b/src/third_party/wiredtiger/test/suite/test_hs29.py new file mode 100644 index 00000000000..da81c6a94d3 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_hs29.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python +# +# Public Domain 2014-present MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wttest + +# test_hs29.py +# It is possible to end up with 3 opened history store cursors at the same time when the following +# occurs: +# - The reconciliation process opens one history store cursor. +# - The function hs_delete_reinsert_from_pos creates a history store cursor too. This means we need +# an update with an OOO timestamp to trigger that function. +# - The function wt_rec_hs_clear_on_tombstone creates a history store cursor as well. This means we +# need a tombstone to trigger the function, i.e a deleted key. +class test_hs29(wttest.WiredTigerTestCase): + + def test_3_hs_cursors(self): + + # Create a table. + uri = "table:test_hs_cursor" + self.session.create(uri, 'key_format=S,value_format=S') + + # Open one cursor to operate on the table and another one to perform eviction. + cursor = self.session.open_cursor(uri) + cursor2 = self.session.open_cursor(uri, None, "debug=(release_evict=true)") + + # Create two keys and perform an update on each. + self.session.begin_transaction() + cursor['1'] = '1' + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(2)) + + self.session.begin_transaction() + cursor['1'] = '11' + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(3)) + + self.session.begin_transaction() + cursor['2'] = '2' + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(10)) + + self.session.begin_transaction() + cursor['2'] = '22' + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(20)) + + # Perform eviction. + cursor2.set_key('1') + self.assertEqual(cursor2.search(), 0) + self.assertEqual(cursor2.get_value(), '11') + self.assertEqual(cursor2.reset(), 0) + + cursor2.set_key('2') + self.assertEqual(cursor2.search(), 0) + self.assertEqual(cursor2.get_value(), '22') + self.assertEqual(cursor2.reset(), 0) + + # Remove the first key without giving a ts. + self.session.begin_transaction() + cursor.set_key('1') + cursor.remove() + self.session.commit_transaction() + + # Update the second key with out of order timestamp. + self.session.begin_transaction() + cursor['2'] = '222' + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(5)) + + # Close the connection to trigger a final checkpoint and reconciliation. + self.conn.close() + +if __name__ == '__main__': + wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_hs31.py b/src/third_party/wiredtiger/test/suite/test_hs31.py new file mode 100644 index 00000000000..7780a187dc9 --- /dev/null +++ b/src/third_party/wiredtiger/test/suite/test_hs31.py @@ -0,0 +1,182 @@ +#!/usr/bin/env python +# +# Public Domain 2014-present MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wttest +from wtscenario import make_scenarios +from wiredtiger import stat + +# test_hs31.py +# Ensure that tombstone with out of order timestamp clear the history store records. +class test_hs31(wttest.WiredTigerTestCase): + conn_config = 'cache_size=5MB,statistics=(all)' + format_values = [ + ('column', dict(key_format='r', value_format='S')), + # ('column-fix', dict(key_format='r', value_format='8t')), + ('integer-row', dict(key_format='i', value_format='S')), + ('string-row', dict(key_format='S', value_format='S')), + ] + + ooo_values = [ + ('out-of-order', dict(ooo_value=True)), + ('mixed-mode', dict(ooo_value=False)), + ] + + globally_visible_before_ckpt_values = [ + ('globally_visible_before_ckpt', dict(globally_visible_before_ckpt=True)), + ('no_globally_visible_before_ckpt', dict(globally_visible_before_ckpt=False)), + ] + + scenarios = make_scenarios(format_values, ooo_values, globally_visible_before_ckpt_values) + nrows = 1000 + + def create_key(self, i): + if self.key_format == 'S': + return str(i) + return i + + def get_stat(self, stat): + stat_cursor = self.session.open_cursor('statistics:') + val = stat_cursor[stat][2] + stat_cursor.close() + return val + + def test_ooo_tombstone_clear_hs(self): + uri = 'file:test_hs31' + create_params = 'key_format={},value_format={}'.format(self.key_format, self.value_format) + self.session.create(uri, create_params) + + if self.value_format == '8t': + value1 = 97 + value2 = 98 + else: + value1 = 'a' * 500 + value2 = 'b' * 500 + + # Pin oldest and stable to timestamp 1. + self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(1) + + ',stable_timestamp=' + self.timestamp_str(1)) + + # Apply a series of updates from timestamps 10-14. + cursor = self.session.open_cursor(uri) + for ts in range(10, 15): + for i in range(1, self.nrows): + self.session.begin_transaction() + cursor[self.create_key(i)] = value1 + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(ts)) + + # Reconcile and flush versions 10-13 to the history store. + self.session.checkpoint() + + # Evict the data from the cache. + self.session.begin_transaction() + cursor2 = self.session.open_cursor(uri, None, "debug=(release_evict=true)") + for i in range(1, self.nrows): + cursor2.set_key(self.create_key(i)) + cursor2.search() + cursor2.reset() + self.session.rollback_transaction() + + if not self.ooo_value: + self.session.breakpoint() + # Start a long running transaction to stop the oldest id being advanced. + session2 = self.conn.open_session() + session2.begin_transaction() + long_cursor = session2.open_cursor(uri, None) + long_cursor[self.create_key(self.nrows + 10)] = value1 + long_cursor.reset() + long_cursor.close() + + # Remove the key with an ooo or mm timestamp. + for i in range(1, self.nrows): + self.session.begin_transaction() + cursor.set_key(self.create_key(i)) + cursor.remove() + if self.ooo_value: + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(5)) + else: + self.session.commit_transaction() + + if not self.globally_visible_before_ckpt: + # Reconcile to write the stop time window. + self.session.checkpoint() + + if not self.ooo_value: + self.session.breakpoint() + # Ensure that old reader can read the history content. + long_cursor = session2.open_cursor(uri, None) + for i in range(1, self.nrows): + long_cursor.set_key(self.create_key(i)) + self.assertEqual(long_cursor.search(), 0) + self.assertEqual(long_cursor.get_value(), value1) + long_cursor.reset() + long_cursor.close() + + # Rollback the long running transaction. + session2.rollback_transaction() + session2.close() + + # Pin oldest and stable to timestamp 5 so that the ooo tombstone is globally visible. + self.conn.set_timestamp('oldest_timestamp=' + self.timestamp_str(10) + + ',stable_timestamp=' + self.timestamp_str(10)) + + # Reconcile and remove the obsolete entries. + self.session.checkpoint() + + # Evict the data from the cache. + self.session.begin_transaction() + cursor2 = self.session.open_cursor(uri, None, "debug=(release_evict=true)") + for i in range(1, self.nrows): + cursor2.set_key(self.create_key(i)) + if self.value_format == '8t': + self.assertEqual(cursor2.search(), 0) + else: + self.assertEqual(cursor2.search(), wiredtiger.WT_NOTFOUND) + cursor2.reset() + self.session.rollback_transaction() + + # Now apply an insert at timestamp 20. + for i in range(1, self.nrows): + self.session.begin_transaction() + cursor[self.create_key(i)] = value2 + self.session.commit_transaction('commit_timestamp=' + self.timestamp_str(20)) + + # Ensure that we blew away history store content. + for ts in range(10, 15): + self.session.begin_transaction('read_timestamp=' + self.timestamp_str(ts)) + for i in range(1, self.nrows): + cursor.set_key(self.create_key(i)) + if self.value_format == '8t': + self.assertEqual(cursor.search(), 0) + self.assertEqual(cursor.get_value(), 0) + else: + self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND) + self.session.rollback_transaction() + + hs_truncate = self.get_stat(stat.conn.cache_hs_key_truncate_onpage_removal) + self.assertGreater(hs_truncate, 0) + |