diff options
author | Luke Chen <luke.chen@mongodb.com> | 2020-06-12 16:47:32 +1000 |
---|---|---|
committer | Evergreen Agent <no-reply@evergreen.mongodb.com> | 2020-06-12 07:00:55 +0000 |
commit | 4bd561c80d376130cf851fed23d1261ac36179fc (patch) | |
tree | 06114c2cd9687df8b66fde29528f3651f47be7b6 /src/third_party | |
parent | 4c86696d09d6d53af02452f7a557a9c40eddebfe (diff) | |
download | mongo-4bd561c80d376130cf851fed23d1261ac36179fc.tar.gz |
Import wiredtiger: f650b1124b18cb4bccd61ca822ed19157206cc7e from branch mongodb-4.4
ref: cd0eca9e68..f650b1124b
for: 4.5.1
WT-6347 Clear history store contents for mixed mode updates
Diffstat (limited to 'src/third_party')
-rw-r--r-- | src/third_party/wiredtiger/import.data | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/history/hs.c | 156 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/btmem.h | 11 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/reconcile/rec_visibility.c | 3 | ||||
-rw-r--r-- | src/third_party/wiredtiger/test/suite/test_hs11.py | 29 |
5 files changed, 135 insertions, 66 deletions
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index e62088151eb..c657e8107df 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -2,5 +2,5 @@ "vendor": "wiredtiger", "github": "wiredtiger/wiredtiger.git", "branch": "mongodb-4.4", - "commit": "cd0eca9e685e9ac2446e8e6d5b398b366a7ba420" + "commit": "f650b1124b18cb4bccd61ca822ed19157206cc7e" } diff --git a/src/third_party/wiredtiger/src/history/hs.c b/src/third_party/wiredtiger/src/history/hs.c index b295ac730b4..8dd46f6889e 100644 --- a/src/third_party/wiredtiger/src/history/hs.c +++ b/src/third_party/wiredtiger/src/history/hs.c @@ -475,7 +475,7 @@ err: static int __hs_insert_record_with_btree(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BTREE *btree, const WT_ITEM *key, const WT_UPDATE *upd, const uint8_t type, const WT_ITEM *hs_value, - WT_HS_TIME_POINT *stop_time_point) + WT_HS_TIME_POINT *stop_time_point, bool clear_hs) { WT_DECL_RET; @@ -500,38 +500,27 @@ __hs_insert_record_with_btree(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BT */ WT_ASSERT(session, type == WT_UPDATE_STANDARD || type == WT_UPDATE_MODIFY); - /* - * If the time points are out of order (which can happen if the application performs updates - * with out-of-order timestamps), so this value can never be seen, don't bother inserting it. - */ - if (stop_time_point->ts < upd->start_ts || - (stop_time_point->ts == upd->start_ts && stop_time_point->txnid <= upd->txnid)) { - char ts_string[2][WT_TS_INT_STRING_SIZE]; - __wt_verbose(session, WT_VERB_TIMESTAMP, - "Warning: fixing out-of-order timestamps %s earlier than previous update %s", - __wt_timestamp_to_string(stop_time_point->ts, ts_string[0]), - __wt_timestamp_to_string(upd->start_ts, ts_string[1])); - return (0); - } - /* The tree structure can change while we try to insert the mod list, retry if that happens. */ while ((ret = __hs_insert_record_with_btree_int( session, cursor, btree, key, upd, type, hs_value, stop_time_point)) == WT_RESTART) WT_STAT_CONN_INCR(session, cache_hs_insert_restart); WT_ERR(ret); - /* If we inserted a timestamped update, we don't need to delete any history store records. */ - if (upd->start_ts != WT_TS_NONE) + /* Done if we don't need to clear the history store content. */ + if (!clear_hs) goto done; /* - * If we inserted an update with no timestamp, we need to delete all history records for that + * We can only insert update without timestamp into the history store if we need to clear the + * history store record. + */ + WT_ASSERT(session, upd->start_ts == WT_TS_NONE); + + /* + * If we need to clear the history store content, we need to delete all history records for that * key that are further in the history table than us (the key is lexicographically greater). For * timestamped tables that are occasionally getting a non-timestamped update, that means that - * all timestamped updates should get removed. In the case of non-timestamped tables, that means - * that all updates with higher transaction ids will get removed (which could happen at some - * more relaxed isolation levels). We're pointing at the newly inserted update, iterate once - * more to avoid deleting it. + * all timestamped updates should get removed. */ WT_ERR_NOTFOUND_OK(cursor->next(cursor), true); @@ -540,7 +529,6 @@ __hs_insert_record_with_btree(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BT ret = 0; goto done; } - while ((ret = __hs_delete_key_from_pos(session, cursor, btree->id, key)) == WT_RESTART) WT_STAT_CONN_INCR(session, cache_hs_key_truncate_mix_ts_restart); WT_ERR(ret); @@ -561,14 +549,14 @@ err: static int __hs_insert_record(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BTREE *btree, const WT_ITEM *key, const WT_UPDATE *upd, const uint8_t type, const WT_ITEM *hs_value, - WT_HS_TIME_POINT *stop_time_point) + WT_HS_TIME_POINT *stop_time_point, bool clear_hs) { WT_CURSOR_BTREE *cbt; WT_DECL_RET; cbt = (WT_CURSOR_BTREE *)cursor; WT_WITH_BTREE(session, CUR2BT(cbt), ret = __hs_insert_record_with_btree(session, cursor, btree, - key, upd, type, hs_value, stop_time_point)); + key, upd, type, hs_value, stop_time_point, clear_hs)); return (ret); } @@ -578,19 +566,12 @@ __hs_insert_record(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BTREE *btree, */ static inline int __hs_next_upd_full_value(WT_SESSION_IMPL *session, WT_MODIFY_VECTOR *modifies, - WT_ITEM *older_full_value, uint32_t btree_id, const WT_ITEM *key, WT_ITEM *full_value, - WT_UPDATE **updp) + WT_ITEM *older_full_value, WT_ITEM *full_value, WT_UPDATE **updp) { WT_UPDATE *upd; *updp = NULL; __wt_modify_vector_pop(modifies, &upd); if (upd->type == WT_UPDATE_TOMBSTONE) { - if (upd->start_ts == WT_TS_NONE) { - /* We can only delete history store entries that have timestamps. */ - WT_RET(__wt_hs_delete_key_from_ts(session, btree_id, key, 1)); - WT_STAT_CONN_INCR(session, cache_hs_key_truncate_mix_ts); - } - if (modifies->size == 0) { WT_ASSERT(session, older_full_value == NULL); *updp = upd; @@ -634,14 +615,14 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) WT_MODIFY entries[MAX_REVERSE_MODIFY_NUM]; WT_MODIFY_VECTOR modifies; WT_SAVE_UPD *list; - WT_UPDATE *prev_upd, *upd; + WT_UPDATE *first_non_ts_upd, *oldest_upd, *prev_upd, *upd; WT_HS_TIME_POINT stop_time_point; wt_off_t hs_size; uint64_t insert_cnt, max_hs_size; uint32_t i; uint8_t *p; int nentries; - bool enable_reverse_modify, squashed; + bool clear_hs, enable_reverse_modify, squashed, ts_updates_in_hs; btree = S2BT(session); cursor = session->hs_cursor; @@ -694,6 +675,9 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) session, btree, upd = __wt_update_obsolete_check(session, page, list->onpage_upd, true)); __wt_free_update_list(session, &upd); upd = list->onpage_upd; + + first_non_ts_upd = NULL; + ts_updates_in_hs = false; enable_reverse_modify = true; /* @@ -745,6 +729,19 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) if (prev_upd != NULL && prev_upd->start_ts < upd->start_ts) enable_reverse_modify = false; + /* Find the first update without timestamp. */ + if (first_non_ts_upd == NULL && upd->start_ts == WT_TS_NONE) { + first_non_ts_upd = upd; + } else if (first_non_ts_upd != NULL && upd->start_ts != WT_TS_NONE) { + /* + * Don't insert updates with timestamps after updates without timestamps to the + * history store. + */ + F_SET(upd, WT_UPDATE_MASKED_BY_NON_TS_UPDATE); + if (F_ISSET(upd, WT_UPDATE_HS)) + ts_updates_in_hs = true; + } + /* * If we've reached a full update and it's in the history store we don't need to * continue as anything beyond this point won't help with calculating deltas. @@ -758,14 +755,35 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) /* Construct the oldest full update. */ WT_ASSERT(session, modifies.size > 0); -#ifdef HAVE_DIAGNOSTIC - __wt_modify_vector_peek(&modifies, &upd); + __wt_modify_vector_peek(&modifies, &oldest_upd); - WT_ASSERT(session, upd->type == WT_UPDATE_STANDARD || upd->type == WT_UPDATE_TOMBSTONE); -#endif + WT_ASSERT(session, + oldest_upd->type == WT_UPDATE_STANDARD || oldest_upd->type == WT_UPDATE_TOMBSTONE); - WT_ERR( - __hs_next_upd_full_value(session, &modifies, NULL, btree->id, key, full_value, &upd)); + /* + * Clear the history store here if the oldest update is a tombstone and it is the first + * update without timestamp on the update chain because we don't have the cursor placed at + * the correct place to delete the history store records when inserting the first update and + * it may be skipped if there is nothing to insert to the history store. + */ + if (oldest_upd->type == WT_UPDATE_TOMBSTONE && oldest_upd == first_non_ts_upd) { + /* We can only delete history store entries that have timestamps. */ + WT_ERR(__wt_hs_delete_key_from_ts(session, btree->id, key, 1)); + WT_STAT_CONN_INCR(session, cache_hs_key_truncate_mix_ts); + clear_hs = false; + } else + /* + * Clear the content with timestamps in the history store if we see updates without + * timestamps on the update chain. + * + * We don't need to clear the history store records if everything is still on the insert + * list and there are no updates moved to the history store by checkpoint or a failed + * eviction. + */ + clear_hs = first_non_ts_upd != NULL && !F_ISSET(first_non_ts_upd, WT_UPDATE_HS) && + (list->ins == NULL || ts_updates_in_hs); + + WT_ERR(__hs_next_upd_full_value(session, &modifies, NULL, full_value, &upd)); squashed = false; @@ -804,8 +822,8 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) stop_time_point.txnid = prev_upd->txnid; } - WT_ERR(__hs_next_upd_full_value( - session, &modifies, full_value, btree->id, key, prev_full_value, &prev_upd)); + WT_ERR( + __hs_next_upd_full_value(session, &modifies, full_value, prev_full_value, &prev_upd)); /* Squash the updates from the same transaction. */ if (upd->start_ts == prev_upd->start_ts && upd->txnid == prev_upd->txnid) { @@ -813,22 +831,42 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) continue; } - if (F_ISSET(upd, WT_UPDATE_HS)) + /* Skip updates already in the history store or masked by updates without timestamps. */ + if (F_ISSET(upd, WT_UPDATE_HS | WT_UPDATE_MASKED_BY_NON_TS_UPDATE)) continue; - /* Calculate reverse modify. */ + /* + * If the time points are out of order (which can happen if the application performs + * updates with out-of-order timestamps), so this value can never be seen, don't bother + * inserting it. + */ + if (stop_time_point.ts < upd->start_ts || + (stop_time_point.ts == upd->start_ts && stop_time_point.txnid <= upd->txnid)) { + char ts_string[2][WT_TS_INT_STRING_SIZE]; + __wt_verbose(session, WT_VERB_TIMESTAMP, + "Warning: fixing out-of-order timestamps %s earlier than previous update %s", + __wt_timestamp_to_string(stop_time_point.ts, ts_string[0]), + __wt_timestamp_to_string(upd->start_ts, ts_string[1])); + continue; + } + + /* + * Calculate reverse modify and clear the history store records with timestamps when + * inserting the first update. + */ nentries = MAX_REVERSE_MODIFY_NUM; if (upd->type == WT_UPDATE_MODIFY && enable_reverse_modify && __wt_calc_modify(session, prev_full_value, full_value, prev_full_value->size / 10, entries, &nentries) == 0) { WT_ERR(__wt_modify_pack(cursor, entries, nentries, &modify_value)); WT_ERR(__hs_insert_record(session, cursor, btree, key, upd, WT_UPDATE_MODIFY, - modify_value, &stop_time_point)); + modify_value, &stop_time_point, clear_hs)); __wt_scr_free(session, &modify_value); } else WT_ERR(__hs_insert_record(session, cursor, btree, key, upd, WT_UPDATE_STANDARD, - full_value, &stop_time_point)); + full_value, &stop_time_point, clear_hs)); + clear_hs = false; /* Flag the update as now in the history store. */ F_SET(upd, WT_UPDATE_HS); ++insert_cnt; @@ -840,6 +878,28 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) if (modifies.size > 0) WT_STAT_CONN_INCR(session, cache_hs_write_squash); + + /* + * We need to clear the history store if we haven't inserted anything into the history store + * and there are updates without timestamps in the middle of the update chain. + * + * e.g., U@10 -> T@0 -> U@5. + * + * But we don't need to clear the history store if we write an update without timestamp to + * the data store because we don't insert any update with timestamp to the history store and + * we will clear the history store again once that update is moved to the history store. + * + * e.g., U@0 -> U@10 -> U@5 and U@1 in the history store. U@10 and U@5 are not inserted to + * the history store as they are flagged as WT_UPDATE_MASKED_BY_NON_TS_UPDATE and U@1 is not + * removed from the history store. U@1 will be removed from the history store once U@0 is + * moved to the history store. + */ + if (clear_hs && (first_non_ts_upd->txnid != list->onpage_upd->txnid || + first_non_ts_upd->start_ts != list->onpage_upd->start_ts)) { + /* We can only delete history store entries that have timestamps. */ + WT_ERR(__wt_hs_delete_key_from_ts(session, btree->id, key, 1)); + WT_STAT_CONN_INCR(session, cache_hs_key_truncate_mix_ts); + } } WT_ERR(__wt_block_manager_named_size(session, WT_HS_FILE, &hs_size)); @@ -1202,6 +1262,8 @@ __hs_delete_key_from_ts_int( WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp)); if (cmp != 0) goto done; + + WT_ASSERT(session, ts == WT_TS_NONE || hs_start_ts != WT_TS_NONE); WT_ERR(__hs_delete_key_from_pos(session, hs_cursor, btree_id, key)); done: ret = 0; diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index c9711020d89..c79316217dc 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -1073,11 +1073,12 @@ struct __wt_update { volatile uint8_t prepare_state; /* prepare state */ /* AUTOMATIC FLAG VALUE GENERATION START */ -#define WT_UPDATE_HS 0x1u /* Update has been written to history store. */ -#define WT_UPDATE_PREPARE_RESTORED_FROM_DS 0x2u /* Prepared update restored from data store. */ -#define WT_UPDATE_RESTORED_FROM_DS 0x4u /* Update restored from data store. */ -#define WT_UPDATE_RESTORED_FROM_HS 0x8u /* Update restored from history store. */ - /* AUTOMATIC FLAG VALUE GENERATION STOP */ +#define WT_UPDATE_HS 0x01u /* Update has been written to history store. */ +#define WT_UPDATE_MASKED_BY_NON_TS_UPDATE 0x02u /* Update masked by updates without timestamp. */ +#define WT_UPDATE_PREPARE_RESTORED_FROM_DS 0x04u /* Prepared update restored from data store. */ +#define WT_UPDATE_RESTORED_FROM_DS 0x08u /* Update restored from data store. */ +#define WT_UPDATE_RESTORED_FROM_HS 0x10u /* Update restored from history store. */ + /* AUTOMATIC FLAG VALUE GENERATION STOP */ uint8_t flags; /* diff --git a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c index 56d3f711f10..197a3a920cb 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_visibility.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_visibility.c @@ -140,9 +140,6 @@ __rec_append_orig_value( * timestamped globally visible tombstone because even if its timestamp is smaller than * the entries in the history store, we can't change the history store entries. This is * not correct but we hope we can get away with it. - * - * FIXME-WT-6171: remove this once we get rid of out of order timestamps and mixed mode - * transactions. */ if (unpack->tw.durable_stop_ts != WT_TS_NONE && tombstone_globally_visible) return (0); diff --git a/src/third_party/wiredtiger/test/suite/test_hs11.py b/src/third_party/wiredtiger/test/suite/test_hs11.py index efc9d02401c..eaa991557ee 100644 --- a/src/third_party/wiredtiger/test/suite/test_hs11.py +++ b/src/third_party/wiredtiger/test/suite/test_hs11.py @@ -27,18 +27,22 @@ # OTHER DEALINGS IN THE SOFTWARE. import wiredtiger, wttest +from wtscenario import make_scenarios def timestamp_str(t): return '%x' % t # test_hs11.py -# Ensure that when we delete a key due to a tombstone being globally visible, we delete its -# associated history store content. +# Ensure that updates without timestamps clear the history store records. class test_hs11(wttest.WiredTigerTestCase): conn_config = 'cache_size=50MB' session_config = 'isolation=snapshot' + scenarios = make_scenarios([ + ('deletion', dict(update_type='deletion')), + ('update', dict(update_type='update')), + ]) - def test_key_deletion_clears_hs(self): + def test_non_ts_updates_clears_hs(self): uri = 'table:test_hs11' create_params = 'key_format=S,value_format=S' self.session.create(uri, create_params) @@ -58,14 +62,16 @@ class test_hs11(wttest.WiredTigerTestCase): # Reconcile and flush versions 1-3 to the history store. self.session.checkpoint() - # Apply a non-timestamped tombstone. When the pages get evicted, the keys will get deleted - # since the tombstone is globally visible. + # Apply an update without timestamp. for i in range(1, 10000): if i % 2 == 0: - cursor.set_key(str(i)) - cursor.remove() + if self.update_type == 'deletion': + cursor.set_key(str(i)) + cursor.remove() + else: + cursor[str(i)] = value2 - # Now apply an update at timestamp 10 to recreate each key. + # Now apply an update at timestamp 10. for i in range(1, 10000): self.session.begin_transaction() cursor[str(i)] = value2 @@ -76,8 +82,11 @@ class test_hs11(wttest.WiredTigerTestCase): self.session.begin_transaction('read_timestamp=' + timestamp_str(ts)) for i in range(1, 10000): if i % 2 == 0: - cursor.set_key(str(i)) - self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND) + if self.update_type == 'deletion': + cursor.set_key(str(i)) + self.assertEqual(cursor.search(), wiredtiger.WT_NOTFOUND) + else: + self.assertEqual(cursor[str(i)], value2) else: self.assertEqual(cursor[str(i)], value1) self.session.rollback_transaction() |