diff options
Diffstat (limited to 'src/third_party/wiredtiger/src/history/hs.c')
-rw-r--r-- | src/third_party/wiredtiger/src/history/hs.c | 148 |
1 files changed, 49 insertions, 99 deletions
diff --git a/src/third_party/wiredtiger/src/history/hs.c b/src/third_party/wiredtiger/src/history/hs.c index 6d1c63c5fdf..398b871ea57 100644 --- a/src/third_party/wiredtiger/src/history/hs.c +++ b/src/third_party/wiredtiger/src/history/hs.c @@ -202,6 +202,9 @@ __wt_hs_cursor_open(WT_SESSION_IMPL *session) session, ret = __wt_open_cursor(session, WT_HS_URI, NULL, open_cursor_cfg, &cursor)); WT_RET(ret); + /* History store cursors should always ignore tombstones. */ + F_SET(cursor, WT_CURSTD_IGNORE_TOMBSTONE); + session->hs_cursor = cursor; F_SET(session, WT_SESSION_HS_CURSOR); @@ -215,7 +218,6 @@ __wt_hs_cursor_open(WT_SESSION_IMPL *session) int __wt_hs_cursor(WT_SESSION_IMPL *session, uint32_t *session_flags, bool *is_owner) { - /* * We don't want to get tapped for eviction after we start using the history store cursor; save * a copy of the current eviction state, we'll turn eviction off before we return. @@ -288,27 +290,12 @@ int __wt_hs_modify(WT_CURSOR_BTREE *hs_cbt, WT_UPDATE *hs_upd) { WT_DECL_RET; - WT_PAGE_MODIFY *mod; - WT_SESSION_IMPL *session; - WT_UPDATE *last_upd; - - session = CUR2S(hs_cbt); - - /* If there are existing updates, append them after the new updates. */ - if (hs_cbt->compare == 0) { - for (last_upd = hs_upd; last_upd->next != NULL; last_upd = last_upd->next) - ; - if (hs_cbt->ins != NULL) - last_upd->next = hs_cbt->ins->upd; - else if ((mod = hs_cbt->ref->page->modify) != NULL && mod->mod_row_update != NULL) - last_upd->next = mod->mod_row_update[hs_cbt->slot]; - } /* * We don't have exclusive access to the history store page so we need to pass "false" here to * ensure that we're locking when inserting new keys to an insert list. */ - WT_WITH_BTREE(session, CUR2BT(hs_cbt), + WT_WITH_BTREE(CUR2S(hs_cbt), CUR2BT(hs_cbt), ret = __wt_row_modify(hs_cbt, &hs_cbt->iface.key, NULL, hs_upd, WT_UPDATE_INVALID, false)); return (ret); } @@ -518,8 +505,6 @@ __hs_insert_record_with_btree(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BT */ WT_ERR(__wt_cursor_key_order_init((WT_CURSOR_BTREE *)cursor)); #endif - F_SET(cursor, WT_CURSTD_IGNORE_TOMBSTONE); - /* We're pointing at the newly inserted update. Iterate once more to avoid deleting it. */ WT_ERR_NOTFOUND_OK(cursor->next(cursor), true); @@ -536,8 +521,6 @@ __hs_insert_record_with_btree(WT_SESSION_IMPL *session, WT_CURSOR *cursor, WT_BT done: err: - F_CLR(cursor, WT_CURSTD_IGNORE_TOMBSTONE); - /* We did a row search, release the cursor so that the page doesn't continue being held. */ cursor->reset(cursor); @@ -609,7 +592,7 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) uint32_t i; uint8_t *p; int nentries; - bool squashed, track_prepare, updates_in_hs, updates_older_than_onpage; + bool squashed, track_prepare; uint8_t upd_count; btree = S2BT(session); @@ -664,7 +647,7 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) __wt_free_update_list(session, &upd); upd = list->onpage_upd; second_older_than_prepare = NULL; - track_prepare = updates_in_hs = updates_older_than_onpage = false; + track_prepare = false; upd_count = 0; /* @@ -724,15 +707,12 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) } } - if (F_ISSET(upd, WT_UPDATE_HS)) { - updates_in_hs = true; - /* - * If we've reached a full update and its in the history store we don't need to - * continue as anything beyond this point won't help with calculating deltas. - */ - if (upd->type == WT_UPDATE_STANDARD) - break; - } + /* + * If we've reached a full update and its in the history store we don't need to continue + * as anything beyond this point won't help with calculating deltas. + */ + if (upd->type == WT_UPDATE_STANDARD && F_ISSET(upd, WT_UPDATE_HS)) + break; } upd = NULL; @@ -742,16 +722,10 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) __wt_modify_vector_pop(&modifies, &upd); WT_ASSERT(session, upd->type == WT_UPDATE_STANDARD || upd->type == WT_UPDATE_TOMBSTONE); - /* Skip TOMBSTONE at the end of the update chain. */ if (upd->type == WT_UPDATE_TOMBSTONE) { if (modifies.size > 0) { - /* - * We don't need to delete the history store records if everything is still on the - * insert list and there are no updates moved to the history store by checkpoint or - * a failed eviction. - */ - if ((list->ins == NULL || updates_in_hs) && upd->start_ts == WT_TS_NONE) { + if (upd->start_ts == WT_TS_NONE) { /* We can only delete history store entries that have timestamps. */ WT_ERR(__wt_hs_delete_key_from_ts(session, btree->id, key, 1)); WT_STAT_CONN_INCR(session, cache_hs_key_truncate_mix_ts); @@ -777,7 +751,6 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) tmp = full_value, full_value = prev_full_value, prev_full_value = tmp, upd = prev_upd) { WT_ASSERT(session, upd->type == WT_UPDATE_STANDARD || upd->type == WT_UPDATE_MODIFY); - updates_older_than_onpage = true; __wt_modify_vector_pop(&modifies, &prev_upd); @@ -803,24 +776,13 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) stop_time_point.txnid = prev_upd->txnid; } - /* - * Delete the history store records if we detect a mixed mode update. We don't need to - * do that if everything is still on the insert list and there are no updates moved to - * the history store by checkpoint or a failed eviction. - * - * Note that if the update is restored from data store or history store, we may have - * cleared its timestamp, remove the history store contents anyway in this case. - */ - if ((list->ins == NULL || updates_in_hs) && prev_upd->start_ts == WT_TS_NONE && - (upd->start_ts != WT_TS_NONE || - F_ISSET(upd, WT_UPDATE_RESTORED_FROM_DS | WT_UPDATE_RESTORED_FROM_HS))) { - /* We can only delete history store entries that have timestamps. */ - WT_ERR(__wt_hs_delete_key_from_ts(session, btree->id, key, 1)); - WT_STAT_CONN_INCR(session, cache_hs_key_truncate_mix_ts); - } - if (prev_upd->type == WT_UPDATE_TOMBSTONE) { WT_ASSERT(session, modifies.size > 0); + if (prev_upd->start_ts == WT_TS_NONE) { + /* We can only delete history store entries that have timestamps. */ + WT_ERR(__wt_hs_delete_key_from_ts(session, btree->id, key, 1)); + WT_STAT_CONN_INCR(session, cache_hs_key_truncate_mix_ts); + } __wt_modify_vector_pop(&modifies, &prev_upd); WT_ASSERT(session, prev_upd->type == WT_UPDATE_STANDARD); prev_full_value->data = prev_upd->data; @@ -874,24 +836,8 @@ __wt_hs_insert_updates(WT_SESSION_IMPL *session, WT_PAGE *page, WT_MULTI *multi) squashed = true; } - WT_ASSERT(session, - upd->txnid == list->onpage_upd->txnid && upd->start_ts == list->onpage_upd->start_ts); - if (modifies.size > 0) WT_STAT_CONN_INCR(session, cache_hs_write_squash); - - /* - * Delete the history store records if the onpage update's timestamp is WT_TS_NONE and we - * don't see any update older than it. We don't need to do that if everything is still on - * the insert list and there are no updates moved to the history store by checkpoint or a - * failed eviction. - */ - if (!updates_older_than_onpage && (list->ins == NULL || updates_in_hs) && - upd->start_ts == WT_TS_NONE) { - /* We can only delete history store entries that have timestamps. */ - WT_ERR(__wt_hs_delete_key_from_ts(session, btree->id, key, 1)); - WT_STAT_CONN_INCR(session, cache_hs_key_truncate_mix_ts); - } } WT_ERR(__wt_block_manager_named_size(session, WT_HS_FILE, &hs_size)); @@ -984,6 +930,7 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_forma WT_UPDATE_VALUE *upd_value, bool allow_prepare, WT_ITEM *on_disk_buf) { WT_CURSOR *hs_cursor; + WT_CURSOR_BTREE *hs_cbt; WT_DECL_ITEM(hs_value); WT_DECL_ITEM(orig_hs_value_buf); WT_DECL_RET; @@ -1030,6 +977,7 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_forma /* Open a history store table cursor. */ WT_ERR(__wt_hs_cursor(session, &session_flags, &is_owner)); hs_cursor = session->hs_cursor; + hs_cbt = (WT_CURSOR_BTREE *)hs_cursor; /* * After positioning our cursor, we're stepping backwards to find the correct update. Since the @@ -1043,19 +991,37 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_forma ret = 0; goto done; } - WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter)); + for (;; ret = hs_cursor->prev(hs_cursor)) { + WT_ERR_NOTFOUND_OK(ret, true); + /* If we hit the end of the table, let's get out of here. */ + if (ret == WT_NOTFOUND) { + ret = 0; + goto done; + } + WT_ERR(hs_cursor->get_key(hs_cursor, &hs_btree_id, &hs_key, &hs_start_ts, &hs_counter)); - /* Stop before crossing over to the next btree */ - if (hs_btree_id != S2BT(session)->id) - goto done; + /* Stop before crossing over to the next btree */ + if (hs_btree_id != S2BT(session)->id) + goto done; - /* - * Keys are sorted in an order, skip the ones before the desired key, and bail out if we have - * crossed over the desired key and not found the record we are looking for. - */ - WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp)); - if (cmp != 0) - goto done; + /* + * Keys are sorted in an order, skip the ones before the desired key, and bail out if we + * have crossed over the desired key and not found the record we are looking for. + */ + WT_ERR(__wt_compare(session, NULL, &hs_key, key, &cmp)); + if (cmp != 0) + goto done; + + /* + * If the stop time point of a record is visible to us, we won't be able to see anything for + * this entire key. Just jump straight to the end. + */ + if (__wt_txn_tw_stop_visible(session, &hs_cbt->upd_value->tw)) + goto done; + /* If the start time point is visible to us, let's return that record. */ + if (__wt_txn_tw_start_visible(session, &hs_cbt->upd_value->tw)) + break; + } WT_ERR(hs_cursor->get_value( hs_cursor, &hs_stop_durable_ts, &durable_timestamp, &upd_type_full, hs_value)); @@ -1086,7 +1052,6 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_forma * visibility checks when reading in order to construct the modify chain, so we can create * the value we expect. */ - F_SET(session, WT_SESSION_HS_IGNORE_VISIBILITY); while (upd_type == WT_UPDATE_MODIFY) { WT_ERR(__wt_upd_alloc(session, hs_value, upd_type, &mod_upd, NULL)); WT_ERR(__wt_modify_vector_push(&modifies, mod_upd)); @@ -1129,7 +1094,6 @@ __wt_find_hs_upd(WT_SESSION_IMPL *session, WT_ITEM *key, const char *value_forma &upd_type_full, hs_value)); upd_type = (uint8_t)upd_type_full; } - F_CLR(session, WT_SESSION_HS_IGNORE_VISIBILITY); WT_ASSERT(session, upd_type == WT_UPDATE_STANDARD); while (modifies.size > 0) { __wt_modify_vector_pop(&modifies, &mod_upd); @@ -1153,8 +1117,6 @@ skip_buf: done: err: - F_CLR(session, WT_SESSION_HS_IGNORE_VISIBILITY); - if (orig_hs_value_buf != NULL) __wt_scr_free(session, &orig_hs_value_buf); else @@ -1270,20 +1232,10 @@ __wt_hs_delete_key_from_ts( WT_RET(__wt_hs_cursor(session, &session_flags, &is_owner)); - /* - * In order to delete a key range, we need to be able to inspect all history store records - * regardless of their stop time points and the visibility of their values. - */ - F_SET(session->hs_cursor, WT_CURSTD_IGNORE_TOMBSTONE); - F_SET(session, WT_SESSION_HS_IGNORE_VISIBILITY); - /* The tree structure can change while we try to insert the mod list, retry if that happens. */ while ((ret = __hs_delete_key_from_ts_int(session, btree_id, key, ts)) == WT_RESTART) WT_STAT_CONN_INCR(session, cache_hs_insert_restart); - F_CLR(session, WT_SESSION_HS_IGNORE_VISIBILITY); - F_CLR(session->hs_cursor, WT_CURSTD_IGNORE_TOMBSTONE); - WT_TRET(__wt_hs_cursor_close(session, session_flags, is_owner)); return (ret); } @@ -1375,7 +1327,6 @@ __verify_history_store_id(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint32 * visible tombstones in the data table to verify the corresponding entries in the history store * are too present in the data store. */ - F_SET(hs_cursor, WT_CURSTD_IGNORE_TOMBSTONE); F_SET(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE); /* @@ -1426,7 +1377,6 @@ __verify_history_store_id(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, uint32 WT_ERR_NOTFOUND_OK(ret, true); err: F_CLR(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE); - F_CLR(hs_cursor, WT_CURSTD_IGNORE_TOMBSTONE); WT_ASSERT(session, hs_key.mem == NULL && hs_key.memsize == 0); __wt_scr_free(session, &prev_hs_key); return (ret); |