summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/cursor/cur_hs.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/third_party/wiredtiger/src/cursor/cur_hs.c')
-rw-r--r--src/third_party/wiredtiger/src/cursor/cur_hs.c331
1 files changed, 328 insertions, 3 deletions
diff --git a/src/third_party/wiredtiger/src/cursor/cur_hs.c b/src/third_party/wiredtiger/src/cursor/cur_hs.c
index 500b9208b98..923b9941d0e 100644
--- a/src/third_party/wiredtiger/src/cursor/cur_hs.c
+++ b/src/third_party/wiredtiger/src/cursor/cur_hs.c
@@ -152,6 +152,7 @@ __curhs_close(WT_CURSOR *cursor)
WT_CURSOR *file_cursor;
WT_CURSOR_HS *hs_cursor;
WT_DECL_RET;
+ WT_ITEM *datastore_key;
WT_SESSION_IMPL *session;
hs_cursor = (WT_CURSOR_HS *)cursor;
@@ -161,6 +162,8 @@ __curhs_close(WT_CURSOR *cursor)
err:
if (file_cursor != NULL)
WT_TRET(file_cursor->close(file_cursor));
+ datastore_key = &hs_cursor->datastore_key;
+ __wt_scr_free(session, &datastore_key);
__wt_cursor_close(cursor);
API_END_RET(session, ret);
@@ -185,6 +188,10 @@ __curhs_reset(WT_CURSOR *cursor)
ret = file_cursor->reset(file_cursor);
F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
WT_TIME_WINDOW_INIT(&hs_cursor->time_window);
+ hs_cursor->btree_id = 0;
+ hs_cursor->datastore_key.data = NULL;
+ hs_cursor->datastore_key.size = 0;
+ hs_cursor->flags = 0;
err:
API_END_RET(session, ret);
@@ -199,15 +206,327 @@ __curhs_set_key(WT_CURSOR *cursor, ...)
{
WT_CURSOR *file_cursor;
WT_CURSOR_HS *hs_cursor;
+ WT_ITEM *datastore_key;
+ WT_SESSION_IMPL *session;
+ wt_timestamp_t start_ts;
+ uint64_t counter;
+ uint32_t arg_count;
va_list ap;
hs_cursor = (WT_CURSOR_HS *)cursor;
file_cursor = hs_cursor->file_cursor;
+ session = CUR2S(cursor);
+ start_ts = WT_TS_NONE;
+ counter = 0;
va_start(ap, cursor);
- file_cursor->set_key(file_cursor, va_arg(ap, uint32_t), va_arg(ap, WT_ITEM *),
- va_arg(ap, wt_timestamp_t), va_arg(ap, uint64_t));
+ arg_count = va_arg(ap, uint32_t);
+
+ WT_ASSERT(session, arg_count >= 1 && arg_count <= 4);
+
+ hs_cursor->btree_id = va_arg(ap, uint32_t);
+ F_SET(hs_cursor, WT_HS_CUR_BTREE_ID_SET);
+ if (arg_count > 1) {
+ datastore_key = va_arg(ap, WT_ITEM *);
+ WT_IGNORE_RET(__wt_buf_set(
+ session, &hs_cursor->datastore_key, datastore_key->data, datastore_key->size));
+ F_SET(hs_cursor, WT_HS_CUR_KEY_SET);
+ } else {
+ hs_cursor->datastore_key.data = NULL;
+ hs_cursor->datastore_key.size = 0;
+ F_CLR(hs_cursor, WT_HS_CUR_KEY_SET);
+ }
+
+ if (arg_count > 2) {
+ start_ts = va_arg(ap, wt_timestamp_t);
+ F_SET(hs_cursor, WT_HS_CUR_TS_SET);
+ } else
+ F_CLR(hs_cursor, WT_HS_CUR_TS_SET);
+
+ if (arg_count > 3) {
+ counter = va_arg(ap, uint64_t);
+ F_SET(hs_cursor, WT_HS_CUR_COUNTER_SET);
+ } else
+ F_CLR(hs_cursor, WT_HS_CUR_COUNTER_SET);
+
va_end(ap);
+
+ file_cursor->set_key(
+ file_cursor, hs_cursor->btree_id, &hs_cursor->datastore_key, start_ts, counter);
+}
+
+/*
+ * __curhs_prev_visible --
+ * Check the visibility of the current history store record. If it is not visible, find the
+ * previous visible history store record.
+ */
+static int
+__curhs_prev_visible(WT_SESSION_IMPL *session, WT_CURSOR_HS *hs_cursor)
+{
+ WT_CURSOR *file_cursor;
+ WT_CURSOR *std_cursor;
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_ITEM(datastore_key);
+ WT_DECL_RET;
+ wt_timestamp_t start_ts;
+ uint64_t counter;
+ uint32_t btree_id;
+ int cmp;
+
+ file_cursor = hs_cursor->file_cursor;
+ std_cursor = (WT_CURSOR *)hs_cursor;
+ cbt = (WT_CURSOR_BTREE *)file_cursor;
+
+ WT_ERR(__wt_scr_alloc(session, 0, &datastore_key));
+
+ for (; ret == 0; ret = __wt_hs_cursor_prev(session, file_cursor)) {
+ WT_ERR(file_cursor->get_key(file_cursor, &btree_id, &datastore_key, &start_ts, &counter));
+
+ /* Stop before crossing over to the next btree. */
+ if (F_ISSET(hs_cursor, WT_HS_CUR_BTREE_ID_SET) && btree_id != hs_cursor->btree_id) {
+ ret = WT_NOTFOUND;
+ goto done;
+ }
+
+ /*
+ * Keys are sorted in an order, skip the ones before the desired key, and bail out if we
+ * have crossed over the desired key and not found the record we are looking for.
+ */
+ if (F_ISSET(hs_cursor, WT_HS_CUR_KEY_SET)) {
+ WT_ERR(__wt_compare(session, NULL, datastore_key, &hs_cursor->datastore_key, &cmp));
+ if (cmp != 0) {
+ ret = WT_NOTFOUND;
+ goto done;
+ }
+ }
+
+ /*
+ * If the stop time pair on the tombstone in the history store is already globally visible
+ * we can skip it.
+ */
+ if (__wt_txn_tw_stop_visible_all(session, &cbt->upd_value->tw)) {
+ WT_STAT_CONN_INCR(session, cursor_prev_hs_tombstone);
+ WT_STAT_DATA_INCR(session, cursor_prev_hs_tombstone);
+ continue;
+ }
+
+ /*
+ * Don't check the visibility of the record if we want to read any history store record that
+ * is not obsolete.
+ */
+ if (F_ISSET(std_cursor, WT_CURSTD_HS_READ_COMMITTED))
+ break;
+
+ if (__wt_txn_tw_stop_visible(session, &cbt->upd_value->tw)) {
+ /*
+ * If the stop time point of a record is visible to us, we won't be able to see anything
+ * for this entire key.
+ */
+ if (F_ISSET(hs_cursor, WT_HS_CUR_KEY_SET)) {
+ ret = WT_NOTFOUND;
+ goto done;
+ } else
+ continue;
+ }
+
+ /* If the start time point is visible to us, let's return that record. */
+ if (__wt_txn_tw_start_visible(session, &cbt->upd_value->tw))
+ break;
+ }
+
+done:
+err:
+ __wt_scr_free(session, &datastore_key);
+ return (ret);
+}
+
+/*
+ * __curhs_next_visible --
+ * Check the visibility of the current history store record. If it is not visible, find the next
+ * visible history store record.
+ */
+static int
+__curhs_next_visible(WT_SESSION_IMPL *session, WT_CURSOR_HS *hs_cursor)
+{
+ WT_CURSOR *file_cursor;
+ WT_CURSOR *std_cursor;
+ WT_CURSOR_BTREE *cbt;
+ WT_DECL_ITEM(datastore_key);
+ WT_DECL_RET;
+ wt_timestamp_t start_ts;
+ uint64_t counter;
+ uint32_t btree_id;
+ int cmp;
+
+ file_cursor = hs_cursor->file_cursor;
+ std_cursor = (WT_CURSOR *)hs_cursor;
+ cbt = (WT_CURSOR_BTREE *)file_cursor;
+
+ WT_ERR(__wt_scr_alloc(session, 0, &datastore_key));
+
+ for (; ret == 0; ret = __wt_hs_cursor_next(session, file_cursor)) {
+ WT_ERR(file_cursor->get_key(file_cursor, &btree_id, &datastore_key, &start_ts, &counter));
+
+ /* Stop before crossing over to the next btree. */
+ if (F_ISSET(hs_cursor, WT_HS_CUR_BTREE_ID_SET) && btree_id != hs_cursor->btree_id) {
+ ret = WT_NOTFOUND;
+ goto done;
+ }
+
+ /*
+ * Keys are sorted in an order, skip the ones before the desired key, and bail out if we
+ * have crossed over the desired key and not found the record we are looking for.
+ */
+ if (F_ISSET(hs_cursor, WT_HS_CUR_KEY_SET)) {
+ WT_ERR(__wt_compare(session, NULL, datastore_key, &hs_cursor->datastore_key, &cmp));
+ if (cmp != 0) {
+ ret = WT_NOTFOUND;
+ goto done;
+ }
+ }
+
+ /*
+ * If the stop time pair on the tombstone in the history store is already globally visible
+ * we can skip it.
+ */
+ if (__wt_txn_tw_stop_visible_all(session, &cbt->upd_value->tw)) {
+ WT_STAT_CONN_INCR(session, cursor_next_hs_tombstone);
+ WT_STAT_DATA_INCR(session, cursor_next_hs_tombstone);
+ continue;
+ }
+
+ /*
+ * Don't check the visibility of the record if we want to read any history store record that
+ * is not obsolete.
+ */
+ if (F_ISSET(std_cursor, WT_CURSTD_HS_READ_COMMITTED))
+ break;
+
+ /*
+ * If the stop time point of a record is visible to us, check the next one.
+ */
+ if (__wt_txn_tw_stop_visible(session, &cbt->upd_value->tw))
+ continue;
+
+ /* If the start time point is visible to us, let's return that record. */
+ if (__wt_txn_tw_start_visible(session, &cbt->upd_value->tw))
+ break;
+ }
+
+done:
+err:
+ __wt_scr_free(session, &datastore_key);
+ return (ret);
+}
+
+/*
+ * __curhs_search_near --
+ * WT_CURSOR->search_near method for the hs cursor type.
+ */
+static int
+__curhs_search_near(WT_CURSOR *cursor, int *exactp)
+{
+ WT_CURSOR *file_cursor;
+ WT_CURSOR_HS *hs_cursor;
+ WT_DECL_ITEM(srch_key);
+ WT_DECL_RET;
+ WT_SESSION_IMPL *session;
+ int cmp;
+ int exact;
+
+ hs_cursor = (WT_CURSOR_HS *)cursor;
+ file_cursor = hs_cursor->file_cursor;
+ *exactp = 0;
+ cmp = 0;
+
+ CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, search_near, CUR2BT(file_cursor));
+
+ WT_ERR(__wt_scr_alloc(session, 0, &srch_key));
+ /* At least we have the btree id set. */
+ WT_ASSERT(session, F_ISSET(hs_cursor, WT_HS_CUR_BTREE_ID_SET));
+ WT_ERR(__wt_buf_set(session, srch_key, file_cursor->key.data, file_cursor->key.size));
+ WT_ERR_NOTFOUND_OK(__wt_hs_cursor_search_near(session, file_cursor, &exact), true);
+
+ /* Empty history store is fine. */
+ if (ret == WT_NOTFOUND)
+ goto done;
+
+ /*
+ * There are some key fields missing so we are searching a range of keys. Place the cursor at
+ * the start of the range.
+ */
+ if (!F_ISSET(hs_cursor, WT_HS_CUR_COUNTER_SET)) {
+ /*
+ * If we raced with a history store insert, we may be two or more records away from our
+ * target. Keep iterating forwards until we are on or past our target key.
+ *
+ * We can't use the cursor positioning helper that we use for regular reads since that will
+ * place us at the end of a particular key/timestamp range whereas we want to be placed at
+ * the beginning.
+ */
+ if (exact < 0) {
+ while ((ret = __wt_hs_cursor_next(session, file_cursor)) == 0) {
+ WT_ERR(__wt_compare(session, NULL, &file_cursor->key, srch_key, &cmp));
+ if (cmp >= 0)
+ break;
+ }
+ /* No entries greater than or equal to the key we searched for. */
+ WT_ERR_NOTFOUND_OK(ret, true);
+ if (ret == WT_NOTFOUND)
+ goto done;
+
+ *exactp = cmp;
+ } else
+ *exactp = 1;
+
+ WT_ERR(__curhs_next_visible(session, hs_cursor));
+ }
+ /* Search the closest match that is smaller or equal to the search key. */
+ else {
+ /*
+ * Because of the special visibility rules for the history store, a new key can appear in
+ * between our search and the set of updates that we're interested in. Keep trying until we
+ * find it.
+ *
+ * There may be no history store entries for the given btree id and record key if they have
+ * been removed by rollback to stable.
+ *
+ * Note that we need to compare the raw key off the cursor to determine where we are in the
+ * history store as opposed to comparing the embedded data store key since the ordering is
+ * not guaranteed to be the same.
+ */
+ if (exact > 0) {
+ /*
+ * It's possible that we may race with a history store insert for another key. So we may
+ * be more than one record away the end of our target key/timestamp range. Keep
+ * iterating backwards until we land on our key.
+ */
+ while ((ret = file_cursor->prev(cursor)) == 0) {
+ WT_STAT_CONN_INCR(session, cursor_skip_hs_cur_position);
+ WT_STAT_DATA_INCR(session, cursor_skip_hs_cur_position);
+
+ WT_ERR(__wt_compare(session, NULL, &file_cursor->key, srch_key, &cmp));
+ if (cmp <= 0)
+ break;
+ }
+
+ *exactp = cmp;
+ } else
+ *exactp = -1;
+#ifdef HAVE_DIAGNOSTIC
+ if (ret == 0) {
+ WT_ERR(__wt_compare(session, NULL, &file_cursor->key, srch_key, &cmp));
+ WT_ASSERT(session, cmp <= 0);
+ }
+#endif
+
+ WT_ERR(__curhs_prev_visible(session, hs_cursor));
+ }
+
+done:
+err:
+ __wt_scr_free(session, &srch_key);
+ API_END_RET(session, ret);
}
/*
@@ -356,7 +675,7 @@ __wt_curhs_open(WT_SESSION_IMPL *session, WT_CURSOR *owner, WT_CURSOR **cursorp)
__wt_cursor_notsup, /* prev */
__curhs_reset, /* reset */
__wt_cursor_notsup, /* search */
- __wt_cursor_search_near_notsup, /* search-near */
+ __curhs_search_near, /* search-near */
__curhs_insert, /* insert */
__wt_cursor_modify_value_format_notsup, /* modify */
__wt_cursor_notsup, /* update */
@@ -369,6 +688,7 @@ __wt_curhs_open(WT_SESSION_IMPL *session, WT_CURSOR *owner, WT_CURSOR **cursorp)
WT_CURSOR *cursor;
WT_CURSOR_HS *hs_cursor;
WT_DECL_RET;
+ WT_ITEM *datastore_key;
WT_RET(__wt_calloc_one(session, &hs_cursor));
cursor = (WT_CURSOR *)hs_cursor;
@@ -381,6 +701,11 @@ __wt_curhs_open(WT_SESSION_IMPL *session, WT_CURSOR *owner, WT_CURSOR **cursorp)
WT_ERR(__hs_cursor_open_int(session, &hs_cursor->file_cursor));
WT_ERR(__wt_cursor_init(cursor, WT_HS_URI, owner, NULL, cursorp));
+ WT_TIME_WINDOW_INIT(&hs_cursor->time_window);
+ hs_cursor->btree_id = 0;
+ datastore_key = &hs_cursor->datastore_key;
+ WT_ERR(__wt_scr_alloc(session, 0, &datastore_key));
+ hs_cursor->flags = 0;
WT_TIME_WINDOW_INIT(&hs_cursor->time_window);