/*-
 * Copyright (c) 2014-present MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

#include "wt_internal.h"

/*
 * Walking backwards through skip lists.
 *
 * The skip list stack is an array of pointers set up by a search. It points to the position a node
 * should go in the skip list. In other words, the skip list search stack always points *after* the
 * search item (that is, into the search item's next array).
 *
 * Helper macros to go from a stack pointer at level i, pointing into a next array, back to the
 * insert node containing that next array.
 */
#undef PREV_ITEM
#define PREV_ITEM(ins_head, insp, i)                      \
    (((insp) == &(ins_head)->head[i] || (insp) == NULL) ? \
        NULL :                                            \
        (WT_INSERT *)((char *)((insp) - (i)) - offsetof(WT_INSERT, next)))

#undef PREV_INS
#define PREV_INS(cbt, i) PREV_ITEM((cbt)->ins_head, (cbt)->ins_stack[(i)], (i))

/*
 * __cursor_skip_prev --
 *     Move back one position in a skip list stack (aka "finger").
 */
static inline int
__cursor_skip_prev(WT_CURSOR_BTREE *cbt)
{
    WT_INSERT *current, *ins, *next_ins;
    WT_ITEM key;
    WT_SESSION_IMPL *session;
    uint64_t recno;
    int i;

    session = CUR2S(cbt);

restart:
    /*
     * If the search stack does not point at the current item, fill it in with a search.
     */
    recno = WT_INSERT_RECNO(cbt->ins);
    while ((current = cbt->ins) != PREV_INS(cbt, 0)) {
        if (CUR2BT(cbt)->type == BTREE_ROW) {
            key.data = WT_INSERT_KEY(current);
            key.size = WT_INSERT_KEY_SIZE(current);
            WT_RET(__wt_search_insert(session, cbt, cbt->ins_head, &key));
        } else
            cbt->ins = __col_insert_search(cbt->ins_head, cbt->ins_stack, cbt->next_stack, recno);
    }

    /*
     * Find the first node up the search stack that does not move.
     *
     * The depth of the current item must be at least this level, since we
     * see it in that many levels of the stack.
     *
     * !!! Watch these loops carefully: they all rely on the value of i,
     * and the exit conditions to end up with the right values are
     * non-trivial.
     */
    ins = NULL; /* -Wconditional-uninitialized */
    for (i = 0; i < WT_SKIP_MAXDEPTH - 1; i++)
        if ((ins = PREV_INS(cbt, i + 1)) != current)
            break;

    /*
     * Find a starting point for the new search. That is either at the non-moving node if we found a
     * valid node, or the beginning of the next list down that is not the current node.
     *
     * Since it is the beginning of a list, and we know the current node is has a skip depth at
     * least this high, any node we find must sort before the current node.
     */
    if (ins == NULL || ins == current)
        for (; i >= 0; i--) {
            cbt->ins_stack[i] = NULL;
            cbt->next_stack[i] = NULL;
            /*
             * Compiler may replace the usage of the variable with another read in the following
             * code. Here we don't need to worry about CPU reordering as we are reading a thread
             * local value.
             *
             * Place a read barrier to avoid this issue.
             */
            WT_ORDERED_READ_WEAK_MEMORDER(ins, cbt->ins_head->head[i]);
            if (ins != NULL && ins != current)
                break;
        }

    /* Walk any remaining levels until just before the current node. */
    while (i >= 0) {
        /*
         * If we get to the end of a list without finding the current item, we must have raced with
         * an insert. Restart the search.
         */
        if (ins == NULL) {
            cbt->ins_stack[0] = NULL;
            cbt->next_stack[0] = NULL;
            goto restart;
        }
        /*
         * CPUs with weak memory ordering may reorder the read and return a stale value. This can
         * lead us to wrongly skip a value in the lower levels of the skip list.
         *
         * For example, if we have A -> C initially for both level 0 and level 1 and we concurrently
         * insert B into both level 0 and level 1. If B is visible on level 1 to this thread, it
         * must also be visible on level 0. Otherwise, we would record an inconsistent stack.
         *
         * Place a read barrier to avoid this issue.
         */
        WT_ORDERED_READ_WEAK_MEMORDER(next_ins, ins->next[i]);
        if (next_ins != current) /* Stay at this level */
            ins = next_ins;
        else { /* Drop down a level */
            cbt->next_stack[i] = next_ins;
            cbt->ins_stack[i] = &ins->next[i];
            --i;
        }
    }

    /* If we found a previous node, the next one must be current. */
    if (cbt->ins_stack[0] != NULL && *cbt->ins_stack[0] != current)
        goto restart;

    cbt->ins = PREV_INS(cbt, 0);
    return (0);
}

/*
 * __cursor_fix_append_prev --
 *     Return the previous fixed-length entry on the append list.
 */
static inline int
__cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart)
{
    WT_SESSION_IMPL *session;

    session = CUR2S(cbt);

    /* If restarting after a prepare conflict, jump to the right spot. */
    if (restart)
        goto restart_read;

    if (newpage) {
        if ((cbt->ins = WT_SKIP_LAST(cbt->ins_head)) == NULL)
            return (WT_NOTFOUND);
    } else {
        /* Move to the previous record in the append list, if any. */
        if (cbt->ins != NULL && cbt->recno <= WT_INSERT_RECNO(cbt->ins))
            WT_RET(__cursor_skip_prev(cbt));

        /*
         * Handle the special case of leading implicit records, that is,
         * there aren't any records in the page not on the append list,
         * and the append list's first record isn't the first record on
         * the page. (Although implemented as a test of the page values,
         * this is really a test for a tree where the first inserted
         * record wasn't record 1, any other page with only an append
         * list will have a first page record number matching the first
         * record in the append list.)
         *
         * The "right" place to handle this is probably in our caller.
         * The high-level cursor-previous routine would:
         *    -- call this routine to walk the append list
         *    -- call the routine to walk the standard page items
         *    -- call the tree walk routine looking for a previous page
         * Each of them returns WT_NOTFOUND, at which point our caller
         * checks the cursor record number, and if it's larger than 1,
         * returns the implicit records.  Instead, I'm trying to detect
         * the case here, mostly because I don't want to put that code
         * into our caller.  Anyway, if this code breaks for any reason,
         * that's the way I'd go.
         *
         * If we're not pointing to a WT_INSERT entry (we didn't find a
         * WT_INSERT record preceding our record name-space), check if
         * we've reached the beginning of this page, a possibility if a
         * page had a large number of items appended, and then split.
         * If not, check if there are any records on the page. If there
         * aren't, then we're in the magic zone, keep going until we get
         * to a record number matching the first record on the page.
         */
        if (cbt->ins == NULL &&
          (cbt->recno == cbt->ref->ref_recno || __col_fix_last_recno(cbt->ref) != 0))
            return (WT_NOTFOUND);
    }

    /*
     * This code looks different from the cursor-next code. The append list may be preceded by other
     * rows. If we're iterating through the tree, starting at the last record in the tree, by
     * definition we're starting a new iteration and we set the record number to the last record
     * found on the page. Otherwise, decrement the record.
     */
    if (newpage)
        __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));
    else
        __cursor_set_recno(cbt, cbt->recno - 1);

    if (F_ISSET(&cbt->iface, WT_CURSTD_KEY_ONLY))
        return (0);

    /*
     * Fixed-width column store appends are inherently non-transactional. Even a non-visible update
     * by a concurrent or aborted transaction changes the effective end of the data. The effect is
     * subtle because of the blurring between deleted and empty values, but ideally we would skip
     * all uncommitted changes at the end of the data. This doesn't apply to variable-width column
     * stores because the implicitly created records written by reconciliation are deleted and so
     * can be never seen by a read.
     */
    if (cbt->ins == NULL || cbt->recno > WT_INSERT_RECNO(cbt->ins)) {
        cbt->v = 0;
        cbt->iface.value.data = &cbt->v;
        cbt->iface.value.size = 1;
    } else {
restart_read:
        WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd));
        if (cbt->upd_value->type == WT_UPDATE_INVALID ||
          cbt->upd_value->type == WT_UPDATE_TOMBSTONE) {
            cbt->v = 0;
            cbt->iface.value.data = &cbt->v;
            cbt->iface.value.size = 1;
        } else
            __wt_value_return(cbt, cbt->upd_value);
    }
    return (0);
}

/*
 * __cursor_fix_prev --
 *     Move to the previous, fixed-length column-store item.
 */
static inline int
__cursor_fix_prev(WT_CURSOR_BTREE *cbt, bool newpage, bool restart)
{
    WT_PAGE *page;
    WT_SESSION_IMPL *session;

    session = CUR2S(cbt);
    page = cbt->ref->page;

    /* If restarting after a prepare conflict, jump to the right spot. */
    if (restart)
        goto restart_read;

    /* Initialize for each new page. */
    if (newpage) {
        /*
         * Be paranoid and set the slot out of bounds when moving to a new page.
         */
        cbt->slot = UINT32_MAX;
        cbt->last_standard_recno = __col_fix_last_recno(cbt->ref);
        if (cbt->last_standard_recno == 0)
            return (WT_NOTFOUND);
        __cursor_set_recno(cbt, cbt->last_standard_recno);
        goto new_page;
    }

    /* Move to the previous entry and return the item. */
    if (cbt->recno == cbt->ref->ref_recno)
        return (WT_NOTFOUND);
    __cursor_set_recno(cbt, cbt->recno - 1);

new_page:
restart_read:
    /* We only have one slot. */
    cbt->slot = 0;

    /* Check any insert list for a matching record. */
    cbt->ins_head = WT_COL_UPDATE_SINGLE(page);
    cbt->ins = __col_insert_search(cbt->ins_head, cbt->ins_stack, cbt->next_stack, cbt->recno);
    if (cbt->ins != NULL && cbt->recno != WT_INSERT_RECNO(cbt->ins))
        cbt->ins = NULL;

    if (F_ISSET(&cbt->iface, WT_CURSTD_KEY_ONLY))
        return (0);

    __wt_upd_value_clear(cbt->upd_value);
    if (cbt->ins != NULL)
        /* Check the update list. */
        WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd));
    if (cbt->upd_value->type == WT_UPDATE_INVALID)
        /*
         * Read the on-disk value and/or history. Pass an update list: the update list may contain
         * the base update for a modify chain after rollback-to-stable, required for correctness.
         */
        WT_RET(__wt_txn_read(session, cbt, NULL, cbt->recno, cbt->ins ? cbt->ins->upd : NULL));
    if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE || cbt->upd_value->type == WT_UPDATE_INVALID) {
        /*
         * Deleted values read as 0.
         *
         * Getting an invalid update back means that there was no update, the on-disk value isn't
         * visible, and there isn't anything in history either. This means this chunk of the tree
         * didn't exist yet for us (given our read timestamp), so we can either return NOTFOUND or
         * produce a zero value depending on the desired end-of-tree semantics. For now, we produce
         * zero so as not to change the preexisting end-of-tree behavior.
         */
        cbt->v = 0;
        cbt->iface.value.data = &cbt->v;
        cbt->iface.value.size = 1;
    } else
        __wt_value_return(cbt, cbt->upd_value);

    return (0);
}

/*
 * __cursor_var_append_prev --
 *     Return the previous variable-length entry on the append list.
 */
static inline int
__cursor_var_append_prev(
  WT_CURSOR_BTREE *cbt, bool newpage, bool restart, size_t *skippedp, bool *key_out_of_boundsp)
{
    WT_DECL_RET;
    WT_SESSION_IMPL *session;

    session = CUR2S(cbt);
    *skippedp = 0;

    /* If restarting after a prepare conflict, jump to the right spot. */
    if (restart)
        goto restart_read;

    if (newpage) {
        cbt->ins = WT_SKIP_LAST(cbt->ins_head);
        goto new_page;
    }

    for (;;) {
        WT_RET(__cursor_skip_prev(cbt));
new_page:
        if (cbt->ins == NULL)
            return (WT_NOTFOUND);

        __cursor_set_recno(cbt, WT_INSERT_RECNO(cbt->ins));

        if (F_ISSET(&cbt->iface, WT_CURSTD_KEY_ONLY))
            return (0);

restart_read:
        /*
         * If a lower bound has been set ensure that the key is within the range, otherwise early
         * exit.
         */
        if ((ret = __wt_btcur_bounds_early_exit(session, cbt, false, key_out_of_boundsp)) ==
          WT_NOTFOUND)
            WT_STAT_CONN_DATA_INCR(session, cursor_bounds_prev_early_exit);
        WT_RET(ret);

        WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd));
        if (cbt->upd_value->type == WT_UPDATE_INVALID) {
            ++*skippedp;
            continue;
        }
        if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) {
            if (cbt->upd_value->tw.stop_txn != WT_TXN_NONE &&
              __wt_txn_upd_value_visible_all(session, cbt->upd_value))
                ++cbt->page_deleted_count;
            ++*skippedp;
            continue;
        }
        __wt_value_return(cbt, cbt->upd_value);
        return (0);
    }
    /* NOTREACHED */
}

/*
 * __cursor_var_prev --
 *     Move to the previous, variable-length column-store item.
 */
static inline int
__cursor_var_prev(
  WT_CURSOR_BTREE *cbt, bool newpage, bool restart, size_t *skippedp, bool *key_out_of_boundsp)
{
    WT_CELL *cell;
    WT_CELL_UNPACK_KV unpack;
    WT_COL *cip;
    WT_DECL_RET;
    WT_INSERT *ins;
    WT_PAGE *page;
    WT_SESSION_IMPL *session;
    uint64_t rle, rle_start;

    session = CUR2S(cbt);
    page = cbt->ref->page;

    rle_start = 0; /* -Werror=maybe-uninitialized */
    *skippedp = 0;

    /* If restarting after a prepare conflict, jump to the right spot. */
    if (restart)
        goto restart_read;

    /* Initialize for each new page. */
    if (newpage) {
        /*
         * Be paranoid and set the slot out of bounds when moving to a new page.
         */
        cbt->slot = UINT32_MAX;
        cbt->last_standard_recno = __col_var_last_recno(cbt->ref);
        if (cbt->last_standard_recno == 0)
            return (WT_NOTFOUND);
        __cursor_set_recno(cbt, cbt->last_standard_recno);
        cbt->cip_saved = NULL;
        F_CLR(cbt, WT_CBT_CACHEABLE_RLE_CELL);
        goto new_page;
    }

    /* Move to the previous entry and return the item. */
    for (;;) {
        __cursor_set_recno(cbt, cbt->recno - 1);

new_page:
        if (cbt->recno < cbt->ref->ref_recno)
            return (WT_NOTFOUND);

restart_read:
        /*
         * If an lower bound has been set ensure that the key is within the range, otherwise early
         * exit. In the case where there is a large set of RLE deleted records it is possible that
         * calculated recno will be off the end of the page. We don't need to add an additional
         * check for this case as the prev iteration, either on a page or append list will check the
         * recno and early exit. It does present a potential optimization but to keep the bounded
         * cursor logic simple we will forego it for now.
         */
        if ((ret = __wt_btcur_bounds_early_exit(session, cbt, false, key_out_of_boundsp)) ==
          WT_NOTFOUND)
            WT_STAT_CONN_DATA_INCR(session, cursor_bounds_prev_early_exit);
        WT_RET(ret);

        /* Find the matching WT_COL slot. */
        if ((cip = __col_var_search(cbt->ref, cbt->recno, &rle_start)) == NULL)
            return (WT_NOTFOUND);
        cbt->slot = WT_COL_SLOT(page, cip);

        /* Check any insert list for a matching record. */
        cbt->ins_head = WT_COL_UPDATE_SLOT(page, cbt->slot);
        cbt->ins = __col_insert_search_match(cbt->ins_head, cbt->recno);
        __wt_upd_value_clear(cbt->upd_value);
        if (cbt->ins != NULL) {
            if (F_ISSET(&cbt->iface, WT_CURSTD_KEY_ONLY))
                return (0);

            WT_RET(__wt_txn_read_upd_list(session, cbt, cbt->ins->upd));
        }
        if (cbt->upd_value->type != WT_UPDATE_INVALID) {
            if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) {
                if (cbt->upd_value->tw.stop_txn != WT_TXN_NONE &&
                  __wt_txn_upd_value_visible_all(session, cbt->upd_value))
                    ++cbt->page_deleted_count;
                ++*skippedp;
                continue;
            }
            __wt_value_return(cbt, cbt->upd_value);
            return (0);
        }

        /*
         * There's no matching insert list item. If we're on the same slot as the last reference,
         * and the cell is cacheable (it might not be if it's not globally visible), reuse the
         * previous return data to avoid repeatedly decoding items.
         */
        if (cbt->cip_saved == cip && F_ISSET(cbt, WT_CBT_CACHEABLE_RLE_CELL)) {
            F_CLR(&cbt->iface, WT_CURSTD_VALUE_EXT);
            F_SET(&cbt->iface, WT_CURSTD_VALUE_INT);
            cbt->iface.value.data = cbt->tmp->data;
            cbt->iface.value.size = cbt->tmp->size;
            return (0);
        }

        /* Unpack the cell and build the return information. */
        cell = WT_COL_PTR(page, cip);
        __wt_cell_unpack_kv(session, page->dsk, cell, &unpack);
        rle = __wt_cell_rle(&unpack);
        if (unpack.type == WT_CELL_DEL) {
            if (rle == 1) {
                ++*skippedp;
                continue;
            }

            /*
             * There can be huge gaps in the variable-length column-store name space appearing as
             * deleted records. If more than one deleted record, do the work of finding the next
             * record to return instead of looping through the records.
             *
             * First, find the largest record in the update list that's smaller than the current
             * record.
             */
            ins = __col_insert_search_lt(cbt->ins_head, cbt->recno);

            /*
             * Second, for records with RLEs greater than 1, the above call to __col_var_search
             * located this record in the page's list of repeating records, and returned the
             * starting record. The starting record - 1 is the record to which we could skip, if
             * there was no larger record in the update list.
             */
            cbt->recno = rle_start - 1;
            if (ins != NULL && WT_INSERT_RECNO(ins) > cbt->recno)
                cbt->recno = WT_INSERT_RECNO(ins);

            /* Adjust for the outer loop decrement. */
            ++cbt->recno;
            ++*skippedp;
            continue;
        }

        if (F_ISSET(&cbt->iface, WT_CURSTD_KEY_ONLY))
            return (0);

        /*
         * Read the on-disk value and/or history. Pass an update list: the update list may contain
         * the base update for a modify chain after rollback-to-stable, required for correctness.
         */
        WT_RET(__wt_txn_read(session, cbt, NULL, cbt->recno, cbt->ins ? cbt->ins->upd : NULL));
        if (cbt->upd_value->type == WT_UPDATE_INVALID ||
          cbt->upd_value->type == WT_UPDATE_TOMBSTONE) {
            ++*skippedp;
            continue;
        }
        __wt_value_return(cbt, cbt->upd_value);

        /*
         * It is only safe to cache the value for other keys in the same RLE cell if it is globally
         * visible. Otherwise, there might be some older timestamp where the value isn't uniform
         * across the cell. Always set cip_saved so it's easy to tell when we change cells.
         *
         * Note: it's important that we're checking the on-disk value for global visibility, and not
         * whatever __wt_txn_read returned, which might be something else. (If it's something else,
         * we can't cache it; but in that case the on-disk value cannot be globally visible.)
         */
        cbt->cip_saved = cip;
        if (rle > 1 &&
          __wt_txn_visible_all(session, unpack.tw.start_txn, unpack.tw.durable_start_ts)) {
            /*
             * Copy the value into cbt->tmp to cache it. This is perhaps unfortunate, because
             * copying isn't free, but it's currently necessary. The memory we're copying might be
             * on the disk page (which is safe because the page is pinned as long as the cursor is
             * sitting on it) but if not it belongs to cbt->upd_value, and that (though it belongs
             * to the cursor and won't disappear arbitrarily) might be invalidated or changed by
             * other paths through this function on a subsequent call but before we are done with
             * this RLE cell. In principle those paths could clear WT_CBT_CACHEABLE_RLE_CELL, but
             * the code is currently structured in a way that makes that difficult.
             */
            WT_RET(__wt_buf_set(session, cbt->tmp, cbt->iface.value.data, cbt->iface.value.size));
            F_SET(cbt, WT_CBT_CACHEABLE_RLE_CELL);
        } else
            F_CLR(cbt, WT_CBT_CACHEABLE_RLE_CELL);
        return (0);
    }
    /* NOTREACHED */
}

/*
 * __cursor_row_prev --
 *     Move to the previous row-store item.
 */
static inline int
__cursor_row_prev(
  WT_CURSOR_BTREE *cbt, bool newpage, bool restart, size_t *skippedp, bool *key_out_of_boundsp)
{
    WT_CELL_UNPACK_KV kpack;
    WT_DECL_RET;
    WT_INSERT *ins;
    WT_ITEM *key;
    WT_PAGE *page;
    WT_ROW *rip;
    WT_SESSION_IMPL *session;

    key = &cbt->iface.key;
    page = cbt->ref->page;
    session = CUR2S(cbt);
    *skippedp = 0;

    /* If restarting after a prepare conflict, jump to the right spot. */
    if (restart) {
        if (cbt->iter_retry == WT_CBT_RETRY_INSERT)
            goto restart_read_insert;
        if (cbt->iter_retry == WT_CBT_RETRY_PAGE)
            goto restart_read_page;
    }
    cbt->iter_retry = WT_CBT_RETRY_NOTSET;

    /*
     * For row-store pages, we need a single item that tells us the part of the page we're walking
     * (otherwise switching from next to prev and vice-versa is just too complicated), so we map the
     * WT_ROW and WT_INSERT_HEAD insert array slots into a single name space: slot 1 is the
     * "smallest key insert list", slot 2 is WT_ROW[0], slot 3 is WT_INSERT_HEAD[0], and so on. This
     * means WT_INSERT lists are odd-numbered slots, and WT_ROW array slots are even-numbered slots.
     *
     * Initialize for each new page.
     */
    if (newpage) {
        /* Check if keys need to be instantiated before we walk the page. */
        WT_RET(__wt_row_leaf_key_instantiate(session, page));

        /*
         * Be paranoid and set the slot out of bounds when moving to a new page.
         */
        cbt->slot = UINT32_MAX;
        if (page->entries == 0)
            cbt->ins_head = WT_ROW_INSERT_SMALLEST(page);
        else
            cbt->ins_head = WT_ROW_INSERT_SLOT(page, page->entries - 1);
        cbt->ins = WT_SKIP_LAST(cbt->ins_head);
        cbt->row_iteration_slot = page->entries * 2 + 1;
        cbt->rip_saved = NULL;
        goto new_insert;
    }

    /* Move to the previous entry and return the item. */
    for (;;) {
        /*
         * Continue traversing any insert list. Maintain the reference to the current insert element
         * in case we switch to a cursor next movement.
         */
        if (cbt->ins != NULL)
            WT_RET(__cursor_skip_prev(cbt));

new_insert:
        cbt->iter_retry = WT_CBT_RETRY_INSERT;
restart_read_insert:
        if ((ins = cbt->ins) != NULL) {
            key->data = WT_INSERT_KEY(ins);
            key->size = WT_INSERT_KEY_SIZE(ins);

            if (F_ISSET(&cbt->iface, WT_CURSTD_KEY_ONLY))
                return (0);

            /*
             * If a lower bound has been set ensure that the key is within the range, otherwise
             * early exit.
             */
            if ((ret = __wt_btcur_bounds_early_exit(session, cbt, false, key_out_of_boundsp)) ==
              WT_NOTFOUND)
                WT_STAT_CONN_DATA_INCR(session, cursor_bounds_prev_early_exit);
            WT_RET(ret);

            WT_RET(__wt_txn_read_upd_list(session, cbt, ins->upd));
            if (cbt->upd_value->type == WT_UPDATE_INVALID) {
                ++*skippedp;
                continue;
            }
            if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) {
                if (cbt->upd_value->tw.stop_txn != WT_TXN_NONE &&
                  __wt_txn_upd_value_visible_all(session, cbt->upd_value))
                    ++cbt->page_deleted_count;
                ++*skippedp;
                continue;
            }
            __wt_value_return(cbt, cbt->upd_value);
            return (0);
        }

        /* Check for the beginning of the page. */
        if (cbt->row_iteration_slot == 1)
            return (WT_NOTFOUND);
        --cbt->row_iteration_slot;

        /*
         * Odd-numbered slots configure as WT_INSERT_HEAD entries, even-numbered slots configure as
         * WT_ROW entries.
         */
        if (cbt->row_iteration_slot & 0x01) {
            cbt->ins_head = cbt->row_iteration_slot == 1 ?
              WT_ROW_INSERT_SMALLEST(page) :
              WT_ROW_INSERT_SLOT(page, cbt->row_iteration_slot / 2 - 1);
            cbt->ins = WT_SKIP_LAST(cbt->ins_head);
            goto new_insert;
        }
        cbt->ins_head = NULL;
        cbt->ins = NULL;

        cbt->iter_retry = WT_CBT_RETRY_PAGE;
        cbt->slot = cbt->row_iteration_slot / 2 - 1;
restart_read_page:
        rip = &page->pg_row[cbt->slot];
        /*
         * The saved cursor key from the slot is used later to get the value from the history store
         * if the on-disk data is not visible.
         */
        WT_RET(__cursor_row_slot_key_return(cbt, rip, &kpack));

        if (F_ISSET(&cbt->iface, WT_CURSTD_KEY_ONLY))
            return (0);

        /*
         * If a lower bound has been set ensure that the key is within the range, otherwise early
         * exit.
         */
        if ((ret = __wt_btcur_bounds_early_exit(session, cbt, false, key_out_of_boundsp)) ==
          WT_NOTFOUND)
            WT_STAT_CONN_DATA_INCR(session, cursor_bounds_prev_early_exit);
        WT_RET(ret);

        /*
         * Read the on-disk value and/or history. Pass an update list: the update list may contain
         * the base update for a modify chain after rollback-to-stable, required for correctness.
         */
        WT_RET(
          __wt_txn_read(session, cbt, &cbt->iface.key, WT_RECNO_OOB, WT_ROW_UPDATE(page, rip)));
        if (cbt->upd_value->type == WT_UPDATE_INVALID) {
            ++*skippedp;
            continue;
        }
        if (cbt->upd_value->type == WT_UPDATE_TOMBSTONE) {
            if (cbt->upd_value->tw.stop_txn != WT_TXN_NONE &&
              __wt_txn_upd_value_visible_all(session, cbt->upd_value))
                ++cbt->page_deleted_count;
            ++*skippedp;
            continue;
        }
        __wt_value_return(cbt, cbt->upd_value);
        return (0);
    }
    /* NOTREACHED */
}

/*
 * __wt_btcur_prev --
 *     Move to the previous record in the tree.
 */
int
__wt_btcur_prev(WT_CURSOR_BTREE *cbt, bool truncating)
{
    WT_CURSOR *cursor;
    WT_DECL_RET;
    WT_PAGE *page;
    WT_SESSION_IMPL *session;
    size_t total_skipped, skipped;
    uint32_t flags;
    bool key_out_of_bounds, newpage, restart, need_walk;
#ifdef HAVE_DIAGNOSTIC
    bool inclusive_set;

    inclusive_set = false;
#endif
    cursor = &cbt->iface;
    key_out_of_bounds = false;
    need_walk = false;
    session = CUR2S(cbt);
    total_skipped = 0;

    WT_STAT_CONN_DATA_INCR(session, cursor_prev);

    /* tree walk flags */
    flags = WT_READ_NO_SPLIT | WT_READ_PREV | WT_READ_SKIP_INTL;
    if (truncating)
        LF_SET(WT_READ_TRUNCATE);

    F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);

    WT_ERR(__wt_cursor_func_init(cbt, false));

    /*
     * If we have a bound set we should position our cursor appropriately if it isn't already
     * positioned. It is possible that the positioning function can directly return the record. For
     * that to happen, the cursor must be placed on a valid record and must be positioned on the
     * first record within the bounds. If the record is not valid or is not positioned within the
     * bounds, continue the prev traversal logic.
     */
    if (F_ISSET(cursor, WT_CURSTD_BOUND_UPPER) && !WT_CURSOR_IS_POSITIONED(cbt)) {
        WT_ERR(__wt_btcur_bounds_position(session, cbt, false, &need_walk));
        if (!need_walk) {
            __wt_value_return(cbt, cbt->upd_value);
            goto done;
        }
    }

    /*
     * If we aren't already iterating in the right direction, there's some setup to do.
     */
    if (!F_ISSET(cbt, WT_CBT_ITERATE_PREV))
        __wt_btcur_iterate_setup(cbt);

    /*
     * Walk any page we're holding until the underlying call returns not-found. Then, move to the
     * previous page, until we reach the start of the file.
     */
    restart = F_ISSET(cbt, WT_CBT_ITERATE_RETRY_PREV);
    F_CLR(cbt, WT_CBT_ITERATE_RETRY_PREV);
    for (newpage = false;; newpage = true, restart = false) {
        /* Calls with key only flag should never restart. */
        WT_ASSERT(session, !F_ISSET(&cbt->iface, WT_CURSTD_KEY_ONLY) || !restart);
        page = cbt->ref == NULL ? NULL : cbt->ref->page;

        /*
         * Column-store pages may have appended entries. Handle it separately from the usual cursor
         * code, it's in a simple format.
         */
        if (newpage && page != NULL && page->type != WT_PAGE_ROW_LEAF &&
          (cbt->ins_head = WT_COL_APPEND(page)) != NULL)
            F_SET(cbt, WT_CBT_ITERATE_APPEND);

        if (F_ISSET(cbt, WT_CBT_ITERATE_APPEND)) {
            /* The page cannot be NULL if the above flag is set. */
            WT_ASSERT(session, page != NULL);
            switch (page->type) {
            case WT_PAGE_COL_FIX:
                ret = __cursor_fix_append_prev(cbt, newpage, restart);
                break;
            case WT_PAGE_COL_VAR:
                ret = __cursor_var_append_prev(cbt, newpage, restart, &skipped, &key_out_of_bounds);
                total_skipped += skipped;
                break;
            default:
                WT_ERR(__wt_illegal_value(session, page->type));
            }
            if (ret == 0 || ret == WT_PREPARE_CONFLICT)
                break;
            F_CLR(cbt, WT_CBT_ITERATE_APPEND);
            if (ret != WT_NOTFOUND)
                break;
            newpage = true;
        }
        if (page != NULL) {
            switch (page->type) {
            case WT_PAGE_COL_FIX:
                ret = __cursor_fix_prev(cbt, newpage, restart);
                break;
            case WT_PAGE_COL_VAR:
                ret = __cursor_var_prev(cbt, newpage, restart, &skipped, &key_out_of_bounds);
                total_skipped += skipped;
                break;
            case WT_PAGE_ROW_LEAF:
                ret = __cursor_row_prev(cbt, newpage, restart, &skipped, &key_out_of_bounds);
                total_skipped += skipped;
                break;
            default:
                WT_ERR(__wt_illegal_value(session, page->type));
            }
            if (ret != WT_NOTFOUND)
                break;
        }

        /*
         * If we are doing an operation with bounds set, we need to check if we have exited the prev
         * function due to the key being out of bounds. If so, we break instead of walking onto the
         * prev page. We're not directly returning here to allow the cursor to be reset first before
         * we return WT_NOTFOUND.
         */
        if (key_out_of_bounds)
            break;

        /*
         * If we saw a lot of deleted records on this page, or we went all the way through a page
         * and only saw deleted records, try to evict the page when we release it. Otherwise
         * repeatedly deleting from the beginning of a tree can have quadratic performance. Take
         * care not to force eviction of pages that are genuinely empty, in new trees.
         *
         * A visible stop timestamp could have been treated as a tombstone and accounted in the
         * deleted count. Such a page might not have any new updates and be clean, but could benefit
         * from reconciliation getting rid of the obsolete content. Hence mark the page dirty to
         * force it through reconciliation.
         */
        if (page != NULL &&
          (cbt->page_deleted_count > WT_BTREE_DELETE_THRESHOLD ||
            (newpage && cbt->page_deleted_count > 0))) {
            WT_ERR(__wt_page_dirty_and_evict_soon(session, cbt->ref));
            WT_STAT_CONN_INCR(session, cache_eviction_force_delete);
        }
        cbt->page_deleted_count = 0;

        if (F_ISSET(cbt, WT_CBT_READ_ONCE))
            LF_SET(WT_READ_WONT_NEED);

        if (!F_ISSET(session->txn, WT_TXN_HAS_SNAPSHOT))
            LF_SET(WT_READ_VISIBLE_ALL);

        /*
         * If we are running with snapshot isolation, and not interested in returning tombstones, we
         * could potentially skip pages. The skip function looks at the aggregated timestamp
         * information to determine if something is visible on the page. If nothing is, the page is
         * skipped.
         */
        if (!F_ISSET(&cbt->iface, WT_CURSTD_KEY_ONLY) &&
          session->txn->isolation == WT_ISO_SNAPSHOT &&
          !F_ISSET(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE))
            WT_ERR(
              __wt_tree_walk_custom_skip(session, &cbt->ref, __wt_btcur_skip_page, NULL, flags));
        else
            WT_ERR(__wt_tree_walk(session, &cbt->ref, flags));
        WT_ERR_TEST(cbt->ref == NULL, WT_NOTFOUND, false);
    }

done:
err:
    if (total_skipped != 0) {
        if (total_skipped < 100)
            WT_STAT_CONN_DATA_INCR(session, cursor_prev_skip_lt_100);
        else
            WT_STAT_CONN_DATA_INCR(session, cursor_prev_skip_ge_100);
    }

    WT_STAT_CONN_DATA_INCRV(session, cursor_prev_skip_total, total_skipped);

    switch (ret) {
    case 0:
        if (F_ISSET(&cbt->iface, WT_CURSTD_KEY_ONLY))
            F_SET(cursor, WT_CURSTD_KEY_INT);
        else
            F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
#ifdef HAVE_DIAGNOSTIC
        /*
         * Skip key order check, if next is called after a prev returned a prepare conflict error,
         * i.e cursor has changed direction at a prepared update, hence current key returned could
         * be same as earlier returned key.
         *
         * eg: Initial data set : (2,3,...10) insert key 1 in a prepare transaction. loop on prev
         * will return 10,...3,2 and subsequent call to prev will return a prepare conflict. Now if
         * we call next key 2 will be returned which will be same as earlier returned key.
         */
        if (!F_ISSET(cbt, WT_CBT_ITERATE_RETRY_NEXT))
            ret = __wt_cursor_key_order_check(session, cbt, false);

        if (need_walk) {
            /*
             * The bounds positioning code relies on the assumption that if we had to walk then we
             * can't possibly have walked to the upper bound. We check that assumption here by
             * comparing the upper bound with our current key or recno. Force inclusive to be false
             * so we don't consider the bound itself.
             */
            inclusive_set = F_ISSET(cursor, WT_CURSTD_BOUND_UPPER_INCLUSIVE);
            F_CLR(cursor, WT_CURSTD_BOUND_UPPER_INCLUSIVE);
            ret = __wt_compare_bounds(
              session, cursor, &cbt->iface.key, cbt->recno, true, &key_out_of_bounds);
            WT_ASSERT(session, ret == 0 && !key_out_of_bounds);
            if (inclusive_set)
                F_SET(cursor, WT_CURSTD_BOUND_UPPER_INCLUSIVE);
        }
#endif
        break;
    case WT_PREPARE_CONFLICT:
        /*
         * If prepare conflict occurs, cursor should not be reset, as current cursor position will
         * be reused in case of a retry from user.
         */
        F_SET(cbt, WT_CBT_ITERATE_RETRY_PREV);
        break;
    default:
        WT_TRET(__cursor_reset(cbt));
    }
    F_CLR(cbt, WT_CBT_ITERATE_RETRY_NEXT);

    if (ret == 0)
        WT_RET(__wt_btcur_evict_reposition(cbt));

    return (ret);
}