/*-
 * Copyright (c) 2014-present MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

/*
 * __wt_txn_context_prepare_check --
 *     Return an error if the current transaction is in the prepare state.
 */
static inline int
__wt_txn_context_prepare_check(WT_SESSION_IMPL *session)
{
    if (F_ISSET(session->txn, WT_TXN_PREPARE_IGNORE_API_CHECK))
        return (0);
    if (F_ISSET(session->txn, WT_TXN_PREPARE))
        WT_RET_MSG(session, EINVAL, "not permitted in a prepared transaction");
    return (0);
}

/*
 * __wt_txn_context_check --
 *     Complain if a transaction is/isn't running.
 */
static inline int
__wt_txn_context_check(WT_SESSION_IMPL *session, bool requires_txn)
{
    if (requires_txn && !F_ISSET(session->txn, WT_TXN_RUNNING))
        WT_RET_MSG(session, EINVAL, "only permitted in a running transaction");
    if (!requires_txn && F_ISSET(session->txn, WT_TXN_RUNNING))
        WT_RET_MSG(session, EINVAL, "not permitted in a running transaction");
    return (0);
}

/*
 * __wt_txn_err_set --
 *     Set an error in the current transaction.
 */
static inline void
__wt_txn_err_set(WT_SESSION_IMPL *session, int ret)
{
    WT_TXN *txn;

    txn = session->txn;

    /*  Ignore standard errors that don't fail the transaction. */
    if (ret == WT_NOTFOUND || ret == WT_DUPLICATE_KEY || ret == WT_PREPARE_CONFLICT)
        return;

    /* Less commonly, it's not a running transaction. */
    if (!F_ISSET(txn, WT_TXN_RUNNING))
        return;

    /* The transaction has to be rolled back. */
    F_SET(txn, WT_TXN_ERROR);

    /*
     * Check for a prepared transaction, and quit: we can't ignore the error and we can't roll back
     * a prepared transaction.
     */
    if (F_ISSET(txn, WT_TXN_PREPARE))
        WT_IGNORE_RET(__wt_panic(session, ret,
          "transactional error logged after transaction was prepared, failing the system"));
}

/*
 * __wt_txn_op_set_recno --
 *     Set the latest transaction operation with the given recno.
 */
static inline void
__wt_txn_op_set_recno(WT_SESSION_IMPL *session, uint64_t recno)
{
    WT_TXN *txn;
    WT_TXN_OP *op;

    txn = session->txn;

    WT_ASSERT(session, txn->mod_count > 0 && recno != WT_RECNO_OOB);
    op = txn->mod + txn->mod_count - 1;

    if (WT_SESSION_IS_CHECKPOINT(session) || WT_IS_HS(op->btree->dhandle) ||
      WT_IS_METADATA(op->btree->dhandle))
        return;

    WT_ASSERT(session, op->type == WT_TXN_OP_BASIC_COL || op->type == WT_TXN_OP_INMEM_COL);

    /*
     * Copy the recno into the transaction operation structure, so when update is evicted to the
     * history store, we have a chance of finding it again. Even though only prepared updates can be
     * evicted, at this stage we don't know whether this transaction will be prepared or not, hence
     * we are copying the key for all operations, so that we can use this key to fetch the update in
     * case this transaction is prepared.
     */
    op->u.op_col.recno = recno;
}

/*
 * __wt_txn_op_set_key --
 *     Set the latest transaction operation with the given key.
 */
static inline int
__wt_txn_op_set_key(WT_SESSION_IMPL *session, const WT_ITEM *key)
{
    WT_TXN *txn;
    WT_TXN_OP *op;

    txn = session->txn;

    WT_ASSERT(session, txn->mod_count > 0 && key->data != NULL);

    op = txn->mod + txn->mod_count - 1;

    if (WT_SESSION_IS_CHECKPOINT(session) || WT_IS_HS(op->btree->dhandle) ||
      WT_IS_METADATA(op->btree->dhandle))
        return (0);

    WT_ASSERT(session, op->type == WT_TXN_OP_BASIC_ROW || op->type == WT_TXN_OP_INMEM_ROW);

    /*
     * Copy the key into the transaction operation structure, so when update is evicted to the
     * history store, we have a chance of finding it again. Even though only prepared updates can be
     * evicted, at this stage we don't know whether this transaction will be prepared or not, hence
     * we are copying the key for all operations, so that we can use this key to fetch the update in
     * case this transaction is prepared.
     */
    return (__wt_buf_set(session, &op->u.op_row.key, key->data, key->size));
}

/*
 * __txn_resolve_prepared_update --
 *     Resolve a prepared update as committed update.
 */
static inline void
__txn_resolve_prepared_update(WT_SESSION_IMPL *session, WT_UPDATE *upd)
{
    WT_TXN *txn;

    txn = session->txn;
    /*
     * In case of a prepared transaction, the order of modification of the prepare timestamp to
     * commit timestamp in the update chain will not affect the data visibility, a reader will
     * encounter a prepared update resulting in prepare conflict.
     *
     * As updating timestamp might not be an atomic operation, we will manage using state.
     */
    upd->prepare_state = WT_PREPARE_LOCKED;
    WT_WRITE_BARRIER();
    upd->start_ts = txn->commit_timestamp;
    upd->durable_ts = txn->durable_timestamp;
    WT_PUBLISH(upd->prepare_state, WT_PREPARE_RESOLVED);
}

/*
 * __txn_next_op --
 *     Mark a WT_UPDATE object modified by the current transaction.
 */
static inline int
__txn_next_op(WT_SESSION_IMPL *session, WT_TXN_OP **opp)
{
    WT_TXN *txn;
    WT_TXN_OP *op;

    *opp = NULL;

    txn = session->txn;

    /*
     * We're about to perform an update. Make sure we have allocated a transaction ID.
     */
    WT_RET(__wt_txn_id_check(session));
    WT_ASSERT(session, F_ISSET(txn, WT_TXN_HAS_ID));

    WT_RET(__wt_realloc_def(session, &txn->mod_alloc, txn->mod_count + 1, &txn->mod));

    op = &txn->mod[txn->mod_count++];
    WT_CLEAR(*op);
    op->btree = S2BT(session);
    (void)__wt_atomic_addi32(&session->dhandle->session_inuse, 1);
    *opp = op;
    return (0);
}

/*
 * __wt_txn_unmodify --
 *     If threads race making updates, they may discard the last referenced WT_UPDATE item while the
 *     transaction is still active. This function removes the last update item from the "log".
 */
static inline void
__wt_txn_unmodify(WT_SESSION_IMPL *session)
{
    WT_TXN *txn;
    WT_TXN_OP *op;

    txn = session->txn;
    if (F_ISSET(txn, WT_TXN_HAS_ID)) {
        WT_ASSERT(session, txn->mod_count > 0);
        --txn->mod_count;
        op = txn->mod + txn->mod_count;
        __wt_txn_op_free(session, op);
    }
}

/*
 * __wt_txn_op_delete_apply_prepare_state --
 *     Apply the correct prepare state and the timestamp to the ref and to any updates in the page
 *     del update list.
 */
static inline void
__wt_txn_op_delete_apply_prepare_state(WT_SESSION_IMPL *session, WT_REF *ref, bool commit)
{
    WT_PAGE_DELETED *page_del;
    WT_TXN *txn;
    WT_UPDATE **updp;
    wt_timestamp_t ts;
    uint8_t prepare_state, previous_state;

    txn = session->txn;

    /* Lock the ref to ensure we don't race with page instantiation. */
    WT_REF_LOCK(session, ref, &previous_state);

    if (commit) {
        ts = txn->commit_timestamp;
        prepare_state = WT_PREPARE_RESOLVED;
    } else {
        ts = txn->prepare_timestamp;
        prepare_state = WT_PREPARE_INPROGRESS;
    }

    /*
     * Timestamps and prepare state are in the page deleted structure for truncates, or in the
     * updates list in the case of instantiated pages. We also need to update any page deleted
     * structure in the ref.
     *
     * Only two cases are possible. First: the state is WT_REF_DELETED. In this case page_del cannot
     * be NULL yet because an uncommitted operation cannot have reached global visibility. (Or at
     * least, global visibility in the sense we need to use it for truncations, in which prepared
     * and uncommitted transactions are not visible.)
     *
     * Otherwise: there is an uncommitted delete operation we're handling, so the page must have
     * been deleted at some point, and the tree can't be readonly. Therefore the page must have been
     * instantiated, the state must be WT_REF_MEM, and there should be an update list in
     * mod->inst_updates. (But just in case, allow the update list to be null.) There might be a
     * non-null page_del structure to update, depending on whether the page has been reconciled
     * since it was deleted and then instantiated.
     */
    if (previous_state != WT_REF_DELETED) {
        WT_ASSERT(session, previous_state == WT_REF_MEM);
        WT_ASSERT(session, ref->page != NULL && ref->page->modify != NULL);
        if ((updp = ref->page->modify->inst_updates) != NULL)
            for (; *updp != NULL; ++updp) {
                (*updp)->start_ts = ts;
                /*
                 * Holding the ref locked means we have exclusive access, so if we are committing we
                 * don't need to use the prepare locked transition state.
                 */
                (*updp)->prepare_state = prepare_state;
                if (commit)
                    (*updp)->durable_ts = txn->durable_timestamp;
            }
    }
    page_del = ref->page_del;
    if (page_del != NULL) {
        page_del->timestamp = ts;
        if (commit)
            page_del->durable_timestamp = txn->durable_timestamp;
        WT_PUBLISH(page_del->prepare_state, prepare_state);
    }

    WT_REF_UNLOCK(ref, previous_state);
}

/*
 * __wt_txn_op_delete_commit_apply_timestamps --
 *     Apply the correct start and durable timestamps to any updates in the page del update list.
 */
static inline void
__wt_txn_op_delete_commit_apply_timestamps(WT_SESSION_IMPL *session, WT_REF *ref)
{
    WT_PAGE_DELETED *page_del;
    WT_TXN *txn;
    WT_UPDATE **updp;
    uint8_t previous_state;

    txn = session->txn;

    /* Lock the ref to ensure we don't race with page instantiation. */
    WT_REF_LOCK(session, ref, &previous_state);

    /*
     * Timestamps are in the page deleted structure for truncates, or in the updates in the case of
     * instantiated pages. We also need to update any page deleted structure in the ref. Both commit
     * and durable timestamps need to be updated.
     *
     * Only two cases are possible. First: the state is WT_REF_DELETED. In this case page_del cannot
     * be NULL yet because an uncommitted operation cannot have reached global visibility. (Or at
     * least, global visibility in the sense we need to use it for truncations, in which prepared
     * and uncommitted transactions are not visible.)
     *
     * Otherwise: there is an uncommitted delete operation we're handling, so the page must have
     * been deleted at some point, and the tree can't be readonly. Therefore the page must have been
     * instantiated, the state must be WT_REF_MEM, and there should be an update list in
     * mod->inst_updates. (But just in case, allow the update list to be null.) There might be a
     * non-null page_del structure to update, depending on whether the page has been reconciled
     * since it was deleted and then instantiated.
     */
    if (previous_state != WT_REF_DELETED) {
        WT_ASSERT(session, previous_state == WT_REF_MEM);
        WT_ASSERT(session, ref->page != NULL && ref->page->modify != NULL);
        if ((updp = ref->page->modify->inst_updates) != NULL)
            for (; *updp != NULL; ++updp) {
                (*updp)->start_ts = txn->commit_timestamp;
                (*updp)->durable_ts = txn->durable_timestamp;
            }
    }
    page_del = ref->page_del;
    if (page_del != NULL && page_del->timestamp == WT_TS_NONE) {
        page_del->timestamp = txn->commit_timestamp;
        page_del->durable_timestamp = txn->durable_timestamp;
    }

    WT_REF_UNLOCK(ref, previous_state);
}

/*
 * __wt_txn_op_set_timestamp --
 *     Decide whether to copy a commit timestamp into an update. If the op structure doesn't have a
 *     populated update or ref field or is in prepared state there won't be any check for an
 *     existing timestamp.
 */
static inline void
__wt_txn_op_set_timestamp(WT_SESSION_IMPL *session, WT_TXN_OP *op)
{
    WT_BTREE *btree;
    WT_TXN *txn;
    WT_UPDATE *upd;

    btree = op->btree;
    txn = session->txn;

    /*
     * Updates without a commit time and logged objects don't have timestamps, and only the most
     * recently committed data matches files on disk.
     */
    if (!F_ISSET(txn, WT_TXN_HAS_TS_COMMIT))
        return;
    if (F_ISSET(btree, WT_BTREE_LOGGED))
        return;

    if (F_ISSET(txn, WT_TXN_PREPARE)) {
        /*
         * We have a commit timestamp for a prepare transaction, this is only possible as part of a
         * transaction commit call.
         */
        if (op->type == WT_TXN_OP_REF_DELETE)
            __wt_txn_op_delete_apply_prepare_state(session, op->u.ref, true);
        else {
            upd = op->u.op_upd;

            /* Resolve prepared update to be committed update. */
            __txn_resolve_prepared_update(session, upd);
        }
    } else {
        if (op->type == WT_TXN_OP_REF_DELETE)
            __wt_txn_op_delete_commit_apply_timestamps(session, op->u.ref);
        else {
            /*
             * The timestamp is in the update for operations other than truncate. Both commit and
             * durable timestamps need to be updated.
             */
            upd = op->u.op_upd;
            if (upd->start_ts == WT_TS_NONE) {
                upd->start_ts = txn->commit_timestamp;
                upd->durable_ts = txn->durable_timestamp;
            }
        }
    }
}

/*
 * __wt_txn_modify --
 *     Mark a WT_UPDATE object modified by the current transaction.
 */
static inline int
__wt_txn_modify(WT_SESSION_IMPL *session, WT_UPDATE *upd)
{
    WT_TXN *txn;
    WT_TXN_OP *op;

    txn = session->txn;

    if (F_ISSET(txn, WT_TXN_READONLY)) {
        if (F_ISSET(txn, WT_TXN_IGNORE_PREPARE))
            WT_RET_MSG(
              session, ENOTSUP, "Transactions with ignore_prepare=true cannot perform updates");
        WT_RET_MSG(session, WT_ROLLBACK, "Attempt to update in a read-only transaction");
    }

    WT_RET(__txn_next_op(session, &op));
    if (F_ISSET(session, WT_SESSION_LOGGING_INMEM)) {
        if (op->btree->type == BTREE_ROW)
            op->type = WT_TXN_OP_INMEM_ROW;
        else
            op->type = WT_TXN_OP_INMEM_COL;
    } else {
        if (op->btree->type == BTREE_ROW)
            op->type = WT_TXN_OP_BASIC_ROW;
        else
            op->type = WT_TXN_OP_BASIC_COL;
    }
    op->u.op_upd = upd;

    /* History store bypasses transactions, transaction modify should never be called on it. */
    WT_ASSERT(session, !WT_IS_HS((S2BT(session))->dhandle));

    upd->txnid = session->txn->id;
    __wt_txn_op_set_timestamp(session, op);

    return (0);
}

/*
 * __wt_txn_modify_page_delete --
 *     Remember a page truncated by the current transaction.
 */
static inline int
__wt_txn_modify_page_delete(WT_SESSION_IMPL *session, WT_REF *ref)
{
    WT_DECL_RET;
    WT_TXN *txn;
    WT_TXN_OP *op;

    txn = session->txn;

    WT_RET(__txn_next_op(session, &op));
    op->type = WT_TXN_OP_REF_DELETE;
    op->u.ref = ref;

    /*
     * This access to the WT_PAGE_DELETED structure is safe; caller has the WT_REF locked, and in
     * fact just allocated the structure to fill in.
     */
    ref->page_del->txnid = txn->id;
    __wt_txn_op_set_timestamp(session, op);

    if (__wt_log_op(session))
        WT_ERR(__wt_txn_log_op(session, NULL));
    return (0);

err:
    __wt_txn_unmodify(session);
    return (ret);
}

/*
 * __wt_txn_oldest_id --
 *     Return the oldest transaction ID that has to be kept for the current tree.
 */
static inline uint64_t
__wt_txn_oldest_id(WT_SESSION_IMPL *session)
{
    WT_CONNECTION_IMPL *conn;
    WT_TXN_GLOBAL *txn_global;
    uint64_t checkpoint_pinned, oldest_id, recovery_ckpt_snap_min;

    conn = S2C(session);
    txn_global = &conn->txn_global;

    /*
     * The metadata is tracked specially because of optimizations for checkpoints.
     */
    if (session->dhandle != NULL && WT_IS_METADATA(session->dhandle))
        return (txn_global->metadata_pinned);

    /*
     * Take a local copy of these IDs in case they are updated while we are checking visibility. The
     * read of the transaction ID pinned by a checkpoint needs to be carefully ordered: if a
     * checkpoint is starting and we have to start checking the pinned ID, we take the minimum of it
     * with the oldest ID, which is what we want. The logged tables are excluded as part of RTS, so
     * there is no need of holding their oldest_id
     */
    WT_ORDERED_READ(oldest_id, txn_global->oldest_id);

    if (!F_ISSET(conn, WT_CONN_RECOVERING) || session->dhandle == NULL ||
      F_ISSET(S2BT(session), WT_BTREE_LOGGED)) {
        /*
         * Checkpoint transactions often fall behind ordinary application threads. If there is an
         * active checkpoint, keep changes until checkpoint is finished.
         */
        checkpoint_pinned = txn_global->checkpoint_txn_shared.pinned_id;
        if (checkpoint_pinned == WT_TXN_NONE || WT_TXNID_LT(oldest_id, checkpoint_pinned))
            return (oldest_id);
        return (checkpoint_pinned);
    } else {
        /*
         * Recovered checkpoint snapshot rarely fall behind ordinary application threads. Keep the
         * changes until the recovery is finished.
         */
        recovery_ckpt_snap_min = conn->recovery_ckpt_snap_min;
        if (recovery_ckpt_snap_min == WT_TXN_NONE || WT_TXNID_LT(oldest_id, recovery_ckpt_snap_min))
            return (oldest_id);
        return (recovery_ckpt_snap_min);
    }
}

/*
 * __wt_txn_pinned_timestamp --
 *     Get the first timestamp that has to be kept for the current tree.
 */
static inline void
__wt_txn_pinned_timestamp(WT_SESSION_IMPL *session, wt_timestamp_t *pinned_tsp)
{
    WT_TXN_GLOBAL *txn_global;
    wt_timestamp_t checkpoint_ts, pinned_ts;

    *pinned_tsp = WT_TS_NONE;

    txn_global = &S2C(session)->txn_global;

    /*
     * There is no need to go further if no pinned timestamp has been set yet.
     */
    if (!txn_global->has_pinned_timestamp)
        return;

    /* If we have a version cursor open, use the pinned timestamp when it is opened. */
    if (S2C(session)->version_cursor_count > 0) {
        *pinned_tsp = txn_global->version_cursor_pinned_timestamp;
        return;
    }

    *pinned_tsp = pinned_ts = txn_global->pinned_timestamp;

    /*
     * The read of checkpoint timestamp needs to be carefully ordered: it needs to be after we have
     * read the pinned timestamp and the checkpoint generation, otherwise, we may read earlier
     * checkpoint timestamp before the checkpoint generation that is read resulting more data being
     * pinned. If a checkpoint is starting and we have to use the checkpoint timestamp, we take the
     * minimum of it with the oldest timestamp, which is what we want.
     */
    WT_READ_BARRIER();
    checkpoint_ts = txn_global->checkpoint_timestamp;

    if (checkpoint_ts != 0 && checkpoint_ts < pinned_ts)
        *pinned_tsp = checkpoint_ts;
}

/*
 * __txn_visible_all_id --
 *     Check if a given transaction ID is "globally visible". This is, if all sessions in the system
 *     will see the transaction ID including the ID that belongs to a running checkpoint.
 */
static inline bool
__txn_visible_all_id(WT_SESSION_IMPL *session, uint64_t id)
{
    WT_TXN *txn;
    uint64_t oldest_id;

    txn = session->txn;

    /* Make sure that checkpoint cursor transactions only read checkpoints, except for metadata. */
    WT_ASSERT(session,
      (session->dhandle != NULL && WT_IS_METADATA(session->dhandle)) ||
        WT_READING_CHECKPOINT(session) == F_ISSET(session->txn, WT_TXN_IS_CHECKPOINT));

    /*
     * When reading from a checkpoint, all readers use the same snapshot, so a transaction is
     * globally visible if it is visible in that snapshot. Note that this can cause things that were
     * not globally visible yet when the checkpoint is taken to become globally visible in the
     * checkpoint. This is expected (it is like all the old running transactions exited) -- but note
     * that it's important that the inverse change (something globally visible when the checkpoint
     * was taken becomes not globally visible in the checkpoint) never happen as this violates basic
     * assumptions about visibility. (And, concretely, it can cause stale history store entries to
     * come back to life and produce wrong answers.)
     *
     * Note: we use the transaction to check this rather than testing WT_READING_CHECKPOINT because
     * reading the metadata while working with a checkpoint cursor will borrow the transaction; it
     * then ends up using it to read a non-checkpoint tree. This is believed to be ok because the
     * metadata is always read-uncommitted, but we want to still use the checkpoint-cursor
     * visibility logic. Using the regular visibility logic with a checkpoint cursor transaction can
     * be logically invalid (it is possible that way for something to be globally visible but
     * specifically invisible) and also can end up comparing transaction ids from different database
     * opens.
     */
    if (F_ISSET(session->txn, WT_TXN_IS_CHECKPOINT))
        return (__wt_txn_visible_id_snapshot(
          id, txn->snap_min, txn->snap_max, txn->snapshot, txn->snapshot_count));
    oldest_id = __wt_txn_oldest_id(session);

    return (WT_TXNID_LT(id, oldest_id));
}

/*
 * __wt_txn_visible_all --
 *     Check whether a given time window is either globally visible or obsolete. For global
 *     visibility checks, the commit times are checked against the oldest possible readers in the
 *     system. If all possible readers could always see the time window - it is globally visible.
 *     For obsolete checks callers should generally pass in the durable timestamp, since it is
 *     guaranteed to be newer than or equal to the commit time, and content needs to be retained
 *     (not become obsolete) until both the commit and durable times are obsolete. If the commit
 *     time is used for this check, it's possible that a transaction is committed with a durable
 *     time and made obsolete before it can be included in a checkpoint - which leads to bugs in
 *     checkpoint correctness.
 */
static inline bool
__wt_txn_visible_all(WT_SESSION_IMPL *session, uint64_t id, wt_timestamp_t timestamp)
{
    wt_timestamp_t pinned_ts;

    /*
     * When shutting down, the transactional system has finished running and all we care about is
     * eviction, make everything visible.
     */
    if (F_ISSET(S2C(session), WT_CONN_CLOSING))
        return (true);

    if (!__txn_visible_all_id(session, id))
        return (false);

    /* Timestamp check. */
    if (timestamp == WT_TS_NONE)
        return (true);

    /* Make sure that checkpoint cursor transactions only read checkpoints, except for metadata. */
    WT_ASSERT(session,
      (session->dhandle != NULL && WT_IS_METADATA(session->dhandle)) ||
        WT_READING_CHECKPOINT(session) == F_ISSET(session->txn, WT_TXN_IS_CHECKPOINT));

    /* When reading a checkpoint, use the checkpoint state instead of the current state. */
    if (F_ISSET(session->txn, WT_TXN_IS_CHECKPOINT))
        return (session->txn->checkpoint_oldest_timestamp != WT_TS_NONE &&
          timestamp <= session->txn->checkpoint_oldest_timestamp);

    /* If no oldest timestamp has been supplied, updates have to stay in cache. */
    __wt_txn_pinned_timestamp(session, &pinned_ts);

    return (pinned_ts != WT_TS_NONE && timestamp <= pinned_ts);
}

/*
 * __wt_txn_upd_visible_all --
 *     Is the given update visible to all (possible) readers?
 */
static inline bool
__wt_txn_upd_visible_all(WT_SESSION_IMPL *session, WT_UPDATE *upd)
{
    if (upd->prepare_state == WT_PREPARE_LOCKED || upd->prepare_state == WT_PREPARE_INPROGRESS)
        return (false);

    /*
     * This function is used to determine when an update is obsolete: that should take into account
     * the durable timestamp which is greater than or equal to the start timestamp.
     */
    return (__wt_txn_visible_all(session, upd->txnid, upd->durable_ts));
}

/*
 * __wt_txn_upd_value_visible_all --
 *     Is the given update value visible to all (possible) readers?
 */
static inline bool
__wt_txn_upd_value_visible_all(WT_SESSION_IMPL *session, WT_UPDATE_VALUE *upd_value)
{
    WT_ASSERT(session, upd_value->tw.prepare == 0);
    return (upd_value->type == WT_UPDATE_TOMBSTONE ?
        __wt_txn_visible_all(session, upd_value->tw.stop_txn, upd_value->tw.durable_stop_ts) :
        __wt_txn_visible_all(session, upd_value->tw.start_txn, upd_value->tw.durable_start_ts));
}

/*
 * __wt_txn_tw_stop_visible --
 *     Is the given stop time window visible?
 */
static inline bool
__wt_txn_tw_stop_visible(WT_SESSION_IMPL *session, WT_TIME_WINDOW *tw)
{
    return (WT_TIME_WINDOW_HAS_STOP(tw) && !tw->prepare &&
      __wt_txn_visible(session, tw->stop_txn, tw->stop_ts, tw->durable_stop_ts));
}

/*
 * __wt_txn_tw_start_visible --
 *     Is the given start time window visible?
 */
static inline bool
__wt_txn_tw_start_visible(WT_SESSION_IMPL *session, WT_TIME_WINDOW *tw)
{
    /*
     * Check the prepared flag if there is no stop time point or the start and stop time points are
     * from the same transaction.
     */
    return (((WT_TIME_WINDOW_HAS_STOP(tw) &&
               (tw->start_txn != tw->stop_txn || tw->start_ts != tw->stop_ts ||
                 tw->durable_start_ts != tw->durable_stop_ts)) ||
              !tw->prepare) &&
      __wt_txn_visible(session, tw->start_txn, tw->start_ts, tw->durable_start_ts));
}

/*
 * __wt_txn_tw_start_visible_all --
 *     Is the given start time window visible to all (possible) readers?
 */
static inline bool
__wt_txn_tw_start_visible_all(WT_SESSION_IMPL *session, WT_TIME_WINDOW *tw)
{
    /*
     * Check the prepared flag if there is no stop time point or the start and stop time points are
     * from the same transaction.
     */
    return (((WT_TIME_WINDOW_HAS_STOP(tw) &&
               (tw->start_txn != tw->stop_txn || tw->start_ts != tw->stop_ts ||
                 tw->durable_start_ts != tw->durable_stop_ts)) ||
              !tw->prepare) &&
      __wt_txn_visible_all(session, tw->start_txn, tw->durable_start_ts));
}

/*
 * __wt_txn_tw_stop_visible_all --
 *     Is the given stop time window visible to all (possible) readers?
 */
static inline bool
__wt_txn_tw_stop_visible_all(WT_SESSION_IMPL *session, WT_TIME_WINDOW *tw)
{
    return (WT_TIME_WINDOW_HAS_STOP(tw) && !tw->prepare &&
      __wt_txn_visible_all(session, tw->stop_txn, tw->durable_stop_ts));
}

/*
 * __wt_txn_visible_id_snapshot --
 *     Is the id visible in terms of the given snapshot?
 */
static inline bool
__wt_txn_visible_id_snapshot(
  uint64_t id, uint64_t snap_min, uint64_t snap_max, uint64_t *snapshot, uint32_t snapshot_count)
{
    bool found;

    /*
     * WT_ISO_SNAPSHOT, WT_ISO_READ_COMMITTED: the ID is visible if it is not the result of a
     * concurrent transaction, that is, if was committed before the snapshot was taken.
     *
     * The order here is important: anything newer than or equal to the maximum ID we saw when
     * taking the snapshot should be invisible, even if the snapshot is empty.
     *
     * Snapshot data:
     *	ids >= snap_max not visible,
     *	ids < snap_min are visible,
     *	everything else is visible unless it is found in the snapshot.
     */
    if (WT_TXNID_LE(snap_max, id))
        return (false);
    if (snapshot_count == 0 || WT_TXNID_LT(id, snap_min))
        return (true);

    WT_BINARY_SEARCH(id, snapshot, snapshot_count, found);
    return (!found);
}

/*
 * __txn_visible_id --
 *     Can the current transaction see the given ID?
 */
static inline bool
__txn_visible_id(WT_SESSION_IMPL *session, uint64_t id)
{
    WT_TXN *txn;

    txn = session->txn;

    /* Changes with no associated transaction are always visible. */
    if (id == WT_TXN_NONE)
        return (true);

    /* Nobody sees the results of aborted transactions. */
    if (id == WT_TXN_ABORTED)
        return (false);

    /* Transactions see their own changes. */
    if (id == txn->id)
        return (true);

    /* Read-uncommitted transactions see all other changes. */
    if (txn->isolation == WT_ISO_READ_UNCOMMITTED)
        return (true);

    /* Otherwise, we should be called with a snapshot. */
    WT_ASSERT(session, F_ISSET(txn, WT_TXN_HAS_SNAPSHOT));

    return (__wt_txn_visible_id_snapshot(
      id, txn->snap_min, txn->snap_max, txn->snapshot, txn->snapshot_count));
}

/*
 * __wt_txn_visible --
 *     Can the current transaction see the given ID / timestamp?
 */
static inline bool
__wt_txn_visible(
  WT_SESSION_IMPL *session, uint64_t id, wt_timestamp_t timestamp, wt_timestamp_t durable_timestamp)
{
    WT_TXN *txn;
    WT_TXN_SHARED *txn_shared;

    txn = session->txn;
    txn_shared = WT_SESSION_TXN_SHARED(session);

    if (!__txn_visible_id(session, id))
        return (false);

    /* Transactions read their writes, regardless of timestamps. */
    if (F_ISSET(session->txn, WT_TXN_HAS_ID) && id == session->txn->id)
        return (true);

    /* Timestamp check. */
    if (!F_ISSET(txn, WT_TXN_SHARED_TS_READ) || timestamp == WT_TS_NONE)
        return (true);

    /*
     * For checkpoint cursors, just using the commit timestamp visibility check can go wrong when a
     * prepared transaction gets committed in parallel to a running checkpoint.
     *
     * To avoid this problem, along with the visibility check of a commit timestamp, comparing the
     * durable timestamp against the stable timestamp of a checkpoint can avoid the problems of
     * returning inconsistent data.
     */
    if (WT_READING_CHECKPOINT(session))
        return ((timestamp <= txn->checkpoint_read_timestamp) &&
          (durable_timestamp <= txn->checkpoint_stable_timestamp));

    return (timestamp <= txn_shared->read_timestamp);
}

/*
 * __wt_txn_upd_visible_type --
 *     Visible type of given update for the current transaction.
 */
static inline WT_VISIBLE_TYPE
__wt_txn_upd_visible_type(WT_SESSION_IMPL *session, WT_UPDATE *upd)
{
    uint8_t prepare_state, previous_state;
    bool upd_visible;

    for (;; __wt_yield()) {
        /* Prepare state change is in progress, yield and try again. */
        WT_ORDERED_READ(prepare_state, upd->prepare_state);
        if (prepare_state == WT_PREPARE_LOCKED)
            continue;

        /* Entries in the history store are always visible. */
        if ((WT_IS_HS(session->dhandle) && upd->txnid != WT_TXN_ABORTED &&
              upd->type == WT_UPDATE_STANDARD))
            return (WT_VISIBLE_TRUE);

        upd_visible = __wt_txn_visible(session, upd->txnid, upd->start_ts, upd->durable_ts);

        /*
         * The visibility check is only valid if the update does not change state. If the state does
         * change, recheck visibility.
         */
        previous_state = prepare_state;
        WT_ORDERED_READ(prepare_state, upd->prepare_state);
        if (previous_state == prepare_state)
            break;

        WT_STAT_CONN_INCR(session, prepared_transition_blocked_page);
    }

    if (!upd_visible)
        return (WT_VISIBLE_FALSE);

    if (prepare_state == WT_PREPARE_INPROGRESS)
        return (WT_VISIBLE_PREPARE);

    return (WT_VISIBLE_TRUE);
}

/*
 * __wt_txn_upd_visible --
 *     Can the current transaction see the given update.
 */
static inline bool
__wt_txn_upd_visible(WT_SESSION_IMPL *session, WT_UPDATE *upd)
{
    return (__wt_txn_upd_visible_type(session, upd) == WT_VISIBLE_TRUE);
}

/*
 * __wt_upd_alloc --
 *     Allocate a WT_UPDATE structure and associated value and fill it in.
 */
static inline int
__wt_upd_alloc(WT_SESSION_IMPL *session, const WT_ITEM *value, u_int modify_type, WT_UPDATE **updp,
  size_t *sizep)
{
    WT_UPDATE *upd;

    *updp = NULL;

    /*
     * The code paths leading here are convoluted: assert we never attempt to allocate an update
     * structure if only intending to insert one we already have, or pass in a value with a type
     * that doesn't support values.
     */
    WT_ASSERT(session, modify_type != WT_UPDATE_INVALID);
    WT_ASSERT(session,
      (value == NULL && (modify_type == WT_UPDATE_RESERVE || modify_type == WT_UPDATE_TOMBSTONE)) ||
        (value != NULL &&
          !(modify_type == WT_UPDATE_RESERVE || modify_type == WT_UPDATE_TOMBSTONE)));

    /*
     * Allocate the WT_UPDATE structure and room for the value, then copy any value into place.
     * Memory is cleared, which is the equivalent of setting:
     *    WT_UPDATE.txnid = WT_TXN_NONE;
     *    WT_UPDATE.durable_ts = WT_TS_NONE;
     *    WT_UPDATE.start_ts = WT_TS_NONE;
     *    WT_UPDATE.prepare_state = WT_PREPARE_INIT;
     *    WT_UPDATE.flags = 0;
     */
    WT_RET(__wt_calloc(session, 1, WT_UPDATE_SIZE + (value == NULL ? 0 : value->size), &upd));
    if (value != NULL && value->size != 0) {
        upd->size = WT_STORE_SIZE(value->size);
        memcpy(upd->data, value->data, value->size);
    }
    upd->type = (uint8_t)modify_type;

    *updp = upd;
    if (sizep != NULL)
        *sizep = WT_UPDATE_MEMSIZE(upd);
    return (0);
}

/*
 * __wt_upd_alloc_tombstone --
 *     Allocate a tombstone update.
 */
static inline int
__wt_upd_alloc_tombstone(WT_SESSION_IMPL *session, WT_UPDATE **updp, size_t *sizep)
{
    return (__wt_upd_alloc(session, NULL, WT_UPDATE_TOMBSTONE, updp, sizep));
}

/*
 * __wt_txn_read_upd_list_internal --
 *     Internal helper function to get the first visible update in a list (or NULL if none are
 *     visible).
 */
static inline int
__wt_txn_read_upd_list_internal(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd,
  WT_UPDATE **prepare_updp, WT_UPDATE **restored_updp)
{
    WT_VISIBLE_TYPE upd_visible;
    uint8_t prepare_state, type;

    if (prepare_updp != NULL)
        *prepare_updp = NULL;
    if (restored_updp != NULL)
        *restored_updp = NULL;
    __wt_upd_value_clear(cbt->upd_value);

    for (; upd != NULL; upd = upd->next) {
        WT_ORDERED_READ(type, upd->type);
        /* Skip reserved place-holders, they're never visible. */
        if (type == WT_UPDATE_RESERVE)
            continue;

        WT_ORDERED_READ(prepare_state, upd->prepare_state);
        /*
         * If the cursor is configured to ignore tombstones, copy the timestamps from the tombstones
         * to the stop time window of the update value being returned to the caller. Caller can
         * process the stop time window to decide if there was a tombstone on the update chain. If
         * the time window already has a stop time set then we must have seen a tombstone prior to
         * ours in the update list, and therefore don't need to do this again.
         */
        if (type == WT_UPDATE_TOMBSTONE && F_ISSET(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE) &&
          !WT_TIME_WINDOW_HAS_STOP(&cbt->upd_value->tw)) {
            cbt->upd_value->tw.durable_stop_ts = upd->durable_ts;
            cbt->upd_value->tw.stop_ts = upd->start_ts;
            cbt->upd_value->tw.stop_txn = upd->txnid;
            cbt->upd_value->tw.prepare =
              prepare_state == WT_PREPARE_INPROGRESS || prepare_state == WT_PREPARE_LOCKED;
            continue;
        }

        upd_visible = __wt_txn_upd_visible_type(session, upd);

        if (upd_visible == WT_VISIBLE_TRUE)
            break;

        /*
         * Save the prepared update to help us detect if we race with prepared commit or rollback
         * irrespective of update visibility.
         */
        if ((prepare_state == WT_PREPARE_INPROGRESS || prepare_state == WT_PREPARE_LOCKED) &&
          prepare_updp != NULL && *prepare_updp == NULL &&
          F_ISSET(upd, WT_UPDATE_PREPARE_RESTORED_FROM_DS))
            *prepare_updp = upd;

        /*
         * Save the restored update to use it as base value update in case if we need to reach
         * history store instead of on-disk value.
         */
        if (upd->txnid != WT_TXN_ABORTED && restored_updp != NULL &&
          F_ISSET(upd, WT_UPDATE_RESTORED_FROM_HS) && type == WT_UPDATE_STANDARD) {
            WT_ASSERT(session, *restored_updp == NULL);
            *restored_updp = upd;
        }

        if (upd_visible == WT_VISIBLE_PREPARE) {
            /* Ignore the prepared update, if transaction configuration says so. */
            if (F_ISSET(session->txn, WT_TXN_IGNORE_PREPARE))
                continue;

            return (WT_PREPARE_CONFLICT);
        }
    }

    if (upd == NULL)
        return (0);

    /*
     * Now assign to the update value. If it's not a modify, we're free to simply point the value at
     * the update's memory without owning it. If it is a modify, we need to reconstruct the full
     * update now and make the value own the buffer.
     *
     * If the caller has specifically asked us to skip assigning the buffer, we shouldn't bother
     * reconstructing the modify.
     */
    if (upd->type != WT_UPDATE_MODIFY || cbt->upd_value->skip_buf)
        __wt_upd_value_assign(cbt->upd_value, upd);
    else
        WT_RET(__wt_modify_reconstruct_from_upd_list(session, cbt, upd, cbt->upd_value));
    return (0);
}

/*
 * __wt_txn_read_upd_list --
 *     Get the first visible update in a list (or NULL if none are visible).
 */
static inline int
__wt_txn_read_upd_list(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd)
{
    return (__wt_txn_read_upd_list_internal(session, cbt, upd, NULL, NULL));
}

/*
 * __wt_txn_read --
 *     Get the first visible update in a chain. This function will first check the update list
 *     supplied as a function argument. If there is no visible update, it will check the onpage
 *     value for the given key. Finally, if the onpage value is not visible to the reader, the
 *     function will search the history store for a visible update.
 */
static inline int
__wt_txn_read(
  WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *key, uint64_t recno, WT_UPDATE *upd)
{
    WT_DECL_RET;
    WT_TIME_WINDOW tw;
    WT_UPDATE *prepare_upd, *restored_upd;
    bool have_stop_tw, prepare_retry, read_onpage;

    prepare_upd = restored_upd = NULL;
    read_onpage = prepare_retry = true;

retry:
    WT_RET(__wt_txn_read_upd_list_internal(session, cbt, upd, &prepare_upd, &restored_upd));
    if (WT_UPDATE_DATA_VALUE(cbt->upd_value) ||
      (cbt->upd_value->type == WT_UPDATE_MODIFY && cbt->upd_value->skip_buf))
        return (0);
    WT_ASSERT(session, cbt->upd_value->type == WT_UPDATE_INVALID);

    /* If there is no ondisk value, there can't be anything in the history store either. */
    if (cbt->ref->page->dsk == NULL) {
        cbt->upd_value->type = WT_UPDATE_TOMBSTONE;
        return (0);
    }

    /*
     * Skip retrieving the on-disk value when there exists a restored update from history store in
     * the update list. Having a restored update as part of the update list indicates that the
     * existing on-disk value is unstable.
     */
    if (restored_upd != NULL) {
        WT_ASSERT(session, !WT_IS_HS(session->dhandle));
        cbt->upd_value->buf.data = restored_upd->data;
        cbt->upd_value->buf.size = restored_upd->size;
    } else {
        /*
         * When we inspected the update list we may have seen a tombstone leaving us with a valid
         * stop time window, we don't want to overwrite this stop time window.
         */
        have_stop_tw = WT_TIME_WINDOW_HAS_STOP(&cbt->upd_value->tw);

        if (read_onpage) {
            /*
             * We may have raced with checkpoint freeing the overflow blocks. Retry from start and
             * ignore the onpage value the next time. For pages that have remained in memory after a
             * checkpoint, this will lead us to read every key with an overflow removed onpage value
             * twice. However, it simplifies the logic and doesn't depend on the assumption that the
             * cell unpacking code will always return a correct time window even it returns a
             * WT_RESTART error.
             */
            ret = __wt_value_return_buf(cbt, cbt->ref, &cbt->upd_value->buf, &tw);
            if (ret == WT_RESTART) {
                read_onpage = false;
                goto retry;
            } else
                WT_RET(ret);

            /*
             * If the stop time point is set, that means that there is a tombstone at that time. If
             * it is not prepared and it is visible to our txn it means we've just spotted a
             * tombstone and should return "not found", except scanning the history store during
             * rollback to stable and when we are told to ignore non-globally visible tombstones.
             */
            if (!have_stop_tw && __wt_txn_tw_stop_visible(session, &tw) &&
              !F_ISSET(&cbt->iface, WT_CURSTD_IGNORE_TOMBSTONE)) {
                cbt->upd_value->buf.data = NULL;
                cbt->upd_value->buf.size = 0;
                cbt->upd_value->type = WT_UPDATE_TOMBSTONE;
                WT_TIME_WINDOW_COPY_STOP(&cbt->upd_value->tw, &tw);
                return (0);
            }

            /* Store the stop time pair of the history store record that is returning. */
            if (!have_stop_tw && WT_TIME_WINDOW_HAS_STOP(&tw) && WT_IS_HS(session->dhandle))
                WT_TIME_WINDOW_COPY_STOP(&cbt->upd_value->tw, &tw);

            /*
             * We return the onpage value in the following cases:
             * 1. The record is from the history store.
             * 2. It is visible to the reader.
             */
            if (WT_IS_HS(session->dhandle) || __wt_txn_tw_start_visible(session, &tw)) {
                if (cbt->upd_value->skip_buf) {
                    cbt->upd_value->buf.data = NULL;
                    cbt->upd_value->buf.size = 0;
                }
                cbt->upd_value->type = WT_UPDATE_STANDARD;

                WT_TIME_WINDOW_COPY_START(&cbt->upd_value->tw, &tw);
                return (0);
            }
        }
    }

    /* If there's no visible update in the update chain or ondisk, check the history store file. */
    if (F_ISSET(S2C(session), WT_CONN_HS_OPEN) && !F_ISSET(session->dhandle, WT_DHANDLE_HS)) {
        __wt_timing_stress(session, WT_TIMING_STRESS_HS_SEARCH, NULL);
        WT_RET(__wt_hs_find_upd(session, S2BT(session)->id, key, cbt->iface.value_format, recno,
          cbt->upd_value, &cbt->upd_value->buf));
    }

    /*
     * Retry if we race with prepared commit or rollback. If we race with prepared rollback, the
     * value the reader should read may have been removed from the history store and appended to the
     * data store. If we race with prepared commit, imagine a case we read with timestamp 50 and we
     * have a prepared update with timestamp 30 and a history store record with timestamp 20,
     * committing the prepared update will cause the stop timestamp of the history store record
     * being updated to 30 and the reader not seeing it.
     */
    if (prepare_upd != NULL) {
        WT_ASSERT(session, F_ISSET(prepare_upd, WT_UPDATE_PREPARE_RESTORED_FROM_DS));
        if (prepare_retry &&
          (prepare_upd->txnid == WT_TXN_ABORTED ||
            prepare_upd->prepare_state == WT_PREPARE_RESOLVED)) {
            prepare_retry = false;
            /* Clean out any stale value before performing the retry. */
            __wt_upd_value_clear(cbt->upd_value);
            WT_STAT_CONN_DATA_INCR(session, txn_read_race_prepare_update);

            /*
             * When a prepared update/insert is rollback or committed, retrying it again should fix
             * concurrent modification of a prepared update. Other than prepared insert rollback,
             * rest of the cases, the history store update is either added to the end of the update
             * chain or modified to set proper stop timestamp. In all the scenarios, retrying again
             * will work to return a proper update.
             */
            goto retry;
        }
    }

    /* Return invalid not tombstone if nothing is found in history store. */
    WT_ASSERT(session, cbt->upd_value->type != WT_UPDATE_TOMBSTONE);
    return (0);
}

/*
 * __wt_txn_begin --
 *     Begin a transaction.
 */
static inline int
__wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[])
{
    WT_TXN *txn;

    txn = session->txn;
    txn->isolation = session->isolation;
    txn->txn_logsync = S2C(session)->txn_logsync;
    txn->commit_timestamp = WT_TS_NONE;
    txn->first_commit_timestamp = WT_TS_NONE;

    WT_ASSERT(session, !F_ISSET(txn, WT_TXN_RUNNING));

    WT_RET(__wt_txn_config(session, cfg));

    /*
     * Allocate a snapshot if required or update the existing snapshot. Do not update the existing
     * snapshot of autocommit transactions because they are committed at the end of the operation.
     */
    if (txn->isolation == WT_ISO_SNAPSHOT &&
      !(F_ISSET(txn, WT_TXN_AUTOCOMMIT) && F_ISSET(txn, WT_TXN_HAS_SNAPSHOT))) {
        if (session->ncursors > 0)
            WT_RET(__wt_session_copy_values(session));

        /*
         * Stall here if the cache is completely full. Eviction check can return rollback, but the
         * WT_SESSION.begin_transaction API can't, continue on.
         */
        WT_RET_ERROR_OK(__wt_cache_eviction_check(session, false, true, NULL), WT_ROLLBACK);

        __wt_txn_get_snapshot(session);
    }

    F_SET(txn, WT_TXN_RUNNING);
    if (F_ISSET(S2C(session), WT_CONN_READONLY))
        F_SET(txn, WT_TXN_READONLY);

    return (0);
}

/*
 * __wt_txn_autocommit_check --
 *     If an auto-commit transaction is required, start one.
 */
static inline int
__wt_txn_autocommit_check(WT_SESSION_IMPL *session)
{
    WT_DECL_RET;
    WT_TXN *txn;

    txn = session->txn;
    if (F_ISSET(txn, WT_TXN_AUTOCOMMIT)) {
        ret = __wt_txn_begin(session, NULL);
        F_CLR(txn, WT_TXN_AUTOCOMMIT);
    }
    return (ret);
}

/*
 * __wt_txn_idle_cache_check --
 *     If there is no transaction active in this thread and we haven't checked if the cache is full,
 *     do it now. If we have to block for eviction, this is the best time to do it.
 */
static inline int
__wt_txn_idle_cache_check(WT_SESSION_IMPL *session)
{
    WT_TXN *txn;
    WT_TXN_SHARED *txn_shared;

    txn = session->txn;
    txn_shared = WT_SESSION_TXN_SHARED(session);

    /*
     * Check the published snap_min because read-uncommitted never sets WT_TXN_HAS_SNAPSHOT. We
     * don't have any transaction information at this point, so assume the transaction will be
     * read-only. The dirty cache check will be performed when the transaction completes, if
     * necessary.
     */
    if (F_ISSET(txn, WT_TXN_RUNNING) && !F_ISSET(txn, WT_TXN_HAS_ID) &&
      txn_shared->pinned_id == WT_TXN_NONE)
        WT_RET(__wt_cache_eviction_check(session, false, true, NULL));

    return (0);
}

/*
 * __wt_txn_id_alloc --
 *     Allocate a new transaction ID.
 */
static inline uint64_t
__wt_txn_id_alloc(WT_SESSION_IMPL *session, bool publish)
{
    WT_TXN_GLOBAL *txn_global;
    WT_TXN_SHARED *txn_shared;
    uint64_t id;

    txn_global = &S2C(session)->txn_global;
    txn_shared = WT_SESSION_TXN_SHARED(session);

    /*
     * Allocating transaction IDs involves several steps.
     *
     * Firstly, publish that this transaction is allocating its ID, then publish the transaction ID
     * as the current global ID. Note that this transaction ID might not be unique among threads and
     * hence not valid at this moment. The flag will notify other transactions that are attempting
     * to get their own snapshot for this transaction ID to retry.
     *
     * Then we do an atomic increment to allocate a unique ID. This will give the valid ID to this
     * transaction that we publish to the global transaction table.
     *
     * We want the global value to lead the allocated values, so that any allocated transaction ID
     * eventually becomes globally visible. When there are no transactions running, the oldest_id
     * will reach the global current ID, so we want post-increment semantics. Our atomic add
     * primitive does pre-increment, so adjust the result here.
     *
     * We rely on atomic reads of the current ID to create snapshots, so for unlocked reads to be
     * well defined, we must use an atomic increment here.
     */
    if (publish) {
        WT_PUBLISH(txn_shared->is_allocating, true);
        WT_PUBLISH(txn_shared->id, txn_global->current);
        id = __wt_atomic_addv64(&txn_global->current, 1) - 1;
        session->txn->id = id;
        WT_PUBLISH(txn_shared->id, id);
        WT_PUBLISH(txn_shared->is_allocating, false);
    } else
        id = __wt_atomic_addv64(&txn_global->current, 1) - 1;

    return (id);
}

/*
 * __wt_txn_id_check --
 *     A transaction is going to do an update, allocate a transaction ID.
 */
static inline int
__wt_txn_id_check(WT_SESSION_IMPL *session)
{
    WT_TXN *txn;

    txn = session->txn;

    WT_ASSERT(session, F_ISSET(txn, WT_TXN_RUNNING));

    if (F_ISSET(txn, WT_TXN_HAS_ID))
        return (0);

    /*
     * Return error when the transactions with read committed or uncommitted isolation tries to
     * perform any write operation. Don't return an error for any update on metadata because it uses
     * special transaction visibility rules, search and updates on metadata happens in
     * read-uncommitted and read-committed isolation.
     */
    if (session->dhandle != NULL && !WT_IS_METADATA(session->dhandle) &&
      (txn->isolation == WT_ISO_READ_COMMITTED || txn->isolation == WT_ISO_READ_UNCOMMITTED)) {
        WT_ASSERT(session, !F_ISSET(session, WT_SESSION_INTERNAL));
        WT_RET_MSG(session, ENOTSUP,
          "write operations are not supported in read-committed or read-uncommitted transactions.");
    }

    /* If the transaction is idle, check that the cache isn't full. */
    WT_RET(__wt_txn_idle_cache_check(session));

    WT_IGNORE_RET(__wt_txn_id_alloc(session, true));

    /*
     * If we have used 64-bits of transaction IDs, there is nothing more we can do.
     */
    if (txn->id == WT_TXN_ABORTED)
        WT_RET_MSG(session, WT_ERROR, "out of transaction IDs");
    F_SET(txn, WT_TXN_HAS_ID);

    return (0);
}

/*
 * __wt_txn_search_check --
 *     Check if a search by the current transaction violates timestamp rules.
 */
static inline int
__wt_txn_search_check(WT_SESSION_IMPL *session)
{
    WT_TXN *txn;
    uint16_t flags;
    const char *name;

    txn = session->txn;
    flags = session->dhandle->ts_flags;
    name = session->dhandle->name;

    /* Timestamps are ignored on logged files. */
    if (F_ISSET(S2BT(session), WT_BTREE_LOGGED))
        return (0);

    /* Skip checks during recovery. */
    if (F_ISSET(S2C(session), WT_CONN_RECOVERING))
        return (0);

    /* Verify if the table should always or never use a read timestamp. */
    if (LF_ISSET(WT_DHANDLE_TS_ASSERT_READ_ALWAYS) && !F_ISSET(txn, WT_TXN_SHARED_TS_READ)) {
        __wt_err(session, EINVAL,
          "%s: " WT_TS_VERBOSE_PREFIX "read timestamps required and none set", name);
#ifdef HAVE_DIAGNOSTIC
        __wt_abort(session);
#endif
        return (EINVAL);
    }

    if (LF_ISSET(WT_DHANDLE_TS_ASSERT_READ_NEVER) && F_ISSET(txn, WT_TXN_SHARED_TS_READ)) {
        __wt_err(session, EINVAL,
          "%s: " WT_TS_VERBOSE_PREFIX "read timestamps disallowed and one set", name);
#ifdef HAVE_DIAGNOSTIC
        __wt_abort(session);
#endif
        return (EINVAL);
    }
    return (0);
}

/*
 * __wt_txn_modify_block --
 *     Check if the current transaction can modify an item.
 */
static inline int
__wt_txn_modify_block(
  WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd, wt_timestamp_t *prev_tsp)
{
    WT_DECL_ITEM(buf);
    WT_DECL_RET;
    WT_TIME_WINDOW tw;
    WT_TXN *txn;
    uint32_t snap_count;
    char ts_string[WT_TS_INT_STRING_SIZE];
    bool ignore_prepare_set, rollback, tw_found;

    rollback = tw_found = false;
    txn = session->txn;

    /*
     * Always include prepared transactions in this check: they are not supposed to affect
     * visibility for update operations.
     */
    ignore_prepare_set = F_ISSET(txn, WT_TXN_IGNORE_PREPARE);
    F_CLR(txn, WT_TXN_IGNORE_PREPARE);
    for (; upd != NULL && !__wt_txn_upd_visible(session, upd); upd = upd->next) {
        if (upd->txnid != WT_TXN_ABORTED) {
            __wt_verbose_debug1(session, WT_VERB_TRANSACTION,
              "Conflict with update with txn id %" PRIu64 " at timestamp: %s", upd->txnid,
              __wt_timestamp_to_string(upd->start_ts, ts_string));
            rollback = true;
            break;
        }
    }

    WT_ASSERT(session, upd != NULL || !rollback);

    /*
     * Check conflict against any on-page value if there is no update on the update chain except
     * aborted updates. Otherwise, we would have either already detected a conflict if we saw an
     * uncommitted update or determined that it would be safe to write if we saw a committed update.
     *
     * In the case of row-store we also need to check that the insert list is empty as the existence
     * of it implies there is no on disk value for the given key. However we can still get a
     * time-window from an unrelated on-disk value if we are not careful as the slot can still be
     * set on the cursor b-tree.
     */
    if (!rollback && upd == NULL && (CUR2BT(cbt)->type != BTREE_ROW || cbt->ins == NULL)) {
        tw_found = __wt_read_cell_time_window(cbt, &tw);
        if (tw_found) {
            if (WT_TIME_WINDOW_HAS_STOP(&tw)) {
                rollback = !__wt_txn_tw_stop_visible(session, &tw);
                if (rollback)
                    __wt_verbose_debug1(session, WT_VERB_TRANSACTION,
                      "Conflict with update %" PRIu64 " at stop timestamp: %s", tw.stop_txn,
                      __wt_timestamp_to_string(tw.stop_ts, ts_string));
            } else {
                rollback = !__wt_txn_tw_start_visible(session, &tw);
                if (rollback)
                    __wt_verbose_debug1(session, WT_VERB_TRANSACTION,
                      "Conflict with update %" PRIu64 " at start timestamp: %s", tw.start_txn,
                      __wt_timestamp_to_string(tw.start_ts, ts_string));
            }
        }
    }

    if (rollback) {
        /* Dump information about the txn snapshot. */
        if (WT_VERBOSE_LEVEL_ISSET(session, WT_VERB_TRANSACTION, WT_VERBOSE_DEBUG_1)) {
            WT_ERR(__wt_scr_alloc(session, 1024, &buf));
            WT_ERR(__wt_buf_fmt(session, buf,
              "snapshot_min=%" PRIu64 ", snapshot_max=%" PRIu64 ", snapshot_count=%" PRIu32,
              txn->snap_min, txn->snap_max, txn->snapshot_count));
            if (txn->snapshot_count > 0) {
                WT_ERR(__wt_buf_catfmt(session, buf, ", snapshots=["));
                for (snap_count = 0; snap_count < txn->snapshot_count - 1; ++snap_count)
                    WT_ERR(
                      __wt_buf_catfmt(session, buf, "%" PRIu64 ",", txn->snapshot[snap_count]));
                WT_ERR(__wt_buf_catfmt(session, buf, "%" PRIu64 "]", txn->snapshot[snap_count]));
            }
            __wt_verbose_debug1(session, WT_VERB_TRANSACTION, "%s", (const char *)buf->data);
        }

        WT_STAT_CONN_DATA_INCR(session, txn_update_conflict);
        ret = __wt_txn_rollback_required(session, WT_TXN_ROLLBACK_REASON_CONFLICT);
    }

    /*
     * Don't access the update from an uncommitted transaction as it can produce wrong timestamp
     * results.
     */
    if (!rollback && prev_tsp != NULL) {
        if (upd != NULL) {
            /*
             * The durable timestamp must be greater than or equal to the commit timestamp unless it
             * is an in-progress prepared update.
             */
            WT_ASSERT(session,
              upd->durable_ts >= upd->start_ts || upd->prepare_state == WT_PREPARE_INPROGRESS);
            *prev_tsp = upd->durable_ts;
        } else if (tw_found)
            *prev_tsp = WT_TIME_WINDOW_HAS_STOP(&tw) ? tw.durable_stop_ts : tw.durable_start_ts;
    }

    if (ignore_prepare_set)
        F_SET(txn, WT_TXN_IGNORE_PREPARE);

err:
    __wt_scr_free(session, &buf);
    return (ret);
}

/*
 * __wt_txn_modify_check --
 *     Check if the current transaction can modify an item.
 */
static inline int
__wt_txn_modify_check(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd,
  wt_timestamp_t *prev_tsp, u_int modify_type)
{
    WT_TXN *txn;
    WT_TXN_GLOBAL *txn_global;

    txn = session->txn;

    /*
     * Check if this operation is permitted, skipping if transaction isolation is not snapshot or
     * operating on the metadata table.
     */
    if (txn->isolation == WT_ISO_SNAPSHOT && !WT_IS_METADATA(cbt->dhandle))
        WT_RET(__wt_txn_modify_block(session, cbt, upd, prev_tsp));

    /*
     * Prepending a tombstone to another tombstone indicates remove of a non-existent key and that
     * isn't permitted, return a WT_NOTFOUND error.
     */
    if (modify_type == WT_UPDATE_TOMBSTONE) {
        /* Loop until a valid update is found. */
        while (upd != NULL && upd->txnid == WT_TXN_ABORTED)
            upd = upd->next;

        if (upd != NULL && upd->type == WT_UPDATE_TOMBSTONE)
            return (WT_NOTFOUND);
    }

    /* Everything is OK, optionally rollback for testing (skipping metadata operations). */
    if (!WT_IS_METADATA(cbt->dhandle)) {
        txn_global = &S2C(session)->txn_global;
        if (txn_global->debug_rollback != 0 &&
          ++txn_global->debug_ops % txn_global->debug_rollback == 0)
            return (__wt_txn_rollback_required(session, "debug mode simulated conflict"));
    }
    return (0);
}

/*
 * __wt_txn_read_last --
 *     Called when the last page for a session is released.
 */
static inline void
__wt_txn_read_last(WT_SESSION_IMPL *session)
{
    WT_TXN *txn;

    txn = session->txn;

    /*
     * Release the snap_min ID we put in the global table.
     *
     * If the isolation has been temporarily forced, don't touch the snapshot here: it will be
     * restored by WT_WITH_TXN_ISOLATION.
     */
    if ((!F_ISSET(txn, WT_TXN_RUNNING) || txn->isolation != WT_ISO_SNAPSHOT) &&
      txn->forced_iso == 0)
        __wt_txn_release_snapshot(session);
}

/*
 * __wt_txn_cursor_op --
 *     Called for each cursor operation.
 */
static inline void
__wt_txn_cursor_op(WT_SESSION_IMPL *session)
{
    WT_TXN *txn;
    WT_TXN_GLOBAL *txn_global;
    WT_TXN_SHARED *txn_shared;

    txn = session->txn;
    txn_global = &S2C(session)->txn_global;
    txn_shared = WT_SESSION_TXN_SHARED(session);

    /*
     * We are about to read data, which means we need to protect against
     * updates being freed from underneath this cursor. Read-uncommitted
     * isolation protects values by putting a transaction ID in the global
     * table to prevent any update that we are reading from being freed.
     * Other isolation levels get a snapshot to protect their reads.
     *
     * !!!
     * Note:  We are updating the global table unprotected, so the global
     * oldest_id may move past our snap_min if a scan races with this value
     * being published. That said, read-uncommitted operations always see
     * the most recent update for each record that has not been aborted
     * regardless of the snap_min value published here.  Even if there is a
     * race while publishing this ID, it prevents the oldest ID from moving
     * further forward, so that once a read-uncommitted cursor is
     * positioned on a value, it can't be freed.
     */
    if (txn->isolation == WT_ISO_READ_UNCOMMITTED) {
        if (txn_shared->pinned_id == WT_TXN_NONE)
            txn_shared->pinned_id = txn_global->last_running;
        if (txn_shared->metadata_pinned == WT_TXN_NONE)
            txn_shared->metadata_pinned = txn_shared->pinned_id;
    } else if (!F_ISSET(txn, WT_TXN_HAS_SNAPSHOT))
        __wt_txn_get_snapshot(session);
}

/*
 * __wt_txn_activity_check --
 *     Check whether there are any running transactions.
 */
static inline int
__wt_txn_activity_check(WT_SESSION_IMPL *session, bool *txn_active)
{
    WT_TXN_GLOBAL *txn_global;

    txn_global = &S2C(session)->txn_global;

    /*
     * Default to true - callers shouldn't rely on this if an error is returned, but let's give them
     * deterministic behavior if they do.
     */
    *txn_active = true;

    /*
     * Ensure the oldest ID is as up to date as possible so we can use a simple check to find if
     * there are any running transactions.
     */
    WT_RET(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT));

    *txn_active = (txn_global->oldest_id != txn_global->current ||
      txn_global->metadata_pinned != txn_global->current);

    return (0);
}

/*
 * __wt_upd_value_assign --
 *     Point an update value at a given update. We're specifically not getting the value to own the
 *     memory since this exists in an update list somewhere.
 */
static inline void
__wt_upd_value_assign(WT_UPDATE_VALUE *upd_value, WT_UPDATE *upd)
{
    if (!upd_value->skip_buf) {
        upd_value->buf.data = upd->data;
        upd_value->buf.size = upd->size;
    }
    if (upd->type == WT_UPDATE_TOMBSTONE) {
        upd_value->tw.durable_stop_ts = upd->durable_ts;
        upd_value->tw.stop_ts = upd->start_ts;
        upd_value->tw.stop_txn = upd->txnid;
        upd_value->tw.prepare =
          upd->prepare_state == WT_PREPARE_INPROGRESS || upd->prepare_state == WT_PREPARE_LOCKED;
    } else {
        upd_value->tw.durable_start_ts = upd->durable_ts;
        upd_value->tw.start_ts = upd->start_ts;
        upd_value->tw.start_txn = upd->txnid;
        upd_value->tw.prepare =
          upd->prepare_state == WT_PREPARE_INPROGRESS || upd->prepare_state == WT_PREPARE_LOCKED;
    }
    upd_value->type = upd->type;
}

/*
 * __wt_upd_value_clear --
 *     Clear an update value to its defaults.
 */
static inline void
__wt_upd_value_clear(WT_UPDATE_VALUE *upd_value)
{
    /*
     * Make sure we don't touch the memory pointers here. If we have some allocated memory, that
     * could come in handy next time we need to write to the buffer.
     */
    upd_value->buf.data = NULL;
    upd_value->buf.size = 0;
    WT_TIME_WINDOW_INIT(&upd_value->tw);
    upd_value->type = WT_UPDATE_INVALID;
}