/*-
 * Public Domain 2014-present MongoDB, Inc.
 * Public Domain 2008-2014 WiredTiger, Inc.
 *
 * This is free and unencumbered software released into the public domain.
 *
 * Anyone is free to copy, modify, publish, use, compile, sell, or
 * distribute this software, either in source code form or as a compiled
 * binary, for any purpose, commercial or non-commercial, and by any
 * means.
 *
 * In jurisdictions that recognize copyright laws, the author or authors
 * of this software dedicate any and all copyright interest in the
 * software to the public domain. We make this dedication for the benefit
 * of the public at large and to the detriment of our heirs and
 * successors. We intend this dedication to be an overt act of
 * relinquishment in perpetuity of all present and future rights to this
 * software under copyright law.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 * OTHER DEALINGS IN THE SOFTWARE.
 */

#include "format.h"

/*
 * snap_init --
 *     Initialize the repeatable operation tracking.
 */
void
snap_init(TINFO *tinfo)
{
    /*
     * We maintain two snap lists, where the current one is indicated by tinfo->s, and keeps the
     * most recent operations.
     *
     * The other one is used when we are running timestamp transactions with rollback_to_stable.
     * When each thread notices that the stable timestamp has changed, it stashes the current snap
     * list and starts fresh with the other snap list. After we've completed a rollback_to_stable,
     * we can the secondary snap list to see the state of keys/values seen and updated at the time
     * of the rollback.
     */
    if (g.c_txn_timestamps) {
        tinfo->s = &tinfo->snap_states[1];
        tinfo->snap_list = dcalloc(SNAP_LIST_SIZE, sizeof(SNAP_OPS));
        tinfo->snap_end = &tinfo->snap_list[SNAP_LIST_SIZE];
    }
    tinfo->s = &tinfo->snap_states[0];
    tinfo->snap_list = dcalloc(SNAP_LIST_SIZE, sizeof(SNAP_OPS));
    tinfo->snap_end = &tinfo->snap_list[SNAP_LIST_SIZE];
    tinfo->snap_current = tinfo->snap_list;
}

/*
 * snap_teardown --
 *     Tear down the repeatable operation tracking structures.
 */
void
snap_teardown(TINFO *tinfo)
{
    SNAP_OPS *snaplist;
    u_int i, snap_index;

    for (snap_index = 0; snap_index < WT_ELEMENTS(tinfo->snap_states); snap_index++)
        if ((snaplist = tinfo->snap_states[snap_index].snap_state_list) != NULL) {
            for (i = 0; i < SNAP_LIST_SIZE; ++i) {
                free(snaplist[i].kdata);
                free(snaplist[i].vdata);
            }
            free(snaplist);
        }
}

/*
 * snap_clear --
 *     Clear a single snap entry.
 */
static void
snap_clear_one(SNAP_OPS *snap)
{
    snap->repeatable = false;
}

/*
 * snap_clear --
 *     Clear the snap list.
 */
static void
snap_clear(TINFO *tinfo)
{
    SNAP_OPS *snap;

    for (snap = tinfo->snap_list; snap < tinfo->snap_end; ++snap)
        snap_clear_one(snap);
}

/*
 * snap_op_init --
 *     Initialize the repeatable operation tracking for each new operation.
 */
void
snap_op_init(TINFO *tinfo, uint64_t read_ts, bool repeatable_reads)
{
    uint64_t stable_ts;

    ++tinfo->opid;

    if (g.c_txn_timestamps) {
        /*
         * If the stable timestamp has changed and we've advanced beyond it, preserve the current
         * snapshot history up to this point, we'll use it verify rollback_to_stable. Switch our
         * tracking to the other snap list.
         */
        stable_ts = __wt_atomic_addv64(&g.stable_timestamp, 0);
        if (stable_ts != tinfo->stable_ts && read_ts > stable_ts) {
            tinfo->stable_ts = stable_ts;
            if (tinfo->s == &tinfo->snap_states[0])
                tinfo->s = &tinfo->snap_states[1];
            else
                tinfo->s = &tinfo->snap_states[0];
            tinfo->snap_current = tinfo->snap_list;

            /* Clear out older info from the snap list. */
            snap_clear(tinfo);
        }
    }

    tinfo->snap_first = tinfo->snap_current;

    tinfo->read_ts = read_ts;
    tinfo->repeatable_reads = repeatable_reads;
    tinfo->repeatable_wrap = false;
}

/*
 * snap_track --
 *     Add a single snapshot isolation returned value to the list.
 */
void
snap_track(TINFO *tinfo, thread_op op)
{
    WT_ITEM *ip;
    SNAP_OPS *snap;

    snap = tinfo->snap_current;
    snap->op = op;
    snap->opid = tinfo->opid;
    snap->keyno = tinfo->keyno;
    snap->ts = WT_TS_NONE;
    snap->repeatable = false;
    snap->last = op == TRUNCATE ? tinfo->last : 0;
    snap->ksize = snap->vsize = 0;

    if (op == INSERT && g.type == ROW) {
        ip = tinfo->key;
        if (snap->kmemsize < ip->size) {
            snap->kdata = drealloc(snap->kdata, ip->size);
            snap->kmemsize = ip->size;
        }
        snap->ksize = ip->size;
        if (ip->size > 0)
            memcpy(snap->kdata, ip->data, ip->size);
    }

    if (op != REMOVE && op != TRUNCATE) {
        ip = tinfo->value;
        if (snap->vmemsize < ip->size) {
            snap->vdata = drealloc(snap->vdata, ip->size);
            snap->vmemsize = ip->size;
        }
        snap->vsize = ip->size;
        if (ip->size > 0)
            memcpy(snap->vdata, ip->data, ip->size);
    }

    /* Move to the next slot, wrap at the end of the circular buffer. */
    if (++tinfo->snap_current >= tinfo->snap_end)
        tinfo->snap_current = tinfo->snap_list;

    /*
     * It's possible to pass this transaction's buffer starting point and start replacing our own
     * entries. If that happens, we can't repeat operations because we don't know which ones were
     * previously modified.
     */
    if (tinfo->snap_current->opid == tinfo->opid)
        tinfo->repeatable_wrap = true;
}

/*
 * print_item_data --
 *     Display a single data/size pair, with a tag.
 */
static void
print_item_data(const char *tag, const uint8_t *data, size_t size)
{
    WT_ITEM tmp;

    if (g.type == FIX) {
        fprintf(stderr, "%s {0x%02x}\n", tag, data[0]);
        return;
    }

    memset(&tmp, 0, sizeof(tmp));
    testutil_check(__wt_raw_to_esc_hex(NULL, data, size, &tmp));
    fprintf(stderr, "%s {%s}\n", tag, (char *)tmp.mem);
    __wt_buf_free(NULL, &tmp);
}

/*
 * snap_verify --
 *     Repeat a read and verify the contents.
 */
static int
snap_verify(WT_CURSOR *cursor, TINFO *tinfo, SNAP_OPS *snap)
{
    WT_DECL_RET;
    WT_ITEM *key, *value;
    uint64_t keyno;
    uint8_t bitfield;

    testutil_assert(snap->op != TRUNCATE);

    key = tinfo->key;
    value = tinfo->value;
    keyno = snap->keyno;

    /*
     * Retrieve the key/value pair by key. Row-store inserts have a unique generated key we saved,
     * else generate the key from the key number.
     */
    if (snap->op == INSERT && g.type == ROW) {
        key->data = snap->kdata;
        key->size = snap->ksize;
        cursor->set_key(cursor, key);
    } else {
        switch (g.type) {
        case FIX:
        case VAR:
            cursor->set_key(cursor, keyno);
            break;
        case ROW:
            key_gen(key, keyno);
            cursor->set_key(cursor, key);
            break;
        }
    }

    switch (ret = read_op(cursor, SEARCH, NULL)) {
    case 0:
        if (g.type == FIX) {
            testutil_check(cursor->get_value(cursor, &bitfield));
            *(uint8_t *)(value->data) = bitfield;
            value->size = 1;
        } else
            testutil_check(cursor->get_value(cursor, value));
        break;
    case WT_NOTFOUND:
        break;
    default:
        return (ret);
    }

    /* Check for simple matches. */
    if (ret == 0 && snap->op != REMOVE && value->size == snap->vsize) {
        if (value->size == 0)
            return (0);
        if ((value->size > 0) && (memcmp(value->data, snap->vdata, value->size) == 0))
            return (0);
    }
    if (ret == WT_NOTFOUND && snap->op == REMOVE)
        return (0);

    /*
     * In fixed length stores, zero values at the end of the key space are returned as not-found,
     * and not-found row reads are saved as zero values. Map back-and-forth for simplicity.
     */
    if (g.type == FIX) {
        if (ret == WT_NOTFOUND && snap->vsize == 1 && *(uint8_t *)snap->vdata == 0)
            return (0);
        if (snap->op == REMOVE && value->size == 1 && *(uint8_t *)value->data == 0)
            return (0);
    }

    /* Things went pear-shaped. */
    switch (g.type) {
    case FIX:
        fprintf(stderr,
          "snapshot-isolation: %" PRIu64 " search: expected {0x%02x}, found {0x%02x}\n", keyno,
          snap->op == REMOVE ? 0U : *(uint8_t *)snap->vdata,
          ret == WT_NOTFOUND ? 0U : *(uint8_t *)value->data);
        break;
    case ROW:
        fprintf(
          stderr, "snapshot-isolation %.*s search mismatch\n", (int)key->size, (char *)key->data);

        if (snap->op == REMOVE)
            fprintf(stderr, "expected {deleted}\n");
        else
            print_item_data("expected", snap->vdata, snap->vsize);
        if (ret == WT_NOTFOUND)
            fprintf(stderr, "   found {deleted}\n");
        else
            print_item_data("   found", value->data, value->size);
        break;
    case VAR:
        fprintf(stderr, "snapshot-isolation %" PRIu64 " search mismatch\n", keyno);

        if (snap->op == REMOVE)
            fprintf(stderr, "expected {deleted}\n");
        else
            print_item_data("expected", snap->vdata, snap->vsize);
        if (ret == WT_NOTFOUND)
            fprintf(stderr, "   found {deleted}\n");
        else
            print_item_data("   found", value->data, value->size);
        break;
    }

    g.page_dump_cursor = cursor;
    testutil_assert(0);

    /* NOTREACHED */
    return (1);
}

/*
 * snap_ts_clear --
 *     Clear snapshots at or before a specified timestamp.
 */
static void
snap_ts_clear(TINFO *tinfo, uint64_t ts)
{
    SNAP_OPS *snap;

    /* Check from the first slot to the last. */
    for (snap = tinfo->snap_list; snap < tinfo->snap_end; ++snap)
        if (snap->repeatable && snap->ts <= ts)
            snap->repeatable = false;
}

/*
 * snap_repeat_ok_match --
 *     Compare two operations and see if they modified the same record.
 */
static bool
snap_repeat_ok_match(SNAP_OPS *current, SNAP_OPS *a)
{
    /* Reads are never a problem, there's no modification. */
    if (a->op == READ)
        return (true);

    /* Check for a matching single record modification. */
    if (a->keyno == current->keyno)
        return (false);

    /* Truncates are slightly harder, make sure the ranges don't overlap. */
    if (a->op == TRUNCATE) {
        if (g.c_reverse && (a->keyno == 0 || a->keyno >= current->keyno) &&
          (a->last == 0 || a->last <= current->keyno))
            return (false);
        if (!g.c_reverse && (a->keyno == 0 || a->keyno <= current->keyno) &&
          (a->last == 0 || a->last >= current->keyno))
            return (false);
    }

    return (true);
}

/*
 * snap_repeat_ok_commit --
 *     Return if an operation in the transaction can be repeated, where the transaction isn't yet
 *     committed (so all locks are in place), or has already committed successfully.
 */
static bool
snap_repeat_ok_commit(TINFO *tinfo, SNAP_OPS *current)
{
    SNAP_OPS *p;

    /*
     * Truncates can't be repeated, we don't know the exact range of records that were removed (if
     * any).
     */
    if (current->op == TRUNCATE)
        return (false);

    /*
     * For updates, check for subsequent changes to the record and don't repeat the read. For reads,
     * check for either subsequent or previous changes to the record and don't repeat the read. (The
     * reads are repeatable, but only at the commit timestamp, and the update will do the repeatable
     * read in that case.)
     */
    for (p = current;;) {
        /* Wrap at the end of the circular buffer. */
        if (++p >= tinfo->snap_end)
            p = tinfo->snap_list;
        if (p->opid != tinfo->opid)
            break;

        if (!snap_repeat_ok_match(current, p))
            return (false);
    }

    if (current->op != READ)
        return (true);
    for (p = current;;) {
        /* Wrap at the beginning of the circular buffer. */
        if (--p < tinfo->snap_list)
            p = &tinfo->snap_list[SNAP_LIST_SIZE - 1];
        if (p->opid != tinfo->opid)
            break;

        if (!snap_repeat_ok_match(current, p))
            return (false);
    }
    return (true);
}

/*
 * snap_repeat_ok_rollback --
 *     Return if an operation in the transaction can be repeated, after a transaction has rolled
 *     back.
 */
static bool
snap_repeat_ok_rollback(TINFO *tinfo, SNAP_OPS *current)
{
    SNAP_OPS *p;

    /* Ignore update operations, they can't be repeated after rollback. */
    if (current->op != READ)
        return (false);

    /*
     * Check for previous changes to the record and don't attempt to repeat the read in that case.
     */
    for (p = current;;) {
        /* Wrap at the beginning of the circular buffer. */
        if (--p < tinfo->snap_list)
            p = &tinfo->snap_list[SNAP_LIST_SIZE - 1];
        if (p->opid != tinfo->opid)
            break;

        if (!snap_repeat_ok_match(current, p))
            return (false);
    }
    return (true);
}

/*
 * snap_repeat_txn --
 *     Repeat each operation done within a snapshot isolation transaction.
 */
int
snap_repeat_txn(WT_CURSOR *cursor, TINFO *tinfo)
{
    SNAP_OPS *current;

    /* If we wrapped the buffer, we can't repeat operations. */
    if (tinfo->repeatable_wrap)
        return (0);

    /* Check from the first operation we saved to the last. */
    for (current = tinfo->snap_first;; ++current) {
        /* Wrap at the end of the circular buffer. */
        if (current >= tinfo->snap_end)
            current = tinfo->snap_list;
        if (current->opid != tinfo->opid)
            break;

        /*
         * The transaction is not yet resolved, so the rules are as if the transaction has
         * committed. Note we are NOT checking if reads are repeatable based on the chosen
         * timestamp. This is because we expect snapshot isolation to work even in the presence of
         * other threads of control committing in our past, until the transaction resolves.
         */
        if (snap_repeat_ok_commit(tinfo, current))
            WT_RET(snap_verify(cursor, tinfo, current));
    }

    return (0);
}

/*
 * snap_repeat_update --
 *     Update the list of snapshot operations based on final transaction resolution.
 */
void
snap_repeat_update(TINFO *tinfo, bool committed)
{
    SNAP_OPS *current;

    /* If we wrapped the buffer, we can't repeat operations. */
    if (tinfo->repeatable_wrap)
        return;

    /* Check from the first operation we saved to the last. */
    for (current = tinfo->snap_first;; ++current) {
        /* Wrap at the end of the circular buffer. */
        if (current >= tinfo->snap_end)
            current = tinfo->snap_list;
        if (current->opid != tinfo->opid)
            break;

        /*
         * First, reads may simply not be repeatable because the read timestamp chosen wasn't older
         * than all concurrently running uncommitted updates.
         */
        if (!tinfo->repeatable_reads && current->op == READ)
            continue;

        /*
         * Second, check based on the transaction resolution (the rules are different if the
         * transaction committed or rolled back).
         */
        current->repeatable = committed ? snap_repeat_ok_commit(tinfo, current) :
                                          snap_repeat_ok_rollback(tinfo, current);

        /*
         * Repeat reads at the transaction's read timestamp and updates at the commit timestamp.
         */
        if (current->repeatable)
            current->ts = current->op == READ ? tinfo->read_ts : tinfo->commit_ts;
    }
}

/*
 * snap_repeat --
 *     Repeat one operation.
 */
static void
snap_repeat(WT_CURSOR *cursor, TINFO *tinfo, SNAP_OPS *snap)
{
    WT_DECL_RET;
    WT_SESSION *session;
#define MAX_RETRY_ON_ROLLBACK 1000
    u_int max_retry;
    char buf[64];

    session = cursor->session;

    trace_op(tinfo, "repeat %" PRIu64 " ts=%" PRIu64 " {%s}", snap->keyno, snap->ts,
      trace_bytes(tinfo, snap->vdata, snap->vsize));

    /* Start a transaction with a read-timestamp and verify the record. */
    testutil_check(__wt_snprintf(buf, sizeof(buf), "read_timestamp=%" PRIx64, snap->ts));

    for (max_retry = 0; max_retry < MAX_RETRY_ON_ROLLBACK; ++max_retry, __wt_yield()) {
        wiredtiger_begin_transaction(session, "isolation=snapshot");

        /* EINVAL means the timestamp has aged out of the system. */
        if ((ret = session->timestamp_transaction(session, buf)) == EINVAL) {
            snap_ts_clear(tinfo, snap->ts);
            break;
        }
        testutil_check(ret);

        /*
         * The only expected error is rollback (as a read-only transaction, cache-full shouldn't
         * matter to us). Persist after rollback, as a repeatable read we should succeed, yield to
         * let eviction catch up.
         */
        if ((ret = snap_verify(cursor, tinfo, snap)) == 0)
            break;
        testutil_assert(ret == WT_ROLLBACK);

        testutil_check(session->rollback_transaction(session, NULL));
    }
    testutil_assert(max_retry < MAX_RETRY_ON_ROLLBACK);

    testutil_check(session->rollback_transaction(session, NULL));
}

/*
 * snap_repeat_single --
 *     Repeat an historic operation.
 */
void
snap_repeat_single(WT_CURSOR *cursor, TINFO *tinfo)
{
    SNAP_OPS *snap;
    u_int v;
    int count;

    /*
     * Start at a random spot in the list of operations and look for a read to retry. Stop when
     * we've walked the entire list or found one.
     */
    v = mmrand(&tinfo->rnd, 1, SNAP_LIST_SIZE) - 1;
    for (snap = &tinfo->snap_list[v], count = SNAP_LIST_SIZE; count > 0; --count, ++snap) {
        /* Wrap at the end of the circular buffer. */
        if (snap >= tinfo->snap_end)
            snap = tinfo->snap_list;

        if (snap->repeatable)
            break;
    }

    if (count == 0)
        return;

    snap_repeat(cursor, tinfo, snap);
}

/*
 * snap_repeat_rollback --
 *     Repeat all known operations after a rollback.
 */
void
snap_repeat_rollback(WT_CURSOR *cursor, TINFO **tinfo_array, size_t tinfo_count)
{
    SNAP_OPS *snap;
    SNAP_STATE *state;
    TINFO *tinfo, **tinfop;
    uint32_t count;
    size_t i, statenum;
    char buf[100];

    count = 0;

    track("rollback_to_stable: checking", 0ULL, NULL);
    for (i = 0, tinfop = tinfo_array; i < tinfo_count; ++i, ++tinfop) {
        tinfo = *tinfop;

        /*
         * For this thread, walk through both sets of snaps ("states"), looking for entries that are
         * repeatable and have relevant timestamps. One set will have the most current operations,
         * meaning they will likely be newer than the stable timestamp, and thus cannot be checked.
         * The other set typically has operations that are just before the stable timestamp, so are
         * candidates for checking.
         */
        for (statenum = 0; statenum < WT_ELEMENTS(tinfo->snap_states); statenum++) {
            state = &tinfo->snap_states[statenum];
            for (snap = state->snap_state_list; snap < state->snap_state_end; ++snap) {
                if (snap->repeatable && snap->ts <= g.stable_timestamp) {
                    snap_repeat(cursor, tinfo, snap);
                    ++count;
                    if (count % 100 == 0) {
                        testutil_check(__wt_snprintf(
                          buf, sizeof(buf), "rollback_to_stable: %" PRIu32 " ops repeated", count));
                        track(buf, 0ULL, NULL);
                    }
                }
                snap_clear_one(snap);
            }
        }
    }

    /* Show the final result and check that we're accomplishing some checking. */
    testutil_check(
      __wt_snprintf(buf, sizeof(buf), "rollback_to_stable: %" PRIu32 " ops repeated", count));
    track(buf, 0ULL, NULL);
    if (count == 0) {
#define WARN_RTS_NO_CHECK 5
        if (++g.rts_no_check >= WARN_RTS_NO_CHECK)
            fprintf(stderr,
              "Warning: %" PRIu32 " consecutive runs with no rollback_to_stable checking\n", count);
    } else
        g.rts_no_check = 0;
}