summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/rollback_to_stable/rts_api.c
blob: 60d9556ab4071810e6349364b105c60e1b141ac4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
/*-
 * Copyright (c) 2014-present MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

#include "wt_internal.h"

/*
 * __rts_assert_timestamps_unchanged --
 *     Wrapper for some diagnostic assertions related to global timestamps.
 */
static void
__rts_assert_timestamps_unchanged(
  WT_SESSION_IMPL *session, wt_timestamp_t old_pinned, wt_timestamp_t old_stable)
{
#ifdef HAVE_DIAGNOSTIC
    WT_ASSERT(session, S2C(session)->txn_global.pinned_timestamp == old_pinned);
    WT_ASSERT(session, S2C(session)->txn_global.stable_timestamp == old_stable);
#else
    WT_UNUSED(session);
    WT_UNUSED(old_pinned);
    WT_UNUSED(old_stable);
#endif
}

/*
 * __rollback_to_stable_int --
 *     Rollback all modifications with timestamps more recent than the passed in timestamp.
 */
static int
__rollback_to_stable_int(WT_SESSION_IMPL *session, bool no_ckpt)
{
    WT_CONNECTION_IMPL *conn;
    WT_DECL_RET;
    WT_TXN_GLOBAL *txn_global;
    wt_timestamp_t pinned_timestamp, rollback_timestamp;
    char ts_string[2][WT_TS_INT_STRING_SIZE];
    bool dryrun;

    conn = S2C(session);
    txn_global = &conn->txn_global;
    dryrun = conn->rts->dryrun;

    WT_ASSERT_SPINLOCK_OWNED(session, &conn->checkpoint_lock);
    WT_ASSERT_SPINLOCK_OWNED(session, &conn->schema_lock);

    /*
     * Rollback to stable should ignore tombstones in the history store since it needs to scan the
     * entire table sequentially.
     */
    F_SET(session, WT_SESSION_ROLLBACK_TO_STABLE);

    WT_ERR(__wt_rts_check(session));

    /*
     * Update the global time window state to have consistent view from global visibility rules for
     * the rollback to stable to bring back the database into a consistent state.
     *
     * As part of the below function call, the oldest transaction id and pinned timestamps are
     * updated.
     */
    WT_ERR(__wt_txn_update_oldest(session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT));

    WT_ASSERT_ALWAYS(session,
      (txn_global->has_pinned_timestamp || !txn_global->has_oldest_timestamp),
      "Database has no pinned timestamp but an oldest timestamp. Pinned timestamp is required to "
      "find out the global visibility/obsolete of an update.");

    /*
     * Copy the stable timestamp, otherwise we'd need to lock it each time it's accessed. Even
     * though the stable timestamp isn't supposed to be updated while rolling back, accessing it
     * without a lock would violate protocol.
     */
    WT_ORDERED_READ(rollback_timestamp, txn_global->stable_timestamp);
    WT_ORDERED_READ(pinned_timestamp, txn_global->pinned_timestamp);
    __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
      WT_RTS_VERB_TAG_INIT
      "start rollback to stable with stable_timestamp=%s and oldest_timestamp=%s",
      __wt_timestamp_to_string(rollback_timestamp, ts_string[0]),
      __wt_timestamp_to_string(txn_global->oldest_timestamp, ts_string[1]));

    if (F_ISSET(conn, WT_CONN_RECOVERING))
        __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
          WT_RTS_VERB_TAG_RECOVER_CKPT "recovered checkpoint snapshot_min=%" PRIu64
                                       ", snapshot_max=%" PRIu64 ", snapshot_count=%" PRIu32,
          conn->recovery_ckpt_snap_min, conn->recovery_ckpt_snap_max,
          conn->recovery_ckpt_snapshot_count);

    WT_ERR(__wt_rts_btree_apply_all(session, rollback_timestamp));

    /* Rollback the global durable timestamp to the stable timestamp. */
    if (!dryrun) {
        txn_global->has_durable_timestamp = txn_global->has_stable_timestamp;
        txn_global->durable_timestamp = txn_global->stable_timestamp;
    }
    __rts_assert_timestamps_unchanged(session, pinned_timestamp, rollback_timestamp);

    /*
     * If the configuration is not in-memory, forcibly log a checkpoint after rollback to stable to
     * ensure that both in-memory and on-disk versions are the same unless caller requested for no
     * checkpoint.
     */
    if (!F_ISSET(conn, WT_CONN_IN_MEMORY) && !no_ckpt && !dryrun)
        WT_ERR(session->iface.checkpoint(&session->iface, "force=1"));

err:
    F_CLR(session, WT_SESSION_ROLLBACK_TO_STABLE);
    return (ret);
}

/*
 * __rollback_to_stable_one --
 *     Perform rollback to stable on a single object.
 */
static int
__rollback_to_stable_one(WT_SESSION_IMPL *session, const char *uri, bool *skipp)
{
    WT_CONNECTION_IMPL *conn;
    WT_DECL_RET;
    wt_timestamp_t pinned_timestamp, rollback_timestamp;
    char *config;

    conn = S2C(session);

    /*
     * This is confusing: the caller's boolean argument "skip" stops the schema-worker loop from
     * processing this object and any underlying objects it may have (for example, a table with
     * multiple underlying file objects). We rollback-to-stable all of the file objects an object
     * may contain, so set the caller's skip argument to true on all file objects, else set the
     * caller's skip argument to false so our caller continues down the tree of objects.
     */
    *skipp = WT_BTREE_PREFIX(uri);
    if (!*skipp)
        return (0);

    WT_RET(__wt_metadata_search(session, uri, &config));

    /* Read the stable timestamp once, when we first start up. */
    WT_ORDERED_READ(rollback_timestamp, conn->txn_global.stable_timestamp);
    WT_ORDERED_READ(pinned_timestamp, conn->txn_global.pinned_timestamp);

    F_SET(session, WT_SESSION_QUIET_CORRUPT_FILE);
    ret = __wt_rts_btree_walk_btree_apply(session, uri, config, rollback_timestamp);
    F_CLR(session, WT_SESSION_QUIET_CORRUPT_FILE);

    __rts_assert_timestamps_unchanged(session, pinned_timestamp, rollback_timestamp);

    __wt_free(session, config);

    return (ret);
}

/*
 * __rollback_to_stable_finalize --
 *     Reset a connection's RTS structure in preparation for the next call.
 */
static void
__rollback_to_stable_finalize(WT_ROLLBACK_TO_STABLE *rts)
{
    rts->dryrun = false;
}

/*
 * __rollback_to_stable --
 *     Rollback the database to the stable timestamp.
 */
static int
__rollback_to_stable(WT_SESSION_IMPL *session, const char *cfg[], bool no_ckpt)
{
    WT_CONFIG_ITEM cval;
    WT_DECL_RET;
    bool dryrun;

    /*
     * Explicit null-check because internal callers (startup/shutdown) do not enter via the API, and
     * don't get default values installed in the config string.
     */
    dryrun = false;
    if (cfg != NULL) {
        ret = __wt_config_gets(session, cfg, "dryrun", &cval);
        if (ret == 0)
            dryrun = cval.val != 0;
        WT_RET_NOTFOUND_OK(ret);
    }

    /*
     * Don't use the connection's default session: we are working on data handles and (a) don't want
     * to cache all of them forever, plus (b) can't guarantee that no other method will be called
     * concurrently. Copy parent session no logging option to the internal session to make sure that
     * rollback to stable doesn't generate log records.
     */
    WT_RET(
      __wt_open_internal_session(S2C(session), "txn rollback_to_stable", true, 0, 0, &session));

    S2C(session)->rts->dryrun = dryrun;

    WT_STAT_CONN_SET(session, txn_rollback_to_stable_running, 1);
    WT_WITH_CHECKPOINT_LOCK(
      session, WT_WITH_SCHEMA_LOCK(session, ret = __rollback_to_stable_int(session, no_ckpt)));

    __wt_verbose_multi(session, WT_VERB_RECOVERY_RTS(session),
      WT_RTS_VERB_TAG_END "finished rollback to stable%s", dryrun ? " dryrun" : "");
    WT_STAT_CONN_SET(session, txn_rollback_to_stable_running, 0);

    __rollback_to_stable_finalize(S2C(session)->rts);

    WT_TRET(__wt_session_close_internal(session));

    return (ret);
}

/*
 * __wt_rollback_to_stable_init --
 *     Initialize the data structures for the rollback to stable subsystem
 */
void
__wt_rollback_to_stable_init(WT_CONNECTION_IMPL *conn)
{
    /*
     * Setup the pointer so the data structure can be accessed easily while avoiding the need to do
     * explicit memory management.
     */
    conn->rts = &conn->_rts;

    /* Setup function pointers. */
    conn->rts->rollback_to_stable = __rollback_to_stable;
    conn->rts->rollback_to_stable_one = __rollback_to_stable_one;

    /* Setup variables. */
    conn->rts->dryrun = false;
}