summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/btree/bt_delete.c
blob: 9749cef3706b98bbb1b5e5b9917fc9bf275af655 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
/*-
 * Copyright (c) 2014-2019 MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

#include "wt_internal.h"

/*
 * Fast-delete support.
 *
 * This file contains most of the code that allows WiredTiger to delete pages
 * of data without reading them into the cache.  (This feature is currently
 * only available for row-store objects.)
 *
 * The way cursor truncate works in a row-store object is it explicitly reads
 * the first and last pages of the truncate range, then walks the tree with a
 * flag so the tree walk code skips reading eligible pages within the range
 * and instead just marks them as deleted, by changing their WT_REF state to
 * WT_REF_DELETED. Pages ineligible for this fast path include pages already
 * in the cache, having overflow items, or requiring lookaside records.
 * Ineligible pages are read and have their rows updated/deleted individually.
 * The transaction for the delete operation is stored in memory referenced by
 * the WT_REF.page_del field.
 *
 * Future cursor walks of the tree will skip the deleted page based on the
 * transaction stored for the delete, but it gets more complicated if a read is
 * done using a random key, or a cursor walk is done with a transaction where
 * the delete is not visible.  In those cases, we read the original contents of
 * the page.  The page-read code notices a deleted page is being read, and as
 * part of the read instantiates the contents of the page, creating a WT_UPDATE
 * with a deleted operation, in the same transaction as deleted the page.  In
 * other words, the read process makes it appear as if the page was read and
 * each individual row deleted, exactly as would have happened if the page had
 * been in the cache all along.
 *
 * There's an additional complication to support rollback of the page delete.
 * When the page was marked deleted, a pointer to the WT_REF was saved in the
 * deleting session's transaction list and the delete is unrolled by resetting
 * the WT_REF_DELETED state back to WT_REF_DISK.  However, if the page has been
 * instantiated by some reading thread, that's not enough, each individual row
 * on the page must have the delete operation reset.  If the page split, the
 * WT_UPDATE lists might have been saved/restored during reconciliation and
 * appear on multiple pages, and the WT_REF stored in the deleting session's
 * transaction list is no longer useful.  For this reason, when the page is
 * instantiated by a read, a list of the WT_UPDATE structures on the page is
 * stored in the WT_REF.page_del field, with the transaction ID, that way the
 * session committing/unrolling the delete can find all WT_UPDATE structures
 * that require update.
 *
 * One final note: pages can also be marked deleted if emptied and evicted.  In
 * that case, the WT_REF state will be set to WT_REF_DELETED but there will not
 * be any associated WT_REF.page_del field.  These pages are always skipped
 * during cursor traversal (the page could not have been evicted if there were
 * updates that weren't globally visible), and if read is forced to instantiate
 * such a page, it simply creates an empty page from scratch.
 */

/*
 * __wt_delete_page --
 *     If deleting a range, try to delete the page without instantiating it.
 */
int
__wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp)
{
    WT_ADDR *ref_addr;
    WT_DECL_RET;
    uint32_t previous_state;

    *skipp = false;

    /* If we have a clean page in memory, attempt to evict it. */
    previous_state = ref->state;
    if ((previous_state == WT_REF_MEM || previous_state == WT_REF_LIMBO) &&
      WT_REF_CAS_STATE(session, ref, previous_state, WT_REF_LOCKED)) {
        if (__wt_page_is_modified(ref->page)) {
            WT_REF_SET_STATE(ref, previous_state);
            return (0);
        }

        (void)__wt_atomic_addv32(&S2BT(session)->evict_busy, 1);
        ret = __wt_evict(session, ref, previous_state, 0);
        (void)__wt_atomic_subv32(&S2BT(session)->evict_busy, 1);
        WT_RET_BUSY_OK(ret);
        ret = 0;
    }

    /*
     * Fast check to see if it's worth locking, then atomically switch the page's state to lock it.
     */
    previous_state = ref->state;
    switch (previous_state) {
    case WT_REF_DISK:
    case WT_REF_LOOKASIDE:
        break;
    default:
        return (0);
    }
    if (!WT_REF_CAS_STATE(session, ref, previous_state, WT_REF_LOCKED))
        return (0);

    /*
     * If this WT_REF was previously part of a truncate operation, there
     * may be existing page-delete information. The structure is only read
     * while the state is locked, free the previous version.
     *
     * Note: changes have been made, we must publish any state change from
     * this point on.
     */
    if (ref->page_del != NULL) {
        WT_ASSERT(session, ref->page_del->txnid == WT_TXN_ABORTED);
        __wt_free(session, ref->page_del->update_list);
        __wt_free(session, ref->page_del);
    }

    /*
     * We cannot truncate pages that have overflow key/value items as the
     * overflow blocks have to be discarded.  The way we figure that out is
     * to check the page's cell type, cells for leaf pages without overflow
     * items are special.
     *
     * To look at an on-page cell, we need to look at the parent page, and
     * that's dangerous, our parent page could change without warning if
     * the parent page were to split, deepening the tree. We can look at
     * the parent page itself because the page can't change underneath us.
     * However, if the parent page splits, our reference address can change;
     * we don't care what version of it we read, as long as we don't read
     * it twice.
     */
    WT_ORDERED_READ(ref_addr, ref->addr);
    if (ref_addr != NULL && (__wt_off_page(ref->home, ref_addr) ?
                                ref_addr->type != WT_ADDR_LEAF_NO :
                                __wt_cell_type_raw((WT_CELL *)ref_addr) != WT_CELL_ADDR_LEAF_NO))
        goto err;

    /*
     * This action dirties the parent page: mark it dirty now, there's no future reconciliation of
     * the child leaf page that will dirty it as we write the tree.
     */
    WT_ERR(__wt_page_parent_modify_set(session, ref, false));

    /* Allocate and initialize the page-deleted structure. */
    WT_ERR(__wt_calloc_one(session, &ref->page_del));
    ref->page_del->previous_state = previous_state;

    WT_ERR(__wt_txn_modify_page_delete(session, ref));

    *skipp = true;
    WT_STAT_CONN_INCR(session, rec_page_delete_fast);
    WT_STAT_DATA_INCR(session, rec_page_delete_fast);

    /* Publish the page to its new state, ensuring visibility. */
    WT_REF_SET_STATE(ref, WT_REF_DELETED);
    return (0);

err:
    __wt_free(session, ref->page_del);

    /* Publish the page to its previous state, ensuring visibility. */
    WT_REF_SET_STATE(ref, previous_state);
    return (ret);
}

/*
 * __wt_delete_page_rollback --
 *     Abort pages that were deleted without being instantiated.
 */
int
__wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
{
    WT_UPDATE **updp;
    uint64_t sleep_usecs, yield_count;
    uint32_t current_state;
    bool locked;

    /*
     * If the page is still "deleted", it's as we left it, reset the state to on-disk and we're
     * done. Otherwise, we expect the page is either instantiated or being instantiated. Loop
     * because it's possible for the page to return to the deleted state if instantiation fails.
     */
    for (locked = false, sleep_usecs = yield_count = 0;;) {
        switch (current_state = ref->state) {
        case WT_REF_DELETED:
            /*
             * If the page is still "deleted", it's as we left it, reset the state.
             */
            if (WT_REF_CAS_STATE(session, ref, WT_REF_DELETED, ref->page_del->previous_state))
                goto done;
            break;
        case WT_REF_LOCKED:
            /*
             * A possible state, the page is being instantiated.
             */
            break;
        case WT_REF_MEM:
        case WT_REF_SPLIT:
            if (WT_REF_CAS_STATE(session, ref, current_state, WT_REF_LOCKED))
                locked = true;
            break;
        case WT_REF_DISK:
        case WT_REF_LIMBO:
        case WT_REF_LOOKASIDE:
        case WT_REF_READING:
        default:
            return (__wt_illegal_value(session, current_state));
        }

        if (locked)
            break;

        /*
         * We wait for the change in page state, yield before retrying, and if we've yielded enough
         * times, start sleeping so we don't burn CPU to no purpose.
         */
        __wt_spin_backoff(&yield_count, &sleep_usecs);
        WT_STAT_CONN_INCRV(session, page_del_rollback_blocked, sleep_usecs);
    }

    /*
     * We can't use the normal read path to get a copy of the page
     * because the session may have closed the cursor, we no longer
     * have the reference to the tree required for a hazard
     * pointer.  We're safe because with unresolved transactions,
     * the page isn't going anywhere.
     *
     * The page is in an in-memory state, which means it
     * was instantiated at some point. Walk any list of
     * update structures and abort them.
     */
    WT_ASSERT(session, locked);
    if ((updp = ref->page_del->update_list) != NULL)
        for (; *updp != NULL; ++updp)
            (*updp)->txnid = WT_TXN_ABORTED;

    WT_REF_SET_STATE(ref, current_state);

done:
    /*
     * Now mark the truncate aborted: this must come last because after this point there is nothing
     * preventing the page from being evicted.
     */
    WT_PUBLISH(ref->page_del->txnid, WT_TXN_ABORTED);
    return (0);
}

/*
 * __wt_delete_page_skip --
 *     If iterating a cursor, skip deleted pages that are either visible to us or globally visible.
 */
bool
__wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref, bool visible_all)
{
    bool skip;

    /*
     * Deleted pages come from two sources: either it's a truncate as
     * described above, or the page has been emptied by other operations
     * and eviction deleted it.
     *
     * In both cases, the WT_REF state will be WT_REF_DELETED.  In the case
     * of a truncated page, there will be a WT_PAGE_DELETED structure with
     * the transaction ID of the transaction that deleted the page, and the
     * page is visible if that transaction ID is visible.  In the case of an
     * empty page, there will be no WT_PAGE_DELETED structure and the delete
     * is by definition visible, eviction could not have deleted the page if
     * there were changes on it that were not globally visible.
     *
     * We're here because we found a WT_REF state set to WT_REF_DELETED.  It
     * is possible the page is being read into memory right now, though, and
     * the page could switch to an in-memory state at any time.  Lock down
     * the structure, just to be safe.
     */
    if (ref->page_del == NULL && ref->page_las == NULL)
        return (true);

    if (!WT_REF_CAS_STATE(session, ref, WT_REF_DELETED, WT_REF_LOCKED))
        return (false);

    skip = !__wt_page_del_active(session, ref, visible_all) && !__wt_page_las_active(session, ref);

    /*
     * The page_del structure can be freed as soon as the delete is stable: it is only read when the
     * ref state is locked. It is worth checking every time we come through because once this is
     * freed, we no longer need synchronization to check the ref.
     */
    if (skip && ref->page_del != NULL &&
      (visible_all ||
          __wt_txn_visible_all(session, ref->page_del->txnid, ref->page_del->timestamp))) {
        __wt_free(session, ref->page_del->update_list);
        __wt_free(session, ref->page_del);
    }

    WT_REF_SET_STATE(ref, WT_REF_DELETED);
    return (skip);
}

/*
 * __tombstone_update_alloc --
 *     Allocate and initialize a page-deleted tombstone update structure.
 */
static int
__tombstone_update_alloc(
  WT_SESSION_IMPL *session, WT_PAGE_DELETED *page_del, WT_UPDATE **updp, size_t *sizep)
{
    WT_UPDATE *upd;

    WT_RET(__wt_update_alloc(session, NULL, &upd, sizep, WT_UPDATE_TOMBSTONE));

    /*
     * Cleared memory matches the lowest possible transaction ID and timestamp, do nothing.
     */
    if (page_del != NULL) {
        upd->txnid = page_del->txnid;
        upd->start_ts = page_del->timestamp;
        upd->durable_ts = page_del->durable_timestamp;
        upd->prepare_state = page_del->prepare_state;
    }
    *updp = upd;
    return (0);
}

/*
 * __wt_delete_page_instantiate --
 *     Instantiate an entirely deleted row-store leaf page.
 */
int
__wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
{
    WT_BTREE *btree;
    WT_DECL_RET;
    WT_INSERT *ins;
    WT_INSERT_HEAD *insert;
    WT_PAGE *page;
    WT_PAGE_DELETED *page_del;
    WT_ROW *rip;
    WT_UPDATE **upd_array, *upd;
    size_t size;
    uint32_t count, i;

    btree = S2BT(session);
    page = ref->page;

    WT_STAT_CONN_INCR(session, cache_read_deleted);
    WT_STAT_DATA_INCR(session, cache_read_deleted);

    /*
     * Give the page a modify structure.
     *
     * Mark tree dirty, unless the handle is read-only.
     * (We'd like to free the deleted pages, but if the handle is read-only,
     * we're not able to do so.)
     */
    WT_RET(__wt_page_modify_init(session, page));
    if (!F_ISSET(btree, WT_BTREE_READONLY))
        __wt_page_modify_set(session, page);

    if (ref->page_del != NULL && ref->page_del->prepare_state != WT_PREPARE_INIT) {
        WT_STAT_CONN_INCR(session, cache_read_deleted_prepared);
        WT_STAT_DATA_INCR(session, cache_read_deleted_prepared);
    }

    /*
     * An operation is accessing a "deleted" page, and we're building an
     * in-memory version of the page (making it look like all entries in
     * the page were individually updated by a remove operation).  There
     * are two cases where we end up here:
     *
     * First, a running transaction used a truncate call to delete the page
     * without reading it, in which case the page reference includes a
     * structure with a transaction ID; the page we're building might split
     * in the future, so we update that structure to include references to
     * all of the update structures we create, so the transaction can abort.
     *
     * Second, a truncate call deleted a page and the truncate committed,
     * but an older transaction in the system forced us to keep the old
     * version of the page around, then we crashed and recovered or we're
     * running inside a checkpoint, and now we're being forced to read that
     * page.
     *
     * Expect a page-deleted structure if there's a running transaction that
     * needs to be resolved, otherwise, there may not be one (and, if the
     * transaction has resolved, we can ignore the page-deleted structure).
     */
    page_del = __wt_page_del_active(session, ref, true) ? ref->page_del : NULL;

    /*
     * Allocate the per-page update array if one doesn't already exist. (It might already exist
     * because deletes are instantiated after lookaside table updates.)
     */
    if (page->entries != 0 && page->modify->mod_row_update == NULL)
        WT_RET(__wt_calloc_def(session, page->entries, &page->modify->mod_row_update));

    /*
     * Allocate the per-reference update array; in the case of instantiating a page deleted in a
     * running transaction, we need a list of the update structures for the eventual commit or
     * abort.
     */
    if (page_del != NULL) {
        count = 0;
        if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL)
            WT_SKIP_FOREACH (ins, insert)
                ++count;
        WT_ROW_FOREACH (page, rip, i) {
            ++count;
            if ((insert = WT_ROW_INSERT(page, rip)) != NULL)
                WT_SKIP_FOREACH (ins, insert)
                    ++count;
        }
        WT_RET(__wt_calloc_def(session, count + 1, &page_del->update_list));
    }

    /* Walk the page entries, giving each one a tombstone. */
    size = 0;
    count = 0;
    upd_array = page->modify->mod_row_update;
    if ((insert = WT_ROW_INSERT_SMALLEST(page)) != NULL)
        WT_SKIP_FOREACH (ins, insert) {
            WT_ERR(__tombstone_update_alloc(session, page_del, &upd, &size));
            upd->next = ins->upd;
            ins->upd = upd;

            if (page_del != NULL)
                page_del->update_list[count++] = upd;
        }
    WT_ROW_FOREACH (page, rip, i) {
        WT_ERR(__tombstone_update_alloc(session, page_del, &upd, &size));
        upd->next = upd_array[WT_ROW_SLOT(page, rip)];
        upd_array[WT_ROW_SLOT(page, rip)] = upd;

        if (page_del != NULL)
            page_del->update_list[count++] = upd;

        if ((insert = WT_ROW_INSERT(page, rip)) != NULL)
            WT_SKIP_FOREACH (ins, insert) {
                WT_ERR(__tombstone_update_alloc(session, page_del, &upd, &size));
                upd->next = ins->upd;
                ins->upd = upd;

                if (page_del != NULL)
                    page_del->update_list[count++] = upd;
            }
    }

    __wt_cache_page_inmem_incr(session, page, size);

    return (0);

err:
    /*
     * The page-delete update structure may have existed before we were called, and presumably might
     * be in use by a running transaction. The list of update structures cannot have been created
     * before we were called, and should not exist if we exit with an error.
     */
    if (page_del != NULL)
        __wt_free(session, page_del->update_list);
    return (ret);
}