summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/btree/bt_ovfl.c
blob: 0ea80819048774650bc89a17d2c8433230858323 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
/*-
 * Copyright (c) 2014-2020 MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

#include "wt_internal.h"

/*
 * __ovfl_read --
 *     Read an overflow item from the disk.
 */
static int
__ovfl_read(WT_SESSION_IMPL *session, const uint8_t *addr, size_t addr_size, WT_ITEM *store)
{
    WT_BTREE *btree;
    const WT_PAGE_HEADER *dsk;

    btree = S2BT(session);

    /*
     * Read the overflow item from the block manager, then reference the start of the data and set
     * the data's length.
     *
     * Overflow reads are synchronous. That may bite me at some point, but WiredTiger supports large
     * page sizes, overflow items should be rare.
     */
    WT_RET(__wt_bt_read(session, store, addr, addr_size));
    dsk = store->data;
    store->data = WT_PAGE_HEADER_BYTE(btree, dsk);
    store->size = dsk->u.datalen;

    WT_STAT_CONN_INCR(session, cache_read_overflow);
    WT_STAT_DATA_INCR(session, cache_read_overflow);

    return (0);
}

/*
 * __wt_ovfl_read --
 *     Bring an overflow item into memory.
 */
int
__wt_ovfl_read(
  WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store, bool *decoded)
{
    WT_DECL_RET;
    WT_OVFL_TRACK *track;
    size_t i;

    *decoded = false;

    /*
     * If no page specified, there's no need to lock and there's no cache to search, we don't care
     * about WT_CELL_VALUE_OVFL_RM cells.
     */
    if (page == NULL)
        return (__ovfl_read(session, unpack->data, unpack->size, store));

    /*
     * WT_CELL_VALUE_OVFL_RM cells: If reconciliation deleted an overflow value, but there was still
     * a reader in the system that might need it, the on-page cell type will have been reset to
     * WT_CELL_VALUE_OVFL_RM and we will be passed a page so we can check the on-page cell.
     *
     * Acquire the overflow lock, and retest the on-page cell's value inside the lock.
     */
    __wt_readlock(session, &S2BT(session)->ovfl_lock);
    if (__wt_cell_type_raw(unpack->cell) == WT_CELL_VALUE_OVFL_RM) {
        track = page->modify->ovfl_track;
        for (i = 0; i < track->remove_next; ++i)
            if (track->remove[i].cell == unpack->cell) {
                store->data = track->remove[i].data;
                store->size = track->remove[i].size;
                break;
            }
        WT_ASSERT(session, i < track->remove_next);
        *decoded = true;
    } else
        ret = __ovfl_read(session, unpack->data, unpack->size, store);
    __wt_readunlock(session, &S2BT(session)->ovfl_lock);

    return (ret);
}

/*
 * __wt_ovfl_discard_remove --
 *     Free the on-page overflow value cache.
 */
void
__wt_ovfl_discard_remove(WT_SESSION_IMPL *session, WT_PAGE *page)
{
    WT_OVFL_TRACK *track;
    uint32_t i;

    if (page->modify != NULL && (track = page->modify->ovfl_track) != NULL) {
        for (i = 0; i < track->remove_next; ++i)
            __wt_free(session, track->remove[i].data);
        __wt_free(session, page->modify->ovfl_track->remove);
        track->remove_allocated = 0;
        track->remove_next = 0;
    }
}

/*
 * __ovfl_cache --
 *     Cache an overflow value.
 */
static int
__ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack)
{
    WT_DECL_ITEM(tmp);
    WT_DECL_RET;
    WT_OVFL_TRACK *track;

    /* Read the overflow value. */
    WT_RET(__wt_scr_alloc(session, 1024, &tmp));
    WT_ERR(__wt_dsk_cell_data_ref(session, page->type, unpack, tmp));

    /* Allocating tracking structures as necessary. */
    if (page->modify->ovfl_track == NULL)
        WT_ERR(__wt_ovfl_track_init(session, page));
    track = page->modify->ovfl_track;

    /* Copy the overflow item into place. */
    WT_ERR(
      __wt_realloc_def(session, &track->remove_allocated, track->remove_next + 1, &track->remove));
    track->remove[track->remove_next].cell = unpack->cell;
    WT_ERR(__wt_memdup(session, tmp->data, tmp->size, &track->remove[track->remove_next].data));
    track->remove[track->remove_next].size = tmp->size;
    ++track->remove_next;

err:
    __wt_scr_free(session, &tmp);
    return (ret);
}

/*
 * __wt_ovfl_remove --
 *     Remove an overflow value.
 */
int
__wt_ovfl_remove(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack, bool evicting)
{
    /*
     * This function solves two problems in reconciliation.
     *
     * The first problem is snapshot readers needing on-page overflow values
     * that have been removed. The scenario is as follows:
     *
     *     - reconciling a leaf page that references an overflow item
     *     - the item is updated and the update committed
     *     - a checkpoint runs, freeing the backing overflow blocks
     *     - a snapshot transaction wants the original version of the item
     *
     * In summary, we may need the original version of an overflow item for
     * a snapshot transaction after the item was deleted from a page that's
     * subsequently been checkpointed, where the checkpoint must know about
     * the freed blocks.  We don't have any way to delay a free of the
     * underlying blocks until a particular set of transactions exit (and
     * this shouldn't be a common scenario), so cache the overflow value in
     * memory.
     *
     * This gets hard because the snapshot transaction reader might:
     *     - search the WT_UPDATE list and not find an useful entry
     *     - read the overflow value's address from the on-page cell
     *     - go to sleep
     *     - checkpoint runs, caches the overflow value, frees the blocks
     *     - another thread allocates and overwrites the blocks
     *     - the reader wakes up and reads the wrong value
     *
     * Use a read/write lock and the on-page cell to fix the problem: hold
     * a write lock when changing the cell type from WT_CELL_VALUE_OVFL to
     * WT_CELL_VALUE_OVFL_RM and hold a read lock when reading an overflow
     * item.
     *
     * The read/write lock is per btree, but it could be per page or even
     * per overflow item.  We don't do any of that because overflow values
     * are supposed to be rare and we shouldn't see contention for the lock.
     *
     * We only have to do this for checkpoints: in any eviction mode, there
     * can't be threads sitting in our update lists.
     */
    if (!evicting)
        WT_RET(__ovfl_cache(session, page, unpack));

    /*
     * The second problem is to only remove the underlying blocks once, solved by the
     * WT_CELL_VALUE_OVFL_RM flag.
     *
     * Queue the on-page cell to be set to WT_CELL_VALUE_OVFL_RM and the underlying overflow value's
     * blocks to be freed when reconciliation completes.
     */
    return (__wt_ovfl_discard_add(session, page, unpack->cell));
}

/*
 * __wt_ovfl_discard --
 *     Discard an on-page overflow value, and reset the page's cell.
 */
int
__wt_ovfl_discard(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell)
{
    WT_BM *bm;
    WT_BTREE *btree;
    WT_CELL_UNPACK *unpack, _unpack;

    btree = S2BT(session);
    bm = btree->bm;
    unpack = &_unpack;

    __wt_cell_unpack(session, page, cell, unpack);

    /*
     * Finally remove overflow key/value objects, called when reconciliation finishes after
     * successfully writing a page.
     *
     * Keys must have already been instantiated and value objects must have already been cached (if
     * they might potentially still be read by any running transaction).
     *
     * Acquire the overflow lock to avoid racing with a thread reading the backing overflow blocks.
     */
    __wt_writelock(session, &btree->ovfl_lock);

    switch (unpack->raw) {
    case WT_CELL_KEY_OVFL:
        __wt_cell_type_reset(session, unpack->cell, WT_CELL_KEY_OVFL, WT_CELL_KEY_OVFL_RM);
        break;
    case WT_CELL_VALUE_OVFL:
        __wt_cell_type_reset(session, unpack->cell, WT_CELL_VALUE_OVFL, WT_CELL_VALUE_OVFL_RM);
        break;
    default:
        return (__wt_illegal_value(session, unpack->raw));
    }

    __wt_writeunlock(session, &btree->ovfl_lock);

    /* Free the backing disk blocks. */
    return (bm->free(bm, session, unpack->data, unpack->size));
}