summaryrefslogtreecommitdiff
path: root/src/btree/bt_ovfl.c
blob: feb78df75e22ad8980702b3170d41b9d27159aea (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
/*-
 * Copyright (c) 2014-2016 MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

#include "wt_internal.h"

/*
 * __ovfl_read --
 *	Read an overflow item from the disk.
 */
static int
__ovfl_read(WT_SESSION_IMPL *session,
    const uint8_t *addr, size_t addr_size, WT_ITEM *store)
{
	WT_BTREE *btree;
	const WT_PAGE_HEADER *dsk;

	btree = S2BT(session);

	/*
	 * Read the overflow item from the block manager, then reference the
	 * start of the data and set the data's length.
	 *
	 * Overflow reads are synchronous. That may bite me at some point, but
	 * WiredTiger supports large page sizes, overflow items should be rare.
	 */
	WT_RET(__wt_bt_read(session, store, addr, addr_size));
	dsk = store->data;
	store->data = WT_PAGE_HEADER_BYTE(btree, dsk);
	store->size = dsk->u.datalen;

	WT_STAT_FAST_CONN_INCR(session, cache_read_overflow);
	WT_STAT_FAST_DATA_INCR(session, cache_read_overflow);

	return (0);
}

/*
 * __wt_ovfl_read --
 *	Bring an overflow item into memory.
 */
int
__wt_ovfl_read(WT_SESSION_IMPL *session,
    WT_PAGE *page, WT_CELL_UNPACK *unpack, WT_ITEM *store)
{
	WT_DECL_RET;

	/*
	 * If no page specified, there's no need to lock and there's no cache
	 * to search, we don't care about WT_CELL_VALUE_OVFL_RM cells.
	 */
	if (page == NULL)
		return (
		    __ovfl_read(session, unpack->data, unpack->size, store));

	/*
	 * WT_CELL_VALUE_OVFL_RM cells: If reconciliation deleted an overflow
	 * value, but there was still a reader in the system that might need it,
	 * the on-page cell type will have been reset to WT_CELL_VALUE_OVFL_RM
	 * and we will be passed a page so we can look-aside into the cache of
	 * such values.
	 *
	 * Acquire the overflow lock, and retest the on-page cell's value inside
	 * the lock.
	 */
	__wt_readlock(session, S2BT(session)->ovfl_lock);
	ret = __wt_cell_type_raw(unpack->cell) == WT_CELL_VALUE_OVFL_RM ?
	    __wt_ovfl_txnc_search(page, unpack->data, unpack->size, store) :
	    __ovfl_read(session, unpack->data, unpack->size, store);
	__wt_readunlock(session, S2BT(session)->ovfl_lock);

	return (ret);
}

/*
 * __ovfl_cache_col_visible --
 *	column-store: check for a globally visible update.
 */
static bool
__ovfl_cache_col_visible(
    WT_SESSION_IMPL *session, WT_UPDATE *upd, WT_CELL_UNPACK *unpack)
{
	/*
	 * Column-store is harder than row_store: we're here because there's a
	 * reader in the system that might read the original version of an
	 * overflow record, which might match a number of records.  For example,
	 * the original overflow value was for records 100-200, we've replaced
	 * each of those records individually, but there exists a reader that
	 * might read any one of those records, and all of those records have
	 * different update entries with different transaction IDs.  Since it's
	 * infeasible to determine if there's a globally visible update for each
	 * reader for each record, we test the simple case where a single record
	 * has a single, globally visible update.  If that's not the case, cache
	 * the value.
	 */
	if (__wt_cell_rle(unpack) == 1 &&
	    upd != NULL &&		/* Sanity: upd should always be set. */
	    __wt_txn_visible_all(session, upd->txnid))
		return (true);
	return (false);
}

/*
 * __ovfl_cache_row_visible --
 *	row-store: check for a globally visible update.
 */
static bool
__ovfl_cache_row_visible(WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip)
{
	WT_UPDATE *upd;

	/* Check to see if there's a globally visible update. */
	for (upd = WT_ROW_UPDATE(page, rip); upd != NULL; upd = upd->next)
		if (__wt_txn_visible_all(session, upd->txnid))
			return (true);

	return (false);
}

/*
 * __ovfl_cache --
 *	Cache a deleted overflow value.
 */
static int
__ovfl_cache(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL_UNPACK *unpack)
{
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	size_t addr_size;
	const uint8_t *addr;

	addr = unpack->data;
	addr_size = unpack->size;

	WT_RET(__wt_scr_alloc(session, 1024, &tmp));

	/* Enter the value into the overflow cache. */
	WT_ERR(__ovfl_read(session, addr, addr_size, tmp));
	WT_ERR(__wt_ovfl_txnc_add(
	    session, page, addr, addr_size, tmp->data, tmp->size));

err:	__wt_scr_free(session, &tmp);
	return (ret);
}

/*
 * __wt_ovfl_cache --
 *	Handle deletion of an overflow value.
 */
int
__wt_ovfl_cache(WT_SESSION_IMPL *session,
    WT_PAGE *page, void *cookie, WT_CELL_UNPACK *vpack)
{
	bool visible;

	/*
	 * This function solves a problem in reconciliation. The scenario is:
	 *     - reconciling a leaf page that references an overflow item
	 *     - the item is updated and the update committed
	 *     - a checkpoint runs, freeing the backing overflow blocks
	 *     - a snapshot transaction wants the original version of the item
	 *
	 * In summary, we may need the original version of an overflow item for
	 * a snapshot transaction after the item was deleted from a page that's
	 * subsequently been checkpointed, where the checkpoint must know about
	 * the freed blocks.  We don't have any way to delay a free of the
	 * underlying blocks until a particular set of transactions exit (and
	 * this shouldn't be a common scenario), so cache the overflow value in
	 * memory.
	 *
	 * This gets hard because the snapshot transaction reader might:
	 *     - search the WT_UPDATE list and not find an useful entry
	 *     - read the overflow value's address from the on-page cell
	 *     - go to sleep
	 *     - checkpoint runs, caches the overflow value, frees the blocks
	 *     - another thread allocates and overwrites the blocks
	 *     - the reader wakes up and reads the wrong value
	 *
	 * Use a read/write lock and the on-page cell to fix the problem: hold
	 * a write lock when changing the cell type from WT_CELL_VALUE_OVFL to
	 * WT_CELL_VALUE_OVFL_RM and hold a read lock when reading an overflow
	 * item.
	 *
	 * The read/write lock is per btree, but it could be per page or even
	 * per overflow item.  We don't do any of that because overflow values
	 * are supposed to be rare and we shouldn't see contention for the lock.
	 *
	 * Check for a globally visible update.  If there is a globally visible
	 * update, we don't need to cache the item because it's not possible for
	 * a running thread to have moved past it.
	 */
	switch (page->type) {
	case WT_PAGE_COL_VAR:
		visible = __ovfl_cache_col_visible(session, cookie, vpack);
		break;
	case WT_PAGE_ROW_LEAF:
		visible = __ovfl_cache_row_visible(session, page, cookie);
		break;
	WT_ILLEGAL_VALUE(session);
	}

	/*
	 * If there's no globally visible update, there's a reader in the system
	 * that might try and read the old value, cache it.
	 */
	if (!visible) {
		WT_RET(__ovfl_cache(session, page, vpack));
		WT_STAT_FAST_CONN_INCR(session, cache_overflow_value);
		WT_STAT_FAST_DATA_INCR(session, cache_overflow_value);
	}

	/*
	 * Queue the on-page cell to be set to WT_CELL_VALUE_OVFL_RM and the
	 * underlying overflow value's blocks to be freed when reconciliation
	 * completes.
	 */
	return (__wt_ovfl_discard_add(session, page, vpack->cell));
}

/*
 * __wt_ovfl_discard --
 *	Discard an on-page overflow value, and reset the page's cell.
 */
int
__wt_ovfl_discard(WT_SESSION_IMPL *session, WT_CELL *cell)
{
	WT_BM *bm;
	WT_BTREE *btree;
	WT_CELL_UNPACK *unpack, _unpack;

	btree = S2BT(session);
	bm = btree->bm;
	unpack = &_unpack;

	__wt_cell_unpack(cell, unpack);

	/*
	 * Finally remove overflow key/value objects, called when reconciliation
	 * finishes after successfully writing a page.
	 *
	 * Keys must have already been instantiated and value objects must have
	 * already been cached (if they might potentially still be read by any
	 * running transaction).
	 *
	 * Acquire the overflow lock to avoid racing with a thread reading the
	 * backing overflow blocks.
	 */
	__wt_writelock(session, btree->ovfl_lock);

	switch (unpack->raw) {
	case WT_CELL_KEY_OVFL:
		__wt_cell_type_reset(session,
		    unpack->cell, WT_CELL_KEY_OVFL, WT_CELL_KEY_OVFL_RM);
		break;
	case WT_CELL_VALUE_OVFL:
		__wt_cell_type_reset(session,
		    unpack->cell, WT_CELL_VALUE_OVFL, WT_CELL_VALUE_OVFL_RM);
		break;
	WT_ILLEGAL_VALUE(session);
	}

	__wt_writeunlock(session, btree->ovfl_lock);

	/* Free the backing disk blocks. */
	return (bm->free(bm, session, unpack->data, unpack->size));
}