summaryrefslogtreecommitdiff
path: root/src/btree/bt_delete.c
blob: 570b7f807428602080f3d2d8fc2f0ccf5cbe894f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
/*-
 * Copyright (c) 2014-2015 MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

#include "wt_internal.h"

/*
 * Fast-delete support.
 *
 * This file contains most of the code that allows WiredTiger to delete pages
 * of data without reading them into the cache.  (This feature is currently
 * only available for row-store objects.)
 *
 * The way cursor truncate works in a row-store object is it explicitly reads
 * the first and last pages of the truncate range, then walks the tree with a
 * flag so the cursor walk code marks any page within the range, that hasn't
 * yet been read and which has no overflow items, as deleted, by changing the
 * WT_REF state to WT_REF_DELETED.  Pages already in the cache or with overflow
 * items, have their rows updated/deleted individually. The transaction for the
 * delete operation is stored in memory referenced by the WT_REF.page_del field.
 *
 * Future cursor walks of the tree will skip the deleted page based on the
 * transaction stored for the delete, but it gets more complicated if a read is
 * done using a random key, or a cursor walk is done with a transaction where
 * the delete is not visible.  In those cases, we read the original contents of
 * the page.  The page-read code notices a deleted page is being read, and as
 * part of the read instantiates the contents of the page, creating a WT_UPDATE
 * with a deleted operation, in the same transaction as deleted the page.  In
 * other words, the read process makes it appear as if the page was read and
 * each individual row deleted, exactly as would have happened if the page had
 * been in the cache all along.
 *
 * There's an additional complication to support rollback of the page delete.
 * When the page was marked deleted, a pointer to the WT_REF was saved in the
 * deleting session's transaction list and the delete is unrolled by resetting
 * the WT_REF_DELETED state back to WT_REF_DISK.  However, if the page has been
 * instantiated by some reading thread, that's not enough, each individual row
 * on the page must have the delete operation reset.  If the page split, the
 * WT_UPDATE lists might have been saved/restored during reconciliation and
 * appear on multiple pages, and the WT_REF stored in the deleting session's
 * transaction list is no longer useful.  For this reason, when the page is
 * instantiated by a read, a list of the WT_UPDATE structures on the page is
 * stored in the WT_REF.page_del field, with the transaction ID, that way the
 * session unrolling the delete can find all of the WT_UPDATE structures that
 * require update.
 *
 * One final note: pages can also be marked deleted if emptied and evicted.  In
 * that case, the WT_REF state will be set to WT_REF_DELETED but there will not
 * be any associated WT_REF.page_del field.  These pages are always skipped
 * during cursor traversal (the page could not have been evicted if there were
 * updates that weren't globally visible), and if read is forced to instantiate
 * such a page, it simply creates an empty page from scratch.
 */

/*
 * __wt_delete_page --
 *	If deleting a range, try to delete the page without instantiating it.
 */
int
__wt_delete_page(WT_SESSION_IMPL *session, WT_REF *ref, int *skipp)
{
	WT_DECL_RET;
	WT_PAGE *parent;

	*skipp = 0;

	/*
	 * Atomically switch the page's state to lock it.  If the page is not
	 * on-disk, other threads may be using it, no fast delete.
	 *
	 * Possible optimization: if the page is already deleted and the delete
	 * is visible to us (the delete has been committed), we could skip the
	 * page instead of instantiating it and figuring out there are no rows
	 * in the page.  While that's a huge amount of work to no purpose, it's
	 * unclear optimizing for overlapping range deletes is worth the effort.
	 */
	if (ref->state != WT_REF_DISK ||
	    !WT_ATOMIC_CAS4(ref->state, WT_REF_DISK, WT_REF_LOCKED))
		return (0);

	/*
	 * We cannot fast-delete pages that have overflow key/value items as
	 * the overflow blocks have to be discarded.  The way we figure that
	 * out is to check the on-page cell type for the page, cells for leaf
	 * pages that have no overflow items are special.
	 *
	 * In some cases, the reference address may not reference an on-page
	 * cell (for example, some combination of page splits), in which case
	 * we can't check the original cell value and we fail.
	 *
	 * To look at an on-page cell, we need to look at the parent page, and
	 * that's dangerous, our parent page could change without warning if
	 * the parent page were to split, deepening the tree.  It's safe: the
	 * page's reference will always point to some valid page, and if we find
	 * any problems we simply fail the fast-delete optimization.
	 *
	 * !!!
	 * I doubt it's worth the effort, but we could copy the cell's type into
	 * the reference structure, and then we wouldn't need an on-page cell.
	 */
	parent = ref->home;
	if (__wt_off_page(parent, ref->addr) ||
	    __wt_cell_type_raw(ref->addr) != WT_CELL_ADDR_LEAF_NO)
		goto err;

	/*
	 * This action dirties the parent page: mark it dirty now, there's no
	 * future reconciliation of the child leaf page that will dirty it as
	 * we write the tree.
	 */
	WT_ERR(__wt_page_parent_modify_set(session, ref, 0));

	/*
	 * Record the change in the transaction structure and set the change's
	 * transaction ID.
	 */
	WT_ERR(__wt_calloc_one(session, &ref->page_del));
	ref->page_del->txnid = session->txn.id;

	WT_ERR(__wt_txn_modify_ref(session, ref));

	*skipp = 1;
	WT_PUBLISH(ref->state, WT_REF_DELETED);
	return (0);

err:	__wt_free(session, ref->page_del);

	/*
	 * Restore the page to on-disk status, we'll have to instantiate it.
	 */
	WT_PUBLISH(ref->state, WT_REF_DISK);
	return (ret);
}

/*
 * __wt_delete_page_rollback --
 *	Abort pages that were deleted without being instantiated.
 */
void
__wt_delete_page_rollback(WT_SESSION_IMPL *session, WT_REF *ref)
{
	WT_UPDATE **upd;

	/*
	 * If the page is still "deleted", it's as we left it, reset the state
	 * to on-disk and we're done.  Otherwise, we expect the page is either
	 * instantiated or being instantiated.  Loop because it's possible for
	 * the page to return to the deleted state if instantiation fails.
	 */
	for (;; __wt_yield())
		switch (ref->state) {
		case WT_REF_DISK:
		case WT_REF_READING:
			WT_ASSERT(session, 0);		/* Impossible, assert */
			break;
		case WT_REF_DELETED:
			/*
			 * If the page is still "deleted", it's as we left it,
			 * reset the state.
			 */
			if (WT_ATOMIC_CAS4(
			    ref->state, WT_REF_DELETED, WT_REF_DISK))
				return;
			break;
		case WT_REF_LOCKED:
			/*
			 * A possible state, the page is being instantiated.
			 */
			break;
		case WT_REF_MEM:
		case WT_REF_SPLIT:
			/*
			 * We can't use the normal read path to get a copy of
			 * the page because the session may have closed the
			 * cursor, we no longer have the reference to the tree
			 * required for a hazard pointer.  We're safe because
			 * with unresolved transactions, the page isn't going
			 * anywhere.
			 *
			 * The page is in an in-memory state, walk the list of
			 * update structures and abort them.
			 */
			for (upd =
			    ref->page_del->update_list; *upd != NULL; ++upd)
				(*upd)->txnid = WT_TXN_ABORTED;

			/*
			 * Discard the memory, the transaction can't abort
			 * twice.
			 */
			__wt_free(session, ref->page_del->update_list);
			__wt_free(session, ref->page_del);
			return;
		}
}

/*
 * __wt_delete_page_skip --
 *	If iterating a cursor, skip deleted pages that are visible to us.
 */
int
__wt_delete_page_skip(WT_SESSION_IMPL *session, WT_REF *ref)
{
	int skip;

	if (ref->state != WT_REF_DELETED)
		return (0);

	/*
	 * Deleted pages come from two sources: either it's a fast-delete as
	 * described above, or the page has been emptied by other operations
	 * and eviction deleted it.
	 *
	 * In both cases, the WT_REF state will be WT_REF_DELETED.  In the case
	 * of a fast-delete page, there will be a WT_PAGE_DELETED structure with
	 * the transaction ID of the transaction that deleted the page, and the
	 * page is visible if that transaction ID is visible.  In the case of an
	 * empty page, there will be no WT_PAGE_DELETED structure and the delete
	 * is by definition visible, eviction could not have deleted the page if
	 * there were changes on it that were not globally visible.
	 *
	 * We're here because we found a WT_REF state set to WT_REF_DELETED.  It
	 * is possible the page is being read into memory right now, though, and
	 * the page could switch to an in-memory state at any time.  Lock down
	 * the structure, just to be safe.
	 */
	if (ref->page_del == NULL)
		return (1);

	if (!WT_ATOMIC_CAS4(ref->state, WT_REF_DELETED, WT_REF_LOCKED))
		return (0);

	skip = __wt_txn_visible(session, ref->page_del->txnid) ? 1 : 0;

	WT_PUBLISH(ref->state, WT_REF_DELETED);
	return (skip);
}

/*
 * __wt_delete_page_instantiate --
 *	Instantiate an entirely deleted row-store leaf page.
 */
int
__wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref)
{
	WT_BTREE *btree;
	WT_DECL_RET;
	WT_PAGE *page;
	WT_PAGE_DELETED *page_del;
	WT_UPDATE **upd_array, *upd;
	uint32_t i;

	btree = S2BT(session);
	page = ref->page;
	page_del = ref->page_del;

	/*
	 * Give the page a modify structure.
	 *
	 * If the tree is already dirty and so will be written, mark the page
	 * dirty.  (We'd like to free the deleted pages, but if the handle is
	 * read-only or if the application never modifies the tree, we're not
	 * able to do so.)
	 */
	if (btree->modified) {
		WT_RET(__wt_page_modify_init(session, page));
		__wt_page_modify_set(session, page);
	}

	/*
	 * An operation is accessing a "deleted" page, and we're building an
	 * in-memory version of the page (making it look like all entries in
	 * the page were individually updated by a remove operation).  There
	 * are two cases where we end up here:
	 *
	 * First, a running transaction used a truncate call to delete the page
	 * without reading it, in which case the page reference includes a
	 * structure with a transaction ID; the page we're building might split
	 * in the future, so we update that structure to include references to
	 * all of the update structures we create, so the transaction can abort.
	 *
	 * Second, a truncate call deleted a page and the truncate committed,
	 * but an older transaction in the system forced us to keep the old
	 * version of the page around, then we crashed and recovered, and now
	 * we're being forced to read that page.
	 *
	 * In the first case, we have a page reference structure, in the second
	 * second, we don't.
	 *
	 * Allocate the per-reference update array; in the case of instantiating
	 * a page, deleted by a running transaction that might eventually abort,
	 * we need a list of the update structures so we can do that abort.  The
	 * hard case is if a page splits: the update structures might be moved
	 * to different pages, and we still have to find them all for an abort.
	 */

	if (page_del != NULL)
		WT_RET(__wt_calloc_def(
		    session, page->pg_row_entries + 1, &page_del->update_list));

	/* Allocate the per-page update array. */
	WT_ERR(__wt_calloc_def(session, page->pg_row_entries, &upd_array));
	page->pg_row_upd = upd_array;

	/*
	 * Fill in the per-reference update array with references to update
	 * structures, fill in the per-page update array with references to
	 * deleted items.
	 */
	for (i = 0; i < page->pg_row_entries; ++i) {
		WT_ERR(__wt_calloc_one(session, &upd));
		WT_UPDATE_DELETED_SET(upd);

		if (page_del == NULL)
			upd->txnid = WT_TXN_NONE;	/* Globally visible */
		else {
			upd->txnid = page_del->txnid;
			page_del->update_list[i] = upd;
		}

		upd->next = upd_array[i];
		upd_array[i] = upd;
	}

	__wt_cache_page_inmem_incr(session, page,
	    page->pg_row_entries * (sizeof(WT_UPDATE *) + sizeof(WT_UPDATE)));

	return (0);

err:	/*
	 * There's no need to free the page update structures on error, our
	 * caller will discard the page and do that work for us.  We could
	 * similarly leave the per-reference update array alone because it
	 * won't ever be used by any page that's not in-memory, but cleaning
	 * it up makes sense, especially if we come back in to this function
	 * attempting to instantiate this page again.
	 */
	if (page_del != NULL)
		__wt_free(session, page_del->update_list);
	return (ret);
}