summaryrefslogtreecommitdiff
path: root/src/btree/row_key.c
blob: 9cff462ed226acd866a1d06ea5214da6285e5e9e (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
/*-
 * Copyright (c) 2008-2012 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

#include "wt_internal.h"

static void __inmem_row_leaf_slots(uint8_t *, uint32_t, uint32_t, uint32_t);

/*
 * __wt_row_leaf_keys --
 *	Instantiate the interesting keys for random search of a page.
 */
int
__wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_BTREE *btree;
	WT_ITEM *tmp;
	WT_ROW *rip;
	uint32_t i;
	int ret;

	btree = session->btree;
	ret = 0;

	if (page->entries == 0) {			/* Just checking... */
		F_SET(page, WT_PAGE_BUILD_KEYS);
		return (0);
	}

	/*
	 * Row-store leaf pages are written as one big prefix-compressed chunk,
	 * that is, only the first key on the page is not prefix-compressed, and
	 * to instantiate the last key on the page, you have to take the first
	 * key on the page and roll it forward to the end of the page.  We don't
	 * want to do that on every page access, of course, so we instantiate a
	 * set of keys, essentially creating prefix chunks on the page, where we
	 * can roll forward from the closest, previous, instantiated key.  The
	 * complication is that not all keys on a page are equal: we're doing a
	 * binary search on the  page, which means there are keys we look at a
	 * lot (every time we search the page), and keys we never look at unless
	 * they are actually being searched for.  This function figures out the
	 * "interesting" keys on a page, and then we sequentially walk that list
	 * instantiating those keys.
	 *
	 * Allocate a bit array and figure out the set of "interesting" keys,
	 * marking up the array.
	 */
	WT_RET(__wt_scr_alloc(session, __bitstr_size(page->entries), &tmp));

	__inmem_row_leaf_slots(tmp->mem, 0, page->entries, btree->key_gap);

	/* Instantiate the keys. */
	for (rip = page->u.row.d, i = 0; i < page->entries; ++rip, ++i)
		if (__bit_test(tmp->mem, i))
			WT_ERR(__wt_row_key(session, page, rip, NULL));

	F_SET(page, WT_PAGE_BUILD_KEYS);

err:	__wt_scr_free(&tmp);
	return (ret);
}

/*
 * __inmem_row_leaf_slots --
 *	Figure out the interesting slots of a page for random search, up to
 * the specified depth.
 */
static void
__inmem_row_leaf_slots(
    uint8_t *list, uint32_t base, uint32_t entries, uint32_t gap)
{
	uint32_t indx, limit;

	if (entries < gap)
		return;

	/*
	 * !!!
	 * Don't clean this code up -- it deliberately looks like the binary
	 * search code.
	 *
	 * !!!
	 * There's got to be a function that would give me this information, but
	 * I don't see any performance reason we can't just do this recursively.
	 */
	limit = entries;
	indx = base + (limit >> 1);
	__bit_set(list, indx);

	__inmem_row_leaf_slots(list, base, limit >> 1, gap);

	base = indx + 1;
	--limit;
	__inmem_row_leaf_slots(list, base, limit >> 1, gap);
}

/*
 * __wt_row_key --
 *	Copy an on-page key into a return buffer, or, if no return buffer
 * is specified, instantiate the key into the in-memory page.
 */
int
__wt_row_key(
    WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip_arg, WT_ITEM *retb)
{
	enum { FORWARD, BACKWARD } direction;
	WT_CELL_UNPACK *unpack, _unpack;
	WT_IKEY *ikey;
	WT_ITEM *tmp;
	WT_ROW *rip;
	int is_local, ret, slot_offset;
	void *key;

	rip = rip_arg;
	tmp = NULL;
	unpack = &_unpack;

	/*
	 * If the caller didn't pass us a buffer, create one.  We don't use
	 * an existing buffer because the memory will be attached to a page
	 * for semi-permanent use, and using an existing buffer might waste
	 * memory if the one allocated from the pool was larger than needed.
	 */
	is_local = 0;
	if (retb == NULL) {
		is_local = 1;
		WT_ERR(__wt_scr_alloc(session, 0, &retb));
	}

	direction = BACKWARD;
	for (slot_offset = 0;;) {
		/*
		 * Multiple threads of control may be searching this page, which
		 * means the key may change underfoot, and here's where it gets
		 * tricky: first, copy the key.  We don't need any barriers, the
		 * key is updated atomically, and we just need a valid copy.
		 */
		key = rip->key;

		/*
		 * Key copied.
		 *
		 * If another thread instantiated the key while we were doing
		 * that, we don't have any work to do.  Figure this out using
		 * the key's value:
		 *
		 * If the key points off-page, another thread updated the key,
		 * we can just use it.
		 *
		 * If the key points on-page, we have a copy of a WT_CELL value
		 * that can be processed, regardless of what any other thread is
		 * doing.
		 *
		 * Overflow keys are not prefix-compressed, we don't want to
		 * read/write them during reconciliation simply because their
		 * prefix might change.  That means we can't use instantiated
		 * overflow keys to figure out the prefix for other keys,
		 * specifically, in this code when we're looking for a key we
		 * can roll-forward to figure out the target key's prefix,
		 * instantiated overflow keys aren't useful.
		 *
		 * 1: the test for an on/off page reference.
		 */
		if (__wt_off_page(page, key)) {
			ikey = key;

			/*
			 * If this is the key we originally wanted, we don't
			 * care if we're rolling forward or backward, or if
			 * it's an overflow key or not, it's what we wanted.
			 * Take a copy and wrap up.
			 */
			if (slot_offset == 0) {
				WT_ERR(__wt_buf_set(session,
				    retb, WT_IKEY_DATA(ikey), ikey->size));
				break;
			}

			__wt_cell_unpack(WT_PAGE_REF_OFFSET(
			    page, ikey->cell_offset), unpack);

			/*
			 * If we wanted a different key and this key is an
			 * overflow key:
			 *	If we're rolling backward, this key is useless
			 * to us because it doesn't have a valid prefix: keep
			 * rolling backward.
			 *	If we're rolling forward, there's no work to be
			 * done because prefixes skip overflow keys: keep
			 * rolling forward.
			 */
			if (unpack->ovfl)
				goto next;

			/*
			 * If we wanted a different key and this key is not an
			 * overflow key, it has a valid prefix, we can use it.
			 *	If rolling backward, take a copy of the key and
			 * switch directions, we can roll forward from this key.
			 *	If rolling forward, replace the key we've been
			 * building with this key, it's what we would have built
			 * anyway.
			 * In short: if it's not an overflow key, take a copy
			 * and roll forward.
			 */
			WT_ERR(__wt_buf_set(
			    session, retb, WT_IKEY_DATA(ikey), ikey->size));
			direction = FORWARD;
			goto next;
		}

		/* Unpack the key's cell. */
		__wt_cell_unpack(key, unpack);

		/* 2: the test for an on-page reference to an overflow key. */
		if (unpack->type == WT_CELL_KEY_OVFL) {
			/*
			 * If this is the key we wanted from the start, we don't
			 * care if it's an overflow key, get a copy and wrap up.
			 */
			if (slot_offset == 0) {
				WT_ERR(__wt_cell_unpack_copy(
				    session, unpack, retb));
				break;
			}

			/*
			 * If we wanted a different key and this key is an
			 * overflow key:
			 *	If we're rolling backward, this key is useless
			 * to us because it doesn't have a valid prefix: keep
			 * rolling backward.
			 *	If we're rolling forward, there's no work to be
			 * done because prefixes skip overflow keys: keep
			 * rolling forward.
			 */
			goto next;
		}

		/*
		 * 3: the test for an on-page reference to a key that isn't
		 * prefix compressed.
		 */
		if (unpack->prefix == 0) {
			/*
			 * If this is the key we originally wanted, we don't
			 * care if we're rolling forward or backward, it's
			 * what we want.  Take a copy and wrap up.
			 *
			 * If we wanted a different key, this key has a valid
			 * prefix we can use it.
			 *	If rolling backward, take a copy of the key and
			 * switch directions, we can roll forward from this key.
			 *	If rolling forward there's a bug, we should have
			 * found this key while rolling backwards and switched
			 * directions then.
			 */
			WT_ERR(__wt_cell_unpack_copy(session, unpack, retb));
			if (slot_offset == 0)
				break;

			WT_ASSERT(session, direction == BACKWARD);
			direction = FORWARD;
			goto next;
		}

		/*
		 * 4: an on-page reference to a key that's prefix compressed.
		 *	If rolling backward, keep looking for something we can
		 * use.
		 *	If rolling forward, build the full key and keep rolling
		 * forward.
		 */
		if (direction == FORWARD) {
			/*
			 * Get a copy of the current key;
			 * Ensure the buffer can hold the key plus the prefix;
			 * Append the key to the prefix (already in the buffer);
			 * Set the final size of the key.
			 */
			if (tmp == NULL)
				WT_ERR(__wt_scr_alloc(session, 0, &tmp));
			WT_ERR(__wt_cell_unpack_copy(session, unpack, tmp));
			WT_ERR(__wt_buf_initsize(
			    session, retb, tmp->size + unpack->prefix));
			memcpy((uint8_t *)
			    retb->data + unpack->prefix, tmp->data, tmp->size);

			if (slot_offset == 0)
				break;
		}

next:		switch (direction) {
		case  BACKWARD:
			--rip;
			++slot_offset;
			break;
		case FORWARD:
			++rip;
			--slot_offset;
			break;
		}
	}

	__wt_scr_free(&tmp);

	/*
	 * If a return buffer was specified, the caller just wants a copy and
	 * no further work is needed.
	 */
	if (!is_local)
		return (0);

	/*
	 * Allocate and initialize a WT_IKEY structure, we're instantiating
	 * this key.
	 */
	WT_ERR(__wt_row_ikey_alloc(session,
	    WT_PAGE_DISK_OFFSET(page, rip_arg->key),
	    retb->data, retb->size, &ikey));

	/* Serialize the swap of the key into place. */
	ret = __wt_row_key_serial(session, page, rip_arg, ikey);

	/*
	 * Free the WT_IKEY structure if the serialized call didn't use it for
	 * the key.
	 */
	if (rip_arg->key != ikey)
		__wt_free(session, ikey);

	__wt_scr_free(&retb);

	return (ret);

err:	if (is_local && retb != NULL)
		__wt_scr_free(&retb);
	__wt_scr_free(&tmp);
	return (ret);
}

/*
 * __wt_row_value --
 *	Return a pointer to the value cell for a row-store leaf page key, or
 * NULL if there isn't one.
 */
WT_CELL *
__wt_row_value(WT_PAGE *page, WT_ROW *rip)
{
	WT_CELL *cell;
	WT_CELL_UNPACK *unpack, _unpack;

	unpack = &_unpack;

	/*
	 * Multiple threads of control may be searching this page, which means
	 * the key may change underfoot, and here's where it gets tricky: first,
	 * copy the key.
	 */
	cell = rip->key;

	/*
	 * Key copied.
	 *
	 * Now, cell either references a WT_IKEY structure that has a value-cell
	 * offset, or references the on-page key WT_CELL, and we can walk past
	 * that to find the value WT_CELL.  Both can be processed regardless of
	 * what other threads are doing.
	 */
	if (__wt_off_page(page, cell))
		cell = WT_PAGE_REF_OFFSET(page, ((WT_IKEY *)cell)->cell_offset);

	/*
	 * Row-store leaf pages may have a single data cell between each key, or
	 * keys may be adjacent (when the data cell is empty).  Move to the next
	 * key.  The page reconciliation code guarantees there is always a key
	 * cell after an empty data cell, so this is safe.
	 */
	__wt_cell_unpack(cell, unpack);
	cell = (WT_CELL *)((uint8_t *)cell + unpack->len);
	if (__wt_cell_type(cell) == WT_CELL_KEY ||
	    __wt_cell_type(cell) == WT_CELL_KEY_OVFL)
		return (NULL);
	return (cell);
}

/*
 * __wt_row_ikey_alloc --
 *	Instantiate a key in a WT_IKEY structure.
 */
int
__wt_row_ikey_alloc(WT_SESSION_IMPL *session,
    uint32_t cell_offset, const void *key, uint32_t size, WT_IKEY **ikeyp)
{
	WT_IKEY *ikey;

	/*
	 * Allocate the WT_IKEY structure and room for the value, then copy
	 * the value into place.
	 */
	WT_RET(__wt_calloc(session, 1, sizeof(WT_IKEY) + size, &ikey));
	ikey->size = size;
	ikey->cell_offset = cell_offset;
	memcpy(WT_IKEY_DATA(ikey), key, size);

	*ikeyp = ikey;
	return (0);
}

/*
 * __wt_row_key_serial_func --
 *	Server function to instantiate a key during a row-store search.
 */
void
__wt_row_key_serial_func(WT_SESSION_IMPL *session)
{
	WT_IKEY *ikey;
	WT_PAGE *page;
	WT_ROW *rip;

	__wt_row_key_unpack(session, &page, &rip, &ikey);

	/*
	 * We don't care about the page's write generation -- there's a simpler
	 * test, if the key we're interested in still needs to be instantiated,
	 * because it can only be in one of two states.
	 */
	if (!__wt_off_page(page, rip->key)) {
		rip->key = ikey;
		__wt_cache_page_inmem_incr(
		    session, page, sizeof(WT_IKEY) + ikey->size);
	}

	__wt_session_serialize_wrapup(session, NULL, 0);
}