summaryrefslogtreecommitdiff
path: root/src/btree/row_key.c
blob: 99ee34a6c5d19a64c96c624dd62952f89678b98d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
/*-
 * Copyright (c) 2014-2016 MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

#include "wt_internal.h"

static void __inmem_row_leaf_slots(uint8_t *, uint32_t, uint32_t, uint32_t);

/*
 * __wt_row_leaf_keys --
 *	Instantiate the interesting keys for random search of a page.
 */
int
__wt_row_leaf_keys(WT_SESSION_IMPL *session, WT_PAGE *page)
{
	WT_BTREE *btree;
	WT_DECL_ITEM(key);
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	WT_ROW *rip;
	uint32_t gap, i;

	btree = S2BT(session);

	if (page->pg_row_entries == 0) {		/* Just checking... */
		F_SET_ATOMIC(page, WT_PAGE_BUILD_KEYS);
		return (0);
	}

	/*
	 * Row-store leaf pages are written as one big prefix-compressed chunk,
	 * that is, only the first key on the page is not prefix-compressed, and
	 * to instantiate the last key on the page, you have to take the first
	 * key on the page and roll it forward to the end of the page.  We don't
	 * want to do that on every page access, of course, so we instantiate a
	 * set of keys, essentially creating prefix chunks on the page, where we
	 * can roll forward from the closest, previous, instantiated key.  The
	 * complication is that not all keys on a page are equal: we're doing a
	 * binary search on the  page, which means there are keys we look at a
	 * lot (every time we search the page), and keys we never look at unless
	 * they are actually being searched for.  This function figures out the
	 * "interesting" keys on a page, and then we sequentially walk that list
	 * instantiating those keys.
	 *
	 * Allocate a bit array and figure out the set of "interesting" keys,
	 * marking up the array.
	 */
	WT_RET(__wt_scr_alloc(session, 0, &key));
	WT_RET(__wt_scr_alloc(session,
	    (uint32_t)__bitstr_size(page->pg_row_entries), &tmp));
	memset(tmp->mem, 0, tmp->memsize);

	if ((gap = btree->key_gap) == 0)
		gap = 1;
	__inmem_row_leaf_slots(tmp->mem, 0, page->pg_row_entries, gap);

	/* Instantiate the keys. */
	for (rip = page->pg_row_d, i = 0; i < page->pg_row_entries; ++rip, ++i)
		if (__bit_test(tmp->mem, i))
			WT_ERR(__wt_row_leaf_key_work(
			    session, page, rip, key, true));

	F_SET_ATOMIC(page, WT_PAGE_BUILD_KEYS);

err:	__wt_scr_free(session, &key);
	__wt_scr_free(session, &tmp);
	return (ret);
}

/*
 * __inmem_row_leaf_slots --
 *	Figure out the interesting slots of a page for random search, up to
 * the specified depth.
 */
static void
__inmem_row_leaf_slots(
    uint8_t *list, uint32_t base, uint32_t entries, uint32_t gap)
{
	uint32_t indx, limit;

	if (entries < gap)
		return;

	/*
	 * !!!
	 * Don't clean this code up -- it deliberately looks like the binary
	 * search code.
	 *
	 * !!!
	 * There's got to be a function that would give me this information, but
	 * I don't see any performance reason we can't just do this recursively.
	 */
	limit = entries;
	indx = base + (limit >> 1);
	__bit_set(list, indx);

	__inmem_row_leaf_slots(list, base, limit >> 1, gap);

	base = indx + 1;
	--limit;
	__inmem_row_leaf_slots(list, base, limit >> 1, gap);
}

/*
 * __wt_row_leaf_key_copy --
 *	Get a copy of a row-store leaf-page key.
 */
int
__wt_row_leaf_key_copy(
    WT_SESSION_IMPL *session, WT_PAGE *page, WT_ROW *rip, WT_ITEM *key)
{
	WT_RET(__wt_row_leaf_key(session, page, rip, key, false));

	/* The return buffer may only hold a reference to a key, copy it. */
	if (!WT_DATA_IN_ITEM(key))
		WT_RET(__wt_buf_set(session, key, key->data, key->size));

	return (0);
}

/*
 * __wt_row_leaf_key_work --
 *	Return a reference to, a row-store leaf-page key, optionally instantiate
 * the key into the in-memory page.
 */
int
__wt_row_leaf_key_work(WT_SESSION_IMPL *session,
    WT_PAGE *page, WT_ROW *rip_arg, WT_ITEM *keyb, bool instantiate)
{
	enum { FORWARD, BACKWARD } direction;
	WT_BTREE *btree;
	WT_CELL *cell;
	WT_CELL_UNPACK *unpack, _unpack;
	WT_DECL_ITEM(tmp);
	WT_DECL_RET;
	WT_IKEY *ikey;
	WT_ROW *rip, *jump_rip;
	size_t size;
	u_int last_prefix;
	int jump_slot_offset, slot_offset;
	void *copy;
	const void *p;

	/*
	 * !!!
	 * It is unusual to call this function: most code should be calling the
	 * front-end, __wt_row_leaf_key, be careful if you're calling this code
	 * directly.
	 */

	btree = S2BT(session);
	unpack = &_unpack;
	rip = rip_arg;

	jump_rip = NULL;
	jump_slot_offset = 0;
	last_prefix = 0;

	p = NULL;			/* -Werror=maybe-uninitialized */
	size = 0;			/* -Werror=maybe-uninitialized */

	direction = BACKWARD;
	for (slot_offset = 0;;) {
		if (0) {
switch_and_jump:	/* Switching to a forward roll. */
			WT_ASSERT(session, direction == BACKWARD);
			direction = FORWARD;

			/* Skip list of keys with compatible prefixes. */
			rip = jump_rip;
			slot_offset = jump_slot_offset;
		}
		copy = WT_ROW_KEY_COPY(rip);

		/*
		 * Figure out what the key looks like.
		 */
		(void)__wt_row_leaf_key_info(
		    page, copy, &ikey, &cell, &p, &size);

		/* 1: the test for a directly referenced on-page key. */
		if (cell == NULL) {
			keyb->data = p;
			keyb->size = size;

			/*
			 * If this is the key we originally wanted, we don't
			 * care if we're rolling forward or backward, or if
			 * it's an overflow key or not, it's what we wanted.
			 * This shouldn't normally happen, the fast-path code
			 * that front-ends this function will have figured it
			 * out before we were called.
			 *
			 * The key doesn't need to be instantiated, skip past
			 * that test.
			 */
			if (slot_offset == 0)
				goto done;

			/*
			 * This key is not an overflow key by definition and
			 * isn't compressed in any way, we can use it to roll
			 * forward.
			 *	If rolling backward, switch directions.
			 *	If rolling forward: there's a bug somewhere,
			 * we should have hit this key when rolling backward.
			 */
			goto switch_and_jump;
		}

		/* 2: the test for an instantiated off-page key. */
		if (ikey != NULL) {
			/*
			 * If this is the key we originally wanted, we don't
			 * care if we're rolling forward or backward, or if
			 * it's an overflow key or not, it's what we wanted.
			 * Take a copy and wrap up.
			 *
			 * The key doesn't need to be instantiated, skip past
			 * that test.
			 */
			if (slot_offset == 0) {
				keyb->data = p;
				keyb->size = size;
				goto done;
			}

			/*
			 * If we wanted a different key and this key is an
			 * overflow key:
			 *	If we're rolling backward, this key is useless
			 * to us because it doesn't have a valid prefix: keep
			 * rolling backward.
			 *	If we're rolling forward, there's no work to be
			 * done because prefixes skip overflow keys: keep
			 * rolling forward.
			 */
			if (__wt_cell_type(cell) == WT_CELL_KEY_OVFL)
				goto next;

			/*
			 * If we wanted a different key and this key is not an
			 * overflow key, it has a valid prefix, we can use it.
			 *	If rolling backward, take a copy of the key and
			 * switch directions, we can roll forward from this key.
			 *	If rolling forward, replace the key we've been
			 * building with this key, it's what we would have built
			 * anyway.
			 * In short: if it's not an overflow key, take a copy
			 * and roll forward.
			 */
			keyb->data = p;
			keyb->size = size;
			direction = FORWARD;
			goto next;
		}

		/*
		 * It must be an on-page cell, unpack it.
		 */
		__wt_cell_unpack(cell, unpack);

		/* 3: the test for an on-page reference to an overflow key. */
		if (unpack->type == WT_CELL_KEY_OVFL) {
			/*
			 * If this is the key we wanted from the start, we don't
			 * care if it's an overflow key, get a copy and wrap up.
			 *
			 * Avoid racing with reconciliation deleting overflow
			 * keys.  Deleted overflow keys must be instantiated
			 * first, acquire the overflow lock and check.  Read
			 * the key if we still need to do so, but holding the
			 * overflow lock.  Note we are not using the version of
			 * the cell-data-ref calls that acquire the overflow
			 * lock and do a look-aside into the tracking cache:
			 * this is an overflow key, not a value, meaning it's
			 * instantiated before being deleted, not copied into
			 * the tracking cache.
			 */
			if (slot_offset == 0) {
				__wt_readlock(session, btree->ovfl_lock);
				copy = WT_ROW_KEY_COPY(rip);
				if (!__wt_row_leaf_key_info(page, copy,
				    NULL, &cell, &keyb->data, &keyb->size)) {
					__wt_cell_unpack(cell, unpack);
					ret = __wt_dsk_cell_data_ref(session,
					    WT_PAGE_ROW_LEAF, unpack, keyb);
				}
				__wt_readunlock(session, btree->ovfl_lock);
				WT_ERR(ret);
				break;
			}

			/*
			 * If we wanted a different key:
			 *	If we're rolling backward, this key is useless
			 * to us because it doesn't have a valid prefix: keep
			 * rolling backward.
			 *	If we're rolling forward, there's no work to be
			 * done because prefixes skip overflow keys: keep
			 * rolling forward.
			 */
			goto next;
		}

		/*
		 * 4: the test for an on-page reference to a key that isn't
		 * prefix compressed.
		 */
		if (unpack->prefix == 0) {
			/*
			 * The only reason to be here is a Huffman encoded key,
			 * a non-encoded key with no prefix compression should
			 * have been directly referenced, and we should not have
			 * needed to unpack its cell.
			 */
			WT_ASSERT(session, btree->huffman_key != NULL);

			/*
			 * If this is the key we originally wanted, we don't
			 * care if we're rolling forward or backward, it's
			 * what we want.  Take a copy and wrap up.
			 *
			 * If we wanted a different key, this key has a valid
			 * prefix, we can use it.
			 *	If rolling backward, take a copy of the key and
			 * switch directions, we can roll forward from this key.
			 *	If rolling forward there's a bug, we should have
			 * found this key while rolling backwards and switched
			 * directions then.
			 *
			 * The key doesn't need to be instantiated, skip past
			 * that test.
			 */
			WT_ERR(__wt_dsk_cell_data_ref(
			    session, WT_PAGE_ROW_LEAF, unpack, keyb));
			if (slot_offset == 0)
				goto done;
			goto switch_and_jump;
		}

		/*
		 * 5: an on-page reference to a key that's prefix compressed.
		 *	If rolling backward, keep looking for something we can
		 * use.
		 *	If rolling forward, build the full key and keep rolling
		 * forward.
		 */
		if (direction == BACKWARD) {
			/*
			 * If there's a set of keys with identical prefixes, we
			 * don't want to instantiate each one, the prefixes are
			 * all the same.
			 *
			 * As we roll backward through the page, track the last
			 * time the prefix decreased in size, so we can start
			 * with that key during our roll-forward.  For a page
			 * populated with a single key prefix, we'll be able to
			 * instantiate the key we want as soon as we find a key
			 * without a prefix.
			 */
			if (slot_offset == 0)
				last_prefix = unpack->prefix;
			if (slot_offset == 0 || last_prefix > unpack->prefix) {
				jump_rip = rip;
				jump_slot_offset = slot_offset;
				last_prefix = unpack->prefix;
			}
		}
		if (direction == FORWARD) {
			/*
			 * Get a reference to the current key's bytes.  Usually
			 * we want bytes from the page, fast-path that case.
			 */
			if (btree->huffman_key == NULL) {
				p = unpack->data;
				size = unpack->size;
			} else {
				if (tmp == NULL)
					WT_ERR(
					__wt_scr_alloc(session, 0, &tmp));
				WT_ERR(__wt_dsk_cell_data_ref(
				    session, WT_PAGE_ROW_LEAF, unpack, tmp));
				p = tmp->data;
				size = tmp->size;
			}

			/*
			 * Grow the buffer as necessary as well as ensure data
			 * has been copied into local buffer space, then append
			 * the suffix to the prefix already in the buffer.
			 *
			 * Don't grow the buffer unnecessarily or copy data we
			 * don't need, truncate the item's data length to the
			 * prefix bytes.
			 */
			keyb->size = unpack->prefix;
			WT_ERR(__wt_buf_grow(session, keyb, keyb->size + size));
			memcpy((uint8_t *)keyb->data + keyb->size, p, size);
			keyb->size += size;

			if (slot_offset == 0)
				break;
		}

next:		switch (direction) {
		case  BACKWARD:
			--rip;
			++slot_offset;
			break;
		case FORWARD:
			++rip;
			--slot_offset;
			break;
		}
	}

	/*
	 * Optionally instantiate the key: there's a cost to figuring out a key
	 * value in a leaf page with prefix-compressed or Huffman encoded keys,
	 * amortize the cost by instantiating a copy of the calculated key in
	 * allocated memory.  We don't instantiate keys when pages are first
	 * brought into memory because it's wasted effort if the page is only
	 * read by a cursor in sorted order.  If, instead, the page is read by a
	 * cursor in reverse order, we immediately instantiate periodic keys for
	 * the page (otherwise the reverse walk would be insanely slow).  If,
	 * instead, the page is randomly searched, we instantiate keys as they
	 * are accessed (meaning, for example, as long as the binary search only
	 * touches one-half of the page, the only keys we instantiate will be in
	 * that half of the page).
	 */
	if (instantiate) {
		copy = WT_ROW_KEY_COPY(rip_arg);
		(void)__wt_row_leaf_key_info(
		    page, copy, &ikey, &cell, NULL, NULL);
		if (ikey == NULL) {
			WT_ERR(__wt_row_ikey_alloc(session,
			    WT_PAGE_DISK_OFFSET(page, cell),
			    keyb->data, keyb->size, &ikey));

			/*
			 * Serialize the swap of the key into place: on success,
			 * update the page's memory footprint, on failure, free
			 * the allocated memory.
			 */
			if (__wt_atomic_cas_ptr(
			    (void *)&WT_ROW_KEY_COPY(rip), copy, ikey))
				__wt_cache_page_inmem_incr(session,
				    page, sizeof(WT_IKEY) + ikey->size);
			else
				__wt_free(session, ikey);
		}
	}

done:
err:	__wt_scr_free(session, &tmp);
	return (ret);
}

/*
 * __wt_row_ikey_alloc --
 *	Instantiate a key in a WT_IKEY structure.
 */
int
__wt_row_ikey_alloc(WT_SESSION_IMPL *session,
    uint32_t cell_offset, const void *key, size_t size, WT_IKEY **ikeyp)
{
	WT_IKEY *ikey;

	/*
	 * Allocate memory for the WT_IKEY structure and the key, then copy
	 * the key into place.
	 */
	WT_RET(__wt_calloc(session, 1, sizeof(WT_IKEY) + size, &ikey));
	ikey->size = WT_STORE_SIZE(size);
	ikey->cell_offset = cell_offset;
	memcpy(WT_IKEY_DATA(ikey), key, size);
	*ikeyp = ikey;
	return (0);
}

/*
 * __wt_row_ikey_incr --
 *	Instantiate a key in a WT_IKEY structure and increment the page's
 * memory footprint.
 */
int
__wt_row_ikey_incr(WT_SESSION_IMPL *session, WT_PAGE *page,
    uint32_t cell_offset, const void *key, size_t size, WT_REF *ref)
{
	WT_RET(__wt_row_ikey(session, cell_offset, key, size, ref));

	__wt_cache_page_inmem_incr(session, page, sizeof(WT_IKEY) + size);

	return (0);
}

/*
 * __wt_row_ikey --
 *	Instantiate a key in a WT_IKEY structure.
 */
int
__wt_row_ikey(WT_SESSION_IMPL *session,
    uint32_t cell_offset, const void *key, size_t size, WT_REF *ref)
{
	WT_IKEY *ikey;

	WT_RET(__wt_row_ikey_alloc(session, cell_offset, key, size, &ikey));

#ifdef HAVE_DIAGNOSTIC
	{
	uintptr_t oldv;

	oldv = (uintptr_t)ref->ref_ikey;
	WT_DIAGNOSTIC_YIELD;

	/*
	 * We should never overwrite an instantiated key, and we should
	 * never instantiate a key after a split.
	 */
	WT_ASSERT(session, oldv == 0 || (oldv & WT_IK_FLAG) != 0);
	WT_ASSERT(session, ref->state != WT_REF_SPLIT);
	WT_ASSERT(session,
	    __wt_atomic_cas_ptr(&ref->ref_ikey, (WT_IKEY *)oldv, ikey));
	}
#else
	ref->ref_ikey = ikey;
#endif
	return (0);
}