summaryrefslogtreecommitdiff
path: root/src/btree/bt_walk.c
blob: 38aa8eb3eb841dc29ab484796ca9ea9d00076a7b (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
/*-
 * Copyright (c) 2008-2012 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

#include "wt_internal.h"

/*
 * __wt_tree_np --
 *	Move to the next/previous page in the tree.
 */
int
__wt_tree_np(WT_SESSION_IMPL *session, WT_PAGE **pagep, int eviction, int next)
{
	WT_BTREE *btree;
	WT_PAGE *page, *t;
	WT_REF *ref;
	uint32_t slot;
	int ret;

	btree = session->btree;
	ret = 0;

	/*
	 * Take a copy of any returned page; we have a hazard reference on the
	 * page, by definition.
	 */
	page = *pagep;
	*pagep = NULL;

	/* If no page is active, begin a walk from the start of the tree. */
	if (page == NULL) {
		if ((page = btree->root_page) == NULL)
			return (0);
		slot = next ? 0 : page->entries - 1;
		goto descend;
	}

	/* If the active page was the root, we've reached the walk's end. */
	if (WT_PAGE_IS_ROOT(page))
		return (0);

	/* Figure out the current slot in the parent page's WT_REF array. */
	t = page->parent;
	slot = (uint32_t)(page->ref - t->u.intl.t);

	/*
	 * Swap our hazard reference for the hazard reference of our parent,
	 * if it's not the root page (we could access it directly because we
	 * know it's in memory, but we need a hazard reference).  Don't leave
	 * a hazard reference dangling on error.
	 *
	 * We're hazard-reference coupling up the tree and that's OK: first,
	 * hazard references can't deadlock, so there's none of the usual
	 * problems found when logically locking up a Btree; second, we don't
	 * release our current hazard reference until we have our parent's
	 * hazard reference.  If the eviction thread tries to evict the active
	 * page, that fails because of our hazard reference.  If eviction tries
	 * to evict our parent, that fails because the parent has a child page
	 * that can't be discarded.
	 */
	if (eviction) {
		if (!WT_PAGE_IS_ROOT(t)) {
			while (!WT_ATOMIC_CAS(t->ref->state,
			    WT_REF_MEM, WT_REF_EVICT_WALK))
				__wt_yield();
		}
		WT_ASSERT(session, page->ref->state == WT_REF_EVICT_WALK);
		page->ref->state = WT_REF_MEM;
	} else {
		if (!WT_PAGE_IS_ROOT(t))
			ret = __wt_page_in(session, t, t->ref);
		__wt_page_release(session, page);
		WT_RET(ret);
	}
	page = t;

	/*
	 * If we're at the last/first slot on the page, return this page in
	 * post-order traversal.  Otherwise we move to the next/prev slot
	 * and left/right-most element in its subtree.
	 */
	for (;;) {
		if ((!next && slot == 0) ||
		    (next && slot == page->entries - 1)) {
			*pagep = page;
			return (0);
		}
		if (next)
			++slot;
		else
			--slot;

descend:	for (;;) {
			if (page->type == WT_PAGE_ROW_INT ||
			    page->type == WT_PAGE_COL_INT)
				ref = &page->u.intl.t[slot];
			else {
				*pagep = page;
				return (0);
			}

			/* We may only care about in-memory pages. */
			if (eviction) {
				if (!WT_ATOMIC_CAS(ref->state,
				    WT_REF_MEM, WT_REF_EVICT_WALK))
					break;
				if (!WT_PAGE_IS_ROOT(page)) {
					WT_ASSERT(session, page->ref->state ==
					    WT_REF_EVICT_WALK);
					page->ref->state = WT_REF_MEM;
				}
			} else {
				/*
				 * Swap hazard references at each level (but
				 * don't leave a hazard reference dangling on
				 * error).
				 */
				ret = __wt_page_in(session, page, ref);
				__wt_page_release(session, page);
				WT_RET(ret);
			}

			page = ref->page;
			WT_ASSERT(session, page != NULL);
			slot = next ? 0 : page->entries - 1;
		}
	}
	/* NOTREACHED */
}