summaryrefslogtreecommitdiff
path: root/src/include/cursor.i
blob: c9026c2ee8fef8a4964345dc43bd2ea4b42c63c4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
/*-
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

/*
 * __cursor_set_recno --
 *	The cursor value in the interface has to track the value in the
 * underlying cursor, update them in parallel.
 */
static inline void
__cursor_set_recno(WT_CURSOR_BTREE *cbt, uint64_t v)
{
	cbt->iface.recno = cbt->recno = v;
}

/*
 * __cursor_search_clear --
 *	Reset the cursor's state for a search.
 */
static inline void
__cursor_search_clear(WT_CURSOR_BTREE *cbt)
{
	/* Our caller should have released any page held by this cursor. */
	cbt->ref = NULL;
	cbt->slot = UINT32_MAX;			/* Fail big */

	cbt->ins_head = NULL;
	cbt->ins = NULL;
	cbt->ins_stack[0] = NULL;
	/* We don't bother clearing the insert stack, that's more expensive. */

	cbt->recno = 0;				/* Illegal value */

	cbt->compare = 2;			/* Illegal value */

	cbt->cip_saved = NULL;
	cbt->rip_saved = NULL;

	F_CLR(cbt, ~WT_CBT_ACTIVE);
}

/*
 * __cursor_enter --
 *	Activate a cursor.
 */
static inline int
__cursor_enter(WT_SESSION_IMPL *session)
{
	/*
	 * If there are no other cursors positioned in the session, check
	 * whether the cache is full and then get a snapshot if necessary.
	 */
	if (session->ncursors == 0) {
		WT_RET(__wt_cache_full_check(session));
		__wt_txn_read_first(session);
	}
	++session->ncursors;
	return (0);
}

/*
 * __cursor_leave --
 *	Deactivate a cursor.
 */
static inline int
__cursor_leave(WT_SESSION_IMPL *session)
{
	/*
	 * Decrement the count of active cursors in the session.  When that
	 * goes to zero, there are no active cursors, and we can release any
	 * snapshot we're holding for read committed isolation.
	 */
	WT_ASSERT(session, session->ncursors > 0);
	if (--session->ncursors == 0)
		__wt_txn_read_last(session);

	return (0);
}

/*
 * __curfile_enter --
 *	Activate a file cursor.
 */
static inline int
__curfile_enter(WT_CURSOR_BTREE *cbt)
{
	WT_SESSION_IMPL *session;

	session = (WT_SESSION_IMPL *)cbt->iface.session;

	WT_RET(__cursor_enter(session));
	F_SET(cbt, WT_CBT_ACTIVE);
	return (0);
}

/*
 * __curfile_leave --
 *	Clear a file cursor's position.
 */
static inline int
__curfile_leave(WT_CURSOR_BTREE *cbt)
{
	WT_SESSION_IMPL *session;

	session = (WT_SESSION_IMPL *)cbt->iface.session;

	/* If the cursor was active, deactivate it. */
	if (F_ISSET(cbt, WT_CBT_ACTIVE)) {
		WT_RET(__cursor_leave(session));
		F_CLR(cbt, WT_CBT_ACTIVE);
	}

	/*
	 * Release any page references we're holding.  This can trigger
	 * eviction (e.g., forced eviction of big pages), so it is important to
	 * do it after releasing our snapshot above.
	 */
	WT_RET(__wt_page_release(session, cbt->ref));
	cbt->ref = NULL;

	return (0);
}

/*
 * __cursor_func_init --
 *	Cursor call setup.
 */
static inline int
__cursor_func_init(WT_CURSOR_BTREE *cbt, int reenter)
{
	WT_SESSION_IMPL *session;

	session = (WT_SESSION_IMPL *)cbt->iface.session;

	if (reenter)
		WT_RET(__curfile_leave(cbt));
	if (!F_ISSET(cbt, WT_CBT_ACTIVE))
		WT_RET(__curfile_enter(cbt));
	__wt_txn_cursor_op(session);
	return (0);
}

/*
 * __cursor_error_resolve --
 *	Resolve the cursor's state for return on error.
 */
static inline int
__cursor_error_resolve(WT_CURSOR_BTREE *cbt)
{
	/*
	 * On error, we can't iterate, so clear the cursor's position and
	 * release any page references we're holding.
	 */
	WT_RET(__curfile_leave(cbt));

	/* Clear the cursor's search state. */
	__cursor_search_clear(cbt);

	return (0);
}

/*
 * __cursor_row_slot_return --
 *	Return a row-store leaf page slot's K/V pair.
 */
static inline int
__cursor_row_slot_return(WT_CURSOR_BTREE *cbt, WT_ROW *rip, WT_UPDATE *upd)
{
	WT_BTREE *btree;
	WT_ITEM *kb, *vb;
	WT_CELL *cell;
	WT_CELL_UNPACK *unpack, _unpack;
	WT_IKEY *ikey;
	WT_PAGE *page;
	WT_SESSION_IMPL *session;
	int key_unpacked;
	void *copy;

	session = (WT_SESSION_IMPL *)cbt->iface.session;
	btree = S2BT(session);
	page = cbt->ref->page;

	unpack = &_unpack;
	key_unpacked = 0;

	kb = &cbt->iface.key;
	vb = &cbt->iface.value;

	/*
	 * The row-store key can change underfoot; explicitly take a copy.
	 */
	copy = WT_ROW_KEY_COPY(rip);

	/*
	 * Get a reference to the key, ideally without doing a copy: we could
	 * call __wt_row_leaf_key, but if a cursor is running through the tree,
	 * we actually have more information here than that function has, we
	 * may have the prefix-compressed key that comes immediately before the
	 * one we want.
	 *
	 * If the key can be accessed directly, or has been instantiated (the
	 * key points off-page), we don't have any work to do.
	 *
	 * If the key points on-page, we have a copy of a WT_CELL value that can
	 * be processed, regardless of what any other thread is doing.
	 */
	if (F_ISSET_ATOMIC(page, WT_PAGE_DIRECT_KEY))
		__wt_row_leaf_direct(page, copy, kb);
	else if (__wt_off_page(page, copy)) {
		ikey = copy;
		kb->data = WT_IKEY_DATA(ikey);
		kb->size = ikey->size;
	} else {
		/*
		 * If the key is simple and on-page and not prefix-compressed,
		 * or we have the previous expanded key in the cursor buffer,
		 * reference or build it.  Else, call __wt_row_leaf_key_work to
		 * do it the hard way.
		 */
		if (btree->huffman_key != NULL)
			goto slow;
		__wt_cell_unpack_with_value(page, copy, unpack);
		key_unpacked = 1;
		if (unpack->type == WT_CELL_KEY && unpack->prefix == 0) {
			cbt->tmp.data = unpack->data;
			cbt->tmp.size = unpack->size;
		} else if (unpack->type == WT_CELL_KEY &&
		    cbt->rip_saved != NULL && cbt->rip_saved == rip - 1) {
			WT_ASSERT(session, cbt->tmp.size >= unpack->prefix);

			/*
			 * Grow the buffer as necessary as well as ensure data
			 * has been copied into local buffer space, then append
			 * the suffix to the prefix already in the buffer.
			 *
			 * Don't grow the buffer unnecessarily or copy data we
			 * don't need, truncate the item's data length to the
			 * prefix bytes.
			 */
			cbt->tmp.size = unpack->prefix;
			WT_RET(__wt_buf_grow(
			    session, &cbt->tmp, cbt->tmp.size + unpack->size));
			memcpy((uint8_t *)cbt->tmp.data + cbt->tmp.size,
			    unpack->data, unpack->size);
			cbt->tmp.size += unpack->size;
		} else {
			/*
			 * __wt_row_leaf_key_work instead of __wt_row_leaf_key:
			 * we do __wt_row_leaf_key's fast-path checks inline.
			 */
slow:			WT_RET(__wt_row_leaf_key_work(
			    session, page, rip, &cbt->tmp, 0));
		}
		kb->data = cbt->tmp.data;
		kb->size = cbt->tmp.size;
		cbt->rip_saved = rip;
	}

	/*
	 * If the item was ever modified, use the WT_UPDATE data.  Note that
	 * the caller passes us the update: it has already resolved which one
	 * (if any) is visible.
	 * Else, check for empty data.
	 * Else, use the value from the original disk image.
	 */
	if (upd != NULL) {
		vb->data = WT_UPDATE_DATA(upd);
		vb->size = upd->size;
		return (0);
	}
	cell = key_unpacked ? unpack->value : __wt_row_leaf_value(page, rip);
	if (cell == NULL) {
		vb->data = "";
		vb->size = 0;
	} else {
		__wt_cell_unpack(cell, unpack);
		WT_RET(__wt_page_cell_data_ref(
		    session, cbt->ref->page, unpack, vb));
	}

	return (0);
}