1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
|
/*-
* Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
* See the file LICENSE for redistribution information.
*/
/*
* WT_LSM_WORKER_COOKIE --
* State for an LSM worker thread.
*/
struct __wt_lsm_worker_cookie {
WT_LSM_CHUNK **chunk_array;
size_t chunk_alloc;
u_int nchunks;
};
/*
* WT_LSM_WORKER_ARGS --
* State for an LSM worker thread.
*/
struct __wt_lsm_worker_args {
WT_SESSION_IMPL *session; /* Session */
WT_CONDVAR *work_cond; /* Owned by the manager */
wt_thread_t tid; /* Thread id */
bool tid_set; /* Thread id set */
u_int id; /* My manager slot id */
uint32_t type; /* Types of operations handled */
volatile bool running; /* Worker is running */
};
/*
* WT_LSM_CURSOR_CHUNK --
* Iterator struct containing all the LSM cursor access points for a chunk.
*/
struct __wt_lsm_cursor_chunk {
WT_BLOOM *bloom; /* Bloom filter handle for each chunk.*/
WT_CURSOR *cursor; /* Cursor handle for each chunk. */
uint64_t count; /* Number of items in chunk */
uint64_t switch_txn; /* Switch txn for each chunk */
};
/*
* WT_CURSOR_LSM --
* An LSM cursor.
*/
struct __wt_cursor_lsm {
WT_CURSOR iface;
WT_LSM_TREE *lsm_tree;
uint64_t dsk_gen;
u_int nchunks; /* Number of chunks in the cursor */
u_int nupdates; /* Updates needed (including
snapshot isolation checks). */
WT_CURSOR *current; /* The current cursor for iteration */
WT_LSM_CHUNK *primary_chunk; /* The current primary chunk */
WT_LSM_CURSOR_CHUNK **chunks; /* Array of LSM cursor units */
size_t chunks_alloc; /* Current size iterators array */
size_t chunks_count; /* Current number of iterators */
u_int update_count; /* Updates performed. */
#define WT_CLSM_ACTIVE 0x001 /* Incremented the session count */
#define WT_CLSM_BULK 0x002 /* Open for snapshot isolation */
#define WT_CLSM_ITERATE_NEXT 0x004 /* Forward iteration */
#define WT_CLSM_ITERATE_PREV 0x008 /* Backward iteration */
#define WT_CLSM_MERGE 0x010 /* Merge cursor, don't update */
#define WT_CLSM_MINOR_MERGE 0x020 /* Minor merge, include tombstones */
#define WT_CLSM_MULTIPLE 0x040 /* Multiple cursors have values for the
current key */
#define WT_CLSM_OPEN_READ 0x080 /* Open for reads */
#define WT_CLSM_OPEN_SNAPSHOT 0x100 /* Open for snapshot isolation */
uint32_t flags;
};
/*
* WT_LSM_CHUNK --
* A single chunk (file) in an LSM tree.
*/
struct __wt_lsm_chunk {
const char *uri; /* Data source for this chunk */
const char *bloom_uri; /* URI of Bloom filter, if any */
struct timespec create_ts; /* Creation time (for rate limiting) */
uint64_t count; /* Approximate count of records */
uint64_t size; /* Final chunk size */
uint64_t switch_txn; /*
* Largest transaction that can write
* to this chunk, set by a worker
* thread when the chunk is switched
* out, or by compact to get the most
* recent chunk flushed.
*/
uint32_t id; /* ID used to generate URIs */
uint32_t generation; /* Merge generation */
uint32_t refcnt; /* Number of worker thread references */
uint32_t bloom_busy; /* Number of worker thread references */
int8_t empty; /* 1/0: checkpoint missing */
int8_t evicted; /* 1/0: in-memory chunk was evicted */
uint8_t flushing; /* 1/0: chunk flush in progress */
#define WT_LSM_CHUNK_BLOOM 0x01
#define WT_LSM_CHUNK_MERGING 0x02
#define WT_LSM_CHUNK_ONDISK 0x04
#define WT_LSM_CHUNK_STABLE 0x08
uint32_t flags;
};
/*
* Different types of work units. Used by LSM worker threads to choose which
* type of work they will execute, and by work units to define which action
* is required.
*/
#define WT_LSM_WORK_BLOOM 0x01 /* Create a bloom filter */
#define WT_LSM_WORK_DROP 0x02 /* Drop unused chunks */
#define WT_LSM_WORK_FLUSH 0x04 /* Flush a chunk to disk */
#define WT_LSM_WORK_MERGE 0x08 /* Look for a tree merge */
#define WT_LSM_WORK_SWITCH 0x10 /* Switch to new in-memory chunk */
/*
* WT_LSM_WORK_UNIT --
* A definition of maintenance that an LSM tree needs done.
*/
struct __wt_lsm_work_unit {
TAILQ_ENTRY(__wt_lsm_work_unit) q; /* Worker unit queue */
uint32_t type; /* Type of operation */
#define WT_LSM_WORK_FORCE 0x0001 /* Force operation */
uint32_t flags; /* Flags for operation */
WT_LSM_TREE *lsm_tree;
};
/*
* WT_LSM_MANAGER --
* A structure that holds resources used to manage any LSM trees in a
* database.
*/
struct __wt_lsm_manager {
/*
* Queues of work units for LSM worker threads. We maintain three
* queues, to allow us to keep each queue FIFO, rather than needing
* to manage the order of work by shuffling the queue order.
* One queue for switches - since switches should never wait for other
* work to be done.
* One queue for application requested work. For example flushing
* and creating bloom filters.
* One queue that is for longer running operations such as merges.
*/
TAILQ_HEAD(__wt_lsm_work_switch_qh, __wt_lsm_work_unit) switchqh;
TAILQ_HEAD(__wt_lsm_work_app_qh, __wt_lsm_work_unit) appqh;
TAILQ_HEAD(__wt_lsm_work_manager_qh, __wt_lsm_work_unit) managerqh;
WT_SPINLOCK switch_lock; /* Lock for switch queue */
WT_SPINLOCK app_lock; /* Lock for application queue */
WT_SPINLOCK manager_lock; /* Lock for manager queue */
WT_CONDVAR *work_cond; /* Used to notify worker of activity */
uint32_t lsm_workers; /* Current number of LSM workers */
uint32_t lsm_workers_max;
#define WT_LSM_MAX_WORKERS 20
#define WT_LSM_MIN_WORKERS 3
WT_LSM_WORKER_ARGS lsm_worker_cookies[WT_LSM_MAX_WORKERS];
#define WT_LSM_MANAGER_SHUTDOWN 0x01 /* Manager has shut down */
uint32_t flags;
};
/*
* The value aggressive needs to get to before it influences how merges
* are chosen. The default value translates to enough level 0 chunks being
* generated to create a second level merge.
*/
#define WT_LSM_AGGRESSIVE_THRESHOLD 2
/*
* WT_LSM_TREE --
* An LSM tree.
*/
struct __wt_lsm_tree {
const char *name, *config, *filename;
const char *key_format, *value_format;
const char *bloom_config, *file_config;
WT_COLLATOR *collator;
const char *collator_name;
int collator_owned;
uint32_t refcnt; /* Number of users of the tree */
WT_SESSION_IMPL *excl_session; /* Session has exclusive lock */
#define LSM_TREE_MAX_QUEUE 100
uint32_t queue_ref;
WT_RWLOCK rwlock;
TAILQ_ENTRY(__wt_lsm_tree) q;
uint64_t dsk_gen;
uint64_t ckpt_throttle; /* Rate limiting due to checkpoints */
uint64_t merge_throttle; /* Rate limiting due to merges */
uint64_t chunk_fill_ms; /* Estimate of time to fill a chunk */
struct timespec last_flush_ts; /* Timestamp last flush finished */
uint64_t chunks_flushed; /* Count of chunks flushed since open */
struct timespec merge_aggressive_ts;/* Timestamp for merge aggression */
struct timespec work_push_ts; /* Timestamp last work unit added */
uint64_t merge_progressing; /* Bumped when merges are active */
uint32_t merge_syncing; /* Bumped when merges are syncing */
/* Configuration parameters */
uint32_t bloom_bit_count;
uint32_t bloom_hash_count;
uint32_t chunk_count_limit; /* Limit number of chunks */
uint64_t chunk_size;
uint64_t chunk_max; /* Maximum chunk a merge creates */
u_int merge_min, merge_max;
#define WT_LSM_BLOOM_MERGED 0x00000001
#define WT_LSM_BLOOM_OFF 0x00000002
#define WT_LSM_BLOOM_OLDEST 0x00000004
uint32_t bloom; /* Bloom creation policy */
WT_LSM_CHUNK **chunk; /* Array of active LSM chunks */
size_t chunk_alloc; /* Space allocated for chunks */
uint32_t nchunks; /* Number of active chunks */
uint32_t last; /* Last allocated ID */
bool modified; /* Have there been updates? */
WT_LSM_CHUNK **old_chunks; /* Array of old LSM chunks */
size_t old_alloc; /* Space allocated for old chunks */
u_int nold_chunks; /* Number of old chunks */
uint32_t freeing_old_chunks; /* Whether chunks are being freed */
uint32_t merge_aggressiveness; /* Increase amount of work per merge */
/*
* We maintain a set of statistics outside of the normal statistics
* area, copying them into place when a statistics cursor is created.
*/
#define WT_LSM_TREE_STAT_INCR(session, fld) do { \
if (WT_STAT_ENABLED(session)) \
++(fld); \
} while (0)
#define WT_LSM_TREE_STAT_INCRV(session, fld, v) do { \
if (WT_STAT_ENABLED(session)) \
(fld) += (int64_t)(v); \
} while (0)
int64_t bloom_false_positive;
int64_t bloom_hit;
int64_t bloom_miss;
int64_t lsm_checkpoint_throttle;
int64_t lsm_lookup_no_bloom;
int64_t lsm_merge_throttle;
/*
* Following fields used to be flags but are susceptible to races.
* Don't merge them with flags.
*/
bool active; /* The tree is open for business */
bool aggressive_timer_enabled; /* Timer for merge aggression enabled */
bool need_switch; /* New chunk needs creating */
/*
* flags here are not protected for concurrent access, don't put
* anything here that is susceptible to races.
*/
#define WT_LSM_TREE_COMPACTING 0x01 /* Tree being compacted */
#define WT_LSM_TREE_MERGES 0x02 /* Tree should run merges */
#define WT_LSM_TREE_OPEN 0x04 /* The tree is open */
#define WT_LSM_TREE_THROTTLE 0x08 /* Throttle updates */
uint32_t flags;
};
/*
* WT_LSM_DATA_SOURCE --
* Implementation of the WT_DATA_SOURCE interface for LSM.
*/
struct __wt_lsm_data_source {
WT_DATA_SOURCE iface;
WT_RWLOCK *rwlock;
};
|