1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
|
/*-
* Copyright (c) 2014-2015 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
* See the file LICENSE for redistribution information.
*/
/*
* WiredTiger's block manager interface.
*/
/*
* The file's description is written into the first block of the file, which
* means we can use an offset of 0 as an invalid offset.
*/
#define WT_BLOCK_INVALID_OFFSET 0
/*
* The block manager maintains three per-checkpoint extent lists:
* alloc: the extents allocated in this checkpoint
* avail: the extents available for allocation
* discard: the extents freed in this checkpoint
*
* An extent list is based on two skiplists: first, a by-offset list linking
* WT_EXT elements and sorted by file offset (low-to-high), second, a by-size
* list linking WT_SIZE elements and sorted by chunk size (low-to-high).
*
* Additionally, each WT_SIZE element on the by-size has a skiplist of its own,
* linking WT_EXT elements and sorted by file offset (low-to-high). This list
* has an entry for extents of a particular size.
*
* The trickiness is each individual WT_EXT element appears on two skiplists.
* In order to minimize allocation calls, we allocate a single array of WT_EXT
* pointers at the end of the WT_EXT structure, for both skiplists, and store
* the depth of the skiplist in the WT_EXT structure. The skiplist entries for
* the offset skiplist start at WT_EXT.next[0] and the entries for the size
* skiplist start at WT_EXT.next[WT_EXT.depth].
*
* One final complication: we only maintain the per-size skiplist for the avail
* list, the alloc and discard extent lists are not searched based on size.
*/
/*
* WT_EXTLIST --
* An extent list.
*/
struct __wt_extlist {
char *name; /* Name */
uint64_t bytes; /* Byte count */
uint32_t entries; /* Entry count */
wt_off_t offset; /* Written extent offset */
uint32_t cksum, size; /* Written extent cksum, size */
bool track_size; /* Maintain per-size skiplist */
WT_EXT *last; /* Cached last element */
WT_EXT *off[WT_SKIP_MAXDEPTH]; /* Size/offset skiplists */
WT_SIZE *sz[WT_SKIP_MAXDEPTH];
};
/*
* WT_EXT --
* Encapsulation of an extent, either allocated or freed within the
* checkpoint.
*/
struct __wt_ext {
wt_off_t off; /* Extent's file offset */
wt_off_t size; /* Extent's Size */
uint8_t depth; /* Skip list depth */
/*
* Variable-length array, sized by the number of skiplist elements.
* The first depth array entries are the address skiplist elements,
* the second depth array entries are the size skiplist.
*/
WT_EXT *next[0]; /* Offset, size skiplists */
};
/*
* WT_SIZE --
* Encapsulation of a block size skiplist entry.
*/
struct __wt_size {
wt_off_t size; /* Size */
uint8_t depth; /* Skip list depth */
WT_EXT *off[WT_SKIP_MAXDEPTH]; /* Per-size offset skiplist */
/*
* We don't use a variable-length array for the size skiplist, we want
* to be able to use any cached WT_SIZE structure as the head of a list,
* and we don't know the related WT_EXT structure's depth.
*/
WT_SIZE *next[WT_SKIP_MAXDEPTH]; /* Size skiplist */
};
/*
* WT_EXT_FOREACH --
* Walk a block manager skiplist.
* WT_EXT_FOREACH_OFF --
* Walk a block manager skiplist where the WT_EXT.next entries are offset
* by the depth.
*/
#define WT_EXT_FOREACH(skip, head) \
for ((skip) = (head)[0]; \
(skip) != NULL; (skip) = (skip)->next[0])
#define WT_EXT_FOREACH_OFF(skip, head) \
for ((skip) = (head)[0]; \
(skip) != NULL; (skip) = (skip)->next[(skip)->depth])
/*
* Checkpoint cookie: carries a version number as I don't want to rev the schema
* file version should the default block manager checkpoint format change.
*
* Version #1 checkpoint cookie format:
* [1] [root addr] [alloc addr] [avail addr] [discard addr]
* [file size] [checkpoint size] [write generation]
*/
#define WT_BM_CHECKPOINT_VERSION 1 /* Checkpoint format version */
#define WT_BLOCK_EXTLIST_MAGIC 71002 /* Identify a list */
struct __wt_block_ckpt {
uint8_t version; /* Version */
wt_off_t root_offset; /* The root */
uint32_t root_cksum, root_size;
WT_EXTLIST alloc; /* Extents allocated */
WT_EXTLIST avail; /* Extents available */
WT_EXTLIST discard; /* Extents discarded */
wt_off_t file_size; /* Checkpoint file size */
uint64_t ckpt_size; /* Checkpoint byte count */
WT_EXTLIST ckpt_avail; /* Checkpoint free'd extents */
/*
* Checkpoint archive: the block manager may potentially free a lot of
* memory from the allocation and discard extent lists when checkpoint
* completes. Put it off until the checkpoint resolves, that lets the
* upper btree layer continue eviction sooner.
*/
WT_EXTLIST ckpt_alloc; /* Checkpoint archive */
WT_EXTLIST ckpt_discard; /* Checkpoint archive */
};
/*
* WT_BM --
* Block manager handle, references a single checkpoint in a file.
*/
struct __wt_bm {
/* Methods */
int (*addr_string)
(WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, const uint8_t *, size_t);
int (*addr_valid)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
u_int (*block_header)(WT_BM *);
int (*checkpoint)
(WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, WT_CKPT *, bool);
int (*checkpoint_load)(WT_BM *, WT_SESSION_IMPL *,
const uint8_t *, size_t, uint8_t *, size_t *, bool);
int (*checkpoint_resolve)(WT_BM *, WT_SESSION_IMPL *);
int (*checkpoint_unload)(WT_BM *, WT_SESSION_IMPL *);
int (*close)(WT_BM *, WT_SESSION_IMPL *);
int (*compact_end)(WT_BM *, WT_SESSION_IMPL *);
int (*compact_page_skip)
(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t, int *);
int (*compact_skip)(WT_BM *, WT_SESSION_IMPL *, int *);
int (*compact_start)(WT_BM *, WT_SESSION_IMPL *);
int (*free)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
int (*preload)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
int (*read)
(WT_BM *, WT_SESSION_IMPL *, WT_ITEM *, const uint8_t *, size_t);
int (*salvage_end)(WT_BM *, WT_SESSION_IMPL *);
int (*salvage_next)
(WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t *, bool *);
int (*salvage_start)(WT_BM *, WT_SESSION_IMPL *);
int (*salvage_valid)
(WT_BM *, WT_SESSION_IMPL *, uint8_t *, size_t, bool);
int (*stat)(WT_BM *, WT_SESSION_IMPL *, WT_DSRC_STATS *stats);
int (*sync)(WT_BM *, WT_SESSION_IMPL *, bool);
int (*verify_addr)(WT_BM *, WT_SESSION_IMPL *, const uint8_t *, size_t);
int (*verify_end)(WT_BM *, WT_SESSION_IMPL *);
int (*verify_start)
(WT_BM *, WT_SESSION_IMPL *, WT_CKPT *, const char *[]);
int (*write) (WT_BM *,
WT_SESSION_IMPL *, WT_ITEM *, uint8_t *, size_t *, bool);
int (*write_size)(WT_BM *, WT_SESSION_IMPL *, size_t *);
WT_BLOCK *block; /* Underlying file */
void *map; /* Mapped region */
size_t maplen;
void *mappingcookie;
/*
* There's only a single block manager handle that can be written, all
* others are checkpoints.
*/
bool is_live; /* The live system */
};
/*
* WT_BLOCK --
* Block manager handle, references a single file.
*/
struct __wt_block {
const char *name; /* Name */
uint64_t name_hash; /* Hash of name */
/* A list of block manager handles, sharing a file descriptor. */
uint32_t ref; /* References */
WT_FH *fh; /* Backing file handle */
TAILQ_ENTRY(__wt_block) q; /* Linked list of handles */
TAILQ_ENTRY(__wt_block) hashq; /* Hashed list of handles */
/* Configuration information, set when the file is opened. */
uint32_t allocfirst; /* Allocation is first-fit */
uint32_t allocsize; /* Allocation size */
size_t os_cache; /* System buffer cache flush max */
size_t os_cache_max;
size_t os_cache_dirty; /* System buffer cache write max */
size_t os_cache_dirty_max;
u_int block_header; /* Header length */
/*
* There is only a single checkpoint in a file that can be written. The
* information could logically live in the WT_BM structure, but then we
* would be re-creating it every time we opened a new checkpoint and I'd
* rather not do that. So, it's stored here, only accessed by one WT_BM
* handle.
*/
WT_SPINLOCK live_lock; /* Live checkpoint lock */
WT_BLOCK_CKPT live; /* Live checkpoint */
int ckpt_inprogress;/* Live checkpoint in progress */
/* Compaction support */
int compact_pct_tenths; /* Percent to compact */
/* Salvage support */
wt_off_t slvg_off; /* Salvage file offset */
/* Verification support */
int verify; /* If performing verification */
int verify_strict; /* Fail hard on any error */
wt_off_t verify_size; /* Checkpoint's file size */
WT_EXTLIST verify_alloc; /* Verification allocation list */
uint64_t frags; /* Maximum frags in the file */
uint8_t *fragfile; /* Per-file frag tracking list */
uint8_t *fragckpt; /* Per-checkpoint frag tracking list */
};
/*
* WT_BLOCK_DESC --
* The file's description.
*/
struct __wt_block_desc {
#define WT_BLOCK_MAGIC 120897
uint32_t magic; /* 00-03: Magic number */
#define WT_BLOCK_MAJOR_VERSION 1
uint16_t majorv; /* 04-05: Major version */
#define WT_BLOCK_MINOR_VERSION 0
uint16_t minorv; /* 06-07: Minor version */
uint32_t cksum; /* 08-11: Description block checksum */
uint32_t unused; /* 12-15: Padding */
};
/*
* WT_BLOCK_DESC_SIZE is the expected structure size -- we verify the build to
* ensure the compiler hasn't inserted padding (padding won't cause failure,
* we reserve the first allocation-size block of the file for this information,
* but it would be worth investigation, regardless).
*/
#define WT_BLOCK_DESC_SIZE 16
/*
* WT_BLOCK_HEADER --
* Blocks have a common header, a WT_PAGE_HEADER structure followed by a
* block-manager specific structure: WT_BLOCK_HEADER is WiredTiger's default.
*/
struct __wt_block_header {
/*
* We write the page size in the on-disk page header because it makes
* salvage easier. (If we don't know the expected page length, we'd
* have to read increasingly larger chunks from the file until we find
* one that checksums, and that's going to be harsh given WiredTiger's
* potentially large page sizes.)
*/
uint32_t disk_size; /* 00-03: on-disk page size */
/*
* Page checksums are stored in two places. First, the page checksum
* is written within the internal page that references it as part of
* the address cookie. This is done to improve the chances of detecting
* not only disk corruption but other bugs (for example, overwriting a
* page with another valid page image). Second, a page's checksum is
* stored in the disk header. This is for salvage, so salvage knows it
* has found a page that may be useful.
*/
uint32_t cksum; /* 04-07: checksum */
#define WT_BLOCK_DATA_CKSUM 0x01 /* Block data is part of the checksum */
uint8_t flags; /* 08: flags */
/*
* End the structure with 3 bytes of padding: it wastes space, but it
* leaves the structure 32-bit aligned and having a few bytes to play
* with in the future can't hurt.
*/
uint8_t unused[3]; /* 09-11: unused padding */
};
/*
* WT_BLOCK_HEADER_SIZE is the number of bytes we allocate for the structure: if
* the compiler inserts padding it will break the world.
*/
#define WT_BLOCK_HEADER_SIZE 12
/*
* WT_BLOCK_HEADER_BYTE
* WT_BLOCK_HEADER_BYTE_SIZE --
* The first usable data byte on the block (past the combined headers).
*/
#define WT_BLOCK_HEADER_BYTE_SIZE \
(WT_PAGE_HEADER_SIZE + WT_BLOCK_HEADER_SIZE)
#define WT_BLOCK_HEADER_BYTE(dsk) \
((void *)((uint8_t *)(dsk) + WT_BLOCK_HEADER_BYTE_SIZE))
/*
* Don't compress the block's WT_PAGE_HEADER and WT_BLOCK_HEADER structures.
* We need the WT_PAGE_HEADER in-memory size, and the WT_BLOCK_HEADER checksum
* and on-disk size to be immediately available without decompression. We use
* the on-disk size and checksum during salvage to figure out where the blocks
* are, and the in-memory size tells us how large a buffer we need to decompress
* the block. We could skip less than 64B, but a 64B boundary may offer better
* alignment for the underlying compression engine, and skipping 64B won't make
* a difference in terms of compression efficiency.
*/
#define WT_BLOCK_COMPRESS_SKIP 64
|