1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
|
/*-
* Copyright (c) 2014-2016 MongoDB, Inc.
* Copyright (c) 2008-2014 WiredTiger, Inc.
* All rights reserved.
*
* See the file LICENSE for redistribution information.
*/
#define WT_LOG_FILENAME "WiredTigerLog" /* Log file name */
#define WT_LOG_PREPNAME "WiredTigerPreplog" /* Log pre-allocated name */
#define WT_LOG_TMPNAME "WiredTigerTmplog" /* Log temporary name */
/* Logging subsystem declarations. */
#define WT_LOG_ALIGN 128
#define WT_INIT_LSN(l) do { \
(l)->file = 1; \
(l)->offset = 0; \
} while (0)
#define WT_MAX_LSN(l) do { \
(l)->file = UINT32_MAX; \
(l)->offset = INT64_MAX; \
} while (0)
#define WT_ZERO_LSN(l) do { \
(l)->file = 0; \
(l)->offset = 0; \
} while (0)
#define WT_IS_INIT_LSN(l) \
((l)->file == 1 && (l)->offset == 0)
#define WT_IS_MAX_LSN(l) \
((l)->file == UINT32_MAX && (l)->offset == INT64_MAX)
/*
* Both of the macros below need to change if the content of __wt_lsn
* ever changes. The value is the following:
* txnid, record type, operation type, file id, operation key, operation value
*/
#define WT_LOGC_KEY_FORMAT WT_UNCHECKED_STRING(IqI)
#define WT_LOGC_VALUE_FORMAT WT_UNCHECKED_STRING(qIIIuu)
#define WT_LOG_SKIP_HEADER(data) \
((const uint8_t *)(data) + offsetof(WT_LOG_RECORD, record))
#define WT_LOG_REC_SIZE(size) \
((size) - offsetof(WT_LOG_RECORD, record))
/*
* Possible values for the consolidation array slot states:
*
* WT_LOG_SLOT_CLOSE - slot is in use but closed to new joins.
* WT_LOG_SLOT_FREE - slot is available for allocation.
* WT_LOG_SLOT_WRITTEN - slot is written and should be processed by worker.
*
* The slot state must be volatile: threads loop checking the state and can't
* cache the first value they see.
*
* The slot state is divided into two 32 bit sizes. One half is the
* amount joined and the other is the amount released. Since we use
* a few special states, reserve the top few bits for state. That makes
* the maximum size less than 32 bits for both joined and released.
*/
/*
* The high bit is reserved for the special states. If the high bit is
* set (WT_LOG_SLOT_RESERVED) then we are guaranteed to be in a special state.
*/
#define WT_LOG_SLOT_FREE -1 /* Not in use */
#define WT_LOG_SLOT_WRITTEN -2 /* Slot data written, not processed */
/*
* We allocate the buffer size, but trigger a slot switch when we cross
* the maximum size of half the buffer. If a record is more than the buffer
* maximum then we trigger a slot switch and write that record unbuffered.
* We use a larger buffer to provide overflow space so that we can switch
* once we cross the threshold.
*/
#define WT_LOG_SLOT_BUF_SIZE (256 * 1024) /* Must be power of 2 */
#define WT_LOG_SLOT_BUF_MAX ((uint32_t)log->slot_buf_size / 2)
#define WT_LOG_SLOT_UNBUFFERED (WT_LOG_SLOT_BUF_SIZE << 1)
/*
* If new slot states are added, adjust WT_LOG_SLOT_BITS and
* WT_LOG_SLOT_MASK_OFF accordingly for how much of the top 32
* bits we are using. More slot states here will reduce the maximum
* size that a slot can hold unbuffered by half. If a record is
* larger than the maximum we can account for in the slot state we fall
* back to direct writes.
*/
#define WT_LOG_SLOT_BITS 2
#define WT_LOG_SLOT_MAXBITS (32 - WT_LOG_SLOT_BITS)
#define WT_LOG_SLOT_CLOSE 0x4000000000000000LL /* Force slot close */
#define WT_LOG_SLOT_RESERVED 0x8000000000000000LL /* Reserved states */
/*
* Check if the unbuffered flag is set in the joined portion of
* the slot state.
*/
#define WT_LOG_SLOT_UNBUFFERED_ISSET(state) \
((state) & ((int64_t)WT_LOG_SLOT_UNBUFFERED << 32))
#define WT_LOG_SLOT_MASK_OFF 0x3fffffffffffffffLL
#define WT_LOG_SLOT_MASK_ON ~(WT_LOG_SLOT_MASK_OFF)
#define WT_LOG_SLOT_JOIN_MASK (WT_LOG_SLOT_MASK_OFF >> 32)
/*
* These macros manipulate the slot state and its component parts.
*/
#define WT_LOG_SLOT_FLAGS(state) ((state) & WT_LOG_SLOT_MASK_ON)
#define WT_LOG_SLOT_JOINED(state) (((state) & WT_LOG_SLOT_MASK_OFF) >> 32)
#define WT_LOG_SLOT_JOINED_BUFFERED(state) \
(WT_LOG_SLOT_JOINED(state) & \
(WT_LOG_SLOT_UNBUFFERED - 1))
#define WT_LOG_SLOT_JOIN_REL(j, r, s) (((j) << 32) + (r) + (s))
#define WT_LOG_SLOT_RELEASED(state) ((int64_t)(int32_t)(state))
#define WT_LOG_SLOT_RELEASED_BUFFERED(state) \
((int64_t)((int32_t)WT_LOG_SLOT_RELEASED(state) & \
(WT_LOG_SLOT_UNBUFFERED - 1)))
/* Slot is in use */
#define WT_LOG_SLOT_ACTIVE(state) \
(WT_LOG_SLOT_JOINED(state) != WT_LOG_SLOT_JOIN_MASK)
/* Slot is in use, but closed to new joins */
#define WT_LOG_SLOT_CLOSED(state) \
(WT_LOG_SLOT_ACTIVE(state) && \
(FLD64_ISSET((uint64_t)state, WT_LOG_SLOT_CLOSE) && \
!FLD64_ISSET((uint64_t)state, WT_LOG_SLOT_RESERVED)))
/* Slot is in use, all data copied into buffer */
#define WT_LOG_SLOT_INPROGRESS(state) \
(WT_LOG_SLOT_RELEASED(state) != WT_LOG_SLOT_JOINED(state))
#define WT_LOG_SLOT_DONE(state) \
(WT_LOG_SLOT_CLOSED(state) && \
!WT_LOG_SLOT_INPROGRESS(state))
/* Slot is in use, more threads may join this slot */
#define WT_LOG_SLOT_OPEN(state) \
(WT_LOG_SLOT_ACTIVE(state) && \
!WT_LOG_SLOT_UNBUFFERED_ISSET(state) && \
!FLD64_ISSET((uint64_t)(state), WT_LOG_SLOT_CLOSE) && \
WT_LOG_SLOT_JOINED(state) < WT_LOG_SLOT_BUF_MAX)
struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_logslot {
volatile int64_t slot_state; /* Slot state */
int64_t slot_unbuffered; /* Unbuffered data in this slot */
int32_t slot_error; /* Error value */
wt_off_t slot_start_offset; /* Starting file offset */
wt_off_t slot_last_offset; /* Last record offset */
WT_LSN slot_release_lsn; /* Slot release LSN */
WT_LSN slot_start_lsn; /* Slot starting LSN */
WT_LSN slot_end_lsn; /* Slot ending LSN */
WT_FH *slot_fh; /* File handle for this group */
WT_ITEM slot_buf; /* Buffer for grouped writes */
#define WT_SLOT_CLOSEFH 0x01 /* Close old fh on release */
#define WT_SLOT_FLUSH 0x02 /* Wait for write */
#define WT_SLOT_SYNC 0x04 /* Needs sync on release */
#define WT_SLOT_SYNC_DIR 0x08 /* Directory sync on release */
uint32_t flags; /* Flags */
};
#define WT_SLOT_INIT_FLAGS 0
#define WT_WITH_SLOT_LOCK(session, log, ret, op) do { \
WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT)); \
WT_WITH_LOCK(session, ret, \
&log->log_slot_lock, WT_SESSION_LOCKED_SLOT, op); \
} while (0)
struct __wt_myslot {
WT_LOGSLOT *slot; /* Slot I'm using */
wt_off_t end_offset; /* My end offset in buffer */
wt_off_t offset; /* Slot buffer offset */
#define WT_MYSLOT_CLOSE 0x01 /* This thread is closing the slot */
#define WT_MYSLOT_UNBUFFERED 0x02 /* Write directly */
uint32_t flags; /* Flags */
};
#define WT_LOG_FIRST_RECORD log->allocsize
struct __wt_log {
uint32_t allocsize; /* Allocation alignment size */
wt_off_t log_written; /* Amount of log written this period */
/*
* Log file information
*/
uint32_t fileid; /* Current log file number */
uint32_t prep_fileid; /* Pre-allocated file number */
uint32_t tmp_fileid; /* Temporary file number */
uint32_t prep_missed; /* Pre-allocated file misses */
WT_FH *log_fh; /* Logging file handle */
WT_FH *log_dir_fh; /* Log directory file handle */
WT_FH *log_close_fh; /* Logging file handle to close */
WT_LSN log_close_lsn; /* LSN needed to close */
/*
* System LSNs
*/
WT_LSN alloc_lsn; /* Next LSN for allocation */
WT_LSN bg_sync_lsn; /* Latest background sync LSN */
WT_LSN ckpt_lsn; /* Last checkpoint LSN */
WT_LSN first_lsn; /* First LSN */
WT_LSN sync_dir_lsn; /* LSN of the last directory sync */
WT_LSN sync_lsn; /* LSN of the last sync */
WT_LSN trunc_lsn; /* End LSN for recovery truncation */
WT_LSN write_lsn; /* End of last LSN written */
WT_LSN write_start_lsn;/* Beginning of last LSN written */
/*
* Synchronization resources
*/
WT_SPINLOCK log_lock; /* Locked: Logging fields */
WT_SPINLOCK log_slot_lock; /* Locked: Consolidation array */
WT_SPINLOCK log_sync_lock; /* Locked: Single-thread fsync */
WT_SPINLOCK log_writelsn_lock; /* Locked: write LSN */
WT_RWLOCK *log_archive_lock; /* Archive and log cursors */
/* Notify any waiting threads when sync_lsn is updated. */
WT_CONDVAR *log_sync_cond;
/* Notify any waiting threads when write_lsn is updated. */
WT_CONDVAR *log_write_cond;
/*
* Consolidation array information
* Our testing shows that the more consolidation we generate the
* better the performance we see which equates to an active slot
* slot count of one.
*
* Note: this can't be an array, we impose cache-line alignment and
* gcc doesn't support that for arrays.
*/
#define WT_SLOT_POOL 128
WT_LOGSLOT *active_slot; /* Active slot */
WT_LOGSLOT slot_pool[WT_SLOT_POOL]; /* Pool of all slots */
size_t slot_buf_size; /* Buffer size for slots */
#ifdef HAVE_DIAGNOSTIC
uint64_t write_calls; /* Calls to log_write */
#endif
uint32_t flags;
};
struct __wt_log_record {
uint32_t len; /* 00-03: Record length including hdr */
uint32_t checksum; /* 04-07: Checksum of the record */
#define WT_LOG_RECORD_COMPRESSED 0x01 /* Compressed except hdr */
#define WT_LOG_RECORD_ENCRYPTED 0x02 /* Encrypted except hdr */
uint16_t flags; /* 08-09: Flags */
uint8_t unused[2]; /* 10-11: Padding */
uint32_t mem_len; /* 12-15: Uncompressed len if needed */
uint8_t record[0]; /* Beginning of actual data */
};
/*
* WT_LOG_DESC --
* The log file's description.
*/
struct __wt_log_desc {
#define WT_LOG_MAGIC 0x101064
uint32_t log_magic; /* 00-03: Magic number */
#define WT_LOG_MAJOR_VERSION 1
uint16_t majorv; /* 04-05: Major version */
#define WT_LOG_MINOR_VERSION 0
uint16_t minorv; /* 06-07: Minor version */
uint64_t log_size; /* 08-15: Log file size */
};
/*
* Flags for __wt_txn_op_printlog.
*/
#define WT_TXN_PRINTLOG_HEX 0x0001 /* Add hex output */
/*
* WT_LOG_REC_DESC --
* A descriptor for a log record type.
*/
struct __wt_log_rec_desc {
const char *fmt;
int (*print)(WT_SESSION_IMPL *session, uint8_t **pp, uint8_t *end);
};
/*
* WT_LOG_OP_DESC --
* A descriptor for a log operation type.
*/
struct __wt_log_op_desc {
const char *fmt;
int (*print)(WT_SESSION_IMPL *session, uint8_t **pp, uint8_t *end);
};
|