summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/include/log.h
blob: a41d0f667986ceed728ccf167ecd8d0b542b6685 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
/*-
 * Copyright (c) 2014-2019 MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

/* AUTOMATIC FLAG VALUE GENERATION START */
#define	WT_LOGSCAN_FIRST		0x01u
#define	WT_LOGSCAN_FROM_CKP		0x02u
#define	WT_LOGSCAN_ONE			0x04u
#define	WT_LOGSCAN_RECOVER		0x08u
#define	WT_LOGSCAN_RECOVER_METADATA	0x10u
/* AUTOMATIC FLAG VALUE GENERATION STOP */

/* AUTOMATIC FLAG VALUE GENERATION START */
#define	WT_LOG_BACKGROUND	0x01u
#define	WT_LOG_DSYNC		0x02u
#define	WT_LOG_FLUSH		0x04u
#define	WT_LOG_FSYNC		0x08u
#define	WT_LOG_SYNC_ENABLED	0x10u
/* AUTOMATIC FLAG VALUE GENERATION STOP */

#define	WT_LOGOP_IGNORE	0x80000000
#define	WT_LOGOP_IS_IGNORED(val)	((val) & WT_LOGOP_IGNORE)

/*
 * WT_LSN --
 *	A log sequence number, representing a position in the transaction log.
 */
union __wt_lsn {
	struct {
#ifdef	WORDS_BIGENDIAN
		uint32_t file;
		uint32_t offset;
#else
		uint32_t offset;
		uint32_t file;
#endif
	} l;
	uint64_t file_offset;
};

#define	WT_LOG_FILENAME	"WiredTigerLog"		/* Log file name */
#define	WT_LOG_PREPNAME	"WiredTigerPreplog"	/* Log pre-allocated name */
#define	WT_LOG_TMPNAME	"WiredTigerTmplog"	/* Log temporary name */

/* Logging subsystem declarations. */
#define	WT_LOG_ALIGN			128

/*
 * Atomically set the two components of the LSN.
 */
#define	WT_SET_LSN(l, f, o) (l)->file_offset = (((uint64_t)(f) << 32) + (o))

#define	WT_INIT_LSN(l)	WT_SET_LSN((l), 1, 0)

#define	WT_MAX_LSN(l)	WT_SET_LSN((l), UINT32_MAX, INT32_MAX)

#define	WT_ZERO_LSN(l)	WT_SET_LSN((l), 0, 0)

/*
 * Test for initial LSN.  We only need to shift the 1 for comparison.
 */
#define	WT_IS_INIT_LSN(l)	((l)->file_offset == ((uint64_t)1 << 32))
/*
 * Original tested INT32_MAX.  But if we read one from an older
 * release we may see UINT32_MAX.
 */
#define	WT_IS_MAX_LSN(lsn)						\
	((lsn)->l.file == UINT32_MAX &&					\
	 ((lsn)->l.offset == INT32_MAX || (lsn)->l.offset == UINT32_MAX))
/*
 * Test for zero LSN.
 */
#define	WT_IS_ZERO_LSN(l)	((l)->file_offset == 0)

/*
 * Macro to print an LSN.
 */
#define	WT_LSN_MSG(lsn, msg)						\
	__wt_msg(session, "%s LSN: [%" PRIu32 "][%" PRIu32 "]",		\
	    (msg), (lsn)->l.file, (lsn)->l.offset)

/*
 * Both of the macros below need to change if the content of __wt_lsn
 * ever changes.  The value is the following:
 * txnid, record type, operation type, file id, operation key, operation value
 */
#define	WT_LOGC_KEY_FORMAT	WT_UNCHECKED_STRING(III)
#define	WT_LOGC_VALUE_FORMAT	WT_UNCHECKED_STRING(qIIIuu)

/*
 * Size range for the log files.
 */
#define	WT_LOG_FILE_MAX ((int64_t)2 * WT_GIGABYTE)
#define	WT_LOG_FILE_MIN (100 * WT_KILOBYTE)

#define	WT_LOG_SKIP_HEADER(data)					\
    ((const uint8_t *)(data) + offsetof(WT_LOG_RECORD, record))
#define	WT_LOG_REC_SIZE(size)						\
    ((size) - offsetof(WT_LOG_RECORD, record))

/*
 * We allocate the buffer size, but trigger a slot switch when we cross
 * the maximum size of half the buffer.  If a record is more than the buffer
 * maximum then we trigger a slot switch and write that record unbuffered.
 * We use a larger buffer to provide overflow space so that we can switch
 * once we cross the threshold.
 */
#define	WT_LOG_SLOT_BUF_SIZE		(256 * 1024)	/* Must be power of 2 */
#define	WT_LOG_SLOT_BUF_MAX		((uint32_t)log->slot_buf_size / 2)
#define	WT_LOG_SLOT_UNBUFFERED		(WT_LOG_SLOT_BUF_SIZE << 1)

/*
 * Possible values for the consolidation array slot states:
 *
 * WT_LOG_SLOT_CLOSE - slot is in use but closed to new joins.
 * WT_LOG_SLOT_FREE - slot is available for allocation.
 * WT_LOG_SLOT_WRITTEN - slot is written and should be processed by worker.
 *
 * The slot state must be volatile: threads loop checking the state and can't
 * cache the first value they see.
 *
 * The slot state is divided into two 32 bit sizes.  One half is the
 * amount joined and the other is the amount released.  Since we use
 * a few special states, reserve the top few bits for state.  That makes
 * the maximum size less than 32 bits for both joined and released.
 */
/*
 * XXX
 * The log slot bits are signed and should be rewritten as unsigned. For now,
 * give the logging subsystem its own flags macro.
 */
#define	FLD_LOG_SLOT_ISSET(field, mask)	(((field) & (uint64_t)(mask)) != 0)

/*
 * The high bit is reserved for the special states.  If the high bit is
 * set (WT_LOG_SLOT_RESERVED) then we are guaranteed to be in a special state.
 */
#define	WT_LOG_SLOT_FREE	(-1)	/* Not in use */
#define	WT_LOG_SLOT_WRITTEN	(-2)	/* Slot data written, not processed */

/*
 * If new slot states are added, adjust WT_LOG_SLOT_BITS and
 * WT_LOG_SLOT_MASK_OFF accordingly for how much of the top 32
 * bits we are using.  More slot states here will reduce the maximum
 * size that a slot can hold unbuffered by half.  If a record is
 * larger than the maximum we can account for in the slot state we fall
 * back to direct writes.
 */
#define	WT_LOG_SLOT_BITS	2
#define	WT_LOG_SLOT_MAXBITS	(32 - WT_LOG_SLOT_BITS)
#define	WT_LOG_SLOT_CLOSE	0x4000000000000000LL	/* Force slot close */
#define	WT_LOG_SLOT_RESERVED	0x8000000000000000LL	/* Reserved states */

/*
 * Check if the unbuffered flag is set in the joined portion of
 * the slot state.
 */
#define	WT_LOG_SLOT_UNBUFFERED_ISSET(state)				\
    ((state) & ((int64_t)WT_LOG_SLOT_UNBUFFERED << 32))

#define	WT_LOG_SLOT_MASK_OFF	0x3fffffffffffffffLL
#define	WT_LOG_SLOT_MASK_ON	~(WT_LOG_SLOT_MASK_OFF)
#define	WT_LOG_SLOT_JOIN_MASK	(WT_LOG_SLOT_MASK_OFF >> 32)

/*
 * These macros manipulate the slot state and its component parts.
 */
#define	WT_LOG_SLOT_FLAGS(state)	((state) & WT_LOG_SLOT_MASK_ON)
#define	WT_LOG_SLOT_JOINED(state)	(((state) & WT_LOG_SLOT_MASK_OFF) >> 32)
#define	WT_LOG_SLOT_JOINED_BUFFERED(state)				\
    (WT_LOG_SLOT_JOINED(state) &					\
    (WT_LOG_SLOT_UNBUFFERED - 1))
#define	WT_LOG_SLOT_JOIN_REL(j, r, s)	(((j) << 32) + (r) + (s))
#define	WT_LOG_SLOT_RELEASED(state)	((int64_t)(int32_t)(state))
#define	WT_LOG_SLOT_RELEASED_BUFFERED(state)				\
    ((int64_t)((int32_t)WT_LOG_SLOT_RELEASED(state) &			\
    (WT_LOG_SLOT_UNBUFFERED - 1)))

/* Slot is in use */
#define	WT_LOG_SLOT_ACTIVE(state)					\
    (WT_LOG_SLOT_JOINED(state) != WT_LOG_SLOT_JOIN_MASK)
/* Slot is in use, but closed to new joins */
#define	WT_LOG_SLOT_CLOSED(state)					\
    (WT_LOG_SLOT_ACTIVE(state) &&					\
    (FLD_LOG_SLOT_ISSET((uint64_t)(state), WT_LOG_SLOT_CLOSE) &&	\
    !FLD_LOG_SLOT_ISSET((uint64_t)(state), WT_LOG_SLOT_RESERVED)))
/* Slot is in use, all data copied into buffer */
#define	WT_LOG_SLOT_INPROGRESS(state)					\
    (WT_LOG_SLOT_RELEASED(state) != WT_LOG_SLOT_JOINED(state))
#define	WT_LOG_SLOT_DONE(state)						\
    (WT_LOG_SLOT_CLOSED(state) &&					\
    !WT_LOG_SLOT_INPROGRESS(state))
/* Slot is in use, more threads may join this slot */
#define	WT_LOG_SLOT_OPEN(state)						\
    (WT_LOG_SLOT_ACTIVE(state) &&					\
    !WT_LOG_SLOT_UNBUFFERED_ISSET(state) &&				\
    !FLD_LOG_SLOT_ISSET((uint64_t)(state), WT_LOG_SLOT_CLOSE) &&	\
    WT_LOG_SLOT_JOINED(state) < WT_LOG_SLOT_BUF_MAX)

struct __wt_logslot {
	WT_CACHE_LINE_PAD_BEGIN
	volatile int64_t slot_state;	/* Slot state */
	int64_t	 slot_unbuffered;	/* Unbuffered data in this slot */
	int	 slot_error;		/* Error value */
	wt_off_t slot_start_offset;	/* Starting file offset */
	wt_off_t slot_last_offset;	/* Last record offset */
	WT_LSN	 slot_release_lsn;	/* Slot release LSN */
	WT_LSN	 slot_start_lsn;	/* Slot starting LSN */
	WT_LSN	 slot_end_lsn;		/* Slot ending LSN */
	WT_FH	*slot_fh;		/* File handle for this group */
	WT_ITEM  slot_buf;		/* Buffer for grouped writes */

/* AUTOMATIC FLAG VALUE GENERATION START */
#define	WT_SLOT_CLOSEFH		0x01u	/* Close old fh on release */
#define	WT_SLOT_FLUSH		0x02u	/* Wait for write */
#define	WT_SLOT_SYNC		0x04u	/* Needs sync on release */
#define	WT_SLOT_SYNC_DIR	0x08u	/* Directory sync on release */
#define	WT_SLOT_SYNC_DIRTY	0x10u	/* Sync system buffers on release */
/* AUTOMATIC FLAG VALUE GENERATION STOP */
	uint32_t flags;
	WT_CACHE_LINE_PAD_END
};

#define	WT_SLOT_INIT_FLAGS	0

#define	WT_SLOT_SYNC_FLAGS						\
	(WT_SLOT_SYNC |							\
	 WT_SLOT_SYNC_DIR |						\
	 WT_SLOT_SYNC_DIRTY)

#define	WT_WITH_SLOT_LOCK(session, log, op) do {			\
	WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SLOT));	\
	WT_WITH_LOCK_WAIT(session,					\
	    &(log)->log_slot_lock, WT_SESSION_LOCKED_SLOT, op);		\
} while (0)

struct __wt_myslot {
	WT_LOGSLOT	*slot;		/* Slot I'm using */
	wt_off_t	 end_offset;	/* My end offset in buffer */
	wt_off_t	 offset;	/* Slot buffer offset */

/* AUTOMATIC FLAG VALUE GENERATION START */
#define	WT_MYSLOT_CLOSE		0x1u	/* This thread is closing the slot */
#define	WT_MYSLOT_NEEDS_RELEASE	0x2u	/* This thread is releasing the slot */
#define	WT_MYSLOT_UNBUFFERED	0x4u	/* Write directly */
/* AUTOMATIC FLAG VALUE GENERATION STOP */
	uint32_t flags;
};

#define	WT_LOG_END_HEADER	log->allocsize

struct __wt_log {
	uint32_t	allocsize;	/* Allocation alignment size */
	uint32_t	first_record;	/* Offset of first record in file */
	wt_off_t	log_written;	/* Amount of log written this period */
	/*
	 * Log file information
	 */
	uint32_t	 fileid;	/* Current log file number */
	uint32_t	 prep_fileid;	/* Pre-allocated file number */
	uint32_t	 tmp_fileid;	/* Temporary file number */
	uint32_t	 prep_missed;	/* Pre-allocated file misses */
	WT_FH           *log_fh;	/* Logging file handle */
	WT_FH           *log_dir_fh;	/* Log directory file handle */
	WT_FH           *log_close_fh;	/* Logging file handle to close */
	WT_LSN		 log_close_lsn;	/* LSN needed to close */

	uint16_t	 log_version;	/* Version of log file */

	/*
	 * System LSNs
	 */
	WT_LSN		alloc_lsn;	/* Next LSN for allocation */
	WT_LSN		bg_sync_lsn;	/* Latest background sync LSN */
	WT_LSN		ckpt_lsn;	/* Last checkpoint LSN */
	WT_LSN		dirty_lsn;	/* LSN of last non-synced write */
	WT_LSN		first_lsn;	/* First LSN */
	WT_LSN		sync_dir_lsn;	/* LSN of the last directory sync */
	WT_LSN		sync_lsn;	/* LSN of the last sync */
	WT_LSN		trunc_lsn;	/* End LSN for recovery truncation */
	WT_LSN		write_lsn;	/* End of last LSN written */
	WT_LSN		write_start_lsn;/* Beginning of last LSN written */

	/*
	 * Synchronization resources
	 */
	WT_SPINLOCK      log_lock;      /* Locked: Logging fields */
	WT_SPINLOCK      log_fs_lock;   /* Locked: tmp, prep and log files */
	WT_SPINLOCK      log_slot_lock; /* Locked: Consolidation array */
	WT_SPINLOCK      log_sync_lock; /* Locked: Single-thread fsync */
	WT_SPINLOCK      log_writelsn_lock; /* Locked: write LSN */

	WT_RWLOCK	 log_archive_lock;/* Archive and log cursors */

	/* Notify any waiting threads when sync_lsn is updated. */
	WT_CONDVAR	*log_sync_cond;
	/* Notify any waiting threads when write_lsn is updated. */
	WT_CONDVAR	*log_write_cond;

	/*
	 * Consolidation array information
	 * Our testing shows that the more consolidation we generate the
	 * better the performance we see which equates to an active slot
	 * slot count of one.
	 *
	 * Note: this can't be an array, we impose cache-line alignment and
	 * gcc doesn't support that for arrays.
	 */
#define	WT_SLOT_POOL	128
	WT_LOGSLOT	*active_slot;			/* Active slot */
	WT_LOGSLOT	 slot_pool[WT_SLOT_POOL];	/* Pool of all slots */
	int32_t		 pool_index;		/* Index into slot pool */
	size_t		 slot_buf_size;		/* Buffer size for slots */
#ifdef HAVE_DIAGNOSTIC
	uint64_t	 write_calls;		/* Calls to log_write */
#endif

/* AUTOMATIC FLAG VALUE GENERATION START */
#define	WT_LOG_FORCE_NEWFILE	0x1u	/* Force switch to new log file */
#define	WT_LOG_OPENED		0x2u	/* Log subsystem successfully open */
#define	WT_LOG_TRUNCATE_NOTSUP	0x4u	/* File system truncate not supported */
/* AUTOMATIC FLAG VALUE GENERATION STOP */
	uint32_t	flags;
};

struct __wt_log_record {
	uint32_t	len;		/* 00-03: Record length including hdr */
	uint32_t	checksum;	/* 04-07: Checksum of the record */

	/*
	 * No automatic generation: flag values cannot change, they're written
	 * to disk.
	 *
	 * Unused bits in the flags, as well as the 'unused' padding,
	 * are expected to be zeroed; we check that to help detect file
	 * corruption.
	 */
#define	WT_LOG_RECORD_COMPRESSED	0x01u	/* Compressed except hdr */
#define	WT_LOG_RECORD_ENCRYPTED		0x02u	/* Encrypted except hdr */
#define	WT_LOG_RECORD_ALL_FLAGS					\
	(WT_LOG_RECORD_COMPRESSED | WT_LOG_RECORD_ENCRYPTED)
	uint16_t	flags;		/* 08-09: Flags */
	uint8_t		unused[2];	/* 10-11: Padding */
	uint32_t	mem_len;	/* 12-15: Uncompressed len if needed */
	uint8_t		record[0];	/* Beginning of actual data */
};

/*
 * __wt_log_record_byteswap --
 *	Handle big- and little-endian transformation of the log record
 *	header block.
 */
static inline void
__wt_log_record_byteswap(WT_LOG_RECORD *record)
{
#ifdef	WORDS_BIGENDIAN
	record->len = __wt_bswap32(record->len);
	record->checksum = __wt_bswap32(record->checksum);
	record->flags = __wt_bswap16(record->flags);
	record->mem_len = __wt_bswap32(record->mem_len);
#else
	WT_UNUSED(record);
#endif
}

/*
 * WT_LOG_DESC --
 *	The log file's description.
 */
struct __wt_log_desc {
#define	WT_LOG_MAGIC		0x101064u
	uint32_t	log_magic;	/* 00-03: Magic number */
/*
 * NOTE: We bumped the log version from 2 to 3 to make it convenient for
 * MongoDB to detect users accidentally running old binaries on a newer
 * release. There are no actual log file format changes with version 2 and 3.
 */
#define	WT_LOG_VERSION	3
	uint16_t	version;	/* 04-05: Log version */
	uint16_t	unused;		/* 06-07: Unused */
	uint64_t	log_size;	/* 08-15: Log file size */
};
/*
 * This is the log version that introduced the system record.
 */
#define	WT_LOG_VERSION_SYSTEM	2

/*
 * WiredTiger release version where log format version changed.
 */
#define	WT_LOG_V2_MAJOR	3
#define	WT_LOG_V2_MINOR	0
#define	WT_LOG_V3_MAJOR	3
#define	WT_LOG_V3_MINOR	1

/*
 * __wt_log_desc_byteswap --
 *	Handle big- and little-endian transformation of the log file
 *	description block.
 */
static inline void
__wt_log_desc_byteswap(WT_LOG_DESC *desc)
{
#ifdef	WORDS_BIGENDIAN
	desc->log_magic = __wt_bswap32(desc->log_magic);
	desc->version = __wt_bswap16(desc->version);
	desc->unused = __wt_bswap16(desc->unused);
	desc->log_size = __wt_bswap64(desc->log_size);
#else
	WT_UNUSED(desc);
#endif
}

/* Cookie passed through the transaction printlog routines. */
struct __wt_txn_printlog_args {
	WT_FSTREAM *fs;

/* AUTOMATIC FLAG VALUE GENERATION START */
#define	WT_TXN_PRINTLOG_HEX	0x1u	/* Add hex output */
/* AUTOMATIC FLAG VALUE GENERATION STOP */
	uint32_t flags;
};

/*
 * WT_LOG_REC_DESC --
 *	A descriptor for a log record type.
 */
struct __wt_log_rec_desc {
	const char *fmt;
	int (*print)(WT_SESSION_IMPL *session, uint8_t **pp, uint8_t *end);
};

/*
 * WT_LOG_OP_DESC --
 *	A descriptor for a log operation type.
 */
struct __wt_log_op_desc {
	const char *fmt;
	int (*print)(WT_SESSION_IMPL *session, uint8_t **pp, uint8_t *end);
};