summaryrefslogtreecommitdiff
path: root/src/dbinc/log.h
blob: c4dea6fc32bcfb9feb0cbd7fa89957da90755872 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
/*-
 * See the file LICENSE for redistribution information.
 *
 * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
 *
 * $Id$
 */

#ifndef _DB_LOG_H_
#define	_DB_LOG_H_

#include "dbinc/db_swap.h"

#if defined(__cplusplus)
extern "C" {
#endif

/*******************************************************
 * DBREG:
 *	The DB file register code keeps track of open files.  It's stored
 *	in the log subsystem's shared region, and so appears in the log.h
 *	header file, but is logically separate.
 *	The dbp may not be open if we are recovering the abort of a create.
 *******************************************************/
/*
 * The per-process table that maps log file-id's to DB structures.
 */
typedef	struct __db_entry {
	DB	*dbp;			/* Open dbp for this file id. */
	int	deleted;		/* File was not found during open. */
} DB_ENTRY;

/*
 * FNAME --
 *	File name and id.
 */
struct __fname {
	SH_TAILQ_ENTRY q;		/* File name queue. */

	pid_t	  pid;			/* Process that owns this. */
	int32_t   id;			/* Logging file id. */
	int32_t   old_id;		/* Saved logging file id. */
	DBTYPE	  s_type;		/* Saved DB type. */

	roff_t	  fname_off;		/* File name offset. */
	roff_t	  dname_off;		/* Database name offset. */
	db_pgno_t meta_pgno;		/* Page number of the meta page. */
	u_int8_t  ufid[DB_FILE_ID_LEN];	/* Unique file id. */

	u_int32_t create_txnid;		/*
					 * Txn ID of the DB create, stored so
					 * we can log it at register time.
					 */
	db_mutex_t mutex;		/* mutex from db handle. */
			/* number of txn referencing + 1 for the db handle. */
	u_int32_t txn_ref;

#define	DB_FNAME_CLOSED		0x01	/* DBP was closed. */
#define	DB_FNAME_DURABLE	0x02	/* File is durable. */
#define	DB_FNAME_INMEM		0x04	/* File is in memory. */
#define	DB_FNAME_NOTLOGGED	0x08	/* Log of close failed. */
#define	DB_FNAME_RECOVER	0x10	/* File was opened by recovery code. */
#define	DB_FNAME_RESTORED	0x20	/* File may be in restored txn. */
#define	DB_FNAME_DBREG_MASK	0xf000	/* These bits come from DBREG below. */
	u_int32_t flags;
};

/* File open/close register log record opcodes. */
#define	DBREG_CHKPNT	1		/* Checkpoint: file name/id dump. */
#define	DBREG_CLOSE	2		/* File close. */
#define	DBREG_OPEN	3		/* File open. */
#define	DBREG_PREOPEN	4		/* Open in mpool only. */
#define	DBREG_RCLOSE	5		/* File close after recovery. */
#define	DBREG_REOPEN	6		/* Open for in-memory database. */
#define	DBREG_XCHKPNT	7		/* Checkpoint of exclusive file. */
#define	DBREG_XOPEN	8		/* File exclusive open. */
#define	DBREG_XREOPEN	9		/* File exclusive open in-memory. */

/* These bits are logged so db_printlog can handle page data. */
#define	DBREG_OP_MASK	0xf		/* Opcode mask */
#define	DBREG_BIGEND	0x1000		/* Db Big endian. */
#define	DBREG_CHKSUM	0x2000		/* Db is checksummed. */
#define	DBREG_ENCRYPT	0x4000		/* Db is encrypted. */
#define	DBREG_EXCL	0x8000		/* Db is exclusive. */

/*******************************************************
 * LOG:
 *	The log subsystem information.
 *******************************************************/
struct __hdr;		typedef struct __hdr HDR;
struct __log;		typedef struct __log LOG;
struct __log_persist;	typedef struct __log_persist LOGP;

#define	LFPREFIX	"log."		/* Log file name prefix. */
#define	LFNAME		"log.%010d"	/* Log file name template. */
#define	LFNAME_V1	"log.%05d"	/* Log file name template, rev 1. */
#define IS_LOG_FILE(name)  (strncmp(name, LFPREFIX, sizeof(LFPREFIX) - 1) == 0)

#define	LG_MAX_DEFAULT		(10 * MEGABYTE)	/* 10 MB. */
#define	LG_MAX_INMEM		(256 * 1024)	/* 256 KB. */
#define	LG_BSIZE_INMEM		(1 * MEGABYTE)	/* 1 MB. */

/*
 * Allocate a few bytes under a power-of-two value.  BDB doesn't care if it's
 * a power-of-two or not, and requesting slightly under a power-of-two allows
 * stupid allocators to avoid wasting space.
 */
#define	LG_BASE_REGION_SIZE	(130000)	/* 128KB - 1072B */
#define	LG_BSIZE_DEFAULT	(32000)		/* 32 KB - 768B */
#define	LG_CURSOR_BUF_SIZE	(32000)		/* 32 KB - 768B */

/*
 * DB_LOG
 *	Per-process log structure.
 */
struct __db_log {
	/*
	 * These fields need to be protected for multi-threaded support.
	 */
	db_mutex_t mtx_dbreg;		/* Mutex for thread protection. */

	DB_ENTRY *dbentry;		/* Recovery file-id mapping. */
#define	DB_GROW_SIZE	64
	int32_t	dbentry_cnt;		/* Entries.  Grows by DB_GROW_SIZE. */

	/*
	 * These fields are only accessed when the region lock is held, so
	 * they do not have to be protected by the thread lock as well.
	 */
	u_int32_t lfname;		/* Log file "name". */
	DB_FH	 *lfhp;			/* Log file handle. */
	time_t	  lf_timestamp;		/* Log file timestamp. */

	u_int8_t *bufp;			/* Region buffer. */

	/* These fields are not thread protected. */
	ENV	 *env;			/* Environment */
	REGINFO	  reginfo;		/* Region information. */

#define	DBLOG_AUTOREMOVE	0x01	/* Autoremove log files. */
#define	DBLOG_DIRECT		0x02	/* Do direct I/O on the log. */
#define	DBLOG_DSYNC		0x04	/* Set OS_DSYNC on the log. */
#define	DBLOG_FORCE_OPEN	0x08	/* Force the DB open even if it appears
					 * to be deleted. */
#define	DBLOG_INMEMORY		0x10	/* Logging is in memory. */
#define	DBLOG_OPENFILES		0x20	/* Prepared files need to be open. */
#define	DBLOG_RECOVER		0x40	/* We are in recovery. */
#define	DBLOG_ZERO		0x80	/* Zero fill the log. */
#define	DBLOG_VERIFYING		0x100	/* The log is being verified. */
	u_int32_t flags;
};

/*
 * HDR --
 *	Log record header.
 */
struct __hdr {
	u_int32_t prev;			/* Previous offset. */
	u_int32_t len;			/* Current length. */
	u_int8_t  chksum[DB_MAC_KEY];	/* Current checksum. */
	u_int8_t  iv[DB_IV_BYTES];	/* IV */
	u_int32_t orig_size;		/* Original size of log record */
	/* !!! - 'size' is not written to log, must be last in hdr */
	size_t	  size;			/* Size of header to use */
};

/*
 * LOG_HDR_SUM -- XOR in prev and len
 *	This helps avoids the race misreading the log while it
 * it is being updated.
 */
#define	LOG_HDR_SUM(crypto, hdr, sum) do {				\
	if (crypto) {							\
		((u_int32_t *)sum)[0] ^= ((HDR *)hdr)->prev;		\
		((u_int32_t *)sum)[1] ^= ((HDR *)hdr)->len;		\
	} else {							\
		((u_int32_t *)sum)[0] ^=				\
		     ((HDR *)hdr)->prev ^ ((HDR *)hdr)->len;		\
	}								\
} while (0)

/*
 * We use HDR internally, and then when we write out, we write out
 * prev, len, and then a 4-byte checksum if normal operation or
 * a crypto-checksum and IV and original size if running in crypto
 * mode.  We must store the original size in case we pad.  Set the
 * size when we set up the header.  We compute a DB_MAC_KEY size
 * checksum regardless, but we can safely just use the first 4 bytes.
 */
#define	HDR_NORMAL_SZ	12
#define	HDR_CRYPTO_SZ	12 + DB_MAC_KEY + DB_IV_BYTES

struct __log_persist {
	u_int32_t magic;		/* DB_LOGMAGIC */
	u_int32_t version;		/* DB_LOGVERSION */

	u_int32_t log_size;		/* Log file size. */
	u_int32_t notused;		/* Historically the log file mode. */
};

/* Macros to lock/unlock the log region as a whole. */
#define	LOG_SYSTEM_LOCK(env)						\
	MUTEX_LOCK(env, ((LOG *)					\
	    (env)->lg_handle->reginfo.primary)->mtx_region)
#define	LOG_SYSTEM_UNLOCK(env)						\
	MUTEX_UNLOCK(env, ((LOG *)					\
	    (env)->lg_handle->reginfo.primary)->mtx_region)

/*
 * LOG --
 *	Shared log region.  One of these is allocated in shared memory,
 *	and describes the log.
 */
struct __log { /* SHARED */
	db_mutex_t mtx_region;		/* Region mutex. */

	db_mutex_t mtx_filelist;	/* Mutex guarding file name list. */

	LOGP	persist;		/* Persistent information. */

	SH_TAILQ_HEAD(__fq1) fq;	/* List of file names. */
	int32_t	fid_max;		/* Max fid allocated. */
	roff_t	free_fid_stack;		/* Stack of free file ids. */
	u_int32_t  free_fids;		/* Height of free fid stack. */
	u_int32_t  free_fids_alloced;	/* N free fid slots allocated. */

	/*
	 * The lsn LSN is the file offset that we're about to write and which
	 * we will return to the user.
	 */
	DB_LSN	  lsn;			/* LSN at current file offset. */

	/*
	 * The f_lsn LSN is the LSN (returned to the user) that "owns" the
	 * first byte of the buffer.  If the record associated with the LSN
	 * spans buffers, it may not reflect the physical file location of
	 * the first byte of the buffer.
	 */
	DB_LSN	  f_lsn;		/* LSN of first byte in the buffer. */
	db_size_t b_off;		/* Current offset in the buffer. */
	u_int32_t w_off;		/* Current write offset in the file. */
	u_int32_t len;			/* Length of the last record. */

	DB_LSN	  active_lsn;		/* Oldest active LSN in the buffer. */
	db_size_t a_off;		/* Offset in the buffer of first active
					   file. */

	/*
	 * The s_lsn LSN is the last LSN that we know is on disk, not just
	 * written, but synced.  This field is protected by the flush mutex
	 * rather than by the region mutex.
	 */
	db_mutex_t mtx_flush;		/* Mutex guarding flushing. */
	int32_t	   in_flush;	/* Log flush in progress. */
	DB_LSN	   s_lsn;		/* LSN of the last sync. */

	DB_LOG_STAT stat;		/* Log statistics. */

	/*
	 * This timestamp is updated anytime someone unlinks log
	 * files.  This can happen when calling __log_vtruncate
	 * or replication internal init when it unlinks log files.
	 *
	 * The timestamp is used so that other processes that might
	 * have file handles to log files know to close/reopen them
	 * so they're not potentially writing to now-removed files.
	 */
	time_t	   timestamp;		/* Log trunc timestamp. */

	/*
	 * !!!
	 * NOTE: the next group of fields are NOT protected by the log
	 * region lock.  They are protected by REP->mtx_clientdb.  If you
	 * need access to both, you must acquire REP->mtx_clientdb
	 * before acquiring the log region lock.
	 *
	 * The waiting_lsn is used by the replication system.  It is the
	 * first LSN that we are holding without putting in the log, because
	 * we received one or more log records out of order.  Associated with
	 * the waiting_lsn is the number of log records that we still have to
	 * receive before we decide that we should request it again.
	 *
	 * The max_wait_lsn is used to control retransmission in the face
	 * of dropped messages.  If we are requesting all records from the
	 * current gap (i.e., chunk of the log that we are missing), then
	 * the max_wait_lsn contains the first LSN that we are known to have
	 * in the __db.rep.db.  If we requested only a single record, then
	 * the max_wait_lsn has the LSN of that record we requested.
	 */
	/* BEGIN fields protected by rep->mtx_clientdb. */
	DB_LSN	  waiting_lsn;		/* First log record after a gap. */
	DB_LSN	  verify_lsn;		/* LSN we are waiting to verify. */
	DB_LSN	  prev_ckp;		/* LSN of ckp preceding verify_lsn. */
	DB_LSN	  max_wait_lsn;		/* Maximum LSN requested. */
	DB_LSN	  max_perm_lsn;		/* Maximum PERMANENT LSN processed. */
	db_timespec max_lease_ts;	/* Maximum Lease timestamp seen. */
	db_timespec wait_ts;		/* Time to wait before requesting. */
	db_timespec rcvd_ts;		/* Initial received time to wait. */
	db_timespec last_ts;		/* Last time of insert in temp db. */
	/*
	 * The ready_lsn is also used by the replication system.  It is the
	 * next LSN we expect to receive.  It's normally equal to "lsn",
	 * except at the beginning of a log file, at which point it's set
	 * to the LSN of the first record of the new file (after the
	 * header), rather than to 0.
	 */
	DB_LSN	  ready_lsn;
	/*
	 * The bulk_buf is used by replication for bulk transfer.  While this
	 * is protected by REP->mtx_clientdb, this doesn't contend with the
	 * above fields because the above are used by clients and the bulk
	 * fields below are used by a master.
	 */
	roff_t	  bulk_buf;		/* Bulk transfer buffer in region. */
	roff_t	  bulk_off;		/* Current offset into bulk buffer. */
	u_int32_t bulk_len;		/* Length of buffer. */
	u_int32_t bulk_flags;		/* Bulk buffer flags. */
	/* END fields protected by rep->mtx_clientdb. */

	/*
	 * During initialization, the log system walks forward through the
	 * last log file to find its end.  If it runs into a checkpoint
	 * while it's doing so, it caches it here so that the transaction
	 * system doesn't need to walk through the file again on its
	 * initialization.
	 */
	DB_LSN	cached_ckp_lsn;

	u_int32_t regionmax;		/* Configured size of the region. */

	roff_t	  buffer_off;		/* Log buffer offset in the region. */
	u_int32_t buffer_size;		/* Log buffer size. */

	u_int32_t log_size;		/* Log file's size. */
	u_int32_t log_nsize;		/* Next log file's size. */

	int	  filemode;		/* Log file permissions mode. */

	/*
	 * DB_LOG_AUTOREMOVE and DB_LOG_INMEMORY: not protected by a mutex,
	 * all we care about is if they're zero or non-zero.
	 */
	int32_t	  db_log_autoremove;
	int32_t	  db_log_inmemory;

	u_int32_t ncommit;		/* Number of txns waiting to commit. */
	DB_LSN	  t_lsn;		/* LSN of first commit */
	SH_TAILQ_HEAD(__commit) commits;/* list of txns waiting to commit. */
	SH_TAILQ_HEAD(__free) free_commits;/* free list of commit structs. */

	/*
	 * In-memory logs maintain a list of the start positions of all log
	 * files currently active in the in-memory buffer.  This is to make the
	 * lookup from LSN to log buffer offset efficient.
	 */
	SH_TAILQ_HEAD(__logfile) logfiles;
	SH_TAILQ_HEAD(__free_logfile) free_logfiles;
};

/*
 * __db_commit structure --
 *	One of these is allocated for each transaction waiting to commit.
 */
struct __db_commit {
	db_mutex_t	mtx_txnwait;	/* Mutex for txn to wait on. */
	DB_LSN		lsn;		/* LSN of commit record. */
	SH_TAILQ_ENTRY	links;		/* Either on free or waiting list. */

#define	DB_COMMIT_FLUSH		0x0001	/* Flush the log when you wake up. */
	u_int32_t	flags;
};

/*
 * Check for the proper progression of Log Sequence Numbers.
 * If we are rolling forward the LSN on the page must be greater
 * than or equal to the previous LSN in log record.
 * We ignore NOT LOGGED LSNs.  The user did an unlogged update.
 * We should eventually see a log record that matches and continue
 * forward.
 * A ZERO LSN implies a page that was allocated prior to the recovery
 * start point and then truncated later in the log.  An allocation of a
 * page after this page will extend the file, leaving a hole.  We want to
 * ignore this page until it is truncated again.
 *
 */

#define	CHECK_LSN(e, redo, cmp, lsn, prev)				\
	if (DB_REDO(redo) && (cmp) < 0 &&				\
	    ((!IS_NOT_LOGGED_LSN(*(lsn)) && !IS_ZERO_LSN(*(lsn))) ||	\
	    IS_REP_CLIENT(e))) {					\
		ret = __db_check_lsn(e, lsn, prev);			\
		goto out;						\
	}
#define	CHECK_ABORT(e, redo, cmp, lsn, prev)				\
	if (redo == DB_TXN_ABORT && (cmp) != 0 &&			\
	    ((!IS_NOT_LOGGED_LSN(*(lsn)) && !IS_ZERO_LSN(*(lsn))) ||	\
	    IS_REP_CLIENT(e))) {					\
		ret = __db_check_lsn(e, lsn, prev);			\
		goto out;						\
	}

/*
 * Helper for in-memory logs -- check whether an offset is in range
 * in a ring buffer (inclusive of start, exclusive of end).
 */
struct __db_filestart {
	u_int32_t	file;
	size_t		b_off;

	SH_TAILQ_ENTRY	links;		/* Either on free or waiting list. */
};

#define	RINGBUF_LEN(lp, start, end)					\
	((start) < (end) ?						\
	    (end) - (start) : (lp)->buffer_size - ((start) - (end)))

/*
 * Internal macro to set pointer to the begin_lsn for generated
 * logging routines.  If begin_lsn is already set then do nothing.
 * Return a pointer to the last lsn too.
 */
#undef DB_SET_TXN_LSNP
#define	DB_SET_TXN_LSNP(txn, blsnp, llsnp) do {				\
	DB_LSN *__lsnp;							\
	TXN_DETAIL *__td;						\
	__td = (txn)->td;						\
	*(llsnp) = &__td->last_lsn;					\
	while (__td->parent != INVALID_ROFF)				\
		__td = R_ADDR(&(txn)->mgrp->reginfo, __td->parent);	\
	__lsnp = &__td->begin_lsn;					\
	if (IS_ZERO_LSN(*__lsnp))					\
		*(blsnp) = __lsnp;					\
} while (0)

/*
 * Status codes indicating the validity of a log file examined by
 * __log_valid().
 */
typedef enum {
	DB_LV_INCOMPLETE,
	DB_LV_NONEXISTENT,
	DB_LV_NORMAL,
	DB_LV_OLD_READABLE,
	DB_LV_OLD_UNREADABLE
} logfile_validity;

/*
 * All log records have these fields.
 */
typedef struct __log_rec_hdr {
	u_int32_t type;
	DB_TXN *txnp;
	DB_LSN prev_lsn;
} LOG_REC_HEADER;

#if defined(__cplusplus)
}
#endif

#include "dbinc_auto/log_ext.h"
#include "dbinc_auto/dbreg_auto.h"
#include "dbinc_auto/dbreg_ext.h"
#endif /* !_DB_LOG_H_ */