1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
|
/*-
* See the file LICENSE for redistribution information.
*
* Copyright (c) 2001-2005
* Sleepycat Software. All rights reserved.
*
* $Id: rep.h,v 12.22 2005/10/27 13:27:01 bostic Exp $
*/
#ifndef _REP_H_
#define _REP_H_
#include "dbinc_auto/rep_auto.h"
/*
* Message types
*/
#define REP_ALIVE 1 /* I am alive message. */
#define REP_ALIVE_REQ 2 /* Request for alive messages. */
#define REP_ALL_REQ 3 /* Request all log records greater than LSN. */
#define REP_BULK_LOG 4 /* Bulk transfer of log records. */
#define REP_BULK_PAGE 5 /* Bulk transfer of pages. */
#define REP_DUPMASTER 6 /* Duplicate master detected; propagate. */
#define REP_FILE 7 /* Page of a database file. NOTUSED */
#define REP_FILE_FAIL 8 /* File requested does not exist. */
#define REP_FILE_REQ 9 /* Request for a database file. NOTUSED */
#define REP_LOG 10 /* Log record. */
#define REP_LOG_MORE 11 /* There are more log records to request. */
#define REP_LOG_REQ 12 /* Request for a log record. */
#define REP_MASTER_REQ 13 /* Who is the master */
#define REP_NEWCLIENT 14 /* Announces the presence of a new client. */
#define REP_NEWFILE 15 /* Announce a log file change. */
#define REP_NEWMASTER 16 /* Announces who the master is. */
#define REP_NEWSITE 17 /* Announces that a site has heard from a new
* site; like NEWCLIENT, but indirect. A
* NEWCLIENT message comes directly from the new
* client while a NEWSITE comes indirectly from
* someone who heard about a NEWSITE.
*/
#define REP_PAGE 18 /* Database page. */
#define REP_PAGE_FAIL 19 /* Requested page does not exist. */
#define REP_PAGE_MORE 20 /* There are more pages to request. */
#define REP_PAGE_REQ 21 /* Request for a database page. */
#define REP_REREQUEST 22 /* Force rerequest. */
#define REP_UPDATE 23 /* Environment hotcopy information. */
#define REP_UPDATE_REQ 24 /* Request for hotcopy information. */
#define REP_VERIFY 25 /* A log record for verification. */
#define REP_VERIFY_FAIL 26 /* The client is outdated. */
#define REP_VERIFY_REQ 27 /* Request for a log record to verify. */
#define REP_VOTE1 28 /* Send out your information for an election. */
#define REP_VOTE2 29 /* Send a "you are master" vote. */
/*
* REP_PRINT_MESSAGE
* A function to print a debugging message.
*
* RPRINT
* A macro for debug printing. Takes as an arg the arg set for __db_msg.
*
* !!! This function assumes a local DB_MSGBUF variable called 'mb'.
*/
#ifdef DIAGNOSTIC
#define REP_PRINT_MESSAGE(dbenv, eid, rp, str) \
__rep_print_message(dbenv, eid, rp, str)
#define RPRINT(e, r, x) do { \
if (FLD_ISSET((e)->verbose, DB_VERB_REPLICATION)) { \
DB_MSGBUF_INIT(&mb); \
if ((e)->db_errpfx == NULL) { \
if (F_ISSET((r), REP_F_CLIENT)) \
__db_msgadd((e), &mb, "CLIENT: "); \
else if (F_ISSET((r), REP_F_MASTER)) \
__db_msgadd((e), &mb, "MASTER: "); \
else \
__db_msgadd((e), &mb, "REP_UNDEF: "); \
} else \
__db_msgadd((e), &mb, "%s: ",(e)->db_errpfx); \
__db_msgadd x; \
DB_MSGBUF_FLUSH((e), &mb); \
} \
} while (0)
#else
#define REP_PRINT_MESSAGE(dbenv, eid, rp, str)
#define RPRINT(e, r, x)
#endif
/*
* Election gen file name
* The file contains an egen number for an election this client has NOT
* participated in. I.e. it is the number of a future election. We
* create it when we create the rep region, if it doesn't already exist
* and initialize egen to 1. If it does exist, we read it when we create
* the rep region. We write it immediately before sending our VOTE1 in
* an election. That way, if a client has ever sent a vote for any
* election, the file is already going to be updated to reflect a future
* election, should it crash.
*/
#define REP_EGENNAME "__db.rep.egen"
/*
* Database types for __rep_client_dbinit
*/
typedef enum {
REP_DB, /* Log record database. */
REP_PG /* Pg database. */
} repdb_t;
/* Macros to lock/unlock the replication region as a whole. */
#define REP_SYSTEM_LOCK(dbenv) \
MUTEX_LOCK(dbenv, ((DB_REP *) \
(dbenv)->rep_handle)->region->mtx_region)
#define REP_SYSTEM_UNLOCK(dbenv) \
MUTEX_UNLOCK(dbenv, ((DB_REP *) \
(dbenv)->rep_handle)->region->mtx_region)
/*
* REP --
* Shared replication structure.
*/
typedef struct __rep {
db_mutex_t mtx_region; /* Region mutex. */
db_mutex_t mtx_clientdb; /* Client database mutex. */
roff_t tally_off; /* Offset of the tally region. */
roff_t v2tally_off; /* Offset of the vote2 tally region. */
int eid; /* Environment id. */
int master_id; /* ID of the master site. */
u_int32_t egen; /* Replication election generation. */
u_int32_t gen; /* Replication generation number. */
u_int32_t recover_gen; /* Last generation number in log. */
int asites; /* Space allocated for sites. */
int nsites; /* Number of sites in group. */
int nvotes; /* Number of votes needed. */
int priority; /* My priority in an election. */
u_int32_t gbytes; /* Limit on data sent in single... */
u_int32_t bytes; /* __rep_process_message call. */
#define DB_REP_REQUEST_GAP 4
#define DB_REP_MAX_GAP 128
u_int32_t request_gap; /* # of records to receive before we
* request a missing log record. */
u_int32_t max_gap; /* Maximum number of records before
* requesting a missing log record. */
/* Status change information */
int elect_th; /* A thread is in rep_elect. */
u_int32_t msg_th; /* Number of callers in rep_proc_msg. */
int start_th; /* A thread is in rep_start. */
u_int32_t handle_cnt; /* Count of handles in library. */
u_int32_t op_cnt; /* Multi-step operation count.*/
int in_recovery; /* Running recovery now. */
/* Backup information. */
u_int32_t nfiles; /* Number of files we have info on. */
u_int32_t curfile; /* Current file we're getting. */
__rep_fileinfo_args *curinfo; /* Current file info ptr. */
void *finfo; /* Current file info buffer. */
void *nextinfo; /* Next file info buffer. */
void *originfo; /* Original file info buffer. */
DB_LSN first_lsn; /* Earliest LSN we need. */
DB_LSN last_lsn; /* Latest LSN we need. */
db_pgno_t ready_pg; /* Next pg expected. */
db_pgno_t waiting_pg; /* First pg after gap. */
db_pgno_t max_wait_pg; /* Maximum pg requested. */
u_int32_t npages; /* Num of pages rcvd for this file. */
DB_MPOOLFILE *file_mpf; /* Mpoolfile for in-mem database. */
DB *file_dbp; /* This file's page info. */
DB *queue_dbp; /* Dbp for a queue file. */
/* Vote tallying information. */
int sites; /* Sites heard from. */
int winner; /* Current winner. */
int w_priority; /* Winner priority. */
u_int32_t w_gen; /* Winner generation. */
DB_LSN w_lsn; /* Winner LSN. */
u_int32_t w_tiebreaker; /* Winner tiebreaking value. */
int votes; /* Number of votes for this site. */
u_int32_t esec; /* Election start seconds. */
u_int32_t eusec; /* Election start useconds. */
/* Statistics. */
DB_REP_STAT stat;
/* Configuration. */
#define REP_C_BULK 0x00001 /* Bulk transfer. */
#define REP_C_DELAYCLIENT 0x00002 /* Delay client sync-up. */
#define REP_C_NOAUTOINIT 0x00004 /* No auto initialization. */
#define REP_C_NOWAIT 0x00008 /* Immediate error return. */
u_int32_t config; /* Configuration flags. */
#define REP_F_CLIENT 0x00001 /* Client replica. */
#define REP_F_DELAY 0x00002 /* Delaying client sync-up. */
#define REP_F_EPHASE1 0x00004 /* In phase 1 of election. */
#define REP_F_EPHASE2 0x00008 /* In phase 2 of election. */
#define REP_F_MASTER 0x00010 /* Master replica. */
#define REP_F_MASTERELECT 0x00020 /* Master elect */
#define REP_F_NOARCHIVE 0x00040 /* Rep blocks log_archive */
#define REP_F_READY 0x00080 /* Wait for txn_cnt to be 0. */
#define REP_F_RECOVER_LOG 0x00100 /* In recovery - log. */
#define REP_F_RECOVER_PAGE 0x00200 /* In recovery - pages. */
#define REP_F_RECOVER_UPDATE 0x00400 /* In recovery - files. */
#define REP_F_RECOVER_VERIFY 0x00800 /* In recovery - verify. */
#define REP_F_TALLY 0x01000 /* Tallied vote before elect. */
u_int32_t flags;
} REP;
/*
* Recovery flag mask to easily check any/all recovery bits. That is
* REP_F_READY and all REP_F_RECOVER*. This must change if the values
* of the flags change.
*/
#define REP_F_RECOVER_MASK \
(REP_F_READY | REP_F_RECOVER_LOG | REP_F_RECOVER_PAGE | \
REP_F_RECOVER_UPDATE | REP_F_RECOVER_VERIFY)
#define IN_ELECTION(R) F_ISSET((R), REP_F_EPHASE1 | REP_F_EPHASE2)
#define IN_ELECTION_TALLY(R) \
F_ISSET((R), REP_F_EPHASE1 | REP_F_EPHASE2 | REP_F_TALLY)
#define IS_REP_MASTER(dbenv) \
(REP_ON(dbenv) && ((DB_REP *)(dbenv)->rep_handle)->region && \
F_ISSET(((REP *)((DB_REP *)(dbenv)->rep_handle)->region), \
REP_F_MASTER))
#define IS_REP_CLIENT(dbenv) \
(REP_ON(dbenv) && ((DB_REP *)(dbenv)->rep_handle)->region && \
F_ISSET(((REP *)((DB_REP *)(dbenv)->rep_handle)->region), \
REP_F_CLIENT))
#define IS_CLIENT_PGRECOVER(dbenv) \
(IS_REP_CLIENT(dbenv) && \
F_ISSET(((REP *)((DB_REP *)(dbenv)->rep_handle)->region), \
REP_F_RECOVER_PAGE))
/*
* Macros to figure out if we need to do replication pre/post-amble processing.
* Skip for specific DB handles owned by the replication layer, either because
* replication is running recovery or because it's a handle entirely owned by
* the replication code (replication opens its own databases to track state).
*/
#define IS_ENV_REPLICATED(E) (REP_ON(E) && \
((DB_REP *)((E)->rep_handle))->region != NULL && \
((DB_REP *)((E)->rep_handle))->region->flags != 0)
/*
* Gap processing flags. These provide control over the basic
* gap processing algorithm for some special cases.
*/
#define REP_GAP_FORCE 0x001 /* Force a request for a gap. */
#define REP_GAP_REREQUEST 0x002 /* Gap request is a forced rerequest. */
/* REREQUEST is a superset of FORCE. */
/*
* Basic pre/post-amble processing.
*/
#define REPLICATION_WRAP(dbenv, func_call, ret) do { \
int __rep_check, __t_ret; \
__rep_check = IS_ENV_REPLICATED(dbenv) ? 1 : 0; \
if (__rep_check && ((ret) = __env_rep_enter(dbenv, 0)) != 0) \
return ((ret)); \
(ret) = func_call; \
if (__rep_check && \
(__t_ret = __env_db_rep_exit(dbenv)) != 0 && (ret) == 0) \
(ret) = __t_ret; \
} while (0)
/*
* Per-process replication structure.
*
* There are 2 mutexes used in replication.
* 1. mtx_region - This protects the fields of the rep region above.
* 2. mtx_clientdb - This protects the per-process flags, and bookkeeping
* database and all of the components that maintain it. Those
* components include the following fields in the log region (see log.h):
* a. ready_lsn
* b. waiting_lsn
* c. verify_lsn
* d. wait_recs
* e. rcvd_recs
* f. max_wait_lsn
* These fields in the log region are NOT protected by the log region lock at
* all.
*
* Note that the per-process flags should truly be protected by a special
* per-process thread mutex, but it is currently set in so isolated a manner
* that it didn't make sense to do so and in most case we're already holding
* the mtx_clientdb anyway.
*
* The lock ordering protocol is that mtx_clientdb must be acquired first and
* then either REP->mtx_region, or the LOG->mtx_region mutex may be acquired if
* necessary.
*/
struct __db_rep {
DB *rep_db; /* Bookkeeping database. */
REP *region; /* In memory structure. */
u_int8_t *bulk; /* Shared memory bulk area. */
#define DBREP_OPENFILES 0x0001 /* This handle has opened files. */
u_int32_t flags; /* per-process flags. */
};
/*
* Control structure for replication communication infrastructure.
*
* Note that the version information should be at the beginning of the
* structure, so that we can rearrange the rest of it while letting the
* version checks continue to work. DB_REPVERSION should be revved any time
* the rest of the structure changes or when the message numbers change.
*/
typedef struct __rep_control {
#define DB_REPVERSION 3
u_int32_t rep_version; /* Replication version number. */
u_int32_t log_version; /* Log version number. */
DB_LSN lsn; /* Log sequence number. */
u_int32_t rectype; /* Message type. */
u_int32_t gen; /* Generation number. */
u_int32_t flags; /* log_put flag value. */
} REP_CONTROL;
/* Election vote information. */
typedef struct __rep_vote {
u_int32_t egen; /* Election generation. */
int nsites; /* Number of sites I've been in
* communication with. */
int nvotes; /* Number of votes needed to win. */
int priority; /* My site's priority. */
u_int32_t tiebreaker; /* Tie-breaking quasi-random value. */
} REP_VOTE_INFO;
typedef struct __rep_vtally {
u_int32_t egen; /* Voter's election generation. */
int eid; /* Voter's ID. */
} REP_VTALLY;
/*
* The REP_THROTTLE_ONLY flag is used to do throttle processing only.
* If set, it will only allow sending the REP_*_MORE message, but not
* the normal, non-throttled message. It is used to support throttling
* with bulk transfer.
*/
/* Flags for __rep_send_throttle. */
#define REP_THROTTLE_ONLY 0x0001 /* Send _MORE message only. */
/* Throttled message processing information. */
typedef struct __rep_throttle {
DB_LSN lsn; /* LSN of this record. */
DBT *data_dbt; /* DBT of this record. */
u_int32_t gbytes; /* This call's max gbytes sent. */
u_int32_t bytes; /* This call's max bytes sent. */
u_int32_t type; /* Record type. */
} REP_THROTTLE;
/* Bulk processing information. */
/*
* !!!
* We use a uintptr_t for the offset. We'd really like to use a ptrdiff_t
* since that really is what it is. But ptrdiff_t is not portable and
* doesn't exist everywhere.
*/
typedef struct __rep_bulk {
u_int8_t *addr; /* Address of bulk buffer. */
uintptr_t *offp; /* Ptr to current offset into buffer. */
u_int32_t len; /* Bulk buffer length. */
u_int32_t type; /* Item type in buffer (log, page). */
DB_LSN lsn; /* First LSN in buffer. */
int eid; /* ID of potential recipients. */
#define BULK_FORCE 0x001 /* Force buffer after this record. */
#define BULK_XMIT 0x002 /* Buffer in transit. */
u_int32_t *flagsp; /* Buffer flags. */
} REP_BULK;
/*
* This structure takes care of representing a transaction.
* It holds all the records, sorted by page number so that
* we can obtain locks and apply updates in a deadlock free
* order.
*/
typedef struct __lsn_collection {
u_int nlsns;
u_int nalloc;
DB_LSN *array;
} LSN_COLLECTION;
/*
* This is used by the page-prep routines to do the lock_vec call to
* apply the updates for a single transaction or a collection of
* transactions.
*/
typedef struct _linfo {
int n;
DB_LOCKREQ *reqs;
DBT *objs;
} linfo_t;
#include "dbinc_auto/rep_ext.h"
#endif /* !_REP_H_ */
|