diff options
Diffstat (limited to 'storage/bdb/dbinc/rep.h')
-rw-r--r-- | storage/bdb/dbinc/rep.h | 247 |
1 files changed, 190 insertions, 57 deletions
diff --git a/storage/bdb/dbinc/rep.h b/storage/bdb/dbinc/rep.h index 1e315494c87..ec1f290f45a 100644 --- a/storage/bdb/dbinc/rep.h +++ b/storage/bdb/dbinc/rep.h @@ -1,20 +1,22 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2001-2002 + * Copyright (c) 2001-2004 * Sleepycat Software. All rights reserved. */ #ifndef _REP_H_ #define _REP_H_ +#include "dbinc_auto/rep_auto.h" + #define REP_ALIVE 1 /* I am alive message. */ #define REP_ALIVE_REQ 2 /* Request for alive messages. */ #define REP_ALL_REQ 3 /* Request all log records greater than LSN. */ -#define REP_ELECT 4 /* Indicates that all listeners should */ - /* begin master election */ -#define REP_FILE 6 /* Page of a database file. */ -#define REP_FILE_REQ 7 /* Request for a database file. */ +#define REP_DUPMASTER 4 /* Duplicate master detected; propagate. */ +#define REP_FILE 5 /* Page of a database file. NOTUSED */ +#define REP_FILE_FAIL 6 /* File requested does not exist. */ +#define REP_FILE_REQ 7 /* Request for a database file. NOTUSED */ #define REP_LOG 8 /* Log record. */ #define REP_LOG_MORE 9 /* There are more log records to request. */ #define REP_LOG_REQ 10 /* Request for a log record. */ @@ -29,23 +31,71 @@ * someone who heard about a NEWSITE. */ #define REP_PAGE 16 /* Database page. */ -#define REP_PAGE_REQ 17 /* Request for a database page. */ -#define REP_PLIST 18 /* Database page list. */ -#define REP_PLIST_REQ 19 /* Request for a page list. */ -#define REP_VERIFY 20 /* A log record for verification. */ -#define REP_VERIFY_FAIL 21 /* The client is outdated. */ -#define REP_VERIFY_REQ 22 /* Request for a log record to verify. */ -#define REP_VOTE1 23 /* Send out your information for an election. */ -#define REP_VOTE2 24 /* Send a "you are master" vote. */ +#define REP_PAGE_FAIL 17 /* Requested page does not exist. */ +#define REP_PAGE_MORE 18 /* There are more pages to request. */ +#define REP_PAGE_REQ 19 /* Request for a database page. */ +#define REP_UPDATE 20 /* Environment hotcopy information. */ +#define REP_UPDATE_REQ 21 /* Request for hotcopy information. */ +#define REP_VERIFY 22 /* A log record for verification. */ +#define REP_VERIFY_FAIL 23 /* The client is outdated. */ +#define REP_VERIFY_REQ 24 /* Request for a log record to verify. */ +#define REP_VOTE1 25 /* Send out your information for an election. */ +#define REP_VOTE2 26 /* Send a "you are master" vote. */ -/* Used to consistently designate which messages ought to be received where. */ -#define MASTER_ONLY(dbenv) \ - if (!F_ISSET(dbenv, DB_ENV_REP_MASTER)) return (EINVAL) +/* + * REP_PRINT_MESSAGE + * A function to print a debugging message. + * + * RPRINT + * A macro for debug printing. Takes as an arg the arg set for __db_msg. + * + * !!! This function assumes a local DB_MSGBUF variable called 'mb'. + */ +#ifdef DIAGNOSTIC +#define REP_PRINT_MESSAGE(dbenv, eid, rp, str) \ + __rep_print_message(dbenv, eid, rp, str) +#define RPRINT(e, r, x) do { \ + if (FLD_ISSET((e)->verbose, DB_VERB_REPLICATION)) { \ + DB_MSGBUF_INIT(&mb); \ + if ((e)->db_errpfx == NULL) { \ + if (F_ISSET((r), REP_F_CLIENT)) \ + __db_msgadd((e), &mb, "CLIENT: "); \ + else if (F_ISSET((r), REP_F_MASTER)) \ + __db_msgadd((e), &mb, "MASTER: "); \ + else \ + __db_msgadd((e), &mb, "REP_UNDEF: "); \ + } else \ + __db_msgadd((e), &mb, "%s: ",(e)->db_errpfx); \ + __db_msgadd x; \ + DB_MSGBUF_FLUSH((e), &mb); \ + } \ +} while (0) +#else +#define REP_PRINT_MESSAGE(dbenv, eid, rp, str) +#define RPRINT(e, r, x) +#endif -#define CLIENT_ONLY(dbenv) \ - if (!F_ISSET(dbenv, DB_ENV_REP_CLIENT)) return (EINVAL) +/* + * Election gen file name + * The file contains an egen number for an election this client + * has NOT participated in. I.e. it is the number of a future + * election. We create it when we create the rep region, if it + * doesn't already exist and initialize egen to 1. If it does + * exist, we read it when we create the rep region. We write it + * immediately before sending our VOTE1 in an election. That way, + * if a client has ever sent a vote for any election, the file is + * already going to be updated to reflect a future election, + * should it crash. + */ +#define REP_EGENNAME "__db.rep.egen" -#define ANYSITE(dbenv) +/* + * Database types for __rep_client_dbinit + */ +typedef enum { + REP_DB, /* Log record database. */ + REP_PG /* Pg database. */ +} repdb_t; /* Shared replication structure. */ @@ -58,12 +108,16 @@ typedef struct __rep { */ DB_MUTEX mutex; /* Region lock. */ roff_t db_mutex_off; /* Client database mutex. */ - u_int32_t tally_off; /* Offset of the tally region. */ + roff_t tally_off; /* Offset of the tally region. */ + roff_t v2tally_off; /* Offset of the vote2 tally region. */ int eid; /* Environment id. */ int master_id; /* ID of the master site. */ - u_int32_t gen; /* Replication generation number */ + u_int32_t egen; /* Replication election generation. */ + u_int32_t gen; /* Replication generation number. */ + u_int32_t recover_gen; /* Last generation number in log. */ int asites; /* Space allocated for sites. */ int nsites; /* Number of sites in group. */ + int nvotes; /* Number of votes needed. */ int priority; /* My priority in an election. */ u_int32_t gbytes; /* Limit on data sent in single... */ u_int32_t bytes; /* __rep_process_message call. */ @@ -73,6 +127,30 @@ typedef struct __rep { * request a missing log record. */ u_int32_t max_gap; /* Maximum number of records before * requesting a missing log record. */ + /* Status change information */ + int elect_th; /* A thread is in rep_elect. */ + u_int32_t msg_th; /* Number of callers in rep_proc_msg. */ + int start_th; /* A thread is in rep_start. */ + u_int32_t handle_cnt; /* Count of handles in library. */ + u_int32_t op_cnt; /* Multi-step operation count.*/ + int in_recovery; /* Running recovery now. */ + + /* Backup information. */ + int nfiles; /* Number of files we have info on. */ + int curfile; /* Current file we're getting. */ + __rep_fileinfo_args *curinfo; /* Current file info ptr. */ + void *finfo; /* Current file info buffer. */ + void *nextinfo; /* Next file info buffer. */ + void *originfo; /* Original file info buffer. */ + DB_LSN first_lsn; /* Earliest LSN we need. */ + DB_LSN last_lsn; /* Latest LSN we need. */ + db_pgno_t ready_pg; /* Next pg expected. */ + db_pgno_t waiting_pg; /* First pg after gap. */ + db_pgno_t max_wait_pg; /* Maximum pg requested. */ + u_int32_t npages; /* Num of pages rcvd for this file. */ + DB_MPOOLFILE *file_mpf; /* Mpoolfile for in-mem database. */ + DB *file_dbp; /* This file's page info. */ + DB *queue_dbp; /* Dbp for a queue file. */ /* Vote tallying information. */ int sites; /* Sites heard from. */ @@ -80,38 +158,102 @@ typedef struct __rep { int w_priority; /* Winner priority. */ u_int32_t w_gen; /* Winner generation. */ DB_LSN w_lsn; /* Winner LSN. */ - int w_tiebreaker; /* Winner tiebreaking value. */ + u_int32_t w_tiebreaker; /* Winner tiebreaking value. */ int votes; /* Number of votes for this site. */ /* Statistics. */ DB_REP_STAT stat; -#define REP_F_EPHASE1 0x01 /* In phase 1 of election. */ -#define REP_F_EPHASE2 0x02 /* In phase 2 of election. */ -#define REP_F_LOGSONLY 0x04 /* Log-site only; cannot be upgraded. */ -#define REP_F_MASTER 0x08 /* Master replica. */ -#define REP_F_RECOVER 0x10 -#define REP_F_UPGRADE 0x20 /* Upgradeable replica. */ -#define REP_ISCLIENT (REP_F_UPGRADE | REP_F_LOGSONLY) +#define REP_F_CLIENT 0x00001 /* Client replica. */ +#define REP_F_EPHASE1 0x00002 /* In phase 1 of election. */ +#define REP_F_EPHASE2 0x00004 /* In phase 2 of election. */ +#define REP_F_MASTER 0x00008 /* Master replica. */ +#define REP_F_MASTERELECT 0x00010 /* Master elect */ +#define REP_F_NOARCHIVE 0x00020 /* Rep blocks log_archive */ +#define REP_F_READY 0x00040 /* Wait for txn_cnt to be 0. */ +#define REP_F_RECOVER_LOG 0x00080 /* In recovery - log. */ +#define REP_F_RECOVER_PAGE 0x00100 /* In recovery - pages. */ +#define REP_F_RECOVER_UPDATE 0x00200 /* In recovery - files. */ +#define REP_F_RECOVER_VERIFY 0x00400 /* In recovery - verify. */ +#define REP_F_TALLY 0x00800 /* Tallied vote before elect. */ u_int32_t flags; } REP; +/* + * Recovery flag mask to easily check any/all recovery bits. That is + * REP_F_READY and all REP_F_RECOVER*. This must change if the values + * of the flags change. + */ +#define REP_F_RECOVER_MASK \ + (REP_F_READY | REP_F_RECOVER_LOG | REP_F_RECOVER_PAGE | \ + REP_F_RECOVER_UPDATE | REP_F_RECOVER_VERIFY) + #define IN_ELECTION(R) F_ISSET((R), REP_F_EPHASE1 | REP_F_EPHASE2) -#define ELECTION_DONE(R) F_CLR((R), REP_F_EPHASE1 | REP_F_EPHASE2) +#define IN_ELECTION_TALLY(R) \ + F_ISSET((R), REP_F_EPHASE1 | REP_F_EPHASE2 | REP_F_TALLY) +#define IS_REP_MASTER(dbenv) \ + (REP_ON(dbenv) && ((DB_REP *)(dbenv)->rep_handle)->region && \ + F_ISSET(((REP *)((DB_REP *)(dbenv)->rep_handle)->region), \ + REP_F_MASTER)) + +#define IS_REP_CLIENT(dbenv) \ + (REP_ON(dbenv) && ((DB_REP *)(dbenv)->rep_handle)->region && \ + F_ISSET(((REP *)((DB_REP *)(dbenv)->rep_handle)->region), \ + REP_F_CLIENT)) + +#define IS_CLIENT_PGRECOVER(dbenv) \ + (IS_REP_CLIENT(dbenv) && \ + F_ISSET(((REP *)((DB_REP *)(dbenv)->rep_handle)->region), \ + REP_F_RECOVER_PAGE)) + +/* + * Macros to figure out if we need to do replication pre/post-amble + * processing. + */ +#define IS_REPLICATED(E, D) \ + (!F_ISSET((D), DB_AM_RECOVER | DB_AM_REPLICATION) && \ + REP_ON(E) && ((DB_REP *)((E)->rep_handle))->region != NULL && \ + ((DB_REP *)((E)->rep_handle))->region->flags != 0) + +#define IS_ENV_REPLICATED(E) (REP_ON(E) && \ + ((DB_REP *)((E)->rep_handle))->region != NULL && \ + ((DB_REP *)((E)->rep_handle))->region->flags != 0) /* * Per-process replication structure. + * + * There are 2 mutexes used in replication. + * 1. rep_mutexp - This protects the fields of the rep region above. + * 2. db_mutexp - This protects the per-process flags, and bookkeeping + * database and all of the components that maintain it. Those + * components include the following fields in the log region (see log.h): + * a. ready_lsn + * b. waiting_lsn + * c. verify_lsn + * d. wait_recs + * e. rcvd_recs + * f. max_wait_lsn + * These fields in the log region are NOT protected by the log + * region lock at all. + * + * Note that the per-process flags should truly be protected by a + * special per-process thread mutex, but it is currently set in so + * isolated a manner that it didn't make sense to do so and in most + * case we're already holding the db_mutexp anyway. + * + * The lock ordering protocol is that db_mutexp must be acquired + * first and then either rep_mutexp, or the log region mutex may + * be acquired if necessary. */ struct __db_rep { - DB_MUTEX *mutexp; + DB_MUTEX *rep_mutexp; /* Mutex for rep region */ DB_MUTEX *db_mutexp; /* Mutex for bookkeeping database. */ DB *rep_db; /* Bookkeeping database. */ REP *region; /* In memory structure. */ - int (*rep_send) /* Send function. */ - __P((DB_ENV *, - const DBT *, const DBT *, int, u_int32_t)); +#define DBREP_OPENFILES 0x0001 /* This handle has opened files. */ + u_int32_t flags; /* per-process flags. */ }; /* @@ -120,10 +262,10 @@ struct __db_rep { * Note that the version information should be at the beginning of the * structure, so that we can rearrange the rest of it while letting the * version checks continue to work. DB_REPVERSION should be revved any time - * the rest of the structure changes. + * the rest of the structure changes or when the message numbers change. */ typedef struct __rep_control { -#define DB_REPVERSION 1 +#define DB_REPVERSION 2 u_int32_t rep_version; /* Replication version number. */ u_int32_t log_version; /* Log version number. */ @@ -135,37 +277,28 @@ typedef struct __rep_control { /* Election vote information. */ typedef struct __rep_vote { - int priority; /* My site's priority. */ - int nsites; /* Number of sites I've been in + u_int32_t egen; /* Election generation. */ + int nsites; /* Number of sites I've been in * communication with. */ - int tiebreaker; /* Tie-breaking quasi-random int. */ + int nvotes; /* Number of votes needed to win. */ + int priority; /* My site's priority. */ + u_int32_t tiebreaker; /* Tie-breaking quasi-random value. */ } REP_VOTE_INFO; +typedef struct __rep_vtally { + u_int32_t egen; /* Voter's election generation. */ + int eid; /* Voter's ID. */ +} REP_VTALLY; + /* * This structure takes care of representing a transaction. * It holds all the records, sorted by page number so that * we can obtain locks and apply updates in a deadlock free * order. */ -typedef struct __lsn_page { - DB_LSN lsn; - u_int32_t fid; - DB_LOCK_ILOCK pgdesc; -#define LSN_PAGE_NOLOCK 0x0001 /* No lock necessary for log rec. */ - u_int32_t flags; -} LSN_PAGE; - -typedef struct __txn_recs { - int npages; - int nalloc; - LSN_PAGE *array; - u_int32_t txnid; - u_int32_t lockid; -} TXN_RECS; - typedef struct __lsn_collection { - int nlsns; - int nalloc; + u_int nlsns; + u_int nalloc; DB_LSN *array; } LSN_COLLECTION; |