diff options
Diffstat (limited to 'src/dbinc/repmgr.h')
-rw-r--r-- | src/dbinc/repmgr.h | 157 |
1 files changed, 123 insertions, 34 deletions
diff --git a/src/dbinc/repmgr.h b/src/dbinc/repmgr.h index d8fd199c..a38defa2 100644 --- a/src/dbinc/repmgr.h +++ b/src/dbinc/repmgr.h @@ -1,7 +1,7 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 2006, 2012 Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2006, 2015 Oracle and/or its affiliates. All rights reserved. * * $Id$ */ @@ -47,20 +47,29 @@ extern "C" { * In protocol version one there were only three message types: 1, 2, and 3; so * 3 was the max. In protocol version 2 we introduced heartbeats, type 4. * (Protocol version 3 did not introduce any new message types.) In version 4 - * we introduced a few more new message types, the largest of which had value 7. + * we introduced a few more new message types, the largest of which had value 8. + * Protocol version 5 did not introduce any new message types, but changed + * the format of site info and membership data to support views. + * + * Protocol version 6 introduced preferred master mode, which added several + * new REPMGR_OWN messages. */ #define REPMGR_MAX_V1_MSG_TYPE 3 #define REPMGR_MAX_V2_MSG_TYPE 4 #define REPMGR_MAX_V3_MSG_TYPE 4 #define REPMGR_MAX_V4_MSG_TYPE 8 +#define REPMGR_MAX_V5_MSG_TYPE 8 +#define REPMGR_MAX_V6_MSG_TYPE 8 #define HEARTBEAT_MIN_VERSION 2 #define CHANNEL_MIN_VERSION 4 #define CONN_COLLISION_VERSION 4 #define GM_MIN_VERSION 4 #define OWN_MIN_VERSION 4 +#define VIEW_MIN_VERSION 5 +#define PREFMAS_MIN_VERSION 6 /* The range of protocol versions we're willing to support. */ -#define DB_REPMGR_VERSION 4 +#define DB_REPMGR_VERSION 6 #define DB_REPMGR_MIN_VERSION 1 /* @@ -73,18 +82,30 @@ extern "C" { * Like the message format types, these message type values should be * permanently frozen. */ -#define REPMGR_CONNECT_REJECT 1 -#define REPMGR_GM_FAILURE 2 -#define REPMGR_GM_FORWARD 3 -#define REPMGR_JOIN_REQUEST 4 -#define REPMGR_JOIN_SUCCESS 5 -#define REPMGR_PARM_REFRESH 6 -#define REPMGR_REJOIN 7 -#define REPMGR_REMOVE_REQUEST 8 -#define REPMGR_REMOVE_SUCCESS 9 -#define REPMGR_RESOLVE_LIMBO 10 -#define REPMGR_SHARING 11 - +#define REPMGR_CONNECT_REJECT 1 +#define REPMGR_GM_FAILURE 2 +#define REPMGR_GM_FORWARD 3 +#define REPMGR_JOIN_REQUEST 4 +#define REPMGR_JOIN_SUCCESS 5 +#define REPMGR_PARM_REFRESH 6 +#define REPMGR_REJOIN 7 +#define REPMGR_REMOVE_REQUEST 8 +#define REPMGR_REMOVE_SUCCESS 9 +#define REPMGR_RESOLVE_LIMBO 10 +#define REPMGR_SHARING 11 +#define REPMGR_LSNHIST_REQUEST 12 +#define REPMGR_LSNHIST_RESPONSE 13 +#define REPMGR_PREFMAS_FAILURE 14 +#define REPMGR_PREFMAS_SUCCESS 15 +#define REPMGR_READONLY_MASTER 16 +#define REPMGR_READONLY_RESPONSE 17 +#define REPMGR_RESTART_CLIENT 18 + +/* Detect inconsistencies between view callback and site's gmdb. */ +#define PARTICIPANT_TO_VIEW(db_rep, site) \ + ((db_rep)->partial && !FLD_ISSET((site)->gmdb_flags, SITE_VIEW)) +#define VIEW_TO_PARTICIPANT(db_rep, site) \ + (!(db_rep)->partial && FLD_ISSET((site)->gmdb_flags, SITE_VIEW)) struct __repmgr_connection; typedef struct __repmgr_connection REPMGR_CONNECTION; @@ -98,7 +119,8 @@ struct __cond_waiters_table; typedef struct __cond_waiters_table COND_WAITERS_TABLE; /* Current Group Membership DB format ID. */ -#define REPMGR_GMDB_FMT_VERSION 1 +#define REPMGR_GMDB_FMT_VERSION 2 +#define REPMGR_GMDB_FMT_MIN_VERSION 1 #ifdef DB_WIN32 typedef SOCKET socket_t; @@ -151,6 +173,17 @@ typedef char SITE_STRING_BUFFER[MAX_SITE_LOC_STRING+1]; #define DB_REPMGR_DEFAULT_ELECTION_RETRY (10 * US_PER_SEC) #define DB_REPMGR_DEFAULT_CHANNEL_TIMEOUT (5 * US_PER_SEC) +/* Default preferred master automatic configuration values. */ +#define DB_REPMGR_PREFMAS_ELECTION_RETRY (1 * US_PER_SEC) +#define DB_REPMGR_PREFMAS_HEARTBEAT_MONITOR (2 * US_PER_SEC) +#define DB_REPMGR_PREFMAS_HEARTBEAT_SEND (75 * (US_PER_SEC / 100)) +#define DB_REPMGR_PREFMAS_PRIORITY_CLIENT 75 +#define DB_REPMGR_PREFMAS_PRIORITY_MASTER 200 + +/* Defaults for undocumented incoming queue maximum messages. */ +#define DB_REPMGR_DEFAULT_INQUEUE_MAX (100 * MEGABYTE) +#define DB_REPMGR_INQUEUE_REDZONE_PERCENT 85 + typedef TAILQ_HEAD(__repmgr_conn_list, __repmgr_connection) CONNECTION_LIST; typedef STAILQ_HEAD(__repmgr_out_q_head, __queued_output) OUT_Q_HEADER; typedef TAILQ_HEAD(__repmgr_retry_q, __repmgr_retry) RETRY_Q_HEADER; @@ -170,14 +203,20 @@ struct __repmgr_runnable { /* * Options governing requested behavior of election thread. */ -#define ELECT_F_EVENT_NOTIFY 0x01 /* Notify application of master failure. */ -#define ELECT_F_FAST 0x02 /* First election "fast" (n-1 trick). */ -#define ELECT_F_IMMED 0x04 /* Start with immediate election. */ -#define ELECT_F_INVITEE 0x08 /* Honor (remote) inviter's nsites. */ -#define ELECT_F_STARTUP 0x10 /* Observe repmgr_start() policy. */ +#define ELECT_F_CLIENT_RESTART 0x01 /* Do client restarts but no elections. */ +#define ELECT_F_EVENT_NOTIFY 0x02 /* Notify application of master failure. */ +#define ELECT_F_FAST 0x04 /* First election "fast" (n-1 trick). */ +#define ELECT_F_IMMED 0x08 /* Start with immediate election. */ +#define ELECT_F_INVITEE 0x10 /* Honor (remote) inviter's nsites. */ +#define ELECT_F_STARTUP 0x20 /* Observe repmgr_start() policy. */ u_int32_t flags; - int eid; /* For Connector thread. */ + /* For connector thread. */ + struct { + int eid; +#define CONNECT_F_REFRESH 0x01 /* New connection to replace old one. */ + u_int32_t flags; + } conn_th; /* * Args for other thread types can be added here in the future @@ -265,6 +304,7 @@ struct __queued_output { */ typedef struct __repmgr_message { STAILQ_ENTRY(__repmgr_message) entries; + size_t size; __repmgr_msg_hdr_args msg_hdr; union { struct { @@ -343,6 +383,7 @@ struct __repmgr_connection { #define CONN_PARAMETERS 5 /* Awaiting parameters handshake. */ #define CONN_READY 6 /* Everything's fine. */ int state; + u_int32_t auto_takeover;/* Connection to remote listener candidate. */ /* * Input: while we're reading a message, we keep track of what phase @@ -464,6 +505,8 @@ typedef struct { SITEADDR addr; /* Unprocessed network address of site. */ u_int32_t config; /* Configuration flags: peer, helper, etc. */ u_int32_t status; /* Group membership status. */ + u_int32_t flags; /* Group membership flags. */ + u_int32_t listener_cand;/* Number of listener candidates of site. */ } SITEINFO; /* @@ -489,6 +532,42 @@ typedef struct { ((u_int)i) < db_rep->site_cnt; \ (int)(++(i)) == db_rep->self_eid ? ++(i) : i) +/* + * Enable replication manager auto listener takeover. + */ +#define HAVE_REPLICATION_LISTENER_TAKEOVER 1 + +/* Listener candidate, that is subordinate rep-aware process. */ +#define IS_LISTENER_CAND(db_rep) \ + (FLD_ISSET((db_rep)->region->config, REP_C_AUTOTAKEOVER) && \ + IS_SUBORDINATE(db_rep) && (db_rep)->repmgr_status == running) + +/* + * The number of listener candidates for each remote site is maintained in + * the listener process and used in subordinate rep-aware processes. + */ +#define SET_LISTENER_CAND(cond, op) \ + do { \ + if (FLD_ISSET(rep->config, REP_C_AUTOTAKEOVER) && \ + !IS_SUBORDINATE(db_rep) && (cond)) { \ + MUTEX_LOCK(env, rep->mtx_repmgr); \ + sites = R_ADDR(env->reginfo, rep->siteinfo_off);\ + (sites[eid].listener_cand)op; \ + MUTEX_UNLOCK(env, rep->mtx_repmgr); \ + } \ + } while (0) + +#define CHECK_LISTENER_CAND(val, op, tval, fval) \ + do { \ + if (IS_LISTENER_CAND(db_rep)) { \ + MUTEX_LOCK(env, rep->mtx_repmgr); \ + sites = R_ADDR(env->reginfo, rep->siteinfo_off);\ + val = ((sites[eid].listener_cand)op) ? \ + (tval) : (fval); \ + MUTEX_UNLOCK(env, rep->mtx_repmgr); \ + } \ + } while (0) + struct __repmgr_site { repmgr_netaddr_t net_addr; @@ -499,12 +578,14 @@ struct __repmgr_site { * host/port network address is promised to be associated with the * locally known EID for the life of the environment. */ - u_int32_t membership; /* Status flags from GMDB. */ + u_int32_t membership; /* Status value from GMDB. */ + u_int32_t gmdb_flags; /* Flags from GMDB. */ u_int32_t config; /* Flags from site->set_config() */ /* * Everything below here is applicable only to remote sites. */ + u_int32_t max_ack_gen; /* Master generation for max_ack. */ DB_LSN max_ack; /* Best ack we've heard from this site. */ int ack_policy; /* Or 0 if unknown. */ u_int16_t alignment; /* Requirements for app channel msgs. */ @@ -604,11 +685,11 @@ struct __channel { * connections may be found: (1) SITE->ref.conn, (2) SITE->sub_conns, and * (3) db_rep->connections. * - * 1. SITE->ref.conn points to our connection with the main process running - * at the given site, if such a connection exists. We may have initiated - * the connection to the site ourselves, or we may have received it as an - * incoming connection. Once it is established there is very little - * difference between those two cases. + * 1. SITE->ref.conn points to our connection with the listener process + * running at the given site, if such a connection exists. We may have + * initiated the connection to the site ourselves, or we may have received + * it as an incoming connection. Once it is established there is very + * little difference between those two cases. * * 2. SITE->sub_conns is a list of connections we have with subordinate * processes running at the given site. There can be any number of these @@ -694,6 +775,7 @@ struct __channel { */ #define APP_CHANNEL_CONNECTION 0x02 /* Connection used for app channel. */ #define ELECTABLE_SITE 0x04 +#define REPMGR_AUTOTAKEOVER 0x08 /* Could become main connection. */ #define REPMGR_SUBORDINATE 0x01 /* This is a subordinate connection. */ /* @@ -719,13 +801,20 @@ typedef struct { * As with message formats, stored formats are defined in repmgr.msg. */ /* - * Flags for the Group Membership data portion of a record. Like message type - * codes, these values are frozen across releases, in order to avoid pointless - * churn. + * Status values for the Group Membership data portion of a record. Like + * message type codes, these values are frozen across releases, in order to + * avoid pointless churn. These values are mutually exclusive. */ #define SITE_ADDING 0x01 #define SITE_DELETING 0x02 #define SITE_PRESENT 0x04 +/* + * Flags for the Group Membership data portion of a record. These values are + * also frozen across releases. These values are bit fields and may be OR'ed + * together. + */ +#define SITE_VIEW 0x01 +#define SITE_JOIN_ELECTABLE 0x02 /* * Message types whose processing could take a long time. We're careful to @@ -755,9 +844,9 @@ typedef struct { * fraction of the code, it's a tiny fraction of the time: repmgr spends most of * its time in a call to select(), and as well a bit in calls into the Base * replication API. All of those release the mutex. - * Access to repmgr's shared list of site addresses is protected by - * another mutex: mtx_repmgr. And, when changing space allocation for that site - * list we conform to the convention of acquiring renv->mtx_regenv. These are + * Access to repmgr's shared values is protected by another mutex: + * mtx_repmgr. And, when changing space allocation for that site list + * we conform to the convention of acquiring renv->mtx_regenv. These are * less frequent of course. * When it's necessary to acquire more than one of these mutexes, the * ordering priority (or "lock ordering protocol") is: |