summaryrefslogtreecommitdiff
path: root/src/rep/rep_method.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/rep/rep_method.c')
-rw-r--r--src/rep/rep_method.c615
1 files changed, 552 insertions, 63 deletions
diff --git a/src/rep/rep_method.c b/src/rep/rep_method.c
index f9f1924c..e0e7dd19 100644
--- a/src/rep/rep_method.c
+++ b/src/rep/rep_method.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 2001, 2012 Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2001, 2015 Oracle and/or its affiliates. All rights reserved.
*
* $Id$
*/
@@ -10,6 +10,7 @@
#include "db_int.h"
#include "dbinc/db_page.h"
+#include "dbinc/blob.h"
#include "dbinc/btree.h"
#include "dbinc/mp.h"
#include "dbinc/txn.h"
@@ -17,14 +18,12 @@
static int __rep_abort_prepared __P((ENV *));
static int __rep_await_condition __P((ENV *,
struct rep_waitgoal *, db_timeout_t));
-static int __rep_bt_cmp __P((DB *, const DBT *, const DBT *));
+static int __rep_bt_cmp __P((DB *, const DBT *, const DBT *, size_t *));
static int __rep_check_applied __P((ENV *,
DB_THREAD_INFO *, DB_COMMIT_INFO *, struct rep_waitgoal *));
static void __rep_config_map __P((ENV *, u_int32_t *, u_int32_t *));
static u_int32_t __rep_conv_vers __P((ENV *, u_int32_t));
-static int __rep_read_lsn_history __P((ENV *,
- DB_THREAD_INFO *, DB_TXN **, DBC **, u_int32_t,
- __rep_lsn_hist_data_args *, struct rep_waitgoal *, u_int32_t));
+static int __rep_defview __P((DB_ENV *, const char *, int *, u_int32_t));
static int __rep_restore_prepared __P((ENV *));
static int __rep_save_lsn_hist __P((ENV *, DB_THREAD_INFO *, DB_LSN *));
/*
@@ -123,9 +122,11 @@ __rep_get_config(dbenv, which, onp)
#undef OK_FLAGS
#define OK_FLAGS \
(DB_REP_CONF_AUTOINIT | DB_REP_CONF_AUTOROLLBACK | \
- DB_REP_CONF_BULK | DB_REP_CONF_DELAYCLIENT | DB_REP_CONF_INMEM | \
+ DB_REP_CONF_BULK | DB_REP_CONF_DELAYCLIENT | \
+ DB_REP_CONF_ELECT_LOGLENGTH | DB_REP_CONF_INMEM | \
DB_REP_CONF_LEASE | DB_REP_CONF_NOWAIT | \
- DB_REPMGR_CONF_2SITE_STRICT | DB_REPMGR_CONF_ELECTIONS)
+ DB_REPMGR_CONF_2SITE_STRICT | DB_REPMGR_CONF_ELECTIONS | \
+ DB_REPMGR_CONF_PREFMAS_CLIENT | DB_REPMGR_CONF_PREFMAS_MASTER)
if (FLD_ISSET(which, ~OK_FLAGS))
return (__db_ferr(env, "DB_ENV->rep_get_config", 0));
@@ -171,19 +172,30 @@ __rep_set_config(dbenv, which, on)
REP *rep;
REP_BULK bulk;
u_int32_t mapped, orig;
- int ret, t_ret;
+ int inmemlog, pm_ret, ret, t_ret;
env = dbenv->env;
db_rep = env->rep_handle;
ret = 0;
+ pm_ret = 0;
+ inmemlog = 0;
#undef OK_FLAGS
#define OK_FLAGS \
(DB_REP_CONF_AUTOINIT | DB_REP_CONF_AUTOROLLBACK | \
- DB_REP_CONF_BULK | DB_REP_CONF_DELAYCLIENT | DB_REP_CONF_INMEM | \
+ DB_REP_CONF_BULK | DB_REP_CONF_DELAYCLIENT | \
+ DB_REP_CONF_ELECT_LOGLENGTH | DB_REP_CONF_INMEM | \
DB_REP_CONF_LEASE | DB_REP_CONF_NOWAIT | \
- DB_REPMGR_CONF_2SITE_STRICT | DB_REPMGR_CONF_ELECTIONS)
-#define REPMGR_FLAGS (REP_C_2SITE_STRICT | REP_C_ELECTIONS)
+ DB_REPMGR_CONF_2SITE_STRICT | DB_REPMGR_CONF_ELECTIONS | \
+ DB_REPMGR_CONF_PREFMAS_CLIENT | DB_REPMGR_CONF_PREFMAS_MASTER)
+#define REPMGR_FLAGS (REP_C_2SITE_STRICT | REP_C_ELECTIONS | \
+ REP_C_PREFMAS_CLIENT | REP_C_PREFMAS_MASTER)
+
+#define TURNING_ON_PREFMAS(orig, curr) \
+ ((FLD_ISSET(curr, REP_C_PREFMAS_MASTER) && \
+ !FLD_ISSET(orig, REP_C_PREFMAS_MASTER)) || \
+ (FLD_ISSET(curr, REP_C_PREFMAS_CLIENT) && \
+ !FLD_ISSET(orig, REP_C_PREFMAS_CLIENT)))
ENV_NOT_CONFIGURED(
env, db_rep->region, "DB_ENV->rep_set_config", DB_INIT_REP);
@@ -224,6 +236,62 @@ __rep_set_config(dbenv, which, on)
return (EINVAL);
}
/*
+ * The undocumented ELECT_LOGLENGTH option and the preferred
+ * master options cannot be changed after calling repmgr_start.
+ */
+ if (FLD_ISSET(mapped, (REP_C_ELECT_LOGLENGTH |
+ REP_C_PREFMAS_MASTER | REP_C_PREFMAS_CLIENT)) &&
+ F_ISSET(rep, REP_F_START_CALLED)) {
+ __db_errx(env, DB_STR("3706",
+ "DB_ENV->rep_set_config: %s "
+ "must be configured before DB_ENV->repmgr_start"),
+ FLD_ISSET(mapped, REP_C_ELECT_LOGLENGTH) ?
+ "ELECT_LOGLENGTH" : "preferred master");
+ ENV_LEAVE(env, ip);
+ return (EINVAL);
+ }
+ /*
+ * Do not allow users to turn on preferred master if
+ * leases or in-memory replication files are in effect,
+ * or with a private environment or in-memory log files.
+ */
+ if (FLD_ISSET(mapped,
+ (REP_C_PREFMAS_MASTER | REP_C_PREFMAS_CLIENT)) &&
+ (REP_CONFIG_IS_SET(env, (REP_C_LEASE | REP_C_INMEM)) ||
+ (__log_get_config(dbenv,
+ DB_LOG_IN_MEMORY, &inmemlog) == 0 &&
+ (inmemlog > 0 || F_ISSET(env, ENV_PRIVATE))))) {
+ __db_errx(env, DB_STR("3707",
+ "DB_ENV->rep_set_config: preferred master mode "
+ "cannot be used with %s"),
+ REP_CONFIG_IS_SET(env, REP_C_LEASE) ?
+ "master leases" :
+ REP_CONFIG_IS_SET(env, REP_C_INMEM) ?
+ "in-memory replication files" :
+ inmemlog > 0 ? "in-memory log files" :
+ "a private environment");
+ ENV_LEAVE(env, ip);
+ return (EINVAL);
+ }
+ /*
+ * If we are already in preferred master mode, we can't
+ * turn off elections or 2site_strict and we can't turn on
+ * leases.
+ */
+ if (PREFMAS_IS_SET(env) && ((FLD_ISSET(mapped,
+ (REP_C_ELECTIONS | REP_C_2SITE_STRICT)) && on == 0) ||
+ (FLD_ISSET(mapped, REP_C_LEASE) && on > 0))) {
+ __db_errx(env, DB_STR("3708",
+ "DB_ENV->rep_set_config: cannot %s %s "
+ "in preferred master mode"),
+ on == 0 ? "disable" : "enable",
+ FLD_ISSET(mapped, REP_C_ELECTIONS) ? "elections" :
+ FLD_ISSET(mapped, REP_C_LEASE) ? "leases" :
+ "2SITE_STRICT");
+ ENV_LEAVE(env, ip);
+ return (EINVAL);
+ }
+ /*
* Leases must be turned on before calling rep_start.
* Leases can never be turned off once they're turned on.
*/
@@ -252,6 +320,17 @@ __rep_set_config(dbenv, which, on)
else
FLD_CLR(rep->config, mapped);
+#ifdef HAVE_REPLICATION_THREADS
+ /* Do automatic preferred master configuration. */
+ if (TURNING_ON_PREFMAS(orig, rep->config) &&
+ (pm_ret = __repmgr_prefmas_auto_config(dbenv,
+ &rep->config)) != 0) {
+ REP_SYSTEM_UNLOCK(env);
+ MUTEX_UNLOCK(env, rep->mtx_clientdb);
+ ENV_LEAVE(env, ip);
+ goto prefmas_err;
+ }
+#endif
/*
* Bulk transfer requires special processing if it is getting
* toggled.
@@ -297,10 +376,25 @@ __rep_set_config(dbenv, which, on)
ret = t_ret;
#endif
} else {
+ orig = db_rep->config;
if (on)
FLD_SET(db_rep->config, mapped);
else
FLD_CLR(db_rep->config, mapped);
+#ifdef HAVE_REPLICATION_THREADS
+ /* Do automatic preferred master configuration. */
+ if (TURNING_ON_PREFMAS(orig, db_rep->config))
+ pm_ret =
+ __repmgr_prefmas_auto_config(dbenv,
+ &db_rep->config);
+#endif
+ }
+prefmas_err:
+ if (pm_ret != 0) {
+ __db_errx(env, DB_STR("3709",
+ "DB_ENV->rep_set_config: could not complete automatic "
+ "preferred master configuration"));
+ ret = EINVAL;
}
/* Configuring 2SITE_STRICT, etc. makes this a repmgr application */
if (ret == 0 && FLD_ISSET(mapped, REPMGR_FLAGS))
@@ -331,6 +425,10 @@ __rep_config_map(env, inflagsp, outflagsp)
FLD_SET(*outflagsp, REP_C_DELAYCLIENT);
FLD_CLR(*inflagsp, DB_REP_CONF_DELAYCLIENT);
}
+ if (FLD_ISSET(*inflagsp, DB_REP_CONF_ELECT_LOGLENGTH)) {
+ FLD_SET(*outflagsp, REP_C_ELECT_LOGLENGTH);
+ FLD_CLR(*inflagsp, DB_REP_CONF_ELECT_LOGLENGTH);
+ }
if (FLD_ISSET(*inflagsp, DB_REP_CONF_INMEM)) {
FLD_SET(*outflagsp, REP_C_INMEM);
FLD_CLR(*inflagsp, DB_REP_CONF_INMEM);
@@ -351,6 +449,14 @@ __rep_config_map(env, inflagsp, outflagsp)
FLD_SET(*outflagsp, REP_C_ELECTIONS);
FLD_CLR(*inflagsp, DB_REPMGR_CONF_ELECTIONS);
}
+ if (FLD_ISSET(*inflagsp, DB_REPMGR_CONF_PREFMAS_CLIENT)) {
+ FLD_SET(*outflagsp, REP_C_PREFMAS_CLIENT);
+ FLD_CLR(*inflagsp, DB_REPMGR_CONF_PREFMAS_CLIENT);
+ }
+ if (FLD_ISSET(*inflagsp, DB_REPMGR_CONF_PREFMAS_MASTER)) {
+ FLD_SET(*outflagsp, REP_C_PREFMAS_MASTER);
+ FLD_CLR(*inflagsp, DB_REPMGR_CONF_PREFMAS_MASTER);
+ }
DB_ASSERT(env, *inflagsp == 0);
}
@@ -368,8 +474,10 @@ __rep_start_pp(dbenv, dbt, flags)
DBT *dbt;
u_int32_t flags;
{
- DB_REP *db_rep;
ENV *env;
+ DB_REP *db_rep;
+ DB_THREAD_INFO *ip;
+ int ret;
env = dbenv->env;
db_rep = env->rep_handle;
@@ -400,7 +508,11 @@ __rep_start_pp(dbenv, dbt, flags)
return (EINVAL);
}
- return (__rep_start_int(env, dbt, flags));
+ ENV_ENTER(env, ip);
+ ret = __rep_start_int(env, dbt, flags, 0);
+ ENV_LEAVE(env, ip);
+
+ return (ret);
}
/*
@@ -432,13 +544,14 @@ __rep_start_pp(dbenv, dbt, flags)
* clients that reference non-existent files whose creation was backed out
* during a synchronizing recovery.
*
- * PUBLIC: int __rep_start_int __P((ENV *, DBT *, u_int32_t));
+ * PUBLIC: int __rep_start_int __P((ENV *, DBT *, u_int32_t, u_int32_t));
*/
int
-__rep_start_int(env, dbt, flags)
+__rep_start_int(env, dbt, flags, startopts)
ENV *env;
DBT *dbt;
u_int32_t flags;
+ u_int32_t startopts;
{
DB *dbp;
DB_LOG *dblp;
@@ -474,9 +587,31 @@ __rep_start_int(env, dbt, flags)
return (EINVAL);
}
- ENV_ENTER(env, ip);
+ /*
+ * If we are a view, we can never become master.
+ */
+ if (IS_VIEW_SITE(env) && role == DB_REP_MASTER) {
+ __db_errx(env, DB_STR("3685",
+ "View site cannot become master"));
+ return (EINVAL);
+ }
+
+ /*
+ * Check for consistent view usage. We need to check here rather
+ * than in __rep_open because non-rep-aware processes such as
+ * db_stat may open/join the environment. Rep-aware handles must
+ * consistently set the view.
+ */
+ if ((ret = __rep_check_view(env)) != 0) {
+ RPRINT(env, (env, DB_VERB_REP_MISC,
+ "Application env/view mismatch."));
+ __db_errx(env, DB_STR("3686",
+ "Application environment and view callback mismatch"));
+ return (ret);
+ }
/* Serialize rep_start() calls. */
+ ENV_GET_THREAD_INFO(env, ip);
MUTEX_LOCK(env, rep->mtx_repstart);
start_th = 1;
@@ -492,8 +627,14 @@ __rep_start_int(env, dbt, flags)
goto out;
REP_SYSTEM_LOCK(env);
+ /*
+ * The FORCE_ROLECHG option is used when a side-effect of the role
+ * change such as incrementing the master gen is needed regardless
+ * of the previous role.
+ */
role_chg = (!F_ISSET(rep, REP_F_MASTER) && role == DB_REP_MASTER) ||
- (!F_ISSET(rep, REP_F_CLIENT) && role == DB_REP_CLIENT);
+ (!F_ISSET(rep, REP_F_CLIENT) && role == DB_REP_CLIENT) ||
+ FLD_ISSET(startopts, REP_START_FORCE_ROLECHG);
/*
* There is no need for lockout if all we're doing is sending a message.
@@ -511,9 +652,11 @@ __rep_start_int(env, dbt, flags)
goto out;
}
- if (FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_MSG)) {
+ if (!FLD_ISSET(startopts, REP_START_WAIT_LOCKMSG) &&
+ FLD_ISSET(rep->lockout_flags, REP_LOCKOUT_MSG)) {
/*
- * There is already someone in msg lockout. Return.
+ * There is already someone in msg lockout and we are not
+ * waiting. Return.
*/
RPRINT(env, (env, DB_VERB_REP_MISC,
"Thread already in msg lockout"));
@@ -702,10 +845,15 @@ __rep_start_int(env, dbt, flags)
* now defunct on master.
* NEWFILE: Used to delay client apply during newfile
* operation, not applicable to master.
+ * READONLY_MASTER: Used to coordinate preferred master
+ * takeover, should not remain in effect after restart.
+ * HOLD_GEN: Freeze gen for preferred master, should not
+ * remain in effect after restart.
*/
F_CLR(rep, REP_F_CLIENT | REP_F_ABBREVIATED |
REP_F_MASTERELECT | REP_F_SKIPPED_APPLY | REP_F_DELAY |
- REP_F_LEASE_EXPIRED | REP_F_NEWFILE);
+ REP_F_LEASE_EXPIRED | REP_F_NEWFILE |
+ REP_F_READONLY_MASTER | REP_F_HOLD_GEN);
/*
* When becoming a master, set the following flags:
* MASTER: Indicate that this site is master.
@@ -842,11 +990,16 @@ __rep_start_int(env, dbt, flags)
}
/*
* When becoming a client, clear the following flags:
+ * HOLD_GEN: Freeze gen for preferred master, should not
+ * remain in effect after restart.
* MASTER: Site is no longer a master.
* MASTERELECT: Indicates that a master is elected
* rather than appointed, not applicable on client.
+ * READONLY_MASTER: Used to coordinate preferred master
+ * takeover, should not remain in effect after restart.
*/
- F_CLR(rep, REP_F_MASTER | REP_F_MASTERELECT);
+ F_CLR(rep, REP_F_HOLD_GEN | REP_F_MASTER | REP_F_MASTERELECT |
+ REP_F_READONLY_MASTER);
F_SET(rep, REP_F_CLIENT);
/*
@@ -928,6 +1081,15 @@ __rep_start_int(env, dbt, flags)
* sync with the master.
*/
SET_GEN(0);
+ /*
+ * If we are changing role to client, reset our min log file
+ * until we hear from a master or another client. In
+ * particular, in a dupmaster situation, if this site loses
+ * an election a stale min_log_file would prevent archiving.
+ */
+#ifdef HAVE_REPLICATION_THREADS
+ rep->min_log_file = 0;
+#endif
REP_SYSTEM_UNLOCK(env);
/*
@@ -935,6 +1097,15 @@ __rep_start_int(env, dbt, flags)
*/
if ((ret = __dbt_usercopy(env, dbt)) != 0)
goto out;
+ /*
+ * The HOLD_CLIGEN option does not allow this client's
+ * gen to change until the REP_F_HOLD_GEN flag is cleared.
+ * It prevents this site from responding to NEWMASTER messages
+ * and disables updating the gen from other incoming messages.
+ */
+ if (FLD_ISSET(startopts, REP_START_HOLD_CLIGEN))
+ F_SET(rep, REP_F_HOLD_GEN);
+
(void)__rep_send_message(env,
DB_EID_BROADCAST, REP_NEWCLIENT, NULL, dbt, 0, 0);
}
@@ -967,7 +1138,6 @@ out:
if (start_th)
MUTEX_UNLOCK(env, rep->mtx_repstart);
__dbt_userfree(env, dbt, NULL, NULL);
- ENV_LEAVE(env, ip);
return (ret);
}
@@ -1170,6 +1340,9 @@ __rep_client_dbinit(env, startup, which)
if (which == REP_DB) {
name = REPDBNAME;
rdbpp = &db_rep->rep_db;
+ } else if (which == REP_BLOB) {
+ name = REPBLOBNAME;
+ rdbpp = &db_rep->blob_dbp;
} else {
name = REPPAGENAME;
rdbpp = &db_rep->file_dbp;
@@ -1209,16 +1382,28 @@ __rep_client_dbinit(env, startup, which)
if (which == REP_DB &&
(ret = __bam_set_bt_compare(dbp, __rep_bt_cmp)) != 0)
goto err;
+ if (which == REP_BLOB &&
+ (ret = __bam_set_bt_compare(dbp, __rep_blob_cmp)) != 0 &&
+ (ret = __db_set_dup_compare(dbp, __rep_offset_cmp)) != 0)
+ goto err;
/* Don't write log records on the client. */
if ((ret = __db_set_flags(dbp, DB_TXN_NOT_DURABLE)) != 0)
goto err;
+ /* Blob gap processing requires sorted duplicates. */
+ if (which == REP_BLOB) {
+ if ((ret = __db_set_blob_threshold(dbp, 0, 0)) != 0)
+ goto err;
+ if ((ret = __db_set_flags(dbp, DB_DUPSORT)) != 0)
+ goto err;
+ }
+
flags = DB_NO_AUTO_COMMIT | DB_CREATE | DB_INTERNAL_TEMPORARY_DB |
(F_ISSET(env, ENV_THREAD) ? DB_THREAD : 0);
if ((ret = __db_open(dbp, ip, NULL, fname, subdb,
- (which == REP_DB ? DB_BTREE : DB_RECNO),
+ (which == REP_PG ? DB_RECNO : DB_BTREE),
flags, 0, PGNO_BASE_MD)) != 0)
goto err;
@@ -1243,14 +1428,16 @@ err: if (dbp != NULL &&
* care about the LSNs.
*/
static int
-__rep_bt_cmp(dbp, dbt1, dbt2)
+__rep_bt_cmp(dbp, dbt1, dbt2, locp)
DB *dbp;
const DBT *dbt1, *dbt2;
+ size_t *locp;
{
DB_LSN lsn1, lsn2;
__rep_control_args *rp1, *rp2;
COMPQUIET(dbp, NULL);
+ COMPQUIET(locp, NULL);
rp1 = dbt1->data;
rp2 = dbt2->data;
@@ -1274,6 +1461,82 @@ __rep_bt_cmp(dbp, dbt1, dbt2)
}
/*
+ * __rep_blob_cmp --
+ *
+ * Comparison function for the blob gap database. The key is the blob_sid
+ * appended with the blob_id.
+ *
+ * PUBLIC: int __rep_blob_cmp __P((DB *, const DBT *, const DBT *, size_t *));
+ */
+int
+__rep_blob_cmp(dbp, dbt1, dbt2, locp)
+ DB *dbp;
+ const DBT *dbt1, *dbt2;
+ size_t *locp;
+{
+ db_seq_t blob_id1, blob_id2, blob_sid1, blob_sid2;
+ u_int8_t *p;
+
+ COMPQUIET(dbp, NULL);
+ COMPQUIET(locp, NULL);
+
+ /* Use memcpy here to prevent alignment issues. */
+ p = dbt1->data;
+ memcpy(&blob_sid1, p, sizeof(db_seq_t));
+ p += sizeof(db_seq_t);
+ memcpy(&blob_id1, p, sizeof(db_seq_t));
+ p = dbt2->data;
+ memcpy(&blob_sid2, p, sizeof(db_seq_t));
+ p += sizeof(db_seq_t);
+ memcpy(&blob_id2, p, sizeof(db_seq_t));
+
+ if (blob_sid1 > blob_sid2)
+ return (1);
+
+ if (blob_sid1 < blob_sid2)
+ return (-1);
+
+ if (blob_id1 > blob_id2)
+ return (1);
+
+ if (blob_id1 < blob_id2)
+ return (-1);
+
+ return (0);
+}
+
+/*
+ * __rep_offset_cmp --
+ *
+ * Comparison function for duplicates in the the blob gap database.
+ *
+ * PUBLIC: int __rep_offset_cmp
+ * PUBLIC: __P((DB *, const DBT *, const DBT *, size_t *));
+ */
+int
+__rep_offset_cmp(dbp, dbt1, dbt2, locp)
+ DB *dbp;
+ const DBT *dbt1, *dbt2;
+ size_t *locp;
+{
+ off_t offset1, offset2;
+
+ COMPQUIET(dbp, NULL);
+ COMPQUIET(locp, NULL);
+
+ /* Use memcpy here to prevent alignment issues. */
+ memcpy(&offset1, dbt1->data, sizeof(off_t));
+ memcpy(&offset2, dbt2->data, sizeof(off_t));
+
+ if (offset1 == offset2)
+ return (0);
+ else if (offset1 > offset2)
+ return (1);
+
+ return (-1);
+}
+
+/*
* __rep_abort_prepared --
* Abort any prepared transactions that recovery restored.
*
@@ -1684,7 +1947,10 @@ __rep_set_nsites_pp(dbenv, n)
"DB_ENV->rep_set_nsites: cannot call from Replication Manager application"));
return (EINVAL);
}
- if ((ret = __rep_set_nsites_int(env, n)) == 0)
+ ENV_ENTER(env, ip);
+ ret = __rep_set_nsites_int(env, n);
+ ENV_LEAVE(env, ip);
+ if (ret == 0)
APP_SET_BASEAPI(env);
return (ret);
}
@@ -1748,18 +2014,15 @@ __rep_get_nsites(dbenv, n)
}
/*
- * PUBLIC: int __rep_set_priority __P((DB_ENV *, u_int32_t));
+ * PUBLIC: int __rep_set_priority_pp __P((DB_ENV *, u_int32_t));
*/
int
-__rep_set_priority(dbenv, priority)
+__rep_set_priority_pp(dbenv, priority)
DB_ENV *dbenv;
u_int32_t priority;
{
DB_REP *db_rep;
ENV *env;
- REP *rep;
- u_int32_t prev;
- int ret;
env = dbenv->env;
db_rep = env->rep_handle;
@@ -1767,6 +2030,30 @@ __rep_set_priority(dbenv, priority)
ENV_NOT_CONFIGURED(
env, db_rep->region, "DB_ENV->rep_set_priority", DB_INIT_REP);
+ if (PREFMAS_IS_SET(env)) {
+ __db_errx(env, DB_STR_A("3710",
+"%s: cannot change priority in preferred master mode.",
+ "%s"), "DB_ENV->rep_set_priority");
+ return (EINVAL);
+ }
+
+ return (__rep_set_priority_int(env, priority));
+}
+
+/*
+ * PUBLIC: int __rep_set_priority_int __P((ENV *, u_int32_t));
+ */
+int
+__rep_set_priority_int(env, priority)
+ ENV *env;
+ u_int32_t priority;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ u_int32_t prev;
+ int ret;
+
+ db_rep = env->rep_handle;
ret = 0;
if (REP_ON(env)) {
rep = db_rep->region;
@@ -1807,10 +2094,10 @@ __rep_get_priority(dbenv, priority)
}
/*
- * PUBLIC: int __rep_set_timeout __P((DB_ENV *, int, db_timeout_t));
+ * PUBLIC: int __rep_set_timeout_pp __P((DB_ENV *, int, db_timeout_t));
*/
int
-__rep_set_timeout(dbenv, which, timeout)
+__rep_set_timeout_pp(dbenv, which, timeout)
DB_ENV *dbenv;
int which;
db_timeout_t timeout;
@@ -1818,13 +2105,10 @@ __rep_set_timeout(dbenv, which, timeout)
DB_REP *db_rep;
DB_THREAD_INFO *ip;
ENV *env;
- REP *rep;
int repmgr_timeout, ret;
env = dbenv->env;
db_rep = env->rep_handle;
- rep = db_rep->region;
- ret = 0;
repmgr_timeout = 0;
if (timeout == 0 && (which == DB_REP_CONNECTION_RETRY ||
@@ -1850,12 +2134,46 @@ __rep_set_timeout(dbenv, which, timeout)
return (EINVAL);
}
if (which == DB_REP_LEASE_TIMEOUT && IS_REP_STARTED(env)) {
- ret = EINVAL;
__db_errx(env, DB_STR_A("3568",
"%s: lease timeout must be set before DB_ENV->rep_start.",
"%s"), "DB_ENV->rep_set_timeout");
return (EINVAL);
}
+ if (PREFMAS_IS_SET(env) &&
+ (which == DB_REP_HEARTBEAT_MONITOR ||
+ which == DB_REP_HEARTBEAT_SEND) &&
+ timeout == 0) {
+ __db_errx(env, DB_STR_A("3711",
+"%s: cannot turn off heartbeat timeout in preferred master mode.",
+ "%s"), "DB_ENV->rep_set_timeout");
+ return (EINVAL);
+ }
+
+ ret = __rep_set_timeout_int(env, which, timeout);
+
+ /* Setting a repmgr timeout makes this a repmgr application */
+ if (ret == 0 && repmgr_timeout)
+ APP_SET_REPMGR(env);
+ return (ret);
+
+}
+
+/*
+ * PUBLIC: int __rep_set_timeout_int __P((ENV *, int, db_timeout_t));
+ */
+int
+__rep_set_timeout_int(env, which, timeout)
+ ENV *env;
+ int which;
+ db_timeout_t timeout;
+{
+ DB_REP *db_rep;
+ REP *rep;
+ int ret;
+
+ db_rep = env->rep_handle;
+ rep = db_rep->region;
+ ret = 0;
switch (which) {
case DB_REP_CHECKPOINT_DELAY:
@@ -1888,6 +2206,7 @@ __rep_set_timeout(dbenv, which, timeout)
rep->ack_timeout = timeout;
else
db_rep->ack_timeout = timeout;
+ ADJUST_AUTOTAKEOVER_WAITS(db_rep, timeout);
break;
case DB_REP_CONNECTION_RETRY:
if (REP_ON(env))
@@ -1919,10 +2238,6 @@ __rep_set_timeout(dbenv, which, timeout)
"Unknown timeout type argument to DB_ENV->rep_set_timeout"));
ret = EINVAL;
}
-
- /* Setting a repmgr timeout makes this a repmgr application */
- if (ret == 0 && repmgr_timeout)
- APP_SET_REPMGR(env);
return (ret);
}
@@ -2099,6 +2414,144 @@ __rep_set_request(dbenv, min, max)
}
/*
+ * __rep_set_view --
+ * Set the view/partial replication function.
+ *
+ * PUBLIC: int __rep_set_view __P((DB_ENV *,
+ * PUBLIC: int (*)(DB_ENV *, const char *, int *, u_int32_t)));
+ */
+int
+__rep_set_view(dbenv, f_partial)
+ DB_ENV *dbenv;
+ int (*f_partial) __P((DB_ENV *,
+ const char *, int *, u_int32_t));
+{
+ DB_REP *db_rep;
+ ENV *env;
+
+ env = dbenv->env;
+ db_rep = env->rep_handle;
+
+ ENV_NOT_CONFIGURED(
+ env, db_rep->region, "DB_ENV->rep_set_view", DB_INIT_REP);
+
+ ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->rep_set_view");
+
+ if (f_partial == NULL)
+ db_rep->partial = __rep_defview;
+ else
+ db_rep->partial = f_partial;
+ return (0);
+}
+
+/*
+ * __rep_defview --
+ * Default view function. Always replicate.
+ */
+static int
+__rep_defview(dbenv, name, result, flags)
+ DB_ENV *dbenv;
+ const char *name;
+ int *result;
+ u_int32_t flags;
+{
+ COMPQUIET(dbenv, NULL);
+ COMPQUIET(name, NULL);
+ COMPQUIET(flags, 0);
+ *result = 1;
+ return (0);
+}
+
+/*
+ * __rep_call_partial --
+ * Calls the partial function, after doing some checks required for
+ * handling blobs.
+ *
+ * PUBLIC: int __rep_call_partial
+ * PUBLIC: __P((ENV *, const char *, int *, u_int32_t, DELAYED_BLOB_LIST **));
+ */
+int
+__rep_call_partial(env, name, result, flags, lsp)
+ ENV *env;
+ const char *name;
+ int *result;
+ u_int32_t flags;
+ DELAYED_BLOB_LIST **lsp;
+{
+ DB_LOG *dblp;
+ DB_REP *db_rep;
+ DELAYED_BLOB_LIST *dbl;
+ FNAME *fname;
+ db_seq_t blob_file_id;
+ char *file_name;
+ int ret;
+
+ ret = 0;
+ blob_file_id = 0;
+ db_rep = env->rep_handle;
+ dblp = env->lg_handle;
+ fname = NULL;
+
+ /*
+ * If the database being sent is a blob meta database or file, then the
+ * name of its associated database needs to be passed to the partial
+ * function. To do this, use the blob file id in the path to the
+ * file to look up the blob_file_id of the associated database. That
+ * can be used to look up the name of the associated database through
+ * dbreg.
+ */
+ if (db_rep->partial == __rep_defview ||
+ (!IS_BLOB_META(name) && !IS_BLOB_FILE(name))) {
+ ret = db_rep->partial(env->dbenv, name, result, flags);
+ } else {
+ /*
+ * The top level blob meta database must always be replicated.
+ */
+ if (strcmp(name, BLOB_META_FILE_NAME) == 0) {
+ *result = 1;
+ return (ret);
+ }
+ if ((ret = __blob_path_to_dir_ids(
+ env, name, &blob_file_id, NULL)) != 0)
+ return (ret);
+ DB_ASSERT(env, blob_file_id > 0);
+
+ /*
+ * It is possible that the database that owns this blob meta
+ * database has not yet been processed on the client when
+ * processing the transaction, so assume it is not replicated.
+ * Return its information and process it later when its
+ * owning database is processed (which must happen in the
+ * same transaction).
+ */
+ if (__dbreg_blob_file_to_fname(
+ dblp, blob_file_id, 0, &fname) != 0) {
+ if ((ret = __os_malloc(
+ env, sizeof(DELAYED_BLOB_LIST), &dbl)) != 0)
+ return (ret);
+ memset(dbl, 0, sizeof(DELAYED_BLOB_LIST));
+ dbl->blob_file_id = blob_file_id;
+ if (*lsp == NULL)
+ *lsp = dbl;
+ else {
+ dbl->next = *lsp;
+ (*lsp)->prev = dbl;
+ *lsp = dbl;
+ }
+ *result = 0;
+ return (0);
+ }
+
+ file_name = fname->fname_off == INVALID_ROFF ?
+ NULL : R_ADDR(&dblp->reginfo, fname->fname_off);
+ DB_ASSERT(env, file_name != NULL);
+ ret = db_rep->partial(env->dbenv, file_name, result, flags);
+ }
+
+ return (ret);
+}
+
+/*
* __rep_set_transport_pp --
* Set the transport function for replication.
*
@@ -2288,25 +2741,46 @@ __rep_set_clockskew(dbenv, fast_clock, slow_clock)
}
/*
- * __rep_flush --
+ * __rep_flush_pp --
* Re-push the last log record to all clients, in case they've lost
* messages and don't know it.
*
- * PUBLIC: int __rep_flush __P((DB_ENV *));
+ * PUBLIC: int __rep_flush_pp __P((DB_ENV *));
*/
int
-__rep_flush(dbenv)
+__rep_flush_pp (dbenv)
DB_ENV *dbenv;
{
+ ENV *env;
+ DB_THREAD_INFO *ip;
+ int ret;
+
+ env = dbenv->env;
+
+ ENV_ENTER(env, ip);
+ ret = __rep_flush_int(env);
+ ENV_LEAVE(env, ip);
+
+ return (ret);
+}
+
+/*
+ * __rep_flush_int --
+ * Re-push the last log record to all clients, in case they've lost
+ * messages and don't know it.
+ *
+ * PUBLIC: int __rep_flush_int __P((ENV *));
+ */
+int
+__rep_flush_int(env)
+ ENV *env;
+{
DBT rec;
DB_LOGC *logc;
DB_LSN lsn;
DB_REP *db_rep;
- DB_THREAD_INFO *ip;
- ENV *env;
int ret, t_ret;
- env = dbenv->env;
db_rep = env->rep_handle;
ENV_REQUIRES_CONFIG_XX(
@@ -2322,8 +2796,6 @@ __rep_flush(dbenv)
return (EINVAL);
}
- ENV_ENTER(env, ip);
-
if ((ret = __log_cursor(env, &logc)) != 0)
return (ret);
@@ -2338,7 +2810,6 @@ __rep_flush(dbenv)
err: if ((t_ret = __logc_close(logc)) != 0 && ret == 0)
ret = t_ret;
- ENV_LEAVE(env, ip);
return (ret);
}
@@ -2693,7 +3164,7 @@ __rep_check_applied(env, ip, commit_info, reasonp)
*/
if (commit_info->gen == gen) {
ret = __rep_read_lsn_history(env,
- ip, &txn, &dbc, gen, &hist, reasonp, DB_SET);
+ ip, &txn, &dbc, gen, &hist, reasonp, DB_SET, 1);
if (ret == DB_NOTFOUND) {
/*
* We haven't yet received the LSN history of the
@@ -2720,7 +3191,7 @@ __rep_check_applied(env, ip, commit_info, reasonp)
* masters at the same gen, and the txn of interest was
* rolled back.
*/
- ret = DB_NOTFOUND;
+ ret = USR_ERR(env, DB_NOTFOUND);
goto out;
}
@@ -2750,7 +3221,7 @@ __rep_check_applied(env, ip, commit_info, reasonp)
* description of the txn of interest doesn't match what we see
* in the history available to us now.
*/
- ret = DB_NOTFOUND;
+ ret = USR_ERR(env, DB_NOTFOUND);
} else if (commit_info->gen < gen || gen == 0) {
/*
@@ -2759,10 +3230,10 @@ __rep_check_applied(env, ip, commit_info, reasonp)
* the token LSN is within the close/open range defined by
* [base,next).
*/
- ret = __rep_read_lsn_history(env,
- ip, &txn, &dbc, commit_info->gen, &hist, reasonp, DB_SET);
- t_ret = __rep_read_lsn_history(env,
- ip, &txn, &dbc, commit_info->gen, &hist2, reasonp, DB_NEXT);
+ ret = __rep_read_lsn_history(env, ip,
+ &txn, &dbc, commit_info->gen, &hist, reasonp, DB_SET, 1);
+ t_ret = __rep_read_lsn_history(env, ip,
+ &txn, &dbc, commit_info->gen, &hist2, reasonp, DB_NEXT, 1);
if (ret == DB_NOTFOUND) {
/*
* If the desired gen is not in our database, it could
@@ -2812,7 +3283,7 @@ __rep_check_applied(env, ip, commit_info, reasonp)
* don't match, meaning the txn was written at a dup
* master and that gen instance was rolled back.
*/
- ret = DB_NOTFOUND;
+ ret = USR_ERR(env, DB_NOTFOUND);
goto out;
}
@@ -2837,7 +3308,7 @@ __rep_check_applied(env, ip, commit_info, reasonp)
LOG_COMPARE(&commit_info->lsn, &hist2.lsn) < 0)
ret = 0;
else
- ret = DB_NOTFOUND;
+ ret = USR_ERR(env, DB_NOTFOUND);
} else {
/*
* Token names a future gen. If we're a client and the LSN also
@@ -2851,7 +3322,7 @@ __rep_check_applied(env, ip, commit_info, reasonp)
reasonp->u.gen = commit_info->gen;
return (DB_TIMEOUT);
}
- return (DB_NOTFOUND);
+ return (USR_ERR(env, DB_NOTFOUND));
}
out:
@@ -2867,9 +3338,19 @@ out:
/*
* The txn and dbc handles are owned by caller, though we create them if
* necessary. Caller is responsible for closing them.
+ *
+ * The use_cache option is enabled for the read-your-writes feature, which
+ * makes frequent requests for the cached information (envid and lsn) when it
+ * is in use. Callers that require information that is not cached (e.g.
+ * timestamp) should not set use_cache.
+ *
+ * PUBLIC: int __rep_read_lsn_history __P((ENV *, DB_THREAD_INFO *, DB_TXN **,
+ * PUBLIC: DBC **, u_int32_t, __rep_lsn_hist_data_args *,
+ * PUBLIC: struct rep_waitgoal *, u_int32_t, int));
*/
-static int
-__rep_read_lsn_history(env, ip, txn, dbc, gen, gen_infop, reasonp, flags)
+int
+__rep_read_lsn_history(env, ip, txn, dbc, gen, gen_infop, reasonp, flags,
+ use_cache)
ENV *env;
DB_THREAD_INFO *ip;
DB_TXN **txn;
@@ -2878,6 +3359,7 @@ __rep_read_lsn_history(env, ip, txn, dbc, gen, gen_infop, reasonp, flags)
__rep_lsn_hist_data_args *gen_infop;
struct rep_waitgoal *reasonp;
u_int32_t flags;
+ int use_cache;
{
DB_REP *db_rep;
REP *rep;
@@ -2898,7 +3380,8 @@ __rep_read_lsn_history(env, ip, txn, dbc, gen, gen_infop, reasonp, flags)
/* Simply return cached info, if we already have it. */
desired_gen = flags == DB_SET ? gen : gen + 1;
REP_SYSTEM_LOCK(env);
- if (rep->gen == desired_gen && !IS_ZERO_LSN(rep->gen_base_lsn)) {
+ if (use_cache && rep->gen == desired_gen &&
+ !IS_ZERO_LSN(rep->gen_base_lsn)) {
gen_infop->lsn = rep->gen_base_lsn;
gen_infop->envid = rep->master_envid;
goto unlock;
@@ -3005,8 +3488,14 @@ __rep_conv_vers(env, log_ver)
/*
* We can't use a switch statement, some of the DB_LOGVERSION_XX
- * constants are the same
+ * constants are the same.
*/
+ if (log_ver == DB_LOGVERSION_61)
+ return (DB_REPVERSION_61);
+ if (log_ver == DB_LOGVERSION_60p1)
+ return (DB_REPVERSION_60);
+ if (log_ver == DB_LOGVERSION_60)
+ return (DB_REPVERSION_60);
if (log_ver == DB_LOGVERSION_53)
return (DB_REPVERSION_53);
if (log_ver == DB_LOGVERSION_52)