summaryrefslogtreecommitdiff
path: root/bdb/log
diff options
context:
space:
mode:
Diffstat (limited to 'bdb/log')
-rw-r--r--bdb/log/log.c881
-rw-r--r--bdb/log/log.src46
-rw-r--r--bdb/log/log_archive.c263
-rw-r--r--bdb/log/log_compare.c6
-rw-r--r--bdb/log/log_findckp.c135
-rw-r--r--bdb/log/log_get.c1185
-rw-r--r--bdb/log/log_method.c113
-rw-r--r--bdb/log/log_put.c1038
-rw-r--r--bdb/log/log_rec.c647
-rw-r--r--bdb/log/log_register.c433
10 files changed, 2579 insertions, 2168 deletions
diff --git a/bdb/log/log.c b/bdb/log/log.c
index 8ddb7bcaf7d..f57caeccb95 100644
--- a/bdb/log/log.c
+++ b/bdb/log/log.c
@@ -1,40 +1,34 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: log.c,v 11.42 2001/01/15 16:42:37 bostic Exp $";
+static const char revid[] = "$Id: log.c,v 11.111 2002/08/16 00:27:44 ubell Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
#include <sys/types.h>
+#include <ctype.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#endif
-#ifdef HAVE_RPC
-#include "db_server.h"
-#endif
-
#include "db_int.h"
-#include "log.h"
-#include "db_dispatch.h"
-#include "txn.h"
-#include "txn_auto.h"
-
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
+#include "dbinc/crypto.h"
+#include "dbinc/hmac.h"
+#include "dbinc/log.h"
+#include "dbinc/txn.h"
static int __log_init __P((DB_ENV *, DB_LOG *));
static int __log_recover __P((DB_LOG *));
+static size_t __log_region_size __P((DB_ENV *));
+static int __log_zero __P((DB_ENV *, DB_LSN *, DB_LSN *));
/*
* __log_open --
@@ -49,16 +43,10 @@ __log_open(dbenv)
DB_LOG *dblp;
LOG *lp;
int ret;
- u_int8_t *readbufp;
-
- readbufp = NULL;
/* Create/initialize the DB_LOG structure. */
if ((ret = __os_calloc(dbenv, 1, sizeof(DB_LOG), &dblp)) != 0)
return (ret);
- if ((ret = __os_calloc(dbenv, 1, dbenv->lg_bsize, &readbufp)) != 0)
- goto err;
- ZERO_LSN(dblp->c_lsn);
dblp->dbenv = dbenv;
/* Join/create the log region. */
@@ -69,40 +57,66 @@ __log_open(dbenv)
if (F_ISSET(dbenv, DB_ENV_CREATE))
F_SET(&dblp->reginfo, REGION_CREATE_OK);
if ((ret = __db_r_attach(
- dbenv, &dblp->reginfo, LG_BASE_REGION_SIZE + dbenv->lg_bsize)) != 0)
+ dbenv, &dblp->reginfo, __log_region_size(dbenv))) != 0)
goto err;
- dblp->readbufp = readbufp;
-
/* If we created the region, initialize it. */
- if (F_ISSET(&dblp->reginfo, REGION_CREATE) &&
- (ret = __log_init(dbenv, dblp)) != 0)
- goto err;
+ if (F_ISSET(&dblp->reginfo, REGION_CREATE))
+ if ((ret = __log_init(dbenv, dblp)) != 0)
+ goto err;
/* Set the local addresses. */
lp = dblp->reginfo.primary =
R_ADDR(&dblp->reginfo, dblp->reginfo.rp->primary);
- dblp->bufp = R_ADDR(&dblp->reginfo, lp->buffer_off);
/*
* If the region is threaded, then we have to lock both the handles
* and the region, and we need to allocate a mutex for that purpose.
*/
- if (F_ISSET(dbenv, DB_ENV_THREAD)) {
- if ((ret = __db_mutex_alloc(
- dbenv, &dblp->reginfo, &dblp->mutexp)) != 0)
- goto err;
- if ((ret = __db_mutex_init(
- dbenv, dblp->mutexp, 0, MUTEX_THREAD)) != 0)
+ if (F_ISSET(dbenv, DB_ENV_THREAD) &&
+ (ret = __db_mutex_setup(dbenv, &dblp->reginfo, &dblp->mutexp,
+ MUTEX_ALLOC | MUTEX_NO_RLOCK)) != 0)
+ goto err;
+
+ /* Initialize the rest of the structure. */
+ dblp->bufp = R_ADDR(&dblp->reginfo, lp->buffer_off);
+
+ /*
+ * Set the handle -- we may be about to run recovery, which allocates
+ * log cursors. Log cursors require logging be already configured,
+ * and the handle being set is what demonstrates that.
+ *
+ * If we created the region, run recovery. If that fails, make sure
+ * we reset the log handle before cleaning up, otherwise we will try
+ * and clean up again in the mainline DB_ENV initialization code.
+ */
+ dbenv->lg_handle = dblp;
+
+ if (F_ISSET(&dblp->reginfo, REGION_CREATE)) {
+ if ((ret = __log_recover(dblp)) != 0) {
+ dbenv->lg_handle = NULL;
goto err;
+ }
+
+ /*
+ * We first take the log file size from the environment, if
+ * specified. If that wasn't set, recovery may have set it
+ * from the persistent information in a log file header. If
+ * that didn't set it either, we default.
+ */
+ if (lp->log_size == 0)
+ lp->log_size = lp->log_nsize = LG_MAX_DEFAULT;
+ } else {
+ /*
+ * A process joining the region may have reset the log file
+ * size, too. If so, it only affects the next log file we
+ * create.
+ */
+ if (dbenv->lg_size != 0)
+ lp->log_nsize = dbenv->lg_size;
}
R_UNLOCK(dbenv, &dblp->reginfo);
-
- dblp->r_file = 0;
- dblp->r_off = 0;
- dblp->r_size = 0;
- dbenv->lg_handle = dblp;
return (0);
err: if (dblp->reginfo.addr != NULL) {
@@ -112,11 +126,11 @@ err: if (dblp->reginfo.addr != NULL) {
(void)__db_r_detach(dbenv, &dblp->reginfo, 0);
}
- if (readbufp != NULL)
- __os_free(readbufp, dbenv->lg_bsize);
if (dblp->mutexp != NULL)
__db_mutex_free(dbenv, &dblp->reginfo, dblp->mutexp);
- __os_free(dblp, sizeof(*dblp));
+
+ __os_free(dbenv, dblp);
+
return (ret);
}
@@ -129,9 +143,13 @@ __log_init(dbenv, dblp)
DB_ENV *dbenv;
DB_LOG *dblp;
{
+ DB_MUTEX *flush_mutexp;
LOG *region;
int ret;
void *p;
+#ifdef HAVE_MUTEX_SYSTEM_RESOURCES
+ u_int8_t *addr;
+#endif
if ((ret = __db_shalloc(dblp->reginfo.addr,
sizeof(*region), 0, &dblp->reginfo.primary)) != 0)
@@ -141,15 +159,55 @@ __log_init(dbenv, dblp)
region = dblp->reginfo.primary;
memset(region, 0, sizeof(*region));
- region->persist.lg_max = dbenv->lg_max;
- region->persist.magic = DB_LOGMAGIC;
- region->persist.version = DB_LOGVERSION;
- region->persist.mode = dbenv->db_mode;
+ region->fid_max = 0;
SH_TAILQ_INIT(&region->fq);
+ region->free_fid_stack = INVALID_ROFF;
+ region->free_fids = region->free_fids_alloced = 0;
/* Initialize LOG LSNs. */
- region->lsn.file = 1;
- region->lsn.offset = 0;
+ INIT_LSN(region->lsn);
+ INIT_LSN(region->ready_lsn);
+ INIT_LSN(region->t_lsn);
+
+ /*
+ * It's possible to be waiting for an LSN of [1][0], if a replication
+ * client gets the first log record out of order. An LSN of [0][0]
+ * signifies that we're not waiting.
+ */
+ ZERO_LSN(region->waiting_lsn);
+
+ /*
+ * Log makes note of the fact that it ran into a checkpoint on
+ * startup if it did so, as a recovery optimization. A zero
+ * LSN signifies that it hasn't found one [yet].
+ */
+ ZERO_LSN(region->cached_ckp_lsn);
+
+#ifdef HAVE_MUTEX_SYSTEM_RESOURCES
+ /* Allocate room for the log maintenance info and initialize it. */
+ if ((ret = __db_shalloc(dblp->reginfo.addr,
+ sizeof(REGMAINT) + LG_MAINT_SIZE, 0, &addr)) != 0)
+ goto mem_err;
+ __db_maintinit(&dblp->reginfo, addr, LG_MAINT_SIZE);
+ region->maint_off = R_OFFSET(&dblp->reginfo, addr);
+#endif
+
+ if ((ret = __db_mutex_setup(dbenv, &dblp->reginfo, &region->fq_mutex,
+ MUTEX_NO_RLOCK)) != 0)
+ return (ret);
+
+ /*
+ * We must create a place for the flush mutex separately; mutexes have
+ * to be aligned to MUTEX_ALIGN, and the only way to guarantee that is
+ * to make sure they're at the beginning of a shalloc'ed chunk.
+ */
+ if ((ret = __db_shalloc(dblp->reginfo.addr,
+ sizeof(DB_MUTEX), MUTEX_ALIGN, &flush_mutexp)) != 0)
+ goto mem_err;
+ if ((ret = __db_mutex_setup(dbenv, &dblp->reginfo, flush_mutexp,
+ MUTEX_NO_RLOCK)) != 0)
+ return (ret);
+ region->flush_mutex_off = R_OFFSET(&dblp->reginfo, flush_mutexp);
/* Initialize the buffer. */
if ((ret =
@@ -159,9 +217,23 @@ mem_err: __db_err(dbenv, "Unable to allocate memory for the log buffer");
}
region->buffer_size = dbenv->lg_bsize;
region->buffer_off = R_OFFSET(&dblp->reginfo, p);
+ region->log_size = region->log_nsize = dbenv->lg_size;
- /* Try and recover any previous log files before releasing the lock. */
- return (__log_recover(dblp));
+ /* Initialize the commit Queue. */
+ SH_TAILQ_INIT(&region->free_commits);
+ SH_TAILQ_INIT(&region->commits);
+ region->ncommit = 0;
+
+ /*
+ * Fill in the log's persistent header. Don't fill in the log file
+ * sizes, as they may change at any time and so have to be filled in
+ * as each log file is created.
+ */
+ region->persist.magic = DB_LOGMAGIC;
+ region->persist.version = DB_LOGVERSION;
+ region->persist.mode = (u_int32_t)dbenv->db_mode;
+
+ return (0);
}
/*
@@ -173,12 +245,16 @@ __log_recover(dblp)
DB_LOG *dblp;
{
DBT dbt;
+ DB_ENV *dbenv;
+ DB_LOGC *logc;
DB_LSN lsn;
LOG *lp;
- int cnt, found_checkpoint, ret;
- u_int32_t chk;
+ u_int32_t cnt, rectype;
+ int ret;
logfile_validity status;
+ logc = NULL;
+ dbenv = dblp->dbenv;
lp = dblp->reginfo.primary;
/*
@@ -192,8 +268,9 @@ __log_recover(dblp)
/*
* If the last file is an old version, readable or no, start a new
- * file. Don't bother finding checkpoints; if we didn't take a
- * checkpoint right before upgrading, the user screwed up anyway.
+ * file. Don't bother finding the end of the last log file;
+ * we assume that it's valid in its entirety, since the user
+ * should have shut down cleanly or run recovery before upgrading.
*/
if (status == DB_LV_OLD_READABLE || status == DB_LV_OLD_UNREADABLE) {
lp->lsn.file = lp->s_lsn.file = cnt + 1;
@@ -213,25 +290,35 @@ __log_recover(dblp)
lsn.file = cnt;
lsn.offset = 0;
- /* Set the cursor. Shouldn't fail; leave error messages on. */
- memset(&dbt, 0, sizeof(dbt));
- if ((ret = __log_get(dblp, &lsn, &dbt, DB_SET, 0)) != 0)
+ /*
+ * Allocate a cursor and set it to the first record. This shouldn't
+ * fail, leave error messages on.
+ */
+ if ((ret = dbenv->log_cursor(dbenv, &logc, 0)) != 0)
return (ret);
+ F_SET(logc, DB_LOG_LOCKED);
+ memset(&dbt, 0, sizeof(dbt));
+ if ((ret = logc->get(logc, &lsn, &dbt, DB_SET)) != 0)
+ goto err;
/*
- * Read to the end of the file, saving checkpoints. This will fail
- * at some point, so turn off error messages.
+ * Read to the end of the file. This may fail at some point, so
+ * turn off error messages.
*/
- found_checkpoint = 0;
- while (__log_get(dblp, &lsn, &dbt, DB_NEXT, 1) == 0) {
+ F_SET(logc, DB_LOG_SILENT_ERR);
+ while (logc->get(logc, &lsn, &dbt, DB_NEXT) == 0) {
if (dbt.size < sizeof(u_int32_t))
continue;
- memcpy(&chk, dbt.data, sizeof(u_int32_t));
- if (chk == DB_txn_ckp) {
- lp->chkpt_lsn = lsn;
- found_checkpoint = 1;
- }
+ memcpy(&rectype, dbt.data, sizeof(u_int32_t));
+ if (rectype == DB___txn_ckp)
+ /*
+ * If we happen to run into a checkpoint, cache its
+ * LSN so that the transaction system doesn't have
+ * to walk this log file again looking for it.
+ */
+ lp->cached_ckp_lsn = lsn;
}
+ F_CLR(logc, DB_LOG_SILENT_ERR);
/*
* We now know where the end of the log is. Set the first LSN that
@@ -240,59 +327,24 @@ __log_recover(dblp)
*/
lp->lsn = lsn;
lp->s_lsn = lsn;
- lp->lsn.offset += dblp->c_len;
- lp->s_lsn.offset += dblp->c_len;
+ lp->lsn.offset += logc->c_len;
+ lp->s_lsn.offset += logc->c_len;
/* Set up the current buffer information, too. */
- lp->len = dblp->c_len;
+ lp->len = logc->c_len;
lp->b_off = 0;
lp->w_off = lp->lsn.offset;
- /*
- * It's possible that we didn't find a checkpoint because there wasn't
- * one in the last log file. Start searching.
- */
- if (!found_checkpoint && cnt > 1) {
- lsn.file = cnt;
- lsn.offset = 0;
-
- /* Set the cursor. Shouldn't fail, leave error messages on. */
- if ((ret = __log_get(dblp, &lsn, &dbt, DB_SET, 0)) != 0)
- return (ret);
-
- /*
- * Read to the end of the file, saving checkpoints. Again,
- * this can fail if there are no checkpoints in any log file,
- * so turn error messages off.
- */
- while (__log_get(dblp, &lsn, &dbt, DB_PREV, 1) == 0) {
- if (dbt.size < sizeof(u_int32_t))
- continue;
- memcpy(&chk, dbt.data, sizeof(u_int32_t));
- if (chk == DB_txn_ckp) {
- lp->chkpt_lsn = lsn;
- found_checkpoint = 1;
- break;
- }
- }
- }
-
- /* If we never find a checkpoint, that's okay, just 0 it out. */
- if (!found_checkpoint)
-skipsearch: ZERO_LSN(lp->chkpt_lsn);
-
- /*
- * Reset the cursor lsn to the beginning of the log, so that an
- * initial call to DB_NEXT does the right thing.
- */
- ZERO_LSN(dblp->c_lsn);
-
- if (FLD_ISSET(dblp->dbenv->verbose, DB_VERB_RECOVERY))
- __db_err(dblp->dbenv,
+skipsearch:
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY))
+ __db_err(dbenv,
"Finding last valid log LSN: file: %lu offset %lu",
(u_long)lp->lsn.file, (u_long)lp->lsn.offset);
- return (0);
+err: if (logc != NULL)
+ (void)logc->close(logc, 0);
+
+ return (ret);
}
/*
@@ -301,20 +353,23 @@ skipsearch: ZERO_LSN(lp->chkpt_lsn);
* the number of the first readable log file, else it will contain the number
* of the last log file (which may be too old to read).
*
- * PUBLIC: int __log_find __P((DB_LOG *, int, int *, logfile_validity *));
+ * PUBLIC: int __log_find __P((DB_LOG *, int, u_int32_t *, logfile_validity *));
*/
int
__log_find(dblp, find_first, valp, statusp)
DB_LOG *dblp;
- int find_first, *valp;
+ int find_first;
+ u_int32_t *valp;
logfile_validity *statusp;
{
+ DB_ENV *dbenv;
logfile_validity logval_status, status;
u_int32_t clv, logval;
int cnt, fcnt, ret;
const char *dir;
- char **names, *p, *q, savech;
+ char *c, **names, *p, *q, savech;
+ dbenv = dblp->dbenv;
logval_status = status = DB_LV_NONEXISTENT;
/* Return a value of 0 as the log file number on failure. */
@@ -333,7 +388,7 @@ __log_find(dblp, find_first, valp, statusp)
}
/* Get the list of file names. */
- ret = __os_dirlist(dblp->dbenv, dir, &names, &fcnt);
+ ret = __os_dirlist(dbenv, dir, &names, &fcnt);
/*
* !!!
@@ -345,8 +400,8 @@ __log_find(dblp, find_first, valp, statusp)
*q = savech;
if (ret != 0) {
- __db_err(dblp->dbenv, "%s: %s", dir, db_strerror(ret));
- __os_freestr(p);
+ __db_err(dbenv, "%s: %s", dir, db_strerror(ret));
+ __os_free(dbenv, p);
return (ret);
}
@@ -356,74 +411,92 @@ __log_find(dblp, find_first, valp, statusp)
continue;
/*
+ * Names of the form log\.[0-9]* are reserved for DB. Other
+ * names sharing LFPREFIX, such as "log.db", are legal.
+ */
+ for (c = names[cnt] + sizeof(LFPREFIX) - 1; *c != '\0'; c++)
+ if (!isdigit((int)*c))
+ break;
+ if (*c != '\0')
+ continue;
+
+ /*
* Use atol, not atoi; if an "int" is 16-bits, the largest
* log file name won't fit.
*/
clv = atol(names[cnt] + (sizeof(LFPREFIX) - 1));
+
+ /*
+ * If searching for the first log file, we want to return the
+ * oldest log file we can read, or, if no readable log files
+ * exist, the newest log file we can't read (the crossover
+ * point between the old and new versions of the log file).
+ *
+ * If we're searching for the last log file, we want to return
+ * the newest log file, period.
+ *
+ * Readable log files should never preceede unreadable log
+ * files, that would mean the admin seriously screwed up.
+ */
if (find_first) {
- if (logval != 0 && clv > logval)
+ if (logval != 0 &&
+ status != DB_LV_OLD_UNREADABLE && clv > logval)
continue;
} else
if (logval != 0 && clv < logval)
continue;
- /*
- * Take note of whether the log file logval is
- * an old version or incompletely initialized.
- */
- if ((ret = __log_valid(dblp, clv, 1, &status)) != 0)
+ if ((ret = __log_valid(dblp, clv, 1, &status)) != 0) {
+ __db_err(dbenv, "Invalid log file: %s: %s",
+ names[cnt], db_strerror(ret));
goto err;
+ }
switch (status) {
+ case DB_LV_NONEXISTENT:
+ /* __log_valid never returns DB_LV_NONEXISTENT. */
+ DB_ASSERT(0);
+ break;
case DB_LV_INCOMPLETE:
/*
- * It's acceptable for the last log file to
- * have been incompletely initialized--it's possible
- * to create a log file but not write anything to it,
- * and recovery needs to gracefully handle this.
- *
- * Just ignore it; we don't want to return this
- * as a valid log file.
+ * The last log file may not have been initialized --
+ * it's possible to create a log file but not write
+ * anything to it. If performing recovery (that is,
+ * if find_first isn't set), ignore the file, it's
+ * not interesting. If we're searching for the first
+ * log record, return the file (assuming we don't find
+ * something better), as the "real" first log record
+ * is likely to be in the log buffer, and we want to
+ * set the file LSN for our return.
*/
+ if (find_first)
+ goto found;
break;
- case DB_LV_NONEXISTENT:
- /* Should never happen. */
- DB_ASSERT(0);
+ case DB_LV_OLD_UNREADABLE:
+ /*
+ * If we're searching for the first log file, then we
+ * only want this file if we don't yet have a file or
+ * already have an unreadable file and this one is
+ * newer than that one. If we're searching for the
+ * last log file, we always want this file because we
+ * wouldn't be here if it wasn't newer than our current
+ * choice.
+ */
+ if (!find_first || logval == 0 ||
+ (status == DB_LV_OLD_UNREADABLE && clv > logval))
+ goto found;
break;
case DB_LV_NORMAL:
case DB_LV_OLD_READABLE:
- logval = clv;
+found: logval = clv;
logval_status = status;
break;
- case DB_LV_OLD_UNREADABLE:
- /*
- * Continue; we want the oldest valid log,
- * and clv is too old to be useful. We don't
- * want it to supplant logval if we're looking for
- * the oldest valid log, but we do want to return
- * it if it's the last log file--we want the very
- * last file number, so that our caller can
- * start a new file after it.
- *
- * The code here assumes that there will never
- * be a too-old log that's preceded by a log
- * of the current version, but in order to
- * attain that state of affairs the user
- * would have had to really seriously screw
- * up; I think we can safely assume this won't
- * happen.
- */
- if (!find_first) {
- logval = clv;
- logval_status = status;
- }
- break;
}
}
*valp = logval;
-err: __os_dirfree(names, fcnt);
- __os_freestr(p);
+err: __os_dirfree(dbenv, names, fcnt);
+ __os_free(dbenv, p);
*statusp = logval_status;
return (ret);
@@ -446,30 +519,48 @@ __log_valid(dblp, number, set_persist, statusp)
int set_persist;
logfile_validity *statusp;
{
+ DB_CIPHER *db_cipher;
+ DB_ENV *dbenv;
DB_FH fh;
+ HDR *hdr;
LOG *region;
- LOGP persist;
- char *fname;
- int ret;
+ LOGP *persist;
logfile_validity status;
- size_t nw;
+ size_t hdrsize, nw, recsize;
+ int is_hmac, need_free, ret;
+ u_int8_t *tmp;
+ char *fname;
+ dbenv = dblp->dbenv;
+ db_cipher = dbenv->crypto_handle;
+ persist = NULL;
status = DB_LV_NORMAL;
/* Try to open the log file. */
if ((ret = __log_name(dblp,
number, &fname, &fh, DB_OSO_RDONLY | DB_OSO_SEQ)) != 0) {
- __os_freestr(fname);
+ __os_free(dbenv, fname);
return (ret);
}
+ need_free = 0;
+ hdrsize = HDR_NORMAL_SZ;
+ is_hmac = 0;
+ recsize = sizeof(LOGP);
+ if (CRYPTO_ON(dbenv)) {
+ hdrsize = HDR_CRYPTO_SZ;
+ recsize = sizeof(LOGP);
+ recsize += db_cipher->adj_size(recsize);
+ is_hmac = 1;
+ }
+ if ((ret = __os_calloc(dbenv, 1, recsize + hdrsize, &tmp)) != 0)
+ return (ret);
+ need_free = 1;
+ hdr = (HDR *)tmp;
+ persist = (LOGP *)(tmp + hdrsize);
/* Try to read the header. */
- if ((ret =
- __os_seek(dblp->dbenv,
- &fh, 0, 0, sizeof(HDR), 0, DB_OS_SEEK_SET)) != 0 ||
- (ret =
- __os_read(dblp->dbenv, &fh, &persist, sizeof(LOGP), &nw)) != 0 ||
- nw != sizeof(LOGP)) {
+ if ((ret = __os_read(dbenv, &fh, tmp, recsize + hdrsize, &nw)) != 0 ||
+ nw != recsize + hdrsize) {
if (ret == 0)
status = DB_LV_INCOMPLETE;
else
@@ -477,19 +568,63 @@ __log_valid(dblp, number, set_persist, statusp)
* The error was a fatal read error, not just an
* incompletely initialized log file.
*/
- __db_err(dblp->dbenv, "Ignoring log file: %s: %s",
+ __db_err(dbenv, "Ignoring log file: %s: %s",
fname, db_strerror(ret));
- (void)__os_closehandle(&fh);
+ (void)__os_closehandle(dbenv, &fh);
goto err;
}
- (void)__os_closehandle(&fh);
+ (void)__os_closehandle(dbenv, &fh);
+
+ /*
+ * Now we have to validate the persistent record. We have
+ * several scenarios we have to deal with:
+ *
+ * 1. User has crypto turned on:
+ * - They're reading an old, unencrypted log file
+ * . We will fail the record size match check below.
+ * - They're reading a current, unencrypted log file
+ * . We will fail the record size match check below.
+ * - They're reading an old, encrypted log file [NOT YET]
+ * . After decryption we'll fail the version check. [NOT YET]
+ * - They're reading a current, encrypted log file
+ * . We should proceed as usual.
+ * 2. User has crypto turned off:
+ * - They're reading an old, unencrypted log file
+ * . We will fail the version check.
+ * - They're reading a current, unencrypted log file
+ * . We should proceed as usual.
+ * - They're reading an old, encrypted log file [NOT YET]
+ * . We'll fail the magic number check (it is encrypted).
+ * - They're reading a current, encrypted log file
+ * . We'll fail the magic number check (it is encrypted).
+ */
+ if (CRYPTO_ON(dbenv)) {
+ /*
+ * If we are trying to decrypt an unencrypted log
+ * we can only detect that by having an unreasonable
+ * data length for our persistent data.
+ */
+ if ((hdr->len - hdrsize) != sizeof(LOGP)) {
+ __db_err(dbenv, "log record size mismatch");
+ goto err;
+ }
+ /* Check the checksum and decrypt. */
+ if ((ret = __db_check_chksum(dbenv, db_cipher, &hdr->chksum[0],
+ (u_int8_t *)persist, hdr->len - hdrsize, is_hmac)) != 0) {
+ __db_err(dbenv, "log record checksum mismatch");
+ goto err;
+ }
+ if ((ret = db_cipher->decrypt(dbenv, db_cipher->data,
+ &hdr->iv[0], (u_int8_t *)persist, hdr->len - hdrsize)) != 0)
+ goto err;
+ }
/* Validate the header. */
- if (persist.magic != DB_LOGMAGIC) {
- __db_err(dblp->dbenv,
+ if (persist->magic != DB_LOGMAGIC) {
+ __db_err(dbenv,
"Ignoring log file: %s: magic number %lx, not %lx",
- fname, (u_long)persist.magic, (u_long)DB_LOGMAGIC);
+ fname, (u_long)persist->magic, (u_long)DB_LOGMAGIC);
ret = EINVAL;
goto err;
}
@@ -499,135 +634,162 @@ __log_valid(dblp, number, set_persist, statusp)
* belongs to an unreadable or readable old version; leave it
* alone if and only if the log file version is the current one.
*/
- if (persist.version > DB_LOGVERSION) {
+ if (persist->version > DB_LOGVERSION) {
/* This is a fatal error--the log file is newer than DB. */
- __db_err(dblp->dbenv,
+ __db_err(dbenv,
"Ignoring log file: %s: unsupported log version %lu",
- fname, (u_long)persist.version);
+ fname, (u_long)persist->version);
ret = EINVAL;
goto err;
- } else if (persist.version < DB_LOGOLDVER) {
+ } else if (persist->version < DB_LOGOLDVER) {
status = DB_LV_OLD_UNREADABLE;
/*
* We don't want to set persistent info based on an
* unreadable region, so jump to "err".
*/
goto err;
- } else if (persist.version < DB_LOGVERSION)
+ } else if (persist->version < DB_LOGVERSION)
status = DB_LV_OLD_READABLE;
/*
- * If the log is thus far readable and we're doing system
- * initialization, set the region's persistent information
- * based on the headers.
+ * Only if we have a current log do we verify the checksum.
+ * We could not check the checksum before checking the magic
+ * and version because old log hdrs have the length and checksum
+ * in a different location.
+ */
+ if (!CRYPTO_ON(dbenv) && ((ret = __db_check_chksum(dbenv,
+ db_cipher, &hdr->chksum[0], (u_int8_t *)persist,
+ hdr->len - hdrsize, is_hmac)) != 0)) {
+ __db_err(dbenv, "log record checksum mismatch");
+ goto err;
+ }
+
+ /*
+ * If the log is readable so far and we're doing system initialization,
+ * set the region's persistent information based on the headers.
+ *
+ * Always set the current log file size. Only set the next log file's
+ * size if the application hasn't set it already.
+ *
+ * XXX
+ * Always use the persistent header's mode, regardless of what was set
+ * in the current environment. We've always done it this way, but it's
+ * probably a bug -- I can't think of a way not-changing the mode would
+ * be a problem, though.
*/
if (set_persist) {
region = dblp->reginfo.primary;
- region->persist.lg_max = persist.lg_max;
- region->persist.mode = persist.mode;
+ region->log_size = persist->log_size;
+ if (region->log_nsize == 0)
+ region->log_nsize = persist->log_size;
+ region->persist.mode = persist->mode;
}
-err: __os_freestr(fname);
+err: __os_free(dbenv, fname);
+ if (need_free)
+ __os_free(dbenv, tmp);
*statusp = status;
return (ret);
}
/*
- * __log_close --
- * Internal version of log_close: only called from dbenv_refresh.
+ * __log_dbenv_refresh --
+ * Clean up after the log system on a close or failed open. Called only
+ * from __dbenv_refresh. (Formerly called __log_close.)
*
- * PUBLIC: int __log_close __P((DB_ENV *));
+ * PUBLIC: int __log_dbenv_refresh __P((DB_ENV *));
*/
int
-__log_close(dbenv)
+__log_dbenv_refresh(dbenv)
DB_ENV *dbenv;
{
DB_LOG *dblp;
int ret, t_ret;
- ret = 0;
dblp = dbenv->lg_handle;
/* We may have opened files as part of XA; if so, close them. */
F_SET(dblp, DBLOG_RECOVER);
- __log_close_files(dbenv);
+ ret = __dbreg_close_files(dbenv);
/* Discard the per-thread lock. */
if (dblp->mutexp != NULL)
__db_mutex_free(dbenv, &dblp->reginfo, dblp->mutexp);
/* Detach from the region. */
- ret = __db_r_detach(dbenv, &dblp->reginfo, 0);
+ if ((t_ret =
+ __db_r_detach(dbenv, &dblp->reginfo, 0)) != 0 && ret == 0)
+ ret = t_ret;
/* Close open files, release allocated memory. */
if (F_ISSET(&dblp->lfh, DB_FH_VALID) &&
- (t_ret = __os_closehandle(&dblp->lfh)) != 0 && ret == 0)
- ret = t_ret;
- if (dblp->c_dbt.data != NULL)
- __os_free(dblp->c_dbt.data, dblp->c_dbt.ulen);
- if (F_ISSET(&dblp->c_fh, DB_FH_VALID) &&
- (t_ret = __os_closehandle(&dblp->c_fh)) != 0 && ret == 0)
+ (t_ret = __os_closehandle(dbenv, &dblp->lfh)) != 0 && ret == 0)
ret = t_ret;
if (dblp->dbentry != NULL)
- __os_free(dblp->dbentry,
- (dblp->dbentry_cnt * sizeof(DB_ENTRY)));
- if (dblp->readbufp != NULL)
- __os_free(dblp->readbufp, dbenv->lg_bsize);
+ __os_free(dbenv, dblp->dbentry);
- __os_free(dblp, sizeof(*dblp));
+ __os_free(dbenv, dblp);
dbenv->lg_handle = NULL;
return (ret);
}
/*
- * log_stat --
- * Return LOG statistics.
+ * __log_stat --
+ * Return log statistics.
+ *
+ * PUBLIC: int __log_stat __P((DB_ENV *, DB_LOG_STAT **, u_int32_t));
*/
int
-log_stat(dbenv, statp, db_malloc)
+__log_stat(dbenv, statp, flags)
DB_ENV *dbenv;
DB_LOG_STAT **statp;
- void *(*db_malloc) __P((size_t));
+ u_int32_t flags;
{
DB_LOG *dblp;
DB_LOG_STAT *stats;
LOG *region;
int ret;
-#ifdef HAVE_RPC
- if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
- return (__dbcl_log_stat(dbenv, statp, db_malloc));
-#endif
-
PANIC_CHECK(dbenv);
- ENV_REQUIRES_CONFIG(dbenv, dbenv->lg_handle, DB_INIT_LOG);
+ ENV_REQUIRES_CONFIG(dbenv,
+ dbenv->lg_handle, "DB_ENV->log_stat", DB_INIT_LOG);
*statp = NULL;
+ if ((ret = __db_fchk(dbenv,
+ "DB_ENV->log_stat", flags, DB_STAT_CLEAR)) != 0)
+ return (ret);
dblp = dbenv->lg_handle;
region = dblp->reginfo.primary;
- if ((ret = __os_malloc(dbenv,
- sizeof(DB_LOG_STAT), db_malloc, &stats)) != 0)
+ if ((ret = __os_umalloc(dbenv, sizeof(DB_LOG_STAT), &stats)) != 0)
return (ret);
/* Copy out the global statistics. */
R_LOCK(dbenv, &dblp->reginfo);
*stats = region->stat;
+ if (LF_ISSET(DB_STAT_CLEAR))
+ memset(&region->stat, 0, sizeof(region->stat));
stats->st_magic = region->persist.magic;
stats->st_version = region->persist.version;
stats->st_mode = region->persist.mode;
stats->st_lg_bsize = region->buffer_size;
- stats->st_lg_max = region->persist.lg_max;
+ stats->st_lg_size = region->log_nsize;
stats->st_region_wait = dblp->reginfo.rp->mutex.mutex_set_wait;
stats->st_region_nowait = dblp->reginfo.rp->mutex.mutex_set_nowait;
+ if (LF_ISSET(DB_STAT_CLEAR)) {
+ dblp->reginfo.rp->mutex.mutex_set_wait = 0;
+ dblp->reginfo.rp->mutex.mutex_set_nowait = 0;
+ }
stats->st_regsize = dblp->reginfo.rp->size;
stats->st_cur_file = region->lsn.file;
stats->st_cur_offset = region->lsn.offset;
+ stats->st_disk_file = region->s_lsn.file;
+ stats->st_disk_offset = region->s_lsn.offset;
R_UNLOCK(dbenv, &dblp->reginfo);
@@ -636,22 +798,287 @@ log_stat(dbenv, statp, db_malloc)
}
/*
- * __log_lastckp --
- * Return the current chkpt_lsn, so that we can store it in
- * the transaction region and keep the chain of checkpoints
- * unbroken across environment recreates.
+ * __log_get_cached_ckp_lsn --
+ * Retrieve any last checkpoint LSN that we may have found on startup.
+ *
+ * PUBLIC: void __log_get_cached_ckp_lsn __P((DB_ENV *, DB_LSN *));
+ */
+void
+__log_get_cached_ckp_lsn(dbenv, ckp_lsnp)
+ DB_ENV *dbenv;
+ DB_LSN *ckp_lsnp;
+{
+ DB_LOG *dblp;
+ LOG *lp;
+
+ dblp = (DB_LOG *)dbenv->lg_handle;
+ lp = (LOG *)dblp->reginfo.primary;
+
+ R_LOCK(dbenv, &dblp->reginfo);
+ *ckp_lsnp = lp->cached_ckp_lsn;
+ R_UNLOCK(dbenv, &dblp->reginfo);
+}
+
+/*
+ * __log_region_size --
+ * Return the amount of space needed for the log region.
+ * Make the region large enough to hold txn_max transaction
+ * detail structures plus some space to hold thread handles
+ * and the beginning of the shalloc region and anything we
+ * need for mutex system resource recording.
+ */
+static size_t
+__log_region_size(dbenv)
+ DB_ENV *dbenv;
+{
+ size_t s;
+
+ s = dbenv->lg_regionmax + dbenv->lg_bsize;
+#ifdef HAVE_MUTEX_SYSTEM_RESOURCES
+ if (F_ISSET(dbenv, DB_ENV_THREAD))
+ s += sizeof(REGMAINT) + LG_MAINT_SIZE;
+#endif
+ return (s);
+}
+
+/*
+ * __log_region_destroy
+ * Destroy any region maintenance info.
+ *
+ * PUBLIC: void __log_region_destroy __P((DB_ENV *, REGINFO *));
+ */
+void
+__log_region_destroy(dbenv, infop)
+ DB_ENV *dbenv;
+ REGINFO *infop;
+{
+ __db_shlocks_destroy(infop, (REGMAINT *)R_ADDR(infop,
+ ((LOG *)R_ADDR(infop, infop->rp->primary))->maint_off));
+
+ COMPQUIET(dbenv, NULL);
+ COMPQUIET(infop, NULL);
+}
+
+/*
+ * __log_vtruncate
+ * This is a virtual truncate. We set up the log indicators to
+ * make everyone believe that the given record is the last one in the
+ * log. Returns with the next valid LSN (i.e., the LSN of the next
+ * record to be written). This is used in replication to discard records
+ * in the log file that do not agree with the master.
+ *
+ * PUBLIC: int __log_vtruncate __P((DB_ENV *, DB_LSN *, DB_LSN *));
+ */
+int
+__log_vtruncate(dbenv, lsn, ckplsn)
+ DB_ENV *dbenv;
+ DB_LSN *lsn, *ckplsn;
+{
+ DBT log_dbt;
+ DB_FH fh;
+ DB_LOG *dblp;
+ DB_LOGC *logc;
+ DB_LSN end_lsn;
+ LOG *lp;
+ u_int32_t bytes, c_len;
+ int fn, ret, t_ret;
+ char *fname;
+
+ /* Need to find out the length of this soon-to-be-last record. */
+ if ((ret = dbenv->log_cursor(dbenv, &logc, 0)) != 0)
+ return (ret);
+ memset(&log_dbt, 0, sizeof(log_dbt));
+ ret = logc->get(logc, lsn, &log_dbt, DB_SET);
+ c_len = logc->c_len;
+ if ((t_ret = logc->close(logc, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ return (ret);
+
+ /* Now do the truncate. */
+ dblp = (DB_LOG *)dbenv->lg_handle;
+ lp = (LOG *)dblp->reginfo.primary;
+
+ R_LOCK(dbenv, &dblp->reginfo);
+ end_lsn = lp->lsn;
+ lp->lsn = *lsn;
+ lp->len = c_len;
+ lp->lsn.offset += lp->len;
+
+ /*
+ * I am going to assume that the number of bytes written since
+ * the last checkpoint doesn't exceed a 32-bit number.
+ */
+ DB_ASSERT(lp->lsn.file >= ckplsn->file);
+ bytes = 0;
+ if (ckplsn->file != lp->lsn.file) {
+ bytes = lp->log_size - ckplsn->offset;
+ if (lp->lsn.file > ckplsn->file + 1)
+ bytes += lp->log_size *
+ (lp->lsn.file - ckplsn->file - 1);
+ bytes += lp->lsn.offset;
+ } else
+ bytes = lp->lsn.offset - ckplsn->offset;
+
+ lp->stat.st_wc_mbytes += bytes / MEGABYTE;
+ lp->stat.st_wc_bytes += bytes % MEGABYTE;
+
+ /*
+ * If the saved lsn is greater than our new end of log, reset it
+ * to our current end of log.
+ */
+ if (log_compare(&lp->s_lsn, lsn) > 0)
+ lp->s_lsn = lp->lsn;
+
+ /*
+ * If the new end of log is in the middle of the buffer,
+ * don't change the w_off or f_lsn. If the new end is
+ * before the w_off then reset w_off and f_lsn to the new
+ * end of log.
+ */
+ if (lp->w_off >= lp->lsn.offset) {
+ lp->f_lsn = lp->lsn;
+ lp->w_off = lp->lsn.offset;
+ lp->b_off = 0;
+ } else
+ lp->b_off = lp->lsn.offset - lp->w_off;
+
+ ZERO_LSN(lp->waiting_lsn);
+ lp->ready_lsn = lp->lsn;
+ lp->wait_recs = 0;
+ lp->rcvd_recs = 0;
+
+ /* Now throw away any extra log files that we have around. */
+ for (fn = lp->lsn.file + 1;; fn++) {
+ if (__log_name(dblp, fn, &fname, &fh, DB_OSO_RDONLY) != 0) {
+ __os_free(dbenv, fname);
+ break;
+ }
+ (void)__os_closehandle(dbenv, &fh);
+ ret = __os_unlink(dbenv, fname);
+ __os_free(dbenv, fname);
+ if (ret != 0)
+ goto err;
+ }
+
+ /* Truncate the log to the new point. */
+ if ((ret = __log_zero(dbenv, &lp->lsn, &end_lsn)) != 0)
+ goto err;
+
+err: R_UNLOCK(dbenv, &dblp->reginfo);
+ return (ret);
+}
+
+/*
+ * __log_is_outdated --
+ * Used by the replication system to identify if a client's logs
+ * are too old. The log represented by dbenv is compared to the file
+ * number passed in fnum. If the log file fnum does not exist and is
+ * lower-numbered than the current logs, the we return *outdatedp non
+ * zero, else we return it 0.
*
- * PUBLIC: int __log_lastckp __P((DB_ENV *, DB_LSN *));
+ * PUBLIC: int __log_is_outdated __P((DB_ENV *dbenv,
+ * PUBLIC: u_int32_t fnum, int *outdatedp));
*/
int
-__log_lastckp(dbenv, lsnp)
+__log_is_outdated(dbenv, fnum, outdatedp)
DB_ENV *dbenv;
- DB_LSN *lsnp;
+ u_int32_t fnum;
+ int *outdatedp;
{
+ DB_LOG *dblp;
LOG *lp;
+ char *name;
+ int ret;
+ u_int32_t cfile;
- lp = (LOG *)(((DB_LOG *)dbenv->lg_handle)->reginfo.primary);
+ dblp = dbenv->lg_handle;
+ *outdatedp = 0;
+
+ if ((ret = __log_name(dblp, fnum, &name, NULL, 0)) != 0)
+ return (ret);
+
+ /* If the file exists, we're just fine. */
+ if (__os_exists(name, NULL) == 0)
+ goto out;
+
+ /*
+ * It didn't exist, decide if the file number is too big or
+ * too little. If it's too little, then we need to indicate
+ * that the LSN is outdated.
+ */
+ R_LOCK(dbenv, &dblp->reginfo);
+ lp = (LOG *)dblp->reginfo.primary;
+ cfile = lp->lsn.file;
+ R_UNLOCK(dbenv, &dblp->reginfo);
+
+ if (cfile > fnum)
+ *outdatedp = 1;
+out: __os_free(dbenv, name);
+ return (ret);
+}
+
+/*
+ * __log_zero --
+ * Zero out the tail of a log after a truncate.
+ */
+static int
+__log_zero(dbenv, from_lsn, to_lsn)
+ DB_ENV *dbenv;
+ DB_LSN *from_lsn, *to_lsn;
+{
+ char *lname;
+ DB_LOG *dblp;
+ LOG *lp;
+ int ret;
+ size_t nbytes, len, nw;
+ u_int8_t buf[4096];
+ u_int32_t mbytes, bytes;
+
+ dblp = dbenv->lg_handle;
+ lp = (LOG *)dblp->reginfo.primary;
+ lname = NULL;
+
+ if (dblp->lfname != lp->lsn.file) {
+ if (F_ISSET(&dblp->lfh, DB_FH_VALID))
+ (void)__os_closehandle(dbenv, &dblp->lfh);
+ dblp->lfname = lp->lsn.file;
+ }
+
+ if (from_lsn->file != to_lsn->file) {
+ /* We removed some log files; have to 0 to end of file. */
+ if (!F_ISSET(&dblp->lfh, DB_FH_VALID) && (ret =
+ __log_name(dblp, dblp->lfname, &lname, &dblp->lfh, 0)) != 0)
+ return (ret);
+ if ((ret = __os_ioinfo(dbenv,
+ NULL, &dblp->lfh, &mbytes, &bytes, NULL)) != 0)
+ goto err;
+ len = mbytes * MEGABYTE + bytes - from_lsn->offset;
+ } else if (to_lsn->offset <= from_lsn->offset)
+ return (0);
+ else
+ len = to_lsn->offset = from_lsn->offset;
+
+ memset(buf, 0, sizeof(buf));
+
+ /* Initialize the write position. */
+ if (!F_ISSET(&dblp->lfh, DB_FH_VALID) &&
+ (ret = __log_name(dblp, dblp->lfname, &lname, &dblp->lfh, 0)) != 0)
+ goto err;
+
+ if ((ret = __os_seek(dbenv,
+ &dblp->lfh, 0, 0, from_lsn->offset, 0, DB_OS_SEEK_SET)) != 0)
+ return (ret);
+
+ while (len > 0) {
+ nbytes = len > sizeof(buf) ? sizeof(buf) : len;
+ if ((ret =
+ __os_write(dbenv, &dblp->lfh, buf, nbytes, &nw)) != 0)
+ return (ret);
+ len -= nbytes;
+ }
+err: if (lname != NULL)
+ __os_free(dbenv, lname);
- *lsnp = lp->chkpt_lsn;
return (0);
}
diff --git a/bdb/log/log.src b/bdb/log/log.src
deleted file mode 100644
index a92fae8de26..00000000000
--- a/bdb/log/log.src
+++ /dev/null
@@ -1,46 +0,0 @@
-/*-
- * See the file LICENSE for redistribution information.
- *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
- * Sleepycat Software. All rights reserved.
- *
- * $Id: log.src,v 10.12 2000/02/17 20:24:10 bostic Exp $
- */
-
-PREFIX log
-
-INCLUDE #include "db_config.h"
-INCLUDE
-INCLUDE #ifndef NO_SYSTEM_INCLUDES
-INCLUDE #include <sys/types.h>
-INCLUDE
-INCLUDE #include <ctype.h>
-INCLUDE #include <errno.h>
-INCLUDE #include <string.h>
-INCLUDE #endif
-INCLUDE
-INCLUDE #include "db_int.h"
-INCLUDE #include "db_page.h"
-INCLUDE #include "db_dispatch.h"
-INCLUDE #include "db_am.h"
-INCLUDE #include "log.h"
-INCLUDE #include "txn.h"
-INCLUDE
-
-/* Used for registering name/id translations at open or close. */
-DEPRECATED register1 1
-ARG opcode u_int32_t lu
-DBT name DBT s
-DBT uid DBT s
-ARG fileid int32_t ld
-ARG ftype DBTYPE lx
-END
-
-BEGIN register 2
-ARG opcode u_int32_t lu
-DBT name DBT s
-DBT uid DBT s
-ARG fileid int32_t ld
-ARG ftype DBTYPE lx
-ARG meta_pgno db_pgno_t lu
-END
diff --git a/bdb/log/log_archive.c b/bdb/log/log_archive.c
index 83728c79e55..19e1af5a93e 100644
--- a/bdb/log/log_archive.c
+++ b/bdb/log/log_archive.c
@@ -1,14 +1,14 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1997, 1998, 1999, 2000
+ * Copyright (c) 1997-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: log_archive.c,v 11.13 2000/11/30 00:58:40 ubell Exp $";
+static const char revid[] = "$Id: log_archive.c,v 11.39 2002/08/06 05:00:31 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -19,50 +19,41 @@ static const char revid[] = "$Id: log_archive.c,v 11.13 2000/11/30 00:58:40 ubel
#include <unistd.h>
#endif
-#ifdef HAVE_RPC
-#include "db_server.h"
-#endif
-
#include "db_int.h"
-#include "db_dispatch.h"
-#include "log.h"
-#include "clib_ext.h" /* XXX: needed for getcwd. */
-
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
+#include "dbinc/db_page.h"
+#include "dbinc/log.h"
+#include "dbinc/qam.h"
+#include "dbinc/txn.h"
static int __absname __P((DB_ENV *, char *, char *, char **));
-static int __build_data __P((DB_ENV *, char *, char ***, void *(*)(size_t)));
+static int __build_data __P((DB_ENV *, char *, char ***));
static int __cmpfunc __P((const void *, const void *));
-static int __usermem __P((DB_ENV *, char ***, void *(*)(size_t)));
+static int __usermem __P((DB_ENV *, char ***));
/*
- * log_archive --
+ * __log_archive --
* Supporting function for db_archive(1).
+ *
+ * PUBLIC: int __log_archive __P((DB_ENV *, char **[], u_int32_t));
*/
int
-log_archive(dbenv, listp, flags, db_malloc)
+__log_archive(dbenv, listp, flags)
DB_ENV *dbenv;
char ***listp;
u_int32_t flags;
- void *(*db_malloc) __P((size_t));
{
DBT rec;
DB_LOG *dblp;
+ DB_LOGC *logc;
DB_LSN stable_lsn;
- u_int32_t fnum;
- int array_size, n, ret;
+ __txn_ckp_args *ckp_args;
char **array, **arrayp, *name, *p, *pref, buf[MAXPATHLEN];
-
-#ifdef HAVE_RPC
- if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
- return (__dbcl_log_archive(dbenv, listp, flags, db_malloc));
-#endif
+ int array_size, db_arch_abs, n, ret;
+ u_int32_t fnum;
PANIC_CHECK(dbenv);
- ENV_REQUIRES_CONFIG(dbenv, dbenv->lg_handle, DB_INIT_LOG);
+ ENV_REQUIRES_CONFIG(dbenv,
+ dbenv->lg_handle, "DB_ENV->log_archive", DB_INIT_LOG);
name = NULL;
dblp = dbenv->lg_handle;
@@ -70,15 +61,24 @@ log_archive(dbenv, listp, flags, db_malloc)
#define OKFLAGS (DB_ARCH_ABS | DB_ARCH_DATA | DB_ARCH_LOG)
if (flags != 0) {
- if ((ret =
- __db_fchk(dbenv, "log_archive", flags, OKFLAGS)) != 0)
+ if ((ret = __db_fchk(
+ dbenv, "DB_ENV->log_archive", flags, OKFLAGS)) != 0)
return (ret);
- if ((ret =
- __db_fcchk(dbenv,
- "log_archive", flags, DB_ARCH_DATA, DB_ARCH_LOG)) != 0)
+ if ((ret = __db_fcchk(dbenv, "DB_ENV->log_archive",
+ flags, DB_ARCH_DATA, DB_ARCH_LOG)) != 0)
return (ret);
}
+ if (LF_ISSET(DB_ARCH_ABS)) {
+ db_arch_abs = 1;
+ LF_CLR(DB_ARCH_ABS);
+ } else
+ db_arch_abs = 0;
+
+ if (flags == 0 || flags == DB_ARCH_DATA)
+ ENV_REQUIRES_CONFIG(dbenv,
+ dbenv->tx_handle, "DB_ENV->log_archive", DB_INIT_TXN);
+
/*
* Get the absolute pathname of the current directory. It would
* be nice to get the shortest pathname of the database directory,
@@ -88,7 +88,7 @@ log_archive(dbenv, listp, flags, db_malloc)
* Can't trust getcwd(3) to set a valid errno. If it doesn't, just
* guess that we ran out of memory.
*/
- if (LF_ISSET(DB_ARCH_ABS)) {
+ if (db_arch_abs) {
__os_set_errno(0);
if ((pref = getcwd(buf, sizeof(buf))) == NULL) {
if (__os_get_errno() == 0)
@@ -98,31 +98,55 @@ log_archive(dbenv, listp, flags, db_malloc)
} else
pref = NULL;
- switch (LF_ISSET(~DB_ARCH_ABS)) {
+ switch (flags) {
case DB_ARCH_DATA:
- return (__build_data(dbenv, pref, listp, db_malloc));
+ return (__build_data(dbenv, pref, listp));
case DB_ARCH_LOG:
memset(&rec, 0, sizeof(rec));
- if (F_ISSET(dbenv, DB_ENV_THREAD))
- F_SET(&rec, DB_DBT_MALLOC);
- if ((ret = log_get(dbenv, &stable_lsn, &rec, DB_LAST)) != 0)
+ if ((ret = dbenv->log_cursor(dbenv, &logc, 0)) != 0)
+ return (ret);
+#ifdef UMRW
+ ZERO_LSN(stable_lsn);
+#endif
+ ret = logc->get(logc, &stable_lsn, &rec, DB_LAST);
+ (void)logc->close(logc, 0);
+ if (ret != 0)
return (ret);
- if (F_ISSET(dbenv, DB_ENV_THREAD))
- __os_free(rec.data, rec.size);
fnum = stable_lsn.file;
break;
case 0:
- if ((ret = __log_findckp(dbenv, &stable_lsn)) != 0) {
+ memset(&rec, 0, sizeof(rec));
+ if (__txn_getckp(dbenv, &stable_lsn) != 0) {
/*
- * A return of DB_NOTFOUND means that we didn't find
- * any records in the log (so we are not going to be
- * deleting any log files).
+ * A failure return means that there's no checkpoint
+ * in the log (so we are not going to be deleting
+ * any log files).
*/
- if (ret != DB_NOTFOUND)
- return (ret);
*listp = NULL;
return (0);
}
+ if ((ret = dbenv->log_cursor(dbenv, &logc, 0)) != 0)
+ return (ret);
+ if ((ret = logc->get(logc, &stable_lsn, &rec, DB_SET)) != 0 ||
+ (ret = __txn_ckp_read(dbenv, rec.data, &ckp_args)) != 0) {
+ /*
+ * A return of DB_NOTFOUND may only mean that the
+ * checkpoint LSN is before the beginning of the
+ * log files that we still have. This is not
+ * an error; it just means our work is done.
+ */
+ if (ret == DB_NOTFOUND) {
+ *listp = NULL;
+ ret = 0;
+ }
+ (void)logc->close(logc, 0);
+ return (ret);
+ }
+ if ((ret = logc->close(logc, 0)) != 0)
+ return (ret);
+ stable_lsn = ckp_args->ckp_lsn;
+ __os_free(dbenv, ckp_args);
+
/* Remove any log files before the last stable LSN. */
fnum = stable_lsn.file - 1;
break;
@@ -130,9 +154,9 @@ log_archive(dbenv, listp, flags, db_malloc)
#define LIST_INCREMENT 64
/* Get some initial space. */
- array_size = 10;
+ array_size = 64;
if ((ret = __os_malloc(dbenv,
- sizeof(char *) * array_size, NULL, &array)) != 0)
+ sizeof(char *) * array_size, &array)) != 0)
return (ret);
array[0] = NULL;
@@ -143,27 +167,27 @@ log_archive(dbenv, listp, flags, db_malloc)
if (__os_exists(name, NULL) != 0) {
if (LF_ISSET(DB_ARCH_LOG) && fnum == stable_lsn.file)
continue;
- __os_freestr(name);
+ __os_free(dbenv, name);
name = NULL;
break;
}
- if (n >= array_size - 1) {
+ if (n >= array_size - 2) {
array_size += LIST_INCREMENT;
if ((ret = __os_realloc(dbenv,
- sizeof(char *) * array_size, NULL, &array)) != 0)
+ sizeof(char *) * array_size, &array)) != 0)
goto err;
}
- if (LF_ISSET(DB_ARCH_ABS)) {
+ if (db_arch_abs) {
if ((ret = __absname(dbenv,
pref, name, &array[n])) != 0)
goto err;
- __os_freestr(name);
+ __os_free(dbenv, name);
} else if ((p = __db_rpath(name)) != NULL) {
if ((ret = __os_strdup(dbenv, p + 1, &array[n])) != 0)
goto err;
- __os_freestr(name);
+ __os_free(dbenv, name);
} else
array[n] = name;
@@ -182,7 +206,7 @@ log_archive(dbenv, listp, flags, db_malloc)
qsort(array, (size_t)n, sizeof(char *), __cmpfunc);
/* Rework the memory. */
- if ((ret = __usermem(dbenv, &array, db_malloc)) != 0)
+ if ((ret = __usermem(dbenv, &array)) != 0)
goto err;
*listp = array;
@@ -190,11 +214,11 @@ log_archive(dbenv, listp, flags, db_malloc)
err: if (array != NULL) {
for (arrayp = array; *arrayp != NULL; ++arrayp)
- __os_freestr(*arrayp);
- __os_free(array, sizeof(char *) * array_size);
+ __os_free(dbenv, *arrayp);
+ __os_free(dbenv, array);
}
if (name != NULL)
- __os_freestr(name);
+ __os_free(dbenv, name);
return (ret);
}
@@ -203,73 +227,89 @@ err: if (array != NULL) {
* Build a list of datafiles for return.
*/
static int
-__build_data(dbenv, pref, listp, db_malloc)
+__build_data(dbenv, pref, listp)
DB_ENV *dbenv;
char *pref, ***listp;
- void *(*db_malloc) __P((size_t));
{
DBT rec;
+ DB_LOGC *logc;
DB_LSN lsn;
- __log_register_args *argp;
+ __dbreg_register_args *argp;
u_int32_t rectype;
- int array_size, last, n, nxt, ret;
- char **array, **arrayp, *p, *real_name;
+ int array_size, last, n, nxt, ret, t_ret;
+ char **array, **arrayp, **list, **lp, *p, *real_name;
/* Get some initial space. */
- array_size = 10;
+ array_size = 64;
if ((ret = __os_malloc(dbenv,
- sizeof(char *) * array_size, NULL, &array)) != 0)
+ sizeof(char *) * array_size, &array)) != 0)
return (ret);
array[0] = NULL;
memset(&rec, 0, sizeof(rec));
- if (F_ISSET(dbenv, DB_ENV_THREAD))
- F_SET(&rec, DB_DBT_MALLOC);
- for (n = 0, ret = log_get(dbenv, &lsn, &rec, DB_FIRST);
- ret == 0; ret = log_get(dbenv, &lsn, &rec, DB_NEXT)) {
+ if ((ret = dbenv->log_cursor(dbenv, &logc, 0)) != 0)
+ return (ret);
+ for (n = 0; (ret = logc->get(logc, &lsn, &rec, DB_PREV)) == 0;) {
if (rec.size < sizeof(rectype)) {
ret = EINVAL;
- __db_err(dbenv, "log_archive: bad log record");
- goto lg_free;
+ __db_err(dbenv, "DB_ENV->log_archive: bad log record");
+ goto free_continue;
}
memcpy(&rectype, rec.data, sizeof(rectype));
- if (rectype != DB_log_register) {
- if (F_ISSET(dbenv, DB_ENV_THREAD)) {
- __os_free(rec.data, rec.size);
- rec.data = NULL;
- }
+ if (rectype != DB___dbreg_register)
continue;
- }
- if ((ret = __log_register_read(dbenv, rec.data, &argp)) != 0) {
+ if ((ret =
+ __dbreg_register_read(dbenv, rec.data, &argp)) != 0) {
ret = EINVAL;
__db_err(dbenv,
- "log_archive: unable to read log record");
- goto lg_free;
+ "DB_ENV->log_archive: unable to read log record");
+ goto free_continue;
}
- if (n >= array_size - 1) {
+ if (n >= array_size - 2) {
array_size += LIST_INCREMENT;
if ((ret = __os_realloc(dbenv,
- sizeof(char *) * array_size, NULL, &array)) != 0)
- goto lg_free;
+ sizeof(char *) * array_size, &array)) != 0)
+ goto free_continue;
}
if ((ret = __os_strdup(dbenv,
- argp->name.data, &array[n])) != 0) {
-lg_free: if (F_ISSET(&rec, DB_DBT_MALLOC) && rec.data != NULL)
- __os_free(rec.data, rec.size);
- goto err1;
- }
-
- array[++n] = NULL;
- __os_free(argp, 0);
-
- if (F_ISSET(dbenv, DB_ENV_THREAD)) {
- __os_free(rec.data, rec.size);
- rec.data = NULL;
+ argp->name.data, &array[n++])) != 0)
+ goto free_continue;
+ array[n] = NULL;
+
+ if (argp->ftype == DB_QUEUE) {
+ if ((ret = __qam_extent_names(dbenv,
+ argp->name.data, &list)) != 0)
+ goto q_err;
+ for (lp = list;
+ lp != NULL && *lp != NULL; lp++) {
+ if (n >= array_size - 2) {
+ array_size += LIST_INCREMENT;
+ if ((ret = __os_realloc(dbenv,
+ sizeof(char *) *
+ array_size, &array)) != 0)
+ goto q_err;
+ }
+ if ((ret =
+ __os_strdup(dbenv, *lp, &array[n++])) != 0)
+ goto q_err;
+ array[n] = NULL;
+ }
+q_err: if (list != NULL)
+ __os_free(dbenv, list);
}
+free_continue: __os_free(dbenv, argp);
+ if (ret != 0)
+ break;
}
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ if ((t_ret = logc->close(logc, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ if (ret != 0)
+ goto err1;
/* If there's nothing to return, we're done. */
if (n == 0) {
@@ -297,34 +337,34 @@ lg_free: if (F_ISSET(&rec, DB_DBT_MALLOC) && rec.data != NULL)
}
for (++nxt; nxt < n &&
strcmp(array[last], array[nxt]) == 0; ++nxt) {
- __os_freestr(array[nxt]);
+ __os_free(dbenv, array[nxt]);
array[nxt] = NULL;
}
/* Get the real name. */
if ((ret = __db_appname(dbenv,
- DB_APP_DATA, NULL, array[last], 0, NULL, &real_name)) != 0)
+ DB_APP_DATA, array[last], 0, NULL, &real_name)) != 0)
goto err2;
/* If the file doesn't exist, ignore it. */
if (__os_exists(real_name, NULL) != 0) {
- __os_freestr(real_name);
- __os_freestr(array[last]);
+ __os_free(dbenv, real_name);
+ __os_free(dbenv, array[last]);
array[last] = NULL;
continue;
}
/* Rework the name as requested by the user. */
- __os_freestr(array[last]);
+ __os_free(dbenv, array[last]);
array[last] = NULL;
if (pref != NULL) {
ret = __absname(dbenv, pref, real_name, &array[last]);
- __os_freestr(real_name);
+ __os_free(dbenv, real_name);
if (ret != 0)
goto err2;
} else if ((p = __db_rpath(real_name)) != NULL) {
ret = __os_strdup(dbenv, p + 1, &array[last]);
- __os_freestr(real_name);
+ __os_free(dbenv, real_name);
if (ret != 0)
goto err2;
} else
@@ -336,7 +376,7 @@ lg_free: if (F_ISSET(&rec, DB_DBT_MALLOC) && rec.data != NULL)
array[last] = NULL;
/* Rework the memory. */
- if ((ret = __usermem(dbenv, &array, db_malloc)) != 0)
+ if ((ret = __usermem(dbenv, &array)) != 0)
goto err1;
*listp = array;
@@ -349,13 +389,13 @@ err2: /*
*/
if (array != NULL)
for (; nxt < n; ++nxt)
- __os_freestr(array[nxt]);
+ __os_free(dbenv, array[nxt]);
/* FALLTHROUGH */
err1: if (array != NULL) {
for (arrayp = array; *arrayp != NULL; ++arrayp)
- __os_freestr(*arrayp);
- __os_free(array, array_size * sizeof(char *));
+ __os_free(dbenv, *arrayp);
+ __os_free(dbenv, array);
}
return (ret);
}
@@ -379,7 +419,7 @@ __absname(dbenv, pref, name, newnamep)
/* Malloc space for concatenating the two. */
if ((ret = __os_malloc(dbenv,
- l_pref + l_name + 2, NULL, &newname)) != 0)
+ l_pref + l_name + 2, &newname)) != 0)
return (ret);
*newnamep = newname;
@@ -400,10 +440,9 @@ __absname(dbenv, pref, name, newnamep)
* If the user has their own malloc routine, use it.
*/
static int
-__usermem(dbenv, listp, db_malloc)
+__usermem(dbenv, listp)
DB_ENV *dbenv;
char ***listp;
- void *(*db_malloc) __P((size_t));
{
size_t len;
int ret;
@@ -415,7 +454,7 @@ __usermem(dbenv, listp, db_malloc)
len += sizeof(char *);
/* Allocate it and set up the pointers. */
- if ((ret = __os_malloc(dbenv, len, db_malloc, &array)) != 0)
+ if ((ret = __os_umalloc(dbenv, len, &array)) != 0)
return (ret);
strp = (char *)(array + (orig - *listp) + 1);
@@ -427,13 +466,13 @@ __usermem(dbenv, listp, db_malloc)
*arrayp = strp;
strp += len + 1;
- __os_freestr(*orig);
+ __os_free(dbenv, *orig);
}
/* NULL-terminate the list. */
*arrayp = NULL;
- __os_free(*listp, 0);
+ __os_free(dbenv, *listp);
*listp = array;
return (0);
diff --git a/bdb/log/log_compare.c b/bdb/log/log_compare.c
index 9bc3c028a5f..115f9c21b76 100644
--- a/bdb/log/log_compare.c
+++ b/bdb/log/log_compare.c
@@ -1,13 +1,13 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: log_compare.c,v 11.3 2000/02/14 02:59:59 bostic Exp $";
+static const char revid[] = "$Id: log_compare.c,v 11.6 2002/01/11 15:52:50 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -19,6 +19,8 @@ static const char revid[] = "$Id: log_compare.c,v 11.3 2000/02/14 02:59:59 bosti
/*
* log_compare --
* Compare two LSN's; return 1, 0, -1 if first is >, == or < second.
+ *
+ * EXTERN: int log_compare __P((const DB_LSN *, const DB_LSN *));
*/
int
log_compare(lsn0, lsn1)
diff --git a/bdb/log/log_findckp.c b/bdb/log/log_findckp.c
deleted file mode 100644
index b1e8fddbdb7..00000000000
--- a/bdb/log/log_findckp.c
+++ /dev/null
@@ -1,135 +0,0 @@
-/*-
- * See the file LICENSE for redistribution information.
- *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
- * Sleepycat Software. All rights reserved.
- */
-
-#include "db_config.h"
-
-#ifndef lint
-static const char revid[] = "$Id: log_findckp.c,v 11.5 2000/11/30 00:58:40 ubell Exp $";
-#endif /* not lint */
-
-#ifndef NO_SYSTEM_INCLUDES
-#include <sys/types.h>
-
-#include <string.h>
-#endif
-
-#include "db_int.h"
-#include "log.h"
-#include "txn.h"
-
-/*
- * __log_findckp --
- *
- * Looks for the most recent checkpoint that occurs before the most recent
- * checkpoint LSN, subject to the constraint that there must be at least two
- * checkpoints. The reason you need two checkpoints is that you might have
- * crashed during the most recent one and may not have a copy of all the
- * open files. This is the point from which recovery can start and the
- * point up to which archival/truncation can take place. Checkpoints in
- * the log look like:
- *
- * -------------------------------------------------------------------
- * | ckp A, ckplsn 100 | .... record .... | ckp B, ckplsn 600 | ...
- * -------------------------------------------------------------------
- * LSN 500 LSN 1000
- *
- * If we read what log returns from using the DB_CKP parameter to logput,
- * we'll get the record at LSN 1000. The checkpoint LSN there is 600.
- * Now we have to scan backwards looking for a checkpoint before LSN 600.
- * We find one at 500. This means that we can truncate the log before
- * 500 or run recovery beginning at 500.
- *
- * Returns 0 if we find a suitable checkpoint or we retrieved the first
- * record in the log from which to start. Returns DB_NOTFOUND if there
- * are no log records, errno on error.
- *
- * PUBLIC: int __log_findckp __P((DB_ENV *, DB_LSN *));
- */
-int
-__log_findckp(dbenv, lsnp)
- DB_ENV *dbenv;
- DB_LSN *lsnp;
-{
- DBT data;
- DB_LSN ckp_lsn, final_ckp, last_ckp, next_lsn;
- __txn_ckp_args *ckp_args;
- int ret;
-
- /*
- * Need to find the appropriate point from which to begin
- * recovery.
- */
- memset(&data, 0, sizeof(data));
- if (F_ISSET(dbenv, DB_ENV_THREAD))
- F_SET(&data, DB_DBT_MALLOC);
- ZERO_LSN(ckp_lsn);
- if ((ret = log_get(dbenv, &last_ckp, &data, DB_CHECKPOINT)) != 0) {
- if (ret == ENOENT)
- goto get_first;
- else
- return (ret);
- }
- final_ckp = last_ckp;
-
- next_lsn = last_ckp;
- do {
- if (F_ISSET(dbenv, DB_ENV_THREAD))
- __os_free(data.data, data.size);
-
- if ((ret = log_get(dbenv, &next_lsn, &data, DB_SET)) != 0)
- return (ret);
- if ((ret = __txn_ckp_read(dbenv, data.data, &ckp_args)) != 0) {
- if (F_ISSET(dbenv, DB_ENV_THREAD))
- __os_free(data.data, data.size);
- return (ret);
- }
- if (IS_ZERO_LSN(ckp_lsn))
- ckp_lsn = ckp_args->ckp_lsn;
- if (FLD_ISSET(dbenv->verbose, DB_VERB_CHKPOINT)) {
- __db_err(dbenv, "Checkpoint at: [%lu][%lu]",
- (u_long)last_ckp.file, (u_long)last_ckp.offset);
- __db_err(dbenv, "Checkpoint LSN: [%lu][%lu]",
- (u_long)ckp_args->ckp_lsn.file,
- (u_long)ckp_args->ckp_lsn.offset);
- __db_err(dbenv, "Previous checkpoint: [%lu][%lu]",
- (u_long)ckp_args->last_ckp.file,
- (u_long)ckp_args->last_ckp.offset);
- }
- last_ckp = next_lsn;
- next_lsn = ckp_args->last_ckp;
- __os_free(ckp_args, sizeof(*ckp_args));
-
- /*
- * Keep looping until either you 1) run out of checkpoints,
- * 2) you've found a checkpoint before the most recent
- * checkpoint's LSN and you have at least 2 checkpoints.
- */
- } while (!IS_ZERO_LSN(next_lsn) &&
- (log_compare(&last_ckp, &ckp_lsn) > 0 ||
- log_compare(&final_ckp, &last_ckp) == 0));
-
- if (F_ISSET(dbenv, DB_ENV_THREAD))
- __os_free(data.data, data.size);
-
- /*
- * At this point, either, next_lsn is ZERO or ckp_lsn is the
- * checkpoint lsn and last_ckp is the LSN of the last checkpoint
- * before ckp_lsn. If the compare in the loop is still true, then
- * next_lsn must be 0 and we need to roll forward from the
- * beginning of the log.
- */
- if (log_compare(&last_ckp, &ckp_lsn) >= 0 ||
- log_compare(&final_ckp, &last_ckp) == 0) {
-get_first: if ((ret = log_get(dbenv, &last_ckp, &data, DB_FIRST)) != 0)
- return (ret);
- if (F_ISSET(dbenv, DB_ENV_THREAD))
- __os_free(data.data, data.size);
- }
- *lsnp = last_ckp;
-
- return (IS_ZERO_LSN(last_ckp) ? DB_NOTFOUND : 0);
-}
diff --git a/bdb/log/log_get.c b/bdb/log/log_get.c
index b75d50a62fd..c8b028da0fb 100644
--- a/bdb/log/log_get.c
+++ b/bdb/log/log_get.c
@@ -1,13 +1,13 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: log_get.c,v 11.32 2001/01/11 18:19:53 bostic Exp $";
+static const char revid[] = "$Id: log_get.c,v 11.81 2002/08/14 20:09:27 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -17,81 +17,175 @@ static const char revid[] = "$Id: log_get.c,v 11.32 2001/01/11 18:19:53 bostic E
#include <unistd.h>
#endif
-#ifdef HAVE_RPC
-#include "db_server.h"
-#endif
-
#include "db_int.h"
-#include "db_page.h"
-#include "log.h"
-#include "hash.h"
+#include "dbinc/crypto.h"
+#include "dbinc/db_page.h"
+#include "dbinc/hmac.h"
+#include "dbinc/log.h"
+#include "dbinc/hash.h"
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
+typedef enum { L_ALREADY, L_ACQUIRED, L_NONE } RLOCK;
+
+static int __log_c_close __P((DB_LOGC *, u_int32_t));
+static int __log_c_get __P((DB_LOGC *, DB_LSN *, DBT *, u_int32_t));
+static int __log_c_get_int __P((DB_LOGC *, DB_LSN *, DBT *, u_int32_t));
+static int __log_c_hdrchk __P((DB_LOGC *, HDR *, int *));
+static int __log_c_incursor __P((DB_LOGC *, DB_LSN *, HDR *, u_int8_t **));
+static int __log_c_inregion __P((DB_LOGC *,
+ DB_LSN *, RLOCK *, DB_LSN *, HDR *, u_int8_t **));
+static int __log_c_io __P((DB_LOGC *,
+ u_int32_t, u_int32_t, void *, size_t *, int *));
+static int __log_c_ondisk __P((DB_LOGC *,
+ DB_LSN *, DB_LSN *, int, HDR *, u_int8_t **, int *));
+static int __log_c_set_maxrec __P((DB_LOGC *, char *));
+static int __log_c_shortread __P((DB_LOGC *, int));
/*
- * log_get --
- * Get a log record.
+ * __log_cursor --
+ * Create a log cursor.
+ *
+ * PUBLIC: int __log_cursor __P((DB_ENV *, DB_LOGC **, u_int32_t));
*/
int
-log_get(dbenv, alsn, dbt, flags)
+__log_cursor(dbenv, logcp, flags)
+ DB_ENV *dbenv;
+ DB_LOGC **logcp;
+ u_int32_t flags;
+{
+ DB_LOGC *logc;
+ int ret;
+
+ PANIC_CHECK(dbenv);
+ ENV_REQUIRES_CONFIG(dbenv,
+ dbenv->lg_handle, "DB_ENV->log_cursor", DB_INIT_LOG);
+
+ *logcp = NULL;
+
+ /* Validate arguments. */
+ if ((ret = __db_fchk(dbenv, "DB_ENV->log_cursor", flags, 0)) != 0)
+ return (ret);
+
+ /* Allocate memory for the cursor. */
+ if ((ret = __os_calloc(dbenv, 1, sizeof(DB_LOGC), &logc)) != 0)
+ goto err;
+ if ((ret = __os_calloc(dbenv, 1, sizeof(DB_FH), &logc->c_fh)) != 0)
+ goto err;
+
+ logc->bp_size = DB_LOGC_BUF_SIZE;
+ if ((ret = __os_malloc(dbenv, logc->bp_size, &logc->bp)) != 0)
+ goto err;
+
+ logc->dbenv = dbenv;
+ logc->close = __log_c_close;
+ logc->get = __log_c_get;
+
+ *logcp = logc;
+ return (0);
+
+err: if (logc != NULL) {
+ if (logc->c_fh != NULL)
+ __os_free(dbenv, logc->c_fh);
+ __os_free(dbenv, logc);
+ }
+
+ return (ret);
+}
+
+/*
+ * __log_c_close --
+ * Close a log cursor.
+ */
+static int
+__log_c_close(logc, flags)
+ DB_LOGC *logc;
+ u_int32_t flags;
+{
DB_ENV *dbenv;
+ int ret;
+
+ dbenv = logc->dbenv;
+
+ PANIC_CHECK(dbenv);
+ if ((ret = __db_fchk(dbenv, "DB_LOGC->close", flags, 0)) != 0)
+ return (ret);
+
+ if (F_ISSET(logc->c_fh, DB_FH_VALID))
+ (void)__os_closehandle(dbenv, logc->c_fh);
+
+ if (logc->c_dbt.data != NULL)
+ __os_free(dbenv, logc->c_dbt.data);
+
+ __os_free(dbenv, logc->bp);
+ __os_free(dbenv, logc->c_fh);
+ __os_free(dbenv, logc);
+
+ return (0);
+}
+
+/*
+ * __log_c_get --
+ * Get a log record.
+ */
+static int
+__log_c_get(logc, alsn, dbt, flags)
+ DB_LOGC *logc;
DB_LSN *alsn;
DBT *dbt;
u_int32_t flags;
{
- DB_LOG *dblp;
+ DB_ENV *dbenv;
DB_LSN saved_lsn;
int ret;
-#ifdef HAVE_RPC
- if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
- return (__dbcl_log_get(dbenv, alsn, dbt, flags));
-#endif
+ dbenv = logc->dbenv;
PANIC_CHECK(dbenv);
- ENV_REQUIRES_CONFIG(dbenv, dbenv->lg_handle, DB_INIT_LOG);
/* Validate arguments. */
- if (flags != DB_CHECKPOINT && flags != DB_CURRENT &&
- flags != DB_FIRST && flags != DB_LAST &&
- flags != DB_NEXT && flags != DB_PREV && flags != DB_SET)
- return (__db_ferr(dbenv, "log_get", 1));
-
- if (F_ISSET(dbenv, DB_ENV_THREAD)) {
- if (flags == DB_NEXT || flags == DB_PREV || flags == DB_CURRENT)
- return (__db_ferr(dbenv, "log_get", 1));
- if (!F_ISSET(dbt,
- DB_DBT_MALLOC | DB_DBT_REALLOC | DB_DBT_USERMEM))
- return (__db_ferr(dbenv, "threaded data", 1));
+ switch (flags) {
+ case DB_CURRENT:
+ case DB_FIRST:
+ case DB_LAST:
+ case DB_NEXT:
+ case DB_PREV:
+ break;
+ case DB_SET:
+ if (IS_ZERO_LSN(*alsn)) {
+ __db_err(dbenv, "DB_LOGC->get: invalid LSN");
+ return (EINVAL);
+ }
+ break;
+ default:
+ return (__db_ferr(dbenv, "DB_LOGC->get", 1));
}
- dblp = dbenv->lg_handle;
- R_LOCK(dbenv, &dblp->reginfo);
-
/*
- * The alsn field is only initialized if DB_SET is the flag, so this
- * assignment causes uninitialized memory complaints for other flag
- * values.
+ * On error, we take care not to overwrite the caller's LSN. This
+ * is because callers looking for the end of the log loop using the
+ * DB_NEXT flag, and expect to take the last successful lsn out of
+ * the passed-in structure after DB_LOGC->get fails with DB_NOTFOUND.
+ *
+ * !!!
+ * This line is often flagged an uninitialized memory read during a
+ * Purify or similar tool run, as the application didn't initialize
+ * *alsn. If the application isn't setting the DB_SET flag, there is
+ * no reason it should have initialized *alsn, but we can't know that
+ * and we want to make sure we never overwrite whatever the application
+ * put in there.
*/
-#ifdef UMRW
- if (flags == DB_SET)
- saved_lsn = *alsn;
- else
- ZERO_LSN(saved_lsn);
-#else
saved_lsn = *alsn;
-#endif
/*
- * If we get one of the log's header records, repeat the operation.
- * This assumes that applications don't ever request the log header
- * records by LSN, but that seems reasonable to me.
+ * If we get one of the log's header records as a result of doing a
+ * DB_FIRST, DB_NEXT, DB_LAST or DB_PREV, repeat the operation, log
+ * file header records aren't useful to applications.
*/
- if ((ret = __log_get(dblp,
- alsn, dbt, flags, 0)) == 0 && alsn->offset == 0) {
+ if ((ret = __log_c_get_int(logc, alsn, dbt, flags)) != 0) {
+ *alsn = saved_lsn;
+ return (ret);
+ }
+ if (alsn->offset == 0 && (flags == DB_FIRST ||
+ flags == DB_NEXT || flags == DB_LAST || flags == DB_PREV)) {
switch (flags) {
case DB_FIRST:
flags = DB_NEXT;
@@ -101,92 +195,100 @@ log_get(dbenv, alsn, dbt, flags)
break;
}
if (F_ISSET(dbt, DB_DBT_MALLOC)) {
- __os_free(dbt->data, dbt->size);
+ __os_free(dbenv, dbt->data);
dbt->data = NULL;
}
- ret = __log_get(dblp, alsn, dbt, flags, 0);
+ if ((ret = __log_c_get_int(logc, alsn, dbt, flags)) != 0) {
+ *alsn = saved_lsn;
+ return (ret);
+ }
}
- if (ret != 0)
- *alsn = saved_lsn;
- R_UNLOCK(dbenv, &dblp->reginfo);
-
- return (ret);
+ return (0);
}
/*
- * __log_get --
+ * __log_c_get_int --
* Get a log record; internal version.
- *
- * PUBLIC: int __log_get __P((DB_LOG *, DB_LSN *, DBT *, u_int32_t, int));
*/
-int
-__log_get(dblp, alsn, dbt, flags, silent)
- DB_LOG *dblp;
+static int
+__log_c_get_int(logc, alsn, dbt, flags)
+ DB_LOGC *logc;
DB_LSN *alsn;
DBT *dbt;
u_int32_t flags;
- int silent;
{
+ DB_CIPHER *db_cipher;
DB_ENV *dbenv;
- DB_LSN nlsn;
+ DB_LOG *dblp;
+ DB_LSN last_lsn, nlsn;
HDR hdr;
LOG *lp;
- const char *fail;
- char *np, *tbuf;
- int cnt, ret;
+ RLOCK rlock;
logfile_validity status;
- size_t len, nr;
- u_int32_t offset;
- u_int8_t *p;
- void *shortp, *readp;
+ u_int32_t cnt;
+ u_int8_t *rp;
+ int eof, is_hmac, ret;
+ dbenv = logc->dbenv;
+ dblp = dbenv->lg_handle;
lp = dblp->reginfo.primary;
- fail = np = tbuf = NULL;
- dbenv = dblp->dbenv;
+ is_hmac = 0;
- nlsn = dblp->c_lsn;
+ /*
+ * We don't acquire the log region lock until we need it, and we
+ * release it as soon as we're done.
+ */
+ rlock = F_ISSET(logc, DB_LOG_LOCKED) ? L_ALREADY : L_NONE;
+
+ nlsn = logc->c_lsn;
switch (flags) {
- case DB_CHECKPOINT:
- nlsn = lp->chkpt_lsn;
- if (IS_ZERO_LSN(nlsn)) {
- /* No db_err. The caller may expect this. */
- ret = ENOENT;
- goto err2;
- }
- break;
case DB_NEXT: /* Next log record. */
if (!IS_ZERO_LSN(nlsn)) {
/* Increment the cursor by the cursor record size. */
- nlsn.offset += dblp->c_len;
+ nlsn.offset += logc->c_len;
break;
}
+ flags = DB_FIRST;
/* FALLTHROUGH */
- case DB_FIRST: /* Find the first log record. */
+ case DB_FIRST: /* First log record. */
/* Find the first log file. */
if ((ret = __log_find(dblp, 1, &cnt, &status)) != 0)
- goto err2;
+ goto err;
/*
- * We want any readable version, so either DB_LV_NORMAL
- * or DB_LV_OLD_READABLE is acceptable here. If it's
- * not one of those two, there is no first log record that
- * we can read.
+ * DB_LV_INCOMPLETE:
+ * Theoretically, the log file we want could be created
+ * but not yet written, the "first" log record must be
+ * in the log buffer.
+ * DB_LV_NORMAL:
+ * DB_LV_OLD_READABLE:
+ * We found a log file we can read.
+ * DB_LV_NONEXISTENT:
+ * No log files exist, the "first" log record must be in
+ * the log buffer.
+ * DB_LV_OLD_UNREADABLE:
+ * No readable log files exist, we're at the cross-over
+ * point between two versions. The "first" log record
+ * must be in the log buffer.
*/
- if (status != DB_LV_NORMAL && status != DB_LV_OLD_READABLE) {
- ret = DB_NOTFOUND;
- goto err2;
+ switch (status) {
+ case DB_LV_INCOMPLETE:
+ DB_ASSERT(lp->lsn.file == cnt);
+ /* FALLTHROUGH */
+ case DB_LV_NORMAL:
+ case DB_LV_OLD_READABLE:
+ nlsn.file = cnt;
+ break;
+ case DB_LV_NONEXISTENT:
+ nlsn.file = 1;
+ DB_ASSERT(lp->lsn.file == nlsn.file);
+ break;
+ case DB_LV_OLD_UNREADABLE:
+ nlsn.file = cnt + 1;
+ DB_ASSERT(lp->lsn.file == nlsn.file);
+ break;
}
-
- /*
- * We may have only entered records in the buffer, and not
- * yet written a log file. If no log files were found and
- * there's anything in the buffer, it belongs to file 1.
- */
- if (cnt == 0)
- cnt = 1;
-
- nlsn.file = cnt;
nlsn.offset = 0;
break;
case DB_CURRENT: /* Current log record. */
@@ -197,21 +299,28 @@ __log_get(dblp, alsn, dbt, flags, silent)
if (nlsn.offset == 0) {
if (nlsn.file == 1 ||
__log_valid(dblp,
- nlsn.file - 1, 0, &status) != 0)
- return (DB_NOTFOUND);
+ nlsn.file - 1, 0, &status) != 0) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
if (status != DB_LV_NORMAL &&
- status != DB_LV_OLD_READABLE)
- return (DB_NOTFOUND);
+ status != DB_LV_OLD_READABLE) {
+ ret = DB_NOTFOUND;
+ goto err;
+ }
--nlsn.file;
- nlsn.offset = dblp->c_off;
- } else
- nlsn.offset = dblp->c_off;
+ }
+ nlsn.offset = logc->c_prev;
break;
}
/* FALLTHROUGH */
case DB_LAST: /* Last log record. */
+ if (rlock == L_NONE) {
+ rlock = L_ACQUIRED;
+ R_LOCK(dbenv, &dblp->reginfo);
+ }
nlsn.file = lp->lsn.file;
nlsn.offset = lp->lsn.offset - lp->len;
break;
@@ -225,241 +334,725 @@ next_file: ++nlsn.file;
nlsn.offset = 0;
}
- /* Return 1 if the request is past the end of the log. */
- if (nlsn.file > lp->lsn.file ||
- (nlsn.file == lp->lsn.file && nlsn.offset >= lp->lsn.offset))
- return (DB_NOTFOUND);
+ /*
+ * The above switch statement should have set nlsn to the lsn of
+ * the requested record.
+ */
- /* If we've switched files, discard the current file handle. */
- if (dblp->c_lsn.file != nlsn.file &&
- F_ISSET(&dblp->c_fh, DB_FH_VALID)) {
- (void)__os_closehandle(&dblp->c_fh);
+ if (CRYPTO_ON(dbenv)) {
+ hdr.size = HDR_CRYPTO_SZ;
+ is_hmac = 1;
+ } else {
+ hdr.size = HDR_NORMAL_SZ;
+ is_hmac = 0;
}
-
- /* If the entire record is in the in-memory buffer, copy it out. */
- if (nlsn.file == lp->lsn.file && nlsn.offset >= lp->w_off) {
- /* Copy the header. */
- p = dblp->bufp + (nlsn.offset - lp->w_off);
- memcpy(&hdr, p, sizeof(HDR));
-
- /* Copy the record. */
- len = hdr.len - sizeof(HDR);
- if ((ret = __db_retcopy(NULL, dbt, p + sizeof(HDR),
- len, &dblp->c_dbt.data, &dblp->c_dbt.ulen)) != 0)
- goto err2;
+ /* Check to see if the record is in the cursor's buffer. */
+ if ((ret = __log_c_incursor(logc, &nlsn, &hdr, &rp)) != 0)
+ goto err;
+ if (rp != NULL)
goto cksum;
- }
- shortp = NULL;
+ /*
+ * Look to see if we're moving backward in the log with the last record
+ * coming from the disk -- it means the record can't be in the region's
+ * buffer. Else, check the region's buffer.
+ *
+ * If the record isn't in the region's buffer, we're going to have to
+ * read the record from disk. We want to make a point of not reading
+ * past the end of the logical log (after recovery, there may be data
+ * after the end of the logical log, not to mention the log file may
+ * have been pre-allocated). So, zero out last_lsn, and initialize it
+ * inside __log_c_inregion -- if it's still zero when we check it in
+ * __log_c_ondisk, that's OK, it just means the logical end of the log
+ * isn't an issue for this request.
+ */
+ ZERO_LSN(last_lsn);
+ if (!F_ISSET(logc, DB_LOG_DISK) ||
+ log_compare(&nlsn, &logc->c_lsn) > 0) {
+ F_CLR(logc, DB_LOG_DISK);
- /* Acquire a file descriptor. */
- if (!F_ISSET(&dblp->c_fh, DB_FH_VALID)) {
- if ((ret = __log_name(dblp, nlsn.file,
- &np, &dblp->c_fh, DB_OSO_RDONLY | DB_OSO_SEQ)) != 0) {
- fail = np;
- goto err1;
- }
- __os_freestr(np);
- np = NULL;
+ if ((ret = __log_c_inregion(logc,
+ &nlsn, &rlock, &last_lsn, &hdr, &rp)) != 0)
+ goto err;
+ if (rp != NULL)
+ goto cksum;
}
- /* See if we've already read this */
- if (nlsn.file == dblp->r_file && nlsn.offset > dblp->r_off
- && nlsn.offset + sizeof(HDR) < dblp->r_off + dblp->r_size)
- goto got_header;
-
/*
- * Seek to the header offset and read the header. Because the file
- * may be pre-allocated, we have to make sure that we're not reading
- * past the information in the start of the in-memory buffer.
+ * We have to read from an on-disk file to retrieve the record.
+ * If we ever can't retrieve the record at offset 0, we're done,
+ * return EOF/DB_NOTFOUND.
+ *
+ * Discard the region lock if we're still holding it, the on-disk
+ * reading routines don't need it.
*/
-
- readp = &hdr;
- offset = nlsn.offset;
- if (nlsn.file == lp->lsn.file && offset + sizeof(HDR) > lp->w_off)
- nr = lp->w_off - offset;
- else if (dblp->readbufp == NULL)
- nr = sizeof(HDR);
- else {
- nr = lp->buffer_size;
- readp = dblp->readbufp;
- dblp->r_file = nlsn.file;
- /* Going backwards. Put the current in the middle. */
- if (flags == DB_PREV || flags == DB_LAST) {
- if (offset <= lp->buffer_size/2)
- offset = 0;
- else
- offset = offset - lp->buffer_size/2;
- }
- if (nlsn.file == lp->lsn.file && offset + nr > lp->lsn.offset)
- nr = lp->lsn.offset - offset;
- dblp->r_off = offset;
+ if (rlock == L_ACQUIRED) {
+ rlock = L_NONE;
+ R_UNLOCK(dbenv, &dblp->reginfo);
+ }
+ if ((ret = __log_c_ondisk(
+ logc, &nlsn, &last_lsn, flags, &hdr, &rp, &eof)) != 0)
+ goto err;
+ if (eof == 1) {
+ /*
+ * Only DB_NEXT automatically moves to the next file, and
+ * it only happens once.
+ */
+ if (flags != DB_NEXT || nlsn.offset == 0)
+ return (DB_NOTFOUND);
+ goto next_file;
}
+ F_SET(logc, DB_LOG_DISK);
- if ((ret = __os_seek(dblp->dbenv,
- &dblp->c_fh, 0, 0, offset, 0, DB_OS_SEEK_SET)) != 0) {
- fail = "seek";
- goto err1;
+cksum: /*
+ * Discard the region lock if we're still holding it. (The path to
+ * get here is that we acquired the lock because of the caller's
+ * flag argument, but we found the record in the cursor's buffer.
+ * Improbable, but it's easy to avoid.
+ */
+ if (rlock == L_ACQUIRED) {
+ rlock = L_NONE;
+ R_UNLOCK(dbenv, &dblp->reginfo);
}
- if ((ret = __os_read(dblp->dbenv, &dblp->c_fh, readp, nr, &nr)) != 0) {
- fail = "read";
- goto err1;
+
+ /*
+ * Checksum: there are two types of errors -- a configuration error
+ * or a checksum mismatch. The former is always bad. The latter is
+ * OK if we're searching for the end of the log, and very, very bad
+ * if we're reading random log records.
+ */
+ db_cipher = dbenv->crypto_handle;
+ if ((ret = __db_check_chksum(dbenv, db_cipher,
+ hdr.chksum, rp + hdr.size, hdr.len - hdr.size, is_hmac)) != 0) {
+ if (F_ISSET(logc, DB_LOG_SILENT_ERR)) {
+ if (ret == 0 || ret == -1)
+ ret = EIO;
+ } else if (ret == -1) {
+ __db_err(dbenv,
+ "DB_LOGC->get: log record checksum mismatch");
+ __db_err(dbenv,
+ "DB_LOGC->get: catastrophic recovery may be required");
+ ret = __db_panic(dbenv, DB_RUNRECOVERY);
+ }
+ goto err;
}
- if (nr < sizeof(HDR)) {
- /* If read returns EOF, try the next file. */
- if (nr == 0) {
- if (flags != DB_NEXT || nlsn.file == lp->lsn.file)
- goto corrupt;
+
+ /*
+ * If we got a 0-length record, that means we're in the midst of
+ * some bytes that got 0'd as the result of a vtruncate. We're
+ * going to have to retry.
+ */
+ if (hdr.len == 0) {
+ switch (flags) {
+ case DB_FIRST:
+ case DB_NEXT:
+ /* Zero'd records always indicate the end of a file. */
goto next_file;
+
+ case DB_LAST:
+ case DB_PREV:
+ /*
+ * We should never get here. If we recover a log
+ * file with 0's at the end, we'll treat the 0'd
+ * headers as the end of log and ignore them. If
+ * we're reading backwards from another file, then
+ * the first record in that new file should have its
+ * prev field set correctly.
+ */
+ __db_err(dbenv,
+ "Encountered zero length records while traversing backwards");
+ DB_ASSERT(0);
+ case DB_SET:
+ default:
+ /* Return the 0-length record. */
+ break;
}
+ }
- if (dblp->readbufp != NULL)
- memcpy((u_int8_t *) &hdr, readp, nr);
+ /* Copy the record into the user's DBT. */
+ if ((ret = __db_retcopy(dbenv, dbt, rp + hdr.size,
+ (u_int32_t)(hdr.len - hdr.size),
+ &logc->c_dbt.data, &logc->c_dbt.ulen)) != 0)
+ goto err;
+ if (CRYPTO_ON(dbenv)) {
+ if ((ret = db_cipher->decrypt(dbenv, db_cipher->data,
+ hdr.iv, dbt->data, hdr.len - hdr.size)) != 0) {
+ ret = EAGAIN;
+ goto err;
+ }
/*
- * If read returns a short count the rest of the record has
- * to be in the in-memory buffer.
+ * Return the original log record size to the user,
+ * even though we've allocated more than that, possibly.
+ * The log record is decrypted in the user dbt, not in
+ * the buffer, so we must do this here after decryption,
+ * not adjust the len passed to the __db_retcopy call.
*/
- if (lp->b_off < sizeof(HDR) - nr)
- goto corrupt;
+ dbt->size = hdr.orig_size;
+ }
- /* Get the rest of the header from the in-memory buffer. */
- memcpy((u_int8_t *)&hdr + nr, dblp->bufp, sizeof(HDR) - nr);
+ /* Update the cursor and the returned LSN. */
+ *alsn = nlsn;
+ logc->c_lsn = nlsn;
+ logc->c_len = hdr.len;
+ logc->c_prev = hdr.prev;
- if (hdr.len == 0)
- goto next_file;
+err: if (rlock == L_ACQUIRED)
+ R_UNLOCK(dbenv, &dblp->reginfo);
- shortp = dblp->bufp + (sizeof(HDR) - nr);
- }
+ return (ret);
+}
- else if (dblp->readbufp != NULL) {
- dblp->r_size = nr;
-got_header: memcpy((u_int8_t *)&hdr,
- dblp->readbufp + (nlsn.offset - dblp->r_off), sizeof(HDR));
- }
+/*
+ * __log_c_incursor --
+ * Check to see if the requested record is in the cursor's buffer.
+ */
+static int
+__log_c_incursor(logc, lsn, hdr, pp)
+ DB_LOGC *logc;
+ DB_LSN *lsn;
+ HDR *hdr;
+ u_int8_t **pp;
+{
+ u_int8_t *p;
+
+ *pp = NULL;
/*
- * Check for buffers of 0's, that's what we usually see during recovery,
- * although it's certainly not something on which we can depend. Check
- * for impossibly large records. The malloc should fail later, but we
- * have customers that run mallocs that handle allocation failure as a
- * fatal error.
+ * Test to see if the requested LSN could be part of the cursor's
+ * buffer.
+ *
+ * The record must be part of the same file as the cursor's buffer.
+ * The record must start at a byte offset equal to or greater than
+ * the cursor buffer.
+ * The record must not start at a byte offset after the cursor
+ * buffer's end.
*/
- if (hdr.len == 0)
- goto next_file;
- if (hdr.len <= sizeof(HDR) || hdr.len > lp->persist.lg_max)
- goto corrupt;
- len = hdr.len - sizeof(HDR);
-
- /* If we've already moved to the in-memory buffer, fill from there. */
- if (shortp != NULL) {
- if (lp->b_off < ((u_int8_t *)shortp - dblp->bufp) + len)
- goto corrupt;
- if ((ret = __db_retcopy(NULL, dbt, shortp, len,
- &dblp->c_dbt.data, &dblp->c_dbt.ulen)) != 0)
- goto err2;
- goto cksum;
- }
+ if (logc->bp_lsn.file != lsn->file)
+ return (0);
+ if (logc->bp_lsn.offset > lsn->offset)
+ return (0);
+ if (logc->bp_lsn.offset + logc->bp_rlen <= lsn->offset + hdr->size)
+ return (0);
- if (dblp->readbufp != NULL) {
- if (nlsn.offset + hdr.len < dblp->r_off + dblp->r_size) {
- if ((ret = __db_retcopy(NULL, dbt, dblp->readbufp +
- (nlsn.offset - dblp->r_off) + sizeof(HDR),
- len, &dblp->c_dbt.data, &dblp->c_dbt.ulen)) != 0)
- goto err2;
- goto cksum;
- } else if ((ret = __os_seek(dblp->dbenv, &dblp->c_fh, 0,
- 0, nlsn.offset + sizeof(HDR), 0, DB_OS_SEEK_SET)) != 0) {
- fail = "seek";
- goto err1;
- }
+ /*
+ * Read the record's header and check if the record is entirely held
+ * in the buffer. If the record is not entirely held, get it again.
+ * (The only advantage in having part of the record locally is that
+ * we might avoid a system call because we already have the HDR in
+ * memory.)
+ *
+ * If the header check fails for any reason, it must be because the
+ * LSN is bogus. Fail hard.
+ */
+ p = logc->bp + (lsn->offset - logc->bp_lsn.offset);
+ memcpy(hdr, p, hdr->size);
+ if (__log_c_hdrchk(logc, hdr, NULL))
+ return (DB_NOTFOUND);
+ if (logc->bp_lsn.offset + logc->bp_rlen <= lsn->offset + hdr->len)
+ return (0);
+
+ *pp = p; /* Success. */
+
+ return (0);
+}
+
+/*
+ * __log_c_inregion --
+ * Check to see if the requested record is in the region's buffer.
+ */
+static int
+__log_c_inregion(logc, lsn, rlockp, last_lsn, hdr, pp)
+ DB_LOGC *logc;
+ DB_LSN *lsn, *last_lsn;
+ RLOCK *rlockp;
+ HDR *hdr;
+ u_int8_t **pp;
+{
+ DB_ENV *dbenv;
+ DB_LOG *dblp;
+ LOG *lp;
+ size_t len, nr;
+ u_int32_t b_disk, b_region;
+ int ret;
+ u_int8_t *p;
+
+ dbenv = logc->dbenv;
+ dblp = dbenv->lg_handle;
+ lp = ((DB_LOG *)logc->dbenv->lg_handle)->reginfo.primary;
+
+ ret = 0;
+ *pp = NULL;
+
+ /* If we haven't yet acquired the log region lock, do so. */
+ if (*rlockp == L_NONE) {
+ *rlockp = L_ACQUIRED;
+ R_LOCK(dbenv, &dblp->reginfo);
}
/*
- * Allocate temporary memory to hold the record.
+ * The routines to read from disk must avoid reading past the logical
+ * end of the log, so pass that information back to it.
*
- * XXX
- * We're calling malloc(3) with a region locked. This isn't
- * a good idea.
+ * Since they're reading directly from the disk, they must also avoid
+ * reading past the offset we've written out. If the log was
+ * truncated, it's possible that there are zeroes or garbage on
+ * disk after this offset, and the logical end of the log can
+ * come later than this point if the log buffer isn't empty.
*/
- if ((ret = __os_malloc(dbenv, len, NULL, &tbuf)) != 0)
- goto err1;
+ *last_lsn = lp->lsn;
+ if (last_lsn->offset > lp->w_off)
+ last_lsn->offset = lp->w_off;
/*
- * Read the record into the buffer. If read returns a short count,
- * there was an error or the rest of the record is in the in-memory
- * buffer. Note, the information may be garbage if we're in recovery,
- * so don't read past the end of the buffer's memory.
- *
- * Because the file may be pre-allocated, we have to make sure that
- * we're not reading past the information in the start of the in-memory
+ * Test to see if the requested LSN could be part of the region's
* buffer.
+ *
+ * During recovery, we read the log files getting the information to
+ * initialize the region. In that case, the region's lsn field will
+ * not yet have been filled in, use only the disk.
+ *
+ * The record must not start at a byte offset after the region buffer's
+ * end, since that means the request is for a record after the end of
+ * the log. Do this test even if the region's buffer is empty -- after
+ * recovery, the log files may continue past the declared end-of-log,
+ * and the disk reading routine will incorrectly attempt to read the
+ * remainder of the log.
+ *
+ * Otherwise, test to see if the region's buffer actually has what we
+ * want:
+ *
+ * The buffer must have some useful content.
+ * The record must be in the same file as the region's buffer and must
+ * start at a byte offset equal to or greater than the region's buffer.
+ */
+ if (IS_ZERO_LSN(lp->lsn))
+ return (0);
+ if (lsn->file > lp->lsn.file ||
+ (lsn->file == lp->lsn.file && lsn->offset >= lp->lsn.offset))
+ return (DB_NOTFOUND);
+ if (lp->b_off == 0)
+ return (0);
+ if (lsn->file < lp->f_lsn.file || lsn->offset < lp->f_lsn.offset)
+ return (0);
+
+ /*
+ * The current contents of the cursor's buffer will be useless for a
+ * future call -- trash it rather than try and make it look correct.
+ */
+ ZERO_LSN(logc->bp_lsn);
+
+ /*
+ * If the requested LSN is greater than the region buffer's first
+ * byte, we know the entire record is in the buffer.
+ *
+ * If the header check fails for any reason, it must be because the
+ * LSN is bogus. Fail hard.
*/
- if (nlsn.file == lp->lsn.file &&
- nlsn.offset + sizeof(HDR) + len > lp->w_off)
- nr = lp->w_off - (nlsn.offset + sizeof(HDR));
+ if (lsn->offset > lp->f_lsn.offset) {
+ p = dblp->bufp + (lsn->offset - lp->w_off);
+ memcpy(hdr, p, hdr->size);
+ if (__log_c_hdrchk(logc, hdr, NULL))
+ return (DB_NOTFOUND);
+ if (logc->bp_size <= hdr->len) {
+ len = ALIGN(hdr->len * 2, 128);
+ if ((ret =
+ __os_realloc(logc->dbenv, len, &logc->bp)) != 0)
+ return (ret);
+ logc->bp_size = (u_int32_t)len;
+ }
+ memcpy(logc->bp, p, hdr->len);
+ *pp = logc->bp;
+ return (0);
+ }
+
+ /*
+ * There's a partial record, that is, the requested record starts
+ * in a log file and finishes in the region buffer. We have to
+ * find out how many bytes of the record are in the region buffer
+ * so we can copy them out into the cursor buffer. First, check
+ * to see if the requested record is the only record in the region
+ * buffer, in which case we should copy the entire region buffer.
+ *
+ * Else, walk back through the region's buffer to find the first LSN
+ * after the record that crosses the buffer boundary -- we can detect
+ * that LSN, because its "prev" field will reference the record we
+ * want. The bytes we need to copy from the region buffer are the
+ * bytes up to the record we find. The bytes we'll need to allocate
+ * to hold the log record are the bytes between the two offsets.
+ */
+ b_disk = lp->w_off - lsn->offset;
+ if (lp->b_off <= lp->len)
+ b_region = (u_int32_t)lp->b_off;
else
- nr = len;
- if ((ret = __os_read(dblp->dbenv, &dblp->c_fh, tbuf, nr, &nr)) != 0) {
- fail = "read";
- goto err1;
+ for (p = dblp->bufp + (lp->b_off - lp->len);;) {
+ memcpy(hdr, p, hdr->size);
+ if (hdr->prev == lsn->offset) {
+ b_region = (u_int32_t)(p - dblp->bufp);
+ break;
+ }
+ p = dblp->bufp + (hdr->prev - lp->w_off);
+ }
+
+ /*
+ * If we don't have enough room for the record, we have to allocate
+ * space. We have to do it while holding the region lock, which is
+ * truly annoying, but there's no way around it. This call is why
+ * we allocate cursor buffer space when allocating the cursor instead
+ * of waiting.
+ */
+ if (logc->bp_size <= b_region + b_disk) {
+ len = ALIGN((b_region + b_disk) * 2, 128);
+ if ((ret = __os_realloc(logc->dbenv, len, &logc->bp)) != 0)
+ return (ret);
+ logc->bp_size = (u_int32_t)len;
}
- if (len - nr > lp->buffer_size)
- goto corrupt;
- if (nr != len) {
- if (lp->b_off < len - nr)
- goto corrupt;
-
- /* Get the rest of the record from the in-memory buffer. */
- memcpy((u_int8_t *)tbuf + nr, dblp->bufp, len - nr);
+
+ /* Copy the region's bytes to the end of the cursor's buffer. */
+ p = (logc->bp + logc->bp_size) - b_region;
+ memcpy(p, dblp->bufp, b_region);
+
+ /* Release the region lock. */
+ if (*rlockp == L_ACQUIRED) {
+ *rlockp = L_NONE;
+ R_UNLOCK(dbenv, &dblp->reginfo);
}
- /* Copy the record into the user's DBT. */
- if ((ret = __db_retcopy(NULL, dbt, tbuf, len,
- &dblp->c_dbt.data, &dblp->c_dbt.ulen)) != 0)
- goto err2;
- __os_free(tbuf, 0);
- tbuf = NULL;
+ /*
+ * Read the rest of the information from disk. Neither short reads
+ * or EOF are acceptable, the bytes we want had better be there.
+ */
+ if (b_disk != 0) {
+ p -= b_disk;
+ nr = b_disk;
+ if ((ret = __log_c_io(
+ logc, lsn->file, lsn->offset, p, &nr, NULL)) != 0)
+ return (ret);
+ if (nr < b_disk)
+ return (__log_c_shortread(logc, 0));
+ }
-cksum: /*
- * If the user specified a partial record read, the checksum can't
- * match. It's not an obvious thing to do, but a user testing for
- * the length of a record might do it.
+ /* Copy the header information into the caller's structure. */
+ memcpy(hdr, p, hdr->size);
+
+ *pp = p;
+ return (0);
+}
+
+/*
+ * __log_c_ondisk --
+ * Read a record off disk.
+ */
+static int
+__log_c_ondisk(logc, lsn, last_lsn, flags, hdr, pp, eofp)
+ DB_LOGC *logc;
+ DB_LSN *lsn, *last_lsn;
+ int flags, *eofp;
+ HDR *hdr;
+ u_int8_t **pp;
+{
+ DB_ENV *dbenv;
+ size_t len, nr;
+ u_int32_t offset;
+ int ret;
+
+ dbenv = logc->dbenv;
+ *eofp = 0;
+
+ nr = hdr->size;
+ if ((ret =
+ __log_c_io(logc, lsn->file, lsn->offset, hdr, &nr, eofp)) != 0)
+ return (ret);
+ if (*eofp)
+ return (0);
+
+ /* If we read 0 bytes, assume we've hit EOF. */
+ if (nr == 0) {
+ *eofp = 1;
+ return (0);
+ }
+
+ /* Check the HDR. */
+ if ((ret = __log_c_hdrchk(logc, hdr, eofp)) != 0)
+ return (ret);
+ if (*eofp)
+ return (0);
+
+ /* Otherwise, we should have gotten the bytes we wanted. */
+ if (nr < hdr->size)
+ return (__log_c_shortread(logc, 0));
+
+ /*
+ * Regardless of how we return, the previous contents of the cursor's
+ * buffer are useless -- trash it.
*/
- if (!F_ISSET(dbt, DB_DBT_PARTIAL) &&
- hdr.cksum != __ham_func4(NULL, dbt->data, dbt->size)) {
- if (!silent)
- __db_err(dbenv, "log_get: checksum mismatch");
- goto corrupt;
+ ZERO_LSN(logc->bp_lsn);
+
+ /*
+ * Otherwise, we now (finally!) know how big the record is. (Maybe
+ * we should have just stuck the length of the record into the LSN!?)
+ * Make sure we have enough space.
+ */
+ if (logc->bp_size <= hdr->len) {
+ len = ALIGN(hdr->len * 2, 128);
+ if ((ret = __os_realloc(dbenv, len, &logc->bp)) != 0)
+ return (ret);
+ logc->bp_size = (u_int32_t)len;
}
- /* Update the cursor and the return lsn. */
- dblp->c_off = hdr.prev;
- dblp->c_len = hdr.len;
- dblp->c_lsn = nlsn;
- *alsn = nlsn;
+ /*
+ * If we're moving forward in the log file, read this record in at the
+ * beginning of the buffer. Otherwise, read this record in at the end
+ * of the buffer, making sure we don't try and read before the start
+ * of the file. (We prefer positioning at the end because transaction
+ * aborts use DB_SET to move backward through the log and we might get
+ * lucky.)
+ *
+ * Read a buffer's worth, without reading past the logical EOF. The
+ * last_lsn may be a zero LSN, but that's OK, the test works anyway.
+ */
+ if (flags == DB_FIRST || flags == DB_NEXT)
+ offset = lsn->offset;
+ else if (lsn->offset + hdr->len < logc->bp_size)
+ offset = 0;
+ else
+ offset = (lsn->offset + hdr->len) - logc->bp_size;
+
+ nr = logc->bp_size;
+ if (lsn->file == last_lsn->file && offset + nr >= last_lsn->offset)
+ nr = last_lsn->offset - offset;
+
+ if ((ret =
+ __log_c_io(logc, lsn->file, offset, logc->bp, &nr, eofp)) != 0)
+ return (ret);
+
+ /*
+ * We should have at least gotten the bytes up-to-and-including the
+ * record we're reading.
+ */
+ if (nr < (lsn->offset + hdr->len) - offset)
+ return (__log_c_shortread(logc, 1));
+
+ /* Set up the return information. */
+ logc->bp_rlen = (u_int32_t)nr;
+ logc->bp_lsn.file = lsn->file;
+ logc->bp_lsn.offset = offset;
+ *pp = logc->bp + (lsn->offset - offset);
+
+ return (0);
+}
+
+/*
+ * __log_c_hdrchk --
+ *
+ * Check for corrupted HDRs before we use them to allocate memory or find
+ * records.
+ *
+ * If the log files were pre-allocated, a zero-filled HDR structure is the
+ * logical file end. However, we can see buffers filled with 0's during
+ * recovery, too (because multiple log buffers were written asynchronously,
+ * and one made it to disk before a different one that logically precedes
+ * it in the log file.
+ *
+ * XXX
+ * I think there's a potential pre-allocation recovery flaw here -- if we
+ * fail to write a buffer at the end of a log file (by scheduling its
+ * write asynchronously, and it never making it to disk), then succeed in
+ * writing a log file block to a subsequent log file, I don't think we will
+ * detect that the buffer of 0's should have marked the end of the log files
+ * during recovery. I think we may need to always write some garbage after
+ * each block write if we pre-allocate log files. (At the moment, we do not
+ * pre-allocate, so this isn't currently an issue.)
+ *
+ * Check for impossibly large records. The malloc should fail later, but we
+ * have customers that run mallocs that treat all allocation failures as fatal
+ * errors.
+ *
+ * Note that none of this is necessarily something awful happening. We let
+ * the application hand us any LSN they want, and it could be a pointer into
+ * the middle of a log record, there's no way to tell.
+ */
+static int
+__log_c_hdrchk(logc, hdr, eofp)
+ DB_LOGC *logc;
+ HDR *hdr;
+ int *eofp;
+{
+ DB_ENV *dbenv;
+ int ret;
+
+ dbenv = logc->dbenv;
+
+ /* Sanity check the log record's size. */
+ if (hdr->len <= hdr->size)
+ goto err;
+ /*
+ * If the cursor's max-record value isn't yet set, it means we aren't
+ * reading these records from a log file and no check is necessary.
+ */
+ if (logc->bp_maxrec != 0 && hdr->len > logc->bp_maxrec) {
+ /*
+ * If we fail the check, there's the pathological case that
+ * we're reading the last file, it's growing, and our initial
+ * check information was wrong. Get it again, to be sure.
+ */
+ if ((ret = __log_c_set_maxrec(logc, NULL)) != 0) {
+ __db_err(dbenv, "DB_LOGC->get: %s", db_strerror(ret));
+ return (ret);
+ }
+ if (logc->bp_maxrec != 0 && hdr->len > logc->bp_maxrec)
+ goto err;
+ }
+
+ if (eofp != NULL) {
+ if (hdr->prev == 0 && hdr->chksum[0] == 0 && hdr->len == 0) {
+ *eofp = 1;
+ return (0);
+ }
+ *eofp = 0;
+ }
return (0);
-corrupt:/*
- * This is the catchall -- for some reason we didn't find enough
- * information or it wasn't reasonable information, and it wasn't
- * because a system call failed.
+err: if (!F_ISSET(logc, DB_LOG_SILENT_ERR))
+ __db_err(dbenv, "DB_LOGC->get: invalid log record header");
+ return (EIO);
+}
+
+/*
+ * __log_c_io --
+ * Read records from a log file.
+ */
+static int
+__log_c_io(logc, fnum, offset, p, nrp, eofp)
+ DB_LOGC *logc;
+ u_int32_t fnum, offset;
+ void *p;
+ size_t *nrp;
+ int *eofp;
+{
+ DB_ENV *dbenv;
+ DB_LOG *dblp;
+ int ret;
+ char *np;
+
+ dbenv = logc->dbenv;
+ dblp = dbenv->lg_handle;
+
+ /*
+ * If we've switched files, discard the current file handle and acquire
+ * a new one.
*/
- ret = EIO;
- fail = "read";
+ if (F_ISSET(logc->c_fh, DB_FH_VALID) && logc->bp_lsn.file != fnum)
+ if ((ret = __os_closehandle(dbenv, logc->c_fh)) != 0)
+ return (ret);
+ if (!F_ISSET(logc->c_fh, DB_FH_VALID)) {
+ if ((ret = __log_name(dblp, fnum,
+ &np, logc->c_fh, DB_OSO_RDONLY | DB_OSO_SEQ)) != 0) {
+ /*
+ * If we're allowed to return EOF, assume that's the
+ * problem, set the EOF status flag and return 0.
+ */
+ if (eofp != NULL) {
+ *eofp = 1;
+ ret = 0;
+ } else if (!F_ISSET(logc, DB_LOG_SILENT_ERR))
+ __db_err(dbenv, "DB_LOGC->get: %s: %s",
+ np, db_strerror(ret));
+ __os_free(dbenv, np);
+ return (ret);
+ }
-err1: if (!silent) {
- if (fail == NULL)
- __db_err(dbenv, "log_get: %s", db_strerror(ret));
- else
+ if ((ret = __log_c_set_maxrec(logc, np)) != 0) {
__db_err(dbenv,
- "log_get: %s: %s", fail, db_strerror(ret));
+ "DB_LOGC->get: %s: %s", np, db_strerror(ret));
+ __os_free(dbenv, np);
+ return (ret);
+ }
+ __os_free(dbenv, np);
}
-err2: if (np != NULL)
- __os_freestr(np);
- if (tbuf != NULL)
- __os_free(tbuf, 0);
- return (ret);
+ /* Seek to the record's offset. */
+ if ((ret = __os_seek(dbenv,
+ logc->c_fh, 0, 0, offset, 0, DB_OS_SEEK_SET)) != 0) {
+ if (!F_ISSET(logc, DB_LOG_SILENT_ERR))
+ __db_err(dbenv,
+ "DB_LOGC->get: seek: %s", db_strerror(ret));
+ return (ret);
+ }
+
+ /* Read the data. */
+ if ((ret = __os_read(dbenv, logc->c_fh, p, *nrp, nrp)) != 0) {
+ if (!F_ISSET(logc, DB_LOG_SILENT_ERR))
+ __db_err(dbenv,
+ "DB_LOGC->get: read: %s", db_strerror(ret));
+ return (ret);
+ }
+
+ return (0);
+}
+
+/*
+ * __log_c_shortread --
+ * Read was short -- return a consistent error message and error.
+ */
+static int
+__log_c_shortread(logc, silent)
+ DB_LOGC *logc;
+ int silent;
+{
+ if (!silent || !F_ISSET(logc, DB_LOG_SILENT_ERR))
+ __db_err(logc->dbenv, "DB_LOGC->get: short read");
+ return (EIO);
+}
+
+/*
+ * __log_c_set_maxrec --
+ * Bound the maximum log record size in a log file.
+ */
+static int
+__log_c_set_maxrec(logc, np)
+ DB_LOGC *logc;
+ char *np;
+{
+ DB_ENV *dbenv;
+ DB_LOG *dblp;
+ LOG *lp;
+ u_int32_t mbytes, bytes;
+ int ret;
+
+ dbenv = logc->dbenv;
+ dblp = dbenv->lg_handle;
+
+ /*
+ * We don't want to try and allocate huge chunks of memory because
+ * applications with error-checking malloc's often consider that a
+ * hard failure. If we're about to look at a corrupted record with
+ * a bizarre size, we need to know before trying to allocate space
+ * to hold it. We could read the persistent data at the beginning
+ * of the file but that's hard -- we may have to decrypt it, checksum
+ * it and so on. Stat the file instead.
+ */
+ if ((ret =
+ __os_ioinfo(dbenv, np, logc->c_fh, &mbytes, &bytes, NULL)) != 0)
+ return (ret);
+
+ logc->bp_maxrec = mbytes * MEGABYTE + bytes;
+
+ /*
+ * If reading from the log file currently being written, we could get
+ * an incorrect size, that is, if the cursor was opened on the file
+ * when it had only a few hundred bytes, and then the cursor used to
+ * move forward in the file, after more log records were written, the
+ * original stat value would be wrong. Use the maximum of the current
+ * log file size and the size of the buffer -- that should represent
+ * the max of any log record currently in the file.
+ *
+ * The log buffer size is set when the environment is opened and never
+ * changed, we don't need a lock on it.
+ */
+ lp = dblp->reginfo.primary;
+ logc->bp_maxrec += lp->buffer_size;
+
+ return (0);
}
diff --git a/bdb/log/log_method.c b/bdb/log/log_method.c
index 883f485d891..42adaf11c6c 100644
--- a/bdb/log/log_method.c
+++ b/bdb/log/log_method.c
@@ -1,38 +1,39 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1999, 2000
+ * Copyright (c) 1999-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: log_method.c,v 11.14 2000/11/30 00:58:40 ubell Exp $";
+static const char revid[] = "$Id: log_method.c,v 11.32 2002/05/30 22:16:47 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
#include <sys/types.h>
+#ifdef HAVE_RPC
+#include <rpc/rpc.h>
+#endif
+
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#endif
-#ifdef HAVE_RPC
-#include "db_server.h"
-#endif
-
#include "db_int.h"
-#include "log.h"
+#include "dbinc/log.h"
#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
+#include "dbinc_auto/db_server.h"
+#include "dbinc_auto/rpc_client_ext.h"
#endif
-static int __log_set_lg_max __P((DB_ENV *, u_int32_t));
static int __log_set_lg_bsize __P((DB_ENV *, u_int32_t));
static int __log_set_lg_dir __P((DB_ENV *, const char *));
+static int __log_set_lg_max __P((DB_ENV *, u_int32_t));
+static int __log_set_lg_regionmax __P((DB_ENV *, u_int32_t));
/*
* __log_dbenv_create --
@@ -44,13 +45,16 @@ void
__log_dbenv_create(dbenv)
DB_ENV *dbenv;
{
- dbenv->lg_bsize = LG_BSIZE_DEFAULT;
- dbenv->set_lg_bsize = __log_set_lg_bsize;
+ /*
+ * !!!
+ * Our caller has not yet had the opportunity to reset the panic
+ * state or turn off mutex locking, and so we can neither check
+ * the panic state or acquire a mutex in the DB_ENV create path.
+ */
- dbenv->lg_max = LG_MAX_DEFAULT;
- dbenv->set_lg_max = __log_set_lg_max;
+ dbenv->lg_bsize = LG_BSIZE_DEFAULT;
+ dbenv->lg_regionmax = LG_BASE_REGION_SIZE;
- dbenv->set_lg_dir = __log_set_lg_dir;
#ifdef HAVE_RPC
/*
* If we have a client, overwrite what we just setup to
@@ -58,10 +62,29 @@ __log_dbenv_create(dbenv)
*/
if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) {
dbenv->set_lg_bsize = __dbcl_set_lg_bsize;
- dbenv->set_lg_max = __dbcl_set_lg_max;
dbenv->set_lg_dir = __dbcl_set_lg_dir;
- }
+ dbenv->set_lg_max = __dbcl_set_lg_max;
+ dbenv->set_lg_regionmax = __dbcl_set_lg_regionmax;
+ dbenv->log_archive = __dbcl_log_archive;
+ dbenv->log_cursor = __dbcl_log_cursor;
+ dbenv->log_file = __dbcl_log_file;
+ dbenv->log_flush = __dbcl_log_flush;
+ dbenv->log_put = __dbcl_log_put;
+ dbenv->log_stat = __dbcl_log_stat;
+ } else
#endif
+ {
+ dbenv->set_lg_bsize = __log_set_lg_bsize;
+ dbenv->set_lg_dir = __log_set_lg_dir;
+ dbenv->set_lg_max = __log_set_lg_max;
+ dbenv->set_lg_regionmax = __log_set_lg_regionmax;
+ dbenv->log_archive = __log_archive;
+ dbenv->log_cursor = __log_cursor;
+ dbenv->log_file = __log_file;
+ dbenv->log_flush = __log_flush;
+ dbenv->log_put = __log_put;
+ dbenv->log_stat = __log_stat;
+ }
}
/*
@@ -73,10 +96,16 @@ __log_set_lg_bsize(dbenv, lg_bsize)
DB_ENV *dbenv;
u_int32_t lg_bsize;
{
+ u_int32_t lg_max;
+
ENV_ILLEGAL_AFTER_OPEN(dbenv, "set_lg_bsize");
+ if (lg_bsize == 0)
+ lg_bsize = LG_BSIZE_DEFAULT;
+
/* Let's not be silly. */
- if (lg_bsize > dbenv->lg_max / 4) {
+ lg_max = dbenv->lg_size == 0 ? LG_MAX_DEFAULT : dbenv->lg_size;
+ if (lg_bsize > lg_max / 4) {
__db_err(dbenv, "log buffer size must be <= log file size / 4");
return (EINVAL);
}
@@ -94,15 +123,53 @@ __log_set_lg_max(dbenv, lg_max)
DB_ENV *dbenv;
u_int32_t lg_max;
{
- ENV_ILLEGAL_AFTER_OPEN(dbenv, "set_lg_max");
+ LOG *region;
+
+ if (lg_max == 0)
+ lg_max = LG_MAX_DEFAULT;
+
+ if (F_ISSET(dbenv, DB_ENV_OPEN_CALLED)) {
+ if (!LOGGING_ON(dbenv))
+ return (__db_env_config(
+ dbenv, "set_lg_max", DB_INIT_LOG));
+ region = ((DB_LOG *)dbenv->lg_handle)->reginfo.primary;
+
+ /* Let's not be silly. */
+ if (lg_max < region->buffer_size * 4)
+ goto err;
+ region->log_nsize = lg_max;
+ } else {
+ /* Let's not be silly. */
+ if (lg_max < dbenv->lg_bsize * 4)
+ goto err;
+ dbenv->lg_size = lg_max;
+ }
+
+ return (0);
+
+err: __db_err(dbenv, "log file size must be >= log buffer size * 4");
+ return (EINVAL);
+}
+
+/*
+ * __log_set_lg_regionmax --
+ * Set the region size.
+ */
+static int
+__log_set_lg_regionmax(dbenv, lg_regionmax)
+ DB_ENV *dbenv;
+ u_int32_t lg_regionmax;
+{
+ ENV_ILLEGAL_AFTER_OPEN(dbenv, "set_lg_regionmax");
/* Let's not be silly. */
- if (lg_max < dbenv->lg_bsize * 4) {
- __db_err(dbenv, "log file size must be >= log buffer size * 4");
+ if (lg_regionmax != 0 && lg_regionmax < LG_BASE_REGION_SIZE) {
+ __db_err(dbenv,
+ "log file size must be >= %d", LG_BASE_REGION_SIZE);
return (EINVAL);
}
- dbenv->lg_max = lg_max;
+ dbenv->lg_regionmax = lg_regionmax;
return (0);
}
@@ -116,6 +183,6 @@ __log_set_lg_dir(dbenv, dir)
const char *dir;
{
if (dbenv->db_log_dir != NULL)
- __os_freestr(dbenv->db_log_dir);
+ __os_free(dbenv, dbenv->db_log_dir);
return (__os_strdup(dbenv, dir, &dbenv->db_log_dir));
}
diff --git a/bdb/log/log_put.c b/bdb/log/log_put.c
index c61f53e6c3d..bf6de2b0f7b 100644
--- a/bdb/log/log_put.c
+++ b/bdb/log/log_put.c
@@ -1,13 +1,13 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: log_put.c,v 11.26 2000/11/30 00:58:40 ubell Exp $";
+static const char revid[] = "$Id: log_put.c,v 11.112 2002/09/10 02:39:26 bostic Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -29,109 +29,424 @@ static const char revid[] = "$Id: log_put.c,v 11.26 2000/11/30 00:58:40 ubell Ex
#include <unistd.h>
#endif
-#ifdef HAVE_RPC
-#include "db_server.h"
-#endif
-
#include "db_int.h"
-#include "db_page.h"
-#include "log.h"
-#include "hash.h"
-#include "clib_ext.h"
-
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
+#include "dbinc/crypto.h"
+#include "dbinc/hmac.h"
+#include "dbinc/log.h"
+#include "dbinc/rep.h"
+#include "dbinc/txn.h"
+static int __log_encrypt_record __P((DB_ENV *, DBT *, HDR *, u_int32_t));
static int __log_fill __P((DB_LOG *, DB_LSN *, void *, u_int32_t));
-static int __log_flush __P((DB_LOG *, const DB_LSN *));
+static int __log_flush_commit __P((DB_ENV *, const DB_LSN *, u_int32_t));
+static int __log_flush_int __P((DB_LOG *, const DB_LSN *, int));
static int __log_newfh __P((DB_LOG *));
-static int __log_putr __P((DB_LOG *, DB_LSN *, const DBT *, u_int32_t));
-static int __log_open_files __P((DB_ENV *));
+static int __log_put_next __P((DB_ENV *,
+ DB_LSN *, const DBT *, HDR *, DB_LSN *));
+static int __log_putr __P((DB_LOG *,
+ DB_LSN *, const DBT *, u_int32_t, HDR *));
static int __log_write __P((DB_LOG *, void *, u_int32_t));
/*
- * log_put --
- * Write a log record.
+ * __log_put --
+ * Write a log record. This is the public interface, DB_ENV->log_put.
+ *
+ * PUBLIC: int __log_put __P((DB_ENV *, DB_LSN *, const DBT *, u_int32_t));
*/
int
-log_put(dbenv, lsn, dbt, flags)
+__log_put(dbenv, lsnp, udbt, flags)
DB_ENV *dbenv;
- DB_LSN *lsn;
- const DBT *dbt;
+ DB_LSN *lsnp;
+ const DBT *udbt;
u_int32_t flags;
{
+ DB_CIPHER *db_cipher;
+ DBT *dbt, t;
DB_LOG *dblp;
- int ret;
-
-#ifdef HAVE_RPC
- if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
- return (__dbcl_log_put(dbenv, lsn, dbt, flags));
-#endif
+ DB_LSN lsn, old_lsn;
+ HDR hdr;
+ LOG *lp;
+ u_int32_t do_flush, op, writeonly;
+ int lock_held, need_free, ret;
+ u_int8_t *key;
PANIC_CHECK(dbenv);
- ENV_REQUIRES_CONFIG(dbenv, dbenv->lg_handle, DB_INIT_LOG);
+ ENV_REQUIRES_CONFIG(dbenv,
+ dbenv->lg_handle, "DB_ENV->log_put", DB_INIT_LOG);
/* Validate arguments. */
- if (flags != 0 && flags != DB_CHECKPOINT &&
- flags != DB_CURLSN && flags != DB_FLUSH)
- return (__db_ferr(dbenv, "log_put", 0));
+ op = DB_OPFLAGS_MASK & flags;
+ if (op != 0 && op != DB_COMMIT)
+ return (__db_ferr(dbenv, "DB_ENV->log_put", 0));
+
+ /* Check for allowed bit-flags. */
+ if (LF_ISSET(~(DB_OPFLAGS_MASK |
+ DB_FLUSH | DB_NOCOPY | DB_PERMANENT | DB_WRNOSYNC)))
+ return (__db_ferr(dbenv, "DB_ENV->log_put", 0));
+
+ /* DB_WRNOSYNC and DB_FLUSH are mutually exclusive. */
+ if (LF_ISSET(DB_WRNOSYNC) && LF_ISSET(DB_FLUSH))
+ return (__db_ferr(dbenv, "DB_ENV->log_put", 1));
+
+ /* Replication clients should never write log records. */
+ if (F_ISSET(dbenv, DB_ENV_REP_CLIENT) ||
+ F_ISSET(dbenv, DB_ENV_REP_LOGSONLY)) {
+ __db_err(dbenv,
+ "DB_ENV->log_put is illegal on replication clients");
+ return (EINVAL);
+ }
dblp = dbenv->lg_handle;
+ lp = dblp->reginfo.primary;
+ db_cipher = dbenv->crypto_handle;
+ dbt = &t;
+ t = *udbt;
+ lock_held = need_free = 0;
+ do_flush = LF_ISSET(DB_FLUSH);
+ writeonly = LF_ISSET(DB_WRNOSYNC);
+
+ /*
+ * If we are coming from the logging code, we use an internal
+ * flag, DB_NOCOPY, because we know we can overwrite/encrypt
+ * the log record in place. Otherwise, if a user called log_put
+ * then we must copy it to new memory so that we know we can
+ * write it.
+ *
+ * We also must copy it to new memory if we are a replication
+ * master so that we retain an unencrypted copy of the log
+ * record to send to clients.
+ */
+ if (!LF_ISSET(DB_NOCOPY) || F_ISSET(dbenv, DB_ENV_REP_MASTER)) {
+ if (CRYPTO_ON(dbenv))
+ t.size += db_cipher->adj_size(udbt->size);
+ if ((ret = __os_calloc(dbenv, 1, t.size, &t.data)) != 0)
+ goto err;
+ need_free = 1;
+ memcpy(t.data, udbt->data, udbt->size);
+ }
+ if ((ret = __log_encrypt_record(dbenv, dbt, &hdr, udbt->size)) != 0)
+ goto err;
+ if (CRYPTO_ON(dbenv))
+ key = db_cipher->mac_key;
+ else
+ key = NULL;
+ /* Otherwise, we actually have a record to put. Put it. */
+
+ /* Before we grab the region lock, calculate the record's checksum. */
+ __db_chksum(dbt->data, dbt->size, key, hdr.chksum);
+
R_LOCK(dbenv, &dblp->reginfo);
- ret = __log_put(dbenv, lsn, dbt, flags);
- R_UNLOCK(dbenv, &dblp->reginfo);
+ lock_held = 1;
+
+ ZERO_LSN(old_lsn);
+ if ((ret = __log_put_next(dbenv, &lsn, dbt, &hdr, &old_lsn)) != 0)
+ goto err;
+
+ if (F_ISSET(dbenv, DB_ENV_REP_MASTER)) {
+ /*
+ * Replication masters need to drop the lock to send
+ * messages, but we want to drop and reacquire it a minimal
+ * number of times.
+ */
+ R_UNLOCK(dbenv, &dblp->reginfo);
+ lock_held = 0;
+
+ /*
+ * If we changed files and we're in a replicated
+ * environment, we need to inform our clients now that
+ * we've dropped the region lock.
+ *
+ * Note that a failed NEWFILE send is a dropped message
+ * that our client can handle, so we can ignore it. It's
+ * possible that the record we already put is a commit, so
+ * we don't just want to return failure.
+ */
+ if (!IS_ZERO_LSN(old_lsn))
+ (void)__rep_send_message(dbenv,
+ DB_EID_BROADCAST, REP_NEWFILE, &old_lsn, NULL, 0);
+
+ /*
+ * Then send the log record itself on to our clients.
+ *
+ * If the send fails and we're a commit or checkpoint,
+ * there's nothing we can do; the record's in the log.
+ * Flush it, even if we're running with TXN_NOSYNC, on the
+ * grounds that it should be in durable form somewhere.
+ */
+ /*
+ * !!!
+ * In the crypto case, we MUST send the udbt, not the
+ * now-encrypted dbt. Clients have no way to decrypt
+ * without the header.
+ */
+ if ((__rep_send_message(dbenv,
+ DB_EID_BROADCAST, REP_LOG, &lsn, udbt, flags) != 0) &&
+ LF_ISSET(DB_PERMANENT))
+ do_flush |= DB_FLUSH;
+ }
+
+ /*
+ * If needed, do a flush. Note that failures at this point
+ * are only permissible if we know we haven't written a commit
+ * record; __log_flush_commit is responsible for enforcing this.
+ *
+ * If a flush is not needed, see if WRITE_NOSYNC was set and we
+ * need to write out the log buffer.
+ */
+ if (do_flush || writeonly) {
+ if (!lock_held) {
+ R_LOCK(dbenv, &dblp->reginfo);
+ lock_held = 1;
+ }
+ if (do_flush)
+ ret = __log_flush_commit(dbenv, &lsn, flags);
+ else if (lp->b_off != 0)
+ /*
+ * writeonly: if there's anything in the current
+ * log buffer, we need to write it out.
+ */
+ if ((ret = __log_write(dblp,
+ dblp->bufp, (u_int32_t)lp->b_off)) == 0)
+ lp->b_off = 0;
+ }
+
+err: if (lock_held)
+ R_UNLOCK(dbenv, &dblp->reginfo);
+ if (need_free)
+ __os_free(dbenv, dbt->data);
+
+ if (ret == 0)
+ *lsnp = lsn;
+
return (ret);
}
/*
- * __log_put --
- * Write a log record; internal version.
+ * __log_txn_lsn --
*
- * PUBLIC: int __log_put __P((DB_ENV *, DB_LSN *, const DBT *, u_int32_t));
+ * PUBLIC: void __log_txn_lsn
+ * PUBLIC: __P((DB_ENV *, DB_LSN *, u_int32_t *, u_int32_t *));
*/
-int
-__log_put(dbenv, lsn, dbt, flags)
+void
+__log_txn_lsn(dbenv, lsnp, mbytesp, bytesp)
+ DB_ENV *dbenv;
+ DB_LSN *lsnp;
+ u_int32_t *mbytesp, *bytesp;
+{
+ DB_LOG *dblp;
+ LOG *lp;
+
+ dblp = dbenv->lg_handle;
+ lp = dblp->reginfo.primary;
+
+ R_LOCK(dbenv, &dblp->reginfo);
+
+ /*
+ * We are trying to get the LSN of the last entry in the log. We use
+ * this in two places: 1) DB_ENV->txn_checkpiont uses it as a first
+ * value when trying to compute an LSN such that all transactions begun
+ * before it are complete. 2) DB_ENV->txn_begin uses it as the
+ * begin_lsn.
+ *
+ * Typically, it's easy to get the last written LSN, you simply look
+ * at the current log pointer and back up the number of bytes of the
+ * last log record. However, if the last thing we did was write the
+ * log header of a new log file, then, this doesn't work, so we return
+ * the first log record that will be written in this new file.
+ */
+ *lsnp = lp->lsn;
+ if (lp->lsn.offset > lp->len)
+ lsnp->offset -= lp->len;
+
+ /*
+ * Since we're holding the log region lock, return the bytes put into
+ * the log since the last checkpoint, transaction checkpoint needs it.
+ *
+ * We add the current buffer offset so as to count bytes that have not
+ * yet been written, but are sitting in the log buffer.
+ */
+ if (mbytesp != NULL) {
+ *mbytesp = lp->stat.st_wc_mbytes;
+ *bytesp = (u_int32_t)(lp->stat.st_wc_bytes + lp->b_off);
+ }
+
+ R_UNLOCK(dbenv, &dblp->reginfo);
+}
+
+/*
+ * __log_put_next --
+ * Put the given record as the next in the log, wherever that may
+ * turn out to be.
+ */
+static int
+__log_put_next(dbenv, lsn, dbt, hdr, old_lsnp)
DB_ENV *dbenv;
DB_LSN *lsn;
const DBT *dbt;
- u_int32_t flags;
+ HDR *hdr;
+ DB_LSN *old_lsnp;
{
- DBT t;
DB_LOG *dblp;
+ DB_LSN old_lsn;
LOG *lp;
- u_int32_t lastoff;
- int ret;
+ int newfile, ret;
dblp = dbenv->lg_handle;
lp = dblp->reginfo.primary;
/*
- * If the application just wants to know where we are, fill in
- * the information. Currently used by the transaction manager
- * to avoid writing TXN_begin records.
+ * Save a copy of lp->lsn before we might decide to switch log
+ * files and change it. If we do switch log files, and we're
+ * doing replication, we'll need to tell our clients about the
+ * switch, and they need to receive a NEWFILE message
+ * with this "would-be" LSN in order to know they're not
+ * missing any log records.
*/
- if (flags == DB_CURLSN) {
- lsn->file = lp->lsn.file;
- lsn->offset = lp->lsn.offset;
- return (0);
- }
+ old_lsn = lp->lsn;
+ newfile = 0;
- /* If this information won't fit in the file, swap files. */
- if (lp->lsn.offset + sizeof(HDR) + dbt->size > lp->persist.lg_max) {
- if (sizeof(HDR) +
- sizeof(LOGP) + dbt->size > lp->persist.lg_max) {
+ /*
+ * If this information won't fit in the file, or if we're a
+ * replication client environment and have been told to do so,
+ * swap files.
+ */
+ if (lp->lsn.offset == 0 ||
+ lp->lsn.offset + hdr->size + dbt->size > lp->log_size) {
+ if (hdr->size + sizeof(LOGP) + dbt->size > lp->log_size) {
__db_err(dbenv,
- "log_put: record larger than maximum file size");
+ "DB_ENV->log_put: record larger than maximum file size");
return (EINVAL);
}
- /* Flush the log. */
- if ((ret = __log_flush(dblp, NULL)) != 0)
+ if ((ret = __log_newfile(dblp, NULL)) != 0)
return (ret);
/*
+ * Flag that we switched files, in case we're a master
+ * and need to send this information to our clients.
+ * We postpone doing the actual send until we can
+ * safely release the log region lock and are doing so
+ * anyway.
+ */
+ newfile = 1;
+
+ if (dbenv->db_noticecall != NULL)
+ dbenv->db_noticecall(dbenv, DB_NOTICE_LOGFILE_CHANGED);
+ }
+
+ /*
+ * The offset into the log file at this point is the LSN where
+ * we're about to put this record, and is the LSN the caller wants.
+ */
+ *lsn = lp->lsn;
+
+ /* If we switched log files, let our caller know where. */
+ if (newfile)
+ *old_lsnp = old_lsn;
+
+ /* Actually put the record. */
+ return (__log_putr(dblp, lsn, dbt, lp->lsn.offset - lp->len, hdr));
+}
+
+/*
+ * __log_flush_commit --
+ * Flush a record for which the DB_FLUSH flag to log_put has been set.
+ */
+static int
+__log_flush_commit(dbenv, lsnp, flags)
+ DB_ENV *dbenv;
+ const DB_LSN *lsnp;
+ u_int32_t flags;
+{
+ DB_LOG *dblp;
+ DB_LSN flush_lsn;
+ LOG *lp;
+ int ret;
+ u_int32_t op;
+
+ dblp = dbenv->lg_handle;
+ lp = dblp->reginfo.primary;
+ flush_lsn = *lsnp;
+ op = DB_OPFLAGS_MASK & flags;
+
+ if ((ret = __log_flush_int(dblp, &flush_lsn, 1)) == 0)
+ return (0);
+
+ /*
+ * If a flush supporting a transaction commit fails, we must abort the
+ * transaction. (If we aren't doing a commit, return the failure; if
+ * if the commit we care about made it to disk successfully, we just
+ * ignore the failure, because there's no way to undo the commit.)
+ */
+ if (op != DB_COMMIT)
+ return (ret);
+
+ if (flush_lsn.file != lp->lsn.file || flush_lsn.offset < lp->w_off)
+ return (0);
+
+ /*
+ * Else, make sure that the commit record does not get out after we
+ * abort the transaction. Do this by overwriting the commit record
+ * in the buffer. (Note that other commits in this buffer will wait
+ * wait until a sucessful write happens, we do not wake them.) We
+ * point at the right part of the buffer and write an abort record
+ * over the commit. We must then try and flush the buffer again,
+ * since the interesting part of the buffer may have actually made
+ * it out to disk before there was a failure, we can't know for sure.
+ */
+ if (__txn_force_abort(dbenv,
+ dblp->bufp + flush_lsn.offset - lp->w_off) == 0)
+ (void)__log_flush_int(dblp, &flush_lsn, 0);
+
+ return (ret);
+}
+
+/*
+ * __log_newfile --
+ * Initialize and switch to a new log file. (Note that this is
+ * called both when no log yet exists and when we fill a log file.)
+ *
+ * PUBLIC: int __log_newfile __P((DB_LOG *, DB_LSN *));
+ */
+int
+__log_newfile(dblp, lsnp)
+ DB_LOG *dblp;
+ DB_LSN *lsnp;
+{
+ DB_CIPHER *db_cipher;
+ DB_ENV *dbenv;
+ DB_LSN lsn;
+ DBT t;
+ HDR hdr;
+ LOG *lp;
+ int need_free, ret;
+ u_int32_t lastoff;
+ size_t tsize;
+ u_int8_t *tmp;
+
+ dbenv = dblp->dbenv;
+ lp = dblp->reginfo.primary;
+
+ /* If we're not at the beginning of a file already, start a new one. */
+ if (lp->lsn.offset != 0) {
+ /*
+ * Flush the log so this file is out and can be closed. We
+ * cannot release the region lock here because we need to
+ * protect the end of the file while we switch. In
+ * particular, a thread with a smaller record than ours
+ * could detect that there is space in the log. Even
+ * blocking that event by declaring the file full would
+ * require all threads to wait here so that the lsn.file
+ * can be moved ahead after the flush completes. This
+ * probably can be changed if we had an lsn for the
+ * previous file and one for the curent, but it does not
+ * seem like this would get much more throughput, if any.
+ */
+ if ((ret = __log_flush_int(dblp, NULL, 0)) != 0)
+ return (ret);
+
+ DB_ASSERT(lp->b_off == 0);
+ /*
* Save the last known offset from the previous file, we'll
* need it to initialize the persistent header information.
*/
@@ -143,78 +458,50 @@ __log_put(dbenv, lsn, dbt, flags)
/* Reset the file write offset. */
lp->w_off = 0;
-
- if (dbenv->db_noticecall != NULL)
- dbenv->db_noticecall(dbenv, DB_NOTICE_LOGFILE_CHANGED);
} else
lastoff = 0;
- /* Initialize the LSN information returned to the user. */
- lsn->file = lp->lsn.file;
- lsn->offset = lp->lsn.offset;
-
/*
* Insert persistent information as the first record in every file.
* Note that the previous length is wrong for the very first record
* of the log, but that's okay, we check for it during retrieval.
*/
- if (lp->lsn.offset == 0) {
- t.data = &lp->persist;
- t.size = sizeof(LOGP);
- if ((ret = __log_putr(dblp, lsn,
- &t, lastoff == 0 ? 0 : lastoff - lp->len)) != 0)
- return (ret);
+ DB_ASSERT(lp->b_off == 0);
- /*
- * Record files open in this log.
- * If we are recovering then we are in the
- * process of outputting the files, don't do
- * it again.
- */
- if (!F_ISSET(dblp, DBLOG_RECOVER) &&
- (ret = __log_open_files(dbenv)) != 0)
- return (ret);
-
- /* Update the LSN information returned to the user. */
- lsn->file = lp->lsn.file;
- lsn->offset = lp->lsn.offset;
- }
+ memset(&t, 0, sizeof(t));
+ memset(&hdr, 0, sizeof(HDR));
- /* Write the application's log record. */
- if ((ret = __log_putr(dblp, lsn, dbt, lp->lsn.offset - lp->len)) != 0)
+ need_free = 0;
+ tsize = sizeof(LOGP);
+ db_cipher = dbenv->crypto_handle;
+ if (CRYPTO_ON(dbenv))
+ tsize += db_cipher->adj_size(tsize);
+ if ((ret = __os_calloc(dbenv, 1, tsize, &tmp)) != 0)
return (ret);
+ lp->persist.log_size = lp->log_size = lp->log_nsize;
+ memcpy(tmp, &lp->persist, sizeof(LOGP));
+ t.data = tmp;
+ t.size = (u_int32_t)tsize;
+ need_free = 1;
- /*
- * On a checkpoint, we:
- * Put out the checkpoint record (above).
- * Save the LSN of the checkpoint in the shared region.
- * Append the set of file name information into the log.
- */
- if (flags == DB_CHECKPOINT) {
- lp->chkpt_lsn = *lsn;
- if ((ret = __log_open_files(dbenv)) != 0)
- return (ret);
- }
+ if ((ret =
+ __log_encrypt_record(dbenv, &t, &hdr, (u_int32_t)tsize)) != 0)
+ goto err;
+ __db_chksum(t.data, t.size,
+ (CRYPTO_ON(dbenv)) ? db_cipher->mac_key : NULL, hdr.chksum);
+ lsn = lp->lsn;
+ if ((ret = __log_putr(dblp, &lsn,
+ &t, lastoff == 0 ? 0 : lastoff - lp->len, &hdr)) != 0)
+ goto err;
- /*
- * On a checkpoint or when flush is requested, we:
- * Flush the current buffer contents to disk.
- * Sync the log to disk.
- */
- if (flags == DB_FLUSH || flags == DB_CHECKPOINT)
- if ((ret = __log_flush(dblp, NULL)) != 0)
- return (ret);
+ /* Update the LSN information returned to the caller. */
+ if (lsnp != NULL)
+ *lsnp = lp->lsn;
- /*
- * On a checkpoint, we:
- * Save the time the checkpoint was written.
- * Reset the bytes written since the last checkpoint.
- */
- if (flags == DB_CHECKPOINT) {
- (void)time(&lp->chkpt);
- lp->stat.st_wc_bytes = lp->stat.st_wc_mbytes = 0;
- }
- return (0);
+err:
+ if (need_free)
+ __os_free(dbenv, tmp);
+ return (ret);
}
/*
@@ -222,100 +509,253 @@ __log_put(dbenv, lsn, dbt, flags)
* Actually put a record into the log.
*/
static int
-__log_putr(dblp, lsn, dbt, prev)
+__log_putr(dblp, lsn, dbt, prev, h)
DB_LOG *dblp;
DB_LSN *lsn;
const DBT *dbt;
u_int32_t prev;
+ HDR *h;
{
- HDR hdr;
+ DB_CIPHER *db_cipher;
+ DB_ENV *dbenv;
+ DB_LSN f_lsn;
LOG *lp;
- int ret;
+ HDR tmp, *hdr;
+ int ret, t_ret;
+ size_t b_off, nr;
+ u_int32_t w_off;
+ dbenv = dblp->dbenv;
lp = dblp->reginfo.primary;
/*
+ * If we weren't given a header, use a local one.
+ */
+ db_cipher = dbenv->crypto_handle;
+ if (h == NULL) {
+ hdr = &tmp;
+ memset(hdr, 0, sizeof(HDR));
+ if (CRYPTO_ON(dbenv))
+ hdr->size = HDR_CRYPTO_SZ;
+ else
+ hdr->size = HDR_NORMAL_SZ;
+ } else
+ hdr = h;
+
+ /* Save our position in case we fail. */
+ b_off = lp->b_off;
+ w_off = lp->w_off;
+ f_lsn = lp->f_lsn;
+
+ /*
* Initialize the header. If we just switched files, lsn.offset will
* be 0, and what we really want is the offset of the previous record
* in the previous file. Fortunately, prev holds the value we want.
*/
- hdr.prev = prev;
- hdr.len = sizeof(HDR) + dbt->size;
- hdr.cksum = __ham_func4(NULL, dbt->data, dbt->size);
+ hdr->prev = prev;
+ hdr->len = (u_int32_t)hdr->size + dbt->size;
- if ((ret = __log_fill(dblp, lsn, &hdr, sizeof(HDR))) != 0)
- return (ret);
- lp->len = sizeof(HDR);
- lp->lsn.offset += sizeof(HDR);
+ /*
+ * If we were passed in a nonzero checksum, our caller calculated
+ * the checksum before acquiring the log mutex, as an optimization.
+ *
+ * If our caller calculated a real checksum of 0, we'll needlessly
+ * recalculate it. C'est la vie; there's no out-of-bounds value
+ * here.
+ */
+ if (hdr->chksum[0] == 0)
+ __db_chksum(dbt->data, dbt->size,
+ (CRYPTO_ON(dbenv)) ? db_cipher->mac_key : NULL,
+ hdr->chksum);
+
+ if ((ret = __log_fill(dblp, lsn, hdr, (u_int32_t)hdr->size)) != 0)
+ goto err;
if ((ret = __log_fill(dblp, lsn, dbt->data, dbt->size)) != 0)
- return (ret);
- lp->len += dbt->size;
- lp->lsn.offset += dbt->size;
+ goto err;
+
+ lp->len = (u_int32_t)(hdr->size + dbt->size);
+ lp->lsn.offset += (u_int32_t)(hdr->size + dbt->size);
return (0);
+err:
+ /*
+ * If we wrote more than one buffer before failing, get the
+ * first one back. The extra buffers will fail the checksums
+ * and be ignored.
+ */
+ if (w_off + lp->buffer_size < lp->w_off) {
+ if ((t_ret =
+ __os_seek(dbenv,
+ &dblp->lfh, 0, 0, w_off, 0, DB_OS_SEEK_SET)) != 0 ||
+ (t_ret = __os_read(dbenv, &dblp->lfh, dblp->bufp,
+ b_off, &nr)) != 0)
+ return (__db_panic(dbenv, t_ret));
+ if (nr != b_off) {
+ __db_err(dbenv, "Short read while restoring log");
+ return (__db_panic(dbenv, EIO));
+ }
+ }
+
+ /* Reset to where we started. */
+ lp->w_off = w_off;
+ lp->b_off = b_off;
+ lp->f_lsn = f_lsn;
+
+ return (ret);
}
/*
- * log_flush --
+ * __log_flush --
* Write all records less than or equal to the specified LSN.
+ *
+ * PUBLIC: int __log_flush __P((DB_ENV *, const DB_LSN *));
*/
int
-log_flush(dbenv, lsn)
+__log_flush(dbenv, lsn)
DB_ENV *dbenv;
const DB_LSN *lsn;
{
DB_LOG *dblp;
int ret;
-#ifdef HAVE_RPC
- if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
- return (__dbcl_log_flush(dbenv, lsn));
-#endif
-
PANIC_CHECK(dbenv);
- ENV_REQUIRES_CONFIG(dbenv, dbenv->lg_handle, DB_INIT_LOG);
+ ENV_REQUIRES_CONFIG(dbenv,
+ dbenv->lg_handle, "DB_ENV->log_flush", DB_INIT_LOG);
dblp = dbenv->lg_handle;
R_LOCK(dbenv, &dblp->reginfo);
- ret = __log_flush(dblp, lsn);
+ ret = __log_flush_int(dblp, lsn, 1);
R_UNLOCK(dbenv, &dblp->reginfo);
return (ret);
}
/*
- * __log_flush --
+ * __log_flush_int --
* Write all records less than or equal to the specified LSN; internal
* version.
*/
static int
-__log_flush(dblp, lsn)
+__log_flush_int(dblp, lsnp, release)
DB_LOG *dblp;
- const DB_LSN *lsn;
+ const DB_LSN *lsnp;
+ int release;
{
- DB_LSN t_lsn;
+ DB_ENV *dbenv;
+ DB_LSN flush_lsn, f_lsn;
+ DB_MUTEX *flush_mutexp;
LOG *lp;
- int current, ret;
+ int current, do_flush, first, ret;
+ size_t b_off;
+ struct __db_commit *commit;
+ u_int32_t ncommit, w_off;
ret = 0;
+ ncommit = 0;
+ dbenv = dblp->dbenv;
lp = dblp->reginfo.primary;
+ flush_mutexp = R_ADDR(&dblp->reginfo, lp->flush_mutex_off);
/*
* If no LSN specified, flush the entire log by setting the flush LSN
* to the last LSN written in the log. Otherwise, check that the LSN
* isn't a non-existent record for the log.
*/
- if (lsn == NULL) {
- t_lsn.file = lp->lsn.file;
- t_lsn.offset = lp->lsn.offset - lp->len;
- lsn = &t_lsn;
- } else
- if (lsn->file > lp->lsn.file ||
- (lsn->file == lp->lsn.file &&
- lsn->offset > lp->lsn.offset - lp->len)) {
- __db_err(dblp->dbenv,
- "log_flush: LSN past current end-of-log");
- return (EINVAL);
- }
+ if (lsnp == NULL) {
+ flush_lsn.file = lp->lsn.file;
+ flush_lsn.offset = lp->lsn.offset - lp->len;
+ } else if (lsnp->file > lp->lsn.file ||
+ (lsnp->file == lp->lsn.file &&
+ lsnp->offset > lp->lsn.offset - lp->len)) {
+ __db_err(dbenv,
+ "DB_ENV->log_flush: LSN past current end-of-log");
+ return (EINVAL);
+ } else {
+ /*
+ * See if we need to wait. s_lsn is not locked so some
+ * care is needed. The sync point can only move forward.
+ * If the file we want is in the past we are done.
+ * If the file numbers are the same check the offset.
+ * If this fails check the file numbers again since the
+ * offset might have changed while we were looking.
+ * This all assumes we can read an integer in one
+ * state or the other, not in transition.
+ */
+ if (lp->s_lsn.file > lsnp->file)
+ return (0);
+
+ if (lp->s_lsn.file == lsnp->file &&
+ lp->s_lsn.offset > lsnp->offset)
+ return (0);
+
+ if (lp->s_lsn.file > lsnp->file)
+ return (0);
+
+ flush_lsn = *lsnp;
+ }
+
+ /*
+ * If a flush is in progress and we're allowed to do so, drop
+ * the region lock and block waiting for the next flush.
+ */
+ if (release && lp->in_flush != 0) {
+ if ((commit = SH_TAILQ_FIRST(
+ &lp->free_commits, __db_commit)) == NULL) {
+ if ((ret =
+ __db_shalloc(dblp->reginfo.addr,
+ sizeof(struct __db_commit),
+ MUTEX_ALIGN, &commit)) != 0)
+ goto flush;
+ memset(commit, 0, sizeof(*commit));
+ if ((ret = __db_mutex_setup(dbenv, &dblp->reginfo,
+ &commit->mutex, MUTEX_SELF_BLOCK |
+ MUTEX_NO_RLOCK)) != 0) {
+ __db_shalloc_free(dblp->reginfo.addr, commit);
+ return (ret);
+ }
+ MUTEX_LOCK(dbenv, &commit->mutex);
+ } else
+ SH_TAILQ_REMOVE(
+ &lp->free_commits, commit, links, __db_commit);
+
+ lp->ncommit++;
+
+ /*
+ * Flushes may be requested out of LSN order; be
+ * sure we only move lp->t_lsn forward.
+ */
+ if (log_compare(&lp->t_lsn, &flush_lsn) < 0)
+ lp->t_lsn = flush_lsn;
+
+ commit->lsn = flush_lsn;
+ SH_TAILQ_INSERT_HEAD(
+ &lp->commits, commit, links, __db_commit);
+ R_UNLOCK(dbenv, &dblp->reginfo);
+ /* Wait here for the in-progress flush to finish. */
+ MUTEX_LOCK(dbenv, &commit->mutex);
+ R_LOCK(dbenv, &dblp->reginfo);
+
+ lp->ncommit--;
+ /*
+ * Grab the flag before freeing the struct to see if
+ * we need to flush the log to commit. If so,
+ * use the maximal lsn for any committing thread.
+ */
+ do_flush = F_ISSET(commit, DB_COMMIT_FLUSH);
+ F_CLR(commit, DB_COMMIT_FLUSH);
+ SH_TAILQ_INSERT_HEAD(
+ &lp->free_commits, commit, links, __db_commit);
+ if (do_flush) {
+ lp->in_flush--;
+ flush_lsn = lp->t_lsn;
+ } else
+ return (0);
+ }
+
+ /*
+ * Protect flushing with its own mutex so we can release
+ * the region lock except during file switches.
+ */
+flush: MUTEX_LOCK(dbenv, flush_mutexp);
/*
* If the LSN is less than or equal to the last-sync'd LSN, we're done.
@@ -323,9 +763,12 @@ __log_flush(dblp, lsn)
* after the byte we absolutely know was written to disk, so the test
* is <, not <=.
*/
- if (lsn->file < lp->s_lsn.file ||
- (lsn->file == lp->s_lsn.file && lsn->offset < lp->s_lsn.offset))
- return (0);
+ if (flush_lsn.file < lp->s_lsn.file ||
+ (flush_lsn.file == lp->s_lsn.file &&
+ flush_lsn.offset < lp->s_lsn.offset)) {
+ MUTEX_UNLOCK(dbenv, flush_mutexp);
+ goto done;
+ }
/*
* We may need to write the current buffer. We have to write the
@@ -333,9 +776,12 @@ __log_flush(dblp, lsn)
* buffer's starting LSN.
*/
current = 0;
- if (lp->b_off != 0 && log_compare(lsn, &lp->f_lsn) >= 0) {
- if ((ret = __log_write(dblp, dblp->bufp, lp->b_off)) != 0)
- return (ret);
+ if (lp->b_off != 0 && log_compare(&flush_lsn, &lp->f_lsn) >= 0) {
+ if ((ret = __log_write(dblp,
+ dblp->bufp, (u_int32_t)lp->b_off)) != 0) {
+ MUTEX_UNLOCK(dbenv, flush_mutexp);
+ goto done;
+ }
lp->b_off = 0;
current = 1;
@@ -348,23 +794,90 @@ __log_flush(dblp, lsn)
* buffer, don't bother. We have nothing to write and nothing to
* sync.
*/
- if (dblp->lfname != lp->lsn.file) {
- if (!current)
- return (0);
- if ((ret = __log_newfh(dblp)) != 0)
- return (ret);
- }
+ if (!F_ISSET(&dblp->lfh, DB_FH_VALID) || dblp->lfname != lp->lsn.file)
+ if (!current || (ret = __log_newfh(dblp)) != 0) {
+ MUTEX_UNLOCK(dbenv, flush_mutexp);
+ goto done;
+ }
+
+ /*
+ * We are going to flush, release the region.
+ * First get the current state of the buffer since
+ * another write may come in, but we may not flush it.
+ */
+ b_off = lp->b_off;
+ w_off = lp->w_off;
+ f_lsn = lp->f_lsn;
+ lp->in_flush++;
+ if (release)
+ R_UNLOCK(dbenv, &dblp->reginfo);
/* Sync all writes to disk. */
- if ((ret = __os_fsync(dblp->dbenv, &dblp->lfh)) != 0)
- return (__db_panic(dblp->dbenv, ret));
+ if ((ret = __os_fsync(dbenv, &dblp->lfh)) != 0) {
+ MUTEX_UNLOCK(dbenv, flush_mutexp);
+ if (release)
+ R_LOCK(dbenv, &dblp->reginfo);
+ ret = __db_panic(dbenv, ret);
+ return (ret);
+ }
+
+ /*
+ * Set the last-synced LSN.
+ * This value must be set to the LSN past the last complete
+ * record that has been flushed. This is at least the first
+ * lsn, f_lsn. If the buffer is empty, b_off == 0, then
+ * we can move up to write point since the first lsn is not
+ * set for the new buffer.
+ */
+ lp->s_lsn = f_lsn;
+ if (b_off == 0)
+ lp->s_lsn.offset = w_off;
+
+ MUTEX_UNLOCK(dbenv, flush_mutexp);
+ if (release)
+ R_LOCK(dbenv, &dblp->reginfo);
+
+ lp->in_flush--;
++lp->stat.st_scount;
- /* Set the last-synced LSN, using the on-disk write offset. */
- lp->s_lsn.file = lp->f_lsn.file;
- lp->s_lsn.offset = lp->w_off;
+ /*
+ * How many flush calls (usually commits) did this call actually sync?
+ * At least one, if it got here.
+ */
+ ncommit = 1;
+done:
+ if (lp->ncommit != 0) {
+ first = 1;
+ for (commit = SH_TAILQ_FIRST(&lp->commits, __db_commit);
+ commit != NULL;
+ commit = SH_TAILQ_NEXT(commit, links, __db_commit))
+ if (log_compare(&lp->s_lsn, &commit->lsn) > 0) {
+ MUTEX_UNLOCK(dbenv, &commit->mutex);
+ SH_TAILQ_REMOVE(
+ &lp->commits, commit, links, __db_commit);
+ ncommit++;
+ } else if (first == 1) {
+ F_SET(commit, DB_COMMIT_FLUSH);
+ MUTEX_UNLOCK(dbenv, &commit->mutex);
+ SH_TAILQ_REMOVE(
+ &lp->commits, commit, links, __db_commit);
+ /*
+ * This thread will wake and flush.
+ * If another thread commits and flushes
+ * first we will waste a trip trough the
+ * mutex.
+ */
+ lp->in_flush++;
+ first = 0;
+ }
+ }
+ if (lp->stat.st_maxcommitperflush < ncommit)
+ lp->stat.st_maxcommitperflush = ncommit;
+ if (lp->stat.st_mincommitperflush > ncommit ||
+ lp->stat.st_mincommitperflush == 0)
+ lp->stat.st_mincommitperflush = ncommit;
- return (0);
+ return (ret);
}
/*
@@ -415,7 +928,7 @@ __log_fill(dblp, lsn, addr, len)
nw = remain > len ? len : remain;
memcpy(dblp->bufp + lp->b_off, addr, nw);
addr = (u_int8_t *)addr + nw;
- len -= nw;
+ len -= (u_int32_t)nw;
lp->b_off += nw;
/* If we fill the buffer, flush it. */
@@ -439,15 +952,18 @@ __log_write(dblp, addr, len)
void *addr;
u_int32_t len;
{
+ DB_ENV *dbenv;
LOG *lp;
size_t nw;
int ret;
+ dbenv = dblp->dbenv;
+ lp = dblp->reginfo.primary;
+
/*
* If we haven't opened the log file yet or the current one
* has changed, acquire a new log file.
*/
- lp = dblp->reginfo.primary;
if (!F_ISSET(&dblp->lfh, DB_FH_VALID) || dblp->lfname != lp->lsn.file)
if ((ret = __log_newfh(dblp)) != 0)
return (ret);
@@ -457,14 +973,10 @@ __log_write(dblp, addr, len)
* since we last did).
*/
if ((ret =
- __os_seek(dblp->dbenv,
+ __os_seek(dbenv,
&dblp->lfh, 0, 0, lp->w_off, 0, DB_OS_SEEK_SET)) != 0 ||
- (ret = __os_write(dblp->dbenv, &dblp->lfh, addr, len, &nw)) != 0)
- return (__db_panic(dblp->dbenv, ret));
- if (nw != len) {
- __db_err(dblp->dbenv, "Short write while writing log");
- return (EIO);
- }
+ (ret = __os_write(dbenv, &dblp->lfh, addr, len, &nw)) != 0)
+ return (ret);
/* Reset the buffer offset and update the seek offset. */
lp->w_off += len;
@@ -484,11 +996,13 @@ __log_write(dblp, addr, len)
}
/*
- * log_file --
+ * __log_file --
* Map a DB_LSN to a file name.
+ *
+ * PUBLIC: int __log_file __P((DB_ENV *, const DB_LSN *, char *, size_t));
*/
int
-log_file(dbenv, lsn, namep, len)
+__log_file(dbenv, lsn, namep, len)
DB_ENV *dbenv;
const DB_LSN *lsn;
char *namep;
@@ -498,13 +1012,9 @@ log_file(dbenv, lsn, namep, len)
int ret;
char *name;
-#ifdef HAVE_RPC
- if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
- return (__dbcl_log_file(dbenv, lsn, namep, len));
-#endif
-
PANIC_CHECK(dbenv);
- ENV_REQUIRES_CONFIG(dbenv, dbenv->lg_handle, DB_INIT_LOG);
+ ENV_REQUIRES_CONFIG(dbenv,
+ dbenv->lg_handle, "DB_ENV->log_file", DB_INIT_LOG);
dblp = dbenv->lg_handle;
R_LOCK(dbenv, &dblp->reginfo);
@@ -516,11 +1026,11 @@ log_file(dbenv, lsn, namep, len)
/* Check to make sure there's enough room and copy the name. */
if (len < strlen(name) + 1) {
*namep = '\0';
- __db_err(dbenv, "log_file: name buffer is too short");
+ __db_err(dbenv, "DB_ENV->log_file: name buffer is too short");
return (EINVAL);
}
(void)strcpy(namep, name);
- __os_freestr(name);
+ __os_free(dbenv, name);
return (0);
}
@@ -533,19 +1043,21 @@ static int
__log_newfh(dblp)
DB_LOG *dblp;
{
+ DB_ENV *dbenv;
LOG *lp;
int ret;
char *name;
+ dbenv = dblp->dbenv;
+ lp = dblp->reginfo.primary;
+
/* Close any previous file descriptor. */
if (F_ISSET(&dblp->lfh, DB_FH_VALID))
- (void)__os_closehandle(&dblp->lfh);
-
- /* Get the path of the new file and open it. */
- lp = dblp->reginfo.primary;
- dblp->lfname = lp->lsn.file;
+ (void)__os_closehandle(dbenv, &dblp->lfh);
/*
+ * Get the path of the new file and open it.
+ *
* Adding DB_OSO_LOG to the flags may add additional platform-specific
* optimizations. On WinNT, the logfile is preallocated, which may
* have a time penalty at startup, but have better overall throughput.
@@ -557,14 +1069,16 @@ __log_newfh(dblp)
* maximum size down into the Windows __os_open routine, because it
* wants to pre-allocate it.
*/
- dblp->lfh.log_size = dblp->dbenv->lg_max;
+ dblp->lfname = lp->lsn.file;
+ dblp->lfh.log_size = lp->log_size;
if ((ret = __log_name(dblp, dblp->lfname,
&name, &dblp->lfh,
- DB_OSO_CREATE |/* DB_OSO_LOG |*/ DB_OSO_SEQ)) != 0)
- __db_err(dblp->dbenv,
- "log_put: %s: %s", name, db_strerror(ret));
+ DB_OSO_CREATE |/* DB_OSO_LOG |*/ DB_OSO_SEQ |
+ (F_ISSET(dbenv, DB_ENV_DIRECT_LOG) ? DB_OSO_DIRECT : 0))) != 0)
+ __db_err(dbenv,
+ "DB_ENV->log_put: %s: %s", name, db_strerror(ret));
- __os_freestr(name);
+ __os_free(dbenv, name);
return (ret);
}
@@ -582,11 +1096,13 @@ __log_name(dblp, filenumber, namep, fhp, flags)
char **namep;
DB_FH *fhp;
{
+ DB_ENV *dbenv;
LOG *lp;
int ret;
char *oname;
char old[sizeof(LFPREFIX) + 5 + 20], new[sizeof(LFPREFIX) + 10 + 20];
+ dbenv = dblp->dbenv;
lp = dblp->reginfo.primary;
/*
@@ -608,13 +1124,12 @@ __log_name(dblp, filenumber, namep, fhp, flags)
* file, return regardless.
*/
(void)snprintf(new, sizeof(new), LFNAME, filenumber);
- if ((ret = __db_appname(dblp->dbenv,
- DB_APP_LOG, NULL, new, 0, NULL, namep)) != 0 || fhp == NULL)
+ if ((ret = __db_appname(dbenv,
+ DB_APP_LOG, new, 0, NULL, namep)) != 0 || fhp == NULL)
return (ret);
/* Open the new-style file -- if we succeed, we're done. */
- if ((ret = __os_open(dblp->dbenv,
- *namep, flags, lp->persist.mode, fhp)) == 0)
+ if ((ret = __os_open(dbenv, *namep, flags, lp->persist.mode, fhp)) == 0)
return (0);
/*
@@ -622,15 +1137,14 @@ __log_name(dblp, filenumber, namep, fhp, flags)
* the caller isn't interested in old-style files.
*/
if (!LF_ISSET(DB_OSO_RDONLY)) {
- __db_err(dblp->dbenv,
+ __db_err(dbenv,
"%s: log file open failed: %s", *namep, db_strerror(ret));
- return (__db_panic(dblp->dbenv, ret));
+ return (__db_panic(dbenv, ret));
}
/* Create an old-style file name. */
(void)snprintf(old, sizeof(old), LFNAME_V1, filenumber);
- if ((ret = __db_appname(dblp->dbenv,
- DB_APP_LOG, NULL, old, 0, NULL, &oname)) != 0)
+ if ((ret = __db_appname(dbenv, DB_APP_LOG, old, 0, NULL, &oname)) != 0)
goto err;
/*
@@ -638,9 +1152,9 @@ __log_name(dblp, filenumber, namep, fhp, flags)
* space allocated for the new-style name and return the old-style
* name to the caller.
*/
- if ((ret = __os_open(dblp->dbenv,
+ if ((ret = __os_open(dbenv,
oname, flags, lp->persist.mode, fhp)) == 0) {
- __os_freestr(*namep);
+ __os_free(dbenv, *namep);
*namep = oname;
return (0);
}
@@ -653,52 +1167,82 @@ __log_name(dblp, filenumber, namep, fhp, flags)
* old-style name, but we expected it to exist and we weren't just
* looking for any log file. That's not a likely error.
*/
-err: __os_freestr(oname);
+err: __os_free(dbenv, oname);
return (ret);
}
-static int
-__log_open_files(dbenv)
+/*
+ * __log_rep_put --
+ * Short-circuit way for replication clients to put records into the
+ * log. Replication clients' logs need to be laid out exactly their masters'
+ * are, so we let replication take responsibility for when the log gets
+ * flushed, when log switches files, etc. This is just a thin PUBLIC wrapper
+ * for __log_putr with a slightly prettier interface.
+ *
+ * Note that the log region mutex should be held when this is called.
+ *
+ * PUBLIC: int __log_rep_put __P((DB_ENV *, DB_LSN *, const DBT *));
+ */
+int
+__log_rep_put(dbenv, lsnp, rec)
DB_ENV *dbenv;
+ DB_LSN *lsnp;
+ const DBT *rec;
{
+ DB_CIPHER *db_cipher;
DB_LOG *dblp;
- DB_LSN r_unused;
- DBT fid_dbt, t;
- FNAME *fnp;
+ HDR hdr;
+ DBT *dbt, t;
LOG *lp;
- int ret;
+ int need_free, ret;
dblp = dbenv->lg_handle;
lp = dblp->reginfo.primary;
- for (fnp = SH_TAILQ_FIRST(&lp->fq, __fname);
- fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) {
- if (fnp->ref == 0) /* Entry not in use. */
- continue;
- if (fnp->name_off != INVALID_ROFF) {
- memset(&t, 0, sizeof(t));
- t.data = R_ADDR(&dblp->reginfo, fnp->name_off);
- t.size = strlen(t.data) + 1;
- }
- memset(&fid_dbt, 0, sizeof(fid_dbt));
- fid_dbt.data = fnp->ufid;
- fid_dbt.size = DB_FILE_ID_LEN;
- /*
- * Output LOG_CHECKPOINT records which will be
- * processed during the OPENFILES pass of recovery.
- * At the end of recovery we want to output the
- * files that were open so that a future recovery
- * run will have the correct files open during
- * a backward pass. For this we output LOG_CLOSE
- * records so that the files will be closed on
- * the forward pass.
- */
- if ((ret = __log_register_log(dbenv,
- NULL, &r_unused, 0,
- F_ISSET(dblp, DBLOG_RECOVER) ? LOG_CLOSE : LOG_CHECKPOINT,
- fnp->name_off == INVALID_ROFF ? NULL : &t,
- &fid_dbt, fnp->id, fnp->s_type, fnp->meta_pgno)) != 0)
+ memset(&hdr, 0, sizeof(HDR));
+ t = *rec;
+ dbt = &t;
+ need_free = 0;
+ db_cipher = (DB_CIPHER *)dbenv->crypto_handle;
+ if (CRYPTO_ON(dbenv))
+ t.size += db_cipher->adj_size(rec->size);
+ if ((ret = __os_calloc(dbenv, 1, t.size, &t.data)) != 0)
+ goto err;
+ need_free = 1;
+ memcpy(t.data, rec->data, rec->size);
+
+ if ((ret = __log_encrypt_record(dbenv, dbt, &hdr, rec->size)) != 0)
+ goto err;
+ __db_chksum(t.data, t.size,
+ (CRYPTO_ON(dbenv)) ? db_cipher->mac_key : NULL, hdr.chksum);
+
+ DB_ASSERT(log_compare(lsnp, &lp->lsn) == 0);
+ ret = __log_putr(dblp, lsnp, dbt, lp->lsn.offset - lp->len, &hdr);
+err:
+ if (need_free)
+ __os_free(dbenv, t.data);
+ return (ret);
+}
+
+static int
+__log_encrypt_record(dbenv, dbt, hdr, orig)
+ DB_ENV *dbenv;
+ DBT *dbt;
+ HDR *hdr;
+ u_int32_t orig;
+{
+ DB_CIPHER *db_cipher;
+ int ret;
+
+ if (CRYPTO_ON(dbenv)) {
+ db_cipher = (DB_CIPHER *)dbenv->crypto_handle;
+ hdr->size = HDR_CRYPTO_SZ;
+ hdr->orig_size = orig;
+ if ((ret = db_cipher->encrypt(dbenv, db_cipher->data,
+ hdr->iv, dbt->data, dbt->size)) != 0)
return (ret);
+ } else {
+ hdr->size = HDR_NORMAL_SZ;
}
return (0);
}
diff --git a/bdb/log/log_rec.c b/bdb/log/log_rec.c
deleted file mode 100644
index 493dd06d4c6..00000000000
--- a/bdb/log/log_rec.c
+++ /dev/null
@@ -1,647 +0,0 @@
-/*-
- * See the file LICENSE for redistribution information.
- *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
- * Sleepycat Software. All rights reserved.
- */
-/*
- * Copyright (c) 1995, 1996
- * The President and Fellows of Harvard University. All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- */
-
-#include "db_config.h"
-
-#ifndef lint
-static const char revid[] = "$Id: log_rec.c,v 11.48 2001/01/11 18:19:53 bostic Exp $";
-#endif /* not lint */
-
-#ifndef NO_SYSTEM_INCLUDES
-#include <sys/types.h>
-
-#include <string.h>
-#endif
-
-#include "db_int.h"
-#include "db_page.h"
-#include "db_am.h"
-#include "log.h"
-
-static int __log_check_master __P((DB_ENV *, u_int8_t *, char *));
-static int __log_do_open __P((DB_ENV *, DB_LOG *,
- u_int8_t *, char *, DBTYPE, int32_t, db_pgno_t));
-static int __log_open_file __P((DB_ENV *, DB_LOG *, __log_register_args *));
-
-/*
- * PUBLIC: int __log_register_recover
- * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
- */
-int
-__log_register_recover(dbenv, dbtp, lsnp, op, info)
- DB_ENV *dbenv;
- DBT *dbtp;
- DB_LSN *lsnp;
- db_recops op;
- void *info;
-{
- DB_ENTRY *dbe;
- DB_LOG *logp;
- DB *dbp;
- __log_register_args *argp;
- int do_rem, ret, t_ret;
-
- logp = dbenv->lg_handle;
- dbp = NULL;
-
-#ifdef DEBUG_RECOVER
- REC_PRINT(__log_register_print);
-#endif
- COMPQUIET(lsnp, NULL);
-
- if ((ret = __log_register_read(dbenv, dbtp->data, &argp)) != 0)
- goto out;
-
- if ((argp->opcode == LOG_OPEN &&
- (DB_REDO(op) || op == DB_TXN_OPENFILES)) ||
- (argp->opcode == LOG_CLOSE && DB_UNDO(op))) {
- /*
- * If we are redoing an open or undoing a close, then we need
- * to open a file. We must open the file even if
- * the meta page is not yet written as we may be creating it.
- */
- if (op == DB_TXN_OPENFILES)
- F_SET(logp, DBLOG_FORCE_OPEN);
- ret = __log_open_file(dbenv, logp, argp);
- F_CLR(logp, DBLOG_FORCE_OPEN);
- if (ret == ENOENT || ret == EINVAL) {
- if (op == DB_TXN_OPENFILES && argp->name.size != 0 &&
- (ret = __db_txnlist_delete(dbenv, info,
- argp->name.data, argp->fileid, 0)) != 0)
- goto out;
- ret = 0;
- }
- } else if (argp->opcode != LOG_CHECKPOINT) {
- /*
- * If we are undoing an open, then we need to close the file.
- *
- * If the file is deleted, then we can just ignore this close.
- * Otherwise, we should usually have a valid dbp we should
- * close or whose reference count should be decremented.
- * However, if we shut down without closing a file, we may, in
- * fact, not have the file open, and that's OK.
- */
- do_rem = 0;
- MUTEX_THREAD_LOCK(dbenv, logp->mutexp);
- if (argp->fileid < logp->dbentry_cnt) {
- dbe = &logp->dbentry[argp->fileid];
-
- DB_ASSERT(dbe->refcount == 1);
-
- ret = __db_txnlist_close(info,
- argp->fileid, dbe->count);
- if ((dbp = TAILQ_FIRST(&dbe->dblist)) != NULL)
- (void)log_unregister(dbenv, dbp);
- do_rem = 1;
- }
- MUTEX_THREAD_UNLOCK(dbenv, logp->mutexp);
- if (do_rem) {
- (void)__log_rem_logid(logp, dbp, argp->fileid);
- /*
- * If remove or rename has closed the file, don't
- * sync.
- */
- if (dbp != NULL &&
- (t_ret = dbp->close(dbp,
- dbp->mpf == NULL ? DB_NOSYNC : 0)) != 0 && ret == 0)
- ret = t_ret;
- }
- } else if (DB_UNDO(op) || op == DB_TXN_OPENFILES) {
- /*
- * It's a checkpoint and we are rolling backward. It
- * is possible that the system was shut down and thus
- * ended with a stable checkpoint; this file was never
- * closed and has therefore not been reopened yet. If
- * so, we need to try to open it.
- */
- ret = __log_open_file(dbenv, logp, argp);
- if (ret == ENOENT || ret == EINVAL) {
- if (argp->name.size != 0 && (ret =
- __db_txnlist_delete(dbenv, info,
- argp->name.data, argp->fileid, 0)) != 0)
- goto out;
- ret = 0;
- }
- }
-
-out: if (argp != NULL)
- __os_free(argp, 0);
- return (ret);
-}
-
-/*
- * __log_open_file --
- * Called during log_register recovery. Make sure that we have an
- * entry in the dbentry table for this ndx. Returns 0 on success,
- * non-zero on error.
- */
-static int
-__log_open_file(dbenv, lp, argp)
- DB_ENV *dbenv;
- DB_LOG *lp;
- __log_register_args *argp;
-{
- DB_ENTRY *dbe;
- DB *dbp;
-
- /*
- * We never re-open temporary files. Temp files are only
- * useful during aborts in which case the dbp was entered
- * when the file was registered. During recovery, we treat
- * temp files as properly deleted files, allowing the open to
- * fail and not reporting any errors when recovery fails to
- * get a valid dbp from db_fileid_to_db.
- */
- if (argp->name.size == 0) {
- (void)__log_add_logid(dbenv, lp, NULL, argp->fileid);
- return (ENOENT);
- }
-
- /*
- * Because of reference counting, we cannot automatically close files
- * during recovery, so when we're opening, we have to check that the
- * name we are opening is what we expect. If it's not, then we close
- * the old file and open the new one.
- */
- MUTEX_THREAD_LOCK(dbenv, lp->mutexp);
- if (argp->fileid < lp->dbentry_cnt)
- dbe = &lp->dbentry[argp->fileid];
- else
- dbe = NULL;
-
- if (dbe != NULL) {
- dbe->deleted = 0;
- if ((dbp = TAILQ_FIRST(&dbe->dblist)) != NULL) {
- if (dbp->meta_pgno != argp->meta_pgno ||
- memcmp(dbp->fileid,
- argp->uid.data, DB_FILE_ID_LEN) != 0) {
- MUTEX_THREAD_UNLOCK(dbenv, lp->mutexp);
- goto reopen;
- }
- if (!F_ISSET(lp, DBLOG_RECOVER))
- dbe->refcount++;
- MUTEX_THREAD_UNLOCK(dbenv, lp->mutexp);
- return (0);
- }
- }
-
- MUTEX_THREAD_UNLOCK(dbenv, lp->mutexp);
- if (0) {
-reopen: (void)log_unregister(dbp->dbenv, dbp);
- (void)__log_rem_logid(lp, dbp, argp->fileid);
- dbp->close(dbp, 0);
- }
-
- return (__log_do_open(dbenv, lp,
- argp->uid.data, argp->name.data,
- argp->ftype, argp->fileid, argp->meta_pgno));
-}
-
-/*
- * log_reopen_file -- close and reopen a db file.
- * Must be called when a metadata page changes.
- *
- * PUBLIC: int __log_reopen_file __P((DB_ENV *,
- * PUBLIC: char *, int32_t, u_int8_t *, db_pgno_t));
- *
- */
-int
-__log_reopen_file(dbenv, name, ndx, fileid, meta_pgno)
- DB_ENV *dbenv;
- char *name;
- int32_t ndx;
- u_int8_t *fileid;
- db_pgno_t meta_pgno;
-{
- DB *dbp;
- DB_LOG *logp;
- DBTYPE ftype;
- FNAME *fnp;
- LOG *lp;
- char *tmp_name;
- int ret;
-
- logp = dbenv->lg_handle;
-
- if (name == NULL) {
- R_LOCK(dbenv, &logp->reginfo);
-
- lp = logp->reginfo.primary;
-
- for (fnp = SH_TAILQ_FIRST(&lp->fq, __fname);
- fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) {
- if (fnp->ref == 0) /* Entry not in use. */
- continue;
- if (memcmp(fnp->ufid, fileid, DB_FILE_ID_LEN) == 0)
- break;
- }
-
- if (fnp == 0 || fnp->name_off == INVALID_ROFF) {
- __db_err(dbenv,
- "metasub recover: non-existent file id");
- return (EINVAL);
- }
-
- name = R_ADDR(&logp->reginfo, fnp->name_off);
- ret = __os_strdup(dbenv, name, &tmp_name);
- R_UNLOCK(dbenv, &logp->reginfo);
- if (ret != 0)
- goto out;
- name = tmp_name;
- } else
- tmp_name = NULL;
-
- if ((ret = __db_fileid_to_db(dbenv, &dbp, ndx, 0)) != 0)
- goto out;
- ftype = dbp->type;
- (void)log_unregister(dbenv, dbp);
- (void)__log_rem_logid(logp, dbp, ndx);
- (void)dbp->close(dbp, 0);
-
- ret = __log_do_open(dbenv, logp, fileid, name, ftype, ndx, meta_pgno);
-
- if (tmp_name != NULL)
- __os_free(tmp_name, 0);
-
-out: return (ret);
-}
-
-/*
- * __log_do_open --
- * Open files referenced in the log. This is the part of the open that
- * is not protected by the thread mutex.
- */
-static int
-__log_do_open(dbenv, lp, uid, name, ftype, ndx, meta_pgno)
- DB_ENV *dbenv;
- DB_LOG *lp;
- u_int8_t *uid;
- char *name;
- DBTYPE ftype;
- int32_t ndx;
- db_pgno_t meta_pgno;
-{
- DB *dbp;
- int ret;
- u_int8_t zeroid[DB_FILE_ID_LEN];
-
- if ((ret = db_create(&dbp, lp->dbenv, 0)) != 0)
- return (ret);
-
- dbp->log_fileid = ndx;
-
- /*
- * This is needed to signal to the locking routines called while
- * opening databases that we are potentially undoing a transaction
- * from an XA process. Since the XA process does not share
- * locks with the aborting transaction this prevents us from
- * deadlocking during the open during rollback.
- * Because this routine is called either during recovery or during an
- * XA_ABORT, we can safely set DB_AM_RECOVER in the dbp since it
- * will not be shared with other threads.
- */
- F_SET(dbp, DB_AM_RECOVER);
- if (meta_pgno != PGNO_BASE_MD)
- memcpy(dbp->fileid, uid, DB_FILE_ID_LEN);
- dbp->type = ftype;
- if ((ret =
- __db_dbopen(dbp, name, 0, __db_omode("rw----"), meta_pgno)) == 0) {
- /*
- * Verify that we are opening the same file that we were
- * referring to when we wrote this log record.
- */
- if (meta_pgno != PGNO_BASE_MD &&
- __log_check_master(dbenv, uid, name) != 0)
- goto not_right;
- if (memcmp(uid, dbp->fileid, DB_FILE_ID_LEN) != 0) {
- memset(zeroid, 0, DB_FILE_ID_LEN);
- if (memcmp(dbp->fileid, zeroid, DB_FILE_ID_LEN) != 0)
- goto not_right;
- memcpy(dbp->fileid, uid, DB_FILE_ID_LEN);
- }
- if (IS_RECOVERING(dbenv)) {
- (void)log_register(dbp->dbenv, dbp, name);
- (void)__log_add_logid(dbenv, lp, dbp, ndx);
- }
- return (0);
- }
-
-not_right:
- (void)dbp->close(dbp, 0);
- (void)__log_add_logid(dbenv, lp, NULL, ndx);
-
- return (ENOENT);
-}
-
-static int
-__log_check_master(dbenv, uid, name)
- DB_ENV *dbenv;
- u_int8_t *uid;
- char *name;
-{
- DB *dbp;
- int ret;
-
- ret = 0;
- if ((ret = db_create(&dbp, dbenv, 0)) != 0)
- return (ret);
- dbp->type = DB_BTREE;
- ret = __db_dbopen(dbp, name, 0, __db_omode("rw----"), PGNO_BASE_MD);
-
- if (ret == 0 && memcmp(uid, dbp->fileid, DB_FILE_ID_LEN) != 0)
- ret = EINVAL;
-
- (void) dbp->close(dbp, 0);
- return (ret);
-}
-
-/*
- * __log_add_logid --
- * Adds a DB entry to the log's DB entry table.
- *
- * PUBLIC: int __log_add_logid __P((DB_ENV *, DB_LOG *, DB *, int32_t));
- */
-int
-__log_add_logid(dbenv, logp, dbp, ndx)
- DB_ENV *dbenv;
- DB_LOG *logp;
- DB *dbp;
- int32_t ndx;
-{
- DB *dbtmp;
- int32_t i;
- int ret;
-
- ret = 0;
-
- MUTEX_THREAD_LOCK(dbenv, logp->mutexp);
-
- /*
- * Check if we need to grow the table. Note, ndx is 0-based (the
- * index into the DB entry table) an dbentry_cnt is 1-based, the
- * number of available slots.
- */
- if (logp->dbentry_cnt <= ndx) {
- if ((ret = __os_realloc(dbenv,
- (ndx + DB_GROW_SIZE) * sizeof(DB_ENTRY),
- NULL, &logp->dbentry)) != 0)
- goto err;
-
- /*
- * We have moved the head of the queue.
- * Fix up the queue header of an empty queue or the previous
- * pointer of the first element.
- */
- for (i = 0; i < logp->dbentry_cnt; i++) {
- if ((dbtmp =
- TAILQ_FIRST(&logp->dbentry[i].dblist)) == NULL)
- TAILQ_INIT(&logp->dbentry[i].dblist);
- else
- TAILQ_REINSERT_HEAD(
- &logp->dbentry[i].dblist, dbtmp, links);
- }
-
- /* Initialize the new entries. */
- for (i = logp->dbentry_cnt; i < ndx + DB_GROW_SIZE; i++) {
- logp->dbentry[i].count = 0;
- TAILQ_INIT(&logp->dbentry[i].dblist);
- logp->dbentry[i].deleted = 0;
- logp->dbentry[i].refcount = 0;
- }
-
- logp->dbentry_cnt = i;
- }
-
- if (logp->dbentry[ndx].deleted == 0 &&
- TAILQ_FIRST(&logp->dbentry[ndx].dblist) == NULL) {
- logp->dbentry[ndx].count = 0;
- if (dbp != NULL)
- TAILQ_INSERT_HEAD(&logp->dbentry[ndx].dblist,
- dbp, links);
- logp->dbentry[ndx].deleted = dbp == NULL;
- logp->dbentry[ndx].refcount = 1;
- } else if (!F_ISSET(logp, DBLOG_RECOVER)) {
- if (dbp != NULL)
- TAILQ_INSERT_HEAD(&logp->dbentry[ndx].dblist,
- dbp, links);
- logp->dbentry[ndx].refcount++;
- }
-
-err: MUTEX_THREAD_UNLOCK(dbenv, logp->mutexp);
- return (ret);
-}
-
-/*
- * __db_fileid_to_db --
- * Return the DB corresponding to the specified fileid.
- *
- * PUBLIC: int __db_fileid_to_db __P((DB_ENV *, DB **, int32_t, int));
- */
-int
-__db_fileid_to_db(dbenv, dbpp, ndx, inc)
- DB_ENV *dbenv;
- DB **dbpp;
- int32_t ndx;
- int inc;
-{
- DB_LOG *logp;
- DB *dbp;
- FNAME *fname;
- int ret;
- char *name;
-
- ret = 0;
- logp = dbenv->lg_handle;
-
- MUTEX_THREAD_LOCK(dbenv, logp->mutexp);
-
- /*
- * Under XA, a process different than the one issuing DB operations
- * may abort a transaction. In this case, recovery routines are run
- * by a process that does not necessarily have the file open, so we
- * we must open the file explicitly.
- */
- if (ndx >= logp->dbentry_cnt ||
- (!logp->dbentry[ndx].deleted &&
- (dbp = TAILQ_FIRST(&logp->dbentry[ndx].dblist)) == NULL)) {
- if (F_ISSET(logp, DBLOG_RECOVER)) {
- ret = ENOENT;
- goto err;
- }
- if (__log_lid_to_fname(logp, ndx, &fname) != 0) {
- /* Couldn't find entry; this is a fatal error. */
- __db_err(dbenv, "Missing log fileid entry");
- ret = EINVAL;
- goto err;
- }
- name = R_ADDR(&logp->reginfo, fname->name_off);
-
- /*
- * __log_do_open is called without protection of the
- * log thread lock.
- */
- MUTEX_THREAD_UNLOCK(dbenv, logp->mutexp);
-
- /*
- * At this point, we are not holding the thread lock, so exit
- * directly instead of going through the exit code at the
- * bottom. If the __log_do_open succeeded, then we don't need
- * to do any of the remaining error checking at the end of this
- * routine.
- */
- if ((ret = __log_do_open(dbenv, logp,
- fname->ufid, name, fname->s_type,
- ndx, fname->meta_pgno)) != 0)
- return (ret);
-
- *dbpp = TAILQ_FIRST(&logp->dbentry[ndx].dblist);
- return (0);
- }
-
- /*
- * Return DB_DELETED if the file has been deleted (it's not an error).
- */
- if (logp->dbentry[ndx].deleted) {
- ret = DB_DELETED;
- if (inc)
- logp->dbentry[ndx].count++;
- goto err;
- }
-
- /*
- * Otherwise return 0, but if we don't have a corresponding DB, it's
- * an error.
- */
- if ((*dbpp = TAILQ_FIRST(&logp->dbentry[ndx].dblist)) == NULL)
- ret = ENOENT;
-
-err: MUTEX_THREAD_UNLOCK(dbenv, logp->mutexp);
- return (ret);
-}
-
-/*
- * __log_close_files --
- * Close files that were opened by the recovery daemon. We sync the
- * file, unless its mpf pointer has been NULLed by a db_remove or
- * db_rename. We may not have flushed the log_register record that
- * closes the file.
- *
- * PUBLIC: void __log_close_files __P((DB_ENV *));
- */
-void
-__log_close_files(dbenv)
- DB_ENV *dbenv;
-{
- DB_ENTRY *dbe;
- DB_LOG *logp;
- DB *dbp;
- int32_t i;
-
- logp = dbenv->lg_handle;
- MUTEX_THREAD_LOCK(dbenv, logp->mutexp);
- for (i = 0; i < logp->dbentry_cnt; i++) {
- dbe = &logp->dbentry[i];
- while ((dbp = TAILQ_FIRST(&dbe->dblist)) != NULL) {
- (void)log_unregister(dbenv, dbp);
- TAILQ_REMOVE(&dbe->dblist, dbp, links);
- (void)dbp->close(dbp, dbp->mpf == NULL ? DB_NOSYNC : 0);
- }
- dbe->deleted = 0;
- dbe->refcount = 0;
- }
- MUTEX_THREAD_UNLOCK(dbenv, logp->mutexp);
-}
-
-/*
- * __log_rem_logid
- * Remove an entry from the log table. Find the appropriate DB and
- * unlink it from the linked list off the table. If the DB is NULL, treat
- * this as a simple refcount decrement.
- *
- * PUBLIC: void __log_rem_logid __P((DB_LOG *, DB *, int32_t));
- */
-void
-__log_rem_logid(logp, dbp, ndx)
- DB_LOG *logp;
- DB *dbp;
- int32_t ndx;
-{
- DB *xdbp;
-
- MUTEX_THREAD_LOCK(logp->dbenv, logp->mutexp);
- if (--logp->dbentry[ndx].refcount == 0) {
- TAILQ_INIT(&logp->dbentry[ndx].dblist);
- logp->dbentry[ndx].deleted = 0;
- } else if (dbp != NULL)
- for (xdbp = TAILQ_FIRST(&logp->dbentry[ndx].dblist);
- xdbp != NULL;
- xdbp = TAILQ_NEXT(xdbp, links))
- if (xdbp == dbp) {
- TAILQ_REMOVE(&logp->dbentry[ndx].dblist,
- xdbp, links);
- break;
- }
-
- MUTEX_THREAD_UNLOCK(logp->dbenv, logp->mutexp);
-}
-
-/*
- * __log_lid_to_fname --
- * Traverse the shared-memory region looking for the entry that
- * matches the passed log fileid. Returns 0 on success; -1 on error.
- * PUBLIC: int __log_lid_to_fname __P((DB_LOG *, int32_t, FNAME **));
- */
-int
-__log_lid_to_fname(dblp, lid, fnamep)
- DB_LOG *dblp;
- int32_t lid;
- FNAME **fnamep;
-{
- FNAME *fnp;
- LOG *lp;
-
- lp = dblp->reginfo.primary;
-
- for (fnp = SH_TAILQ_FIRST(&lp->fq, __fname);
- fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) {
- if (fnp->ref == 0) /* Entry not in use. */
- continue;
- if (fnp->id == lid) {
- *fnamep = fnp;
- return (0);
- }
- }
- return (-1);
-}
diff --git a/bdb/log/log_register.c b/bdb/log/log_register.c
deleted file mode 100644
index 1e0e523d8b9..00000000000
--- a/bdb/log/log_register.c
+++ /dev/null
@@ -1,433 +0,0 @@
-/*-
- * See the file LICENSE for redistribution information.
- *
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
- * Sleepycat Software. All rights reserved.
- */
-#include "db_config.h"
-
-#ifndef lint
-static const char revid[] = "$Id: log_register.c,v 11.35 2001/01/10 16:04:19 bostic Exp $";
-#endif /* not lint */
-
-#ifndef NO_SYSTEM_INCLUDES
-#include <sys/types.h>
-
-#include <string.h>
-#endif
-
-#ifdef HAVE_RPC
-#include "db_server.h"
-#endif
-
-#include "db_int.h"
-#include "log.h"
-
-#ifdef HAVE_RPC
-#include "gen_client_ext.h"
-#include "rpc_client_ext.h"
-#endif
-
-/*
- * log_register --
- * Register a file name.
- */
-int
-log_register(dbenv, dbp, name)
- DB_ENV *dbenv;
- DB *dbp;
- const char *name;
-{
- DBT fid_dbt, r_name;
- DB_LOG *dblp;
- DB_LSN r_unused;
- FNAME *found_fnp, *fnp, *recover_fnp, *reuse_fnp;
- LOG *lp;
- size_t len;
- int32_t maxid;
- int inserted, ok, ret;
- void *namep;
-
-#ifdef HAVE_RPC
- if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
- return (__dbcl_log_register(dbenv, dbp, name));
-#endif
-
- PANIC_CHECK(dbenv);
- ENV_REQUIRES_CONFIG(dbenv, dbenv->lg_handle, DB_INIT_LOG);
-
- dblp = dbenv->lg_handle;
- lp = dblp->reginfo.primary;
- fnp = reuse_fnp = NULL;
- inserted = ret = 0;
- namep = NULL;
-
- /* Check the arguments. */
- if (dbp->type != DB_BTREE && dbp->type != DB_QUEUE &&
- dbp->type != DB_HASH && dbp->type != DB_RECNO) {
- __db_err(dbenv, "log_register: unknown DB file type");
- return (EINVAL);
- }
-
- R_LOCK(dbenv, &dblp->reginfo);
-
- /*
- * See if we've already got this file in the log, finding the
- * (maximum+1) in-use file id and some available file id (if we
- * find an available fid, we'll use it, else we'll have to allocate
- * one after the maximum that we found).
- */
- ok = 0;
- found_fnp = recover_fnp = NULL;
- for (maxid = 0, fnp = SH_TAILQ_FIRST(&lp->fq, __fname);
- fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) {
- if (F_ISSET(dblp, DBLOG_RECOVER) && fnp->id == dbp->log_fileid)
- recover_fnp = fnp;
- if (fnp->ref == 0) { /* Entry is not in use. */
- if (reuse_fnp == NULL)
- reuse_fnp = fnp;
- continue;
- }
- if (memcmp(dbp->fileid, fnp->ufid, DB_FILE_ID_LEN) == 0) {
- if (fnp->meta_pgno == 0) {
- if (fnp->locked == 1) {
- __db_err(dbenv, "File is locked");
- return (EINVAL);
- }
- if (found_fnp != NULL) {
- fnp = found_fnp;
- goto found;
- }
- ok = 1;
- }
- if (dbp->meta_pgno == fnp->meta_pgno) {
- if (F_ISSET(dblp, DBLOG_RECOVER)) {
- if (fnp->id != dbp->log_fileid) {
- /*
- * If we are in recovery, there
- * is only one dbp on the list.
- * If the refcount goes to 0,
- * we will clear the list. If
- * it doesn't, we want to leave
- * the dbp where it is, so
- * passing a NULL to rem_logid
- * is correct.
- */
- __log_rem_logid(dblp,
- NULL, fnp->id);
- if (recover_fnp != NULL)
- break;
- continue;
- }
- fnp->ref = 1;
- goto found;
- }
- ++fnp->ref;
- if (ok)
- goto found;
- found_fnp = fnp;
- }
- }
- if (maxid <= fnp->id)
- maxid = fnp->id + 1;
- }
- if ((fnp = found_fnp) != NULL)
- goto found;
-
- /* Fill in fnp structure. */
- if (recover_fnp != NULL) /* This has the right number */
- fnp = recover_fnp;
- else if (reuse_fnp != NULL) /* Reuse existing one. */
- fnp = reuse_fnp;
- else { /* Allocate a new one. */
- if ((ret = __db_shalloc(dblp->reginfo.addr,
- sizeof(FNAME), 0, &fnp)) != 0)
- goto mem_err;
- fnp->id = maxid;
- }
-
- if (F_ISSET(dblp, DBLOG_RECOVER))
- fnp->id = dbp->log_fileid;
-
- fnp->ref = 1;
- fnp->locked = 0;
- fnp->s_type = dbp->type;
- memcpy(fnp->ufid, dbp->fileid, DB_FILE_ID_LEN);
- fnp->meta_pgno = dbp->meta_pgno;
-
- if (name != NULL) {
- len = strlen(name) + 1;
- if ((ret =
- __db_shalloc(dblp->reginfo.addr, len, 0, &namep)) != 0) {
-mem_err: __db_err(dbenv,
- "Unable to allocate memory to register %s", name);
- goto err;
- }
- fnp->name_off = R_OFFSET(&dblp->reginfo, namep);
- memcpy(namep, name, len);
- } else
- fnp->name_off = INVALID_ROFF;
-
- /* Only do the insert if we allocated a new fnp. */
- if (reuse_fnp == NULL && recover_fnp == NULL)
- SH_TAILQ_INSERT_HEAD(&lp->fq, fnp, q, __fname);
- inserted = 1;
-
- /* Log the registry. */
- if (!F_ISSET(dblp, DBLOG_RECOVER)) {
- /*
- * We allow logging on in-memory databases, so the name here
- * could be NULL.
- */
- if (name != NULL) {
- r_name.data = (void *)name;
- r_name.size = strlen(name) + 1;
- }
- memset(&fid_dbt, 0, sizeof(fid_dbt));
- fid_dbt.data = dbp->fileid;
- fid_dbt.size = DB_FILE_ID_LEN;
- if ((ret = __log_register_log(dbenv, NULL, &r_unused,
- 0, LOG_OPEN, name == NULL ? NULL : &r_name,
- &fid_dbt, fnp->id, dbp->type, dbp->meta_pgno)) != 0)
- goto err;
- }
-
-found: /*
- * If we found the entry in the shared area, then the file is
- * already open, so there is no need to log the open. We only
- * log the open and closes on the first open and last close.
- */
- if (!F_ISSET(dblp, DBLOG_RECOVER) &&
- (ret = __log_add_logid(dbenv, dblp, dbp, fnp->id)) != 0)
- goto err;
-
- if (!F_ISSET(dblp, DBLOG_RECOVER))
- dbp->log_fileid = fnp->id;
-
- if (0) {
-err: if (inserted)
- SH_TAILQ_REMOVE(&lp->fq, fnp, q, __fname);
- if (namep != NULL)
- __db_shalloc_free(dblp->reginfo.addr, namep);
- if (fnp != NULL)
- __db_shalloc_free(dblp->reginfo.addr, fnp);
- }
-
- R_UNLOCK(dbenv, &dblp->reginfo);
-
- return (ret);
-}
-
-/*
- * log_unregister --
- * Discard a registered file name.
- */
-int
-log_unregister(dbenv, dbp)
- DB_ENV *dbenv;
- DB *dbp;
-{
- int ret;
-
-#ifdef HAVE_RPC
- if (F_ISSET(dbenv, DB_ENV_RPCCLIENT))
- return (__dbcl_log_unregister(dbenv, dbp));
-#endif
-
- PANIC_CHECK(dbenv);
- ENV_REQUIRES_CONFIG(dbenv, dbenv->lg_handle, DB_INIT_LOG);
-
- ret = __log_filelist_update(dbenv, dbp, dbp->log_fileid, NULL, NULL);
- dbp->log_fileid = DB_LOGFILEID_INVALID;
- return (ret);
-}
-
-/*
- * PUBLIC: int __log_filelist_update
- * PUBLIC: __P((DB_ENV *, DB *, int32_t, const char *, int *));
- *
- * Utility player for updating and logging the file list. Called
- * for 3 reasons:
- * 1) mark file closed: newname == NULL.
- * 2) change filename: newname != NULL.
- * 3) from recovery to verify & change filename if necessary, set != NULL.
- */
-int
-__log_filelist_update(dbenv, dbp, fid, newname, set)
- DB_ENV *dbenv;
- DB *dbp;
- int32_t fid;
- const char *newname;
- int *set;
-{
- DBT fid_dbt, r_name;
- DB_LOG *dblp;
- DB_LSN r_unused;
- FNAME *fnp;
- LOG *lp;
- u_int32_t len, newlen;
- int ret;
- void *namep;
-
- ret = 0;
- dblp = dbenv->lg_handle;
- lp = dblp->reginfo.primary;
-
- R_LOCK(dbenv, &dblp->reginfo);
-
- /* Find the entry in the log. */
- for (fnp = SH_TAILQ_FIRST(&lp->fq, __fname);
- fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname))
- if (fid == fnp->id)
- break;
- if (fnp == NULL) {
- __db_err(dbenv, "log_unregister: non-existent file id");
- ret = EINVAL;
- goto ret1;
- }
-
- /*
- * Log the unregistry only if this is the last one and we are
- * really closing the file or if this is an abort of a created
- * file and we need to make sure there is a record in the log.
- */
- namep = NULL;
- len = 0;
- if (fnp->name_off != INVALID_ROFF) {
- namep = R_ADDR(&dblp->reginfo, fnp->name_off);
- len = strlen(namep) + 1;
- }
- if (!F_ISSET(dblp, DBLOG_RECOVER) && fnp->ref == 1) {
- if (namep != NULL) {
- memset(&r_name, 0, sizeof(r_name));
- r_name.data = namep;
- r_name.size = len;
- }
- memset(&fid_dbt, 0, sizeof(fid_dbt));
- fid_dbt.data = fnp->ufid;
- fid_dbt.size = DB_FILE_ID_LEN;
- if ((ret = __log_register_log(dbenv, NULL, &r_unused,
- 0, LOG_CLOSE,
- fnp->name_off == INVALID_ROFF ? NULL : &r_name,
- &fid_dbt, fid, fnp->s_type, fnp->meta_pgno))
- != 0)
- goto ret1;
- }
-
- /*
- * If we are changing the name we must log this fact.
- */
- if (newname != NULL) {
- DB_ASSERT(fnp->ref == 1);
- newlen = strlen(newname) + 1;
- if (!F_ISSET(dblp, DBLOG_RECOVER)) {
- r_name.data = (void *) newname;
- r_name.size = newlen;
- if ((ret = __log_register_log(dbenv,
- NULL, &r_unused, 0, LOG_OPEN, &r_name, &fid_dbt,
- fnp->id, fnp->s_type, fnp->meta_pgno)) != 0)
- goto ret1;
- }
-
- /*
- * Check to see if the name is already correct.
- */
- if (set != NULL) {
- if (len != newlen || memcmp(namep, newname, len) != 0)
- *set = 1;
- else {
- *set = 0;
- goto ret1;
- }
- }
-
- /*
- * Change the name, realloc memory if necessary
- */
- if (len < newlen) {
- __db_shalloc_free(dblp->reginfo.addr,
- R_ADDR(&dblp->reginfo, fnp->name_off));
- if ((ret = __db_shalloc(
- dblp->reginfo.addr, newlen, 0, &namep)) != 0) {
- __db_err(dbenv,
- "Unable to allocate memory to register %s",
- newname);
- goto ret1;
- }
- fnp->name_off = R_OFFSET(&dblp->reginfo, namep);
- } else
- namep = R_ADDR(&dblp->reginfo, fnp->name_off);
- memcpy(namep, newname, newlen);
- } else {
-
- /*
- * If more than 1 reference, just decrement the reference
- * and return. Otherwise, free the name if one exists.
- */
- DB_ASSERT(fnp->ref >= 1);
- --fnp->ref;
- if (fnp->ref == 0) {
- if (fnp->name_off != INVALID_ROFF)
- __db_shalloc_free(dblp->reginfo.addr,
- R_ADDR(&dblp->reginfo, fnp->name_off));
- fnp->name_off = INVALID_ROFF;
- }
-
- /*
- * Remove from the process local table. If this
- * operation is taking place during recovery, then
- * the logid was never added to the table, so do not remove it.
- */
- if (!F_ISSET(dblp, DBLOG_RECOVER))
- __log_rem_logid(dblp, dbp, fid);
- }
-
-ret1: R_UNLOCK(dbenv, &dblp->reginfo);
- return (ret);
-}
-
-/*
- * __log_file_lock -- lock a file for single access
- * This only works if logging is on.
- *
- * PUBLIC: int __log_file_lock __P((DB *));
- */
-int
-__log_file_lock(dbp)
- DB *dbp;
-{
- DB_ENV *dbenv;
- DB_LOG *dblp;
- FNAME *fnp;
- LOG *lp;
- int ret;
-
- dbenv = dbp->dbenv;
- dblp = dbenv->lg_handle;
- lp = dblp->reginfo.primary;
-
- ret = 0;
- R_LOCK(dbenv, &dblp->reginfo);
-
- for (fnp = SH_TAILQ_FIRST(&lp->fq, __fname);
- fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) {
- if (fnp->ref == 0)
- continue;
-
- if (!memcmp(dbp->fileid, fnp->ufid, DB_FILE_ID_LEN)) {
- if (fnp->meta_pgno == 0) {
- if (fnp->ref != 1)
- goto err;
-
- fnp->locked = 1;
- } else {
-err: __db_err(dbp->dbenv, "File is open");
- ret = EINVAL;
- goto done;
- }
-
- }
- }
-done: R_UNLOCK(dbenv, &dblp->reginfo);
- return (ret);
-}