summaryrefslogtreecommitdiff
path: root/bdb/db/db_dispatch.c
diff options
context:
space:
mode:
Diffstat (limited to 'bdb/db/db_dispatch.c')
-rw-r--r--bdb/db/db_dispatch.c1305
1 files changed, 863 insertions, 442 deletions
diff --git a/bdb/db/db_dispatch.c b/bdb/db/db_dispatch.c
index c9beac401a7..2cf29ec2f33 100644
--- a/bdb/db/db_dispatch.c
+++ b/bdb/db/db_dispatch.c
@@ -1,7 +1,7 @@
/*-
* See the file LICENSE for redistribution information.
*
- * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Copyright (c) 1996-2002
* Sleepycat Software. All rights reserved.
*/
/*
@@ -39,7 +39,7 @@
#include "db_config.h"
#ifndef lint
-static const char revid[] = "$Id: db_dispatch.c,v 11.41 2001/01/11 18:19:50 bostic Exp $";
+static const char revid[] = "$Id: db_dispatch.c,v 11.121 2002/09/07 17:36:31 ubell Exp $";
#endif /* not lint */
#ifndef NO_SYSTEM_INCLUDES
@@ -51,16 +51,24 @@ static const char revid[] = "$Id: db_dispatch.c,v 11.41 2001/01/11 18:19:50 bost
#endif
#include "db_int.h"
-#include "db_page.h"
-#include "db_dispatch.h"
-#include "db_am.h"
-#include "log_auto.h"
-#include "txn.h"
-#include "txn_auto.h"
-#include "log.h"
-
-static int __db_txnlist_find_internal __P((void *, db_txnlist_type,
- u_int32_t, u_int8_t [DB_FILE_ID_LEN], DB_TXNLIST **, int));
+#include "dbinc/db_page.h"
+#include "dbinc/hash.h"
+#include "dbinc/log.h"
+#include "dbinc/fop.h"
+#include "dbinc/rep.h"
+#include "dbinc/txn.h"
+
+static int __db_limbo_fix __P((DB *,
+ DB_TXN *, DB_TXNLIST *, db_pgno_t *, DBMETA *));
+static int __db_limbo_bucket __P((DB_ENV *, DB_TXN *, DB_TXNLIST *));
+static int __db_limbo_move __P((DB_ENV *, DB_TXN *, DB_TXN *, DB_TXNLIST *));
+static int __db_lock_move __P((DB_ENV *,
+ u_int8_t *, db_pgno_t, db_lockmode_t, DB_TXN *, DB_TXN *));
+static int __db_default_getpgnos __P((DB_ENV *, DB_LSN *lsnp, void *));
+static int __db_txnlist_find_internal __P((DB_ENV *, void *, db_txnlist_type,
+ u_int32_t, u_int8_t [DB_FILE_ID_LEN], DB_TXNLIST **, int));
+static int __db_txnlist_pgnoadd __P((DB_ENV *, DB_TXNHEAD *,
+ int32_t, u_int8_t [DB_FILE_ID_LEN], char *, db_pgno_t));
/*
* __db_dispatch --
@@ -71,16 +79,21 @@ static int __db_txnlist_find_internal __P((void *, db_txnlist_type,
* scripts in the tools directory). An application using a different
* recovery paradigm will supply a different dispatch function to txn_open.
*
- * PUBLIC: int __db_dispatch __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
+ * PUBLIC: int __db_dispatch __P((DB_ENV *,
+ * PUBLIC: int (**)__P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)),
+ * PUBLIC: size_t, DBT *, DB_LSN *, db_recops, void *));
*/
int
-__db_dispatch(dbenv, db, lsnp, redo, info)
+__db_dispatch(dbenv, dtab, dtabsize, db, lsnp, redo, info)
DB_ENV *dbenv; /* The environment. */
+ int (**dtab)__P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
+ size_t dtabsize; /* Size of the dtab. */
DBT *db; /* The log record upon which to dispatch. */
DB_LSN *lsnp; /* The lsn of the record being dispatched. */
db_recops redo; /* Redo this op (or undo it). */
void *info;
{
+ DB_LSN prev_lsn;
u_int32_t rectype, txnid;
int make_call, ret;
@@ -88,6 +101,9 @@ __db_dispatch(dbenv, db, lsnp, redo, info)
memcpy(&txnid, (u_int8_t *)db->data + sizeof(rectype), sizeof(txnid));
make_call = ret = 0;
+ /* If we don't have a dispatch table, it's hard to dispatch. */
+ DB_ASSERT(dtab != NULL);
+
/*
* If we find a record that is in the user's number space and they
* have specified a recovery routine, let them handle it. If they
@@ -96,17 +112,29 @@ __db_dispatch(dbenv, db, lsnp, redo, info)
*/
switch (redo) {
case DB_TXN_ABORT:
- /*
- * XXX
- * db_printlog depends on DB_TXN_ABORT not examining the TXN
- * list. If that ever changes, fix db_printlog too.
- */
+ case DB_TXN_APPLY:
+ case DB_TXN_PRINT:
make_call = 1;
break;
case DB_TXN_OPENFILES:
- if (rectype == DB_log_register)
- return (dbenv->dtab[rectype](dbenv,
- db, lsnp, redo, info));
+ /*
+ * We collect all the transactions that have
+ * "begin" records, those with no previous LSN,
+ * so that we do not abort partial transactions.
+ * These are known to be undone, otherwise the
+ * log would not have been freeable.
+ */
+ memcpy(&prev_lsn, (u_int8_t *)db->data +
+ sizeof(rectype) + sizeof(txnid), sizeof(prev_lsn));
+ if (txnid != 0 && prev_lsn.file == 0 && (ret =
+ __db_txnlist_add(dbenv, info, txnid, TXN_OK, NULL)) != 0)
+ return (ret);
+
+ /* FALLTHROUGH */
+ case DB_TXN_POPENFILES:
+ if (rectype == DB___dbreg_register ||
+ rectype == DB___txn_ckp || rectype == DB___txn_recycle)
+ return (dtab[rectype](dbenv, db, lsnp, redo, info));
break;
case DB_TXN_BACKWARD_ROLL:
/*
@@ -117,43 +145,146 @@ __db_dispatch(dbenv, db, lsnp, redo, info)
* we've never seen it, then we call the appropriate recovery
* routine.
*
- * We need to always undo DB_db_noop records, so that we
+ * We need to always undo DB___db_noop records, so that we
* properly handle any aborts before the file was closed.
*/
- if (rectype == DB_log_register ||
- rectype == DB_txn_ckp || rectype == DB_db_noop
- || rectype == DB_txn_child || (txnid != 0 &&
- (ret = __db_txnlist_find(info, txnid)) != 0)) {
+ switch(rectype) {
+ case DB___txn_regop:
+ case DB___txn_recycle:
+ case DB___txn_ckp:
+ case DB___db_noop:
+ case DB___fop_file_remove:
+ case DB___txn_child:
make_call = 1;
- if (ret == DB_NOTFOUND && rectype != DB_txn_regop &&
- rectype != DB_txn_xa_regop && (ret =
- __db_txnlist_add(dbenv, info, txnid, 1)) != 0)
- return (ret);
+ break;
+
+ case DB___dbreg_register:
+ if (txnid == 0) {
+ make_call = 1;
+ break;
+ }
+ /* FALLTHROUGH */
+ default:
+ if (txnid != 0 && (ret =
+ __db_txnlist_find(dbenv,
+ info, txnid)) != TXN_COMMIT && ret != TXN_IGNORE) {
+ /*
+ * If not found then, this is an incomplete
+ * abort.
+ */
+ if (ret == TXN_NOTFOUND)
+ return (__db_txnlist_add(dbenv,
+ info, txnid, TXN_IGNORE, lsnp));
+ make_call = 1;
+ if (ret == TXN_OK &&
+ (ret = __db_txnlist_update(dbenv,
+ info, txnid,
+ rectype == DB___txn_xa_regop ?
+ TXN_PREPARE : TXN_ABORT, NULL)) != 0)
+ return (ret);
+ }
}
break;
case DB_TXN_FORWARD_ROLL:
/*
* In the forward pass, if we haven't seen the transaction,
- * do nothing, else recovery it.
+ * do nothing, else recover it.
*
- * We need to always redo DB_db_noop records, so that we
+ * We need to always redo DB___db_noop records, so that we
* properly handle any commits after the file was closed.
*/
- if (rectype == DB_log_register ||
- rectype == DB_txn_ckp ||
- rectype == DB_db_noop ||
- __db_txnlist_find(info, txnid) == 0)
+ switch(rectype) {
+ case DB___txn_recycle:
+ case DB___txn_ckp:
+ case DB___db_noop:
make_call = 1;
+ break;
+
+ default:
+ if (txnid != 0 && (ret = __db_txnlist_find(dbenv,
+ info, txnid)) == TXN_COMMIT)
+ make_call = 1;
+ else if (ret != TXN_IGNORE &&
+ (rectype == DB___ham_metagroup ||
+ rectype == DB___ham_groupalloc ||
+ rectype == DB___db_pg_alloc)) {
+ /*
+ * Because we cannot undo file extensions
+ * all allocation records must be reprocessed
+ * during rollforward in case the file was
+ * just created. It may not have been
+ * present during the backward pass.
+ */
+ make_call = 1;
+ redo = DB_TXN_BACKWARD_ALLOC;
+ } else if (rectype == DB___dbreg_register) {
+ /*
+ * This may be a transaction dbreg_register.
+ * If it is, we only make the call on a COMMIT,
+ * which we checked above. If it's not, then we
+ * should always make the call, because we need
+ * the file open information.
+ */
+ if (txnid == 0)
+ make_call = 1;
+ }
+ }
break;
+ case DB_TXN_GETPGNOS:
+ /*
+ * If this is one of DB's own log records, we simply
+ * dispatch.
+ */
+ if (rectype < DB_user_BEGIN) {
+ make_call = 1;
+ break;
+ }
+
+ /*
+ * If we're still here, this is a custom record in an
+ * application that's doing app-specific logging. Such a
+ * record doesn't have a getpgno function for the user
+ * dispatch function to call--the getpgnos functions return
+ * which pages replication needs to lock using the TXN_RECS
+ * structure, which is private and not something we want to
+ * document.
+ *
+ * Thus, we leave any necessary locking for the app's
+ * recovery function to do during the upcoming
+ * DB_TXN_APPLY. Fill in default getpgnos info (we need
+ * a stub entry for every log record that will get
+ * DB_TXN_APPLY'd) and return success.
+ */
+ return (__db_default_getpgnos(dbenv, lsnp, info));
default:
return (__db_unknown_flag(dbenv, "__db_dispatch", redo));
}
+ /*
+ * The switch statement uses ret to receive the return value of
+ * __db_txnlist_find, which returns a large number of different
+ * statuses, none of which we will be returning. For safety,
+ * let's reset this here in case we ever do a "return(ret)"
+ * below in the future.
+ */
+ ret = 0;
if (make_call) {
- if (rectype >= DB_user_BEGIN && dbenv->tx_recover != NULL)
- return (dbenv->tx_recover(dbenv, db, lsnp, redo));
- else
- return (dbenv->dtab[rectype](dbenv, db, lsnp, redo, info));
+ if (rectype >= DB_user_BEGIN && dbenv->app_dispatch != NULL)
+ return (dbenv->app_dispatch(dbenv, db, lsnp, redo));
+ else {
+ /*
+ * The size of the dtab table argument is the same as
+ * the standard table, use the standard table's size
+ * as our sanity check.
+ */
+ if (rectype > dtabsize || dtab[rectype] == NULL) {
+ __db_err(dbenv,
+ "Illegal record type %lu in log",
+ (u_long)rectype);
+ return (EINVAL);
+ }
+ return (dtab[rectype](dbenv, db, lsnp, redo, info));
+ }
}
return (0);
@@ -163,75 +294,100 @@ __db_dispatch(dbenv, db, lsnp, redo, info)
* __db_add_recovery --
*
* PUBLIC: int __db_add_recovery __P((DB_ENV *,
- * PUBLIC: int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops, void *), u_int32_t));
+ * PUBLIC: int (***)(DB_ENV *, DBT *, DB_LSN *, db_recops, void *), size_t *,
+ * PUBLIC: int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops, void *), u_int32_t));
*/
int
-__db_add_recovery(dbenv, func, ndx)
+__db_add_recovery(dbenv, dtab, dtabsize, func, ndx)
DB_ENV *dbenv;
+ int (***dtab) __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
+ size_t *dtabsize;
int (*func) __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
u_int32_t ndx;
{
- u_int32_t i, nsize;
+ size_t i, nsize;
int ret;
/* Check if we have to grow the table. */
- if (ndx >= dbenv->dtab_size) {
+ if (ndx >= *dtabsize) {
nsize = ndx + 40;
- if ((ret = __os_realloc(dbenv,
- nsize * sizeof(dbenv->dtab[0]), NULL, &dbenv->dtab)) != 0)
+ if ((ret =
+ __os_realloc(dbenv, nsize * sizeof((*dtab)[0]), dtab)) != 0)
return (ret);
- for (i = dbenv->dtab_size; i < nsize; ++i)
- dbenv->dtab[i] = NULL;
- dbenv->dtab_size = nsize;
+ for (i = *dtabsize; i < nsize; ++i)
+ (*dtab)[i] = NULL;
+ *dtabsize = nsize;
}
- dbenv->dtab[ndx] = func;
+ (*dtab)[ndx] = func;
return (0);
}
/*
- * __deprecated_recover --
- * Stub routine for deprecated recovery functions.
- *
- * PUBLIC: int __deprecated_recover
- * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *));
- */
-int
-__deprecated_recover(dbenv, dbtp, lsnp, op, info)
- DB_ENV *dbenv;
- DBT *dbtp;
- DB_LSN *lsnp;
- db_recops op;
- void *info;
-{
- COMPQUIET(dbenv, NULL);
- COMPQUIET(dbtp, NULL);
- COMPQUIET(lsnp, NULL);
- COMPQUIET(op, 0);
- COMPQUIET(info, NULL);
- return (EINVAL);
-}
-
-/*
* __db_txnlist_init --
* Initialize transaction linked list.
*
- * PUBLIC: int __db_txnlist_init __P((DB_ENV *, void *));
+ * PUBLIC: int __db_txnlist_init __P((DB_ENV *,
+ * PUBLIC: u_int32_t, u_int32_t, DB_LSN *, void *));
*/
int
-__db_txnlist_init(dbenv, retp)
+__db_txnlist_init(dbenv, low_txn, hi_txn, trunc_lsn, retp)
DB_ENV *dbenv;
+ u_int32_t low_txn, hi_txn;
+ DB_LSN *trunc_lsn;
void *retp;
{
DB_TXNHEAD *headp;
- int ret;
+ u_int32_t tmp;
+ int ret, size;
- if ((ret = __os_malloc(dbenv, sizeof(DB_TXNHEAD), NULL, &headp)) != 0)
+ /*
+ * Size a hash table.
+ * If low is zero then we are being called during rollback
+ * and we need only one slot.
+ * Hi maybe lower than low if we have recycled txnid's.
+ * The numbers here are guesses about txn density, we can afford
+ * to look at a few entries in each slot.
+ */
+ if (low_txn == 0)
+ size = 1;
+ else {
+ if (hi_txn < low_txn) {
+ tmp = hi_txn;
+ hi_txn = low_txn;
+ low_txn = tmp;
+ }
+ tmp = hi_txn - low_txn;
+ /* See if we wrapped around. */
+ if (tmp > (TXN_MAXIMUM - TXN_MINIMUM) / 2)
+ tmp = (low_txn - TXN_MINIMUM) + (TXN_MAXIMUM - hi_txn);
+ size = tmp / 5;
+ if (size < 100)
+ size = 100;
+ }
+ if ((ret = __os_malloc(dbenv,
+ sizeof(DB_TXNHEAD) + size * sizeof(headp->head), &headp)) != 0)
return (ret);
- LIST_INIT(&headp->head);
- headp->maxid = 0;
- headp->generation = 1;
+ memset(headp, 0, sizeof(DB_TXNHEAD) + size * sizeof(headp->head));
+ headp->maxid = hi_txn;
+ headp->generation = 0;
+ headp->nslots = size;
+ headp->gen_alloc = 8;
+ if ((ret = __os_malloc(dbenv, headp->gen_alloc *
+ sizeof(headp->gen_array[0]), &headp->gen_array)) != 0) {
+ __os_free(dbenv, headp);
+ return (ret);
+ }
+ headp->gen_array[0].generation = 0;
+ headp->gen_array[0].txn_min = TXN_MINIMUM;
+ headp->gen_array[0].txn_max = TXN_MAXIMUM;
+ if (trunc_lsn != NULL)
+ headp->trunc_lsn = *trunc_lsn;
+ else
+ ZERO_LSN(headp->trunc_lsn);
+ ZERO_LSN(headp->maxlsn);
+ ZERO_LSN(headp->ckplsn);
*(void **)retp = headp;
return (0);
@@ -241,132 +397,86 @@ __db_txnlist_init(dbenv, retp)
* __db_txnlist_add --
* Add an element to our transaction linked list.
*
- * PUBLIC: int __db_txnlist_add __P((DB_ENV *, void *, u_int32_t, int32_t));
+ * PUBLIC: int __db_txnlist_add __P((DB_ENV *,
+ * PUBLIC: void *, u_int32_t, int32_t, DB_LSN *));
*/
int
-__db_txnlist_add(dbenv, listp, txnid, aborted)
+__db_txnlist_add(dbenv, listp, txnid, status, lsn)
DB_ENV *dbenv;
void *listp;
u_int32_t txnid;
- int32_t aborted;
+ int32_t status;
+ DB_LSN *lsn;
{
DB_TXNHEAD *hp;
DB_TXNLIST *elp;
int ret;
- if ((ret = __os_malloc(dbenv, sizeof(DB_TXNLIST), NULL, &elp)) != 0)
+ if ((ret = __os_malloc(dbenv, sizeof(DB_TXNLIST), &elp)) != 0)
return (ret);
hp = (DB_TXNHEAD *)listp;
- LIST_INSERT_HEAD(&hp->head, elp, links);
+ LIST_INSERT_HEAD(&hp->head[DB_TXNLIST_MASK(hp, txnid)], elp, links);
elp->type = TXNLIST_TXNID;
elp->u.t.txnid = txnid;
- elp->u.t.aborted = aborted;
+ elp->u.t.status = status;
+ elp->u.t.generation = hp->generation;
if (txnid > hp->maxid)
hp->maxid = txnid;
- elp->u.t.generation = hp->generation;
+ if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT)
+ hp->maxlsn = *lsn;
+
+ DB_ASSERT(lsn == NULL ||
+ status != TXN_COMMIT || log_compare(&hp->maxlsn, lsn) >= 0);
return (0);
}
+
/*
* __db_txnlist_remove --
* Remove an element from our transaction linked list.
*
- * PUBLIC: int __db_txnlist_remove __P((void *, u_int32_t));
+ * PUBLIC: int __db_txnlist_remove __P((DB_ENV *, void *, u_int32_t));
*/
int
-__db_txnlist_remove(listp, txnid)
+__db_txnlist_remove(dbenv, listp, txnid)
+ DB_ENV *dbenv;
void *listp;
u_int32_t txnid;
{
DB_TXNLIST *entry;
- return (__db_txnlist_find_internal(listp,
- TXNLIST_TXNID, txnid, NULL, &entry, 1));
-}
-
-/* __db_txnlist_close --
- *
- * Call this when we close a file. It allows us to reconcile whether
- * we have done any operations on this file with whether the file appears
- * to have been deleted. If you never do any operations on a file, then
- * we assume it's OK to appear deleted.
- *
- * PUBLIC: int __db_txnlist_close __P((void *, int32_t, u_int32_t));
- */
-
-int
-__db_txnlist_close(listp, lid, count)
- void *listp;
- int32_t lid;
- u_int32_t count;
-{
- DB_TXNHEAD *hp;
- DB_TXNLIST *p;
-
- hp = (DB_TXNHEAD *)listp;
- for (p = LIST_FIRST(&hp->head); p != NULL; p = LIST_NEXT(p, links)) {
- if (p->type == TXNLIST_DELETE)
- if (lid == p->u.d.fileid &&
- !F_ISSET(&p->u.d, TXNLIST_FLAG_CLOSED)) {
- p->u.d.count += count;
- return (0);
- }
- }
-
- return (0);
+ return (__db_txnlist_find_internal(dbenv,
+ listp, TXNLIST_TXNID, txnid,
+ NULL, &entry, 1) == TXN_NOTFOUND ? TXN_NOTFOUND : TXN_OK);
}
/*
- * __db_txnlist_delete --
- *
- * Record that a file was missing or deleted. If the deleted
- * flag is set, then we've encountered a delete of a file, else we've
- * just encountered a file that is missing. The lid is the log fileid
- * and is only meaningful if deleted is not equal to 0.
+ * __db_txnlist_ckp --
+ * Used to record the maximum checkpoint that will be retained
+ * after recovery. Typically this is simply the max checkpoint, but
+ * if we are doing client replication recovery or timestamp-based
+ * recovery, we are going to virtually truncate the log and we need
+ * to retain the last checkpoint before the truncation point.
*
- * PUBLIC: int __db_txnlist_delete __P((DB_ENV *,
- * PUBLIC: void *, char *, u_int32_t, int));
+ * PUBLIC: void __db_txnlist_ckp __P((DB_ENV *, void *, DB_LSN *));
*/
-int
-__db_txnlist_delete(dbenv, listp, name, lid, deleted)
+void
+__db_txnlist_ckp(dbenv, listp, ckp_lsn)
DB_ENV *dbenv;
void *listp;
- char *name;
- u_int32_t lid;
- int deleted;
+ DB_LSN *ckp_lsn;
{
DB_TXNHEAD *hp;
- DB_TXNLIST *p;
- int ret;
- hp = (DB_TXNHEAD *)listp;
- for (p = LIST_FIRST(&hp->head); p != NULL; p = LIST_NEXT(p, links)) {
- if (p->type == TXNLIST_DELETE)
- if (strcmp(name, p->u.d.fname) == 0) {
- if (deleted)
- F_SET(&p->u.d, TXNLIST_FLAG_DELETED);
- else
- F_CLR(&p->u.d, TXNLIST_FLAG_CLOSED);
- return (0);
- }
- }
-
- /* Need to add it. */
- if ((ret = __os_malloc(dbenv, sizeof(DB_TXNLIST), NULL, &p)) != 0)
- return (ret);
- LIST_INSERT_HEAD(&hp->head, p, links);
+ COMPQUIET(dbenv, NULL);
- p->type = TXNLIST_DELETE;
- p->u.d.flags = 0;
- if (deleted)
- F_SET(&p->u.d, TXNLIST_FLAG_DELETED);
- p->u.d.fileid = lid;
- p->u.d.count = 0;
- ret = __os_strdup(dbenv, name, &p->u.d.fname);
+ hp = (DB_TXNHEAD *)listp;
- return (ret);
+ if (IS_ZERO_LSN(hp->ckplsn) && !IS_ZERO_LSN(hp->maxlsn) &&
+ log_compare(&hp->maxlsn, ckp_lsn) >= 0)
+ hp->ckplsn = *ckp_lsn;
}
/*
@@ -383,99 +493,156 @@ __db_txnlist_end(dbenv, listp)
{
DB_TXNHEAD *hp;
DB_TXNLIST *p;
- DB_LOG *lp;
+ int i;
- hp = (DB_TXNHEAD *)listp;
- lp = (DB_LOG *)dbenv->lg_handle;
- while (hp != NULL && (p = LIST_FIRST(&hp->head)) != NULL) {
- LIST_REMOVE(p, links);
- switch (p->type) {
- case TXNLIST_DELETE:
- /*
- * If we have a file that is not deleted and has
- * some operations, we flag the warning. Since
- * the file could still be open, we need to check
- * the actual log table as well.
- */
- if ((!F_ISSET(&p->u.d, TXNLIST_FLAG_DELETED) &&
- p->u.d.count != 0) ||
- (!F_ISSET(&p->u.d, TXNLIST_FLAG_CLOSED) &&
- p->u.d.fileid != (int32_t) TXNLIST_INVALID_ID &&
- p->u.d.fileid < lp->dbentry_cnt &&
- lp->dbentry[p->u.d.fileid].count != 0))
- __db_err(dbenv, "warning: %s: %s",
- p->u.d.fname, db_strerror(ENOENT));
- __os_freestr(p->u.d.fname);
- break;
- case TXNLIST_LSN:
- __os_free(p->u.l.lsn_array,
- p->u.l.maxn * sizeof(DB_LSN));
- break;
- default:
- /* Possibly an incomplete DB_TXNLIST; just free it. */
- break;
+ if ((hp = (DB_TXNHEAD *)listp) == NULL)
+ return;
+
+ for (i = 0; i < hp->nslots; i++)
+ while (hp != NULL && (p = LIST_FIRST(&hp->head[i])) != NULL) {
+ LIST_REMOVE(p, links);
+ switch (p->type) {
+ case TXNLIST_LSN:
+ __os_free(dbenv, p->u.l.lsn_array);
+ break;
+ default:
+ /*
+ * Possibly an incomplete DB_TXNLIST; just
+ * free it.
+ */
+ break;
+ }
+ __os_free(dbenv, p);
}
- __os_free(p, sizeof(DB_TXNLIST));
- }
- __os_free(listp, sizeof(DB_TXNHEAD));
+
+ if (hp->gen_array != NULL)
+ __os_free(dbenv, hp->gen_array);
+ __os_free(dbenv, listp);
}
/*
* __db_txnlist_find --
* Checks to see if a txnid with the current generation is in the
- * txnid list. This returns DB_NOTFOUND if the item isn't in the
- * list otherwise it returns (like __db_txnlist_find_internal) a
- * 1 or 0 indicating if the transaction is aborted or not. A txnid
- * of 0 means the record was generated while not in a transaction.
+ * txnid list. This returns TXN_NOTFOUND if the item isn't in the
+ * list otherwise it returns (like __db_txnlist_find_internal)
+ * the status of the transaction. A txnid of 0 means the record
+ * was generated while not in a transaction.
*
- * PUBLIC: int __db_txnlist_find __P((void *, u_int32_t));
+ * PUBLIC: int __db_txnlist_find __P((DB_ENV *, void *, u_int32_t));
*/
int
-__db_txnlist_find(listp, txnid)
+__db_txnlist_find(dbenv, listp, txnid)
+ DB_ENV *dbenv;
void *listp;
u_int32_t txnid;
{
DB_TXNLIST *entry;
if (txnid == 0)
- return (DB_NOTFOUND);
- return (__db_txnlist_find_internal(listp,
- TXNLIST_TXNID, txnid, NULL, &entry, 0));
+ return (TXN_NOTFOUND);
+ return (__db_txnlist_find_internal(dbenv, listp,
+ TXNLIST_TXNID, txnid, NULL, &entry, 0));
+}
+
+/*
+ * __db_txnlist_update --
+ * Change the status of an existing transaction entry.
+ * Returns TXN_NOTFOUND if no such entry exists.
+ *
+ * PUBLIC: int __db_txnlist_update __P((DB_ENV *,
+ * PUBLIC: void *, u_int32_t, u_int32_t, DB_LSN *));
+ */
+int
+__db_txnlist_update(dbenv, listp, txnid, status, lsn)
+ DB_ENV *dbenv;
+ void *listp;
+ u_int32_t txnid;
+ u_int32_t status;
+ DB_LSN *lsn;
+{
+ DB_TXNHEAD *hp;
+ DB_TXNLIST *elp;
+ int ret;
+
+ if (txnid == 0)
+ return (TXN_NOTFOUND);
+ hp = (DB_TXNHEAD *)listp;
+ ret = __db_txnlist_find_internal(dbenv,
+ listp, TXNLIST_TXNID, txnid, NULL, &elp, 0);
+
+ if (ret == TXN_NOTFOUND)
+ return (ret);
+ elp->u.t.status = status;
+
+ if (lsn != NULL && IS_ZERO_LSN(hp->maxlsn) && status == TXN_COMMIT)
+ hp->maxlsn = *lsn;
+
+ return (ret);
}
/*
* __db_txnlist_find_internal --
- * Find an entry on the transaction list.
- * If the entry is not there or the list pointeris not initialized
- * we return DB_NOTFOUND. If the item is found, we return the aborted
- * status (1 for aborted, 0 for not aborted). Currently we always call
- * this with an initialized list pointer but checking for NULL keeps it general.
+ * Find an entry on the transaction list. If the entry is not there or
+ * the list pointer is not initialized we return TXN_NOTFOUND. If the
+ * item is found, we return the status. Currently we always call this
+ * with an initialized list pointer but checking for NULL keeps it general.
*/
static int
-__db_txnlist_find_internal(listp, type, txnid, uid, txnlistp, delete)
+__db_txnlist_find_internal(dbenv, listp, type, txnid, uid, txnlistp, delete)
+ DB_ENV *dbenv;
void *listp;
db_txnlist_type type;
- u_int32_t txnid;
+ u_int32_t txnid;
u_int8_t uid[DB_FILE_ID_LEN];
DB_TXNLIST **txnlistp;
int delete;
{
DB_TXNHEAD *hp;
DB_TXNLIST *p;
- int ret;
+ int32_t generation;
+ u_int32_t hash;
+ struct __db_headlink *head;
+ int i, ret;
if ((hp = (DB_TXNHEAD *)listp) == NULL)
- return (DB_NOTFOUND);
+ return (TXN_NOTFOUND);
+
+ switch (type) {
+ case TXNLIST_TXNID:
+ hash = txnid;
+ /* Find the most recent generation containing this ID */
+ for (i = 0; i <= hp->generation; i++)
+ /* The range may wrap around the end. */
+ if (hp->gen_array[i].txn_min <
+ hp->gen_array[i].txn_max ?
+ (txnid >= hp->gen_array[i].txn_min &&
+ txnid <= hp->gen_array[i].txn_max) :
+ (txnid >= hp->gen_array[i].txn_min ||
+ txnid <= hp->gen_array[i].txn_max))
+ break;
+ DB_ASSERT(i <= hp->generation);
+ generation = hp->gen_array[i].generation;
+ break;
+ case TXNLIST_PGNO:
+ memcpy(&hash, uid, sizeof(hash));
+ generation = 0;
+ break;
+ default:
+ DB_ASSERT(0);
+ return (EINVAL);
+ }
+
+ head = &hp->head[DB_TXNLIST_MASK(hp, hash)];
- for (p = LIST_FIRST(&hp->head); p != NULL; p = LIST_NEXT(p, links)) {
+ for (p = LIST_FIRST(head); p != NULL; p = LIST_NEXT(p, links)) {
if (p->type != type)
continue;
switch (type) {
case TXNLIST_TXNID:
if (p->u.t.txnid != txnid ||
- hp->generation != p->u.t.generation)
+ generation != p->u.t.generation)
continue;
- ret = p->u.t.aborted;
+ ret = p->u.t.status;
break;
case TXNLIST_PGNO:
@@ -490,42 +657,67 @@ __db_txnlist_find_internal(listp, type, txnid, uid, txnlistp, delete)
}
if (delete == 1) {
LIST_REMOVE(p, links);
- __os_free(p, sizeof(DB_TXNLIST));
- } else if (p != LIST_FIRST(&hp->head)) {
+ __os_free(dbenv, p);
+ } else if (p != LIST_FIRST(head)) {
/* Move it to head of list. */
LIST_REMOVE(p, links);
- LIST_INSERT_HEAD(&hp->head, p, links);
+ LIST_INSERT_HEAD(head, p, links);
}
*txnlistp = p;
return (ret);
}
- return (DB_NOTFOUND);
+ return (TXN_NOTFOUND);
}
/*
* __db_txnlist_gen --
* Change the current generation number.
*
- * PUBLIC: void __db_txnlist_gen __P((void *, int));
+ * PUBLIC: int __db_txnlist_gen __P((DB_ENV *,
+ * PUBLIC: void *, int, u_int32_t, u_int32_t));
*/
-void
-__db_txnlist_gen(listp, incr)
+int
+__db_txnlist_gen(dbenv, listp, incr, min, max)
+ DB_ENV *dbenv;
void *listp;
int incr;
+ u_int32_t min, max;
{
DB_TXNHEAD *hp;
+ int ret;
/*
- * During recovery generation numbers keep track of how many "restart"
- * checkpoints we've seen. Restart checkpoints occur whenever we take
- * a checkpoint and there are no outstanding transactions. When that
- * happens, we can reset transaction IDs back to 1. It always happens
- * at recovery and it prevents us from exhausting the transaction IDs
- * name space.
+ * During recovery generation numbers keep track of "restart"
+ * checkpoints and recycle records. Restart checkpoints occur
+ * whenever we take a checkpoint and there are no outstanding
+ * transactions. When that happens, we can reset transaction IDs
+ * back to TXNID_MINIMUM. Currently we only do the reset
+ * at then end of recovery. Recycle records occrur when txnids
+ * are exhausted during runtime. A free range of ids is identified
+ * and logged. This code maintains a stack of ranges. A txnid
+ * is given the generation number of the first range it falls into
+ * in the stack.
*/
hp = (DB_TXNHEAD *)listp;
hp->generation += incr;
+ if (incr < 0)
+ memmove(hp->gen_array, &hp->gen_array[1],
+ (hp->generation + 1) * sizeof(hp->gen_array[0]));
+ else {
+ if (hp->generation >= hp->gen_alloc) {
+ hp->gen_alloc *= 2;
+ if ((ret = __os_realloc(dbenv, hp->gen_alloc *
+ sizeof(hp->gen_array[0]), &hp->gen_array)) != 0)
+ return (ret);
+ }
+ memmove(&hp->gen_array[1], &hp->gen_array[0],
+ hp->generation * sizeof(hp->gen_array[0]));
+ hp->gen_array[0].generation = hp->generation;
+ hp->gen_array[0].txn_min = min;
+ hp->gen_array[0].txn_max = max;
+ }
+ return (0);
}
#define TXN_BUBBLE(AP, MAX) { \
@@ -542,10 +734,10 @@ __db_txnlist_gen(listp, incr)
/*
* __db_txnlist_lsnadd --
- * Add to or re-sort the transaction list lsn entry.
- * Note that since this is used during an abort, the __txn_undo
- * code calls into the "recovery" subsystem explicitly, and there
- * is only a single TXNLIST_LSN entry on the list.
+ * Add to or re-sort the transaction list lsn entry. Note that since this
+ * is used during an abort, the __txn_undo code calls into the "recovery"
+ * subsystem explicitly, and there is only a single TXNLIST_LSN entry on
+ * the list.
*
* PUBLIC: int __db_txnlist_lsnadd __P((DB_ENV *, void *, DB_LSN *, u_int32_t));
*/
@@ -562,19 +754,19 @@ __db_txnlist_lsnadd(dbenv, listp, lsnp, flags)
hp = (DB_TXNHEAD *)listp;
- for (elp = LIST_FIRST(&hp->head);
+ for (elp = LIST_FIRST(&hp->head[0]);
elp != NULL; elp = LIST_NEXT(elp, links))
if (elp->type == TXNLIST_LSN)
break;
if (elp == NULL)
- return (EINVAL);
+ return (DB_SURPRISE_KID);
if (LF_ISSET(TXNLIST_NEW)) {
if (elp->u.l.ntxns >= elp->u.l.maxn) {
if ((ret = __os_realloc(dbenv,
2 * elp->u.l.maxn * sizeof(DB_LSN),
- NULL, &elp->u.l.lsn_array)) != 0)
+ &elp->u.l.lsn_array)) != 0)
return (ret);
elp->u.l.maxn *= 2;
}
@@ -584,9 +776,9 @@ __db_txnlist_lsnadd(dbenv, listp, lsnp, flags)
elp->u.l.lsn_array[0] = *lsnp;
/*
- * If we just added a new entry and there may be NULL
- * entries, so we have to do a complete bubble sort,
- * not just trickle a changed entry around.
+ * If we just added a new entry and there may be NULL entries, so we
+ * have to do a complete bubble sort, not just trickle a changed entry
+ * around.
*/
for (i = 0; i < (!LF_ISSET(TXNLIST_NEW) ? 1 : elp->u.l.ntxns); i++)
TXN_BUBBLE(elp->u.l.lsn_array, elp->u.l.ntxns);
@@ -597,35 +789,6 @@ __db_txnlist_lsnadd(dbenv, listp, lsnp, flags)
}
/*
- * __db_txnlist_lsnhead --
- * Return a pointer to the beginning of the lsn_array.
- *
- * PUBLIC: int __db_txnlist_lsnhead __P((void *, DB_LSN **));
- */
-int
-__db_txnlist_lsnhead(listp, lsnpp)
- void *listp;
- DB_LSN **lsnpp;
-{
- DB_TXNHEAD *hp;
- DB_TXNLIST *elp;
-
- hp = (DB_TXNHEAD *)listp;
-
- for (elp = LIST_FIRST(&hp->head);
- elp != NULL; elp = LIST_NEXT(elp, links))
- if (elp->type == TXNLIST_LSN)
- break;
-
- if (elp == NULL)
- return (EINVAL);
-
- *lsnpp = &elp->u.l.lsn_array[0];
-
- return (0);
-}
-
-/*
* __db_txnlist_lsninit --
* Initialize a transaction list with an lsn array entry.
*
@@ -642,12 +805,12 @@ __db_txnlist_lsninit(dbenv, hp, lsnp)
elp = NULL;
- if ((ret = __os_malloc(dbenv, sizeof(DB_TXNLIST), NULL, &elp)) != 0)
+ if ((ret = __os_malloc(dbenv, sizeof(DB_TXNLIST), &elp)) != 0)
goto err;
- LIST_INSERT_HEAD(&hp->head, elp, links);
+ LIST_INSERT_HEAD(&hp->head[0], elp, links);
if ((ret = __os_malloc(dbenv,
- 12 * sizeof(DB_LSN), NULL, &elp->u.l.lsn_array)) != 0)
+ 12 * sizeof(DB_LSN), &elp->u.l.lsn_array)) != 0)
goto err;
elp->type = TXNLIST_LSN;
elp->u.l.maxn = 12;
@@ -662,8 +825,7 @@ err: __db_txnlist_end(dbenv, hp);
/*
* __db_add_limbo -- add pages to the limbo list.
- * Get the file information and call pgnoadd
- * for each page.
+ * Get the file information and call pgnoadd for each page.
*
* PUBLIC: int __db_add_limbo __P((DB_ENV *,
* PUBLIC: void *, int32_t, db_pgno_t, int32_t));
@@ -681,7 +843,7 @@ __db_add_limbo(dbenv, info, fileid, pgno, count)
int ret;
dblp = dbenv->lg_handle;
- if ((ret = __log_lid_to_fname(dblp, fileid, &fnp)) != 0)
+ if ((ret = __dbreg_id_to_fname(dblp, fileid, 0, &fnp)) != 0)
return (ret);
do {
@@ -698,201 +860,429 @@ __db_add_limbo(dbenv, info, fileid, pgno, count)
/*
* __db_do_the_limbo -- move pages from limbo to free.
*
- * If we are in recovery we add things to the free list without
- * logging becasue we want to incrementaly apply logs that
- * may be generated on another copy of this environment.
- * Otherwise we just call __db_free to put the pages on
- * the free list and log the activity.
+ * Limbo processing is what ensures that we correctly handle and
+ * recover from page allocations. During recovery, for each database,
+ * we process each in-question allocation, link them into the free list
+ * and then write out the new meta-data page that contains the pointer
+ * to the new beginning of the free list. On an abort, we use our
+ * standard __db_free mechanism in a compensating transaction which logs
+ * the specific modifications to the free list.
+ *
+ * If we run out of log space during an abort, then we can't write the
+ * compensating transaction, so we abandon the idea of a compenating
+ * transaction, and go back to processing how we do during recovery.
+ * The reason that this is not the norm is that it's expensive: it requires
+ * that we flush any database with an in-question allocation. Thus if
+ * a compensating transaction fails, we never try to restart it.
+ *
+ * Since files may be open and closed within transactions (in particular,
+ * the master database for subdatabases), we must be prepared to open
+ * files during this process. If there is a compensating transaction, we
+ * can open the files in that transaction. If this was an abort and there
+ * is no compensating transaction, then we've got to perform these opens
+ * in the context of the aborting transaction so that we do not deadlock.
+ * During recovery, there's no locking, so this isn't an issue.
*
- * PUBLIC: int __db_do_the_limbo __P((DB_ENV *, DB_TXNHEAD *));
+ * What you want to keep in mind when reading this is that there are two
+ * algorithms going on here: ctxn == NULL, then we're either in recovery
+ * or our compensating transaction has failed and we're doing the
+ * "create list and write meta-data page" algorithm. Otherwise, we're in
+ * an abort and doing the "use compensating transaction" algorithm.
+ *
+ * PUBLIC: int __db_do_the_limbo __P((DB_ENV *,
+ * PUBLIC: DB_TXN *, DB_TXN *, DB_TXNHEAD *));
*/
int
-__db_do_the_limbo(dbenv, hp)
+__db_do_the_limbo(dbenv, ptxn, txn, hp)
DB_ENV *dbenv;
+ DB_TXN *ptxn, *txn;
DB_TXNHEAD *hp;
{
- DB *dbp;
- DBC *dbc;
- DBMETA *meta;
- DB_TXN *txn;
DB_TXNLIST *elp;
- PAGE *pagep;
- db_pgno_t last_pgno, pgno;
- int i, in_recover, put_page, ret, t_ret;
+ int h, ret;
- dbp = NULL;
- dbc = NULL;
- txn = NULL;
ret = 0;
+ /*
+ * The slots correspond to hash buckets. We've hashed the
+ * fileids into hash buckets and need to pick up all affected
+ * files. (There will only be a single slot for an abort.)
+ */
+ for (h = 0; h < hp->nslots; h++) {
+ if ((elp = LIST_FIRST(&hp->head[h])) == NULL)
+ continue;
+ if (ptxn != NULL) {
+ if ((ret =
+ __db_limbo_move(dbenv, ptxn, txn, elp)) != 0)
+ goto err;
+ } else if ((ret = __db_limbo_bucket(dbenv, txn, elp)) != 0)
+ goto err;
+ }
+
+err: if (ret != 0) {
+ __db_err(dbenv, "Fatal error in abort of an allocation");
+ ret = __db_panic(dbenv, ret);
+ }
- /* Are we in recovery? */
- in_recover = F_ISSET((DB_LOG *)dbenv->lg_handle, DBLOG_RECOVER);
+ return (ret);
+}
- for (elp = LIST_FIRST(&hp->head);
- elp != NULL; elp = LIST_NEXT(elp, links)) {
+/* Limbo support routines. */
+
+/*
+ * __db_lock_move --
+ * Move a lock from child to parent.
+ */
+static int
+__db_lock_move(dbenv, fileid, pgno, mode, ptxn, txn)
+ DB_ENV *dbenv;
+ u_int8_t *fileid;
+ db_pgno_t pgno;
+ db_lockmode_t mode;
+ DB_TXN *ptxn, *txn;
+{
+ DBT lock_dbt;
+ DB_LOCK lock;
+ DB_LOCK_ILOCK lock_obj;
+ DB_LOCKREQ req;
+ int ret;
+
+ lock_obj.pgno = pgno;
+ memcpy(lock_obj.fileid, fileid, DB_FILE_ID_LEN);
+ lock_obj.type = DB_PAGE_LOCK;
+
+ memset(&lock_dbt, 0, sizeof(lock_dbt));
+ lock_dbt.data = &lock_obj;
+ lock_dbt.size = sizeof(lock_obj);
+
+ if ((ret = dbenv->lock_get(dbenv,
+ txn->txnid, 0, &lock_dbt, mode, &lock)) == 0) {
+ memset(&req, 0, sizeof(req));
+ req.lock = lock;
+ req.op = DB_LOCK_TRADE;
+
+ ret = dbenv->lock_vec(dbenv, ptxn->txnid, 0, &req, 1, NULL);
+ }
+ return (ret);
+}
+
+/*
+ * __db_limbo_move
+ * Move just the metapage lock to the parent.
+ */
+static int
+__db_limbo_move(dbenv, ptxn, txn, elp)
+ DB_ENV *dbenv;
+ DB_TXN *ptxn, *txn;
+ DB_TXNLIST *elp;
+{
+ int ret;
+
+ for (; elp != NULL; elp = LIST_NEXT(elp, links)) {
+ if (elp->type != TXNLIST_PGNO || elp->u.p.locked == 1)
+ continue;
+ if ((ret = __db_lock_move(dbenv, elp->u.p.uid,
+ PGNO_BASE_MD, DB_LOCK_WRITE, ptxn, txn)) != 0)
+ return (ret);
+ elp->u.p.locked = 1;
+ }
+
+ return (0);
+}
+/*
+ * __db_limbo_bucket
+ * Perform limbo processing for a single hash bucket in the txnlist.
+ * txn is the transaction aborting in the case of an abort and ctxn is the
+ * compensating transaction.
+ */
+
+#define T_RESTORED(txn) ((txn) != NULL && F_ISSET(txn, TXN_RESTORED))
+static int
+__db_limbo_bucket(dbenv, txn, elp)
+ DB_ENV *dbenv;
+ DB_TXN *txn;
+ DB_TXNLIST *elp;
+{
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ DBMETA *meta;
+ DB_TXN *ctxn, *t;
+ db_pgno_t last_pgno, pgno;
+ int dbp_created, in_retry, ret, t_ret;
+
+ ctxn = NULL;
+ in_retry = 0;
+ meta = NULL;
+ mpf = NULL;
+ ret = 0;
+ for (; elp != NULL; elp = LIST_NEXT(elp, links)) {
if (elp->type != TXNLIST_PGNO)
continue;
+retry: dbp_created = 0;
+
+ /*
+ * Pick the transaction in which to potentially
+ * log compensations.
+ */
+ if (!in_retry && !IS_RECOVERING(dbenv) && !T_RESTORED(txn)
+ && (ret = __txn_compensate_begin(dbenv, &ctxn)) != 0)
+ return (ret);
+
+ /*
+ * Either use the compensating transaction or
+ * the one passed in, which will be null if recovering.
+ */
+ t = ctxn == NULL ? txn : ctxn;
+
+ /* First try to get a dbp by fileid. */
+ ret = __dbreg_id_to_db(dbenv, t, &dbp, elp->u.p.fileid, 0);
+
+ /*
+ * File is being destroyed. No need to worry about
+ * dealing with recovery of allocations.
+ */
+ if (ret == DB_DELETED ||
+ (ret == 0 && F_ISSET(dbp, DB_AM_DISCARD)))
+ goto next;
- if (in_recover) {
+ if (ret != 0) {
if ((ret = db_create(&dbp, dbenv, 0)) != 0)
goto err;
/*
- * It is ok if the file is nolonger there.
+ * This tells the system not to lock, which is always
+ * OK, whether this is an abort or recovery.
*/
+ F_SET(dbp, DB_AM_COMPENSATE);
+ dbp_created = 1;
+
+ /* It is ok if the file is nolonger there. */
dbp->type = DB_UNKNOWN;
- ret = __db_dbopen(dbp,
- elp->u.p.fname, 0, __db_omode("rw----"), 0);
+ ret = __db_dbopen(dbp, t, elp->u.p.fname, NULL,
+ DB_ODDFILESIZE, __db_omode("rw----"), PGNO_BASE_MD);
+ if (ret == ENOENT)
+ goto next;
+ }
+
+ /*
+ * Verify that we are opening the same file that we were
+ * referring to when we wrote this log record.
+ */
+ if (memcmp(elp->u.p.uid, dbp->fileid, DB_FILE_ID_LEN) != 0)
+ goto next;
+
+ mpf = dbp->mpf;
+ last_pgno = PGNO_INVALID;
+
+ if (ctxn == NULL) {
+ pgno = PGNO_BASE_MD;
+ if ((ret =
+ mpf->get(mpf, &pgno, 0, (PAGE **)&meta)) != 0)
+ goto err;
+ last_pgno = meta->free;
+ }
+
+ ret = __db_limbo_fix(dbp, ctxn, elp, &last_pgno, meta);
+ /*
+ * If we were doing compensating transactions, then we are
+ * going to hope this error was due to running out of space.
+ * We'll change modes (into the sync the file mode) and keep
+ * trying. If we weren't doing compensating transactions,
+ * then this is a real error and we're sunk.
+ */
+ if (ret != 0) {
+ if (ret == DB_RUNRECOVERY || ctxn == NULL)
+ goto err;
+ in_retry = 1;
+ goto retry;
+ }
+
+ if (ctxn != NULL) {
+ ret = ctxn->commit(ctxn, DB_TXN_NOSYNC);
+ ctxn = NULL;
+ if (ret != 0)
+ goto retry;
+ goto next;
+ }
+
+ /*
+ * This is where we handle the case where we're explicitly
+ * putting together a free list. We need to decide whether
+ * we have to write the meta-data page, and if we do, then
+ * we need to sync it as well.
+ */
+ if (last_pgno == meta->free) {
+ /* No change to page; just put the page back. */
+ if ((ret = mpf->put(mpf, meta, 0)) != 0)
+ goto err;
+ meta = NULL;
} else {
/*
- * If we are in transaction undo, then we know
- * the fileid is still correct.
+ * These changes are unlogged so we cannot have the
+ * metapage pointing at pages that are not on disk.
+ * Therefore, we flush the new free list, then update
+ * the metapage. We have to put the meta-data page
+ * first so that it isn't pinned when we try to sync.
*/
+ if (!IS_RECOVERING(dbenv) && !T_RESTORED(txn))
+ __db_err(dbenv, "Flushing free list to disk");
+ if ((ret = mpf->put(mpf, meta, 0)) != 0)
+ goto err;
+ meta = NULL;
+ dbp->sync(dbp, 0);
+ pgno = PGNO_BASE_MD;
if ((ret =
- __db_fileid_to_db(dbenv, &dbp,
- elp->u.p.fileid, 0)) != 0 && ret != DB_DELETED)
+ mpf->get(mpf, &pgno, 0, (PAGE **)&meta)) != 0)
+ goto err;
+ meta->free = last_pgno;
+ if ((ret = mpf->put(mpf, meta, DB_MPOOL_DIRTY)) != 0)
goto err;
- /* File is being destroyed. */
- if (F_ISSET(dbp, DB_AM_DISCARD))
- ret = DB_DELETED;
+ meta = NULL;
}
+
+next:
/*
- * Verify that we are opening the same file that we were
- * referring to when we wrote this log record.
+ * If we get here, either we have processed the list
+ * or the db file has been deleted or could no be opened.
*/
- if (ret == 0 &&
- memcmp(elp->u.p.uid, dbp->fileid, DB_FILE_ID_LEN) == 0) {
- last_pgno = PGNO_INVALID;
- if (in_recover) {
- pgno = PGNO_BASE_MD;
- if ((ret = memp_fget(dbp->mpf,
- &pgno, 0, (PAGE **)&meta)) != 0)
- goto err;
- last_pgno = meta->free;
- /*
- * Check to see if the head of the free
- * list is any of the pages we are about
- * to link in. We could have crashed
- * after linking them in and before writing
- * a checkpoint.
- * It may not be the last one since
- * any page may get reallocated before here.
- */
- if (last_pgno != PGNO_INVALID)
- for (i = 0; i < elp->u.p.nentries; i++)
- if (last_pgno
- == elp->u.p.pgno_array[i])
- goto done_it;
- }
+ if (ctxn != NULL &&
+ (t_ret = ctxn->abort(ctxn)) != 0 && ret == 0)
+ ret = t_ret;
- for (i = 0; i < elp->u.p.nentries; i++) {
- pgno = elp->u.p.pgno_array[i];
- if ((ret = memp_fget(dbp->mpf,
- &pgno, DB_MPOOL_CREATE, &pagep)) != 0)
- goto err;
+ if (dbp_created &&
+ (t_ret = __db_close_i(dbp, txn, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ dbp = NULL;
+ __os_free(dbenv, elp->u.p.fname);
+ __os_free(dbenv, elp->u.p.pgno_array);
+ if (ret == ENOENT)
+ ret = 0;
+ else if (ret != 0)
+ goto err;
+ }
- put_page = 1;
- if (IS_ZERO_LSN(LSN(pagep))) {
- P_INIT(pagep, dbp->pgsize,
- pgno, PGNO_INVALID,
- last_pgno, 0, P_INVALID);
-
- if (in_recover) {
- LSN(pagep) = LSN(meta);
- last_pgno = pgno;
- } else {
- /*
- * Starting the transaction
- * is postponed until we know
- * we have something to do.
- */
- if (txn == NULL &&
- (ret = txn_begin(dbenv,
- NULL, &txn, 0)) != 0)
- goto err;
-
- if (dbc == NULL &&
- (ret = dbp->cursor(dbp,
- txn, &dbc, 0)) != 0)
- goto err;
- /* Turn off locking. */
- F_SET(dbc, DBC_COMPENSATE);
-
- /* __db_free puts the page. */
- if ((ret =
- __db_free(dbc, pagep)) != 0)
- goto err;
- put_page = 0;
- }
- }
+err: if (meta != NULL)
+ (void)mpf->put(mpf, meta, 0);
+ return (ret);
+}
- if (put_page == 1 &&
- (ret = memp_fput(dbp->mpf,
- pagep, DB_MPOOL_DIRTY)) != 0)
- goto err;
- }
- if (in_recover) {
- if (last_pgno == meta->free) {
-done_it:
+/*
+ * __db_limbo_fix --
+ * Process a single limbo entry which describes all the page allocations
+ * for a single file.
+ */
+static int
+__db_limbo_fix(dbp, ctxn, elp, lastp, meta)
+ DB *dbp;
+ DB_TXN *ctxn;
+ DB_TXNLIST *elp;
+ db_pgno_t *lastp;
+ DBMETA *meta;
+{
+ DBC *dbc;
+ DB_MPOOLFILE *mpf;
+ PAGE *freep, *pagep;
+ db_pgno_t next, pgno;
+ int i, put_page, ret, t_ret;
+
+ /*
+ * Loop through the entries for this txnlist element and
+ * either link them into the free list or write a compensating
+ * record for each.
+ */
+ put_page = 0;
+ ret = 0;
+ mpf = dbp->mpf;
+ dbc = NULL;
+
+ for (i = 0; i < elp->u.p.nentries; i++) {
+ pgno = elp->u.p.pgno_array[i];
+
+ if ((ret = mpf->get(mpf, &pgno, DB_MPOOL_CREATE, &pagep)) != 0)
+ goto err;
+ put_page = 1;
+
+ if (IS_ZERO_LSN(LSN(pagep))) {
+ if (ctxn == NULL) {
+ /*
+ * If this is a fatal recovery which
+ * spans a previous crash this page may
+ * be on the free list already.
+ */
+ for (next = *lastp; next != 0; ) {
+ if (next == pgno)
+ break;
+ if ((ret = mpf->get(mpf,
+ &next, 0, &freep)) != 0)
+ goto err;
+ next = NEXT_PGNO(freep);
if ((ret =
- memp_fput(dbp->mpf, meta, 0)) != 0)
+ mpf->put(mpf, freep, 0)) != 0)
goto err;
- } else {
- /*
- * Flush the new free list then
- * update the metapage. This is
- * unlogged so we cannot have the
- * metapage pointing at pages that
- * are not on disk.
- */
- dbp->sync(dbp, 0);
- meta->free = last_pgno;
- if ((ret = memp_fput(dbp->mpf,
- meta, DB_MPOOL_DIRTY)) != 0)
+ }
+
+ if (next != pgno) {
+ P_INIT(pagep, dbp->pgsize, pgno,
+ PGNO_INVALID, *lastp, 0, P_INVALID);
+ LSN(pagep) = LSN(meta);
+ *lastp = pgno;
+ }
+ } else {
+ P_INIT(pagep, dbp->pgsize, pgno,
+ PGNO_INVALID, *lastp, 0, P_INVALID);
+ if (dbc == NULL && (ret =
+ dbp->cursor(dbp, ctxn, &dbc, 0)) != 0)
goto err;
+ /*
+ * If the dbp is compensating (because we
+ * opened it), the dbc will automatically be
+ * marked compensating, but in case we didn't
+ * do the open, we have to mark it explicitly.
+ */
+ F_SET(dbc, DBC_COMPENSATE);
+ ret = __db_free(dbc, pagep);
+ put_page = 0;
+ /*
+ * On any error, we hope that the error was
+ * caused due to running out of space, and we
+ * switch modes, doing the processing where we
+ * sync out files instead of doing compensating
+ * transactions. If this was a real error and
+ * not out of space, we assume that some other
+ * call will fail real soon.
+ */
+ if (ret != 0) {
+ /* Assume that this is out of space. */
+ (void)dbc->c_close(dbc);
+ dbc = NULL;
+ goto err;
}
}
- if (dbc != NULL && (ret = dbc->c_close(dbc)) != 0)
- goto err;
- dbc = NULL;
}
- if (in_recover && (t_ret = dbp->close(dbp, 0)) != 0 && ret == 0)
- ret = t_ret;
- dbp = NULL;
- __os_free(elp->u.p.fname, 0);
- __os_free(elp->u.p.pgno_array, 0);
- if (ret == ENOENT)
- ret = 0;
- else if (ret != 0)
+
+ if (put_page == 1) {
+ ret = mpf->put(mpf, pagep, DB_MPOOL_DIRTY);
+ put_page = 0;
+ }
+ if (ret != 0)
goto err;
}
- if (txn != NULL) {
- ret = txn_commit(txn, 0);
- txn = NULL;
- }
-err:
- if (dbc != NULL)
- (void)dbc->c_close(dbc);
- if (in_recover && dbp != NULL)
- (void)dbp->close(dbp, 0);
- if (txn != NULL)
- (void)txn_abort(txn);
+err: if (put_page &&
+ (t_ret = mpf->put(mpf, pagep, DB_MPOOL_DIRTY)) != 0 && ret == 0)
+ ret = t_ret;
+ if (dbc != NULL && (t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
return (ret);
-
}
-#define DB_TXNLIST_MAX_PGNO 8 /* A nice even number. */
+#define DB_TXNLIST_MAX_PGNO 8 /* A nice even number. */
/*
* __db_txnlist_pgnoadd --
- * Find the txnlist entry for a file and add this pgno,
- * or add the list entry for the file and then add the pgno.
- *
- * PUBLIC: int __db_txnlist_pgnoadd __P((DB_ENV *, DB_TXNHEAD *,
- * PUBLIC: int32_t, u_int8_t [DB_FILE_ID_LEN], char *, db_pgno_t));
+ * Find the txnlist entry for a file and add this pgno, or add the list
+ * entry for the file and then add the pgno.
*/
-int
+static int
__db_txnlist_pgnoadd(dbenv, hp, fileid, uid, fname, pgno)
DB_ENV *dbenv;
DB_TXNHEAD *hp;
@@ -902,34 +1292,39 @@ __db_txnlist_pgnoadd(dbenv, hp, fileid, uid, fname, pgno)
db_pgno_t pgno;
{
DB_TXNLIST *elp;
+ u_int32_t hash;
int len, ret;
elp = NULL;
- if (__db_txnlist_find_internal(hp, TXNLIST_PGNO, 0, uid, &elp, 0) != 0) {
+ if (__db_txnlist_find_internal(dbenv, hp,
+ TXNLIST_PGNO, 0, uid, &elp, 0) != 0) {
if ((ret =
- __os_malloc(dbenv, sizeof(DB_TXNLIST), NULL, &elp)) != 0)
+ __os_malloc(dbenv, sizeof(DB_TXNLIST), &elp)) != 0)
goto err;
- LIST_INSERT_HEAD(&hp->head, elp, links);
+ memcpy(&hash, uid, sizeof(hash));
+ LIST_INSERT_HEAD(
+ &hp->head[DB_TXNLIST_MASK(hp, hash)], elp, links);
elp->u.p.fileid = fileid;
memcpy(elp->u.p.uid, uid, DB_FILE_ID_LEN);
- len = strlen(fname) + 1;
- if ((ret = __os_malloc(dbenv, len, NULL, &elp->u.p.fname)) != 0)
+ len = (int)strlen(fname) + 1;
+ if ((ret = __os_malloc(dbenv, len, &elp->u.p.fname)) != 0)
goto err;
memcpy(elp->u.p.fname, fname, len);
elp->u.p.maxentry = 0;
+ elp->u.p.locked = 0;
elp->type = TXNLIST_PGNO;
if ((ret = __os_malloc(dbenv,
- 8 * sizeof(db_pgno_t), NULL, &elp->u.p.pgno_array)) != 0)
+ 8 * sizeof(db_pgno_t), &elp->u.p.pgno_array)) != 0)
goto err;
elp->u.p.maxentry = DB_TXNLIST_MAX_PGNO;
elp->u.p.nentries = 0;
} else if (elp->u.p.nentries == elp->u.p.maxentry) {
elp->u.p.maxentry <<= 1;
if ((ret = __os_realloc(dbenv, elp->u.p.maxentry *
- sizeof(db_pgno_t), NULL, &elp->u.p.pgno_array)) != 0)
+ sizeof(db_pgno_t), &elp->u.p.pgno_array)) != 0)
goto err;
}
@@ -941,6 +1336,36 @@ err: __db_txnlist_end(dbenv, hp);
return (ret);
}
+/*
+ * __db_default_getpgnos --
+ * Fill in default getpgnos information for an application-specific
+ * log record.
+ */
+static int
+__db_default_getpgnos(dbenv, lsnp, summary)
+ DB_ENV *dbenv;
+ DB_LSN *lsnp;
+ void *summary;
+{
+ TXN_RECS *t;
+ int ret;
+
+ t = (TXN_RECS *)summary;
+
+ if ((ret = __rep_check_alloc(dbenv, t, 1)) != 0)
+ return (ret);
+
+ t->array[t->npages].flags = LSN_PAGE_NOLOCK;
+ t->array[t->npages].lsn = *lsnp;
+ t->array[t->npages].fid = DB_LOGFILEID_INVALID;
+ memset(&t->array[t->npages].pgdesc, 0,
+ sizeof(t->array[t->npages].pgdesc));
+
+ t->npages++;
+
+ return (0);
+}
+
#ifdef DEBUG
/*
* __db_txnlist_print --
@@ -954,25 +1379,21 @@ __db_txnlist_print(listp)
{
DB_TXNHEAD *hp;
DB_TXNLIST *p;
+ int i;
+ char *stats[] = { "ok", "commit", "prepare", "abort", "notfound",
+ "ignore", "expected", "unexpected" };
hp = (DB_TXNHEAD *)listp;
printf("Maxid: %lu Generation: %lu\n",
(u_long)hp->maxid, (u_long)hp->generation);
- for (p = LIST_FIRST(&hp->head); p != NULL; p = LIST_NEXT(p, links)) {
+ for (i = 0; i < hp->nslots; i++)
+ for (p = LIST_FIRST(&hp->head[i]); p != NULL; p = LIST_NEXT(p, links)) {
switch (p->type) {
case TXNLIST_TXNID:
- printf("TXNID: %lu(%lu)\n",
- (u_long)p->u.t.txnid, (u_long)p->u.t.generation);
- break;
- case TXNLIST_DELETE:
- printf("FILE: %s id=%d ops=%d %s %s\n",
- p->u.d.fname, p->u.d.fileid, p->u.d.count,
- F_ISSET(&p->u.d, TXNLIST_FLAG_DELETED) ?
- "(deleted)" : "(missing)",
- F_ISSET(&p->u.d, TXNLIST_FLAG_CLOSED) ?
- "(closed)" : "(open)");
-
+ printf("TXNID: %lx(%lu): %s\n",
+ (u_long)p->u.t.txnid, (u_long)p->u.t.generation,
+ stats[p->u.t.status]);
break;
default:
printf("Unrecognized type: %d\n", p->type);