summaryrefslogtreecommitdiff
path: root/src/db/db_meta.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/db/db_meta.c')
-rw-r--r--src/db/db_meta.c1428
1 files changed, 1428 insertions, 0 deletions
diff --git a/src/db/db_meta.c b/src/db/db_meta.c
new file mode 100644
index 00000000..8f97ebd8
--- /dev/null
+++ b/src/db/db_meta.c
@@ -0,0 +1,1428 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/txn.h"
+#include "dbinc/db_am.h"
+#include "dbinc/hash.h"
+
+static void __db_init_meta __P((DB *, void *, db_pgno_t, u_int32_t));
+#ifdef HAVE_FTRUNCATE
+static int __db_pglistcmp __P((const void *, const void *));
+static int __db_truncate_freelist __P((DBC *, DBMETA *,
+ PAGE *, db_pgno_t *, u_int32_t, u_int32_t));
+#endif
+
+/*
+ * __db_init_meta --
+ * Helper function for __db_new that initializes the important fields in
+ * a meta-data page (used instead of P_INIT). We need to make sure that we
+ * retain the page number and LSN of the existing page.
+ */
+static void
+__db_init_meta(dbp, p, pgno, pgtype)
+ DB *dbp;
+ void *p;
+ db_pgno_t pgno;
+ u_int32_t pgtype;
+{
+ DBMETA *meta;
+ DB_LSN save_lsn;
+
+ meta = (DBMETA *)p;
+ save_lsn = meta->lsn;
+ memset(meta, 0, sizeof(DBMETA));
+ meta->lsn = save_lsn;
+ meta->pagesize = dbp->pgsize;
+ if (F_ISSET(dbp, DB_AM_CHKSUM))
+ FLD_SET(meta->metaflags, DBMETA_CHKSUM);
+ meta->pgno = pgno;
+ meta->type = (u_int8_t)pgtype;
+}
+
+/*
+ * __db_new --
+ * Get a new page, preferably from the freelist.
+ *
+ * PUBLIC: int __db_new __P((DBC *, u_int32_t, DB_LOCK *, PAGE **));
+ */
+int
+__db_new(dbc, type, lockp, pagepp)
+ DBC *dbc;
+ u_int32_t type;
+ DB_LOCK *lockp;
+ PAGE **pagepp;
+{
+ DB *dbp;
+ DBMETA *meta;
+ DB_LOCK metalock;
+ DB_LSN lsn;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *h;
+ db_pgno_t last, *list, pgno, newnext;
+ int extend, hash, ret;
+
+ meta = NULL;
+ dbp = dbc->dbp;
+ env = dbp->env;
+ mpf = dbp->mpf;
+ h = NULL;
+ newnext = PGNO_INVALID;
+ if (lockp != NULL)
+ LOCK_INIT(*lockp);
+
+ hash = 0;
+ ret = 0;
+ LOCK_INIT(metalock);
+
+#ifdef HAVE_HASH
+ if (dbp->type == DB_HASH) {
+ if ((ret = __ham_return_meta(dbc, DB_MPOOL_DIRTY, &meta)) != 0)
+ goto err;
+ if (meta != NULL)
+ hash = 1;
+ }
+#endif
+ if (meta == NULL) {
+ pgno = PGNO_BASE_MD;
+ if ((ret = __db_lget(dbc,
+ LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+ DB_MPOOL_DIRTY, &meta)) != 0)
+ goto err;
+ }
+
+ last = meta->last_pgno;
+ if (meta->free == PGNO_INVALID) {
+ if (FLD_ISSET(type, P_DONTEXTEND)) {
+ *pagepp = NULL;
+ goto err;
+ }
+ last = pgno = meta->last_pgno + 1;
+ ZERO_LSN(lsn);
+ extend = 1;
+ } else {
+ pgno = meta->free;
+ /*
+ * Lock the new page. Do this here because we must do it
+ * before getting the page and the caller may need the lock
+ * to keep readers from seeing the page before the transaction
+ * commits. We can do this because no one will hold a free
+ * page locked.
+ */
+ if (lockp != NULL && (ret =
+ __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, lockp)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+ DB_MPOOL_DIRTY, &h)) != 0)
+ goto err;
+
+ /*
+ * We want to take the first page off the free list and
+ * then set meta->free to the that page's next_pgno, but
+ * we need to log the change first.
+ */
+ newnext = h->next_pgno;
+ lsn = h->lsn;
+ extend = 0;
+ DB_ASSERT(env, TYPE(h) == P_INVALID);
+
+ if (TYPE(h) != P_INVALID) {
+ __db_errx(env, DB_STR_A("0689",
+ "%s page %lu is on free list with type %lu",
+ "%s %lu %lu"), dbp->fname, (u_long)PGNO(h),
+ (u_long)TYPE(h));
+ return (__env_panic(env, EINVAL));
+ }
+
+ }
+
+ FLD_CLR(type, P_DONTEXTEND);
+
+ /*
+ * Log the allocation before fetching the new page. If we
+ * don't have room in the log then we don't want to tell
+ * mpool to extend the file.
+ */
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __db_pg_alloc_log(dbp, dbc->txn, &LSN(meta), 0,
+ &LSN(meta), PGNO_BASE_MD, &lsn,
+ pgno, (u_int32_t)type, newnext, meta->last_pgno)) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(meta));
+
+ meta->free = newnext;
+
+ if (extend == 1) {
+ if (lockp != NULL && (ret =
+ __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, lockp)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+ DB_MPOOL_NEW, &h)) != 0)
+ goto err;
+ DB_ASSERT(env, last == pgno);
+ meta->last_pgno = pgno;
+ ZERO_LSN(h->lsn);
+ h->pgno = pgno;
+
+ /*
+ * If the file was extended for the first time in this
+ * transaction, set the MPOOLFILE's file extension
+ * watermark.
+ */
+ __txn_add_fe_watermark(dbc->txn, dbp, h->pgno);
+
+ }
+ LSN(h) = LSN(meta);
+
+ if (hash == 0 && (ret = __memp_fput(mpf,
+ dbc->thread_info, meta, dbc->priority)) != 0)
+ goto err;
+ meta = NULL;
+
+ switch (type) {
+ case P_BTREEMETA:
+ case P_HASHMETA:
+ case P_QAMMETA:
+ __db_init_meta(dbp, h, h->pgno, type);
+ break;
+ default:
+ P_INIT(h, dbp->pgsize,
+ h->pgno, PGNO_INVALID, PGNO_INVALID, 0, type);
+ break;
+ }
+
+ /* Fix up the sorted free list if necessary. */
+#ifdef HAVE_FTRUNCATE
+ if (extend == 0) {
+ u_int32_t nelems = 0;
+
+ if ((ret = __memp_get_freelist(dbp->mpf, &nelems, &list)) != 0)
+ goto err;
+ if (nelems != 0) {
+ DB_ASSERT(env, h->pgno == list[0]);
+ memmove(list, &list[1], (nelems - 1) * sizeof(*list));
+ if ((ret = __memp_extend_freelist(
+ dbp->mpf, nelems - 1, &list)) != 0)
+ goto err;
+ }
+ }
+#else
+ COMPQUIET(list, NULL);
+#endif
+
+ if ((ret = __TLPUT(dbc, metalock)) != 0)
+ return (ret);
+ *pagepp = h;
+ PERFMON6(env, alloc, new, dbp->fname, dbp->dname, pgno, type, h, 0);
+ return (0);
+
+err: if (h != NULL)
+ (void)__memp_fput(mpf, dbc->thread_info, h, dbc->priority);
+ if (meta != NULL && hash == 0)
+ (void)__memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
+ (void)__TLPUT(dbc, metalock);
+ if (lockp != NULL)
+ (void)__LPUT(dbc, *lockp);
+ /* Failure return - report 0 pgno, null page address. */
+ PERFMON6(env, alloc, new, dbp->fname, dbp->dname, 0, type, NULL, ret);
+ return (ret);
+}
+
+/*
+ * __db_free --
+ * Add a page to the head of the freelist.
+ *
+ * PUBLIC: int __db_free __P((DBC *, PAGE *, u_int32_t));
+ */
+int
+__db_free(dbc, h, flags)
+ DBC *dbc;
+ PAGE *h;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DBMETA *meta;
+ DBT ddbt, ldbt;
+ DB_LOCK metalock;
+ DB_LSN *lsnp;
+ DB_MPOOLFILE *mpf;
+ PAGE *prev;
+ db_pgno_t last_pgno, next_pgno, pgno, prev_pgno;
+ u_int32_t lflag;
+ int hash, ret, t_ret;
+#ifdef HAVE_FTRUNCATE
+ db_pgno_t *list, *lp;
+ u_int32_t nelem, position, start;
+ int do_truncate;
+#endif
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ prev_pgno = PGNO_INVALID;
+ meta = NULL;
+ prev = NULL;
+ LOCK_INIT(metalock);
+#ifdef HAVE_FTRUNCATE
+ lp = NULL;
+ nelem = 0;
+ do_truncate = 0;
+#endif
+
+ /*
+ * Retrieve the metadata page. If we are not keeping a sorted
+ * free list put the page at the head of the the free list.
+ * If we are keeping a sorted free list, for truncation,
+ * then figure out where this page belongs and either
+ * link it in or truncate the file as much as possible.
+ * If either the lock get or page get routines
+ * fail, then we need to put the page with which we were called
+ * back because our caller assumes we take care of it.
+ */
+ hash = 0;
+
+ pgno = PGNO_BASE_MD;
+ if ((ret = __db_lget(dbc,
+ LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+ goto err;
+
+#ifdef HAVE_HASH
+ if (dbp->type == DB_HASH) {
+ if ((ret = __ham_return_meta(dbc,
+#ifdef HAVE_FTRUNCATE
+ 0,
+#else
+ DB_MPOOL_DIRTY,
+#endif
+ &meta)) != 0)
+ goto err;
+ if (meta != NULL)
+ hash = 1;
+ }
+#endif
+ if (meta == NULL) {
+ /* If we support truncate, we might not dirty the meta page. */
+ if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+#ifdef HAVE_FTRUNCATE
+ 0,
+#else
+ DB_MPOOL_DIRTY,
+#endif
+ &meta)) != 0)
+ goto err1;
+ }
+
+ last_pgno = meta->last_pgno;
+ next_pgno = meta->free;
+ /*
+ * Assign lsnp here so it always initialized when
+ * HAVE_FTRUNCATE is not defined.
+ */
+ lsnp = &LSN(meta);
+
+ DB_ASSERT(dbp->env, h->pgno != next_pgno);
+
+#ifdef HAVE_FTRUNCATE
+ /*
+ * If we are maintaining a sorted free list see if we either have a
+ * new truncation point or the page goes somewhere in the middle of
+ * the list. If it goes in the middle of the list, we will drop the
+ * meta page and get the previous page.
+ */
+ COMPQUIET(position, 0);
+ if ((ret = __memp_get_freelist(mpf, &nelem, &list)) != 0)
+ goto err1;
+ if (list == NULL)
+ goto no_sort;
+
+ if (h->pgno != last_pgno) {
+ /*
+ * Put the page number in the sorted list. Find its
+ * position and the previous page. After logging we
+ * will extend the list, make room and insert the page in
+ * the list.
+ */
+ position = 0;
+ if (nelem != 0) {
+ __db_freelist_pos(h->pgno, list, nelem, &position);
+
+ DB_ASSERT(dbp->env, h->pgno != list[position]);
+
+ /* Get the previous page if this is not the smallest. */
+ if (position != 0 || h->pgno > list[0])
+ prev_pgno = list[position];
+ }
+
+ } else if (nelem != 0) {
+ /* Find the truncation point. */
+ for (lp = &list[nelem - 1]; lp >= list; lp--)
+ if (--last_pgno != *lp)
+ break;
+ if (lp < list || last_pgno < h->pgno - 1)
+ do_truncate = 1;
+ last_pgno = meta->last_pgno;
+ }
+
+no_sort:
+ if (prev_pgno == PGNO_INVALID) {
+#ifdef HAVE_HASH
+ if (hash) {
+ if ((ret =
+ __ham_return_meta(dbc, DB_MPOOL_DIRTY, &meta)) != 0)
+ goto err1;
+ } else
+#endif
+ if ((ret = __memp_dirty(mpf,
+ &meta, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto err1;
+ lsnp = &LSN(meta);
+ } else {
+ pgno = prev_pgno;
+ if ((ret = __memp_fget(mpf, &pgno,
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &prev)) != 0)
+ goto err1;
+ next_pgno = NEXT_PGNO(prev);
+ lsnp = &LSN(prev);
+ }
+#endif
+
+ /*
+ * Log the change.
+ * We are either logging an update to the metapage or to the
+ * previous page in the sorted list.
+ */
+ if (DBC_LOGGING(dbc)) {
+ memset(&ldbt, 0, sizeof(ldbt));
+ ldbt.data = h;
+ ldbt.size = P_OVERHEAD(dbp);
+ /*
+ * If we are removing pages from the file, we need to make
+ * sure the logging happens before the truncation. If we
+ * are truncating multiple pages we don't need to flush the
+ * log here as it will be flushed by __db_truncate_freelist.
+ */
+ lflag = 0;
+
+#ifdef HAVE_FTRUNCATE
+ if (h->pgno == last_pgno && do_truncate == 0)
+ lflag = DB_FLUSH;
+#endif
+ switch (h->type) {
+ case P_HASH:
+ case P_IBTREE:
+ case P_IRECNO:
+ case P_LBTREE:
+ case P_LRECNO:
+ case P_LDUP:
+ if (h->entries > 0 && (h->pgno == last_pgno ||
+ !LF_ISSET(DB_LOG_NO_DATA))) {
+ ldbt.size += h->entries * sizeof(db_indx_t);
+ ddbt.data = (u_int8_t *)h + HOFFSET(h);
+ ddbt.size = dbp->pgsize - HOFFSET(h);
+ if ((ret = __db_pg_freedata_log(dbp, dbc->txn,
+ lsnp, lflag,
+ h->pgno, lsnp, pgno,
+ &ldbt, next_pgno, last_pgno, &ddbt)) != 0)
+ goto err1;
+ goto logged;
+ }
+ break;
+ case P_HASHMETA:
+ ldbt.size = sizeof(HMETA);
+ break;
+ case P_BTREEMETA:
+ ldbt.size = sizeof(BTMETA);
+ break;
+ case P_OVERFLOW:
+ ldbt.size += OV_LEN(h);
+ break;
+ default:
+ DB_ASSERT(dbp->env, h->type != P_QAMDATA);
+ }
+
+ if ((ret = __db_pg_free_log(dbp,
+ dbc->txn, lsnp, lflag, h->pgno,
+ lsnp, pgno, &ldbt, next_pgno, last_pgno)) != 0)
+ goto err1;
+ } else
+ LSN_NOT_LOGGED(*lsnp);
+
+logged:
+#ifdef HAVE_FTRUNCATE
+ if (do_truncate) {
+ start = (u_int32_t) (lp - list) + 1;
+ meta->last_pgno--;
+ ret = __db_truncate_freelist(
+ dbc, meta, h, list, start, nelem);
+ h = NULL;
+ } else if (h->pgno == last_pgno) {
+ /*
+ * We are going to throw this page away, but if we are
+ * using MVCC then this version may stick around and we
+ * might have to make a copy.
+ */
+ if (atomic_read(&mpf->mfp->multiversion) &&
+ (ret = __memp_dirty(mpf,
+ &h, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto err1;
+ LSN(h) = *lsnp;
+ P_INIT(h, dbp->pgsize,
+ h->pgno, PGNO_INVALID, next_pgno, 0, P_INVALID);
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, h, DB_PRIORITY_VERY_LOW)) != 0)
+ goto err1;
+ h = NULL;
+ /* Give the page back to the OS. */
+ if ((ret = __memp_ftruncate(mpf, dbc->txn, dbc->thread_info,
+ last_pgno, 0)) != 0)
+ goto err1;
+ DB_ASSERT(dbp->env, meta->pgno == PGNO_BASE_MD);
+ meta->last_pgno--;
+ } else {
+ if (list != NULL) {
+ /* Put the page number into the list. */
+ if ((ret =
+ __memp_extend_freelist(mpf, nelem + 1, &list)) != 0)
+ goto err1;
+ if (prev_pgno != PGNO_INVALID)
+ lp = &list[position + 1];
+ else
+ lp = list;
+ if (nelem != 0 && position != nelem)
+ memmove(lp + 1, lp, (size_t)
+ ((u_int8_t*)&list[nelem] - (u_int8_t*)lp));
+ *lp = h->pgno;
+ }
+#else
+ {
+#endif
+ /*
+ * If we are not truncating the page then we
+ * reinitialize it and put it at the head of
+ * the free list.
+ */
+ if ((ret = __memp_dirty(mpf,
+ &h, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto err1;
+ LSN(h) = *lsnp;
+ P_INIT(h, dbp->pgsize,
+ h->pgno, PGNO_INVALID, next_pgno, 0, P_INVALID);
+#ifdef DIAGNOSTIC
+ memset((u_int8_t *) h + P_OVERHEAD(dbp),
+ CLEAR_BYTE, dbp->pgsize - P_OVERHEAD(dbp));
+#endif
+ if (prev_pgno == PGNO_INVALID)
+ meta->free = h->pgno;
+ else
+ NEXT_PGNO(prev) = h->pgno;
+ }
+
+ /* Discard the metadata or previous page. */
+err1: if (hash == 0 && meta != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, (PAGE *)meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (prev != (PAGE*) meta && prev != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, prev, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Discard the caller's page reference. */
+err: if (h != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ PERFMON4(dbp->env, alloc, free, dbp->fname, dbp->dname, pgno, ret);
+ /*
+ * XXX
+ * We have to unlock the caller's page in the caller!
+ */
+ return (ret);
+}
+
+#ifdef HAVE_FTRUNCATE
+/*
+ * __db_freelist_pos -- find the position of a page in the freelist.
+ * The list is sorted, we do a binary search.
+ *
+ * PUBLIC: #ifdef HAVE_FTRUNCATE
+ * PUBLIC: void __db_freelist_pos __P((db_pgno_t,
+ * PUBLIC: db_pgno_t *, u_int32_t, u_int32_t *));
+ * PUBLIC: #endif
+ */
+void
+__db_freelist_pos(pgno, list, nelem, posp)
+ db_pgno_t pgno;
+ db_pgno_t *list;
+ u_int32_t nelem;
+ u_int32_t *posp;
+{
+ u_int32_t base, indx, lim;
+
+ indx = 0;
+ for (base = 0, lim = nelem; lim != 0; lim >>= 1) {
+ indx = base + (lim >> 1);
+ if (pgno == list[indx]) {
+ *posp = indx;
+ return;
+ }
+ if (pgno > list[indx]) {
+ base = indx + 1;
+ --lim;
+ }
+ }
+ if (base != 0)
+ base--;
+ *posp = base;
+ return;
+}
+
+static int
+__db_pglistcmp(a, b)
+ const void *a, *b;
+{
+ db_pglist_t *ap, *bp;
+
+ ap = (db_pglist_t *)a;
+ bp = (db_pglist_t *)b;
+
+ return ((ap->pgno > bp->pgno) ? 1 : (ap->pgno < bp->pgno) ? -1: 0);
+}
+
+/*
+ * __db_freelist_sort -- sort a list of free pages.
+ * PUBLIC: void __db_freelist_sort __P((db_pglist_t *, u_int32_t));
+ */
+void
+__db_freelist_sort(list, nelems)
+ db_pglist_t *list;
+ u_int32_t nelems;
+{
+ qsort(list, (size_t)nelems, sizeof(db_pglist_t), __db_pglistcmp);
+}
+
+/*
+ * __db_pg_truncate -- find the truncation point in a sorted freelist.
+ *
+ * PUBLIC: #ifdef HAVE_FTRUNCATE
+ * PUBLIC: int __db_pg_truncate __P((DBC *, DB_TXN *,
+ * PUBLIC: db_pglist_t *, DB_COMPACT *, u_int32_t *,
+ * PUBLIC: db_pgno_t , db_pgno_t *, DB_LSN *, int));
+ * PUBLIC: #endif
+ */
+int
+__db_pg_truncate(dbc, txn,
+ list, c_data, nelemp, free_pgno, last_pgno, lsnp, in_recovery)
+ DBC *dbc;
+ DB_TXN *txn;
+ db_pglist_t *list;
+ DB_COMPACT *c_data;
+ u_int32_t *nelemp;
+ db_pgno_t free_pgno, *last_pgno;
+ DB_LSN *lsnp;
+ int in_recovery;
+{
+ DB *dbp;
+ DBT ddbt;
+ DB_LSN null_lsn;
+ DB_MPOOLFILE *mpf;
+ PAGE *h;
+ db_pglist_t *lp, *slp;
+ db_pgno_t lpgno, pgno;
+ u_int32_t elems, log_size, tpoint;
+ int last, ret;
+
+ ret = 0;
+ h = NULL;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ elems = tpoint = *nelemp;
+
+ /*
+ * Figure out what (if any) pages can be truncated immediately and
+ * record the place from which we can truncate, so we can do the
+ * memp_ftruncate below. We also use this to avoid ever putting
+ * these pages on the freelist, which we are about to relink.
+ */
+ pgno = *last_pgno;
+ lp = &list[elems - 1];
+ last = 1;
+ while (tpoint != 0) {
+ if (lp->pgno != pgno)
+ break;
+ pgno--;
+ tpoint--;
+ lp--;
+ }
+
+ lp = list;
+ slp = &list[elems];
+ /*
+ * Log the sorted list. We log the whole list so it can be rebuilt.
+ * Don't overflow the log file.
+ */
+again: if (DBC_LOGGING(dbc)) {
+ last = 1;
+ lpgno = *last_pgno;
+ ddbt.size = elems * sizeof(*lp);
+ ddbt.data = lp;
+ log_size = ((LOG *)dbc->env->
+ lg_handle->reginfo.primary)->log_size;
+ if (ddbt.size > log_size / 2) {
+ elems = (log_size / 2) / sizeof(*lp);
+ ddbt.size = elems * sizeof(*lp);
+ last = 0;
+ /*
+ * If we stopped after the truncation point
+ * then we need to truncate from here.
+ */
+ if (lp + elems >= &list[tpoint])
+ lpgno = lp[elems - 1].pgno;
+ }
+ /*
+ * If this is not the beginning of the list fetch the end
+ * of the previous segment. This page becomes the last_free
+ * page and will link to this segment if it is not truncated.
+ */
+ if (lp != list) {
+ if ((ret = __memp_fget(mpf, &lp[-1].pgno,
+ dbc->thread_info, txn, 0, &h)) != 0)
+ goto err;
+ }
+
+ slp = &lp[elems];
+
+ ZERO_LSN(null_lsn);
+ if ((ret = __db_pg_trunc_log(dbp, dbc->txn,
+ lsnp, last == 1 ? DB_FLUSH : 0, PGNO_BASE_MD,
+ lsnp, h != NULL ? PGNO(h) : PGNO_INVALID,
+ h != NULL ? &LSN(h) : &null_lsn,
+ free_pgno, lpgno, &ddbt)) != 0)
+ goto err;
+ if (h != NULL) {
+ LSN(h) = *lsnp;
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0)
+ goto err;
+ }
+ h = NULL;
+ } else if (!in_recovery)
+ LSN_NOT_LOGGED(*lsnp);
+
+ for (; lp < slp && lp < &list[tpoint]; lp++) {
+ if ((ret = __memp_fget(mpf, &lp->pgno, dbc->thread_info,
+ txn, !in_recovery ? DB_MPOOL_DIRTY : 0, &h)) != 0) {
+ /* Page may have been truncated later. */
+ if (in_recovery && ret == DB_PAGE_NOTFOUND) {
+ ret = 0;
+ continue;
+ }
+ goto err;
+ }
+ if (in_recovery) {
+ if (LOG_COMPARE(&LSN(h), &lp->lsn) == 0) {
+ if ((ret = __memp_dirty(mpf, &h,
+ dbc->thread_info,
+ txn, dbp->priority, 0)) != 0) {
+ (void)__memp_fput(mpf,
+ dbc->thread_info, h, dbp->priority);
+ goto err;
+ }
+ } else
+ goto skip;
+ }
+
+ if (lp == &list[tpoint - 1])
+ NEXT_PGNO(h) = PGNO_INVALID;
+ else
+ NEXT_PGNO(h) = lp[1].pgno;
+ DB_ASSERT(mpf->env, NEXT_PGNO(h) < *last_pgno);
+
+ LSN(h) = *lsnp;
+skip: if ((ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbp->priority)) != 0)
+ goto err;
+ h = NULL;
+ }
+
+ /*
+ * If we did not log everything try again. We start from slp and
+ * try to go to the end of the list.
+ */
+ if (last == 0) {
+ elems = (u_int32_t)(&list[*nelemp] - slp);
+ lp = slp;
+ goto again;
+ }
+
+ /*
+ * Truncate the file. Its possible that the last page is the
+ * only one that got truncated and that's done in the caller.
+ */
+ if (pgno != *last_pgno) {
+ if (tpoint != *nelemp &&
+ (ret = __memp_ftruncate(mpf, dbc->txn, dbc->thread_info,
+ pgno + 1, in_recovery ? MP_TRUNC_RECOVER : 0)) != 0)
+ goto err;
+ if (c_data)
+ c_data->compact_pages_truncated += *last_pgno - pgno;
+ *last_pgno = pgno;
+ }
+ *nelemp = tpoint;
+
+ if (0) {
+err: if (h != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority);
+ }
+ return (ret);
+}
+
+/*
+ * __db_free_truncate --
+ * Build a sorted free list and truncate free pages at the end
+ * of the file.
+ *
+ * PUBLIC: #ifdef HAVE_FTRUNCATE
+ * PUBLIC: int __db_free_truncate __P((DB *, DB_THREAD_INFO *, DB_TXN *,
+ * PUBLIC: u_int32_t, DB_COMPACT *, db_pglist_t **, u_int32_t *,
+ * PUBLIC: db_pgno_t *));
+ * PUBLIC: #endif
+ */
+int
+__db_free_truncate(dbp, ip, txn, flags, c_data, listp, nelemp, last_pgnop)
+ DB *dbp;
+ DB_THREAD_INFO *ip;
+ DB_TXN *txn;
+ u_int32_t flags;
+ DB_COMPACT *c_data;
+ db_pglist_t **listp;
+ u_int32_t *nelemp;
+ db_pgno_t *last_pgnop;
+{
+ DBC *dbc;
+ DBMETA *meta;
+ DB_LOCK metalock;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ PAGE *h;
+ db_pglist_t *list, *lp;
+ db_pgno_t pgno;
+ u_int32_t nelems;
+ int ret, t_ret;
+ size_t size;
+
+ COMPQUIET(flags, 0);
+ list = NULL;
+ meta = NULL;
+ env = dbp->env;
+ mpf = dbp->mpf;
+ h = NULL;
+ nelems = 0;
+ if (listp != NULL) {
+ *listp = NULL;
+ DB_ASSERT(env, nelemp != NULL);
+ *nelemp = 0;
+ }
+
+ if ((ret = __db_cursor(dbp, ip, txn, &dbc, DB_WRITELOCK)) != 0)
+ return (ret);
+
+ pgno = PGNO_BASE_MD;
+ if ((ret = __db_lget(dbc,
+ LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn, 0,
+ &meta)) != 0)
+ goto err;
+
+ if (last_pgnop != NULL)
+ *last_pgnop = meta->last_pgno;
+ if ((pgno = meta->free) == PGNO_INVALID)
+ goto done;
+
+ size = 128;
+ if ((ret = __os_malloc(env, size * sizeof(*list), &list)) != 0)
+ goto err;
+ lp = list;
+
+ do {
+ if (lp == &list[size]) {
+ size *= 2;
+ if ((ret = __os_realloc(env,
+ size * sizeof(*list), &list)) != 0)
+ goto err;
+ lp = &list[size / 2];
+ }
+ if ((ret = __memp_fget(mpf, &pgno,
+ dbc->thread_info, dbc->txn, 0, &h)) != 0)
+ goto err;
+
+ lp->pgno = pgno;
+ lp->next_pgno = NEXT_PGNO(h);
+ lp->lsn = LSN(h);
+ pgno = NEXT_PGNO(h);
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0)
+ goto err;
+ lp++;
+ } while (pgno != PGNO_INVALID);
+ nelems = (u_int32_t)(lp - list);
+
+ if ((ret = __memp_dirty(mpf,
+ &meta, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto err;
+
+ /* Sort the list */
+ __db_freelist_sort(list, nelems);
+
+ if ((ret = __db_pg_truncate(dbc, txn, list, c_data,
+ &nelems, meta->free, &meta->last_pgno, &LSN(meta), 0)) != 0)
+ goto err;
+
+ if (nelems == 0)
+ meta->free = PGNO_INVALID;
+ else
+ meta->free = list[0].pgno;
+
+done: if (last_pgnop != NULL)
+ *last_pgnop = meta->last_pgno;
+
+ /*
+ * The truncate point is the number of pages in the free
+ * list back from the last page. The number of pages
+ * in the free list are the number that we can swap in.
+ * Adjust it down slightly so if we find higher numbered
+ * pages early and then free other pages later we can
+ * truncate them.
+ */
+ if (c_data) {
+ c_data->compact_truncate = (u_int32_t)meta->last_pgno - nelems;
+ if (c_data->compact_truncate > nelems >> 2)
+ c_data->compact_truncate -= nelems >> 2;
+ }
+
+ if (nelems != 0 && listp != NULL) {
+ *listp = list;
+ *nelemp = nelems;
+ list = NULL;
+ }
+
+err: if (list != NULL)
+ __os_free(env, list);
+ if (meta != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, (PAGE *)meta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+static int
+__db_truncate_freelist(dbc, meta, h, list, start, nelem)
+ DBC *dbc;
+ DBMETA *meta;
+ PAGE *h;
+ db_pgno_t *list;
+ u_int32_t start, nelem;
+{
+ DB *dbp;
+ DBT ddbt;
+ DB_LSN null_lsn;
+ DB_MPOOLFILE *mpf;
+ PAGE *last_free, *pg;
+ db_pgno_t *lp, free_pgno, lpgno;
+ db_pglist_t *plist, *pp, *spp;
+ u_int32_t elem, log_size;
+ int last, ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ plist = NULL;
+ last_free = NULL;
+ pg = NULL;
+
+ if (start != 0 &&
+ (ret = __memp_fget(mpf, &list[start - 1],
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &last_free)) != 0)
+ goto err;
+
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __os_malloc(dbp->env,
+ (nelem - start) * sizeof(*pp), &plist)) != 0)
+ goto err;
+
+ pp = plist;
+ for (lp = &list[start]; lp < &list[nelem]; lp++) {
+ pp->pgno = *lp;
+ if ((ret = __memp_fget(mpf, lp,
+ dbc->thread_info, dbc->txn, 0, &pg)) != 0)
+ goto err;
+ pp->lsn = LSN(pg);
+ pp->next_pgno = NEXT_PGNO(pg);
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, pg, DB_PRIORITY_VERY_LOW)) != 0)
+ goto err;
+ pg = NULL;
+ pp++;
+ }
+ ZERO_LSN(null_lsn);
+ pp = plist;
+ elem = nelem - start;
+ log_size = ((LOG *)dbc->env->
+ lg_handle->reginfo.primary)->log_size;
+again: ddbt.data = spp = pp;
+ free_pgno = pp->pgno;
+ lpgno = meta->last_pgno;
+ ddbt.size = elem * sizeof(*pp);
+ if (ddbt.size > log_size / 2) {
+ elem = (log_size / 2) / (u_int32_t)sizeof(*pp);
+ ddbt.size = elem * sizeof(*pp);
+ pp += elem;
+ elem = (nelem - start) - (u_int32_t)(pp - plist);
+ lpgno = pp[-1].pgno;
+ last = 0;
+ } else
+ last = 1;
+ /*
+ * Get the page which will link to this section if we abort.
+ * If this is the first segment then its last_free.
+ */
+ if (spp == plist)
+ pg = last_free;
+ else if ((ret = __memp_fget(mpf, &spp[-1].pgno,
+ dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &pg)) != 0)
+ goto err;
+
+ if ((ret = __db_pg_trunc_log(dbp, dbc->txn,
+ &LSN(meta), last == 1 ? DB_FLUSH : 0,
+ PGNO(meta), &LSN(meta),
+ pg != NULL ? PGNO(pg) : PGNO_INVALID,
+ pg != NULL ? &LSN(pg) : &null_lsn,
+ free_pgno, lpgno, &ddbt)) != 0)
+ goto err;
+ if (pg != NULL) {
+ LSN(pg) = LSN(meta);
+ if (pg != last_free && (ret = __memp_fput(mpf,
+ dbc->thread_info, pg, DB_PRIORITY_VERY_LOW)) != 0)
+ goto err;
+ pg = NULL;
+ }
+ if (last == 0)
+ goto again;
+ } else
+ LSN_NOT_LOGGED(LSN(meta));
+
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, h, DB_PRIORITY_VERY_LOW)) != 0)
+ goto err;
+ h = NULL;
+ if ((ret = __memp_ftruncate(mpf, dbc->txn, dbc->thread_info,
+ list[start], 0)) != 0)
+ goto err;
+ meta->last_pgno = list[start] - 1;
+
+ if (start == 0)
+ meta->free = PGNO_INVALID;
+ else {
+ NEXT_PGNO(last_free) = PGNO_INVALID;
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, last_free, dbc->priority)) != 0)
+ goto err;
+ last_free = NULL;
+ }
+
+ /* Shrink the number of elements in the list. */
+ ret = __memp_extend_freelist(mpf, start, &list);
+
+err: if (plist != NULL)
+ __os_free(dbp->env, plist);
+
+ /* We need to put the page on error. */
+ if (h != NULL)
+ (void)__memp_fput(mpf, dbc->thread_info, h, dbc->priority);
+ if (pg != NULL && pg != last_free)
+ (void)__memp_fput(mpf, dbc->thread_info, pg, dbc->priority);
+ if (last_free != NULL)
+ (void)__memp_fput(mpf,
+ dbc->thread_info, last_free, dbc->priority);
+
+ return (ret);
+}
+#endif
+
+#ifdef DEBUG
+/*
+ * __db_lprint --
+ * Print out the list of locks currently held by a cursor.
+ *
+ * PUBLIC: int __db_lprint __P((DBC *));
+ */
+int
+__db_lprint(dbc)
+ DBC *dbc;
+{
+ DB *dbp;
+ DB_LOCKREQ req;
+ ENV *env;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+
+ if (LOCKING_ON(env)) {
+ req.op = DB_LOCK_DUMP;
+ (void)__lock_vec(env, dbc->locker, 0, &req, 1, NULL);
+ }
+ return (0);
+}
+#endif
+
+/*
+ * __db_lget --
+ * The standard lock get call.
+ *
+ * PUBLIC: int __db_lget __P((DBC *,
+ * PUBLIC: int, db_pgno_t, db_lockmode_t, u_int32_t, DB_LOCK *));
+ */
+int
+__db_lget(dbc, action, pgno, mode, lkflags, lockp)
+ DBC *dbc;
+ int action;
+ db_pgno_t pgno;
+ db_lockmode_t mode;
+ u_int32_t lkflags;
+ DB_LOCK *lockp;
+{
+ DB *dbp;
+ DB_LOCKREQ couple[3], *reqp;
+ DB_TXN *txn;
+ ENV *env;
+ int has_timeout, i, ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ txn = dbc->txn;
+
+ /*
+ * We do not always check if we're configured for locking before
+ * calling __db_lget to acquire the lock.
+ */
+ if (CDB_LOCKING(env) || !LOCKING_ON(env) ||
+ (MULTIVERSION(dbp) && mode == DB_LOCK_READ &&
+ dbc->txn != NULL && F_ISSET(dbc->txn, TXN_SNAPSHOT)) ||
+ F_ISSET(dbc, DBC_DONTLOCK) || (F_ISSET(dbc, DBC_RECOVER) &&
+ (action != LCK_ROLLBACK || IS_REP_CLIENT(env))) ||
+ (action != LCK_ALWAYS && F_ISSET(dbc, DBC_OPD))) {
+ LOCK_INIT(*lockp);
+ return (0);
+ }
+
+ /*
+ * If the transaction enclosing this cursor has DB_LOCK_NOWAIT set,
+ * pass that along to the lock call.
+ */
+ if (DB_NONBLOCK(dbc))
+ lkflags |= DB_LOCK_NOWAIT;
+
+ /*
+ * If we're trying to run in exclusive mode, attempt to get an
+ * exclusive database lock. If it is not available then wait
+ * for the lock on the database and clear the exclusive bit.
+ *
+ * If we get an exclusive lock on the database, mark the cursor
+ * with DBC_DONTLOCK to avoid any further locking.
+ */
+ if (F_ISSET(dbp->mpf->mfp, MP_DATABASE_LOCKING)) {
+ dbc->lock.type = DB_DATABASE_LOCK;
+ dbc->lock.pgno = PGNO_BASE_MD;
+ if ((ret = __lock_get(env, dbc->locker, DB_LOCK_NOWAIT,
+ &dbc->lock_dbt, F_ISSET(dbp, DB_AM_RDONLY) ?
+ DB_LOCK_READ : DB_LOCK_WRITE, lockp)) == 0) {
+ if (F_ISSET(dbp->mpf->mfp, MP_DATABASE_LOCKING)) {
+ F_SET(dbc, DBC_DONTLOCK);
+ if (!IS_REAL_TXN(txn))
+ dbc->mylock = *lockp;
+ LOCK_INIT(*lockp);
+ return (0);
+ }
+ } else if (ret == DB_LOCK_NOTGRANTED &&
+ (lkflags & DB_LOCK_NOWAIT) == 0) {
+ if ((ret = __lock_get(env, dbc->locker, 0,
+ &dbc->lock_dbt, DB_LOCK_WRITE, lockp)) != 0)
+ return (ret);
+ F_CLR(dbp->mpf->mfp, MP_DATABASE_LOCKING);
+ if ((ret = __lock_put(env, lockp)) != 0)
+ return (ret);
+ LOCK_INIT(*lockp);
+ } else if (ret != 0)
+ return (ret);
+ }
+
+ dbc->lock.pgno = pgno;
+ if (lkflags & DB_LOCK_RECORD)
+ dbc->lock.type = DB_RECORD_LOCK;
+ else
+ dbc->lock.type = DB_PAGE_LOCK;
+ lkflags &= ~DB_LOCK_RECORD;
+
+ if (F_ISSET(dbc, DBC_READ_UNCOMMITTED) && mode == DB_LOCK_READ)
+ mode = DB_LOCK_READ_UNCOMMITTED;
+
+ has_timeout = F_ISSET(dbc, DBC_RECOVER) ||
+ (txn != NULL && F_ISSET(txn, TXN_LOCKTIMEOUT));
+
+ /*
+ * Transactional locking.
+ * Hold on to the previous read lock only if we are in full isolation.
+ * COUPLE_ALWAYS indicates we are holding an interior node which need
+ * not be isolated.
+ * Downgrade write locks if we are supporting dirty readers and the
+ * update did not have an error.
+ */
+ if ((action != LCK_COUPLE && action != LCK_COUPLE_ALWAYS) ||
+ !LOCK_ISSET(*lockp))
+ action = 0;
+ else if (dbc->txn == NULL || action == LCK_COUPLE_ALWAYS)
+ action = LCK_COUPLE;
+ else if (F_ISSET(dbc, DBC_READ_COMMITTED | DBC_WAS_READ_COMMITTED) &&
+ lockp->mode == DB_LOCK_READ)
+ action = LCK_COUPLE;
+ else if (lockp->mode == DB_LOCK_READ_UNCOMMITTED)
+ action = LCK_COUPLE;
+ else if (F_ISSET(dbc->dbp, DB_AM_READ_UNCOMMITTED) &&
+ !F_ISSET(dbc, DBC_ERROR) && lockp->mode == DB_LOCK_WRITE)
+ action = LCK_DOWNGRADE;
+ else
+ action = 0;
+
+ i = 0;
+ switch (action) {
+ default:
+ if (has_timeout)
+ goto do_couple;
+ ret = __lock_get(env,
+ dbc->locker, lkflags, &dbc->lock_dbt, mode, lockp);
+ break;
+
+ case LCK_DOWNGRADE:
+ couple[0].op = DB_LOCK_GET;
+ couple[0].obj = NULL;
+ couple[0].lock = *lockp;
+ couple[0].mode = DB_LOCK_WWRITE;
+ UMRW_SET(couple[0].timeout);
+ i++;
+ /* FALLTHROUGH */
+ case LCK_COUPLE:
+do_couple: couple[i].op = has_timeout? DB_LOCK_GET_TIMEOUT : DB_LOCK_GET;
+ couple[i].obj = &dbc->lock_dbt;
+ couple[i].mode = mode;
+ UMRW_SET(couple[i].timeout);
+ i++;
+ if (has_timeout)
+ couple[0].timeout =
+ F_ISSET(dbc, DBC_RECOVER) ? 0 : txn->lock_timeout;
+ if (action == LCK_COUPLE || action == LCK_DOWNGRADE) {
+ couple[i].op = DB_LOCK_PUT;
+ couple[i].lock = *lockp;
+ i++;
+ }
+
+ ret = __lock_vec(env,
+ dbc->locker, lkflags, couple, i, &reqp);
+ if (ret == 0 || reqp == &couple[i - 1])
+ *lockp = i == 1 ? couple[0].lock : couple[i - 2].lock;
+ break;
+ }
+
+ if (txn != NULL && ret == DB_LOCK_DEADLOCK)
+ F_SET(txn, TXN_DEADLOCK);
+ return ((ret == DB_LOCK_NOTGRANTED && !F_ISSET(env->dbenv,
+ DB_ENV_TIME_NOTGRANTED)) ? DB_LOCK_DEADLOCK : ret);
+}
+
+#ifdef DIAGNOSTIC
+/*
+ * __db_haslock --
+ * Determine if this locker holds a particular lock.
+ * Returns 0 if lock is held, non-zero otherwise.
+ *
+ * PUBLIC: #ifdef DIAGNOSTIC
+ * PUBLIC: int __db_haslock __P((ENV *, DB_LOCKER *,
+ * PUBLIC: DB_MPOOLFILE *, db_pgno_t, db_lockmode_t, u_int32_t));
+ * PUBLIC: #endif
+ */
+int
+__db_haslock(env, locker, dbmfp, pgno, mode, type)
+ ENV *env;
+ DB_LOCKER *locker;
+ DB_MPOOLFILE *dbmfp;
+ db_pgno_t pgno;
+ db_lockmode_t mode;
+ u_int32_t type;
+{
+ DBT lkdata;
+ DB_LOCK lock;
+ DB_LOCK_ILOCK ilock;
+
+ memset(&lkdata, 0, sizeof(lkdata));
+ lkdata.data = &ilock;
+ lkdata.size = sizeof(ilock);
+
+ memcpy(ilock.fileid, dbmfp->fileid, DB_FILE_ID_LEN);
+ ilock.pgno = pgno;
+ ilock.type = type;
+
+ return (__lock_get(env, locker, DB_LOCK_CHECK, &lkdata, mode, &lock));
+}
+/*
+ * __db_has_pagelock --
+ * Determine if this locker holds a particular page lock.
+ * Returns 0 if lock is held, non-zero otherwise.
+ *
+ * PUBLIC: #ifdef DIAGNOSTIC
+ * PUBLIC: int __db_has_pagelock __P((ENV *, DB_LOCKER *,
+ * PUBLIC: DB_MPOOLFILE *, PAGE *, db_lockmode_t));
+ * PUBLIC: #endif
+ */
+int
+__db_has_pagelock(env, locker, dbmfp, pagep, mode)
+ ENV *env;
+ DB_LOCKER *locker;
+ DB_MPOOLFILE *dbmfp;
+ PAGE *pagep;
+ db_lockmode_t mode;
+{
+ int ret;
+
+ switch (pagep->type) {
+ case P_OVERFLOW:
+ case P_INVALID:
+ case P_QAMDATA:
+ case P_QAMMETA:
+ case P_IHEAP:
+ return (0);
+ case P_HASH:
+ if (PREV_PGNO(pagep) != PGNO_INVALID)
+ return (0);
+ break;
+ default:
+ break;
+ }
+ if ((ret = __db_haslock(env,
+ locker, dbmfp, pagep->pgno, mode, DB_PAGE_LOCK)) != 0)
+ ret = __db_haslock(env,
+ locker, dbmfp, PGNO_BASE_MD, mode, DB_DATABASE_LOCK);
+ return (ret);
+}
+#endif
+
+/*
+ * __db_lput --
+ * The standard lock put call.
+ *
+ * PUBLIC: int __db_lput __P((DBC *, DB_LOCK *));
+ */
+int
+__db_lput(dbc, lockp)
+ DBC *dbc;
+ DB_LOCK *lockp;
+{
+ DB_LOCKREQ couple[2], *reqp;
+ ENV *env;
+ int action, ret;
+
+ /*
+ * Transactional locking.
+ * Hold on to the read locks only if we are in full isolation.
+ * Downgrade write locks if we are supporting dirty readers unless
+ * there was an error.
+ */
+ if (F_ISSET(dbc->dbp, DB_AM_READ_UNCOMMITTED) &&
+ !F_ISSET(dbc, DBC_ERROR) && lockp->mode == DB_LOCK_WRITE)
+ action = LCK_DOWNGRADE;
+ else if (dbc->txn == NULL)
+ action = LCK_COUPLE;
+ else if (F_ISSET(dbc, DBC_READ_COMMITTED | DBC_WAS_READ_COMMITTED) &&
+ lockp->mode == DB_LOCK_READ)
+ action = LCK_COUPLE;
+ else if (lockp->mode == DB_LOCK_READ_UNCOMMITTED)
+ action = LCK_COUPLE;
+ else
+ action = 0;
+
+ env = dbc->env;
+ switch (action) {
+ case LCK_COUPLE:
+ ret = __lock_put(env, lockp);
+ break;
+ case LCK_DOWNGRADE:
+ couple[0].op = DB_LOCK_GET;
+ couple[0].obj = NULL;
+ couple[0].mode = DB_LOCK_WWRITE;
+ couple[0].lock = *lockp;
+ UMRW_SET(couple[0].timeout);
+ couple[1].op = DB_LOCK_PUT;
+ couple[1].lock = *lockp;
+ ret = __lock_vec(env, dbc->locker, 0, couple, 2, &reqp);
+ if (ret == 0 || reqp == &couple[1])
+ *lockp = couple[0].lock;
+ break;
+ default:
+ ret = 0;
+ break;
+ }
+
+ return (ret);
+}