1 files changed, 843 insertions, 83 deletions
diff --git a/bdb/db/db_am.c b/bdb/db/db_am.c
index 2d224566904..cf6ef18549b 100644
--- a/bdb/db/db_am.c
+++ b/bdb/db/db_am.c
@@ -1,14 +1,14 @@
 /*-
  * See the file LICENSE for redistribution information.
  *
- * Copyright (c) 1998, 1999, 2000
+ * Copyright (c) 1998-2002
  *	Sleepycat Software.  All rights reserved.
  */
 
 #include "db_config.h"
 
 #ifndef lint
-static const char revid[] = "$Id: db_am.c,v 11.42 2001/01/11 18:19:50 bostic Exp $";
+static const char revid[] = "$Id: db_am.c,v 11.96 2002/08/27 15:17:32 bostic Exp $";
 #endif /* not lint */
 
 #ifndef NO_SYSTEM_INCLUDES
@@ -18,16 +18,22 @@ static const char revid[] = "$Id: db_am.c,v 11.42 2001/01/11 18:19:50 bostic Exp
 #endif
 
 #include "db_int.h"
-#include "db_page.h"
-#include "db_shash.h"
-#include "btree.h"
-#include "hash.h"
-#include "qam.h"
-#include "lock.h"
-#include "mp.h"
-#include "txn.h"
-#include "db_am.h"
-#include "db_ext.h"
+#include "dbinc/db_page.h"
+#include "dbinc/db_shash.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/log.h"
+#include "dbinc/mp.h"
+#include "dbinc/qam.h"
+
+static int __db_append_primary __P((DBC *, DBT *, DBT *));
+static int __db_secondary_get __P((DB *, DB_TXN *, DBT *, DBT *, u_int32_t));
+static int __db_secondary_close __P((DB *, u_int32_t));
+
+#ifdef DEBUG
+static int __db_cprint_item __P((DBC *));
+#endif
 
 /*
  * __db_cursor --
@@ -53,12 +59,22 @@ __db_cursor(dbp, txn, dbcp, flags)
 	PANIC_CHECK(dbenv);
 	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->cursor");
 
-	/* Check for invalid flags. */
-	if ((ret = __db_cursorchk(dbp, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0)
+	/* Validate arguments. */
+	if ((ret = __db_cursorchk(dbp, flags)) != 0)
 		return (ret);
 
-	if ((ret =
-	    __db_icursor(dbp, txn, dbp->type, PGNO_INVALID, 0, dbcp)) != 0)
+	/*
+	 * Check for consistent transaction usage.  For now, assume that
+	 * this cursor might be used for read operations only (in which
+	 * case it may not require a txn).  We'll check more stringently
+	 * in c_del and c_put.  (Note that this all means that the
+	 * read-op txn tests have to be a subset of the write-op ones.)
+	 */
+	if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 1)) != 0)
+		return (ret);
+
+	if ((ret = __db_icursor(dbp,
+	    txn, dbp->type, PGNO_INVALID, 0, DB_LOCK_INVALIDID, dbcp)) != 0)
 		return (ret);
 	dbc = *dbcp;
 
@@ -70,7 +86,7 @@ __db_cursor(dbp, txn, dbcp, flags)
 		op = LF_ISSET(DB_OPFLAGS_MASK);
 		mode = (op == DB_WRITELOCK) ? DB_LOCK_WRITE :
 		    ((op == DB_WRITECURSOR) ? DB_LOCK_IWRITE : DB_LOCK_READ);
-		if ((ret = lock_get(dbenv, dbc->locker, 0,
+		if ((ret = dbenv->lock_get(dbenv, dbc->locker, 0,
 		    &dbc->lock_dbt, mode, &dbc->mylock)) != 0) {
 			(void)__db_c_close(dbc);
 			return (ret);
@@ -81,6 +97,9 @@ __db_cursor(dbp, txn, dbcp, flags)
 			F_SET(dbc, DBC_WRITER);
 	}
 
+	if (LF_ISSET(DB_DIRTY_READ) ||
+	    (txn != NULL && F_ISSET(txn, TXN_DIRTY_READ)))
+		F_SET(dbc, DBC_DIRTY_READ);
 	return (0);
 }
 
@@ -91,15 +110,16 @@ __db_cursor(dbp, txn, dbcp, flags)
  *	initialize as a cursor.
  *
  * PUBLIC: int __db_icursor
- * PUBLIC:     __P((DB *, DB_TXN *, DBTYPE, db_pgno_t, int, DBC **));
+ * PUBLIC:     __P((DB *, DB_TXN *, DBTYPE, db_pgno_t, int, u_int32_t, DBC **));
  */
 int
-__db_icursor(dbp, txn, dbtype, root, is_opd, dbcp)
+__db_icursor(dbp, txn, dbtype, root, is_opd, lockerid, dbcp)
 	DB *dbp;
 	DB_TXN *txn;
 	DBTYPE dbtype;
 	db_pgno_t root;
 	int is_opd;
+	u_int32_t lockerid;
 	DBC **dbcp;
 {
 	DBC *dbc, *adbc;
@@ -120,7 +140,7 @@ __db_icursor(dbp, txn, dbtype, root, is_opd, dbcp)
 	    dbc != NULL; dbc = TAILQ_NEXT(dbc, links))
 		if (dbtype == dbc->dbtype) {
 			TAILQ_REMOVE(&dbp->free_queue, dbc, links);
-			dbc->flags = 0;
+			F_CLR(dbc, ~DBC_OWN_LID);
 			break;
 		}
 	MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp);
@@ -144,11 +164,35 @@ __db_icursor(dbp, txn, dbtype, root, is_opd, dbcp)
 			if (!DB_IS_THREADED(dbp) &&
 			    (adbc = TAILQ_FIRST(&dbp->active_queue)) != NULL)
 				dbc->lid = adbc->lid;
-			else
-				if ((ret = lock_id(dbenv, &dbc->lid)) != 0)
+			else {
+				if ((ret =
+				    dbenv->lock_id(dbenv, &dbc->lid)) != 0)
 					goto err;
+				F_SET(dbc, DBC_OWN_LID);
+			}
+
+			/*
+			 * In CDB, secondary indices should share a lock file
+			 * ID with the primary;  otherwise we're susceptible to
+			 * deadlocks.  We also use __db_icursor rather
+			 * than sdbp->cursor to create secondary update
+			 * cursors in c_put and c_del;  these won't
+			 * acquire a new lock.
+			 *
+			 * !!!
+			 * Since this is in the one-time cursor allocation
+			 * code, we need to be sure to destroy, not just
+			 * close, all cursors in the secondary when we
+			 * associate.
+			 */
+			if (CDB_LOCKING(dbp->dbenv) &&
+			    F_ISSET(dbp, DB_AM_SECONDARY))
+				memcpy(dbc->lock.fileid,
+				    dbp->s_primary->fileid, DB_FILE_ID_LEN);
+			else
+				memcpy(dbc->lock.fileid,
+				    dbp->fileid, DB_FILE_ID_LEN);
 
-			memcpy(dbc->lock.fileid, dbp->fileid, DB_FILE_ID_LEN);
 			if (CDB_LOCKING(dbenv)) {
 				if (F_ISSET(dbenv, DB_ENV_CDB_ALLDB)) {
 					/*
@@ -198,18 +242,55 @@ __db_icursor(dbp, txn, dbtype, root, is_opd, dbcp)
 
 	/* Refresh the DBC structure. */
 	dbc->dbtype = dbtype;
+	RESET_RET_MEM(dbc);
 
-	if ((dbc->txn = txn) == NULL)
-		dbc->locker = dbc->lid;
-	else {
+	if ((dbc->txn = txn) == NULL) {
+		/*
+		 * There are certain cases in which we want to create a
+		 * new cursor with a particular locker ID that is known
+		 * to be the same as (and thus not conflict with) an
+		 * open cursor.
+		 *
+		 * The most obvious case is cursor duplication;  when we
+		 * call DBC->c_dup or __db_c_idup, we want to use the original
+		 * cursor's locker ID.
+		 *
+		 * Another case is when updating secondary indices.  Standard
+		 * CDB locking would mean that we might block ourself:  we need
+		 * to open an update cursor in the secondary while an update
+		 * cursor in the primary is open, and when the secondary and
+		 * primary are subdatabases or we're using env-wide locking,
+		 * this is disastrous.
+		 *
+		 * In these cases, our caller will pass a nonzero locker ID
+		 * into this function.  Use this locker ID instead of dbc->lid
+		 * as the locker ID for our new cursor.
+		 */
+		if (lockerid != DB_LOCK_INVALIDID)
+			dbc->locker = lockerid;
+		else
+			dbc->locker = dbc->lid;
+	} else {
 		dbc->locker = txn->txnid;
 		txn->cursors++;
 	}
 
+	/*
+	 * These fields change when we are used as a secondary index, so
+	 * if the DB is a secondary, make sure they're set properly just
+	 * in case we opened some cursors before we were associated.
+	 *
+	 * __db_c_get is used by all access methods, so this should be safe.
+	 */
+	if (F_ISSET(dbp, DB_AM_SECONDARY))
+		dbc->c_get = __db_c_secondary_get;
+
 	if (is_opd)
 		F_SET(dbc, DBC_OPD);
 	if (F_ISSET(dbp, DB_AM_RECOVER))
 		F_SET(dbc, DBC_RECOVER);
+	if (F_ISSET(dbp, DB_AM_COMPENSATE))
+		F_SET(dbc, DBC_COMPENSATE);
 
 	/* Refresh the DBC internal structure. */
 	cp = dbc->internal;
@@ -243,14 +324,14 @@ __db_icursor(dbp, txn, dbtype, root, is_opd, dbcp)
 	return (0);
 
 err:	if (allocated)
-		__os_free(dbc, sizeof(*dbc));
+		__os_free(dbp->dbenv, dbc);
 	return (ret);
 }
 
 #ifdef DEBUG
 /*
  * __db_cprint --
- *	Display the current cursor list.
+ *	Display the cursor active and free queues.
  *
  * PUBLIC: int __db_cprint __P((DB *));
  */
@@ -258,60 +339,76 @@ int
 __db_cprint(dbp)
 	DB *dbp;
 {
+	DBC *dbc;
+	int ret, t_ret;
+
+	ret = 0;
+	MUTEX_THREAD_LOCK(dbp->dbenv, dbp->mutexp);
+	fprintf(stderr, "Active queue:\n");
+	for (dbc = TAILQ_FIRST(&dbp->active_queue);
+	    dbc != NULL; dbc = TAILQ_NEXT(dbc, links))
+		if ((t_ret = __db_cprint_item(dbc)) != 0 && ret == 0)
+			ret = t_ret;
+	fprintf(stderr, "Free queue:\n");
+	for (dbc = TAILQ_FIRST(&dbp->free_queue);
+	    dbc != NULL; dbc = TAILQ_NEXT(dbc, links))
+		if ((t_ret = __db_cprint_item(dbc)) != 0 && ret == 0)
+			ret = t_ret;
+	MUTEX_THREAD_UNLOCK(dbp->dbenv, dbp->mutexp);
+
+	return (ret);
+}
+
+static
+int __db_cprint_item(dbc)
+	DBC *dbc;
+{
 	static const FN fn[] = {
 		{ DBC_ACTIVE,		"active" },
+		{ DBC_COMPENSATE,	"compensate" },
 		{ DBC_OPD,		"off-page-dup" },
 		{ DBC_RECOVER,		"recover" },
 		{ DBC_RMW,		"read-modify-write" },
+		{ DBC_TRANSIENT,	"transient" },
 		{ DBC_WRITECURSOR,	"write cursor" },
 		{ DBC_WRITEDUP,		"internally dup'ed write cursor" },
 		{ DBC_WRITER,		"short-term write cursor" },
 		{ 0,			NULL }
 	};
-	DBC *dbc;
+	DB *dbp;
 	DBC_INTERNAL *cp;
-	char *s;
+	const char *s;
 
-	MUTEX_THREAD_LOCK(dbp->dbenv, dbp->mutexp);
-	for (dbc = TAILQ_FIRST(&dbp->active_queue);
-	    dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) {
-		switch (dbc->dbtype) {
-		case DB_BTREE:
-			s = "btree";
-			break;
-		case DB_HASH:
-			s = "hash";
-			break;
-		case DB_RECNO:
-			s = "recno";
-			break;
-		case DB_QUEUE:
-			s = "queue";
-			break;
-		default:
-			DB_ASSERT(0);
-			return (1);
-		}
-		cp = dbc->internal;
-		fprintf(stderr, "%s/%#0lx: opd: %#0lx\n",
-		    s, P_TO_ULONG(dbc), P_TO_ULONG(cp->opd));
-		fprintf(stderr, "\ttxn: %#0lx lid: %lu locker: %lu\n",
-		    P_TO_ULONG(dbc->txn),
-		    (u_long)dbc->lid, (u_long)dbc->locker);
-		fprintf(stderr, "\troot: %lu page/index: %lu/%lu",
-		    (u_long)cp->root, (u_long)cp->pgno, (u_long)cp->indx);
-		__db_prflags(dbc->flags, fn, stderr);
-		fprintf(stderr, "\n");
-
-		if (dbp->type == DB_BTREE)
-			__bam_cprint(dbc);
+	dbp = dbc->dbp;
+	cp = dbc->internal;
+
+	s = __db_dbtype_to_string(dbc->dbtype);
+	if (strcmp(s, "UNKNOWN TYPE") == 0) {
+		DB_ASSERT(0);
+		return (1);
 	}
-	for (dbc = TAILQ_FIRST(&dbp->free_queue);
-	    dbc != NULL; dbc = TAILQ_NEXT(dbc, links))
-		fprintf(stderr, "free: %#0lx ", P_TO_ULONG(dbc));
+	fprintf(stderr, "%s/%#0lx: opd: %#0lx\n",
+	    s, P_TO_ULONG(dbc), P_TO_ULONG(cp->opd));
+
+	fprintf(stderr, "\ttxn: %#0lx lid: %lu locker: %lu\n",
+	    P_TO_ULONG(dbc->txn), (u_long)dbc->lid, (u_long)dbc->locker);
+
+	fprintf(stderr, "\troot: %lu page/index: %lu/%lu",
+	    (u_long)cp->root, (u_long)cp->pgno, (u_long)cp->indx);
+
+	__db_prflags(dbc->flags, fn, stderr);
 	fprintf(stderr, "\n");
-	MUTEX_THREAD_UNLOCK(dbp->dbenv, dbp->mutexp);
 
+	switch (dbp->type) {
+	case DB_BTREE:
+		__bam_cprint(dbc);
+		break;
+	case DB_HASH:
+		__ham_cprint(dbc);
+		break;
+	default:
+		break;
+	}
 	return (0);
 }
 #endif /* DEBUG */
@@ -345,7 +442,7 @@ __db_fd(dbp, fdp)
 		return (0);
 	} else {
 		*fdp = -1;
-		__db_err(dbp->dbenv, "DB does not have a valid file handle.");
+		__db_err(dbp->dbenv, "DB does not have a valid file handle");
 		return (ENOENT);
 	}
 }
@@ -372,8 +469,16 @@ __db_get(dbp, txn, key, data, flags)
 	if ((ret = __db_getchk(dbp, key, data, flags)) != 0)
 		return (ret);
 
+	/* Check for consistent transaction usage. */
+	if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 1)) != 0)
+		return (ret);
+
 	mode = 0;
-	if (flags == DB_CONSUME || flags == DB_CONSUME_WAIT)
+	if (LF_ISSET(DB_DIRTY_READ)) {
+		mode = DB_DIRTY_READ;
+		LF_CLR(DB_DIRTY_READ);
+	}
+	else if (flags == DB_CONSUME || flags == DB_CONSUME_WAIT)
 		mode = DB_WRITELOCK;
 	if ((ret = dbp->cursor(dbp, txn, &dbc, mode)) != 0)
 		return (ret);
@@ -387,11 +492,17 @@ __db_get(dbp, txn, key, data, flags)
 	 * going to close it right away.  Thus, we can perform the get
 	 * without duplicating the cursor, saving some cycles in this
 	 * common case.
+	 *
+	 * SET_RET_MEM indicates that if key and/or data have no DBT
+	 * flags set and DB manages the returned-data memory, that memory
+	 * will belong to this handle, not to the underlying cursor.
 	 */
 	F_SET(dbc, DBC_TRANSIENT);
+	SET_RET_MEM(dbc, dbp);
 
-	ret = dbc->c_get(dbc, key, data,
-	    flags == 0 || flags == DB_RMW ? flags | DB_SET : flags);
+	if (LF_ISSET(~(DB_RMW | DB_MULTIPLE)) == 0)
+		LF_SET(DB_SET);
+	ret = dbc->c_get(dbc, key, data, flags);
 
 	if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0)
 		ret = t_ret;
@@ -414,20 +525,39 @@ __db_put(dbp, txn, key, data, flags)
 {
 	DBC *dbc;
 	DBT tdata;
-	int ret, t_ret;
+	DB_ENV *dbenv;
+	int ret, t_ret, txn_local;
 
-	PANIC_CHECK(dbp->dbenv);
+	dbc = NULL;
+	dbenv = dbp->dbenv;
+	txn_local = 0;
+
+	PANIC_CHECK(dbenv);
 	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->put");
 
+	/* Validate arguments. */
 	if ((ret = __db_putchk(dbp, key, data,
-	    flags, F_ISSET(dbp, DB_AM_RDONLY),
-	    F_ISSET(dbp, DB_AM_DUP) || F_ISSET(key, DB_DBT_DUPOK))) != 0)
+	    flags, F_ISSET(dbp, DB_AM_DUP) || F_ISSET(key, DB_DBT_DUPOK))) != 0)
 		return (ret);
 
-	DB_CHECK_TXN(dbp, txn);
+	/* Create local transaction as necessary. */
+	if (IS_AUTO_COMMIT(dbenv, txn, flags)) {
+		if ((ret = __db_txn_auto(dbp, &txn)) != 0)
+			return (ret);
+		txn_local = 1;
+		LF_CLR(DB_AUTO_COMMIT);
+	}
+
+	/* Check for consistent transaction usage. */
+	if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0)
+		goto err;
 
 	if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0)
-		return (ret);
+		goto err;
+
+	DEBUG_LWRITE(dbc, txn, "db_put", key, data, flags);
+
+	SET_RET_MEM(dbc, dbp);
 
 	/*
 	 * See the comment in __db_get().
@@ -441,9 +571,58 @@ __db_put(dbp, txn, key, data, flags)
 	 */
 	F_SET(dbc, DBC_TRANSIENT);
 
-	DEBUG_LWRITE(dbc, txn, "__db_put", key, data, flags);
+	switch (flags) {
+	case DB_APPEND:
+		/*
+		 * If there is an append callback, the value stored in
+		 * data->data may be replaced and then freed.  To avoid
+		 * passing a freed pointer back to the user, just operate
+		 * on a copy of the data DBT.
+		 */
+		tdata = *data;
 
-	if (flags == DB_NOOVERWRITE) {
+		/*
+		 * Append isn't a normal put operation;  call the appropriate
+		 * access method's append function.
+		 */
+		switch (dbp->type) {
+		case DB_QUEUE:
+			if ((ret = __qam_append(dbc, key, &tdata)) != 0)
+				goto err;
+			break;
+		case DB_RECNO:
+			if ((ret = __ram_append(dbc, key, &tdata)) != 0)
+				goto err;
+			break;
+		default:
+			/* The interface should prevent this. */
+			DB_ASSERT(0);
+			ret = __db_ferr(dbenv, "__db_put", flags);
+			goto err;
+		}
+
+		/*
+		 * Secondary indices:  since we've returned zero from
+		 * an append function, we've just put a record, and done
+		 * so outside __db_c_put.  We know we're not a secondary--
+		 * the interface prevents puts on them--but we may be a
+		 * primary.  If so, update our secondary indices
+		 * appropriately.
+		 */
+		DB_ASSERT(!F_ISSET(dbp, DB_AM_SECONDARY));
+
+		if (LIST_FIRST(&dbp->s_secondaries) != NULL)
+			ret = __db_append_primary(dbc, key, &tdata);
+
+		/*
+		 * The append callback, if one exists, may have allocated
+		 * a new tdata.data buffer.  If so, free it.
+		 */
+		FREE_IF_NEEDED(dbp, &tdata);
+
+		/* No need for a cursor put;  we're done. */
+		goto err;
+	case DB_NOOVERWRITE:
 		flags = 0;
 		/*
 		 * Set DB_DBT_USERMEM, this might be a threaded application and
@@ -460,16 +639,161 @@ __db_put(dbp, txn, key, data, flags)
 		if ((ret = dbc->c_get(dbc, key, &tdata,
 		    DB_SET | (STD_LOCKING(dbc) ? DB_RMW : 0))) == 0)
 			ret = DB_KEYEXIST;
-		else if (ret == DB_NOTFOUND)
+		else if (ret == DB_NOTFOUND || ret == DB_KEYEMPTY)
 			ret = 0;
+		break;
+	default:
+		/* Fall through to normal cursor put. */
+		break;
 	}
 	if (ret == 0)
 		ret = dbc->c_put(dbc,
-		     key, data, flags == 0 ? DB_KEYLAST : flags);
+		    key, data, flags == 0 ? DB_KEYLAST : flags);
 
-	if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0)
+err:	/* Close the cursor. */
+	if (dbc != NULL && (t_ret = __db_c_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	/* Commit for DB_AUTO_COMMIT. */
+	if (txn_local) {
+		if (ret == 0)
+			ret = txn->commit(txn, 0);
+		else
+			if ((t_ret = txn->abort(txn)) != 0)
+				ret = __db_panic(dbenv, t_ret);
+	}
+
+	return (ret);
+}
+
+/*
+ * __db_delete --
+ *	Delete the items referenced by a key.
+ *
+ * PUBLIC: int __db_delete __P((DB *, DB_TXN *, DBT *, u_int32_t));
+ */
+int
+__db_delete(dbp, txn, key, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	DBT *key;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	DBT data, lkey;
+	DB_ENV *dbenv;
+	u_int32_t f_init, f_next;
+	int ret, t_ret, txn_local;
+
+	dbc = NULL;
+	dbenv = dbp->dbenv;
+	txn_local = 0;
+
+	PANIC_CHECK(dbenv);
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->del");
+
+	/* Check for invalid flags. */
+	if ((ret = __db_delchk(dbp, key, flags)) != 0)
+		return (ret);
+
+	/* Create local transaction as necessary. */
+	if (IS_AUTO_COMMIT(dbenv, txn, flags)) {
+		if ((ret = __db_txn_auto(dbp, &txn)) != 0)
+			return (ret);
+		txn_local = 1;
+		LF_CLR(DB_AUTO_COMMIT);
+	}
+
+	/* Check for consistent transaction usage. */
+	if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0)
+		goto err;
+
+	/* Allocate a cursor. */
+	if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0)
+		goto err;
+
+	DEBUG_LWRITE(dbc, txn, "db_delete", key, NULL, flags);
+
+	/*
+	 * Walk a cursor through the key/data pairs, deleting as we go.  Set
+	 * the DB_DBT_USERMEM flag, as this might be a threaded application
+	 * and the flags checking will catch us.  We don't actually want the
+	 * keys or data, so request a partial of length 0.
+	 */
+	memset(&lkey, 0, sizeof(lkey));
+	F_SET(&lkey, DB_DBT_USERMEM | DB_DBT_PARTIAL);
+	memset(&data, 0, sizeof(data));
+	F_SET(&data, DB_DBT_USERMEM | DB_DBT_PARTIAL);
+
+	/*
+	 * If locking (and we haven't already acquired CDB locks), set the
+	 * read-modify-write flag.
+	 */
+	f_init = DB_SET;
+	f_next = DB_NEXT_DUP;
+	if (STD_LOCKING(dbc)) {
+		f_init |= DB_RMW;
+		f_next |= DB_RMW;
+	}
+
+	/* Walk through the set of key/data pairs, deleting as we go. */
+	if ((ret = dbc->c_get(dbc, key, &data, f_init)) != 0)
+		goto err;
+
+	/*
+	 * Hash permits an optimization in DB->del:  since on-page
+	 * duplicates are stored in a single HKEYDATA structure, it's
+	 * possible to delete an entire set of them at once, and as
+	 * the HKEYDATA has to be rebuilt and re-put each time it
+	 * changes, this is much faster than deleting the duplicates
+	 * one by one.  Thus, if we're not pointing at an off-page
+	 * duplicate set, and we're not using secondary indices (in
+	 * which case we'd have to examine the items one by one anyway),
+	 * let hash do this "quick delete".
+	 *
+	 * !!!
+	 * Note that this is the only application-executed delete call in
+	 * Berkeley DB that does not go through the __db_c_del function.
+	 * If anything other than the delete itself (like a secondary index
+	 * update) has to happen there in a particular situation, the
+	 * conditions here should be modified not to call __ham_quick_delete.
+	 * The ordinary AM-independent alternative will work just fine with
+	 * a hash;  it'll just be slower.
+	 */
+	if (dbp->type == DB_HASH) {
+		if (LIST_FIRST(&dbp->s_secondaries) == NULL &&
+		    !F_ISSET(dbp, DB_AM_SECONDARY) &&
+		    dbc->internal->opd == NULL) {
+			ret = __ham_quick_delete(dbc);
+			goto err;
+		}
+	}
+
+	for (;;) {
+		if ((ret = dbc->c_del(dbc, 0)) != 0)
+			goto err;
+		if ((ret = dbc->c_get(dbc, &lkey, &data, f_next)) != 0) {
+			if (ret == DB_NOTFOUND) {
+				ret = 0;
+				break;
+			}
+			goto err;
+		}
+	}
+
+err:	/* Discard the cursor. */
+	if (dbc != NULL && (t_ret = dbc->c_close(dbc)) != 0 && ret == 0)
 		ret = t_ret;
 
+	/* Commit for DB_AUTO_COMMIT. */
+	if (txn_local) {
+		if (ret == 0)
+			ret = txn->commit(txn, 0);
+		else
+			if ((t_ret = txn->abort(txn)) != 0)
+				ret = __db_panic(dbenv, t_ret);
+	}
+
 	return (ret);
 }
 
@@ -505,7 +829,443 @@ __db_sync(dbp, flags)
 		return (0);
 
 	/* Flush any dirty pages from the cache to the backing file. */
-	if ((t_ret = memp_fsync(dbp->mpf)) != 0 && ret == 0)
+	if ((t_ret = dbp->mpf->sync(dbp->mpf)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __db_associate --
+ *	Associate another database as a secondary index to this one.
+ *
+ * PUBLIC: int __db_associate __P((DB *, DB_TXN *, DB *,
+ * PUBLIC:     int (*)(DB *, const DBT *, const DBT *, DBT *), u_int32_t));
+ */
+int
+__db_associate(dbp, txn, sdbp, callback, flags)
+	DB *dbp, *sdbp;
+	DB_TXN *txn;
+	int (*callback) __P((DB *, const DBT *, const DBT *, DBT *));
+	u_int32_t flags;
+{
+	DB_ENV *dbenv;
+	DBC *pdbc, *sdbc;
+	DBT skey, key, data;
+	int build, ret, t_ret, txn_local;
+
+	dbenv = dbp->dbenv;
+
+	PANIC_CHECK(dbenv);
+
+	txn_local = 0;
+	pdbc = NULL;
+
+	memset(&key, 0, sizeof(DBT));
+	memset(&data, 0, sizeof(DBT));
+	memset(&skey, 0, sizeof(DBT));
+
+	if ((ret = __db_associatechk(dbp, sdbp, callback, flags)) != 0)
+		return (ret);
+
+	/*
+	 * Create a local transaction as necessary, check for consistent
+	 * transaction usage, and, if we have no transaction but do have
+	 * locking on, acquire a locker id for the handle lock acquisition.
+	 */
+	if (IS_AUTO_COMMIT(dbenv, txn, flags)) {
+		if ((ret = __db_txn_auto(dbp, &txn)) != 0)
+			return (ret);
+		txn_local = 1;
+	} else if (txn != NULL && !TXN_ON(dbenv))
+		return (__db_not_txn_env(dbenv));
+
+	/*
+	 * Check that if an open transaction is in progress, we're in it,
+	 * for other common transaction errors, and for concurrent associates.
+	 */
+	if ((ret = __db_check_txn(dbp, txn, DB_LOCK_INVALIDID, 0)) != 0)
+		return (ret);
+
+	sdbp->s_callback = callback;
+	sdbp->s_primary = dbp;
+
+	sdbp->stored_get = sdbp->get;
+	sdbp->get = __db_secondary_get;
+
+	sdbp->stored_close = sdbp->close;
+	sdbp->close = __db_secondary_close;
+
+	/*
+	 * Secondary cursors may have the primary's lock file ID, so we
+	 * need to make sure that no older cursors are lying around
+	 * when we make the transition.
+	 */
+	if (TAILQ_FIRST(&sdbp->active_queue) != NULL ||
+	    TAILQ_FIRST(&sdbp->join_queue) != NULL) {
+		__db_err(dbenv,
+    "Databases may not become secondary indices while cursors are open");
+		ret = EINVAL;
+		goto err;
+	}
+	while ((sdbc = TAILQ_FIRST(&sdbp->free_queue)) != NULL)
+		if ((ret = __db_c_destroy(sdbc)) != 0)
+			goto err;
+
+	F_SET(sdbp, DB_AM_SECONDARY);
+
+	/*
+	 * Check to see if the secondary is empty--and thus if we should
+	 * build it--before we link it in and risk making it show up in
+	 * other threads.
+	 */
+	build = 0;
+	if (LF_ISSET(DB_CREATE)) {
+		if ((ret = sdbp->cursor(sdbp, txn, &sdbc, 0)) != 0)
+			goto err;
+
+		memset(&key, 0, sizeof(DBT));
+		memset(&data, 0, sizeof(DBT));
+
+		/*
+		 * We don't care about key or data;  we're just doing
+		 * an existence check.
+		 */
+		F_SET(&key, DB_DBT_PARTIAL | DB_DBT_USERMEM);
+		F_SET(&data, DB_DBT_PARTIAL | DB_DBT_USERMEM);
+		if ((ret = sdbc->c_real_get(sdbc, &key, &data,
+		    (STD_LOCKING(sdbc) ? DB_RMW : 0) |
+		    DB_FIRST)) == DB_NOTFOUND) {
+			build = 1;
+			ret = 0;
+		}
+
+		/*
+		 * Secondary cursors have special refcounting close
+		 * methods.  Be careful.
+		 */
+		if ((t_ret = __db_c_close(sdbc)) != 0)
+			ret = t_ret;
+		if (ret != 0)
+			goto err;
+	}
+
+	/*
+	 * Add the secondary to the list on the primary.  Do it here
+	 * so that we see any updates that occur while we're walking
+	 * the primary.
+	 */
+	MUTEX_THREAD_LOCK(dbenv, dbp->mutexp);
+
+	/* See __db_s_next for an explanation of secondary refcounting. */
+	DB_ASSERT(sdbp->s_refcnt == 0);
+	sdbp->s_refcnt = 1;
+	LIST_INSERT_HEAD(&dbp->s_secondaries, sdbp, s_links);
+	MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp);
+
+	if (build) {
+		/*
+		 * We loop through the primary, putting each item we
+		 * find into the new secondary.
+		 *
+		 * If we're using CDB, opening these two cursors puts us
+		 * in a bit of a locking tangle:  CDB locks are done on the
+		 * primary, so that we stay deadlock-free, but that means
+		 * that updating the secondary while we have a read cursor
+		 * open on the primary will self-block.  To get around this,
+		 * we force the primary cursor to use the same locker ID
+		 * as the secondary, so they won't conflict.  This should
+		 * be harmless even if we're not using CDB.
+		 */
+		if ((ret = sdbp->cursor(sdbp, txn, &sdbc,
+		    CDB_LOCKING(sdbp->dbenv) ? DB_WRITECURSOR : 0)) != 0)
+			goto err;
+		if ((ret = __db_icursor(dbp,
+		    txn, dbp->type, PGNO_INVALID, 0, sdbc->locker, &pdbc)) != 0)
+			goto err;
+
+		/* Lock out other threads, now that we have a locker ID. */
+		dbp->associate_lid = sdbc->locker;
+
+		memset(&key, 0, sizeof(DBT));
+		memset(&data, 0, sizeof(DBT));
+		while ((ret = pdbc->c_get(pdbc, &key, &data, DB_NEXT)) == 0) {
+			memset(&skey, 0, sizeof(DBT));
+			if ((ret = callback(sdbp, &key, &data, &skey)) != 0) {
+				if (ret == DB_DONOTINDEX)
+					continue;
+				else
+					goto err;
+			}
+			if ((ret = sdbc->c_put(sdbc,
+			    &skey, &key, DB_UPDATE_SECONDARY)) != 0) {
+				FREE_IF_NEEDED(sdbp, &skey);
+				goto err;
+			}
+
+			FREE_IF_NEEDED(sdbp, &skey);
+		}
+		if (ret == DB_NOTFOUND)
+			ret = 0;
+
+		if ((ret = sdbc->c_close(sdbc)) != 0)
+			goto err;
+	}
+
+err:	if (pdbc != NULL && (t_ret = pdbc->c_close(pdbc)) != 0 && ret == 0)
+		ret = t_ret;
+
+	dbp->associate_lid = DB_LOCK_INVALIDID;
+
+	if (txn_local) {
+		if (ret == 0)
+			ret = txn->commit(txn, 0);
+		else
+			if ((t_ret = txn->abort(txn)) != 0)
+				ret = __db_panic(dbenv, t_ret);
+	}
+
+	return (ret);
+}
+
+/*
+ * __db_pget --
+ *	Return a primary key/data pair given a secondary key.
+ *
+ * PUBLIC: int __db_pget __P((DB *, DB_TXN *, DBT *, DBT *, DBT *, u_int32_t));
+ */
+int
+__db_pget(dbp, txn, skey, pkey, data, flags)
+	DB *dbp;
+	DB_TXN *txn;
+	DBT *skey, *pkey, *data;
+	u_int32_t flags;
+{
+	DBC *dbc;
+	int ret, t_ret;
+
+	PANIC_CHECK(dbp->dbenv);
+	DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->pget");
+
+	if ((ret = __db_pgetchk(dbp, skey, pkey, data, flags)) != 0)
+		return (ret);
+
+	if ((ret = dbp->cursor(dbp, txn, &dbc, 0)) != 0)
+		return (ret);
+	SET_RET_MEM(dbc, dbp);
+
+	/*
+	 * The underlying cursor pget will fill in a default DBT for null
+	 * pkeys, and use the cursor's returned-key memory internally to
+	 * store any intermediate primary keys.  However, we've just set
+	 * the returned-key memory to the DB handle's key memory, which
+	 * is unsafe to use if the DB handle is threaded.  If the pkey
+	 * argument is NULL, use the DBC-owned returned-key memory
+	 * instead;  it'll go away when we close the cursor before we
+	 * return, but in this case that's just fine, as we're not
+	 * returning the primary key.
+	 */
+	if (pkey == NULL)
+		dbc->rkey = &dbc->my_rkey;
+
+	DEBUG_LREAD(dbc, txn, "__db_pget", skey, NULL, flags);
+
+	/*
+	 * The cursor is just a perfectly ordinary secondary database
+	 * cursor.  Call its c_pget() method to do the dirty work.
+	 */
+	if (flags == 0 || flags == DB_RMW)
+		flags |= DB_SET;
+	ret = dbc->c_pget(dbc, skey, pkey, data, flags);
+
+	if ((t_ret = __db_c_close(dbc)) != 0 && ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __db_secondary_get --
+ *	This wrapper function for DB->pget() is the DB->get() function
+ *	on a database which has been made into a secondary index.
+ */
+static int
+__db_secondary_get(sdbp, txn, skey, data, flags)
+	DB *sdbp;
+	DB_TXN *txn;
+	DBT *skey, *data;
+	u_int32_t flags;
+{
+
+	DB_ASSERT(F_ISSET(sdbp, DB_AM_SECONDARY));
+	return (sdbp->pget(sdbp, txn, skey, NULL, data, flags));
+}
+
+/*
+ * __db_secondary_close --
+ *	Wrapper function for DB->close() which we use on secondaries to
+ *	manage refcounting and make sure we don't close them underneath
+ *	a primary that is updating.
+ */
+static int
+__db_secondary_close(sdbp, flags)
+	DB *sdbp;
+	u_int32_t flags;
+{
+	DB *primary;
+	int doclose;
+
+	doclose = 0;
+	primary = sdbp->s_primary;
+
+	MUTEX_THREAD_LOCK(primary->dbenv, primary->mutexp);
+	/*
+	 * Check the refcount--if it was at 1 when we were called, no
+	 * thread is currently updating this secondary through the primary,
+	 * so it's safe to close it for real.
+	 *
+	 * If it's not safe to do the close now, we do nothing;  the
+	 * database will actually be closed when the refcount is decremented,
+	 * which can happen in either __db_s_next or __db_s_done.
+	 */
+	DB_ASSERT(sdbp->s_refcnt != 0);
+	if (--sdbp->s_refcnt == 0) {
+		LIST_REMOVE(sdbp, s_links);
+		/* We don't want to call close while the mutex is held. */
+		doclose = 1;
+	}
+	MUTEX_THREAD_UNLOCK(primary->dbenv, primary->mutexp);
+
+	/*
+	 * sdbp->close is this function;  call the real one explicitly if
+	 * need be.
+	 */
+	return (doclose ? __db_close(sdbp, flags) : 0);
+}
+
+/*
+ * __db_append_primary --
+ *	Perform the secondary index updates necessary to put(DB_APPEND)
+ *	a record to a primary database.
+ */
+static int
+__db_append_primary(dbc, key, data)
+	DBC *dbc;
+	DBT *key, *data;
+{
+	DB *dbp, *sdbp;
+	DBC *sdbc, *pdbc;
+	DBT oldpkey, pkey, pdata, skey;
+	int cmp, ret, t_ret;
+
+	dbp = dbc->dbp;
+	sdbp = NULL;
+	ret = 0;
+
+	/*
+	 * Worrying about partial appends seems a little like worrying
+	 * about Linear A character encodings.  But we support those
+	 * too if your application understands them.
+	 */
+	pdbc = NULL;
+	if (F_ISSET(data, DB_DBT_PARTIAL) || F_ISSET(key, DB_DBT_PARTIAL)) {
+		/*
+		 * The dbc we were passed is all set to pass things
+		 * back to the user;  we can't safely do a call on it.
+		 * Dup the cursor, grab the real data item (we don't
+		 * care what the key is--we've been passed it directly),
+		 * and use that instead of the data DBT we were passed.
+		 *
+		 * Note that we can get away with this simple get because
+		 * an appended item is by definition new, and the
+		 * correctly-constructed full data item from this partial
+		 * put is on the page waiting for us.
+		 */
+		if ((ret = __db_c_idup(dbc, &pdbc, DB_POSITIONI)) != 0)
+			return (ret);
+		memset(&pkey, 0, sizeof(DBT));
+		memset(&pdata, 0, sizeof(DBT));
+
+		if ((ret = pdbc->c_get(pdbc, &pkey, &pdata, DB_CURRENT)) != 0)
+			goto err;
+
+		key = &pkey;
+		data = &pdata;
+	}
+
+	/*
+	 * Loop through the secondary indices, putting a new item in
+	 * each that points to the appended item.
+	 *
+	 * This is much like the loop in "step 3" in __db_c_put, so
+	 * I'm not commenting heavily here;  it was unclean to excerpt
+	 * just that section into a common function, but the basic
+	 * overview is the same here.
+	 */
+	for (sdbp = __db_s_first(dbp);
+	    sdbp != NULL && ret == 0; ret = __db_s_next(&sdbp)) {
+		memset(&skey, 0, sizeof(DBT));
+		if ((ret = sdbp->s_callback(sdbp, key, data, &skey)) != 0) {
+			if (ret == DB_DONOTINDEX)
+				continue;
+			else
+				goto err;
+		}
+
+		if ((ret = __db_icursor(sdbp, dbc->txn, sdbp->type,
+		    PGNO_INVALID, 0, dbc->locker, &sdbc)) != 0) {
+			FREE_IF_NEEDED(sdbp, &skey);
+			goto err;
+		}
+		if (CDB_LOCKING(sdbp->dbenv)) {
+			DB_ASSERT(sdbc->mylock.off == LOCK_INVALID);
+			F_SET(sdbc, DBC_WRITER);
+		}
+
+		/*
+		 * Since we know we have a new primary key, it can't be a
+		 * duplicate duplicate in the secondary.  It can be a
+		 * duplicate in a secondary that doesn't support duplicates,
+		 * however, so we need to be careful to avoid an overwrite
+		 * (which would corrupt our index).
+		 */
+		if (!F_ISSET(sdbp, DB_AM_DUP)) {
+			memset(&oldpkey, 0, sizeof(DBT));
+			F_SET(&oldpkey, DB_DBT_MALLOC);
+			ret = sdbc->c_real_get(sdbc, &skey, &oldpkey,
+			    DB_SET | (STD_LOCKING(dbc) ? DB_RMW : 0));
+			if (ret == 0) {
+				cmp = __bam_defcmp(sdbp, &oldpkey, key);
+				/*
+				 * XXX
+				 * This needs to use the right free function
+				 * as soon as this is possible.
+				 */
+				__os_ufree(sdbp->dbenv,
+				    oldpkey.data);
+				if (cmp != 0) {
+					__db_err(sdbp->dbenv, "%s%s",
+			    "Append results in a non-unique secondary key in",
+			    " an index not configured to support duplicates");
+					ret = EINVAL;
+					goto err1;
+				}
+			} else if (ret != DB_NOTFOUND && ret != DB_KEYEMPTY)
+				goto err1;
+		}
+
+		ret = sdbc->c_put(sdbc, &skey, key, DB_UPDATE_SECONDARY);
+
+err1:		FREE_IF_NEEDED(sdbp, &skey);
+
+		if ((t_ret = sdbc->c_close(sdbc)) != 0 && ret == 0)
+			ret = t_ret;
+
+		if (ret != 0)
+			goto err;
+	}
+
+err:	if (pdbc != NULL && (t_ret = pdbc->c_close(pdbc)) != 0 && ret == 0)
+		ret = t_ret;
+	if (sdbp != NULL && (t_ret = __db_s_done(sdbp)) != 0 && ret == 0)
 		ret = t_ret;
 	return (ret);
 }