diff options
Diffstat (limited to 'storage/bdb/db/db_cam.c')
-rw-r--r-- | storage/bdb/db/db_cam.c | 2286 |
1 files changed, 2286 insertions, 0 deletions
diff --git a/storage/bdb/db/db_cam.c b/storage/bdb/db/db_cam.c new file mode 100644 index 00000000000..4de3467d4aa --- /dev/null +++ b/storage/bdb/db/db_cam.c @@ -0,0 +1,2286 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2000-2002 + * Sleepycat Software. All rights reserved. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: db_cam.c,v 11.114 2002/09/03 15:44:46 krinsky Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <string.h> +#endif + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_shash.h" +#include "dbinc/btree.h" +#include "dbinc/hash.h" +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/qam.h" + +static int __db_buildpartial __P((DB *, DBT *, DBT *, DBT *)); +static int __db_c_cleanup __P((DBC *, DBC *, int)); +static int __db_c_del_secondary __P((DBC *)); +static int __db_c_pget_recno __P((DBC *, DBT *, DBT *, u_int32_t)); +static int __db_wrlock_err __P((DB_ENV *)); + +#define CDB_LOCKING_INIT(dbp, dbc) \ + /* \ + * If we are running CDB, this had better be either a write \ + * cursor or an immediate writer. If it's a regular writer, \ + * that means we have an IWRITE lock and we need to upgrade \ + * it to a write lock. \ + */ \ + if (CDB_LOCKING((dbp)->dbenv)) { \ + if (!F_ISSET(dbc, DBC_WRITECURSOR | DBC_WRITER)) \ + return (__db_wrlock_err(dbp->dbenv)); \ + \ + if (F_ISSET(dbc, DBC_WRITECURSOR) && \ + (ret = (dbp)->dbenv->lock_get((dbp)->dbenv, \ + (dbc)->locker, DB_LOCK_UPGRADE, &(dbc)->lock_dbt, \ + DB_LOCK_WRITE, &(dbc)->mylock)) != 0) \ + return (ret); \ + } +#define CDB_LOCKING_DONE(dbp, dbc) \ + /* Release the upgraded lock. */ \ + if (F_ISSET(dbc, DBC_WRITECURSOR)) \ + (void)__lock_downgrade( \ + (dbp)->dbenv, &(dbc)->mylock, DB_LOCK_IWRITE, 0); +/* + * Copy the lock info from one cursor to another, so that locking + * in CDB can be done in the context of an internally-duplicated + * or off-page-duplicate cursor. + */ +#define CDB_LOCKING_COPY(dbp, dbc_o, dbc_n) \ + if (CDB_LOCKING((dbp)->dbenv) && \ + F_ISSET((dbc_o), DBC_WRITECURSOR | DBC_WRITEDUP)) { \ + memcpy(&(dbc_n)->mylock, &(dbc_o)->mylock, \ + sizeof((dbc_o)->mylock)); \ + /* This lock isn't ours to put--just discard it on close. */ \ + F_SET((dbc_n), DBC_WRITEDUP); \ + } + +/* + * __db_c_close -- + * Close the cursor. + * + * PUBLIC: int __db_c_close __P((DBC *)); + */ +int +__db_c_close(dbc) + DBC *dbc; +{ + DB *dbp; + DBC *opd; + DBC_INTERNAL *cp; + DB_ENV *dbenv; + int ret, t_ret; + + dbp = dbc->dbp; + dbenv = dbp->dbenv; + ret = 0; + + PANIC_CHECK(dbenv); + + /* + * If the cursor is already closed we have a serious problem, and we + * assume that the cursor isn't on the active queue. Don't do any of + * the remaining cursor close processing. + */ + if (!F_ISSET(dbc, DBC_ACTIVE)) { + if (dbp != NULL) + __db_err(dbenv, "Closing already-closed cursor"); + + DB_ASSERT(0); + return (EINVAL); + } + + cp = dbc->internal; + opd = cp->opd; + + /* + * Remove the cursor(s) from the active queue. We may be closing two + * cursors at once here, a top-level one and a lower-level, off-page + * duplicate one. The acess-method specific cursor close routine must + * close both of them in a single call. + * + * !!! + * Cursors must be removed from the active queue before calling the + * access specific cursor close routine, btree depends on having that + * order of operations. + */ + MUTEX_THREAD_LOCK(dbenv, dbp->mutexp); + + if (opd != NULL) { + F_CLR(opd, DBC_ACTIVE); + TAILQ_REMOVE(&dbp->active_queue, opd, links); + } + F_CLR(dbc, DBC_ACTIVE); + TAILQ_REMOVE(&dbp->active_queue, dbc, links); + + MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp); + + /* Call the access specific cursor close routine. */ + if ((t_ret = + dbc->c_am_close(dbc, PGNO_INVALID, NULL)) != 0 && ret == 0) + ret = t_ret; + + /* + * Release the lock after calling the access method specific close + * routine, a Btree cursor may have had pending deletes. + */ + if (CDB_LOCKING(dbenv)) { + /* + * If DBC_WRITEDUP is set, the cursor is an internally + * duplicated write cursor and the lock isn't ours to put. + * + * Also, be sure not to free anything if mylock.off is + * INVALID; in some cases, such as idup'ed read cursors + * and secondary update cursors, a cursor in a CDB + * environment may not have a lock at all. + */ + if (!F_ISSET(dbc, DBC_WRITEDUP) && LOCK_ISSET(dbc->mylock)) { + if ((t_ret = dbenv->lock_put( + dbenv, &dbc->mylock)) != 0 && ret == 0) + ret = t_ret; + } + + /* For safety's sake, since this is going on the free queue. */ + memset(&dbc->mylock, 0, sizeof(dbc->mylock)); + F_CLR(dbc, DBC_WRITEDUP); + } + + if (dbc->txn != NULL) + dbc->txn->cursors--; + + /* Move the cursor(s) to the free queue. */ + MUTEX_THREAD_LOCK(dbenv, dbp->mutexp); + if (opd != NULL) { + if (dbc->txn != NULL) + dbc->txn->cursors--; + TAILQ_INSERT_TAIL(&dbp->free_queue, opd, links); + opd = NULL; + } + TAILQ_INSERT_TAIL(&dbp->free_queue, dbc, links); + MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp); + + return (ret); +} + +/* + * __db_c_destroy -- + * Destroy the cursor, called after DBC->c_close. + * + * PUBLIC: int __db_c_destroy __P((DBC *)); + */ +int +__db_c_destroy(dbc) + DBC *dbc; +{ + DB *dbp; + DB_ENV *dbenv; + int ret, t_ret; + + dbp = dbc->dbp; + dbenv = dbp->dbenv; + + /* Remove the cursor from the free queue. */ + MUTEX_THREAD_LOCK(dbenv, dbp->mutexp); + TAILQ_REMOVE(&dbp->free_queue, dbc, links); + MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp); + + /* Free up allocated memory. */ + if (dbc->my_rskey.data != NULL) + __os_free(dbenv, dbc->my_rskey.data); + if (dbc->my_rkey.data != NULL) + __os_free(dbenv, dbc->my_rkey.data); + if (dbc->my_rdata.data != NULL) + __os_free(dbenv, dbc->my_rdata.data); + + /* Call the access specific cursor destroy routine. */ + ret = dbc->c_am_destroy == NULL ? 0 : dbc->c_am_destroy(dbc); + + /* + * Release the lock id for this cursor. + */ + if (LOCKING_ON(dbenv) && + F_ISSET(dbc, DBC_OWN_LID) && + (t_ret = dbenv->lock_id_free(dbenv, dbc->lid)) != 0 && ret == 0) + ret = t_ret; + + __os_free(dbenv, dbc); + + return (ret); +} + +/* + * __db_c_count -- + * Return a count of duplicate data items. + * + * PUBLIC: int __db_c_count __P((DBC *, db_recno_t *, u_int32_t)); + */ +int +__db_c_count(dbc, recnop, flags) + DBC *dbc; + db_recno_t *recnop; + u_int32_t flags; +{ + DB *dbp; + int ret; + + /* + * Cursor Cleanup Note: + * All of the cursors passed to the underlying access methods by this + * routine are not duplicated and will not be cleaned up on return. + * So, pages/locks that the cursor references must be resolved by the + * underlying functions. + */ + dbp = dbc->dbp; + + PANIC_CHECK(dbp->dbenv); + + /* Check for invalid flags. */ + if ((ret = __db_ccountchk(dbp, flags, IS_INITIALIZED(dbc))) != 0) + return (ret); + + switch (dbc->dbtype) { + case DB_QUEUE: + case DB_RECNO: + *recnop = 1; + break; + case DB_HASH: + if (dbc->internal->opd == NULL) { + if ((ret = __ham_c_count(dbc, recnop)) != 0) + return (ret); + break; + } + /* FALLTHROUGH */ + case DB_BTREE: + if ((ret = __bam_c_count(dbc, recnop)) != 0) + return (ret); + break; + default: + return (__db_unknown_type(dbp->dbenv, + "__db_c_count", dbp->type)); + } + return (0); +} + +/* + * __db_c_del -- + * Delete using a cursor. + * + * PUBLIC: int __db_c_del __P((DBC *, u_int32_t)); + */ +int +__db_c_del(dbc, flags) + DBC *dbc; + u_int32_t flags; +{ + DB *dbp; + DBC *opd; + int ret; + + /* + * Cursor Cleanup Note: + * All of the cursors passed to the underlying access methods by this + * routine are not duplicated and will not be cleaned up on return. + * So, pages/locks that the cursor references must be resolved by the + * underlying functions. + */ + dbp = dbc->dbp; + + PANIC_CHECK(dbp->dbenv); + + /* Check for invalid flags. */ + if ((ret = __db_cdelchk(dbp, flags, IS_INITIALIZED(dbc))) != 0) + return (ret); + + /* Check for consistent transaction usage. */ + if ((ret = __db_check_txn(dbp, dbc->txn, dbc->locker, 0)) != 0) + return (ret); + + DEBUG_LWRITE(dbc, dbc->txn, "db_c_del", NULL, NULL, flags); + + CDB_LOCKING_INIT(dbp, dbc); + + /* + * If we're a secondary index, and DB_UPDATE_SECONDARY isn't set + * (which it only is if we're being called from a primary update), + * then we need to call through to the primary and delete the item. + * + * Note that this will delete the current item; we don't need to + * delete it ourselves as well, so we can just goto done. + */ + if (flags != DB_UPDATE_SECONDARY && F_ISSET(dbp, DB_AM_SECONDARY)) { + ret = __db_c_del_secondary(dbc); + goto done; + } + + /* + * If we are a primary and have secondary indices, go through + * and delete any secondary keys that point at the current record. + */ + if (LIST_FIRST(&dbp->s_secondaries) != NULL && + (ret = __db_c_del_primary(dbc)) != 0) + goto done; + + /* + * Off-page duplicate trees are locked in the primary tree, that is, + * we acquire a write lock in the primary tree and no locks in the + * off-page dup tree. If the del operation is done in an off-page + * duplicate tree, call the primary cursor's upgrade routine first. + */ + opd = dbc->internal->opd; + if (opd == NULL) + ret = dbc->c_am_del(dbc); + else + if ((ret = dbc->c_am_writelock(dbc)) == 0) + ret = opd->c_am_del(opd); + +done: CDB_LOCKING_DONE(dbp, dbc); + + return (ret); +} + +/* + * __db_c_dup -- + * Duplicate a cursor + * + * PUBLIC: int __db_c_dup __P((DBC *, DBC **, u_int32_t)); + */ +int +__db_c_dup(dbc_orig, dbcp, flags) + DBC *dbc_orig; + DBC **dbcp; + u_int32_t flags; +{ + DB_ENV *dbenv; + DB *dbp; + DBC *dbc_n, *dbc_nopd; + int ret; + + dbp = dbc_orig->dbp; + dbenv = dbp->dbenv; + dbc_n = dbc_nopd = NULL; + + PANIC_CHECK(dbp->dbenv); + + /* + * We can never have two write cursors open in CDB, so do not + * allow duplication of a write cursor. + */ + if (flags != DB_POSITIONI && + F_ISSET(dbc_orig, DBC_WRITER | DBC_WRITECURSOR)) { + __db_err(dbenv, "Cannot duplicate writeable cursor"); + return (EINVAL); + } + + /* Allocate a new cursor and initialize it. */ + if ((ret = __db_c_idup(dbc_orig, &dbc_n, flags)) != 0) + goto err; + *dbcp = dbc_n; + + /* + * If we're in CDB, and this isn't an internal duplication (in which + * case we're explicitly overriding CDB locking), the duplicated + * cursor needs its own read lock. (We know it's not a write cursor + * because we wouldn't have made it this far; you can't dup them.) + */ + if (CDB_LOCKING(dbenv) && flags != DB_POSITIONI) { + DB_ASSERT(!F_ISSET(dbc_orig, DBC_WRITER | DBC_WRITECURSOR)); + + if ((ret = dbenv->lock_get(dbenv, dbc_n->locker, 0, + &dbc_n->lock_dbt, DB_LOCK_READ, &dbc_n->mylock)) != 0) { + (void)__db_c_close(dbc_n); + return (ret); + } + } + + /* + * If the cursor references an off-page duplicate tree, allocate a + * new cursor for that tree and initialize it. + */ + if (dbc_orig->internal->opd != NULL) { + if ((ret = + __db_c_idup(dbc_orig->internal->opd, &dbc_nopd, flags)) != 0) + goto err; + dbc_n->internal->opd = dbc_nopd; + } + + /* Copy the dirty read flag to the new cursor. */ + F_SET(dbc_n, F_ISSET(dbc_orig, DBC_DIRTY_READ)); + return (0); + +err: if (dbc_n != NULL) + (void)dbc_n->c_close(dbc_n); + if (dbc_nopd != NULL) + (void)dbc_nopd->c_close(dbc_nopd); + + return (ret); +} + +/* + * __db_c_idup -- + * Internal version of __db_c_dup. + * + * PUBLIC: int __db_c_idup __P((DBC *, DBC **, u_int32_t)); + */ +int +__db_c_idup(dbc_orig, dbcp, flags) + DBC *dbc_orig, **dbcp; + u_int32_t flags; +{ + DB *dbp; + DBC *dbc_n; + DBC_INTERNAL *int_n, *int_orig; + int ret; + + dbp = dbc_orig->dbp; + dbc_n = *dbcp; + + if ((ret = __db_icursor(dbp, dbc_orig->txn, dbc_orig->dbtype, + dbc_orig->internal->root, F_ISSET(dbc_orig, DBC_OPD), + dbc_orig->locker, &dbc_n)) != 0) + return (ret); + + /* If the user wants the cursor positioned, do it here. */ + if (flags == DB_POSITION || flags == DB_POSITIONI) { + int_n = dbc_n->internal; + int_orig = dbc_orig->internal; + + dbc_n->flags |= dbc_orig->flags & ~DBC_OWN_LID; + + int_n->indx = int_orig->indx; + int_n->pgno = int_orig->pgno; + int_n->root = int_orig->root; + int_n->lock_mode = int_orig->lock_mode; + + switch (dbc_orig->dbtype) { + case DB_QUEUE: + if ((ret = __qam_c_dup(dbc_orig, dbc_n)) != 0) + goto err; + break; + case DB_BTREE: + case DB_RECNO: + if ((ret = __bam_c_dup(dbc_orig, dbc_n)) != 0) + goto err; + break; + case DB_HASH: + if ((ret = __ham_c_dup(dbc_orig, dbc_n)) != 0) + goto err; + break; + default: + ret = __db_unknown_type(dbp->dbenv, + "__db_c_idup", dbc_orig->dbtype); + goto err; + } + } + + /* Now take care of duping the CDB information. */ + CDB_LOCKING_COPY(dbp, dbc_orig, dbc_n); + + /* Copy the dirty read flag to the new cursor. */ + F_SET(dbc_n, F_ISSET(dbc_orig, DBC_DIRTY_READ)); + + *dbcp = dbc_n; + return (0); + +err: (void)dbc_n->c_close(dbc_n); + return (ret); +} + +/* + * __db_c_newopd -- + * Create a new off-page duplicate cursor. + * + * PUBLIC: int __db_c_newopd __P((DBC *, db_pgno_t, DBC *, DBC **)); + */ +int +__db_c_newopd(dbc_parent, root, oldopd, dbcp) + DBC *dbc_parent; + db_pgno_t root; + DBC *oldopd; + DBC **dbcp; +{ + DB *dbp; + DBC *opd; + DBTYPE dbtype; + int ret; + + dbp = dbc_parent->dbp; + dbtype = (dbp->dup_compare == NULL) ? DB_RECNO : DB_BTREE; + + /* + * On failure, we want to default to returning the old off-page dup + * cursor, if any; our caller can't be left with a dangling pointer + * to a freed cursor. On error the only allowable behavior is to + * close the cursor (and the old OPD cursor it in turn points to), so + * this should be safe. + */ + *dbcp = oldopd; + + if ((ret = __db_icursor(dbp, + dbc_parent->txn, dbtype, root, 1, dbc_parent->locker, &opd)) != 0) + return (ret); + + /* !!! + * If the parent is a DBC_WRITER, this won't copy anything. That's + * not actually a problem--we only need lock information in an + * off-page dup cursor in order to upgrade at cursor close time + * if we've done a delete, but WRITERs don't need to upgrade. + */ + CDB_LOCKING_COPY(dbp, dbc_parent, opd); + + *dbcp = opd; + + /* + * Check to see if we already have an off-page dup cursor that we've + * passed in. If we do, close it. It'd be nice to use it again + * if it's a cursor belonging to the right tree, but if we're doing + * a cursor-relative operation this might not be safe, so for now + * we'll take the easy way out and always close and reopen. + * + * Note that under no circumstances do we want to close the old + * cursor without returning a valid new one; we don't want to + * leave the main cursor in our caller with a non-NULL pointer + * to a freed off-page dup cursor. + */ + if (oldopd != NULL && (ret = oldopd->c_close(oldopd)) != 0) + return (ret); + + return (0); +} + +/* + * __db_c_get -- + * Get using a cursor. + * + * PUBLIC: int __db_c_get __P((DBC *, DBT *, DBT *, u_int32_t)); + */ +int +__db_c_get(dbc_arg, key, data, flags) + DBC *dbc_arg; + DBT *key, *data; + u_int32_t flags; +{ + DB *dbp; + DBC *dbc, *dbc_n, *opd; + DBC_INTERNAL *cp, *cp_n; + DB_MPOOLFILE *mpf; + db_pgno_t pgno; + u_int32_t multi, tmp_dirty, tmp_flags, tmp_rmw; + u_int8_t type; + int ret, t_ret; + + /* + * Cursor Cleanup Note: + * All of the cursors passed to the underlying access methods by this + * routine are duplicated cursors. On return, any referenced pages + * will be discarded, and, if the cursor is not intended to be used + * again, the close function will be called. So, pages/locks that + * the cursor references do not need to be resolved by the underlying + * functions. + */ + dbp = dbc_arg->dbp; + mpf = dbp->mpf; + dbc_n = NULL; + opd = NULL; + + PANIC_CHECK(dbp->dbenv); + + /* Check for invalid flags. */ + if ((ret = + __db_cgetchk(dbp, key, data, flags, IS_INITIALIZED(dbc_arg))) != 0) + return (ret); + + /* Clear OR'd in additional bits so we can check for flag equality. */ + tmp_rmw = LF_ISSET(DB_RMW); + LF_CLR(DB_RMW); + + tmp_dirty = LF_ISSET(DB_DIRTY_READ); + LF_CLR(DB_DIRTY_READ); + + multi = LF_ISSET(DB_MULTIPLE|DB_MULTIPLE_KEY); + LF_CLR(DB_MULTIPLE|DB_MULTIPLE_KEY); + + DEBUG_LREAD(dbc_arg, dbc_arg->txn, "db_c_get", + flags == DB_SET || flags == DB_SET_RANGE ? key : NULL, NULL, flags); + + /* + * Return a cursor's record number. It has nothing to do with the + * cursor get code except that it was put into the interface. + */ + if (flags == DB_GET_RECNO) { + if (tmp_rmw) + F_SET(dbc_arg, DBC_RMW); + if (tmp_dirty) + F_SET(dbc_arg, DBC_DIRTY_READ); + ret = __bam_c_rget(dbc_arg, data); + if (tmp_rmw) + F_CLR(dbc_arg, DBC_RMW); + if (tmp_dirty) + F_CLR(dbc_arg, DBC_DIRTY_READ); + return (ret); + } + + if (flags == DB_CONSUME || flags == DB_CONSUME_WAIT) + CDB_LOCKING_INIT(dbp, dbc_arg); + + /* + * If we have an off-page duplicates cursor, and the operation applies + * to it, perform the operation. Duplicate the cursor and call the + * underlying function. + * + * Off-page duplicate trees are locked in the primary tree, that is, + * we acquire a write lock in the primary tree and no locks in the + * off-page dup tree. If the DB_RMW flag was specified and the get + * operation is done in an off-page duplicate tree, call the primary + * cursor's upgrade routine first. + */ + cp = dbc_arg->internal; + if (cp->opd != NULL && + (flags == DB_CURRENT || flags == DB_GET_BOTHC || + flags == DB_NEXT || flags == DB_NEXT_DUP || flags == DB_PREV)) { + if (tmp_rmw && (ret = dbc_arg->c_am_writelock(dbc_arg)) != 0) + return (ret); + if ((ret = __db_c_idup(cp->opd, &opd, DB_POSITIONI)) != 0) + return (ret); + + switch (ret = + opd->c_am_get(opd, key, data, flags, NULL)) { + case 0: + goto done; + case DB_NOTFOUND: + /* + * Translate DB_NOTFOUND failures for the DB_NEXT and + * DB_PREV operations into a subsequent operation on + * the parent cursor. + */ + if (flags == DB_NEXT || flags == DB_PREV) { + if ((ret = opd->c_close(opd)) != 0) + goto err; + opd = NULL; + break; + } + goto err; + default: + goto err; + } + } + + /* + * Perform an operation on the main cursor. Duplicate the cursor, + * upgrade the lock as required, and call the underlying function. + */ + switch (flags) { + case DB_CURRENT: + case DB_GET_BOTHC: + case DB_NEXT: + case DB_NEXT_DUP: + case DB_NEXT_NODUP: + case DB_PREV: + case DB_PREV_NODUP: + tmp_flags = DB_POSITIONI; + break; + default: + tmp_flags = 0; + break; + } + + if (tmp_dirty) + F_SET(dbc_arg, DBC_DIRTY_READ); + + /* + * If this cursor is going to be closed immediately, we don't + * need to take precautions to clean it up on error. + */ + if (F_ISSET(dbc_arg, DBC_TRANSIENT)) + dbc_n = dbc_arg; + else { + ret = __db_c_idup(dbc_arg, &dbc_n, tmp_flags); + if (tmp_dirty) + F_CLR(dbc_arg, DBC_DIRTY_READ); + + if (ret != 0) + goto err; + COPY_RET_MEM(dbc_arg, dbc_n); + } + + if (tmp_rmw) + F_SET(dbc_n, DBC_RMW); + + switch (multi) { + case DB_MULTIPLE: + F_SET(dbc_n, DBC_MULTIPLE); + break; + case DB_MULTIPLE_KEY: + F_SET(dbc_n, DBC_MULTIPLE_KEY); + break; + case DB_MULTIPLE | DB_MULTIPLE_KEY: + F_SET(dbc_n, DBC_MULTIPLE|DBC_MULTIPLE_KEY); + break; + case 0: + break; + } + + pgno = PGNO_INVALID; + ret = dbc_n->c_am_get(dbc_n, key, data, flags, &pgno); + if (tmp_rmw) + F_CLR(dbc_n, DBC_RMW); + if (tmp_dirty) + F_CLR(dbc_arg, DBC_DIRTY_READ); + F_CLR(dbc_n, DBC_MULTIPLE|DBC_MULTIPLE_KEY); + if (ret != 0) + goto err; + + cp_n = dbc_n->internal; + + /* + * We may be referencing a new off-page duplicates tree. Acquire + * a new cursor and call the underlying function. + */ + if (pgno != PGNO_INVALID) { + if ((ret = __db_c_newopd(dbc_arg, + pgno, cp_n->opd, &cp_n->opd)) != 0) + goto err; + + switch (flags) { + case DB_FIRST: + case DB_NEXT: + case DB_NEXT_NODUP: + case DB_SET: + case DB_SET_RECNO: + case DB_SET_RANGE: + tmp_flags = DB_FIRST; + break; + case DB_LAST: + case DB_PREV: + case DB_PREV_NODUP: + tmp_flags = DB_LAST; + break; + case DB_GET_BOTH: + case DB_GET_BOTHC: + case DB_GET_BOTH_RANGE: + tmp_flags = flags; + break; + default: + ret = + __db_unknown_flag(dbp->dbenv, "__db_c_get", flags); + goto err; + } + if ((ret = cp_n->opd->c_am_get( + cp_n->opd, key, data, tmp_flags, NULL)) != 0) + goto err; + } + +done: /* + * Return a key/data item. The only exception is that we don't return + * a key if the user already gave us one, that is, if the DB_SET flag + * was set. The DB_SET flag is necessary. In a Btree, the user's key + * doesn't have to be the same as the key stored the tree, depending on + * the magic performed by the comparison function. As we may not have + * done any key-oriented operation here, the page reference may not be + * valid. Fill it in as necessary. We don't have to worry about any + * locks, the cursor must already be holding appropriate locks. + * + * XXX + * If not a Btree and DB_SET_RANGE is set, we shouldn't return a key + * either, should we? + */ + cp_n = dbc_n == NULL ? dbc_arg->internal : dbc_n->internal; + if (!F_ISSET(key, DB_DBT_ISSET)) { + if (cp_n->page == NULL && (ret = + mpf->get(mpf, &cp_n->pgno, 0, &cp_n->page)) != 0) + goto err; + + if ((ret = __db_ret(dbp, cp_n->page, cp_n->indx, + key, &dbc_arg->rkey->data, &dbc_arg->rkey->ulen)) != 0) + goto err; + } + if (multi != 0) { + /* + * Even if fetching from the OPD cursor we need a duplicate + * primary cursor if we are going after multiple keys. + */ + if (dbc_n == NULL) { + /* + * Non-"_KEY" DB_MULTIPLE doesn't move the main cursor, + * so it's safe to just use dbc_arg, unless dbc_arg + * has an open OPD cursor whose state might need to + * be preserved. + */ + if ((!(multi & DB_MULTIPLE_KEY) && + dbc_arg->internal->opd == NULL) || + F_ISSET(dbc_arg, DBC_TRANSIENT)) + dbc_n = dbc_arg; + else { + if ((ret = __db_c_idup(dbc_arg, + &dbc_n, DB_POSITIONI)) != 0) + goto err; + if ((ret = dbc_n->c_am_get(dbc_n, + key, data, DB_CURRENT, &pgno)) != 0) + goto err; + } + cp_n = dbc_n->internal; + } + + /* + * If opd is set then we dupped the opd that we came in with. + * When we return we may have a new opd if we went to another + * key. + */ + if (opd != NULL) { + DB_ASSERT(cp_n->opd == NULL); + cp_n->opd = opd; + opd = NULL; + } + + /* + * Bulk get doesn't use __db_retcopy, so data.size won't + * get set up unless there is an error. Assume success + * here. This is the only call to c_am_bulk, and it avoids + * setting it exactly the same everywhere. If we have an + * ENOMEM error, it'll get overwritten with the needed value. + */ + data->size = data->ulen; + ret = dbc_n->c_am_bulk(dbc_n, data, flags | multi); + } else if (!F_ISSET(data, DB_DBT_ISSET)) { + dbc = opd != NULL ? opd : cp_n->opd != NULL ? cp_n->opd : dbc_n; + type = TYPE(dbc->internal->page); + ret = __db_ret(dbp, dbc->internal->page, dbc->internal->indx + + (type == P_LBTREE || type == P_HASH ? O_INDX : 0), + data, &dbc_arg->rdata->data, &dbc_arg->rdata->ulen); + } + +err: /* Don't pass DB_DBT_ISSET back to application level, error or no. */ + F_CLR(key, DB_DBT_ISSET); + F_CLR(data, DB_DBT_ISSET); + + /* Cleanup and cursor resolution. */ + if (opd != NULL) { + if ((t_ret = __db_c_cleanup( + dbc_arg->internal->opd, opd, ret)) != 0 && ret == 0) + ret = t_ret; + + } + + if ((t_ret = __db_c_cleanup(dbc_arg, dbc_n, ret)) != 0 && ret == 0) + ret = t_ret; + + if (flags == DB_CONSUME || flags == DB_CONSUME_WAIT) + CDB_LOCKING_DONE(dbp, dbc_arg); + return (ret); +} + +/* + * __db_c_put -- + * Put using a cursor. + * + * PUBLIC: int __db_c_put __P((DBC *, DBT *, DBT *, u_int32_t)); + */ +int +__db_c_put(dbc_arg, key, data, flags) + DBC *dbc_arg; + DBT *key, *data; + u_int32_t flags; +{ + DB *dbp, *sdbp; + DBC *dbc_n, *oldopd, *opd, *sdbc, *pdbc; + DBT olddata, oldpkey, oldskey, newdata, pkey, save_skey, skey, temp; + db_pgno_t pgno; + int cmp, have_oldrec, ispartial, nodel, re_pad, ret, rmw, t_ret; + u_int32_t re_len, size, tmp_flags; + + /* + * Cursor Cleanup Note: + * All of the cursors passed to the underlying access methods by this + * routine are duplicated cursors. On return, any referenced pages + * will be discarded, and, if the cursor is not intended to be used + * again, the close function will be called. So, pages/locks that + * the cursor references do not need to be resolved by the underlying + * functions. + */ + dbp = dbc_arg->dbp; + sdbp = NULL; + pdbc = dbc_n = NULL; + memset(&newdata, 0, sizeof(DBT)); + + PANIC_CHECK(dbp->dbenv); + + /* Check for invalid flags. */ + if ((ret = __db_cputchk(dbp, + key, data, flags, IS_INITIALIZED(dbc_arg))) != 0) + return (ret); + + /* Check for consistent transaction usage. */ + if ((ret = __db_check_txn(dbp, dbc_arg->txn, dbc_arg->locker, 0)) != 0) + return (ret); + + /* + * Putting to secondary indices is forbidden; when we need + * to internally update one, we'll call this with a private + * synonym for DB_KEYLAST, DB_UPDATE_SECONDARY, which does + * the right thing but won't return an error from cputchk(). + */ + if (flags == DB_UPDATE_SECONDARY) + flags = DB_KEYLAST; + + DEBUG_LWRITE(dbc_arg, dbc_arg->txn, "db_c_put", + flags == DB_KEYFIRST || flags == DB_KEYLAST || + flags == DB_NODUPDATA ? key : NULL, data, flags); + + CDB_LOCKING_INIT(dbp, dbc_arg); + + /* + * Check to see if we are a primary and have secondary indices. + * If we are not, we save ourselves a good bit of trouble and + * just skip to the "normal" put. + */ + if (LIST_FIRST(&dbp->s_secondaries) == NULL) + goto skip_s_update; + + /* + * We have at least one secondary which we may need to update. + * + * There is a rather vile locking issue here. Secondary gets + * will always involve acquiring a read lock in the secondary, + * then acquiring a read lock in the primary. Ideally, we + * would likewise perform puts by updating all the secondaries + * first, then doing the actual put in the primary, to avoid + * deadlock (since having multiple threads doing secondary + * gets and puts simultaneously is probably a common case). + * + * However, if this put is a put-overwrite--and we have no way to + * tell in advance whether it will be--we may need to delete + * an outdated secondary key. In order to find that old + * secondary key, we need to get the record we're overwriting, + * before we overwrite it. + * + * (XXX: It would be nice to avoid this extra get, and have the + * underlying put routines somehow pass us the old record + * since they need to traverse the tree anyway. I'm saving + * this optimization for later, as it's a lot of work, and it + * would be hard to fit into this locking paradigm anyway.) + * + * The simple thing to do would be to go get the old record before + * we do anything else. Unfortunately, though, doing so would + * violate our "secondary, then primary" lock acquisition + * ordering--even in the common case where no old primary record + * exists, we'll still acquire and keep a lock on the page where + * we're about to do the primary insert. + * + * To get around this, we do the following gyrations, which + * hopefully solve this problem in the common case: + * + * 1) If this is a c_put(DB_CURRENT), go ahead and get the + * old record. We already hold the lock on this page in + * the primary, so no harm done, and we'll need the primary + * key (which we weren't passed in this case) to do any + * secondary puts anyway. + * + * 2) If we're doing a partial put, we need to perform the + * get on the primary key right away, since we don't have + * the whole datum that the secondary key is based on. + * We may also need to pad out the record if the primary + * has a fixed record length. + * + * 3) Loop through the secondary indices, putting into each a + * new secondary key that corresponds to the new record. + * + * 4) If we haven't done so in (1) or (2), get the old primary + * key/data pair. If one does not exist--the common case--we're + * done with secondary indices, and can go straight on to the + * primary put. + * + * 5) If we do have an old primary key/data pair, however, we need + * to loop through all the secondaries a second time and delete + * the old secondary in each. + */ + memset(&pkey, 0, sizeof(DBT)); + memset(&olddata, 0, sizeof(DBT)); + have_oldrec = nodel = 0; + + /* + * Primary indices can't have duplicates, so only DB_CURRENT, + * DB_KEYFIRST, and DB_KEYLAST make any sense. Other flags + * should have been caught by the checking routine, but + * add a sprinkling of paranoia. + */ + DB_ASSERT(flags == DB_CURRENT || + flags == DB_KEYFIRST || flags == DB_KEYLAST); + + /* + * We'll want to use DB_RMW in a few places, but it's only legal + * when locking is on. + */ + rmw = STD_LOCKING(dbc_arg) ? DB_RMW : 0; + + if (flags == DB_CURRENT) { /* Step 1. */ + /* + * This is safe to do on the cursor we already have; + * error or no, it won't move. + * + * We use DB_RMW for all of these gets because we'll be + * writing soon enough in the "normal" put code. In + * transactional databases we'll hold those write locks + * even if we close the cursor we're reading with. + */ + ret = dbc_arg->c_get(dbc_arg, + &pkey, &olddata, rmw | DB_CURRENT); + if (ret == DB_KEYEMPTY) { + nodel = 1; /* + * We know we don't need a delete + * in the secondary. + */ + have_oldrec = 1; /* We've looked for the old record. */ + } else if (ret != 0) + goto err; + else + have_oldrec = 1; + + } else { + /* So we can just use &pkey everywhere instead of key. */ + pkey.data = key->data; + pkey.size = key->size; + } + + /* + * Check for partial puts (step 2). + */ + if (F_ISSET(data, DB_DBT_PARTIAL)) { + if (!have_oldrec && !nodel) { + /* + * We're going to have to search the tree for the + * specified key. Dup a cursor (so we have the same + * locking info) and do a c_get. + */ + if ((ret = __db_c_idup(dbc_arg, &pdbc, 0)) != 0) + goto err; + + /* We should have gotten DB_CURRENT in step 1. */ + DB_ASSERT(flags != DB_CURRENT); + + ret = pdbc->c_get(pdbc, + &pkey, &olddata, rmw | DB_SET); + if (ret == DB_KEYEMPTY || ret == DB_NOTFOUND) { + nodel = 1; + ret = 0; + } + if ((t_ret = pdbc->c_close(pdbc)) != 0) + ret = t_ret; + if (ret != 0) + goto err; + + have_oldrec = 1; + } + + /* + * Now build the new datum from olddata and the partial + * data we were given. + */ + if ((ret = + __db_buildpartial(dbp, &olddata, data, &newdata)) != 0) + goto err; + ispartial = 1; + } else + ispartial = 0; + + /* + * Handle fixed-length records. If the primary database has + * fixed-length records, we need to pad out the datum before + * we pass it into the callback function; we always index the + * "real" record. + */ + if ((dbp->type == DB_RECNO && F_ISSET(dbp, DB_AM_FIXEDLEN)) || + (dbp->type == DB_QUEUE)) { + if (dbp->type == DB_QUEUE) { + re_len = ((QUEUE *)dbp->q_internal)->re_len; + re_pad = ((QUEUE *)dbp->q_internal)->re_pad; + } else { + re_len = ((BTREE *)dbp->bt_internal)->re_len; + re_pad = ((BTREE *)dbp->bt_internal)->re_pad; + } + + size = ispartial ? newdata.size : data->size; + if (size > re_len) { + __db_err(dbp->dbenv, + "Length improper for fixed length record %lu", + (u_long)size); + ret = EINVAL; + goto err; + } else if (size < re_len) { + /* + * If we're not doing a partial put, copy + * data->data into newdata.data, then pad out + * newdata.data. + * + * If we're doing a partial put, the data + * we want are already in newdata.data; we + * just need to pad. + * + * Either way, realloc is safe. + */ + if ((ret = __os_realloc(dbp->dbenv, re_len, + &newdata.data)) != 0) + goto err; + if (!ispartial) + memcpy(newdata.data, data->data, size); + memset((u_int8_t *)newdata.data + size, re_pad, + re_len - size); + newdata.size = re_len; + ispartial = 1; + } + } + + /* + * Loop through the secondaries. (Step 3.) + * + * Note that __db_s_first and __db_s_next will take care of + * thread-locking and refcounting issues. + */ + for (sdbp = __db_s_first(dbp); + sdbp != NULL && ret == 0; ret = __db_s_next(&sdbp)) { + /* + * Call the callback for this secondary, to get the + * appropriate secondary key. + */ + memset(&skey, 0, sizeof(DBT)); + if ((ret = sdbp->s_callback(sdbp, + &pkey, ispartial ? &newdata : data, &skey)) != 0) { + if (ret == DB_DONOTINDEX) + /* + * The callback returned a null value--don't + * put this key in the secondary. Just + * move on to the next one--we'll handle + * any necessary deletes in step 5. + */ + continue; + else + goto err; + } + + /* + * Save the DBT we just got back from the callback function + * off; we want to pass its value into c_get functions + * that may stomp on a buffer the callback function + * allocated. + */ + memset(&save_skey, 0, sizeof(DBT)); /* Paranoia. */ + save_skey = skey; + + /* + * Open a cursor in this secondary. + * + * Use the same locker ID as our primary cursor, so that + * we're guaranteed that the locks don't conflict (e.g. in CDB + * or if we're subdatabases that share and want to lock a + * metadata page). + */ + if ((ret = __db_icursor(sdbp, dbc_arg->txn, sdbp->type, + PGNO_INVALID, 0, dbc_arg->locker, &sdbc)) != 0) + goto err; + + /* + * If we're in CDB, updates will fail since the new cursor + * isn't a writer. However, we hold the WRITE lock in the + * primary and will for as long as our new cursor lasts, + * and the primary and secondary share a lock file ID, + * so it's safe to consider this a WRITER. The close + * routine won't try to put anything because we don't + * really have a lock. + */ + if (CDB_LOCKING(sdbp->dbenv)) { + DB_ASSERT(sdbc->mylock.off == LOCK_INVALID); + F_SET(sdbc, DBC_WRITER); + } + + /* + * There are three cases here-- + * 1) The secondary supports sorted duplicates. + * If we attempt to put a secondary/primary pair + * that already exists, that's a duplicate duplicate, + * and c_put will return DB_KEYEXIST (see __db_duperr). + * This will leave us with exactly one copy of the + * secondary/primary pair, and this is just right--we'll + * avoid deleting it later, as the old and new secondaries + * will match (since the old secondary is the dup dup + * that's already there). + * 2) The secondary supports duplicates, but they're not + * sorted. We need to avoid putting a duplicate + * duplicate, because the matching old and new secondaries + * will prevent us from deleting anything and we'll + * wind up with two secondary records that point to the + * same primary key. Do a c_get(DB_GET_BOTH); if + * that returns 0, skip the put. + * 3) The secondary doesn't support duplicates at all. + * In this case, secondary keys must be unique; if + * another primary key already exists for this + * secondary key, we have to either overwrite it or + * not put this one, and in either case we've + * corrupted the secondary index. Do a c_get(DB_SET). + * If the secondary/primary pair already exists, do + * nothing; if the secondary exists with a different + * primary, return an error; and if the secondary + * does not exist, put it. + */ + if (!F_ISSET(sdbp, DB_AM_DUP)) { + /* Case 3. */ + memset(&oldpkey, 0, sizeof(DBT)); + F_SET(&oldpkey, DB_DBT_MALLOC); + ret = sdbc->c_real_get(sdbc, + &skey, &oldpkey, rmw | DB_SET); + if (ret == 0) { + cmp = __bam_defcmp(sdbp, &oldpkey, &pkey); + __os_ufree(sdbp->dbenv, oldpkey.data); + if (cmp != 0) { + __db_err(sdbp->dbenv, "%s%s", + "Put results in a non-unique secondary key in an ", + "index not configured to support duplicates"); + ret = EINVAL; + goto skipput; + } + } else if (ret != DB_NOTFOUND && ret != DB_KEYEMPTY) + goto skipput; + } else if (!F_ISSET(sdbp, DB_AM_DUPSORT)) + /* Case 2. */ + if ((ret = sdbc->c_real_get(sdbc, + &skey, &pkey, rmw | DB_GET_BOTH)) == 0) + goto skipput; + + ret = sdbc->c_put(sdbc, &skey, &pkey, DB_UPDATE_SECONDARY); + + /* + * We don't know yet whether this was a put-overwrite that + * in fact changed nothing. If it was, we may get DB_KEYEXIST. + * This is not an error. + */ + if (ret == DB_KEYEXIST) + ret = 0; + +skipput: FREE_IF_NEEDED(sdbp, &save_skey) + + if ((t_ret = sdbc->c_close(sdbc)) != 0) + ret = t_ret; + + if (ret != 0) + goto err; + } + if (ret != 0) + goto err; + + /* If still necessary, go get the old primary key/data. (Step 4.) */ + if (!have_oldrec) { + /* See the comments in step 2. This is real familiar. */ + if ((ret = __db_c_idup(dbc_arg, &pdbc, 0)) != 0) + goto err; + DB_ASSERT(flags != DB_CURRENT); + pkey.data = key->data; + pkey.size = key->size; + ret = pdbc->c_get(pdbc, &pkey, &olddata, rmw | DB_SET); + if (ret == DB_KEYEMPTY || ret == DB_NOTFOUND) { + nodel = 1; + ret = 0; + } + if ((t_ret = pdbc->c_close(pdbc)) != 0) + ret = t_ret; + if (ret != 0) + goto err; + have_oldrec = 1; + } + + /* + * If we don't follow this goto, we do in fact have an old record + * we may need to go delete. (Step 5). + */ + if (nodel) + goto skip_s_update; + + for (sdbp = __db_s_first(dbp); + sdbp != NULL && ret == 0; ret = __db_s_next(&sdbp)) { + /* + * Call the callback for this secondary to get the + * old secondary key. + */ + memset(&oldskey, 0, sizeof(DBT)); + if ((ret = sdbp->s_callback(sdbp, + &pkey, &olddata, &oldskey)) != 0) { + if (ret == DB_DONOTINDEX) + /* + * The callback returned a null value--there's + * nothing to delete. Go on to the next + * secondary. + */ + continue; + else + goto err; + } + if ((ret = sdbp->s_callback(sdbp, + &pkey, ispartial ? &newdata : data, &skey)) != 0 && + ret != DB_DONOTINDEX) + goto err; + + /* + * If there is no new secondary key, or if the old secondary + * key is different from the new secondary key, then + * we need to delete the old one. + * + * Note that bt_compare is (and must be) set no matter + * what access method we're in. + */ + sdbc = NULL; + if (ret == DB_DONOTINDEX || + ((BTREE *)sdbp->bt_internal)->bt_compare(sdbp, + &oldskey, &skey) != 0) { + if ((ret = __db_icursor(sdbp, dbc_arg->txn, sdbp->type, + PGNO_INVALID, 0, dbc_arg->locker, &sdbc)) != 0) + goto err; + if (CDB_LOCKING(sdbp->dbenv)) { + DB_ASSERT(sdbc->mylock.off == LOCK_INVALID); + F_SET(sdbc, DBC_WRITER); + } + + /* + * Don't let c_get(DB_GET_BOTH) stomp on + * any secondary key value that the callback + * function may have allocated. Use a temp + * DBT instead. + */ + memset(&temp, 0, sizeof(DBT)); + temp.data = oldskey.data; + temp.size = oldskey.size; + if ((ret = sdbc->c_real_get(sdbc, + &temp, &pkey, rmw | DB_GET_BOTH)) == 0) + ret = sdbc->c_del(sdbc, DB_UPDATE_SECONDARY); + } + + FREE_IF_NEEDED(sdbp, &skey); + FREE_IF_NEEDED(sdbp, &oldskey); + if (sdbc != NULL && (t_ret = sdbc->c_close(sdbc)) != 0) + ret = t_ret; + if (ret != 0) + goto err; + } + + /* Secondary index updates are now done. On to the "real" stuff. */ + +skip_s_update: + /* + * If we have an off-page duplicates cursor, and the operation applies + * to it, perform the operation. Duplicate the cursor and call the + * underlying function. + * + * Off-page duplicate trees are locked in the primary tree, that is, + * we acquire a write lock in the primary tree and no locks in the + * off-page dup tree. If the put operation is done in an off-page + * duplicate tree, call the primary cursor's upgrade routine first. + */ + if (dbc_arg->internal->opd != NULL && + (flags == DB_AFTER || flags == DB_BEFORE || flags == DB_CURRENT)) { + /* + * A special case for hash off-page duplicates. Hash doesn't + * support (and is documented not to support) put operations + * relative to a cursor which references an already deleted + * item. For consistency, apply the same criteria to off-page + * duplicates as well. + */ + if (dbc_arg->dbtype == DB_HASH && F_ISSET( + ((BTREE_CURSOR *)(dbc_arg->internal->opd->internal)), + C_DELETED)) { + ret = DB_NOTFOUND; + goto err; + } + + if ((ret = dbc_arg->c_am_writelock(dbc_arg)) != 0) + return (ret); + if ((ret = __db_c_dup(dbc_arg, &dbc_n, DB_POSITIONI)) != 0) + goto err; + opd = dbc_n->internal->opd; + if ((ret = opd->c_am_put( + opd, key, data, flags, NULL)) != 0) + goto err; + goto done; + } + + /* + * Perform an operation on the main cursor. Duplicate the cursor, + * and call the underlying function. + * + * XXX: MARGO + * + tmp_flags = flags == DB_AFTER || + flags == DB_BEFORE || flags == DB_CURRENT ? DB_POSITIONI : 0; + */ + tmp_flags = DB_POSITIONI; + + /* + * If this cursor is going to be closed immediately, we don't + * need to take precautions to clean it up on error. + */ + if (F_ISSET(dbc_arg, DBC_TRANSIENT)) + dbc_n = dbc_arg; + else if ((ret = __db_c_idup(dbc_arg, &dbc_n, tmp_flags)) != 0) + goto err; + + pgno = PGNO_INVALID; + if ((ret = dbc_n->c_am_put(dbc_n, key, data, flags, &pgno)) != 0) + goto err; + + /* + * We may be referencing a new off-page duplicates tree. Acquire + * a new cursor and call the underlying function. + */ + if (pgno != PGNO_INVALID) { + oldopd = dbc_n->internal->opd; + if ((ret = __db_c_newopd(dbc_arg, pgno, oldopd, &opd)) != 0) { + dbc_n->internal->opd = opd; + goto err; + } + + dbc_n->internal->opd = opd; + + if ((ret = opd->c_am_put( + opd, key, data, flags, NULL)) != 0) + goto err; + } + +done: +err: /* Cleanup and cursor resolution. */ + if ((t_ret = __db_c_cleanup(dbc_arg, dbc_n, ret)) != 0 && ret == 0) + ret = t_ret; + + /* If newdata was used, free its buffer. */ + if (newdata.data != NULL) + __os_free(dbp->dbenv, newdata.data); + + CDB_LOCKING_DONE(dbp, dbc_arg); + + if (sdbp != NULL && (t_ret = __db_s_done(sdbp)) != 0) + return (t_ret); + + return (ret); +} + +/* + * __db_duperr() + * Error message: we don't currently support sorted duplicate duplicates. + * PUBLIC: int __db_duperr __P((DB *, u_int32_t)); + */ +int +__db_duperr(dbp, flags) + DB *dbp; + u_int32_t flags; +{ + + /* + * If we run into this error while updating a secondary index, + * don't yell--there's no clean way to pass DB_NODUPDATA in along + * with DB_UPDATE_SECONDARY, but we may run into this problem + * in a normal, non-error course of events. + * + * !!! + * If and when we ever permit duplicate duplicates in sorted-dup + * databases, we need to either change the secondary index code + * to check for dup dups, or we need to maintain the implicit + * "DB_NODUPDATA" behavior for databases with DB_AM_SECONDARY set. + */ + if (flags != DB_NODUPDATA && !F_ISSET(dbp, DB_AM_SECONDARY)) + __db_err(dbp->dbenv, + "Duplicate data items are not supported with sorted data"); + return (DB_KEYEXIST); +} + +/* + * __db_c_cleanup -- + * Clean up duplicate cursors. + */ +static int +__db_c_cleanup(dbc, dbc_n, failed) + DBC *dbc, *dbc_n; + int failed; +{ + DB *dbp; + DBC *opd; + DBC_INTERNAL *internal; + DB_MPOOLFILE *mpf; + int ret, t_ret; + + dbp = dbc->dbp; + mpf = dbp->mpf; + internal = dbc->internal; + ret = 0; + + /* Discard any pages we're holding. */ + if (internal->page != NULL) { + if ((t_ret = mpf->put(mpf, internal->page, 0)) != 0 && ret == 0) + ret = t_ret; + internal->page = NULL; + } + opd = internal->opd; + if (opd != NULL && opd->internal->page != NULL) { + if ((t_ret = + mpf->put(mpf, opd->internal->page, 0)) != 0 && ret == 0) + ret = t_ret; + opd->internal->page = NULL; + } + + /* + * If dbc_n is NULL, there's no internal cursor swapping to be done + * and no dbc_n to close--we probably did the entire operation on an + * offpage duplicate cursor. Just return. + * + * If dbc and dbc_n are the same, we're either inside a DB->{put/get} + * operation, and as an optimization we performed the operation on + * the main cursor rather than on a duplicated one, or we're in a + * bulk get that can't have moved the cursor (DB_MULTIPLE with the + * initial c_get operation on an off-page dup cursor). Just + * return--either we know we didn't move the cursor, or we're going + * to close it before we return to application code, so we're sure + * not to visibly violate the "cursor stays put on error" rule. + */ + if (dbc_n == NULL || dbc == dbc_n) + return (ret); + + if (dbc_n->internal->page != NULL) { + if ((t_ret = + mpf->put(mpf, dbc_n->internal->page, 0)) != 0 && ret == 0) + ret = t_ret; + dbc_n->internal->page = NULL; + } + opd = dbc_n->internal->opd; + if (opd != NULL && opd->internal->page != NULL) { + if ((t_ret = + mpf->put(mpf, opd->internal->page, 0)) != 0 && ret == 0) + ret = t_ret; + opd->internal->page = NULL; + } + + /* + * If we didn't fail before entering this routine or just now when + * freeing pages, swap the interesting contents of the old and new + * cursors. + */ + if (!failed && ret == 0) { + dbc->internal = dbc_n->internal; + dbc_n->internal = internal; + } + + /* + * Close the cursor we don't care about anymore. The close can fail, + * but we only expect DB_LOCK_DEADLOCK failures. This violates our + * "the cursor is unchanged on error" semantics, but since all you can + * do with a DB_LOCK_DEADLOCK failure is close the cursor, I believe + * that's OK. + * + * XXX + * There's no way to recover from failure to close the old cursor. + * All we can do is move to the new position and return an error. + * + * XXX + * We might want to consider adding a flag to the cursor, so that any + * subsequent operations other than close just return an error? + */ + if ((t_ret = dbc_n->c_close(dbc_n)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __db_c_secondary_get -- + * This wrapper function for DBC->c_pget() is the DBC->c_get() function + * for a secondary index cursor. + * + * PUBLIC: int __db_c_secondary_get __P((DBC *, DBT *, DBT *, u_int32_t)); + */ +int +__db_c_secondary_get(dbc, skey, data, flags) + DBC *dbc; + DBT *skey, *data; + u_int32_t flags; +{ + + DB_ASSERT(F_ISSET(dbc->dbp, DB_AM_SECONDARY)); + return (dbc->c_pget(dbc, skey, NULL, data, flags)); +} + +/* + * __db_c_pget -- + * Get a primary key/data pair through a secondary index. + * + * PUBLIC: int __db_c_pget __P((DBC *, DBT *, DBT *, DBT *, u_int32_t)); + */ +int +__db_c_pget(dbc, skey, pkey, data, flags) + DBC *dbc; + DBT *skey, *pkey, *data; + u_int32_t flags; +{ + DB *pdbp, *sdbp; + DBC *pdbc; + DBT *save_rdata, nullpkey; + int pkeymalloc, ret, save_pkey_flags, t_ret; + + sdbp = dbc->dbp; + pdbp = sdbp->s_primary; + pkeymalloc = t_ret = 0; + + PANIC_CHECK(sdbp->dbenv); + if ((ret = __db_cpgetchk(sdbp, + skey, pkey, data, flags, IS_INITIALIZED(dbc))) != 0) + return (ret); + + /* + * The challenging part of this function is getting the behavior + * right for all the various permutations of DBT flags. The + * next several blocks handle the various cases we need to + * deal with specially. + */ + + /* + * We may be called with a NULL pkey argument, if we've been + * wrapped by a 2-DBT get call. If so, we need to use our + * own DBT. + */ + if (pkey == NULL) { + memset(&nullpkey, 0, sizeof(DBT)); + pkey = &nullpkey; + } + + /* + * DB_GET_RECNO is a special case, because we're interested not in + * the primary key/data pair, but rather in the primary's record + * number. + */ + if ((flags & DB_OPFLAGS_MASK) == DB_GET_RECNO) + return (__db_c_pget_recno(dbc, pkey, data, flags)); + + /* + * If the DBTs we've been passed don't have any of the + * user-specified memory management flags set, we want to make sure + * we return values using the DBTs dbc->rskey, dbc->rkey, and + * dbc->rdata, respectively. + * + * There are two tricky aspects to this: first, we need to pass + * skey and pkey *in* to the initial c_get on the secondary key, + * since either or both may be looked at by it (depending on the + * get flag). Second, we must not use a normal DB->get call + * on the secondary, even though that's what we want to accomplish, + * because the DB handle may be free-threaded. Instead, + * we open a cursor, then take steps to ensure that we actually use + * the rkey/rdata from the *secondary* cursor. + * + * We accomplish all this by passing in the DBTs we started out + * with to the c_get, but having swapped the contents of rskey and + * rkey, respectively, into rkey and rdata; __db_ret will treat + * them like the normal key/data pair in a c_get call, and will + * realloc them as need be (this is "step 1"). Then, for "step 2", + * we swap back rskey/rkey/rdata to normal, and do a get on the primary + * with the secondary dbc appointed as the owner of the returned-data + * memory. + * + * Note that in step 2, we copy the flags field in case we need to + * pass down a DB_DBT_PARTIAL or other flag that is compatible with + * letting DB do the memory management. + */ + /* Step 1. */ + save_rdata = dbc->rdata; + dbc->rdata = dbc->rkey; + dbc->rkey = dbc->rskey; + + /* + * It is correct, though slightly sick, to attempt a partial get + * of a primary key. However, if we do so here, we'll never find the + * primary record; clear the DB_DBT_PARTIAL field of pkey just + * for the duration of the next call. + */ + save_pkey_flags = pkey->flags; + F_CLR(pkey, DB_DBT_PARTIAL); + + /* + * Now we can go ahead with the meat of this call. First, get the + * primary key from the secondary index. (What exactly we get depends + * on the flags, but the underlying cursor get will take care of the + * dirty work.) + */ + if ((ret = dbc->c_real_get(dbc, skey, pkey, flags)) != 0) { + /* Restore rskey/rkey/rdata and return. */ + pkey->flags = save_pkey_flags; + dbc->rskey = dbc->rkey; + dbc->rkey = dbc->rdata; + dbc->rdata = save_rdata; + goto err; + } + + /* Restore pkey's flags in case we stomped the PARTIAL flag. */ + pkey->flags = save_pkey_flags; + + /* + * Restore the cursor's rskey, rkey, and rdata DBTs. If DB + * is handling the memory management, we now have newly + * reallocated buffers and ulens in rkey and rdata which we want + * to put in rskey and rkey. save_rdata contains the old value + * of dbc->rdata. + */ + dbc->rskey = dbc->rkey; + dbc->rkey = dbc->rdata; + dbc->rdata = save_rdata; + + /* + * Now we're ready for "step 2". If either or both of pkey and + * data do not have memory management flags set--that is, if DB is + * managing their memory--we need to swap around the rkey/rdata + * structures so that we don't wind up trying to use memory managed + * by the primary database cursor, which we'll close before we return. + * + * !!! + * If you're carefully following the bouncing ball, you'll note + * that in the DB-managed case, the buffer hanging off of pkey is + * the same as dbc->rkey->data. This is just fine; we may well + * realloc and stomp on it when we return, if we're going a + * DB_GET_BOTH and need to return a different partial or key + * (depending on the comparison function), but this is safe. + * + * !!! + * We need to use __db_icursor here rather than simply calling + * pdbp->cursor, because otherwise, if we're in CDB, we'll + * allocate a new locker ID and leave ourselves open to deadlocks. + * (Even though we're only acquiring read locks, we'll still block + * if there are any waiters.) + */ + if ((ret = __db_icursor(pdbp, + dbc->txn, pdbp->type, PGNO_INVALID, 0, dbc->locker, &pdbc)) != 0) + goto err; + + /* + * We're about to use pkey a second time. If DB_DBT_MALLOC + * is set on it, we'll leak the memory we allocated the first time. + * Thus, set DB_DBT_REALLOC instead so that we reuse that memory + * instead of leaking it. + * + * !!! + * This assumes that the user must always specify a compatible + * realloc function if a malloc function is specified. I think + * this is a reasonable requirement. + */ + if (F_ISSET(pkey, DB_DBT_MALLOC)) { + F_CLR(pkey, DB_DBT_MALLOC); + F_SET(pkey, DB_DBT_REALLOC); + pkeymalloc = 1; + } + + /* + * Do the actual get. Set DBC_TRANSIENT since we don't care + * about preserving the position on error, and it's faster. + * SET_RET_MEM so that the secondary DBC owns any returned-data + * memory. + */ + F_SET(pdbc, DBC_TRANSIENT); + SET_RET_MEM(pdbc, dbc); + ret = pdbc->c_get(pdbc, pkey, data, DB_SET); + + /* + * If the item wasn't found in the primary, this is a bug; + * our secondary has somehow gotten corrupted, and contains + * elements that don't correspond to anything in the primary. + * Complain. + */ + if (ret == DB_NOTFOUND) + ret = __db_secondary_corrupt(pdbp); + + /* Now close the primary cursor. */ + t_ret = pdbc->c_close(pdbc); + +err: if (pkeymalloc) { + /* + * If pkey had a MALLOC flag, we need to restore it; + * otherwise, if the user frees the buffer but reuses + * the DBT without NULL'ing its data field or changing + * the flags, we may drop core. + */ + F_CLR(pkey, DB_DBT_REALLOC); + F_SET(pkey, DB_DBT_MALLOC); + } + return (t_ret == 0 ? ret : t_ret); +} + +/* + * __db_c_pget_recno -- + * Perform a DB_GET_RECNO c_pget on a secondary index. Returns + * the secondary's record number in the pkey field and the primary's + * in the data field. + */ +static int +__db_c_pget_recno(sdbc, pkey, data, flags) + DBC *sdbc; + DBT *pkey, *data; + u_int32_t flags; +{ + DB *pdbp, *sdbp; + DB_ENV *dbenv; + DBC *pdbc; + DBT discardme, primary_key; + db_recno_t oob; + u_int32_t rmw; + int ret, t_ret; + + sdbp = sdbc->dbp; + pdbp = sdbp->s_primary; + dbenv = sdbp->dbenv; + pdbc = NULL; + ret = t_ret = 0; + + rmw = LF_ISSET(DB_RMW); + + memset(&discardme, 0, sizeof(DBT)); + F_SET(&discardme, DB_DBT_USERMEM | DB_DBT_PARTIAL); + + oob = RECNO_OOB; + + /* + * If the primary is an rbtree, we want its record number, whether + * or not the secondary is one too. Fetch the recno into "data". + * + * If it's not an rbtree, return RECNO_OOB in "data". + */ + if (F_ISSET(pdbp, DB_AM_RECNUM)) { + /* + * Get the primary key, so we can find the record number + * in the primary. (We're uninterested in the secondary key.) + */ + memset(&primary_key, 0, sizeof(DBT)); + F_SET(&primary_key, DB_DBT_MALLOC); + if ((ret = sdbc->c_real_get(sdbc, + &discardme, &primary_key, rmw | DB_CURRENT)) != 0) + return (ret); + + /* + * Open a cursor on the primary, set it to the right record, + * and fetch its recno into "data". + * + * (See __db_c_pget for a comment on the use of __db_icursor.) + * + * SET_RET_MEM so that the secondary DBC owns any returned-data + * memory. + */ + if ((ret = __db_icursor(pdbp, sdbc->txn, + pdbp->type, PGNO_INVALID, 0, sdbc->locker, &pdbc)) != 0) + goto perr; + SET_RET_MEM(pdbc, sdbc); + if ((ret = pdbc->c_get(pdbc, + &primary_key, &discardme, rmw | DB_SET)) != 0) + goto perr; + + ret = pdbc->c_get(pdbc, &discardme, data, rmw | DB_GET_RECNO); + +perr: __os_ufree(sdbp->dbenv, primary_key.data); + if (pdbc != NULL && + (t_ret = pdbc->c_close(pdbc)) != 0 && ret == 0) + ret = t_ret; + if (ret != 0) + return (ret); + } else if ((ret = __db_retcopy(dbenv, data, &oob, + sizeof(oob), &sdbc->rkey->data, &sdbc->rkey->ulen)) != 0) + return (ret); + + /* + * If the secondary is an rbtree, we want its record number, whether + * or not the primary is one too. Fetch the recno into "pkey". + * + * If it's not an rbtree, return RECNO_OOB in "pkey". + */ + if (F_ISSET(sdbp, DB_AM_RECNUM)) + return (sdbc->c_real_get(sdbc, &discardme, pkey, flags)); + else + return (__db_retcopy(dbenv, pkey, &oob, + sizeof(oob), &sdbc->rdata->data, &sdbc->rdata->ulen)); +} + +/* + * __db_wrlock_err -- do not have a write lock. + */ +static int +__db_wrlock_err(dbenv) + DB_ENV *dbenv; +{ + __db_err(dbenv, "Write attempted on read-only cursor"); + return (EPERM); +} + +/* + * __db_c_del_secondary -- + * Perform a delete operation on a secondary index: call through + * to the primary and delete the primary record that this record + * points to. + * + * Note that deleting the primary record will call c_del on all + * the secondaries, including this one; thus, it is not necessary + * to execute both this function and an actual delete. + * + */ +static int +__db_c_del_secondary(dbc) + DBC *dbc; +{ + DB *pdbp; + DBC *pdbc; + DBT skey, pkey; + int ret, t_ret; + + memset(&skey, 0, sizeof(DBT)); + memset(&pkey, 0, sizeof(DBT)); + + /* + * Get the current item that we're pointing at. + * We don't actually care about the secondary key, just + * the primary. + */ + F_SET(&skey, DB_DBT_PARTIAL | DB_DBT_USERMEM); + if ((ret = dbc->c_real_get(dbc, + &skey, &pkey, DB_CURRENT)) != 0) + return (ret); + + /* + * Create a cursor on the primary with our locker ID, + * so that when it calls back, we don't conflict. + * + * We create a cursor explicitly because there's no + * way to specify the same locker ID if we're using + * locking but not transactions if we use the DB->del + * interface. This shouldn't be any less efficient + * anyway. + */ + pdbp = dbc->dbp->s_primary; + if ((ret = __db_icursor(pdbp, dbc->txn, + pdbp->type, PGNO_INVALID, 0, dbc->locker, &pdbc)) != 0) + return (ret); + + /* + * See comment in __db_c_put--if we're in CDB, + * we already hold the locks we need, and we need to flag + * the cursor as a WRITER so we don't run into errors + * when we try to delete. + */ + if (CDB_LOCKING(pdbp->dbenv)) { + DB_ASSERT(pdbc->mylock.off == LOCK_INVALID); + F_SET(pdbc, DBC_WRITER); + } + + /* + * Set the new cursor to the correct primary key. Then + * delete it. We don't really care about the datum; + * just reuse our skey DBT. + * + * If the primary get returns DB_NOTFOUND, something is amiss-- + * every record in the secondary should correspond to some record + * in the primary. + */ + if ((ret = pdbc->c_get(pdbc, &pkey, &skey, + (STD_LOCKING(dbc) ? DB_RMW : 0) | DB_SET)) == 0) + ret = pdbc->c_del(pdbc, 0); + else if (ret == DB_NOTFOUND) + ret = __db_secondary_corrupt(pdbp); + + if ((t_ret = pdbc->c_close(pdbc)) != 0 && ret != 0) + ret = t_ret; + + return (ret); +} + +/* + * __db_c_del_primary -- + * Perform a delete operation on a primary index. Loop through + * all the secondary indices which correspond to this primary + * database, and delete any secondary keys that point at the current + * record. + * + * PUBLIC: int __db_c_del_primary __P((DBC *)); + */ +int +__db_c_del_primary(dbc) + DBC *dbc; +{ + DB *dbp, *sdbp; + DBC *sdbc; + DBT data, pkey, skey, temp; + int ret, t_ret; + + dbp = dbc->dbp; + + /* + * If we're called at all, we have at least one secondary. + * (Unfortunately, we can't assert this without grabbing the mutex.) + * Get the current record so that we can construct appropriate + * secondary keys as needed. + */ + memset(&pkey, 0, sizeof(DBT)); + memset(&data, 0, sizeof(DBT)); + if ((ret = dbc->c_get(dbc, &pkey, &data, DB_CURRENT)) != 0) + return (ret); + + for (sdbp = __db_s_first(dbp); + sdbp != NULL && ret == 0; ret = __db_s_next(&sdbp)) { + /* + * Get the secondary key for this secondary and the current + * item. + */ + memset(&skey, 0, sizeof(DBT)); + if ((ret = sdbp->s_callback(sdbp, &pkey, &data, &skey)) != 0) { + /* + * If the current item isn't in this index, we + * have no work to do. Proceed. + */ + if (ret == DB_DONOTINDEX) + continue; + + /* We had a substantive error. Bail. */ + FREE_IF_NEEDED(sdbp, &skey); + goto done; + } + + /* Open a secondary cursor. */ + if ((ret = __db_icursor(sdbp, dbc->txn, sdbp->type, + PGNO_INVALID, 0, dbc->locker, &sdbc)) != 0) + goto done; + /* See comment above and in __db_c_put. */ + if (CDB_LOCKING(sdbp->dbenv)) { + DB_ASSERT(sdbc->mylock.off == LOCK_INVALID); + F_SET(sdbc, DBC_WRITER); + } + + /* + * Set the secondary cursor to the appropriate item. + * Delete it. + * + * We want to use DB_RMW if locking is on; it's only + * legal then, though. + * + * !!! + * Don't stomp on any callback-allocated buffer in skey + * when we do a c_get(DB_GET_BOTH); use a temp DBT instead. + */ + memset(&temp, 0, sizeof(DBT)); + temp.data = skey.data; + temp.size = skey.size; + if ((ret = sdbc->c_real_get(sdbc, &temp, &pkey, + (STD_LOCKING(dbc) ? DB_RMW : 0) | DB_GET_BOTH)) == 0) + ret = sdbc->c_del(sdbc, DB_UPDATE_SECONDARY); + + FREE_IF_NEEDED(sdbp, &skey); + + if ((t_ret = sdbc->c_close(sdbc)) != 0 || ret != 0) { + if (ret == 0) + ret = t_ret; + goto done; + } + } + +done: if (sdbp != NULL && (t_ret = __db_s_done(sdbp)) != 0 && ret == 0) + return (t_ret); + return (ret); +} + +/* + * __db_s_first -- + * Get the first secondary, if any are present, from the primary. + * + * PUBLIC: DB *__db_s_first __P((DB *)); + */ +DB * +__db_s_first(pdbp) + DB *pdbp; +{ + DB *sdbp; + + MUTEX_THREAD_LOCK(pdbp->dbenv, pdbp->mutexp); + sdbp = LIST_FIRST(&pdbp->s_secondaries); + + /* See __db_s_next. */ + if (sdbp != NULL) + sdbp->s_refcnt++; + MUTEX_THREAD_UNLOCK(pdbp->dbenv, pdbp->mutexp); + + return (sdbp); +} + +/* + * __db_s_next -- + * Get the next secondary in the list. + * + * PUBLIC: int __db_s_next __P((DB **)); + */ +int +__db_s_next(sdbpp) + DB **sdbpp; +{ + DB *sdbp, *pdbp, *closeme; + int ret; + + /* + * Secondary indices are kept in a linked list, s_secondaries, + * off each primary DB handle. If a primary is free-threaded, + * this list may only be traversed or modified while the primary's + * thread mutex is held. + * + * The tricky part is that we don't want to hold the thread mutex + * across the full set of secondary puts necessary for each primary + * put, or we'll wind up essentially single-threading all the puts + * to the handle; the secondary puts will each take about as + * long as the primary does, and may require I/O. So we instead + * hold the thread mutex only long enough to follow one link to the + * next secondary, and then we release it before performing the + * actual secondary put. + * + * The only danger here is that we might legitimately close a + * secondary index in one thread while another thread is performing + * a put and trying to update that same secondary index. To + * prevent this from happening, we refcount the secondary handles. + * If close is called on a secondary index handle while we're putting + * to it, it won't really be closed--the refcount will simply drop, + * and we'll be responsible for closing it here. + */ + sdbp = *sdbpp; + pdbp = sdbp->s_primary; + closeme = NULL; + + MUTEX_THREAD_LOCK(pdbp->dbenv, pdbp->mutexp); + DB_ASSERT(sdbp->s_refcnt != 0); + if (--sdbp->s_refcnt == 0) { + LIST_REMOVE(sdbp, s_links); + closeme = sdbp; + } + sdbp = LIST_NEXT(sdbp, s_links); + if (sdbp != NULL) + sdbp->s_refcnt++; + MUTEX_THREAD_UNLOCK(pdbp->dbenv, pdbp->mutexp); + + *sdbpp = sdbp; + + /* + * closeme->close() is a wrapper; call __db_close explicitly. + */ + ret = closeme != NULL ? __db_close(closeme, 0) : 0; + return (ret); +} + +/* + * __db_s_done -- + * Properly decrement the refcount on a secondary database handle we're + * using, without calling __db_s_next. + * + * PUBLIC: int __db_s_done __P((DB *)); + */ +int +__db_s_done(sdbp) + DB *sdbp; +{ + DB *pdbp; + int doclose; + + pdbp = sdbp->s_primary; + doclose = 0; + + MUTEX_THREAD_LOCK(pdbp->dbenv, pdbp->mutexp); + DB_ASSERT(sdbp->s_refcnt != 0); + if (--sdbp->s_refcnt == 0) { + LIST_REMOVE(sdbp, s_links); + doclose = 1; + } + MUTEX_THREAD_UNLOCK(pdbp->dbenv, pdbp->mutexp); + + return (doclose ? __db_close(sdbp, 0) : 0); +} + +/* + * __db_buildpartial -- + * Build the record that will result after a partial put is applied to + * an existing record. + * + * This should probably be merged with __bam_build, but that requires + * a little trickery if we plan to keep the overflow-record optimization + * in that function. + */ +static int +__db_buildpartial(dbp, oldrec, partial, newrec) + DB *dbp; + DBT *oldrec, *partial, *newrec; +{ + int ret; + u_int8_t *buf; + u_int32_t len, nbytes; + + DB_ASSERT(F_ISSET(partial, DB_DBT_PARTIAL)); + + memset(newrec, 0, sizeof(DBT)); + + nbytes = __db_partsize(oldrec->size, partial); + newrec->size = nbytes; + + if ((ret = __os_malloc(dbp->dbenv, nbytes, &buf)) != 0) + return (ret); + newrec->data = buf; + + /* Nul or pad out the buffer, for any part that isn't specified. */ + memset(buf, + F_ISSET(dbp, DB_AM_FIXEDLEN) ? ((BTREE *)dbp->bt_internal)->re_pad : + 0, nbytes); + + /* Copy in any leading data from the original record. */ + memcpy(buf, oldrec->data, + partial->doff > oldrec->size ? oldrec->size : partial->doff); + + /* Copy the data from partial. */ + memcpy(buf + partial->doff, partial->data, partial->size); + + /* Copy any trailing data from the original record. */ + len = partial->doff + partial->dlen; + if (oldrec->size > len) + memcpy(buf + partial->doff + partial->size, + (u_int8_t *)oldrec->data + len, oldrec->size - len); + + return (0); +} + +/* + * __db_partsize -- + * Given the number of bytes in an existing record and a DBT that + * is about to be partial-put, calculate the size of the record + * after the put. + * + * This code is called from __bam_partsize. + * + * PUBLIC: u_int32_t __db_partsize __P((u_int32_t, DBT *)); + */ +u_int32_t +__db_partsize(nbytes, data) + u_int32_t nbytes; + DBT *data; +{ + + /* + * There are really two cases here: + * + * Case 1: We are replacing some bytes that do not exist (i.e., they + * are past the end of the record). In this case the number of bytes + * we are replacing is irrelevant and all we care about is how many + * bytes we are going to add from offset. So, the new record length + * is going to be the size of the new bytes (size) plus wherever those + * new bytes begin (doff). + * + * Case 2: All the bytes we are replacing exist. Therefore, the new + * size is the oldsize (nbytes) minus the bytes we are replacing (dlen) + * plus the bytes we are adding (size). + */ + if (nbytes < data->doff + data->dlen) /* Case 1 */ + return (data->doff + data->size); + + return (nbytes + data->size - data->dlen); /* Case 2 */ +} |