summaryrefslogtreecommitdiff
path: root/src/hash/hash.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/hash/hash.c')
-rw-r--r--src/hash/hash.c2340
1 files changed, 2340 insertions, 0 deletions
diff --git a/src/hash/hash.c b/src/hash/hash.c
new file mode 100644
index 00000000..ae5736e7
--- /dev/null
+++ b/src/hash/hash.c
@@ -0,0 +1,2340 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994
+ * Margo Seltzer. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Margo Seltzer.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/mp.h"
+#include "dbinc/partition.h"
+
+static int __ham_bulk __P((DBC *, DBT *, u_int32_t));
+static int __hamc_close __P((DBC *, db_pgno_t, int *));
+static int __hamc_del __P((DBC *, u_int32_t));
+static int __hamc_destroy __P((DBC *));
+static int __hamc_get __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+static int __hamc_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *));
+static int __hamc_writelock __P((DBC *));
+static int __ham_dup_return __P((DBC *, DBT *, u_int32_t));
+static int __ham_expand_table __P((DBC *));
+static int __hamc_update_getorder
+ __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+static int __hamc_update_setorder
+ __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+static int __ham_get_clist_func
+ __P((DBC *, DBC *, u_int32_t *, db_pgno_t, u_int32_t, void *));
+
+/*
+ * __ham_quick_delete --
+ * This function is called by __db_del when the appropriate conditions
+ * are met, and it performs the delete in the optimized way.
+ *
+ * PUBLIC: int __ham_quick_delete __P((DBC *));
+ */
+int
+__ham_quick_delete(dbc)
+ DBC *dbc;
+{
+ DB_MPOOLFILE *mpf;
+ HASH_CURSOR *hcp;
+ int ret, t_ret;
+
+ /*
+ * When performing a DB->del operation not involving secondary indices
+ * and not removing an off-page duplicate tree, we can speed things up
+ * substantially by removing the entire duplicate set, if any is
+ * present, in one operation, rather than by conjuring up and deleting
+ * each of the items individually. (All are stored in one big HKEYDATA
+ * structure.) We don't bother to distinguish on-page duplicate sets
+ * from single, non-dup items; they're deleted in exactly the same way.
+ *
+ * The cursor should be set to the first item in the duplicate set, or
+ * to the sole key/data pair when the key does not have a duplicate set,
+ * before the function is called.
+ *
+ * We do not need to call CDB_LOCKING_INIT, __db_del calls here with
+ * a write cursor.
+ *
+ * Assert we're initialized, but not to an off-page duplicate.
+ * Assert we're not using secondary indices.
+ */
+ DB_ASSERT(dbc->env, IS_INITIALIZED(dbc));
+ DB_ASSERT(dbc->env, dbc->internal->opd == NULL);
+ DB_ASSERT(dbc->env, !F_ISSET(dbc->dbp, DB_AM_SECONDARY));
+ DB_ASSERT(dbc->env, !DB_IS_PRIMARY(dbc->dbp));
+
+ hcp = (HASH_CURSOR *)dbc->internal;
+ mpf = dbc->dbp->mpf;
+ if ((ret = __ham_get_meta(dbc)) != 0)
+ return (ret);
+
+ if ((ret = __hamc_writelock(dbc)) == 0) {
+ ret = __ham_del_pair(dbc, 0, NULL);
+ /*
+ * If a page was retrieved during the delete, put it now. We
+ * can't rely on the callers cursor close to do that, since bulk
+ * delete operations keep the cursor open across deletes.
+ */
+ if (hcp->page != NULL) {
+ if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+ hcp->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ hcp->page = NULL;
+ }
+ }
+
+ if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ return (ret);
+}
+
+/* ****************** CURSORS ********************************** */
+/*
+ * __hamc_init --
+ * Initialize the hash-specific portion of a cursor.
+ *
+ * PUBLIC: int __hamc_init __P((DBC *));
+ */
+int
+__hamc_init(dbc)
+ DBC *dbc;
+{
+ ENV *env;
+ HASH_CURSOR *new_curs;
+ int ret;
+
+ env = dbc->env;
+ if ((ret = __os_calloc(env,
+ 1, sizeof(struct cursor_t), &new_curs)) != 0)
+ return (ret);
+ if ((ret = __os_malloc(env,
+ dbc->dbp->pgsize, &new_curs->split_buf)) != 0) {
+ __os_free(env, new_curs);
+ return (ret);
+ }
+
+ dbc->internal = (DBC_INTERNAL *) new_curs;
+ dbc->close = dbc->c_close = __dbc_close_pp;
+ dbc->cmp = __dbc_cmp_pp;
+ dbc->count = dbc->c_count = __dbc_count_pp;
+ dbc->del = dbc->c_del = __dbc_del_pp;
+ dbc->dup = dbc->c_dup = __dbc_dup_pp;
+ dbc->get = dbc->c_get = __dbc_get_pp;
+ dbc->pget = dbc->c_pget = __dbc_pget_pp;
+ dbc->put = dbc->c_put = __dbc_put_pp;
+ dbc->am_bulk = __ham_bulk;
+ dbc->am_close = __hamc_close;
+ dbc->am_del = __hamc_del;
+ dbc->am_destroy = __hamc_destroy;
+ dbc->am_get = __hamc_get;
+ dbc->am_put = __hamc_put;
+ dbc->am_writelock = __hamc_writelock;
+
+ return (__ham_item_init(dbc));
+}
+
+/*
+ * __hamc_close --
+ * Close down the cursor from a single use.
+ */
+static int
+__hamc_close(dbc, root_pgno, rmroot)
+ DBC *dbc;
+ db_pgno_t root_pgno;
+ int *rmroot;
+{
+ DB_MPOOLFILE *mpf;
+ HASH_CURSOR *hcp;
+ HKEYDATA *dp;
+ db_lockmode_t lock_mode;
+ int doroot, gotmeta, ret, t_ret;
+
+ COMPQUIET(rmroot, 0);
+ mpf = dbc->dbp->mpf;
+ doroot = gotmeta = ret = 0;
+ hcp = (HASH_CURSOR *) dbc->internal;
+
+ /* Check for off page dups. */
+ if (dbc->internal->opd != NULL) {
+ if ((ret = __ham_get_meta(dbc)) != 0)
+ goto done;
+ gotmeta = 1;
+ lock_mode = DB_LOCK_READ;
+
+ /* To support dirty reads we must reget the write lock. */
+ if (F_ISSET(dbc->dbp, DB_AM_READ_UNCOMMITTED) &&
+ F_ISSET((BTREE_CURSOR *)
+ dbc->internal->opd->internal, C_DELETED))
+ lock_mode = DB_LOCK_WRITE;
+
+ if ((ret = __ham_get_cpage(dbc, lock_mode)) != 0)
+ goto out;
+ dp = (HKEYDATA *)H_PAIRDATA(dbc->dbp, hcp->page, hcp->indx);
+
+ /* If it's not a dup we aborted before we changed it. */
+ if (HPAGE_PTYPE(dp) == H_OFFDUP)
+ memcpy(&root_pgno,
+ HOFFPAGE_PGNO(dp), sizeof(db_pgno_t));
+ else
+ root_pgno = PGNO_INVALID;
+
+ if ((ret =
+ hcp->opd->am_close(hcp->opd, root_pgno, &doroot)) != 0)
+ goto out;
+ if (doroot != 0) {
+ if ((ret = __memp_dirty(mpf, &hcp->page,
+ dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto out;
+ if ((ret = __ham_del_pair(dbc, 0, NULL)) != 0)
+ goto out;
+ }
+ }
+
+out: if (ret != 0)
+ F_SET(dbc, DBC_ERROR);
+ if (hcp->page != NULL && (t_ret = __memp_fput(mpf,
+ dbc->thread_info, hcp->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (gotmeta != 0 && (t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+done: if ((t_ret = __ham_item_init(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __hamc_destroy --
+ * Cleanup the access method private part of a cursor.
+ */
+static int
+__hamc_destroy(dbc)
+ DBC *dbc;
+{
+ HASH_CURSOR *hcp;
+
+ hcp = (HASH_CURSOR *)dbc->internal;
+ if (hcp->split_buf != NULL)
+ __os_free(dbc->env, hcp->split_buf);
+ __os_free(dbc->env, hcp);
+
+ return (0);
+}
+
+/*
+ * __hamc_count --
+ * Return a count of on-page duplicates.
+ *
+ * PUBLIC: int __hamc_count __P((DBC *, db_recno_t *));
+ */
+int
+__hamc_count(dbc, recnop)
+ DBC *dbc;
+ db_recno_t *recnop;
+{
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ HASH_CURSOR *hcp;
+ db_indx_t len;
+ db_recno_t recno;
+ int ret, t_ret;
+ u_int8_t *p, *pend;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ hcp = (HASH_CURSOR *)dbc->internal;
+
+ recno = 0;
+
+ if ((ret = __ham_get_cpage(dbc, DB_LOCK_READ)) != 0)
+ return (ret);
+ if (hcp->indx >= NUM_ENT(hcp->page)) {
+ *recnop = 0;
+ goto err;
+ }
+
+ switch (HPAGE_PTYPE(H_PAIRDATA(dbp, hcp->page, hcp->indx))) {
+ case H_KEYDATA:
+ case H_OFFPAGE:
+ recno = 1;
+ break;
+ case H_DUPLICATE:
+ p = HKEYDATA_DATA(H_PAIRDATA(dbp, hcp->page, hcp->indx));
+ pend = p +
+ LEN_HDATA(dbp, hcp->page, dbp->pgsize, hcp->indx);
+ for (; p < pend; recno++) {
+ /* p may be odd, so copy rather than just dereffing */
+ memcpy(&len, p, sizeof(db_indx_t));
+ p += 2 * sizeof(db_indx_t) + len;
+ }
+
+ break;
+ default:
+ ret = __db_pgfmt(dbp->env, hcp->pgno);
+ goto err;
+ }
+
+ *recnop = recno;
+
+err: if ((t_ret = __memp_fput(mpf,
+ dbc->thread_info, hcp->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ hcp->page = NULL;
+ return (ret);
+}
+
+/*
+ * __hamc_cmp --
+ * Compare two hash cursors for equality.
+ *
+ * This function is only called with two cursors that point to the same item.
+ * It distinguishes two cases:
+ * * Cursors pointing to different items in the same on-page duplicate set.
+ * * Cursors pointing to the same item, with different DELETED flags.
+ *
+ * PUBLIC: int __hamc_cmp __P((DBC *, DBC *, int *));
+ */
+int
+__hamc_cmp(dbc, other_dbc, result)
+ DBC *dbc, *other_dbc;
+ int *result;
+{
+ ENV *env;
+ HASH_CURSOR *hcp, *ohcp;
+
+ env = dbc->env;
+ hcp = (HASH_CURSOR *)dbc->internal;
+ ohcp = (HASH_CURSOR *)other_dbc->internal;
+
+ DB_ASSERT (env, hcp->pgno == ohcp->pgno);
+ DB_ASSERT (env, hcp->indx == ohcp->indx);
+
+ /* Only compare the duplicate offsets if this is a duplicate item. */
+ if ((F_ISSET(hcp, H_ISDUP) && hcp->dup_off != ohcp->dup_off) ||
+ F_ISSET(hcp, H_DELETED) != F_ISSET(ohcp, H_DELETED))
+ *result = 1;
+ else
+ *result = 0;
+ return (0);
+}
+
+static int
+__hamc_del(dbc, flags)
+ DBC *dbc;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DBT repldbt;
+ DB_MPOOLFILE *mpf;
+ HASH_CURSOR *hcp;
+ int ret, t_ret;
+
+ COMPQUIET(flags, 0);
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ hcp = (HASH_CURSOR *)dbc->internal;
+
+ if (F_ISSET(hcp, H_DELETED))
+ return (DB_NOTFOUND);
+
+ if ((ret = __ham_get_meta(dbc)) != 0)
+ goto out;
+
+ if ((ret = __ham_get_cpage(dbc, DB_LOCK_WRITE)) != 0)
+ goto out;
+
+ /* Off-page duplicates. */
+ if (HPAGE_TYPE(dbp, hcp->page, H_DATAINDEX(hcp->indx)) == H_OFFDUP)
+ goto out;
+
+ DB_ASSERT(dbp->env, IS_DIRTY(hcp->page));
+
+ if (F_ISSET(hcp, H_ISDUP)) { /* On-page duplicate. */
+ if (hcp->dup_off == 0 &&
+ DUP_SIZE(hcp->dup_len) == LEN_HDATA(dbp, hcp->page,
+ hcp->hdr->dbmeta.pagesize, hcp->indx))
+ ret = __ham_del_pair(dbc, 0, NULL);
+ else {
+ repldbt.flags = 0;
+ F_SET(&repldbt, DB_DBT_PARTIAL);
+ repldbt.doff = hcp->dup_off;
+ repldbt.dlen = DUP_SIZE(hcp->dup_len);
+ repldbt.size = 0;
+ repldbt.data = HKEYDATA_DATA(H_PAIRDATA(dbp, hcp->page,
+ hcp->indx));
+ if ((ret =
+ __ham_replpair(dbc, &repldbt, H_DUPLICATE)) == 0) {
+ hcp->dup_tlen -= DUP_SIZE(hcp->dup_len);
+ F_SET(hcp, H_DELETED);
+ /*
+ * Clear any cached streaming information.
+ */
+ hcp->stream_start_pgno = PGNO_INVALID;
+ ret = __hamc_update(dbc, DUP_SIZE(hcp->dup_len),
+ DB_HAM_CURADJ_DEL, 1);
+ }
+ }
+ } else /* Not a duplicate */
+ ret = __ham_del_pair(dbc, 0, NULL);
+
+out: if (hcp->page != NULL) {
+ if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+ hcp->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ hcp->page = NULL;
+ }
+ if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+ return (ret);
+}
+
+/*
+ * __hamc_dup --
+ * Duplicate a hash cursor, such that the new one holds appropriate
+ * locks for the position of the original.
+ *
+ * PUBLIC: int __hamc_dup __P((DBC *, DBC *));
+ */
+int
+__hamc_dup(orig_dbc, new_dbc)
+ DBC *orig_dbc, *new_dbc;
+{
+ HASH_CURSOR *orig, *new;
+
+ orig = (HASH_CURSOR *)orig_dbc->internal;
+ new = (HASH_CURSOR *)new_dbc->internal;
+
+ new->bucket = orig->bucket;
+ new->lbucket = orig->lbucket;
+ new->dup_off = orig->dup_off;
+ new->dup_len = orig->dup_len;
+ new->dup_tlen = orig->dup_tlen;
+
+ if (F_ISSET(orig, H_DELETED))
+ F_SET(new, H_DELETED);
+ if (F_ISSET(orig, H_ISDUP))
+ F_SET(new, H_ISDUP);
+
+ return (0);
+}
+
+static int
+__hamc_get(dbc, key, data, flags, pgnop)
+ DBC *dbc;
+ DBT *key;
+ DBT *data;
+ u_int32_t flags;
+ db_pgno_t *pgnop;
+{
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ ENV *env;
+ HASH_CURSOR *hcp;
+ db_lockmode_t lock_type;
+ int ret, t_ret;
+
+ hcp = (HASH_CURSOR *)dbc->internal;
+ dbp = dbc->dbp;
+ env = dbp->env;
+ mpf = dbp->mpf;
+
+ /* Clear OR'd in additional bits so we can check for flag equality. */
+ if (F_ISSET(dbc, DBC_RMW))
+ lock_type = DB_LOCK_WRITE;
+ else
+ lock_type = DB_LOCK_READ;
+
+ if ((ret = __ham_get_meta(dbc)) != 0)
+ return (ret);
+ hcp->seek_size = 0;
+
+ ret = 0;
+ switch (flags) {
+ case DB_PREV_DUP:
+ F_SET(hcp, H_DUPONLY);
+ goto prev;
+ case DB_PREV_NODUP:
+ F_SET(hcp, H_NEXT_NODUP);
+ /* FALLTHROUGH */
+ case DB_PREV:
+ if (IS_INITIALIZED(dbc)) {
+prev: ret = __ham_item_prev(dbc, lock_type, pgnop);
+ break;
+ }
+ /* FALLTHROUGH */
+ case DB_LAST:
+ ret = __ham_item_last(dbc, lock_type, pgnop);
+ break;
+ case DB_NEXT_DUP:
+ case DB_GET_BOTHC:
+ /* cgetchk has already determined that the cursor is set. */
+ F_SET(hcp, H_DUPONLY);
+ goto next;
+ case DB_NEXT_NODUP:
+ F_SET(hcp, H_NEXT_NODUP);
+ /* FALLTHROUGH */
+ case DB_NEXT:
+ if (IS_INITIALIZED(dbc)) {
+next: ret = __ham_item_next(dbc, lock_type, pgnop);
+ break;
+ }
+ /* FALLTHROUGH */
+ case DB_FIRST:
+ ret = __ham_item_first(dbc, lock_type, pgnop);
+ break;
+ case DB_SET:
+ case DB_SET_RANGE:
+ case DB_GET_BOTH:
+ case DB_GET_BOTH_RANGE:
+ ret = __ham_lookup(dbc, key, 0, lock_type, pgnop);
+ break;
+ case DB_CURRENT:
+ /* cgetchk has already determined that the cursor is set. */
+ if (F_ISSET(hcp, H_DELETED)) {
+ ret = DB_KEYEMPTY;
+ goto err;
+ }
+
+ ret = __ham_item(dbc, lock_type, pgnop);
+ break;
+ default:
+ ret = __db_unknown_flag(env, "__hamc_get", flags);
+ break;
+ }
+
+ /*
+ * Must always enter this loop to do error handling and
+ * check for big key/data pair.
+ */
+ for (;;) {
+ if (ret != 0 && ret != DB_NOTFOUND)
+ goto err;
+ else if (F_ISSET(hcp, H_OK)) {
+ if (*pgnop == PGNO_INVALID)
+ ret = __ham_dup_return(dbc, data, flags);
+ break;
+ } else if (!F_ISSET(hcp, H_NOMORE)) {
+ __db_errx(env, DB_STR("1130",
+ "H_NOMORE returned to __hamc_get"));
+ ret = EINVAL;
+ break;
+ }
+
+ /*
+ * Ran out of entries in a bucket; change buckets.
+ */
+ switch (flags) {
+ case DB_LAST:
+ case DB_PREV:
+ case DB_PREV_DUP:
+ case DB_PREV_NODUP:
+ ret = __memp_fput(mpf,
+ dbc->thread_info, hcp->page, dbc->priority);
+ hcp->page = NULL;
+ if (hcp->bucket == 0) {
+ ret = DB_NOTFOUND;
+ hcp->pgno = PGNO_INVALID;
+ goto err;
+ }
+ F_CLR(hcp, H_ISDUP);
+ hcp->bucket--;
+ hcp->indx = NDX_INVALID;
+ hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
+ if (ret == 0)
+ ret = __ham_item_prev(dbc, lock_type, pgnop);
+ break;
+ case DB_FIRST:
+ case DB_NEXT:
+ case DB_NEXT_NODUP:
+ ret = __memp_fput(mpf,
+ dbc->thread_info, hcp->page, dbc->priority);
+ hcp->page = NULL;
+ hcp->indx = NDX_INVALID;
+ hcp->bucket++;
+ F_CLR(hcp, H_ISDUP);
+ hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
+ if (hcp->bucket > hcp->hdr->max_bucket) {
+ ret = DB_NOTFOUND;
+ hcp->pgno = PGNO_INVALID;
+ goto err;
+ }
+ if (ret == 0)
+ ret = __ham_item_next(dbc, lock_type, pgnop);
+ break;
+ case DB_GET_BOTH:
+ case DB_GET_BOTHC:
+ case DB_GET_BOTH_RANGE:
+ case DB_NEXT_DUP:
+ case DB_SET:
+ case DB_SET_RANGE:
+ /* Key not found. */
+ ret = DB_NOTFOUND;
+ goto err;
+ case DB_CURRENT:
+ /*
+ * This should only happen if you are doing deletes and
+ * reading with concurrent threads and not doing proper
+ * locking. We return the same error code as we would
+ * if the cursor were deleted.
+ */
+ ret = DB_KEYEMPTY;
+ goto err;
+ default:
+ DB_ASSERT(env, 0);
+ }
+ }
+
+err: if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+ F_CLR(hcp, H_DUPONLY);
+ F_CLR(hcp, H_NEXT_NODUP);
+
+ return (ret);
+}
+
+/*
+ * __ham_bulk -- Return bulk data from a hash table.
+ */
+static int
+__ham_bulk(dbc, data, flags)
+ DBC *dbc;
+ DBT *data;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ HASH_CURSOR *cp;
+ PAGE *pg;
+ db_indx_t dup_len, dup_off, dup_tlen, indx, *inp;
+ db_lockmode_t lock_mode;
+ db_pgno_t pgno;
+ int32_t *endp, *offp, *saveoff;
+ u_int32_t key_off, key_size, pagesize, size, space;
+ u_int8_t *dbuf, *dp, *hk, *np, *tmp;
+ int is_dup, is_key;
+ int need_pg, next_key, no_dup, ret, t_ret;
+
+ ret = 0;
+ key_off = 0;
+ dup_len = dup_off = dup_tlen = 0;
+ size = 0;
+ dbp = dbc->dbp;
+ pagesize = dbp->pgsize;
+ mpf = dbp->mpf;
+ cp = (HASH_CURSOR *)dbc->internal;
+ is_key = LF_ISSET(DB_MULTIPLE_KEY) ? 1 : 0;
+ next_key = is_key && LF_ISSET(DB_OPFLAGS_MASK) != DB_NEXT_DUP;
+ no_dup = LF_ISSET(DB_OPFLAGS_MASK) == DB_NEXT_NODUP;
+ dbuf = data->data;
+ np = dp = dbuf;
+
+ /* Keep track of space that is left. There is an termination entry */
+ space = data->ulen;
+ space -= sizeof(*offp);
+
+ /* Build the offset/size table from the end up. */
+ endp = (int32_t *) ((u_int8_t *)dbuf + data->ulen);
+ endp--;
+ offp = endp;
+
+ key_size = 0;
+ lock_mode = F_ISSET(dbc, DBC_RMW) ? DB_LOCK_WRITE: DB_LOCK_READ;
+
+next_pg:
+ need_pg = 1;
+ indx = cp->indx;
+ pg = cp->page;
+ inp = P_INP(dbp, pg);
+
+ do {
+ if (is_key) {
+ hk = H_PAIRKEY(dbp, pg, indx);
+ if (HPAGE_PTYPE(hk) == H_OFFPAGE) {
+ memcpy(&key_size,
+ HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
+ memcpy(&pgno,
+ HOFFPAGE_PGNO(hk), sizeof(db_pgno_t));
+ size = key_size;
+ if (key_size > space)
+ goto get_key_space;
+ if ((ret = __bam_bulk_overflow(
+ dbc, key_size, pgno, np)) != 0)
+ return (ret);
+ space -= key_size;
+ key_off = (u_int32_t)(np - dbuf);
+ np += key_size;
+ } else {
+ if (need_pg) {
+ dp = np;
+ size = pagesize - HOFFSET(pg);
+ if (space < size) {
+get_key_space:
+ if (offp == endp) {
+ data->size = (u_int32_t)
+ DB_ALIGN(size +
+ pagesize, 1024);
+ return
+ (DB_BUFFER_SMALL);
+ }
+ goto back_up;
+ }
+ memcpy(dp,
+ (u_int8_t *)pg + HOFFSET(pg), size);
+ need_pg = 0;
+ space -= size;
+ np += size;
+ }
+ key_size = LEN_HKEY(dbp, pg, pagesize, indx);
+ key_off = ((inp[indx] - HOFFSET(pg)) +
+ (u_int32_t)(dp - dbuf)) +
+ SSZA(HKEYDATA, data);
+ }
+ }
+
+ hk = H_PAIRDATA(dbp, pg, indx);
+ switch (HPAGE_PTYPE(hk)) {
+ case H_DUPLICATE:
+ case H_KEYDATA:
+ if (need_pg) {
+ dp = np;
+ size = pagesize - HOFFSET(pg);
+ if (space < size) {
+back_up:
+ if (indx != 0) {
+ indx -= 2;
+ /* XXX
+ * It's not clear that this is
+ * the right way to fix this,
+ * but here goes.
+ * If we are backing up onto a
+ * duplicate, then we need to
+ * position ourselves at the
+ * end of the duplicate set.
+ * We probably need to make
+ * this work for H_OFFDUP too.
+ * It might be worth making a
+ * dummy cursor and calling
+ * __ham_item_prev.
+ */
+ tmp = H_PAIRDATA(dbp, pg, indx);
+ if (HPAGE_PTYPE(tmp) ==
+ H_DUPLICATE) {
+ dup_off = dup_tlen =
+ LEN_HDATA(dbp, pg,
+ pagesize, indx + 1);
+ memcpy(&dup_len,
+ HKEYDATA_DATA(tmp),
+ sizeof(db_indx_t));
+ } else {
+ is_dup = 0;
+ dup_len = 0;
+ dup_off = 0;
+ dup_tlen = 0;
+ F_CLR(cp, H_ISDUP);
+ }
+ goto get_space;
+ }
+ /* indx == 0 */
+ cp->dup_len = dup_len;
+ cp->dup_off = dup_off;
+ cp->dup_tlen = dup_tlen;
+ if ((ret = __ham_item_prev(dbc,
+ lock_mode, &pgno)) != 0) {
+ if (ret != DB_NOTFOUND)
+ return (ret);
+ if ((ret = __memp_fput(mpf,
+ dbc->thread_info, cp->page,
+ dbc->priority)) != 0)
+ return (ret);
+ cp->page = NULL;
+ if (cp->bucket == 0) {
+ cp->indx = indx =
+ NDX_INVALID;
+ goto get_space;
+ }
+ if ((ret =
+ __ham_get_meta(dbc)) != 0)
+ return (ret);
+
+ cp->bucket--;
+ cp->pgno = BUCKET_TO_PAGE(cp,
+ cp->bucket);
+ cp->indx = NDX_INVALID;
+ if ((ret = __ham_release_meta(
+ dbc)) != 0)
+ return (ret);
+ /*
+ * Not an error to get
+ * DB_NOTFOUND, we're just at
+ * the beginning of the db.
+ */
+ if ((ret = __ham_item_prev(dbc,
+ lock_mode, &pgno)) != 0) {
+ if (ret != DB_NOTFOUND)
+ return (ret);
+ else
+ ret = 0;
+ }
+ }
+ indx = cp->indx;
+get_space:
+ /*
+ * See if we put any data in the buffer.
+ */
+ if (offp >= endp ||
+ F_ISSET(dbc, DBC_TRANSIENT)) {
+ data->size = (u_int32_t)
+ DB_ALIGN(size +
+ data->ulen - space, 1024);
+ return (DB_BUFFER_SMALL);
+ }
+ /*
+ * Don't continue; we're all out
+ * of space, even though we're
+ * returning success.
+ */
+ next_key = 0;
+ break;
+ }
+ memcpy(dp, (u_int8_t *)pg + HOFFSET(pg), size);
+ need_pg = 0;
+ space -= size;
+ np += size;
+ }
+
+ /*
+ * We're about to crack the offset(s) and length(s)
+ * out of an H_KEYDATA or H_DUPLICATE item.
+ * There are three cases:
+ * 1. We were moved into a duplicate set by
+ * the standard hash cursor code. Respect
+ * the dup_off and dup_tlen we were given.
+ * 2. We stumbled upon a duplicate set while
+ * walking the page on our own. We need to
+ * recognize it as a dup and set dup_off and
+ * dup_tlen.
+ * 3. The current item is not a dup.
+ */
+ if (F_ISSET(cp, H_ISDUP)) {
+ /* Case 1 */
+ is_dup = 1;
+ dup_len = cp->dup_len;
+ dup_off = cp->dup_off;
+ dup_tlen = cp->dup_tlen;
+ } else if (HPAGE_PTYPE(hk) == H_DUPLICATE) {
+ /* Case 2 */
+ is_dup = 1;
+ /*
+ * If we run out of memory and bail,
+ * make sure the fact we're in a dup set
+ * isn't ignored later.
+ */
+ F_SET(cp, H_ISDUP);
+ dup_off = 0;
+ memcpy(&dup_len,
+ HKEYDATA_DATA(hk), sizeof(db_indx_t));
+ dup_tlen = LEN_HDATA(dbp, pg, pagesize, indx);
+ } else {
+ /* Case 3 */
+ is_dup = 0;
+ dup_len = 0;
+ dup_off = 0;
+ dup_tlen = 0;
+ }
+
+ do {
+ space -= (is_key ? 4 : 2) * sizeof(*offp);
+ size += (is_key ? 4 : 2) * sizeof(*offp);
+ /*
+ * Since space is an unsigned, if we happen
+ * to wrap, then this comparison will turn out
+ * to be true. XXX Wouldn't it be better to
+ * simply check above that space is greater than
+ * the value we're about to subtract???
+ */
+ if (space > data->ulen) {
+ if (!is_dup || dup_off == 0)
+ goto back_up;
+ dup_off -= (db_indx_t)
+ DUP_SIZE((u_int32_t)offp[1]);
+ goto get_space;
+ }
+ if (is_key) {
+ *offp-- = (int32_t)key_off;
+ *offp-- = (int32_t)key_size;
+ }
+ if (is_dup) {
+ *offp-- = (int32_t)(
+ ((inp[indx + 1] - HOFFSET(pg)) +
+ dp - dbuf) + SSZA(HKEYDATA, data) +
+ dup_off + sizeof(db_indx_t));
+ memcpy(&dup_len,
+ HKEYDATA_DATA(hk) + dup_off,
+ sizeof(db_indx_t));
+ dup_off += DUP_SIZE(dup_len);
+ *offp-- = dup_len;
+ } else {
+ *offp-- = (int32_t)(
+ ((inp[indx + 1] - HOFFSET(pg)) +
+ dp - dbuf) + SSZA(HKEYDATA, data));
+ *offp-- = LEN_HDATA(dbp, pg,
+ pagesize, indx);
+ }
+ } while (is_dup && dup_off < dup_tlen && no_dup == 0);
+ F_CLR(cp, H_ISDUP);
+ break;
+ case H_OFFDUP:
+ memcpy(&pgno, HOFFPAGE_PGNO(hk), sizeof(db_pgno_t));
+ space -= 2 * sizeof(*offp);
+ if (space > data->ulen)
+ goto back_up;
+
+ if (is_key) {
+ space -= 2 * sizeof(*offp);
+ if (space > data->ulen)
+ goto back_up;
+ *offp-- = (int32_t)key_off;
+ *offp-- = (int32_t)key_size;
+ }
+ saveoff = offp;
+ if ((ret = __bam_bulk_duplicates(dbc,
+ pgno, dbuf, is_key ? offp + 2 : NULL,
+ &offp, &np, &space, no_dup)) != 0) {
+ if (ret == DB_BUFFER_SMALL) {
+ size = space;
+ space = 0;
+ if (is_key && saveoff == offp) {
+ offp += 2;
+ goto back_up;
+ }
+ goto get_space;
+ }
+ return (ret);
+ }
+ break;
+ case H_OFFPAGE:
+ space -= (is_key ? 4 : 2) * sizeof(*offp);
+ if (space > data->ulen)
+ goto back_up;
+
+ memcpy(&size, HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
+ memcpy(&pgno, HOFFPAGE_PGNO(hk), sizeof(db_pgno_t));
+ if (size > space)
+ goto back_up;
+
+ if ((ret =
+ __bam_bulk_overflow(dbc, size, pgno, np)) != 0)
+ return (ret);
+
+ if (is_key) {
+ *offp-- = (int32_t)key_off;
+ *offp-- = (int32_t)key_size;
+ }
+
+ *offp-- = (int32_t)(np - dbuf);
+ *offp-- = (int32_t)size;
+
+ np += size;
+ space -= size;
+ break;
+ default:
+ /* Do nothing. */
+ break;
+ }
+ } while (next_key && (indx += 2) < NUM_ENT(pg));
+
+ cp->indx = indx;
+ cp->dup_len = dup_len;
+ cp->dup_off = dup_off;
+ cp->dup_tlen = dup_tlen;
+
+ /* If we are off the page then try to the next page. */
+ if (ret == 0 && next_key && indx >= NUM_ENT(pg)) {
+ if ((ret = __ham_item_next(dbc, lock_mode, &pgno)) == 0)
+ goto next_pg;
+ if (ret != DB_NOTFOUND)
+ return (ret);
+ if ((ret = __memp_fput(dbc->dbp->mpf,
+ dbc->thread_info, cp->page, dbc->priority)) != 0)
+ return (ret);
+ cp->page = NULL;
+ if ((ret = __ham_get_meta(dbc)) != 0)
+ return (ret);
+
+ cp->bucket++;
+ if (cp->bucket > cp->hdr->max_bucket) {
+ /*
+ * Restore cursor to its previous state. We're past
+ * the last item in the last bucket, so the next
+ * DBC->get(DB_NEXT) will return DB_NOTFOUND.
+ */
+ cp->bucket--;
+ ret = DB_NOTFOUND;
+ } else {
+ /*
+ * Start on the next bucket.
+ *
+ * Note that if this new bucket happens to be empty,
+ * but there's another non-empty bucket after it,
+ * we'll return early. This is a rare case, and we
+ * don't guarantee any particular number of keys
+ * returned on each call, so just let the next call
+ * to bulk get move forward by yet another bucket.
+ */
+ cp->pgno = BUCKET_TO_PAGE(cp, cp->bucket);
+ cp->indx = NDX_INVALID;
+ F_CLR(cp, H_ISDUP);
+ ret = __ham_item_next(dbc, lock_mode, &pgno);
+ }
+
+ if ((t_ret = __ham_release_meta(dbc)) != 0)
+ return (t_ret);
+ if (ret == 0)
+ goto next_pg;
+ if (ret != DB_NOTFOUND)
+ return (ret);
+ }
+ *offp = -1;
+ return (0);
+}
+
+static int
+__hamc_put(dbc, key, data, flags, pgnop)
+ DBC *dbc;
+ DBT *key;
+ DBT *data;
+ u_int32_t flags;
+ db_pgno_t *pgnop;
+{
+ DB *dbp;
+ DBT tmp_val, *myval;
+ DB_MPOOLFILE *mpf;
+ HASH_CURSOR *hcp;
+ u_int32_t nbytes;
+ int ret, t_ret;
+
+ /*
+ * The compiler doesn't realize that we only use this when ret is
+ * equal to 0 and that if ret is equal to 0, that we must have set
+ * myval. So, we initialize it here to shut the compiler up.
+ */
+ COMPQUIET(myval, NULL);
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ hcp = (HASH_CURSOR *)dbc->internal;
+
+ if (F_ISSET(hcp, H_DELETED) && flags != DB_KEYFIRST &&
+ flags != DB_KEYLAST && flags != DB_OVERWRITE_DUP)
+ return (DB_NOTFOUND);
+
+ if ((ret = __ham_get_meta(dbc)) != 0)
+ goto err1;
+
+ switch (flags) {
+ case DB_KEYLAST:
+ case DB_KEYFIRST:
+ case DB_NODUPDATA:
+ case DB_NOOVERWRITE:
+ case DB_OVERWRITE_DUP:
+ nbytes = (ISBIG(hcp, key->size) ? HOFFPAGE_PSIZE :
+ HKEYDATA_PSIZE(key->size)) +
+ (ISBIG(hcp, data->size) ? HOFFPAGE_PSIZE :
+ HKEYDATA_PSIZE(data->size));
+ if ((ret = __ham_lookup(dbc,
+ key, nbytes, DB_LOCK_WRITE, pgnop)) == DB_NOTFOUND) {
+ if (hcp->seek_found_page != PGNO_INVALID &&
+ hcp->seek_found_page != hcp->pgno) {
+ if ((ret = __memp_fput(mpf, dbc->thread_info,
+ hcp->page, dbc->priority)) != 0)
+ goto err2;
+ hcp->page = NULL;
+ hcp->pgno = hcp->seek_found_page;
+ hcp->indx = NDX_INVALID;
+ }
+
+ if (F_ISSET(data, DB_DBT_PARTIAL) && data->doff != 0) {
+ /*
+ * A partial put, but the key does not exist
+ * and we are not beginning the write at 0.
+ * We must create a data item padded up to doff
+ * and then write the new bytes represented by
+ * val.
+ */
+ if ((ret = __ham_init_dbt(dbp->env, &tmp_val,
+ data->size + data->doff,
+ &dbc->my_rdata.data,
+ &dbc->my_rdata.ulen)) != 0)
+ goto err2;
+
+ memset(tmp_val.data, 0, data->doff);
+ memcpy((u_int8_t *)tmp_val.data +
+ data->doff, data->data, data->size);
+ myval = &tmp_val;
+ } else
+ myval = (DBT *)data;
+
+ ret = __ham_add_el(dbc, key, myval, H_KEYDATA);
+ goto done;
+ } else if (ret == 0 && flags == DB_NOOVERWRITE &&
+ !F_ISSET(hcp, H_DELETED)) {
+ if (*pgnop == PGNO_INVALID)
+ ret = DB_KEYEXIST;
+ else
+ ret = __bam_opd_exists(dbc, *pgnop);
+ if (ret != 0)
+ goto done;
+ }
+ break;
+ case DB_BEFORE:
+ case DB_AFTER:
+ case DB_CURRENT:
+ ret = __ham_item(dbc, DB_LOCK_WRITE, pgnop);
+ break;
+ default:
+ ret = __db_unknown_flag(dbp->env, "__hamc_put", flags);
+ break;
+ }
+
+ /*
+ * Invalidate any insert index found so they are not reused
+ * in future inserts.
+ */
+ hcp->seek_found_page = PGNO_INVALID;
+ hcp->seek_found_indx = NDX_INVALID;
+
+ if (*pgnop == PGNO_INVALID && ret == 0) {
+ if ((ret = __memp_dirty(mpf, &hcp->page,
+ dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+ goto done;
+ if (flags == DB_CURRENT ||
+ (!(F_ISSET(dbp, DB_AM_DUP) || F_ISSET(key, DB_DBT_DUPOK)) &&
+ (flags == DB_KEYFIRST || flags == DB_KEYLAST ||
+ flags == DB_NODUPDATA || flags == DB_OVERWRITE_DUP)))
+ ret = __ham_overwrite(dbc, data, flags);
+ else
+ ret = __ham_add_dup(dbc, data, flags, pgnop);
+ }
+
+done: if (hcp->page != NULL) {
+ if ((t_ret = __memp_fput(mpf, dbc->thread_info,
+ hcp->page, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if (t_ret == 0)
+ hcp->page = NULL;
+ }
+
+ if (ret == 0 && F_ISSET(hcp, H_EXPAND)) {
+ ret = __ham_expand_table(dbc);
+ F_CLR(hcp, H_EXPAND);
+ /* If we are out of space, ignore the error. */
+ if (ret == ENOSPC && dbc->txn == NULL)
+ ret = 0;
+ } else if (ret == 0 && F_ISSET(hcp, H_CONTRACT)) {
+ if (!F_ISSET(dbp, DB_AM_REVSPLITOFF))
+ ret = __ham_contract_table(dbc, NULL);
+ F_CLR(hcp, H_CONTRACT);
+ }
+
+err2: if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
+ ret = t_ret;
+
+err1: return (ret);
+}
+
+/********************************* UTILITIES ************************/
+
+/*
+ * __ham_contract_table -- remove the last bucket.
+ * PUBLIC: int __ham_contract_table __P((DBC *, DB_COMPACT *));
+ */
+int
+__ham_contract_table(dbc, c_data)
+ DBC *dbc;
+ DB_COMPACT *c_data;
+{
+ DB *dbp;
+ DB_MPOOLFILE *mpf;
+ HASH_CURSOR *hcp;
+ HMETA *hdr;
+ PAGE *h;
+ db_pgno_t maxpgno, stoppgno;
+ int drop_segment, ret;
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ h = NULL;
+ if ((ret = __ham_dirty_meta(dbc, 0)) != 0)
+ return (ret);
+ hcp = (HASH_CURSOR *)dbc->internal;
+ hdr = hcp->hdr;
+
+ if ((ret = __ham_merge_pages(dbc,
+ hdr->max_bucket & hdr->low_mask, hdr->max_bucket, c_data)) != 0)
+ return (ret);
+
+ maxpgno = BUCKET_TO_PAGE(hcp, hdr->max_bucket);
+ drop_segment = hdr->max_bucket == (hdr->low_mask + 1);
+
+ if (DBC_LOGGING(dbc)) {
+ if ((ret = __ham_contract_log(dbp, dbc->txn, &LSN(hdr),
+ 0, PGNO(hdr), &LSN(hdr), hdr->max_bucket, maxpgno)) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(LSN(hdr));
+
+ hdr->max_bucket--;
+ /*
+ * If we are dropping a segment then adjust the spares table and masks
+ * and free the pages in that segment.
+ */
+ if (drop_segment) {
+ LOCK_CHECK_OFF(dbc->thread_info);
+ hdr->spares[__db_log2(hdr->max_bucket + 1) + 1] = PGNO_INVALID;
+ hdr->high_mask = hdr->low_mask;
+ hdr->low_mask >>= 1;
+ stoppgno = maxpgno + hdr->max_bucket + 1;
+ do {
+ if ((ret = __memp_fget(mpf, &maxpgno,
+ dbc->thread_info, dbc->txn,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &h)) != 0)
+ break;
+ if ((ret = __db_free(dbc, h, 0)) != 0)
+ break;
+ ret = 0;
+ } while (++maxpgno < stoppgno);
+ LOCK_CHECK_ON(dbc->thread_info);
+ }
+
+err: return (ret);
+}
+
+/*
+ * __ham_expand_table --
+ */
+static int
+__ham_expand_table(dbc)
+ DBC *dbc;
+{
+ DB *dbp;
+ DBMETA *mmeta;
+ DB_LOCK metalock;
+ DB_LSN lsn;
+ DB_MPOOLFILE *mpf;
+ HASH_CURSOR *hcp;
+ PAGE *h;
+ db_pgno_t pgno, mpgno;
+ u_int32_t logn, newalloc, new_bucket, old_bucket;
+ int got_meta, new_double, ret, t_ret;
+
+ LOCK_CHECK_OFF(dbc->thread_info);
+
+ dbp = dbc->dbp;
+ mpf = dbp->mpf;
+ hcp = (HASH_CURSOR *)dbc->internal;
+ if ((ret = __ham_dirty_meta(dbc, 0)) != 0)
+ return (ret);
+
+ LOCK_INIT(metalock);
+ mmeta = (DBMETA *) hcp->hdr;
+ mpgno = mmeta->pgno;
+ h = NULL;
+ newalloc = 0;
+ got_meta = 0;
+
+ /*
+ * If the split point is about to increase, make sure that we
+ * have enough extra pages. The calculation here is weird.
+ * We'd like to do this after we've upped max_bucket, but it's
+ * too late then because we've logged the meta-data split. What
+ * we'll do between then and now is increment max bucket and then
+ * see what the log of one greater than that is; here we have to
+ * look at the log of max + 2. VERY NASTY STUFF.
+ *
+ * We figure out what we need to do, then we log it, then request
+ * the pages from mpool. We don't want to fail after extending
+ * the file.
+ *
+ * If the page we are about to split into has already been allocated,
+ * then we simply need to get it to get its LSN. If it hasn't yet
+ * been allocated, then we know it's LSN (0,0).
+ */
+
+ new_bucket = hcp->hdr->max_bucket + 1;
+ old_bucket = new_bucket & hcp->hdr->low_mask;
+
+ new_double = hcp->hdr->max_bucket == hcp->hdr->high_mask;
+ logn = __db_log2(new_bucket);
+
+ if (!new_double || hcp->hdr->spares[logn + 1] != PGNO_INVALID) {
+ /* Page exists; get it so we can get its LSN */
+ pgno = BUCKET_TO_PAGE(hcp, new_bucket);
+ if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &h)) != 0)
+ goto err;
+ lsn = h->lsn;
+ } else {
+ /* Get the master meta-data page to do allocation. */
+ if (F_ISSET(dbp, DB_AM_SUBDB)) {
+ mpgno = PGNO_BASE_MD;
+ if ((ret = __db_lget(dbc,
+ 0, mpgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
+ goto err;
+ if ((ret = __memp_fget(mpf, &mpgno, dbc->thread_info,
+ dbc->txn, DB_MPOOL_DIRTY, &mmeta)) != 0)
+ goto err;
+ got_meta = 1;
+ }
+ pgno = mmeta->last_pgno + 1;
+ ZERO_LSN(lsn);
+ newalloc = 1;
+ }
+
+ /* Log the meta-data split first. */
+ if (DBC_LOGGING(dbc)) {
+ /*
+ * We always log the page number of the first page of
+ * the allocation group. However, the LSN that we log
+ * is either the LSN on the first page (if we did not
+ * do the actual allocation here) or the LSN on the last
+ * page of the unit (if we did do the allocation here).
+ */
+ if ((ret = __ham_metagroup_log(dbp, dbc->txn,
+ &lsn, 0, hcp->hdr->max_bucket, mpgno, &mmeta->lsn,
+ hcp->hdr->dbmeta.pgno, &hcp->hdr->dbmeta.lsn,
+ pgno, &lsn, newalloc, mmeta->last_pgno)) != 0)
+ goto err;
+ } else
+ LSN_NOT_LOGGED(lsn);
+
+ hcp->hdr->dbmeta.lsn = lsn;
+
+ if (new_double && hcp->hdr->spares[logn + 1] == PGNO_INVALID) {
+ /*
+ * We need to begin a new doubling and we have not allocated
+ * any pages yet. Read the last page in and initialize it to
+ * make the allocation contiguous. The pgno we calculated
+ * above is the first page allocated. The entry in spares is
+ * that page number minus any buckets already allocated (it
+ * simplifies bucket to page transaction). After we've set
+ * that, we calculate the last pgno.
+ */
+
+ pgno += hcp->hdr->max_bucket;
+
+ if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
+ DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &h)) != 0)
+ goto err;
+
+ hcp->hdr->spares[logn + 1] =
+ (pgno - new_bucket) - hcp->hdr->max_bucket;
+ mmeta->last_pgno = pgno;
+ mmeta->lsn = lsn;
+
+ P_INIT(h, dbp->pgsize,
+ pgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH);
+ }
+
+ /* Write out whatever page we ended up modifying. */
+ h->lsn = lsn;
+ if ((ret = __memp_fput(mpf, dbc->thread_info, h, dbc->priority)) != 0)
+ goto err;
+ h = NULL;
+
+ /*
+ * Update the meta-data page of this hash database.
+ */
+ hcp->hdr->max_bucket = new_bucket;
+ if (new_double) {
+ hcp->hdr->low_mask = hcp->hdr->high_mask;
+ hcp->hdr->high_mask = new_bucket | hcp->hdr->low_mask;
+ }
+
+err: if (got_meta)
+ if ((t_ret = __memp_fput(mpf,
+ dbc->thread_info, mmeta, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+ if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
+ ret = t_ret;
+ if (h != NULL)
+ if ((t_ret = __memp_fput(mpf,
+ dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
+ ret = t_ret;
+
+ /* Relocate records to the new bucket -- after releasing metapage. */
+ if (ret == 0)
+ ret = __ham_split_page(dbc, old_bucket, new_bucket);
+ LOCK_CHECK_ON(dbc->thread_info);
+
+ return (ret);
+}
+
+/*
+ * PUBLIC: u_int32_t __ham_call_hash __P((DBC *, u_int8_t *, u_int32_t));
+ */
+u_int32_t
+__ham_call_hash(dbc, k, len)
+ DBC *dbc;
+ u_int8_t *k;
+ u_int32_t len;
+{
+ DB *dbp;
+ HASH *hashp;
+ HASH_CURSOR *hcp;
+ u_int32_t n, bucket;
+
+ dbp = dbc->dbp;
+ hcp = (HASH_CURSOR *)dbc->internal;
+ hashp = dbp->h_internal;
+
+ n = (u_int32_t)(hashp->h_hash(dbp, k, len));
+
+ bucket = n & hcp->hdr->high_mask;
+ if (bucket > hcp->hdr->max_bucket)
+ bucket = bucket & hcp->hdr->low_mask;
+ return (bucket);
+}
+
+/*
+ * Check for duplicates, and call __db_ret appropriately. Release
+ * everything held by the cursor.
+ */
+static int
+__ham_dup_return(dbc, val, flags)
+ DBC *dbc;
+ DBT *val;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DBT *myval, tmp_val;
+ HASH_CURSOR *hcp;
+ PAGE *pp;
+ db_indx_t ndx;
+ db_pgno_t pgno;
+ u_int32_t off, tlen;
+ u_int8_t *hk, type;
+ int cmp, ret;
+ db_indx_t len;
+
+ /* Check for duplicate and return the first one. */
+ dbp = dbc->dbp;
+ hcp = (HASH_CURSOR *)dbc->internal;
+ ndx = H_DATAINDEX(hcp->indx);
+ type = HPAGE_TYPE(dbp, hcp->page, ndx);
+ pp = hcp->page;
+ myval = val;
+
+ /*
+ * There are 4 cases:
+ * 1. We are not in duplicate, simply return; the upper layer
+ * will do the right thing.
+ * 2. We are looking at keys and stumbled onto a duplicate.
+ * 3. We are in the middle of a duplicate set. (ISDUP set)
+ * 4. We need to check for particular data match.
+ */
+
+ /* We should never get here with off-page dups. */
+ DB_ASSERT(dbp->env, type != H_OFFDUP);
+
+ /* Case 1 */
+ if (type != H_DUPLICATE && flags != DB_GET_BOTH &&
+ flags != DB_GET_BOTHC && flags != DB_GET_BOTH_RANGE)
+ return (0);
+
+ /*
+ * Here we check for the case where we just stumbled onto a
+ * duplicate. In this case, we do initialization and then
+ * let the normal duplicate code handle it. (Case 2)
+ */
+ if (!F_ISSET(hcp, H_ISDUP) && type == H_DUPLICATE) {
+ F_SET(hcp, H_ISDUP);
+ hcp->dup_tlen = LEN_HDATA(dbp, hcp->page,
+ hcp->hdr->dbmeta.pagesize, hcp->indx);
+ hk = H_PAIRDATA(dbp, hcp->page, hcp->indx);
+ if (flags == DB_LAST ||
+ flags == DB_PREV || flags == DB_PREV_NODUP) {
+ hcp->dup_off = 0;
+ do {
+ memcpy(&len,
+ HKEYDATA_DATA(hk) + hcp->dup_off,
+ sizeof(db_indx_t));
+ hcp->dup_off += DUP_SIZE(len);
+ } while (hcp->dup_off < hcp->dup_tlen);
+ hcp->dup_off -= DUP_SIZE(len);
+ } else {
+ memcpy(&len,
+ HKEYDATA_DATA(hk), sizeof(db_indx_t));
+ hcp->dup_off = 0;
+ }
+ hcp->dup_len = len;
+ }
+
+ /*
+ * If we are retrieving a specific key/data pair, then we
+ * may need to adjust the cursor before returning data.
+ * Case 4
+ */
+ if (flags == DB_GET_BOTH ||
+ flags == DB_GET_BOTHC || flags == DB_GET_BOTH_RANGE) {
+ if (F_ISSET(hcp, H_ISDUP)) {
+ /*
+ * If we're doing a join, search forward from the
+ * current position, not the beginning of the dup set.
+ */
+ if (flags == DB_GET_BOTHC)
+ F_SET(hcp, H_CONTINUE);
+
+ __ham_dsearch(dbc, val, &off, &cmp, flags);
+
+ /*
+ * This flag is set nowhere else and is safe to
+ * clear unconditionally.
+ */
+ F_CLR(hcp, H_CONTINUE);
+ hcp->dup_off = off;
+ } else {
+ hk = H_PAIRDATA(dbp, hcp->page, hcp->indx);
+ if (((HKEYDATA *)hk)->type == H_OFFPAGE) {
+ memcpy(&tlen,
+ HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
+ memcpy(&pgno,
+ HOFFPAGE_PGNO(hk), sizeof(db_pgno_t));
+ if ((ret = __db_moff(dbc, val, pgno, tlen,
+ dbp->dup_compare, &cmp)) != 0)
+ return (ret);
+ cmp = -cmp;
+ } else {
+ /*
+ * We do not zero tmp_val since the comparison
+ * routines may only look at data and size.
+ */
+ tmp_val.data = HKEYDATA_DATA(hk);
+ tmp_val.size = LEN_HDATA(dbp, hcp->page,
+ dbp->pgsize, hcp->indx);
+ cmp = dbp->dup_compare == NULL ?
+ __bam_defcmp(dbp, &tmp_val, val) :
+ dbp->dup_compare(dbp, &tmp_val, val);
+ }
+
+ if (cmp > 0 && flags == DB_GET_BOTH_RANGE &&
+ F_ISSET(dbp, DB_AM_DUPSORT))
+ cmp = 0;
+ }
+
+ if (cmp != 0)
+ return (DB_NOTFOUND);
+ }
+
+ /*
+ * If we've already got the data for this value, or we're doing a bulk
+ * get, we don't want to return the data.
+ */
+ if (F_ISSET(dbc, DBC_MULTIPLE | DBC_MULTIPLE_KEY) ||
+ F_ISSET(val, DB_DBT_ISSET))
+ return (0);
+
+ /*
+ * Now, everything is initialized, grab a duplicate if
+ * necessary.
+ */
+ if (F_ISSET(hcp, H_ISDUP)) { /* Case 3 */
+ /*
+ * Copy the DBT in case we are retrieving into user
+ * memory and we need the parameters for it. If the
+ * user requested a partial, then we need to adjust
+ * the user's parameters to get the partial of the
+ * duplicate which is itself a partial.
+ */
+ memcpy(&tmp_val, val, sizeof(*val));
+
+ if (F_ISSET(&tmp_val, DB_DBT_PARTIAL)) {
+ /*
+ * Take the user's length unless it would go
+ * beyond the end of the duplicate.
+ */
+ if (tmp_val.doff > hcp->dup_len)
+ tmp_val.dlen = 0;
+ else if (tmp_val.dlen + tmp_val.doff > hcp->dup_len)
+ tmp_val.dlen = hcp->dup_len - tmp_val.doff;
+
+ } else {
+ F_SET(&tmp_val, DB_DBT_PARTIAL);
+ tmp_val.dlen = hcp->dup_len;
+ tmp_val.doff = 0;
+ }
+
+ /*
+ * Set offset to the appropriate place within the
+ * current duplicate -- need to take into account
+ * both the dup_off and the current duplicate's
+ * length.
+ */
+ tmp_val.doff += hcp->dup_off + sizeof(db_indx_t);
+
+ myval = &tmp_val;
+ }
+
+ /*
+ * Finally, if we had a duplicate, pp, ndx, and myval should be
+ * set appropriately.
+ */
+ if ((ret = __db_ret(dbc, pp, ndx, myval,
+ &dbc->rdata->data, &dbc->rdata->ulen)) != 0) {
+ if (ret == DB_BUFFER_SMALL)
+ val->size = myval->size;
+ return (ret);
+ }
+
+ /*
+ * In case we sent a temporary off to db_ret, set the real
+ * return values.
+ */
+ val->data = myval->data;
+ val->size = myval->size;
+
+ F_SET(val, DB_DBT_ISSET);
+
+ return (0);
+}
+
+/*
+ * Overwrite a record.
+ *
+ * PUBLIC: int __ham_overwrite __P((DBC *, DBT *, u_int32_t));
+ */
+int
+__ham_overwrite(dbc, nval, flags)
+ DBC *dbc;
+ DBT *nval;
+ u_int32_t flags;
+{
+ DB *dbp;
+ DBT *myval, tmp_val, tmp_val2;
+ ENV *env;
+ HASH_CURSOR *hcp;
+ void *newrec;
+ u_int8_t *hk, *p;
+ u_int32_t len, nondup_size;
+ db_indx_t newsize;
+ int ret;
+
+ dbp = dbc->dbp;
+ env = dbp->env;
+ hcp = (HASH_CURSOR *)dbc->internal;
+ if (F_ISSET(hcp, H_ISDUP)) {
+ /*
+ * This is an overwrite of a duplicate. We should never
+ * be off-page at this point.
+ */
+ DB_ASSERT(env, hcp->opd == NULL);
+ /* On page dups */
+ if (F_ISSET(nval, DB_DBT_PARTIAL)) {
+ /*
+ * We're going to have to get the current item, then
+ * construct the record, do any padding and do a
+ * replace.
+ */
+ memset(&tmp_val, 0, sizeof(tmp_val));
+ if ((ret =
+ __ham_dup_return(dbc, &tmp_val, DB_CURRENT)) != 0)
+ return (ret);
+
+ /* Figure out new size. */
+ nondup_size = tmp_val.size;
+ newsize = nondup_size;
+
+ /*
+ * Three cases:
+ * 1. strictly append (may need to allocate space
+ * for pad bytes; really gross).
+ * 2. overwrite some and append.
+ * 3. strictly overwrite.
+ */
+ if (nval->doff > nondup_size)
+ newsize +=
+ ((nval->doff - nondup_size) + nval->size);
+ else if (nval->doff + nval->dlen > nondup_size)
+ newsize += nval->size -
+ (nondup_size - nval->doff);
+ else
+ newsize += nval->size - nval->dlen;
+
+ /*
+ * Make sure that the new size doesn't put us over
+ * the onpage duplicate size in which case we need
+ * to convert to off-page duplicates.
+ */
+ if (ISBIG(hcp,
+ (hcp->dup_tlen - nondup_size) + newsize)) {
+ if ((ret = __ham_dup_convert(dbc)) != 0)
+ return (ret);
+ return (hcp->opd->am_put(hcp->opd,
+ NULL, nval, flags, NULL));
+ }
+
+ if ((ret = __os_malloc(dbp->env,
+ DUP_SIZE(newsize), &newrec)) != 0)
+ return (ret);
+ memset(&tmp_val2, 0, sizeof(tmp_val2));
+ F_SET(&tmp_val2, DB_DBT_PARTIAL);
+
+ /* Construct the record. */
+ p = newrec;
+ /* Initial size. */
+ memcpy(p, &newsize, sizeof(db_indx_t));
+ p += sizeof(db_indx_t);
+
+ /* First part of original record. */
+ len = nval->doff > tmp_val.size
+ ? tmp_val.size : nval->doff;
+ memcpy(p, tmp_val.data, len);
+ p += len;
+
+ if (nval->doff > tmp_val.size) {
+ /* Padding */
+ memset(p, 0, nval->doff - tmp_val.size);
+ p += nval->doff - tmp_val.size;
+ }
+
+ /* New bytes */
+ memcpy(p, nval->data, nval->size);
+ p += nval->size;
+
+ /* End of original record (if there is any) */
+ if (nval->doff + nval->dlen < tmp_val.size) {
+ len = (tmp_val.size - nval->doff) - nval->dlen;
+ memcpy(p, (u_int8_t *)tmp_val.data +
+ nval->doff + nval->dlen, len);
+ p += len;
+ }
+
+ /* Final size. */
+ memcpy(p, &newsize, sizeof(db_indx_t));
+
+ /*
+ * Make sure that the caller isn't corrupting
+ * the sort order.
+ */
+ if (dbp->dup_compare != NULL) {
+ tmp_val2.data =
+ (u_int8_t *)newrec + sizeof(db_indx_t);
+ tmp_val2.size = newsize;
+ if (dbp->dup_compare(
+ dbp, &tmp_val, &tmp_val2) != 0) {
+ __os_free(env, newrec);
+ return (__db_duperr(dbp, flags));
+ }
+ }
+
+ tmp_val2.data = newrec;
+ tmp_val2.size = DUP_SIZE(newsize);
+ tmp_val2.doff = hcp->dup_off;
+ tmp_val2.dlen = DUP_SIZE(hcp->dup_len);
+
+ ret = __ham_replpair(dbc, &tmp_val2, H_DUPLICATE);
+ __os_free(env, newrec);
+
+ /* Update cursor */
+ if (ret != 0)
+ return (ret);
+
+ if (newsize > nondup_size) {
+ if ((ret = __hamc_update(dbc,
+ (newsize - nondup_size),
+ DB_HAM_CURADJ_ADDMOD, 1)) != 0)
+ return (ret);
+ hcp->dup_tlen += (newsize - nondup_size);
+ } else {
+ if ((ret = __hamc_update(dbc,
+ (nondup_size - newsize),
+ DB_HAM_CURADJ_DELMOD, 1)) != 0)
+ return (ret);
+ hcp->dup_tlen -= (nondup_size - newsize);
+ }
+ hcp->dup_len = newsize;
+ return (0);
+ } else {
+ /* Check whether we need to convert to off page. */
+ if (ISBIG(hcp,
+ (hcp->dup_tlen - hcp->dup_len) + nval->size)) {
+ if ((ret = __ham_dup_convert(dbc)) != 0)
+ return (ret);
+ return (hcp->opd->am_put(hcp->opd,
+ NULL, nval, flags, NULL));
+ }
+
+ /* Make sure we maintain sort order. */
+ if (dbp->dup_compare != NULL) {
+ tmp_val2.data =
+ HKEYDATA_DATA(H_PAIRDATA(dbp, hcp->page,
+ hcp->indx)) + hcp->dup_off +
+ sizeof(db_indx_t);
+ tmp_val2.size = hcp->dup_len;
+ if (dbp->dup_compare(
+ dbp, nval, &tmp_val2) != 0) {
+ __db_errx(env, DB_STR("1131",
+ "Existing data sorts differently from put data"));
+ return (EINVAL);
+ }
+ }
+ /* Overwriting a complete duplicate. */
+ if ((ret =
+ __ham_make_dup(dbp->env, nval, &tmp_val,
+ &dbc->my_rdata.data, &dbc->my_rdata.ulen)) != 0)
+ return (ret);
+ /* Now fix what we are replacing. */
+ tmp_val.doff = hcp->dup_off;
+ tmp_val.dlen = DUP_SIZE(hcp->dup_len);
+
+ /* Update cursor */
+ if (nval->size > hcp->dup_len) {
+ if ((ret = __hamc_update(dbc,
+ (nval->size - hcp->dup_len),
+ DB_HAM_CURADJ_ADDMOD, 1)) != 0)
+ return (ret);
+ hcp->dup_tlen += (nval->size - hcp->dup_len);
+ } else {
+ if ((ret = __hamc_update(dbc,
+ (hcp->dup_len - nval->size),
+ DB_HAM_CURADJ_DELMOD, 1)) != 0)
+ return (ret);
+ hcp->dup_tlen -= (hcp->dup_len - nval->size);
+ }
+ hcp->dup_len = (db_indx_t)nval->size;
+ }
+ myval = &tmp_val;
+ } else if (!F_ISSET(nval, DB_DBT_PARTIAL)) {
+ /* Put/overwrite */
+ memcpy(&tmp_val, nval, sizeof(*nval));
+ F_SET(&tmp_val, DB_DBT_PARTIAL);
+ tmp_val.doff = 0;
+ hk = H_PAIRDATA(dbp, hcp->page, hcp->indx);
+ if (HPAGE_PTYPE(hk) == H_OFFPAGE)
+ memcpy(&tmp_val.dlen,
+ HOFFPAGE_TLEN(hk), sizeof(u_int32_t));
+ else
+ tmp_val.dlen = LEN_HDATA(dbp, hcp->page,
+ hcp->hdr->dbmeta.pagesize, hcp->indx);
+ myval = &tmp_val;
+ } else
+ /* Regular partial put */
+ myval = nval;
+
+ return (__ham_replpair(dbc, myval,
+ F_ISSET(hcp, H_ISDUP) ? H_DUPLICATE : H_KEYDATA));
+}
+
+/*
+ * Given a key and a cursor, sets the cursor to the page/ndx on which
+ * the key resides. If the key is found, the cursor H_OK flag is set
+ * and the pagep, bndx, pgno (dpagep, dndx, dpgno) fields are set.
+ * If the key is not found, the H_OK flag is not set. If the sought
+ * field is non-0, the pagep, bndx, pgno (dpagep, dndx, dpgno) fields
+ * are set indicating where an add might take place. If it is 0,
+ * none of the cursor pointer field are valid.
+ * PUBLIC: int __ham_lookup __P((DBC *,
+ * PUBLIC: const DBT *, u_int32_t, db_lockmode_t, db_pgno_t *));
+ */
+int
+__ham_lookup(dbc, key, sought, mode, pgnop)
+ DBC *dbc;
+ const DBT *key;
+ u_int32_t sought;
+ db_lockmode_t mode;
+ db_pgno_t *pgnop;
+{
+ DB *dbp;
+ HASH_CURSOR *hcp;
+ db_pgno_t next_pgno;
+ int match, ret;
+ u_int8_t *dk;
+
+ dbp = dbc->dbp;
+ hcp = (HASH_CURSOR *)dbc->internal;
+
+ /*
+ * Set up cursor so that we're looking for space to add an item
+ * as we cycle through the pages looking for the key.
+ */
+ if ((ret = __ham_item_reset(dbc)) != 0)
+ return (ret);
+ hcp->seek_size = sought;
+
+ hcp->bucket = __ham_call_hash(dbc, (u_int8_t *)key->data, key->size);
+ hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
+ /* look though all pages in the bucket for the key */
+ if ((ret = __ham_get_cpage(dbc, mode)) != 0)
+ return (ret);
+
+ *pgnop = PGNO_INVALID;
+ if (hcp->indx == NDX_INVALID) {
+ hcp->indx = 0;
+ F_CLR(hcp, H_ISDUP);
+ }
+ while (hcp->pgno != PGNO_INVALID) {
+ /* Are we looking for space to insert an item. */
+ if (hcp->seek_size != 0 &&
+ hcp->seek_found_page == PGNO_INVALID &&
+ hcp->seek_size < P_FREESPACE(dbp, hcp->page)) {
+ hcp->seek_found_page = hcp->pgno;
+ hcp->seek_found_indx = NDX_INVALID;
+ }
+
+ if ((ret = __ham_getindex(dbc, hcp->page, key,
+ H_KEYDATA, &match, &hcp->indx)) != 0)
+ return (ret);
+
+ /*
+ * If this is the first page in the bucket with space for
+ * inserting the requested item. Store the insert index to
+ * save having to look it up again later.
+ */
+ if (hcp->seek_found_page == hcp->pgno)
+ hcp->seek_found_indx = hcp->indx;
+
+ if (match == 0) {
+ F_SET(hcp, H_OK);
+ dk = H_PAIRDATA(dbp, hcp->page, hcp->indx);
+ if (HPAGE_PTYPE(dk) == H_OFFDUP)
+ memcpy(pgnop, HOFFDUP_PGNO(dk),
+ sizeof(db_pgno_t));
+ return (0);
+ }
+
+ /* move the cursor to the next page. */
+ if (NEXT_PGNO(hcp->page) == PGNO_INVALID)
+ break;
+ next_pgno = NEXT_PGNO(hcp->page);
+ hcp->indx = 0;
+ if ((ret = __ham_next_cpage(dbc, next_pgno)) != 0)
+ return (ret);
+ }
+ F_SET(hcp, H_NOMORE);
+ return (DB_NOTFOUND);
+}
+
+/*
+ * __ham_init_dbt --
+ * Initialize a dbt using some possibly already allocated storage
+ * for items.
+ *
+ * PUBLIC: int __ham_init_dbt __P((ENV *,
+ * PUBLIC: DBT *, u_int32_t, void **, u_int32_t *));
+ */
+int
+__ham_init_dbt(env, dbt, size, bufp, sizep)
+ ENV *env;
+ DBT *dbt;
+ u_int32_t size;
+ void **bufp;
+ u_int32_t *sizep;
+{
+ int ret;
+
+ memset(dbt, 0, sizeof(*dbt));
+ if (*sizep < size) {
+ if ((ret = __os_realloc(env, size, bufp)) != 0) {
+ *sizep = 0;
+ return (ret);
+ }
+ *sizep = size;
+ }
+ dbt->data = *bufp;
+ dbt->size = size;
+ return (0);
+}
+
+/*
+ * Adjust the cursor after an insert or delete. The cursor passed is
+ * the one that was operated upon; we just need to check any of the
+ * others.
+ *
+ * len indicates the length of the item added/deleted
+ * add indicates if the item indicated by the cursor has just been
+ * added (add == 1) or deleted (add == 0).
+ * dup indicates if the addition occurred into a duplicate set.
+ *
+ * PUBLIC: int __hamc_update
+ * PUBLIC: __P((DBC *, u_int32_t, db_ham_curadj, int));
+ */
+ static int
+ __hamc_update_getorder(cp, dbc, orderp, pgno, is_dup, args)
+ DBC *dbc, *cp;
+ u_int32_t *orderp;
+ db_pgno_t pgno;
+ u_int32_t is_dup;
+ void *args;
+{
+ HASH_CURSOR *hcp, *lcp;
+
+ COMPQUIET(args, NULL);
+ COMPQUIET(pgno, 0);
+
+ hcp = (HASH_CURSOR *)dbc->internal;
+ if (cp == dbc || cp->dbtype != DB_HASH)
+ return (0);
+ lcp = (HASH_CURSOR *)cp->internal;
+ if (F_ISSET(lcp, H_DELETED) &&
+ hcp->pgno == lcp->pgno &&
+ hcp->indx == lcp->indx &&
+ *orderp < lcp->order &&
+ (!is_dup || hcp->dup_off == lcp->dup_off) &&
+ !MVCC_SKIP_CURADJ(cp, lcp->pgno))
+ *orderp = lcp->order;
+ return (0);
+}
+struct __hamc_update_setorder_args {
+ int was_mod, was_add;
+ u_int32_t len, order;
+ DB_TXN *my_txn;
+};
+
+static int
+__hamc_update_setorder(cp, dbc, foundp, pgno, is_dup, vargs)
+ DBC *dbc, *cp;
+ u_int32_t *foundp;
+ db_pgno_t pgno;
+ u_int32_t is_dup;
+ void *vargs;
+{
+ HASH_CURSOR *hcp, *lcp;
+ struct __hamc_update_setorder_args *args;
+
+ COMPQUIET(pgno, 0);
+
+ if (cp == dbc || cp->dbtype != DB_HASH)
+ return (0);
+
+ hcp = (HASH_CURSOR *)dbc->internal;
+ lcp = (HASH_CURSOR *)cp->internal;
+
+ if (lcp->pgno != hcp->pgno ||
+ lcp->indx == NDX_INVALID ||
+ MVCC_SKIP_CURADJ(cp, lcp->pgno))
+ return (0);
+
+ args = vargs;
+ /*
+ * We're about to move things out from under this
+ * cursor. Clear any cached streaming information.
+ */
+ lcp->stream_start_pgno = PGNO_INVALID;
+
+ if (args->my_txn != NULL && cp->txn != args->my_txn)
+ *foundp = 1;
+
+ if (!is_dup) {
+ if (args->was_add == 1) {
+ /*
+ * This routine is not called to add
+ * non-dup records which are always put
+ * at the end. It is only called from
+ * recovery in this case and the
+ * cursor will be marked deleted.
+ * We are "undeleting" so unmark all
+ * cursors with the same order.
+ */
+ if (lcp->indx == hcp->indx &&
+ F_ISSET(lcp, H_DELETED)) {
+ if (lcp->order == hcp->order)
+ F_CLR(lcp, H_DELETED);
+ else if (lcp->order >
+ hcp->order) {
+
+ /*
+ * If we've moved this cursor's
+ * index, split its order
+ * number--i.e., decrement it by
+ * enough so that the lowest
+ * cursor moved has order 1.
+ * cp_arg->order is the split
+ * point, so decrement by it.
+ */
+ lcp->order -=
+ hcp->order;
+ lcp->indx += 2;
+ }
+ } else if (lcp->indx >= hcp->indx)
+ lcp->indx += 2;
+ } else {
+ if (lcp->indx > hcp->indx) {
+ lcp->indx -= 2;
+ if (lcp->indx == hcp->indx &&
+ F_ISSET(lcp, H_DELETED))
+ lcp->order += args->order;
+ } else if (lcp->indx == hcp->indx &&
+ !F_ISSET(lcp, H_DELETED)) {
+ F_SET(lcp, H_DELETED);
+ F_CLR(lcp, H_ISDUP);
+ lcp->order = args->order;
+ }
+ }
+ } else if (lcp->indx == hcp->indx) {
+ /*
+ * Handle duplicates. This routine is only
+ * called for on page dups. Off page dups are
+ * handled by btree/rtree code.
+ */
+ if (args->was_add == 1) {
+ lcp->dup_tlen += args->len;
+ if (lcp->dup_off == hcp->dup_off &&
+ F_ISSET(hcp, H_DELETED) &&
+ F_ISSET(lcp, H_DELETED)) {
+ /* Abort of a delete. */
+ if (lcp->order == hcp->order)
+ F_CLR(lcp, H_DELETED);
+ else if (lcp->order >
+ hcp->order) {
+ lcp->order -=
+ (hcp->order -1);
+ lcp->dup_off += args->len;
+ }
+ } else if (lcp->dup_off >
+ hcp->dup_off || (!args->was_mod &&
+ lcp->dup_off == hcp->dup_off))
+ lcp->dup_off += args->len;
+ } else {
+ lcp->dup_tlen -= args->len;
+ if (lcp->dup_off > hcp->dup_off) {
+ lcp->dup_off -= args->len;
+ if (lcp->dup_off ==
+ hcp->dup_off &&
+ F_ISSET(lcp, H_DELETED))
+ lcp->order += args->order;
+ } else if (!args->was_mod &&
+ lcp->dup_off == hcp->dup_off &&
+ !F_ISSET(lcp, H_DELETED)) {
+ F_SET(lcp, H_DELETED);
+ lcp->order = args->order;
+ }
+ }
+ }
+ return (0);
+}
+
+int
+__hamc_update(dbc, len, operation, is_dup)
+ DBC *dbc;
+ u_int32_t len;
+ db_ham_curadj operation;
+ int is_dup;
+{
+ DB *dbp;
+ DB_LSN lsn;
+ HASH_CURSOR *hcp;
+ int ret;
+ u_int32_t found;
+ struct __hamc_update_setorder_args args;
+
+ dbp = dbc->dbp;
+ hcp = (HASH_CURSOR *)dbc->internal;
+
+ /*
+ * Adjustment will only be logged if this is a subtransaction.
+ * Only subtransactions can abort and effect their parent
+ * transactions cursors.
+ */
+
+ args.my_txn = IS_SUBTRANSACTION(dbc->txn) ? dbc->txn : NULL;
+ args.len = len;
+
+ switch (operation) {
+ case DB_HAM_CURADJ_DEL:
+ args.was_mod = 0;
+ args.was_add = 0;
+ break;
+ case DB_HAM_CURADJ_ADD:
+ args.was_mod = 0;
+ args.was_add = 1;
+ break;
+ case DB_HAM_CURADJ_DELMOD:
+ args.was_mod = 1;
+ args.was_add = 0;
+ break;
+ case DB_HAM_CURADJ_ADDMOD:
+ args.was_mod = 1;
+ args.was_add = 1;
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ /*
+ * Calculate the order of this deleted record.
+ * This will be one greater than any cursor that is pointing
+ * at this record and already marked as deleted.
+ */
+ if (args.was_add == 0) {
+ if ((ret = __db_walk_cursors(dbp, dbc, __hamc_update_getorder,
+ &args.order, 0, (u_int32_t)is_dup, NULL)) != 0)
+ return (ret);
+ args.order++;
+ hcp->order = args.order;
+ }
+
+ if ((ret = __db_walk_cursors(dbp, dbc,
+ __hamc_update_setorder, &found, 0, (u_int32_t)is_dup, &args)) != 0)
+ return (ret);
+
+ if (found != 0 && DBC_LOGGING(dbc)) {
+ if ((ret = __ham_curadj_log(dbp, args.my_txn, &lsn, 0,
+ hcp->pgno, hcp->indx, len, hcp->dup_off,
+ (int)operation, is_dup, args.order)) != 0)
+ return (ret);
+ }
+
+ return (0);
+}
+
+struct __ham_get_clist_args {
+ u_int nalloc, nused;
+ DBC **listp;
+};
+
+static int
+__ham_get_clist_func(dbc, my_dbc, countp, pgno, indx, vargs)
+ DBC *dbc, *my_dbc;
+ u_int32_t *countp;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ void *vargs;
+{
+ int ret;
+ struct __ham_get_clist_args *args;
+
+ COMPQUIET(my_dbc, NULL);
+ COMPQUIET(countp, NULL);
+ args = vargs;
+ /*
+ * We match if dbc->pgno matches the specified
+ * pgno, and if either the dbc->indx matches
+ * or we weren't given an index.
+ */
+ if (dbc->internal->pgno == pgno &&
+ (indx == NDX_INVALID ||
+ dbc->internal->indx == indx) &&
+ !MVCC_SKIP_CURADJ(dbc, pgno)) {
+ if (args->nused >= args->nalloc) {
+ args->nalloc += 10;
+ if ((ret = __os_realloc(dbc->dbp->env,
+ args->nalloc * sizeof(HASH_CURSOR *),
+ &args->listp)) != 0)
+ return (ret);
+ }
+ args->listp[args->nused++] = dbc;
+ }
+ return (0);
+}
+/*
+ * __ham_get_clist --
+ *
+ * Get a list of cursors either on a particular bucket or on a particular
+ * page and index combination. The former is so that we can update
+ * cursors on a split. The latter is so we can update cursors when we
+ * move items off page.
+ *
+ * PUBLIC: int __ham_get_clist __P((DB *, db_pgno_t, u_int32_t, DBC ***));
+ */
+int
+__ham_get_clist(dbp, pgno, indx, listp)
+ DB *dbp;
+ db_pgno_t pgno;
+ u_int32_t indx;
+ DBC ***listp;
+{
+ ENV *env;
+ int ret;
+ u_int32_t count;
+ struct __ham_get_clist_args args;
+
+ env = dbp->env;
+ args.listp = NULL;
+ args.nalloc = args.nused = 0;
+
+ if ((ret = __db_walk_cursors(dbp, NULL,
+ __ham_get_clist_func, &count, pgno, indx, &args)) != 0)
+ return (ret);
+ if (args.listp != NULL) {
+ if (args.nused >= args.nalloc) {
+ args.nalloc++;
+ if ((ret = __os_realloc(env,
+ args.nalloc * sizeof(HASH_CURSOR *),
+ &args.listp)) != 0)
+ return (ret);
+ }
+ args.listp[args.nused] = NULL;
+ }
+ *listp = args.listp;
+ return (0);
+}
+
+static int
+__hamc_writelock(dbc)
+ DBC *dbc;
+{
+ DB_LOCK tmp_lock;
+ HASH_CURSOR *hcp;
+ int ret;
+
+ /*
+ * All we need do is acquire the lock and let the off-page
+ * dup tree do its thing.
+ */
+ if (!STD_LOCKING(dbc))
+ return (0);
+
+ hcp = (HASH_CURSOR *)dbc->internal;
+ ret = 0;
+ if ((!LOCK_ISSET(hcp->lock) || hcp->lock_mode != DB_LOCK_WRITE)) {
+ tmp_lock = hcp->lock;
+ if ((ret = __ham_lock_bucket(dbc, DB_LOCK_WRITE)) == 0 &&
+ tmp_lock.mode != DB_LOCK_WWRITE)
+ ret = __LPUT(dbc, tmp_lock);
+ }
+ return (ret);
+}