diff options
author | unknown <tim@threads.polyesthetic.msg> | 2001-03-04 19:42:05 -0500 |
---|---|---|
committer | unknown <tim@threads.polyesthetic.msg> | 2001-03-04 19:42:05 -0500 |
commit | 07dc15a5b0fafaf0a0bcde2768b34aad2f3825fa (patch) | |
tree | 9dd732e08dba156ee3d7635caedc0dc3107ecac6 /bdb/hash | |
parent | 542e1c18dc5bf80665df55ffa04a48d986945259 (diff) | |
download | mariadb-git-07dc15a5b0fafaf0a0bcde2768b34aad2f3825fa.tar.gz |
Import changeset
Diffstat (limited to 'bdb/hash')
-rw-r--r-- | bdb/hash/hash.c | 2096 | ||||
-rw-r--r-- | bdb/hash/hash.src | 361 | ||||
-rw-r--r-- | bdb/hash/hash_auto.c | 2023 | ||||
-rw-r--r-- | bdb/hash/hash_conv.c | 112 | ||||
-rw-r--r-- | bdb/hash/hash_dup.c | 805 | ||||
-rw-r--r-- | bdb/hash/hash_func.c | 242 | ||||
-rw-r--r-- | bdb/hash/hash_meta.c | 121 | ||||
-rw-r--r-- | bdb/hash/hash_method.c | 126 | ||||
-rw-r--r-- | bdb/hash/hash_page.c | 1655 | ||||
-rw-r--r-- | bdb/hash/hash_rec.c | 1078 | ||||
-rw-r--r-- | bdb/hash/hash_reclaim.c | 68 | ||||
-rw-r--r-- | bdb/hash/hash_stat.c | 329 | ||||
-rw-r--r-- | bdb/hash/hash_upgrade.c | 271 | ||||
-rw-r--r-- | bdb/hash/hash_verify.c | 1051 |
14 files changed, 10338 insertions, 0 deletions
diff --git a/bdb/hash/hash.c b/bdb/hash/hash.c new file mode 100644 index 00000000000..e96fd4898f0 --- /dev/null +++ b/bdb/hash/hash.c @@ -0,0 +1,2096 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994 + * Margo Seltzer. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: hash.c,v 11.94 2001/01/03 16:42:26 ubell Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_am.h" +#include "db_ext.h" +#include "db_shash.h" +#include "db_swap.h" +#include "hash.h" +#include "btree.h" +#include "log.h" +#include "lock.h" +#include "txn.h" + +static int __ham_c_close __P((DBC *, db_pgno_t, int *)); +static int __ham_c_del __P((DBC *)); +static int __ham_c_destroy __P((DBC *)); +static int __ham_c_get __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *)); +static int __ham_c_put __P((DBC *, DBT *, DBT *, u_int32_t, db_pgno_t *)); +static int __ham_c_writelock __P((DBC *)); +static int __ham_del_dups __P((DBC *, DBT *)); +static int __ham_delete __P((DB *, DB_TXN *, DBT *, u_int32_t)); +static int __ham_dup_return __P((DBC *, DBT *, u_int32_t)); +static int __ham_expand_table __P((DBC *)); +static int __ham_init_htab __P((DBC *, + const char *, db_pgno_t, u_int32_t, u_int32_t)); +static int __ham_lookup __P((DBC *, + const DBT *, u_int32_t, db_lockmode_t, db_pgno_t *)); +static int __ham_overwrite __P((DBC *, DBT *, u_int32_t)); + +/* + * __ham_metachk -- + * + * PUBLIC: int __ham_metachk __P((DB *, const char *, HMETA *)); + */ +int +__ham_metachk(dbp, name, hashm) + DB *dbp; + const char *name; + HMETA *hashm; +{ + DB_ENV *dbenv; + u_int32_t vers; + int ret; + + dbenv = dbp->dbenv; + + /* + * At this point, all we know is that the magic number is for a Hash. + * Check the version, the database may be out of date. + */ + vers = hashm->dbmeta.version; + if (F_ISSET(dbp, DB_AM_SWAP)) + M_32_SWAP(vers); + switch (vers) { + case 4: + case 5: + case 6: + __db_err(dbenv, + "%s: hash version %lu requires a version upgrade", + name, (u_long)vers); + return (DB_OLD_VERSION); + case 7: + break; + default: + __db_err(dbenv, + "%s: unsupported hash version: %lu", name, (u_long)vers); + return (EINVAL); + } + + /* Swap the page if we need to. */ + if (F_ISSET(dbp, DB_AM_SWAP) && (ret = __ham_mswap((PAGE *)hashm)) != 0) + return (ret); + + /* Check the type. */ + if (dbp->type != DB_HASH && dbp->type != DB_UNKNOWN) + return (EINVAL); + dbp->type = DB_HASH; + DB_ILLEGAL_METHOD(dbp, DB_OK_HASH); + + /* + * Check application info against metadata info, and set info, flags, + * and type based on metadata info. + */ + if ((ret = __db_fchk(dbenv, + "DB->open", hashm->dbmeta.flags, + DB_HASH_DUP | DB_HASH_SUBDB | DB_HASH_DUPSORT)) != 0) + return (ret); + + if (F_ISSET(&hashm->dbmeta, DB_HASH_DUP)) + F_SET(dbp, DB_AM_DUP); + else + if (F_ISSET(dbp, DB_AM_DUP)) { + __db_err(dbenv, + "%s: DB_DUP specified to open method but not set in database", + name); + return (EINVAL); + } + + if (F_ISSET(&hashm->dbmeta, DB_HASH_SUBDB)) + F_SET(dbp, DB_AM_SUBDB); + else + if (F_ISSET(dbp, DB_AM_SUBDB)) { + __db_err(dbenv, + "%s: multiple databases specified but not supported in file", + name); + return (EINVAL); + } + + if (F_ISSET(&hashm->dbmeta, DB_HASH_DUPSORT)) { + if (dbp->dup_compare == NULL) + dbp->dup_compare = __bam_defcmp; + } else + if (dbp->dup_compare != NULL) { + __db_err(dbenv, + "%s: duplicate sort function specified but not set in database", + name); + return (EINVAL); + } + + /* Set the page size. */ + dbp->pgsize = hashm->dbmeta.pagesize; + + /* Copy the file's ID. */ + memcpy(dbp->fileid, hashm->dbmeta.uid, DB_FILE_ID_LEN); + + return (0); +} + +/* + * __ham_open -- + * + * PUBLIC: int __ham_open __P((DB *, const char *, db_pgno_t, u_int32_t)); + */ +int +__ham_open(dbp, name, base_pgno, flags) + DB *dbp; + const char *name; + db_pgno_t base_pgno; + u_int32_t flags; +{ + DB_ENV *dbenv; + DBC *dbc; + HASH_CURSOR *hcp; + HASH *hashp; + int need_sync, ret, t_ret; + + dbc = NULL; + dbenv = dbp->dbenv; + need_sync = 0; + + /* Initialize the remaining fields/methods of the DB. */ + dbp->del = __ham_delete; + dbp->stat = __ham_stat; + + /* + * Get a cursor. If DB_CREATE is specified, we may be creating + * pages, and to do that safely in CDB we need a write cursor. + * In STD_LOCKING mode, we'll synchronize using the meta page + * lock instead. + */ + if ((ret = dbp->cursor(dbp, + dbp->open_txn, &dbc, LF_ISSET(DB_CREATE) && CDB_LOCKING(dbenv) ? + DB_WRITECURSOR : 0)) != 0) + return (ret); + + hcp = (HASH_CURSOR *)dbc->internal; + hashp = dbp->h_internal; + hashp->meta_pgno = base_pgno; + if ((ret = __ham_get_meta(dbc)) != 0) + goto err1; + + /* + * If this is a new file, initialize it, and put it back dirty. + * + * Initialize the hdr structure. + */ + if (hcp->hdr->dbmeta.magic == DB_HASHMAGIC) { + /* File exists, verify the data in the header. */ + if (hashp->h_hash == NULL) + hashp->h_hash = hcp->hdr->dbmeta.version < 5 + ? __ham_func4 : __ham_func5; + if (!F_ISSET(dbp, DB_RDONLY) && + hashp->h_hash(dbp, + CHARKEY, sizeof(CHARKEY)) != hcp->hdr->h_charkey) { + __db_err(dbp->dbenv, + "hash: incompatible hash function"); + ret = EINVAL; + goto err2; + } + if (F_ISSET(&hcp->hdr->dbmeta, DB_HASH_DUP)) + F_SET(dbp, DB_AM_DUP); + if (F_ISSET(&hcp->hdr->dbmeta, DB_HASH_DUPSORT)) + F_SET(dbp, DB_AM_DUPSORT); + if (F_ISSET(&hcp->hdr->dbmeta, DB_HASH_SUBDB)) + F_SET(dbp, DB_AM_SUBDB); + } else if (!IS_RECOVERING(dbenv)) { + /* + * File does not exist, we must initialize the header. If + * locking is enabled that means getting a write lock first. + * During recovery the meta page will be in the log. + */ + dbc->lock.pgno = base_pgno; + + if (STD_LOCKING(dbc) && + ((ret = lock_put(dbenv, &hcp->hlock)) != 0 || + (ret = lock_get(dbenv, dbc->locker, + DB_NONBLOCK(dbc) ? DB_LOCK_NOWAIT : 0, + &dbc->lock_dbt, DB_LOCK_WRITE, &hcp->hlock)) != 0)) + goto err2; + else if (CDB_LOCKING(dbp->dbenv)) { + DB_ASSERT(LF_ISSET(DB_CREATE)); + if ((ret = lock_get(dbenv, dbc->locker, + DB_LOCK_UPGRADE, &dbc->lock_dbt, DB_LOCK_WRITE, + &dbc->mylock)) != 0) + goto err2; + } + if ((ret = __ham_init_htab(dbc, name, + base_pgno, hashp->h_nelem, hashp->h_ffactor)) != 0) + goto err2; + + need_sync = 1; + } + +err2: /* Release the meta data page */ + if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0) + ret = t_ret; +err1: if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + /* Sync the file so that we know that the meta data goes to disk. */ + if (ret == 0 && need_sync) + ret = dbp->sync(dbp, 0); +#if CONFIG_TEST + if (ret == 0) + DB_TEST_RECOVERY(dbp, DB_TEST_POSTSYNC, ret, name); + +DB_TEST_RECOVERY_LABEL +#endif + if (ret != 0) + (void)__ham_db_close(dbp); + + return (ret); +} + +/************************** LOCAL CREATION ROUTINES **********************/ +/* + * Returns 0 on No Error + */ +static int +__ham_init_htab(dbc, name, pgno, nelem, ffactor) + DBC *dbc; + const char *name; + db_pgno_t pgno; + u_int32_t nelem, ffactor; +{ + DB *dbp; + DB_LOCK metalock; + DB_LSN orig_lsn; + DBMETA *mmeta; + HASH_CURSOR *hcp; + HASH *hashp; + PAGE *h; + db_pgno_t mpgno; + int32_t l2, nbuckets; + int dirty_mmeta, i, ret, t_ret; + + hcp = (HASH_CURSOR *)dbc->internal; + dbp = dbc->dbp; + hashp = dbp->h_internal; + mmeta = NULL; + h = NULL; + ret = 0; + dirty_mmeta = 0; + metalock.off = LOCK_INVALID; + + if (hashp->h_hash == NULL) + hashp->h_hash = DB_HASHVERSION < 5 ? __ham_func4 : __ham_func5; + + if (nelem != 0 && ffactor != 0) { + nelem = (nelem - 1) / ffactor + 1; + l2 = __db_log2(nelem > 2 ? nelem : 2); + } else + l2 = 1; + nbuckets = 1 << l2; + + orig_lsn = hcp->hdr->dbmeta.lsn; + memset(hcp->hdr, 0, sizeof(HMETA)); + ZERO_LSN(hcp->hdr->dbmeta.lsn); + hcp->hdr->dbmeta.pgno = pgno; + hcp->hdr->dbmeta.magic = DB_HASHMAGIC; + hcp->hdr->dbmeta.version = DB_HASHVERSION; + hcp->hdr->dbmeta.pagesize = dbp->pgsize; + hcp->hdr->dbmeta.type = P_HASHMETA; + hcp->hdr->dbmeta.free = PGNO_INVALID; + hcp->hdr->max_bucket = hcp->hdr->high_mask = nbuckets - 1; + hcp->hdr->low_mask = (nbuckets >> 1) - 1; + hcp->hdr->ffactor = ffactor; + hcp->hdr->h_charkey = hashp->h_hash(dbp, CHARKEY, sizeof(CHARKEY)); + memcpy(hcp->hdr->dbmeta.uid, dbp->fileid, DB_FILE_ID_LEN); + + if (F_ISSET(dbp, DB_AM_DUP)) + F_SET(&hcp->hdr->dbmeta, DB_HASH_DUP); + if (F_ISSET(dbp, DB_AM_SUBDB)) + F_SET(&hcp->hdr->dbmeta, DB_HASH_SUBDB); + if (dbp->dup_compare != NULL) + F_SET(&hcp->hdr->dbmeta, DB_HASH_DUPSORT); + + if ((ret = memp_fset(dbp->mpf, hcp->hdr, DB_MPOOL_DIRTY)) != 0) + goto err; + + /* + * Create the first and second buckets pages so that we have the + * page numbers for them and we can store that page number + * in the meta-data header (spares[0]). + */ + hcp->hdr->spares[0] = nbuckets; + if ((ret = memp_fget(dbp->mpf, + &hcp->hdr->spares[0], DB_MPOOL_NEW_GROUP, &h)) != 0) + goto err; + + P_INIT(h, dbp->pgsize, hcp->hdr->spares[0], PGNO_INVALID, + PGNO_INVALID, 0, P_HASH); + + /* Fill in the last fields of the meta data page. */ + hcp->hdr->spares[0] -= (nbuckets - 1); + for (i = 1; i <= l2; i++) + hcp->hdr->spares[i] = hcp->hdr->spares[0]; + for (; i < NCACHED; i++) + hcp->hdr->spares[i] = PGNO_INVALID; + + /* + * Before we are about to put any dirty pages, we need to log + * the meta-data page create. + */ + ret = __db_log_page(dbp, name, &orig_lsn, pgno, (PAGE *)hcp->hdr); + + if (dbp->open_txn != NULL) { + mmeta = (DBMETA *) hcp->hdr; + if (F_ISSET(dbp, DB_AM_SUBDB)) { + + /* + * If this is a subdatabase, then we need to + * get the LSN off the master meta data page + * because that's where free pages are linked + * and during recovery we need to access + * that page and roll it backward/forward + * correctly with respect to LSN. + */ + mpgno = PGNO_BASE_MD; + if ((ret = __db_lget(dbc, + 0, mpgno, DB_LOCK_WRITE, 0, &metalock)) != 0) + goto err; + if ((ret = memp_fget(dbp->mpf, + &mpgno, 0, (PAGE **)&mmeta)) != 0) + goto err; + } + if ((t_ret = __ham_groupalloc_log(dbp->dbenv, + dbp->open_txn, &LSN(mmeta), 0, dbp->log_fileid, + &LSN(mmeta), hcp->hdr->spares[0], + hcp->hdr->max_bucket + 1, mmeta->free)) != 0 && ret == 0) + ret = t_ret; + if (ret == 0) { + /* need to update real LSN for buffer manager */ + dirty_mmeta = 1; + } + + } + + DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOG, ret, name); + +DB_TEST_RECOVERY_LABEL +err: if (h != NULL && + (t_ret = memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0 && ret == 0) + ret = t_ret; + + if (F_ISSET(dbp, DB_AM_SUBDB) && mmeta != NULL) + if ((t_ret = memp_fput(dbp->mpf, mmeta, + dirty_mmeta ? DB_MPOOL_DIRTY : 0)) != 0 && ret == 0) + ret = t_ret; + if (metalock.off != LOCK_INVALID) + (void)__TLPUT(dbc, metalock); + + return (ret); +} + +static int +__ham_delete(dbp, txn, key, flags) + DB *dbp; + DB_TXN *txn; + DBT *key; + u_int32_t flags; +{ + DBC *dbc; + HASH_CURSOR *hcp; + db_pgno_t pgno; + int ret, t_ret; + + /* + * This is the only access method routine called directly from + * the dbp, so we have to do error checking. + */ + + PANIC_CHECK(dbp->dbenv); + DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->del"); + DB_CHECK_TXN(dbp, txn); + + if ((ret = + __db_delchk(dbp, key, flags, F_ISSET(dbp, DB_AM_RDONLY))) != 0) + return (ret); + + if ((ret = dbp->cursor(dbp, txn, &dbc, DB_WRITELOCK)) != 0) + return (ret); + + DEBUG_LWRITE(dbc, txn, "ham_delete", key, NULL, flags); + + hcp = (HASH_CURSOR *)dbc->internal; + if ((ret = __ham_get_meta(dbc)) != 0) + goto out; + + pgno = PGNO_INVALID; + if ((ret = __ham_lookup(dbc, key, 0, DB_LOCK_WRITE, &pgno)) == 0) { + if (F_ISSET(hcp, H_OK)) { + if (pgno == PGNO_INVALID) + ret = __ham_del_pair(dbc, 1); + else { + /* When we close the cursor in __ham_del_dups, + * that will make the off-page dup tree go + * go away as well as our current entry. When + * it updates cursors, ours should get marked + * as H_DELETED. + */ + ret = __ham_del_dups(dbc, key); + } + } else + ret = DB_NOTFOUND; + } + + if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0) + ret = t_ret; + +out: if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* ****************** CURSORS ********************************** */ +/* + * __ham_c_init -- + * Initialize the hash-specific portion of a cursor. + * + * PUBLIC: int __ham_c_init __P((DBC *)); + */ +int +__ham_c_init(dbc) + DBC *dbc; +{ + DB_ENV *dbenv; + HASH_CURSOR *new_curs; + int ret; + + dbenv = dbc->dbp->dbenv; + if ((ret = __os_calloc(dbenv, + 1, sizeof(struct cursor_t), &new_curs)) != 0) + return (ret); + if ((ret = __os_malloc(dbenv, + dbc->dbp->pgsize, NULL, &new_curs->split_buf)) != 0) { + __os_free(new_curs, sizeof(*new_curs)); + return (ret); + } + + dbc->internal = (DBC_INTERNAL *) new_curs; + dbc->c_close = __db_c_close; + dbc->c_count = __db_c_count; + dbc->c_del = __db_c_del; + dbc->c_dup = __db_c_dup; + dbc->c_get = __db_c_get; + dbc->c_put = __db_c_put; + dbc->c_am_close = __ham_c_close; + dbc->c_am_del = __ham_c_del; + dbc->c_am_destroy = __ham_c_destroy; + dbc->c_am_get = __ham_c_get; + dbc->c_am_put = __ham_c_put; + dbc->c_am_writelock = __ham_c_writelock; + + __ham_item_init(dbc); + + return (0); +} + +/* + * __ham_c_close -- + * Close down the cursor from a single use. + */ +static int +__ham_c_close(dbc, root_pgno, rmroot) + DBC *dbc; + db_pgno_t root_pgno; + int *rmroot; +{ + HASH_CURSOR *hcp; + HKEYDATA *dp; + int doroot, gotmeta, ret, t_ret; + u_int32_t dirty; + + COMPQUIET(rmroot, 0); + dirty = 0; + doroot = gotmeta = ret = 0; + hcp = (HASH_CURSOR *) dbc->internal; + + /* Check for off page dups. */ + if (dbc->internal->opd != NULL) { + if ((ret = __ham_get_meta(dbc)) != 0) + goto done; + gotmeta = 1; + if ((ret = __ham_get_cpage(dbc, DB_LOCK_READ)) != 0) + goto out; + dp = (HKEYDATA *)H_PAIRDATA(hcp->page, hcp->indx); + DB_ASSERT(HPAGE_PTYPE(dp) == H_OFFDUP); + memcpy(&root_pgno, HOFFPAGE_PGNO(dp), sizeof(db_pgno_t)); + + if ((ret = + hcp->opd->c_am_close(hcp->opd, root_pgno, &doroot)) != 0) + goto out; + if (doroot != 0) { + if ((ret = __ham_del_pair(dbc, 1)) != 0) + goto out; + dirty = DB_MPOOL_DIRTY; + } + } + +out: if (hcp->page != NULL && (t_ret = + memp_fput(dbc->dbp->mpf, hcp->page, dirty)) != 0 && ret == 0) + ret = t_ret; + if (gotmeta != 0 && (t_ret = __ham_release_meta(dbc)) != 0 && ret == 0) + ret = t_ret; + +done: + __ham_item_init(dbc); + return (ret); +} + +/* + * __ham_c_destroy -- + * Cleanup the access method private part of a cursor. + */ +static int +__ham_c_destroy(dbc) + DBC *dbc; +{ + HASH_CURSOR *hcp; + + hcp = (HASH_CURSOR *)dbc->internal; + if (hcp->split_buf != NULL) + __os_free(hcp->split_buf, dbc->dbp->pgsize); + __os_free(hcp, sizeof(HASH_CURSOR)); + + return (0); +} + +/* + * __ham_c_count -- + * Return a count of on-page duplicates. + * + * PUBLIC: int __ham_c_count __P((DBC *, db_recno_t *)); + */ +int +__ham_c_count(dbc, recnop) + DBC *dbc; + db_recno_t *recnop; +{ + DB *dbp; + HASH_CURSOR *hcp; + db_indx_t len; + db_recno_t recno; + int ret, t_ret; + u_int8_t *p, *pend; + + dbp = dbc->dbp; + hcp = (HASH_CURSOR *) dbc->internal; + + recno = 0; + + if ((ret = __ham_get_cpage(dbc, DB_LOCK_READ)) != 0) + return (ret); + + switch (HPAGE_PTYPE(H_PAIRDATA(hcp->page, hcp->indx))) { + case H_KEYDATA: + case H_OFFPAGE: + recno = 1; + break; + case H_DUPLICATE: + p = HKEYDATA_DATA(H_PAIRDATA(hcp->page, hcp->indx)); + pend = p + + LEN_HDATA(hcp->page, dbp->pgsize, hcp->indx); + for (; p < pend; recno++) { + /* p may be odd, so copy rather than just dereffing */ + memcpy(&len, p, sizeof(db_indx_t)); + p += 2 * sizeof(db_indx_t) + len; + } + + break; + default: + ret = __db_unknown_type(dbp->dbenv, "__ham_c_count", + HPAGE_PTYPE(H_PAIRDATA(hcp->page, hcp->indx))); + goto err; + } + + *recnop = recno; + +err: if ((t_ret = memp_fput(dbc->dbp->mpf, hcp->page, 0)) != 0 && ret == 0) + ret = t_ret; + hcp->page = NULL; + return (ret); +} + +static int +__ham_c_del(dbc) + DBC *dbc; +{ + DB *dbp; + DBT repldbt; + HASH_CURSOR *hcp; + int ret, t_ret; + + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + + if (F_ISSET(hcp, H_DELETED)) + return (DB_NOTFOUND); + + if ((ret = __ham_get_meta(dbc)) != 0) + goto out; + + if ((ret = __ham_get_cpage(dbc, DB_LOCK_WRITE)) != 0) + goto out; + + /* Off-page duplicates. */ + if (HPAGE_TYPE(hcp->page, H_DATAINDEX(hcp->indx)) == H_OFFDUP) + goto out; + + if (F_ISSET(hcp, H_ISDUP)) { /* On-page duplicate. */ + if (hcp->dup_off == 0 && + DUP_SIZE(hcp->dup_len) == LEN_HDATA(hcp->page, + hcp->hdr->dbmeta.pagesize, hcp->indx)) + ret = __ham_del_pair(dbc, 1); + else { + repldbt.flags = 0; + F_SET(&repldbt, DB_DBT_PARTIAL); + repldbt.doff = hcp->dup_off; + repldbt.dlen = DUP_SIZE(hcp->dup_len); + repldbt.size = 0; + repldbt.data = HKEYDATA_DATA(H_PAIRDATA(hcp->page, + hcp->indx)); + ret = __ham_replpair(dbc, &repldbt, 0); + hcp->dup_tlen -= DUP_SIZE(hcp->dup_len); + F_SET(hcp, H_DELETED); + ret = __ham_c_update(dbc, DUP_SIZE(hcp->dup_len), 0, 1); + } + + } else /* Not a duplicate */ + ret = __ham_del_pair(dbc, 1); + +out: if (ret == 0 && hcp->page != NULL && + (t_ret = memp_fput(dbp->mpf, hcp->page, DB_MPOOL_DIRTY)) != 0) + ret = t_ret; + hcp->page = NULL; + if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * __ham_c_dup -- + * Duplicate a hash cursor, such that the new one holds appropriate + * locks for the position of the original. + * + * PUBLIC: int __ham_c_dup __P((DBC *, DBC *)); + */ +int +__ham_c_dup(orig_dbc, new_dbc) + DBC *orig_dbc, *new_dbc; +{ + HASH_CURSOR *orig, *new; + + orig = (HASH_CURSOR *)orig_dbc->internal; + new = (HASH_CURSOR *)new_dbc->internal; + + new->bucket = orig->bucket; + new->lbucket = orig->lbucket; + new->dup_off = orig->dup_off; + new->dup_len = orig->dup_len; + new->dup_tlen = orig->dup_tlen; + + if (F_ISSET(orig, H_DELETED)) + F_SET(new, H_DELETED); + if (F_ISSET(orig, H_ISDUP)) + F_SET(new, H_ISDUP); + + /* + * If the old cursor held a lock and we're not in transactions, get one + * for the new one. The reason that we don't need a new lock if we're + * in a transaction is because we already hold a lock and will continue + * to do so until commit, so there is no point in reaquiring it. We + * don't know if the old lock was a read or write lock, but it doesn't + * matter. We'll get a read lock. We know that this locker already + * holds a lock of the correct type, so if we need a write lock and + * request it, we know that we'll get it. + */ + if (orig->lock.off == LOCK_INVALID || orig_dbc->txn != NULL) + return (0); + + return (__ham_lock_bucket(new_dbc, DB_LOCK_READ)); +} + +static int +__ham_c_get(dbc, key, data, flags, pgnop) + DBC *dbc; + DBT *key; + DBT *data; + u_int32_t flags; + db_pgno_t *pgnop; +{ + DB *dbp; + HASH_CURSOR *hcp; + db_lockmode_t lock_type; + int get_key, ret, t_ret; + + hcp = (HASH_CURSOR *)dbc->internal; + dbp = dbc->dbp; + + /* Clear OR'd in additional bits so we can check for flag equality. */ + if (F_ISSET(dbc, DBC_RMW)) + lock_type = DB_LOCK_WRITE; + else + lock_type = DB_LOCK_READ; + + if ((ret = __ham_get_meta(dbc)) != 0) + return (ret); + hcp->seek_size = 0; + + ret = 0; + get_key = 1; + switch (flags) { + case DB_PREV_NODUP: + F_SET(hcp, H_NEXT_NODUP); + /* FALLTHROUGH */ + case DB_PREV: + if (IS_INITIALIZED(dbc)) { + ret = __ham_item_prev(dbc, lock_type, pgnop); + break; + } + /* FALLTHROUGH */ + case DB_LAST: + ret = __ham_item_last(dbc, lock_type, pgnop); + break; + case DB_NEXT_NODUP: + F_SET(hcp, H_NEXT_NODUP); + /* FALLTHROUGH */ + case DB_NEXT: + if (IS_INITIALIZED(dbc)) { + ret = __ham_item_next(dbc, lock_type, pgnop); + break; + } + /* FALLTHROUGH */ + case DB_FIRST: + ret = __ham_item_first(dbc, lock_type, pgnop); + break; + case DB_NEXT_DUP: + /* cgetchk has already determined that the cursor is set. */ + F_SET(hcp, H_DUPONLY); + ret = __ham_item_next(dbc, lock_type, pgnop); + break; + case DB_SET: + case DB_SET_RANGE: + case DB_GET_BOTH: + ret = __ham_lookup(dbc, key, 0, lock_type, pgnop); + get_key = 0; + break; + case DB_GET_BOTHC: + F_SET(hcp, H_DUPONLY); + + ret = __ham_item_next(dbc, lock_type, pgnop); + get_key = 0; + break; + case DB_CURRENT: + /* cgetchk has already determined that the cursor is set. */ + if (F_ISSET(hcp, H_DELETED)) { + ret = DB_KEYEMPTY; + goto err; + } + + ret = __ham_item(dbc, lock_type, pgnop); + break; + } + + /* + * Must always enter this loop to do error handling and + * check for big key/data pair. + */ + for (;;) { + if (ret != 0 && ret != DB_NOTFOUND) + goto err; + else if (F_ISSET(hcp, H_OK)) { + if (*pgnop == PGNO_INVALID) + ret = __ham_dup_return (dbc, data, flags); + break; + } else if (!F_ISSET(hcp, H_NOMORE)) { + __db_err(dbp->dbenv, + "H_NOMORE returned to __ham_c_get"); + ret = EINVAL; + break; + } + + /* + * Ran out of entries in a bucket; change buckets. + */ + switch (flags) { + case DB_LAST: + case DB_PREV: + case DB_PREV_NODUP: + ret = memp_fput(dbp->mpf, hcp->page, 0); + hcp->page = NULL; + if (hcp->bucket == 0) { + ret = DB_NOTFOUND; + hcp->pgno = PGNO_INVALID; + goto err; + } + F_CLR(hcp, H_ISDUP); + hcp->bucket--; + hcp->indx = NDX_INVALID; + hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket); + if (ret == 0) + ret = __ham_item_prev(dbc, + lock_type, pgnop); + break; + case DB_FIRST: + case DB_NEXT: + case DB_NEXT_NODUP: + ret = memp_fput(dbp->mpf, hcp->page, 0); + hcp->page = NULL; + hcp->indx = NDX_INVALID; + hcp->bucket++; + F_CLR(hcp, H_ISDUP); + hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket); + if (hcp->bucket > hcp->hdr->max_bucket) { + ret = DB_NOTFOUND; + hcp->pgno = PGNO_INVALID; + goto err; + } + if (ret == 0) + ret = __ham_item_next(dbc, + lock_type, pgnop); + break; + case DB_GET_BOTH: + case DB_GET_BOTHC: + case DB_NEXT_DUP: + case DB_SET: + case DB_SET_RANGE: + /* Key not found. */ + ret = DB_NOTFOUND; + goto err; + case DB_CURRENT: + /* + * This should only happen if you are doing + * deletes and reading with concurrent threads + * and not doing proper locking. We return + * the same error code as we would if the + * cursor were deleted. + */ + ret = DB_KEYEMPTY; + goto err; + default: + DB_ASSERT(0); + } + } + + if (get_key == 0) + F_SET(key, DB_DBT_ISSET); + +err: if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0) + ret = t_ret; + + F_CLR(hcp, H_DUPONLY); + F_CLR(hcp, H_NEXT_NODUP); + + return (ret); +} + +static int +__ham_c_put(dbc, key, data, flags, pgnop) + DBC *dbc; + DBT *key; + DBT *data; + u_int32_t flags; + db_pgno_t *pgnop; +{ + DB *dbp; + DBT tmp_val, *myval; + HASH_CURSOR *hcp; + u_int32_t nbytes; + int ret, t_ret; + + /* + * The compiler doesn't realize that we only use this when ret is + * equal to 0 and that if ret is equal to 0, that we must have set + * myval. So, we initialize it here to shut the compiler up. + */ + COMPQUIET(myval, NULL); + + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + + if (F_ISSET(hcp, H_DELETED) && + flags != DB_KEYFIRST && flags != DB_KEYLAST) + return (DB_NOTFOUND); + + if ((ret = __ham_get_meta(dbc)) != 0) + goto err1; + + switch (flags) { + case DB_KEYLAST: + case DB_KEYFIRST: + case DB_NODUPDATA: + nbytes = (ISBIG(hcp, key->size) ? HOFFPAGE_PSIZE : + HKEYDATA_PSIZE(key->size)) + + (ISBIG(hcp, data->size) ? HOFFPAGE_PSIZE : + HKEYDATA_PSIZE(data->size)); + if ((ret = __ham_lookup(dbc, + key, nbytes, DB_LOCK_WRITE, pgnop)) == DB_NOTFOUND) { + ret = 0; + if (hcp->seek_found_page != PGNO_INVALID && + hcp->seek_found_page != hcp->pgno) { + if ((ret = memp_fput(dbp->mpf, hcp->page, 0)) + != 0) + goto err2; + hcp->page = NULL; + hcp->pgno = hcp->seek_found_page; + hcp->indx = NDX_INVALID; + } + + if (F_ISSET(data, DB_DBT_PARTIAL) && data->doff != 0) { + /* + * A partial put, but the key does not exist + * and we are not beginning the write at 0. + * We must create a data item padded up to doff + * and then write the new bytes represented by + * val. + */ + if ((ret = __ham_init_dbt(dbp->dbenv, + &tmp_val, data->size + data->doff, + &dbc->rdata.data, &dbc->rdata.ulen)) == 0) { + memset(tmp_val.data, 0, data->doff); + memcpy((u_int8_t *)tmp_val.data + + data->doff, data->data, data->size); + myval = &tmp_val; + } + } else + myval = (DBT *)data; + + if (ret == 0) + ret = __ham_add_el(dbc, key, myval, H_KEYDATA); + goto done; + } + break; + case DB_BEFORE: + case DB_AFTER: + case DB_CURRENT: + ret = __ham_item(dbc, DB_LOCK_WRITE, pgnop); + break; + } + + if (*pgnop == PGNO_INVALID && ret == 0) { + if (flags == DB_CURRENT || + ((flags == DB_KEYFIRST || + flags == DB_KEYLAST || flags == DB_NODUPDATA) && + !(F_ISSET(dbp, DB_AM_DUP) || F_ISSET(key, DB_DBT_DUPOK)))) + ret = __ham_overwrite(dbc, data, flags); + else + ret = __ham_add_dup(dbc, data, flags, pgnop); + } + +done: if (ret == 0 && F_ISSET(hcp, H_EXPAND)) { + ret = __ham_expand_table(dbc); + F_CLR(hcp, H_EXPAND); + } + + if (ret == 0 && + (t_ret = memp_fset(dbp->mpf, hcp->page, DB_MPOOL_DIRTY)) != 0) + ret = t_ret; + +err2: if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0) + ret = t_ret; + +err1: return (ret); +} + +/********************************* UTILITIES ************************/ + +/* + * __ham_expand_table -- + */ +static int +__ham_expand_table(dbc) + DBC *dbc; +{ + DB *dbp; + PAGE *h; + HASH_CURSOR *hcp; + db_pgno_t pgno; + u_int32_t old_bucket, new_bucket; + int ret; + + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + if ((ret = __ham_dirty_meta(dbc)) != 0) + return (ret); + + /* + * If the split point is about to increase, make sure that we + * have enough extra pages. The calculation here is weird. + * We'd like to do this after we've upped max_bucket, but it's + * too late then because we've logged the meta-data split. What + * we'll do between then and now is increment max bucket and then + * see what the log of one greater than that is; here we have to + * look at the log of max + 2. VERY NASTY STUFF. + * + * It just got even nastier. With subdatabases, we have to request + * a chunk of contiguous pages, so we do that here using an + * undocumented feature of mpool (the MPOOL_NEW_GROUP flag) to + * give us a number of contiguous pages. Ouch. + */ + if (hcp->hdr->max_bucket == hcp->hdr->high_mask) { + /* + * Ask mpool to give us a set of contiguous page numbers + * large enough to contain the next doubling. + * + * Figure out how many new pages we need. This will return + * us the last page. We calculate its page number, initialize + * the page and then write it back to reserve all the pages + * in between. It is possible that the allocation of new pages + * has already been done, but the tranaction aborted. Since + * we don't undo the allocation, check for a valid pgno before + * doing the allocation. + */ + pgno = hcp->hdr->max_bucket + 1; + if (hcp->hdr->spares[__db_log2(pgno) + 1] == PGNO_INVALID) + /* Allocate a group of pages. */ + ret = memp_fget(dbp->mpf, + &pgno, DB_MPOOL_NEW_GROUP, &h); + else { + /* Just read in the last page of the batch */ + pgno = hcp->hdr->spares[__db_log2(pgno) + 1] + + hcp->hdr->max_bucket + 1; + /* Move to the last page of the group. */ + pgno += hcp->hdr->max_bucket; + ret = memp_fget(dbp->mpf, + &pgno, DB_MPOOL_CREATE, &h); + } + if (ret != 0) + return (ret); + + P_INIT(h, dbp->pgsize, pgno, + PGNO_INVALID, PGNO_INVALID, 0, P_HASH); + pgno -= hcp->hdr->max_bucket; + } else { + pgno = BUCKET_TO_PAGE(hcp, hcp->hdr->max_bucket + 1); + if ((ret = + memp_fget(dbp->mpf, &pgno, DB_MPOOL_CREATE, &h)) != 0) + return (ret); + } + + /* Now we can log the meta-data split. */ + if (DB_LOGGING(dbc)) { + if ((ret = __ham_metagroup_log(dbp->dbenv, + dbc->txn, &h->lsn, 0, dbp->log_fileid, + hcp->hdr->max_bucket, pgno, &hcp->hdr->dbmeta.lsn, + &h->lsn)) != 0) { + (void)memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY); + return (ret); + } + + hcp->hdr->dbmeta.lsn = h->lsn; + } + + /* If we allocated some new pages, write out the last page. */ + if ((ret = memp_fput(dbp->mpf, h, DB_MPOOL_DIRTY)) != 0) + return (ret); + + new_bucket = ++hcp->hdr->max_bucket; + old_bucket = (hcp->hdr->max_bucket & hcp->hdr->low_mask); + + /* + * If we started a new doubling, fill in the spares array with + * the starting page number negatively offset by the bucket number. + */ + if (new_bucket > hcp->hdr->high_mask) { + /* Starting a new doubling */ + hcp->hdr->low_mask = hcp->hdr->high_mask; + hcp->hdr->high_mask = new_bucket | hcp->hdr->low_mask; + if (hcp->hdr->spares[__db_log2(new_bucket) + 1] == PGNO_INVALID) + hcp->hdr->spares[__db_log2(new_bucket) + 1] = + pgno - new_bucket; + } + + /* Relocate records to the new bucket */ + return (__ham_split_page(dbc, old_bucket, new_bucket)); +} + +/* + * PUBLIC: u_int32_t __ham_call_hash __P((DBC *, u_int8_t *, int32_t)); + */ +u_int32_t +__ham_call_hash(dbc, k, len) + DBC *dbc; + u_int8_t *k; + int32_t len; +{ + DB *dbp; + u_int32_t n, bucket; + HASH_CURSOR *hcp; + HASH *hashp; + + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + hashp = dbp->h_internal; + + n = (u_int32_t)(hashp->h_hash(dbp, k, len)); + + bucket = n & hcp->hdr->high_mask; + if (bucket > hcp->hdr->max_bucket) + bucket = bucket & hcp->hdr->low_mask; + return (bucket); +} + +/* + * Check for duplicates, and call __db_ret appropriately. Release + * everything held by the cursor. + */ +static int +__ham_dup_return (dbc, val, flags) + DBC *dbc; + DBT *val; + u_int32_t flags; +{ + DB *dbp; + HASH_CURSOR *hcp; + PAGE *pp; + DBT *myval, tmp_val; + db_indx_t ndx; + db_pgno_t pgno; + u_int32_t off, tlen; + u_int8_t *hk, type; + int cmp, ret; + db_indx_t len; + + /* Check for duplicate and return the first one. */ + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + ndx = H_DATAINDEX(hcp->indx); + type = HPAGE_TYPE(hcp->page, ndx); + pp = hcp->page; + myval = val; + + /* + * There are 4 cases: + * 1. We are not in duplicate, simply return; the upper layer + * will do the right thing. + * 2. We are looking at keys and stumbled onto a duplicate. + * 3. We are in the middle of a duplicate set. (ISDUP set) + * 4. We need to check for particular data match. + */ + + /* We should never get here with off-page dups. */ + DB_ASSERT(type != H_OFFDUP); + + /* Case 1 */ + if (type != H_DUPLICATE && + flags != DB_GET_BOTH && flags != DB_GET_BOTHC) + return (0); + + /* + * Here we check for the case where we just stumbled onto a + * duplicate. In this case, we do initialization and then + * let the normal duplicate code handle it. (Case 2) + */ + if (!F_ISSET(hcp, H_ISDUP) && type == H_DUPLICATE) { + F_SET(hcp, H_ISDUP); + hcp->dup_tlen = LEN_HDATA(hcp->page, + hcp->hdr->dbmeta.pagesize, hcp->indx); + hk = H_PAIRDATA(hcp->page, hcp->indx); + if (flags == DB_LAST + || flags == DB_PREV || flags == DB_PREV_NODUP) { + hcp->dup_off = 0; + do { + memcpy(&len, + HKEYDATA_DATA(hk) + hcp->dup_off, + sizeof(db_indx_t)); + hcp->dup_off += DUP_SIZE(len); + } while (hcp->dup_off < hcp->dup_tlen); + hcp->dup_off -= DUP_SIZE(len); + } else { + memcpy(&len, + HKEYDATA_DATA(hk), sizeof(db_indx_t)); + hcp->dup_off = 0; + } + hcp->dup_len = len; + } + + /* + * If we are retrieving a specific key/data pair, then we + * may need to adjust the cursor before returning data. + * Case 4 + */ + if (flags == DB_GET_BOTH || flags == DB_GET_BOTHC) { + if (F_ISSET(hcp, H_ISDUP)) { + /* + * If we're doing a join, search forward from the + * current position, not the beginning of the dup set. + */ + if (flags == DB_GET_BOTHC) + F_SET(hcp, H_CONTINUE); + + __ham_dsearch(dbc, val, &off, &cmp); + + /* + * This flag is set nowhere else and is safe to + * clear unconditionally. + */ + F_CLR(hcp, H_CONTINUE); + hcp->dup_off = off; + } else { + hk = H_PAIRDATA(hcp->page, hcp->indx); + if (((HKEYDATA *)hk)->type == H_OFFPAGE) { + memcpy(&tlen, + HOFFPAGE_TLEN(hk), sizeof(u_int32_t)); + memcpy(&pgno, + HOFFPAGE_PGNO(hk), sizeof(db_pgno_t)); + if ((ret = __db_moff(dbp, val, + pgno, tlen, dbp->dup_compare, &cmp)) != 0) + return (ret); + } else { + /* + * We do not zero tmp_val since the comparison + * routines may only look at data and size. + */ + tmp_val.data = HKEYDATA_DATA(hk); + tmp_val.size = LEN_HDATA(hcp->page, + dbp->pgsize, hcp->indx); + cmp = dbp->dup_compare == NULL ? + __bam_defcmp(dbp, &tmp_val, val) : + dbp->dup_compare(dbp, &tmp_val, val); + } + } + + if (cmp != 0) + return (DB_NOTFOUND); + } + + /* + * Now, everything is initialized, grab a duplicate if + * necessary. + */ + if (F_ISSET(hcp, H_ISDUP)) { /* Case 3 */ + /* + * Copy the DBT in case we are retrieving into user + * memory and we need the parameters for it. If the + * user requested a partial, then we need to adjust + * the user's parameters to get the partial of the + * duplicate which is itself a partial. + */ + memcpy(&tmp_val, val, sizeof(*val)); + if (F_ISSET(&tmp_val, DB_DBT_PARTIAL)) { + /* + * Take the user's length unless it would go + * beyond the end of the duplicate. + */ + if (tmp_val.doff + hcp->dup_off > hcp->dup_len) + tmp_val.dlen = 0; + else if (tmp_val.dlen + tmp_val.doff > + hcp->dup_len) + tmp_val.dlen = + hcp->dup_len - tmp_val.doff; + + /* + * Calculate the new offset. + */ + tmp_val.doff += hcp->dup_off; + } else { + F_SET(&tmp_val, DB_DBT_PARTIAL); + tmp_val.dlen = hcp->dup_len; + tmp_val.doff = hcp->dup_off + sizeof(db_indx_t); + } + myval = &tmp_val; + } + + /* + * Finally, if we had a duplicate, pp, ndx, and myval should be + * set appropriately. + */ + if ((ret = __db_ret(dbp, pp, ndx, myval, &dbc->rdata.data, + &dbc->rdata.ulen)) != 0) + return (ret); + + /* + * In case we sent a temporary off to db_ret, set the real + * return values. + */ + val->data = myval->data; + val->size = myval->size; + + F_SET(val, DB_DBT_ISSET); + + return (0); +} + +static int +__ham_overwrite(dbc, nval, flags) + DBC *dbc; + DBT *nval; + u_int32_t flags; +{ + DB *dbp; + HASH_CURSOR *hcp; + DBT *myval, tmp_val, tmp_val2; + void *newrec; + u_int8_t *hk, *p; + u_int32_t len, nondup_size; + db_indx_t newsize; + int ret; + + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + if (F_ISSET(hcp, H_ISDUP)) { + /* + * This is an overwrite of a duplicate. We should never + * be off-page at this point. + */ + DB_ASSERT(hcp->opd == NULL); + /* On page dups */ + if (F_ISSET(nval, DB_DBT_PARTIAL)) { + /* + * We're going to have to get the current item, then + * construct the record, do any padding and do a + * replace. + */ + memset(&tmp_val, 0, sizeof(tmp_val)); + if ((ret = + __ham_dup_return (dbc, &tmp_val, DB_CURRENT)) != 0) + return (ret); + + /* Figure out new size. */ + nondup_size = tmp_val.size; + newsize = nondup_size; + + /* + * Three cases: + * 1. strictly append (may need to allocate space + * for pad bytes; really gross). + * 2. overwrite some and append. + * 3. strictly overwrite. + */ + if (nval->doff > nondup_size) + newsize += + (nval->doff - nondup_size + nval->size); + else if (nval->doff + nval->dlen > nondup_size) + newsize += nval->size - + (nondup_size - nval->doff); + else + newsize += nval->size - nval->dlen; + + /* + * Make sure that the new size doesn't put us over + * the onpage duplicate size in which case we need + * to convert to off-page duplicates. + */ + if (ISBIG(hcp, hcp->dup_tlen - nondup_size + newsize)) { + if ((ret = __ham_dup_convert(dbc)) != 0) + return (ret); + return (hcp->opd->c_am_put(hcp->opd, + NULL, nval, flags, NULL)); + } + + if ((ret = __os_malloc(dbp->dbenv, + DUP_SIZE(newsize), NULL, &newrec)) != 0) + return (ret); + memset(&tmp_val2, 0, sizeof(tmp_val2)); + F_SET(&tmp_val2, DB_DBT_PARTIAL); + + /* Construct the record. */ + p = newrec; + /* Initial size. */ + memcpy(p, &newsize, sizeof(db_indx_t)); + p += sizeof(db_indx_t); + + /* First part of original record. */ + len = nval->doff > tmp_val.size + ? tmp_val.size : nval->doff; + memcpy(p, tmp_val.data, len); + p += len; + + if (nval->doff > tmp_val.size) { + /* Padding */ + memset(p, 0, nval->doff - tmp_val.size); + p += nval->doff - tmp_val.size; + } + + /* New bytes */ + memcpy(p, nval->data, nval->size); + p += nval->size; + + /* End of original record (if there is any) */ + if (nval->doff + nval->dlen < tmp_val.size) { + len = tmp_val.size - nval->doff - nval->dlen; + memcpy(p, (u_int8_t *)tmp_val.data + + nval->doff + nval->dlen, len); + p += len; + } + + /* Final size. */ + memcpy(p, &newsize, sizeof(db_indx_t)); + + /* + * Make sure that the caller isn't corrupting + * the sort order. + */ + if (dbp->dup_compare != NULL) { + tmp_val2.data = + (u_int8_t *)newrec + sizeof(db_indx_t); + tmp_val2.size = newsize; + if (dbp->dup_compare( + dbp, &tmp_val, &tmp_val2) != 0) { + (void)__os_free(newrec, + DUP_SIZE(newsize)); + return (__db_duperr(dbp, flags)); + } + } + + tmp_val2.data = newrec; + tmp_val2.size = DUP_SIZE(newsize); + tmp_val2.doff = hcp->dup_off; + tmp_val2.dlen = DUP_SIZE(hcp->dup_len); + + ret = __ham_replpair(dbc, &tmp_val2, 0); + (void)__os_free(newrec, DUP_SIZE(newsize)); + + /* Update cursor */ + if (ret != 0) + return (ret); + + if (newsize > nondup_size) + hcp->dup_tlen += (newsize - nondup_size); + else + hcp->dup_tlen -= (nondup_size - newsize); + hcp->dup_len = DUP_SIZE(newsize); + return (0); + } else { + /* Check whether we need to convert to off page. */ + if (ISBIG(hcp, + hcp->dup_tlen - hcp->dup_len + nval->size)) { + if ((ret = __ham_dup_convert(dbc)) != 0) + return (ret); + return (hcp->opd->c_am_put(hcp->opd, + NULL, nval, flags, NULL)); + } + + /* Make sure we maintain sort order. */ + if (dbp->dup_compare != NULL) { + tmp_val2.data = + HKEYDATA_DATA(H_PAIRDATA(hcp->page, + hcp->indx)) + hcp->dup_off + + sizeof(db_indx_t); + tmp_val2.size = hcp->dup_len; + if (dbp->dup_compare(dbp, nval, &tmp_val2) != 0) + return (EINVAL); + } + /* Overwriting a complete duplicate. */ + if ((ret = + __ham_make_dup(dbp->dbenv, nval, + &tmp_val, &dbc->rdata.data, &dbc->rdata.ulen)) != 0) + return (ret); + /* Now fix what we are replacing. */ + tmp_val.doff = hcp->dup_off; + tmp_val.dlen = DUP_SIZE(hcp->dup_len); + + /* Update cursor */ + if (nval->size > hcp->dup_len) + hcp->dup_tlen += (nval->size - hcp->dup_len); + else + hcp->dup_tlen -= (hcp->dup_len - nval->size); + hcp->dup_len = DUP_SIZE(nval->size); + } + myval = &tmp_val; + } else if (!F_ISSET(nval, DB_DBT_PARTIAL)) { + /* Put/overwrite */ + memcpy(&tmp_val, nval, sizeof(*nval)); + F_SET(&tmp_val, DB_DBT_PARTIAL); + tmp_val.doff = 0; + hk = H_PAIRDATA(hcp->page, hcp->indx); + if (HPAGE_PTYPE(hk) == H_OFFPAGE) + memcpy(&tmp_val.dlen, + HOFFPAGE_TLEN(hk), sizeof(u_int32_t)); + else + tmp_val.dlen = LEN_HDATA(hcp->page, + hcp->hdr->dbmeta.pagesize, hcp->indx); + myval = &tmp_val; + } else + /* Regular partial put */ + myval = nval; + + return (__ham_replpair(dbc, myval, 0)); +} + +/* + * Given a key and a cursor, sets the cursor to the page/ndx on which + * the key resides. If the key is found, the cursor H_OK flag is set + * and the pagep, bndx, pgno (dpagep, dndx, dpgno) fields are set. + * If the key is not found, the H_OK flag is not set. If the sought + * field is non-0, the pagep, bndx, pgno (dpagep, dndx, dpgno) fields + * are set indicating where an add might take place. If it is 0, + * non of the cursor pointer field are valid. + */ +static int +__ham_lookup(dbc, key, sought, mode, pgnop) + DBC *dbc; + const DBT *key; + u_int32_t sought; + db_lockmode_t mode; + db_pgno_t *pgnop; +{ + DB *dbp; + HASH_CURSOR *hcp; + db_pgno_t pgno; + u_int32_t tlen; + int match, ret; + u_int8_t *hk, *dk; + + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + /* + * Set up cursor so that we're looking for space to add an item + * as we cycle through the pages looking for the key. + */ + if ((ret = __ham_item_reset(dbc)) != 0) + return (ret); + hcp->seek_size = sought; + + hcp->bucket = __ham_call_hash(dbc, (u_int8_t *)key->data, key->size); + hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket); + + while (1) { + *pgnop = PGNO_INVALID; + if ((ret = __ham_item_next(dbc, mode, pgnop)) != 0) + return (ret); + + if (F_ISSET(hcp, H_NOMORE)) + break; + + hk = H_PAIRKEY(hcp->page, hcp->indx); + switch (HPAGE_PTYPE(hk)) { + case H_OFFPAGE: + memcpy(&tlen, HOFFPAGE_TLEN(hk), sizeof(u_int32_t)); + if (tlen == key->size) { + memcpy(&pgno, + HOFFPAGE_PGNO(hk), sizeof(db_pgno_t)); + if ((ret = __db_moff(dbp, + key, pgno, tlen, NULL, &match)) != 0) + return (ret); + if (match == 0) + goto found_key; + } + break; + case H_KEYDATA: + if (key->size == + LEN_HKEY(hcp->page, dbp->pgsize, hcp->indx) && + memcmp(key->data, + HKEYDATA_DATA(hk), key->size) == 0) { + /* Found the key, check for data type. */ +found_key: F_SET(hcp, H_OK); + dk = H_PAIRDATA(hcp->page, hcp->indx); + if (HPAGE_PTYPE(dk) == H_OFFDUP) + memcpy(pgnop, HOFFDUP_PGNO(dk), + sizeof(db_pgno_t)); + return (0); + } + break; + case H_DUPLICATE: + case H_OFFDUP: + /* + * These are errors because keys are never + * duplicated, only data items are. + */ + return (__db_pgfmt(dbp, PGNO(hcp->page))); + } + } + + /* + * Item was not found. + */ + + if (sought != 0) + return (ret); + + return (ret); +} + +/* + * __ham_init_dbt -- + * Initialize a dbt using some possibly already allocated storage + * for items. + * + * PUBLIC: int __ham_init_dbt __P((DB_ENV *, + * PUBLIC: DBT *, u_int32_t, void **, u_int32_t *)); + */ +int +__ham_init_dbt(dbenv, dbt, size, bufp, sizep) + DB_ENV *dbenv; + DBT *dbt; + u_int32_t size; + void **bufp; + u_int32_t *sizep; +{ + int ret; + + memset(dbt, 0, sizeof(*dbt)); + if (*sizep < size) { + if ((ret = __os_realloc(dbenv, size, NULL, bufp)) != 0) { + *sizep = 0; + return (ret); + } + *sizep = size; + } + dbt->data = *bufp; + dbt->size = size; + return (0); +} + +/* + * Adjust the cursor after an insert or delete. The cursor passed is + * the one that was operated upon; we just need to check any of the + * others. + * + * len indicates the length of the item added/deleted + * add indicates if the item indicated by the cursor has just been + * added (add == 1) or deleted (add == 0). + * dup indicates if the addition occurred into a duplicate set. + * + * PUBLIC: int __ham_c_update + * PUBLIC: __P((DBC *, u_int32_t, int, int)); + */ +int +__ham_c_update(dbc, len, add, is_dup) + DBC *dbc; + u_int32_t len; + int add, is_dup; +{ + DB *dbp, *ldbp; + DBC *cp; + DB_ENV *dbenv; + DB_LSN lsn; + DB_TXN *my_txn; + HASH_CURSOR *hcp, *lcp; + int found, ret; + u_int32_t order; + + dbp = dbc->dbp; + dbenv = dbp->dbenv; + hcp = (HASH_CURSOR *)dbc->internal; + + /* + * Adjustment will only be logged if this is a subtransaction. + * Only subtransactions can abort and effect their parent + * transactions cursors. + */ + + my_txn = IS_SUBTRANSACTION(dbc->txn) ? dbc->txn : NULL; + found = 0; + + MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp); + + /* + * Calcuate the order of this deleted record. + * This will be one grater than any cursor that is pointing + * at this record and already marked as deleted. + */ + order = 0; + if (!add) { + order = 1; + for (ldbp = __dblist_get(dbenv, dbp->adj_fileid); + ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; + ldbp = LIST_NEXT(ldbp, dblistlinks)) { + MUTEX_THREAD_LOCK(dbenv, dbp->mutexp); + for (cp = TAILQ_FIRST(&ldbp->active_queue); cp != NULL; + cp = TAILQ_NEXT(cp, links)) { + if (cp == dbc || cp->dbtype != DB_HASH) + continue; + lcp = (HASH_CURSOR *)cp->internal; + if (F_ISSET(lcp, H_DELETED) && + hcp->pgno == lcp->pgno && + hcp->indx == lcp->indx && + order <= lcp->order && + (!is_dup || hcp->dup_off == lcp->dup_off)) + order = lcp->order +1; + } + MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp); + } + hcp->order = order; + } + + for (ldbp = __dblist_get(dbenv, dbp->adj_fileid); + ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; + ldbp = LIST_NEXT(ldbp, dblistlinks)) { + MUTEX_THREAD_LOCK(dbenv, dbp->mutexp); + for (cp = TAILQ_FIRST(&ldbp->active_queue); cp != NULL; + cp = TAILQ_NEXT(cp, links)) { + if (cp == dbc || cp->dbtype != DB_HASH) + continue; + + lcp = (HASH_CURSOR *)cp->internal; + + if (lcp->pgno != hcp->pgno || lcp->indx == NDX_INVALID) + continue; + + if (my_txn != NULL && cp->txn != my_txn) + found = 1; + + if (!is_dup) { + if (add) { + /* + * This routine is not called to add + * non-dup records which are always put + * at the end. It is only called from + * recovery in this case and the + * cursor will be marked deleted. + * We are "undeleting" so unmark all + * cursors with the same order. + */ + if (lcp->indx == hcp->indx + && F_ISSET(lcp, H_DELETED)) { + if (lcp->order == hcp->order) + F_CLR(lcp, H_DELETED); + else if (lcp->order > + hcp->order) { + + /* + * If we've moved this cursor's + * index, split its order + * number--i.e., decrement it by + * enough so that the lowest + * cursor moved has order 1. + * cp_arg->order is the split + * point, so decrement by one + * less than that. + */ + lcp->order -= + (hcp->order - 1); + lcp->indx += 2; + } + } else if (lcp->indx >= hcp->indx) + lcp->indx += 2; + + } else { + if (lcp->indx > hcp->indx) { + lcp->indx -= 2; + if (lcp->indx == hcp->indx + && F_ISSET(lcp, H_DELETED)) + lcp->order += order; + } else if (lcp->indx == hcp->indx + && !F_ISSET(lcp, H_DELETED)) { + F_SET(lcp, H_DELETED); + lcp->order = order; + } + } + } else if (lcp->indx == hcp->indx) { + /* + * Handle duplicates. This routine is + * only called for on page dups. + * Off page dups are handled by btree/rtree + * code. + */ + if (add) { + lcp->dup_tlen += len; + if (lcp->dup_off == hcp->dup_off + && F_ISSET(hcp, H_DELETED) + && F_ISSET(lcp, H_DELETED)) { + /* Abort of a delete. */ + if (lcp->order == hcp->order) + F_CLR(lcp, H_DELETED); + else if (lcp->order > + hcp->order) { + lcp->order -= + (hcp->order -1); + lcp->dup_off += len; + } + } else if (lcp->dup_off >= hcp->dup_off) + lcp->dup_off += len; + } else { + lcp->dup_tlen -= len; + if (lcp->dup_off > hcp->dup_off) { + lcp->dup_off -= len; + if (lcp->dup_off == hcp->dup_off + && F_ISSET(lcp, H_DELETED)) + lcp->order += order; + } else if (lcp->dup_off == + hcp->dup_off && + !F_ISSET(lcp, H_DELETED)) { + F_SET(lcp, H_DELETED); + lcp->order = order; + } + } + } + } + MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp); + } + MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp); + + if (found != 0 && DB_LOGGING(dbc)) { + if ((ret = __ham_curadj_log(dbenv, + my_txn, &lsn, 0, dbp->log_fileid, hcp->pgno, + hcp->indx, len, hcp->dup_off, add, is_dup, order)) != 0) + return (ret); + } + + return (0); +} + +/* + * __ham_get_clist -- + * + * Get a list of cursors either on a particular bucket or on a particular + * page and index combination. The former is so that we can update + * cursors on a split. The latter is so we can update cursors when we + * move items off page. + * + * PUBLIC: int __ham_get_clist __P((DB *, + * PUBLIC: db_pgno_t, u_int32_t, DBC ***)); + */ +int +__ham_get_clist(dbp, bucket, indx, listp) + DB *dbp; + db_pgno_t bucket; + u_int32_t indx; + DBC ***listp; +{ + DB *ldbp; + DBC *cp; + DB_ENV *dbenv; + int nalloc, nused, ret; + + /* + * Assume that finding anything is the exception, so optimize for + * the case where there aren't any. + */ + nalloc = nused = 0; + *listp = NULL; + dbenv = dbp->dbenv; + + MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp); + for (ldbp = __dblist_get(dbenv, dbp->adj_fileid); + ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; + ldbp = LIST_NEXT(ldbp, dblistlinks)) { + MUTEX_THREAD_LOCK(dbenv, dbp->mutexp); + for (cp = TAILQ_FIRST(&ldbp->active_queue); cp != NULL; + cp = TAILQ_NEXT(cp, links)) + if (cp->dbtype == DB_HASH && + ((indx == NDX_INVALID && + ((HASH_CURSOR *)(cp->internal))->bucket + == bucket) || (indx != NDX_INVALID && + cp->internal->pgno == bucket && + cp->internal->indx == indx))) { + if (nused >= nalloc) { + nalloc += 10; + if ((ret = __os_realloc(dbp->dbenv, + nalloc * sizeof(HASH_CURSOR *), + NULL, listp)) != 0) + return (ret); + } + (*listp)[nused++] = cp; + } + + MUTEX_THREAD_UNLOCK(dbp->dbenv, dbp->mutexp); + } + MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp); + + if (listp != NULL) { + if (nused >= nalloc) { + nalloc++; + if ((ret = __os_realloc(dbp->dbenv, + nalloc * sizeof(HASH_CURSOR *), NULL, listp)) != 0) + return (ret); + } + (*listp)[nused] = NULL; + } + return (0); +} + +static int +__ham_del_dups(orig_dbc, key) + DBC *orig_dbc; + DBT *key; +{ + DBC *dbc; + DBT data, lkey; + int ret, t_ret; + + /* Allocate a cursor. */ + if ((ret = orig_dbc->c_dup(orig_dbc, &dbc, 0)) != 0) + return (ret); + + /* + * Walk a cursor through the key/data pairs, deleting as we go. Set + * the DB_DBT_USERMEM flag, as this might be a threaded application + * and the flags checking will catch us. We don't actually want the + * keys or data, so request a partial of length 0. + */ + memset(&lkey, 0, sizeof(lkey)); + F_SET(&lkey, DB_DBT_USERMEM | DB_DBT_PARTIAL); + memset(&data, 0, sizeof(data)); + F_SET(&data, DB_DBT_USERMEM | DB_DBT_PARTIAL); + + /* Walk through the set of key/data pairs, deleting as we go. */ + if ((ret = dbc->c_get(dbc, key, &data, DB_SET)) != 0) { + if (ret == DB_NOTFOUND) + ret = 0; + goto err; + } + + for (;;) { + if ((ret = dbc->c_del(dbc, 0)) != 0) + goto err; + if ((ret = dbc->c_get(dbc, &lkey, &data, DB_NEXT_DUP)) != 0) { + if (ret == DB_NOTFOUND) { + ret = 0; + break; + } + goto err; + } + } + +err: /* + * Discard the cursor. This will cause the underlying off-page dup + * tree to go away as well as the actual entry on the page. + */ + if ((t_ret = dbc->c_close(dbc)) != 0 && ret == 0) + ret = t_ret; + + return (ret); + +} + +static int +__ham_c_writelock(dbc) + DBC *dbc; +{ + HASH_CURSOR *hcp; + DB_LOCK tmp_lock; + int ret; + + /* + * All we need do is acquire the lock and let the off-page + * dup tree do its thing. + */ + if (!STD_LOCKING(dbc)) + return (0); + + hcp = (HASH_CURSOR *)dbc->internal; + if ((hcp->lock.off == LOCK_INVALID || hcp->lock_mode == DB_LOCK_READ)) { + tmp_lock = hcp->lock; + if ((ret = __ham_lock_bucket(dbc, DB_LOCK_WRITE)) != 0) + return (ret); + if (tmp_lock.off != LOCK_INVALID && + (ret = lock_put(dbc->dbp->dbenv, &tmp_lock)) != 0) + return (ret); + } + return (0); +} + +/* + * __ham_c_chgpg -- + * + * Adjust the cursors after moving an item from one page to another. + * If the old_index is NDX_INVALID, that means that we copied the + * page wholesale and we're leaving indices intact and just changing + * the page number. + * + * PUBLIC: int __ham_c_chgpg + * PUBLIC: __P((DBC *, db_pgno_t, u_int32_t, db_pgno_t, u_int32_t)); + */ +int +__ham_c_chgpg(dbc, old_pgno, old_index, new_pgno, new_index) + DBC *dbc; + db_pgno_t old_pgno, new_pgno; + u_int32_t old_index, new_index; +{ + DB *dbp, *ldbp; + DB_ENV *dbenv; + DB_LSN lsn; + DB_TXN *my_txn; + DBC *cp; + HASH_CURSOR *hcp; + int found, ret; + + dbp = dbc->dbp; + dbenv = dbp->dbenv; + + my_txn = IS_SUBTRANSACTION(dbc->txn) ? dbc->txn : NULL; + found = 0; + + MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp); + for (ldbp = __dblist_get(dbenv, dbp->adj_fileid); + ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid; + ldbp = LIST_NEXT(ldbp, dblistlinks)) { + MUTEX_THREAD_LOCK(dbenv, dbp->mutexp); + for (cp = TAILQ_FIRST(&ldbp->active_queue); cp != NULL; + cp = TAILQ_NEXT(cp, links)) { + if (cp == dbc || cp->dbtype != DB_HASH) + continue; + + hcp = (HASH_CURSOR *)cp->internal; + if (hcp->pgno == old_pgno) { + if (old_index == NDX_INVALID) { + hcp->pgno = new_pgno; + } else if (hcp->indx == old_index) { + hcp->pgno = new_pgno; + hcp->indx = new_index; + } else + continue; + if (my_txn != NULL && cp->txn != my_txn) + found = 1; + } + } + MUTEX_THREAD_UNLOCK(dbenv, dbp->mutexp); + } + MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp); + + if (found != 0 && DB_LOGGING(dbc)) { + if ((ret = __ham_chgpg_log(dbenv, + my_txn, &lsn, 0, dbp->log_fileid, DB_HAM_CHGPG, + old_pgno, new_pgno, old_index, new_index)) != 0) + return (ret); + } + return (0); +} diff --git a/bdb/hash/hash.src b/bdb/hash/hash.src new file mode 100644 index 00000000000..e6ecd11c907 --- /dev/null +++ b/bdb/hash/hash.src @@ -0,0 +1,361 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1995, 1996 + * Margo Seltzer. All rights reserved. + */ +/* + * Copyright (c) 1995, 1996 + * The President and Fellows of Harvard University. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: hash.src,v 10.24 2000/12/12 17:41:48 bostic Exp $ + */ + +/* + * This is the source file used to create the logging functions for the + * hash package. Each access method (or set of routines wishing to register + * record types with the transaction system) should have a file like this. + * Each type of log record and its parameters is defined. The basic + * format of a record definition is: + * + * BEGIN <RECORD_TYPE> + * ARG|STRING|POINTER <variable name> <variable type> <printf format> + * ... + * END + * ARG the argument is a simple parameter of the type * specified. + * DBT the argument is a DBT (db.h) containing a length and pointer. + * PTR the argument is a pointer to the data type specified; the entire + * type should be logged. + * + * There are a set of shell scripts of the form xxx.sh that generate c + * code and or h files to process these. (This is probably better done + * in a single PERL script, but for now, this works.) + * + * The DB recovery system requires the following three fields appear in + * every record, and will assign them to the per-record-type structures + * as well as making them the first parameters to the appropriate logging + * call. + * rectype: record-type, identifies the structure and log/read call + * txnid: transaction id, a DBT in this implementation + * prev: the last LSN for this transaction + */ + +/* + * Use the argument of PREFIX as the prefix for all record types, + * routines, id numbers, etc. + */ +PREFIX ham + +INCLUDE #include "db_config.h" +INCLUDE +INCLUDE #ifndef NO_SYSTEM_INCLUDES +INCLUDE #include <sys/types.h> +INCLUDE +INCLUDE #include <ctype.h> +INCLUDE #include <errno.h> +INCLUDE #include <string.h> +INCLUDE #endif +INCLUDE +INCLUDE #include "db_int.h" +INCLUDE #include "db_page.h" +INCLUDE #include "db_dispatch.h" +INCLUDE #include "db_am.h" +INCLUDE #include "hash.h" +INCLUDE #include "txn.h" +INCLUDE + +/* + * HASH-insdel: used for hash to insert/delete a pair of entries onto a master + * page. The pair might be regular key/data pairs or they might be the + * structures that refer to off page items, duplicates or offpage duplicates. + * opcode - PUTPAIR/DELPAIR + big masks + * fileid - identifies the file referenced + * pgno - page within file + * ndx - index on the page of the item being added (item index) + * pagelsn - lsn on the page before the update + * key - the key being inserted + * data - the data being inserted + */ +BEGIN insdel 21 +ARG opcode u_int32_t lu +ARG fileid int32_t ld +ARG pgno db_pgno_t lu +ARG ndx u_int32_t lu +POINTER pagelsn DB_LSN * lu +DBT key DBT s +DBT data DBT s +END + +/* + * Used to add and remove overflow pages. + * prev_pgno is the previous page that is going to get modified to + * point to this one. If this is the first page in a chain + * then prev_pgno should be PGNO_INVALID. + * new_pgno is the page being allocated. + * next_pgno is the page that follows this one. On allocation, + * this should be PGNO_INVALID. For deletes, it may exist. + * pagelsn is the old lsn on the page. + */ +BEGIN newpage 22 +ARG opcode u_int32_t lu +ARG fileid int32_t ld +ARG prev_pgno db_pgno_t lu +POINTER prevlsn DB_LSN * lu +ARG new_pgno db_pgno_t lu +POINTER pagelsn DB_LSN * lu +ARG next_pgno db_pgno_t lu +POINTER nextlsn DB_LSN * lu +END + +/* + * DEPRECATED in 3.0. + * Superceded by metagroup which allocates a group of new pages. + * + * Splitting requires two types of log messages. The first logs the + * meta-data of the split. + * + * For the meta-data split + * bucket: max_bucket in table before split + * ovflpoint: overflow point before split. + * spares: spares[ovflpoint] before split. + */ +DEPRECATED splitmeta 23 +ARG fileid int32_t ld +ARG bucket u_int32_t lu +ARG ovflpoint u_int32_t lu +ARG spares u_int32_t lu +POINTER metalsn DB_LSN * lu +END + +/* + * Splitting requires two types of log messages. The second logs the + * data on the original page. To redo the split, we have to visit the + * new page (pages) and add the items back on the page if they are not + * yet there. + */ +BEGIN splitdata 24 +ARG fileid int32_t ld +ARG opcode u_int32_t lu +ARG pgno db_pgno_t lu +DBT pageimage DBT s +POINTER pagelsn DB_LSN * lu +END + +/* + * HASH-replace: is used for hash to handle partial puts that only + * affect a single master page. + * fileid - identifies the file referenced + * pgno - page within file + * ndx - index on the page of the item being modified (item index) + * pagelsn - lsn on the page before the update + * off - offset in the old item where the new item is going. + * olditem - DBT that describes the part of the item being replaced. + * newitem - DBT of the new item. + * makedup - this was a replacement that made an item a duplicate. + */ +BEGIN replace 25 +ARG fileid int32_t ld +ARG pgno db_pgno_t lu +ARG ndx u_int32_t lu +POINTER pagelsn DB_LSN * lu +ARG off int32_t ld +DBT olditem DBT s +DBT newitem DBT s +ARG makedup u_int32_t lu +END + +/* + * DEPRECATED in 3.0. + * Hash now uses the btree allocation and deletion page routines. + * + * HASH-newpgno: is used to record getting/deleting a new page number. + * This doesn't require much data modification, just modifying the + * meta-data. + * pgno is the page being allocated/freed. + * free_pgno is the next_pgno on the free list. + * old_type was the type of a page being deallocated. + * old_pgno was the next page number before the deallocation. + */ +DEPRECATED newpgno 26 +ARG opcode u_int32_t lu +ARG fileid int32_t ld +ARG pgno db_pgno_t lu +ARG free_pgno db_pgno_t lu +ARG old_type u_int32_t lu +ARG old_pgno db_pgno_t lu +ARG new_type u_int32_t lu +POINTER pagelsn DB_LSN * lu +POINTER metalsn DB_LSN * lu +END + +/* + * DEPRECATED in 3.0. + * Since we now pre-allocate the contiguous chunk of pages for a doubling, + * there is no big benefit to pre-allocating a few extra pages. It used + * to be that the file was only physically as large as the current bucket, + * so if you were on a doubling of 16K, but were only on the first bucket + * of that 16K, the file was much shorter than it would be at the end of + * the doubling, so we didn't want to force overflow pages at the end of the + * 16K pages. Since we now must allocate the 16K pages (because of sub + * databases), it's not a big deal to tack extra pages on at the end. + * + * ovfl: initialize a set of overflow pages. + */ +DEPRECATED ovfl 27 +ARG fileid int32_t ld +ARG start_pgno db_pgno_t lu +ARG npages u_int32_t lu +ARG free_pgno db_pgno_t lu +ARG ovflpoint u_int32_t lu +POINTER metalsn DB_LSN * lu +END + +/* + * Used when we empty the first page in a bucket and there are pages after + * it. The page after it gets copied into the bucket page (since bucket + * pages have to be in fixed locations). + * pgno: the bucket page + * pagelsn: the old LSN on the bucket page + * next_pgno: the page number of the next page + * nnext_pgno: page after next_pgno (may need to change its prev) + * nnextlsn: the LSN of nnext_pgno. + */ +BEGIN copypage 28 +ARG fileid int32_t ld +ARG pgno db_pgno_t lu +POINTER pagelsn DB_LSN * lu +ARG next_pgno db_pgno_t lu +POINTER nextlsn DB_LSN * lu +ARG nnext_pgno db_pgno_t lu +POINTER nnextlsn DB_LSN * lu +DBT page DBT s +END + +/* + * This replaces the old splitmeta operation. It behaves largely the same + * way, but it has enough information so that we can record a group allocation + * which we do now because of sub databases. The number of pages allocated is + * always bucket + 1 pgno is the page number of the first newly allocated + * bucket. + * bucket: Old maximum bucket number. + * pgno: Page allocated to bucket + 1 (first newly allocated page) + * metalsn: Lsn of the meta-data page. + * pagelsn: Lsn of the maximum page allocated. + */ +BEGIN metagroup 29 +ARG fileid int32_t ld +ARG bucket u_int32_t lu +ARG pgno db_pgno_t lu +POINTER metalsn DB_LSN * lu +POINTER pagelsn DB_LSN * lu +END + +/* + * groupalloc + * + * This is used in conjunction with MPOOL_NEW_GROUP when we are creating + * a new database to make sure that we recreate or reclaim free pages + * when we allocate a chunk of contiguous ones during database creation. + * + * pgno: meta-data page number + * metalsn: meta-data lsn + * start_pgno: starting page number + * num: number of allocated pages + */ +DEPRECATED groupalloc1 30 +ARG fileid int32_t ld +ARG pgno db_pgno_t lu +POINTER metalsn DB_LSN * lu +POINTER mmetalsn DB_LSN * lu +ARG start_pgno db_pgno_t lu +ARG num u_int32_t lu +END + +DEPRECATED groupalloc2 31 +ARG fileid int32_t ld +POINTER meta_lsn DB_LSN * lu +POINTER alloc_lsn DB_LSN * lu +ARG start_pgno db_pgno_t lu +ARG num u_int32_t lu +ARG free db_pgno_t lu +END + +BEGIN groupalloc 32 +ARG fileid int32_t ld +POINTER meta_lsn DB_LSN * lu +ARG start_pgno db_pgno_t lu +ARG num u_int32_t lu +ARG free db_pgno_t lu +END + +/* + * Records for backing out cursor adjustment. + * curadj - added or deleted a record or a dup + * within a record. + * pgno - page that was effected + * indx - indx of recrod effected. + * len - if a dup its length. + * dup_off - if a dup its offset + * add - 1 if add 0 if delete + * is_dup - 1 if dup 0 otherwise. + * order - order assinged to this deleted record or dup. + * + * chgpg - rmoved a page, move the records to a new page + * mode - CHGPG page was deleted or records move to new page. + * - SPLIT we split a bucket + * - DUP we convered to off page duplicates. + * old_pgno, new_pgno - old and new page numbers. + * old_index, new_index - old and new index numbers, NDX_INVALID if + * it effects all records on the page. + */ +BEGIN curadj 33 +ARG fileid int32_t ld +ARG pgno db_pgno_t lu +ARG indx u_int32_t lu +ARG len u_int32_t lu +ARG dup_off u_int32_t lu +ARG add int ld +ARG is_dup int ld +ARG order u_int32_t lu +END + +BEGIN chgpg 34 +ARG fileid int32_t ld +ARG mode db_ham_mode ld +ARG old_pgno db_pgno_t lu +ARG new_pgno db_pgno_t lu +ARG old_indx u_int32_t lu +ARG new_indx u_int32_t lu +END + diff --git a/bdb/hash/hash_auto.c b/bdb/hash/hash_auto.c new file mode 100644 index 00000000000..b6faf4f5645 --- /dev/null +++ b/bdb/hash/hash_auto.c @@ -0,0 +1,2023 @@ +/* Do not edit: automatically built by gen_rec.awk. */ +#include "db_config.h" + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <ctype.h> +#include <errno.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_dispatch.h" +#include "db_am.h" +#include "hash.h" +#include "txn.h" + +int +__ham_insdel_log(dbenv, txnid, ret_lsnp, flags, + opcode, fileid, pgno, ndx, pagelsn, key, + data) + DB_ENV *dbenv; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t opcode; + int32_t fileid; + db_pgno_t pgno; + u_int32_t ndx; + DB_LSN * pagelsn; + const DBT *key; + const DBT *data; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t zero; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_ham_insdel; + if (txnid != NULL && + TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + ZERO_LSN(null_lsn); + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(opcode) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(ndx) + + sizeof(*pagelsn) + + sizeof(u_int32_t) + (key == NULL ? 0 : key->size) + + sizeof(u_int32_t) + (data == NULL ? 0 : data->size); + if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) + return (ret); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &opcode, sizeof(opcode)); + bp += sizeof(opcode); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + memcpy(bp, &ndx, sizeof(ndx)); + bp += sizeof(ndx); + if (pagelsn != NULL) + memcpy(bp, pagelsn, sizeof(*pagelsn)); + else + memset(bp, 0, sizeof(*pagelsn)); + bp += sizeof(*pagelsn); + if (key == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &key->size, sizeof(key->size)); + bp += sizeof(key->size); + memcpy(bp, key->data, key->size); + bp += key->size; + } + if (data == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &data->size, sizeof(data->size)); + bp += sizeof(data->size); + memcpy(bp, data->data, data->size); + bp += data->size; + } + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); + ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + __os_free(logrec.data, logrec.size); + return (ret); +} + +int +__ham_insdel_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __ham_insdel_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __ham_insdel_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]ham_insdel: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\topcode: %lu\n", (u_long)argp->opcode); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tndx: %lu\n", (u_long)argp->ndx); + printf("\tpagelsn: [%lu][%lu]\n", + (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset); + printf("\tkey: "); + for (i = 0; i < argp->key.size; i++) { + ch = ((u_int8_t *)argp->key.data)[i]; + if (isprint(ch) || ch == 0xa) + putchar(ch); + else + printf("%#x ", ch); + } + printf("\n"); + printf("\tdata: "); + for (i = 0; i < argp->data.size; i++) { + ch = ((u_int8_t *)argp->data.data)[i]; + if (isprint(ch) || ch == 0xa) + putchar(ch); + else + printf("%#x ", ch); + } + printf("\n"); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__ham_insdel_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __ham_insdel_args **argpp; +{ + __ham_insdel_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__ham_insdel_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->opcode, bp, sizeof(argp->opcode)); + bp += sizeof(argp->opcode); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->ndx, bp, sizeof(argp->ndx)); + bp += sizeof(argp->ndx); + memcpy(&argp->pagelsn, bp, sizeof(argp->pagelsn)); + bp += sizeof(argp->pagelsn); + memset(&argp->key, 0, sizeof(argp->key)); + memcpy(&argp->key.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->key.data = bp; + bp += argp->key.size; + memset(&argp->data, 0, sizeof(argp->data)); + memcpy(&argp->data.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->data.data = bp; + bp += argp->data.size; + *argpp = argp; + return (0); +} + +int +__ham_newpage_log(dbenv, txnid, ret_lsnp, flags, + opcode, fileid, prev_pgno, prevlsn, new_pgno, pagelsn, + next_pgno, nextlsn) + DB_ENV *dbenv; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + u_int32_t opcode; + int32_t fileid; + db_pgno_t prev_pgno; + DB_LSN * prevlsn; + db_pgno_t new_pgno; + DB_LSN * pagelsn; + db_pgno_t next_pgno; + DB_LSN * nextlsn; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_ham_newpage; + if (txnid != NULL && + TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + ZERO_LSN(null_lsn); + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(opcode) + + sizeof(fileid) + + sizeof(prev_pgno) + + sizeof(*prevlsn) + + sizeof(new_pgno) + + sizeof(*pagelsn) + + sizeof(next_pgno) + + sizeof(*nextlsn); + if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) + return (ret); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &opcode, sizeof(opcode)); + bp += sizeof(opcode); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &prev_pgno, sizeof(prev_pgno)); + bp += sizeof(prev_pgno); + if (prevlsn != NULL) + memcpy(bp, prevlsn, sizeof(*prevlsn)); + else + memset(bp, 0, sizeof(*prevlsn)); + bp += sizeof(*prevlsn); + memcpy(bp, &new_pgno, sizeof(new_pgno)); + bp += sizeof(new_pgno); + if (pagelsn != NULL) + memcpy(bp, pagelsn, sizeof(*pagelsn)); + else + memset(bp, 0, sizeof(*pagelsn)); + bp += sizeof(*pagelsn); + memcpy(bp, &next_pgno, sizeof(next_pgno)); + bp += sizeof(next_pgno); + if (nextlsn != NULL) + memcpy(bp, nextlsn, sizeof(*nextlsn)); + else + memset(bp, 0, sizeof(*nextlsn)); + bp += sizeof(*nextlsn); + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); + ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + __os_free(logrec.data, logrec.size); + return (ret); +} + +int +__ham_newpage_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __ham_newpage_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __ham_newpage_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]ham_newpage: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\topcode: %lu\n", (u_long)argp->opcode); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\tprev_pgno: %lu\n", (u_long)argp->prev_pgno); + printf("\tprevlsn: [%lu][%lu]\n", + (u_long)argp->prevlsn.file, (u_long)argp->prevlsn.offset); + printf("\tnew_pgno: %lu\n", (u_long)argp->new_pgno); + printf("\tpagelsn: [%lu][%lu]\n", + (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset); + printf("\tnext_pgno: %lu\n", (u_long)argp->next_pgno); + printf("\tnextlsn: [%lu][%lu]\n", + (u_long)argp->nextlsn.file, (u_long)argp->nextlsn.offset); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__ham_newpage_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __ham_newpage_args **argpp; +{ + __ham_newpage_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__ham_newpage_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->opcode, bp, sizeof(argp->opcode)); + bp += sizeof(argp->opcode); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->prev_pgno, bp, sizeof(argp->prev_pgno)); + bp += sizeof(argp->prev_pgno); + memcpy(&argp->prevlsn, bp, sizeof(argp->prevlsn)); + bp += sizeof(argp->prevlsn); + memcpy(&argp->new_pgno, bp, sizeof(argp->new_pgno)); + bp += sizeof(argp->new_pgno); + memcpy(&argp->pagelsn, bp, sizeof(argp->pagelsn)); + bp += sizeof(argp->pagelsn); + memcpy(&argp->next_pgno, bp, sizeof(argp->next_pgno)); + bp += sizeof(argp->next_pgno); + memcpy(&argp->nextlsn, bp, sizeof(argp->nextlsn)); + bp += sizeof(argp->nextlsn); + *argpp = argp; + return (0); +} + +int +__ham_splitmeta_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __ham_splitmeta_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __ham_splitmeta_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]ham_splitmeta: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\tbucket: %lu\n", (u_long)argp->bucket); + printf("\tovflpoint: %lu\n", (u_long)argp->ovflpoint); + printf("\tspares: %lu\n", (u_long)argp->spares); + printf("\tmetalsn: [%lu][%lu]\n", + (u_long)argp->metalsn.file, (u_long)argp->metalsn.offset); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__ham_splitmeta_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __ham_splitmeta_args **argpp; +{ + __ham_splitmeta_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__ham_splitmeta_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->bucket, bp, sizeof(argp->bucket)); + bp += sizeof(argp->bucket); + memcpy(&argp->ovflpoint, bp, sizeof(argp->ovflpoint)); + bp += sizeof(argp->ovflpoint); + memcpy(&argp->spares, bp, sizeof(argp->spares)); + bp += sizeof(argp->spares); + memcpy(&argp->metalsn, bp, sizeof(argp->metalsn)); + bp += sizeof(argp->metalsn); + *argpp = argp; + return (0); +} + +int +__ham_splitdata_log(dbenv, txnid, ret_lsnp, flags, + fileid, opcode, pgno, pageimage, pagelsn) + DB_ENV *dbenv; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + int32_t fileid; + u_int32_t opcode; + db_pgno_t pgno; + const DBT *pageimage; + DB_LSN * pagelsn; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t zero; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_ham_splitdata; + if (txnid != NULL && + TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + ZERO_LSN(null_lsn); + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(opcode) + + sizeof(pgno) + + sizeof(u_int32_t) + (pageimage == NULL ? 0 : pageimage->size) + + sizeof(*pagelsn); + if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) + return (ret); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &opcode, sizeof(opcode)); + bp += sizeof(opcode); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + if (pageimage == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &pageimage->size, sizeof(pageimage->size)); + bp += sizeof(pageimage->size); + memcpy(bp, pageimage->data, pageimage->size); + bp += pageimage->size; + } + if (pagelsn != NULL) + memcpy(bp, pagelsn, sizeof(*pagelsn)); + else + memset(bp, 0, sizeof(*pagelsn)); + bp += sizeof(*pagelsn); + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); + ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + __os_free(logrec.data, logrec.size); + return (ret); +} + +int +__ham_splitdata_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __ham_splitdata_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __ham_splitdata_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]ham_splitdata: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\topcode: %lu\n", (u_long)argp->opcode); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tpageimage: "); + for (i = 0; i < argp->pageimage.size; i++) { + ch = ((u_int8_t *)argp->pageimage.data)[i]; + if (isprint(ch) || ch == 0xa) + putchar(ch); + else + printf("%#x ", ch); + } + printf("\n"); + printf("\tpagelsn: [%lu][%lu]\n", + (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__ham_splitdata_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __ham_splitdata_args **argpp; +{ + __ham_splitdata_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__ham_splitdata_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->opcode, bp, sizeof(argp->opcode)); + bp += sizeof(argp->opcode); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memset(&argp->pageimage, 0, sizeof(argp->pageimage)); + memcpy(&argp->pageimage.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->pageimage.data = bp; + bp += argp->pageimage.size; + memcpy(&argp->pagelsn, bp, sizeof(argp->pagelsn)); + bp += sizeof(argp->pagelsn); + *argpp = argp; + return (0); +} + +int +__ham_replace_log(dbenv, txnid, ret_lsnp, flags, + fileid, pgno, ndx, pagelsn, off, olditem, + newitem, makedup) + DB_ENV *dbenv; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + int32_t fileid; + db_pgno_t pgno; + u_int32_t ndx; + DB_LSN * pagelsn; + int32_t off; + const DBT *olditem; + const DBT *newitem; + u_int32_t makedup; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t zero; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_ham_replace; + if (txnid != NULL && + TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + ZERO_LSN(null_lsn); + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(ndx) + + sizeof(*pagelsn) + + sizeof(off) + + sizeof(u_int32_t) + (olditem == NULL ? 0 : olditem->size) + + sizeof(u_int32_t) + (newitem == NULL ? 0 : newitem->size) + + sizeof(makedup); + if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) + return (ret); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + memcpy(bp, &ndx, sizeof(ndx)); + bp += sizeof(ndx); + if (pagelsn != NULL) + memcpy(bp, pagelsn, sizeof(*pagelsn)); + else + memset(bp, 0, sizeof(*pagelsn)); + bp += sizeof(*pagelsn); + memcpy(bp, &off, sizeof(off)); + bp += sizeof(off); + if (olditem == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &olditem->size, sizeof(olditem->size)); + bp += sizeof(olditem->size); + memcpy(bp, olditem->data, olditem->size); + bp += olditem->size; + } + if (newitem == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &newitem->size, sizeof(newitem->size)); + bp += sizeof(newitem->size); + memcpy(bp, newitem->data, newitem->size); + bp += newitem->size; + } + memcpy(bp, &makedup, sizeof(makedup)); + bp += sizeof(makedup); + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); + ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + __os_free(logrec.data, logrec.size); + return (ret); +} + +int +__ham_replace_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __ham_replace_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __ham_replace_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]ham_replace: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tndx: %lu\n", (u_long)argp->ndx); + printf("\tpagelsn: [%lu][%lu]\n", + (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset); + printf("\toff: %ld\n", (long)argp->off); + printf("\tolditem: "); + for (i = 0; i < argp->olditem.size; i++) { + ch = ((u_int8_t *)argp->olditem.data)[i]; + if (isprint(ch) || ch == 0xa) + putchar(ch); + else + printf("%#x ", ch); + } + printf("\n"); + printf("\tnewitem: "); + for (i = 0; i < argp->newitem.size; i++) { + ch = ((u_int8_t *)argp->newitem.data)[i]; + if (isprint(ch) || ch == 0xa) + putchar(ch); + else + printf("%#x ", ch); + } + printf("\n"); + printf("\tmakedup: %lu\n", (u_long)argp->makedup); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__ham_replace_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __ham_replace_args **argpp; +{ + __ham_replace_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__ham_replace_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->ndx, bp, sizeof(argp->ndx)); + bp += sizeof(argp->ndx); + memcpy(&argp->pagelsn, bp, sizeof(argp->pagelsn)); + bp += sizeof(argp->pagelsn); + memcpy(&argp->off, bp, sizeof(argp->off)); + bp += sizeof(argp->off); + memset(&argp->olditem, 0, sizeof(argp->olditem)); + memcpy(&argp->olditem.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->olditem.data = bp; + bp += argp->olditem.size; + memset(&argp->newitem, 0, sizeof(argp->newitem)); + memcpy(&argp->newitem.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->newitem.data = bp; + bp += argp->newitem.size; + memcpy(&argp->makedup, bp, sizeof(argp->makedup)); + bp += sizeof(argp->makedup); + *argpp = argp; + return (0); +} + +int +__ham_newpgno_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __ham_newpgno_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __ham_newpgno_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]ham_newpgno: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\topcode: %lu\n", (u_long)argp->opcode); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tfree_pgno: %lu\n", (u_long)argp->free_pgno); + printf("\told_type: %lu\n", (u_long)argp->old_type); + printf("\told_pgno: %lu\n", (u_long)argp->old_pgno); + printf("\tnew_type: %lu\n", (u_long)argp->new_type); + printf("\tpagelsn: [%lu][%lu]\n", + (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset); + printf("\tmetalsn: [%lu][%lu]\n", + (u_long)argp->metalsn.file, (u_long)argp->metalsn.offset); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__ham_newpgno_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __ham_newpgno_args **argpp; +{ + __ham_newpgno_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__ham_newpgno_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->opcode, bp, sizeof(argp->opcode)); + bp += sizeof(argp->opcode); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->free_pgno, bp, sizeof(argp->free_pgno)); + bp += sizeof(argp->free_pgno); + memcpy(&argp->old_type, bp, sizeof(argp->old_type)); + bp += sizeof(argp->old_type); + memcpy(&argp->old_pgno, bp, sizeof(argp->old_pgno)); + bp += sizeof(argp->old_pgno); + memcpy(&argp->new_type, bp, sizeof(argp->new_type)); + bp += sizeof(argp->new_type); + memcpy(&argp->pagelsn, bp, sizeof(argp->pagelsn)); + bp += sizeof(argp->pagelsn); + memcpy(&argp->metalsn, bp, sizeof(argp->metalsn)); + bp += sizeof(argp->metalsn); + *argpp = argp; + return (0); +} + +int +__ham_ovfl_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __ham_ovfl_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __ham_ovfl_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]ham_ovfl: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\tstart_pgno: %lu\n", (u_long)argp->start_pgno); + printf("\tnpages: %lu\n", (u_long)argp->npages); + printf("\tfree_pgno: %lu\n", (u_long)argp->free_pgno); + printf("\tovflpoint: %lu\n", (u_long)argp->ovflpoint); + printf("\tmetalsn: [%lu][%lu]\n", + (u_long)argp->metalsn.file, (u_long)argp->metalsn.offset); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__ham_ovfl_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __ham_ovfl_args **argpp; +{ + __ham_ovfl_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__ham_ovfl_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->start_pgno, bp, sizeof(argp->start_pgno)); + bp += sizeof(argp->start_pgno); + memcpy(&argp->npages, bp, sizeof(argp->npages)); + bp += sizeof(argp->npages); + memcpy(&argp->free_pgno, bp, sizeof(argp->free_pgno)); + bp += sizeof(argp->free_pgno); + memcpy(&argp->ovflpoint, bp, sizeof(argp->ovflpoint)); + bp += sizeof(argp->ovflpoint); + memcpy(&argp->metalsn, bp, sizeof(argp->metalsn)); + bp += sizeof(argp->metalsn); + *argpp = argp; + return (0); +} + +int +__ham_copypage_log(dbenv, txnid, ret_lsnp, flags, + fileid, pgno, pagelsn, next_pgno, nextlsn, nnext_pgno, + nnextlsn, page) + DB_ENV *dbenv; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + int32_t fileid; + db_pgno_t pgno; + DB_LSN * pagelsn; + db_pgno_t next_pgno; + DB_LSN * nextlsn; + db_pgno_t nnext_pgno; + DB_LSN * nnextlsn; + const DBT *page; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t zero; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_ham_copypage; + if (txnid != NULL && + TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + ZERO_LSN(null_lsn); + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(*pagelsn) + + sizeof(next_pgno) + + sizeof(*nextlsn) + + sizeof(nnext_pgno) + + sizeof(*nnextlsn) + + sizeof(u_int32_t) + (page == NULL ? 0 : page->size); + if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) + return (ret); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + if (pagelsn != NULL) + memcpy(bp, pagelsn, sizeof(*pagelsn)); + else + memset(bp, 0, sizeof(*pagelsn)); + bp += sizeof(*pagelsn); + memcpy(bp, &next_pgno, sizeof(next_pgno)); + bp += sizeof(next_pgno); + if (nextlsn != NULL) + memcpy(bp, nextlsn, sizeof(*nextlsn)); + else + memset(bp, 0, sizeof(*nextlsn)); + bp += sizeof(*nextlsn); + memcpy(bp, &nnext_pgno, sizeof(nnext_pgno)); + bp += sizeof(nnext_pgno); + if (nnextlsn != NULL) + memcpy(bp, nnextlsn, sizeof(*nnextlsn)); + else + memset(bp, 0, sizeof(*nnextlsn)); + bp += sizeof(*nnextlsn); + if (page == NULL) { + zero = 0; + memcpy(bp, &zero, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + } else { + memcpy(bp, &page->size, sizeof(page->size)); + bp += sizeof(page->size); + memcpy(bp, page->data, page->size); + bp += page->size; + } + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); + ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + __os_free(logrec.data, logrec.size); + return (ret); +} + +int +__ham_copypage_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __ham_copypage_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __ham_copypage_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]ham_copypage: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tpagelsn: [%lu][%lu]\n", + (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset); + printf("\tnext_pgno: %lu\n", (u_long)argp->next_pgno); + printf("\tnextlsn: [%lu][%lu]\n", + (u_long)argp->nextlsn.file, (u_long)argp->nextlsn.offset); + printf("\tnnext_pgno: %lu\n", (u_long)argp->nnext_pgno); + printf("\tnnextlsn: [%lu][%lu]\n", + (u_long)argp->nnextlsn.file, (u_long)argp->nnextlsn.offset); + printf("\tpage: "); + for (i = 0; i < argp->page.size; i++) { + ch = ((u_int8_t *)argp->page.data)[i]; + if (isprint(ch) || ch == 0xa) + putchar(ch); + else + printf("%#x ", ch); + } + printf("\n"); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__ham_copypage_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __ham_copypage_args **argpp; +{ + __ham_copypage_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__ham_copypage_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->pagelsn, bp, sizeof(argp->pagelsn)); + bp += sizeof(argp->pagelsn); + memcpy(&argp->next_pgno, bp, sizeof(argp->next_pgno)); + bp += sizeof(argp->next_pgno); + memcpy(&argp->nextlsn, bp, sizeof(argp->nextlsn)); + bp += sizeof(argp->nextlsn); + memcpy(&argp->nnext_pgno, bp, sizeof(argp->nnext_pgno)); + bp += sizeof(argp->nnext_pgno); + memcpy(&argp->nnextlsn, bp, sizeof(argp->nnextlsn)); + bp += sizeof(argp->nnextlsn); + memset(&argp->page, 0, sizeof(argp->page)); + memcpy(&argp->page.size, bp, sizeof(u_int32_t)); + bp += sizeof(u_int32_t); + argp->page.data = bp; + bp += argp->page.size; + *argpp = argp; + return (0); +} + +int +__ham_metagroup_log(dbenv, txnid, ret_lsnp, flags, + fileid, bucket, pgno, metalsn, pagelsn) + DB_ENV *dbenv; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + int32_t fileid; + u_int32_t bucket; + db_pgno_t pgno; + DB_LSN * metalsn; + DB_LSN * pagelsn; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_ham_metagroup; + if (txnid != NULL && + TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + ZERO_LSN(null_lsn); + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(bucket) + + sizeof(pgno) + + sizeof(*metalsn) + + sizeof(*pagelsn); + if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) + return (ret); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &bucket, sizeof(bucket)); + bp += sizeof(bucket); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + if (metalsn != NULL) + memcpy(bp, metalsn, sizeof(*metalsn)); + else + memset(bp, 0, sizeof(*metalsn)); + bp += sizeof(*metalsn); + if (pagelsn != NULL) + memcpy(bp, pagelsn, sizeof(*pagelsn)); + else + memset(bp, 0, sizeof(*pagelsn)); + bp += sizeof(*pagelsn); + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); + ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + __os_free(logrec.data, logrec.size); + return (ret); +} + +int +__ham_metagroup_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __ham_metagroup_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __ham_metagroup_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]ham_metagroup: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\tbucket: %lu\n", (u_long)argp->bucket); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tmetalsn: [%lu][%lu]\n", + (u_long)argp->metalsn.file, (u_long)argp->metalsn.offset); + printf("\tpagelsn: [%lu][%lu]\n", + (u_long)argp->pagelsn.file, (u_long)argp->pagelsn.offset); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__ham_metagroup_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __ham_metagroup_args **argpp; +{ + __ham_metagroup_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__ham_metagroup_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->bucket, bp, sizeof(argp->bucket)); + bp += sizeof(argp->bucket); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->metalsn, bp, sizeof(argp->metalsn)); + bp += sizeof(argp->metalsn); + memcpy(&argp->pagelsn, bp, sizeof(argp->pagelsn)); + bp += sizeof(argp->pagelsn); + *argpp = argp; + return (0); +} + +int +__ham_groupalloc1_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __ham_groupalloc1_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __ham_groupalloc1_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]ham_groupalloc1: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tmetalsn: [%lu][%lu]\n", + (u_long)argp->metalsn.file, (u_long)argp->metalsn.offset); + printf("\tmmetalsn: [%lu][%lu]\n", + (u_long)argp->mmetalsn.file, (u_long)argp->mmetalsn.offset); + printf("\tstart_pgno: %lu\n", (u_long)argp->start_pgno); + printf("\tnum: %lu\n", (u_long)argp->num); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__ham_groupalloc1_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __ham_groupalloc1_args **argpp; +{ + __ham_groupalloc1_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__ham_groupalloc1_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->metalsn, bp, sizeof(argp->metalsn)); + bp += sizeof(argp->metalsn); + memcpy(&argp->mmetalsn, bp, sizeof(argp->mmetalsn)); + bp += sizeof(argp->mmetalsn); + memcpy(&argp->start_pgno, bp, sizeof(argp->start_pgno)); + bp += sizeof(argp->start_pgno); + memcpy(&argp->num, bp, sizeof(argp->num)); + bp += sizeof(argp->num); + *argpp = argp; + return (0); +} + +int +__ham_groupalloc2_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __ham_groupalloc2_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __ham_groupalloc2_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]ham_groupalloc2: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\tmeta_lsn: [%lu][%lu]\n", + (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset); + printf("\talloc_lsn: [%lu][%lu]\n", + (u_long)argp->alloc_lsn.file, (u_long)argp->alloc_lsn.offset); + printf("\tstart_pgno: %lu\n", (u_long)argp->start_pgno); + printf("\tnum: %lu\n", (u_long)argp->num); + printf("\tfree: %lu\n", (u_long)argp->free); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__ham_groupalloc2_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __ham_groupalloc2_args **argpp; +{ + __ham_groupalloc2_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__ham_groupalloc2_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->meta_lsn, bp, sizeof(argp->meta_lsn)); + bp += sizeof(argp->meta_lsn); + memcpy(&argp->alloc_lsn, bp, sizeof(argp->alloc_lsn)); + bp += sizeof(argp->alloc_lsn); + memcpy(&argp->start_pgno, bp, sizeof(argp->start_pgno)); + bp += sizeof(argp->start_pgno); + memcpy(&argp->num, bp, sizeof(argp->num)); + bp += sizeof(argp->num); + memcpy(&argp->free, bp, sizeof(argp->free)); + bp += sizeof(argp->free); + *argpp = argp; + return (0); +} + +int +__ham_groupalloc_log(dbenv, txnid, ret_lsnp, flags, + fileid, meta_lsn, start_pgno, num, free) + DB_ENV *dbenv; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + int32_t fileid; + DB_LSN * meta_lsn; + db_pgno_t start_pgno; + u_int32_t num; + db_pgno_t free; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_ham_groupalloc; + if (txnid != NULL && + TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + ZERO_LSN(null_lsn); + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(*meta_lsn) + + sizeof(start_pgno) + + sizeof(num) + + sizeof(free); + if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) + return (ret); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + if (meta_lsn != NULL) + memcpy(bp, meta_lsn, sizeof(*meta_lsn)); + else + memset(bp, 0, sizeof(*meta_lsn)); + bp += sizeof(*meta_lsn); + memcpy(bp, &start_pgno, sizeof(start_pgno)); + bp += sizeof(start_pgno); + memcpy(bp, &num, sizeof(num)); + bp += sizeof(num); + memcpy(bp, &free, sizeof(free)); + bp += sizeof(free); + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); + ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + __os_free(logrec.data, logrec.size); + return (ret); +} + +int +__ham_groupalloc_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __ham_groupalloc_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __ham_groupalloc_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]ham_groupalloc: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\tmeta_lsn: [%lu][%lu]\n", + (u_long)argp->meta_lsn.file, (u_long)argp->meta_lsn.offset); + printf("\tstart_pgno: %lu\n", (u_long)argp->start_pgno); + printf("\tnum: %lu\n", (u_long)argp->num); + printf("\tfree: %lu\n", (u_long)argp->free); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__ham_groupalloc_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __ham_groupalloc_args **argpp; +{ + __ham_groupalloc_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__ham_groupalloc_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->meta_lsn, bp, sizeof(argp->meta_lsn)); + bp += sizeof(argp->meta_lsn); + memcpy(&argp->start_pgno, bp, sizeof(argp->start_pgno)); + bp += sizeof(argp->start_pgno); + memcpy(&argp->num, bp, sizeof(argp->num)); + bp += sizeof(argp->num); + memcpy(&argp->free, bp, sizeof(argp->free)); + bp += sizeof(argp->free); + *argpp = argp; + return (0); +} + +int +__ham_curadj_log(dbenv, txnid, ret_lsnp, flags, + fileid, pgno, indx, len, dup_off, add, + is_dup, order) + DB_ENV *dbenv; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + int32_t fileid; + db_pgno_t pgno; + u_int32_t indx; + u_int32_t len; + u_int32_t dup_off; + int add; + int is_dup; + u_int32_t order; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_ham_curadj; + if (txnid != NULL && + TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + ZERO_LSN(null_lsn); + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(pgno) + + sizeof(indx) + + sizeof(len) + + sizeof(dup_off) + + sizeof(add) + + sizeof(is_dup) + + sizeof(order); + if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) + return (ret); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &pgno, sizeof(pgno)); + bp += sizeof(pgno); + memcpy(bp, &indx, sizeof(indx)); + bp += sizeof(indx); + memcpy(bp, &len, sizeof(len)); + bp += sizeof(len); + memcpy(bp, &dup_off, sizeof(dup_off)); + bp += sizeof(dup_off); + memcpy(bp, &add, sizeof(add)); + bp += sizeof(add); + memcpy(bp, &is_dup, sizeof(is_dup)); + bp += sizeof(is_dup); + memcpy(bp, &order, sizeof(order)); + bp += sizeof(order); + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); + ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + __os_free(logrec.data, logrec.size); + return (ret); +} + +int +__ham_curadj_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __ham_curadj_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __ham_curadj_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]ham_curadj: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\tpgno: %lu\n", (u_long)argp->pgno); + printf("\tindx: %lu\n", (u_long)argp->indx); + printf("\tlen: %lu\n", (u_long)argp->len); + printf("\tdup_off: %lu\n", (u_long)argp->dup_off); + printf("\tadd: %ld\n", (long)argp->add); + printf("\tis_dup: %ld\n", (long)argp->is_dup); + printf("\torder: %lu\n", (u_long)argp->order); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__ham_curadj_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __ham_curadj_args **argpp; +{ + __ham_curadj_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__ham_curadj_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->pgno, bp, sizeof(argp->pgno)); + bp += sizeof(argp->pgno); + memcpy(&argp->indx, bp, sizeof(argp->indx)); + bp += sizeof(argp->indx); + memcpy(&argp->len, bp, sizeof(argp->len)); + bp += sizeof(argp->len); + memcpy(&argp->dup_off, bp, sizeof(argp->dup_off)); + bp += sizeof(argp->dup_off); + memcpy(&argp->add, bp, sizeof(argp->add)); + bp += sizeof(argp->add); + memcpy(&argp->is_dup, bp, sizeof(argp->is_dup)); + bp += sizeof(argp->is_dup); + memcpy(&argp->order, bp, sizeof(argp->order)); + bp += sizeof(argp->order); + *argpp = argp; + return (0); +} + +int +__ham_chgpg_log(dbenv, txnid, ret_lsnp, flags, + fileid, mode, old_pgno, new_pgno, old_indx, new_indx) + DB_ENV *dbenv; + DB_TXN *txnid; + DB_LSN *ret_lsnp; + u_int32_t flags; + int32_t fileid; + db_ham_mode mode; + db_pgno_t old_pgno; + db_pgno_t new_pgno; + u_int32_t old_indx; + u_int32_t new_indx; +{ + DBT logrec; + DB_LSN *lsnp, null_lsn; + u_int32_t rectype, txn_num; + int ret; + u_int8_t *bp; + + rectype = DB_ham_chgpg; + if (txnid != NULL && + TAILQ_FIRST(&txnid->kids) != NULL && + (ret = __txn_activekids(dbenv, rectype, txnid)) != 0) + return (ret); + txn_num = txnid == NULL ? 0 : txnid->txnid; + if (txnid == NULL) { + ZERO_LSN(null_lsn); + lsnp = &null_lsn; + } else + lsnp = &txnid->last_lsn; + logrec.size = sizeof(rectype) + sizeof(txn_num) + sizeof(DB_LSN) + + sizeof(fileid) + + sizeof(mode) + + sizeof(old_pgno) + + sizeof(new_pgno) + + sizeof(old_indx) + + sizeof(new_indx); + if ((ret = __os_malloc(dbenv, logrec.size, NULL, &logrec.data)) != 0) + return (ret); + + bp = logrec.data; + memcpy(bp, &rectype, sizeof(rectype)); + bp += sizeof(rectype); + memcpy(bp, &txn_num, sizeof(txn_num)); + bp += sizeof(txn_num); + memcpy(bp, lsnp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(bp, &fileid, sizeof(fileid)); + bp += sizeof(fileid); + memcpy(bp, &mode, sizeof(mode)); + bp += sizeof(mode); + memcpy(bp, &old_pgno, sizeof(old_pgno)); + bp += sizeof(old_pgno); + memcpy(bp, &new_pgno, sizeof(new_pgno)); + bp += sizeof(new_pgno); + memcpy(bp, &old_indx, sizeof(old_indx)); + bp += sizeof(old_indx); + memcpy(bp, &new_indx, sizeof(new_indx)); + bp += sizeof(new_indx); + DB_ASSERT((u_int32_t)(bp - (u_int8_t *)logrec.data) == logrec.size); + ret = log_put(dbenv, ret_lsnp, (DBT *)&logrec, flags); + if (txnid != NULL) + txnid->last_lsn = *ret_lsnp; + __os_free(logrec.data, logrec.size); + return (ret); +} + +int +__ham_chgpg_print(dbenv, dbtp, lsnp, notused2, notused3) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops notused2; + void *notused3; +{ + __ham_chgpg_args *argp; + u_int32_t i; + u_int ch; + int ret; + + i = 0; + ch = 0; + notused2 = DB_TXN_ABORT; + notused3 = NULL; + + if ((ret = __ham_chgpg_read(dbenv, dbtp->data, &argp)) != 0) + return (ret); + printf("[%lu][%lu]ham_chgpg: rec: %lu txnid %lx prevlsn [%lu][%lu]\n", + (u_long)lsnp->file, + (u_long)lsnp->offset, + (u_long)argp->type, + (u_long)argp->txnid->txnid, + (u_long)argp->prev_lsn.file, + (u_long)argp->prev_lsn.offset); + printf("\tfileid: %ld\n", (long)argp->fileid); + printf("\tmode: %ld\n", (long)argp->mode); + printf("\told_pgno: %lu\n", (u_long)argp->old_pgno); + printf("\tnew_pgno: %lu\n", (u_long)argp->new_pgno); + printf("\told_indx: %lu\n", (u_long)argp->old_indx); + printf("\tnew_indx: %lu\n", (u_long)argp->new_indx); + printf("\n"); + __os_free(argp, 0); + return (0); +} + +int +__ham_chgpg_read(dbenv, recbuf, argpp) + DB_ENV *dbenv; + void *recbuf; + __ham_chgpg_args **argpp; +{ + __ham_chgpg_args *argp; + u_int8_t *bp; + int ret; + + ret = __os_malloc(dbenv, sizeof(__ham_chgpg_args) + + sizeof(DB_TXN), NULL, &argp); + if (ret != 0) + return (ret); + argp->txnid = (DB_TXN *)&argp[1]; + bp = recbuf; + memcpy(&argp->type, bp, sizeof(argp->type)); + bp += sizeof(argp->type); + memcpy(&argp->txnid->txnid, bp, sizeof(argp->txnid->txnid)); + bp += sizeof(argp->txnid->txnid); + memcpy(&argp->prev_lsn, bp, sizeof(DB_LSN)); + bp += sizeof(DB_LSN); + memcpy(&argp->fileid, bp, sizeof(argp->fileid)); + bp += sizeof(argp->fileid); + memcpy(&argp->mode, bp, sizeof(argp->mode)); + bp += sizeof(argp->mode); + memcpy(&argp->old_pgno, bp, sizeof(argp->old_pgno)); + bp += sizeof(argp->old_pgno); + memcpy(&argp->new_pgno, bp, sizeof(argp->new_pgno)); + bp += sizeof(argp->new_pgno); + memcpy(&argp->old_indx, bp, sizeof(argp->old_indx)); + bp += sizeof(argp->old_indx); + memcpy(&argp->new_indx, bp, sizeof(argp->new_indx)); + bp += sizeof(argp->new_indx); + *argpp = argp; + return (0); +} + +int +__ham_init_print(dbenv) + DB_ENV *dbenv; +{ + int ret; + + if ((ret = __db_add_recovery(dbenv, + __ham_insdel_print, DB_ham_insdel)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_newpage_print, DB_ham_newpage)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_splitmeta_print, DB_ham_splitmeta)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_splitdata_print, DB_ham_splitdata)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_replace_print, DB_ham_replace)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_newpgno_print, DB_ham_newpgno)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_ovfl_print, DB_ham_ovfl)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_copypage_print, DB_ham_copypage)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_metagroup_print, DB_ham_metagroup)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_groupalloc1_print, DB_ham_groupalloc1)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_groupalloc2_print, DB_ham_groupalloc2)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_groupalloc_print, DB_ham_groupalloc)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_curadj_print, DB_ham_curadj)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_chgpg_print, DB_ham_chgpg)) != 0) + return (ret); + return (0); +} + +int +__ham_init_recover(dbenv) + DB_ENV *dbenv; +{ + int ret; + + if ((ret = __db_add_recovery(dbenv, + __ham_insdel_recover, DB_ham_insdel)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_newpage_recover, DB_ham_newpage)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __deprecated_recover, DB_ham_splitmeta)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_splitdata_recover, DB_ham_splitdata)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_replace_recover, DB_ham_replace)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __deprecated_recover, DB_ham_newpgno)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __deprecated_recover, DB_ham_ovfl)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_copypage_recover, DB_ham_copypage)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_metagroup_recover, DB_ham_metagroup)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __deprecated_recover, DB_ham_groupalloc1)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __deprecated_recover, DB_ham_groupalloc2)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_groupalloc_recover, DB_ham_groupalloc)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_curadj_recover, DB_ham_curadj)) != 0) + return (ret); + if ((ret = __db_add_recovery(dbenv, + __ham_chgpg_recover, DB_ham_chgpg)) != 0) + return (ret); + return (0); +} + diff --git a/bdb/hash/hash_conv.c b/bdb/hash/hash_conv.c new file mode 100644 index 00000000000..30d17a6164d --- /dev/null +++ b/bdb/hash/hash_conv.c @@ -0,0 +1,112 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: hash_conv.c,v 11.5 2000/03/31 00:30:32 ubell Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_swap.h" +#include "hash.h" + +/* + * __ham_pgin -- + * Convert host-specific page layout from the host-independent format + * stored on disk. + * + * PUBLIC: int __ham_pgin __P((DB_ENV *, db_pgno_t, void *, DBT *)); + */ +int +__ham_pgin(dbenv, pg, pp, cookie) + DB_ENV *dbenv; + db_pgno_t pg; + void *pp; + DBT *cookie; +{ + DB_PGINFO *pginfo; + PAGE *h; + + h = pp; + pginfo = (DB_PGINFO *)cookie->data; + + /* + * The hash access method does blind reads of pages, causing them + * to be created. If the type field isn't set it's one of them, + * initialize the rest of the page and return. + */ + if (h->type != P_HASHMETA && h->pgno == PGNO_INVALID) { + P_INIT(pp, pginfo->db_pagesize, + pg, PGNO_INVALID, PGNO_INVALID, 0, P_HASH); + return (0); + } + + if (!pginfo->needswap) + return (0); + + return (h->type == P_HASHMETA ? __ham_mswap(pp) : + __db_byteswap(dbenv, pg, pp, pginfo->db_pagesize, 1)); +} + +/* + * __ham_pgout -- + * Convert host-specific page layout to the host-independent format + * stored on disk. + * + * PUBLIC: int __ham_pgout __P((DB_ENV *, db_pgno_t, void *, DBT *)); + */ +int +__ham_pgout(dbenv, pg, pp, cookie) + DB_ENV *dbenv; + db_pgno_t pg; + void *pp; + DBT *cookie; +{ + DB_PGINFO *pginfo; + PAGE *h; + + pginfo = (DB_PGINFO *)cookie->data; + if (!pginfo->needswap) + return (0); + + h = pp; + return (h->type == P_HASHMETA ? __ham_mswap(pp) : + __db_byteswap(dbenv, pg, pp, pginfo->db_pagesize, 0)); +} + +/* + * __ham_mswap -- + * Swap the bytes on the hash metadata page. + * + * PUBLIC: int __ham_mswap __P((void *)); + */ +int +__ham_mswap(pg) + void *pg; +{ + u_int8_t *p; + int i; + + __db_metaswap(pg); + + p = (u_int8_t *)pg + sizeof(DBMETA); + + SWAP32(p); /* max_bucket */ + SWAP32(p); /* high_mask */ + SWAP32(p); /* low_mask */ + SWAP32(p); /* ffactor */ + SWAP32(p); /* nelem */ + SWAP32(p); /* h_charkey */ + for (i = 0; i < NCACHED; ++i) + SWAP32(p); /* spares */ + return (0); +} diff --git a/bdb/hash/hash_dup.c b/bdb/hash/hash_dup.c new file mode 100644 index 00000000000..f5fbf4f472f --- /dev/null +++ b/bdb/hash/hash_dup.c @@ -0,0 +1,805 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: hash_dup.c,v 11.49 2000/12/21 21:54:35 margo Exp $"; +#endif /* not lint */ + +/* + * PACKAGE: hashing + * + * DESCRIPTION: + * Manipulation of duplicates for the hash package. + * + * ROUTINES: + * + * External + * __add_dup + * Internal + */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "hash.h" +#include "btree.h" +#include "txn.h" + +static int __ham_check_move __P((DBC *, u_int32_t)); +static int __ham_dcursor __P((DBC *, db_pgno_t, u_int32_t)); + +/* + * Called from hash_access to add a duplicate key. nval is the new + * value that we want to add. The flags correspond to the flag values + * to cursor_put indicating where to add the new element. + * There are 4 cases. + * Case 1: The existing duplicate set already resides on a separate page. + * We return and let the common code handle this. + * Case 2: The element is small enough to just be added to the existing set. + * Case 3: The element is large enough to be a big item, so we're going to + * have to push the set onto a new page. + * Case 4: The element is large enough to push the duplicate set onto a + * separate page. + * + * PUBLIC: int __ham_add_dup __P((DBC *, DBT *, u_int32_t, db_pgno_t *)); + */ +int +__ham_add_dup(dbc, nval, flags, pgnop) + DBC *dbc; + DBT *nval; + u_int32_t flags; + db_pgno_t *pgnop; +{ + DB *dbp; + HASH_CURSOR *hcp; + DBT pval, tmp_val; + u_int32_t add_bytes, new_size; + int cmp, ret; + u_int8_t *hk; + + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + + DB_ASSERT(flags != DB_CURRENT); + + add_bytes = nval->size + + (F_ISSET(nval, DB_DBT_PARTIAL) ? nval->doff : 0); + add_bytes = DUP_SIZE(add_bytes); + + if ((ret = __ham_check_move(dbc, add_bytes)) != 0) + return (ret); + + /* + * Check if resulting duplicate set is going to need to go + * onto a separate duplicate page. If so, convert the + * duplicate set and add the new one. After conversion, + * hcp->dndx is the first free ndx or the index of the + * current pointer into the duplicate set. + */ + hk = H_PAIRDATA(hcp->page, hcp->indx); + /* Add the len bytes to the current singleton. */ + if (HPAGE_PTYPE(hk) != H_DUPLICATE) + add_bytes += DUP_SIZE(0); + new_size = + LEN_HKEYDATA(hcp->page, dbp->pgsize, H_DATAINDEX(hcp->indx)) + + add_bytes; + + /* + * We convert to off-page duplicates if the item is a big item, + * the addition of the new item will make the set large, or + * if there isn't enough room on this page to add the next item. + */ + if (HPAGE_PTYPE(hk) != H_OFFDUP && + (HPAGE_PTYPE(hk) == H_OFFPAGE || ISBIG(hcp, new_size) || + add_bytes > P_FREESPACE(hcp->page))) { + + if ((ret = __ham_dup_convert(dbc)) != 0) + return (ret); + return (hcp->opd->c_am_put(hcp->opd, + NULL, nval, flags, NULL)); + } + + /* There are two separate cases here: on page and off page. */ + if (HPAGE_PTYPE(hk) != H_OFFDUP) { + if (HPAGE_PTYPE(hk) != H_DUPLICATE) { + pval.flags = 0; + pval.data = HKEYDATA_DATA(hk); + pval.size = LEN_HDATA(hcp->page, dbp->pgsize, + hcp->indx); + if ((ret = __ham_make_dup(dbp->dbenv, + &pval, &tmp_val, &dbc->rdata.data, + &dbc->rdata.ulen)) != 0 || (ret = + __ham_replpair(dbc, &tmp_val, 1)) != 0) + return (ret); + hk = H_PAIRDATA(hcp->page, hcp->indx); + HPAGE_PTYPE(hk) = H_DUPLICATE; + + /* + * Update the cursor position since we now are in + * duplicates. + */ + F_SET(hcp, H_ISDUP); + hcp->dup_off = 0; + hcp->dup_len = pval.size; + hcp->dup_tlen = DUP_SIZE(hcp->dup_len); + } + + /* Now make the new entry a duplicate. */ + if ((ret = __ham_make_dup(dbp->dbenv, nval, + &tmp_val, &dbc->rdata.data, &dbc->rdata.ulen)) != 0) + return (ret); + + tmp_val.dlen = 0; + switch (flags) { /* On page. */ + case DB_KEYFIRST: + case DB_KEYLAST: + case DB_NODUPDATA: + if (dbp->dup_compare != NULL) { + __ham_dsearch(dbc, nval, &tmp_val.doff, &cmp); + + /* dup dups are not supported w/ sorted dups */ + if (cmp == 0) + return (__db_duperr(dbp, flags)); + } else { + hcp->dup_tlen = LEN_HDATA(hcp->page, + dbp->pgsize, hcp->indx); + hcp->dup_len = nval->size; + F_SET(hcp, H_ISDUP); + if (flags == DB_KEYFIRST) + hcp->dup_off = tmp_val.doff = 0; + else + hcp->dup_off = + tmp_val.doff = hcp->dup_tlen; + } + break; + case DB_BEFORE: + tmp_val.doff = hcp->dup_off; + break; + case DB_AFTER: + tmp_val.doff = hcp->dup_off + DUP_SIZE(hcp->dup_len); + break; + } + /* Add the duplicate. */ + ret = __ham_replpair(dbc, &tmp_val, 0); + if (ret == 0) + ret = memp_fset(dbp->mpf, hcp->page, DB_MPOOL_DIRTY); + + if (ret != 0) + return (ret); + + /* Now, update the cursor if necessary. */ + switch (flags) { + case DB_AFTER: + hcp->dup_off += DUP_SIZE(hcp->dup_len); + hcp->dup_len = nval->size; + hcp->dup_tlen += DUP_SIZE(nval->size); + break; + case DB_KEYFIRST: + case DB_KEYLAST: + case DB_BEFORE: + hcp->dup_tlen += DUP_SIZE(nval->size); + hcp->dup_len = nval->size; + break; + } + ret = __ham_c_update(dbc, tmp_val.size, 1, 1); + return (ret); + } + + /* + * If we get here, then we're on duplicate pages; set pgnop and + * return so the common code can handle it. + */ + memcpy(pgnop, + HOFFDUP_PGNO(H_PAIRDATA(hcp->page, hcp->indx)), sizeof(db_pgno_t)); + + return (ret); +} + +/* + * Convert an on-page set of duplicates to an offpage set of duplicates. + * + * PUBLIC: int __ham_dup_convert __P((DBC *)); + */ +int +__ham_dup_convert(dbc) + DBC *dbc; +{ + DB *dbp; + DBC **hcs; + DB_LSN lsn; + PAGE *dp; + HASH_CURSOR *hcp; + BOVERFLOW bo; + DBT dbt; + HOFFPAGE ho; + db_indx_t i, len, off; + int c, ret, t_ret; + u_int8_t *p, *pend; + + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + + /* + * Create a new page for the duplicates. + */ + if ((ret = __db_new(dbc, + dbp->dup_compare == NULL ? P_LRECNO : P_LDUP, &dp)) != 0) + return (ret); + P_INIT(dp, dbp->pgsize, + dp->pgno, PGNO_INVALID, PGNO_INVALID, LEAFLEVEL, TYPE(dp)); + + /* + * Get the list of cursors that may need to be updated. + */ + if ((ret = __ham_get_clist(dbp, + PGNO(hcp->page), (u_int32_t)hcp->indx, &hcs)) != 0) + return (ret); + + /* + * Now put the duplicates onto the new page. + */ + dbt.flags = 0; + switch (HPAGE_PTYPE(H_PAIRDATA(hcp->page, hcp->indx))) { + case H_KEYDATA: + /* Simple case, one key on page; move it to dup page. */ + dbt.size = LEN_HDATA(hcp->page, dbp->pgsize, hcp->indx); + dbt.data = HKEYDATA_DATA(H_PAIRDATA(hcp->page, hcp->indx)); + ret = __db_pitem(dbc, + dp, 0, BKEYDATA_SIZE(dbt.size), NULL, &dbt); + goto finish; + case H_OFFPAGE: + /* Simple case, one key on page; move it to dup page. */ + memcpy(&ho, + P_ENTRY(hcp->page, H_DATAINDEX(hcp->indx)), HOFFPAGE_SIZE); + UMRW_SET(bo.unused1); + B_TSET(bo.type, ho.type, 0); + UMRW_SET(bo.unused2); + bo.pgno = ho.pgno; + bo.tlen = ho.tlen; + dbt.size = BOVERFLOW_SIZE; + dbt.data = &bo; + + ret = __db_pitem(dbc, dp, 0, dbt.size, &dbt, NULL); + +finish: if (ret == 0) { + memp_fset(dbp->mpf, dp, DB_MPOOL_DIRTY); + /* + * Update any other cursors + */ + if (hcs != NULL && DB_LOGGING(dbc) + && IS_SUBTRANSACTION(dbc->txn)) { + if ((ret = __ham_chgpg_log(dbp->dbenv, + dbc->txn, &lsn, 0, dbp->log_fileid, + DB_HAM_DUP, PGNO(hcp->page), + PGNO(dp), hcp->indx, 0)) != 0) + break; + } + for (c = 0; hcs != NULL && hcs[c] != NULL; c++) + if ((ret = __ham_dcursor(hcs[c], + PGNO(dp), 0)) != 0) + break; + + } + break; + + case H_DUPLICATE: + p = HKEYDATA_DATA(H_PAIRDATA(hcp->page, hcp->indx)); + pend = p + + LEN_HDATA(hcp->page, dbp->pgsize, hcp->indx); + + /* + * We need to maintain the duplicate cursor position. + * Keep track of where we are in the duplicate set via + * the offset, and when it matches the one in the cursor, + * set the off-page duplicate cursor index to the current + * index. + */ + for (off = 0, i = 0; p < pend; i++) { + memcpy(&len, p, sizeof(db_indx_t)); + dbt.size = len; + p += sizeof(db_indx_t); + dbt.data = p; + p += len + sizeof(db_indx_t); + if ((ret = __db_pitem(dbc, dp, + i, BKEYDATA_SIZE(dbt.size), NULL, &dbt)) != 0) + break; + /* + * Update any other cursors + */ + for (c = 0; hcs != NULL && hcs[c] != NULL; c++) + if (((HASH_CURSOR *)(hcs[c]->internal))->dup_off + == off && (ret = __ham_dcursor(hcs[c], + PGNO(dp), i)) != 0) + goto out; + off += len + 2 * sizeof(db_indx_t); + } +out: break; + + default: + ret = __db_pgfmt(dbp, (u_long)hcp->pgno); + break; + } + if (ret == 0) { + /* + * Now attach this to the source page in place of + * the old duplicate item. + */ + __ham_move_offpage(dbc, hcp->page, + (u_int32_t)H_DATAINDEX(hcp->indx), PGNO(dp)); + + ret = memp_fset(dbp->mpf, hcp->page, DB_MPOOL_DIRTY); + if ((t_ret = memp_fput(dbp->mpf, dp, DB_MPOOL_DIRTY)) != 0) + ret = t_ret; + hcp->dup_tlen = hcp->dup_off = hcp->dup_len = 0; + } else + (void)__db_free(dbc, dp); + + if (hcs != NULL) + __os_free(hcs, 0); + + return (ret); +} + +/* + * __ham_make_dup + * + * Take a regular dbt and make it into a duplicate item with all the partial + * information set appropriately. If the incoming dbt is a partial, assume + * we are creating a new entry and make sure that we do any initial padding. + * + * PUBLIC: int __ham_make_dup __P((DB_ENV *, + * PUBLIC: const DBT *, DBT *d, void **, u_int32_t *)); + */ +int +__ham_make_dup(dbenv, notdup, duplicate, bufp, sizep) + DB_ENV *dbenv; + const DBT *notdup; + DBT *duplicate; + void **bufp; + u_int32_t *sizep; +{ + db_indx_t tsize, item_size; + int ret; + u_int8_t *p; + + item_size = (db_indx_t)notdup->size; + if (F_ISSET(notdup, DB_DBT_PARTIAL)) + item_size += notdup->doff; + + tsize = DUP_SIZE(item_size); + if ((ret = __ham_init_dbt(dbenv, duplicate, tsize, bufp, sizep)) != 0) + return (ret); + + duplicate->dlen = 0; + duplicate->flags = notdup->flags; + F_SET(duplicate, DB_DBT_PARTIAL); + + p = duplicate->data; + memcpy(p, &item_size, sizeof(db_indx_t)); + p += sizeof(db_indx_t); + if (F_ISSET(notdup, DB_DBT_PARTIAL)) { + memset(p, 0, notdup->doff); + p += notdup->doff; + } + memcpy(p, notdup->data, notdup->size); + p += notdup->size; + memcpy(p, &item_size, sizeof(db_indx_t)); + + duplicate->doff = 0; + duplicate->dlen = notdup->size; + + return (0); +} + +/* + * __ham_check_move -- + * + * Check if we can do whatever we need to on this page. If not, + * then we'll have to move the current element to a new page. + */ +static int +__ham_check_move(dbc, add_len) + DBC *dbc; + u_int32_t add_len; +{ + DB *dbp; + HASH_CURSOR *hcp; + DBT k, d; + DB_LSN new_lsn; + PAGE *next_pagep; + db_pgno_t next_pgno; + u_int32_t new_datalen, old_len, rectype; + u_int8_t *hk; + int ret; + + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + + hk = H_PAIRDATA(hcp->page, hcp->indx); + + /* + * If the item is already off page duplicates or an offpage item, + * then we know we can do whatever we need to do in-place + */ + if (HPAGE_PTYPE(hk) == H_OFFDUP || HPAGE_PTYPE(hk) == H_OFFPAGE) + return (0); + + old_len = LEN_HITEM(hcp->page, dbp->pgsize, H_DATAINDEX(hcp->indx)); + new_datalen = old_len - HKEYDATA_SIZE(0) + add_len; + if (HPAGE_PTYPE(hk) != H_DUPLICATE) + new_datalen += DUP_SIZE(0); + + /* + * We need to add a new page under two conditions: + * 1. The addition makes the total data length cross the BIG + * threshold and the OFFDUP structure won't fit on this page. + * 2. The addition does not make the total data cross the + * threshold, but the new data won't fit on the page. + * If neither of these is true, then we can return. + */ + if (ISBIG(hcp, new_datalen) && (old_len > HOFFDUP_SIZE || + HOFFDUP_SIZE - old_len <= P_FREESPACE(hcp->page))) + return (0); + + if (!ISBIG(hcp, new_datalen) && add_len <= P_FREESPACE(hcp->page)) + return (0); + + /* + * If we get here, then we need to move the item to a new page. + * Check if there are more pages in the chain. We now need to + * update new_datalen to include the size of both the key and + * the data that we need to move. + */ + + new_datalen = ISBIG(hcp, new_datalen) ? + HOFFDUP_SIZE : HKEYDATA_SIZE(new_datalen); + new_datalen += LEN_HITEM(hcp->page, dbp->pgsize, H_KEYINDEX(hcp->indx)); + + next_pagep = NULL; + for (next_pgno = NEXT_PGNO(hcp->page); next_pgno != PGNO_INVALID; + next_pgno = NEXT_PGNO(next_pagep)) { + if (next_pagep != NULL && + (ret = memp_fput(dbp->mpf, next_pagep, 0)) != 0) + return (ret); + + if ((ret = memp_fget(dbp->mpf, + &next_pgno, DB_MPOOL_CREATE, &next_pagep)) != 0) + return (ret); + + if (P_FREESPACE(next_pagep) >= new_datalen) + break; + } + + /* No more pages, add one. */ + if (next_pagep == NULL && (ret = __ham_add_ovflpage(dbc, + hcp->page, 0, &next_pagep)) != 0) + return (ret); + + /* Add new page at the end of the chain. */ + if (P_FREESPACE(next_pagep) < new_datalen && (ret = + __ham_add_ovflpage(dbc, next_pagep, 1, &next_pagep)) != 0) { + (void)memp_fput(dbp->mpf, next_pagep, 0); + return (ret); + } + + /* Copy the item to the new page. */ + if (DB_LOGGING(dbc)) { + rectype = PUTPAIR; + k.flags = 0; + d.flags = 0; + if (HPAGE_PTYPE( + H_PAIRKEY(hcp->page, hcp->indx)) == H_OFFPAGE) { + rectype |= PAIR_KEYMASK; + k.data = H_PAIRKEY(hcp->page, hcp->indx); + k.size = HOFFPAGE_SIZE; + } else { + k.data = + HKEYDATA_DATA(H_PAIRKEY(hcp->page, hcp->indx)); + k.size = LEN_HKEY(hcp->page, dbp->pgsize, hcp->indx); + } + + if (HPAGE_PTYPE(hk) == H_OFFPAGE) { + rectype |= PAIR_DATAMASK; + d.data = H_PAIRDATA(hcp->page, hcp->indx); + d.size = HOFFPAGE_SIZE; + } else { + if (HPAGE_PTYPE(H_PAIRDATA(hcp->page, hcp->indx)) + == H_DUPLICATE) + rectype |= PAIR_DUPMASK; + d.data = + HKEYDATA_DATA(H_PAIRDATA(hcp->page, hcp->indx)); + d.size = LEN_HDATA(hcp->page, dbp->pgsize, hcp->indx); + } + + if ((ret = __ham_insdel_log(dbp->dbenv, + dbc->txn, &new_lsn, 0, rectype, + dbp->log_fileid, PGNO(next_pagep), + (u_int32_t)NUM_ENT(next_pagep), &LSN(next_pagep), + &k, &d)) != 0) { + (void)memp_fput(dbp->mpf, next_pagep, 0); + return (ret); + } + + /* Move lsn onto page. */ + LSN(next_pagep) = new_lsn; /* Structure assignment. */ + } + + __ham_copy_item(dbp->pgsize, + hcp->page, H_KEYINDEX(hcp->indx), next_pagep); + __ham_copy_item(dbp->pgsize, + hcp->page, H_DATAINDEX(hcp->indx), next_pagep); + + /* + * We've just manually inserted a key and set of data onto + * next_pagep; however, it's possible that our caller will + * return without further modifying the new page, for instance + * if DB_NODUPDATA is set and our new item is a duplicate duplicate. + * Thus, to be on the safe side, we need to mark the page dirty + * here. [#2996] + * + * Note that __ham_del_pair should dirty the page we're moving + * the items from, so we need only dirty the new page ourselves. + */ + if ((ret = memp_fset(dbp->mpf, next_pagep, DB_MPOOL_DIRTY)) != 0) + goto out; + + /* Update all cursors that used to point to this item. */ + if ((ret = __ham_c_chgpg(dbc, PGNO(hcp->page), H_KEYINDEX(hcp->indx), + PGNO(next_pagep), NUM_ENT(next_pagep) - 2)) != 0) + goto out; + + /* Now delete the pair from the current page. */ + ret = __ham_del_pair(dbc, 0); + + /* + * __ham_del_pair decremented nelem. This is incorrect; we + * manually copied the element elsewhere, so the total number + * of elements hasn't changed. Increment it again. + */ + if (!STD_LOCKING(dbc)) + hcp->hdr->nelem++; + +out: + (void)memp_fput(dbp->mpf, hcp->page, DB_MPOOL_DIRTY); + hcp->page = next_pagep; + hcp->pgno = PGNO(hcp->page); + hcp->indx = NUM_ENT(hcp->page) - 2; + F_SET(hcp, H_EXPAND); + F_CLR(hcp, H_DELETED); + + return (ret); +} + +/* + * __ham_move_offpage -- + * Replace an onpage set of duplicates with the OFFDUP structure + * that references the duplicate page. + * + * XXX + * This is really just a special case of __onpage_replace; we should + * probably combine them. + * + * PUBLIC: void __ham_move_offpage __P((DBC *, PAGE *, u_int32_t, db_pgno_t)); + */ +void +__ham_move_offpage(dbc, pagep, ndx, pgno) + DBC *dbc; + PAGE *pagep; + u_int32_t ndx; + db_pgno_t pgno; +{ + DB *dbp; + HASH_CURSOR *hcp; + DBT new_dbt; + DBT old_dbt; + HOFFDUP od; + db_indx_t i; + int32_t shrink; + u_int8_t *src; + + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + od.type = H_OFFDUP; + UMRW_SET(od.unused[0]); + UMRW_SET(od.unused[1]); + UMRW_SET(od.unused[2]); + od.pgno = pgno; + + if (DB_LOGGING(dbc)) { + new_dbt.data = &od; + new_dbt.size = HOFFDUP_SIZE; + old_dbt.data = P_ENTRY(pagep, ndx); + old_dbt.size = LEN_HITEM(pagep, dbp->pgsize, ndx); + (void)__ham_replace_log(dbp->dbenv, + dbc->txn, &LSN(pagep), 0, dbp->log_fileid, + PGNO(pagep), (u_int32_t)ndx, &LSN(pagep), -1, + &old_dbt, &new_dbt, 0); + } + + shrink = LEN_HITEM(pagep, dbp->pgsize, ndx) - HOFFDUP_SIZE; + + if (shrink != 0) { + /* Copy data. */ + src = (u_int8_t *)(pagep) + HOFFSET(pagep); + memmove(src + shrink, src, pagep->inp[ndx] - HOFFSET(pagep)); + HOFFSET(pagep) += shrink; + + /* Update index table. */ + for (i = ndx; i < NUM_ENT(pagep); i++) + pagep->inp[i] += shrink; + } + + /* Now copy the offdup entry onto the page. */ + memcpy(P_ENTRY(pagep, ndx), &od, HOFFDUP_SIZE); +} + +/* + * __ham_dsearch: + * Locate a particular duplicate in a duplicate set. Make sure that + * we exit with the cursor set appropriately. + * + * PUBLIC: void __ham_dsearch __P((DBC *, DBT *, u_int32_t *, int *)); + */ +void +__ham_dsearch(dbc, dbt, offp, cmpp) + DBC *dbc; + DBT *dbt; + u_int32_t *offp; + int *cmpp; +{ + DB *dbp; + HASH_CURSOR *hcp; + DBT cur; + db_indx_t i, len; + int (*func) __P((DB *, const DBT *, const DBT *)); + u_int8_t *data; + + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + if (dbp->dup_compare == NULL) + func = __bam_defcmp; + else + func = dbp->dup_compare; + + i = F_ISSET(hcp, H_CONTINUE) ? hcp->dup_off: 0; + data = HKEYDATA_DATA(H_PAIRDATA(hcp->page, hcp->indx)) + i; + hcp->dup_tlen = LEN_HDATA(hcp->page, dbp->pgsize, hcp->indx); + while (i < hcp->dup_tlen) { + memcpy(&len, data, sizeof(db_indx_t)); + data += sizeof(db_indx_t); + cur.data = data; + cur.size = (u_int32_t)len; + *cmpp = func(dbp, dbt, &cur); + if (*cmpp == 0 || (*cmpp < 0 && dbp->dup_compare != NULL)) + break; + i += len + 2 * sizeof(db_indx_t); + data += len + sizeof(db_indx_t); + } + *offp = i; + hcp->dup_off = i; + hcp->dup_len = len; + F_SET(hcp, H_ISDUP); +} + +#ifdef DEBUG +/* + * __ham_cprint -- + * Display the current cursor list. + * + * PUBLIC: int __ham_cprint __P((DB *)); + */ +int +__ham_cprint(dbp) + DB *dbp; +{ + HASH_CURSOR *cp; + DBC *dbc; + + MUTEX_THREAD_LOCK(dbp->dbenv, dbp->mutexp); + for (dbc = TAILQ_FIRST(&dbp->active_queue); + dbc != NULL; dbc = TAILQ_NEXT(dbc, links)) { + cp = (HASH_CURSOR *)dbc->internal; + fprintf(stderr, "%#0lx->%#0lx: page: %lu index: %lu", + P_TO_ULONG(dbc), P_TO_ULONG(cp), (u_long)cp->pgno, + (u_long)cp->indx); + if (F_ISSET(cp, H_DELETED)) + fprintf(stderr, " (deleted)"); + fprintf(stderr, "\n"); + } + MUTEX_THREAD_UNLOCK(dbp->dbenv, dbp->mutexp); + + return (0); +} +#endif /* DEBUG */ + +/* + * __ham_dcursor -- + * + * Create an off page duplicate cursor for this cursor. + */ +static int +__ham_dcursor(dbc, pgno, indx) + DBC *dbc; + db_pgno_t pgno; + u_int32_t indx; +{ + DB *dbp; + DBC *dbc_nopd; + HASH_CURSOR *hcp; + BTREE_CURSOR *dcp; + int ret; + + dbp = dbc->dbp; + + if ((ret = __db_c_newopd(dbc, pgno, &dbc_nopd)) != 0) + return (ret); + + dcp = (BTREE_CURSOR *)dbc_nopd->internal; + dcp->pgno = pgno; + dcp->indx = indx; + + if (dbp->dup_compare == NULL) { + /* + * Converting to off-page Recno trees is tricky. The + * record number for the cursor is the index + 1 (to + * convert to 1-based record numbers). + */ + dcp->recno = indx + 1; + } + + /* + * Transfer the deleted flag from the top-level cursor to the + * created one. + */ + hcp = (HASH_CURSOR *)dbc->internal; + if (F_ISSET(hcp, H_DELETED)) { + F_SET(dcp, C_DELETED); + F_CLR(hcp, H_DELETED); + } + + /* Stack the cursors and reset the initial cursor's index. */ + hcp->opd = dbc_nopd; + + return (0); +} diff --git a/bdb/hash/hash_func.c b/bdb/hash/hash_func.c new file mode 100644 index 00000000000..22b4f08ee70 --- /dev/null +++ b/bdb/hash/hash_func.c @@ -0,0 +1,242 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993 + * Margo Seltzer. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: hash_func.c,v 11.7 2000/08/16 18:26:19 ubell Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "hash.h" + +/* + * __ham_func2 -- + * Phong Vo's linear congruential hash. + * + * PUBLIC: u_int32_t __ham_func2 __P((DB *, const void *, u_int32_t)); + */ +#define DCHARHASH(h, c) ((h) = 0x63c63cd9*(h) + 0x9c39c33d + (c)) + +u_int32_t +__ham_func2(dbp, key, len) + DB *dbp; + const void *key; + u_int32_t len; +{ + const u_int8_t *e, *k; + u_int32_t h; + u_int8_t c; + + if (dbp != NULL) + COMPQUIET(dbp, NULL); + + k = key; + e = k + len; + for (h = 0; k != e;) { + c = *k++; + if (!c && k > e) + break; + DCHARHASH(h, c); + } + return (h); +} + +/* + * __ham_func3 -- + * Ozan Yigit's original sdbm hash. + * + * Ugly, but fast. Break the string up into 8 byte units. On the first time + * through the loop get the "leftover bytes" (strlen % 8). On every other + * iteration, perform 8 HASHC's so we handle all 8 bytes. Essentially, this + * saves us 7 cmp & branch instructions. + * + * PUBLIC: u_int32_t __ham_func3 __P((DB *, const void *, u_int32_t)); + */ +u_int32_t +__ham_func3(dbp, key, len) + DB *dbp; + const void *key; + u_int32_t len; +{ + const u_int8_t *k; + u_int32_t n, loop; + + if (dbp != NULL) + COMPQUIET(dbp, NULL); + + if (len == 0) + return (0); + +#define HASHC n = *k++ + 65599 * n + n = 0; + k = key; + + loop = (len + 8 - 1) >> 3; + switch (len & (8 - 1)) { + case 0: + do { + HASHC; + case 7: + HASHC; + case 6: + HASHC; + case 5: + HASHC; + case 4: + HASHC; + case 3: + HASHC; + case 2: + HASHC; + case 1: + HASHC; + } while (--loop); + } + return (n); +} + +/* + * __ham_func4 -- + * Chris Torek's hash function. Although this function performs only + * slightly worse than __ham_func5 on strings, it performs horribly on + * numbers. + * + * PUBLIC: u_int32_t __ham_func4 __P((DB *, const void *, u_int32_t)); + */ +u_int32_t +__ham_func4(dbp, key, len) + DB *dbp; + const void *key; + u_int32_t len; +{ + const u_int8_t *k; + u_int32_t h, loop; + + if (dbp != NULL) + COMPQUIET(dbp, NULL); + + if (len == 0) + return (0); + +#define HASH4a h = (h << 5) - h + *k++; +#define HASH4b h = (h << 5) + h + *k++; +#define HASH4 HASH4b + h = 0; + k = key; + + loop = (len + 8 - 1) >> 3; + switch (len & (8 - 1)) { + case 0: + do { + HASH4; + case 7: + HASH4; + case 6: + HASH4; + case 5: + HASH4; + case 4: + HASH4; + case 3: + HASH4; + case 2: + HASH4; + case 1: + HASH4; + } while (--loop); + } + return (h); +} + +/* + * Fowler/Noll/Vo hash + * + * The basis of the hash algorithm was taken from an idea sent by email to the + * IEEE Posix P1003.2 mailing list from Phong Vo (kpv@research.att.com) and + * Glenn Fowler (gsf@research.att.com). Landon Curt Noll (chongo@toad.com) + * later improved on their algorithm. + * + * The magic is in the interesting relationship between the special prime + * 16777619 (2^24 + 403) and 2^32 and 2^8. + * + * This hash produces the fewest collisions of any function that we've seen so + * far, and works well on both numbers and strings. + * + * PUBLIC: u_int32_t __ham_func5 __P((DB *, const void *, u_int32_t)); + */ +u_int32_t +__ham_func5(dbp, key, len) + DB *dbp; + const void *key; + u_int32_t len; +{ + const u_int8_t *k, *e; + u_int32_t h; + + if (dbp != NULL) + COMPQUIET(dbp, NULL); + + k = key; + e = k + len; + for (h = 0; k < e; ++k) { + h *= 16777619; + h ^= *k; + } + return (h); +} + +u_int32_t +__ham_test(dbp, key, len) + DB *dbp; + const void *key; + u_int32_t len; +{ + COMPQUIET(dbp, NULL); + COMPQUIET(len, 0); + return ((u_int32_t)*(char *)key); +} diff --git a/bdb/hash/hash_meta.c b/bdb/hash/hash_meta.c new file mode 100644 index 00000000000..d96a6db3207 --- /dev/null +++ b/bdb/hash/hash_meta.c @@ -0,0 +1,121 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1999, 2000 + * Sleepycat Software. All rights reserved. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: hash_meta.c,v 11.10 2000/12/21 21:54:35 margo Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "hash.h" +#include "db_shash.h" +#include "lock.h" +#include "txn.h" + +/* + * Acquire the meta-data page. + * + * PUBLIC: int __ham_get_meta __P((DBC *)); + */ +int +__ham_get_meta(dbc) + DBC *dbc; +{ + HASH_CURSOR *hcp; + HASH *hashp; + DB *dbp; + int ret; + + hcp = (HASH_CURSOR *)dbc->internal; + dbp = dbc->dbp; + hashp = dbp->h_internal; + + if (dbp->dbenv != NULL && + STD_LOCKING(dbc) && !F_ISSET(dbc, DBC_RECOVER)) { + dbc->lock.pgno = hashp->meta_pgno; + if ((ret = lock_get(dbp->dbenv, dbc->locker, + DB_NONBLOCK(dbc) ? DB_LOCK_NOWAIT : 0, + &dbc->lock_dbt, DB_LOCK_READ, &hcp->hlock)) != 0) + return (ret); + } + + if ((ret = memp_fget(dbc->dbp->mpf, + &hashp->meta_pgno, DB_MPOOL_CREATE, &(hcp->hdr))) != 0 && + hcp->hlock.off != LOCK_INVALID) { + (void)lock_put(dbc->dbp->dbenv, &hcp->hlock); + hcp->hlock.off = LOCK_INVALID; + } + + return (ret); +} + +/* + * Release the meta-data page. + * + * PUBLIC: int __ham_release_meta __P((DBC *)); + */ +int +__ham_release_meta(dbc) + DBC *dbc; +{ + HASH_CURSOR *hcp; + + hcp = (HASH_CURSOR *)dbc->internal; + + if (hcp->hdr) + (void)memp_fput(dbc->dbp->mpf, hcp->hdr, + F_ISSET(hcp, H_DIRTY) ? DB_MPOOL_DIRTY : 0); + hcp->hdr = NULL; + if (!F_ISSET(dbc, DBC_RECOVER) && + dbc->txn == NULL && hcp->hlock.off != LOCK_INVALID) + (void)lock_put(dbc->dbp->dbenv, &hcp->hlock); + hcp->hlock.off = LOCK_INVALID; + F_CLR(hcp, H_DIRTY); + + return (0); +} + +/* + * Mark the meta-data page dirty. + * + * PUBLIC: int __ham_dirty_meta __P((DBC *)); + */ +int +__ham_dirty_meta(dbc) + DBC *dbc; +{ + DB *dbp; + DB_LOCK _tmp; + HASH *hashp; + HASH_CURSOR *hcp; + int ret; + + dbp = dbc->dbp; + hashp = dbp->h_internal; + hcp = (HASH_CURSOR *)dbc->internal; + + ret = 0; + if (STD_LOCKING(dbc) && !F_ISSET(dbc, DBC_RECOVER)) { + dbc->lock.pgno = hashp->meta_pgno; + if ((ret = lock_get(dbp->dbenv, dbc->locker, + DB_NONBLOCK(dbc) ? DB_LOCK_NOWAIT : 0, + &dbc->lock_dbt, DB_LOCK_WRITE, &_tmp)) == 0) { + ret = lock_put(dbp->dbenv, &hcp->hlock); + hcp->hlock = _tmp; + } + } + + if (ret == 0) + F_SET(hcp, H_DIRTY); + return (ret); +} diff --git a/bdb/hash/hash_method.c b/bdb/hash/hash_method.c new file mode 100644 index 00000000000..f8239993dc5 --- /dev/null +++ b/bdb/hash/hash_method.c @@ -0,0 +1,126 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1999, 2000 + * Sleepycat Software. All rights reserved. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: hash_method.c,v 11.7 2000/07/04 18:28:23 bostic Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "hash.h" + +static int __ham_set_h_ffactor __P((DB *, u_int32_t)); +static int __ham_set_h_hash + __P((DB *, u_int32_t(*)(DB *, const void *, u_int32_t))); +static int __ham_set_h_nelem __P((DB *, u_int32_t)); + +/* + * __ham_db_create -- + * Hash specific initialization of the DB structure. + * + * PUBLIC: int __ham_db_create __P((DB *)); + */ +int +__ham_db_create(dbp) + DB *dbp; +{ + HASH *hashp; + int ret; + + if ((ret = __os_malloc(dbp->dbenv, + sizeof(HASH), NULL, &dbp->h_internal)) != 0) + return (ret); + + hashp = dbp->h_internal; + + hashp->h_nelem = 0; /* Defaults. */ + hashp->h_ffactor = 0; + hashp->h_hash = NULL; + + dbp->set_h_ffactor = __ham_set_h_ffactor; + dbp->set_h_hash = __ham_set_h_hash; + dbp->set_h_nelem = __ham_set_h_nelem; + + return (0); +} + +/* + * PUBLIC: int __ham_db_close __P((DB *)); + */ +int +__ham_db_close(dbp) + DB *dbp; +{ + if (dbp->h_internal == NULL) + return (0); + __os_free(dbp->h_internal, sizeof(HASH)); + dbp->h_internal = NULL; + return (0); +} + +/* + * __ham_set_h_ffactor -- + * Set the fill factor. + */ +static int +__ham_set_h_ffactor(dbp, h_ffactor) + DB *dbp; + u_int32_t h_ffactor; +{ + HASH *hashp; + + DB_ILLEGAL_AFTER_OPEN(dbp, "set_h_ffactor"); + DB_ILLEGAL_METHOD(dbp, DB_OK_HASH); + + hashp = dbp->h_internal; + hashp->h_ffactor = h_ffactor; + return (0); +} + +/* + * __ham_set_h_hash -- + * Set the hash function. + */ +static int +__ham_set_h_hash(dbp, func) + DB *dbp; + u_int32_t (*func) __P((DB *, const void *, u_int32_t)); +{ + HASH *hashp; + + DB_ILLEGAL_AFTER_OPEN(dbp, "set_h_hash"); + DB_ILLEGAL_METHOD(dbp, DB_OK_HASH); + + hashp = dbp->h_internal; + hashp->h_hash = func; + return (0); +} + +/* + * __ham_set_h_nelem -- + * Set the table size. + */ +static int +__ham_set_h_nelem(dbp, h_nelem) + DB *dbp; + u_int32_t h_nelem; +{ + HASH *hashp; + + DB_ILLEGAL_AFTER_OPEN(dbp, "set_h_nelem"); + DB_ILLEGAL_METHOD(dbp, DB_OK_HASH); + + hashp = dbp->h_internal; + hashp->h_nelem = h_nelem; + return (0); +} diff --git a/bdb/hash/hash_page.c b/bdb/hash/hash_page.c new file mode 100644 index 00000000000..64f38853284 --- /dev/null +++ b/bdb/hash/hash_page.c @@ -0,0 +1,1655 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994 + * Margo Seltzer. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: hash_page.c,v 11.46 2001/01/11 18:19:51 bostic Exp $"; +#endif /* not lint */ + +/* + * PACKAGE: hashing + * + * DESCRIPTION: + * Page manipulation for hashing package. + * + * ROUTINES: + * + * External + * __get_page + * __add_ovflpage + * __overflow_page + * Internal + * open_temp + */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_shash.h" +#include "hash.h" +#include "lock.h" +#include "txn.h" + +/* + * PUBLIC: int __ham_item __P((DBC *, db_lockmode_t, db_pgno_t *)); + */ +int +__ham_item(dbc, mode, pgnop) + DBC *dbc; + db_lockmode_t mode; + db_pgno_t *pgnop; +{ + DB *dbp; + HASH_CURSOR *hcp; + db_pgno_t next_pgno; + int ret; + + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + + if (F_ISSET(hcp, H_DELETED)) { + __db_err(dbp->dbenv, "Attempt to return a deleted item"); + return (EINVAL); + } + F_CLR(hcp, H_OK | H_NOMORE); + + /* Check if we need to get a page for this cursor. */ + if ((ret = __ham_get_cpage(dbc, mode)) != 0) + return (ret); + +recheck: + /* Check if we are looking for space in which to insert an item. */ + if (hcp->seek_size && hcp->seek_found_page == PGNO_INVALID + && hcp->seek_size < P_FREESPACE(hcp->page)) + hcp->seek_found_page = hcp->pgno; + + /* Check for off-page duplicates. */ + if (hcp->indx < NUM_ENT(hcp->page) && + HPAGE_TYPE(hcp->page, H_DATAINDEX(hcp->indx)) == H_OFFDUP) { + memcpy(pgnop, + HOFFDUP_PGNO(H_PAIRDATA(hcp->page, hcp->indx)), + sizeof(db_pgno_t)); + F_SET(hcp, H_OK); + return (0); + } + + /* Check if we need to go on to the next page. */ + if (F_ISSET(hcp, H_ISDUP)) + /* + * ISDUP is set, and offset is at the beginning of the datum. + * We need to grab the length of the datum, then set the datum + * pointer to be the beginning of the datum. + */ + memcpy(&hcp->dup_len, + HKEYDATA_DATA(H_PAIRDATA(hcp->page, hcp->indx)) + + hcp->dup_off, sizeof(db_indx_t)); + + if (hcp->indx >= (db_indx_t)NUM_ENT(hcp->page)) { + /* Fetch next page. */ + if (NEXT_PGNO(hcp->page) == PGNO_INVALID) { + F_SET(hcp, H_NOMORE); + return (DB_NOTFOUND); + } + next_pgno = NEXT_PGNO(hcp->page); + hcp->indx = 0; + if ((ret = __ham_next_cpage(dbc, next_pgno, 0)) != 0) + return (ret); + goto recheck; + } + + F_SET(hcp, H_OK); + return (0); +} + +/* + * PUBLIC: int __ham_item_reset __P((DBC *)); + */ +int +__ham_item_reset(dbc) + DBC *dbc; +{ + HASH_CURSOR *hcp; + DB *dbp; + int ret; + + ret = 0; + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + if (hcp->page != NULL) + ret = memp_fput(dbp->mpf, hcp->page, 0); + + __ham_item_init(dbc); + return (ret); +} + +/* + * PUBLIC: void __ham_item_init __P((DBC *)); + */ +void +__ham_item_init(dbc) + DBC *dbc; +{ + HASH_CURSOR *hcp; + + hcp = (HASH_CURSOR *)dbc->internal; + /* + * If this cursor still holds any locks, we must + * release them if we are not running with transactions. + */ + if (hcp->lock.off != LOCK_INVALID && dbc->txn == NULL) + (void)lock_put(dbc->dbp->dbenv, &hcp->lock); + + /* + * The following fields must *not* be initialized here + * because they may have meaning across inits. + * hlock, hdr, split_buf, stats + */ + hcp->bucket = BUCKET_INVALID; + hcp->lbucket = BUCKET_INVALID; + hcp->lock.off = LOCK_INVALID; + hcp->lock_mode = DB_LOCK_NG; + hcp->dup_off = 0; + hcp->dup_len = 0; + hcp->dup_tlen = 0; + hcp->seek_size = 0; + hcp->seek_found_page = PGNO_INVALID; + hcp->flags = 0; + + hcp->pgno = PGNO_INVALID; + hcp->indx = NDX_INVALID; + hcp->page = NULL; +} + +/* + * Returns the last item in a bucket. + * + * PUBLIC: int __ham_item_last __P((DBC *, db_lockmode_t, db_pgno_t *)); + */ +int +__ham_item_last(dbc, mode, pgnop) + DBC *dbc; + db_lockmode_t mode; + db_pgno_t *pgnop; +{ + HASH_CURSOR *hcp; + int ret; + + hcp = (HASH_CURSOR *)dbc->internal; + if ((ret = __ham_item_reset(dbc)) != 0) + return (ret); + + hcp->bucket = hcp->hdr->max_bucket; + hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket); + F_SET(hcp, H_OK); + return (__ham_item_prev(dbc, mode, pgnop)); +} + +/* + * PUBLIC: int __ham_item_first __P((DBC *, db_lockmode_t, db_pgno_t *)); + */ +int +__ham_item_first(dbc, mode, pgnop) + DBC *dbc; + db_lockmode_t mode; + db_pgno_t *pgnop; +{ + HASH_CURSOR *hcp; + int ret; + + hcp = (HASH_CURSOR *)dbc->internal; + if ((ret = __ham_item_reset(dbc)) != 0) + return (ret); + F_SET(hcp, H_OK); + hcp->bucket = 0; + hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket); + return (__ham_item_next(dbc, mode, pgnop)); +} + +/* + * __ham_item_prev -- + * Returns a pointer to key/data pair on a page. In the case of + * bigkeys, just returns the page number and index of the bigkey + * pointer pair. + * + * PUBLIC: int __ham_item_prev __P((DBC *, db_lockmode_t, db_pgno_t *)); + */ +int +__ham_item_prev(dbc, mode, pgnop) + DBC *dbc; + db_lockmode_t mode; + db_pgno_t *pgnop; +{ + DB *dbp; + HASH_CURSOR *hcp; + db_pgno_t next_pgno; + int ret; + + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + /* + * There are 5 cases for backing up in a hash file. + * Case 1: In the middle of a page, no duplicates, just dec the index. + * Case 2: In the middle of a duplicate set, back up one. + * Case 3: At the beginning of a duplicate set, get out of set and + * back up to next key. + * Case 4: At the beginning of a page; go to previous page. + * Case 5: At the beginning of a bucket; go to prev bucket. + */ + F_CLR(hcp, H_OK | H_NOMORE | H_DELETED); + + if ((ret = __ham_get_cpage(dbc, mode)) != 0) + return (ret); + + /* + * First handle the duplicates. Either you'll get the key here + * or you'll exit the duplicate set and drop into the code below + * to handle backing up through keys. + */ + if (!F_ISSET(hcp, H_NEXT_NODUP) && F_ISSET(hcp, H_ISDUP)) { + if (HPAGE_TYPE(hcp->page, H_DATAINDEX(hcp->indx)) == H_OFFDUP) { + memcpy(pgnop, + HOFFDUP_PGNO(H_PAIRDATA(hcp->page, hcp->indx)), + sizeof(db_pgno_t)); + F_SET(hcp, H_OK); + return (0); + } + + /* Duplicates are on-page. */ + if (hcp->dup_off != 0) { + memcpy(&hcp->dup_len, HKEYDATA_DATA( + H_PAIRDATA(hcp->page, hcp->indx)) + + hcp->dup_off - sizeof(db_indx_t), + sizeof(db_indx_t)); + hcp->dup_off -= + DUP_SIZE(hcp->dup_len); + return (__ham_item(dbc, mode, pgnop)); + } + } + + /* + * If we get here, we are not in a duplicate set, and just need + * to back up the cursor. There are still three cases: + * midpage, beginning of page, beginning of bucket. + */ + + if (F_ISSET(hcp, H_DUPONLY)) { + F_CLR(hcp, H_OK); + F_SET(hcp, H_NOMORE); + return (0); + } else + /* + * We are no longer in a dup set; flag this so the dup code + * will reinitialize should we stumble upon another one. + */ + F_CLR(hcp, H_ISDUP); + + if (hcp->indx == 0) { /* Beginning of page. */ + hcp->pgno = PREV_PGNO(hcp->page); + if (hcp->pgno == PGNO_INVALID) { + /* Beginning of bucket. */ + F_SET(hcp, H_NOMORE); + return (DB_NOTFOUND); + } else if ((ret = + __ham_next_cpage(dbc, hcp->pgno, 0)) != 0) + return (ret); + else + hcp->indx = NUM_ENT(hcp->page); + } + + /* + * Either we've got the cursor set up to be decremented, or we + * have to find the end of a bucket. + */ + if (hcp->indx == NDX_INVALID) { + DB_ASSERT(hcp->page != NULL); + + hcp->indx = NUM_ENT(hcp->page); + for (next_pgno = NEXT_PGNO(hcp->page); + next_pgno != PGNO_INVALID; + next_pgno = NEXT_PGNO(hcp->page)) { + if ((ret = __ham_next_cpage(dbc, next_pgno, 0)) != 0) + return (ret); + hcp->indx = NUM_ENT(hcp->page); + } + + if (hcp->indx == 0) { + /* Bucket was empty. */ + F_SET(hcp, H_NOMORE); + return (DB_NOTFOUND); + } + } + + hcp->indx -= 2; + + return (__ham_item(dbc, mode, pgnop)); +} + +/* + * Sets the cursor to the next key/data pair on a page. + * + * PUBLIC: int __ham_item_next __P((DBC *, db_lockmode_t, db_pgno_t *)); + */ +int +__ham_item_next(dbc, mode, pgnop) + DBC *dbc; + db_lockmode_t mode; + db_pgno_t *pgnop; +{ + HASH_CURSOR *hcp; + int ret; + + hcp = (HASH_CURSOR *)dbc->internal; + + if ((ret = __ham_get_cpage(dbc, mode)) != 0) + return (ret); + + /* + * Deleted on-page duplicates are a weird case. If we delete the last + * one, then our cursor is at the very end of a duplicate set and + * we actually need to go on to the next key. + */ + if (F_ISSET(hcp, H_DELETED)) { + if (hcp->indx != NDX_INVALID && + F_ISSET(hcp, H_ISDUP) && + HPAGE_TYPE(hcp->page, H_DATAINDEX(hcp->indx)) + == H_DUPLICATE && hcp->dup_tlen == hcp->dup_off) { + if (F_ISSET(hcp, H_DUPONLY)) { + F_CLR(hcp, H_OK); + F_SET(hcp, H_NOMORE); + return (0); + } else { + F_CLR(hcp, H_ISDUP); + hcp->indx += 2; + } + } else if (!F_ISSET(hcp, H_ISDUP) && F_ISSET(hcp, H_DUPONLY)) { + F_CLR(hcp, H_OK); + F_SET(hcp, H_NOMORE); + return (0); + } else if (F_ISSET(hcp, H_ISDUP) && + F_ISSET(hcp, H_NEXT_NODUP)) { + F_CLR(hcp, H_ISDUP); + hcp->indx += 2; + } + F_CLR(hcp, H_DELETED); + } else if (hcp->indx == NDX_INVALID) { + hcp->indx = 0; + F_CLR(hcp, H_ISDUP); + } else if (F_ISSET(hcp, H_NEXT_NODUP)) { + hcp->indx += 2; + F_CLR(hcp, H_ISDUP); + } else if (F_ISSET(hcp, H_ISDUP) && hcp->dup_tlen != 0) { + if (hcp->dup_off + DUP_SIZE(hcp->dup_len) >= + hcp->dup_tlen && F_ISSET(hcp, H_DUPONLY)) { + F_CLR(hcp, H_OK); + F_SET(hcp, H_NOMORE); + return (0); + } + hcp->dup_off += DUP_SIZE(hcp->dup_len); + if (hcp->dup_off >= hcp->dup_tlen) { + F_CLR(hcp, H_ISDUP); + hcp->indx += 2; + } + } else if (F_ISSET(hcp, H_DUPONLY)) { + F_CLR(hcp, H_OK); + F_SET(hcp, H_NOMORE); + return (0); + } else { + hcp->indx += 2; + F_CLR(hcp, H_ISDUP); + } + + return (__ham_item(dbc, mode, pgnop)); +} + +/* + * PUBLIC: void __ham_putitem __P((PAGE *p, const DBT *, int)); + * + * This is a little bit sleazy in that we're overloading the meaning + * of the H_OFFPAGE type here. When we recover deletes, we have the + * entire entry instead of having only the DBT, so we'll pass type + * H_OFFPAGE to mean, "copy the whole entry" as opposed to constructing + * an H_KEYDATA around it. + */ +void +__ham_putitem(p, dbt, type) + PAGE *p; + const DBT *dbt; + int type; +{ + u_int16_t n, off; + + n = NUM_ENT(p); + + /* Put the item element on the page. */ + if (type == H_OFFPAGE) { + off = HOFFSET(p) - dbt->size; + HOFFSET(p) = p->inp[n] = off; + memcpy(P_ENTRY(p, n), dbt->data, dbt->size); + } else { + off = HOFFSET(p) - HKEYDATA_SIZE(dbt->size); + HOFFSET(p) = p->inp[n] = off; + PUT_HKEYDATA(P_ENTRY(p, n), dbt->data, dbt->size, type); + } + + /* Adjust page info. */ + NUM_ENT(p) += 1; +} + +/* + * PUBLIC: void __ham_reputpair + * PUBLIC: __P((PAGE *p, u_int32_t, u_int32_t, const DBT *, const DBT *)); + * + * This is a special case to restore a key/data pair to its original + * location during recovery. We are guaranteed that the pair fits + * on the page and is not the last pair on the page (because if it's + * the last pair, the normal insert works). + */ +void +__ham_reputpair(p, psize, ndx, key, data) + PAGE *p; + u_int32_t psize, ndx; + const DBT *key, *data; +{ + db_indx_t i, movebytes, newbytes; + u_int8_t *from; + + /* First shuffle the existing items up on the page. */ + movebytes = + (ndx == 0 ? psize : p->inp[H_DATAINDEX(ndx - 2)]) - HOFFSET(p); + newbytes = key->size + data->size; + from = (u_int8_t *)p + HOFFSET(p); + memmove(from - newbytes, from, movebytes); + + /* + * Adjust the indices and move them up 2 spaces. Note that we + * have to check the exit condition inside the loop just in case + * we are dealing with index 0 (db_indx_t's are unsigned). + */ + for (i = NUM_ENT(p) - 1; ; i-- ) { + p->inp[i + 2] = p->inp[i] - newbytes; + if (i == H_KEYINDEX(ndx)) + break; + } + + /* Put the key and data on the page. */ + p->inp[H_KEYINDEX(ndx)] = + (ndx == 0 ? psize : p->inp[H_DATAINDEX(ndx - 2)]) - key->size; + p->inp[H_DATAINDEX(ndx)] = p->inp[H_KEYINDEX(ndx)] - data->size; + memcpy(P_ENTRY(p, H_KEYINDEX(ndx)), key->data, key->size); + memcpy(P_ENTRY(p, H_DATAINDEX(ndx)), data->data, data->size); + + /* Adjust page info. */ + HOFFSET(p) -= newbytes; + NUM_ENT(p) += 2; +} + +/* + * PUBLIC: int __ham_del_pair __P((DBC *, int)); + */ +int +__ham_del_pair(dbc, reclaim_page) + DBC *dbc; + int reclaim_page; +{ + DB *dbp; + HASH_CURSOR *hcp; + DBT data_dbt, key_dbt; + DB_ENV *dbenv; + DB_LSN new_lsn, *n_lsn, tmp_lsn; + PAGE *n_pagep, *nn_pagep, *p, *p_pagep; + db_indx_t ndx; + db_pgno_t chg_pgno, pgno, tmp_pgno; + int ret, t_ret; + + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + + dbenv = dbp->dbenv; + ndx = hcp->indx; + + n_pagep = p_pagep = nn_pagep = NULL; + + if (hcp->page == NULL && (ret = memp_fget(dbp->mpf, + &hcp->pgno, DB_MPOOL_CREATE, &hcp->page)) != 0) + return (ret); + p = hcp->page; + + /* + * We optimize for the normal case which is when neither the key nor + * the data are large. In this case, we write a single log record + * and do the delete. If either is large, we'll call __big_delete + * to remove the big item and then update the page to remove the + * entry referring to the big item. + */ + ret = 0; + if (HPAGE_PTYPE(H_PAIRKEY(p, ndx)) == H_OFFPAGE) { + memcpy(&pgno, HOFFPAGE_PGNO(P_ENTRY(p, H_KEYINDEX(ndx))), + sizeof(db_pgno_t)); + ret = __db_doff(dbc, pgno); + } + + if (ret == 0) + switch (HPAGE_PTYPE(H_PAIRDATA(p, ndx))) { + case H_OFFPAGE: + memcpy(&pgno, + HOFFPAGE_PGNO(P_ENTRY(p, H_DATAINDEX(ndx))), + sizeof(db_pgno_t)); + ret = __db_doff(dbc, pgno); + break; + case H_OFFDUP: + case H_DUPLICATE: + /* + * If we delete a pair that is/was a duplicate, then + * we had better clear the flag so that we update the + * cursor appropriately. + */ + F_CLR(hcp, H_ISDUP); + break; + } + + if (ret) + return (ret); + + /* Now log the delete off this page. */ + if (DB_LOGGING(dbc)) { + key_dbt.data = P_ENTRY(p, H_KEYINDEX(ndx)); + key_dbt.size = LEN_HITEM(p, dbp->pgsize, H_KEYINDEX(ndx)); + data_dbt.data = P_ENTRY(p, H_DATAINDEX(ndx)); + data_dbt.size = LEN_HITEM(p, dbp->pgsize, H_DATAINDEX(ndx)); + + if ((ret = __ham_insdel_log(dbenv, + dbc->txn, &new_lsn, 0, DELPAIR, + dbp->log_fileid, PGNO(p), (u_int32_t)ndx, + &LSN(p), &key_dbt, &data_dbt)) != 0) + return (ret); + + /* Move lsn onto page. */ + LSN(p) = new_lsn; + } + + /* Do the delete. */ + __ham_dpair(dbp, p, ndx); + + /* + * Mark item deleted so that we don't try to return it, and + * so that we update the cursor correctly on the next call + * to next. + */ + F_SET(hcp, H_DELETED); + F_CLR(hcp, H_OK); + + /* + * Update cursors that are on the page where the delete happend. + */ + if ((ret = __ham_c_update(dbc, 0, 0, 0)) != 0) + return (ret); + + /* + * If we are locking, we will not maintain this, because it is + * a hot spot. + * + * XXX + * Perhaps we can retain incremental numbers and apply them later. + */ + if (!STD_LOCKING(dbc)) + --hcp->hdr->nelem; + + /* + * If we need to reclaim the page, then check if the page is empty. + * There are two cases. If it's empty and it's not the first page + * in the bucket (i.e., the bucket page) then we can simply remove + * it. If it is the first chain in the bucket, then we need to copy + * the second page into it and remove the second page. + * If its the only page in the bucket we leave it alone. + */ + if (!reclaim_page || + NUM_ENT(p) != 0 || + (PREV_PGNO(p) == PGNO_INVALID && NEXT_PGNO(p) == PGNO_INVALID)) + return (memp_fset(dbp->mpf, p, DB_MPOOL_DIRTY)); + + if (PREV_PGNO(p) == PGNO_INVALID) { + /* + * First page in chain is empty and we know that there + * are more pages in the chain. + */ + if ((ret = + memp_fget(dbp->mpf, &NEXT_PGNO(p), 0, &n_pagep)) != 0) + return (ret); + + if (NEXT_PGNO(n_pagep) != PGNO_INVALID && + (ret = memp_fget(dbp->mpf, &NEXT_PGNO(n_pagep), 0, + &nn_pagep)) != 0) + goto err; + + if (DB_LOGGING(dbc)) { + key_dbt.data = n_pagep; + key_dbt.size = dbp->pgsize; + if ((ret = __ham_copypage_log(dbenv, + dbc->txn, &new_lsn, 0, dbp->log_fileid, PGNO(p), + &LSN(p), PGNO(n_pagep), &LSN(n_pagep), + NEXT_PGNO(n_pagep), + nn_pagep == NULL ? NULL : &LSN(nn_pagep), + &key_dbt)) != 0) + goto err; + + /* Move lsn onto page. */ + LSN(p) = new_lsn; /* Structure assignment. */ + LSN(n_pagep) = new_lsn; + if (NEXT_PGNO(n_pagep) != PGNO_INVALID) + LSN(nn_pagep) = new_lsn; + } + if (nn_pagep != NULL) { + PREV_PGNO(nn_pagep) = PGNO(p); + if ((ret = memp_fput(dbp->mpf, + nn_pagep, DB_MPOOL_DIRTY)) != 0) { + nn_pagep = NULL; + goto err; + } + } + + tmp_pgno = PGNO(p); + tmp_lsn = LSN(p); + memcpy(p, n_pagep, dbp->pgsize); + PGNO(p) = tmp_pgno; + LSN(p) = tmp_lsn; + PREV_PGNO(p) = PGNO_INVALID; + + /* + * Update cursors to reflect the fact that records + * on the second page have moved to the first page. + */ + if ((ret = __ham_c_chgpg(dbc, + PGNO(n_pagep), NDX_INVALID, PGNO(p), NDX_INVALID)) != 0) + return (ret); + + /* + * Update the cursor to reflect its new position. + */ + hcp->indx = 0; + hcp->pgno = PGNO(p); + if ((ret = memp_fset(dbp->mpf, p, DB_MPOOL_DIRTY)) != 0 || + (ret = __db_free(dbc, n_pagep)) != 0) + return (ret); + } else { + if ((ret = + memp_fget(dbp->mpf, &PREV_PGNO(p), 0, &p_pagep)) != 0) + goto err; + + if (NEXT_PGNO(p) != PGNO_INVALID) { + if ((ret = memp_fget(dbp->mpf, + &NEXT_PGNO(p), 0, &n_pagep)) != 0) + goto err; + n_lsn = &LSN(n_pagep); + } else { + n_pagep = NULL; + n_lsn = NULL; + } + + NEXT_PGNO(p_pagep) = NEXT_PGNO(p); + if (n_pagep != NULL) + PREV_PGNO(n_pagep) = PGNO(p_pagep); + + if (DB_LOGGING(dbc)) { + if ((ret = __ham_newpage_log(dbenv, + dbc->txn, &new_lsn, 0, DELOVFL, + dbp->log_fileid, PREV_PGNO(p), &LSN(p_pagep), + PGNO(p), &LSN(p), NEXT_PGNO(p), n_lsn)) != 0) + goto err; + + /* Move lsn onto page. */ + LSN(p_pagep) = new_lsn; /* Structure assignment. */ + if (n_pagep) + LSN(n_pagep) = new_lsn; + LSN(p) = new_lsn; + } + if (NEXT_PGNO(p) == PGNO_INVALID) { + /* + * There is no next page; put the cursor on the + * previous page as if we'd deleted the last item + * on that page; index greater than number of + * valid entries and H_DELETED set. + */ + hcp->pgno = PGNO(p_pagep); + hcp->indx = NUM_ENT(p_pagep); + F_SET(hcp, H_DELETED); + } else { + hcp->pgno = NEXT_PGNO(p); + hcp->indx = 0; + } + + /* + * Since we are about to delete the cursor page and we have + * just moved the cursor, we need to make sure that the + * old page pointer isn't left hanging around in the cursor. + */ + hcp->page = NULL; + chg_pgno = PGNO(p); + ret = __db_free(dbc, p); + if ((t_ret = memp_fput(dbp->mpf, p_pagep, DB_MPOOL_DIRTY)) != 0 + && ret == 0) + ret = t_ret; + if (n_pagep != NULL && (t_ret = memp_fput(dbp->mpf, + n_pagep, DB_MPOOL_DIRTY)) != 0 && ret == 0) + ret = t_ret; + if (ret != 0) + return (ret); + ret = __ham_c_chgpg(dbc, + chg_pgno, 0, hcp->pgno, hcp->indx); + } + return (ret); + +err: /* Clean up any pages. */ + if (n_pagep != NULL) + (void)memp_fput(dbp->mpf, n_pagep, 0); + if (nn_pagep != NULL) + (void)memp_fput(dbp->mpf, nn_pagep, 0); + if (p_pagep != NULL) + (void)memp_fput(dbp->mpf, p_pagep, 0); + return (ret); +} + +/* + * __ham_replpair -- + * Given the key data indicated by the cursor, replace part/all of it + * according to the fields in the dbt. + * + * PUBLIC: int __ham_replpair __P((DBC *, DBT *, u_int32_t)); + */ +int +__ham_replpair(dbc, dbt, make_dup) + DBC *dbc; + DBT *dbt; + u_int32_t make_dup; +{ + DB *dbp; + HASH_CURSOR *hcp; + DBT old_dbt, tdata, tmp; + DB_LSN new_lsn; + int32_t change; /* XXX: Possible overflow. */ + u_int32_t dup, len, memsize; + int is_big, ret, type; + u_int8_t *beg, *dest, *end, *hk, *src; + void *memp; + + /* + * Big item replacements are handled in generic code. + * Items that fit on the current page fall into 4 classes. + * 1. On-page element, same size + * 2. On-page element, new is bigger (fits) + * 3. On-page element, new is bigger (does not fit) + * 4. On-page element, old is bigger + * Numbers 1, 2, and 4 are essentially the same (and should + * be the common case). We handle case 3 as a delete and + * add. + */ + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + + /* + * We need to compute the number of bytes that we are adding or + * removing from the entry. Normally, we can simply substract + * the number of bytes we are replacing (dbt->dlen) from the + * number of bytes we are inserting (dbt->size). However, if + * we are doing a partial put off the end of a record, then this + * formula doesn't work, because we are essentially adding + * new bytes. + */ + change = dbt->size - dbt->dlen; + + hk = H_PAIRDATA(hcp->page, hcp->indx); + is_big = HPAGE_PTYPE(hk) == H_OFFPAGE; + + if (is_big) + memcpy(&len, HOFFPAGE_TLEN(hk), sizeof(u_int32_t)); + else + len = LEN_HKEYDATA(hcp->page, + dbp->pgsize, H_DATAINDEX(hcp->indx)); + + if (dbt->doff + dbt->dlen > len) + change += dbt->doff + dbt->dlen - len; + + if (change > (int32_t)P_FREESPACE(hcp->page) || is_big) { + /* + * Case 3 -- two subcases. + * A. This is not really a partial operation, but an overwrite. + * Simple del and add works. + * B. This is a partial and we need to construct the data that + * we are really inserting (yuck). + * In both cases, we need to grab the key off the page (in + * some cases we could do this outside of this routine; for + * cleanliness we do it here. If you happen to be on a big + * key, this could be a performance hit). + */ + memset(&tmp, 0, sizeof(tmp)); + if ((ret = + __db_ret(dbp, hcp->page, H_KEYINDEX(hcp->indx), + &tmp, &dbc->rkey.data, &dbc->rkey.ulen)) != 0) + return (ret); + + /* Preserve duplicate info. */ + dup = F_ISSET(hcp, H_ISDUP); + if (dbt->doff == 0 && dbt->dlen == len) { + ret = __ham_del_pair(dbc, 0); + if (ret == 0) + ret = __ham_add_el(dbc, + &tmp, dbt, dup ? H_DUPLICATE : H_KEYDATA); + } else { /* Case B */ + type = HPAGE_PTYPE(hk) != H_OFFPAGE ? + HPAGE_PTYPE(hk) : H_KEYDATA; + memset(&tdata, 0, sizeof(tdata)); + memp = NULL; + memsize = 0; + if ((ret = __db_ret(dbp, hcp->page, + H_DATAINDEX(hcp->indx), &tdata, &memp, &memsize)) + != 0) + goto err; + + /* Now we can delete the item. */ + if ((ret = __ham_del_pair(dbc, 0)) != 0) { + __os_free(memp, memsize); + goto err; + } + + /* Now shift old data around to make room for new. */ + if (change > 0) { + if ((ret = __os_realloc(dbp->dbenv, + tdata.size + change, + NULL, &tdata.data)) != 0) + return (ret); + memp = tdata.data; + memsize = tdata.size + change; + memset((u_int8_t *)tdata.data + tdata.size, + 0, change); + } + end = (u_int8_t *)tdata.data + tdata.size; + + src = (u_int8_t *)tdata.data + dbt->doff + dbt->dlen; + if (src < end && tdata.size > dbt->doff + dbt->dlen) { + len = tdata.size - dbt->doff - dbt->dlen; + dest = src + change; + memmove(dest, src, len); + } + memcpy((u_int8_t *)tdata.data + dbt->doff, + dbt->data, dbt->size); + tdata.size += change; + + /* Now add the pair. */ + ret = __ham_add_el(dbc, &tmp, &tdata, type); + __os_free(memp, memsize); + } + F_SET(hcp, dup); +err: return (ret); + } + + /* + * Set up pointer into existing data. Do it before the log + * message so we can use it inside of the log setup. + */ + beg = HKEYDATA_DATA(H_PAIRDATA(hcp->page, hcp->indx)); + beg += dbt->doff; + + /* + * If we are going to have to move bytes at all, figure out + * all the parameters here. Then log the call before moving + * anything around. + */ + if (DB_LOGGING(dbc)) { + old_dbt.data = beg; + old_dbt.size = dbt->dlen; + if ((ret = __ham_replace_log(dbp->dbenv, + dbc->txn, &new_lsn, 0, dbp->log_fileid, PGNO(hcp->page), + (u_int32_t)H_DATAINDEX(hcp->indx), &LSN(hcp->page), + (u_int32_t)dbt->doff, &old_dbt, dbt, make_dup)) != 0) + return (ret); + + LSN(hcp->page) = new_lsn; /* Structure assignment. */ + } + + __ham_onpage_replace(hcp->page, dbp->pgsize, + (u_int32_t)H_DATAINDEX(hcp->indx), (int32_t)dbt->doff, change, dbt); + + return (0); +} + +/* + * Replace data on a page with new data, possibly growing or shrinking what's + * there. This is called on two different occasions. On one (from replpair) + * we are interested in changing only the data. On the other (from recovery) + * we are replacing the entire data (header and all) with a new element. In + * the latter case, the off argument is negative. + * pagep: the page that we're changing + * ndx: page index of the element that is growing/shrinking. + * off: Offset at which we are beginning the replacement. + * change: the number of bytes (+ or -) that the element is growing/shrinking. + * dbt: the new data that gets written at beg. + * PUBLIC: void __ham_onpage_replace __P((PAGE *, size_t, u_int32_t, int32_t, + * PUBLIC: int32_t, DBT *)); + */ +void +__ham_onpage_replace(pagep, pgsize, ndx, off, change, dbt) + PAGE *pagep; + size_t pgsize; + u_int32_t ndx; + int32_t off; + int32_t change; + DBT *dbt; +{ + db_indx_t i; + int32_t len; + u_int8_t *src, *dest; + int zero_me; + + if (change != 0) { + zero_me = 0; + src = (u_int8_t *)(pagep) + HOFFSET(pagep); + if (off < 0) + len = pagep->inp[ndx] - HOFFSET(pagep); + else if ((u_int32_t)off >= LEN_HKEYDATA(pagep, pgsize, ndx)) { + len = HKEYDATA_DATA(P_ENTRY(pagep, ndx)) + + LEN_HKEYDATA(pagep, pgsize, ndx) - src; + zero_me = 1; + } else + len = (HKEYDATA_DATA(P_ENTRY(pagep, ndx)) + off) - src; + dest = src - change; + memmove(dest, src, len); + if (zero_me) + memset(dest + len, 0, change); + + /* Now update the indices. */ + for (i = ndx; i < NUM_ENT(pagep); i++) + pagep->inp[i] -= change; + HOFFSET(pagep) -= change; + } + if (off >= 0) + memcpy(HKEYDATA_DATA(P_ENTRY(pagep, ndx)) + off, + dbt->data, dbt->size); + else + memcpy(P_ENTRY(pagep, ndx), dbt->data, dbt->size); +} + +/* + * PUBLIC: int __ham_split_page __P((DBC *, u_int32_t, u_int32_t)); + */ +int +__ham_split_page(dbc, obucket, nbucket) + DBC *dbc; + u_int32_t obucket, nbucket; +{ + DB *dbp; + DBC **carray; + HASH_CURSOR *hcp, *cp; + DBT key, page_dbt; + DB_ENV *dbenv; + DB_LSN new_lsn; + PAGE **pp, *old_pagep, *temp_pagep, *new_pagep; + db_indx_t n; + db_pgno_t bucket_pgno, npgno, next_pgno; + u_int32_t big_len, len; + int found, i, ret, t_ret; + void *big_buf; + + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + dbenv = dbp->dbenv; + temp_pagep = old_pagep = new_pagep = NULL; + + if ((ret = __ham_get_clist(dbp, obucket, NDX_INVALID, &carray)) != 0) + return (ret); + + bucket_pgno = BUCKET_TO_PAGE(hcp, obucket); + if ((ret = memp_fget(dbp->mpf, + &bucket_pgno, DB_MPOOL_CREATE, &old_pagep)) != 0) + goto err; + + /* Properly initialize the new bucket page. */ + npgno = BUCKET_TO_PAGE(hcp, nbucket); + if ((ret = memp_fget(dbp->mpf, + &npgno, DB_MPOOL_CREATE, &new_pagep)) != 0) + goto err; + P_INIT(new_pagep, + dbp->pgsize, npgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH); + + temp_pagep = hcp->split_buf; + memcpy(temp_pagep, old_pagep, dbp->pgsize); + + if (DB_LOGGING(dbc)) { + page_dbt.size = dbp->pgsize; + page_dbt.data = old_pagep; + if ((ret = __ham_splitdata_log(dbenv, + dbc->txn, &new_lsn, 0, dbp->log_fileid, SPLITOLD, + PGNO(old_pagep), &page_dbt, &LSN(old_pagep))) != 0) + goto err; + } + + P_INIT(old_pagep, dbp->pgsize, PGNO(old_pagep), PGNO_INVALID, + PGNO_INVALID, 0, P_HASH); + + if (DB_LOGGING(dbc)) + LSN(old_pagep) = new_lsn; /* Structure assignment. */ + + big_len = 0; + big_buf = NULL; + key.flags = 0; + while (temp_pagep != NULL) { + for (n = 0; n < (db_indx_t)NUM_ENT(temp_pagep); n += 2) { + if ((ret = + __db_ret(dbp, temp_pagep, H_KEYINDEX(n), + &key, &big_buf, &big_len)) != 0) + goto err; + + if (__ham_call_hash(dbc, key.data, key.size) + == obucket) + pp = &old_pagep; + else + pp = &new_pagep; + + /* + * Figure out how many bytes we need on the new + * page to store the key/data pair. + */ + + len = LEN_HITEM(temp_pagep, dbp->pgsize, + H_DATAINDEX(n)) + + LEN_HITEM(temp_pagep, dbp->pgsize, + H_KEYINDEX(n)) + + 2 * sizeof(db_indx_t); + + if (P_FREESPACE(*pp) < len) { + if (DB_LOGGING(dbc)) { + page_dbt.size = dbp->pgsize; + page_dbt.data = *pp; + if ((ret = __ham_splitdata_log( + dbenv, dbc->txn, + &new_lsn, 0, dbp->log_fileid, + SPLITNEW, PGNO(*pp), &page_dbt, + &LSN(*pp))) != 0) + goto err; + LSN(*pp) = new_lsn; + } + if ((ret = + __ham_add_ovflpage(dbc, *pp, 1, pp)) != 0) + goto err; + } + + /* Check if we need to update a cursor. */ + if (carray != NULL) { + found = 0; + for (i = 0; carray[i] != NULL; i++) { + cp = + (HASH_CURSOR *)carray[i]->internal; + if (cp->pgno == PGNO(temp_pagep) + && cp->indx == n) { + cp->pgno = PGNO(*pp); + cp->indx = NUM_ENT(*pp); + found = 1; + } + } + if (found && DB_LOGGING(dbc) + && IS_SUBTRANSACTION(dbc->txn)) { + if ((ret = + __ham_chgpg_log(dbp->dbenv, + dbc->txn, &new_lsn, 0, + dbp->log_fileid, + DB_HAM_SPLIT, PGNO(temp_pagep), + PGNO(*pp), n, NUM_ENT(*pp))) != 0) + goto err; + } + } + __ham_copy_item(dbp->pgsize, + temp_pagep, H_KEYINDEX(n), *pp); + __ham_copy_item(dbp->pgsize, + temp_pagep, H_DATAINDEX(n), *pp); + } + next_pgno = NEXT_PGNO(temp_pagep); + + /* Clear temp_page; if it's a link overflow page, free it. */ + if (PGNO(temp_pagep) != bucket_pgno && (ret = + __db_free(dbc, temp_pagep)) != 0) { + temp_pagep = NULL; + goto err; + } + + if (next_pgno == PGNO_INVALID) + temp_pagep = NULL; + else if ((ret = memp_fget(dbp->mpf, + &next_pgno, DB_MPOOL_CREATE, &temp_pagep)) != 0) + goto err; + + if (temp_pagep != NULL && DB_LOGGING(dbc)) { + page_dbt.size = dbp->pgsize; + page_dbt.data = temp_pagep; + if ((ret = __ham_splitdata_log(dbenv, + dbc->txn, &new_lsn, 0, dbp->log_fileid, + SPLITOLD, PGNO(temp_pagep), + &page_dbt, &LSN(temp_pagep))) != 0) + goto err; + LSN(temp_pagep) = new_lsn; + } + } + if (big_buf != NULL) + __os_free(big_buf, big_len); + + /* + * If the original bucket spanned multiple pages, then we've got + * a pointer to a page that used to be on the bucket chain. It + * should be deleted. + */ + if (temp_pagep != NULL && PGNO(temp_pagep) != bucket_pgno && + (ret = __db_free(dbc, temp_pagep)) != 0) { + temp_pagep = NULL; + goto err; + } + + /* + * Write new buckets out. + */ + if (DB_LOGGING(dbc)) { + page_dbt.size = dbp->pgsize; + page_dbt.data = old_pagep; + if ((ret = __ham_splitdata_log(dbenv, dbc->txn, &new_lsn, 0, + dbp->log_fileid, SPLITNEW, PGNO(old_pagep), &page_dbt, + &LSN(old_pagep))) != 0) + goto err; + LSN(old_pagep) = new_lsn; + + page_dbt.data = new_pagep; + if ((ret = __ham_splitdata_log(dbenv, dbc->txn, &new_lsn, 0, + dbp->log_fileid, SPLITNEW, PGNO(new_pagep), &page_dbt, + &LSN(new_pagep))) != 0) + goto err; + LSN(new_pagep) = new_lsn; + } + ret = memp_fput(dbp->mpf, old_pagep, DB_MPOOL_DIRTY); + if ((t_ret = memp_fput(dbp->mpf, new_pagep, DB_MPOOL_DIRTY)) != 0 + && ret == 0) + ret = t_ret; + + if (0) { +err: if (old_pagep != NULL) + (void)memp_fput(dbp->mpf, old_pagep, DB_MPOOL_DIRTY); + if (new_pagep != NULL) + (void)memp_fput(dbp->mpf, new_pagep, DB_MPOOL_DIRTY); + if (temp_pagep != NULL && PGNO(temp_pagep) != bucket_pgno) + (void)memp_fput(dbp->mpf, temp_pagep, DB_MPOOL_DIRTY); + } + if (carray != NULL) /* We never knew its size. */ + __os_free(carray, 0); + return (ret); +} + +/* + * Add the given pair to the page. The page in question may already be + * held (i.e. it was already gotten). If it is, then the page is passed + * in via the pagep parameter. On return, pagep will contain the page + * to which we just added something. This allows us to link overflow + * pages and return the new page having correctly put the last page. + * + * PUBLIC: int __ham_add_el __P((DBC *, const DBT *, const DBT *, int)); + */ +int +__ham_add_el(dbc, key, val, type) + DBC *dbc; + const DBT *key, *val; + int type; +{ + DB *dbp; + HASH_CURSOR *hcp; + const DBT *pkey, *pdata; + DBT key_dbt, data_dbt; + DB_LSN new_lsn; + HOFFPAGE doff, koff; + db_pgno_t next_pgno, pgno; + u_int32_t data_size, key_size, pairsize, rectype; + int do_expand, is_keybig, is_databig, ret; + int key_type, data_type; + + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + do_expand = 0; + + pgno = hcp->seek_found_page != PGNO_INVALID ? hcp->seek_found_page : + hcp->pgno; + if (hcp->page == NULL && (ret = memp_fget(dbp->mpf, &pgno, + DB_MPOOL_CREATE, &hcp->page)) != 0) + return (ret); + + key_size = HKEYDATA_PSIZE(key->size); + data_size = HKEYDATA_PSIZE(val->size); + is_keybig = ISBIG(hcp, key->size); + is_databig = ISBIG(hcp, val->size); + if (is_keybig) + key_size = HOFFPAGE_PSIZE; + if (is_databig) + data_size = HOFFPAGE_PSIZE; + + pairsize = key_size + data_size; + + /* Advance to first page in chain with room for item. */ + while (H_NUMPAIRS(hcp->page) && NEXT_PGNO(hcp->page) != PGNO_INVALID) { + /* + * This may not be the end of the chain, but the pair may fit + * anyway. Check if it's a bigpair that fits or a regular + * pair that fits. + */ + if (P_FREESPACE(hcp->page) >= pairsize) + break; + next_pgno = NEXT_PGNO(hcp->page); + if ((ret = + __ham_next_cpage(dbc, next_pgno, 0)) != 0) + return (ret); + } + + /* + * Check if we need to allocate a new page. + */ + if (P_FREESPACE(hcp->page) < pairsize) { + do_expand = 1; + if ((ret = __ham_add_ovflpage(dbc, + (PAGE *)hcp->page, 1, (PAGE **)&hcp->page)) != 0) + return (ret); + hcp->pgno = PGNO(hcp->page); + } + + /* + * Update cursor. + */ + hcp->indx = NUM_ENT(hcp->page); + F_CLR(hcp, H_DELETED); + if (is_keybig) { + koff.type = H_OFFPAGE; + UMRW_SET(koff.unused[0]); + UMRW_SET(koff.unused[1]); + UMRW_SET(koff.unused[2]); + if ((ret = __db_poff(dbc, key, &koff.pgno)) != 0) + return (ret); + koff.tlen = key->size; + key_dbt.data = &koff; + key_dbt.size = sizeof(koff); + pkey = &key_dbt; + key_type = H_OFFPAGE; + } else { + pkey = key; + key_type = H_KEYDATA; + } + + if (is_databig) { + doff.type = H_OFFPAGE; + UMRW_SET(doff.unused[0]); + UMRW_SET(doff.unused[1]); + UMRW_SET(doff.unused[2]); + if ((ret = __db_poff(dbc, val, &doff.pgno)) != 0) + return (ret); + doff.tlen = val->size; + data_dbt.data = &doff; + data_dbt.size = sizeof(doff); + pdata = &data_dbt; + data_type = H_OFFPAGE; + } else { + pdata = val; + data_type = type; + } + + if (DB_LOGGING(dbc)) { + rectype = PUTPAIR; + if (is_databig) + rectype |= PAIR_DATAMASK; + if (is_keybig) + rectype |= PAIR_KEYMASK; + if (type == H_DUPLICATE) + rectype |= PAIR_DUPMASK; + + if ((ret = __ham_insdel_log(dbp->dbenv, dbc->txn, &new_lsn, 0, + rectype, dbp->log_fileid, PGNO(hcp->page), + (u_int32_t)NUM_ENT(hcp->page), &LSN(hcp->page), pkey, + pdata)) != 0) + return (ret); + + /* Move lsn onto page. */ + LSN(hcp->page) = new_lsn; /* Structure assignment. */ + } + + __ham_putitem(hcp->page, pkey, key_type); + __ham_putitem(hcp->page, pdata, data_type); + + /* + * For splits, we are going to update item_info's page number + * field, so that we can easily return to the same page the + * next time we come in here. For other operations, this shouldn't + * matter, since odds are this is the last thing that happens before + * we return to the user program. + */ + hcp->pgno = PGNO(hcp->page); + + /* + * XXX + * Maybe keep incremental numbers here. + */ + if (!STD_LOCKING(dbc)) + hcp->hdr->nelem++; + + if (do_expand || (hcp->hdr->ffactor != 0 && + (u_int32_t)H_NUMPAIRS(hcp->page) > hcp->hdr->ffactor)) + F_SET(hcp, H_EXPAND); + return (0); +} + +/* + * Special __putitem call used in splitting -- copies one entry to + * another. Works for all types of hash entries (H_OFFPAGE, H_KEYDATA, + * H_DUPLICATE, H_OFFDUP). Since we log splits at a high level, we + * do not need to do any logging here. + * + * PUBLIC: void __ham_copy_item __P((size_t, PAGE *, u_int32_t, PAGE *)); + */ +void +__ham_copy_item(pgsize, src_page, src_ndx, dest_page) + size_t pgsize; + PAGE *src_page; + u_int32_t src_ndx; + PAGE *dest_page; +{ + u_int32_t len; + void *src, *dest; + + /* + * Copy the key and data entries onto this new page. + */ + src = P_ENTRY(src_page, src_ndx); + + /* Set up space on dest. */ + len = LEN_HITEM(src_page, pgsize, src_ndx); + HOFFSET(dest_page) -= len; + dest_page->inp[NUM_ENT(dest_page)] = HOFFSET(dest_page); + dest = P_ENTRY(dest_page, NUM_ENT(dest_page)); + NUM_ENT(dest_page)++; + + memcpy(dest, src, len); +} + +/* + * + * Returns: + * pointer on success + * NULL on error + * + * PUBLIC: int __ham_add_ovflpage __P((DBC *, PAGE *, int, PAGE **)); + */ +int +__ham_add_ovflpage(dbc, pagep, release, pp) + DBC *dbc; + PAGE *pagep; + int release; + PAGE **pp; +{ + DB *dbp; + HASH_CURSOR *hcp; + DB_LSN new_lsn; + PAGE *new_pagep; + int ret; + + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + + if ((ret = __db_new(dbc, P_HASH, &new_pagep)) != 0) + return (ret); + + if (DB_LOGGING(dbc)) { + if ((ret = __ham_newpage_log(dbp->dbenv, dbc->txn, &new_lsn, 0, + PUTOVFL, dbp->log_fileid, PGNO(pagep), &LSN(pagep), + PGNO(new_pagep), &LSN(new_pagep), PGNO_INVALID, NULL)) != 0) + return (ret); + + /* Move lsn onto page. */ + LSN(pagep) = LSN(new_pagep) = new_lsn; + } + NEXT_PGNO(pagep) = PGNO(new_pagep); + PREV_PGNO(new_pagep) = PGNO(pagep); + + if (release) + ret = memp_fput(dbp->mpf, pagep, DB_MPOOL_DIRTY); + + *pp = new_pagep; + return (ret); +} + +/* + * PUBLIC: int __ham_get_cpage __P((DBC *, db_lockmode_t)); + */ +int +__ham_get_cpage(dbc, mode) + DBC *dbc; + db_lockmode_t mode; +{ + DB *dbp; + DB_LOCK tmp_lock; + HASH_CURSOR *hcp; + int ret; + + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + ret = 0; + + /* + * There are four cases with respect to buckets and locks. + * 1. If there is no lock held, then if we are locking, we should + * get the lock. + * 2. If there is a lock held, it's for the current bucket, and it's + * for the right mode, we don't need to do anything. + * 3. If there is a lock held for the current bucket but it's not + * strong enough, we need to upgrade. + * 4. If there is a lock, but it's for a different bucket, then we need + * to release the existing lock and get a new lock. + */ + tmp_lock.off = LOCK_INVALID; + if (STD_LOCKING(dbc)) { + if (hcp->lock.off != LOCK_INVALID && + hcp->lbucket != hcp->bucket) { /* Case 4 */ + if (dbc->txn == NULL && + (ret = lock_put(dbp->dbenv, &hcp->lock)) != 0) + return (ret); + hcp->lock.off = LOCK_INVALID; + } + if ((hcp->lock.off != LOCK_INVALID && + (hcp->lock_mode == DB_LOCK_READ && + mode == DB_LOCK_WRITE))) { + /* Case 3. */ + tmp_lock = hcp->lock; + hcp->lock.off = LOCK_INVALID; + } + + /* Acquire the lock. */ + if (hcp->lock.off == LOCK_INVALID) + /* Cases 1, 3, and 4. */ + if ((ret = __ham_lock_bucket(dbc, mode)) != 0) + return (ret); + + if (ret == 0) { + hcp->lock_mode = mode; + hcp->lbucket = hcp->bucket; + if (tmp_lock.off != LOCK_INVALID) + /* Case 3: release the original lock. */ + ret = lock_put(dbp->dbenv, &tmp_lock); + } else if (tmp_lock.off != LOCK_INVALID) + hcp->lock = tmp_lock; + } + + if (ret == 0 && hcp->page == NULL) { + if (hcp->pgno == PGNO_INVALID) + hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket); + if ((ret = memp_fget(dbp->mpf, + &hcp->pgno, DB_MPOOL_CREATE, &hcp->page)) != 0) + return (ret); + } + + return (0); +} + +/* + * Get a new page at the cursor, putting the last page if necessary. + * If the flag is set to H_ISDUP, then we are talking about the + * duplicate page, not the main page. + * + * PUBLIC: int __ham_next_cpage __P((DBC *, db_pgno_t, int)); + */ +int +__ham_next_cpage(dbc, pgno, dirty) + DBC *dbc; + db_pgno_t pgno; + int dirty; +{ + DB *dbp; + HASH_CURSOR *hcp; + PAGE *p; + int ret; + + dbp = dbc->dbp; + hcp = (HASH_CURSOR *)dbc->internal; + + if (hcp->page != NULL && (ret = memp_fput(dbp->mpf, + hcp->page, dirty ? DB_MPOOL_DIRTY : 0)) != 0) + return (ret); + + if ((ret = memp_fget(dbp->mpf, &pgno, DB_MPOOL_CREATE, &p)) != 0) + return (ret); + + hcp->page = p; + hcp->pgno = pgno; + hcp->indx = 0; + + return (0); +} + +/* + * __ham_lock_bucket -- + * Get the lock on a particular bucket. + * + * PUBLIC: int __ham_lock_bucket __P((DBC *, db_lockmode_t)); + */ +int +__ham_lock_bucket(dbc, mode) + DBC *dbc; + db_lockmode_t mode; +{ + HASH_CURSOR *hcp; + u_int32_t flags; + int gotmeta, ret; + + hcp = (HASH_CURSOR *)dbc->internal; + gotmeta = hcp->hdr == NULL ? 1 : 0; + if (gotmeta) + if ((ret = __ham_get_meta(dbc)) != 0) + return (ret); + dbc->lock.pgno = BUCKET_TO_PAGE(hcp, hcp->bucket); + if (gotmeta) + if ((ret = __ham_release_meta(dbc)) != 0) + return (ret); + + flags = 0; + if (DB_NONBLOCK(dbc)) + LF_SET(DB_LOCK_NOWAIT); + + ret = lock_get(dbc->dbp->dbenv, + dbc->locker, flags, &dbc->lock_dbt, mode, &hcp->lock); + + hcp->lock_mode = mode; + return (ret); +} + +/* + * __ham_dpair -- + * Delete a pair on a page, paying no attention to what the pair + * represents. The caller is responsible for freeing up duplicates + * or offpage entries that might be referenced by this pair. + * + * PUBLIC: void __ham_dpair __P((DB *, PAGE *, u_int32_t)); + */ +void +__ham_dpair(dbp, p, indx) + DB *dbp; + PAGE *p; + u_int32_t indx; +{ + db_indx_t delta, n; + u_int8_t *dest, *src; + + /* + * Compute "delta", the amount we have to shift all of the + * offsets. To find the delta, we just need to calculate + * the size of the pair of elements we are removing. + */ + delta = H_PAIRSIZE(p, dbp->pgsize, indx); + + /* + * The hard case: we want to remove something other than + * the last item on the page. We need to shift data and + * offsets down. + */ + if ((db_indx_t)indx != NUM_ENT(p) - 2) { + /* + * Move the data: src is the first occupied byte on + * the page. (Length is delta.) + */ + src = (u_int8_t *)p + HOFFSET(p); + + /* + * Destination is delta bytes beyond src. This might + * be an overlapping copy, so we have to use memmove. + */ + dest = src + delta; + memmove(dest, src, p->inp[H_DATAINDEX(indx)] - HOFFSET(p)); + } + + /* Adjust page metadata. */ + HOFFSET(p) = HOFFSET(p) + delta; + NUM_ENT(p) = NUM_ENT(p) - 2; + + /* Adjust the offsets. */ + for (n = (db_indx_t)indx; n < (db_indx_t)(NUM_ENT(p)); n++) + p->inp[n] = p->inp[n + 2] + delta; + +} diff --git a/bdb/hash/hash_rec.c b/bdb/hash/hash_rec.c new file mode 100644 index 00000000000..ded58c281e9 --- /dev/null +++ b/bdb/hash/hash_rec.c @@ -0,0 +1,1078 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1995, 1996 + * Margo Seltzer. All rights reserved. + */ +/* + * Copyright (c) 1995, 1996 + * The President and Fellows of Harvard University. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Margo Seltzer. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: hash_rec.c,v 11.34 2001/01/11 18:19:52 bostic Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_shash.h" +#include "btree.h" +#include "hash.h" +#include "lock.h" +#include "log.h" +#include "mp.h" + +static int __ham_alloc_pages __P((DB *, __ham_groupalloc_args *)); + +/* + * __ham_insdel_recover -- + * + * PUBLIC: int __ham_insdel_recover + * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__ham_insdel_recover(dbenv, dbtp, lsnp, op, info) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __ham_insdel_args *argp; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + PAGE *pagep; + u_int32_t opcode; + int cmp_n, cmp_p, flags, getmeta, ret, type; + + COMPQUIET(info, NULL); + + getmeta = 0; + REC_PRINT(__ham_insdel_print); + REC_INTRO(__ham_insdel_read, 1); + + if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + if (DB_UNDO(op)) { + /* + * We are undoing and the page doesn't exist. That + * is equivalent to having a pagelsn of 0, so we + * would not have to undo anything. In this case, + * don't bother creating a page. + */ + goto done; + } else if ((ret = memp_fget(mpf, &argp->pgno, + DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + } + + if ((ret = __ham_get_meta(dbc)) != 0) + goto out; + getmeta = 1; + + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->pagelsn); + CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->pagelsn); + /* + * Two possible things going on: + * redo a delete/undo a put: delete the item from the page. + * redo a put/undo a delete: add the item to the page. + * If we are undoing a delete, then the information logged is the + * entire entry off the page, not just the data of a dbt. In + * this case, we want to copy it back onto the page verbatim. + * We do this by calling __putitem with the type H_OFFPAGE instead + * of H_KEYDATA. + */ + opcode = OPCODE_OF(argp->opcode); + + flags = 0; + if ((opcode == DELPAIR && cmp_n == 0 && DB_UNDO(op)) || + (opcode == PUTPAIR && cmp_p == 0 && DB_REDO(op))) { + /* + * Need to redo a PUT or undo a delete. If we are undoing a + * delete, we've got to restore the item back to its original + * position. That's a royal pain in the butt (because we do + * not store item lengths on the page), but there's no choice. + */ + if (opcode != DELPAIR || + argp->ndx == (u_int32_t)NUM_ENT(pagep)) { + __ham_putitem(pagep, &argp->key, + DB_UNDO(op) || PAIR_ISKEYBIG(argp->opcode) ? + H_OFFPAGE : H_KEYDATA); + + if (PAIR_ISDATADUP(argp->opcode)) + type = H_DUPLICATE; + else if (DB_UNDO(op) || PAIR_ISDATABIG(argp->opcode)) + type = H_OFFPAGE; + else + type = H_KEYDATA; + __ham_putitem(pagep, &argp->data, type); + } else + (void)__ham_reputpair(pagep, file_dbp->pgsize, + argp->ndx, &argp->key, &argp->data); + + LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn; + flags = DB_MPOOL_DIRTY; + + } else if ((opcode == DELPAIR && cmp_p == 0 && DB_REDO(op)) + || (opcode == PUTPAIR && cmp_n == 0 && DB_UNDO(op))) { + /* Need to undo a put or redo a delete. */ + __ham_dpair(file_dbp, pagep, argp->ndx); + LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn; + flags = DB_MPOOL_DIRTY; + } + + if ((ret = memp_fput(file_dbp->mpf, pagep, flags)) != 0) + goto out; + + /* Return the previous LSN. */ +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: if (getmeta) + (void)__ham_release_meta(dbc); + REC_CLOSE; +} + +/* + * __ham_newpage_recover -- + * This log message is used when we add/remove overflow pages. This + * message takes care of the pointer chains, not the data on the pages. + * + * PUBLIC: int __ham_newpage_recover + * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__ham_newpage_recover(dbenv, dbtp, lsnp, op, info) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __ham_newpage_args *argp; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int cmp_n, cmp_p, flags, getmeta, ret; + + COMPQUIET(info, NULL); + + getmeta = 0; + REC_PRINT(__ham_newpage_print); + REC_INTRO(__ham_newpage_read, 1); + + if ((ret = memp_fget(mpf, &argp->new_pgno, 0, &pagep)) != 0) { + if (DB_UNDO(op)) { + /* + * We are undoing and the page doesn't exist. That + * is equivalent to having a pagelsn of 0, so we + * would not have to undo anything. In this case, + * don't bother creating a page. + */ + ret = 0; + goto ppage; + } else if ((ret = memp_fget(mpf, &argp->new_pgno, + DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + } + + if ((ret = __ham_get_meta(dbc)) != 0) + goto out; + getmeta = 1; + + /* + * There are potentially three pages we need to check: the one + * that we created/deleted, the one before it and the one after + * it. + */ + + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->pagelsn); + CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->pagelsn); + + flags = 0; + if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == PUTOVFL) || + (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DELOVFL)) { + /* Redo a create new page or undo a delete new page. */ + P_INIT(pagep, file_dbp->pgsize, argp->new_pgno, + argp->prev_pgno, argp->next_pgno, 0, P_HASH); + flags = DB_MPOOL_DIRTY; + } else if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DELOVFL) || + (cmp_n == 0 && DB_UNDO(op) && argp->opcode == PUTOVFL)) { + /* + * Redo a delete or undo a create new page. All we + * really need to do is change the LSN. + */ + flags = DB_MPOOL_DIRTY; + } + + if (flags) + LSN(pagep) = DB_REDO(op) ? *lsnp : argp->pagelsn; + + if ((ret = memp_fput(file_dbp->mpf, pagep, flags)) != 0) + goto out; + + /* Now do the prev page. */ +ppage: if (argp->prev_pgno != PGNO_INVALID) { + if ((ret = memp_fget(mpf, &argp->prev_pgno, 0, &pagep)) != 0) { + if (DB_UNDO(op)) { + /* + * We are undoing and the page doesn't exist. + * That is equivalent to having a pagelsn of 0, + * so we would not have to undo anything. In + * this case, don't bother creating a page. + */ + ret = 0; + goto npage; + } else if ((ret = + memp_fget(mpf, &argp->prev_pgno, + DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + } + + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->prevlsn); + CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->prevlsn); + flags = 0; + + if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == PUTOVFL) || + (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DELOVFL)) { + /* Redo a create new page or undo a delete new page. */ + pagep->next_pgno = argp->new_pgno; + flags = DB_MPOOL_DIRTY; + } else if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DELOVFL) || + (cmp_n == 0 && DB_UNDO(op) && argp->opcode == PUTOVFL)) { + /* Redo a delete or undo a create new page. */ + pagep->next_pgno = argp->next_pgno; + flags = DB_MPOOL_DIRTY; + } + + if (flags) + LSN(pagep) = DB_REDO(op) ? *lsnp : argp->prevlsn; + + if ((ret = memp_fput(file_dbp->mpf, pagep, flags)) != 0) + goto out; + } + + /* Now time to do the next page */ +npage: if (argp->next_pgno != PGNO_INVALID) { + if ((ret = memp_fget(mpf, &argp->next_pgno, 0, &pagep)) != 0) { + if (DB_UNDO(op)) { + /* + * We are undoing and the page doesn't exist. + * That is equivalent to having a pagelsn of 0, + * so we would not have to undo anything. In + * this case, don't bother creating a page. + */ + goto done; + } else if ((ret = + memp_fget(mpf, &argp->next_pgno, + DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + } + + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->nextlsn); + CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->nextlsn); + flags = 0; + + if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == PUTOVFL) || + (cmp_n == 0 && DB_UNDO(op) && argp->opcode == DELOVFL)) { + /* Redo a create new page or undo a delete new page. */ + pagep->prev_pgno = argp->new_pgno; + flags = DB_MPOOL_DIRTY; + } else if ((cmp_p == 0 && DB_REDO(op) && argp->opcode == DELOVFL) || + (cmp_n == 0 && DB_UNDO(op) && argp->opcode == PUTOVFL)) { + /* Redo a delete or undo a create new page. */ + pagep->prev_pgno = argp->prev_pgno; + flags = DB_MPOOL_DIRTY; + } + + if (flags) + LSN(pagep) = DB_REDO(op) ? *lsnp : argp->nextlsn; + + if ((ret = memp_fput(file_dbp->mpf, pagep, flags)) != 0) + goto out; + } +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: if (getmeta) + (void)__ham_release_meta(dbc); + REC_CLOSE; +} + +/* + * __ham_replace_recover -- + * This log message refers to partial puts that are local to a single + * page. You can think of them as special cases of the more general + * insdel log message. + * + * PUBLIC: int __ham_replace_recover + * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__ham_replace_recover(dbenv, dbtp, lsnp, op, info) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __ham_replace_args *argp; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + DBT dbt; + PAGE *pagep; + int32_t grow; + int cmp_n, cmp_p, flags, getmeta, ret; + u_int8_t *hk; + + COMPQUIET(info, NULL); + + getmeta = 0; + REC_PRINT(__ham_replace_print); + REC_INTRO(__ham_replace_read, 1); + + if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + if (DB_UNDO(op)) { + /* + * We are undoing and the page doesn't exist. That + * is equivalent to having a pagelsn of 0, so we + * would not have to undo anything. In this case, + * don't bother creating a page. + */ + goto done; + } else if ((ret = memp_fget(mpf, &argp->pgno, + DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + } + + if ((ret = __ham_get_meta(dbc)) != 0) + goto out; + getmeta = 1; + + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->pagelsn); + CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->pagelsn); + + memset(&dbt, 0, sizeof(dbt)); + flags = 0; + grow = 1; + + if (cmp_p == 0 && DB_REDO(op)) { + /* Reapply the change as specified. */ + dbt.data = argp->newitem.data; + dbt.size = argp->newitem.size; + grow = argp->newitem.size - argp->olditem.size; + LSN(pagep) = *lsnp; + flags = DB_MPOOL_DIRTY; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Undo the already applied change. */ + dbt.data = argp->olditem.data; + dbt.size = argp->olditem.size; + grow = argp->olditem.size - argp->newitem.size; + LSN(pagep) = argp->pagelsn; + flags = DB_MPOOL_DIRTY; + } + + if (flags) { + __ham_onpage_replace(pagep, + file_dbp->pgsize, argp->ndx, argp->off, grow, &dbt); + if (argp->makedup) { + hk = P_ENTRY(pagep, argp->ndx); + if (DB_REDO(op)) + HPAGE_PTYPE(hk) = H_DUPLICATE; + else + HPAGE_PTYPE(hk) = H_KEYDATA; + } + } + + if ((ret = memp_fput(file_dbp->mpf, pagep, flags)) != 0) + goto out; + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: if (getmeta) + (void)__ham_release_meta(dbc); + REC_CLOSE; +} + +/* + * __ham_splitdata_recover -- + * + * PUBLIC: int __ham_splitdata_recover + * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__ham_splitdata_recover(dbenv, dbtp, lsnp, op, info) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __ham_splitdata_args *argp; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int cmp_n, cmp_p, flags, getmeta, ret; + + COMPQUIET(info, NULL); + + getmeta = 0; + REC_PRINT(__ham_splitdata_print); + REC_INTRO(__ham_splitdata_read, 1); + + if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + if (DB_UNDO(op)) { + /* + * We are undoing and the page doesn't exist. That + * is equivalent to having a pagelsn of 0, so we + * would not have to undo anything. In this case, + * don't bother creating a page. + */ + goto done; + } else if ((ret = memp_fget(mpf, &argp->pgno, + DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + } + + if ((ret = __ham_get_meta(dbc)) != 0) + goto out; + getmeta = 1; + + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->pagelsn); + CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->pagelsn); + + /* + * There are two types of log messages here, one for the old page + * and one for the new pages created. The original image in the + * SPLITOLD record is used for undo. The image in the SPLITNEW + * is used for redo. We should never have a case where there is + * a redo operation and the SPLITOLD record is on disk, but not + * the SPLITNEW record. Therefore, we only have work to do when + * redo NEW messages and undo OLD messages, but we have to update + * LSNs in both cases. + */ + flags = 0; + if (cmp_p == 0 && DB_REDO(op)) { + if (argp->opcode == SPLITNEW) + /* Need to redo the split described. */ + memcpy(pagep, argp->pageimage.data, + argp->pageimage.size); + LSN(pagep) = *lsnp; + flags = DB_MPOOL_DIRTY; + } else if (cmp_n == 0 && DB_UNDO(op)) { + if (argp->opcode == SPLITOLD) { + /* Put back the old image. */ + memcpy(pagep, argp->pageimage.data, + argp->pageimage.size); + } else + P_INIT(pagep, file_dbp->pgsize, argp->pgno, + PGNO_INVALID, PGNO_INVALID, 0, P_HASH); + LSN(pagep) = argp->pagelsn; + flags = DB_MPOOL_DIRTY; + } + if ((ret = memp_fput(file_dbp->mpf, pagep, flags)) != 0) + goto out; + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: if (getmeta) + (void)__ham_release_meta(dbc); + REC_CLOSE; +} + +/* + * __ham_copypage_recover -- + * Recovery function for copypage. + * + * PUBLIC: int __ham_copypage_recover + * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__ham_copypage_recover(dbenv, dbtp, lsnp, op, info) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __ham_copypage_args *argp; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + PAGE *pagep; + int cmp_n, cmp_p, flags, getmeta, ret; + + COMPQUIET(info, NULL); + + getmeta = 0; + REC_PRINT(__ham_copypage_print); + REC_INTRO(__ham_copypage_read, 1); + + if ((ret = __ham_get_meta(dbc)) != 0) + goto out; + getmeta = 1; + flags = 0; + + /* This is the bucket page. */ + if ((ret = memp_fget(mpf, &argp->pgno, 0, &pagep)) != 0) { + if (DB_UNDO(op)) { + /* + * We are undoing and the page doesn't exist. That + * is equivalent to having a pagelsn of 0, so we + * would not have to undo anything. In this case, + * don't bother creating a page. + */ + ret = 0; + goto donext; + } else if ((ret = memp_fget(mpf, &argp->pgno, + DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + } + + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->pagelsn); + CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->pagelsn); + + if (cmp_p == 0 && DB_REDO(op)) { + /* Need to redo update described. */ + memcpy(pagep, argp->page.data, argp->page.size); + PGNO(pagep) = argp->pgno; + PREV_PGNO(pagep) = PGNO_INVALID; + LSN(pagep) = *lsnp; + flags = DB_MPOOL_DIRTY; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Need to undo update described. */ + P_INIT(pagep, file_dbp->pgsize, argp->pgno, PGNO_INVALID, + argp->next_pgno, 0, P_HASH); + LSN(pagep) = argp->pagelsn; + flags = DB_MPOOL_DIRTY; + } + if ((ret = memp_fput(mpf, pagep, flags)) != 0) + goto out; + +donext: /* Now fix up the "next" page. */ + if ((ret = memp_fget(mpf, &argp->next_pgno, 0, &pagep)) != 0) { + if (DB_UNDO(op)) { + /* + * We are undoing and the page doesn't exist. That + * is equivalent to having a pagelsn of 0, so we + * would not have to undo anything. In this case, + * don't bother creating a page. + */ + ret = 0; + goto do_nn; + } else if ((ret = memp_fget(mpf, &argp->next_pgno, + DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + } + + /* For REDO just update the LSN. For UNDO copy page back. */ + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->nextlsn); + CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->nextlsn); + flags = 0; + if (cmp_p == 0 && DB_REDO(op)) { + LSN(pagep) = *lsnp; + flags = DB_MPOOL_DIRTY; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Need to undo update described. */ + memcpy(pagep, argp->page.data, argp->page.size); + flags = DB_MPOOL_DIRTY; + } + if ((ret = memp_fput(mpf, pagep, flags)) != 0) + goto out; + + /* Now fix up the next's next page. */ +do_nn: if (argp->nnext_pgno == PGNO_INVALID) + goto done; + + if ((ret = memp_fget(mpf, &argp->nnext_pgno, 0, &pagep)) != 0) { + if (DB_UNDO(op)) { + /* + * We are undoing and the page doesn't exist. That + * is equivalent to having a pagelsn of 0, so we + * would not have to undo anything. In this case, + * don't bother creating a page. + */ + goto done; + } else if ((ret = memp_fget(mpf, &argp->nnext_pgno, + DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + } + + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->nnextlsn); + CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->nnextlsn); + + flags = 0; + if (cmp_p == 0 && DB_REDO(op)) { + /* Need to redo update described. */ + PREV_PGNO(pagep) = argp->pgno; + LSN(pagep) = *lsnp; + flags = DB_MPOOL_DIRTY; + } else if (cmp_n == 0 && DB_UNDO(op)) { + /* Need to undo update described. */ + PREV_PGNO(pagep) = argp->next_pgno; + LSN(pagep) = argp->nnextlsn; + flags = DB_MPOOL_DIRTY; + } + if ((ret = memp_fput(mpf, pagep, flags)) != 0) + goto out; + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: if (getmeta) + (void)__ham_release_meta(dbc); + REC_CLOSE; +} + +/* + * __ham_metagroup_recover -- + * Recovery function for metagroup. + * + * PUBLIC: int __ham_metagroup_recover + * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__ham_metagroup_recover(dbenv, dbtp, lsnp, op, info) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __ham_metagroup_args *argp; + HASH_CURSOR *hcp; + DB *file_dbp; + DBC *dbc; + DB_MPOOLFILE *mpf; + PAGE *pagep; + db_pgno_t last_pgno; + int cmp_n, cmp_p, flags, groupgrow, ret; + + COMPQUIET(info, NULL); + REC_PRINT(__ham_metagroup_print); + REC_INTRO(__ham_metagroup_read, 1); + + /* + * This logs the virtual create of pages pgno to pgno + bucket + * Since the mpool page-allocation is not really able to be + * transaction protected, we can never undo it. Even in an abort, + * we have to allocate these pages to the hash table. + * The log record contains: + * bucket: new bucket being allocated. + * pgno: page number of the new bucket. + * if bucket is a power of 2, then we allocated a whole batch of + * pages; if it's not, then we simply allocated one new page. + */ + groupgrow = + (u_int32_t)(1 << __db_log2(argp->bucket + 1)) == argp->bucket + 1; + + last_pgno = argp->pgno; + if (groupgrow) + /* Read the last page. */ + last_pgno += argp->bucket; + + if ((ret = memp_fget(mpf, &last_pgno, DB_MPOOL_CREATE, &pagep)) != 0) + goto out; + + cmp_n = log_compare(lsnp, &LSN(pagep)); + cmp_p = log_compare(&LSN(pagep), &argp->pagelsn); + CHECK_LSN(op, cmp_p, &LSN(pagep), &argp->pagelsn); + + flags = 0; + if ((cmp_p == 0 && DB_REDO(op)) || (cmp_n == 0 && DB_UNDO(op))) { + /* + * We need to make sure that we redo the allocation of the + * pages. + */ + if (DB_REDO(op)) + pagep->lsn = *lsnp; + else + pagep->lsn = argp->pagelsn; + flags = DB_MPOOL_DIRTY; + } + if ((ret = memp_fput(mpf, pagep, flags)) != 0) + goto out; + + /* Now we have to update the meta-data page. */ + hcp = (HASH_CURSOR *)dbc->internal; + if ((ret = __ham_get_meta(dbc)) != 0) + goto out; + cmp_n = log_compare(lsnp, &hcp->hdr->dbmeta.lsn); + cmp_p = log_compare(&hcp->hdr->dbmeta.lsn, &argp->metalsn); + CHECK_LSN(op, cmp_p, &hcp->hdr->dbmeta.lsn, &argp->metalsn); + if ((cmp_p == 0 && DB_REDO(op)) || (cmp_n == 0 && DB_UNDO(op))) { + if (DB_REDO(op)) { + /* Redo the actual updating of bucket counts. */ + ++hcp->hdr->max_bucket; + if (groupgrow) { + hcp->hdr->low_mask = hcp->hdr->high_mask; + hcp->hdr->high_mask = + (argp->bucket + 1) | hcp->hdr->low_mask; + } + hcp->hdr->dbmeta.lsn = *lsnp; + } else { + /* Undo the actual updating of bucket counts. */ + --hcp->hdr->max_bucket; + if (groupgrow) { + hcp->hdr->high_mask = hcp->hdr->low_mask; + hcp->hdr->low_mask = hcp->hdr->high_mask >> 1; + } + hcp->hdr->dbmeta.lsn = argp->metalsn; + } + if (groupgrow && + hcp->hdr->spares[__db_log2(argp->bucket + 1) + 1] == + PGNO_INVALID) + hcp->hdr->spares[__db_log2(argp->bucket + 1) + 1] = + argp->pgno - argp->bucket - 1; + F_SET(hcp, H_DIRTY); + } + if ((ret = __ham_release_meta(dbc)) != 0) + goto out; + +done: *lsnp = argp->prev_lsn; + ret = 0; + +out: REC_CLOSE; +} + +/* + * __ham_groupalloc_recover -- + * Recover the batch creation of a set of pages for a new database. + * + * PUBLIC: int __ham_groupalloc_recover + * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ +int +__ham_groupalloc_recover(dbenv, dbtp, lsnp, op, info) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __ham_groupalloc_args *argp; + DBMETA *mmeta; + DB_MPOOLFILE *mpf; + DB *file_dbp; + DBC *dbc; + db_pgno_t pgno; + int cmp_n, cmp_p, flags, ret; + + REC_PRINT(__ham_groupalloc_print); + REC_INTRO(__ham_groupalloc_read, 0); + + pgno = PGNO_BASE_MD; + if ((ret = memp_fget(mpf, &pgno, 0, &mmeta)) != 0) { + if (DB_REDO(op)) { + /* Page should have existed. */ + (void)__db_pgerr(file_dbp, pgno); + goto out; + } else { + ret = 0; + goto done; + } + } + + cmp_n = log_compare(lsnp, &LSN(mmeta)); + cmp_p = log_compare(&LSN(mmeta), &argp->meta_lsn); + CHECK_LSN(op, cmp_p, &LSN(mmeta), &argp->meta_lsn); + + /* + * Basically, we used mpool to allocate a chunk of pages. + * We need to either add those to a free list (in the undo + * case) or initialize them (in the redo case). + * + * If we are redoing and this is a hash subdatabase, it's possible + * that the pages were never allocated, so we'd better check for + * that and handle it here. + */ + + flags = 0; + if (DB_REDO(op)) { + if ((ret = __ham_alloc_pages(file_dbp, argp)) != 0) + goto out1; + if (cmp_p == 0) { + LSN(mmeta) = *lsnp; + flags = DB_MPOOL_DIRTY; + } + } + + /* + * Always put the pages into the limbo list and free them later. + */ + else if (DB_UNDO(op)) { + if ((ret = __db_add_limbo(dbenv, + info, argp->fileid, argp->start_pgno, argp->num)) != 0) + goto out; + if (cmp_n == 0) { + LSN(mmeta) = argp->meta_lsn; + flags = DB_MPOOL_DIRTY; + } + } + +out1: if ((ret = memp_fput(mpf, mmeta, flags)) != 0) + goto out; + +done: if (ret == 0) + *lsnp = argp->prev_lsn; + +out: REC_CLOSE; +} + +/* + * __ham_alloc_pages -- + * + * Called during redo of a file create. We create new pages in the file + * using the MPOOL_NEW_GROUP flag. We then log the meta-data page with a + * __crdel_metasub message. If we manage to crash without the newly written + * pages getting to disk (I'm not sure this can happen anywhere except our + * test suite?!), then we need to go through a recreate the final pages. + * Hash normally has holes in its files and handles them appropriately. + */ +static int +__ham_alloc_pages(dbp, argp) + DB *dbp; + __ham_groupalloc_args *argp; +{ + DB_MPOOLFILE *mpf; + PAGE *pagep; + db_pgno_t pgno; + int ret; + + mpf = dbp->mpf; + + /* Read the last page of the allocation. */ + pgno = argp->start_pgno + argp->num - 1; + + /* If the page exists, and it has been initialized, then we're done. */ + if ((ret = memp_fget(mpf, &pgno, 0, &pagep)) == 0) { + if ((pagep->type == P_INVALID) && IS_ZERO_LSN(pagep->lsn)) + goto reinit_page; + if ((ret = memp_fput(mpf, pagep, 0)) != 0) + return (ret); + return (0); + } + + /* + * Had to create the page. On some systems (read "Windows"), + * you can find random garbage on pages to which you haven't + * yet written. So, we have an os layer that will do the + * right thing for group allocations. We call that directly + * to make sure all the pages are allocated and then continue + * merrily on our way with normal recovery. + */ + if ((ret = __os_fpinit(dbp->dbenv, &mpf->fh, + argp->start_pgno, argp->num, dbp->pgsize)) != 0) + return (ret); + + if ((ret = memp_fget(mpf, &pgno, DB_MPOOL_CREATE, &pagep)) != 0) { + (void)__db_pgerr(dbp, pgno); + return (ret); + } + +reinit_page: + /* Initialize the newly allocated page. */ + P_INIT(pagep, + dbp->pgsize, pgno, PGNO_INVALID, PGNO_INVALID, 0, P_HASH); + ZERO_LSN(pagep->lsn); + + if ((ret = memp_fput(mpf, pagep, DB_MPOOL_DIRTY)) != 0) + return (ret); + + return (0); +} + +/* + * __ham_curadj_recover -- + * Undo cursor adjustments if a subtransaction fails. + * + * PUBLIC: int __ham_curadj_recover + * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ + +int +__ham_curadj_recover(dbenv, dbtp, lsnp, op, info) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __ham_curadj_args *argp; + DB_MPOOLFILE *mpf; + DB *file_dbp; + DBC *dbc; + int ret; + HASH_CURSOR *hcp; + + REC_PRINT(__ham_groupalloc_print); + + ret = 0; + if (op != DB_TXN_ABORT) + goto done; + REC_INTRO(__ham_curadj_read, 0); + + COMPQUIET(info, NULL); + /* + * Undo the adjustment by reinitializing the the cursor + * to look like the one that was used to do the adustment, + * then we invert the add so that undo the adjustment. + */ + hcp = (HASH_CURSOR *)dbc->internal; + hcp->pgno = argp->pgno; + hcp->indx = argp->indx; + hcp->dup_off = argp->dup_off; + hcp->order = argp->order; + if (!argp->add) + F_SET(hcp, H_DELETED); + (void)__ham_c_update(dbc, argp->len, !argp->add, argp->is_dup); + +done: *lsnp = argp->prev_lsn; +out: REC_CLOSE; +} + +/* + * __ham_chgpg_recover -- + * Undo cursor adjustments if a subtransaction fails. + * + * PUBLIC: int __ham_chgpg_recover + * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); + */ + +int +__ham_chgpg_recover(dbenv, dbtp, lsnp, op, info) + DB_ENV *dbenv; + DBT *dbtp; + DB_LSN *lsnp; + db_recops op; + void *info; +{ + __ham_chgpg_args *argp; + BTREE_CURSOR *opdcp; + DB_MPOOLFILE *mpf; + DB *file_dbp, *ldbp; + DBC *dbc; + int ret; + DBC *cp; + HASH_CURSOR *lcp; + + REC_PRINT(__ham_chgpg_print); + + ret = 0; + if (op != DB_TXN_ABORT) + goto out; + REC_INTRO(__ham_chgpg_read, 0); + + COMPQUIET(info, NULL); + + MUTEX_THREAD_LOCK(dbenv, dbenv->dblist_mutexp); + for (ldbp = __dblist_get(dbenv, file_dbp->adj_fileid); + ldbp != NULL && ldbp->adj_fileid == file_dbp->adj_fileid; + ldbp = LIST_NEXT(ldbp, dblistlinks)) { + MUTEX_THREAD_LOCK(dbenv, file_dbp->mutexp); + + for (cp = TAILQ_FIRST(&ldbp->active_queue); cp != NULL; + cp = TAILQ_NEXT(cp, links)) { + lcp = (HASH_CURSOR *)cp->internal; + + switch (argp->mode) { + case DB_HAM_CHGPG: + if (lcp->pgno != argp->new_pgno) + break; + + if (argp->old_indx == NDX_INVALID) + lcp->pgno = argp->old_pgno; + else if (lcp->indx == argp->new_indx) { + lcp->indx = argp->old_indx; + lcp->pgno = argp->old_pgno; + } + break; + + case DB_HAM_SPLIT: + if (lcp->pgno == argp->new_pgno + && lcp->indx == argp->new_indx) { + lcp->indx = argp->old_indx; + lcp->pgno = argp->old_pgno; + } + break; + + case DB_HAM_DUP: + if (lcp->opd != NULL) { + opdcp = + (BTREE_CURSOR *)lcp->opd->internal; + if (opdcp->pgno == argp->new_pgno && + opdcp->indx == argp->new_indx) { + if (F_ISSET(opdcp, C_DELETED)) + F_SET(lcp, H_DELETED); + if ((ret = + lcp->opd->c_close( + lcp->opd)) != 0) + goto out; + lcp->opd = NULL; + } + } + break; + } + } + + MUTEX_THREAD_UNLOCK(dbenv, file_dbp->mutexp); + } + MUTEX_THREAD_UNLOCK(dbenv, dbenv->dblist_mutexp); + +done: *lsnp = argp->prev_lsn; + ret = 0; +out: REC_CLOSE; +} diff --git a/bdb/hash/hash_reclaim.c b/bdb/hash/hash_reclaim.c new file mode 100644 index 00000000000..8857c5406a4 --- /dev/null +++ b/bdb/hash/hash_reclaim.c @@ -0,0 +1,68 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: hash_reclaim.c,v 11.4 2000/11/30 00:58:37 ubell Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_shash.h" +#include "hash.h" +#include "lock.h" + +/* + * __ham_reclaim -- + * Reclaim the pages from a subdatabase and return them to the + * parent free list. For now, we link each freed page on the list + * separately. If people really store hash databases in subdatabases + * and do a lot of creates and deletes, this is going to be a problem, + * because hash needs chunks of contiguous storage. We may eventually + * need to go to a model where we maintain the free list with chunks of + * contiguous pages as well. + * + * PUBLIC: int __ham_reclaim __P((DB *, DB_TXN *txn)); + */ +int +__ham_reclaim(dbp, txn) + DB *dbp; + DB_TXN *txn; +{ + DBC *dbc; + HASH_CURSOR *hcp; + int ret; + + /* Open up a cursor that we'll use for traversing. */ + if ((ret = dbp->cursor(dbp, txn, &dbc, 0)) != 0) + return (ret); + hcp = (HASH_CURSOR *)dbc->internal; + + if ((ret = __ham_get_meta(dbc)) != 0) + goto err; + + if ((ret = __ham_traverse(dbp, + dbc, DB_LOCK_WRITE, __db_reclaim_callback, dbc)) != 0) + goto err; + if ((ret = dbc->c_close(dbc)) != 0) + goto err; + if ((ret = __ham_release_meta(dbc)) != 0) + goto err; + return (0); + +err: if (hcp->hdr != NULL) + (void)__ham_release_meta(dbc); + (void)dbc->c_close(dbc); + return (ret); +} diff --git a/bdb/hash/hash_stat.c b/bdb/hash/hash_stat.c new file mode 100644 index 00000000000..ed64bbc68bd --- /dev/null +++ b/bdb/hash/hash_stat.c @@ -0,0 +1,329 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: hash_stat.c,v 11.24 2000/12/21 21:54:35 margo Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_shash.h" +#include "btree.h" +#include "hash.h" +#include "lock.h" + +static int __ham_stat_callback __P((DB *, PAGE *, void *, int *)); + +/* + * __ham_stat -- + * Gather/print the hash statistics + * + * PUBLIC: int __ham_stat __P((DB *, void *, void *(*)(size_t), u_int32_t)); + */ +int +__ham_stat(dbp, spp, db_malloc, flags) + DB *dbp; + void *spp, *(*db_malloc) __P((size_t)); + u_int32_t flags; +{ + DB_HASH_STAT *sp; + HASH_CURSOR *hcp; + DBC *dbc; + PAGE *h; + db_pgno_t pgno; + int ret; + + PANIC_CHECK(dbp->dbenv); + DB_ILLEGAL_BEFORE_OPEN(dbp, "DB->stat"); + + sp = NULL; + + /* Check for invalid flags. */ + if ((ret = __db_statchk(dbp, flags)) != 0) + return (ret); + + if ((ret = dbp->cursor(dbp, NULL, &dbc, 0)) != 0) + return (ret); + hcp = (HASH_CURSOR *)dbc->internal; + + if ((ret = __ham_get_meta(dbc)) != 0) + goto err; + + /* Allocate and clear the structure. */ + if ((ret = __os_malloc(dbp->dbenv, sizeof(*sp), db_malloc, &sp)) != 0) + goto err; + memset(sp, 0, sizeof(*sp)); + if (flags == DB_CACHED_COUNTS) { + sp->hash_nkeys = hcp->hdr->dbmeta.key_count; + sp->hash_ndata = hcp->hdr->dbmeta.record_count; + goto done; + } + + /* Copy the fields that we have. */ + sp->hash_pagesize = dbp->pgsize; + sp->hash_buckets = hcp->hdr->max_bucket + 1; + sp->hash_magic = hcp->hdr->dbmeta.magic; + sp->hash_version = hcp->hdr->dbmeta.version; + sp->hash_metaflags = hcp->hdr->dbmeta.flags; + sp->hash_nelem = hcp->hdr->nelem; + sp->hash_ffactor = hcp->hdr->ffactor; + + /* Walk the free list, counting pages. */ + for (sp->hash_free = 0, pgno = hcp->hdr->dbmeta.free; + pgno != PGNO_INVALID;) { + ++sp->hash_free; + + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) + goto err; + + pgno = h->next_pgno; + (void)memp_fput(dbp->mpf, h, 0); + } + + /* Now traverse the rest of the table. */ + if ((ret = __ham_traverse(dbp, + dbc, DB_LOCK_READ, __ham_stat_callback, sp)) != 0) + goto err; + + if (!F_ISSET(dbp, DB_AM_RDONLY)) { + if ((ret = __ham_dirty_meta(dbc)) != 0) + goto err; + hcp->hdr->dbmeta.key_count = sp->hash_nkeys; + hcp->hdr->dbmeta.record_count = sp->hash_ndata; + } + +done: + if ((ret = __ham_release_meta(dbc)) != 0) + goto err; + if ((ret = dbc->c_close(dbc)) != 0) + goto err; + + *(DB_HASH_STAT **)spp = sp; + return (0); + +err: if (sp != NULL) + __os_free(sp, sizeof(*sp)); + if (hcp->hdr != NULL) + (void)__ham_release_meta(dbc); + (void)dbc->c_close(dbc); + return (ret); + +} + +/* + * __ham_traverse + * Traverse an entire hash table. We use the callback so that we + * can use this both for stat collection and for deallocation. + * + * PUBLIC: int __ham_traverse __P((DB *, DBC *, db_lockmode_t, + * PUBLIC: int (*)(DB *, PAGE *, void *, int *), void *)); + */ +int +__ham_traverse(dbp, dbc, mode, callback, cookie) + DB *dbp; + DBC *dbc; + db_lockmode_t mode; + int (*callback) __P((DB *, PAGE *, void *, int *)); + void *cookie; +{ + HASH_CURSOR *hcp; + HKEYDATA *hk; + DBC *opd; + db_pgno_t pgno, opgno; + u_int32_t bucket; + int did_put, i, ret, t_ret; + + hcp = (HASH_CURSOR *)dbc->internal; + opd = NULL; + ret = 0; + + /* + * In a perfect world, we could simply read each page in the file + * and look at its page type to tally the information necessary. + * Unfortunately, the bucket locking that hash tables do to make + * locking easy, makes this a pain in the butt. We have to traverse + * duplicate, overflow and big pages from the bucket so that we + * don't access anything that isn't properly locked. + */ + for (bucket = 0; bucket <= hcp->hdr->max_bucket; bucket++) { + hcp->bucket = bucket; + hcp->pgno = pgno = BUCKET_TO_PAGE(hcp, bucket); + for (ret = __ham_get_cpage(dbc, mode); ret == 0; + ret = __ham_next_cpage(dbc, pgno, 0)) { + pgno = NEXT_PGNO(hcp->page); + + /* + * Go through each item on the page checking for + * duplicates (in which case we have to count the + * duplicate pages) or big key/data items (in which + * case we have to count those pages). + */ + for (i = 0; i < NUM_ENT(hcp->page); i++) { + hk = (HKEYDATA *)P_ENTRY(hcp->page, i); + switch (HPAGE_PTYPE(hk)) { + case H_OFFDUP: + memcpy(&opgno, HOFFDUP_PGNO(hk), + sizeof(db_pgno_t)); + if ((ret = __db_c_newopd(dbc, + opgno, &opd)) != 0) + return (ret); + if ((ret = __bam_traverse(opd, + DB_LOCK_READ, opgno, + __ham_stat_callback, cookie)) + != 0) + goto err; + if ((ret = opd->c_close(opd)) != 0) + return (ret); + opd = NULL; + break; + case H_OFFPAGE: + /* + * We are about to get a big page + * which will use the same spot that + * the current page uses, so we need + * to restore the current page before + * looking at it again. + */ + memcpy(&opgno, HOFFPAGE_PGNO(hk), + sizeof(db_pgno_t)); + if ((ret = __db_traverse_big(dbp, + opgno, callback, cookie)) != 0) + goto err; + break; + case H_KEYDATA: + break; + } + } + + /* Call the callback on main pages. */ + if ((ret = callback(dbp, + hcp->page, cookie, &did_put)) != 0) + goto err; + + if (did_put) + hcp->page = NULL; + if (pgno == PGNO_INVALID) + break; + } + if (ret != 0) + goto err; + + if (STD_LOCKING(dbc)) + (void)lock_put(dbp->dbenv, &hcp->lock); + + if (hcp->page != NULL) { + if ((ret = memp_fput(dbc->dbp->mpf, hcp->page, 0)) != 0) + return (ret); + hcp->page = NULL; + } + + } +err: if (opd != NULL && + (t_ret = opd->c_close(opd)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +static int +__ham_stat_callback(dbp, pagep, cookie, putp) + DB *dbp; + PAGE *pagep; + void *cookie; + int *putp; +{ + DB_HASH_STAT *sp; + DB_BTREE_STAT bstat; + db_indx_t indx, len, off, tlen, top; + u_int8_t *hk; + + *putp = 0; + sp = cookie; + + switch (pagep->type) { + case P_INVALID: + /* + * Hash pages may be wholly zeroed; this is not a bug. + * Obviously such pages have no data, so we can just proceed. + */ + break; + case P_HASH: + /* + * We count the buckets and the overflow pages + * separately and tally their bytes separately + * as well. We need to figure out if this page + * is a bucket. + */ + if (PREV_PGNO(pagep) == PGNO_INVALID) + sp->hash_bfree += P_FREESPACE(pagep); + else { + sp->hash_overflows++; + sp->hash_ovfl_free += P_FREESPACE(pagep); + } + top = NUM_ENT(pagep); + /* Correct for on-page duplicates and deleted items. */ + for (indx = 0; indx < top; indx += P_INDX) { + switch (*H_PAIRDATA(pagep, indx)) { + case H_OFFDUP: + case H_OFFPAGE: + break; + case H_KEYDATA: + sp->hash_ndata++; + break; + case H_DUPLICATE: + tlen = LEN_HDATA(pagep, 0, indx); + hk = H_PAIRDATA(pagep, indx); + for (off = 0; off < tlen; + off += len + 2 * sizeof (db_indx_t)) { + sp->hash_ndata++; + memcpy(&len, + HKEYDATA_DATA(hk) + + off, sizeof(db_indx_t)); + } + } + } + sp->hash_nkeys += H_NUMPAIRS(pagep); + break; + case P_IBTREE: + case P_IRECNO: + case P_LBTREE: + case P_LRECNO: + case P_LDUP: + /* + * These are all btree pages; get a correct + * cookie and call them. Then add appropriate + * fields into our stat structure. + */ + memset(&bstat, 0, sizeof(bstat)); + bstat.bt_dup_pgfree = 0; + bstat.bt_int_pgfree = 0; + bstat.bt_leaf_pgfree = 0; + bstat.bt_ndata = 0; + __bam_stat_callback(dbp, pagep, &bstat, putp); + sp->hash_dup++; + sp->hash_dup_free += bstat.bt_leaf_pgfree + + bstat.bt_dup_pgfree + bstat.bt_int_pgfree; + sp->hash_ndata += bstat.bt_ndata; + break; + case P_OVERFLOW: + sp->hash_bigpages++; + sp->hash_big_bfree += P_OVFLSPACE(dbp->pgsize, pagep); + break; + default: + return (__db_unknown_type(dbp->dbenv, + "__ham_stat_callback", pagep->type)); + } + + return (0); +} diff --git a/bdb/hash/hash_upgrade.c b/bdb/hash/hash_upgrade.c new file mode 100644 index 00000000000..c34381276b4 --- /dev/null +++ b/bdb/hash/hash_upgrade.c @@ -0,0 +1,271 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: hash_upgrade.c,v 11.25 2000/12/14 19:18:32 bostic Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <limits.h> +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_swap.h" +#include "hash.h" +#include "db_upgrade.h" + +/* + * __ham_30_hashmeta -- + * Upgrade the database from version 4/5 to version 6. + * + * PUBLIC: int __ham_30_hashmeta __P((DB *, char *, u_int8_t *)); + */ +int +__ham_30_hashmeta(dbp, real_name, obuf) + DB *dbp; + char *real_name; + u_int8_t *obuf; +{ + DB_ENV *dbenv; + HASHHDR *oldmeta; + HMETA30 newmeta; + u_int32_t *o_spares, *n_spares; + u_int32_t fillf, maxb, nelem; + int i, max_entry, ret; + + dbenv = dbp->dbenv; + memset(&newmeta, 0, sizeof(newmeta)); + + oldmeta = (HASHHDR *)obuf; + + /* + * The first 32 bytes are similar. The only change is the version + * and that we removed the ovfl_point and have the page type now. + */ + + newmeta.dbmeta.lsn = oldmeta->lsn; + newmeta.dbmeta.pgno = oldmeta->pgno; + newmeta.dbmeta.magic = oldmeta->magic; + newmeta.dbmeta.version = 6; + newmeta.dbmeta.pagesize = oldmeta->pagesize; + newmeta.dbmeta.type = P_HASHMETA; + + /* Move flags */ + newmeta.dbmeta.flags = oldmeta->flags; + + /* Copy the free list, which has changed its name but works the same. */ + newmeta.dbmeta.free = oldmeta->last_freed; + + /* Copy: max_bucket, high_mask, low-mask, ffactor, nelem, h_charkey */ + newmeta.max_bucket = oldmeta->max_bucket; + newmeta.high_mask = oldmeta->high_mask; + newmeta.low_mask = oldmeta->low_mask; + newmeta.ffactor = oldmeta->ffactor; + newmeta.nelem = oldmeta->nelem; + newmeta.h_charkey = oldmeta->h_charkey; + + /* + * There was a bug in 2.X versions where the nelem could go negative. + * In general, this is considered "bad." If it does go negative + * (that is, very large and positive), we'll die trying to dump and + * load this database. So, let's see if we can fix it here. + */ + nelem = newmeta.nelem; + fillf = newmeta.ffactor; + maxb = newmeta.max_bucket; + + if ((fillf != 0 && fillf * maxb < 2 * nelem) || + (fillf == 0 && nelem > 0x8000000)) + newmeta.nelem = 0; + + /* + * We now have to convert the spares array. The old spares array + * contained the total number of extra pages allocated prior to + * the bucket that begins the next doubling. The new spares array + * contains the page number of the first bucket in the next doubling + * MINUS the bucket number of that bucket. + */ + o_spares = oldmeta->spares; + n_spares = newmeta.spares; + max_entry = __db_log2(maxb + 1); /* highest spares entry in use */ + n_spares[0] = 1; + for (i = 1; i < NCACHED && i <= max_entry; i++) + n_spares[i] = 1 + o_spares[i - 1]; + + /* Replace the unique ID. */ + if ((ret = __os_fileid(dbenv, real_name, 1, newmeta.dbmeta.uid)) != 0) + return (ret); + + /* Overwrite the original. */ + memcpy(oldmeta, &newmeta, sizeof(newmeta)); + + return (0); +} + +/* + * __ham_30_sizefix -- + * Make sure that all hash pages belonging to the current + * hash doubling are within the bounds of the file. + * + * PUBLIC: int __ham_30_sizefix __P((DB *, DB_FH *, char *, u_int8_t *)); + */ +int +__ham_30_sizefix(dbp, fhp, realname, metabuf) + DB *dbp; + DB_FH *fhp; + char *realname; + u_int8_t *metabuf; +{ + u_int8_t buf[DB_MAX_PGSIZE]; + DB_ENV *dbenv; + HMETA30 *meta; + db_pgno_t last_actual, last_desired; + int ret; + size_t nw; + u_int32_t pagesize; + + dbenv = dbp->dbenv; + memset(buf, 0, DB_MAX_PGSIZE); + + meta = (HMETA30 *)metabuf; + pagesize = meta->dbmeta.pagesize; + + /* + * Get the last page number. To do this, we'll need dbp->pgsize + * to be set right, so slam it into place. + */ + dbp->pgsize = pagesize; + if ((ret = __db_lastpgno(dbp, realname, fhp, &last_actual)) != 0) + return (ret); + + /* + * The last bucket in the doubling is equal to high_mask; calculate + * the page number that implies. + */ + last_desired = BS_TO_PAGE(meta->high_mask, meta->spares); + + /* + * If last_desired > last_actual, we need to grow the file. Write + * a zeroed page where last_desired would go. + */ + if (last_desired > last_actual) { + if ((ret = __os_seek(dbenv, + fhp, pagesize, last_desired, 0, 0, DB_OS_SEEK_SET)) != 0) + return (ret); + if ((ret = __os_write(dbenv, fhp, buf, pagesize, &nw)) != 0) + return (ret); + if (nw != pagesize) { + __db_err(dbenv, "Short write during upgrade"); + return (EIO); + } + } + + return (0); +} + +/* + * __ham_31_hashmeta -- + * Upgrade the database from version 6 to version 7. + * + * PUBLIC: int __ham_31_hashmeta + * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)); + */ +int +__ham_31_hashmeta(dbp, real_name, flags, fhp, h, dirtyp) + DB *dbp; + char *real_name; + u_int32_t flags; + DB_FH *fhp; + PAGE *h; + int *dirtyp; +{ + HMETA31 *newmeta; + HMETA30 *oldmeta; + + COMPQUIET(dbp, NULL); + COMPQUIET(real_name, NULL); + COMPQUIET(fhp, NULL); + + newmeta = (HMETA31 *)h; + oldmeta = (HMETA30 *)h; + + /* + * Copy the fields down the page. + * The fields may overlap so start at the bottom and use memmove(). + */ + memmove(newmeta->spares, oldmeta->spares, sizeof(oldmeta->spares)); + newmeta->h_charkey = oldmeta->h_charkey; + newmeta->nelem = oldmeta->nelem; + newmeta->ffactor = oldmeta->ffactor; + newmeta->low_mask = oldmeta->low_mask; + newmeta->high_mask = oldmeta->high_mask; + newmeta->max_bucket = oldmeta->max_bucket; + memmove(newmeta->dbmeta.uid, + oldmeta->dbmeta.uid, sizeof(oldmeta->dbmeta.uid)); + newmeta->dbmeta.flags = oldmeta->dbmeta.flags; + newmeta->dbmeta.record_count = 0; + newmeta->dbmeta.key_count = 0; + ZERO_LSN(newmeta->dbmeta.unused3); + + /* Update the version. */ + newmeta->dbmeta.version = 7; + + /* Upgrade the flags. */ + if (LF_ISSET(DB_DUPSORT)) + F_SET(&newmeta->dbmeta, DB_HASH_DUPSORT); + + *dirtyp = 1; + return (0); +} + +/* + * __ham_31_hash -- + * Upgrade the database hash leaf pages. + * + * PUBLIC: int __ham_31_hash + * PUBLIC: __P((DB *, char *, u_int32_t, DB_FH *, PAGE *, int *)); + */ +int +__ham_31_hash(dbp, real_name, flags, fhp, h, dirtyp) + DB *dbp; + char *real_name; + u_int32_t flags; + DB_FH *fhp; + PAGE *h; + int *dirtyp; +{ + HKEYDATA *hk; + db_pgno_t pgno, tpgno; + db_indx_t indx; + int ret; + + COMPQUIET(flags, 0); + + ret = 0; + for (indx = 0; indx < NUM_ENT(h); indx += 2) { + hk = (HKEYDATA *)H_PAIRDATA(h, indx); + if (HPAGE_PTYPE(hk) == H_OFFDUP) { + memcpy(&pgno, HOFFDUP_PGNO(hk), sizeof(db_pgno_t)); + tpgno = pgno; + if ((ret = __db_31_offdup(dbp, real_name, fhp, + LF_ISSET(DB_DUPSORT) ? 1 : 0, &tpgno)) != 0) + break; + if (pgno != tpgno) { + *dirtyp = 1; + memcpy(HOFFDUP_PGNO(hk), + &tpgno, sizeof(db_pgno_t)); + } + } + } + + return (ret); +} diff --git a/bdb/hash/hash_verify.c b/bdb/hash/hash_verify.c new file mode 100644 index 00000000000..31dd7cc2299 --- /dev/null +++ b/bdb/hash/hash_verify.c @@ -0,0 +1,1051 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1999, 2000 + * Sleepycat Software. All rights reserved. + * + * $Id: hash_verify.c,v 1.31 2000/11/30 00:58:37 ubell Exp $ + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: hash_verify.c,v 1.31 2000/11/30 00:58:37 ubell Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_verify.h" +#include "btree.h" +#include "hash.h" + +static int __ham_dups_unsorted __P((DB *, u_int8_t *, u_int32_t)); +static int __ham_vrfy_bucket __P((DB *, VRFY_DBINFO *, HMETA *, u_int32_t, + u_int32_t)); +static int __ham_vrfy_item __P((DB *, + VRFY_DBINFO *, db_pgno_t, PAGE *, u_int32_t, u_int32_t)); + +/* + * __ham_vrfy_meta -- + * Verify the hash-specific part of a metadata page. + * + * Note that unlike btree, we don't save things off, because we + * will need most everything again to verify each page and the + * amount of state here is significant. + * + * PUBLIC: int __ham_vrfy_meta __P((DB *, VRFY_DBINFO *, HMETA *, + * PUBLIC: db_pgno_t, u_int32_t)); + */ +int +__ham_vrfy_meta(dbp, vdp, m, pgno, flags) + DB *dbp; + VRFY_DBINFO *vdp; + HMETA *m; + db_pgno_t pgno; + u_int32_t flags; +{ + HASH *hashp; + VRFY_PAGEINFO *pip; + int i, ret, t_ret, isbad; + u_int32_t pwr, mbucket; + u_int32_t (*hfunc) __P((DB *, const void *, u_int32_t)); + + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + isbad = 0; + + hashp = dbp->h_internal; + + if (hashp != NULL && hashp->h_hash != NULL) + hfunc = hashp->h_hash; + else + hfunc = __ham_func5; + + /* + * If we haven't already checked the common fields in pagezero, + * check them. + */ + if (!F_ISSET(pip, VRFY_INCOMPLETE) && + (ret = __db_vrfy_meta(dbp, vdp, &m->dbmeta, pgno, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + + /* h_charkey */ + if (!LF_ISSET(DB_NOORDERCHK)) + if (m->h_charkey != hfunc(dbp, CHARKEY, sizeof(CHARKEY))) { + EPRINT((dbp->dbenv, +"Database has different custom hash function; reverify with DB_NOORDERCHK set" + )); + /* + * Return immediately; this is probably a sign + * of user error rather than database corruption, so + * we want to avoid extraneous errors. + */ + isbad = 1; + goto err; + } + + /* max_bucket must be less than the last pgno. */ + if (m->max_bucket > vdp->last_pgno) { + EPRINT((dbp->dbenv, + "Impossible max_bucket %lu on meta page %lu", + m->max_bucket, pgno)); + /* + * Most other fields depend somehow on max_bucket, so + * we just return--there will be lots of extraneous + * errors. + */ + isbad = 1; + goto err; + } + + /* + * max_bucket, high_mask and low_mask: high_mask must be one + * less than the next power of two above max_bucket, and + * low_mask must be one less than the power of two below it. + * + * + */ + pwr = (m->max_bucket == 0) ? 1 : 1 << __db_log2(m->max_bucket + 1); + if (m->high_mask != pwr - 1) { + EPRINT((dbp->dbenv, + "Incorrect high_mask %lu on page %lu, should be %lu", + m->high_mask, pgno, pwr - 1)); + isbad = 1; + } + pwr >>= 1; + if (m->low_mask != pwr - 1) { + EPRINT((dbp->dbenv, + "Incorrect low_mask %lu on page %lu, should be %lu", + m->low_mask, pgno, pwr - 1)); + isbad = 1; + } + + /* ffactor: no check possible. */ + pip->h_ffactor = m->ffactor; + + /* + * nelem: just make sure it's not astronomical for now. This is the + * same check that hash_upgrade does, since there was a bug in 2.X + * which could make nelem go "negative". + */ + if (m->nelem > 0x80000000) { + EPRINT((dbp->dbenv, + "Suspiciously high nelem of %lu on page %lu", + m->nelem, pgno)); + isbad = 1; + pip->h_nelem = 0; + } else + pip->h_nelem = m->nelem; + + /* flags */ + if (F_ISSET(&m->dbmeta, DB_HASH_DUP)) + F_SET(pip, VRFY_HAS_DUPS); + if (F_ISSET(&m->dbmeta, DB_HASH_DUPSORT)) + F_SET(pip, VRFY_HAS_DUPSORT); + /* XXX: Why is the DB_HASH_SUBDB flag necessary? */ + + /* spares array */ + for (i = 0; m->spares[i] != 0 && i < NCACHED; i++) { + /* + * We set mbucket to the maximum bucket that would use a given + * spares entry; we want to ensure that it's always less + * than last_pgno. + */ + mbucket = (1 << i) - 1; + if (BS_TO_PAGE(mbucket, m->spares) > vdp->last_pgno) { + EPRINT((dbp->dbenv, + "Spares array entry %lu, page %lu is invalid", + i, pgno)); + isbad = 1; + } + } + +err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0) + ret = t_ret; + return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); +} + +/* + * __ham_vrfy -- + * Verify hash page. + * + * PUBLIC: int __ham_vrfy __P((DB *, VRFY_DBINFO *, PAGE *, db_pgno_t, + * PUBLIC: u_int32_t)); + */ +int +__ham_vrfy(dbp, vdp, h, pgno, flags) + DB *dbp; + VRFY_DBINFO *vdp; + PAGE *h; + db_pgno_t pgno; + u_int32_t flags; +{ + VRFY_PAGEINFO *pip; + u_int32_t ent, himark, inpend; + int isbad, ret, t_ret; + + isbad = 0; + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + + /* Sanity check our flags and page type. */ + if ((ret = __db_fchk(dbp->dbenv, "__ham_vrfy", + flags, DB_AGGRESSIVE | DB_NOORDERCHK | DB_SALVAGE)) != 0) + goto err; + + if (TYPE(h) != P_HASH) { + TYPE_ERR_PRINT(dbp->dbenv, "__ham_vrfy", pgno, TYPE(h)); + DB_ASSERT(0); + ret = EINVAL; + goto err; + } + + /* Verify and save off fields common to all PAGEs. */ + if ((ret = __db_vrfy_datapage(dbp, vdp, h, pgno, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + + /* + * Verify inp[]. Each offset from 0 to NUM_ENT(h) must be lower + * than the previous one, higher than the current end of the inp array, + * and lower than the page size. + * + * In any case, we return immediately if things are bad, as it would + * be unsafe to proceed. + */ + for (ent = 0, himark = dbp->pgsize, + inpend = (u_int8_t *)h->inp - (u_int8_t *)h; + ent < NUM_ENT(h); ent++) + if (h->inp[ent] >= himark) { + EPRINT((dbp->dbenv, + "Item %lu on page %lu out of order or nonsensical", + ent, pgno)); + isbad = 1; + goto err; + } else if (inpend >= himark) { + EPRINT((dbp->dbenv, + "inp array collided with data on page %lu", + pgno)); + isbad = 1; + goto err; + + } else { + himark = h->inp[ent]; + inpend += sizeof(db_indx_t); + if ((ret = __ham_vrfy_item( + dbp, vdp, pgno, h, ent, flags)) != 0) + goto err; + } + +err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0) + ret = t_ret; + return (ret == 0 && isbad == 1 ? DB_VERIFY_BAD : ret); +} + +/* + * __ham_vrfy_item -- + * Given a hash page and an offset, sanity-check the item itself, + * and save off any overflow items or off-page dup children as necessary. + */ +static int +__ham_vrfy_item(dbp, vdp, pgno, h, i, flags) + DB *dbp; + VRFY_DBINFO *vdp; + db_pgno_t pgno; + PAGE *h; + u_int32_t i, flags; +{ + HOFFPAGE hop; + HOFFDUP hod; + VRFY_CHILDINFO child; + VRFY_PAGEINFO *pip; + db_indx_t offset, len, dlen, elen; + int ret, t_ret; + u_int8_t *databuf; + + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + return (ret); + + switch (HPAGE_TYPE(h, i)) { + case H_KEYDATA: + /* Nothing to do here--everything but the type field is data */ + break; + case H_DUPLICATE: + /* Are we a datum or a key? Better be the former. */ + if (i % 2 == 0) { + EPRINT((dbp->dbenv, + "Hash key stored as duplicate at page %lu item %lu", + pip->pgno, i)); + } + /* + * Dups are encoded as a series within a single HKEYDATA, + * in which each dup is surrounded by a copy of its length + * on either side (so that the series can be walked in either + * direction. We loop through this series and make sure + * each dup is reasonable. + * + * Note that at this point, we've verified item i-1, so + * it's safe to use LEN_HKEYDATA (which looks at inp[i-1]). + */ + len = LEN_HKEYDATA(h, dbp->pgsize, i); + databuf = HKEYDATA_DATA(P_ENTRY(h, i)); + for (offset = 0; offset < len; offset += DUP_SIZE(dlen)) { + memcpy(&dlen, databuf + offset, sizeof(db_indx_t)); + + /* Make sure the length is plausible. */ + if (offset + DUP_SIZE(dlen) > len) { + EPRINT((dbp->dbenv, + "Duplicate item %lu, page %lu has bad length", + i, pip->pgno)); + ret = DB_VERIFY_BAD; + goto err; + } + + /* + * Make sure the second copy of the length is the + * same as the first. + */ + memcpy(&elen, + databuf + offset + dlen + sizeof(db_indx_t), + sizeof(db_indx_t)); + if (elen != dlen) { + EPRINT((dbp->dbenv, + "Duplicate item %lu, page %lu has two different lengths", + i, pip->pgno)); + ret = DB_VERIFY_BAD; + goto err; + } + } + F_SET(pip, VRFY_HAS_DUPS); + if (!LF_ISSET(DB_NOORDERCHK) && + __ham_dups_unsorted(dbp, databuf, len)) + F_SET(pip, VRFY_DUPS_UNSORTED); + break; + case H_OFFPAGE: + /* Offpage item. Make sure pgno is sane, save off. */ + memcpy(&hop, P_ENTRY(h, i), HOFFPAGE_SIZE); + if (!IS_VALID_PGNO(hop.pgno) || hop.pgno == pip->pgno || + hop.pgno == PGNO_INVALID) { + EPRINT((dbp->dbenv, + "Offpage item %lu, page %lu has bad page number", + i, pip->pgno)); + ret = DB_VERIFY_BAD; + goto err; + } + memset(&child, 0, sizeof(VRFY_CHILDINFO)); + child.pgno = hop.pgno; + child.type = V_OVERFLOW; + child.tlen = hop.tlen; /* This will get checked later. */ + if ((ret = __db_vrfy_childput(vdp, pip->pgno, &child)) != 0) + goto err; + break; + case H_OFFDUP: + /* Offpage duplicate item. Same drill. */ + memcpy(&hod, P_ENTRY(h, i), HOFFDUP_SIZE); + if (!IS_VALID_PGNO(hod.pgno) || hod.pgno == pip->pgno || + hod.pgno == PGNO_INVALID) { + EPRINT((dbp->dbenv, + "Offpage item %lu, page %lu has bad page number", + i, pip->pgno)); + ret = DB_VERIFY_BAD; + goto err; + } + memset(&child, 0, sizeof(VRFY_CHILDINFO)); + child.pgno = hod.pgno; + child.type = V_DUPLICATE; + if ((ret = __db_vrfy_childput(vdp, pip->pgno, &child)) != 0) + goto err; + F_SET(pip, VRFY_HAS_DUPS); + break; + default: + EPRINT((dbp->dbenv, + "Item %i, page %lu has bad type", i, pip->pgno)); + ret = DB_VERIFY_BAD; + break; + } + +err: if ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0 && ret == 0) + ret = t_ret; + return (ret); +} + +/* + * __ham_vrfy_structure -- + * Verify the structure of a hash database. + * + * PUBLIC: int __ham_vrfy_structure __P((DB *, VRFY_DBINFO *, db_pgno_t, + * PUBLIC: u_int32_t)); + */ +int +__ham_vrfy_structure(dbp, vdp, meta_pgno, flags) + DB *dbp; + VRFY_DBINFO *vdp; + db_pgno_t meta_pgno; + u_int32_t flags; +{ + DB *pgset; + HMETA *m; + PAGE *h; + VRFY_PAGEINFO *pip; + int isbad, p, ret, t_ret; + db_pgno_t pgno; + u_int32_t bucket; + + ret = isbad = 0; + h = NULL; + pgset = vdp->pgset; + + if ((ret = __db_vrfy_pgset_get(pgset, meta_pgno, &p)) != 0) + return (ret); + if (p != 0) { + EPRINT((dbp->dbenv, + "Hash meta page %lu referenced twice", meta_pgno)); + return (DB_VERIFY_BAD); + } + if ((ret = __db_vrfy_pgset_inc(pgset, meta_pgno)) != 0) + return (ret); + + /* Get the meta page; we'll need it frequently. */ + if ((ret = memp_fget(dbp->mpf, &meta_pgno, 0, &m)) != 0) + return (ret); + + /* Loop through bucket by bucket. */ + for (bucket = 0; bucket <= m->max_bucket; bucket++) + if ((ret = + __ham_vrfy_bucket(dbp, vdp, m, bucket, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + + /* + * There may be unused hash pages corresponding to buckets + * that have been allocated but not yet used. These may be + * part of the current doubling above max_bucket, or they may + * correspond to buckets that were used in a transaction + * that then aborted. + * + * Loop through them, as far as the spares array defines them, + * and make sure they're all empty. + * + * Note that this should be safe, since we've already verified + * that the spares array is sane. + */ + for (bucket = m->max_bucket + 1; + m->spares[__db_log2(bucket + 1)] != 0; bucket++) { + pgno = BS_TO_PAGE(bucket, m->spares); + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + goto err; + + /* It's okay if these pages are totally zeroed; unmark it. */ + F_CLR(pip, VRFY_IS_ALLZEROES); + + if (pip->type != P_HASH) { + EPRINT((dbp->dbenv, + "Hash bucket %lu maps to non-hash page %lu", + bucket, pgno)); + isbad = 1; + } else if (pip->entries != 0) { + EPRINT((dbp->dbenv, + "Non-empty page %lu in unused hash bucket %lu", + pgno, bucket)); + isbad = 1; + } else { + if ((ret = __db_vrfy_pgset_get(pgset, pgno, &p)) != 0) + goto err; + if (p != 0) { + EPRINT((dbp->dbenv, + "Hash page %lu above max_bucket referenced", + pgno)); + isbad = 1; + } else { + if ((ret = + __db_vrfy_pgset_inc(pgset, pgno)) != 0) + goto err; + if ((ret = + __db_vrfy_putpageinfo(vdp, pip)) != 0) + goto err; + continue; + } + } + + /* If we got here, it's an error. */ + (void)__db_vrfy_putpageinfo(vdp, pip); + goto err; + } + +err: if ((t_ret = memp_fput(dbp->mpf, m, 0)) != 0) + return (t_ret); + if (h != NULL && (t_ret = memp_fput(dbp->mpf, h, 0)) != 0) + return (t_ret); + return ((isbad == 1 && ret == 0) ? DB_VERIFY_BAD: ret); +} + +/* + * __ham_vrfy_bucket -- + * Verify a given bucket. + */ +static int +__ham_vrfy_bucket(dbp, vdp, m, bucket, flags) + DB *dbp; + VRFY_DBINFO *vdp; + HMETA *m; + u_int32_t bucket, flags; +{ + HASH *hashp; + VRFY_CHILDINFO *child; + VRFY_PAGEINFO *mip, *pip; + int ret, t_ret, isbad, p; + db_pgno_t pgno, next_pgno; + DBC *cc; + u_int32_t (*hfunc) __P((DB *, const void *, u_int32_t)); + + isbad = 0; + pip = NULL; + cc = NULL; + + hashp = dbp->h_internal; + if (hashp != NULL && hashp->h_hash != NULL) + hfunc = hashp->h_hash; + else + hfunc = __ham_func5; + + if ((ret = __db_vrfy_getpageinfo(vdp, PGNO(m), &mip)) != 0) + return (ret); + + /* Calculate the first pgno for this bucket. */ + pgno = BS_TO_PAGE(bucket, m->spares); + + if ((ret = __db_vrfy_getpageinfo(vdp, pgno, &pip)) != 0) + goto err; + + /* Make sure we got a plausible page number. */ + if (pgno > vdp->last_pgno || pip->type != P_HASH) { + EPRINT((dbp->dbenv, "Bucket %lu has impossible first page %lu", + bucket, pgno)); + /* Unsafe to continue. */ + isbad = 1; + goto err; + } + + if (pip->prev_pgno != PGNO_INVALID) { + EPRINT((dbp->dbenv, + "First hash page %lu in bucket %lu has a prev_pgno", pgno)); + isbad = 1; + } + + /* + * Set flags for dups and sorted dups. + */ + flags |= F_ISSET(mip, VRFY_HAS_DUPS) ? ST_DUPOK : 0; + flags |= F_ISSET(mip, VRFY_HAS_DUPSORT) ? ST_DUPSORT : 0; + + /* Loop until we find a fatal bug, or until we run out of pages. */ + for (;;) { + /* Provide feedback on our progress to the application. */ + if (!LF_ISSET(DB_SALVAGE)) + __db_vrfy_struct_feedback(dbp, vdp); + + if ((ret = __db_vrfy_pgset_get(vdp->pgset, pgno, &p)) != 0) + goto err; + if (p != 0) { + EPRINT((dbp->dbenv, + "Hash page %lu referenced twice", pgno)); + isbad = 1; + /* Unsafe to continue. */ + goto err; + } else if ((ret = __db_vrfy_pgset_inc(vdp->pgset, pgno)) != 0) + goto err; + + /* + * Hash pages that nothing has ever hashed to may never + * have actually come into existence, and may appear to be + * entirely zeroed. This is acceptable, and since there's + * no real way for us to know whether this has actually + * occurred, we clear the "wholly zeroed" flag on every + * hash page. A wholly zeroed page, by nature, will appear + * to have no flags set and zero entries, so should + * otherwise verify correctly. + */ + F_CLR(pip, VRFY_IS_ALLZEROES); + + /* If we have dups, our meta page had better know about it. */ + if (F_ISSET(pip, VRFY_HAS_DUPS) + && !F_ISSET(mip, VRFY_HAS_DUPS)) { + EPRINT((dbp->dbenv, + "Duplicates present in non-duplicate database, page %lu", + pgno)); + isbad = 1; + } + + /* + * If the database has sorted dups, this page had better + * not have unsorted ones. + */ + if (F_ISSET(mip, VRFY_HAS_DUPSORT) && + F_ISSET(pip, VRFY_DUPS_UNSORTED)) { + EPRINT((dbp->dbenv, + "Unsorted dups in sorted-dup database, page %lu", + pgno)); + isbad = 1; + } + + /* Walk overflow chains and offpage dup trees. */ + if ((ret = __db_vrfy_childcursor(vdp, &cc)) != 0) + goto err; + for (ret = __db_vrfy_ccset(cc, pip->pgno, &child); ret == 0; + ret = __db_vrfy_ccnext(cc, &child)) + if (child->type == V_OVERFLOW) { + if ((ret = __db_vrfy_ovfl_structure(dbp, vdp, + child->pgno, child->tlen, flags)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + } else if (child->type == V_DUPLICATE) { + if ((ret = __db_vrfy_duptype(dbp, + vdp, child->pgno, flags)) != 0) { + isbad = 1; + continue; + } + if ((ret = __bam_vrfy_subtree(dbp, vdp, + child->pgno, NULL, NULL, + flags | ST_RECNUM | ST_DUPSET, NULL, + NULL, NULL)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + } + if ((ret = __db_vrfy_ccclose(cc)) != 0) + goto err; + cc = NULL; + + /* If it's safe to check that things hash properly, do so. */ + if (isbad == 0 && !LF_ISSET(DB_NOORDERCHK) && + (ret = __ham_vrfy_hashing(dbp, pip->entries, + m, bucket, pgno, flags, hfunc)) != 0) { + if (ret == DB_VERIFY_BAD) + isbad = 1; + else + goto err; + } + + next_pgno = pip->next_pgno; + ret = __db_vrfy_putpageinfo(vdp, pip); + + pip = NULL; + if (ret != 0) + goto err; + + if (next_pgno == PGNO_INVALID) + break; /* End of the bucket. */ + + /* We already checked this, but just in case... */ + if (!IS_VALID_PGNO(next_pgno)) { + DB_ASSERT(0); + EPRINT((dbp->dbenv, + "Hash page %lu has bad next_pgno", pgno)); + isbad = 1; + goto err; + } + + if ((ret = __db_vrfy_getpageinfo(vdp, next_pgno, &pip)) != 0) + goto err; + + if (pip->prev_pgno != pgno) { + EPRINT((dbp->dbenv, "Hash page %lu has bad prev_pgno", + next_pgno)); + isbad = 1; + } + pgno = next_pgno; + } + +err: if (cc != NULL && ((t_ret = __db_vrfy_ccclose(cc)) != 0) && ret == 0) + ret = t_ret; + if (mip != NULL && ((t_ret = __db_vrfy_putpageinfo(vdp, mip)) != 0) && + ret == 0) + ret = t_ret; + if (pip != NULL && ((t_ret = __db_vrfy_putpageinfo(vdp, pip)) != 0) && + ret == 0) + ret = t_ret; + return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); +} + +/* + * __ham_vrfy_hashing -- + * Verify that all items on a given hash page hash correctly. + * + * PUBLIC: int __ham_vrfy_hashing __P((DB *, + * PUBLIC: u_int32_t, HMETA *, u_int32_t, db_pgno_t, u_int32_t, + * PUBLIC: u_int32_t (*) __P((DB *, const void *, u_int32_t)))); + */ +int +__ham_vrfy_hashing(dbp, nentries, m, thisbucket, pgno, flags, hfunc) + DB *dbp; + u_int32_t nentries; + HMETA *m; + u_int32_t thisbucket; + db_pgno_t pgno; + u_int32_t flags; + u_int32_t (*hfunc) __P((DB *, const void *, u_int32_t)); +{ + DBT dbt; + PAGE *h; + db_indx_t i; + int ret, t_ret, isbad; + u_int32_t hval, bucket; + + ret = isbad = 0; + memset(&dbt, 0, sizeof(DBT)); + F_SET(&dbt, DB_DBT_REALLOC); + + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) + return (ret); + + for (i = 0; i < nentries; i += 2) { + /* + * We've already verified the page integrity and that of any + * overflow chains linked off it; it is therefore safe to use + * __db_ret. It's also not all that much slower, since we have + * to copy every hash item to deal with alignment anyway; we + * can tweak this a bit if this proves to be a bottleneck, + * but for now, take the easy route. + */ + if ((ret = __db_ret(dbp, h, i, &dbt, NULL, NULL)) != 0) + goto err; + hval = hfunc(dbp, dbt.data, dbt.size); + + bucket = hval & m->high_mask; + if (bucket > m->max_bucket) + bucket = bucket & m->low_mask; + + if (bucket != thisbucket) { + EPRINT((dbp->dbenv, + "Item %lu on page %lu hashes incorrectly", + i, pgno)); + isbad = 1; + } + } + +err: if (dbt.data != NULL) + __os_free(dbt.data, 0); + if ((t_ret = memp_fput(dbp->mpf, h, 0)) != 0) + return (t_ret); + + return ((ret == 0 && isbad == 1) ? DB_VERIFY_BAD : ret); +} + +/* + * __ham_salvage -- + * Safely dump out anything that looks like a key on an alleged + * hash page. + * + * PUBLIC: int __ham_salvage __P((DB *, VRFY_DBINFO *, db_pgno_t, PAGE *, + * PUBLIC: void *, int (*)(void *, const void *), u_int32_t)); + */ +int +__ham_salvage(dbp, vdp, pgno, h, handle, callback, flags) + DB *dbp; + VRFY_DBINFO *vdp; + db_pgno_t pgno; + PAGE *h; + void *handle; + int (*callback) __P((void *, const void *)); + u_int32_t flags; +{ + DBT dbt, unkdbt; + db_pgno_t dpgno; + int ret, err_ret, t_ret; + u_int32_t himark, tlen; + u_int8_t *hk; + void *buf; + u_int32_t dlen, len, i; + + memset(&dbt, 0, sizeof(DBT)); + dbt.flags = DB_DBT_REALLOC; + + memset(&unkdbt, 0, sizeof(DBT)); + unkdbt.size = strlen("UNKNOWN") + 1; + unkdbt.data = "UNKNOWN"; + + err_ret = 0; + + /* + * Allocate a buffer for overflow items. Start at one page; + * __db_safe_goff will realloc as needed. + */ + if ((ret = __os_malloc(dbp->dbenv, dbp->pgsize, NULL, &buf)) != 0) + return (ret); + + himark = dbp->pgsize; + for (i = 0;; i++) { + /* If we're not aggressive, break when we hit NUM_ENT(h). */ + if (!LF_ISSET(DB_AGGRESSIVE) && i >= NUM_ENT(h)) + break; + + /* Verify the current item. */ + ret = __db_vrfy_inpitem(dbp, + h, pgno, i, 0, flags, &himark, NULL); + /* If this returned a fatality, it's time to break. */ + if (ret == DB_VERIFY_FATAL) + break; + + if (ret == 0) { + hk = P_ENTRY(h, i); + len = LEN_HKEYDATA(h, dbp->pgsize, i); + if ((u_int32_t)(hk + len - (u_int8_t *)h) > + dbp->pgsize) { + /* + * Item is unsafely large; either continue + * or set it to the whole page, depending on + * aggressiveness. + */ + if (!LF_ISSET(DB_AGGRESSIVE)) + continue; + len = dbp->pgsize - + (u_int32_t)(hk - (u_int8_t *)h); + err_ret = DB_VERIFY_BAD; + } + switch (HPAGE_PTYPE(hk)) { + default: + if (!LF_ISSET(DB_AGGRESSIVE)) + break; + err_ret = DB_VERIFY_BAD; + /* FALLTHROUGH */ + case H_KEYDATA: +keydata: memcpy(buf, HKEYDATA_DATA(hk), len); + dbt.size = len; + dbt.data = buf; + if ((ret = __db_prdbt(&dbt, + 0, " ", handle, callback, 0, NULL)) != 0) + err_ret = ret; + break; + case H_OFFPAGE: + if (len < HOFFPAGE_SIZE) { + err_ret = DB_VERIFY_BAD; + continue; + } + memcpy(&dpgno, + HOFFPAGE_PGNO(hk), sizeof(dpgno)); + if ((ret = __db_safe_goff(dbp, vdp, + dpgno, &dbt, &buf, flags)) != 0) { + err_ret = ret; + (void)__db_prdbt(&unkdbt, 0, " ", + handle, callback, 0, NULL); + break; + } + if ((ret = __db_prdbt(&dbt, + 0, " ", handle, callback, 0, NULL)) != 0) + err_ret = ret; + break; + case H_OFFDUP: + if (len < HOFFPAGE_SIZE) { + err_ret = DB_VERIFY_BAD; + continue; + } + memcpy(&dpgno, + HOFFPAGE_PGNO(hk), sizeof(dpgno)); + /* UNKNOWN iff pgno is bad or we're a key. */ + if (!IS_VALID_PGNO(dpgno) || (i % 2 == 0)) { + if ((ret = __db_prdbt(&unkdbt, 0, " ", + handle, callback, 0, NULL)) != 0) + err_ret = ret; + } else if ((ret = __db_salvage_duptree(dbp, + vdp, dpgno, &dbt, handle, callback, + flags | SA_SKIPFIRSTKEY)) != 0) + err_ret = ret; + break; + case H_DUPLICATE: + /* + * We're a key; printing dups will seriously + * foul the output. If we're being aggressive, + * pretend this is a key and let the app. + * programmer sort out the mess. + */ + if (i % 2 == 0) { + err_ret = ret; + if (LF_ISSET(DB_AGGRESSIVE)) + goto keydata; + break; + } + + /* Too small to have any data. */ + if (len < + HKEYDATA_SIZE(2 * sizeof(db_indx_t))) { + err_ret = DB_VERIFY_BAD; + continue; + } + + /* Loop until we hit the total length. */ + for (tlen = 0; tlen + sizeof(db_indx_t) < len; + tlen += dlen) { + tlen += sizeof(db_indx_t); + memcpy(&dlen, hk, sizeof(db_indx_t)); + /* + * If dlen is too long, print all the + * rest of the dup set in a chunk. + */ + if (dlen + tlen > len) + dlen = len - tlen; + memcpy(buf, hk + tlen, dlen); + dbt.size = dlen; + dbt.data = buf; + if ((ret = __db_prdbt(&dbt, 0, " ", + handle, callback, 0, NULL)) != 0) + err_ret = ret; + tlen += sizeof(db_indx_t); + } + break; + } + } + } + + __os_free(buf, 0); + if ((t_ret = __db_salvage_markdone(vdp, pgno)) != 0) + return (t_ret); + return ((ret == 0 && err_ret != 0) ? err_ret : ret); +} + +/* + * __ham_meta2pgset -- + * Return the set of hash pages corresponding to the given + * known-good meta page. + * + * PUBLIC: int __ham_meta2pgset __P((DB *, VRFY_DBINFO *, HMETA *, u_int32_t, + * PUBLIC: DB *)); + */ +int __ham_meta2pgset(dbp, vdp, hmeta, flags, pgset) + DB *dbp; + VRFY_DBINFO *vdp; + HMETA *hmeta; + u_int32_t flags; + DB *pgset; +{ + PAGE *h; + db_pgno_t pgno; + u_int32_t bucket, totpgs; + int ret, val; + + /* + * We don't really need flags, but leave them for consistency with + * __bam_meta2pgset. + */ + COMPQUIET(flags, 0); + + DB_ASSERT(pgset != NULL); + + totpgs = 0; + + /* + * Loop through all the buckets, pushing onto pgset the corresponding + * page(s) for each one. + */ + for (bucket = 0; bucket <= hmeta->max_bucket; bucket++) { + pgno = BS_TO_PAGE(bucket, hmeta->spares); + + /* + * We know the initial pgno is safe because the spares array has + * been verified. + * + * Safely walk the list of pages in this bucket. + */ + for (;;) { + if ((ret = memp_fget(dbp->mpf, &pgno, 0, &h)) != 0) + return (ret); + if (TYPE(h) == P_HASH) { + + /* + * Make sure we don't go past the end of + * pgset. + */ + if (++totpgs > vdp->last_pgno) { + (void)memp_fput(dbp->mpf, h, 0); + return (DB_VERIFY_BAD); + } + if ((ret = + __db_vrfy_pgset_inc(pgset, pgno)) != 0) + return (ret); + + pgno = NEXT_PGNO(h); + } else + pgno = PGNO_INVALID; + + if ((ret = memp_fput(dbp->mpf, h, 0)) != 0) + return (ret); + + /* If the new pgno is wonky, go onto the next bucket. */ + if (!IS_VALID_PGNO(pgno) || + pgno == PGNO_INVALID) + goto nextbucket; + + /* + * If we've touched this page before, we have a cycle; + * go on to the next bucket. + */ + if ((ret = __db_vrfy_pgset_get(pgset, pgno, &val)) != 0) + return (ret); + if (val != 0) + goto nextbucket; + } +nextbucket: ; + } + return (0); +} + +/* + * __ham_dups_unsorted -- + * Takes a known-safe hash duplicate set and its total length. + * Returns 1 if there are out-of-order duplicates in this set, + * 0 if there are not. + */ +static int +__ham_dups_unsorted(dbp, buf, len) + DB *dbp; + u_int8_t *buf; + u_int32_t len; +{ + DBT a, b; + db_indx_t offset, dlen; + int (*func) __P((DB *, const DBT *, const DBT *)); + + memset(&a, 0, sizeof(DBT)); + memset(&b, 0, sizeof(DBT)); + + func = (dbp->dup_compare == NULL) ? __bam_defcmp : dbp->dup_compare; + + /* + * Loop through the dup set until we hit the end or we find + * a pair of dups that's out of order. b is always the current + * dup, a the one before it. + */ + for (offset = 0; offset < len; offset += DUP_SIZE(dlen)) { + memcpy(&dlen, buf + offset, sizeof(db_indx_t)); + b.data = buf + offset + sizeof(db_indx_t); + b.size = dlen; + + if (a.data != NULL && func(dbp, &a, &b) > 0) + return (1); + + a.data = b.data; + a.size = b.size; + } + + return (0); +} |