1 files changed, 549 insertions, 0 deletions
diff --git a/src/hash/hash_compact.c b/src/hash/hash_compact.c
new file mode 100644
index 00000000..83b5ffb1
--- /dev/null
+++ b/src/hash/hash_compact.c
@@ -0,0 +1,549 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 2012 Oracle and/or its affiliates.  All rights reserved.
+ * $Id$
+ */
+
+#include "db_config.h"
+
+#include "db_int.h"
+#include "dbinc/db_page.h"
+#include "dbinc/btree.h"
+#include "dbinc/hash.h"
+#include "dbinc/lock.h"
+#include "dbinc/txn.h"
+#include "dbinc/mp.h"
+
+static int __ham_copy_data __P((DBC *, PAGE *, DB_COMPACT *, int *));
+static int __ham_truncate_overflow __P((DBC *, u_int32_t, DB_COMPACT *, int *));
+
+/*
+ * __ham_compact_int -- internal HASH compaction routine.
+ *
+ * PUBLIC: int __ham_compact_int __P((DBC *,
+ * PUBLIC:      DBT *, DBT *, u_int32_t, DB_COMPACT *, int *, u_int32_t));
+ */
+int
+__ham_compact_int(dbc, start, stop, factor, c_data, donep, flags)
+	DBC *dbc;
+	DBT *start, *stop;
+	u_int32_t factor;
+	DB_COMPACT *c_data;
+	int *donep;
+	u_int32_t flags;
+{
+	DB *dbp;
+	DB_MPOOLFILE *mpf;
+	HASH_CURSOR *hcp;
+	db_pgno_t origpgno, pgno;
+	int check_trunc, pgs_done, ret, t_ret;
+	u_int32_t empty_buckets, i, stop_bucket;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	hcp = (HASH_CURSOR *)dbc->internal;
+	pgs_done = 0;
+	empty_buckets = 0;
+	check_trunc = c_data->compact_truncate != PGNO_INVALID;
+	if ((ret = __ham_get_meta(dbc)) != 0)
+		return (ret);
+
+	if (stop != NULL && stop->size != 0)
+		stop_bucket = *(u_int32_t *)stop->data;
+	else
+		stop_bucket = hcp->hdr->max_bucket;
+
+	if (start != NULL && start->size != 0)
+		hcp->bucket = *(u_int32_t *)start->data;
+	else
+		hcp->bucket = 0;
+
+	for (; hcp->bucket <= stop_bucket && ret == 0; hcp->bucket++) {
+		/*
+		 * For each bucket first move records toward the head of
+		 * the bucket.
+		 */
+		hcp->indx = NDX_INVALID;
+		F_CLR(hcp, H_ISDUP);
+		hcp->pgno = BUCKET_TO_PAGE(hcp, hcp->bucket);
+		pgno = PGNO_INVALID;
+		ret = __ham_item_next(dbc, DB_LOCK_WRITE, &pgno);
+
+		/*
+		 * If the bucket is empty, just note it, otherwise process it.
+		 * If there are any records there must be some in the head
+		 * of the bucket.
+		 */
+		if (ret == DB_NOTFOUND ) {
+			empty_buckets++;
+			c_data->compact_pages_examine++;
+			DB_ASSERT(dbp->env,
+			    PREV_PGNO(hcp->page) == PGNO_INVALID &&
+			    NEXT_PGNO(hcp->page) == PGNO_INVALID);
+			goto err;
+		} else if (ret != 0)
+			break;
+		c_data->compact_pages_examine++;
+
+		if (NEXT_PGNO(hcp->page) != PGNO_INVALID) {
+			if ((ret =
+			    __ham_compact_bucket(dbc, c_data, &pgs_done)) != 0)
+				goto err;
+			pgno = PGNO_INVALID;
+			if ((ret = __ham_item(dbc, DB_LOCK_WRITE, &pgno)) != 0)
+				goto err;
+		}
+
+		/*
+		 * Loop through the items in this page in the bucket and process
+		 * overflow records and off page duplicate sets.
+		 */
+		while (ret == 0) {
+			/* Handle off page duplicate trees. */
+			if (pgno == PGNO_INVALID)
+				goto no_opd;
+			if (check_trunc &&
+			    pgno > c_data->compact_truncate) {
+				c_data->compact_pages_examine++;
+				/*
+				 * Truncate this page if possible.
+				 * We must update the parent here
+				 * because the page number is
+				 * not aligned.
+				 */
+				if ((ret = __memp_dirty(mpf, &hcp->page,
+				    dbc->thread_info,
+				    dbc->txn, dbc->priority, 0)) != 0)
+					break;
+				origpgno = pgno;
+				if ((ret = __db_truncate_root(dbc, hcp->page,
+				    H_DATAINDEX(hcp->indx), &pgno, 0)) != 0)
+					break;
+				if (pgno != origpgno) {
+					memcpy(HOFFDUP_PGNO(H_PAIRDATA(dbp,
+					    hcp->page, hcp->indx)),
+					    &pgno, sizeof(db_pgno_t));
+					pgs_done++;
+					c_data->compact_pages--;
+				}
+			}
+			/*
+			 * Compact the off page duplicate tree.
+			 */
+			if ((ret = __bam_compact_opd(dbc,
+			    pgno, NULL, factor, c_data, &pgs_done)) != 0)
+				break;
+
+no_opd:			if (check_trunc && HPAGE_PTYPE(H_PAIRDATA(
+			    dbp, hcp->page, hcp->indx)) == H_OFFPAGE) {
+				/* This is an overflow chain. */
+				if ((ret = __ham_truncate_overflow(dbc,
+				    H_DATAINDEX(hcp->indx),
+				    c_data, &pgs_done)) != 0)
+					break;
+			}
+
+			/* Check for an overflow key. */
+			if (check_trunc && HPAGE_PTYPE(H_PAIRKEY(
+			    dbp, hcp->page, hcp->indx)) == H_OFFPAGE) {
+				/* This is an overflow chain. */
+				if ((ret = __ham_truncate_overflow(dbc,
+				    H_KEYINDEX(hcp->indx),
+				    c_data, &pgs_done)) != 0)
+					break;
+			}
+
+			pgno = PGNO_INVALID;
+			ret = __ham_item_next(dbc, DB_LOCK_WRITE, &pgno);
+		}
+
+err:		if (hcp->page != NULL &&
+		    (t_ret = __memp_fput(mpf, dbc->thread_info,
+		    hcp->page, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		if (ret == DB_NOTFOUND)
+			ret = 0;
+		hcp->page = NULL;
+		hcp->pgno = pgno = PGNO_INVALID;
+		/*
+		 * If we are in an auto-transaction and we updated something
+		 * return to the caller to commit this transaction to
+		 * avoid holding locks. Otherwise process the next bucket.
+		 * We can drop the lock if we did not do anything.
+		 * We always must commit the txn if we are in MVCC
+		 * as we have dirtied the hash buckets.
+		 */
+		if (ret == 0 &&
+		    atomic_read(&dbp->mpf->mfp->multiversion) == 0 &&
+		    (pgs_done == 0 || dbc->txn == NULL))
+			ret = __LPUT(dbc, hcp->lock);
+		else if (LF_ISSET(DB_AUTO_COMMIT)) {
+			if (ret == 0)
+				hcp->bucket++;
+			break;
+		}
+	}
+	/*
+	 * If we saw any empty buckets and we are freeing space we
+	 * want to contract the table before dropping the metadata
+	 * page. Wait till we are done with everything else as we
+	 * need to get an exclusive lock on the metadata page.
+	 */
+	if (ret == 0 && empty_buckets != 0 && LF_ISSET(DB_FREE_SPACE)) {
+		for (i = 0; i < empty_buckets && hcp->hdr->max_bucket > 2; i++)
+			if ((ret = __ham_contract_table(dbc, c_data)) != 0)
+				break;
+	}
+
+	if (ret == 0)
+		ret = __db_retcopy(dbp->env, start, &hcp->bucket,
+		    sizeof(hcp->bucket), &start->data, &start->ulen);
+	(void)__ham_release_meta(dbc);
+	c_data->compact_empty_buckets += empty_buckets;
+	if (hcp->bucket > stop_bucket)
+		*donep = 1;
+	return (ret);
+}
+
+/*
+ * __ham_compact_bucket -- move data to as few pages as possible.
+ *
+ * PUBLIC: int __ham_compact_bucket __P((DBC *, DB_COMPACT *, int *));
+ */
+int
+__ham_compact_bucket(dbc, c_data, pgs_donep)
+	DBC *dbc;
+	DB_COMPACT *c_data;
+	int *pgs_donep;
+{
+	DB *dbp;
+	DB_MPOOLFILE *mpf;
+	HASH_CURSOR *hcp;
+	PAGE *pg;
+	db_pgno_t pgno;
+	int check_trunc, ret, t_ret;
+
+	hcp = (HASH_CURSOR *)dbc->internal;
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	pg = hcp->page;
+	check_trunc = c_data->compact_truncate != PGNO_INVALID;
+	ret = 0;
+
+	pgno = hcp->pgno;
+	do {
+		if (pg == NULL && (ret = __memp_fget(mpf, &pgno,
+		    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &pg)) != 0)
+			break;
+		/* Sort any unsorted pages before adding to the page. */
+		if (TYPE(pg) == P_HASH_UNSORTED) {
+			if ((ret = __ham_sort_page_cursor(dbc, pg)) != 0)
+				break;
+			(*pgs_donep)++;
+		}
+
+		/* If this is not the head try to move it to a lower page. */
+		if (check_trunc && PREV_PGNO(pg) != PGNO_INVALID  &&
+		    PGNO(pg) > c_data->compact_truncate &&
+		    (ret = __db_exchange_page(dbc, &pg,
+		    hcp->page, PGNO_INVALID, DB_EXCH_FREE)) != 0)
+			break;
+		if (pgno != PGNO(pg))
+			(*pgs_donep)++;
+
+		if (NEXT_PGNO(pg) == PGNO_INVALID)
+			break;
+		if ((ret = __ham_copy_data(dbc, pg, c_data, pgs_donep)) != 0)
+			break;
+		pgno = NEXT_PGNO(pg);
+		if (pg != hcp->page && (ret = __memp_fput(mpf,
+		     dbc->thread_info, pg, dbc->priority)) != 0)
+			break;
+		pg = NULL;
+	} while (pgno != PGNO_INVALID);
+
+	if (pg != NULL && pg != hcp->page &&
+	    (t_ret = __memp_fput(mpf, dbc->thread_info, pg, dbc->priority)) &&
+	    ret == 0)
+		ret = t_ret;
+	return (ret);
+}
+
+/*
+ * __ham_copy_data -- copy as many records as possible from next page
+ */
+static int
+__ham_copy_data(dbc, pg, c_data, pgs_donep)
+	DBC *dbc;
+	PAGE *pg;
+	DB_COMPACT *c_data;
+	int *pgs_donep;
+{
+	DB *dbp;
+	DBC *newdbc;
+	DBT data, key;
+	DB_MPOOLFILE *mpf;
+	HASH_CURSOR *hcp, *ncp;
+	PAGE *nextpage;
+	db_pgno_t origpgno;
+	int i, nument, records, ret, t_ret;
+	u_int32_t len;
+
+	dbp = dbc->dbp;
+	mpf = dbp->mpf;
+	hcp = (HASH_CURSOR *)dbc->internal;
+	records = 0;
+
+	if ((ret = __dbc_dup(dbc, &newdbc, 0)) != 0)
+		return (ret);
+	ncp = (HASH_CURSOR *)newdbc->internal;
+	ncp->hdr = hcp->hdr;
+
+	/*
+	 * Copy data to the front of the bucket. Loop until either we
+	 * have not replaced the next page or there is no next page.
+	 * If the next page was not removed then it still has data
+	 * on it.
+	 */
+	origpgno = PGNO_INVALID;
+	while (origpgno != NEXT_PGNO(pg) &&
+	    (origpgno = NEXT_PGNO(pg)) != PGNO_INVALID) {
+
+		if ((ret = __memp_fget(mpf, &NEXT_PGNO(pg), dbc->thread_info,
+		    dbc->txn, DB_MPOOL_DIRTY, &nextpage)) != 0)
+			break;
+
+		c_data->compact_pages_examine++;
+		ncp->page = nextpage;
+		ncp->pgno = PGNO(nextpage);
+		ncp->indx = 0;
+		memset(&key, 0, sizeof(key));
+		memset(&data, 0, sizeof(data));
+		nument = NUM_ENT(nextpage);
+		DB_ASSERT(dbp->env, nument != 0);
+		for (i = 0; i < nument; i += 2) {
+			len = LEN_HITEM(dbp, nextpage, dbp->pgsize, 0) +
+			    LEN_HITEM(dbp, nextpage, dbp->pgsize, 1) +
+			    2 * sizeof(db_indx_t);
+			if (P_FREESPACE(dbp, pg) < len)
+				continue;
+
+			if ((ret =
+			    __ham_copypair(dbc, nextpage, 0, pg, NULL, 1)) != 0)
+				break;
+
+			records++;
+			if ((ret = __ham_del_pair(newdbc,
+			    HAM_DEL_IGNORE_OFFPAGE, pg)) != 0)
+				break;
+			if (!STD_LOCKING(dbc)) {
+				if ((ret = __ham_dirty_meta(dbc, 0)) != 0)
+					return (ret);
+				++hcp->hdr->nelem;
+			}
+		}
+		/*
+		 * If we moved all the records then __ham_del_pair will
+		 * have deleted the nextpage.
+		 */
+		if (records >= nument/2) {
+			c_data->compact_pages_examine++;
+			c_data->compact_pages_free++;
+			COMPACT_TRUNCATE(c_data);
+		}
+		if (ncp->page != NULL &&
+		    (t_ret = __memp_fput(mpf, dbc->thread_info,
+		    ncp->page, dbc->priority)) != 0 && ret == 0)
+			ret = t_ret;
+		ncp->page = NULL;
+		ncp->pgno = PGNO_INVALID;
+	}
+
+	/*
+	 * If __ham_del_pair freed a page then we needed to dirty the metapage
+	 * and it could change so we need to copy it back to hcp.
+	 */
+	hcp->hdr = ncp->hdr;
+	ncp->hdr = NULL;
+	if ((t_ret = __ham_release_meta(newdbc)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __dbc_close(newdbc)) != 0 && ret == 0)
+		ret = t_ret;
+	if (records != 0)
+		(*pgs_donep)++;
+	return (ret);
+}
+
+/*
+ * __ham_truncate_overflow -- try to truncate pages from an overflow chain.
+ */
+static int
+__ham_truncate_overflow(dbc, indx, c_data, pgs_done)
+	DBC *dbc;
+	u_int32_t indx;
+	DB_COMPACT *c_data;
+	int *pgs_done;
+{
+	DB *dbp;
+	HASH_CURSOR *hcp;
+	db_pgno_t origpgno, pgno;
+	int ret;
+
+	hcp = (HASH_CURSOR *)dbc->internal;
+	dbp = dbc->dbp;
+	memcpy(&pgno,
+	    HOFFPAGE_PGNO(P_ENTRY(dbp, hcp->page, indx)), sizeof(db_pgno_t));
+	if (pgno > c_data->compact_truncate) {
+		c_data->compact_pages_examine++;
+		origpgno = pgno;
+		if ((ret = __memp_dirty(dbp->mpf, &hcp->page,
+		    dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
+			return (ret);
+		if ((ret =
+		     __db_truncate_root(dbc, hcp->page, indx, &pgno, 0)) != 0)
+			return (ret);
+		if (pgno != origpgno) {
+			memcpy(HOFFPAGE_PGNO(P_ENTRY(dbp, hcp->page, indx)),
+			    &pgno, sizeof(db_pgno_t));
+			(*pgs_done)++;
+			c_data->compact_pages--;
+		}
+	}
+	if ((ret = __db_truncate_overflow(dbc, pgno, NULL, c_data)) != 0)
+		return (ret);
+	return (0);
+}
+
+#ifdef HAVE_FTRUNCATE
+/*
+ * __ham_compact_hash -- compact the hash table.
+ * PUBLIC: int __ham_compact_hash __P((DB *,
+ * PUBLIC:       DB_THREAD_INFO *, DB_TXN *, DB_COMPACT *));
+ */
+int
+__ham_compact_hash(dbp, ip, txn, c_data)
+	DB *dbp;
+	DB_THREAD_INFO *ip;
+	DB_TXN *txn;
+	DB_COMPACT *c_data;
+{
+	DBC *dbc;
+	DB_LOCK lock;
+	HASH_CURSOR *hcp;
+	HMETA *meta;
+	PAGE *oldpage;
+	db_pgno_t free_pgno, last_pgno, pgno, start_pgno;
+	int flags, local_txn, ret, t_ret;
+	u_int32_t bucket, i, size;
+
+	local_txn = IS_DB_AUTO_COMMIT(dbp, txn);
+	oldpage = NULL;
+	dbc = NULL;
+	LOCK_INIT(lock);
+
+	if (local_txn &&
+	    (ret = __txn_begin(dbp->env, ip, txn, &txn, 0)) != 0)
+		return (ret);
+
+	if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
+		goto err1;
+	hcp = (HASH_CURSOR *)dbc->internal;
+
+	if ((ret = __ham_get_meta(dbc)) != 0 ||
+	    (ret = __ham_dirty_meta(dbc, 0)) != 0)
+		goto err1;
+
+	meta = hcp->hdr;
+
+	LOCK_CHECK_OFF(ip);
+
+	/*
+	 * Find contiguous lower numbered pages for each hash table segment.
+	 */
+	for (i = 0; i < NCACHED && meta->spares[i] != PGNO_INVALID; i++) {
+		if (i == 0) {
+			bucket = 0;
+			size = 1;
+		} else {
+			bucket = 1 << (i - 1);
+			size = bucket;
+		}
+		start_pgno = meta->spares[i] + bucket;
+		if ((ret = __db_find_free(dbc, P_HASH,
+		    size, start_pgno, &free_pgno)) != 0) {
+			if (ret != DB_NOTFOUND)
+				break;
+			ret = 0;
+			continue;
+		}
+		if (DBC_LOGGING(dbc)) {
+			if ((ret = __ham_changeslot_log(dbp,
+			    dbc->txn, &LSN(meta),
+			    0, &LSN(meta), i, start_pgno, free_pgno)) != 0)
+				break;
+		} else
+			LSN_NOT_LOGGED(LSN(meta));
+		last_pgno = free_pgno + bucket;
+		/*
+		 * March through the list swapping pages.  If the page is
+		 * empty we just need to free it.  If we are just sliding
+		 * things down don't free the pages that will be reused.
+		 * Note that __db_exchange_page returns the new page so
+		 * we must put it.
+		 */
+		for (pgno = start_pgno;
+		    pgno < start_pgno + size; pgno++, free_pgno++) {
+			if ((ret = __db_lget(dbc,
+			    LCK_COUPLE, pgno, DB_LOCK_WRITE, 0, &lock)) != 0)
+				goto err;
+			if ((ret = __memp_fget(dbp->mpf, &pgno,
+			     dbc->thread_info, dbc->txn,
+			     DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &oldpage)) != 0)
+				goto err;
+			if (NUM_ENT(oldpage) != 0) {
+				if (pgno < last_pgno)
+					flags = 0;
+				else
+					flags = DB_EXCH_FREE;
+				if ((ret = __db_exchange_page(dbc,
+				    &oldpage, NULL, free_pgno, flags)) != 0)
+					goto err;
+			} else if (pgno >= last_pgno) {
+				if ((ret = __db_free(dbc, oldpage, 0)) != 0)
+					goto err;
+				COMPACT_TRUNCATE(c_data);
+				oldpage = NULL;
+			}
+			if (oldpage != NULL && (ret = __memp_fput(dbp->mpf,
+			    dbc->thread_info, oldpage, dbc->priority)) != 0)
+				goto err;
+			ret = 0;
+			oldpage = NULL;
+			c_data->compact_pages_examine++;
+		}
+		meta->spares[i] = free_pgno - (size + bucket);
+	}
+	if (ret == 0 && F_ISSET(dbp, DB_AM_SUBDB) &&
+	    PGNO(hcp->hdr) > c_data->compact_truncate)
+		ret = __db_move_metadata(dbc, (DBMETA**)&hcp->hdr, c_data);
+
+err:	if (oldpage != NULL && (t_ret = __memp_fput(dbp->mpf,
+	    dbc->thread_info, oldpage, dbc->priority)) != 0 && ret == 0)
+		ret = t_ret;
+	if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
+		ret = t_ret;
+	LOCK_CHECK_ON(ip);
+err1:	if (dbc != NULL) {
+		if ((t_ret = __ham_release_meta(dbc)) != 0 && ret == 0)
+			ret = t_ret;
+		if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
+			ret = t_ret;
+	}
+	if (local_txn && (t_ret = (ret == 0 ?
+	    __txn_commit(txn, 0) : __txn_abort(txn))) != 0 && ret == 0)
+		ret = t_ret;
+
+	return (ret);
+}
+#endif