summaryrefslogtreecommitdiff
path: root/bdb/dbinc/btree.h
diff options
context:
space:
mode:
Diffstat (limited to 'bdb/dbinc/btree.h')
-rw-r--r--bdb/dbinc/btree.h320
1 files changed, 320 insertions, 0 deletions
diff --git a/bdb/dbinc/btree.h b/bdb/dbinc/btree.h
new file mode 100644
index 00000000000..54da9c5b208
--- /dev/null
+++ b/bdb/dbinc/btree.h
@@ -0,0 +1,320 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996-2002
+ * Sleepycat Software. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995, 1996
+ * Keith Bostic. All rights reserved.
+ */
+/*
+ * Copyright (c) 1990, 1993, 1994, 1995
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Olson.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: btree.h,v 11.45 2002/08/06 06:11:21 bostic Exp $
+ */
+#ifndef _DB_BTREE_H_
+#define _DB_BTREE_H_
+
+/* Forward structure declarations. */
+struct __btree; typedef struct __btree BTREE;
+struct __cursor; typedef struct __cursor BTREE_CURSOR;
+struct __epg; typedef struct __epg EPG;
+struct __recno; typedef struct __recno RECNO;
+
+#define DEFMINKEYPAGE (2)
+
+/*
+ * A recno order of 0 indicates that we don't have an order, not that we've
+ * an order less than 1.
+ */
+#define INVALID_ORDER 0
+
+#define ISINTERNAL(p) (TYPE(p) == P_IBTREE || TYPE(p) == P_IRECNO)
+#define ISLEAF(p) (TYPE(p) == P_LBTREE || \
+ TYPE(p) == P_LRECNO || TYPE(p) == P_LDUP)
+
+/* Flags for __bam_cadjust_log(). */
+#define CAD_UPDATEROOT 0x01 /* Root page count was updated. */
+
+/* Flags for __bam_split_log(). */
+#define SPL_NRECS 0x01 /* Split tree has record count. */
+
+/* Flags for __bam_iitem(). */
+#define BI_DELETED 0x01 /* Key/data pair only placeholder. */
+
+/* Flags for __bam_stkrel(). */
+#define STK_CLRDBC 0x01 /* Clear dbc->page reference. */
+#define STK_NOLOCK 0x02 /* Don't retain locks. */
+
+/* Flags for __ram_ca(). These get logged, so make the values explicit. */
+typedef enum {
+ CA_DELETE = 0, /* Delete the current record. */
+ CA_IAFTER = 1, /* Insert before the current record. */
+ CA_IBEFORE = 2, /* Insert after the current record. */
+ CA_ICURRENT = 3 /* Overwrite the current record. */
+} ca_recno_arg;
+
+/*
+ * Flags for __bam_search() and __bam_rsearch().
+ *
+ * Note, internal page searches must find the largest record less than key in
+ * the tree so that descents work. Leaf page searches must find the smallest
+ * record greater than key so that the returned index is the record's correct
+ * position for insertion.
+ *
+ * The flags parameter to the search routines describes three aspects of the
+ * search: the type of locking required (including if we're locking a pair of
+ * pages), the item to return in the presence of duplicates and whether or not
+ * to return deleted entries. To simplify both the mnemonic representation
+ * and the code that checks for various cases, we construct a set of bitmasks.
+ */
+#define S_READ 0x00001 /* Read locks. */
+#define S_WRITE 0x00002 /* Write locks. */
+
+#define S_APPEND 0x00040 /* Append to the tree. */
+#define S_DELNO 0x00080 /* Don't return deleted items. */
+#define S_DUPFIRST 0x00100 /* Return first duplicate. */
+#define S_DUPLAST 0x00200 /* Return last duplicate. */
+#define S_EXACT 0x00400 /* Exact items only. */
+#define S_PARENT 0x00800 /* Lock page pair. */
+#define S_STACK 0x01000 /* Need a complete stack. */
+#define S_PAST_EOF 0x02000 /* If doing insert search (or keyfirst
+ * or keylast operations), or a split
+ * on behalf of an insert, it's okay to
+ * return an entry one past end-of-page.
+ */
+#define S_STK_ONLY 0x04000 /* Just return info in the stack */
+
+#define S_DELETE (S_WRITE | S_DUPFIRST | S_DELNO | S_EXACT | S_STACK)
+#define S_FIND (S_READ | S_DUPFIRST | S_DELNO)
+#define S_FIND_WR (S_WRITE | S_DUPFIRST | S_DELNO)
+#define S_INSERT (S_WRITE | S_DUPLAST | S_PAST_EOF | S_STACK)
+#define S_KEYFIRST (S_WRITE | S_DUPFIRST | S_PAST_EOF | S_STACK)
+#define S_KEYLAST (S_WRITE | S_DUPLAST | S_PAST_EOF | S_STACK)
+#define S_WRPAIR (S_WRITE | S_DUPLAST | S_PAST_EOF | S_PARENT)
+
+/*
+ * Various routines pass around page references. A page reference is
+ * a pointer to the page, and the indx indicates an item on the page.
+ * Each page reference may include a lock.
+ */
+struct __epg {
+ PAGE *page; /* The page. */
+ db_indx_t indx; /* The index on the page. */
+ db_indx_t entries; /* The number of entries on page */
+ DB_LOCK lock; /* The page's lock. */
+ db_lockmode_t lock_mode; /* The lock mode. */
+};
+
+/*
+ * We maintain a stack of the pages that we're locking in the tree. Grow
+ * the stack as necessary.
+ *
+ * XXX
+ * Temporary fix for #3243 -- clear the page and lock from the stack entry.
+ * The correct fix is to never release a stack that doesn't hold items.
+ */
+#define BT_STK_CLR(c) do { \
+ (c)->csp = (c)->sp; \
+ (c)->csp->page = NULL; \
+ LOCK_INIT((c)->csp->lock); \
+} while (0)
+
+#define BT_STK_ENTER(dbenv, c, pagep, page_indx, l, mode, ret) do { \
+ if ((ret = \
+ (c)->csp == (c)->esp ? __bam_stkgrow(dbenv, c) : 0) == 0) { \
+ (c)->csp->page = pagep; \
+ (c)->csp->indx = page_indx; \
+ (c)->csp->entries = NUM_ENT(pagep); \
+ (c)->csp->lock = l; \
+ (c)->csp->lock_mode = mode; \
+ } \
+} while (0)
+
+#define BT_STK_PUSH(dbenv, c, pagep, page_indx, lock, mode, ret) do { \
+ BT_STK_ENTER(dbenv, c, pagep, page_indx, lock, mode, ret); \
+ ++(c)->csp; \
+} while (0)
+
+#define BT_STK_NUM(dbenv, c, pagep, page_indx, ret) do { \
+ if ((ret = \
+ (c)->csp == (c)->esp ? __bam_stkgrow(dbenv, c) : 0) == 0) { \
+ (c)->csp->page = NULL; \
+ (c)->csp->indx = page_indx; \
+ (c)->csp->entries = NUM_ENT(pagep); \
+ LOCK_INIT((c)->csp->lock); \
+ (c)->csp->lock_mode = DB_LOCK_NG; \
+ } \
+} while (0)
+
+#define BT_STK_NUMPUSH(dbenv, c, pagep, page_indx, ret) do { \
+ BT_STK_NUM(dbenv, cp, pagep, page_indx, ret); \
+ ++(c)->csp; \
+} while (0)
+
+#define BT_STK_POP(c) \
+ ((c)->csp == (c)->sp ? NULL : --(c)->csp)
+
+/* Btree/Recno cursor. */
+struct __cursor {
+ /* struct __dbc_internal */
+ __DBC_INTERNAL
+
+ /* btree private part */
+ EPG *sp; /* Stack pointer. */
+ EPG *csp; /* Current stack entry. */
+ EPG *esp; /* End stack pointer. */
+ EPG stack[5];
+
+ db_indx_t ovflsize; /* Maximum key/data on-page size. */
+
+ db_recno_t recno; /* Current record number. */
+ u_int32_t order; /* Relative order among deleted curs. */
+
+ /*
+ * Btree:
+ * We set a flag in the cursor structure if the underlying object has
+ * been deleted. It's not strictly necessary, we could get the same
+ * information by looking at the page itself, but this method doesn't
+ * require us to retrieve the page on cursor delete.
+ *
+ * Recno:
+ * When renumbering recno databases during deletes, cursors referencing
+ * "deleted" records end up positioned between two records, and so must
+ * be specially adjusted on the next operation.
+ */
+#define C_DELETED 0x0001 /* Record was deleted. */
+ /*
+ * There are three tree types that require maintaining record numbers.
+ * Recno AM trees, Btree AM trees for which the DB_RECNUM flag was set,
+ * and Btree off-page duplicate trees.
+ */
+#define C_RECNUM 0x0002 /* Tree requires record counts. */
+ /*
+ * Recno trees have immutable record numbers by default, but optionally
+ * support mutable record numbers. Off-page duplicate Recno trees have
+ * mutable record numbers. All Btrees with record numbers (including
+ * off-page duplicate trees) are mutable by design, no flag is needed.
+ */
+#define C_RENUMBER 0x0004 /* Tree records are mutable. */
+ u_int32_t flags;
+};
+
+/*
+ * Threshhold value, as a function of bt_minkey, of the number of
+ * bytes a key/data pair can use before being placed on an overflow
+ * page. Assume every item requires the maximum alignment for
+ * padding, out of sheer paranoia.
+ */
+#define B_MINKEY_TO_OVFLSIZE(dbp, minkey, pgsize) \
+ ((u_int16_t)(((pgsize) - P_OVERHEAD(dbp)) / ((minkey) * P_INDX) -\
+ (BKEYDATA_PSIZE(0) + ALIGN(1, sizeof(int32_t)))))
+
+/*
+ * The maximum space that a single item can ever take up on one page.
+ * Used by __bam_split to determine whether a split is still necessary.
+ */
+#define B_MAX(a,b) (((a) > (b)) ? (a) : (b))
+#define B_MAXSIZEONPAGE(ovflsize) \
+ (B_MAX(BOVERFLOW_PSIZE, BKEYDATA_PSIZE(ovflsize)))
+
+/*
+ * The in-memory, per-tree btree/recno data structure.
+ */
+struct __btree { /* Btree access method. */
+ /*
+ * !!!
+ * These fields are write-once (when the structure is created) and
+ * so are ignored as far as multi-threading is concerned.
+ */
+ db_pgno_t bt_meta; /* Database meta-data page. */
+ db_pgno_t bt_root; /* Database root page. */
+
+ u_int32_t bt_maxkey; /* Maximum keys per page. */
+ u_int32_t bt_minkey; /* Minimum keys per page. */
+
+ /* Btree comparison function. */
+ int (*bt_compare) __P((DB *, const DBT *, const DBT *));
+ /* Btree prefix function. */
+ size_t (*bt_prefix) __P((DB *, const DBT *, const DBT *));
+
+ /* Recno access method. */
+ int re_pad; /* Fixed-length padding byte. */
+ int re_delim; /* Variable-length delimiting byte. */
+ u_int32_t re_len; /* Length for fixed-length records. */
+ char *re_source; /* Source file name. */
+
+ /*
+ * !!!
+ * The bt_lpgno field is NOT protected by any mutex, and for this
+ * reason must be advisory only, so, while it is read/written by
+ * multiple threads, DB is completely indifferent to the quality
+ * of its information.
+ */
+ db_pgno_t bt_lpgno; /* Last insert location. */
+
+ /*
+ * !!!
+ * The re_modified field is NOT protected by any mutex, and for this
+ * reason cannot be anything more complicated than a zero/non-zero
+ * value. The actual writing of the backing source file cannot be
+ * threaded, so clearing the flag isn't a problem.
+ */
+ int re_modified; /* If the tree was modified. */
+
+ /*
+ * !!!
+ * These fields are ignored as far as multi-threading is concerned.
+ * There are no transaction semantics associated with backing files,
+ * nor is there any thread protection.
+ */
+ FILE *re_fp; /* Source file handle. */
+ int re_eof; /* Backing source file EOF reached. */
+ db_recno_t re_last; /* Last record number read. */
+};
+
+/*
+ * Modes for the __bam_curadj recovery records (btree_curadj).
+ * These appear in log records, so we wire the values and
+ * do not leave it up to the compiler.
+ */
+typedef enum {
+ DB_CA_DI = 1,
+ DB_CA_DUP = 2,
+ DB_CA_RSPLIT = 3,
+ DB_CA_SPLIT = 4
+} db_ca_mode;
+
+#include "dbinc_auto/btree_auto.h"
+#include "dbinc_auto/btree_ext.h"
+#include "dbinc/db_am.h"
+#endif /* !_DB_BTREE_H_ */