diff options
Diffstat (limited to 'bdb/dbinc/btree.h')
-rw-r--r-- | bdb/dbinc/btree.h | 320 |
1 files changed, 320 insertions, 0 deletions
diff --git a/bdb/dbinc/btree.h b/bdb/dbinc/btree.h new file mode 100644 index 00000000000..54da9c5b208 --- /dev/null +++ b/bdb/dbinc/btree.h @@ -0,0 +1,320 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2002 + * Sleepycat Software. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995, 1996 + * Keith Bostic. All rights reserved. + */ +/* + * Copyright (c) 1990, 1993, 1994, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Olson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: btree.h,v 11.45 2002/08/06 06:11:21 bostic Exp $ + */ +#ifndef _DB_BTREE_H_ +#define _DB_BTREE_H_ + +/* Forward structure declarations. */ +struct __btree; typedef struct __btree BTREE; +struct __cursor; typedef struct __cursor BTREE_CURSOR; +struct __epg; typedef struct __epg EPG; +struct __recno; typedef struct __recno RECNO; + +#define DEFMINKEYPAGE (2) + +/* + * A recno order of 0 indicates that we don't have an order, not that we've + * an order less than 1. + */ +#define INVALID_ORDER 0 + +#define ISINTERNAL(p) (TYPE(p) == P_IBTREE || TYPE(p) == P_IRECNO) +#define ISLEAF(p) (TYPE(p) == P_LBTREE || \ + TYPE(p) == P_LRECNO || TYPE(p) == P_LDUP) + +/* Flags for __bam_cadjust_log(). */ +#define CAD_UPDATEROOT 0x01 /* Root page count was updated. */ + +/* Flags for __bam_split_log(). */ +#define SPL_NRECS 0x01 /* Split tree has record count. */ + +/* Flags for __bam_iitem(). */ +#define BI_DELETED 0x01 /* Key/data pair only placeholder. */ + +/* Flags for __bam_stkrel(). */ +#define STK_CLRDBC 0x01 /* Clear dbc->page reference. */ +#define STK_NOLOCK 0x02 /* Don't retain locks. */ + +/* Flags for __ram_ca(). These get logged, so make the values explicit. */ +typedef enum { + CA_DELETE = 0, /* Delete the current record. */ + CA_IAFTER = 1, /* Insert before the current record. */ + CA_IBEFORE = 2, /* Insert after the current record. */ + CA_ICURRENT = 3 /* Overwrite the current record. */ +} ca_recno_arg; + +/* + * Flags for __bam_search() and __bam_rsearch(). + * + * Note, internal page searches must find the largest record less than key in + * the tree so that descents work. Leaf page searches must find the smallest + * record greater than key so that the returned index is the record's correct + * position for insertion. + * + * The flags parameter to the search routines describes three aspects of the + * search: the type of locking required (including if we're locking a pair of + * pages), the item to return in the presence of duplicates and whether or not + * to return deleted entries. To simplify both the mnemonic representation + * and the code that checks for various cases, we construct a set of bitmasks. + */ +#define S_READ 0x00001 /* Read locks. */ +#define S_WRITE 0x00002 /* Write locks. */ + +#define S_APPEND 0x00040 /* Append to the tree. */ +#define S_DELNO 0x00080 /* Don't return deleted items. */ +#define S_DUPFIRST 0x00100 /* Return first duplicate. */ +#define S_DUPLAST 0x00200 /* Return last duplicate. */ +#define S_EXACT 0x00400 /* Exact items only. */ +#define S_PARENT 0x00800 /* Lock page pair. */ +#define S_STACK 0x01000 /* Need a complete stack. */ +#define S_PAST_EOF 0x02000 /* If doing insert search (or keyfirst + * or keylast operations), or a split + * on behalf of an insert, it's okay to + * return an entry one past end-of-page. + */ +#define S_STK_ONLY 0x04000 /* Just return info in the stack */ + +#define S_DELETE (S_WRITE | S_DUPFIRST | S_DELNO | S_EXACT | S_STACK) +#define S_FIND (S_READ | S_DUPFIRST | S_DELNO) +#define S_FIND_WR (S_WRITE | S_DUPFIRST | S_DELNO) +#define S_INSERT (S_WRITE | S_DUPLAST | S_PAST_EOF | S_STACK) +#define S_KEYFIRST (S_WRITE | S_DUPFIRST | S_PAST_EOF | S_STACK) +#define S_KEYLAST (S_WRITE | S_DUPLAST | S_PAST_EOF | S_STACK) +#define S_WRPAIR (S_WRITE | S_DUPLAST | S_PAST_EOF | S_PARENT) + +/* + * Various routines pass around page references. A page reference is + * a pointer to the page, and the indx indicates an item on the page. + * Each page reference may include a lock. + */ +struct __epg { + PAGE *page; /* The page. */ + db_indx_t indx; /* The index on the page. */ + db_indx_t entries; /* The number of entries on page */ + DB_LOCK lock; /* The page's lock. */ + db_lockmode_t lock_mode; /* The lock mode. */ +}; + +/* + * We maintain a stack of the pages that we're locking in the tree. Grow + * the stack as necessary. + * + * XXX + * Temporary fix for #3243 -- clear the page and lock from the stack entry. + * The correct fix is to never release a stack that doesn't hold items. + */ +#define BT_STK_CLR(c) do { \ + (c)->csp = (c)->sp; \ + (c)->csp->page = NULL; \ + LOCK_INIT((c)->csp->lock); \ +} while (0) + +#define BT_STK_ENTER(dbenv, c, pagep, page_indx, l, mode, ret) do { \ + if ((ret = \ + (c)->csp == (c)->esp ? __bam_stkgrow(dbenv, c) : 0) == 0) { \ + (c)->csp->page = pagep; \ + (c)->csp->indx = page_indx; \ + (c)->csp->entries = NUM_ENT(pagep); \ + (c)->csp->lock = l; \ + (c)->csp->lock_mode = mode; \ + } \ +} while (0) + +#define BT_STK_PUSH(dbenv, c, pagep, page_indx, lock, mode, ret) do { \ + BT_STK_ENTER(dbenv, c, pagep, page_indx, lock, mode, ret); \ + ++(c)->csp; \ +} while (0) + +#define BT_STK_NUM(dbenv, c, pagep, page_indx, ret) do { \ + if ((ret = \ + (c)->csp == (c)->esp ? __bam_stkgrow(dbenv, c) : 0) == 0) { \ + (c)->csp->page = NULL; \ + (c)->csp->indx = page_indx; \ + (c)->csp->entries = NUM_ENT(pagep); \ + LOCK_INIT((c)->csp->lock); \ + (c)->csp->lock_mode = DB_LOCK_NG; \ + } \ +} while (0) + +#define BT_STK_NUMPUSH(dbenv, c, pagep, page_indx, ret) do { \ + BT_STK_NUM(dbenv, cp, pagep, page_indx, ret); \ + ++(c)->csp; \ +} while (0) + +#define BT_STK_POP(c) \ + ((c)->csp == (c)->sp ? NULL : --(c)->csp) + +/* Btree/Recno cursor. */ +struct __cursor { + /* struct __dbc_internal */ + __DBC_INTERNAL + + /* btree private part */ + EPG *sp; /* Stack pointer. */ + EPG *csp; /* Current stack entry. */ + EPG *esp; /* End stack pointer. */ + EPG stack[5]; + + db_indx_t ovflsize; /* Maximum key/data on-page size. */ + + db_recno_t recno; /* Current record number. */ + u_int32_t order; /* Relative order among deleted curs. */ + + /* + * Btree: + * We set a flag in the cursor structure if the underlying object has + * been deleted. It's not strictly necessary, we could get the same + * information by looking at the page itself, but this method doesn't + * require us to retrieve the page on cursor delete. + * + * Recno: + * When renumbering recno databases during deletes, cursors referencing + * "deleted" records end up positioned between two records, and so must + * be specially adjusted on the next operation. + */ +#define C_DELETED 0x0001 /* Record was deleted. */ + /* + * There are three tree types that require maintaining record numbers. + * Recno AM trees, Btree AM trees for which the DB_RECNUM flag was set, + * and Btree off-page duplicate trees. + */ +#define C_RECNUM 0x0002 /* Tree requires record counts. */ + /* + * Recno trees have immutable record numbers by default, but optionally + * support mutable record numbers. Off-page duplicate Recno trees have + * mutable record numbers. All Btrees with record numbers (including + * off-page duplicate trees) are mutable by design, no flag is needed. + */ +#define C_RENUMBER 0x0004 /* Tree records are mutable. */ + u_int32_t flags; +}; + +/* + * Threshhold value, as a function of bt_minkey, of the number of + * bytes a key/data pair can use before being placed on an overflow + * page. Assume every item requires the maximum alignment for + * padding, out of sheer paranoia. + */ +#define B_MINKEY_TO_OVFLSIZE(dbp, minkey, pgsize) \ + ((u_int16_t)(((pgsize) - P_OVERHEAD(dbp)) / ((minkey) * P_INDX) -\ + (BKEYDATA_PSIZE(0) + ALIGN(1, sizeof(int32_t))))) + +/* + * The maximum space that a single item can ever take up on one page. + * Used by __bam_split to determine whether a split is still necessary. + */ +#define B_MAX(a,b) (((a) > (b)) ? (a) : (b)) +#define B_MAXSIZEONPAGE(ovflsize) \ + (B_MAX(BOVERFLOW_PSIZE, BKEYDATA_PSIZE(ovflsize))) + +/* + * The in-memory, per-tree btree/recno data structure. + */ +struct __btree { /* Btree access method. */ + /* + * !!! + * These fields are write-once (when the structure is created) and + * so are ignored as far as multi-threading is concerned. + */ + db_pgno_t bt_meta; /* Database meta-data page. */ + db_pgno_t bt_root; /* Database root page. */ + + u_int32_t bt_maxkey; /* Maximum keys per page. */ + u_int32_t bt_minkey; /* Minimum keys per page. */ + + /* Btree comparison function. */ + int (*bt_compare) __P((DB *, const DBT *, const DBT *)); + /* Btree prefix function. */ + size_t (*bt_prefix) __P((DB *, const DBT *, const DBT *)); + + /* Recno access method. */ + int re_pad; /* Fixed-length padding byte. */ + int re_delim; /* Variable-length delimiting byte. */ + u_int32_t re_len; /* Length for fixed-length records. */ + char *re_source; /* Source file name. */ + + /* + * !!! + * The bt_lpgno field is NOT protected by any mutex, and for this + * reason must be advisory only, so, while it is read/written by + * multiple threads, DB is completely indifferent to the quality + * of its information. + */ + db_pgno_t bt_lpgno; /* Last insert location. */ + + /* + * !!! + * The re_modified field is NOT protected by any mutex, and for this + * reason cannot be anything more complicated than a zero/non-zero + * value. The actual writing of the backing source file cannot be + * threaded, so clearing the flag isn't a problem. + */ + int re_modified; /* If the tree was modified. */ + + /* + * !!! + * These fields are ignored as far as multi-threading is concerned. + * There are no transaction semantics associated with backing files, + * nor is there any thread protection. + */ + FILE *re_fp; /* Source file handle. */ + int re_eof; /* Backing source file EOF reached. */ + db_recno_t re_last; /* Last record number read. */ +}; + +/* + * Modes for the __bam_curadj recovery records (btree_curadj). + * These appear in log records, so we wire the values and + * do not leave it up to the compiler. + */ +typedef enum { + DB_CA_DI = 1, + DB_CA_DUP = 2, + DB_CA_RSPLIT = 3, + DB_CA_SPLIT = 4 +} db_ca_mode; + +#include "dbinc_auto/btree_auto.h" +#include "dbinc_auto/btree_ext.h" +#include "dbinc/db_am.h" +#endif /* !_DB_BTREE_H_ */ |