summaryrefslogtreecommitdiff
path: root/bdb/env
diff options
context:
space:
mode:
authorunknown <tim@threads.polyesthetic.msg>2001-03-04 19:42:05 -0500
committerunknown <tim@threads.polyesthetic.msg>2001-03-04 19:42:05 -0500
commitec6ae091617bdfdca9e65e8d3e65b950d234f676 (patch)
tree9dd732e08dba156ee3d7635caedc0dc3107ecac6 /bdb/env
parent87d70fb598105b64b538ff6b81eef9da626255b1 (diff)
downloadmariadb-git-ec6ae091617bdfdca9e65e8d3e65b950d234f676.tar.gz
Import changeset
Diffstat (limited to 'bdb/env')
-rw-r--r--bdb/env/db_salloc.c360
-rw-r--r--bdb/env/db_shash.c124
-rw-r--r--bdb/env/env_method.c461
-rw-r--r--bdb/env/env_open.c1064
-rw-r--r--bdb/env/env_recover.c449
-rw-r--r--bdb/env/env_region.c1205
6 files changed, 3663 insertions, 0 deletions
diff --git a/bdb/env/db_salloc.c b/bdb/env/db_salloc.c
new file mode 100644
index 00000000000..4780107c593
--- /dev/null
+++ b/bdb/env/db_salloc.c
@@ -0,0 +1,360 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: db_salloc.c,v 11.10 2000/12/06 19:55:44 ubell Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <stdlib.h>
+#include <string.h>
+#endif
+
+#include "db_int.h"
+
+/*
+ * Implement shared memory region allocation, using simple first-fit algorithm.
+ * The model is that we take a "chunk" of shared memory store and begin carving
+ * it up into areas, similarly to how malloc works. We do coalescing on free.
+ *
+ * The "len" field in the __data struct contains the length of the free region
+ * (less the size_t bytes that holds the length). We use the address provided
+ * by the caller to find this length, which allows us to free a chunk without
+ * requiring that the caller pass in the length of the chunk they're freeing.
+ */
+SH_LIST_HEAD(__head);
+struct __data {
+ size_t len;
+ SH_LIST_ENTRY links;
+};
+
+/*
+ * __db_shalloc_init --
+ * Initialize the area as one large chunk.
+ *
+ * PUBLIC: void __db_shalloc_init __P((void *, size_t));
+ */
+void
+__db_shalloc_init(area, size)
+ void *area;
+ size_t size;
+{
+ struct __data *elp;
+ struct __head *hp;
+
+ hp = area;
+ SH_LIST_INIT(hp);
+
+ elp = (struct __data *)(hp + 1);
+ elp->len = size - sizeof(struct __head) - sizeof(elp->len);
+ SH_LIST_INSERT_HEAD(hp, elp, links, __data);
+}
+
+/*
+ * __db_shalloc --
+ * Allocate some space from the shared region.
+ *
+ * PUBLIC: int __db_shalloc_size __P((size_t, size_t));
+ */
+int
+__db_shalloc_size(len, align)
+ size_t len, align;
+{
+ /* Never allocate less than the size of a struct __data. */
+ if (len < sizeof(struct __data))
+ len = sizeof(struct __data);
+
+#ifdef DIAGNOSTIC
+ /* Add room for a guard byte. */
+ ++len;
+#endif
+
+ /* Never align to less than a db_align_t boundary. */
+ if (align <= sizeof(db_align_t))
+ align = sizeof(db_align_t);
+
+ return (ALIGN(len, align) + sizeof (struct __data));
+}
+
+/*
+ * __db_shalloc --
+ * Allocate some space from the shared region.
+ *
+ * PUBLIC: int __db_shalloc __P((void *, size_t, size_t, void *));
+ */
+int
+__db_shalloc(p, len, align, retp)
+ void *p, *retp;
+ size_t len, align;
+{
+ struct __data *elp;
+ size_t *sp;
+ void *rp;
+
+ /* Never allocate less than the size of a struct __data. */
+ if (len < sizeof(struct __data))
+ len = sizeof(struct __data);
+
+#ifdef DIAGNOSTIC
+ /* Add room for a guard byte. */
+ ++len;
+#endif
+
+ /* Never align to less than a db_align_t boundary. */
+ if (align <= sizeof(db_align_t))
+ align = sizeof(db_align_t);
+
+ /* Walk the list, looking for a slot. */
+ for (elp = SH_LIST_FIRST((struct __head *)p, __data);
+ elp != NULL;
+ elp = SH_LIST_NEXT(elp, links, __data)) {
+ /*
+ * Calculate the value of the returned pointer if we were to
+ * use this chunk.
+ * + Find the end of the chunk.
+ * + Subtract the memory the user wants.
+ * + Find the closest previous correctly-aligned address.
+ */
+ rp = (u_int8_t *)elp + sizeof(size_t) + elp->len;
+ rp = (u_int8_t *)rp - len;
+ rp = (u_int8_t *)((db_alignp_t)rp & ~(align - 1));
+
+ /*
+ * Rp may now point before elp->links, in which case the chunk
+ * was too small, and we have to try again.
+ */
+ if ((u_int8_t *)rp < (u_int8_t *)&elp->links)
+ continue;
+
+ *(void **)retp = rp;
+#ifdef DIAGNOSTIC
+ /*
+ * At this point, whether or not we still need to split up a
+ * chunk, retp is the address of the region we are returning,
+ * and (u_int8_t *)elp + sizeof(size_t) + elp->len gives us
+ * the address of the first byte after the end of the chunk.
+ * Make the byte immediately before that the guard byte.
+ */
+ *((u_int8_t *)elp + sizeof(size_t) + elp->len - 1) = GUARD_BYTE;
+#endif
+
+#define SHALLOC_FRAGMENT 32
+ /*
+ * If there are at least SHALLOC_FRAGMENT additional bytes of
+ * memory, divide the chunk into two chunks.
+ */
+ if ((u_int8_t *)rp >=
+ (u_int8_t *)&elp->links + SHALLOC_FRAGMENT) {
+ sp = rp;
+ *--sp = elp->len -
+ ((u_int8_t *)rp - (u_int8_t *)&elp->links);
+ elp->len -= *sp + sizeof(size_t);
+ return (0);
+ }
+
+ /*
+ * Otherwise, we return the entire chunk, wasting some amount
+ * of space to keep the list compact. However, because the
+ * address we're returning to the user may not be the address
+ * of the start of the region for alignment reasons, set the
+ * size_t length fields back to the "real" length field to a
+ * flag value, so that we can find the real length during free.
+ */
+#define ILLEGAL_SIZE 1
+ SH_LIST_REMOVE(elp, links, __data);
+ for (sp = rp; (u_int8_t *)--sp >= (u_int8_t *)&elp->links;)
+ *sp = ILLEGAL_SIZE;
+ return (0);
+ }
+
+ return (ENOMEM);
+}
+
+/*
+ * __db_shalloc_free --
+ * Free a shared memory allocation.
+ *
+ * PUBLIC: void __db_shalloc_free __P((void *, void *));
+ */
+void
+__db_shalloc_free(regionp, ptr)
+ void *regionp, *ptr;
+{
+ struct __data *elp, *lastp, *newp;
+ struct __head *hp;
+ size_t free_size, *sp;
+ int merged;
+
+ /*
+ * Step back over flagged length fields to find the beginning of
+ * the object and its real size.
+ */
+ for (sp = (size_t *)ptr; sp[-1] == ILLEGAL_SIZE; --sp)
+ ;
+ ptr = sp;
+
+ newp = (struct __data *)((u_int8_t *)ptr - sizeof(size_t));
+ free_size = newp->len;
+
+#ifdef DIAGNOSTIC
+ /*
+ * The "real size" includes the guard byte; it's just the last
+ * byte in the chunk, and the caller never knew it existed.
+ *
+ * Check it to make sure it hasn't been stomped.
+ */
+ if (*((u_int8_t *)ptr + free_size - 1) != GUARD_BYTE) {
+ /*
+ * Eventually, once we push a DB_ENV handle down to these
+ * routines, we should use the standard output channels.
+ */
+ fprintf(stderr,
+ "Guard byte incorrect during shared memory free.\n");
+ abort();
+ /* NOTREACHED */
+ }
+
+ /* Trash the returned memory (including guard byte). */
+ memset(ptr, CLEAR_BYTE, free_size);
+#endif
+
+ /*
+ * Walk the list, looking for where this entry goes.
+ *
+ * We keep the free list sorted by address so that coalescing is
+ * trivial.
+ *
+ * XXX
+ * Probably worth profiling this to see how expensive it is.
+ */
+ hp = (struct __head *)regionp;
+ for (elp = SH_LIST_FIRST(hp, __data), lastp = NULL;
+ elp != NULL && (void *)elp < (void *)ptr;
+ lastp = elp, elp = SH_LIST_NEXT(elp, links, __data))
+ ;
+
+ /*
+ * Elp is either NULL (we reached the end of the list), or the slot
+ * after the one that's being returned. Lastp is either NULL (we're
+ * returning the first element of the list) or the element before the
+ * one being returned.
+ *
+ * Check for coalescing with the next element.
+ */
+ merged = 0;
+ if ((u_int8_t *)ptr + free_size == (u_int8_t *)elp) {
+ newp->len += elp->len + sizeof(size_t);
+ SH_LIST_REMOVE(elp, links, __data);
+ if (lastp != NULL)
+ SH_LIST_INSERT_AFTER(lastp, newp, links, __data);
+ else
+ SH_LIST_INSERT_HEAD(hp, newp, links, __data);
+ merged = 1;
+ }
+
+ /* Check for coalescing with the previous element. */
+ if (lastp != NULL && (u_int8_t *)lastp +
+ lastp->len + sizeof(size_t) == (u_int8_t *)newp) {
+ lastp->len += newp->len + sizeof(size_t);
+
+ /*
+ * If we have already put the new element into the list take
+ * it back off again because it's just been merged with the
+ * previous element.
+ */
+ if (merged)
+ SH_LIST_REMOVE(newp, links, __data);
+ merged = 1;
+ }
+
+ if (!merged) {
+ if (lastp == NULL)
+ SH_LIST_INSERT_HEAD(hp, newp, links, __data);
+ else
+ SH_LIST_INSERT_AFTER(lastp, newp, links, __data);
+ }
+}
+
+/*
+ * __db_shalloc_count --
+ * Return the amount of memory on the free list.
+ *
+ * PUBLIC: size_t __db_shalloc_count __P((void *));
+ */
+size_t
+__db_shalloc_count(addr)
+ void *addr;
+{
+ struct __data *elp;
+ size_t count;
+
+ count = 0;
+ for (elp = SH_LIST_FIRST((struct __head *)addr, __data);
+ elp != NULL;
+ elp = SH_LIST_NEXT(elp, links, __data))
+ count += elp->len;
+
+ return (count);
+}
+
+/*
+ * __db_shsizeof --
+ * Return the size of a shalloc'd piece of memory.
+ *
+ * !!!
+ * Note that this is from an internal standpoint -- it includes not only
+ * the size of the memory being used, but also the extra alignment bytes
+ * in front and, #ifdef DIAGNOSTIC, the guard byte at the end.
+ *
+ * PUBLIC: size_t __db_shsizeof __P((void *));
+ */
+size_t
+__db_shsizeof(ptr)
+ void *ptr;
+{
+ struct __data *elp;
+ size_t *sp;
+
+ /*
+ * Step back over flagged length fields to find the beginning of
+ * the object and its real size.
+ */
+ for (sp = (size_t *)ptr; sp[-1] == ILLEGAL_SIZE; --sp)
+ ;
+
+ elp = (struct __data *)((u_int8_t *)sp - sizeof(size_t));
+ return (elp->len);
+}
+
+/*
+ * __db_shalloc_dump --
+ *
+ * PUBLIC: void __db_shalloc_dump __P((void *, FILE *));
+ */
+void
+__db_shalloc_dump(addr, fp)
+ void *addr;
+ FILE *fp;
+{
+ struct __data *elp;
+
+ /* Make it easy to call from the debugger. */
+ if (fp == NULL)
+ fp = stderr;
+
+ fprintf(fp, "%s\nMemory free list\n", DB_LINE);
+
+ for (elp = SH_LIST_FIRST((struct __head *)addr, __data);
+ elp != NULL;
+ elp = SH_LIST_NEXT(elp, links, __data))
+ fprintf(fp, "%#lx: %lu\t", (u_long)elp, (u_long)elp->len);
+ fprintf(fp, "\n");
+}
diff --git a/bdb/env/db_shash.c b/bdb/env/db_shash.c
new file mode 100644
index 00000000000..1c33b383098
--- /dev/null
+++ b/bdb/env/db_shash.c
@@ -0,0 +1,124 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: db_shash.c,v 11.3 2000/02/14 02:59:49 bostic Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+#endif
+
+#include "db_int.h"
+
+/*
+ * Table of good hash values. Up to ~250,000 buckets, we use powers of 2.
+ * After that, we slow the rate of increase by half. For each choice, we
+ * then use a nearby prime number as the hash value.
+ *
+ * If a terabyte is the maximum cache we'll see, and we assume there are
+ * 10 1K buckets on each hash chain, then 107374182 is the maximum number
+ * of buckets we'll ever need.
+ */
+static const struct {
+ u_int32_t power;
+ u_int32_t prime;
+} list[] = {
+ { 64, 67}, /* 2^6 */
+ { 128, 131}, /* 2^7 */
+ { 256, 257}, /* 2^8 */
+ { 512, 521}, /* 2^9 */
+ { 1024, 1031}, /* 2^10 */
+ { 2048, 2053}, /* 2^11 */
+ { 4096, 4099}, /* 2^12 */
+ { 8192, 8191}, /* 2^13 */
+ { 16384, 16381}, /* 2^14 */
+ { 32768, 32771}, /* 2^15 */
+ { 65536, 65537}, /* 2^16 */
+ { 131072, 131071}, /* 2^17 */
+ { 262144, 262147}, /* 2^18 */
+ { 393216, 393209}, /* 2^18 + 2^18/2 */
+ { 524288, 524287}, /* 2^19 */
+ { 786432, 786431}, /* 2^19 + 2^19/2 */
+ { 1048576, 1048573}, /* 2^20 */
+ { 1572864, 1572869}, /* 2^20 + 2^20/2 */
+ { 2097152, 2097169}, /* 2^21 */
+ { 3145728, 3145721}, /* 2^21 + 2^21/2 */
+ { 4194304, 4194301}, /* 2^22 */
+ { 6291456, 6291449}, /* 2^22 + 2^22/2 */
+ { 8388608, 8388617}, /* 2^23 */
+ { 12582912, 12582917}, /* 2^23 + 2^23/2 */
+ { 16777216, 16777213}, /* 2^24 */
+ { 25165824, 25165813}, /* 2^24 + 2^24/2 */
+ { 33554432, 33554393}, /* 2^25 */
+ { 50331648, 50331653}, /* 2^25 + 2^25/2 */
+ { 67108864, 67108859}, /* 2^26 */
+ { 100663296, 100663291}, /* 2^26 + 2^26/2 */
+ { 134217728, 134217757}, /* 2^27 */
+ { 201326592, 201326611}, /* 2^27 + 2^27/2 */
+ { 268435456, 268435459}, /* 2^28 */
+ { 402653184, 402653189}, /* 2^28 + 2^28/2 */
+ { 536870912, 536870909}, /* 2^29 */
+ { 805306368, 805306357}, /* 2^29 + 2^29/2 */
+ {1073741824, 1073741827}, /* 2^30 */
+ {0, 0}
+};
+
+/*
+ * __db_tablesize --
+ * Choose a size for the hash table.
+ *
+ * PUBLIC: int __db_tablesize __P((u_int32_t));
+ */
+int
+__db_tablesize(n_buckets)
+ u_int32_t n_buckets;
+{
+ int i;
+
+ /*
+ * We try to be clever about how big we make the hash tables. Use a
+ * prime number close to the "suggested" number of elements that will
+ * be in the hash table. Use 64 as the minimum hash table size.
+ *
+ * Ref: Sedgewick, Algorithms in C, "Hash Functions"
+ */
+ if (n_buckets < 64)
+ n_buckets = 64;
+
+ for (i = 0;; ++i) {
+ if (list[i].power == 0) {
+ --i;
+ break;
+ }
+ if (list[i].power >= n_buckets)
+ break;
+ }
+ return (list[i].prime);
+}
+
+/*
+ * __db_hashinit --
+ * Initialize a hash table that resides in shared memory.
+ *
+ * PUBLIC: void __db_hashinit __P((void *, u_int32_t));
+ */
+void
+__db_hashinit(begin, nelements)
+ void *begin;
+ u_int32_t nelements;
+{
+ u_int32_t i;
+ SH_TAILQ_HEAD(hash_head) *headp;
+
+ headp = (struct hash_head *)begin;
+
+ for (i = 0; i < nelements; i++, headp++)
+ SH_TAILQ_INIT(headp);
+}
diff --git a/bdb/env/env_method.c b/bdb/env/env_method.c
new file mode 100644
index 00000000000..c5f45df7124
--- /dev/null
+++ b/bdb/env/env_method.c
@@ -0,0 +1,461 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: env_method.c,v 11.31 2000/11/30 00:58:35 ubell Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <string.h>
+#endif
+
+#ifdef HAVE_RPC
+#include "db_server.h"
+#endif
+
+/*
+ * This is the file that initializes the global array. Do it this way because
+ * people keep changing one without changing the other. Having declaration and
+ * initialization in one file will hopefully fix that.
+ */
+#define DB_INITIALIZE_DB_GLOBALS 1
+
+#include "db_int.h"
+#include "db_shash.h"
+#include "db_page.h"
+#include "db_am.h"
+#include "lock.h"
+#include "log.h"
+#include "mp.h"
+#include "txn.h"
+
+#ifdef HAVE_RPC
+#include "gen_client_ext.h"
+#include "rpc_client_ext.h"
+#endif
+
+static void __dbenv_err __P((const DB_ENV *, int, const char *, ...));
+static void __dbenv_errx __P((const DB_ENV *, const char *, ...));
+static int __dbenv_set_data_dir __P((DB_ENV *, const char *));
+static void __dbenv_set_errcall __P((DB_ENV *, void (*)(const char *, char *)));
+static void __dbenv_set_errfile __P((DB_ENV *, FILE *));
+static void __dbenv_set_errpfx __P((DB_ENV *, const char *));
+static int __dbenv_set_feedback __P((DB_ENV *, void (*)(DB_ENV *, int, int)));
+static int __dbenv_set_flags __P((DB_ENV *, u_int32_t, int));
+static int __dbenv_set_mutexlocks __P((DB_ENV *, int));
+static int __dbenv_set_paniccall __P((DB_ENV *, void (*)(DB_ENV *, int)));
+static int __dbenv_set_recovery_init __P((DB_ENV *, int (*)(DB_ENV *)));
+static int __dbenv_set_server_noclnt
+ __P((DB_ENV *, char *, long, long, u_int32_t));
+static int __dbenv_set_shm_key __P((DB_ENV *, long));
+static int __dbenv_set_tmp_dir __P((DB_ENV *, const char *));
+static int __dbenv_set_verbose __P((DB_ENV *, u_int32_t, int));
+
+/*
+ * db_env_create --
+ * DB_ENV constructor.
+ */
+int
+db_env_create(dbenvpp, flags)
+ DB_ENV **dbenvpp;
+ u_int32_t flags;
+{
+ DB_ENV *dbenv;
+ int ret;
+
+ /*
+ * !!!
+ * We can't call the flags-checking routines, we don't have an
+ * environment yet.
+ */
+ if (flags != 0 && flags != DB_CLIENT)
+ return (EINVAL);
+
+ if ((ret = __os_calloc(NULL, 1, sizeof(*dbenv), &dbenv)) != 0)
+ return (ret);
+
+#ifdef HAVE_RPC
+ if (LF_ISSET(DB_CLIENT))
+ F_SET(dbenv, DB_ENV_RPCCLIENT);
+#endif
+ ret = __dbenv_init(dbenv);
+
+ if (ret != 0) {
+ __os_free(dbenv, sizeof(*dbenv));
+ return (ret);
+ }
+
+ *dbenvpp = dbenv;
+ return (0);
+}
+
+/*
+ * __dbenv_init --
+ * Initialize a DB_ENV structure.
+ *
+ * PUBLIC: int __dbenv_init __P((DB_ENV *));
+ */
+int
+__dbenv_init(dbenv)
+ DB_ENV *dbenv;
+{
+ /*
+ * Set up methods that are the same in both normal and RPC
+ */
+ dbenv->err = __dbenv_err;
+ dbenv->errx = __dbenv_errx;
+ dbenv->set_errcall = __dbenv_set_errcall;
+ dbenv->set_errfile = __dbenv_set_errfile;
+ dbenv->set_errpfx = __dbenv_set_errpfx;
+
+#ifdef HAVE_RPC
+ if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) {
+ dbenv->close = __dbcl_env_close;
+ dbenv->open = __dbcl_env_open;
+ dbenv->remove = __dbcl_env_remove;
+ dbenv->set_data_dir = __dbcl_set_data_dir;
+ dbenv->set_feedback = __dbcl_env_set_feedback;
+ dbenv->set_flags = __dbcl_env_flags;
+ dbenv->set_mutexlocks = __dbcl_set_mutex_locks;
+ dbenv->set_paniccall = __dbcl_env_paniccall;
+ dbenv->set_recovery_init = __dbcl_set_recovery_init;
+ dbenv->set_server = __dbcl_envserver;
+ dbenv->set_shm_key = __dbcl_set_shm_key;
+ dbenv->set_tmp_dir = __dbcl_set_tmp_dir;
+ dbenv->set_verbose = __dbcl_set_verbose;
+ } else {
+#endif
+ dbenv->close = __dbenv_close;
+ dbenv->open = __dbenv_open;
+ dbenv->remove = __dbenv_remove;
+ dbenv->set_data_dir = __dbenv_set_data_dir;
+ dbenv->set_feedback = __dbenv_set_feedback;
+ dbenv->set_flags = __dbenv_set_flags;
+ dbenv->set_mutexlocks = __dbenv_set_mutexlocks;
+ dbenv->set_paniccall = __dbenv_set_paniccall;
+ dbenv->set_recovery_init = __dbenv_set_recovery_init;
+ dbenv->set_server = __dbenv_set_server_noclnt;
+ dbenv->set_shm_key = __dbenv_set_shm_key;
+ dbenv->set_tmp_dir = __dbenv_set_tmp_dir;
+ dbenv->set_verbose = __dbenv_set_verbose;
+#ifdef HAVE_RPC
+ }
+#endif
+ dbenv->shm_key = INVALID_REGION_SEGID;
+ dbenv->db_mutexlocks = 1;
+
+ __log_dbenv_create(dbenv); /* Subsystem specific. */
+ __lock_dbenv_create(dbenv);
+ __memp_dbenv_create(dbenv);
+ __txn_dbenv_create(dbenv);
+
+ return (0);
+}
+
+/*
+ * __dbenv_err --
+ * Error message, including the standard error string.
+ */
+static void
+#ifdef __STDC__
+__dbenv_err(const DB_ENV *dbenv, int error, const char *fmt, ...)
+#else
+__dbenv_err(dbenv, error, fmt, va_alist)
+ const DB_ENV *dbenv;
+ int error;
+ const char *fmt;
+ va_dcl
+#endif
+{
+ va_list ap;
+
+#ifdef __STDC__
+ va_start(ap, fmt);
+#else
+ va_start(ap);
+#endif
+ __db_real_err(dbenv, error, 1, 1, fmt, ap);
+
+ va_end(ap);
+}
+
+/*
+ * __dbenv_errx --
+ * Error message.
+ */
+static void
+#ifdef __STDC__
+__dbenv_errx(const DB_ENV *dbenv, const char *fmt, ...)
+#else
+__dbenv_errx(dbenv, fmt, va_alist)
+ const DB_ENV *dbenv;
+ const char *fmt;
+ va_dcl
+#endif
+{
+ va_list ap;
+
+#ifdef __STDC__
+ va_start(ap, fmt);
+#else
+ va_start(ap);
+#endif
+ __db_real_err(dbenv, 0, 0, 1, fmt, ap);
+
+ va_end(ap);
+}
+
+static int
+__dbenv_set_flags(dbenv, flags, onoff)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+ int onoff;
+{
+#define OK_FLAGS (DB_CDB_ALLDB | DB_NOMMAP | DB_TXN_NOSYNC)
+
+ if (LF_ISSET(~OK_FLAGS))
+ return (__db_ferr(dbenv, "DBENV->set_flags", 0));
+
+ if (LF_ISSET(DB_CDB_ALLDB)) {
+ ENV_ILLEGAL_AFTER_OPEN(dbenv, "set_flags: DB_CDB_ALLDB");
+ if (onoff)
+ F_SET(dbenv, DB_ENV_CDB_ALLDB);
+ else
+ F_CLR(dbenv, DB_ENV_CDB_ALLDB);
+ }
+ if (LF_ISSET(DB_NOMMAP)) {
+ if (onoff)
+ F_SET(dbenv, DB_ENV_NOMMAP);
+ else
+ F_CLR(dbenv, DB_ENV_NOMMAP);
+ }
+ if (LF_ISSET(DB_TXN_NOSYNC)) {
+ if (onoff)
+ F_SET(dbenv, DB_ENV_TXN_NOSYNC);
+ else
+ F_CLR(dbenv, DB_ENV_TXN_NOSYNC);
+ }
+ return (0);
+}
+
+static int
+__dbenv_set_data_dir(dbenv, dir)
+ DB_ENV *dbenv;
+ const char *dir;
+{
+ int ret;
+
+#define DATA_INIT_CNT 20 /* Start with 20 data slots. */
+ if (dbenv->db_data_dir == NULL) {
+ if ((ret = __os_calloc(dbenv, DATA_INIT_CNT,
+ sizeof(char **), &dbenv->db_data_dir)) != 0)
+ return (ret);
+ dbenv->data_cnt = DATA_INIT_CNT;
+ } else if (dbenv->data_next == dbenv->data_cnt - 1) {
+ dbenv->data_cnt *= 2;
+ if ((ret = __os_realloc(dbenv,
+ dbenv->data_cnt * sizeof(char **),
+ NULL, &dbenv->db_data_dir)) != 0)
+ return (ret);
+ }
+ return (__os_strdup(dbenv,
+ dir, &dbenv->db_data_dir[dbenv->data_next++]));
+}
+
+static void
+__dbenv_set_errcall(dbenv, errcall)
+ DB_ENV *dbenv;
+ void (*errcall) __P((const char *, char *));
+{
+ dbenv->db_errcall = errcall;
+}
+
+static void
+__dbenv_set_errfile(dbenv, errfile)
+ DB_ENV *dbenv;
+ FILE *errfile;
+{
+ dbenv->db_errfile = errfile;
+}
+
+static void
+__dbenv_set_errpfx(dbenv, errpfx)
+ DB_ENV *dbenv;
+ const char *errpfx;
+{
+ dbenv->db_errpfx = errpfx;
+}
+
+static int
+__dbenv_set_feedback(dbenv, feedback)
+ DB_ENV *dbenv;
+ void (*feedback) __P((DB_ENV *, int, int));
+{
+ dbenv->db_feedback = feedback;
+ return (0);
+}
+
+static int
+__dbenv_set_mutexlocks(dbenv, onoff)
+ DB_ENV *dbenv;
+ int onoff;
+{
+ dbenv->db_mutexlocks = onoff;
+ return (0);
+}
+
+static int
+__dbenv_set_paniccall(dbenv, paniccall)
+ DB_ENV *dbenv;
+ void (*paniccall) __P((DB_ENV *, int));
+{
+ dbenv->db_paniccall = paniccall;
+ return (0);
+}
+
+static int
+__dbenv_set_recovery_init(dbenv, recovery_init)
+ DB_ENV *dbenv;
+ int (*recovery_init) __P((DB_ENV *));
+{
+ ENV_ILLEGAL_AFTER_OPEN(dbenv, "set_recovery_init");
+
+ dbenv->db_recovery_init = recovery_init;
+
+ return (0);
+}
+
+static int
+__dbenv_set_shm_key(dbenv, shm_key)
+ DB_ENV *dbenv;
+ long shm_key; /* !!!: really a key_t. */
+{
+ ENV_ILLEGAL_AFTER_OPEN(dbenv, "set_shm_key");
+
+ dbenv->shm_key = shm_key;
+ return (0);
+}
+
+static int
+__dbenv_set_tmp_dir(dbenv, dir)
+ DB_ENV *dbenv;
+ const char *dir;
+{
+ if (dbenv->db_tmp_dir != NULL)
+ __os_freestr(dbenv->db_tmp_dir);
+ return (__os_strdup(dbenv, dir, &dbenv->db_tmp_dir));
+}
+
+static int
+__dbenv_set_verbose(dbenv, which, onoff)
+ DB_ENV *dbenv;
+ u_int32_t which;
+ int onoff;
+{
+ switch (which) {
+ case DB_VERB_CHKPOINT:
+ case DB_VERB_DEADLOCK:
+ case DB_VERB_RECOVERY:
+ case DB_VERB_WAITSFOR:
+ if (onoff)
+ FLD_SET(dbenv->verbose, which);
+ else
+ FLD_CLR(dbenv->verbose, which);
+ break;
+ default:
+ return (EINVAL);
+ }
+ return (0);
+}
+
+/*
+ * __db_mi_env --
+ * Method illegally called with public environment.
+ *
+ * PUBLIC: int __db_mi_env __P((DB_ENV *, const char *));
+ */
+int
+__db_mi_env(dbenv, name)
+ DB_ENV *dbenv;
+ const char *name;
+{
+ __db_err(dbenv, "%s: method meaningless in shared environment", name);
+ return (EINVAL);
+}
+
+/*
+ * __db_mi_open --
+ * Method illegally called after open.
+ *
+ * PUBLIC: int __db_mi_open __P((DB_ENV *, const char *, int));
+ */
+int
+__db_mi_open(dbenv, name, after)
+ DB_ENV *dbenv;
+ const char *name;
+ int after;
+{
+ __db_err(dbenv,
+ "%s: method meaningless %s open", name, after ? "after" : "before");
+ return (EINVAL);
+}
+
+/*
+ * __db_env_config --
+ * Method or function called without subsystem being configured.
+ *
+ * PUBLIC: int __db_env_config __P((DB_ENV *, int));
+ */
+int
+__db_env_config(dbenv, subsystem)
+ DB_ENV *dbenv;
+ int subsystem;
+{
+ const char *name;
+
+ switch (subsystem) {
+ case DB_INIT_LOCK:
+ name = "lock";
+ break;
+ case DB_INIT_LOG:
+ name = "log";
+ break;
+ case DB_INIT_MPOOL:
+ name = "mpool";
+ break;
+ case DB_INIT_TXN:
+ name = "txn";
+ break;
+ default:
+ name = "unknown";
+ break;
+ }
+ __db_err(dbenv,
+ "%s interface called with environment not configured for that subsystem",
+ name);
+ return (EINVAL);
+}
+
+static int
+__dbenv_set_server_noclnt(dbenv, host, tsec, ssec, flags)
+ DB_ENV *dbenv;
+ char *host;
+ long tsec, ssec;
+ u_int32_t flags;
+{
+ COMPQUIET(host, NULL);
+ COMPQUIET(tsec, 0);
+ COMPQUIET(ssec, 0);
+ COMPQUIET(flags, 0);
+
+ __db_err(dbenv, "set_server method meaningless in non-RPC enviroment");
+ return (__db_eopnotsup(dbenv));
+}
diff --git a/bdb/env/env_open.c b/bdb/env/env_open.c
new file mode 100644
index 00000000000..2007b4266c0
--- /dev/null
+++ b/bdb/env/env_open.c
@@ -0,0 +1,1064 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: env_open.c,v 11.34 2000/12/21 19:20:00 bostic Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "db_shash.h"
+#include "btree.h"
+#include "hash.h"
+#include "qam.h"
+#include "lock.h"
+#include "log.h"
+#include "mp.h"
+#include "txn.h"
+#include "clib_ext.h"
+
+static int __dbenv_config __P((DB_ENV *, const char *, u_int32_t));
+static int __dbenv_refresh __P((DB_ENV *));
+static int __db_home __P((DB_ENV *, const char *, u_int32_t));
+static int __db_parse __P((DB_ENV *, char *));
+static int __db_tmp_open __P((DB_ENV *, u_int32_t, char *, DB_FH *));
+
+/*
+ * db_version --
+ * Return version information.
+ */
+char *
+db_version(majverp, minverp, patchp)
+ int *majverp, *minverp, *patchp;
+{
+ if (majverp != NULL)
+ *majverp = DB_VERSION_MAJOR;
+ if (minverp != NULL)
+ *minverp = DB_VERSION_MINOR;
+ if (patchp != NULL)
+ *patchp = DB_VERSION_PATCH;
+ return ((char *)DB_VERSION_STRING);
+}
+
+/*
+ * __dbenv_open --
+ * Initialize an environment.
+ *
+ * PUBLIC: int __dbenv_open __P((DB_ENV *, const char *, u_int32_t, int));
+ */
+int
+__dbenv_open(dbenv, db_home, flags, mode)
+ DB_ENV *dbenv;
+ const char *db_home;
+ u_int32_t flags;
+ int mode;
+{
+ DB_ENV *rm_dbenv;
+ int ret;
+ u_int32_t init_flags;
+
+#undef OKFLAGS
+#define OKFLAGS \
+ DB_CREATE | DB_INIT_CDB | DB_INIT_LOCK | DB_INIT_LOG | \
+ DB_INIT_MPOOL | DB_INIT_TXN | DB_JOINENV | DB_LOCKDOWN | \
+ DB_PRIVATE | DB_RECOVER | DB_RECOVER_FATAL | DB_SYSTEM_MEM | \
+ DB_THREAD | DB_USE_ENVIRON | DB_USE_ENVIRON_ROOT
+#undef OKFLAGS_CDB
+#define OKFLAGS_CDB \
+ DB_CREATE | DB_INIT_CDB | DB_INIT_MPOOL | DB_LOCKDOWN | \
+ DB_PRIVATE | DB_SYSTEM_MEM | DB_THREAD | \
+ DB_USE_ENVIRON | DB_USE_ENVIRON_ROOT
+
+ /*
+ * Flags saved in the init_flags field of the environment, representing
+ * flags to DBENV->set_flags and DBENV->open that need to be set.
+ */
+#define DB_INITENV_CDB 0x0001 /* DB_INIT_CDB */
+#define DB_INITENV_CDB_ALLDB 0x0002 /* DB_INIT_CDB_ALLDB */
+#define DB_INITENV_LOCK 0x0004 /* DB_INIT_LOCK */
+#define DB_INITENV_LOG 0x0008 /* DB_INIT_LOG */
+#define DB_INITENV_MPOOL 0x0010 /* DB_INIT_MPOOL */
+#define DB_INITENV_TXN 0x0020 /* DB_INIT_TXN */
+
+ if ((ret = __db_fchk(dbenv, "DBENV->open", flags, OKFLAGS)) != 0)
+ return (ret);
+ if (LF_ISSET(DB_INIT_CDB) &&
+ (ret = __db_fchk(dbenv, "DBENV->open", flags, OKFLAGS_CDB)) != 0)
+ return (ret);
+ if ((ret = __db_fcchk(dbenv,
+ "DBENV->open", flags, DB_PRIVATE, DB_SYSTEM_MEM)) != 0)
+ return (ret);
+ if ((ret = __db_fcchk(dbenv, "DBENV->open", flags, DB_JOINENV,
+ DB_CREATE | DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL |
+ DB_INIT_TXN | DB_PRIVATE)) != 0)
+ return (ret);
+
+ /*
+ * If we're doing recovery, destroy the environment so that we create
+ * all the regions from scratch. I'd like to reuse already created
+ * regions, but that's hard. We would have to create the environment
+ * region from scratch, at least, as we have no way of knowing if its
+ * linked lists are corrupted.
+ *
+ * I suppose we could set flags while modifying those links, but that
+ * is going to be difficult to get right. The major concern I have
+ * is if the application stomps the environment with a rogue pointer.
+ * We have no way of detecting that, and we could be forced into a
+ * situation where we start up and then crash, repeatedly.
+ *
+ * Note that we do not check any flags like DB_PRIVATE before calling
+ * remove. We don't care if the current environment was private or
+ * not, we just want to nail any files that are left-over for whatever
+ * reason, from whatever session.
+ */
+ if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL)) {
+ if ((ret = db_env_create(&rm_dbenv, 0)) != 0)
+ return (ret);
+ if ((ret = dbenv->remove(rm_dbenv, db_home, DB_FORCE)) != 0)
+ return (ret);
+ }
+
+ /* Initialize the DB_ENV structure. */
+ if ((ret = __dbenv_config(dbenv, db_home, flags)) != 0)
+ goto err;
+
+ /* Convert the DBENV->open flags to internal flags. */
+ if (LF_ISSET(DB_CREATE))
+ F_SET(dbenv, DB_ENV_CREATE);
+ if (LF_ISSET(DB_LOCKDOWN))
+ F_SET(dbenv, DB_ENV_LOCKDOWN);
+ if (LF_ISSET(DB_PRIVATE))
+ F_SET(dbenv, DB_ENV_PRIVATE);
+ if (LF_ISSET(DB_SYSTEM_MEM))
+ F_SET(dbenv, DB_ENV_SYSTEM_MEM);
+ if (LF_ISSET(DB_THREAD))
+ F_SET(dbenv, DB_ENV_THREAD);
+
+ /* Default permissions are read-write for both owner and group. */
+ dbenv->db_mode = mode == 0 ? __db_omode("rwrw--") : mode;
+
+ /*
+ * Create/join the environment. We pass in the flags that
+ * will be of interest to an environment joining later; if
+ * we're not the ones to do the create, we
+ * pull out whatever has been stored, if we don't do a create.
+ */
+ init_flags = 0;
+ init_flags |= (LF_ISSET(DB_INIT_CDB) ? DB_INITENV_CDB : 0);
+ init_flags |= (LF_ISSET(DB_INIT_LOCK) ? DB_INITENV_LOCK : 0);
+ init_flags |= (LF_ISSET(DB_INIT_LOG) ? DB_INITENV_LOG : 0);
+ init_flags |= (LF_ISSET(DB_INIT_MPOOL) ? DB_INITENV_MPOOL : 0);
+ init_flags |= (LF_ISSET(DB_INIT_TXN) ? DB_INITENV_TXN : 0);
+ init_flags |=
+ (F_ISSET(dbenv, DB_ENV_CDB_ALLDB) ? DB_INITENV_CDB_ALLDB : 0);
+
+ if ((ret = __db_e_attach(dbenv, &init_flags)) != 0)
+ goto err;
+
+ /*
+ * __db_e_attach will return the saved init_flags field, which
+ * contains the DB_INIT_* flags used when we were created.
+ */
+ if (LF_ISSET(DB_JOINENV)) {
+ LF_CLR(DB_JOINENV);
+
+ LF_SET((init_flags & DB_INITENV_CDB) ? DB_INIT_CDB : 0);
+ LF_SET((init_flags & DB_INITENV_LOCK) ? DB_INIT_LOCK : 0);
+ LF_SET((init_flags & DB_INITENV_LOG) ? DB_INIT_LOG : 0);
+ LF_SET((init_flags & DB_INITENV_MPOOL) ? DB_INIT_MPOOL : 0);
+ LF_SET((init_flags & DB_INITENV_TXN) ? DB_INIT_TXN : 0);
+
+ if (LF_ISSET(DB_INITENV_CDB_ALLDB) &&
+ (ret = dbenv->set_flags(dbenv, DB_CDB_ALLDB, 1)) != 0)
+ goto err;
+ }
+
+ /* Initialize for CDB product. */
+ if (LF_ISSET(DB_INIT_CDB)) {
+ LF_SET(DB_INIT_LOCK);
+ F_SET(dbenv, DB_ENV_CDB);
+ }
+
+ /* Initialize the DB list, and its mutex if appropriate. */
+ LIST_INIT(&dbenv->dblist);
+ if (F_ISSET(dbenv, DB_ENV_THREAD)) {
+ if ((ret = __db_mutex_alloc(dbenv,
+ dbenv->reginfo, (MUTEX **)&dbenv->dblist_mutexp)) != 0)
+ return (ret);
+ if ((ret = __db_mutex_init(dbenv,
+ dbenv->dblist_mutexp, 0, MUTEX_THREAD)) != 0) {
+ __db_mutex_free(dbenv, dbenv->reginfo,
+ dbenv->dblist_mutexp);
+ return (ret);
+ }
+ }
+
+ /*
+ * Initialize the subsystems. Transactions imply logging but do not
+ * imply locking. While almost all applications want both locking
+ * and logging, it would not be unreasonable for a single threaded
+ * process to want transactions for atomicity guarantees, but not
+ * necessarily need concurrency.
+ */
+ if (LF_ISSET(DB_INIT_MPOOL))
+ if ((ret = __memp_open(dbenv)) != 0)
+ goto err;
+ if (LF_ISSET(DB_INIT_LOG | DB_INIT_TXN))
+ if ((ret = __log_open(dbenv)) != 0)
+ goto err;
+ if (LF_ISSET(DB_INIT_LOCK))
+ if ((ret = __lock_open(dbenv)) != 0)
+ goto err;
+ if (LF_ISSET(DB_INIT_TXN)) {
+ if ((ret = __txn_open(dbenv)) != 0)
+ goto err;
+
+ /*
+ * If the application is running with transactions, initialize
+ * the function tables.
+ */
+ if ((ret = __bam_init_recover(dbenv)) != 0)
+ goto err;
+ if ((ret = __crdel_init_recover(dbenv)) != 0)
+ goto err;
+ if ((ret = __db_init_recover(dbenv)) != 0)
+ goto err;
+ if ((ret = __ham_init_recover(dbenv)) != 0)
+ goto err;
+ if ((ret = __log_init_recover(dbenv)) != 0)
+ goto err;
+ if ((ret = __qam_init_recover(dbenv)) != 0)
+ goto err;
+ if ((ret = __txn_init_recover(dbenv)) != 0)
+ goto err;
+
+ /*
+ * If the application specified their own recovery
+ * initialization function, call it.
+ */
+ if (dbenv->db_recovery_init != NULL &&
+ (ret = dbenv->db_recovery_init(dbenv)) != 0)
+ goto err;
+
+ /* Perform recovery for any previous run. */
+ if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL) &&
+ (ret = __db_apprec(dbenv,
+ LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL))) != 0)
+ goto err;
+ }
+ return (0);
+
+err: (void)__dbenv_refresh(dbenv);
+ return (ret);
+}
+
+/*
+ * __dbenv_remove --
+ * Discard an environment.
+ *
+ * PUBLIC: int __dbenv_remove __P((DB_ENV *, const char *, u_int32_t));
+ */
+int
+__dbenv_remove(dbenv, db_home, flags)
+ DB_ENV *dbenv;
+ const char *db_home;
+ u_int32_t flags;
+{
+ int ret, t_ret;
+
+#undef OKFLAGS
+#define OKFLAGS \
+ DB_FORCE | DB_USE_ENVIRON | DB_USE_ENVIRON_ROOT
+
+ /* Validate arguments. */
+ if ((ret = __db_fchk(dbenv, "DBENV->remove", flags, OKFLAGS)) != 0)
+ goto err;
+
+ /*
+ * A hard-to-debug error is calling DBENV->remove after open. That's
+ * not legal. You have to close the original, already opened handle
+ * and then allocate a new DBENV handle to use for DBENV->remove.
+ */
+ if (F_ISSET(dbenv, DB_ENV_OPEN_CALLED)) {
+ __db_err(dbenv,
+ "DBENV handle opened, not usable for remove method.");
+ return (EINVAL);
+ }
+
+ /* Initialize the DB_ENV structure. */
+ if ((ret = __dbenv_config(dbenv, db_home, flags)) != 0)
+ goto err;
+
+ /* Remove the environment. */
+ ret = __db_e_remove(dbenv, LF_ISSET(DB_FORCE) ? 1 : 0);
+
+ /* Discard any resources we've acquired. */
+err: if ((t_ret = __dbenv_refresh(dbenv)) != 0 && ret == 0)
+ ret = t_ret;
+
+ memset(dbenv, CLEAR_BYTE, sizeof(DB_ENV));
+ __os_free(dbenv, sizeof(DB_ENV));
+
+ return (ret);
+}
+
+/*
+ * __dbenv_config --
+ * Initialize the DB_ENV structure.
+ */
+static int
+__dbenv_config(dbenv, db_home, flags)
+ DB_ENV *dbenv;
+ const char *db_home;
+ u_int32_t flags;
+{
+ FILE *fp;
+ int ret;
+ char *lp, buf[MAXPATHLEN * 2];
+
+ /* Set the database home. */
+ if ((ret = __db_home(dbenv, db_home, flags)) != 0)
+ return (ret);
+
+ /*
+ * Parse the config file.
+ *
+ * !!!
+ * Don't use sprintf(3)/snprintf(3) -- the former is dangerous, and
+ * the latter isn't standard, and we're manipulating strings handed
+ * us by the application.
+ */
+ if (dbenv->db_home != NULL) {
+#define CONFIG_NAME "/DB_CONFIG"
+ if (strlen(dbenv->db_home) +
+ strlen(CONFIG_NAME) + 1 > sizeof(buf)) {
+ ret = ENAMETOOLONG;
+ return (ret);
+ }
+ (void)strcpy(buf, dbenv->db_home);
+ (void)strcat(buf, CONFIG_NAME);
+ if ((fp = fopen(buf, "r")) != NULL) {
+ while (fgets(buf, sizeof(buf), fp) != NULL) {
+ if ((lp = strchr(buf, '\n')) == NULL) {
+ __db_err(dbenv,
+ "%s: line too long", CONFIG_NAME);
+ (void)fclose(fp);
+ ret = EINVAL;
+ return (ret);
+ }
+ *lp = '\0';
+ if (buf[0] == '\0' ||
+ buf[0] == '#' || isspace((int)buf[0]))
+ continue;
+
+ if ((ret = __db_parse(dbenv, buf)) != 0) {
+ (void)fclose(fp);
+ return (ret);
+ }
+ }
+ (void)fclose(fp);
+ }
+ }
+
+ /* Set up the tmp directory path. */
+ if (dbenv->db_tmp_dir == NULL && (ret = __os_tmpdir(dbenv, flags)) != 0)
+ return (ret);
+
+ /*
+ * The locking file descriptor is rarely on. Set the fd to -1, not
+ * because it's ever tested, but to make sure we catch mistakes.
+ */
+ if ((ret =
+ __os_calloc(dbenv,
+ 1, sizeof(*dbenv->lockfhp), &dbenv->lockfhp)) != 0)
+ return (ret);
+ dbenv->lockfhp->fd = -1;
+
+ /*
+ * Flag that the DB_ENV structure has been initialized. Note, this
+ * must be set before calling into the subsystems as it's used during
+ * file naming.
+ */
+ F_SET(dbenv, DB_ENV_OPEN_CALLED);
+
+ return (0);
+}
+
+/*
+ * __dbenv_close --
+ * DB_ENV destructor.
+ *
+ * PUBLIC: int __dbenv_close __P((DB_ENV *, u_int32_t));
+ */
+int
+__dbenv_close(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ int ret;
+
+ COMPQUIET(flags, 0);
+
+ PANIC_CHECK(dbenv);
+
+ ret = __dbenv_refresh(dbenv);
+
+ /* Discard the structure if we allocated it. */
+ if (!F_ISSET(dbenv, DB_ENV_USER_ALLOC)) {
+ memset(dbenv, CLEAR_BYTE, sizeof(DB_ENV));
+ __os_free(dbenv, sizeof(DB_ENV));
+ }
+
+ return (ret);
+}
+
+/*
+ * __dbenv_refresh --
+ * Refresh the DB_ENV structure, releasing any allocated resources.
+ */
+static int
+__dbenv_refresh(dbenv)
+ DB_ENV *dbenv;
+{
+ int ret, t_ret;
+ char **p;
+
+ ret = 0;
+
+ /*
+ * Close subsystems, in the reverse order they were opened (txn
+ * must be first, it may want to discard locks and flush the log).
+ */
+ if (TXN_ON(dbenv)) {
+ if ((t_ret = __txn_close(dbenv)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ if (LOCKING_ON(dbenv)) {
+ if ((t_ret = __lock_close(dbenv)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+ __lock_dbenv_close(dbenv);
+
+ if (LOGGING_ON(dbenv)) {
+ if ((t_ret = __log_close(dbenv)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ if (MPOOL_ON(dbenv)) {
+ if ((t_ret = __memp_close(dbenv)) != 0 && ret == 0)
+ ret = t_ret;
+ }
+
+ /* Discard DB list and its mutex. */
+ LIST_INIT(&dbenv->dblist);
+ if (dbenv->dblist_mutexp != NULL)
+ __db_mutex_free(dbenv, dbenv->reginfo, dbenv->dblist_mutexp);
+
+ /* Detach from the region. */
+ if (dbenv->reginfo != NULL) {
+ if ((t_ret = __db_e_detach(dbenv, 0)) != 0 && ret == 0)
+ ret = t_ret;
+ /*
+ * !!!
+ * Don't free dbenv->reginfo or set the reference to NULL,
+ * that was done by __db_e_detach().
+ */
+ }
+
+ /* Clean up the structure. */
+ dbenv->db_panic = 0;
+
+ if (dbenv->db_home != NULL) {
+ __os_freestr(dbenv->db_home);
+ dbenv->db_home = NULL;
+ }
+ if (dbenv->db_log_dir != NULL) {
+ __os_freestr(dbenv->db_log_dir);
+ dbenv->db_log_dir = NULL;
+ }
+ if (dbenv->db_tmp_dir != NULL) {
+ __os_freestr(dbenv->db_tmp_dir);
+ dbenv->db_tmp_dir = NULL;
+ }
+ if (dbenv->db_data_dir != NULL) {
+ for (p = dbenv->db_data_dir; *p != NULL; ++p)
+ __os_freestr(*p);
+ __os_free(dbenv->db_data_dir,
+ dbenv->data_cnt * sizeof(char **));
+ dbenv->db_data_dir = NULL;
+ }
+ dbenv->data_cnt = dbenv->data_next = 0;
+
+ dbenv->db_mode = 0;
+
+ if (dbenv->lockfhp != NULL) {
+ __os_free(dbenv->lockfhp, sizeof(*dbenv->lockfhp));
+ dbenv->lockfhp = NULL;
+ }
+
+ if (dbenv->dtab != NULL) {
+ __os_free(dbenv->dtab,
+ dbenv->dtab_size * sizeof(dbenv->dtab[0]));
+ dbenv->dtab = NULL;
+ dbenv->dtab_size = 0;
+ }
+
+ dbenv->mp_mmapsize = 0;
+ dbenv->links.tqe_next = NULL;
+ dbenv->links.tqe_prev = NULL;
+ dbenv->xa_rmid = 0;
+ dbenv->xa_txn = 0;
+
+ F_CLR(dbenv, ~(DB_ENV_STANDALONE | DB_ENV_USER_ALLOC));
+
+ return (ret);
+}
+
+#define DB_ADDSTR(add) { \
+ if ((add) != NULL) { \
+ /* If leading slash, start over. */ \
+ if (__os_abspath(add)) { \
+ p = str; \
+ slash = 0; \
+ } \
+ /* Append to the current string. */ \
+ len = strlen(add); \
+ if (slash) \
+ *p++ = PATH_SEPARATOR[0]; \
+ memcpy(p, add, len); \
+ p += len; \
+ slash = strchr(PATH_SEPARATOR, p[-1]) == NULL; \
+ } \
+}
+
+/*
+ * __db_appname --
+ * Given an optional DB environment, directory and file name and type
+ * of call, build a path based on the DBENV->open rules, and return
+ * it in allocated space.
+ *
+ * PUBLIC: int __db_appname __P((DB_ENV *, APPNAME,
+ * PUBLIC: const char *, const char *, u_int32_t, DB_FH *, char **));
+ */
+int
+__db_appname(dbenv, appname, dir, file, tmp_oflags, fhp, namep)
+ DB_ENV *dbenv;
+ APPNAME appname;
+ const char *dir, *file;
+ u_int32_t tmp_oflags;
+ DB_FH *fhp;
+ char **namep;
+{
+ DB_ENV etmp;
+ size_t len, str_len;
+ int data_entry, ret, slash, tmp_create, tmp_free;
+ const char *a, *b, *c;
+ char *p, *str;
+
+ a = b = c = NULL;
+ data_entry = -1;
+ tmp_create = tmp_free = 0;
+
+ /*
+ * We don't return a name when creating temporary files, just a
+ * file handle. Default to an error now.
+ */
+ if (fhp != NULL)
+ F_CLR(fhp, DB_FH_VALID);
+ if (namep != NULL)
+ *namep = NULL;
+
+ /*
+ * Absolute path names are never modified. If the file is an absolute
+ * path, we're done. If the directory is, simply append the file and
+ * return.
+ */
+ if (file != NULL && __os_abspath(file))
+ return (__os_strdup(dbenv, file, namep));
+ if (dir != NULL && __os_abspath(dir)) {
+ a = dir;
+ goto done;
+ }
+
+ /*
+ * DB_ENV DIR APPNAME RESULT
+ * -------------------------------------------
+ * null null none <tmp>/file
+ * null set none DIR/file
+ * set null none DB_HOME/file
+ * set set none DB_HOME/DIR/file
+ *
+ * DB_ENV FILE APPNAME RESULT
+ * -------------------------------------------
+ * null null DB_APP_DATA <tmp>/<create>
+ * null set DB_APP_DATA ./file
+ * set null DB_APP_DATA <tmp>/<create>
+ * set set DB_APP_DATA DB_HOME/DB_DATA_DIR/file
+ *
+ * DB_ENV DIR APPNAME RESULT
+ * -------------------------------------------
+ * null null DB_APP_LOG <tmp>/file
+ * null set DB_APP_LOG DIR/file
+ * set null DB_APP_LOG DB_HOME/DB_LOG_DIR/file
+ * set set DB_APP_LOG DB_HOME/DB_LOG_DIR/DIR/file
+ *
+ * DB_ENV APPNAME RESULT
+ * -------------------------------------------
+ * null DB_APP_TMP* <tmp>/<create>
+ * set DB_APP_TMP* DB_HOME/DB_TMP_DIR/<create>
+ */
+retry: switch (appname) {
+ case DB_APP_NONE:
+ if (dbenv == NULL || !F_ISSET(dbenv, DB_ENV_OPEN_CALLED)) {
+ if (dir == NULL)
+ goto tmp;
+ a = dir;
+ } else {
+ a = dbenv->db_home;
+ b = dir;
+ }
+ break;
+ case DB_APP_DATA:
+ if (dir != NULL) {
+ __db_err(dbenv,
+ "DB_APP_DATA: illegal directory specification");
+ return (EINVAL);
+ }
+
+ if (file == NULL) {
+ tmp_create = 1;
+ goto tmp;
+ }
+ if (dbenv != NULL && F_ISSET(dbenv, DB_ENV_OPEN_CALLED)) {
+ a = dbenv->db_home;
+ if (dbenv->db_data_dir != NULL &&
+ (b = dbenv->db_data_dir[++data_entry]) == NULL) {
+ data_entry = -1;
+ b = dbenv->db_data_dir[0];
+ }
+ }
+ break;
+ case DB_APP_LOG:
+ if (dbenv == NULL || !F_ISSET(dbenv, DB_ENV_OPEN_CALLED)) {
+ if (dir == NULL)
+ goto tmp;
+ a = dir;
+ } else {
+ a = dbenv->db_home;
+ b = dbenv->db_log_dir;
+ c = dir;
+ }
+ break;
+ case DB_APP_TMP:
+ if (dir != NULL || file != NULL) {
+ __db_err(dbenv,
+ "DB_APP_TMP: illegal directory or file specification");
+ return (EINVAL);
+ }
+
+ tmp_create = 1;
+ if (dbenv == NULL || !F_ISSET(dbenv, DB_ENV_OPEN_CALLED))
+ goto tmp;
+ else {
+ a = dbenv->db_home;
+ b = dbenv->db_tmp_dir;
+ }
+ break;
+ }
+
+ /* Reference a file from the appropriate temporary directory. */
+ if (0) {
+tmp: if (dbenv == NULL || !F_ISSET(dbenv, DB_ENV_OPEN_CALLED)) {
+ memset(&etmp, 0, sizeof(etmp));
+ if ((ret = __os_tmpdir(&etmp, DB_USE_ENVIRON)) != 0)
+ return (ret);
+ tmp_free = 1;
+ a = etmp.db_tmp_dir;
+ } else
+ a = dbenv->db_tmp_dir;
+ }
+
+done: len =
+ (a == NULL ? 0 : strlen(a) + 1) +
+ (b == NULL ? 0 : strlen(b) + 1) +
+ (c == NULL ? 0 : strlen(c) + 1) +
+ (file == NULL ? 0 : strlen(file) + 1);
+
+ /*
+ * Allocate space to hold the current path information, as well as any
+ * temporary space that we're going to need to create a temporary file
+ * name.
+ */
+#define DB_TRAIL "BDBXXXXXX"
+ str_len = len + sizeof(DB_TRAIL) + 10;
+ if ((ret = __os_malloc(dbenv, str_len, NULL, &str)) != 0) {
+ if (tmp_free)
+ __os_freestr(etmp.db_tmp_dir);
+ return (ret);
+ }
+
+ slash = 0;
+ p = str;
+ DB_ADDSTR(a);
+ DB_ADDSTR(b);
+ DB_ADDSTR(file);
+ *p = '\0';
+
+ /* Discard any space allocated to find the temp directory. */
+ if (tmp_free) {
+ __os_freestr(etmp.db_tmp_dir);
+ tmp_free = 0;
+ }
+
+ /*
+ * If we're opening a data file, see if it exists. If it does,
+ * return it, otherwise, try and find another one to open.
+ */
+ if (data_entry != -1 && __os_exists(str, NULL) != 0) {
+ __os_free(str, str_len);
+ a = b = c = NULL;
+ goto retry;
+ }
+
+ /* Create the file if so requested. */
+ if (tmp_create &&
+ (ret = __db_tmp_open(dbenv, tmp_oflags, str, fhp)) != 0) {
+ __os_free(str, str_len);
+ return (ret);
+ }
+
+ if (namep == NULL)
+ __os_free(str, str_len);
+ else
+ *namep = str;
+ return (0);
+}
+
+/*
+ * __db_home --
+ * Find the database home.
+ */
+static int
+__db_home(dbenv, db_home, flags)
+ DB_ENV *dbenv;
+ const char *db_home;
+ u_int32_t flags;
+{
+ const char *p;
+
+ /*
+ * Use db_home by default, this allows utilities to reasonably
+ * override the environment either explicitly or by using a -h
+ * option. Otherwise, use the environment if it's permitted
+ * and initialized.
+ */
+ if ((p = db_home) == NULL &&
+ (LF_ISSET(DB_USE_ENVIRON) ||
+ (LF_ISSET(DB_USE_ENVIRON_ROOT) && __os_isroot())) &&
+ (p = getenv("DB_HOME")) != NULL && p[0] == '\0') {
+ __db_err(dbenv, "illegal DB_HOME environment variable");
+ return (EINVAL);
+ }
+
+ return (p == NULL ? 0 : __os_strdup(dbenv, p, &dbenv->db_home));
+}
+
+/*
+ * __db_parse --
+ * Parse a single NAME VALUE pair.
+ */
+static int
+__db_parse(dbenv, s)
+ DB_ENV *dbenv;
+ char *s;
+{
+ u_long v1, v2, v3;
+ u_int32_t flags;
+ char *name, *p, *value, v4;
+
+ /*
+ * !!!
+ * The value of 40 is hard-coded into format arguments to sscanf
+ * below, it can't be changed here without changing it there, too.
+ */
+ char arg[40];
+
+ /*
+ * Name/value pairs are parsed as two white-space separated strings.
+ * Leading and trailing white-space is trimmed from the value, but
+ * it may contain embedded white-space. Note: we use the isspace(3)
+ * macro because it's more portable, but that means that you can use
+ * characters like form-feed to separate the strings.
+ */
+ name = s;
+ for (p = name; *p != '\0' && !isspace((int)*p); ++p)
+ ;
+ if (*p == '\0' || p == name)
+ goto illegal;
+ *p = '\0';
+ for (++p; isspace((int)*p); ++p)
+ ;
+ if (*p == '\0')
+ goto illegal;
+ value = p;
+ for (++p; *p != '\0'; ++p)
+ ;
+ for (--p; isspace((int)*p); --p)
+ ;
+ ++p;
+ if (p == value) {
+illegal: __db_err(dbenv, "mis-formatted name-value pair: %s", s);
+ return (EINVAL);
+ }
+ *p = '\0';
+
+ if (!strcasecmp(name, "set_cachesize")) {
+ if (sscanf(value, "%lu %lu %lu %c", &v1, &v2, &v3, &v4) != 3)
+ goto badarg;
+ return (dbenv->set_cachesize(dbenv, v1, v2, v3));
+ }
+
+ if (!strcasecmp(name, "set_data_dir") ||
+ !strcasecmp(name, "db_data_dir")) /* Compatibility. */
+ return (dbenv->set_data_dir(dbenv, value));
+
+ if (!strcasecmp(name, "set_flags")) {
+ if (sscanf(value, "%40s %c", arg, &v4) != 1)
+ goto badarg;
+
+ if (!strcasecmp(value, "db_cdb_alldb"))
+ return (dbenv->set_flags(dbenv, DB_CDB_ALLDB, 1));
+ if (!strcasecmp(value, "db_nommap"))
+ return (dbenv->set_flags(dbenv, DB_NOMMAP, 1));
+ if (!strcasecmp(value, "db_txn_nosync"))
+ return (dbenv->set_flags(dbenv, DB_TXN_NOSYNC, 1));
+ goto badarg;
+ }
+
+ if (!strcasecmp(name, "set_lg_bsize")) {
+ if (sscanf(value, "%lu %c", &v1, &v4) != 1)
+ goto badarg;
+ return (dbenv->set_lg_bsize(dbenv, v1));
+ }
+
+ if (!strcasecmp(name, "set_lg_max")) {
+ if (sscanf(value, "%lu %c", &v1, &v4) != 1)
+ goto badarg;
+ return (dbenv->set_lg_max(dbenv, v1));
+ }
+
+ if (!strcasecmp(name, "set_lg_dir") ||
+ !strcasecmp(name, "db_log_dir")) /* Compatibility. */
+ return (dbenv->set_lg_dir(dbenv, value));
+
+ if (!strcasecmp(name, "set_lk_detect")) {
+ if (sscanf(value, "%40s %c", arg, &v4) != 1)
+ goto badarg;
+ if (!strcasecmp(value, "db_lock_default"))
+ flags = DB_LOCK_DEFAULT;
+ else if (!strcasecmp(value, "db_lock_oldest"))
+ flags = DB_LOCK_OLDEST;
+ else if (!strcasecmp(value, "db_lock_random"))
+ flags = DB_LOCK_RANDOM;
+ else if (!strcasecmp(value, "db_lock_youngest"))
+ flags = DB_LOCK_YOUNGEST;
+ else
+ goto badarg;
+ return (dbenv->set_lk_detect(dbenv, flags));
+ }
+
+ if (!strcasecmp(name, "set_lk_max")) {
+ if (sscanf(value, "%lu %c", &v1, &v4) != 1)
+ goto badarg;
+ return (dbenv->set_lk_max(dbenv, v1));
+ }
+
+ if (!strcasecmp(name, "set_lk_max_locks")) {
+ if (sscanf(value, "%lu %c", &v1, &v4) != 1)
+ goto badarg;
+ return (dbenv->set_lk_max_locks(dbenv, v1));
+ }
+
+ if (!strcasecmp(name, "set_lk_max_lockers")) {
+ if (sscanf(value, "%lu %c", &v1, &v4) != 1)
+ goto badarg;
+ return (dbenv->set_lk_max_lockers(dbenv, v1));
+ }
+
+ if (!strcasecmp(name, "set_lk_max_objects")) {
+ if (sscanf(value, "%lu %c", &v1, &v4) != 1)
+ goto badarg;
+ return (dbenv->set_lk_max_objects(dbenv, v1));
+ }
+
+ if (!strcasecmp(name, "set_mp_mmapsize")) {
+ if (sscanf(value, "%lu %c", &v1, &v4) != 1)
+ goto badarg;
+ return (dbenv->set_mp_mmapsize(dbenv, v1));
+ }
+
+ if (!strcasecmp(name, "set_region_init")) {
+ if (sscanf(value, "%lu %c", &v1, &v4) != 1 || v1 != 1)
+ goto badarg;
+ return (db_env_set_region_init(v1));
+ }
+
+ if (!strcasecmp(name, "set_shm_key")) {
+ if (sscanf(value, "%lu %c", &v1, &v4) != 1)
+ goto badarg;
+ return (dbenv->set_shm_key(dbenv, (long)v1));
+ }
+
+ if (!strcasecmp(name, "set_tas_spins")) {
+ if (sscanf(value, "%lu %c", &v1, &v4) != 1)
+ goto badarg;
+ return (db_env_set_tas_spins(v1));
+ }
+
+ if (!strcasecmp(name, "set_tmp_dir") ||
+ !strcasecmp(name, "db_tmp_dir")) /* Compatibility.*/
+ return (dbenv->set_tmp_dir(dbenv, value));
+
+ if (!strcasecmp(name, "set_tx_max")) {
+ if (sscanf(value, "%lu %c", &v1, &v4) != 1)
+ goto badarg;
+ return (dbenv->set_tx_max(dbenv, v1));
+ }
+
+ if (!strcasecmp(name, "set_verbose")) {
+ if (sscanf(value, "%40s %c", arg, &v4) != 1)
+ goto badarg;
+
+ if (!strcasecmp(value, "db_verb_chkpoint"))
+ flags = DB_VERB_CHKPOINT;
+ else if (!strcasecmp(value, "db_verb_deadlock"))
+ flags = DB_VERB_DEADLOCK;
+ else if (!strcasecmp(value, "db_verb_recovery"))
+ flags = DB_VERB_RECOVERY;
+ else if (!strcasecmp(value, "db_verb_waitsfor"))
+ flags = DB_VERB_WAITSFOR;
+ else
+ goto badarg;
+ return (dbenv->set_verbose(dbenv, flags, 1));
+ }
+
+ __db_err(dbenv, "unrecognized name-value pair: %s", s);
+ return (EINVAL);
+
+badarg: __db_err(dbenv, "incorrect arguments for name-value pair: %s", s);
+ return (EINVAL);
+}
+
+/*
+ * __db_tmp_open --
+ * Create a temporary file.
+ */
+static int
+__db_tmp_open(dbenv, tmp_oflags, path, fhp)
+ DB_ENV *dbenv;
+ u_int32_t tmp_oflags;
+ char *path;
+ DB_FH *fhp;
+{
+ u_long pid;
+ int mode, isdir, ret;
+ const char *p;
+ char *trv;
+
+ /*
+ * Check the target directory; if you have six X's and it doesn't
+ * exist, this runs for a *very* long time.
+ */
+ if ((ret = __os_exists(path, &isdir)) != 0) {
+ __db_err(dbenv, "%s: %s", path, db_strerror(ret));
+ return (ret);
+ }
+ if (!isdir) {
+ __db_err(dbenv, "%s: %s", path, db_strerror(EINVAL));
+ return (EINVAL);
+ }
+
+ /* Build the path. */
+ for (trv = path; *trv != '\0'; ++trv)
+ ;
+ *trv = PATH_SEPARATOR[0];
+ for (p = DB_TRAIL; (*++trv = *p) != '\0'; ++p)
+ ;
+
+ /*
+ * Replace the X's with the process ID. Pid should be a pid_t,
+ * but we use unsigned long for portability.
+ */
+ for (pid = getpid(); *--trv == 'X'; pid /= 10)
+ switch (pid % 10) {
+ case 0: *trv = '0'; break;
+ case 1: *trv = '1'; break;
+ case 2: *trv = '2'; break;
+ case 3: *trv = '3'; break;
+ case 4: *trv = '4'; break;
+ case 5: *trv = '5'; break;
+ case 6: *trv = '6'; break;
+ case 7: *trv = '7'; break;
+ case 8: *trv = '8'; break;
+ case 9: *trv = '9'; break;
+ }
+ ++trv;
+
+ /* Set up open flags and mode. */
+ mode = __db_omode("rw----");
+
+ /* Loop, trying to open a file. */
+ for (;;) {
+ if ((ret = __os_open(dbenv, path,
+ tmp_oflags | DB_OSO_CREATE | DB_OSO_EXCL, mode, fhp)) == 0)
+ return (0);
+
+ /*
+ * !!!:
+ * If we don't get an EEXIST error, then there's something
+ * seriously wrong. Unfortunately, if the implementation
+ * doesn't return EEXIST for O_CREAT and O_EXCL regardless
+ * of other possible errors, we've lost.
+ */
+ if (ret != EEXIST) {
+ __db_err(dbenv,
+ "tmp_open: %s: %s", path, db_strerror(ret));
+ return (ret);
+ }
+
+ /*
+ * Tricky little algorithm for backward compatibility.
+ * Assumes sequential ordering of lower-case characters.
+ */
+ for (;;) {
+ if (*trv == '\0')
+ return (EINVAL);
+ if (*trv == 'z')
+ *trv++ = 'a';
+ else {
+ if (isdigit((int)*trv))
+ *trv = 'a';
+ else
+ ++*trv;
+ break;
+ }
+ }
+ }
+ /* NOTREACHED */
+}
diff --git a/bdb/env/env_recover.c b/bdb/env/env_recover.c
new file mode 100644
index 00000000000..bc5e4760584
--- /dev/null
+++ b/bdb/env/env_recover.c
@@ -0,0 +1,449 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char copyright[] =
+ "Copyright (c) 1996-2000\nSleepycat Software Inc. All rights reserved.\n";
+static const char revid[] =
+ "$Id: env_recover.c,v 11.33 2001/01/04 22:38:42 ubell Exp $";
+#endif
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#if TIME_WITH_SYS_TIME
+#include <sys/time.h>
+#include <time.h>
+#else
+#if HAVE_SYS_TIME_H
+#include <sys/time.h>
+#else
+#include <time.h>
+#endif
+#endif
+
+#include <string.h>
+#endif
+
+#include "db_int.h"
+#include "db_page.h"
+#include "db_dispatch.h"
+#include "db_am.h"
+#include "log.h"
+#include "txn.h"
+
+static float __lsn_diff __P((DB_LSN *, DB_LSN *, DB_LSN *, u_int32_t, int));
+static int __log_earliest __P((DB_ENV *, int32_t *, DB_LSN *));
+
+/*
+ * __db_apprec --
+ * Perform recovery.
+ *
+ * PUBLIC: int __db_apprec __P((DB_ENV *, u_int32_t));
+ */
+int
+__db_apprec(dbenv, flags)
+ DB_ENV *dbenv;
+ u_int32_t flags;
+{
+ DBT data;
+ DB_LSN ckp_lsn, first_lsn, last_lsn, lowlsn, lsn, open_lsn;
+ DB_TXNREGION *region;
+ __txn_ckp_args *ckp_args;
+ time_t now, tlow;
+ float nfiles;
+ int32_t low;
+ int is_thread, progress, ret;
+ void *txninfo;
+
+ COMPQUIET(nfiles, (float)0);
+
+ /*
+ * Save the state of the thread flag -- we don't need it on at the
+ * moment because we're single-threaded until recovery is complete.
+ */
+ is_thread = F_ISSET(dbenv, DB_ENV_THREAD) ? 1 : 0;
+ F_CLR(dbenv, DB_ENV_THREAD);
+ F_SET((DB_LOG *)dbenv->lg_handle, DBLOG_RECOVER);
+
+ /*
+ * If the user is specifying recover to a particular point in time,
+ * verify that the logs present are sufficient to do this.
+ */
+ ZERO_LSN(lowlsn);
+ if (dbenv->tx_timestamp != 0) {
+ if ((ret = __log_earliest(dbenv, &low, &lowlsn)) != 0)
+ return (ret);
+ if ((int32_t)dbenv->tx_timestamp < low) {
+ char t1[30], t2[30];
+
+ strcpy(t1, ctime(&dbenv->tx_timestamp));
+ tlow = (time_t)low;
+ strcpy(t2, ctime(&tlow));
+ __db_err(dbenv,
+ "Invalid recovery timestamp %.*s; earliest time is %.*s",
+ 24, t1, 24, t2);
+ return (EINVAL);
+ }
+ }
+
+ /* Initialize the transaction list. */
+ if ((ret = __db_txnlist_init(dbenv, &txninfo)) != 0)
+ return (ret);
+
+ /*
+ * Recovery is done in three passes:
+ * Pass #0:
+ * We need to find the position from which we will open files
+ * We need to open files beginning with the last to next
+ * checkpoint because we might have crashed after writing the
+ * last checkpoint record, but before having written out all
+ * the open file information.
+ *
+ * Pass #1:
+ * Read forward through the log from the second to last checkpoint
+ * opening and closing files so that at the end of the log we have
+ * the "current" set of files open.
+ *
+ * Pass #2:
+ * Read backward through the log undoing any uncompleted TXNs.
+ * There are three cases:
+ * 1. If doing catastrophic recovery, we read to the beginning
+ * of the log
+ * 2. If we are doing normal reovery, then we have to roll
+ * back to the most recent checkpoint that occurs
+ * before the most recent checkpoint LSN, which is
+ * returned by __log_findckp().
+ * 3. If we are recovering to a point in time, then we have
+ * to roll back to the checkpoint whose ckp_lsn is earlier
+ * than the specified time. __log_earliest will figure
+ * this out for us.
+ * In case 2, "uncompleted TXNs" include all those who commited
+ * after the user's specified timestamp.
+ *
+ * Pass #3:
+ * Read forward through the log from the LSN found in pass #2,
+ * redoing any committed TXNs (which commited after any user-
+ * specified rollback point). During this pass, checkpoint
+ * file information is ignored, and file openings and closings
+ * are redone.
+ */
+
+ /*
+ * Find out the last lsn, so that we can estimate how far along we
+ * are in recovery. This will help us determine how much log there
+ * is between the first LSN that we're going to be working with and
+ * the last one. We assume that each of the three phases takes the
+ * same amount of time (a false assumption) and then use the %-age
+ * of the amount of log traversed to figure out how much of the
+ * pass we've accomplished.
+ */
+ memset(&data, 0, sizeof(data));
+ if (dbenv->db_feedback != NULL &&
+ (ret = log_get(dbenv, &last_lsn, &data, DB_LAST)) != 0)
+ goto out;
+
+ /*
+ * Pass #0
+ * Find the second to last checkpoint in the log. This is the point
+ * from which we want to begin pass #1 (the open files pass).
+ */
+ ckp_args = NULL;
+
+ if (LF_ISSET(DB_RECOVER_FATAL)) {
+ if ((ret = log_get(dbenv, &ckp_lsn, &data, DB_FIRST)) != 0) {
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ else
+ __db_err(dbenv, "First log record not found");
+ goto out;
+ }
+ open_lsn = ckp_lsn;
+ } else if ((ret =
+ log_get(dbenv, &ckp_lsn, &data, DB_CHECKPOINT)) != 0) {
+ /*
+ * If we don't find a checkpoint, start from the beginning.
+ * If that fails, we're done. Note, we do not require that
+ * there be log records if we're performing recovery.
+ */
+first: if ((ret = log_get(dbenv, &ckp_lsn, &data, DB_FIRST)) != 0) {
+ if (ret == DB_NOTFOUND)
+ ret = 0;
+ else
+ __db_err(dbenv, "First log record not found");
+ goto out;
+ }
+ open_lsn = ckp_lsn;
+ } else if ((ret = __txn_ckp_read(dbenv, data.data, &ckp_args)) != 0) {
+ __db_err(dbenv, "Invalid checkpoint record at [%ld][%ld]\n",
+ (u_long)ckp_lsn.file, (u_long)ckp_lsn.offset);
+ goto out;
+ } else if (IS_ZERO_LSN(ckp_args->last_ckp) ||
+ (ret = log_get(dbenv, &ckp_args->last_ckp, &data, DB_SET)) != 0)
+ goto first;
+ else
+ open_lsn = ckp_args->last_ckp;
+
+ if (dbenv->db_feedback != NULL) {
+ if (last_lsn.file == open_lsn.file)
+ nfiles = (float)(last_lsn.offset - open_lsn.offset) /
+ dbenv->lg_max;
+ else
+ nfiles = (float)(last_lsn.file - open_lsn.file) +
+ (float)(dbenv->lg_max - open_lsn.offset +
+ last_lsn.offset) / dbenv->lg_max;
+ /* We are going to divide by nfiles; make sure it isn't 0. */
+ if (nfiles == 0)
+ nfiles = (float)0.001;
+ }
+
+ /*
+ * Pass #1
+ * Now, ckp_lsn is either the lsn of the last checkpoint
+ * or the lsn of the first record in the log. Open_lsn is
+ * the second to last checkpoint or the beinning of the log;
+ * begin the open files pass from that lsn, and proceed to
+ * the end of the log.
+ */
+ lsn = open_lsn;
+ for (;;) {
+ if (dbenv->db_feedback != NULL) {
+ progress = (int)(33 * (__lsn_diff(&open_lsn,
+ &last_lsn, &lsn, dbenv->lg_max, 1) / nfiles));
+ dbenv->db_feedback(dbenv, DB_RECOVER, progress);
+ }
+ ret = __db_dispatch(dbenv,
+ &data, &lsn, DB_TXN_OPENFILES, txninfo);
+ if (ret != 0 && ret != DB_TXN_CKP)
+ goto msgerr;
+ if ((ret = log_get(dbenv, &lsn, &data, DB_NEXT)) != 0) {
+ if (ret == DB_NOTFOUND)
+ break;
+ goto out;
+ }
+ }
+
+ /*
+ * Pass #2.
+ *
+ * Before we can begin pass #2, backward roll phase, we determine how
+ * far back in the log to recover. If we are doing catastrophic
+ * recovery, then we go as far back as we have files. If we are
+ * doing normal recovery, we go as back to the most recent checkpoint
+ * that occurs before the most recent checkpoint LSN. If we are
+ * recovering to a point in time, then rollback to the checkpoint whose
+ * ckp_lsn precedes the first log record (and then roll forward to
+ * the appropriate timestamp in Pass #3).
+ */
+ if (LF_ISSET(DB_RECOVER_FATAL)) {
+ ZERO_LSN(first_lsn);
+ } else if (dbenv->tx_timestamp != 0)
+ first_lsn = lowlsn;
+ else
+ if ((ret = __log_findckp(dbenv, &first_lsn)) == DB_NOTFOUND) {
+ /*
+ * We don't require that log files exist if recovery
+ * was specified.
+ */
+ ret = 0;
+ goto out;
+ }
+
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY))
+ __db_err(dbenv, "Recovery starting from [%lu][%lu]",
+ (u_long)first_lsn.file, (u_long)first_lsn.offset);
+
+ for (ret = log_get(dbenv, &lsn, &data, DB_LAST);
+ ret == 0 && log_compare(&lsn, &first_lsn) > 0;
+ ret = log_get(dbenv, &lsn, &data, DB_PREV)) {
+ if (dbenv->db_feedback != NULL) {
+ progress = 34 + (int)(33 * (__lsn_diff(&open_lsn,
+ &last_lsn, &lsn, dbenv->lg_max, 0) / nfiles));
+ dbenv->db_feedback(dbenv, DB_RECOVER, progress);
+ }
+ ret = __db_dispatch(dbenv,
+ &data, &lsn, DB_TXN_BACKWARD_ROLL, txninfo);
+ if (ret != 0) {
+ if (ret != DB_TXN_CKP)
+ goto msgerr;
+ else
+ ret = 0;
+ }
+ }
+ if (ret != 0 && ret != DB_NOTFOUND)
+ goto out;
+
+ /*
+ * Pass #3.
+ */
+ for (ret = log_get(dbenv, &lsn, &data, DB_NEXT);
+ ret == 0; ret = log_get(dbenv, &lsn, &data, DB_NEXT)) {
+ if (dbenv->db_feedback != NULL) {
+ progress = 67 + (int)(33 * (__lsn_diff(&open_lsn,
+ &last_lsn, &lsn, dbenv->lg_max, 1) / nfiles));
+ dbenv->db_feedback(dbenv, DB_RECOVER, progress);
+ }
+ ret = __db_dispatch(dbenv,
+ &data, &lsn, DB_TXN_FORWARD_ROLL, txninfo);
+ if (ret != 0) {
+ if (ret != DB_TXN_CKP)
+ goto msgerr;
+ else
+ ret = 0;
+ }
+ }
+ if (ret != DB_NOTFOUND)
+ goto out;
+
+ /*
+ * Process any pages that were on the limbo list
+ * and move them to the free list. Do this
+ * before checkpointing the database.
+ */
+ if ((ret = __db_do_the_limbo(dbenv, txninfo)) != 0)
+ goto out;
+
+ /*
+ * Now set the last checkpoint lsn and the current time,
+ * take a checkpoint, and reset the txnid.
+ */
+ (void)time(&now);
+ region = ((DB_TXNMGR *)dbenv->tx_handle)->reginfo.primary;
+ region->last_txnid = ((DB_TXNHEAD *)txninfo)->maxid;
+ region->last_ckp = ckp_lsn;
+ region->time_ckp = (u_int32_t)now;
+
+ /*
+ * Take two checkpoints so that we don't re-recover any of the
+ * work we've already done.
+ */
+ if ((ret = txn_checkpoint(dbenv, 0, 0, DB_FORCE)) != 0)
+ goto out;
+
+ /* Now close all the db files that are open. */
+ __log_close_files(dbenv);
+
+ if ((ret = txn_checkpoint(dbenv, 0, 0, DB_FORCE)) != 0)
+ goto out;
+ region->last_txnid = TXN_MINIMUM;
+
+ if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY)) {
+ __db_err(dbenv, "Recovery complete at %.24s", ctime(&now));
+ __db_err(dbenv, "%s %lx %s [%lu][%lu]",
+ "Maximum transaction ID",
+ ((DB_TXNHEAD *)txninfo)->maxid,
+ "Recovery checkpoint",
+ (u_long)region->last_ckp.file,
+ (u_long)region->last_ckp.offset);
+ }
+
+ if (0) {
+msgerr: __db_err(dbenv, "Recovery function for LSN %lu %lu failed",
+ (u_long)lsn.file, (u_long)lsn.offset);
+ }
+
+out: if (is_thread)
+ F_SET(dbenv, DB_ENV_THREAD);
+ __db_txnlist_end(dbenv, txninfo);
+ if (ckp_args != NULL)
+ __os_free(ckp_args, sizeof(*ckp_args));
+ F_CLR((DB_LOG *)dbenv->lg_handle, DBLOG_RECOVER);
+
+ dbenv->tx_timestamp = 0;
+ return (ret);
+}
+
+/*
+ * Figure out how many logfiles we have processed. If we are moving
+ * forward (is_forward != 0), then we're computing current - low. If
+ * we are moving backward, we are computing high - current. max is
+ * the number of bytes per logfile.
+ */
+static float
+__lsn_diff(low, high, current, max, is_forward)
+ DB_LSN *low, *high, *current;
+ u_int32_t max;
+ int is_forward;
+{
+ float nf;
+
+ /*
+ * There are three cases in each direction. If you are in the
+ * same file, then all you need worry about is the difference in
+ * offsets. If you are in different files, then either your offsets
+ * put you either more or less than the integral difference in the
+ * number of files -- we need to handle both of these.
+ */
+ if (is_forward) {
+ if (current->file == low->file)
+ nf = (float)(current->offset - low->offset) / max;
+ else if (current->offset < low->offset)
+ nf = (float)(current->file - low->file - 1) +
+ (float)(max - low->offset + current->offset) / max;
+ else
+ nf = (float)(current->file - low->file) +
+ (float)(current->offset - low->offset) / max;
+ } else {
+ if (current->file == high->file)
+ nf = (float)(high->offset - current->offset) / max;
+ else if (current->offset > high->offset)
+ nf = (float)(high->file - current->file - 1) +
+ (float)(max - current->offset + high->offset) / max;
+ else
+ nf = (float)(high->file - current->file) +
+ (float)(high->offset - current->offset) / max;
+ }
+ return (nf);
+}
+
+/*
+ * __log_earliest --
+ *
+ * Return the earliest recovery point for the log files present. The
+ * earliest recovery time is the time stamp of the first checkpoint record
+ * whose checkpoint LSN is greater than the first LSN we process.
+ */
+static int
+__log_earliest(dbenv, lowtime, lowlsn)
+ DB_ENV *dbenv;
+ int32_t *lowtime;
+ DB_LSN *lowlsn;
+{
+ DB_LSN first_lsn, lsn;
+ DBT data;
+ __txn_ckp_args *ckpargs;
+ u_int32_t rectype;
+ int cmp, ret;
+
+ memset(&data, 0, sizeof(data));
+ /*
+ * Read forward through the log looking for the first checkpoint
+ * record whose ckp_lsn is greater than first_lsn.
+ */
+
+ for (ret = log_get(dbenv, &first_lsn, &data, DB_FIRST);
+ ret == 0; ret = log_get(dbenv, &lsn, &data, DB_NEXT)) {
+ if (ret != 0)
+ break;
+ memcpy(&rectype, data.data, sizeof(rectype));
+ if (rectype != DB_txn_ckp)
+ continue;
+ if ((ret = __txn_ckp_read(dbenv, data.data, &ckpargs)) == 0) {
+ cmp = log_compare(&ckpargs->ckp_lsn, &first_lsn);
+ *lowlsn = ckpargs->ckp_lsn;
+ *lowtime = ckpargs->timestamp;
+
+ __os_free(ckpargs, 0);
+ if (cmp >= 0)
+ break;
+ }
+ }
+
+ return (ret);
+}
diff --git a/bdb/env/env_region.c b/bdb/env/env_region.c
new file mode 100644
index 00000000000..f3df4bac184
--- /dev/null
+++ b/bdb/env/env_region.c
@@ -0,0 +1,1205 @@
+/*-
+ * See the file LICENSE for redistribution information.
+ *
+ * Copyright (c) 1996, 1997, 1998, 1999, 2000
+ * Sleepycat Software. All rights reserved.
+ */
+
+#include "db_config.h"
+
+#ifndef lint
+static const char revid[] = "$Id: env_region.c,v 11.28 2000/12/12 17:36:10 bostic Exp $";
+#endif /* not lint */
+
+#ifndef NO_SYSTEM_INCLUDES
+#include <sys/types.h>
+
+#include <ctype.h>
+#include <string.h>
+#include <unistd.h>
+#endif
+
+#include "db_int.h"
+#include "db_shash.h"
+#include "lock.h"
+#include "lock_ext.h"
+#include "log.h"
+#include "log_ext.h"
+#include "mp.h"
+#include "mp_ext.h"
+#include "txn.h"
+#include "txn_ext.h"
+
+static int __db_des_destroy __P((DB_ENV *, REGION *));
+static int __db_des_get __P((DB_ENV *, REGINFO *, REGINFO *, REGION **));
+static int __db_e_remfile __P((DB_ENV *));
+static int __db_faultmem __P((void *, size_t, int));
+static void __db_region_destroy __P((DB_ENV *, REGINFO *));
+
+/*
+ * __db_e_attach
+ * Join/create the environment
+ *
+ * PUBLIC: int __db_e_attach __P((DB_ENV *, u_int32_t *));
+ */
+int
+__db_e_attach(dbenv, init_flagsp)
+ DB_ENV *dbenv;
+ u_int32_t *init_flagsp;
+{
+ REGENV *renv;
+ REGENV_REF ref;
+ REGINFO *infop;
+ REGION *rp, tregion;
+ size_t size;
+ size_t nrw;
+ u_int32_t mbytes, bytes;
+ int retry_cnt, ret, segid;
+ char buf[sizeof(DB_REGION_FMT) + 20];
+
+#if !defined(HAVE_MUTEX_THREADS)
+ /*
+ * !!!
+ * If we don't have spinlocks, we need a file descriptor for fcntl(2)
+ * locking. We use the file handle from the REGENV file for this
+ * purpose.
+ *
+ * Since we may be using shared memory regions, e.g., shmget(2), and
+ * not a mapped-in regular file, the backing file may be only a few
+ * bytes in length. So, this depends on the ability to call fcntl to
+ * lock file offsets much larger than the actual physical file. I
+ * think that's safe -- besides, very few systems actually need this
+ * kind of support, SunOS is the only one still in wide use of which
+ * I'm aware.
+ *
+ * The error case is if an application lacks spinlocks and wants to be
+ * threaded. That doesn't work because fcntl may lock the underlying
+ * process, including all its threads.
+ */
+ if (F_ISSET(dbenv, DB_ENV_THREAD)) {
+ __db_err(dbenv,
+"architecture lacks fast mutexes: applications cannot be threaded");
+ return (EINVAL);
+ }
+#endif
+
+ /* Initialization */
+ retry_cnt = 0;
+
+ /* Repeated initialization. */
+loop: renv = NULL;
+
+ /* Set up the DB_ENV's REG_INFO structure. */
+ if ((ret = __os_calloc(dbenv, 1, sizeof(REGINFO), &infop)) != 0)
+ return (ret);
+ infop->type = REGION_TYPE_ENV;
+ infop->id = REGION_ID_ENV;
+ infop->mode = dbenv->db_mode;
+ infop->flags = REGION_JOIN_OK;
+ if (F_ISSET(dbenv, DB_ENV_CREATE))
+ F_SET(infop, REGION_CREATE_OK);
+
+ /*
+ * We have to single-thread the creation of the REGENV region. Once
+ * it exists, we can do locking using locks in the region, but until
+ * then we have to be the only player in the game.
+ *
+ * If this is a private environment, we are only called once and there
+ * are no possible race conditions.
+ *
+ * If this is a public environment, we use the filesystem to ensure
+ * the creation of the environment file is single-threaded.
+ */
+ if (F_ISSET(dbenv, DB_ENV_PRIVATE))
+ goto creation;
+
+ /* Build the region name. */
+ (void)snprintf(buf, sizeof(buf), "%s", DB_REGION_ENV);
+ if ((ret = __db_appname(dbenv,
+ DB_APP_NONE, NULL, buf, 0, NULL, &infop->name)) != 0)
+ goto err;
+
+ /*
+ * Try to create the file, if we have the authority. We have to ensure
+ * that multiple threads/processes attempting to simultaneously create
+ * the file are properly ordered. Open using the O_CREAT and O_EXCL
+ * flags so that multiple attempts to create the region will return
+ * failure in all but one. POSIX 1003.1 requires that EEXIST be the
+ * errno return value -- I sure hope they're right.
+ */
+ if (F_ISSET(dbenv, DB_ENV_CREATE)) {
+ if ((ret = __os_open(dbenv,
+ infop->name, DB_OSO_REGION | DB_OSO_CREATE | DB_OSO_EXCL,
+ dbenv->db_mode, dbenv->lockfhp)) == 0)
+ goto creation;
+ if (ret != EEXIST) {
+ __db_err(dbenv,
+ "%s: %s", infop->name, db_strerror(ret));
+ goto err;
+ }
+ }
+
+ /*
+ * If we couldn't create the file, try and open it. (If that fails,
+ * we're done.)
+ */
+ if ((ret = __os_open(dbenv, infop->name,
+ DB_OSO_REGION, dbenv->db_mode, dbenv->lockfhp)) != 0)
+ goto err;
+
+ /*
+ * !!!
+ * The region may be in system memory not backed by the filesystem
+ * (more specifically, not backed by this file), and we're joining
+ * it. In that case, the process that created it will have written
+ * out a REGENV_REF structure as its only contents. We read that
+ * structure before we do anything further, e.g., we can't just map
+ * that file in and then figure out what's going on.
+ *
+ * All of this noise is because some systems don't have a coherent VM
+ * and buffer cache, and what's worse, when you mix operations on the
+ * VM and buffer cache, half the time you hang the system.
+ *
+ * If the file is the size of an REGENV_REF structure, then we know
+ * the real region is in some other memory. (The only way you get a
+ * file that size is to deliberately write it, as it's smaller than
+ * any possible disk sector created by writing a file or mapping the
+ * file into memory.) In which case, retrieve the structure from the
+ * file and use it to acquire the referenced memory.
+ *
+ * If the structure is larger than a REGENV_REF structure, then this
+ * file is backing the shared memory region, and we just map it into
+ * memory.
+ *
+ * And yes, this makes me want to take somebody and kill them. (I
+ * digress -- but you have no freakin' idea. This is unbelievably
+ * stupid and gross, and I've probably spent six months of my life,
+ * now, trying to make different versions of it work.)
+ */
+ if ((ret = __os_ioinfo(dbenv, infop->name,
+ dbenv->lockfhp, &mbytes, &bytes, NULL)) != 0) {
+ __db_err(dbenv, "%s: %s", infop->name, db_strerror(ret));
+ goto err;
+ }
+
+ /*
+ * !!!
+ * A size_t is OK -- regions get mapped into memory, and so can't
+ * be larger than a size_t.
+ */
+ size = mbytes * MEGABYTE + bytes;
+
+ /*
+ * If the size is less than the size of a REGENV_REF structure, the
+ * region (or, possibly, the REGENV_REF structure) has not yet been
+ * completely written. Wait awhile and try again.
+ *
+ * Otherwise, if the size is the size of a REGENV_REF structure,
+ * read it into memory and use it as a reference to the real region.
+ */
+ if (size <= sizeof(ref)) {
+ if (size != sizeof(ref))
+ goto retry;
+
+ if ((ret = __os_read(dbenv, dbenv->lockfhp, &ref,
+ sizeof(ref), &nrw)) != 0 || nrw < (size_t)sizeof(ref)) {
+ if (ret == 0)
+ ret = EIO;
+ __db_err(dbenv,
+ "%s: unable to read system-memory information from: %s",
+ infop->name, db_strerror(ret));
+ goto err;
+ }
+ size = ref.size;
+ segid = ref.segid;
+
+ F_SET(dbenv, DB_ENV_SYSTEM_MEM);
+ } else if (F_ISSET(dbenv, DB_ENV_SYSTEM_MEM)) {
+ ret = EINVAL;
+ __db_err(dbenv,
+ "%s: existing environment not created in system memory: %s",
+ infop->name, db_strerror(ret));
+ goto err;
+ } else
+ segid = INVALID_REGION_SEGID;
+
+ /*
+ * If not doing thread locking, we need to save the file handle for
+ * fcntl(2) locking. Otherwise, discard the handle, we no longer
+ * need it, and the less contact between the buffer cache and the VM,
+ * the better.
+ */
+#ifdef HAVE_MUTEX_THREADS
+ __os_closehandle(dbenv->lockfhp);
+#endif
+
+ /* Call the region join routine to acquire the region. */
+ memset(&tregion, 0, sizeof(tregion));
+ tregion.size = size;
+ tregion.segid = segid;
+ if ((ret = __os_r_attach(dbenv, infop, &tregion)) != 0)
+ goto err;
+
+ /*
+ * The environment's REGENV structure has to live at offset 0 instead
+ * of the usual shalloc information. Set the primary reference and
+ * correct the "addr" value to reference the shalloc region. Note,
+ * this means that all of our offsets (R_ADDR/R_OFFSET) get shifted
+ * as well, but that should be fine.
+ */
+ infop->primary = R_ADDR(infop, 0);
+ infop->addr = (u_int8_t *)infop->addr + sizeof(REGENV);
+
+ /*
+ * Check if the environment has had a catastrophic failure.
+ *
+ * Check the magic number to ensure the region is initialized. If the
+ * magic number isn't set, the lock may not have been initialized, and
+ * an attempt to use it could lead to random behavior.
+ *
+ * The panic and magic values aren't protected by any lock, so we never
+ * use them in any check that's more complex than set/not-set.
+ *
+ * !!!
+ * I'd rather play permissions games using the underlying file, but I
+ * can't because Windows/NT filesystems won't open files mode 0.
+ */
+ renv = infop->primary;
+ if (renv->panic) {
+ ret = __db_panic_msg(dbenv);
+ goto err;
+ }
+ if (renv->magic != DB_REGION_MAGIC)
+ goto retry;
+
+ /* Make sure the region matches our build. */
+ if (renv->majver != DB_VERSION_MAJOR ||
+ renv->minver != DB_VERSION_MINOR ||
+ renv->patch != DB_VERSION_PATCH) {
+ __db_err(dbenv,
+ "Program version %d.%d.%d doesn't match environment version %d.%d.%d",
+ DB_VERSION_MAJOR, DB_VERSION_MINOR, DB_VERSION_PATCH,
+ renv->majver, renv->minver, renv->patch);
+#ifndef DIAGNOSTIC
+ ret = EINVAL;
+ goto err;
+#endif
+ }
+
+ /* Lock the environment. */
+ MUTEX_LOCK(dbenv, &renv->mutex, dbenv->lockfhp);
+
+ /*
+ * Finally! We own the environment now. Repeat the panic check, it's
+ * possible that it was set while we waited for the lock.
+ */
+ if (renv->panic) {
+ ret = __db_panic_msg(dbenv);
+ goto err_unlock;
+ }
+
+ /*
+ * Get a reference to the underlying REGION information for this
+ * environment.
+ */
+ if ((ret = __db_des_get(dbenv, infop, infop, &rp)) != 0 || rp == NULL) {
+ MUTEX_UNLOCK(dbenv, &renv->mutex);
+ goto find_err;
+ }
+ infop->rp = rp;
+
+ /*
+ * There's still a possibility for inconsistent data. When we acquired
+ * the size of the region and attached to it, it might have still been
+ * growing as part of its creation. We can detect this by checking the
+ * size we originally found against the region's current size. (The
+ * region's current size has to be final, the creator finished growing
+ * it before releasing the environment for us to lock.)
+ */
+ if (rp->size != size) {
+err_unlock: MUTEX_UNLOCK(dbenv, &renv->mutex);
+ goto retry;
+ }
+
+ /* Increment the reference count. */
+ ++renv->refcnt;
+
+ /*
+ * If our caller wants them, return the flags this environment was
+ * initialized with.
+ */
+ if (init_flagsp != NULL)
+ *init_flagsp = renv->init_flags;
+
+ /* Discard our lock. */
+ MUTEX_UNLOCK(dbenv, &renv->mutex);
+
+ /*
+ * Fault the pages into memory. Note, do this AFTER releasing the
+ * lock, because we're only reading the pages, not writing them.
+ */
+ (void)__db_faultmem(infop->primary, rp->size, 0);
+
+ /* Everything looks good, we're done. */
+ dbenv->reginfo = infop;
+ return (0);
+
+creation:
+ /* Create the environment region. */
+ F_SET(infop, REGION_CREATE);
+
+ /*
+ * Allocate room for 50 REGION structures plus overhead (we're going
+ * to use this space for last-ditch allocation requests), although we
+ * should never need anything close to that.
+ */
+ memset(&tregion, 0, sizeof(tregion));
+ tregion.size = 50 * sizeof(REGION) + 50 * sizeof(MUTEX) + 2048;
+ tregion.segid = INVALID_REGION_SEGID;
+ if ((ret = __os_r_attach(dbenv, infop, &tregion)) != 0)
+ goto err;
+
+ /*
+ * Fault the pages into memory. Note, do this BEFORE we initialize
+ * anything, because we're writing the pages, not just reading them.
+ */
+ (void)__db_faultmem(infop->addr, tregion.size, 1);
+
+ /*
+ * The first object in the region is the REGENV structure. This is
+ * different from the other regions, and, from everything else in
+ * this region, where all objects are allocated from the pool, i.e.,
+ * there aren't any fixed locations. The remaining space is made
+ * available for later allocation.
+ *
+ * The allocation space must be size_t aligned, because that's what
+ * the initialization routine is going to store there. To make sure
+ * that happens, the REGENV structure was padded with a final size_t.
+ * No other region needs to worry about it because all of them treat
+ * the entire region as allocation space.
+ *
+ * Set the primary reference and correct the "addr" value to reference
+ * the shalloc region. Note, this requires that we "uncorrect" it at
+ * region detach, and that all of our offsets (R_ADDR/R_OFFSET) will be
+ * shifted as well, but that should be fine.
+ */
+ infop->primary = R_ADDR(infop, 0);
+ infop->addr = (u_int8_t *)infop->addr + sizeof(REGENV);
+ __db_shalloc_init(infop->addr, tregion.size - sizeof(REGENV));
+
+ /*
+ * Initialize the rest of the REGENV structure, except for the magic
+ * number which validates the file/environment.
+ */
+ renv = infop->primary;
+ renv->panic = 0;
+ db_version(&renv->majver, &renv->minver, &renv->patch);
+ SH_LIST_INIT(&renv->regionq);
+ renv->refcnt = 1;
+
+ /*
+ * Initialize init_flags to store the flags that any other environment
+ * handle that uses DB_JOINENV to join this environment will need.
+ */
+ renv->init_flags = (init_flagsp == NULL) ? 0 : *init_flagsp;
+
+ /*
+ * Lock the environment.
+ *
+ * Check the lock call return. This is the first lock we initialize
+ * and acquire, and we have to know if it fails. (It CAN fail, e.g.,
+ * SunOS, when using fcntl(2) for locking and using an in-memory
+ * filesystem as the database home. But you knew that, I'm sure -- it
+ * probably wasn't even worth mentioning.)
+ */
+ if ((ret =
+ __db_mutex_init(dbenv, &renv->mutex, DB_FCNTL_OFF_GEN, 0)) != 0) {
+ __db_err(dbenv, "%s: unable to initialize environment lock: %s",
+ infop->name, db_strerror(ret));
+ goto err;
+ }
+
+ if (!F_ISSET(&renv->mutex, MUTEX_IGNORE) &&
+ (ret = __db_mutex_lock(dbenv, &renv->mutex, dbenv->lockfhp)) != 0) {
+ __db_err(dbenv, "%s: unable to acquire environment lock: %s",
+ infop->name, db_strerror(ret));
+ goto err;
+ }
+
+ /*
+ * Get the underlying REGION structure for this environment. Note,
+ * we created the underlying OS region before we acquired the REGION
+ * structure, which is backwards from the normal procedure. Update
+ * the REGION structure.
+ */
+ if ((ret = __db_des_get(dbenv, infop, infop, &rp)) != 0) {
+find_err: __db_err(dbenv,
+ "%s: unable to find environment", infop->name);
+ if (ret == 0)
+ ret = EINVAL;
+ goto err;
+ }
+ infop->rp = rp;
+ rp->size = tregion.size;
+ rp->segid = tregion.segid;
+
+ /*
+ * !!!
+ * If we create an environment where regions are public and in system
+ * memory, we have to inform processes joining the environment how to
+ * attach to the shared memory segment. So, we write the shared memory
+ * identifier into the file, to be read by those other processes.
+ *
+ * XXX
+ * This is really OS-layer information, but I can't see any easy way
+ * to move it down there without passing down information that it has
+ * no right to know, e.g., that this is the one-and-only REGENV region
+ * and not some other random region.
+ */
+ if (tregion.segid != INVALID_REGION_SEGID) {
+ ref.size = tregion.size;
+ ref.segid = tregion.segid;
+ if ((ret = __os_write(dbenv, dbenv->lockfhp,
+ &ref, sizeof(ref), &nrw)) != 0 || nrw != sizeof(ref)) {
+ __db_err(dbenv,
+ "%s: unable to write out public environment ID: %s",
+ infop->name, db_strerror(ret));
+ goto err;
+ }
+ }
+
+ /*
+ * If not doing thread locking, we need to save the file handle for
+ * fcntl(2) locking. Otherwise, discard the handle, we no longer
+ * need it, and the less contact between the buffer cache and the VM,
+ * the better.
+ */
+#if defined(HAVE_MUTEX_THREADS)
+ if (F_ISSET(dbenv->lockfhp, DB_FH_VALID))
+ __os_closehandle(dbenv->lockfhp);
+#endif
+
+ /* Validate the file. */
+ renv->magic = DB_REGION_MAGIC;
+
+ /* Discard our lock. */
+ MUTEX_UNLOCK(dbenv, &renv->mutex);
+
+ /* Everything looks good, we're done. */
+ dbenv->reginfo = infop;
+ return (0);
+
+err:
+retry: /* Close any open file handle. */
+ if (F_ISSET(dbenv->lockfhp, DB_FH_VALID))
+ (void)__os_closehandle(dbenv->lockfhp);
+
+ /*
+ * If we joined or created the region, detach from it. If we created
+ * it, destroy it. Note, there's a path in the above code where we're
+ * using a temporary REGION structure because we haven't yet allocated
+ * the real one. In that case the region address (addr) will be filled
+ * in, but the REGION pointer (rp) won't. Fix it.
+ */
+ if (infop->addr != NULL) {
+ if (infop->rp == NULL)
+ infop->rp = &tregion;
+
+ /* Reset the addr value that we "corrected" above. */
+ infop->addr = infop->primary;
+ (void)__os_r_detach(dbenv,
+ infop, F_ISSET(infop, REGION_CREATE));
+ }
+
+ /* Free the allocated name and/or REGINFO structure. */
+ if (infop->name != NULL)
+ __os_freestr(infop->name);
+ __os_free(infop, sizeof(REGINFO));
+
+ /* If we had a temporary error, wait awhile and try again. */
+ if (ret == 0) {
+ if (++retry_cnt > 3) {
+ __db_err(dbenv, "unable to join the environment");
+ ret = EAGAIN;
+ } else {
+ __os_sleep(dbenv, retry_cnt * 3, 0);
+ goto loop;
+ }
+ }
+
+ return (ret);
+}
+
+/*
+ * __db_e_detach --
+ * Detach from the environment.
+ *
+ * PUBLIC: int __db_e_detach __P((DB_ENV *, int));
+ */
+int
+__db_e_detach(dbenv, destroy)
+ DB_ENV *dbenv;
+ int destroy;
+{
+ REGENV *renv;
+ REGINFO *infop;
+
+ infop = dbenv->reginfo;
+ renv = infop->primary;
+
+ /* Lock the environment. */
+ MUTEX_LOCK(dbenv, &renv->mutex, dbenv->lockfhp);
+
+ /* Decrement the reference count. */
+ if (renv->refcnt == 0) {
+ __db_err(dbenv,
+ "region %lu (environment): reference count went negative",
+ infop->rp->id);
+ } else
+ --renv->refcnt;
+
+ /* Release the lock. */
+ MUTEX_UNLOCK(dbenv, &renv->mutex);
+
+ /* Close the locking file handle. */
+ if (F_ISSET(dbenv->lockfhp, DB_FH_VALID))
+ (void)__os_closehandle(dbenv->lockfhp);
+
+ /* Reset the addr value that we "corrected" above. */
+ infop->addr = infop->primary;
+
+ /*
+ * If we are destroying the environment, we need to
+ * destroy any system resources backing the mutex.
+ * Do that now before we free the memory in __os_r_detach.
+ */
+ if (destroy)
+ __db_mutex_destroy(&renv->mutex);
+
+ /*
+ * Release the region, and kill our reference.
+ *
+ * We set the DBENV->reginfo field to NULL here and discard its memory.
+ * DBENV->remove calls __dbenv_remove to do the region remove, and
+ * __dbenv_remove attached and then detaches from the region. We don't
+ * want to return to DBENV->remove with a non-NULL DBENV->reginfo field
+ * because it will attempt to detach again as part of its cleanup.
+ */
+ (void)__os_r_detach(dbenv, infop, destroy);
+
+ if (infop->name != NULL)
+ __os_free(infop->name, 0);
+ __os_free(dbenv->reginfo, sizeof(REGINFO));
+ dbenv->reginfo = NULL;
+
+ return (0);
+}
+
+/*
+ * __db_e_remove --
+ * Discard an environment if it's not in use.
+ *
+ * PUBLIC: int __db_e_remove __P((DB_ENV *, int));
+ */
+int
+__db_e_remove(dbenv, force)
+ DB_ENV *dbenv;
+ int force;
+{
+ REGENV *renv;
+ REGINFO *infop, reginfo;
+ REGION *rp;
+ int ret;
+
+ /*
+ * This routine has to walk a nasty line between not looking into
+ * the environment (which may be corrupted after an app or system
+ * crash), and removing everything that needs removing. What we
+ * do is:
+ * 1. Connect to the environment (so it better be OK).
+ * 2. If the environment is in use (reference count is non-zero),
+ * return EBUSY.
+ * 3. Overwrite the magic number so that any threads of control
+ * attempting to connect will backoff and retry.
+ * 4. Walk the list of regions. Connect to each region and then
+ * disconnect with the destroy flag set. This shouldn't cause
+ * any problems, even if the region is corrupted, because we
+ * should never be looking inside the region.
+ * 5. Walk the list of files in the directory, unlinking any
+ * files that match a region name. Unlink the environment
+ * file last.
+ *
+ * If the force flag is set, we do not acquire any locks during this
+ * process.
+ */
+ if (force)
+ dbenv->db_mutexlocks = 0;
+
+ /* Join the environment. */
+ if ((ret = __db_e_attach(dbenv, NULL)) != 0) {
+ /*
+ * If we can't join it, we assume that's because it doesn't
+ * exist. It would be better to know why we failed, but it
+ * probably isn't important.
+ */
+ ret = 0;
+ if (force)
+ goto remfiles;
+ goto err;
+ }
+
+ infop = dbenv->reginfo;
+ renv = infop->primary;
+
+ /* Lock the environment. */
+ MUTEX_LOCK(dbenv, &renv->mutex, dbenv->lockfhp);
+
+ /* If it's in use, we're done. */
+ if (renv->refcnt == 1 || force) {
+ /*
+ * Set the panic flag and overwrite the magic number.
+ *
+ * !!!
+ * From this point on, there's no going back, we pretty
+ * much ignore errors, and just whack on whatever we can.
+ */
+ renv->panic = 1;
+ renv->magic = 0;
+
+ /*
+ * Unlock the environment. We should no longer need the lock
+ * because we've poisoned the pool, but we can't continue to
+ * hold it either, because other routines may want it.
+ */
+ MUTEX_UNLOCK(dbenv, &renv->mutex);
+
+ /*
+ * Attach to each sub-region and destroy it.
+ *
+ * !!!
+ * The REGION_CREATE_OK flag is set for Windows/95 -- regions
+ * are zero'd out when the last reference to the region goes
+ * away, in which case the underlying OS region code requires
+ * callers be prepared to create the region in order to join it.
+ */
+ memset(&reginfo, 0, sizeof(reginfo));
+restart: for (rp = SH_LIST_FIRST(&renv->regionq, __db_region);
+ rp != NULL; rp = SH_LIST_NEXT(rp, q, __db_region)) {
+ if (rp->type == REGION_TYPE_ENV)
+ continue;
+
+ reginfo.id = rp->id;
+ reginfo.flags = REGION_CREATE_OK;
+ if ((ret = __db_r_attach(dbenv, &reginfo, 0)) != 0) {
+ __db_err(dbenv,
+ "region %s attach: %s", db_strerror(ret));
+ continue;
+ }
+ R_UNLOCK(dbenv, &reginfo);
+ if ((ret = __db_r_detach(dbenv, &reginfo, 1)) != 0) {
+ __db_err(dbenv,
+ "region detach: %s", db_strerror(ret));
+ continue;
+ }
+ /*
+ * If we have an error, we continue so we eventually
+ * reach the end of the list. If we succeed, restart
+ * the list because it was relinked when we destroyed
+ * the entry.
+ */
+ goto restart;
+ }
+
+ /* Destroy the environment's region. */
+ (void)__db_e_detach(dbenv, 1);
+
+ /* Discard the physical files. */
+remfiles: (void)__db_e_remfile(dbenv);
+ } else {
+ /* Unlock the environment. */
+ MUTEX_UNLOCK(dbenv, &renv->mutex);
+
+ /* Discard the environment. */
+ (void)__db_e_detach(dbenv, 0);
+
+ ret = EBUSY;
+ }
+
+err:
+ return (ret);
+}
+
+/*
+ * __db_e_remfile --
+ * Discard any region files in the filesystem.
+ */
+static int
+__db_e_remfile(dbenv)
+ DB_ENV *dbenv;
+{
+ static char *old_region_names[] = {
+ "__db_lock.share",
+ "__db_log.share",
+ "__db_mpool.share",
+ "__db_txn.share",
+ NULL,
+ };
+ int cnt, fcnt, lastrm, ret;
+ u_int8_t saved_byte;
+ const char *dir;
+ char *p, **names, *path, buf[sizeof(DB_REGION_FMT) + 20];
+
+ /* Get the full path of a file in the environment. */
+ (void)snprintf(buf, sizeof(buf), "%s", DB_REGION_ENV);
+ if ((ret =
+ __db_appname(dbenv, DB_APP_NONE, NULL, buf, 0, NULL, &path)) != 0)
+ return (ret);
+
+ /* Get the parent directory for the environment. */
+ if ((p = __db_rpath(path)) == NULL) {
+ p = path;
+ saved_byte = *p;
+
+ dir = PATH_DOT;
+ } else {
+ saved_byte = *p;
+ *p = '\0';
+
+ dir = path;
+ }
+
+ /* Get the list of file names. */
+ ret = __os_dirlist(dbenv, dir, &names, &fcnt);
+
+ /* Restore the path, and free it. */
+ *p = saved_byte;
+ __os_freestr(path);
+
+ if (ret != 0) {
+ __db_err(dbenv, "%s: %s", dir, db_strerror(ret));
+ return (ret);
+ }
+
+ /*
+ * Search for valid region names, and remove them. We remove the
+ * environment region last, because it's the key to this whole mess.
+ */
+ for (lastrm = -1, cnt = fcnt; --cnt >= 0;) {
+ if (strlen(names[cnt]) != DB_REGION_NAME_LENGTH ||
+ memcmp(names[cnt], DB_REGION_FMT, DB_REGION_NAME_NUM) != 0)
+ continue;
+ if (strcmp(names[cnt], DB_REGION_ENV) == 0) {
+ lastrm = cnt;
+ continue;
+ }
+ for (p = names[cnt] + DB_REGION_NAME_NUM;
+ *p != '\0' && isdigit((int)*p); ++p)
+ ;
+ if (*p != '\0')
+ continue;
+
+ if (__db_appname(dbenv,
+ DB_APP_NONE, NULL, names[cnt], 0, NULL, &path) == 0) {
+ (void)__os_unlink(dbenv, path);
+ __os_freestr(path);
+ }
+ }
+
+ if (lastrm != -1)
+ if (__db_appname(dbenv,
+ DB_APP_NONE, NULL, names[lastrm], 0, NULL, &path) == 0) {
+ (void)__os_unlink(dbenv, path);
+ __os_freestr(path);
+ }
+ __os_dirfree(names, fcnt);
+
+ /*
+ * !!!
+ * Backward compatibility -- remove region files from releases
+ * before 2.8.XX.
+ */
+ for (names = (char **)old_region_names; *names != NULL; ++names)
+ if (__db_appname(dbenv,
+ DB_APP_NONE, NULL, *names, 0, NULL, &path) == 0) {
+ (void)__os_unlink(dbenv, path);
+ __os_freestr(path);
+ }
+
+ return (0);
+}
+
+/*
+ * __db_e_stat
+ * Statistics for the environment.
+ *
+ * PUBLIC: int __db_e_stat __P((DB_ENV *, REGENV *, REGION *, int *));
+ */
+int
+__db_e_stat(dbenv, arg_renv, arg_regions, arg_regions_cnt)
+ DB_ENV *dbenv;
+ REGENV *arg_renv;
+ REGION *arg_regions;
+ int *arg_regions_cnt;
+{
+ REGENV *renv;
+ REGINFO *infop;
+ REGION *rp;
+ int n;
+
+ infop = dbenv->reginfo;
+ renv = infop->primary;
+ rp = infop->rp;
+
+ /* Lock the environment. */
+ MUTEX_LOCK(dbenv, &rp->mutex, dbenv->lockfhp);
+
+ *arg_renv = *renv;
+
+ for (n = 0, rp = SH_LIST_FIRST(&renv->regionq, __db_region);
+ n < *arg_regions_cnt && rp != NULL;
+ ++n, rp = SH_LIST_NEXT(rp, q, __db_region))
+ arg_regions[n] = *rp;
+
+ /* Release the lock. */
+ rp = infop->rp;
+ MUTEX_UNLOCK(dbenv, &rp->mutex);
+
+ *arg_regions_cnt = n == 0 ? n : n - 1;
+
+ return (0);
+}
+
+/*
+ * __db_r_attach
+ * Join/create a region.
+ *
+ * PUBLIC: int __db_r_attach __P((DB_ENV *, REGINFO *, size_t));
+ */
+int
+__db_r_attach(dbenv, infop, size)
+ DB_ENV *dbenv;
+ REGINFO *infop;
+ size_t size;
+{
+ REGENV *renv;
+ REGION *rp;
+ int ret;
+ char buf[sizeof(DB_REGION_FMT) + 20];
+
+ renv = ((REGINFO *)dbenv->reginfo)->primary;
+ F_CLR(infop, REGION_CREATE);
+
+ /* Lock the environment. */
+ MUTEX_LOCK(dbenv, &renv->mutex, dbenv->lockfhp);
+
+ /* Find or create a REGION structure for this region. */
+ if ((ret = __db_des_get(dbenv, dbenv->reginfo, infop, &rp)) != 0) {
+ MUTEX_UNLOCK(dbenv, &renv->mutex);
+ return (ret);
+ }
+ infop->rp = rp;
+ infop->type = rp->type;
+ infop->id = rp->id;
+
+ /* If we're creating the region, set the desired size. */
+ if (F_ISSET(infop, REGION_CREATE))
+ rp->size = size;
+
+ /* Join/create the underlying region. */
+ (void)snprintf(buf, sizeof(buf), DB_REGION_FMT, infop->id);
+ if ((ret = __db_appname(dbenv,
+ DB_APP_NONE, NULL, buf, 0, NULL, &infop->name)) != 0)
+ goto err;
+ if ((ret = __os_r_attach(dbenv, infop, rp)) != 0)
+ goto err;
+
+ /*
+ * Fault the pages into memory. Note, do this BEFORE we initialize
+ * anything because we're writing pages in created regions, not just
+ * reading them.
+ */
+ (void)__db_faultmem(infop->addr,
+ rp->size, F_ISSET(infop, REGION_CREATE));
+
+ /*
+ * !!!
+ * The underlying layer may have just decided that we are going
+ * to create the region. There are various system issues that
+ * can result in a useless region that requires re-initialization.
+ *
+ * If we created the region, initialize it for allocation.
+ */
+ if (F_ISSET(infop, REGION_CREATE)) {
+ ((REGION *)(infop->addr))->magic = DB_REGION_MAGIC;
+
+ (void)__db_shalloc_init(infop->addr, rp->size);
+ }
+
+ /*
+ * If the underlying REGION isn't the environment, acquire a lock
+ * for it and release our lock on the environment.
+ */
+ if (infop->type != REGION_TYPE_ENV) {
+ MUTEX_LOCK(dbenv, &rp->mutex, dbenv->lockfhp);
+ MUTEX_UNLOCK(dbenv, &renv->mutex);
+ }
+
+ return (0);
+
+ /* Discard the underlying region. */
+err: if (infop->addr != NULL)
+ (void)__os_r_detach(dbenv,
+ infop, F_ISSET(infop, REGION_CREATE));
+ infop->rp = NULL;
+ infop->id = INVALID_REGION_ID;
+
+ /* Discard the REGION structure if we created it. */
+ if (F_ISSET(infop, REGION_CREATE))
+ (void)__db_des_destroy(dbenv, rp);
+
+ /* Release the environment lock. */
+ MUTEX_UNLOCK(dbenv, &renv->mutex);
+
+ return (ret);
+}
+
+/*
+ * __db_r_detach --
+ * Detach from a region.
+ *
+ * PUBLIC: int __db_r_detach __P((DB_ENV *, REGINFO *, int));
+ */
+int
+__db_r_detach(dbenv, infop, destroy)
+ DB_ENV *dbenv;
+ REGINFO *infop;
+ int destroy;
+{
+ REGENV *renv;
+ REGION *rp;
+ int ret, t_ret;
+
+ renv = ((REGINFO *)dbenv->reginfo)->primary;
+ rp = infop->rp;
+
+ /* Lock the environment. */
+ MUTEX_LOCK(dbenv, &renv->mutex, dbenv->lockfhp);
+
+ /* Acquire the lock for the REGION. */
+ MUTEX_LOCK(dbenv, &rp->mutex, dbenv->lockfhp);
+
+ /*
+ * We need to call destroy on per-subsystem info before
+ * we free the memory associated with the region.
+ */
+ if (destroy)
+ __db_region_destroy(dbenv, infop);
+
+ /* Detach from the underlying OS region. */
+ ret = __os_r_detach(dbenv, infop, destroy);
+
+ /* Release the REGION lock. */
+ MUTEX_UNLOCK(dbenv, &rp->mutex);
+
+ /* If we destroyed the region, discard the REGION structure. */
+ if (destroy &&
+ ((t_ret = __db_des_destroy(dbenv, rp)) != 0) && ret == 0)
+ ret = t_ret;
+
+ /* Release the environment lock. */
+ MUTEX_UNLOCK(dbenv, &renv->mutex);
+
+ /* Destroy the structure. */
+ if (infop->name != NULL)
+ __os_freestr(infop->name);
+
+ return (ret);
+}
+
+/*
+ * __db_des_get --
+ * Return a reference to the shared information for a REGION,
+ * optionally creating a new entry.
+ */
+static int
+__db_des_get(dbenv, env_infop, infop, rpp)
+ DB_ENV *dbenv;
+ REGINFO *env_infop, *infop;
+ REGION **rpp;
+{
+ REGENV *renv;
+ REGION *rp, *first_type;
+ u_int32_t maxid;
+ int ret;
+
+ /*
+ * !!!
+ * Called with the environment already locked.
+ */
+ *rpp = NULL;
+ renv = env_infop->primary;
+
+ /*
+ * If the caller wants to join a region, walk through the existing
+ * regions looking for a matching ID (if ID specified) or matching
+ * type (if type specified). If we return based on a matching type
+ * return the "primary" region, that is, the first region that was
+ * created of this type.
+ *
+ * Track the maximum region ID so we can allocate a new region,
+ * note that we have to start at 1 because the primary environment
+ * uses ID == 1.
+ */
+ maxid = REGION_ID_ENV;
+ for (first_type = NULL,
+ rp = SH_LIST_FIRST(&renv->regionq, __db_region);
+ rp != NULL; rp = SH_LIST_NEXT(rp, q, __db_region)) {
+ if (infop->id != INVALID_REGION_ID) {
+ if (infop->id == rp->id)
+ break;
+ continue;
+ }
+ if (infop->type == rp->type &&
+ F_ISSET(infop, REGION_JOIN_OK) &&
+ (first_type == NULL || first_type->id > rp->id))
+ first_type = rp;
+
+ if (rp->id > maxid)
+ maxid = rp->id;
+ }
+ if (rp == NULL)
+ rp = first_type;
+
+ /*
+ * If we didn't find a region and we can't create the region, fail.
+ * The caller generates any error message.
+ */
+ if (rp == NULL && !F_ISSET(infop, REGION_CREATE_OK))
+ return (ENOENT);
+
+ /*
+ * If we didn't find a region, create and initialize a REGION structure
+ * for the caller. If id was set, use that value, otherwise we use the
+ * next available ID.
+ */
+ if (rp == NULL) {
+ if ((ret = __db_shalloc(env_infop->addr,
+ sizeof(REGION), MUTEX_ALIGN, &rp)) != 0)
+ return (ret);
+
+ /* Initialize the region. */
+ memset(rp, 0, sizeof(*rp));
+ if ((ret = __db_mutex_init(dbenv, &rp->mutex,
+ R_OFFSET(env_infop, &rp->mutex) + DB_FCNTL_OFF_GEN,
+ 0)) != 0) {
+ __db_shalloc_free(env_infop->addr, rp);
+ return (ret);
+ }
+ rp->segid = INVALID_REGION_SEGID;
+
+ /*
+ * Set the type and ID; if no region ID was specified,
+ * allocate one.
+ */
+ rp->type = infop->type;
+ rp->id = infop->id == INVALID_REGION_ID ? maxid + 1 : infop->id;
+
+ SH_LIST_INSERT_HEAD(&renv->regionq, rp, q, __db_region);
+ F_SET(infop, REGION_CREATE);
+ }
+
+ *rpp = rp;
+ return (0);
+}
+
+/*
+ * __db_des_destroy --
+ * Destroy a reference to a REGION.
+ */
+static int
+__db_des_destroy(dbenv, rp)
+ DB_ENV *dbenv;
+ REGION *rp;
+{
+ REGINFO *infop;
+
+ /*
+ * !!!
+ * Called with the environment already locked.
+ */
+ infop = dbenv->reginfo;
+
+ SH_LIST_REMOVE(rp, q, __db_region);
+ __db_mutex_destroy(&rp->mutex);
+ __db_shalloc_free(infop->addr, rp);
+
+ return (0);
+}
+
+/*
+ * __db_faultmem --
+ * Fault the region into memory.
+ */
+static int
+__db_faultmem(addr, size, created)
+ void *addr;
+ size_t size;
+ int created;
+{
+ int ret;
+ u_int8_t *p, *t;
+
+ /*
+ * It's sometimes significantly faster to page-fault in all of the
+ * region's pages before we run the application, as we see nasty
+ * side-effects when we page-fault while holding various locks, i.e.,
+ * the lock takes a long time to acquire because of the underlying
+ * page fault, and the other threads convoy behind the lock holder.
+ *
+ * If we created the region, we write a non-zero value so that the
+ * system can't cheat. If we're just joining the region, we can
+ * only read the value and try to confuse the compiler sufficiently
+ * that it doesn't figure out that we're never really using it.
+ */
+ ret = 0;
+ if (DB_GLOBAL(db_region_init)) {
+ if (created)
+ for (p = addr, t = (u_int8_t *)addr + size;
+ p < t; p += OS_VMPAGESIZE)
+ p[0] = 0xdb;
+ else
+ for (p = addr, t = (u_int8_t *)addr + size;
+ p < t; p += OS_VMPAGESIZE)
+ ret |= p[0];
+ }
+
+ return (ret);
+}
+
+/*
+ * __db_region_destroy --
+ * Destroy per-subsystem region information.
+ * Called with the region already locked.
+ */
+static void
+__db_region_destroy(dbenv, infop)
+ DB_ENV *dbenv;
+ REGINFO *infop;
+{
+ switch (infop->type) {
+ case REGION_TYPE_LOCK:
+ __lock_region_destroy(dbenv, infop);
+ break;
+ case REGION_TYPE_MPOOL:
+ __mpool_region_destroy(dbenv, infop);
+ break;
+ case REGION_TYPE_ENV:
+ case REGION_TYPE_LOG:
+ case REGION_TYPE_MUTEX:
+ case REGION_TYPE_TXN:
+ break;
+ default:
+ DB_ASSERT(0);
+ break;
+ }
+}