diff options
author | unknown <tim@threads.polyesthetic.msg> | 2001-03-04 19:42:05 -0500 |
---|---|---|
committer | unknown <tim@threads.polyesthetic.msg> | 2001-03-04 19:42:05 -0500 |
commit | ec6ae091617bdfdca9e65e8d3e65b950d234f676 (patch) | |
tree | 9dd732e08dba156ee3d7635caedc0dc3107ecac6 /bdb/env | |
parent | 87d70fb598105b64b538ff6b81eef9da626255b1 (diff) | |
download | mariadb-git-ec6ae091617bdfdca9e65e8d3e65b950d234f676.tar.gz |
Import changeset
Diffstat (limited to 'bdb/env')
-rw-r--r-- | bdb/env/db_salloc.c | 360 | ||||
-rw-r--r-- | bdb/env/db_shash.c | 124 | ||||
-rw-r--r-- | bdb/env/env_method.c | 461 | ||||
-rw-r--r-- | bdb/env/env_open.c | 1064 | ||||
-rw-r--r-- | bdb/env/env_recover.c | 449 | ||||
-rw-r--r-- | bdb/env/env_region.c | 1205 |
6 files changed, 3663 insertions, 0 deletions
diff --git a/bdb/env/db_salloc.c b/bdb/env/db_salloc.c new file mode 100644 index 00000000000..4780107c593 --- /dev/null +++ b/bdb/env/db_salloc.c @@ -0,0 +1,360 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: db_salloc.c,v 11.10 2000/12/06 19:55:44 ubell Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <stdlib.h> +#include <string.h> +#endif + +#include "db_int.h" + +/* + * Implement shared memory region allocation, using simple first-fit algorithm. + * The model is that we take a "chunk" of shared memory store and begin carving + * it up into areas, similarly to how malloc works. We do coalescing on free. + * + * The "len" field in the __data struct contains the length of the free region + * (less the size_t bytes that holds the length). We use the address provided + * by the caller to find this length, which allows us to free a chunk without + * requiring that the caller pass in the length of the chunk they're freeing. + */ +SH_LIST_HEAD(__head); +struct __data { + size_t len; + SH_LIST_ENTRY links; +}; + +/* + * __db_shalloc_init -- + * Initialize the area as one large chunk. + * + * PUBLIC: void __db_shalloc_init __P((void *, size_t)); + */ +void +__db_shalloc_init(area, size) + void *area; + size_t size; +{ + struct __data *elp; + struct __head *hp; + + hp = area; + SH_LIST_INIT(hp); + + elp = (struct __data *)(hp + 1); + elp->len = size - sizeof(struct __head) - sizeof(elp->len); + SH_LIST_INSERT_HEAD(hp, elp, links, __data); +} + +/* + * __db_shalloc -- + * Allocate some space from the shared region. + * + * PUBLIC: int __db_shalloc_size __P((size_t, size_t)); + */ +int +__db_shalloc_size(len, align) + size_t len, align; +{ + /* Never allocate less than the size of a struct __data. */ + if (len < sizeof(struct __data)) + len = sizeof(struct __data); + +#ifdef DIAGNOSTIC + /* Add room for a guard byte. */ + ++len; +#endif + + /* Never align to less than a db_align_t boundary. */ + if (align <= sizeof(db_align_t)) + align = sizeof(db_align_t); + + return (ALIGN(len, align) + sizeof (struct __data)); +} + +/* + * __db_shalloc -- + * Allocate some space from the shared region. + * + * PUBLIC: int __db_shalloc __P((void *, size_t, size_t, void *)); + */ +int +__db_shalloc(p, len, align, retp) + void *p, *retp; + size_t len, align; +{ + struct __data *elp; + size_t *sp; + void *rp; + + /* Never allocate less than the size of a struct __data. */ + if (len < sizeof(struct __data)) + len = sizeof(struct __data); + +#ifdef DIAGNOSTIC + /* Add room for a guard byte. */ + ++len; +#endif + + /* Never align to less than a db_align_t boundary. */ + if (align <= sizeof(db_align_t)) + align = sizeof(db_align_t); + + /* Walk the list, looking for a slot. */ + for (elp = SH_LIST_FIRST((struct __head *)p, __data); + elp != NULL; + elp = SH_LIST_NEXT(elp, links, __data)) { + /* + * Calculate the value of the returned pointer if we were to + * use this chunk. + * + Find the end of the chunk. + * + Subtract the memory the user wants. + * + Find the closest previous correctly-aligned address. + */ + rp = (u_int8_t *)elp + sizeof(size_t) + elp->len; + rp = (u_int8_t *)rp - len; + rp = (u_int8_t *)((db_alignp_t)rp & ~(align - 1)); + + /* + * Rp may now point before elp->links, in which case the chunk + * was too small, and we have to try again. + */ + if ((u_int8_t *)rp < (u_int8_t *)&elp->links) + continue; + + *(void **)retp = rp; +#ifdef DIAGNOSTIC + /* + * At this point, whether or not we still need to split up a + * chunk, retp is the address of the region we are returning, + * and (u_int8_t *)elp + sizeof(size_t) + elp->len gives us + * the address of the first byte after the end of the chunk. + * Make the byte immediately before that the guard byte. + */ + *((u_int8_t *)elp + sizeof(size_t) + elp->len - 1) = GUARD_BYTE; +#endif + +#define SHALLOC_FRAGMENT 32 + /* + * If there are at least SHALLOC_FRAGMENT additional bytes of + * memory, divide the chunk into two chunks. + */ + if ((u_int8_t *)rp >= + (u_int8_t *)&elp->links + SHALLOC_FRAGMENT) { + sp = rp; + *--sp = elp->len - + ((u_int8_t *)rp - (u_int8_t *)&elp->links); + elp->len -= *sp + sizeof(size_t); + return (0); + } + + /* + * Otherwise, we return the entire chunk, wasting some amount + * of space to keep the list compact. However, because the + * address we're returning to the user may not be the address + * of the start of the region for alignment reasons, set the + * size_t length fields back to the "real" length field to a + * flag value, so that we can find the real length during free. + */ +#define ILLEGAL_SIZE 1 + SH_LIST_REMOVE(elp, links, __data); + for (sp = rp; (u_int8_t *)--sp >= (u_int8_t *)&elp->links;) + *sp = ILLEGAL_SIZE; + return (0); + } + + return (ENOMEM); +} + +/* + * __db_shalloc_free -- + * Free a shared memory allocation. + * + * PUBLIC: void __db_shalloc_free __P((void *, void *)); + */ +void +__db_shalloc_free(regionp, ptr) + void *regionp, *ptr; +{ + struct __data *elp, *lastp, *newp; + struct __head *hp; + size_t free_size, *sp; + int merged; + + /* + * Step back over flagged length fields to find the beginning of + * the object and its real size. + */ + for (sp = (size_t *)ptr; sp[-1] == ILLEGAL_SIZE; --sp) + ; + ptr = sp; + + newp = (struct __data *)((u_int8_t *)ptr - sizeof(size_t)); + free_size = newp->len; + +#ifdef DIAGNOSTIC + /* + * The "real size" includes the guard byte; it's just the last + * byte in the chunk, and the caller never knew it existed. + * + * Check it to make sure it hasn't been stomped. + */ + if (*((u_int8_t *)ptr + free_size - 1) != GUARD_BYTE) { + /* + * Eventually, once we push a DB_ENV handle down to these + * routines, we should use the standard output channels. + */ + fprintf(stderr, + "Guard byte incorrect during shared memory free.\n"); + abort(); + /* NOTREACHED */ + } + + /* Trash the returned memory (including guard byte). */ + memset(ptr, CLEAR_BYTE, free_size); +#endif + + /* + * Walk the list, looking for where this entry goes. + * + * We keep the free list sorted by address so that coalescing is + * trivial. + * + * XXX + * Probably worth profiling this to see how expensive it is. + */ + hp = (struct __head *)regionp; + for (elp = SH_LIST_FIRST(hp, __data), lastp = NULL; + elp != NULL && (void *)elp < (void *)ptr; + lastp = elp, elp = SH_LIST_NEXT(elp, links, __data)) + ; + + /* + * Elp is either NULL (we reached the end of the list), or the slot + * after the one that's being returned. Lastp is either NULL (we're + * returning the first element of the list) or the element before the + * one being returned. + * + * Check for coalescing with the next element. + */ + merged = 0; + if ((u_int8_t *)ptr + free_size == (u_int8_t *)elp) { + newp->len += elp->len + sizeof(size_t); + SH_LIST_REMOVE(elp, links, __data); + if (lastp != NULL) + SH_LIST_INSERT_AFTER(lastp, newp, links, __data); + else + SH_LIST_INSERT_HEAD(hp, newp, links, __data); + merged = 1; + } + + /* Check for coalescing with the previous element. */ + if (lastp != NULL && (u_int8_t *)lastp + + lastp->len + sizeof(size_t) == (u_int8_t *)newp) { + lastp->len += newp->len + sizeof(size_t); + + /* + * If we have already put the new element into the list take + * it back off again because it's just been merged with the + * previous element. + */ + if (merged) + SH_LIST_REMOVE(newp, links, __data); + merged = 1; + } + + if (!merged) { + if (lastp == NULL) + SH_LIST_INSERT_HEAD(hp, newp, links, __data); + else + SH_LIST_INSERT_AFTER(lastp, newp, links, __data); + } +} + +/* + * __db_shalloc_count -- + * Return the amount of memory on the free list. + * + * PUBLIC: size_t __db_shalloc_count __P((void *)); + */ +size_t +__db_shalloc_count(addr) + void *addr; +{ + struct __data *elp; + size_t count; + + count = 0; + for (elp = SH_LIST_FIRST((struct __head *)addr, __data); + elp != NULL; + elp = SH_LIST_NEXT(elp, links, __data)) + count += elp->len; + + return (count); +} + +/* + * __db_shsizeof -- + * Return the size of a shalloc'd piece of memory. + * + * !!! + * Note that this is from an internal standpoint -- it includes not only + * the size of the memory being used, but also the extra alignment bytes + * in front and, #ifdef DIAGNOSTIC, the guard byte at the end. + * + * PUBLIC: size_t __db_shsizeof __P((void *)); + */ +size_t +__db_shsizeof(ptr) + void *ptr; +{ + struct __data *elp; + size_t *sp; + + /* + * Step back over flagged length fields to find the beginning of + * the object and its real size. + */ + for (sp = (size_t *)ptr; sp[-1] == ILLEGAL_SIZE; --sp) + ; + + elp = (struct __data *)((u_int8_t *)sp - sizeof(size_t)); + return (elp->len); +} + +/* + * __db_shalloc_dump -- + * + * PUBLIC: void __db_shalloc_dump __P((void *, FILE *)); + */ +void +__db_shalloc_dump(addr, fp) + void *addr; + FILE *fp; +{ + struct __data *elp; + + /* Make it easy to call from the debugger. */ + if (fp == NULL) + fp = stderr; + + fprintf(fp, "%s\nMemory free list\n", DB_LINE); + + for (elp = SH_LIST_FIRST((struct __head *)addr, __data); + elp != NULL; + elp = SH_LIST_NEXT(elp, links, __data)) + fprintf(fp, "%#lx: %lu\t", (u_long)elp, (u_long)elp->len); + fprintf(fp, "\n"); +} diff --git a/bdb/env/db_shash.c b/bdb/env/db_shash.c new file mode 100644 index 00000000000..1c33b383098 --- /dev/null +++ b/bdb/env/db_shash.c @@ -0,0 +1,124 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: db_shash.c,v 11.3 2000/02/14 02:59:49 bostic Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> +#endif + +#include "db_int.h" + +/* + * Table of good hash values. Up to ~250,000 buckets, we use powers of 2. + * After that, we slow the rate of increase by half. For each choice, we + * then use a nearby prime number as the hash value. + * + * If a terabyte is the maximum cache we'll see, and we assume there are + * 10 1K buckets on each hash chain, then 107374182 is the maximum number + * of buckets we'll ever need. + */ +static const struct { + u_int32_t power; + u_int32_t prime; +} list[] = { + { 64, 67}, /* 2^6 */ + { 128, 131}, /* 2^7 */ + { 256, 257}, /* 2^8 */ + { 512, 521}, /* 2^9 */ + { 1024, 1031}, /* 2^10 */ + { 2048, 2053}, /* 2^11 */ + { 4096, 4099}, /* 2^12 */ + { 8192, 8191}, /* 2^13 */ + { 16384, 16381}, /* 2^14 */ + { 32768, 32771}, /* 2^15 */ + { 65536, 65537}, /* 2^16 */ + { 131072, 131071}, /* 2^17 */ + { 262144, 262147}, /* 2^18 */ + { 393216, 393209}, /* 2^18 + 2^18/2 */ + { 524288, 524287}, /* 2^19 */ + { 786432, 786431}, /* 2^19 + 2^19/2 */ + { 1048576, 1048573}, /* 2^20 */ + { 1572864, 1572869}, /* 2^20 + 2^20/2 */ + { 2097152, 2097169}, /* 2^21 */ + { 3145728, 3145721}, /* 2^21 + 2^21/2 */ + { 4194304, 4194301}, /* 2^22 */ + { 6291456, 6291449}, /* 2^22 + 2^22/2 */ + { 8388608, 8388617}, /* 2^23 */ + { 12582912, 12582917}, /* 2^23 + 2^23/2 */ + { 16777216, 16777213}, /* 2^24 */ + { 25165824, 25165813}, /* 2^24 + 2^24/2 */ + { 33554432, 33554393}, /* 2^25 */ + { 50331648, 50331653}, /* 2^25 + 2^25/2 */ + { 67108864, 67108859}, /* 2^26 */ + { 100663296, 100663291}, /* 2^26 + 2^26/2 */ + { 134217728, 134217757}, /* 2^27 */ + { 201326592, 201326611}, /* 2^27 + 2^27/2 */ + { 268435456, 268435459}, /* 2^28 */ + { 402653184, 402653189}, /* 2^28 + 2^28/2 */ + { 536870912, 536870909}, /* 2^29 */ + { 805306368, 805306357}, /* 2^29 + 2^29/2 */ + {1073741824, 1073741827}, /* 2^30 */ + {0, 0} +}; + +/* + * __db_tablesize -- + * Choose a size for the hash table. + * + * PUBLIC: int __db_tablesize __P((u_int32_t)); + */ +int +__db_tablesize(n_buckets) + u_int32_t n_buckets; +{ + int i; + + /* + * We try to be clever about how big we make the hash tables. Use a + * prime number close to the "suggested" number of elements that will + * be in the hash table. Use 64 as the minimum hash table size. + * + * Ref: Sedgewick, Algorithms in C, "Hash Functions" + */ + if (n_buckets < 64) + n_buckets = 64; + + for (i = 0;; ++i) { + if (list[i].power == 0) { + --i; + break; + } + if (list[i].power >= n_buckets) + break; + } + return (list[i].prime); +} + +/* + * __db_hashinit -- + * Initialize a hash table that resides in shared memory. + * + * PUBLIC: void __db_hashinit __P((void *, u_int32_t)); + */ +void +__db_hashinit(begin, nelements) + void *begin; + u_int32_t nelements; +{ + u_int32_t i; + SH_TAILQ_HEAD(hash_head) *headp; + + headp = (struct hash_head *)begin; + + for (i = 0; i < nelements; i++, headp++) + SH_TAILQ_INIT(headp); +} diff --git a/bdb/env/env_method.c b/bdb/env/env_method.c new file mode 100644 index 00000000000..c5f45df7124 --- /dev/null +++ b/bdb/env/env_method.c @@ -0,0 +1,461 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1999, 2000 + * Sleepycat Software. All rights reserved. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: env_method.c,v 11.31 2000/11/30 00:58:35 ubell Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <string.h> +#endif + +#ifdef HAVE_RPC +#include "db_server.h" +#endif + +/* + * This is the file that initializes the global array. Do it this way because + * people keep changing one without changing the other. Having declaration and + * initialization in one file will hopefully fix that. + */ +#define DB_INITIALIZE_DB_GLOBALS 1 + +#include "db_int.h" +#include "db_shash.h" +#include "db_page.h" +#include "db_am.h" +#include "lock.h" +#include "log.h" +#include "mp.h" +#include "txn.h" + +#ifdef HAVE_RPC +#include "gen_client_ext.h" +#include "rpc_client_ext.h" +#endif + +static void __dbenv_err __P((const DB_ENV *, int, const char *, ...)); +static void __dbenv_errx __P((const DB_ENV *, const char *, ...)); +static int __dbenv_set_data_dir __P((DB_ENV *, const char *)); +static void __dbenv_set_errcall __P((DB_ENV *, void (*)(const char *, char *))); +static void __dbenv_set_errfile __P((DB_ENV *, FILE *)); +static void __dbenv_set_errpfx __P((DB_ENV *, const char *)); +static int __dbenv_set_feedback __P((DB_ENV *, void (*)(DB_ENV *, int, int))); +static int __dbenv_set_flags __P((DB_ENV *, u_int32_t, int)); +static int __dbenv_set_mutexlocks __P((DB_ENV *, int)); +static int __dbenv_set_paniccall __P((DB_ENV *, void (*)(DB_ENV *, int))); +static int __dbenv_set_recovery_init __P((DB_ENV *, int (*)(DB_ENV *))); +static int __dbenv_set_server_noclnt + __P((DB_ENV *, char *, long, long, u_int32_t)); +static int __dbenv_set_shm_key __P((DB_ENV *, long)); +static int __dbenv_set_tmp_dir __P((DB_ENV *, const char *)); +static int __dbenv_set_verbose __P((DB_ENV *, u_int32_t, int)); + +/* + * db_env_create -- + * DB_ENV constructor. + */ +int +db_env_create(dbenvpp, flags) + DB_ENV **dbenvpp; + u_int32_t flags; +{ + DB_ENV *dbenv; + int ret; + + /* + * !!! + * We can't call the flags-checking routines, we don't have an + * environment yet. + */ + if (flags != 0 && flags != DB_CLIENT) + return (EINVAL); + + if ((ret = __os_calloc(NULL, 1, sizeof(*dbenv), &dbenv)) != 0) + return (ret); + +#ifdef HAVE_RPC + if (LF_ISSET(DB_CLIENT)) + F_SET(dbenv, DB_ENV_RPCCLIENT); +#endif + ret = __dbenv_init(dbenv); + + if (ret != 0) { + __os_free(dbenv, sizeof(*dbenv)); + return (ret); + } + + *dbenvpp = dbenv; + return (0); +} + +/* + * __dbenv_init -- + * Initialize a DB_ENV structure. + * + * PUBLIC: int __dbenv_init __P((DB_ENV *)); + */ +int +__dbenv_init(dbenv) + DB_ENV *dbenv; +{ + /* + * Set up methods that are the same in both normal and RPC + */ + dbenv->err = __dbenv_err; + dbenv->errx = __dbenv_errx; + dbenv->set_errcall = __dbenv_set_errcall; + dbenv->set_errfile = __dbenv_set_errfile; + dbenv->set_errpfx = __dbenv_set_errpfx; + +#ifdef HAVE_RPC + if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) { + dbenv->close = __dbcl_env_close; + dbenv->open = __dbcl_env_open; + dbenv->remove = __dbcl_env_remove; + dbenv->set_data_dir = __dbcl_set_data_dir; + dbenv->set_feedback = __dbcl_env_set_feedback; + dbenv->set_flags = __dbcl_env_flags; + dbenv->set_mutexlocks = __dbcl_set_mutex_locks; + dbenv->set_paniccall = __dbcl_env_paniccall; + dbenv->set_recovery_init = __dbcl_set_recovery_init; + dbenv->set_server = __dbcl_envserver; + dbenv->set_shm_key = __dbcl_set_shm_key; + dbenv->set_tmp_dir = __dbcl_set_tmp_dir; + dbenv->set_verbose = __dbcl_set_verbose; + } else { +#endif + dbenv->close = __dbenv_close; + dbenv->open = __dbenv_open; + dbenv->remove = __dbenv_remove; + dbenv->set_data_dir = __dbenv_set_data_dir; + dbenv->set_feedback = __dbenv_set_feedback; + dbenv->set_flags = __dbenv_set_flags; + dbenv->set_mutexlocks = __dbenv_set_mutexlocks; + dbenv->set_paniccall = __dbenv_set_paniccall; + dbenv->set_recovery_init = __dbenv_set_recovery_init; + dbenv->set_server = __dbenv_set_server_noclnt; + dbenv->set_shm_key = __dbenv_set_shm_key; + dbenv->set_tmp_dir = __dbenv_set_tmp_dir; + dbenv->set_verbose = __dbenv_set_verbose; +#ifdef HAVE_RPC + } +#endif + dbenv->shm_key = INVALID_REGION_SEGID; + dbenv->db_mutexlocks = 1; + + __log_dbenv_create(dbenv); /* Subsystem specific. */ + __lock_dbenv_create(dbenv); + __memp_dbenv_create(dbenv); + __txn_dbenv_create(dbenv); + + return (0); +} + +/* + * __dbenv_err -- + * Error message, including the standard error string. + */ +static void +#ifdef __STDC__ +__dbenv_err(const DB_ENV *dbenv, int error, const char *fmt, ...) +#else +__dbenv_err(dbenv, error, fmt, va_alist) + const DB_ENV *dbenv; + int error; + const char *fmt; + va_dcl +#endif +{ + va_list ap; + +#ifdef __STDC__ + va_start(ap, fmt); +#else + va_start(ap); +#endif + __db_real_err(dbenv, error, 1, 1, fmt, ap); + + va_end(ap); +} + +/* + * __dbenv_errx -- + * Error message. + */ +static void +#ifdef __STDC__ +__dbenv_errx(const DB_ENV *dbenv, const char *fmt, ...) +#else +__dbenv_errx(dbenv, fmt, va_alist) + const DB_ENV *dbenv; + const char *fmt; + va_dcl +#endif +{ + va_list ap; + +#ifdef __STDC__ + va_start(ap, fmt); +#else + va_start(ap); +#endif + __db_real_err(dbenv, 0, 0, 1, fmt, ap); + + va_end(ap); +} + +static int +__dbenv_set_flags(dbenv, flags, onoff) + DB_ENV *dbenv; + u_int32_t flags; + int onoff; +{ +#define OK_FLAGS (DB_CDB_ALLDB | DB_NOMMAP | DB_TXN_NOSYNC) + + if (LF_ISSET(~OK_FLAGS)) + return (__db_ferr(dbenv, "DBENV->set_flags", 0)); + + if (LF_ISSET(DB_CDB_ALLDB)) { + ENV_ILLEGAL_AFTER_OPEN(dbenv, "set_flags: DB_CDB_ALLDB"); + if (onoff) + F_SET(dbenv, DB_ENV_CDB_ALLDB); + else + F_CLR(dbenv, DB_ENV_CDB_ALLDB); + } + if (LF_ISSET(DB_NOMMAP)) { + if (onoff) + F_SET(dbenv, DB_ENV_NOMMAP); + else + F_CLR(dbenv, DB_ENV_NOMMAP); + } + if (LF_ISSET(DB_TXN_NOSYNC)) { + if (onoff) + F_SET(dbenv, DB_ENV_TXN_NOSYNC); + else + F_CLR(dbenv, DB_ENV_TXN_NOSYNC); + } + return (0); +} + +static int +__dbenv_set_data_dir(dbenv, dir) + DB_ENV *dbenv; + const char *dir; +{ + int ret; + +#define DATA_INIT_CNT 20 /* Start with 20 data slots. */ + if (dbenv->db_data_dir == NULL) { + if ((ret = __os_calloc(dbenv, DATA_INIT_CNT, + sizeof(char **), &dbenv->db_data_dir)) != 0) + return (ret); + dbenv->data_cnt = DATA_INIT_CNT; + } else if (dbenv->data_next == dbenv->data_cnt - 1) { + dbenv->data_cnt *= 2; + if ((ret = __os_realloc(dbenv, + dbenv->data_cnt * sizeof(char **), + NULL, &dbenv->db_data_dir)) != 0) + return (ret); + } + return (__os_strdup(dbenv, + dir, &dbenv->db_data_dir[dbenv->data_next++])); +} + +static void +__dbenv_set_errcall(dbenv, errcall) + DB_ENV *dbenv; + void (*errcall) __P((const char *, char *)); +{ + dbenv->db_errcall = errcall; +} + +static void +__dbenv_set_errfile(dbenv, errfile) + DB_ENV *dbenv; + FILE *errfile; +{ + dbenv->db_errfile = errfile; +} + +static void +__dbenv_set_errpfx(dbenv, errpfx) + DB_ENV *dbenv; + const char *errpfx; +{ + dbenv->db_errpfx = errpfx; +} + +static int +__dbenv_set_feedback(dbenv, feedback) + DB_ENV *dbenv; + void (*feedback) __P((DB_ENV *, int, int)); +{ + dbenv->db_feedback = feedback; + return (0); +} + +static int +__dbenv_set_mutexlocks(dbenv, onoff) + DB_ENV *dbenv; + int onoff; +{ + dbenv->db_mutexlocks = onoff; + return (0); +} + +static int +__dbenv_set_paniccall(dbenv, paniccall) + DB_ENV *dbenv; + void (*paniccall) __P((DB_ENV *, int)); +{ + dbenv->db_paniccall = paniccall; + return (0); +} + +static int +__dbenv_set_recovery_init(dbenv, recovery_init) + DB_ENV *dbenv; + int (*recovery_init) __P((DB_ENV *)); +{ + ENV_ILLEGAL_AFTER_OPEN(dbenv, "set_recovery_init"); + + dbenv->db_recovery_init = recovery_init; + + return (0); +} + +static int +__dbenv_set_shm_key(dbenv, shm_key) + DB_ENV *dbenv; + long shm_key; /* !!!: really a key_t. */ +{ + ENV_ILLEGAL_AFTER_OPEN(dbenv, "set_shm_key"); + + dbenv->shm_key = shm_key; + return (0); +} + +static int +__dbenv_set_tmp_dir(dbenv, dir) + DB_ENV *dbenv; + const char *dir; +{ + if (dbenv->db_tmp_dir != NULL) + __os_freestr(dbenv->db_tmp_dir); + return (__os_strdup(dbenv, dir, &dbenv->db_tmp_dir)); +} + +static int +__dbenv_set_verbose(dbenv, which, onoff) + DB_ENV *dbenv; + u_int32_t which; + int onoff; +{ + switch (which) { + case DB_VERB_CHKPOINT: + case DB_VERB_DEADLOCK: + case DB_VERB_RECOVERY: + case DB_VERB_WAITSFOR: + if (onoff) + FLD_SET(dbenv->verbose, which); + else + FLD_CLR(dbenv->verbose, which); + break; + default: + return (EINVAL); + } + return (0); +} + +/* + * __db_mi_env -- + * Method illegally called with public environment. + * + * PUBLIC: int __db_mi_env __P((DB_ENV *, const char *)); + */ +int +__db_mi_env(dbenv, name) + DB_ENV *dbenv; + const char *name; +{ + __db_err(dbenv, "%s: method meaningless in shared environment", name); + return (EINVAL); +} + +/* + * __db_mi_open -- + * Method illegally called after open. + * + * PUBLIC: int __db_mi_open __P((DB_ENV *, const char *, int)); + */ +int +__db_mi_open(dbenv, name, after) + DB_ENV *dbenv; + const char *name; + int after; +{ + __db_err(dbenv, + "%s: method meaningless %s open", name, after ? "after" : "before"); + return (EINVAL); +} + +/* + * __db_env_config -- + * Method or function called without subsystem being configured. + * + * PUBLIC: int __db_env_config __P((DB_ENV *, int)); + */ +int +__db_env_config(dbenv, subsystem) + DB_ENV *dbenv; + int subsystem; +{ + const char *name; + + switch (subsystem) { + case DB_INIT_LOCK: + name = "lock"; + break; + case DB_INIT_LOG: + name = "log"; + break; + case DB_INIT_MPOOL: + name = "mpool"; + break; + case DB_INIT_TXN: + name = "txn"; + break; + default: + name = "unknown"; + break; + } + __db_err(dbenv, + "%s interface called with environment not configured for that subsystem", + name); + return (EINVAL); +} + +static int +__dbenv_set_server_noclnt(dbenv, host, tsec, ssec, flags) + DB_ENV *dbenv; + char *host; + long tsec, ssec; + u_int32_t flags; +{ + COMPQUIET(host, NULL); + COMPQUIET(tsec, 0); + COMPQUIET(ssec, 0); + COMPQUIET(flags, 0); + + __db_err(dbenv, "set_server method meaningless in non-RPC enviroment"); + return (__db_eopnotsup(dbenv)); +} diff --git a/bdb/env/env_open.c b/bdb/env/env_open.c new file mode 100644 index 00000000000..2007b4266c0 --- /dev/null +++ b/bdb/env/env_open.c @@ -0,0 +1,1064 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: env_open.c,v 11.34 2000/12/21 19:20:00 bostic Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <ctype.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_shash.h" +#include "btree.h" +#include "hash.h" +#include "qam.h" +#include "lock.h" +#include "log.h" +#include "mp.h" +#include "txn.h" +#include "clib_ext.h" + +static int __dbenv_config __P((DB_ENV *, const char *, u_int32_t)); +static int __dbenv_refresh __P((DB_ENV *)); +static int __db_home __P((DB_ENV *, const char *, u_int32_t)); +static int __db_parse __P((DB_ENV *, char *)); +static int __db_tmp_open __P((DB_ENV *, u_int32_t, char *, DB_FH *)); + +/* + * db_version -- + * Return version information. + */ +char * +db_version(majverp, minverp, patchp) + int *majverp, *minverp, *patchp; +{ + if (majverp != NULL) + *majverp = DB_VERSION_MAJOR; + if (minverp != NULL) + *minverp = DB_VERSION_MINOR; + if (patchp != NULL) + *patchp = DB_VERSION_PATCH; + return ((char *)DB_VERSION_STRING); +} + +/* + * __dbenv_open -- + * Initialize an environment. + * + * PUBLIC: int __dbenv_open __P((DB_ENV *, const char *, u_int32_t, int)); + */ +int +__dbenv_open(dbenv, db_home, flags, mode) + DB_ENV *dbenv; + const char *db_home; + u_int32_t flags; + int mode; +{ + DB_ENV *rm_dbenv; + int ret; + u_int32_t init_flags; + +#undef OKFLAGS +#define OKFLAGS \ + DB_CREATE | DB_INIT_CDB | DB_INIT_LOCK | DB_INIT_LOG | \ + DB_INIT_MPOOL | DB_INIT_TXN | DB_JOINENV | DB_LOCKDOWN | \ + DB_PRIVATE | DB_RECOVER | DB_RECOVER_FATAL | DB_SYSTEM_MEM | \ + DB_THREAD | DB_USE_ENVIRON | DB_USE_ENVIRON_ROOT +#undef OKFLAGS_CDB +#define OKFLAGS_CDB \ + DB_CREATE | DB_INIT_CDB | DB_INIT_MPOOL | DB_LOCKDOWN | \ + DB_PRIVATE | DB_SYSTEM_MEM | DB_THREAD | \ + DB_USE_ENVIRON | DB_USE_ENVIRON_ROOT + + /* + * Flags saved in the init_flags field of the environment, representing + * flags to DBENV->set_flags and DBENV->open that need to be set. + */ +#define DB_INITENV_CDB 0x0001 /* DB_INIT_CDB */ +#define DB_INITENV_CDB_ALLDB 0x0002 /* DB_INIT_CDB_ALLDB */ +#define DB_INITENV_LOCK 0x0004 /* DB_INIT_LOCK */ +#define DB_INITENV_LOG 0x0008 /* DB_INIT_LOG */ +#define DB_INITENV_MPOOL 0x0010 /* DB_INIT_MPOOL */ +#define DB_INITENV_TXN 0x0020 /* DB_INIT_TXN */ + + if ((ret = __db_fchk(dbenv, "DBENV->open", flags, OKFLAGS)) != 0) + return (ret); + if (LF_ISSET(DB_INIT_CDB) && + (ret = __db_fchk(dbenv, "DBENV->open", flags, OKFLAGS_CDB)) != 0) + return (ret); + if ((ret = __db_fcchk(dbenv, + "DBENV->open", flags, DB_PRIVATE, DB_SYSTEM_MEM)) != 0) + return (ret); + if ((ret = __db_fcchk(dbenv, "DBENV->open", flags, DB_JOINENV, + DB_CREATE | DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | + DB_INIT_TXN | DB_PRIVATE)) != 0) + return (ret); + + /* + * If we're doing recovery, destroy the environment so that we create + * all the regions from scratch. I'd like to reuse already created + * regions, but that's hard. We would have to create the environment + * region from scratch, at least, as we have no way of knowing if its + * linked lists are corrupted. + * + * I suppose we could set flags while modifying those links, but that + * is going to be difficult to get right. The major concern I have + * is if the application stomps the environment with a rogue pointer. + * We have no way of detecting that, and we could be forced into a + * situation where we start up and then crash, repeatedly. + * + * Note that we do not check any flags like DB_PRIVATE before calling + * remove. We don't care if the current environment was private or + * not, we just want to nail any files that are left-over for whatever + * reason, from whatever session. + */ + if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL)) { + if ((ret = db_env_create(&rm_dbenv, 0)) != 0) + return (ret); + if ((ret = dbenv->remove(rm_dbenv, db_home, DB_FORCE)) != 0) + return (ret); + } + + /* Initialize the DB_ENV structure. */ + if ((ret = __dbenv_config(dbenv, db_home, flags)) != 0) + goto err; + + /* Convert the DBENV->open flags to internal flags. */ + if (LF_ISSET(DB_CREATE)) + F_SET(dbenv, DB_ENV_CREATE); + if (LF_ISSET(DB_LOCKDOWN)) + F_SET(dbenv, DB_ENV_LOCKDOWN); + if (LF_ISSET(DB_PRIVATE)) + F_SET(dbenv, DB_ENV_PRIVATE); + if (LF_ISSET(DB_SYSTEM_MEM)) + F_SET(dbenv, DB_ENV_SYSTEM_MEM); + if (LF_ISSET(DB_THREAD)) + F_SET(dbenv, DB_ENV_THREAD); + + /* Default permissions are read-write for both owner and group. */ + dbenv->db_mode = mode == 0 ? __db_omode("rwrw--") : mode; + + /* + * Create/join the environment. We pass in the flags that + * will be of interest to an environment joining later; if + * we're not the ones to do the create, we + * pull out whatever has been stored, if we don't do a create. + */ + init_flags = 0; + init_flags |= (LF_ISSET(DB_INIT_CDB) ? DB_INITENV_CDB : 0); + init_flags |= (LF_ISSET(DB_INIT_LOCK) ? DB_INITENV_LOCK : 0); + init_flags |= (LF_ISSET(DB_INIT_LOG) ? DB_INITENV_LOG : 0); + init_flags |= (LF_ISSET(DB_INIT_MPOOL) ? DB_INITENV_MPOOL : 0); + init_flags |= (LF_ISSET(DB_INIT_TXN) ? DB_INITENV_TXN : 0); + init_flags |= + (F_ISSET(dbenv, DB_ENV_CDB_ALLDB) ? DB_INITENV_CDB_ALLDB : 0); + + if ((ret = __db_e_attach(dbenv, &init_flags)) != 0) + goto err; + + /* + * __db_e_attach will return the saved init_flags field, which + * contains the DB_INIT_* flags used when we were created. + */ + if (LF_ISSET(DB_JOINENV)) { + LF_CLR(DB_JOINENV); + + LF_SET((init_flags & DB_INITENV_CDB) ? DB_INIT_CDB : 0); + LF_SET((init_flags & DB_INITENV_LOCK) ? DB_INIT_LOCK : 0); + LF_SET((init_flags & DB_INITENV_LOG) ? DB_INIT_LOG : 0); + LF_SET((init_flags & DB_INITENV_MPOOL) ? DB_INIT_MPOOL : 0); + LF_SET((init_flags & DB_INITENV_TXN) ? DB_INIT_TXN : 0); + + if (LF_ISSET(DB_INITENV_CDB_ALLDB) && + (ret = dbenv->set_flags(dbenv, DB_CDB_ALLDB, 1)) != 0) + goto err; + } + + /* Initialize for CDB product. */ + if (LF_ISSET(DB_INIT_CDB)) { + LF_SET(DB_INIT_LOCK); + F_SET(dbenv, DB_ENV_CDB); + } + + /* Initialize the DB list, and its mutex if appropriate. */ + LIST_INIT(&dbenv->dblist); + if (F_ISSET(dbenv, DB_ENV_THREAD)) { + if ((ret = __db_mutex_alloc(dbenv, + dbenv->reginfo, (MUTEX **)&dbenv->dblist_mutexp)) != 0) + return (ret); + if ((ret = __db_mutex_init(dbenv, + dbenv->dblist_mutexp, 0, MUTEX_THREAD)) != 0) { + __db_mutex_free(dbenv, dbenv->reginfo, + dbenv->dblist_mutexp); + return (ret); + } + } + + /* + * Initialize the subsystems. Transactions imply logging but do not + * imply locking. While almost all applications want both locking + * and logging, it would not be unreasonable for a single threaded + * process to want transactions for atomicity guarantees, but not + * necessarily need concurrency. + */ + if (LF_ISSET(DB_INIT_MPOOL)) + if ((ret = __memp_open(dbenv)) != 0) + goto err; + if (LF_ISSET(DB_INIT_LOG | DB_INIT_TXN)) + if ((ret = __log_open(dbenv)) != 0) + goto err; + if (LF_ISSET(DB_INIT_LOCK)) + if ((ret = __lock_open(dbenv)) != 0) + goto err; + if (LF_ISSET(DB_INIT_TXN)) { + if ((ret = __txn_open(dbenv)) != 0) + goto err; + + /* + * If the application is running with transactions, initialize + * the function tables. + */ + if ((ret = __bam_init_recover(dbenv)) != 0) + goto err; + if ((ret = __crdel_init_recover(dbenv)) != 0) + goto err; + if ((ret = __db_init_recover(dbenv)) != 0) + goto err; + if ((ret = __ham_init_recover(dbenv)) != 0) + goto err; + if ((ret = __log_init_recover(dbenv)) != 0) + goto err; + if ((ret = __qam_init_recover(dbenv)) != 0) + goto err; + if ((ret = __txn_init_recover(dbenv)) != 0) + goto err; + + /* + * If the application specified their own recovery + * initialization function, call it. + */ + if (dbenv->db_recovery_init != NULL && + (ret = dbenv->db_recovery_init(dbenv)) != 0) + goto err; + + /* Perform recovery for any previous run. */ + if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL) && + (ret = __db_apprec(dbenv, + LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL))) != 0) + goto err; + } + return (0); + +err: (void)__dbenv_refresh(dbenv); + return (ret); +} + +/* + * __dbenv_remove -- + * Discard an environment. + * + * PUBLIC: int __dbenv_remove __P((DB_ENV *, const char *, u_int32_t)); + */ +int +__dbenv_remove(dbenv, db_home, flags) + DB_ENV *dbenv; + const char *db_home; + u_int32_t flags; +{ + int ret, t_ret; + +#undef OKFLAGS +#define OKFLAGS \ + DB_FORCE | DB_USE_ENVIRON | DB_USE_ENVIRON_ROOT + + /* Validate arguments. */ + if ((ret = __db_fchk(dbenv, "DBENV->remove", flags, OKFLAGS)) != 0) + goto err; + + /* + * A hard-to-debug error is calling DBENV->remove after open. That's + * not legal. You have to close the original, already opened handle + * and then allocate a new DBENV handle to use for DBENV->remove. + */ + if (F_ISSET(dbenv, DB_ENV_OPEN_CALLED)) { + __db_err(dbenv, + "DBENV handle opened, not usable for remove method."); + return (EINVAL); + } + + /* Initialize the DB_ENV structure. */ + if ((ret = __dbenv_config(dbenv, db_home, flags)) != 0) + goto err; + + /* Remove the environment. */ + ret = __db_e_remove(dbenv, LF_ISSET(DB_FORCE) ? 1 : 0); + + /* Discard any resources we've acquired. */ +err: if ((t_ret = __dbenv_refresh(dbenv)) != 0 && ret == 0) + ret = t_ret; + + memset(dbenv, CLEAR_BYTE, sizeof(DB_ENV)); + __os_free(dbenv, sizeof(DB_ENV)); + + return (ret); +} + +/* + * __dbenv_config -- + * Initialize the DB_ENV structure. + */ +static int +__dbenv_config(dbenv, db_home, flags) + DB_ENV *dbenv; + const char *db_home; + u_int32_t flags; +{ + FILE *fp; + int ret; + char *lp, buf[MAXPATHLEN * 2]; + + /* Set the database home. */ + if ((ret = __db_home(dbenv, db_home, flags)) != 0) + return (ret); + + /* + * Parse the config file. + * + * !!! + * Don't use sprintf(3)/snprintf(3) -- the former is dangerous, and + * the latter isn't standard, and we're manipulating strings handed + * us by the application. + */ + if (dbenv->db_home != NULL) { +#define CONFIG_NAME "/DB_CONFIG" + if (strlen(dbenv->db_home) + + strlen(CONFIG_NAME) + 1 > sizeof(buf)) { + ret = ENAMETOOLONG; + return (ret); + } + (void)strcpy(buf, dbenv->db_home); + (void)strcat(buf, CONFIG_NAME); + if ((fp = fopen(buf, "r")) != NULL) { + while (fgets(buf, sizeof(buf), fp) != NULL) { + if ((lp = strchr(buf, '\n')) == NULL) { + __db_err(dbenv, + "%s: line too long", CONFIG_NAME); + (void)fclose(fp); + ret = EINVAL; + return (ret); + } + *lp = '\0'; + if (buf[0] == '\0' || + buf[0] == '#' || isspace((int)buf[0])) + continue; + + if ((ret = __db_parse(dbenv, buf)) != 0) { + (void)fclose(fp); + return (ret); + } + } + (void)fclose(fp); + } + } + + /* Set up the tmp directory path. */ + if (dbenv->db_tmp_dir == NULL && (ret = __os_tmpdir(dbenv, flags)) != 0) + return (ret); + + /* + * The locking file descriptor is rarely on. Set the fd to -1, not + * because it's ever tested, but to make sure we catch mistakes. + */ + if ((ret = + __os_calloc(dbenv, + 1, sizeof(*dbenv->lockfhp), &dbenv->lockfhp)) != 0) + return (ret); + dbenv->lockfhp->fd = -1; + + /* + * Flag that the DB_ENV structure has been initialized. Note, this + * must be set before calling into the subsystems as it's used during + * file naming. + */ + F_SET(dbenv, DB_ENV_OPEN_CALLED); + + return (0); +} + +/* + * __dbenv_close -- + * DB_ENV destructor. + * + * PUBLIC: int __dbenv_close __P((DB_ENV *, u_int32_t)); + */ +int +__dbenv_close(dbenv, flags) + DB_ENV *dbenv; + u_int32_t flags; +{ + int ret; + + COMPQUIET(flags, 0); + + PANIC_CHECK(dbenv); + + ret = __dbenv_refresh(dbenv); + + /* Discard the structure if we allocated it. */ + if (!F_ISSET(dbenv, DB_ENV_USER_ALLOC)) { + memset(dbenv, CLEAR_BYTE, sizeof(DB_ENV)); + __os_free(dbenv, sizeof(DB_ENV)); + } + + return (ret); +} + +/* + * __dbenv_refresh -- + * Refresh the DB_ENV structure, releasing any allocated resources. + */ +static int +__dbenv_refresh(dbenv) + DB_ENV *dbenv; +{ + int ret, t_ret; + char **p; + + ret = 0; + + /* + * Close subsystems, in the reverse order they were opened (txn + * must be first, it may want to discard locks and flush the log). + */ + if (TXN_ON(dbenv)) { + if ((t_ret = __txn_close(dbenv)) != 0 && ret == 0) + ret = t_ret; + } + + if (LOCKING_ON(dbenv)) { + if ((t_ret = __lock_close(dbenv)) != 0 && ret == 0) + ret = t_ret; + } + __lock_dbenv_close(dbenv); + + if (LOGGING_ON(dbenv)) { + if ((t_ret = __log_close(dbenv)) != 0 && ret == 0) + ret = t_ret; + } + + if (MPOOL_ON(dbenv)) { + if ((t_ret = __memp_close(dbenv)) != 0 && ret == 0) + ret = t_ret; + } + + /* Discard DB list and its mutex. */ + LIST_INIT(&dbenv->dblist); + if (dbenv->dblist_mutexp != NULL) + __db_mutex_free(dbenv, dbenv->reginfo, dbenv->dblist_mutexp); + + /* Detach from the region. */ + if (dbenv->reginfo != NULL) { + if ((t_ret = __db_e_detach(dbenv, 0)) != 0 && ret == 0) + ret = t_ret; + /* + * !!! + * Don't free dbenv->reginfo or set the reference to NULL, + * that was done by __db_e_detach(). + */ + } + + /* Clean up the structure. */ + dbenv->db_panic = 0; + + if (dbenv->db_home != NULL) { + __os_freestr(dbenv->db_home); + dbenv->db_home = NULL; + } + if (dbenv->db_log_dir != NULL) { + __os_freestr(dbenv->db_log_dir); + dbenv->db_log_dir = NULL; + } + if (dbenv->db_tmp_dir != NULL) { + __os_freestr(dbenv->db_tmp_dir); + dbenv->db_tmp_dir = NULL; + } + if (dbenv->db_data_dir != NULL) { + for (p = dbenv->db_data_dir; *p != NULL; ++p) + __os_freestr(*p); + __os_free(dbenv->db_data_dir, + dbenv->data_cnt * sizeof(char **)); + dbenv->db_data_dir = NULL; + } + dbenv->data_cnt = dbenv->data_next = 0; + + dbenv->db_mode = 0; + + if (dbenv->lockfhp != NULL) { + __os_free(dbenv->lockfhp, sizeof(*dbenv->lockfhp)); + dbenv->lockfhp = NULL; + } + + if (dbenv->dtab != NULL) { + __os_free(dbenv->dtab, + dbenv->dtab_size * sizeof(dbenv->dtab[0])); + dbenv->dtab = NULL; + dbenv->dtab_size = 0; + } + + dbenv->mp_mmapsize = 0; + dbenv->links.tqe_next = NULL; + dbenv->links.tqe_prev = NULL; + dbenv->xa_rmid = 0; + dbenv->xa_txn = 0; + + F_CLR(dbenv, ~(DB_ENV_STANDALONE | DB_ENV_USER_ALLOC)); + + return (ret); +} + +#define DB_ADDSTR(add) { \ + if ((add) != NULL) { \ + /* If leading slash, start over. */ \ + if (__os_abspath(add)) { \ + p = str; \ + slash = 0; \ + } \ + /* Append to the current string. */ \ + len = strlen(add); \ + if (slash) \ + *p++ = PATH_SEPARATOR[0]; \ + memcpy(p, add, len); \ + p += len; \ + slash = strchr(PATH_SEPARATOR, p[-1]) == NULL; \ + } \ +} + +/* + * __db_appname -- + * Given an optional DB environment, directory and file name and type + * of call, build a path based on the DBENV->open rules, and return + * it in allocated space. + * + * PUBLIC: int __db_appname __P((DB_ENV *, APPNAME, + * PUBLIC: const char *, const char *, u_int32_t, DB_FH *, char **)); + */ +int +__db_appname(dbenv, appname, dir, file, tmp_oflags, fhp, namep) + DB_ENV *dbenv; + APPNAME appname; + const char *dir, *file; + u_int32_t tmp_oflags; + DB_FH *fhp; + char **namep; +{ + DB_ENV etmp; + size_t len, str_len; + int data_entry, ret, slash, tmp_create, tmp_free; + const char *a, *b, *c; + char *p, *str; + + a = b = c = NULL; + data_entry = -1; + tmp_create = tmp_free = 0; + + /* + * We don't return a name when creating temporary files, just a + * file handle. Default to an error now. + */ + if (fhp != NULL) + F_CLR(fhp, DB_FH_VALID); + if (namep != NULL) + *namep = NULL; + + /* + * Absolute path names are never modified. If the file is an absolute + * path, we're done. If the directory is, simply append the file and + * return. + */ + if (file != NULL && __os_abspath(file)) + return (__os_strdup(dbenv, file, namep)); + if (dir != NULL && __os_abspath(dir)) { + a = dir; + goto done; + } + + /* + * DB_ENV DIR APPNAME RESULT + * ------------------------------------------- + * null null none <tmp>/file + * null set none DIR/file + * set null none DB_HOME/file + * set set none DB_HOME/DIR/file + * + * DB_ENV FILE APPNAME RESULT + * ------------------------------------------- + * null null DB_APP_DATA <tmp>/<create> + * null set DB_APP_DATA ./file + * set null DB_APP_DATA <tmp>/<create> + * set set DB_APP_DATA DB_HOME/DB_DATA_DIR/file + * + * DB_ENV DIR APPNAME RESULT + * ------------------------------------------- + * null null DB_APP_LOG <tmp>/file + * null set DB_APP_LOG DIR/file + * set null DB_APP_LOG DB_HOME/DB_LOG_DIR/file + * set set DB_APP_LOG DB_HOME/DB_LOG_DIR/DIR/file + * + * DB_ENV APPNAME RESULT + * ------------------------------------------- + * null DB_APP_TMP* <tmp>/<create> + * set DB_APP_TMP* DB_HOME/DB_TMP_DIR/<create> + */ +retry: switch (appname) { + case DB_APP_NONE: + if (dbenv == NULL || !F_ISSET(dbenv, DB_ENV_OPEN_CALLED)) { + if (dir == NULL) + goto tmp; + a = dir; + } else { + a = dbenv->db_home; + b = dir; + } + break; + case DB_APP_DATA: + if (dir != NULL) { + __db_err(dbenv, + "DB_APP_DATA: illegal directory specification"); + return (EINVAL); + } + + if (file == NULL) { + tmp_create = 1; + goto tmp; + } + if (dbenv != NULL && F_ISSET(dbenv, DB_ENV_OPEN_CALLED)) { + a = dbenv->db_home; + if (dbenv->db_data_dir != NULL && + (b = dbenv->db_data_dir[++data_entry]) == NULL) { + data_entry = -1; + b = dbenv->db_data_dir[0]; + } + } + break; + case DB_APP_LOG: + if (dbenv == NULL || !F_ISSET(dbenv, DB_ENV_OPEN_CALLED)) { + if (dir == NULL) + goto tmp; + a = dir; + } else { + a = dbenv->db_home; + b = dbenv->db_log_dir; + c = dir; + } + break; + case DB_APP_TMP: + if (dir != NULL || file != NULL) { + __db_err(dbenv, + "DB_APP_TMP: illegal directory or file specification"); + return (EINVAL); + } + + tmp_create = 1; + if (dbenv == NULL || !F_ISSET(dbenv, DB_ENV_OPEN_CALLED)) + goto tmp; + else { + a = dbenv->db_home; + b = dbenv->db_tmp_dir; + } + break; + } + + /* Reference a file from the appropriate temporary directory. */ + if (0) { +tmp: if (dbenv == NULL || !F_ISSET(dbenv, DB_ENV_OPEN_CALLED)) { + memset(&etmp, 0, sizeof(etmp)); + if ((ret = __os_tmpdir(&etmp, DB_USE_ENVIRON)) != 0) + return (ret); + tmp_free = 1; + a = etmp.db_tmp_dir; + } else + a = dbenv->db_tmp_dir; + } + +done: len = + (a == NULL ? 0 : strlen(a) + 1) + + (b == NULL ? 0 : strlen(b) + 1) + + (c == NULL ? 0 : strlen(c) + 1) + + (file == NULL ? 0 : strlen(file) + 1); + + /* + * Allocate space to hold the current path information, as well as any + * temporary space that we're going to need to create a temporary file + * name. + */ +#define DB_TRAIL "BDBXXXXXX" + str_len = len + sizeof(DB_TRAIL) + 10; + if ((ret = __os_malloc(dbenv, str_len, NULL, &str)) != 0) { + if (tmp_free) + __os_freestr(etmp.db_tmp_dir); + return (ret); + } + + slash = 0; + p = str; + DB_ADDSTR(a); + DB_ADDSTR(b); + DB_ADDSTR(file); + *p = '\0'; + + /* Discard any space allocated to find the temp directory. */ + if (tmp_free) { + __os_freestr(etmp.db_tmp_dir); + tmp_free = 0; + } + + /* + * If we're opening a data file, see if it exists. If it does, + * return it, otherwise, try and find another one to open. + */ + if (data_entry != -1 && __os_exists(str, NULL) != 0) { + __os_free(str, str_len); + a = b = c = NULL; + goto retry; + } + + /* Create the file if so requested. */ + if (tmp_create && + (ret = __db_tmp_open(dbenv, tmp_oflags, str, fhp)) != 0) { + __os_free(str, str_len); + return (ret); + } + + if (namep == NULL) + __os_free(str, str_len); + else + *namep = str; + return (0); +} + +/* + * __db_home -- + * Find the database home. + */ +static int +__db_home(dbenv, db_home, flags) + DB_ENV *dbenv; + const char *db_home; + u_int32_t flags; +{ + const char *p; + + /* + * Use db_home by default, this allows utilities to reasonably + * override the environment either explicitly or by using a -h + * option. Otherwise, use the environment if it's permitted + * and initialized. + */ + if ((p = db_home) == NULL && + (LF_ISSET(DB_USE_ENVIRON) || + (LF_ISSET(DB_USE_ENVIRON_ROOT) && __os_isroot())) && + (p = getenv("DB_HOME")) != NULL && p[0] == '\0') { + __db_err(dbenv, "illegal DB_HOME environment variable"); + return (EINVAL); + } + + return (p == NULL ? 0 : __os_strdup(dbenv, p, &dbenv->db_home)); +} + +/* + * __db_parse -- + * Parse a single NAME VALUE pair. + */ +static int +__db_parse(dbenv, s) + DB_ENV *dbenv; + char *s; +{ + u_long v1, v2, v3; + u_int32_t flags; + char *name, *p, *value, v4; + + /* + * !!! + * The value of 40 is hard-coded into format arguments to sscanf + * below, it can't be changed here without changing it there, too. + */ + char arg[40]; + + /* + * Name/value pairs are parsed as two white-space separated strings. + * Leading and trailing white-space is trimmed from the value, but + * it may contain embedded white-space. Note: we use the isspace(3) + * macro because it's more portable, but that means that you can use + * characters like form-feed to separate the strings. + */ + name = s; + for (p = name; *p != '\0' && !isspace((int)*p); ++p) + ; + if (*p == '\0' || p == name) + goto illegal; + *p = '\0'; + for (++p; isspace((int)*p); ++p) + ; + if (*p == '\0') + goto illegal; + value = p; + for (++p; *p != '\0'; ++p) + ; + for (--p; isspace((int)*p); --p) + ; + ++p; + if (p == value) { +illegal: __db_err(dbenv, "mis-formatted name-value pair: %s", s); + return (EINVAL); + } + *p = '\0'; + + if (!strcasecmp(name, "set_cachesize")) { + if (sscanf(value, "%lu %lu %lu %c", &v1, &v2, &v3, &v4) != 3) + goto badarg; + return (dbenv->set_cachesize(dbenv, v1, v2, v3)); + } + + if (!strcasecmp(name, "set_data_dir") || + !strcasecmp(name, "db_data_dir")) /* Compatibility. */ + return (dbenv->set_data_dir(dbenv, value)); + + if (!strcasecmp(name, "set_flags")) { + if (sscanf(value, "%40s %c", arg, &v4) != 1) + goto badarg; + + if (!strcasecmp(value, "db_cdb_alldb")) + return (dbenv->set_flags(dbenv, DB_CDB_ALLDB, 1)); + if (!strcasecmp(value, "db_nommap")) + return (dbenv->set_flags(dbenv, DB_NOMMAP, 1)); + if (!strcasecmp(value, "db_txn_nosync")) + return (dbenv->set_flags(dbenv, DB_TXN_NOSYNC, 1)); + goto badarg; + } + + if (!strcasecmp(name, "set_lg_bsize")) { + if (sscanf(value, "%lu %c", &v1, &v4) != 1) + goto badarg; + return (dbenv->set_lg_bsize(dbenv, v1)); + } + + if (!strcasecmp(name, "set_lg_max")) { + if (sscanf(value, "%lu %c", &v1, &v4) != 1) + goto badarg; + return (dbenv->set_lg_max(dbenv, v1)); + } + + if (!strcasecmp(name, "set_lg_dir") || + !strcasecmp(name, "db_log_dir")) /* Compatibility. */ + return (dbenv->set_lg_dir(dbenv, value)); + + if (!strcasecmp(name, "set_lk_detect")) { + if (sscanf(value, "%40s %c", arg, &v4) != 1) + goto badarg; + if (!strcasecmp(value, "db_lock_default")) + flags = DB_LOCK_DEFAULT; + else if (!strcasecmp(value, "db_lock_oldest")) + flags = DB_LOCK_OLDEST; + else if (!strcasecmp(value, "db_lock_random")) + flags = DB_LOCK_RANDOM; + else if (!strcasecmp(value, "db_lock_youngest")) + flags = DB_LOCK_YOUNGEST; + else + goto badarg; + return (dbenv->set_lk_detect(dbenv, flags)); + } + + if (!strcasecmp(name, "set_lk_max")) { + if (sscanf(value, "%lu %c", &v1, &v4) != 1) + goto badarg; + return (dbenv->set_lk_max(dbenv, v1)); + } + + if (!strcasecmp(name, "set_lk_max_locks")) { + if (sscanf(value, "%lu %c", &v1, &v4) != 1) + goto badarg; + return (dbenv->set_lk_max_locks(dbenv, v1)); + } + + if (!strcasecmp(name, "set_lk_max_lockers")) { + if (sscanf(value, "%lu %c", &v1, &v4) != 1) + goto badarg; + return (dbenv->set_lk_max_lockers(dbenv, v1)); + } + + if (!strcasecmp(name, "set_lk_max_objects")) { + if (sscanf(value, "%lu %c", &v1, &v4) != 1) + goto badarg; + return (dbenv->set_lk_max_objects(dbenv, v1)); + } + + if (!strcasecmp(name, "set_mp_mmapsize")) { + if (sscanf(value, "%lu %c", &v1, &v4) != 1) + goto badarg; + return (dbenv->set_mp_mmapsize(dbenv, v1)); + } + + if (!strcasecmp(name, "set_region_init")) { + if (sscanf(value, "%lu %c", &v1, &v4) != 1 || v1 != 1) + goto badarg; + return (db_env_set_region_init(v1)); + } + + if (!strcasecmp(name, "set_shm_key")) { + if (sscanf(value, "%lu %c", &v1, &v4) != 1) + goto badarg; + return (dbenv->set_shm_key(dbenv, (long)v1)); + } + + if (!strcasecmp(name, "set_tas_spins")) { + if (sscanf(value, "%lu %c", &v1, &v4) != 1) + goto badarg; + return (db_env_set_tas_spins(v1)); + } + + if (!strcasecmp(name, "set_tmp_dir") || + !strcasecmp(name, "db_tmp_dir")) /* Compatibility.*/ + return (dbenv->set_tmp_dir(dbenv, value)); + + if (!strcasecmp(name, "set_tx_max")) { + if (sscanf(value, "%lu %c", &v1, &v4) != 1) + goto badarg; + return (dbenv->set_tx_max(dbenv, v1)); + } + + if (!strcasecmp(name, "set_verbose")) { + if (sscanf(value, "%40s %c", arg, &v4) != 1) + goto badarg; + + if (!strcasecmp(value, "db_verb_chkpoint")) + flags = DB_VERB_CHKPOINT; + else if (!strcasecmp(value, "db_verb_deadlock")) + flags = DB_VERB_DEADLOCK; + else if (!strcasecmp(value, "db_verb_recovery")) + flags = DB_VERB_RECOVERY; + else if (!strcasecmp(value, "db_verb_waitsfor")) + flags = DB_VERB_WAITSFOR; + else + goto badarg; + return (dbenv->set_verbose(dbenv, flags, 1)); + } + + __db_err(dbenv, "unrecognized name-value pair: %s", s); + return (EINVAL); + +badarg: __db_err(dbenv, "incorrect arguments for name-value pair: %s", s); + return (EINVAL); +} + +/* + * __db_tmp_open -- + * Create a temporary file. + */ +static int +__db_tmp_open(dbenv, tmp_oflags, path, fhp) + DB_ENV *dbenv; + u_int32_t tmp_oflags; + char *path; + DB_FH *fhp; +{ + u_long pid; + int mode, isdir, ret; + const char *p; + char *trv; + + /* + * Check the target directory; if you have six X's and it doesn't + * exist, this runs for a *very* long time. + */ + if ((ret = __os_exists(path, &isdir)) != 0) { + __db_err(dbenv, "%s: %s", path, db_strerror(ret)); + return (ret); + } + if (!isdir) { + __db_err(dbenv, "%s: %s", path, db_strerror(EINVAL)); + return (EINVAL); + } + + /* Build the path. */ + for (trv = path; *trv != '\0'; ++trv) + ; + *trv = PATH_SEPARATOR[0]; + for (p = DB_TRAIL; (*++trv = *p) != '\0'; ++p) + ; + + /* + * Replace the X's with the process ID. Pid should be a pid_t, + * but we use unsigned long for portability. + */ + for (pid = getpid(); *--trv == 'X'; pid /= 10) + switch (pid % 10) { + case 0: *trv = '0'; break; + case 1: *trv = '1'; break; + case 2: *trv = '2'; break; + case 3: *trv = '3'; break; + case 4: *trv = '4'; break; + case 5: *trv = '5'; break; + case 6: *trv = '6'; break; + case 7: *trv = '7'; break; + case 8: *trv = '8'; break; + case 9: *trv = '9'; break; + } + ++trv; + + /* Set up open flags and mode. */ + mode = __db_omode("rw----"); + + /* Loop, trying to open a file. */ + for (;;) { + if ((ret = __os_open(dbenv, path, + tmp_oflags | DB_OSO_CREATE | DB_OSO_EXCL, mode, fhp)) == 0) + return (0); + + /* + * !!!: + * If we don't get an EEXIST error, then there's something + * seriously wrong. Unfortunately, if the implementation + * doesn't return EEXIST for O_CREAT and O_EXCL regardless + * of other possible errors, we've lost. + */ + if (ret != EEXIST) { + __db_err(dbenv, + "tmp_open: %s: %s", path, db_strerror(ret)); + return (ret); + } + + /* + * Tricky little algorithm for backward compatibility. + * Assumes sequential ordering of lower-case characters. + */ + for (;;) { + if (*trv == '\0') + return (EINVAL); + if (*trv == 'z') + *trv++ = 'a'; + else { + if (isdigit((int)*trv)) + *trv = 'a'; + else + ++*trv; + break; + } + } + } + /* NOTREACHED */ +} diff --git a/bdb/env/env_recover.c b/bdb/env/env_recover.c new file mode 100644 index 00000000000..bc5e4760584 --- /dev/null +++ b/bdb/env/env_recover.c @@ -0,0 +1,449 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ + +#include "db_config.h" + +#ifndef lint +static const char copyright[] = + "Copyright (c) 1996-2000\nSleepycat Software Inc. All rights reserved.\n"; +static const char revid[] = + "$Id: env_recover.c,v 11.33 2001/01/04 22:38:42 ubell Exp $"; +#endif + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#if TIME_WITH_SYS_TIME +#include <sys/time.h> +#include <time.h> +#else +#if HAVE_SYS_TIME_H +#include <sys/time.h> +#else +#include <time.h> +#endif +#endif + +#include <string.h> +#endif + +#include "db_int.h" +#include "db_page.h" +#include "db_dispatch.h" +#include "db_am.h" +#include "log.h" +#include "txn.h" + +static float __lsn_diff __P((DB_LSN *, DB_LSN *, DB_LSN *, u_int32_t, int)); +static int __log_earliest __P((DB_ENV *, int32_t *, DB_LSN *)); + +/* + * __db_apprec -- + * Perform recovery. + * + * PUBLIC: int __db_apprec __P((DB_ENV *, u_int32_t)); + */ +int +__db_apprec(dbenv, flags) + DB_ENV *dbenv; + u_int32_t flags; +{ + DBT data; + DB_LSN ckp_lsn, first_lsn, last_lsn, lowlsn, lsn, open_lsn; + DB_TXNREGION *region; + __txn_ckp_args *ckp_args; + time_t now, tlow; + float nfiles; + int32_t low; + int is_thread, progress, ret; + void *txninfo; + + COMPQUIET(nfiles, (float)0); + + /* + * Save the state of the thread flag -- we don't need it on at the + * moment because we're single-threaded until recovery is complete. + */ + is_thread = F_ISSET(dbenv, DB_ENV_THREAD) ? 1 : 0; + F_CLR(dbenv, DB_ENV_THREAD); + F_SET((DB_LOG *)dbenv->lg_handle, DBLOG_RECOVER); + + /* + * If the user is specifying recover to a particular point in time, + * verify that the logs present are sufficient to do this. + */ + ZERO_LSN(lowlsn); + if (dbenv->tx_timestamp != 0) { + if ((ret = __log_earliest(dbenv, &low, &lowlsn)) != 0) + return (ret); + if ((int32_t)dbenv->tx_timestamp < low) { + char t1[30], t2[30]; + + strcpy(t1, ctime(&dbenv->tx_timestamp)); + tlow = (time_t)low; + strcpy(t2, ctime(&tlow)); + __db_err(dbenv, + "Invalid recovery timestamp %.*s; earliest time is %.*s", + 24, t1, 24, t2); + return (EINVAL); + } + } + + /* Initialize the transaction list. */ + if ((ret = __db_txnlist_init(dbenv, &txninfo)) != 0) + return (ret); + + /* + * Recovery is done in three passes: + * Pass #0: + * We need to find the position from which we will open files + * We need to open files beginning with the last to next + * checkpoint because we might have crashed after writing the + * last checkpoint record, but before having written out all + * the open file information. + * + * Pass #1: + * Read forward through the log from the second to last checkpoint + * opening and closing files so that at the end of the log we have + * the "current" set of files open. + * + * Pass #2: + * Read backward through the log undoing any uncompleted TXNs. + * There are three cases: + * 1. If doing catastrophic recovery, we read to the beginning + * of the log + * 2. If we are doing normal reovery, then we have to roll + * back to the most recent checkpoint that occurs + * before the most recent checkpoint LSN, which is + * returned by __log_findckp(). + * 3. If we are recovering to a point in time, then we have + * to roll back to the checkpoint whose ckp_lsn is earlier + * than the specified time. __log_earliest will figure + * this out for us. + * In case 2, "uncompleted TXNs" include all those who commited + * after the user's specified timestamp. + * + * Pass #3: + * Read forward through the log from the LSN found in pass #2, + * redoing any committed TXNs (which commited after any user- + * specified rollback point). During this pass, checkpoint + * file information is ignored, and file openings and closings + * are redone. + */ + + /* + * Find out the last lsn, so that we can estimate how far along we + * are in recovery. This will help us determine how much log there + * is between the first LSN that we're going to be working with and + * the last one. We assume that each of the three phases takes the + * same amount of time (a false assumption) and then use the %-age + * of the amount of log traversed to figure out how much of the + * pass we've accomplished. + */ + memset(&data, 0, sizeof(data)); + if (dbenv->db_feedback != NULL && + (ret = log_get(dbenv, &last_lsn, &data, DB_LAST)) != 0) + goto out; + + /* + * Pass #0 + * Find the second to last checkpoint in the log. This is the point + * from which we want to begin pass #1 (the open files pass). + */ + ckp_args = NULL; + + if (LF_ISSET(DB_RECOVER_FATAL)) { + if ((ret = log_get(dbenv, &ckp_lsn, &data, DB_FIRST)) != 0) { + if (ret == DB_NOTFOUND) + ret = 0; + else + __db_err(dbenv, "First log record not found"); + goto out; + } + open_lsn = ckp_lsn; + } else if ((ret = + log_get(dbenv, &ckp_lsn, &data, DB_CHECKPOINT)) != 0) { + /* + * If we don't find a checkpoint, start from the beginning. + * If that fails, we're done. Note, we do not require that + * there be log records if we're performing recovery. + */ +first: if ((ret = log_get(dbenv, &ckp_lsn, &data, DB_FIRST)) != 0) { + if (ret == DB_NOTFOUND) + ret = 0; + else + __db_err(dbenv, "First log record not found"); + goto out; + } + open_lsn = ckp_lsn; + } else if ((ret = __txn_ckp_read(dbenv, data.data, &ckp_args)) != 0) { + __db_err(dbenv, "Invalid checkpoint record at [%ld][%ld]\n", + (u_long)ckp_lsn.file, (u_long)ckp_lsn.offset); + goto out; + } else if (IS_ZERO_LSN(ckp_args->last_ckp) || + (ret = log_get(dbenv, &ckp_args->last_ckp, &data, DB_SET)) != 0) + goto first; + else + open_lsn = ckp_args->last_ckp; + + if (dbenv->db_feedback != NULL) { + if (last_lsn.file == open_lsn.file) + nfiles = (float)(last_lsn.offset - open_lsn.offset) / + dbenv->lg_max; + else + nfiles = (float)(last_lsn.file - open_lsn.file) + + (float)(dbenv->lg_max - open_lsn.offset + + last_lsn.offset) / dbenv->lg_max; + /* We are going to divide by nfiles; make sure it isn't 0. */ + if (nfiles == 0) + nfiles = (float)0.001; + } + + /* + * Pass #1 + * Now, ckp_lsn is either the lsn of the last checkpoint + * or the lsn of the first record in the log. Open_lsn is + * the second to last checkpoint or the beinning of the log; + * begin the open files pass from that lsn, and proceed to + * the end of the log. + */ + lsn = open_lsn; + for (;;) { + if (dbenv->db_feedback != NULL) { + progress = (int)(33 * (__lsn_diff(&open_lsn, + &last_lsn, &lsn, dbenv->lg_max, 1) / nfiles)); + dbenv->db_feedback(dbenv, DB_RECOVER, progress); + } + ret = __db_dispatch(dbenv, + &data, &lsn, DB_TXN_OPENFILES, txninfo); + if (ret != 0 && ret != DB_TXN_CKP) + goto msgerr; + if ((ret = log_get(dbenv, &lsn, &data, DB_NEXT)) != 0) { + if (ret == DB_NOTFOUND) + break; + goto out; + } + } + + /* + * Pass #2. + * + * Before we can begin pass #2, backward roll phase, we determine how + * far back in the log to recover. If we are doing catastrophic + * recovery, then we go as far back as we have files. If we are + * doing normal recovery, we go as back to the most recent checkpoint + * that occurs before the most recent checkpoint LSN. If we are + * recovering to a point in time, then rollback to the checkpoint whose + * ckp_lsn precedes the first log record (and then roll forward to + * the appropriate timestamp in Pass #3). + */ + if (LF_ISSET(DB_RECOVER_FATAL)) { + ZERO_LSN(first_lsn); + } else if (dbenv->tx_timestamp != 0) + first_lsn = lowlsn; + else + if ((ret = __log_findckp(dbenv, &first_lsn)) == DB_NOTFOUND) { + /* + * We don't require that log files exist if recovery + * was specified. + */ + ret = 0; + goto out; + } + + if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY)) + __db_err(dbenv, "Recovery starting from [%lu][%lu]", + (u_long)first_lsn.file, (u_long)first_lsn.offset); + + for (ret = log_get(dbenv, &lsn, &data, DB_LAST); + ret == 0 && log_compare(&lsn, &first_lsn) > 0; + ret = log_get(dbenv, &lsn, &data, DB_PREV)) { + if (dbenv->db_feedback != NULL) { + progress = 34 + (int)(33 * (__lsn_diff(&open_lsn, + &last_lsn, &lsn, dbenv->lg_max, 0) / nfiles)); + dbenv->db_feedback(dbenv, DB_RECOVER, progress); + } + ret = __db_dispatch(dbenv, + &data, &lsn, DB_TXN_BACKWARD_ROLL, txninfo); + if (ret != 0) { + if (ret != DB_TXN_CKP) + goto msgerr; + else + ret = 0; + } + } + if (ret != 0 && ret != DB_NOTFOUND) + goto out; + + /* + * Pass #3. + */ + for (ret = log_get(dbenv, &lsn, &data, DB_NEXT); + ret == 0; ret = log_get(dbenv, &lsn, &data, DB_NEXT)) { + if (dbenv->db_feedback != NULL) { + progress = 67 + (int)(33 * (__lsn_diff(&open_lsn, + &last_lsn, &lsn, dbenv->lg_max, 1) / nfiles)); + dbenv->db_feedback(dbenv, DB_RECOVER, progress); + } + ret = __db_dispatch(dbenv, + &data, &lsn, DB_TXN_FORWARD_ROLL, txninfo); + if (ret != 0) { + if (ret != DB_TXN_CKP) + goto msgerr; + else + ret = 0; + } + } + if (ret != DB_NOTFOUND) + goto out; + + /* + * Process any pages that were on the limbo list + * and move them to the free list. Do this + * before checkpointing the database. + */ + if ((ret = __db_do_the_limbo(dbenv, txninfo)) != 0) + goto out; + + /* + * Now set the last checkpoint lsn and the current time, + * take a checkpoint, and reset the txnid. + */ + (void)time(&now); + region = ((DB_TXNMGR *)dbenv->tx_handle)->reginfo.primary; + region->last_txnid = ((DB_TXNHEAD *)txninfo)->maxid; + region->last_ckp = ckp_lsn; + region->time_ckp = (u_int32_t)now; + + /* + * Take two checkpoints so that we don't re-recover any of the + * work we've already done. + */ + if ((ret = txn_checkpoint(dbenv, 0, 0, DB_FORCE)) != 0) + goto out; + + /* Now close all the db files that are open. */ + __log_close_files(dbenv); + + if ((ret = txn_checkpoint(dbenv, 0, 0, DB_FORCE)) != 0) + goto out; + region->last_txnid = TXN_MINIMUM; + + if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY)) { + __db_err(dbenv, "Recovery complete at %.24s", ctime(&now)); + __db_err(dbenv, "%s %lx %s [%lu][%lu]", + "Maximum transaction ID", + ((DB_TXNHEAD *)txninfo)->maxid, + "Recovery checkpoint", + (u_long)region->last_ckp.file, + (u_long)region->last_ckp.offset); + } + + if (0) { +msgerr: __db_err(dbenv, "Recovery function for LSN %lu %lu failed", + (u_long)lsn.file, (u_long)lsn.offset); + } + +out: if (is_thread) + F_SET(dbenv, DB_ENV_THREAD); + __db_txnlist_end(dbenv, txninfo); + if (ckp_args != NULL) + __os_free(ckp_args, sizeof(*ckp_args)); + F_CLR((DB_LOG *)dbenv->lg_handle, DBLOG_RECOVER); + + dbenv->tx_timestamp = 0; + return (ret); +} + +/* + * Figure out how many logfiles we have processed. If we are moving + * forward (is_forward != 0), then we're computing current - low. If + * we are moving backward, we are computing high - current. max is + * the number of bytes per logfile. + */ +static float +__lsn_diff(low, high, current, max, is_forward) + DB_LSN *low, *high, *current; + u_int32_t max; + int is_forward; +{ + float nf; + + /* + * There are three cases in each direction. If you are in the + * same file, then all you need worry about is the difference in + * offsets. If you are in different files, then either your offsets + * put you either more or less than the integral difference in the + * number of files -- we need to handle both of these. + */ + if (is_forward) { + if (current->file == low->file) + nf = (float)(current->offset - low->offset) / max; + else if (current->offset < low->offset) + nf = (float)(current->file - low->file - 1) + + (float)(max - low->offset + current->offset) / max; + else + nf = (float)(current->file - low->file) + + (float)(current->offset - low->offset) / max; + } else { + if (current->file == high->file) + nf = (float)(high->offset - current->offset) / max; + else if (current->offset > high->offset) + nf = (float)(high->file - current->file - 1) + + (float)(max - current->offset + high->offset) / max; + else + nf = (float)(high->file - current->file) + + (float)(high->offset - current->offset) / max; + } + return (nf); +} + +/* + * __log_earliest -- + * + * Return the earliest recovery point for the log files present. The + * earliest recovery time is the time stamp of the first checkpoint record + * whose checkpoint LSN is greater than the first LSN we process. + */ +static int +__log_earliest(dbenv, lowtime, lowlsn) + DB_ENV *dbenv; + int32_t *lowtime; + DB_LSN *lowlsn; +{ + DB_LSN first_lsn, lsn; + DBT data; + __txn_ckp_args *ckpargs; + u_int32_t rectype; + int cmp, ret; + + memset(&data, 0, sizeof(data)); + /* + * Read forward through the log looking for the first checkpoint + * record whose ckp_lsn is greater than first_lsn. + */ + + for (ret = log_get(dbenv, &first_lsn, &data, DB_FIRST); + ret == 0; ret = log_get(dbenv, &lsn, &data, DB_NEXT)) { + if (ret != 0) + break; + memcpy(&rectype, data.data, sizeof(rectype)); + if (rectype != DB_txn_ckp) + continue; + if ((ret = __txn_ckp_read(dbenv, data.data, &ckpargs)) == 0) { + cmp = log_compare(&ckpargs->ckp_lsn, &first_lsn); + *lowlsn = ckpargs->ckp_lsn; + *lowtime = ckpargs->timestamp; + + __os_free(ckpargs, 0); + if (cmp >= 0) + break; + } + } + + return (ret); +} diff --git a/bdb/env/env_region.c b/bdb/env/env_region.c new file mode 100644 index 00000000000..f3df4bac184 --- /dev/null +++ b/bdb/env/env_region.c @@ -0,0 +1,1205 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Sleepycat Software. All rights reserved. + */ + +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: env_region.c,v 11.28 2000/12/12 17:36:10 bostic Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <ctype.h> +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "db_shash.h" +#include "lock.h" +#include "lock_ext.h" +#include "log.h" +#include "log_ext.h" +#include "mp.h" +#include "mp_ext.h" +#include "txn.h" +#include "txn_ext.h" + +static int __db_des_destroy __P((DB_ENV *, REGION *)); +static int __db_des_get __P((DB_ENV *, REGINFO *, REGINFO *, REGION **)); +static int __db_e_remfile __P((DB_ENV *)); +static int __db_faultmem __P((void *, size_t, int)); +static void __db_region_destroy __P((DB_ENV *, REGINFO *)); + +/* + * __db_e_attach + * Join/create the environment + * + * PUBLIC: int __db_e_attach __P((DB_ENV *, u_int32_t *)); + */ +int +__db_e_attach(dbenv, init_flagsp) + DB_ENV *dbenv; + u_int32_t *init_flagsp; +{ + REGENV *renv; + REGENV_REF ref; + REGINFO *infop; + REGION *rp, tregion; + size_t size; + size_t nrw; + u_int32_t mbytes, bytes; + int retry_cnt, ret, segid; + char buf[sizeof(DB_REGION_FMT) + 20]; + +#if !defined(HAVE_MUTEX_THREADS) + /* + * !!! + * If we don't have spinlocks, we need a file descriptor for fcntl(2) + * locking. We use the file handle from the REGENV file for this + * purpose. + * + * Since we may be using shared memory regions, e.g., shmget(2), and + * not a mapped-in regular file, the backing file may be only a few + * bytes in length. So, this depends on the ability to call fcntl to + * lock file offsets much larger than the actual physical file. I + * think that's safe -- besides, very few systems actually need this + * kind of support, SunOS is the only one still in wide use of which + * I'm aware. + * + * The error case is if an application lacks spinlocks and wants to be + * threaded. That doesn't work because fcntl may lock the underlying + * process, including all its threads. + */ + if (F_ISSET(dbenv, DB_ENV_THREAD)) { + __db_err(dbenv, +"architecture lacks fast mutexes: applications cannot be threaded"); + return (EINVAL); + } +#endif + + /* Initialization */ + retry_cnt = 0; + + /* Repeated initialization. */ +loop: renv = NULL; + + /* Set up the DB_ENV's REG_INFO structure. */ + if ((ret = __os_calloc(dbenv, 1, sizeof(REGINFO), &infop)) != 0) + return (ret); + infop->type = REGION_TYPE_ENV; + infop->id = REGION_ID_ENV; + infop->mode = dbenv->db_mode; + infop->flags = REGION_JOIN_OK; + if (F_ISSET(dbenv, DB_ENV_CREATE)) + F_SET(infop, REGION_CREATE_OK); + + /* + * We have to single-thread the creation of the REGENV region. Once + * it exists, we can do locking using locks in the region, but until + * then we have to be the only player in the game. + * + * If this is a private environment, we are only called once and there + * are no possible race conditions. + * + * If this is a public environment, we use the filesystem to ensure + * the creation of the environment file is single-threaded. + */ + if (F_ISSET(dbenv, DB_ENV_PRIVATE)) + goto creation; + + /* Build the region name. */ + (void)snprintf(buf, sizeof(buf), "%s", DB_REGION_ENV); + if ((ret = __db_appname(dbenv, + DB_APP_NONE, NULL, buf, 0, NULL, &infop->name)) != 0) + goto err; + + /* + * Try to create the file, if we have the authority. We have to ensure + * that multiple threads/processes attempting to simultaneously create + * the file are properly ordered. Open using the O_CREAT and O_EXCL + * flags so that multiple attempts to create the region will return + * failure in all but one. POSIX 1003.1 requires that EEXIST be the + * errno return value -- I sure hope they're right. + */ + if (F_ISSET(dbenv, DB_ENV_CREATE)) { + if ((ret = __os_open(dbenv, + infop->name, DB_OSO_REGION | DB_OSO_CREATE | DB_OSO_EXCL, + dbenv->db_mode, dbenv->lockfhp)) == 0) + goto creation; + if (ret != EEXIST) { + __db_err(dbenv, + "%s: %s", infop->name, db_strerror(ret)); + goto err; + } + } + + /* + * If we couldn't create the file, try and open it. (If that fails, + * we're done.) + */ + if ((ret = __os_open(dbenv, infop->name, + DB_OSO_REGION, dbenv->db_mode, dbenv->lockfhp)) != 0) + goto err; + + /* + * !!! + * The region may be in system memory not backed by the filesystem + * (more specifically, not backed by this file), and we're joining + * it. In that case, the process that created it will have written + * out a REGENV_REF structure as its only contents. We read that + * structure before we do anything further, e.g., we can't just map + * that file in and then figure out what's going on. + * + * All of this noise is because some systems don't have a coherent VM + * and buffer cache, and what's worse, when you mix operations on the + * VM and buffer cache, half the time you hang the system. + * + * If the file is the size of an REGENV_REF structure, then we know + * the real region is in some other memory. (The only way you get a + * file that size is to deliberately write it, as it's smaller than + * any possible disk sector created by writing a file or mapping the + * file into memory.) In which case, retrieve the structure from the + * file and use it to acquire the referenced memory. + * + * If the structure is larger than a REGENV_REF structure, then this + * file is backing the shared memory region, and we just map it into + * memory. + * + * And yes, this makes me want to take somebody and kill them. (I + * digress -- but you have no freakin' idea. This is unbelievably + * stupid and gross, and I've probably spent six months of my life, + * now, trying to make different versions of it work.) + */ + if ((ret = __os_ioinfo(dbenv, infop->name, + dbenv->lockfhp, &mbytes, &bytes, NULL)) != 0) { + __db_err(dbenv, "%s: %s", infop->name, db_strerror(ret)); + goto err; + } + + /* + * !!! + * A size_t is OK -- regions get mapped into memory, and so can't + * be larger than a size_t. + */ + size = mbytes * MEGABYTE + bytes; + + /* + * If the size is less than the size of a REGENV_REF structure, the + * region (or, possibly, the REGENV_REF structure) has not yet been + * completely written. Wait awhile and try again. + * + * Otherwise, if the size is the size of a REGENV_REF structure, + * read it into memory and use it as a reference to the real region. + */ + if (size <= sizeof(ref)) { + if (size != sizeof(ref)) + goto retry; + + if ((ret = __os_read(dbenv, dbenv->lockfhp, &ref, + sizeof(ref), &nrw)) != 0 || nrw < (size_t)sizeof(ref)) { + if (ret == 0) + ret = EIO; + __db_err(dbenv, + "%s: unable to read system-memory information from: %s", + infop->name, db_strerror(ret)); + goto err; + } + size = ref.size; + segid = ref.segid; + + F_SET(dbenv, DB_ENV_SYSTEM_MEM); + } else if (F_ISSET(dbenv, DB_ENV_SYSTEM_MEM)) { + ret = EINVAL; + __db_err(dbenv, + "%s: existing environment not created in system memory: %s", + infop->name, db_strerror(ret)); + goto err; + } else + segid = INVALID_REGION_SEGID; + + /* + * If not doing thread locking, we need to save the file handle for + * fcntl(2) locking. Otherwise, discard the handle, we no longer + * need it, and the less contact between the buffer cache and the VM, + * the better. + */ +#ifdef HAVE_MUTEX_THREADS + __os_closehandle(dbenv->lockfhp); +#endif + + /* Call the region join routine to acquire the region. */ + memset(&tregion, 0, sizeof(tregion)); + tregion.size = size; + tregion.segid = segid; + if ((ret = __os_r_attach(dbenv, infop, &tregion)) != 0) + goto err; + + /* + * The environment's REGENV structure has to live at offset 0 instead + * of the usual shalloc information. Set the primary reference and + * correct the "addr" value to reference the shalloc region. Note, + * this means that all of our offsets (R_ADDR/R_OFFSET) get shifted + * as well, but that should be fine. + */ + infop->primary = R_ADDR(infop, 0); + infop->addr = (u_int8_t *)infop->addr + sizeof(REGENV); + + /* + * Check if the environment has had a catastrophic failure. + * + * Check the magic number to ensure the region is initialized. If the + * magic number isn't set, the lock may not have been initialized, and + * an attempt to use it could lead to random behavior. + * + * The panic and magic values aren't protected by any lock, so we never + * use them in any check that's more complex than set/not-set. + * + * !!! + * I'd rather play permissions games using the underlying file, but I + * can't because Windows/NT filesystems won't open files mode 0. + */ + renv = infop->primary; + if (renv->panic) { + ret = __db_panic_msg(dbenv); + goto err; + } + if (renv->magic != DB_REGION_MAGIC) + goto retry; + + /* Make sure the region matches our build. */ + if (renv->majver != DB_VERSION_MAJOR || + renv->minver != DB_VERSION_MINOR || + renv->patch != DB_VERSION_PATCH) { + __db_err(dbenv, + "Program version %d.%d.%d doesn't match environment version %d.%d.%d", + DB_VERSION_MAJOR, DB_VERSION_MINOR, DB_VERSION_PATCH, + renv->majver, renv->minver, renv->patch); +#ifndef DIAGNOSTIC + ret = EINVAL; + goto err; +#endif + } + + /* Lock the environment. */ + MUTEX_LOCK(dbenv, &renv->mutex, dbenv->lockfhp); + + /* + * Finally! We own the environment now. Repeat the panic check, it's + * possible that it was set while we waited for the lock. + */ + if (renv->panic) { + ret = __db_panic_msg(dbenv); + goto err_unlock; + } + + /* + * Get a reference to the underlying REGION information for this + * environment. + */ + if ((ret = __db_des_get(dbenv, infop, infop, &rp)) != 0 || rp == NULL) { + MUTEX_UNLOCK(dbenv, &renv->mutex); + goto find_err; + } + infop->rp = rp; + + /* + * There's still a possibility for inconsistent data. When we acquired + * the size of the region and attached to it, it might have still been + * growing as part of its creation. We can detect this by checking the + * size we originally found against the region's current size. (The + * region's current size has to be final, the creator finished growing + * it before releasing the environment for us to lock.) + */ + if (rp->size != size) { +err_unlock: MUTEX_UNLOCK(dbenv, &renv->mutex); + goto retry; + } + + /* Increment the reference count. */ + ++renv->refcnt; + + /* + * If our caller wants them, return the flags this environment was + * initialized with. + */ + if (init_flagsp != NULL) + *init_flagsp = renv->init_flags; + + /* Discard our lock. */ + MUTEX_UNLOCK(dbenv, &renv->mutex); + + /* + * Fault the pages into memory. Note, do this AFTER releasing the + * lock, because we're only reading the pages, not writing them. + */ + (void)__db_faultmem(infop->primary, rp->size, 0); + + /* Everything looks good, we're done. */ + dbenv->reginfo = infop; + return (0); + +creation: + /* Create the environment region. */ + F_SET(infop, REGION_CREATE); + + /* + * Allocate room for 50 REGION structures plus overhead (we're going + * to use this space for last-ditch allocation requests), although we + * should never need anything close to that. + */ + memset(&tregion, 0, sizeof(tregion)); + tregion.size = 50 * sizeof(REGION) + 50 * sizeof(MUTEX) + 2048; + tregion.segid = INVALID_REGION_SEGID; + if ((ret = __os_r_attach(dbenv, infop, &tregion)) != 0) + goto err; + + /* + * Fault the pages into memory. Note, do this BEFORE we initialize + * anything, because we're writing the pages, not just reading them. + */ + (void)__db_faultmem(infop->addr, tregion.size, 1); + + /* + * The first object in the region is the REGENV structure. This is + * different from the other regions, and, from everything else in + * this region, where all objects are allocated from the pool, i.e., + * there aren't any fixed locations. The remaining space is made + * available for later allocation. + * + * The allocation space must be size_t aligned, because that's what + * the initialization routine is going to store there. To make sure + * that happens, the REGENV structure was padded with a final size_t. + * No other region needs to worry about it because all of them treat + * the entire region as allocation space. + * + * Set the primary reference and correct the "addr" value to reference + * the shalloc region. Note, this requires that we "uncorrect" it at + * region detach, and that all of our offsets (R_ADDR/R_OFFSET) will be + * shifted as well, but that should be fine. + */ + infop->primary = R_ADDR(infop, 0); + infop->addr = (u_int8_t *)infop->addr + sizeof(REGENV); + __db_shalloc_init(infop->addr, tregion.size - sizeof(REGENV)); + + /* + * Initialize the rest of the REGENV structure, except for the magic + * number which validates the file/environment. + */ + renv = infop->primary; + renv->panic = 0; + db_version(&renv->majver, &renv->minver, &renv->patch); + SH_LIST_INIT(&renv->regionq); + renv->refcnt = 1; + + /* + * Initialize init_flags to store the flags that any other environment + * handle that uses DB_JOINENV to join this environment will need. + */ + renv->init_flags = (init_flagsp == NULL) ? 0 : *init_flagsp; + + /* + * Lock the environment. + * + * Check the lock call return. This is the first lock we initialize + * and acquire, and we have to know if it fails. (It CAN fail, e.g., + * SunOS, when using fcntl(2) for locking and using an in-memory + * filesystem as the database home. But you knew that, I'm sure -- it + * probably wasn't even worth mentioning.) + */ + if ((ret = + __db_mutex_init(dbenv, &renv->mutex, DB_FCNTL_OFF_GEN, 0)) != 0) { + __db_err(dbenv, "%s: unable to initialize environment lock: %s", + infop->name, db_strerror(ret)); + goto err; + } + + if (!F_ISSET(&renv->mutex, MUTEX_IGNORE) && + (ret = __db_mutex_lock(dbenv, &renv->mutex, dbenv->lockfhp)) != 0) { + __db_err(dbenv, "%s: unable to acquire environment lock: %s", + infop->name, db_strerror(ret)); + goto err; + } + + /* + * Get the underlying REGION structure for this environment. Note, + * we created the underlying OS region before we acquired the REGION + * structure, which is backwards from the normal procedure. Update + * the REGION structure. + */ + if ((ret = __db_des_get(dbenv, infop, infop, &rp)) != 0) { +find_err: __db_err(dbenv, + "%s: unable to find environment", infop->name); + if (ret == 0) + ret = EINVAL; + goto err; + } + infop->rp = rp; + rp->size = tregion.size; + rp->segid = tregion.segid; + + /* + * !!! + * If we create an environment where regions are public and in system + * memory, we have to inform processes joining the environment how to + * attach to the shared memory segment. So, we write the shared memory + * identifier into the file, to be read by those other processes. + * + * XXX + * This is really OS-layer information, but I can't see any easy way + * to move it down there without passing down information that it has + * no right to know, e.g., that this is the one-and-only REGENV region + * and not some other random region. + */ + if (tregion.segid != INVALID_REGION_SEGID) { + ref.size = tregion.size; + ref.segid = tregion.segid; + if ((ret = __os_write(dbenv, dbenv->lockfhp, + &ref, sizeof(ref), &nrw)) != 0 || nrw != sizeof(ref)) { + __db_err(dbenv, + "%s: unable to write out public environment ID: %s", + infop->name, db_strerror(ret)); + goto err; + } + } + + /* + * If not doing thread locking, we need to save the file handle for + * fcntl(2) locking. Otherwise, discard the handle, we no longer + * need it, and the less contact between the buffer cache and the VM, + * the better. + */ +#if defined(HAVE_MUTEX_THREADS) + if (F_ISSET(dbenv->lockfhp, DB_FH_VALID)) + __os_closehandle(dbenv->lockfhp); +#endif + + /* Validate the file. */ + renv->magic = DB_REGION_MAGIC; + + /* Discard our lock. */ + MUTEX_UNLOCK(dbenv, &renv->mutex); + + /* Everything looks good, we're done. */ + dbenv->reginfo = infop; + return (0); + +err: +retry: /* Close any open file handle. */ + if (F_ISSET(dbenv->lockfhp, DB_FH_VALID)) + (void)__os_closehandle(dbenv->lockfhp); + + /* + * If we joined or created the region, detach from it. If we created + * it, destroy it. Note, there's a path in the above code where we're + * using a temporary REGION structure because we haven't yet allocated + * the real one. In that case the region address (addr) will be filled + * in, but the REGION pointer (rp) won't. Fix it. + */ + if (infop->addr != NULL) { + if (infop->rp == NULL) + infop->rp = &tregion; + + /* Reset the addr value that we "corrected" above. */ + infop->addr = infop->primary; + (void)__os_r_detach(dbenv, + infop, F_ISSET(infop, REGION_CREATE)); + } + + /* Free the allocated name and/or REGINFO structure. */ + if (infop->name != NULL) + __os_freestr(infop->name); + __os_free(infop, sizeof(REGINFO)); + + /* If we had a temporary error, wait awhile and try again. */ + if (ret == 0) { + if (++retry_cnt > 3) { + __db_err(dbenv, "unable to join the environment"); + ret = EAGAIN; + } else { + __os_sleep(dbenv, retry_cnt * 3, 0); + goto loop; + } + } + + return (ret); +} + +/* + * __db_e_detach -- + * Detach from the environment. + * + * PUBLIC: int __db_e_detach __P((DB_ENV *, int)); + */ +int +__db_e_detach(dbenv, destroy) + DB_ENV *dbenv; + int destroy; +{ + REGENV *renv; + REGINFO *infop; + + infop = dbenv->reginfo; + renv = infop->primary; + + /* Lock the environment. */ + MUTEX_LOCK(dbenv, &renv->mutex, dbenv->lockfhp); + + /* Decrement the reference count. */ + if (renv->refcnt == 0) { + __db_err(dbenv, + "region %lu (environment): reference count went negative", + infop->rp->id); + } else + --renv->refcnt; + + /* Release the lock. */ + MUTEX_UNLOCK(dbenv, &renv->mutex); + + /* Close the locking file handle. */ + if (F_ISSET(dbenv->lockfhp, DB_FH_VALID)) + (void)__os_closehandle(dbenv->lockfhp); + + /* Reset the addr value that we "corrected" above. */ + infop->addr = infop->primary; + + /* + * If we are destroying the environment, we need to + * destroy any system resources backing the mutex. + * Do that now before we free the memory in __os_r_detach. + */ + if (destroy) + __db_mutex_destroy(&renv->mutex); + + /* + * Release the region, and kill our reference. + * + * We set the DBENV->reginfo field to NULL here and discard its memory. + * DBENV->remove calls __dbenv_remove to do the region remove, and + * __dbenv_remove attached and then detaches from the region. We don't + * want to return to DBENV->remove with a non-NULL DBENV->reginfo field + * because it will attempt to detach again as part of its cleanup. + */ + (void)__os_r_detach(dbenv, infop, destroy); + + if (infop->name != NULL) + __os_free(infop->name, 0); + __os_free(dbenv->reginfo, sizeof(REGINFO)); + dbenv->reginfo = NULL; + + return (0); +} + +/* + * __db_e_remove -- + * Discard an environment if it's not in use. + * + * PUBLIC: int __db_e_remove __P((DB_ENV *, int)); + */ +int +__db_e_remove(dbenv, force) + DB_ENV *dbenv; + int force; +{ + REGENV *renv; + REGINFO *infop, reginfo; + REGION *rp; + int ret; + + /* + * This routine has to walk a nasty line between not looking into + * the environment (which may be corrupted after an app or system + * crash), and removing everything that needs removing. What we + * do is: + * 1. Connect to the environment (so it better be OK). + * 2. If the environment is in use (reference count is non-zero), + * return EBUSY. + * 3. Overwrite the magic number so that any threads of control + * attempting to connect will backoff and retry. + * 4. Walk the list of regions. Connect to each region and then + * disconnect with the destroy flag set. This shouldn't cause + * any problems, even if the region is corrupted, because we + * should never be looking inside the region. + * 5. Walk the list of files in the directory, unlinking any + * files that match a region name. Unlink the environment + * file last. + * + * If the force flag is set, we do not acquire any locks during this + * process. + */ + if (force) + dbenv->db_mutexlocks = 0; + + /* Join the environment. */ + if ((ret = __db_e_attach(dbenv, NULL)) != 0) { + /* + * If we can't join it, we assume that's because it doesn't + * exist. It would be better to know why we failed, but it + * probably isn't important. + */ + ret = 0; + if (force) + goto remfiles; + goto err; + } + + infop = dbenv->reginfo; + renv = infop->primary; + + /* Lock the environment. */ + MUTEX_LOCK(dbenv, &renv->mutex, dbenv->lockfhp); + + /* If it's in use, we're done. */ + if (renv->refcnt == 1 || force) { + /* + * Set the panic flag and overwrite the magic number. + * + * !!! + * From this point on, there's no going back, we pretty + * much ignore errors, and just whack on whatever we can. + */ + renv->panic = 1; + renv->magic = 0; + + /* + * Unlock the environment. We should no longer need the lock + * because we've poisoned the pool, but we can't continue to + * hold it either, because other routines may want it. + */ + MUTEX_UNLOCK(dbenv, &renv->mutex); + + /* + * Attach to each sub-region and destroy it. + * + * !!! + * The REGION_CREATE_OK flag is set for Windows/95 -- regions + * are zero'd out when the last reference to the region goes + * away, in which case the underlying OS region code requires + * callers be prepared to create the region in order to join it. + */ + memset(®info, 0, sizeof(reginfo)); +restart: for (rp = SH_LIST_FIRST(&renv->regionq, __db_region); + rp != NULL; rp = SH_LIST_NEXT(rp, q, __db_region)) { + if (rp->type == REGION_TYPE_ENV) + continue; + + reginfo.id = rp->id; + reginfo.flags = REGION_CREATE_OK; + if ((ret = __db_r_attach(dbenv, ®info, 0)) != 0) { + __db_err(dbenv, + "region %s attach: %s", db_strerror(ret)); + continue; + } + R_UNLOCK(dbenv, ®info); + if ((ret = __db_r_detach(dbenv, ®info, 1)) != 0) { + __db_err(dbenv, + "region detach: %s", db_strerror(ret)); + continue; + } + /* + * If we have an error, we continue so we eventually + * reach the end of the list. If we succeed, restart + * the list because it was relinked when we destroyed + * the entry. + */ + goto restart; + } + + /* Destroy the environment's region. */ + (void)__db_e_detach(dbenv, 1); + + /* Discard the physical files. */ +remfiles: (void)__db_e_remfile(dbenv); + } else { + /* Unlock the environment. */ + MUTEX_UNLOCK(dbenv, &renv->mutex); + + /* Discard the environment. */ + (void)__db_e_detach(dbenv, 0); + + ret = EBUSY; + } + +err: + return (ret); +} + +/* + * __db_e_remfile -- + * Discard any region files in the filesystem. + */ +static int +__db_e_remfile(dbenv) + DB_ENV *dbenv; +{ + static char *old_region_names[] = { + "__db_lock.share", + "__db_log.share", + "__db_mpool.share", + "__db_txn.share", + NULL, + }; + int cnt, fcnt, lastrm, ret; + u_int8_t saved_byte; + const char *dir; + char *p, **names, *path, buf[sizeof(DB_REGION_FMT) + 20]; + + /* Get the full path of a file in the environment. */ + (void)snprintf(buf, sizeof(buf), "%s", DB_REGION_ENV); + if ((ret = + __db_appname(dbenv, DB_APP_NONE, NULL, buf, 0, NULL, &path)) != 0) + return (ret); + + /* Get the parent directory for the environment. */ + if ((p = __db_rpath(path)) == NULL) { + p = path; + saved_byte = *p; + + dir = PATH_DOT; + } else { + saved_byte = *p; + *p = '\0'; + + dir = path; + } + + /* Get the list of file names. */ + ret = __os_dirlist(dbenv, dir, &names, &fcnt); + + /* Restore the path, and free it. */ + *p = saved_byte; + __os_freestr(path); + + if (ret != 0) { + __db_err(dbenv, "%s: %s", dir, db_strerror(ret)); + return (ret); + } + + /* + * Search for valid region names, and remove them. We remove the + * environment region last, because it's the key to this whole mess. + */ + for (lastrm = -1, cnt = fcnt; --cnt >= 0;) { + if (strlen(names[cnt]) != DB_REGION_NAME_LENGTH || + memcmp(names[cnt], DB_REGION_FMT, DB_REGION_NAME_NUM) != 0) + continue; + if (strcmp(names[cnt], DB_REGION_ENV) == 0) { + lastrm = cnt; + continue; + } + for (p = names[cnt] + DB_REGION_NAME_NUM; + *p != '\0' && isdigit((int)*p); ++p) + ; + if (*p != '\0') + continue; + + if (__db_appname(dbenv, + DB_APP_NONE, NULL, names[cnt], 0, NULL, &path) == 0) { + (void)__os_unlink(dbenv, path); + __os_freestr(path); + } + } + + if (lastrm != -1) + if (__db_appname(dbenv, + DB_APP_NONE, NULL, names[lastrm], 0, NULL, &path) == 0) { + (void)__os_unlink(dbenv, path); + __os_freestr(path); + } + __os_dirfree(names, fcnt); + + /* + * !!! + * Backward compatibility -- remove region files from releases + * before 2.8.XX. + */ + for (names = (char **)old_region_names; *names != NULL; ++names) + if (__db_appname(dbenv, + DB_APP_NONE, NULL, *names, 0, NULL, &path) == 0) { + (void)__os_unlink(dbenv, path); + __os_freestr(path); + } + + return (0); +} + +/* + * __db_e_stat + * Statistics for the environment. + * + * PUBLIC: int __db_e_stat __P((DB_ENV *, REGENV *, REGION *, int *)); + */ +int +__db_e_stat(dbenv, arg_renv, arg_regions, arg_regions_cnt) + DB_ENV *dbenv; + REGENV *arg_renv; + REGION *arg_regions; + int *arg_regions_cnt; +{ + REGENV *renv; + REGINFO *infop; + REGION *rp; + int n; + + infop = dbenv->reginfo; + renv = infop->primary; + rp = infop->rp; + + /* Lock the environment. */ + MUTEX_LOCK(dbenv, &rp->mutex, dbenv->lockfhp); + + *arg_renv = *renv; + + for (n = 0, rp = SH_LIST_FIRST(&renv->regionq, __db_region); + n < *arg_regions_cnt && rp != NULL; + ++n, rp = SH_LIST_NEXT(rp, q, __db_region)) + arg_regions[n] = *rp; + + /* Release the lock. */ + rp = infop->rp; + MUTEX_UNLOCK(dbenv, &rp->mutex); + + *arg_regions_cnt = n == 0 ? n : n - 1; + + return (0); +} + +/* + * __db_r_attach + * Join/create a region. + * + * PUBLIC: int __db_r_attach __P((DB_ENV *, REGINFO *, size_t)); + */ +int +__db_r_attach(dbenv, infop, size) + DB_ENV *dbenv; + REGINFO *infop; + size_t size; +{ + REGENV *renv; + REGION *rp; + int ret; + char buf[sizeof(DB_REGION_FMT) + 20]; + + renv = ((REGINFO *)dbenv->reginfo)->primary; + F_CLR(infop, REGION_CREATE); + + /* Lock the environment. */ + MUTEX_LOCK(dbenv, &renv->mutex, dbenv->lockfhp); + + /* Find or create a REGION structure for this region. */ + if ((ret = __db_des_get(dbenv, dbenv->reginfo, infop, &rp)) != 0) { + MUTEX_UNLOCK(dbenv, &renv->mutex); + return (ret); + } + infop->rp = rp; + infop->type = rp->type; + infop->id = rp->id; + + /* If we're creating the region, set the desired size. */ + if (F_ISSET(infop, REGION_CREATE)) + rp->size = size; + + /* Join/create the underlying region. */ + (void)snprintf(buf, sizeof(buf), DB_REGION_FMT, infop->id); + if ((ret = __db_appname(dbenv, + DB_APP_NONE, NULL, buf, 0, NULL, &infop->name)) != 0) + goto err; + if ((ret = __os_r_attach(dbenv, infop, rp)) != 0) + goto err; + + /* + * Fault the pages into memory. Note, do this BEFORE we initialize + * anything because we're writing pages in created regions, not just + * reading them. + */ + (void)__db_faultmem(infop->addr, + rp->size, F_ISSET(infop, REGION_CREATE)); + + /* + * !!! + * The underlying layer may have just decided that we are going + * to create the region. There are various system issues that + * can result in a useless region that requires re-initialization. + * + * If we created the region, initialize it for allocation. + */ + if (F_ISSET(infop, REGION_CREATE)) { + ((REGION *)(infop->addr))->magic = DB_REGION_MAGIC; + + (void)__db_shalloc_init(infop->addr, rp->size); + } + + /* + * If the underlying REGION isn't the environment, acquire a lock + * for it and release our lock on the environment. + */ + if (infop->type != REGION_TYPE_ENV) { + MUTEX_LOCK(dbenv, &rp->mutex, dbenv->lockfhp); + MUTEX_UNLOCK(dbenv, &renv->mutex); + } + + return (0); + + /* Discard the underlying region. */ +err: if (infop->addr != NULL) + (void)__os_r_detach(dbenv, + infop, F_ISSET(infop, REGION_CREATE)); + infop->rp = NULL; + infop->id = INVALID_REGION_ID; + + /* Discard the REGION structure if we created it. */ + if (F_ISSET(infop, REGION_CREATE)) + (void)__db_des_destroy(dbenv, rp); + + /* Release the environment lock. */ + MUTEX_UNLOCK(dbenv, &renv->mutex); + + return (ret); +} + +/* + * __db_r_detach -- + * Detach from a region. + * + * PUBLIC: int __db_r_detach __P((DB_ENV *, REGINFO *, int)); + */ +int +__db_r_detach(dbenv, infop, destroy) + DB_ENV *dbenv; + REGINFO *infop; + int destroy; +{ + REGENV *renv; + REGION *rp; + int ret, t_ret; + + renv = ((REGINFO *)dbenv->reginfo)->primary; + rp = infop->rp; + + /* Lock the environment. */ + MUTEX_LOCK(dbenv, &renv->mutex, dbenv->lockfhp); + + /* Acquire the lock for the REGION. */ + MUTEX_LOCK(dbenv, &rp->mutex, dbenv->lockfhp); + + /* + * We need to call destroy on per-subsystem info before + * we free the memory associated with the region. + */ + if (destroy) + __db_region_destroy(dbenv, infop); + + /* Detach from the underlying OS region. */ + ret = __os_r_detach(dbenv, infop, destroy); + + /* Release the REGION lock. */ + MUTEX_UNLOCK(dbenv, &rp->mutex); + + /* If we destroyed the region, discard the REGION structure. */ + if (destroy && + ((t_ret = __db_des_destroy(dbenv, rp)) != 0) && ret == 0) + ret = t_ret; + + /* Release the environment lock. */ + MUTEX_UNLOCK(dbenv, &renv->mutex); + + /* Destroy the structure. */ + if (infop->name != NULL) + __os_freestr(infop->name); + + return (ret); +} + +/* + * __db_des_get -- + * Return a reference to the shared information for a REGION, + * optionally creating a new entry. + */ +static int +__db_des_get(dbenv, env_infop, infop, rpp) + DB_ENV *dbenv; + REGINFO *env_infop, *infop; + REGION **rpp; +{ + REGENV *renv; + REGION *rp, *first_type; + u_int32_t maxid; + int ret; + + /* + * !!! + * Called with the environment already locked. + */ + *rpp = NULL; + renv = env_infop->primary; + + /* + * If the caller wants to join a region, walk through the existing + * regions looking for a matching ID (if ID specified) or matching + * type (if type specified). If we return based on a matching type + * return the "primary" region, that is, the first region that was + * created of this type. + * + * Track the maximum region ID so we can allocate a new region, + * note that we have to start at 1 because the primary environment + * uses ID == 1. + */ + maxid = REGION_ID_ENV; + for (first_type = NULL, + rp = SH_LIST_FIRST(&renv->regionq, __db_region); + rp != NULL; rp = SH_LIST_NEXT(rp, q, __db_region)) { + if (infop->id != INVALID_REGION_ID) { + if (infop->id == rp->id) + break; + continue; + } + if (infop->type == rp->type && + F_ISSET(infop, REGION_JOIN_OK) && + (first_type == NULL || first_type->id > rp->id)) + first_type = rp; + + if (rp->id > maxid) + maxid = rp->id; + } + if (rp == NULL) + rp = first_type; + + /* + * If we didn't find a region and we can't create the region, fail. + * The caller generates any error message. + */ + if (rp == NULL && !F_ISSET(infop, REGION_CREATE_OK)) + return (ENOENT); + + /* + * If we didn't find a region, create and initialize a REGION structure + * for the caller. If id was set, use that value, otherwise we use the + * next available ID. + */ + if (rp == NULL) { + if ((ret = __db_shalloc(env_infop->addr, + sizeof(REGION), MUTEX_ALIGN, &rp)) != 0) + return (ret); + + /* Initialize the region. */ + memset(rp, 0, sizeof(*rp)); + if ((ret = __db_mutex_init(dbenv, &rp->mutex, + R_OFFSET(env_infop, &rp->mutex) + DB_FCNTL_OFF_GEN, + 0)) != 0) { + __db_shalloc_free(env_infop->addr, rp); + return (ret); + } + rp->segid = INVALID_REGION_SEGID; + + /* + * Set the type and ID; if no region ID was specified, + * allocate one. + */ + rp->type = infop->type; + rp->id = infop->id == INVALID_REGION_ID ? maxid + 1 : infop->id; + + SH_LIST_INSERT_HEAD(&renv->regionq, rp, q, __db_region); + F_SET(infop, REGION_CREATE); + } + + *rpp = rp; + return (0); +} + +/* + * __db_des_destroy -- + * Destroy a reference to a REGION. + */ +static int +__db_des_destroy(dbenv, rp) + DB_ENV *dbenv; + REGION *rp; +{ + REGINFO *infop; + + /* + * !!! + * Called with the environment already locked. + */ + infop = dbenv->reginfo; + + SH_LIST_REMOVE(rp, q, __db_region); + __db_mutex_destroy(&rp->mutex); + __db_shalloc_free(infop->addr, rp); + + return (0); +} + +/* + * __db_faultmem -- + * Fault the region into memory. + */ +static int +__db_faultmem(addr, size, created) + void *addr; + size_t size; + int created; +{ + int ret; + u_int8_t *p, *t; + + /* + * It's sometimes significantly faster to page-fault in all of the + * region's pages before we run the application, as we see nasty + * side-effects when we page-fault while holding various locks, i.e., + * the lock takes a long time to acquire because of the underlying + * page fault, and the other threads convoy behind the lock holder. + * + * If we created the region, we write a non-zero value so that the + * system can't cheat. If we're just joining the region, we can + * only read the value and try to confuse the compiler sufficiently + * that it doesn't figure out that we're never really using it. + */ + ret = 0; + if (DB_GLOBAL(db_region_init)) { + if (created) + for (p = addr, t = (u_int8_t *)addr + size; + p < t; p += OS_VMPAGESIZE) + p[0] = 0xdb; + else + for (p = addr, t = (u_int8_t *)addr + size; + p < t; p += OS_VMPAGESIZE) + ret |= p[0]; + } + + return (ret); +} + +/* + * __db_region_destroy -- + * Destroy per-subsystem region information. + * Called with the region already locked. + */ +static void +__db_region_destroy(dbenv, infop) + DB_ENV *dbenv; + REGINFO *infop; +{ + switch (infop->type) { + case REGION_TYPE_LOCK: + __lock_region_destroy(dbenv, infop); + break; + case REGION_TYPE_MPOOL: + __mpool_region_destroy(dbenv, infop); + break; + case REGION_TYPE_ENV: + case REGION_TYPE_LOG: + case REGION_TYPE_MUTEX: + case REGION_TYPE_TXN: + break; + default: + DB_ASSERT(0); + break; + } +} |