diff options
author | Lorry <lorry@roadtrain.codethink.co.uk> | 2012-07-20 20:00:05 +0100 |
---|---|---|
committer | Lorry <lorry@roadtrain.codethink.co.uk> | 2012-07-20 20:00:05 +0100 |
commit | 3ef782d3745ea8f25a3151561a3cfb882190210e (patch) | |
tree | 86b9c2f5fde051dd0bced99b3fc9f5a3ba08db69 /src/env | |
download | berkeleydb-3ef782d3745ea8f25a3151561a3cfb882190210e.tar.gz |
Tarball conversion
Diffstat (limited to 'src/env')
-rw-r--r-- | src/env/env_alloc.c | 759 | ||||
-rw-r--r-- | src/env/env_backup.c | 166 | ||||
-rw-r--r-- | src/env/env_config.c | 737 | ||||
-rw-r--r-- | src/env/env_failchk.c | 558 | ||||
-rw-r--r-- | src/env/env_file.c | 128 | ||||
-rw-r--r-- | src/env/env_globals.c | 66 | ||||
-rw-r--r-- | src/env/env_method.c | 1918 | ||||
-rw-r--r-- | src/env/env_name.c | 285 | ||||
-rw-r--r-- | src/env/env_open.c | 1262 | ||||
-rw-r--r-- | src/env/env_recover.c | 1093 | ||||
-rw-r--r-- | src/env/env_region.c | 1497 | ||||
-rw-r--r-- | src/env/env_register.c | 730 | ||||
-rw-r--r-- | src/env/env_sig.c | 201 | ||||
-rw-r--r-- | src/env/env_stat.c | 879 |
14 files changed, 10279 insertions, 0 deletions
diff --git a/src/env/env_alloc.c b/src/env/env_alloc.c new file mode 100644 index 00000000..700bfb27 --- /dev/null +++ b/src/env/env_alloc.c @@ -0,0 +1,759 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" + +/* + * Implement shared memory region allocation. The initial list is a single + * memory "chunk" which is carved up as memory is requested. Chunks are + * coalesced when free'd. We maintain two types of linked-lists: a list of + * all chunks sorted by address, and a set of lists with free chunks sorted + * by size. + * + * The ALLOC_LAYOUT structure is the governing structure for the allocator. + * + * The ALLOC_ELEMENT structure is the structure that describes any single + * chunk of memory, and is immediately followed by the user's memory. + * + * The internal memory chunks are always aligned to a uintmax_t boundary so + * we don't drop core accessing the fields of the ALLOC_ELEMENT structure. + * + * The memory chunks returned to the user are aligned to a uintmax_t boundary. + * This is enforced by terminating the ALLOC_ELEMENT structure with a uintmax_t + * field as that immediately precedes the user's memory. Any caller needing + * more than uintmax_t alignment is responsible for doing alignment themselves. + */ + +typedef SH_TAILQ_HEAD(__sizeq) SIZEQ_HEAD; + +typedef struct __alloc_layout { + SH_TAILQ_HEAD(__addrq) addrq; /* Sorted by address */ + + /* + * A perfect Berkeley DB application does little allocation because + * most things are allocated on startup and never free'd. This is + * true even for the cache, because we don't free and re-allocate + * the memory associated with a cache buffer when swapping a page + * in memory for a page on disk -- unless the page is changing size. + * The latter problem is why we have multiple size queues. If the + * application's working set fits in cache, it's not a problem. If + * the application's working set doesn't fit in cache, but all of + * the databases have the same size pages, it's still not a problem. + * If the application's working set doesn't fit in cache, and its + * databases have different page sizes, we can end up walking a lot + * of 512B chunk allocations looking for an available 64KB chunk. + * + * So, we keep a set of queues, where we expect to find a chunk of + * roughly the right size at the front of the list. The first queue + * is chunks <= 1024, the second is <= 2048, and so on. With 11 + * queues, we have separate queues for chunks up to 1MB. + */ +#define DB_SIZE_Q_COUNT 11 + SIZEQ_HEAD sizeq[DB_SIZE_Q_COUNT]; /* Sorted by size */ +#ifdef HAVE_STATISTICS + u_int32_t pow2_size[DB_SIZE_Q_COUNT]; +#endif + +#ifdef HAVE_STATISTICS + u_int32_t success; /* Successful allocations */ + u_int32_t failure; /* Failed allocations */ + u_int32_t freed; /* Free calls */ + u_int32_t longest; /* Longest chain walked */ +#endif + uintmax_t unused; /* Guarantee alignment */ +} ALLOC_LAYOUT; + +typedef struct __alloc_element { + SH_TAILQ_ENTRY addrq; /* List by address */ + SH_TAILQ_ENTRY sizeq; /* List by size */ + + /* + * The "len" field is the total length of the chunk, not the size + * available to the caller. Use a uintmax_t to guarantee that the + * size of this struct will be aligned correctly. + */ + uintmax_t len; /* Chunk length */ + + /* + * The "ulen" field is the length returned to the caller. + * + * Set to 0 if the chunk is not currently in use. + */ + uintmax_t ulen; /* User's length */ +} ALLOC_ELEMENT; + +/* + * If the chunk can be split into two pieces, with the fragment holding at + * least 64 bytes of memory, we divide the chunk into two parts. + */ +#define SHALLOC_FRAGMENT (sizeof(ALLOC_ELEMENT) + 64) + +/* Macro to find the appropriate queue for a specific size chunk. */ +#undef SET_QUEUE_FOR_SIZE +#define SET_QUEUE_FOR_SIZE(head, q, i, len) do { \ + for (i = 0; i < DB_SIZE_Q_COUNT; ++i) { \ + q = &(head)->sizeq[i]; \ + if ((len) <= (u_int64_t)1024 << i) \ + break; \ + } \ +} while (0) + +static void __env_size_insert __P((ALLOC_LAYOUT *, ALLOC_ELEMENT *)); + +/* + * __env_alloc_init -- + * Initialize the area as one large chunk. + * + * PUBLIC: void __env_alloc_init __P((REGINFO *, size_t)); + */ +void +__env_alloc_init(infop, size) + REGINFO *infop; + size_t size; +{ + ALLOC_ELEMENT *elp; + ALLOC_LAYOUT *head; + ENV *env; + u_int i; + + env = infop->env; + + /* No initialization needed for heap memory regions. */ + if (F_ISSET(env, ENV_PRIVATE)) + return; + + /* + * The first chunk of memory is the ALLOC_LAYOUT structure. + */ + head = infop->head; + memset(head, 0, sizeof(*head)); + SH_TAILQ_INIT(&head->addrq); + for (i = 0; i < DB_SIZE_Q_COUNT; ++i) + SH_TAILQ_INIT(&head->sizeq[i]); + COMPQUIET(head->unused, 0); + + /* + * The rest of the memory is the first available chunk. + */ + elp = (ALLOC_ELEMENT *)((u_int8_t *)head + sizeof(ALLOC_LAYOUT)); + elp->len = size - sizeof(ALLOC_LAYOUT); + elp->ulen = 0; + + SH_TAILQ_INSERT_HEAD(&head->addrq, elp, addrq, __alloc_element); + SH_TAILQ_INSERT_HEAD( + &head->sizeq[DB_SIZE_Q_COUNT - 1], elp, sizeq, __alloc_element); +} + +/* + * The length, the ALLOC_ELEMENT structure and an optional guard byte, + * rounded up to standard alignment. + */ +#ifdef DIAGNOSTIC +#define DB_ALLOC_SIZE(len) \ + (size_t)DB_ALIGN((len) + sizeof(ALLOC_ELEMENT) + 1, sizeof(uintmax_t)) +#else +#define DB_ALLOC_SIZE(len) \ + (size_t)DB_ALIGN((len) + sizeof(ALLOC_ELEMENT), sizeof(uintmax_t)) +#endif + +/* + * __env_alloc_overhead -- + * Return the overhead needed for an allocation. + * + * PUBLIC: size_t __env_alloc_overhead __P((void)); + */ +size_t +__env_alloc_overhead() +{ + return (sizeof(ALLOC_ELEMENT)); +} + +/* + * __env_alloc_size -- + * Return the space needed for an allocation, including alignment. + * + * PUBLIC: size_t __env_alloc_size __P((size_t)); + */ +size_t +__env_alloc_size(len) + size_t len; +{ + return (DB_ALLOC_SIZE(len)); +} + +/* + * __env_alloc -- + * Allocate space from the shared region. + * + * PUBLIC: int __env_alloc __P((REGINFO *, size_t, void *)); + */ +int +__env_alloc(infop, len, retp) + REGINFO *infop; + size_t len; + void *retp; +{ + SIZEQ_HEAD *q; + ALLOC_ELEMENT *elp, *frag, *elp_tmp; + ALLOC_LAYOUT *head; + ENV *env; + REGION_MEM *mem; + REGINFO *envinfop; + size_t total_len; + u_int8_t *p; + u_int i; + int ret; +#ifdef HAVE_STATISTICS + u_int32_t st_search; +#endif + env = infop->env; + *(void **)retp = NULL; +#ifdef HAVE_MUTEX_SUPPORT + MUTEX_REQUIRED(env, infop->mtx_alloc); +#endif + + PERFMON3(env, mpool, env_alloc, len, infop->id, infop->type); + /* + * In a heap-backed environment, we call malloc for additional space. + * (Malloc must return memory correctly aligned for our use.) + * + * In a heap-backed environment, memory is laid out as follows: + * + * { uintmax_t total-length } { user-memory } { guard-byte } + */ + if (F_ISSET(env, ENV_PRIVATE)) { + /* + * If we are shared then we must track the allocation + * in the main environment region. + */ + if (F_ISSET(infop, REGION_SHARED)) + envinfop = env->reginfo; + else + envinfop = infop; + /* + * We need an additional uintmax_t to hold the length (and + * keep the buffer aligned on 32-bit systems). + */ + len += sizeof(uintmax_t); + if (F_ISSET(infop, REGION_TRACKED)) + len += sizeof(REGION_MEM); + +#ifdef DIAGNOSTIC + /* Plus one byte for the guard byte. */ + ++len; +#endif + /* Check if we're over the limit. */ + if (envinfop->max_alloc != 0 && + envinfop->allocated + len > envinfop->max_alloc) + return (ENOMEM); + + /* Allocate the space. */ + if ((ret = __os_malloc(env, len, &p)) != 0) + return (ret); + infop->allocated += len; + if (infop != envinfop) + envinfop->allocated += len; + + *(uintmax_t *)p = len; +#ifdef DIAGNOSTIC + p[len - 1] = GUARD_BYTE; +#endif + if (F_ISSET(infop, REGION_TRACKED)) { + mem = (REGION_MEM *)(p + sizeof(uintmax_t)); + mem->next = infop->mem; + infop->mem = mem; + p += sizeof(mem); + } + *(void **)retp = p + sizeof(uintmax_t); + return (0); + } + + head = infop->head; + total_len = DB_ALLOC_SIZE(len); + + /* Find the first size queue that could satisfy the request. */ + COMPQUIET(q, NULL); +#ifdef HAVE_MMAP_EXTEND +retry: +#endif + SET_QUEUE_FOR_SIZE(head, q, i, total_len); + +#ifdef HAVE_STATISTICS + if (i >= DB_SIZE_Q_COUNT) + i = DB_SIZE_Q_COUNT - 1; + ++head->pow2_size[i]; /* Note the size of the request. */ +#endif + + /* + * Search this queue, and, if necessary, queues larger than this queue, + * looking for a chunk we can use. + */ + STAT(st_search = 0); + for (elp = NULL;; ++q) { + SH_TAILQ_FOREACH(elp_tmp, q, sizeq, __alloc_element) { + STAT(++st_search); + + /* + * Chunks are sorted from largest to smallest -- if + * this chunk is less than what we need, no chunk + * further down the list will be large enough. + */ + if (elp_tmp->len < total_len) + break; + + /* + * This chunk will do... maybe there's a better one, + * but this one will do. + */ + elp = elp_tmp; + + /* + * We might have many chunks of the same size. Stop + * looking if we won't fragment memory by picking the + * current one. + */ + if (elp_tmp->len - total_len <= SHALLOC_FRAGMENT) + break; + } + if (elp != NULL || ++i >= DB_SIZE_Q_COUNT) + break; + } + +#ifdef HAVE_STATISTICS + if (head->longest < st_search) { + head->longest = st_search; + STAT_PERFMON3(env, + mpool, longest_search, len, infop->id, st_search); + } +#endif + + /* + * If we don't find an element of the right size, try to extend + * the region, if not then we are done. + */ + if (elp == NULL) { + ret = ENOMEM; +#ifdef HAVE_MMAP_EXTEND + if (infop->rp->size < infop->rp->max && + (ret = __env_region_extend(env, infop)) == 0) + goto retry; +#endif + STAT_INC_VERB(env, mpool, fail, head->failure, len, infop->id); + return (ret); + } + STAT_INC_VERB(env, mpool, alloc, head->success, len, infop->id); + + /* Pull the chunk off of the size queue. */ + SH_TAILQ_REMOVE(q, elp, sizeq, __alloc_element); + + if (elp->len - total_len > SHALLOC_FRAGMENT) { + frag = (ALLOC_ELEMENT *)((u_int8_t *)elp + total_len); + frag->len = elp->len - total_len; + frag->ulen = 0; + + elp->len = total_len; + + /* The fragment follows the chunk on the address queue. */ + SH_TAILQ_INSERT_AFTER( + &head->addrq, elp, frag, addrq, __alloc_element); + + /* Insert the frag into the correct size queue. */ + __env_size_insert(head, frag); + } + + p = (u_int8_t *)elp + sizeof(ALLOC_ELEMENT); + elp->ulen = len; +#ifdef DIAGNOSTIC + p[len] = GUARD_BYTE; +#endif + *(void **)retp = p; + + return (0); +} + +/* + * __env_alloc_free -- + * Free space into the shared region. + * + * PUBLIC: void __env_alloc_free __P((REGINFO *, void *)); + */ +void +__env_alloc_free(infop, ptr) + REGINFO *infop; + void *ptr; +{ + ALLOC_ELEMENT *elp, *elp_tmp; + ALLOC_LAYOUT *head; + ENV *env; + SIZEQ_HEAD *q; + size_t len; + u_int8_t i, *p; + + env = infop->env; + + /* In a private region, we call free. */ + if (F_ISSET(env, ENV_PRIVATE)) { + /* Find the start of the memory chunk and its length. */ + p = (u_int8_t *)((uintmax_t *)ptr - 1); + len = (size_t)*(uintmax_t *)p; + + infop->allocated -= len; + if (F_ISSET(infop, REGION_SHARED)) + env->reginfo->allocated -= len; + +#ifdef DIAGNOSTIC + /* Check the guard byte. */ + DB_ASSERT(env, p[len - 1] == GUARD_BYTE); + + /* Trash the memory chunk. */ + memset(p, CLEAR_BYTE, len); +#endif + __os_free(env, p); + return; + } + +#ifdef HAVE_MUTEX_SUPPORT + MUTEX_REQUIRED(env, infop->mtx_alloc); +#endif + + head = infop->head; + + p = ptr; + elp = (ALLOC_ELEMENT *)(p - sizeof(ALLOC_ELEMENT)); + + STAT_INC_VERB(env, mpool, free, head->freed, elp->ulen, infop->id); + +#ifdef DIAGNOSTIC + /* Check the guard byte. */ + DB_ASSERT(env, p[elp->ulen] == GUARD_BYTE); + + /* Trash the memory chunk. */ + memset(p, CLEAR_BYTE, (size_t)elp->len - sizeof(ALLOC_ELEMENT)); +#endif + + /* Mark the memory as no longer in use. */ + elp->ulen = 0; + + /* + * Try and merge this chunk with chunks on either side of it. Two + * chunks can be merged if they're contiguous and not in use. + */ + if ((elp_tmp = + SH_TAILQ_PREV(&head->addrq, elp, addrq, __alloc_element)) != NULL && + elp_tmp->ulen == 0 && + (u_int8_t *)elp_tmp + elp_tmp->len == (u_int8_t *)elp) { + /* + * If we're merging the entry into a previous entry, remove the + * current entry from the addr queue and the previous entry from + * its size queue, and merge. + */ + SH_TAILQ_REMOVE(&head->addrq, elp, addrq, __alloc_element); + SET_QUEUE_FOR_SIZE(head, q, i, elp_tmp->len); + SH_TAILQ_REMOVE(q, elp_tmp, sizeq, __alloc_element); + + elp_tmp->len += elp->len; + elp = elp_tmp; + } + if ((elp_tmp = SH_TAILQ_NEXT(elp, addrq, __alloc_element)) != NULL && + elp_tmp->ulen == 0 && + (u_int8_t *)elp + elp->len == (u_int8_t *)elp_tmp) { + /* + * If we're merging the current entry into a subsequent entry, + * remove the subsequent entry from the addr and size queues + * and merge. + */ + SH_TAILQ_REMOVE(&head->addrq, elp_tmp, addrq, __alloc_element); + SET_QUEUE_FOR_SIZE(head, q, i, elp_tmp->len); + SH_TAILQ_REMOVE(q, elp_tmp, sizeq, __alloc_element); + + elp->len += elp_tmp->len; + } + + /* Insert in the correct place in the size queues. */ + __env_size_insert(head, elp); +} + +/* + * __env_alloc_extend -- + * Extend a previously allocated chunk at the end of a region. + * + * PUBLIC: int __env_alloc_extend __P((REGINFO *, void *, size_t *)); + */ +int +__env_alloc_extend(infop, ptr, lenp) + REGINFO *infop; + void *ptr; + size_t *lenp; +{ + ALLOC_ELEMENT *elp, *elp_tmp; + ALLOC_LAYOUT *head; + ENV *env; + SIZEQ_HEAD *q; + size_t len, tlen; + u_int8_t i, *p; + int ret; + + env = infop->env; + + DB_ASSERT(env, !F_ISSET(env, ENV_PRIVATE)); + +#ifdef HAVE_MUTEX_SUPPORT + MUTEX_REQUIRED(env, infop->mtx_alloc); +#endif + + head = infop->head; + + p = ptr; + len = *lenp; + elp = (ALLOC_ELEMENT *)(p - sizeof(ALLOC_ELEMENT)); +#ifdef DIAGNOSTIC + /* Check the guard byte. */ + DB_ASSERT(env, p[elp->ulen] == GUARD_BYTE); +#endif + + /* See if there is anything left in the region. */ +again: if ((elp_tmp = SH_TAILQ_NEXT(elp, addrq, __alloc_element)) != NULL && + elp_tmp->ulen == 0 && + (u_int8_t *)elp + elp->len == (u_int8_t *)elp_tmp) { + /* + * If we're merging the current entry into a subsequent entry, + * remove the subsequent entry from the addr and size queues + * and merge. + */ + SH_TAILQ_REMOVE(&head->addrq, elp_tmp, addrq, __alloc_element); + SET_QUEUE_FOR_SIZE(head, q, i, elp_tmp->len); + SH_TAILQ_REMOVE(q, elp_tmp, sizeq, __alloc_element); + if (elp_tmp->len < len + SHALLOC_FRAGMENT) { + elp->len += elp_tmp->len; + if (elp_tmp->len < len) + len -= (size_t)elp_tmp->len; + else + len = 0; + } else { + tlen = (size_t)elp_tmp->len; + elp_tmp = (ALLOC_ELEMENT *) ((u_int8_t *)elp_tmp + len); + elp_tmp->len = tlen - len; + elp_tmp->ulen = 0; + elp->len += len; + len = 0; + + /* The fragment follows the on the address queue. */ + SH_TAILQ_INSERT_AFTER( + &head->addrq, elp, elp_tmp, addrq, __alloc_element); + + /* Insert the frag into the correct size queue. */ + __env_size_insert(head, elp_tmp); + } + } else if (elp_tmp != NULL) { + __db_errx(env, DB_STR("1583", "block not at end of region")); + return (__env_panic(env, EINVAL)); + } + if (len == 0) + goto done; + + if ((ret = __env_region_extend(env, infop)) != 0) { + if (ret != ENOMEM) + return (ret); + goto done; + } + goto again; + +done: elp->ulen = elp->len - sizeof(ALLOC_ELEMENT); +#ifdef DIAGNOSTIC + elp->ulen -= sizeof(uintmax_t); + /* There was room for the guard byte in the chunk that came in. */ + p[elp->ulen] = GUARD_BYTE; +#endif + *lenp -= len; + infop->allocated += *lenp; + if (F_ISSET(infop, REGION_SHARED)) + env->reginfo->allocated += *lenp; + return (0); +} + +/* + * __env_size_insert -- + * Insert into the correct place in the size queues. + */ +static void +__env_size_insert(head, elp) + ALLOC_LAYOUT *head; + ALLOC_ELEMENT *elp; +{ + SIZEQ_HEAD *q; + ALLOC_ELEMENT *elp_tmp; + u_int i; + + /* Find the appropriate queue for the chunk. */ + SET_QUEUE_FOR_SIZE(head, q, i, elp->len); + + /* Find the correct slot in the size queue. */ + SH_TAILQ_FOREACH(elp_tmp, q, sizeq, __alloc_element) + if (elp->len >= elp_tmp->len) + break; + if (elp_tmp == NULL) + SH_TAILQ_INSERT_TAIL(q, elp, sizeq); + else + SH_TAILQ_INSERT_BEFORE(q, elp_tmp, elp, sizeq, __alloc_element); +} + +/* + * __env_region_extend -- + * Extend a region. + * + * PUBLIC: int __env_region_extend __P((ENV *, REGINFO *)); + */ +int +__env_region_extend(env, infop) + ENV *env; + REGINFO *infop; +{ + ALLOC_ELEMENT *elp; + REGION *rp; + int ret; + + DB_ASSERT(env, !F_ISSET(env, ENV_PRIVATE)); + + ret = 0; + rp = infop->rp; + if (rp->size >= rp->max) + return (ENOMEM); + elp = (ALLOC_ELEMENT *)((u_int8_t *)infop->addr + rp->size); + if (rp->size + rp->alloc > rp->max) + rp->alloc = rp->max - rp->size; + rp->size += rp->alloc; + rp->size = (size_t)ALIGNP_INC(rp->size, sizeof(size_t)); + if (rp->max - rp->size <= SHALLOC_FRAGMENT) + rp->size = rp->max; + if (infop->fhp && + (ret = __db_file_extend(env, infop->fhp, rp->size)) != 0) + return (ret); + elp->len = rp->alloc; + elp->ulen = 0; +#ifdef DIAGNOSTIC + *(u_int8_t *)(elp+1) = GUARD_BYTE; +#endif + + SH_TAILQ_INSERT_TAIL(&((ALLOC_LAYOUT *)infop->head)->addrq, elp, addrq); + __env_alloc_free(infop, elp + 1); + if (rp->alloc < MEGABYTE) + rp->alloc += rp->size; + if (rp->alloc > MEGABYTE) + rp->alloc = MEGABYTE; + return (ret); +} + +/* + * __env_elem_size -- + * Return the size of an allocated element. + * PUBLIC: uintmax_t __env_elem_size __P((ENV *, void *)); + */ +uintmax_t +__env_elem_size(env, p) + ENV *env; + void *p; +{ + ALLOC_ELEMENT *elp; + uintmax_t size; + + if (F_ISSET(env, ENV_PRIVATE)) { + size = *((uintmax_t *)p - 1); + size -= sizeof(uintmax_t); + } else { + elp = (ALLOC_ELEMENT *)((u_int8_t *)p - sizeof(ALLOC_ELEMENT)); + size = elp->ulen; + } + return (size); +} + +/* + * __env_get_chunk -- + * Return the next chunk allocated in a private region. + * PUBLIC: void * __env_get_chunk __P((REGINFO *, void **, uintmax_t *)); + */ +void * +__env_get_chunk(infop, nextp, sizep) + REGINFO *infop; + void **nextp; + uintmax_t *sizep; +{ + REGION_MEM *mem; + + if (infop->mem == NULL) + return (NULL); + if (*nextp == NULL) + *nextp = infop->mem; + mem = *(REGION_MEM **)nextp; + *nextp = mem->next; + + *sizep = __env_elem_size(infop->env, mem); + *sizep -= sizeof(*mem); + + return ((void *)(mem + 1)); +} + +#ifdef HAVE_STATISTICS +/* + * __env_alloc_print -- + * Display the lists of memory chunks. + * + * PUBLIC: void __env_alloc_print __P((REGINFO *, u_int32_t)); + */ +void +__env_alloc_print(infop, flags) + REGINFO *infop; + u_int32_t flags; +{ + ALLOC_ELEMENT *elp; + ALLOC_LAYOUT *head; + ENV *env; + u_int i; + + env = infop->env; + head = infop->head; + + if (F_ISSET(env, ENV_PRIVATE)) + return; + + __db_msg(env, + "Region allocations: %lu allocations, %lu failures, %lu frees, %lu longest", + (u_long)head->success, (u_long)head->failure, (u_long)head->freed, + (u_long)head->longest); + + if (!LF_ISSET(DB_STAT_ALL)) + return; + + __db_msg(env, "%s", "Allocations by power-of-two sizes:"); + for (i = 0; i < DB_SIZE_Q_COUNT; ++i) + __db_msg(env, "%3dKB\t%lu", + (1024 << i) / 1024, (u_long)head->pow2_size[i]); + + if (!LF_ISSET(DB_STAT_ALLOC)) + return; + /* + * We don't normally display the list of address/chunk pairs, a few + * thousand lines of output is too voluminous for even DB_STAT_ALL. + */ + __db_msg(env, + "Allocation list by address, offset: {chunk length, user length}"); + SH_TAILQ_FOREACH(elp, &head->addrq, addrq, __alloc_element) + __db_msg(env, "\t%#lx, %lu {%lu, %lu}", + P_TO_ULONG(elp), (u_long)R_OFFSET(infop, elp), + (u_long)elp->len, (u_long)elp->ulen); + + __db_msg(env, "Allocation free list by size: KB {chunk length}"); + for (i = 0; i < DB_SIZE_Q_COUNT; ++i) { + __db_msg(env, "%3dKB", (1024 << i) / 1024); + SH_TAILQ_FOREACH(elp, &head->sizeq[i], sizeq, __alloc_element) + __db_msg(env, + "\t%#lx {%lu}", P_TO_ULONG(elp), (u_long)elp->len); + } +} +#endif diff --git a/src/env/env_backup.c b/src/env/env_backup.c new file mode 100644 index 00000000..9c79dbb4 --- /dev/null +++ b/src/env/env_backup.c @@ -0,0 +1,166 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2011, 2012 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" + +static int __env_backup_alloc __P((DB_ENV *)); + +static int +__env_backup_alloc(dbenv) + DB_ENV *dbenv; +{ + ENV *env; + + env = dbenv->env; + if (env->backup_handle != NULL) + return (0); + return (__os_calloc(env, 1, + sizeof(*env->backup_handle), &env->backup_handle)); +} + +/* + * __env_get_backup_config -- + * + * PUBLIC: int __env_get_backup_config __P((DB_ENV *, + * PUBLIC: DB_BACKUP_CONFIG, u_int32_t*)); + */ +int +__env_get_backup_config(dbenv, config, valuep) + DB_ENV *dbenv; + DB_BACKUP_CONFIG config; + u_int32_t *valuep; +{ + DB_BACKUP *backup; + + backup = dbenv->env->backup_handle; + if (backup == NULL) + return (EINVAL); + + switch (config) { + case DB_BACKUP_WRITE_DIRECT: + *valuep = F_ISSET(backup, BACKUP_WRITE_DIRECT); + break; + + case DB_BACKUP_READ_COUNT: + *valuep = backup->read_count; + break; + + case DB_BACKUP_READ_SLEEP: + *valuep = backup->read_sleep; + break; + + case DB_BACKUP_SIZE: + *valuep = backup->size; + break; + } + return (0); +} + +/* + * __env_set_backup_config -- + * + * PUBLIC: int __env_set_backup_config __P((DB_ENV *, + * PUBLIC: DB_BACKUP_CONFIG, u_int32_t)); + */ +int +__env_set_backup_config(dbenv, config, value) + DB_ENV *dbenv; + DB_BACKUP_CONFIG config; + u_int32_t value; +{ + DB_BACKUP *backup; + int ret; + + if ((ret = __env_backup_alloc(dbenv)) != 0) + return (ret); + + backup = dbenv->env->backup_handle; + switch (config) { + case DB_BACKUP_WRITE_DIRECT: + if (value == 0) + F_CLR(backup, BACKUP_WRITE_DIRECT); + else + F_SET(backup, BACKUP_WRITE_DIRECT); + break; + + case DB_BACKUP_READ_COUNT: + backup->read_count = value; + break; + + case DB_BACKUP_READ_SLEEP: + backup->read_sleep = value; + break; + + case DB_BACKUP_SIZE: + backup->size = value; + break; + } + + return (0); +} + +/* + * __env_get_backup_callbacks -- + * + * PUBLIC: int __env_get_backup_callbacks __P((DB_ENV *, + * PUBLIC: int (**)(DB_ENV *, const char *, const char *, void **), + * PUBLIC: int (**)(DB_ENV *, + * PUBLIC: u_int32_t, u_int32_t, u_int32_t, u_int8_t *, void *), + * PUBLIC: int (**)(DB_ENV *, const char *, void *))); + */ +int +__env_get_backup_callbacks(dbenv, openp, writep, closep) + DB_ENV *dbenv; + int (**openp)(DB_ENV *, const char *, const char *, void **); + int (**writep)(DB_ENV *, + u_int32_t, u_int32_t, u_int32_t, u_int8_t *, void *); + int (**closep)(DB_ENV *, const char *, void *); +{ + DB_BACKUP *backup; + + backup = dbenv->env->backup_handle; + if (backup == NULL) + return (EINVAL); + + *openp = backup->open; + *writep = backup->write; + *closep = backup->close; + return (0); +} + +/* + * __env_set_backup_callbacks -- + * + * PUBLIC: int __env_set_backup_callbacks __P((DB_ENV *, + * PUBLIC: int (*)(DB_ENV *, const char *, const char *, void **), + * PUBLIC: int (*)(DB_ENV *, + * PUBLIC: u_int32_t, u_int32_t, u_int32_t, u_int8_t *, void *), + * PUBLIC: int (*)(DB_ENV *, const char *, void *))); + */ +int +__env_set_backup_callbacks(dbenv, open_func, write_func, close_func) + DB_ENV *dbenv; + int (*open_func)(DB_ENV *, const char *, const char *, void **); + int (*write_func)(DB_ENV *, + u_int32_t, u_int32_t, u_int32_t, u_int8_t *, void *); + int (*close_func)(DB_ENV *, const char *, void *); +{ + DB_BACKUP *backup; + int ret; + + if ((ret = __env_backup_alloc(dbenv)) != 0) + return (ret); + + backup = dbenv->env->backup_handle; + backup->open = open_func; + backup->write = write_func; + backup->close = close_func; + return (0); +} diff --git a/src/env/env_config.c b/src/env/env_config.c new file mode 100644 index 00000000..57496909 --- /dev/null +++ b/src/env/env_config.c @@ -0,0 +1,737 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/txn.h" +#include "dbinc/db_page.h" +#include "dbinc_auto/db_ext.h" + +/* + * DB_CONFIG lines are processed primarily by interpreting the command + * description tables initialized below. + * + * Most DB_CONFIG commands consist of a single token name followed by one or two + * integer or string arguments. These commands are described by entries in the + * config_descs[] array. + * + * The remaining, usually more complex, DB_CONFIG commands are handled by small + * code blocks in __config_parse(). Many of those commands need to translate + * option names to the integer values needed by the API configuration functions. + * Below the __config_descs[] initialization there are many FN array + * initializations which provide the mapping between user-specifiable strings + * and internally-used integer values. Typically there is one of these mappings + * defined for each complex DB_CONFIG command. Use __db_name_to_val() + * to translate a string to its integer value. + */ +typedef enum { + CFG_INT, /* The argument is 1 signed integer. */ + CFG_LONG, /* The argument is 1 signed long int. */ + CFG_UINT, /* The argument is 1 unsigned integer. */ + CFG_2INT, /* The arguments are 2 signed integers. */ + CFG_2UINT, /* The arguments are 2 unsigned integers. */ + CFG_STRING /* The rest of the line is a string. */ +} __db_config_type; + +typedef struct __db_config_desc { + char *name; /* The name of a simple DB_CONFIG command. */ + __db_config_type type; /* The enum describing its argument type(s). */ + int (*func)(); /* The function to call with the argument(s). */ +} CFG_DESC; + +/* These typedefs help eliminate lint warnings where "func" above is used. */ +typedef int (*CFG_FUNC_STRING) __P((DB_ENV *, const char *)); +typedef int (*CFG_FUNC_INT) __P((DB_ENV *, int)); +typedef int (*CFG_FUNC_LONG) __P((DB_ENV *, long)); +typedef int (*CFG_FUNC_UINT) __P((DB_ENV *, u_int32_t)); +typedef int (*CFG_FUNC_2INT) __P((DB_ENV *, int, int)); +typedef int (*CFG_FUNC_2UINT) __P((DB_ENV *, u_int32_t, u_int32_t)); + +/* + * This table lists the simple DB_CONFIG configuration commands. It is sorted by + * the command name, so that __config_scan() can bsearch() it. After making an + * addition to this table, please be sure that it remains sorted. With vi or + * vim, the following command line will do it: + * :/^static const CFG_DESC config_descs/+1, /^}/-1 ! sort + * + * This table can contain aliases. Aliases have different names with identical + * types and functions. At this time there are four aliases: + * Outdated Name Current Name + * db_data_dir set_data_dir + * db_log_dir set_lg_dir + * db_tmp_dir set_tmp_dir + * set_tas_spins mutex_set_tas_spins + */ +static const CFG_DESC config_descs[] = { + { "add_data_dir", CFG_STRING, __env_add_data_dir }, + { "db_data_dir", CFG_STRING, __env_set_data_dir }, + { "db_log_dir", CFG_STRING, __log_set_lg_dir }, + { "db_tmp_dir", CFG_STRING, __env_set_tmp_dir }, + { "mutex_set_align", CFG_UINT, __mutex_set_align }, + { "mutex_set_increment", CFG_UINT, __mutex_set_increment }, + { "mutex_set_init", CFG_UINT, __mutex_set_init }, + { "mutex_set_max", CFG_UINT, __mutex_set_max }, + { "mutex_set_tas_spins", CFG_UINT, __mutex_set_tas_spins }, + { "rep_set_clockskew", CFG_2UINT, __rep_set_clockskew }, + { "rep_set_limit", CFG_2UINT, __rep_set_limit }, + { "rep_set_nsites", CFG_UINT, __rep_set_nsites_pp }, + { "rep_set_priority", CFG_UINT, __rep_set_priority }, + { "rep_set_request", CFG_2UINT, __rep_set_request }, + { "set_cache_max", CFG_2UINT, __memp_set_cache_max }, + { "set_create_dir", CFG_STRING, __env_set_create_dir }, + { "set_data_dir", CFG_STRING, __env_set_data_dir }, + { "set_data_len", CFG_UINT, __env_set_data_len }, + { "set_intermediate_dir_mode",CFG_STRING, __env_set_intermediate_dir_mode }, + { "set_lg_bsize", CFG_UINT, __log_set_lg_bsize }, + { "set_lg_dir", CFG_STRING, __log_set_lg_dir }, + { "set_lg_filemode", CFG_INT, __log_set_lg_filemode }, + { "set_lg_max", CFG_UINT, __log_set_lg_max }, + { "set_lg_regionmax", CFG_UINT, __log_set_lg_regionmax }, + { "set_lk_max_lockers", CFG_UINT, __lock_set_lk_max_lockers }, + { "set_lk_max_locks", CFG_UINT, __lock_set_lk_max_locks }, + { "set_lk_max_objects", CFG_UINT, __lock_set_lk_max_objects }, + { "set_lk_partitions", CFG_UINT, __lock_set_lk_partitions }, + { "set_lk_tablesize", CFG_UINT, __lock_set_lk_tablesize }, + { "set_memory_max", CFG_2UINT, __env_set_memory_max }, + { "set_metadata_dir", CFG_STRING, __env_set_metadata_dir }, + { "set_mp_max_openfd", CFG_INT, __memp_set_mp_max_openfd }, + { "set_mp_max_write", CFG_2INT, __memp_set_mp_max_write }, + { "set_mp_mmapsize", CFG_UINT, __memp_set_mp_mmapsize }, + { "set_mp_mtxcount", CFG_UINT, __memp_set_mp_mtxcount }, + { "set_mp_pagesize", CFG_UINT, __memp_set_mp_pagesize }, + { "set_shm_key", CFG_LONG, __env_set_shm_key }, + { "set_tas_spins", CFG_UINT, __mutex_set_tas_spins }, + { "set_thread_count", CFG_UINT, __env_set_thread_count }, + { "set_tmp_dir", CFG_STRING, __env_set_tmp_dir }, + { "set_tx_max", CFG_UINT, __txn_set_tx_max } +}; + +/* + * Here are the option-name to option-value mappings used by complex commands. + */ +static const FN config_mem_init[] = { + { (u_int32_t) DB_MEM_LOCK, "DB_MEM_LOCK" }, + { (u_int32_t) DB_MEM_LOCKER, "DB_MEM_LOCKER" }, + { (u_int32_t) DB_MEM_LOCKOBJECT, "DB_MEM_LOCKOBJECT" }, + { (u_int32_t) DB_MEM_TRANSACTION, "DB_MEM_TRANSACTION" }, + { (u_int32_t) DB_MEM_THREAD, "DB_MEM_THREAD" }, + { (u_int32_t) DB_MEM_LOGID, "DB_MEM_LOGID" }, + { 0, NULL } +}; + +static const FN config_rep_config[] = { + { DB_REP_CONF_AUTOINIT, "db_rep_conf_autoinit" }, + { DB_REP_CONF_AUTOROLLBACK, "db_rep_conf_autorollback" }, + { DB_REP_CONF_BULK, "db_rep_conf_bulk" }, + { DB_REP_CONF_DELAYCLIENT, "db_rep_conf_delayclient" }, + { DB_REP_CONF_INMEM, "db_rep_conf_inmem" }, + { DB_REP_CONF_LEASE, "db_rep_conf_lease" }, + { DB_REP_CONF_NOWAIT, "db_rep_conf_nowait" }, + { DB_REPMGR_CONF_2SITE_STRICT, "db_repmgr_conf_2site_strict" }, + { DB_REPMGR_CONF_ELECTIONS, "db_repmgr_conf_elections" }, + { 0, NULL } +}; + +static const FN config_rep_timeout[] = { + { DB_REP_ACK_TIMEOUT, "db_rep_ack_timeout" }, + { DB_REP_CHECKPOINT_DELAY, "db_rep_checkpoint_delay" }, + { DB_REP_CONNECTION_RETRY, "db_rep_connection_retry" }, + { DB_REP_ELECTION_TIMEOUT, "db_rep_election_timeout" }, + { DB_REP_ELECTION_RETRY, "db_rep_election_retry" }, + { DB_REP_FULL_ELECTION_TIMEOUT, "db_rep_full_election_timeout" }, + { DB_REP_HEARTBEAT_MONITOR, "db_rep_heartbeat_monitor" }, + { DB_REP_HEARTBEAT_SEND, "db_rep_heartbeat_send" }, + { DB_REP_LEASE_TIMEOUT, "db_rep_lease_timeout" }, + { 0, NULL } +}; + +static const FN config_repmgr_ack_policy[] = { + { DB_REPMGR_ACKS_ALL, "db_repmgr_acks_all" }, + { DB_REPMGR_ACKS_ALL_AVAILABLE, "db_repmgr_acks_all_available" }, + { DB_REPMGR_ACKS_ALL_PEERS, "db_repmgr_acks_all_peers" }, + { DB_REPMGR_ACKS_NONE, "db_repmgr_acks_none" }, + { DB_REPMGR_ACKS_ONE, "db_repmgr_acks_one" }, + { DB_REPMGR_ACKS_ONE_PEER, "db_repmgr_acks_one_peer" }, + { DB_REPMGR_ACKS_QUORUM, "db_repmgr_acks_quorum" }, + { 0, NULL } +}; + +static const FN config_repmgr_site[] = { + { DB_BOOTSTRAP_HELPER, "db_bootstrap_helper" }, + { DB_GROUP_CREATOR, "db_group_creator" }, + { DB_LEGACY, "db_legacy" }, + { DB_LOCAL_SITE, "db_local_site" }, + { DB_REPMGR_PEER, "db_repmgr_peer" }, + { 0, NULL } +}; + +static const FN config_set_flags[] = { + { DB_AUTO_COMMIT, "db_auto_commit" }, + { DB_CDB_ALLDB, "db_cdb_alldb" }, + { DB_DIRECT_DB, "db_direct_db" }, + { DB_DSYNC_DB, "db_dsync_db" }, + { DB_MULTIVERSION, "db_multiversion" }, + { DB_NOLOCKING, "db_nolocking" }, + { DB_NOMMAP, "db_nommap" }, + { DB_NOPANIC, "db_nopanic" }, + { DB_OVERWRITE, "db_overwrite" }, + { DB_REGION_INIT, "db_region_init" }, + { DB_TIME_NOTGRANTED, "db_time_notgranted" }, + { DB_TXN_NOSYNC, "db_txn_nosync" }, + { DB_TXN_NOWAIT, "db_txn_nowait" }, + { DB_TXN_SNAPSHOT, "db_txn_snapshot" }, + { DB_TXN_WRITE_NOSYNC, "db_txn_write_nosync" }, + { DB_YIELDCPU, "db_yieldcpu" }, + { 0, NULL } +}; + +static const FN config_set_flags_forlog[] = { + { DB_LOG_DIRECT, "db_direct_log" }, + { DB_LOG_DSYNC, "db_dsync_log" }, + { DB_LOG_AUTO_REMOVE, "db_log_autoremove" }, + { DB_LOG_IN_MEMORY, "db_log_inmemory" }, + { 0, NULL } +}; + +static const FN config_log_set_config[] = { + { DB_LOG_DIRECT, "db_log_direct" }, + { DB_LOG_DSYNC, "db_log_dsync" }, + { DB_LOG_AUTO_REMOVE, "db_log_auto_remove" }, + { DB_LOG_IN_MEMORY, "db_log_in_memory" }, + { DB_LOG_ZERO, "db_log_zero" }, + { 0, NULL } +}; + +static const FN config_set_lk_detect[] = { + { DB_LOCK_DEFAULT, "db_lock_default" }, + { DB_LOCK_EXPIRE, "db_lock_expire" }, + { DB_LOCK_MAXLOCKS, "db_lock_maxlocks" }, + { DB_LOCK_MAXWRITE, "db_lock_maxwrite" }, + { DB_LOCK_MINLOCKS, "db_lock_minlocks" }, + { DB_LOCK_MINWRITE, "db_lock_minwrite" }, + { DB_LOCK_OLDEST, "db_lock_oldest" }, + { DB_LOCK_RANDOM, "db_lock_random" }, + { DB_LOCK_YOUNGEST, "db_lock_youngest" }, + { 0, NULL } +}; + +static const FN config_set_open_flags[] = { + { DB_INIT_REP, "db_init_rep" }, + { DB_PRIVATE, "db_private" }, + { DB_REGISTER, "db_register" }, + { DB_THREAD, "db_thread" }, + { 0, NULL } +}; + +static const FN config_set_verbose[] = { + { DB_VERB_BACKUP, "db_verb_backup" }, + { DB_VERB_DEADLOCK, "db_verb_deadlock" }, + { DB_VERB_FILEOPS, "db_verb_fileops" }, + { DB_VERB_FILEOPS_ALL, "db_verb_fileops_all" }, + { DB_VERB_RECOVERY, "db_verb_recovery" }, + { DB_VERB_REGISTER, "db_verb_register" }, + { DB_VERB_REPLICATION, "db_verb_replication" }, + { DB_VERB_REP_ELECT, "db_verb_rep_elect" }, + { DB_VERB_REP_LEASE, "db_verb_rep_lease" }, + { DB_VERB_REP_MISC, "db_verb_rep_misc" }, + { DB_VERB_REP_MSGS, "db_verb_rep_msgs" }, + { DB_VERB_REP_SYNC, "db_verb_rep_sync" }, + { DB_VERB_REP_SYSTEM, "db_verb_rep_system" }, + { DB_VERB_REP_TEST, "db_verb_rep_test" }, + { DB_VERB_REPMGR_CONNFAIL, "db_verb_repmgr_connfail" }, + { DB_VERB_REPMGR_MISC, "db_verb_repmgr_misc" }, + { DB_VERB_WAITSFOR, "db_verb_waitsfor" }, + { 0, NULL} +}; + +static int __config_parse __P((ENV *, char *, int)); +static int __config_scan __P((char *, char **, const CFG_DESC **)); +static int cmp_cfg_name __P((const void *, const void *element)); + +/* + * __env_read_db_config -- + * Read the DB_CONFIG file. + * + * PUBLIC: int __env_read_db_config __P((ENV *)); + */ +int +__env_read_db_config(env) + ENV *env; +{ + FILE *fp; + int lc, ret; + char *p, buf[256]; + + /* Parse the config file. */ + p = NULL; + if ((ret = __db_appname(env, + DB_APP_NONE, "DB_CONFIG", NULL, &p)) != 0) + return (ret); + if (p == NULL) + fp = NULL; + else { + fp = fopen(p, "r"); + __os_free(env, p); + } + + if (fp == NULL) + return (0); + + for (lc = 1; fgets(buf, sizeof(buf), fp) != NULL; ++lc) { + if ((p = strchr(buf, '\n')) == NULL) + p = buf + strlen(buf); + if (p > buf && p[-1] == '\r') + --p; + *p = '\0'; + for (p = buf; *p != '\0' && isspace((int)*p); ++p) + ; + if (*p == '\0' || *p == '#') + continue; + + if ((ret = __config_parse(env, p, lc)) != 0) + break; + } + (void)fclose(fp); + + return (ret); +} + +#undef CFG_GET_INT +#define CFG_GET_INT(s, vp) do { \ + int __ret; \ + if ((__ret = \ + __db_getlong(env->dbenv, NULL, s, 0, INT_MAX, vp)) != 0) \ + return (__ret); \ +} while (0) +#undef CFG_GET_LONG +#define CFG_GET_LONG(s, vp) do { \ + int __ret; \ + if ((__ret = \ + __db_getlong(env->dbenv, NULL, s, 0, LONG_MAX, vp)) != 0) \ + return (__ret); \ +} while (0) +#undef CFG_GET_UINT +#define CFG_GET_UINT(s, vp) do { \ + int __ret; \ + if ((__ret = \ + __db_getulong(env->dbenv, NULL, s, 0, UINT_MAX, vp)) != 0) \ + return (__ret); \ +} while (0) +#undef CFG_GET_UINT32 +#define CFG_GET_UINT32(s, vp) do { \ + if (__db_getulong(env->dbenv, NULL, s, 0, UINT32_MAX, vp) != 0) \ + return (EINVAL); \ +} while (0) + +/* This is the maximum number of tokens in a DB_CONFIG line. */ +#undef CFG_SLOTS +#define CFG_SLOTS 10 + +/* + * __config_parse -- + * Parse a single NAME VALUE pair. + */ +static int +__config_parse(env, s, lc) + ENV *env; + char *s; + int lc; +{ + DB_ENV *dbenv; + DB_SITE *site; + u_long uv1, uv2; + long lv1, lv2; + u_int port; + int i, nf, onoff, bad, ret, t_ret; + char *argv[CFG_SLOTS]; + const CFG_DESC *desc; + + bad = 0; + dbenv = env->dbenv; + + /* + * Split the input line in 's' into its argv-like components, returning + * the number of fields. If the command is one of the "simple" ones in + * config_descs, also return its command descriptor. + */ + if ((nf = __config_scan(s, argv, &desc)) < 2) { +format: __db_errx(env, DB_STR_A("1584", + "line %d: %s: incorrect name-value pair", "%d %s"), + lc, argv[0]); + return (EINVAL); + } + + /* Handle simple configuration lines here. */ + if (desc != NULL) { + ret = 0; + switch (desc->type) { + case CFG_INT: /* <command> <int> */ + if (nf != 2) + goto format; + CFG_GET_INT(argv[1], &lv1); + ret = ((CFG_FUNC_INT)desc->func)(dbenv, (int) lv1); + break; + + case CFG_LONG: /* <command> <long int> */ + if (nf != 2) + goto format; + CFG_GET_LONG(argv[1], &lv1); + ret = ((CFG_FUNC_LONG)desc->func)(dbenv, lv1); + break; + + case CFG_UINT: /* <command> <uint> */ + if (nf != 2) + goto format; + CFG_GET_UINT(argv[1], &uv1); + ret = ((CFG_FUNC_UINT)desc->func) + (dbenv, (u_int32_t) uv1); + break; + + case CFG_2INT: /* <command> <int1> <int2> */ + if (nf != 3) + goto format; + CFG_GET_INT(argv[1], &lv1); + CFG_GET_INT(argv[2], &lv2); + ret = ((CFG_FUNC_2INT)desc->func) + (dbenv, (int) lv1, (int) lv2); + break; + + case CFG_2UINT: /* <command> <uint1> <uint2> */ + if (nf != 3) + goto format; + CFG_GET_UINT(argv[1], &uv1); + CFG_GET_UINT(argv[2], &uv2); + ret = ((CFG_FUNC_2UINT)desc->func) + (dbenv, (u_int32_t) uv1, (u_int32_t) uv2); + break; + + case CFG_STRING: /* <command> <rest of line as string> */ + ret = ((CFG_FUNC_STRING) desc->func)(dbenv, argv[1]); + break; + } + return (ret); + } + + /* + * The commands not covered in config_descs are handled below, each + * with their own command-specific block of code. Most of them are + * fairly similar to each other, but not quite enough to warrant + * that they all be table-driven too. + */ + + /* set_memory_init db_mem_XXX <unsigned> */ + if (strcasecmp(argv[0], "set_memory_init") == 0) { + if (nf != 3) + goto format; + if ((lv1 = __db_name_to_val(config_mem_init, argv[1])) == -1) + goto format; + CFG_GET_UINT32(argv[2], &uv2); + return (__env_set_memory_init(dbenv, + (DB_MEM_CONFIG) lv1, (u_int32_t)uv2)); + } + + /* rep_set_config { db_rep_conf_XXX | db_repmgr_conf_XXX } [on|off] */ + if (strcasecmp(argv[0], "rep_set_config") == 0) { + if (nf != 2 && nf != 3) + goto format; + onoff = 1; + if (nf == 3) { + if (strcasecmp(argv[2], "off") == 0) + onoff = 0; + else if (strcasecmp(argv[2], "on") != 0) + goto format; + } + if ((lv1 = __db_name_to_val(config_rep_config, argv[1])) == -1) + goto format; + return (__rep_set_config(dbenv, (u_int32_t)lv1, onoff)); + } + + /* rep_set_timeout db_rep_XXX <unsigned> */ + if (strcasecmp(argv[0], "rep_set_timeout") == 0) { + if (nf != 3) + goto format; + if ((lv1 = __db_name_to_val(config_rep_timeout, argv[1])) == -1) + goto format; + CFG_GET_UINT32(argv[2], &uv2); + return (__rep_set_timeout(dbenv, lv1, (db_timeout_t)uv2)); + } + + /* repmgr_set_ack_policy db_repmgr_acks_XXX */ + if (strcasecmp(argv[0], "repmgr_set_ack_policy") == 0) { + if (nf != 2) + goto format; + if ((lv1 = + __db_name_to_val(config_repmgr_ack_policy, argv[1])) == -1) + goto format; + return (__repmgr_set_ack_policy(dbenv, lv1)); + } + + /* + * Configure name/value pairs of config information for a site (local or + * remote). + * + * repmgr_site host port [which value(on | off | unsigned)}] ... + */ + if (strcasecmp(argv[0], "repmgr_site") == 0) { + if (nf < 3 || (nf % 2) == 0) + goto format; + CFG_GET_UINT(argv[2], &uv2); + port = (u_int)uv2; + + if ((ret = __repmgr_site(dbenv, argv[1], port, &site, 0)) != 0) + return (ret); +#ifdef HAVE_REPLICATION_THREADS + for (i = 3; i < nf; i += 2) { + if ((lv1 = __db_name_to_val( + config_repmgr_site, argv[i])) == -1) { + bad = 1; + break; + } + + if (strcasecmp(argv[i + 1], "on") == 0) + uv2 = 1; + else if (strcasecmp(argv[i + 1], "off") == 0) + uv2 = 0; + else + CFG_GET_UINT32(argv[i + 1], &uv2); + if ((ret = __repmgr_site_config(site, + (u_int32_t)lv1, (u_int32_t)uv2)) != 0) + break; + } + if ((t_ret = __repmgr_site_close(site)) != 0 && ret == 0) + ret = t_ret; + if (bad) + goto format; +#else + /* If repmgr not built, __repmgr_site() returns DB_OPNOTSUP. */ + COMPQUIET(i, 0); + COMPQUIET(t_ret, 0); + DB_ASSERT(env, 0); +#endif + return (ret); + } + + /* set_cachesize <unsigned gbytes> <unsigned bytes> <int ncaches> */ + if (strcasecmp(argv[0], "set_cachesize") == 0) { + if (nf != 4) + goto format; + CFG_GET_UINT32(argv[1], &uv1); + CFG_GET_UINT32(argv[2], &uv2); + CFG_GET_INT(argv[3], &lv1); + return (__memp_set_cachesize( + dbenv, (u_int32_t)uv1, (u_int32_t)uv2, (int)lv1)); + } + + /* set_intermediate_dir <integer dir permission> */ + if (strcasecmp(argv[0], "set_intermediate_dir") == 0) { + if (nf != 2) + goto format; + CFG_GET_INT(argv[1], &lv1); + if (lv1 <= 0) + goto format; + env->dir_mode = (int)lv1; + return (0); + } + + /* set_flags <env or log flag name> [on | off] */ + if (strcasecmp(argv[0], "set_flags") == 0) { + if (nf != 2 && nf != 3) + goto format; + onoff = 1; + if (nf == 3) { + if (strcasecmp(argv[2], "off") == 0) + onoff = 0; + else if (strcasecmp(argv[2], "on") != 0) + goto format; + } + /* First see whether it is an env flag, then a log flag. */ + if ((lv1 = __db_name_to_val(config_set_flags, argv[1])) != -1) + return (__env_set_flags(dbenv, (u_int32_t)lv1, onoff)); + else if ((lv1 = + __db_name_to_val(config_set_flags_forlog, argv[1])) != -1) + return (__log_set_config(dbenv, (u_int32_t)lv1, onoff)); + goto format; + } + + /* log_set_config <log flag name> [on | off] */ + if (strcasecmp(argv[0], "log_set_config") == 0) { + if (nf != 2 && nf != 3) + goto format; + onoff = 1; + if (nf == 3) { + if (strcasecmp(argv[2], "off") == 0) + onoff = 0; + else if (strcasecmp(argv[2], "on") != 0) + goto format; + } + if ((lv1 = + __db_name_to_val(config_log_set_config, argv[1])) == -1) + goto format; + return (__log_set_config(dbenv, (u_int32_t)lv1, onoff)); + } + + /* set_lk_detect db_lock_xxx */ + if (strcasecmp(argv[0], "set_lk_detect") == 0) { + if (nf != 2) + goto format; + if ((lv1 = + __db_name_to_val(config_set_lk_detect, argv[1])) == -1) + goto format; + return (__lock_set_lk_detect(dbenv, (u_int32_t)lv1)); + } + + /* set_lock_timeout <unsigned lock timeout> */ + if (strcasecmp(argv[0], "set_lock_timeout") == 0) { + if (nf != 2) + goto format; + CFG_GET_UINT32(argv[1], &uv1); + return (__lock_set_env_timeout( + dbenv, (u_int32_t)uv1, DB_SET_LOCK_TIMEOUT)); + } + + /* set_open_flags <env open flag name> [on | off] */ + if (strcasecmp(argv[0], "set_open_flags") == 0) { + if (nf != 2 && nf != 3) + goto format; + onoff = 1; + if (nf == 3) { + if (strcasecmp(argv[2], "off") == 0) + onoff = 0; + else if (strcasecmp(argv[2], "on") != 0) + goto format; + } + if ((lv1 = + __db_name_to_val(config_set_open_flags, argv[1])) == -1) + goto format; + if (onoff == 1) + FLD_SET(env->open_flags, (u_int32_t)lv1); + else + FLD_CLR(env->open_flags, (u_int32_t)lv1); + return (0); + } + + /* set_region_init <0 or 1> */ + if (strcasecmp(argv[0], "set_region_init") == 0) { + if (nf != 2) + goto format; + CFG_GET_INT(argv[1], &lv1); + if (lv1 != 0 && lv1 != 1) + goto format; + return (__env_set_flags( + dbenv, DB_REGION_INIT, lv1 == 0 ? 0 : 1)); + } + + /* set_reg_timeout <unsigned timeout> */ + if (strcasecmp(argv[0], "set_reg_timeout") == 0) { + if (nf != 2) + goto format; + CFG_GET_UINT32(argv[1], &uv1); + return (__env_set_timeout( + dbenv, (u_int32_t)uv1, DB_SET_REG_TIMEOUT)); + } + + /* set_txn_timeout <unsigned timeout> */ + if (strcasecmp(argv[0], "set_txn_timeout") == 0) { + if (nf != 2) + goto format; + CFG_GET_UINT32(argv[1], &uv1); + return (__lock_set_env_timeout( + dbenv, (u_int32_t)uv1, DB_SET_TXN_TIMEOUT)); + } + + /* set_verbose db_verb_XXX [on | off] */ + if (strcasecmp(argv[0], "set_verbose") == 0) { + if (nf != 2 && nf != 3) + goto format; + onoff = 1; + if (nf == 3) { + if (strcasecmp(argv[2], "off") == 0) + onoff = 0; + else if (strcasecmp(argv[2], "on") != 0) + goto format; + } + if ((lv1 = __db_name_to_val(config_set_verbose, argv[1])) == -1) + goto format; + return (__env_set_verbose(dbenv, (u_int32_t)lv1, onoff)); + } + + __db_errx(env, + DB_STR_A("1585", "unrecognized name-value pair: %s", "%s"), s); + return (EINVAL); +} + +/* cmp_cfg_name -- + * Bsearch comparison function for CFG_DESC.name, for looking up + * the names of simple commmands. + */ +static int +cmp_cfg_name(sought, element) + const void *sought; + const void *element; +{ + return + (strcmp((const char *) sought, ((const CFG_DESC *) element)->name)); +} + +/* + * __config_scan -- + * Split DB_CONFIG lines into fields. Usually each whitespace separated + * field is scanned as a distinct argument. However, if the command is + * recognized as one needing a single string value, then the rest of the + * line is returned as the one argument. That supports strings which + * contain whitespaces, such as some directory paths. + * + * This returns the number of fields. It sets *descptr to the command + * descriptor (if it is recognized), or NULL. + */ +static int +__config_scan(input, argv, descptr) + char *input, *argv[CFG_SLOTS]; + const CFG_DESC **descptr; +{ + size_t tablecount; + int count; + char **ap; + + tablecount = sizeof(config_descs) / sizeof(config_descs[0]); + *descptr = NULL; + for (count = 0, ap = argv; (*ap = strsep(&input, " \t\n")) != NULL;) { + /* Empty tokens are adjacent whitespaces; skip them. */ + if (**ap == '\0') + continue; + /* Accept a non-empty token as the next field. */ + count++; + ap++; + /* + * If that was the first token, look it up in the simple command + * table. If it is there and takes a single string value, then + * return the remainder of the line (after skipping over any + * leading whitespaces) without splitting it further. + */ + if (count == 1) { + *descptr = bsearch(argv[0], config_descs, + tablecount, sizeof(config_descs[0]), cmp_cfg_name); + if (*descptr != NULL && + (*descptr)->type == CFG_STRING) { + count++; + while (isspace(*input)) + input++; + *ap++ = input; + break; + } + } + /* Stop scanning if the line has too many tokens. */ + if (count >= CFG_SLOTS) + break; + } + return (count); +} diff --git a/src/env/env_failchk.c b/src/env/env_failchk.c new file mode 100644 index 00000000..05752f07 --- /dev/null +++ b/src/env/env_failchk.c @@ -0,0 +1,558 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2005, 2012 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#ifndef HAVE_SIMPLE_THREAD_TYPE +#include "dbinc/db_page.h" +#include "dbinc/hash.h" /* Needed for call to __ham_func5. */ +#endif +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/mp.h" +#include "dbinc/txn.h" + +static int __env_in_api __P((ENV *)); +static void __env_clear_state __P((ENV *)); + +/* + * __env_failchk_pp -- + * ENV->failchk pre/post processing. + * + * PUBLIC: int __env_failchk_pp __P((DB_ENV *, u_int32_t)); + */ +int +__env_failchk_pp(dbenv, flags) + DB_ENV *dbenv; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int ret; + + env = dbenv->env; + + ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->failchk"); + + /* + * ENV->failchk requires self and is-alive functions. We + * have a default self function, but no is-alive function. + */ + if (!ALIVE_ON(env)) { + __db_errx(env, DB_STR("1503", + "DB_ENV->failchk requires DB_ENV->is_alive be configured")); + return (EINVAL); + } + + if (flags != 0) + return (__db_ferr(env, "DB_ENV->failchk", 0)); + + ENV_ENTER(env, ip); + FAILCHK_THREAD(env, ip); /* mark as failchk thread */ + ret = __env_failchk_int(dbenv); + ENV_LEAVE(env, ip); + return (ret); +} +/* + * __env_failchk_int -- + * Process the subsystem failchk routines + * + * PUBLIC: int __env_failchk_int __P((DB_ENV *)); + */ +int +__env_failchk_int(dbenv) + DB_ENV *dbenv; +{ + ENV *env; + int ret; + + env = dbenv->env; + F_SET(dbenv, DB_ENV_FAILCHK); + + /* + * We check for dead threads in the API first as this would be likely + * to hang other things we try later, like locks and transactions. + */ + if ((ret = __env_in_api(env)) != 0) + goto err; + + if (LOCKING_ON(env) && (ret = __lock_failchk(env)) != 0) + goto err; + + if (TXN_ON(env) && + ((ret = __txn_failchk(env)) != 0 || + (ret = __dbreg_failchk(env)) != 0)) + goto err; + + if ((ret = __memp_failchk(env)) != 0) + goto err; + +#ifdef HAVE_REPLICATION_THREADS + if (REP_ON(env) && (ret = __repmgr_failchk(env)) != 0) + goto err; +#endif + + /* Mark any dead blocked threads as dead. */ + __env_clear_state(env); + +#ifdef HAVE_MUTEX_SUPPORT + ret = __mut_failchk(env); +#endif + +err: F_CLR(dbenv, DB_ENV_FAILCHK); + return (ret); +} + +/* + * __env_thread_size -- + * Initial amount of memory for thread info blocks. + * PUBLIC: size_t __env_thread_size __P((ENV *, size_t)); + */ +size_t +__env_thread_size(env, other_alloc) + ENV *env; + size_t other_alloc; +{ + DB_ENV *dbenv; + size_t size; + u_int32_t max; + + dbenv = env->dbenv; + size = 0; + + max = dbenv->thr_max; + if (dbenv->thr_init != 0) { + size = + dbenv->thr_init * __env_alloc_size(sizeof(DB_THREAD_INFO)); + if (max < dbenv->thr_init) + max = dbenv->thr_init; + } else if (max == 0 && ALIVE_ON(env)) { + if ((max = dbenv->tx_init) == 0) { + /* + * They want thread tracking, but don't say how much. + * Arbitrarily assume 1/10 of the remaining memory + * or at least 100. We just use this to size + * the hash table. + */ + if (dbenv->memory_max != 0) + max = (u_int32_t) + (((dbenv->memory_max - other_alloc) / 10) / + sizeof(DB_THREAD_INFO)); + if (max < 100) + max = 100; + } + } + /* + * Set the number of buckets to be 1/8th the number of + * thread control blocks. This is rather arbitrary. + */ + dbenv->thr_max = max; + if (max != 0) + size += __env_alloc_size(sizeof(DB_HASHTAB) * + __db_tablesize(max / 8)); + return (size); +} + +/* + * __env_thread_max -- + * Return the amount of extra memory to hold thread information. + * PUBLIC: size_t __env_thread_max __P((ENV *)); + */ +size_t +__env_thread_max(env) + ENV *env; +{ + DB_ENV *dbenv; + size_t size; + + dbenv = env->dbenv; + + /* + * Allocate space for thread info blocks. Max is only advisory, + * so we allocate 25% more. + */ + if (dbenv->thr_max > dbenv->thr_init) { + size = dbenv->thr_max - dbenv->thr_init; + size += size / 4; + } else { + dbenv->thr_max = dbenv->thr_init; + size = dbenv->thr_init / 4; + } + + size = size * __env_alloc_size(sizeof(DB_THREAD_INFO)); + return (size); +} + +/* + * __env_thread_init -- + * Initialize the thread control block table. + * + * PUBLIC: int __env_thread_init __P((ENV *, int)); + */ +int +__env_thread_init(env, during_creation) + ENV *env; + int during_creation; +{ + DB_ENV *dbenv; + DB_HASHTAB *htab; + REGENV *renv; + REGINFO *infop; + THREAD_INFO *thread; + int ret; + + dbenv = env->dbenv; + infop = env->reginfo; + renv = infop->primary; + + if (renv->thread_off == INVALID_ROFF) { + if (dbenv->thr_max == 0) { + env->thr_hashtab = NULL; + if (ALIVE_ON(env)) { + __db_errx(env, DB_STR("1504", + "is_alive method specified but no thread region allocated")); + return (EINVAL); + } + return (0); + } + + if (!during_creation) { + __db_errx(env, DB_STR("1505", +"thread table must be allocated when the database environment is created")); + return (EINVAL); + } + + if ((ret = + __env_alloc(infop, sizeof(THREAD_INFO), &thread)) != 0) { + __db_err(env, ret, DB_STR("1506", + "unable to allocate a thread status block")); + return (ret); + } + memset(thread, 0, sizeof(*thread)); + renv->thread_off = R_OFFSET(infop, thread); + thread->thr_nbucket = __db_tablesize(dbenv->thr_max / 8); + if ((ret = __env_alloc(infop, + thread->thr_nbucket * sizeof(DB_HASHTAB), &htab)) != 0) + return (ret); + thread->thr_hashoff = R_OFFSET(infop, htab); + __db_hashinit(htab, thread->thr_nbucket); + thread->thr_max = dbenv->thr_max; + thread->thr_init = dbenv->thr_init; + } else { + thread = R_ADDR(infop, renv->thread_off); + htab = R_ADDR(infop, thread->thr_hashoff); + } + + env->thr_hashtab = htab; + env->thr_nbucket = thread->thr_nbucket; + dbenv->thr_max = thread->thr_max; + dbenv->thr_init = thread->thr_init; + return (0); +} + +/* + * __env_thread_destroy -- + * Destroy the thread control block table. + * + * PUBLIC: void __env_thread_destroy __P((ENV *)); + */ +void +__env_thread_destroy(env) + ENV *env; +{ + DB_HASHTAB *htab; + DB_THREAD_INFO *ip, *np; + REGENV *renv; + REGINFO *infop; + THREAD_INFO *thread; + u_int32_t i; + + infop = env->reginfo; + renv = infop->primary; + if (renv->thread_off == INVALID_ROFF) + return; + + thread = R_ADDR(infop, renv->thread_off); + if ((htab = env->thr_hashtab) != NULL) { + for (i = 0; i < env->thr_nbucket; i++) { + ip = SH_TAILQ_FIRST(&htab[i], __db_thread_info); + for (; ip != NULL; ip = np) { + np = SH_TAILQ_NEXT(ip, + dbth_links, __db_thread_info); + __env_alloc_free(infop, ip); + } + } + __env_alloc_free(infop, htab); + } + + __env_alloc_free(infop, thread); + return; +} + +/* + * __env_in_api -- + * Look for threads which died in the api and complain. + * If no threads died but there are blocked threads unpin + * any buffers they may have locked. + */ +static int +__env_in_api(env) + ENV *env; +{ + DB_ENV *dbenv; + DB_HASHTAB *htab; + DB_THREAD_INFO *ip; + REGENV *renv; + REGINFO *infop; + THREAD_INFO *thread; + u_int32_t i; + int unpin, ret; + + if ((htab = env->thr_hashtab) == NULL) + return (EINVAL); + + dbenv = env->dbenv; + infop = env->reginfo; + renv = infop->primary; + thread = R_ADDR(infop, renv->thread_off); + unpin = 0; + + for (i = 0; i < env->thr_nbucket; i++) + SH_TAILQ_FOREACH(ip, &htab[i], dbth_links, __db_thread_info) { + if (ip->dbth_state == THREAD_SLOT_NOT_IN_USE || + (ip->dbth_state == THREAD_OUT && + thread->thr_count < thread->thr_max)) + continue; + if (dbenv->is_alive( + dbenv, ip->dbth_pid, ip->dbth_tid, 0)) + continue; + if (ip->dbth_state == THREAD_BLOCKED) { + ip->dbth_state = THREAD_BLOCKED_DEAD; + unpin = 1; + continue; + } + if (ip->dbth_state == THREAD_OUT) { + ip->dbth_state = THREAD_SLOT_NOT_IN_USE; + continue; + } + return (__db_failed(env, DB_STR("1507", + "Thread died in Berkeley DB library"), + ip->dbth_pid, ip->dbth_tid)); + } + + if (unpin == 0) + return (0); + + for (i = 0; i < env->thr_nbucket; i++) + SH_TAILQ_FOREACH(ip, &htab[i], dbth_links, __db_thread_info) + if (ip->dbth_state == THREAD_BLOCKED_DEAD && + (ret = __memp_unpin_buffers(env, ip)) != 0) + return (ret); + + return (0); +} + +/* + * __env_clear_state -- + * Look for threads which died while blockedi and clear them.. + */ +static void +__env_clear_state(env) + ENV *env; +{ + DB_HASHTAB *htab; + DB_THREAD_INFO *ip; + u_int32_t i; + + htab = env->thr_hashtab; + for (i = 0; i < env->thr_nbucket; i++) + SH_TAILQ_FOREACH(ip, &htab[i], dbth_links, __db_thread_info) + if (ip->dbth_state == THREAD_BLOCKED_DEAD) + ip->dbth_state = THREAD_SLOT_NOT_IN_USE; +} + +struct __db_threadid { + pid_t pid; + db_threadid_t tid; +}; + +/* + * PUBLIC: int __env_set_state __P((ENV *, DB_THREAD_INFO **, DB_THREAD_STATE)); + */ +int +__env_set_state(env, ipp, state) + ENV *env; + DB_THREAD_INFO **ipp; + DB_THREAD_STATE state; +{ + struct __db_threadid id; + DB_ENV *dbenv; + DB_HASHTAB *htab; + DB_THREAD_INFO *ip; + REGENV *renv; + REGINFO *infop; + THREAD_INFO *thread; + u_int32_t indx; + int ret; + + dbenv = env->dbenv; + htab = env->thr_hashtab; + + if (F_ISSET(dbenv, DB_ENV_NOLOCKING)) { + *ipp = NULL; + return (0); + } + dbenv->thread_id(dbenv, &id.pid, &id.tid); + + /* + * Hashing of thread ids. This is simple but could be replaced with + * something more expensive if needed. + */ +#ifdef HAVE_SIMPLE_THREAD_TYPE + /* + * A thread ID may be a pointer, so explicitly cast to a pointer of + * the appropriate size before doing the bitwise XOR. + */ + indx = (u_int32_t)((uintptr_t)id.pid ^ (uintptr_t)id.tid); +#else + indx = __ham_func5(NULL, &id.tid, sizeof(id.tid)); +#endif + indx %= env->thr_nbucket; + SH_TAILQ_FOREACH(ip, &htab[indx], dbth_links, __db_thread_info) { +#ifdef HAVE_SIMPLE_THREAD_TYPE + if (id.pid == ip->dbth_pid && id.tid == ip->dbth_tid) + break; +#else + if (memcmp(&id.pid, &ip->dbth_pid, sizeof(id.pid)) != 0) + continue; +#ifdef HAVE_MUTEX_PTHREADS + if (pthread_equal(id.tid, ip->dbth_tid) == 0) +#else + if (memcmp(&id.tid, &ip->dbth_tid, sizeof(id.tid)) != 0) +#endif + continue; + break; +#endif + } + + /* + * If ipp is not null, return the thread control block if found. + * Check to ensure the thread of control has been registered. + */ + if (state == THREAD_VERIFY) { + DB_ASSERT(env, ip != NULL && ip->dbth_state != THREAD_OUT); + if (ipp != NULL) { + if (ip == NULL) /* The control block wasn't found */ + return (EINVAL); + *ipp = ip; + } + return (0); + } + + *ipp = NULL; + ret = 0; + if (ip == NULL) { + infop = env->reginfo; + renv = infop->primary; + thread = R_ADDR(infop, renv->thread_off); + MUTEX_LOCK(env, renv->mtx_regenv); + + /* + * If we are passed the specified max, try to reclaim one from + * our queue. If failcheck has marked the slot not in use, we + * can take it, otherwise we must call is_alive before freeing + * it. + */ + if (thread->thr_count >= thread->thr_max) { + SH_TAILQ_FOREACH( + ip, &htab[indx], dbth_links, __db_thread_info) + if (ip->dbth_state == THREAD_SLOT_NOT_IN_USE || + (ip->dbth_state == THREAD_OUT && + ALIVE_ON(env) && !dbenv->is_alive( + dbenv, ip->dbth_pid, ip->dbth_tid, 0))) + break; + + if (ip != NULL) { + DB_ASSERT(env, ip->dbth_pincount == 0); + goto init; + } + } + + thread->thr_count++; + if ((ret = __env_alloc(infop, + sizeof(DB_THREAD_INFO), &ip)) == 0) { + memset(ip, 0, sizeof(*ip)); + /* + * This assumes we can link atomically since we do + * no locking here. We never use the backpointer + * so we only need to be able to write an offset + * atomically. + */ + SH_TAILQ_INSERT_HEAD( + &htab[indx], ip, dbth_links, __db_thread_info); + ip->dbth_pincount = 0; + ip->dbth_pinmax = PINMAX; + ip->dbth_pinlist = R_OFFSET(infop, ip->dbth_pinarray); + +init: ip->dbth_pid = id.pid; + ip->dbth_tid = id.tid; + ip->dbth_state = state; + SH_TAILQ_INIT(&ip->dbth_xatxn); + } + MUTEX_UNLOCK(env, renv->mtx_regenv); + } else + ip->dbth_state = state; + *ipp = ip; + + DB_ASSERT(env, ret == 0); + if (ret != 0) + __db_errx(env, DB_STR("1508", + "Unable to allocate thread control block")); + return (ret); +} + +/* + * __env_thread_id_string -- + * Convert a thread id to a string. + * + * PUBLIC: char *__env_thread_id_string + * PUBLIC: __P((DB_ENV *, pid_t, db_threadid_t, char *)); + */ +char * +__env_thread_id_string(dbenv, pid, tid, buf) + DB_ENV *dbenv; + pid_t pid; + db_threadid_t tid; + char *buf; +{ +#ifdef HAVE_SIMPLE_THREAD_TYPE +#ifdef UINT64_FMT + char fmt[20]; + + snprintf(fmt, sizeof(fmt), "%s/%s", UINT64_FMT, UINT64_FMT); + snprintf(buf, + DB_THREADID_STRLEN, fmt, (u_int64_t)pid, (u_int64_t)(uintptr_t)tid); +#else + snprintf(buf, DB_THREADID_STRLEN, "%lu/%lu", (u_long)pid, (u_long)tid); +#endif +#else +#ifdef UINT64_FMT + char fmt[20]; + + snprintf(fmt, sizeof(fmt), "%s/TID", UINT64_FMT); + snprintf(buf, DB_THREADID_STRLEN, fmt, (u_int64_t)pid); +#else + snprintf(buf, DB_THREADID_STRLEN, "%lu/TID", (u_long)pid); +#endif +#endif + COMPQUIET(dbenv, NULL); + COMPQUIET(*(u_int8_t *)&tid, 0); + + return (buf); +} diff --git a/src/env/env_file.c b/src/env/env_file.c new file mode 100644 index 00000000..b102404d --- /dev/null +++ b/src/env/env_file.c @@ -0,0 +1,128 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2002, 2012 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" + +/* + * __db_file_extend -- + * Initialize a regular file by writing the last page of the file. + * + * PUBLIC: int __db_file_extend __P((ENV *, DB_FH *, size_t)); + */ +int +__db_file_extend(env, fhp, size) + ENV *env; + DB_FH *fhp; + size_t size; +{ + db_pgno_t pages; + size_t nw; + u_int32_t relative; + int ret; + char buf; + + buf = '\0'; + /* + * Extend the file by writing the last page. If the region is >4Gb, + * increment may be larger than the maximum possible seek "relative" + * argument, as it's an unsigned 32-bit value. Break the offset into + * pages of 1MB each so we don't overflow -- (2^20 * 2^32 is bigger + * than any memory I expect to see for awhile). + */ + pages = (db_pgno_t)((size - sizeof(buf)) / MEGABYTE); + relative = (u_int32_t)((size - sizeof(buf)) % MEGABYTE); + if ((ret = __os_seek(env, fhp, pages, MEGABYTE, relative)) == 0) + ret = __os_write(env, fhp, &buf, sizeof(buf), &nw); + + return (ret); +} + +/* + * __db_file_multi_write -- + * Overwrite a file with multiple passes to corrupt the data. + * + * PUBLIC: int __db_file_multi_write __P((ENV *, const char *)); + */ +int +__db_file_multi_write(env, path) + ENV *env; + const char *path; +{ + DB_FH *fhp; + u_int32_t mbytes, bytes; + int ret; + + if ((ret = __os_open(env, path, 0, DB_OSO_REGION, 0, &fhp)) == 0 && + (ret = __os_ioinfo(env, path, fhp, &mbytes, &bytes, NULL)) == 0) { + /* + * !!! + * Overwrite a regular file with alternating 0xff, 0x00 and 0xff + * byte patterns. Implies a fixed-block filesystem, journaling + * or logging filesystems will require operating system support. + */ + if ((ret = + __db_file_write(env, fhp, mbytes, bytes, 255)) != 0) + goto err; + if ((ret = + __db_file_write(env, fhp, mbytes, bytes, 0)) != 0) + goto err; + if ((ret = + __db_file_write(env, fhp, mbytes, bytes, 255)) != 0) + goto err; + } else + __db_err(env, ret, "%s", path); + +err: if (fhp != NULL) + (void)__os_closehandle(env, fhp); + return (ret); +} + +/* + * __db_file_write -- + * A single pass over the file, writing the specified byte pattern. + * + * PUBLIC: int __db_file_write __P((ENV *, + * PUBLIC: DB_FH *, u_int32_t, u_int32_t, int)); + */ +int +__db_file_write(env, fhp, mbytes, bytes, pattern) + ENV *env; + DB_FH *fhp; + int pattern; + u_int32_t mbytes, bytes; +{ + size_t len, nw; + int i, ret; + char *buf; + +#undef FILE_WRITE_IO_SIZE +#define FILE_WRITE_IO_SIZE (64 * 1024) + if ((ret = __os_malloc(env, FILE_WRITE_IO_SIZE, &buf)) != 0) + return (ret); + memset(buf, pattern, FILE_WRITE_IO_SIZE); + + if ((ret = __os_seek(env, fhp, 0, 0, 0)) != 0) + goto err; + for (; mbytes > 0; --mbytes) + for (i = MEGABYTE / FILE_WRITE_IO_SIZE; i > 0; --i) + if ((ret = __os_write( + env, fhp, buf, FILE_WRITE_IO_SIZE, &nw)) != 0) + goto err; + for (; bytes > 0; bytes -= (u_int32_t)len) { + len = bytes < FILE_WRITE_IO_SIZE ? bytes : FILE_WRITE_IO_SIZE; + if ((ret = __os_write(env, fhp, buf, len, &nw)) != 0) + goto err; + } + + ret = __os_fsync(env, fhp); + +err: __os_free(env, buf); + return (ret); +} diff --git a/src/env/env_globals.c b/src/env/env_globals.c new file mode 100644 index 00000000..955e6738 --- /dev/null +++ b/src/env/env_globals.c @@ -0,0 +1,66 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" + +/* + * A structure with static initialization values for all of the global fields + * used by Berkeley DB. + * See dbinc/globals.h for the structure definition. + */ +DB_GLOBALS __db_global_values = { +#ifdef HAVE_VXWORKS + 0, /* VxWorks: db_global_init */ + NULL, /* VxWorks: db_global_lock */ +#endif +#ifdef DB_WIN32 +#ifndef DB_WINCE + { 0 }, /* SECURITY_DESCRIPTOR win_default_sec_desc */ + { 0 }, /* SECURITY_ATTRIBUTES win_default_sec_attr */ +#endif + NULL, /* SECURITY_ATTRIBUTES *win_sec_attr */ +#endif + { NULL, NULL }, /* XA env list */ + + "=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=", /* db_line */ + { 0 }, /* error_buf */ + 0, /* uid_init */ + 0, /* rand_next */ + 0, /* fid_serial */ + 0, /* db_errno */ + 0, /* num_active_pids */ + 0, /* size_active_pids */ + NULL, /* active_pids */ + NULL, /* saved_errstr */ + NULL, /* j_assert */ + NULL, /* j_close */ + NULL, /* j_dirfree */ + NULL, /* j_dirlist */ + NULL, /* j_exists*/ + NULL, /* j_free */ + NULL, /* j_fsync */ + NULL, /* j_ftruncate */ + NULL, /* j_ioinfo */ + NULL, /* j_malloc */ + NULL, /* j_file_map */ + NULL, /* j_file_unmap */ + NULL, /* j_open */ + NULL, /* j_pread */ + NULL, /* j_pwrite */ + NULL, /* j_read */ + NULL, /* j_realloc */ + NULL, /* j_region_map */ + NULL, /* j_region_unmap */ + NULL, /* j_rename */ + NULL, /* j_seek */ + NULL, /* j_unlink */ + NULL, /* j_write */ + NULL /* j_yield */ +}; diff --git a/src/env/env_method.c b/src/env/env_method.c new file mode 100644 index 00000000..63deacea --- /dev/null +++ b/src/env/env_method.c @@ -0,0 +1,1918 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1999, 2012 Oracle and/or its affiliates. All rights reserved. + * + * $Id: env_method.c,v dabaaeb7d839 2010/08/03 17:28:53 mike $ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/crypto.h" +#include "dbinc/hmac.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" +#include "dbinc/lock.h" +#include "dbinc/mp.h" +#include "dbinc/txn.h" + +static int __db_env_init __P((DB_ENV *)); +static void __env_err __P((const DB_ENV *, int, const char *, ...)); +static void __env_errx __P((const DB_ENV *, const char *, ...)); +static int __env_get_create_dir __P((DB_ENV *, const char **)); +static int __env_get_data_dirs __P((DB_ENV *, const char ***)); +static int __env_get_data_len __P((DB_ENV *, u_int32_t *)); +static int __env_get_flags __P((DB_ENV *, u_int32_t *)); +static int __env_get_home __P((DB_ENV *, const char **)); +static int __env_get_intermediate_dir_mode __P((DB_ENV *, const char **)); +static int __env_get_metadata_dir __P((DB_ENV *, const char **)); +static int __env_get_shm_key __P((DB_ENV *, long *)); +static int __env_get_thread_count __P((DB_ENV *, u_int32_t *)); +static int __env_get_thread_id_fn __P((DB_ENV *, + void (**)(DB_ENV *, pid_t *, db_threadid_t *))); +static int __env_get_thread_id_string_fn __P((DB_ENV *, + char * (**)(DB_ENV *, pid_t, db_threadid_t, char *))); +static int __env_get_timeout __P((DB_ENV *, db_timeout_t *, u_int32_t)); +static int __env_get_tmp_dir __P((DB_ENV *, const char **)); +static int __env_get_verbose __P((DB_ENV *, u_int32_t, int *)); +static int __env_get_app_dispatch + __P((DB_ENV *, int (**)(DB_ENV *, DBT *, DB_LSN *, db_recops))); +static int __env_set_app_dispatch + __P((DB_ENV *, int (*)(DB_ENV *, DBT *, DB_LSN *, db_recops))); +static int __env_set_event_notify + __P((DB_ENV *, void (*)(DB_ENV *, u_int32_t, void *))); +static int __env_get_feedback __P((DB_ENV *, void (**)(DB_ENV *, int, int))); +static int __env_set_feedback __P((DB_ENV *, void (*)(DB_ENV *, int, int))); +static int __env_get_isalive __P((DB_ENV *, + int (**)(DB_ENV *, pid_t, db_threadid_t, u_int32_t))); +static int __env_set_isalive __P((DB_ENV *, + int (*)(DB_ENV *, pid_t, db_threadid_t, u_int32_t))); +static int __env_set_thread_id __P((DB_ENV *, void (*)(DB_ENV *, + pid_t *, db_threadid_t *))); +static int __env_set_thread_id_string __P((DB_ENV *, + char * (*)(DB_ENV *, pid_t, db_threadid_t, char *))); + +/* + * db_env_create -- + * DB_ENV constructor. + * + * EXTERN: int db_env_create __P((DB_ENV **, u_int32_t)); + */ +int +db_env_create(dbenvpp, flags) + DB_ENV **dbenvpp; + u_int32_t flags; +{ + DB_ENV *dbenv; + ENV *env; + int ret; + + /* + * !!! + * Our caller has not yet had the opportunity to reset the panic + * state or turn off mutex locking, and so we can neither check + * the panic state or acquire a mutex in the DB_ENV create path. + * + * !!! + * We can't call the flags-checking routines, we don't have an + * environment yet. + */ + if (flags != 0) + return (EINVAL); + + /* Allocate the DB_ENV and ENV structures -- we always have both. */ + if ((ret = __os_calloc(NULL, 1, sizeof(DB_ENV), &dbenv)) != 0) + return (ret); + if ((ret = __os_calloc(NULL, 1, sizeof(ENV), &env)) != 0) + goto err; + dbenv->env = env; + env->dbenv = dbenv; + + if ((ret = __db_env_init(dbenv)) != 0 || + (ret = __lock_env_create(dbenv)) != 0 || + (ret = __log_env_create(dbenv)) != 0 || + (ret = __memp_env_create(dbenv)) != 0 || +#ifdef HAVE_REPLICATION + (ret = __rep_env_create(dbenv)) != 0 || +#endif + (ret = __txn_env_create(dbenv))) + goto err; + + *dbenvpp = dbenv; + return (0); + +err: __db_env_destroy(dbenv); + return (ret); +} + +/* + * __db_env_destroy -- + * DB_ENV destructor. + * + * PUBLIC: void __db_env_destroy __P((DB_ENV *)); + */ +void +__db_env_destroy(dbenv) + DB_ENV *dbenv; +{ + __lock_env_destroy(dbenv); + __log_env_destroy(dbenv); + __memp_env_destroy(dbenv); +#ifdef HAVE_REPLICATION + __rep_env_destroy(dbenv); +#endif + __txn_env_destroy(dbenv); + + /* + * Discard the underlying ENV structure. + * + * XXX + * This is wrong, but can't be fixed until we finish the work of + * splitting up the DB_ENV and ENV structures so that we don't + * touch anything in the ENV as part of the above calls to subsystem + * DB_ENV cleanup routines. + */ + memset(dbenv->env, CLEAR_BYTE, sizeof(ENV)); + __os_free(NULL, dbenv->env); + + memset(dbenv, CLEAR_BYTE, sizeof(DB_ENV)); + __os_free(NULL, dbenv); +} + +/* + * __db_env_init -- + * Initialize a DB_ENV structure. + */ +static int +__db_env_init(dbenv) + DB_ENV *dbenv; +{ + ENV *env; + /* + * !!! + * Our caller has not yet had the opportunity to reset the panic + * state or turn off mutex locking, and so we can neither check + * the panic state or acquire a mutex in the DB_ENV create path. + * + * Initialize the method handles. + */ + /* DB_ENV PUBLIC HANDLE LIST BEGIN */ + dbenv->add_data_dir = __env_add_data_dir; + dbenv->backup = __db_backup; + dbenv->dbbackup = __db_dbbackup_pp; + dbenv->cdsgroup_begin = __cdsgroup_begin_pp; + dbenv->close = __env_close_pp; + dbenv->dbremove = __env_dbremove_pp; + dbenv->dbrename = __env_dbrename_pp; + dbenv->err = __env_err; + dbenv->errx = __env_errx; + dbenv->failchk = __env_failchk_pp; + dbenv->fileid_reset = __env_fileid_reset_pp; + dbenv->get_alloc = __env_get_alloc; + dbenv->get_app_dispatch = __env_get_app_dispatch; + dbenv->get_cache_max = __memp_get_cache_max; + dbenv->get_cachesize = __memp_get_cachesize; + dbenv->get_backup_callbacks = __env_get_backup_callbacks; + dbenv->get_backup_config = __env_get_backup_config; + dbenv->get_create_dir = __env_get_create_dir; + dbenv->get_data_dirs = __env_get_data_dirs; + dbenv->get_data_len = __env_get_data_len; + dbenv->get_encrypt_flags = __env_get_encrypt_flags; + dbenv->get_errcall = __env_get_errcall; + dbenv->get_errfile = __env_get_errfile; + dbenv->get_errpfx = __env_get_errpfx; + dbenv->get_feedback = __env_get_feedback; + dbenv->get_flags = __env_get_flags; + dbenv->get_home = __env_get_home; + dbenv->get_intermediate_dir_mode = __env_get_intermediate_dir_mode; + dbenv->get_isalive = __env_get_isalive; + dbenv->get_lg_bsize = __log_get_lg_bsize; + dbenv->get_lg_dir = __log_get_lg_dir; + dbenv->get_lg_filemode = __log_get_lg_filemode; + dbenv->get_lg_max = __log_get_lg_max; + dbenv->get_lg_regionmax = __log_get_lg_regionmax; + dbenv->get_lk_conflicts = __lock_get_lk_conflicts; + dbenv->get_lk_detect = __lock_get_lk_detect; + dbenv->get_lk_max_lockers = __lock_get_lk_max_lockers; + dbenv->get_lk_max_locks = __lock_get_lk_max_locks; + dbenv->get_lk_max_objects = __lock_get_lk_max_objects; + dbenv->get_lk_partitions = __lock_get_lk_partitions; + dbenv->get_lk_priority = __lock_get_lk_priority; + dbenv->get_lk_tablesize = __lock_get_lk_tablesize; + dbenv->get_memory_init = __env_get_memory_init; + dbenv->get_memory_max = __env_get_memory_max; + dbenv->get_metadata_dir = __env_get_metadata_dir; + dbenv->get_mp_max_openfd = __memp_get_mp_max_openfd; + dbenv->get_mp_max_write = __memp_get_mp_max_write; + dbenv->get_mp_mmapsize = __memp_get_mp_mmapsize; + dbenv->get_mp_mtxcount = __memp_get_mp_mtxcount; + dbenv->get_mp_pagesize = __memp_get_mp_pagesize; + dbenv->get_mp_tablesize = __memp_get_mp_tablesize; + dbenv->get_msgcall = __env_get_msgcall; + dbenv->get_msgfile = __env_get_msgfile; + dbenv->get_open_flags = __env_get_open_flags; + dbenv->get_shm_key = __env_get_shm_key; + dbenv->get_thread_count = __env_get_thread_count; + dbenv->get_thread_id_fn = __env_get_thread_id_fn; + dbenv->get_thread_id_string_fn = __env_get_thread_id_string_fn; + dbenv->get_timeout = __env_get_timeout; + dbenv->get_tmp_dir = __env_get_tmp_dir; + dbenv->get_tx_max = __txn_get_tx_max; + dbenv->get_tx_timestamp = __txn_get_tx_timestamp; + dbenv->get_verbose = __env_get_verbose; + dbenv->is_bigendian = __db_isbigendian; + dbenv->lock_detect = __lock_detect_pp; + dbenv->lock_get = __lock_get_pp; + dbenv->lock_id = __lock_id_pp; + dbenv->lock_id_free = __lock_id_free_pp; + dbenv->lock_put = __lock_put_pp; + dbenv->lock_stat = __lock_stat_pp; + dbenv->lock_stat_print = __lock_stat_print_pp; + dbenv->lock_vec = __lock_vec_pp; + dbenv->log_archive = __log_archive_pp; + dbenv->log_cursor = __log_cursor_pp; + dbenv->log_file = __log_file_pp; + dbenv->log_flush = __log_flush_pp; + dbenv->log_get_config = __log_get_config; + dbenv->log_printf = __log_printf_capi; + dbenv->log_put = __log_put_pp; + dbenv->log_put_record = __log_put_record_pp; + dbenv->log_read_record = __log_read_record_pp; + dbenv->log_set_config = __log_set_config; + dbenv->log_stat = __log_stat_pp; + dbenv->log_stat_print = __log_stat_print_pp; + dbenv->log_verify = __log_verify_pp; + dbenv->lsn_reset = __env_lsn_reset_pp; + dbenv->memp_fcreate = __memp_fcreate_pp; + dbenv->memp_register = __memp_register_pp; + dbenv->memp_stat = __memp_stat_pp; + dbenv->memp_stat_print = __memp_stat_print_pp; + dbenv->memp_sync = __memp_sync_pp; + dbenv->memp_trickle = __memp_trickle_pp; + dbenv->mutex_alloc = __mutex_alloc_pp; + dbenv->mutex_free = __mutex_free_pp; + dbenv->mutex_get_align = __mutex_get_align; + dbenv->mutex_get_increment = __mutex_get_increment; + dbenv->mutex_get_init = __mutex_get_init; + dbenv->mutex_get_max = __mutex_get_max; + dbenv->mutex_get_tas_spins = __mutex_get_tas_spins; + dbenv->mutex_lock = __mutex_lock_pp; + dbenv->mutex_set_align = __mutex_set_align; + dbenv->mutex_set_increment = __mutex_set_increment; + dbenv->mutex_set_init = __mutex_set_init; + dbenv->mutex_set_max = __mutex_set_max; + dbenv->mutex_set_tas_spins = __mutex_set_tas_spins; + dbenv->mutex_stat = __mutex_stat_pp; + dbenv->mutex_stat_print = __mutex_stat_print_pp; + dbenv->mutex_unlock = __mutex_unlock_pp; + dbenv->open = __env_open_pp; + dbenv->remove = __env_remove; + dbenv->rep_elect = __rep_elect_pp; + dbenv->rep_flush = __rep_flush; + dbenv->rep_get_clockskew = __rep_get_clockskew; + dbenv->rep_get_config = __rep_get_config; + dbenv->rep_get_limit = __rep_get_limit; + dbenv->rep_get_nsites = __rep_get_nsites; + dbenv->rep_get_priority = __rep_get_priority; + dbenv->rep_get_request = __rep_get_request; + dbenv->rep_get_timeout = __rep_get_timeout; + dbenv->rep_process_message = __rep_process_message_pp; + dbenv->rep_set_clockskew = __rep_set_clockskew; + dbenv->rep_set_config = __rep_set_config; + dbenv->rep_set_limit = __rep_set_limit; + dbenv->rep_set_nsites = __rep_set_nsites_pp; + dbenv->rep_set_priority = __rep_set_priority; + dbenv->rep_set_request = __rep_set_request; + dbenv->rep_set_timeout = __rep_set_timeout; + dbenv->rep_set_transport = __rep_set_transport_pp; + dbenv->rep_start = __rep_start_pp; + dbenv->rep_stat = __rep_stat_pp; + dbenv->rep_stat_print = __rep_stat_print_pp; + dbenv->rep_sync = __rep_sync; + dbenv->repmgr_channel = __repmgr_channel; + dbenv->repmgr_get_ack_policy = __repmgr_get_ack_policy; + dbenv->repmgr_local_site = __repmgr_local_site; + dbenv->repmgr_msg_dispatch = __repmgr_set_msg_dispatch; + dbenv->repmgr_set_ack_policy = __repmgr_set_ack_policy; + dbenv->repmgr_site = __repmgr_site; + dbenv->repmgr_site_by_eid = __repmgr_site_by_eid; + dbenv->repmgr_site_list = __repmgr_site_list; + dbenv->repmgr_start = __repmgr_start; + dbenv->repmgr_stat = __repmgr_stat_pp; + dbenv->repmgr_stat_print = __repmgr_stat_print_pp; + dbenv->set_alloc = __env_set_alloc; + dbenv->set_app_dispatch = __env_set_app_dispatch; + dbenv->set_backup_callbacks = __env_set_backup_callbacks; + dbenv->set_backup_config = __env_set_backup_config; + dbenv->set_cache_max = __memp_set_cache_max; + dbenv->set_cachesize = __memp_set_cachesize; + dbenv->set_create_dir = __env_set_create_dir; + dbenv->set_data_dir = __env_set_data_dir; + dbenv->set_data_len = __env_set_data_len; + dbenv->set_encrypt = __env_set_encrypt; + dbenv->set_errcall = __env_set_errcall; + dbenv->set_errfile = __env_set_errfile; + dbenv->set_errpfx = __env_set_errpfx; + dbenv->set_event_notify = __env_set_event_notify; + dbenv->set_feedback = __env_set_feedback; + dbenv->set_flags = __env_set_flags; + dbenv->set_intermediate_dir_mode = __env_set_intermediate_dir_mode; + dbenv->set_isalive = __env_set_isalive; + dbenv->set_lg_bsize = __log_set_lg_bsize; + dbenv->set_lg_dir = __log_set_lg_dir; + dbenv->set_lg_filemode = __log_set_lg_filemode; + dbenv->set_lg_max = __log_set_lg_max; + dbenv->set_lg_regionmax = __log_set_lg_regionmax; + dbenv->set_lk_conflicts = __lock_set_lk_conflicts; + dbenv->set_lk_detect = __lock_set_lk_detect; + dbenv->set_lk_max_lockers = __lock_set_lk_max_lockers; + dbenv->set_lk_max_locks = __lock_set_lk_max_locks; + dbenv->set_lk_max_objects = __lock_set_lk_max_objects; + dbenv->set_lk_partitions = __lock_set_lk_partitions; + dbenv->set_lk_priority = __lock_set_lk_priority; + dbenv->set_lk_tablesize = __lock_set_lk_tablesize; + dbenv->set_memory_init = __env_set_memory_init; + dbenv->set_memory_max = __env_set_memory_max; + dbenv->set_metadata_dir = __env_set_metadata_dir; + dbenv->set_mp_max_openfd = __memp_set_mp_max_openfd; + dbenv->set_mp_max_write = __memp_set_mp_max_write; + dbenv->set_mp_mmapsize = __memp_set_mp_mmapsize; + dbenv->set_mp_mtxcount = __memp_set_mp_mtxcount; + dbenv->set_mp_pagesize = __memp_set_mp_pagesize; + dbenv->set_mp_tablesize = __memp_set_mp_tablesize; + dbenv->set_msgcall = __env_set_msgcall; + dbenv->set_msgfile = __env_set_msgfile; + dbenv->set_paniccall = __env_set_paniccall; + dbenv->set_shm_key = __env_set_shm_key; + dbenv->set_thread_count = __env_set_thread_count; + dbenv->set_thread_id = __env_set_thread_id; + dbenv->set_thread_id_string = __env_set_thread_id_string; + dbenv->set_timeout = __env_set_timeout; + dbenv->set_tmp_dir = __env_set_tmp_dir; + dbenv->set_tx_max = __txn_set_tx_max; + dbenv->set_tx_timestamp = __txn_set_tx_timestamp; + dbenv->set_verbose = __env_set_verbose; + dbenv->stat_print = __env_stat_print_pp; + dbenv->txn_applied = __txn_applied_pp; + dbenv->txn_begin = __txn_begin_pp; + dbenv->txn_checkpoint = __txn_checkpoint_pp; + dbenv->txn_recover = __txn_recover_pp; + dbenv->txn_stat = __txn_stat_pp; + dbenv->txn_stat_print = __txn_stat_print_pp; + /* DB_ENV PUBLIC HANDLE LIST END */ + + /* DB_ENV PRIVATE HANDLE LIST BEGIN */ + dbenv->prdbt = __db_prdbt; + /* DB_ENV PRIVATE HANDLE LIST END */ + + dbenv->shm_key = INVALID_REGION_SEGID; + dbenv->thread_id = __os_id; + dbenv->thread_id_string = __env_thread_id_string; + + env = dbenv->env; + __os_id(NULL, &env->pid_cache, NULL); + + env->db_ref = 0; + env->log_verify_wrap = __log_verify_wrap; + env->data_len = ENV_DEF_DATA_LEN; + TAILQ_INIT(&env->fdlist); + + if (!__db_isbigendian()) + F_SET(env, ENV_LITTLEENDIAN); + F_SET(env, ENV_NO_OUTPUT_SET); + + return (0); +} + +/* + * __env_err -- + * DbEnv.err method. + */ +static void +#ifdef STDC_HEADERS +__env_err(const DB_ENV *dbenv, int error, const char *fmt, ...) +#else +__env_err(dbenv, error, fmt, va_alist) + const DB_ENV *dbenv; + int error; + const char *fmt; + va_dcl +#endif +{ + /* Message with error string, to stderr by default. */ + DB_REAL_ERR(dbenv, error, DB_ERROR_SET, 1, fmt); +} + +/* + * __env_errx -- + * DbEnv.errx method. + */ +static void +#ifdef STDC_HEADERS +__env_errx(const DB_ENV *dbenv, const char *fmt, ...) +#else +__env_errx(dbenv, fmt, va_alist) + const DB_ENV *dbenv; + const char *fmt; + va_dcl +#endif +{ + /* Message without error string, to stderr by default. */ + DB_REAL_ERR(dbenv, 0, DB_ERROR_NOT_SET, 1, fmt); +} + +static int +__env_get_home(dbenv, homep) + DB_ENV *dbenv; + const char **homep; +{ + ENV *env; + + env = dbenv->env; + + ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->get_home"); + *homep = env->db_home; + + return (0); +} + +/* + * __env_get_alloc -- + * {DB_ENV,DB}->get_alloc. + * + * PUBLIC: int __env_get_alloc __P((DB_ENV *, void *(**)(size_t), + * PUBLIC: void *(**)(void *, size_t), void (**)(void *))); + */ +int +__env_get_alloc(dbenv, mal_funcp, real_funcp, free_funcp) + DB_ENV *dbenv; + void *(**mal_funcp) __P((size_t)); + void *(**real_funcp) __P((void *, size_t)); + void (**free_funcp) __P((void *)); +{ + + if (mal_funcp != NULL) + *mal_funcp = dbenv->db_malloc; + if (real_funcp != NULL) + *real_funcp = dbenv->db_realloc; + if (free_funcp != NULL) + *free_funcp = dbenv->db_free; + return (0); +} + +/* + * __env_set_alloc -- + * {DB_ENV,DB}->set_alloc. + * + * PUBLIC: int __env_set_alloc __P((DB_ENV *, void *(*)(size_t), + * PUBLIC: void *(*)(void *, size_t), void (*)(void *))); + */ +int +__env_set_alloc(dbenv, mal_func, real_func, free_func) + DB_ENV *dbenv; + void *(*mal_func) __P((size_t)); + void *(*real_func) __P((void *, size_t)); + void (*free_func) __P((void *)); +{ + ENV *env; + + env = dbenv->env; + + ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_alloc"); + + dbenv->db_malloc = mal_func; + dbenv->db_realloc = real_func; + dbenv->db_free = free_func; + return (0); +} +/* + * __env_get_memory_init -- + * DB_ENV->get_memory_init. + * + * PUBLIC: int __env_get_memory_init __P((DB_ENV *, + * PUBLIC: DB_MEM_CONFIG, u_int32_t *)); + */ +int +__env_get_memory_init(dbenv, type, countp) + DB_ENV *dbenv; + DB_MEM_CONFIG type; + u_int32_t *countp; +{ + ENV *env; + + env = dbenv->env; + + switch (type) { + case DB_MEM_LOCK: + ENV_NOT_CONFIGURED(env, + env->lk_handle, "DB_ENV->get_memory_init", DB_INIT_LOCK); + if (LOCKING_ON(env)) + *countp = ((DB_LOCKREGION *) + env->lk_handle->reginfo.primary)->stat.st_initlocks; + else + *countp = dbenv->lk_init; + break; + case DB_MEM_LOCKOBJECT: + ENV_NOT_CONFIGURED(env, + env->lk_handle, "DB_ENV->get_memory_init", DB_INIT_LOCK); + if (LOCKING_ON(env)) + *countp = ((DB_LOCKREGION *) env-> + lk_handle->reginfo.primary)->stat.st_initobjects; + else + *countp = dbenv->lk_init_objects; + break; + case DB_MEM_LOCKER: + ENV_NOT_CONFIGURED(env, + env->lk_handle, "DB_ENV->get_memory_init", DB_INIT_LOCK); + if (LOCKING_ON(env)) + *countp = ((DB_LOCKREGION *) env-> + lk_handle->reginfo.primary)->stat.st_initlockers; + else + *countp = dbenv->lk_init_lockers; + break; + case DB_MEM_LOGID: + ENV_NOT_CONFIGURED(env, + env->lg_handle, "DB_ENV->get_memory_init", DB_INIT_LOG); + + if (LOGGING_ON(env)) + *countp = ((LOG *)env->lg_handle-> + reginfo.primary)->stat.st_fileid_init; + else + *countp = dbenv->lg_fileid_init; + break; + case DB_MEM_TRANSACTION: + ENV_NOT_CONFIGURED(env, + env->tx_handle, "DB_ENV->memory_init", DB_INIT_TXN); + + if (TXN_ON(env)) + *countp = ((DB_TXNREGION *) + env->tx_handle->reginfo.primary)->inittxns; + else + *countp = dbenv->tx_init; + break; + case DB_MEM_THREAD: + /* We always update thr_init when joining an env. */ + *countp = dbenv->thr_init; + break; + } + + return (0); +} + +/* + * __env_set_memory_init -- + * DB_ENV->set_memory_init. + * + * PUBLIC: int __env_set_memory_init __P((DB_ENV *, DB_MEM_CONFIG, u_int32_t)); + */ +int +__env_set_memory_init(dbenv, type, count) + DB_ENV *dbenv; + DB_MEM_CONFIG type; + u_int32_t count; +{ + ENV *env; + + env = dbenv->env; + + ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_memory_init"); + switch (type) { + case DB_MEM_LOCK: + dbenv->lk_init = count; + break; + case DB_MEM_LOCKOBJECT: + dbenv->lk_init_objects = count; + break; + case DB_MEM_LOCKER: + dbenv->lk_init_lockers = count; + break; + case DB_MEM_LOGID: + dbenv->lg_fileid_init = count; + break; + case DB_MEM_TRANSACTION: + dbenv->tx_init = count; + break; + case DB_MEM_THREAD: + dbenv->thr_init = count; + break; + } + + return (0); +} +/* + * __env_get_memory_max -- + * DB_ENV->get_memory_max. + * + * PUBLIC: int __env_get_memory_max __P((DB_ENV *, u_int32_t *, u_int32_t *)); + */ +int +__env_get_memory_max(dbenv, gbytes, bytes) + DB_ENV *dbenv; + u_int32_t *gbytes, *bytes; +{ + ENV *env; + env = dbenv->env; + + if (F_ISSET(env, ENV_OPEN_CALLED)) { + *gbytes = (u_int32_t)(env->reginfo->rp->max / GIGABYTE); + *bytes = (u_int32_t)(env->reginfo->rp->max % GIGABYTE); + } else { + *gbytes = (u_int32_t)(dbenv->memory_max / GIGABYTE); + *bytes = (u_int32_t)(dbenv->memory_max % GIGABYTE); + } + return (0); +} + +/* + * __env_set_memory_max -- + * DB_ENV->set_memory_max. + * + * PUBLIC: int __env_set_memory_max __P((DB_ENV *, u_int32_t, u_int32_t)); + */ +int +__env_set_memory_max(dbenv, gbytes, bytes) + DB_ENV *dbenv; + u_int32_t gbytes, bytes; +{ + ENV *env; + + env = dbenv->env; + + ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_memory_max"); + + /* + * If they are asking for 4GB exactly on a 32 bit platform, they + * really meant 4GB - 1. Give it to them. + */ + if (sizeof(roff_t) == 4 && gbytes == 4 && bytes == 0) { + --gbytes; + bytes = GIGABYTE - 1; + } + /* + * Make sure they wouldn't overflow the memory_max field on a + * 32 bit architecture. + */ + if (sizeof(roff_t) == 4 && gbytes >= 4) { + __db_errx(env, DB_STR("1588", + "Maximum memory size too large: maximum is 4GB")); + return (EINVAL); + } + dbenv->memory_max = ((roff_t)gbytes * GIGABYTE) + bytes; + return (0); +} + +/* + * __env_get_app_dispatch -- + * Get the transaction abort recover function. + */ +static int +__env_get_app_dispatch(dbenv, app_dispatchp) + DB_ENV *dbenv; + int (**app_dispatchp) __P((DB_ENV *, DBT *, DB_LSN *, db_recops)); +{ + + if (app_dispatchp != NULL) + *app_dispatchp = dbenv->app_dispatch; + return (0); +} + +/* + * __env_set_app_dispatch -- + * Set the transaction abort recover function. + */ +static int +__env_set_app_dispatch(dbenv, app_dispatch) + DB_ENV *dbenv; + int (*app_dispatch) __P((DB_ENV *, DBT *, DB_LSN *, db_recops)); +{ + ENV *env; + + env = dbenv->env; + + ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_app_dispatch"); + + dbenv->app_dispatch = app_dispatch; + return (0); +} + +/* + * __env_get_encrypt_flags -- + * {DB_ENV,DB}->get_encrypt_flags. + * + * PUBLIC: int __env_get_encrypt_flags __P((DB_ENV *, u_int32_t *)); + */ +int +__env_get_encrypt_flags(dbenv, flagsp) + DB_ENV *dbenv; + u_int32_t *flagsp; +{ +#ifdef HAVE_CRYPTO + DB_CIPHER *db_cipher; +#endif + ENV *env; + + env = dbenv->env; + +#ifdef HAVE_CRYPTO + db_cipher = env->crypto_handle; + if (db_cipher != NULL && db_cipher->alg == CIPHER_AES) + *flagsp = DB_ENCRYPT_AES; + else + *flagsp = 0; + return (0); +#else + COMPQUIET(flagsp, 0); + __db_errx(env, DB_STR("1555", + "library build did not include support for cryptography")); + return (DB_OPNOTSUP); +#endif +} + +/* + * __env_set_encrypt -- + * DB_ENV->set_encrypt. + * + * PUBLIC: int __env_set_encrypt __P((DB_ENV *, const char *, u_int32_t)); + */ +int +__env_set_encrypt(dbenv, passwd, flags) + DB_ENV *dbenv; + const char *passwd; + u_int32_t flags; +{ +#ifdef HAVE_CRYPTO + DB_THREAD_INFO *ip; + DB_CIPHER *db_cipher; + ENV *env; + int ret; + + env = dbenv->env; + + ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_encrypt"); +#define OK_CRYPTO_FLAGS (DB_ENCRYPT_AES) + + if (flags != 0 && LF_ISSET(~OK_CRYPTO_FLAGS)) + return (__db_ferr(env, "DB_ENV->set_encrypt", 0)); + + if (passwd == NULL || strlen(passwd) == 0) { + __db_errx(env, DB_STR("1556", + "Empty password specified to set_encrypt")); + return (EINVAL); + } + ENV_ENTER(env, ip); + if (!CRYPTO_ON(env)) { + if ((ret = __os_calloc(env, 1, sizeof(DB_CIPHER), &db_cipher)) + != 0) + goto err; + env->crypto_handle = db_cipher; + } else + db_cipher = env->crypto_handle; + + if (dbenv->passwd != NULL) + __os_free(env, dbenv->passwd); + if ((ret = __os_strdup(env, passwd, &dbenv->passwd)) != 0) { + __os_free(env, db_cipher); + goto err; + } + /* + * We're going to need this often enough to keep around + */ + dbenv->passwd_len = strlen(dbenv->passwd) + 1; + /* + * The MAC key is for checksumming, and is separate from + * the algorithm. So initialize it here, even if they + * are using CIPHER_ANY. + */ + __db_derive_mac( + (u_int8_t *)dbenv->passwd, dbenv->passwd_len, db_cipher->mac_key); + switch (flags) { + case 0: + F_SET(db_cipher, CIPHER_ANY); + break; + case DB_ENCRYPT_AES: + if ((ret = + __crypto_algsetup(env, db_cipher, CIPHER_AES, 0)) != 0) + goto err1; + break; + default: /* Impossible. */ + break; + } + ENV_LEAVE(env, ip); + return (0); + +err1: + __os_free(env, dbenv->passwd); + __os_free(env, db_cipher); + env->crypto_handle = NULL; +err: + ENV_LEAVE(env, ip); + return (ret); +#else + COMPQUIET(passwd, NULL); + COMPQUIET(flags, 0); + + __db_errx(dbenv->env, DB_STR("1557", + "library build did not include support for cryptography")); + return (DB_OPNOTSUP); +#endif +} +#ifndef HAVE_BREW +static +#endif +const FLAG_MAP EnvMap[] = { + { DB_AUTO_COMMIT, DB_ENV_AUTO_COMMIT }, + { DB_CDB_ALLDB, DB_ENV_CDB_ALLDB }, + { DB_DATABASE_LOCKING, DB_ENV_DATABASE_LOCKING }, + { DB_DIRECT_DB, DB_ENV_DIRECT_DB }, + { DB_DSYNC_DB, DB_ENV_DSYNC_DB }, + { DB_HOTBACKUP_IN_PROGRESS, DB_ENV_HOTBACKUP }, + { DB_MULTIVERSION, DB_ENV_MULTIVERSION }, + { DB_NOFLUSH, DB_ENV_NOFLUSH }, + { DB_NOLOCKING, DB_ENV_NOLOCKING }, + { DB_NOMMAP, DB_ENV_NOMMAP }, + { DB_NOPANIC, DB_ENV_NOPANIC }, + { DB_OVERWRITE, DB_ENV_OVERWRITE }, + { DB_REGION_INIT, DB_ENV_REGION_INIT }, + { DB_TIME_NOTGRANTED, DB_ENV_TIME_NOTGRANTED }, + { DB_TXN_NOSYNC, DB_ENV_TXN_NOSYNC }, + { DB_TXN_NOWAIT, DB_ENV_TXN_NOWAIT }, + { DB_TXN_SNAPSHOT, DB_ENV_TXN_SNAPSHOT }, + { DB_TXN_WRITE_NOSYNC, DB_ENV_TXN_WRITE_NOSYNC }, + { DB_YIELDCPU, DB_ENV_YIELDCPU } +}; + +/* + * __env_map_flags -- map from external to internal flags. + * PUBLIC: void __env_map_flags __P((const FLAG_MAP *, + * PUBLIC: u_int, u_int32_t *, u_int32_t *)); + */ +void +__env_map_flags(flagmap, mapsize, inflagsp, outflagsp) + const FLAG_MAP *flagmap; + u_int mapsize; + u_int32_t *inflagsp, *outflagsp; +{ + + const FLAG_MAP *fmp; + u_int i; + + for (i = 0, fmp = flagmap; + i < mapsize / sizeof(flagmap[0]); ++i, ++fmp) + if (FLD_ISSET(*inflagsp, fmp->inflag)) { + FLD_SET(*outflagsp, fmp->outflag); + FLD_CLR(*inflagsp, fmp->inflag); + if (*inflagsp == 0) + break; + } +} + +/* + * __env_fetch_flags -- map from internal to external flags. + * PUBLIC: void __env_fetch_flags __P((const FLAG_MAP *, + * PUBLIC: u_int, u_int32_t *, u_int32_t *)); + */ +void +__env_fetch_flags(flagmap, mapsize, inflagsp, outflagsp) + const FLAG_MAP *flagmap; + u_int mapsize; + u_int32_t *inflagsp, *outflagsp; +{ + const FLAG_MAP *fmp; + u_int32_t i; + + *outflagsp = 0; + for (i = 0, fmp = flagmap; + i < mapsize / sizeof(flagmap[0]); ++i, ++fmp) + if (FLD_ISSET(*inflagsp, fmp->outflag)) + FLD_SET(*outflagsp, fmp->inflag); +} + +static int +__env_get_flags(dbenv, flagsp) + DB_ENV *dbenv; + u_int32_t *flagsp; +{ + ENV *env; + DB_THREAD_INFO *ip; + + __env_fetch_flags(EnvMap, sizeof(EnvMap), &dbenv->flags, flagsp); + + env = dbenv->env; + /* Some flags are persisted in the regions. */ + if (env->reginfo != NULL && + ((REGENV *)env->reginfo->primary)->panic != 0) + FLD_SET(*flagsp, DB_PANIC_ENVIRONMENT); + + /* If the hotbackup counter is positive, set the flag indicating so. */ + if (TXN_ON(env)) { + ENV_ENTER(env, ip); + TXN_SYSTEM_LOCK(env); + if (((DB_TXNREGION *) + env->tx_handle->reginfo.primary)->n_hotbackup > 0) + FLD_SET(*flagsp, DB_HOTBACKUP_IN_PROGRESS); + TXN_SYSTEM_UNLOCK(env); + ENV_LEAVE(env, ip); + } + + return (0); +} + +/* + * __env_set_flags -- + * DB_ENV->set_flags. + * + * PUBLIC: int __env_set_flags __P((DB_ENV *, u_int32_t, int)); + */ +int +__env_set_flags(dbenv, flags, on) + DB_ENV *dbenv; + u_int32_t flags; + int on; +{ + ENV *env; + DB_THREAD_INFO *ip; + u_int32_t mapped_flags; + int mem_on, ret; + + env = dbenv->env; + +#define OK_FLAGS \ + (DB_AUTO_COMMIT | DB_CDB_ALLDB | DB_DATABASE_LOCKING | \ + DB_DIRECT_DB | DB_DSYNC_DB | DB_MULTIVERSION | \ + DB_NOLOCKING | DB_NOMMAP | DB_NOPANIC | DB_OVERWRITE | \ + DB_PANIC_ENVIRONMENT | DB_REGION_INIT | \ + DB_TIME_NOTGRANTED | DB_TXN_NOSYNC | DB_TXN_NOWAIT | \ + DB_TXN_SNAPSHOT | DB_TXN_WRITE_NOSYNC | DB_YIELDCPU | \ + DB_HOTBACKUP_IN_PROGRESS | DB_NOFLUSH) + + if (LF_ISSET(~OK_FLAGS)) + return (__db_ferr(env, "DB_ENV->set_flags", 0)); + if (on) { + if ((ret = __db_fcchk(env, "DB_ENV->set_flags", + flags, DB_TXN_NOSYNC, DB_TXN_WRITE_NOSYNC)) != 0) + return (ret); + if (LF_ISSET(DB_DIRECT_DB) && __os_support_direct_io() == 0) { + __db_errx(env, + "DB_ENV->set_flags: direct I/O either not configured or not supported"); + return (EINVAL); + } + } + + if (LF_ISSET(DB_CDB_ALLDB)) + ENV_ILLEGAL_AFTER_OPEN(env, + "DB_ENV->set_flags: DB_CDB_ALLDB"); + if (LF_ISSET(DB_PANIC_ENVIRONMENT)) { + ENV_ILLEGAL_BEFORE_OPEN(env, + "DB_ENV->set_flags: DB_PANIC_ENVIRONMENT"); + if (on) { + __db_errx(env, DB_STR("1558", + "Environment panic set")); + (void)__env_panic(env, DB_RUNRECOVERY); + } else + __env_panic_set(env, 0); + } + if (LF_ISSET(DB_REGION_INIT)) + ENV_ILLEGAL_AFTER_OPEN(env, + "DB_ENV->set_flags: DB_REGION_INIT"); + + /* + * DB_LOG_IN_MEMORY, DB_TXN_NOSYNC and DB_TXN_WRITE_NOSYNC are + * mutually incompatible. If we're setting one of them, clear all + * current settings. If the environment is open, check to see that + * logging is not in memory. + */ + if (on && LF_ISSET(DB_TXN_NOSYNC | DB_TXN_WRITE_NOSYNC)) { + F_CLR(dbenv, DB_ENV_TXN_NOSYNC | DB_ENV_TXN_WRITE_NOSYNC); + if (!F_ISSET(env, ENV_OPEN_CALLED)) { + if ((ret = + __log_set_config(dbenv, DB_LOG_IN_MEMORY, 0)) != 0) + return (ret); + } else if (LOGGING_ON(env)) { + if ((ret = __log_get_config(dbenv, + DB_LOG_IN_MEMORY, &mem_on)) != 0) + return (ret); + if (mem_on == 1) { + __db_errx(env, DB_STR("1559", + "DB_TXN_NOSYNC and DB_TXN_WRITE_NOSYNC" + " may not be used with DB_LOG_IN_MEMORY")); + return (EINVAL); + } + } + } + + /* + * Settings of DB_HOTBACKUP_IN_PROGRESS are reference-counted + * in REGENV. + */ + if (LF_ISSET(DB_HOTBACKUP_IN_PROGRESS)) { + /* You can't take a hot backup without transactions. */ + ENV_REQUIRES_CONFIG(env, env->tx_handle, + "DB_ENV->set_flags: DB_HOTBACKUP_IN_PROGRESS", DB_INIT_TXN); + + ENV_ENTER(env, ip); + ret = __env_set_backup(env, on); + ENV_LEAVE(env, ip); + if (ret != 0) + return (ret); + } + + mapped_flags = 0; + __env_map_flags(EnvMap, sizeof(EnvMap), &flags, &mapped_flags); + if (on) + F_SET(dbenv, mapped_flags); + else + F_CLR(dbenv, mapped_flags); + + return (0); +} + +/* + * __env_set_backup -- + * PUBLIC: int __env_set_backup __P((ENV *, int)); + */ +int +__env_set_backup(env, on) + ENV *env; + int on; +{ + DB_TXNREGION *tenv; + int needs_checkpoint, ret; + + tenv = (DB_TXNREGION *)env->tx_handle->reginfo.primary; + needs_checkpoint = 0; + + TXN_SYSTEM_LOCK(env); + if (on) { + tenv->n_hotbackup++; + if (tenv->n_bulk_txn > 0) + needs_checkpoint = 1; + } else { + if (tenv->n_hotbackup == 0) + needs_checkpoint = -1; /* signal count error */ + else + tenv->n_hotbackup--; + } + TXN_SYSTEM_UNLOCK(env); + + if (needs_checkpoint == -1) { + __db_errx(env, DB_STR("1560", + "Attempt to decrement hotbackup counter past zero")); + return (EINVAL); + } + + if (needs_checkpoint && (ret = __txn_checkpoint(env, 0, 0, 0))) + return (ret); + return (0); +} + +static int +__env_get_data_dirs(dbenv, dirpp) + DB_ENV *dbenv; + const char ***dirpp; +{ + *dirpp = (const char **)dbenv->db_data_dir; + return (0); +} + +/* + * __env_set_data_dir -- + * DB_ENV->set_data_dir. + * + * PUBLIC: int __env_set_data_dir __P((DB_ENV *, const char *)); + */ +int +__env_set_data_dir(dbenv, dir) + DB_ENV *dbenv; + const char *dir; +{ + int ret; + + if ((ret = __env_add_data_dir(dbenv, dir)) != 0) + return (ret); + + if (dbenv->data_next == 1) + return (__env_set_create_dir(dbenv, dir)); + + return (0); +} + +/* + * __env_add_data_dir -- + * DB_ENV->add_data_dir. + * + * PUBLIC: int __env_add_data_dir __P((DB_ENV *, const char *)); + */ +int +__env_add_data_dir(dbenv, dir) + DB_ENV *dbenv; + const char *dir; +{ + ENV *env; + int ret; + + env = dbenv->env; + ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->add_data_dir"); + + /* + * The array is NULL-terminated so it can be returned by get_data_dirs + * without a length. + */ + +#define DATA_INIT_CNT 20 /* Start with 20 data slots. */ + if (dbenv->db_data_dir == NULL) { + if ((ret = __os_calloc(env, DATA_INIT_CNT, + sizeof(char **), &dbenv->db_data_dir)) != 0) + return (ret); + dbenv->data_cnt = DATA_INIT_CNT; + } else if (dbenv->data_next == dbenv->data_cnt - 2) { + dbenv->data_cnt *= 2; + if ((ret = __os_realloc(env, + (u_int)dbenv->data_cnt * sizeof(char **), + &dbenv->db_data_dir)) != 0) + return (ret); + } + + ret = __os_strdup(env, + dir, &dbenv->db_data_dir[dbenv->data_next++]); + dbenv->db_data_dir[dbenv->data_next] = NULL; + return (ret); +} + +/* + * __env_set_create_dir -- + * DB_ENV->set_create_dir. + * The list of directories cannot change after opening the env and setting + * a pointer must be atomic so we do not need to mutex here even if multiple + * threads are using the DB_ENV handle. + * + * PUBLIC: int __env_set_create_dir __P((DB_ENV *, const char *)); + */ +int +__env_set_create_dir(dbenv, dir) + DB_ENV *dbenv; + const char *dir; +{ + ENV *env; + int i; + + env = dbenv->env; + + for (i = 0; i < dbenv->data_next; i++) + if (strcmp(dir, dbenv->db_data_dir[i]) == 0) + break; + + if (i == dbenv->data_next) { + __db_errx(env, DB_STR_A("1561", + "Directory %s not in environment list.", "%s"), dir); + return (EINVAL); + } + + dbenv->db_create_dir = dbenv->db_data_dir[i]; + return (0); +} + +static int +__env_get_create_dir(dbenv, dirp) + DB_ENV *dbenv; + const char **dirp; +{ + *dirp = dbenv->db_create_dir; + return (0); +} + +static int +__env_get_intermediate_dir_mode(dbenv, modep) + DB_ENV *dbenv; + const char **modep; +{ + *modep = dbenv->intermediate_dir_mode; + return (0); +} + +/* + * __env_set_metadata_dir -- + * DB_ENV->set_metadata_dir. + * + * PUBLIC: int __env_set_metadata_dir __P((DB_ENV *, const char *)); + */ +int +__env_set_metadata_dir(dbenv, dir) + DB_ENV *dbenv; + const char *dir; +{ + ENV *env; + int i, ret; + + env = dbenv->env; + + ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_metadata_dir"); + + /* If metadata_dir is not already on data_dir list, add it. */ + for (i = 0; i < dbenv->data_next; i++) + if (strcmp(dir, dbenv->db_data_dir[i]) == 0) + break; + if (i == dbenv->data_next && + (ret = __env_add_data_dir(dbenv, dir)) != 0) { + __db_errx(env, DB_STR_A("1590", + "Could not add %s to environment list.", "%s"), dir); + return (ret); + } + + if (dbenv->db_md_dir != NULL) + __os_free(env, dbenv->db_md_dir); + return (__os_strdup(env, dir, &dbenv->db_md_dir)); +} + +static int +__env_get_metadata_dir(dbenv, dirp) + DB_ENV *dbenv; + const char **dirp; +{ + *dirp = dbenv->db_md_dir; + return (0); +} + +/* + * __env_set_data_len -- + * DB_ENV->set_data_len. + * + * PUBLIC: int __env_set_data_len __P((DB_ENV *, u_int32_t)); + */ +int +__env_set_data_len(dbenv, data_len) + DB_ENV *dbenv; + u_int32_t data_len; +{ + + dbenv->env->data_len = data_len; + return (0); +} + +static int +__env_get_data_len(dbenv, data_lenp) + DB_ENV *dbenv; + u_int32_t *data_lenp; +{ + *data_lenp = dbenv->env->data_len; + return (0); +} + +/* + * __env_set_intermediate_dir_mode -- + * DB_ENV->set_intermediate_dir_mode. + * + * PUBLIC: int __env_set_intermediate_dir_mode __P((DB_ENV *, const char *)); + */ +int +__env_set_intermediate_dir_mode(dbenv, mode) + DB_ENV *dbenv; + const char *mode; +{ + ENV *env; + u_int t; + int ret; + + env = dbenv->env; + + ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_intermediate_dir_mode"); + +#define __SETMODE(offset, valid_ch, mask) { \ + if (mode[offset] == (valid_ch)) \ + t |= (mask); \ + else if (mode[offset] != '-') \ + goto format_err; \ +} + t = 0; + __SETMODE(0, 'r', S_IRUSR); + __SETMODE(1, 'w', S_IWUSR); + __SETMODE(2, 'x', S_IXUSR); + __SETMODE(3, 'r', S_IRGRP); + __SETMODE(4, 'w', S_IWGRP); + __SETMODE(5, 'x', S_IXGRP); + __SETMODE(6, 'r', S_IROTH); + __SETMODE(7, 'w', S_IWOTH); + __SETMODE(8, 'x', S_IXOTH); + if (mode[9] != '\0' || t == 0) { + /* + * We disallow modes of 0 -- we use 0 to decide the application + * never configured intermediate directory permissions, and we + * shouldn't create intermediate directories. Besides, setting + * the permissions to 0 makes no sense. + */ +format_err: __db_errx(env, + "DB_ENV->set_intermediate_dir_mode: illegal mode \"%s\"", mode); + return (EINVAL); + } + + if (dbenv->intermediate_dir_mode != NULL) + __os_free(env, dbenv->intermediate_dir_mode); + if ((ret = __os_strdup(env, mode, &dbenv->intermediate_dir_mode)) != 0) + return (ret); + + env->dir_mode = (int)t; + return (0); +} + +/* + * __env_get_errcall -- + * {DB_ENV,DB}->get_errcall. + * + * PUBLIC: void __env_get_errcall __P((DB_ENV *, + * PUBLIC: void (**)(const DB_ENV *, const char *, const char *))); + */ +void +__env_get_errcall(dbenv, errcallp) + DB_ENV *dbenv; + void (**errcallp) __P((const DB_ENV *, const char *, const char *)); +{ + *errcallp = dbenv->db_errcall; +} + +/* + * __env_set_errcall -- + * {DB_ENV,DB}->set_errcall. + * + * PUBLIC: void __env_set_errcall __P((DB_ENV *, + * PUBLIC: void (*)(const DB_ENV *, const char *, const char *))); + */ +void +__env_set_errcall(dbenv, errcall) + DB_ENV *dbenv; + void (*errcall) __P((const DB_ENV *, const char *, const char *)); +{ + ENV *env; + + env = dbenv->env; + + F_CLR(env, ENV_NO_OUTPUT_SET); + dbenv->db_errcall = errcall; +} + +/* + * __env_get_errfile -- + * {DB_ENV,DB}->get_errfile. + * + * PUBLIC: void __env_get_errfile __P((DB_ENV *, FILE **)); + */ +void +__env_get_errfile(dbenv, errfilep) + DB_ENV *dbenv; + FILE **errfilep; +{ + *errfilep = dbenv->db_errfile; +} + +/* + * __env_set_errfile -- + * {DB_ENV,DB}->set_errfile. + * + * PUBLIC: void __env_set_errfile __P((DB_ENV *, FILE *)); + */ +void +__env_set_errfile(dbenv, errfile) + DB_ENV *dbenv; + FILE *errfile; +{ + ENV *env; + + env = dbenv->env; + + F_CLR(env, ENV_NO_OUTPUT_SET); + dbenv->db_errfile = errfile; +} + +/* + * __env_get_errpfx -- + * {DB_ENV,DB}->get_errpfx. + * + * PUBLIC: void __env_get_errpfx __P((DB_ENV *, const char **)); + */ +void +__env_get_errpfx(dbenv, errpfxp) + DB_ENV *dbenv; + const char **errpfxp; +{ + *errpfxp = dbenv->db_errpfx; +} + +/* + * __env_set_errpfx -- + * {DB_ENV,DB}->set_errpfx. + * + * PUBLIC: void __env_set_errpfx __P((DB_ENV *, const char *)); + */ +void +__env_set_errpfx(dbenv, errpfx) + DB_ENV *dbenv; + const char *errpfx; +{ + dbenv->db_errpfx = errpfx; +} + +static int +__env_get_feedback(dbenv, feedbackp) + DB_ENV *dbenv; + void (**feedbackp) __P((DB_ENV *, int, int)); +{ + if (feedbackp != NULL) + *feedbackp = dbenv->db_feedback; + return (0); +} + +static int +__env_set_feedback(dbenv, feedback) + DB_ENV *dbenv; + void (*feedback) __P((DB_ENV *, int, int)); +{ + dbenv->db_feedback = feedback; + return (0); +} + +/* + * __env_get_thread_id_fn -- + * DB_ENV->get_thread_id_fn + */ +static int +__env_get_thread_id_fn(dbenv, idp) + DB_ENV *dbenv; + void (**idp) __P((DB_ENV *, pid_t *, db_threadid_t *)); +{ + if (idp != NULL) + *idp = dbenv->thread_id; + return (0); +} + +/* + * __env_set_thread_id -- + * DB_ENV->set_thread_id + */ +static int +__env_set_thread_id(dbenv, id) + DB_ENV *dbenv; + void (*id) __P((DB_ENV *, pid_t *, db_threadid_t *)); +{ + dbenv->thread_id = id; + return (0); +} + +/* + * __env_get_threadid_string_fn -- + * DB_ENV->get_threadid_string_fn + */ +static int +__env_get_thread_id_string_fn(dbenv, thread_id_stringp) + DB_ENV *dbenv; + char *(**thread_id_stringp) + __P((DB_ENV *, pid_t, db_threadid_t, char *)); +{ + if (thread_id_stringp != NULL) + *thread_id_stringp = dbenv->thread_id_string; + return (0); +} + +/* + * __env_set_threadid_string -- + * DB_ENV->set_threadid_string + */ +static int +__env_set_thread_id_string(dbenv, thread_id_string) + DB_ENV *dbenv; + char *(*thread_id_string) __P((DB_ENV *, pid_t, db_threadid_t, char *)); +{ + dbenv->thread_id_string = thread_id_string; + return (0); +} + +/* + * __env_get_isalive -- + * DB_ENV->get_isalive + */ +static int +__env_get_isalive(dbenv, is_alivep) + DB_ENV *dbenv; + int (**is_alivep) __P((DB_ENV *, pid_t, db_threadid_t, u_int32_t)); +{ + ENV *env; + + env = dbenv->env; + + if (F_ISSET(env, ENV_OPEN_CALLED) && env->thr_nbucket == 0) { + __db_errx(env, DB_STR("1562", + "is_alive method specified but no thread region allocated")); + return (EINVAL); + } + if (is_alivep != NULL) + *is_alivep = dbenv->is_alive; + return (0); +} + +/* + * __env_set_isalive -- + * DB_ENV->set_isalive + */ +static int +__env_set_isalive(dbenv, is_alive) + DB_ENV *dbenv; + int (*is_alive) __P((DB_ENV *, pid_t, db_threadid_t, u_int32_t)); +{ + ENV *env; + + env = dbenv->env; + + if (F_ISSET(env, ENV_OPEN_CALLED) && env->thr_nbucket == 0) { + __db_errx(env, DB_STR("1563", + "is_alive method specified but no thread region allocated")); + return (EINVAL); + } + dbenv->is_alive = is_alive; + return (0); +} + +/* + * __env_get_thread_count -- + * DB_ENV->get_thread_count + */ +static int +__env_get_thread_count(dbenv, countp) + DB_ENV *dbenv; + u_int32_t *countp; +{ + *countp = dbenv->thr_max; + return (0); +} + +/* + * __env_set_thread_count -- + * DB_ENV->set_thread_count + * + * PUBLIC: int __env_set_thread_count __P((DB_ENV *, u_int32_t)); + */ +int +__env_set_thread_count(dbenv, count) + DB_ENV *dbenv; + u_int32_t count; +{ + ENV *env; + + env = dbenv->env; + + ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_thread_count"); + dbenv->thr_max = count; + + return (0); +} + +/* + * __env_get_msgcall -- + * {DB_ENV,DB}->get_msgcall. + * + * PUBLIC: void __env_get_msgcall + * PUBLIC: __P((DB_ENV *, void (**)(const DB_ENV *, const char *))); + */ +void +__env_get_msgcall(dbenv, msgcallp) + DB_ENV *dbenv; + void (**msgcallp) __P((const DB_ENV *, const char *)); +{ + if (msgcallp != NULL) + *msgcallp = dbenv->db_msgcall; +} + +/* + * __env_set_msgcall -- + * {DB_ENV,DB}->set_msgcall. + * + * PUBLIC: void __env_set_msgcall + * PUBLIC: __P((DB_ENV *, void (*)(const DB_ENV *, const char *))); + */ +void +__env_set_msgcall(dbenv, msgcall) + DB_ENV *dbenv; + void (*msgcall) __P((const DB_ENV *, const char *)); +{ + dbenv->db_msgcall = msgcall; +} + +/* + * __env_get_msgfile -- + * {DB_ENV,DB}->get_msgfile. + * + * PUBLIC: void __env_get_msgfile __P((DB_ENV *, FILE **)); + */ +void +__env_get_msgfile(dbenv, msgfilep) + DB_ENV *dbenv; + FILE **msgfilep; +{ + *msgfilep = dbenv->db_msgfile; +} + +/* + * __env_set_msgfile -- + * {DB_ENV,DB}->set_msgfile. + * + * PUBLIC: void __env_set_msgfile __P((DB_ENV *, FILE *)); + */ +void +__env_set_msgfile(dbenv, msgfile) + DB_ENV *dbenv; + FILE *msgfile; +{ + dbenv->db_msgfile = msgfile; +} + +/* + * __env_set_paniccall -- + * {DB_ENV,DB}->set_paniccall. + * + * PUBLIC: int __env_set_paniccall __P((DB_ENV *, void (*)(DB_ENV *, int))); + */ +int +__env_set_paniccall(dbenv, paniccall) + DB_ENV *dbenv; + void (*paniccall) __P((DB_ENV *, int)); +{ + dbenv->db_paniccall = paniccall; + return (0); +} + +/* + * __env_set_event_notify -- + * DB_ENV->set_event_notify. + */ +static int +__env_set_event_notify(dbenv, event_func) + DB_ENV *dbenv; + void (*event_func) __P((DB_ENV *, u_int32_t, void *)); +{ + dbenv->db_event_func = event_func; + return (0); +} + +static int +__env_get_shm_key(dbenv, shm_keyp) + DB_ENV *dbenv; + long *shm_keyp; /* !!!: really a key_t *. */ +{ + *shm_keyp = dbenv->shm_key; + return (0); +} + +/* + * __env_set_shm_key -- + * DB_ENV->set_shm_key. + * + * PUBLIC: int __env_set_shm_key __P((DB_ENV *, long)); + */ +int +__env_set_shm_key(dbenv, shm_key) + DB_ENV *dbenv; + long shm_key; /* !!!: really a key_t. */ +{ + ENV *env; + + env = dbenv->env; + + ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->set_shm_key"); + + dbenv->shm_key = shm_key; + return (0); +} + +static int +__env_get_tmp_dir(dbenv, dirp) + DB_ENV *dbenv; + const char **dirp; +{ + *dirp = dbenv->db_tmp_dir; + return (0); +} + +/* + * __env_set_tmp_dir -- + * DB_ENV->set_tmp_dir. + * + * PUBLIC: int __env_set_tmp_dir __P((DB_ENV *, const char *)); + */ +int +__env_set_tmp_dir(dbenv, dir) + DB_ENV *dbenv; + const char *dir; +{ + ENV *env; + + env = dbenv->env; + + if (dbenv->db_tmp_dir != NULL) + __os_free(env, dbenv->db_tmp_dir); + return (__os_strdup(env, dir, &dbenv->db_tmp_dir)); +} + +static int +__env_get_verbose(dbenv, which, onoffp) + DB_ENV *dbenv; + u_int32_t which; + int *onoffp; +{ + switch (which) { + case DB_VERB_BACKUP: + case DB_VERB_DEADLOCK: + case DB_VERB_FILEOPS: + case DB_VERB_FILEOPS_ALL: + case DB_VERB_RECOVERY: + case DB_VERB_REGISTER: + case DB_VERB_REPLICATION: + case DB_VERB_REP_ELECT: + case DB_VERB_REP_LEASE: + case DB_VERB_REP_MISC: + case DB_VERB_REP_MSGS: + case DB_VERB_REP_SYNC: + case DB_VERB_REP_SYSTEM: + case DB_VERB_REP_TEST: + case DB_VERB_REPMGR_CONNFAIL: + case DB_VERB_REPMGR_MISC: + case DB_VERB_WAITSFOR: + *onoffp = FLD_ISSET(dbenv->verbose, which) ? 1 : 0; + break; + default: + return (EINVAL); + } + return (0); +} + +/* + * __env_set_verbose -- + * DB_ENV->set_verbose. + * + * PUBLIC: int __env_set_verbose __P((DB_ENV *, u_int32_t, int)); + */ +int +__env_set_verbose(dbenv, which, on) + DB_ENV *dbenv; + u_int32_t which; + int on; +{ + switch (which) { + case DB_VERB_BACKUP: + case DB_VERB_DEADLOCK: + case DB_VERB_FILEOPS: + case DB_VERB_FILEOPS_ALL: + case DB_VERB_RECOVERY: + case DB_VERB_REGISTER: + case DB_VERB_REPLICATION: + case DB_VERB_REP_ELECT: + case DB_VERB_REP_LEASE: + case DB_VERB_REP_MISC: + case DB_VERB_REP_MSGS: + case DB_VERB_REP_SYNC: + case DB_VERB_REP_SYSTEM: + case DB_VERB_REP_TEST: + case DB_VERB_REPMGR_CONNFAIL: + case DB_VERB_REPMGR_MISC: + case DB_VERB_WAITSFOR: + if (on) + FLD_SET(dbenv->verbose, which); + else + FLD_CLR(dbenv->verbose, which); + break; + default: + return (EINVAL); + } + return (0); +} + +/* + * __db_mi_env -- + * Method illegally called with public environment. + * + * PUBLIC: int __db_mi_env __P((ENV *, const char *)); + */ +int +__db_mi_env(env, name) + ENV *env; + const char *name; +{ + __db_errx(env, DB_STR_A("1564", + "%s: method not permitted when environment specified", "%s"), + name); + return (EINVAL); +} + +/* + * __db_mi_open -- + * Method illegally called after open. + * + * PUBLIC: int __db_mi_open __P((ENV *, const char *, int)); + */ +int +__db_mi_open(env, name, after) + ENV *env; + const char *name; + int after; +{ + __db_errx(env, DB_STR_A("1565", + "%s: method not permitted %s handle's open method", "%s %s"), + name, after ? DB_STR_P("after") : DB_STR_P("before")); + return (EINVAL); +} + +/* + * __env_not_config -- + * Method or function called without required configuration. + * + * PUBLIC: int __env_not_config __P((ENV *, char *, u_int32_t)); + */ +int +__env_not_config(env, i, flags) + ENV *env; + char *i; + u_int32_t flags; +{ + char *sub; + int is_sub; + + is_sub = 1; + + switch (flags) { + case DB_INIT_CDB: + sub = "DB_INIT_CDB"; + is_sub = 0; + break; + case DB_INIT_LOCK: + sub = "locking"; + break; + case DB_INIT_LOG: + sub = "logging"; + break; + case DB_INIT_MPOOL: + sub = "memory pool"; + break; + case DB_INIT_MUTEX: + sub = "mutex"; + break; + case DB_INIT_REP: + sub = "replication"; + break; + case DB_INIT_TXN: + sub = "transaction"; + break; + default: + sub = "<unspecified>"; + break; + } + + if (is_sub) { + __db_errx(env, DB_STR_A("1566", + "%s interface requires an environment configured for the %s subsystem", + "%s %s"), i, sub); + } else { + __db_errx(env, DB_STR_A("1587", + "%s interface requires an environment configured with %s", + "%s %s"), i, sub); + } + + return (EINVAL); +} + +/* + * __env_get_timeout -- + * DB_ENV->get_timeout + */ +static int +__env_get_timeout(dbenv, timeoutp, flags) + DB_ENV *dbenv; + db_timeout_t *timeoutp; + u_int32_t flags; +{ + int ret; + + ret = 0; + if (flags == DB_SET_REG_TIMEOUT) { + *timeoutp = dbenv->envreg_timeout; + } else + ret = __lock_get_env_timeout(dbenv, timeoutp, flags); + return (ret); +} + +/* + * __env_set_timeout -- + * DB_ENV->set_timeout + * + * PUBLIC: int __env_set_timeout __P((DB_ENV *, db_timeout_t, u_int32_t)); + */ +int +__env_set_timeout(dbenv, timeout, flags) + DB_ENV *dbenv; + db_timeout_t timeout; + u_int32_t flags; +{ + int ret; + + ret = 0; + if (flags == DB_SET_REG_TIMEOUT) + dbenv->envreg_timeout = timeout; + else + ret = __lock_set_env_timeout(dbenv, timeout, flags); + return (ret); +} diff --git a/src/env/env_name.c b/src/env/env_name.c new file mode 100644 index 00000000..a3a0b371 --- /dev/null +++ b/src/env/env_name.c @@ -0,0 +1,285 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" + +static int __db_fullpath + __P((ENV *, const char *, const char *, int, int, char **)); + +#define DB_ADDSTR(add) { \ + /* \ + * The string might be NULL or zero-length, and the p[-1] \ + * might indirect to before the beginning of our buffer. \ + */ \ + if ((add) != NULL && (add)[0] != '\0') { \ + /* If leading slash, start over. */ \ + if (__os_abspath(add)) { \ + p = str; \ + slash = 0; \ + } \ + /* Append to the current string. */ \ + len = strlen(add); \ + if (slash) \ + *p++ = PATH_SEPARATOR[0]; \ + memcpy(p, add, len); \ + p += len; \ + slash = strchr(PATH_SEPARATOR, p[-1]) == NULL; \ + } \ +} + +/* + * __db_fullpath -- + * Constructs a path name relative to the environment home, and optionally + * checks whether the file or directory exist. + */ +static int +__db_fullpath(env, dir, file, check_file, check_dir, namep) + ENV *env; + const char *dir; + const char *file; + int check_file; + int check_dir; + char **namep; +{ + size_t len; + const char *home; + char *p, *str; + int isdir, ret, slash; + + /* All paths are relative to the environment home. */ + home = (env == NULL) ? NULL : env->db_home; + + len = + (home == NULL ? 0 : strlen(home) + 1) + + (dir == NULL ? 0 : strlen(dir) + 1) + + (file == NULL ? 0 : strlen(file) + 1); + + if ((ret = __os_malloc(env, len, &str)) != 0) + return (ret); + + slash = 0; + p = str; + DB_ADDSTR(home); + DB_ADDSTR(dir); + *p = '\0'; + if (check_dir && (__os_exists(env, str, &isdir) != 0 || !isdir)) { + __os_free(env, str); + return (ENOENT); + } + DB_ADDSTR(file); + *p = '\0'; + + /* + * If we're opening a data file, see if it exists. If not, keep + * trying. + */ + if (check_file && __os_exists(env, str, NULL) != 0) { + __os_free(env, str); + return (ENOENT); + } + + if (namep == NULL) + __os_free(env, str); + else + *namep = str; + return (0); +} + +#define DB_CHECKFILE(file, dir, check_file, check_dir, namep, ret_dir) do { \ + ret = __db_fullpath(env, dir, file, \ + check_file, check_dir, namep); \ + if (ret == 0 && (ret_dir) != NULL) \ + *(ret_dir) = (dir); \ + if (ret != ENOENT) \ + return (ret); \ +} while (0) + +/* + * __db_appname -- + * Given an optional DB environment, directory and file name and type + * of call, build a path based on the ENV->open rules, and return + * it in allocated space. Dirp can be used to specify a data directory + * to use. If not and one is used then drip will contain a pointer + * to the directory name. + * + * PUBLIC: int __db_appname __P((ENV *, APPNAME, + * PUBLIC: const char *, const char **, char **)); + */ +int +__db_appname(env, appname, file, dirp, namep) + ENV *env; + APPNAME appname; + const char *file; + const char **dirp; + char **namep; +{ + DB_ENV *dbenv; + char **ddp; + const char *dir; + int ret; + + dbenv = env->dbenv; + dir = NULL; + + if (namep != NULL) + *namep = NULL; + + /* + * Absolute path names are never modified. If the file is an absolute + * path, we're done. + */ + if (file != NULL && __os_abspath(file)) + return (__os_strdup(env, file, namep)); + + /* + * DB_APP_NONE: + * DB_HOME/file + * DB_APP_DATA: + * DB_HOME/DB_DATA_DIR/file + * DB_APP_LOG: + * DB_HOME/DB_LOG_DIR/file + * DB_APP_TMP: + * DB_HOME/DB_TMP_DIR/<create> + */ + switch (appname) { + case DB_APP_NONE: + break; + case DB_APP_RECOVER: + case DB_APP_DATA: + /* + * First, step through the data_dir entries, if any, looking + * for the file. + */ + if (dbenv != NULL && dbenv->db_data_dir != NULL) + for (ddp = dbenv->db_data_dir; *ddp != NULL; ddp++) + DB_CHECKFILE(file, *ddp, 1, 0, namep, dirp); + + /* Second, look in the environment home directory. */ + DB_CHECKFILE(file, NULL, 1, 0, namep, dirp); + + /* + * Otherwise, we're going to create. Use the specified + * directory unless we're in recovery and it doesn't exist. + */ + if (dirp != NULL && *dirp != NULL) + DB_CHECKFILE(file, *dirp, 0, + appname == DB_APP_RECOVER, namep, dirp); + + /* Finally, use the create directory, if set. */ + if (dbenv != NULL && dbenv->db_create_dir != NULL) + dir = dbenv->db_create_dir; + break; + case DB_APP_LOG: + if (dbenv != NULL) + dir = dbenv->db_log_dir; + break; + case DB_APP_TMP: + if (dbenv != NULL) + dir = dbenv->db_tmp_dir; + break; + case DB_APP_META: + if (dbenv != NULL) + dir = dbenv->db_md_dir; + break; + } + + /* + * Construct the full path. For temporary files, it is an error if the + * directory does not exist: if it doesn't, checking whether millions + * of temporary files exist inside it takes a *very* long time. + */ + DB_CHECKFILE(file, dir, 0, appname == DB_APP_TMP, namep, dirp); + + return (ret); +} + +/* + * __db_tmp_open -- + * Create a temporary file. + * + * PUBLIC: int __db_tmp_open __P((ENV *, u_int32_t, DB_FH **)); + */ +int +__db_tmp_open(env, oflags, fhpp) + ENV *env; + u_int32_t oflags; + DB_FH **fhpp; +{ + pid_t pid; + int filenum, i, ipid, ret; + char *path; + char *firstx, *trv; + + DB_ASSERT(env, fhpp != NULL); + *fhpp = NULL; + +#define DB_TRAIL "BDBXXXXX" + if ((ret = __db_appname(env, DB_APP_TMP, DB_TRAIL, NULL, &path)) != 0) + goto done; + + /* Replace the X's with the process ID (in decimal). */ + __os_id(env->dbenv, &pid, NULL); + ipid = (int)pid; + if (ipid < 0) + ipid = -ipid; + for (trv = path + strlen(path); *--trv == 'X'; ipid /= 10) + *trv = '0' + (u_char)(ipid % 10); + firstx = trv + 1; + + /* Loop, trying to open a file. */ + for (filenum = 1;; filenum++) { + if ((ret = __os_open(env, path, 0, + oflags | DB_OSO_CREATE | DB_OSO_EXCL | DB_OSO_TEMP, + DB_MODE_600, fhpp)) == 0) { + ret = 0; + goto done; + } + + /* + * !!!: + * If we don't get an EEXIST error, then there's something + * seriously wrong. Unfortunately, if the implementation + * doesn't return EEXIST for O_CREAT and O_EXCL regardless + * of other possible errors, we've lost. + */ + if (ret != EEXIST) { + __db_err(env, ret, DB_STR_A("1586", + "temporary open: %s", "%s"), path); + goto done; + } + + /* + * Generate temporary file names in a backwards-compatible way. + * If pid == 12345, the result is: + * <path>/DB12345 (tried above, the first time through). + * <path>/DBa2345 ... <path>/DBz2345 + * <path>/DBaa345 ... <path>/DBaz345 + * <path>/DBba345, and so on. + * + * XXX + * This algorithm is O(n**2) -- that is, creating 100 temporary + * files requires 5,000 opens, creating 1000 files requires + * 500,000. If applications open a lot of temporary files, we + * could improve performance by switching to timestamp-based + * file names. + */ + for (i = filenum, trv = firstx; i > 0; i = (i - 1) / 26) + if (*trv++ == '\0') { + ret = EINVAL; + goto done; + } + + for (i = filenum; i > 0; i = (i - 1) / 26) + *--trv = 'a' + ((i - 1) % 26); + } +done: + __os_free(env, path); + return (ret); +} diff --git a/src/env/env_open.c b/src/env/env_open.c new file mode 100644 index 00000000..7eddca3a --- /dev/null +++ b/src/env/env_open.c @@ -0,0 +1,1262 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/crypto.h" +#include "dbinc/db_page.h" +#include "dbinc/btree.h" +#include "dbinc/lock.h" +#include "dbinc/mp.h" +#include "dbinc/txn.h" + +static int __env_open_arg __P((DB_ENV *, u_int32_t)); +static int __file_handle_cleanup __P((ENV *)); + +/* + * db_version -- + * Return legacy version information, including DB Major Version, + * DB Minor Version, and DB Patch/Build numbers. + * + * EXTERN: char *db_version __P((int *, int *, int *)); + */ +char * +db_version(majverp, minverp, patchp) + int *majverp, *minverp, *patchp; +{ + if (majverp != NULL) + *majverp = DB_VERSION_MAJOR; + if (minverp != NULL) + *minverp = DB_VERSION_MINOR; + if (patchp != NULL) + *patchp = DB_VERSION_PATCH; + return ((char *)DB_VERSION_STRING); +} + +/* + * db_full_version -- + * Return complete version information, including Oracle Family, + * Oracle Release, DB Major Version, DB Minor Version, and DB + * Patch/Build numbers. + * + * EXTERN: char *db_full_version __P((int *, int *, int *, int *, int *)); + */ +char * +db_full_version(familyp, releasep, majverp, minverp, patchp) + int *familyp, *releasep, *majverp, *minverp, *patchp; +{ + if (familyp != NULL) + *familyp = DB_VERSION_FAMILY; + if (releasep != NULL) + *releasep = DB_VERSION_RELEASE; + if (majverp != NULL) + *majverp = DB_VERSION_MAJOR; + if (minverp != NULL) + *minverp = DB_VERSION_MINOR; + if (patchp != NULL) + *patchp = DB_VERSION_PATCH; + return ((char *)DB_VERSION_FULL_STRING); +} + +/* + * __env_open_pp -- + * DB_ENV->open pre/post processing. + * + * PUBLIC: int __env_open_pp __P((DB_ENV *, const char *, u_int32_t, int)); + */ +int +__env_open_pp(dbenv, db_home, flags, mode) + DB_ENV *dbenv; + const char *db_home; + u_int32_t flags; + int mode; +{ + ENV *env; + int ret; + + env = dbenv->env; + ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->open"); + +#undef OKFLAGS +#define OKFLAGS \ + (DB_CREATE | DB_FAILCHK | DB_FAILCHK_ISALIVE | DB_INIT_CDB | \ + DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_REP | \ + DB_INIT_TXN | DB_LOCKDOWN | DB_NO_CHECKPOINT | DB_PRIVATE | \ + DB_RECOVER | DB_RECOVER_FATAL | DB_REGISTER | DB_SYSTEM_MEM | \ + DB_THREAD | DB_USE_ENVIRON | DB_USE_ENVIRON_ROOT) +#undef OKFLAGS_CDB +#define OKFLAGS_CDB \ + (DB_CREATE | DB_INIT_CDB | DB_INIT_MPOOL | DB_LOCKDOWN | \ + DB_PRIVATE | DB_SYSTEM_MEM | DB_THREAD | \ + DB_USE_ENVIRON | DB_USE_ENVIRON_ROOT) + + if ((ret = __db_fchk(env, "DB_ENV->open", flags, OKFLAGS)) != 0) + return (ret); + if ((ret = __db_fcchk( + env, "DB_ENV->open", flags, DB_INIT_CDB, ~OKFLAGS_CDB)) != 0) + return (ret); + +#if defined(HAVE_MIXED_SIZE_ADDRESSING) && (SIZEOF_CHAR_P == 8) + if (F_ISSET(env, DB_PRIVATE)) { + __db_errx(env, DB_STR("1589", "DB_PRIVATE is not " + "supported by 64-bit applications in " + "mixed-size-addressing mode")); + return (EINVAL); + } +#endif + + return (__env_open(dbenv, db_home, flags, mode)); +} + +/* + * __env_open -- + * DB_ENV->open. + * + * PUBLIC: int __env_open __P((DB_ENV *, const char *, u_int32_t, int)); + */ +int +__env_open(dbenv, db_home, flags, mode) + DB_ENV *dbenv; + const char *db_home; + u_int32_t flags; + int mode; +{ + DB_THREAD_INFO *ip; + ENV *env; + u_int32_t orig_flags; + int register_recovery, ret, t_ret; + + ip = NULL; + env = dbenv->env; + register_recovery = 0; + + /* Initial configuration. */ + if ((ret = __env_config(dbenv, db_home, &flags, mode)) != 0) + return (ret); + + /* + * Save the DB_ENV handle's configuration flags as set by user-called + * configuration methods and the environment directory's DB_CONFIG + * file. If we use this DB_ENV structure to recover the existing + * environment or to remove an environment we created after failure, + * we'll restore the DB_ENV flags to these values. + */ + orig_flags = dbenv->flags; + + /* Check open flags. */ + if ((ret = __env_open_arg(dbenv, flags)) != 0) + return (ret); + + /* + * If we're going to register with the environment, that's the first + * thing we do. + */ + if (LF_ISSET(DB_REGISTER)) { + /* + * Through the SQL interface (btree.c) we set + * DB_FAILCHK_ISALIVE. When set, we want to run failchk + * if a recovery is needed. Set up the infrastructure to run + * it. SQL applications have no way to specify the thread + * count or an isalive, so force it here. Failchk is run + * inside of register code. + */ + if (LF_ISSET(DB_FAILCHK_ISALIVE)) { + (void)__env_set_thread_count(dbenv, 50); + dbenv->is_alive = __envreg_isalive; + } + + if ((ret = + __envreg_register(env, ®ister_recovery, flags)) != 0) + goto err; + if (register_recovery) { + if (!LF_ISSET(DB_RECOVER)) { + __db_errx(env, DB_STR("1567", + "The DB_RECOVER flag was not specified, and recovery is needed")); + ret = DB_RUNRECOVERY; + goto err; + } + } else + LF_CLR(DB_RECOVER); + } + + /* + * If we're doing recovery, destroy the environment so that we create + * all the regions from scratch. The major concern I have is if the + * application stomps the environment with a rogue pointer. We have + * no way of detecting that, and we could be forced into a situation + * where we start up and then crash, repeatedly. + * + * We do not check any flags like DB_PRIVATE before calling remove. + * We don't care if the current environment was private or not, we + * want to remove files left over for any reason, from any session. + */ +retry: if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL)) +#ifdef HAVE_REPLICATION + if ((ret = __rep_reset_init(env)) != 0 || + (ret = __env_remove_env(env)) != 0 || +#else + if ((ret = __env_remove_env(env)) != 0 || +#endif + (ret = __env_refresh(dbenv, orig_flags, 0)) != 0) + goto err; + + if ((ret = __env_attach_regions(dbenv, flags, orig_flags, 1)) != 0) + goto err; + + /* + * After attached to env, run failchk if not doing register + * recovery. Not providing this option with the DB_FAILCHK_ISALIVE + * flag. + */ + if (LF_ISSET(DB_FAILCHK) && !register_recovery) { + ENV_ENTER(env, ip); + if ((ret = __env_failchk_int(dbenv)) != 0) + goto err; + ENV_LEAVE(env, ip); + } + +err: if (ret != 0) + (void)__env_refresh(dbenv, orig_flags, 0); + + if (register_recovery) { + /* + * If recovery succeeded, release our exclusive lock, other + * processes can now proceed. + * + * If recovery failed, unregister now and let another process + * clean up. + */ + if (ret == 0 && (t_ret = __envreg_xunlock(env)) != 0) + ret = t_ret; + if (ret != 0) + (void)__envreg_unregister(env, 1); + } + + /* + * If the open is called with DB_REGISTER we can potentially skip + * running recovery on a panicked environment. We can't check the panic + * bit earlier since checking requires opening the environment. + * Only retry if DB_RECOVER was specified - the register_recovery flag + * indicates that. + */ + if (ret == DB_RUNRECOVERY && !register_recovery && + !LF_ISSET(DB_RECOVER) && LF_ISSET(DB_REGISTER)) { + LF_SET(DB_RECOVER); + goto retry; + } + + return (ret); +} + +/* + * __env_open_arg -- + * DB_ENV->open flags checking. + */ +static int +__env_open_arg(dbenv, flags) + DB_ENV *dbenv; + u_int32_t flags; +{ + ENV *env; + int ret; + + env = dbenv->env; + ret = 0; + + if (LF_ISSET(DB_REGISTER)) { + if (!__os_support_db_register()) { + __db_errx(env, DB_STR("1568", + "Berkeley DB library does not support DB_REGISTER on this system")); + return (EINVAL); + } + if ((ret = __db_fcchk(env, "DB_ENV->open", flags, + DB_PRIVATE, DB_REGISTER | DB_SYSTEM_MEM)) != 0) + return (ret); + if (LF_ISSET(DB_CREATE) && !LF_ISSET(DB_INIT_TXN)) { + __db_errx(env, DB_STR("1569", + "registration requires transaction support")); + return (EINVAL); + } + } + /* + * Only check for flags compatible with DB_INIT_REP when creating + * since otherwise it'll be ignored anyway. + */ + if (LF_ISSET(DB_INIT_REP) && LF_ISSET(DB_CREATE)) { + if (!__os_support_replication()) { + __db_errx(env, DB_STR("1570", + "Berkeley DB library does not support replication on this system")); + return (EINVAL); + } + if (!LF_ISSET(DB_INIT_LOCK)) { + __db_errx(env, DB_STR("1571", + "replication requires locking support")); + return (EINVAL); + } + if (!LF_ISSET(DB_INIT_TXN)) { + __db_errx(env, DB_STR("1572", + "replication requires transaction support")); + return (EINVAL); + } + } + if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL)) { + if ((ret = __db_fcchk(env, + "DB_ENV->open", flags, DB_RECOVER, DB_RECOVER_FATAL)) != 0) + return (ret); + if ((ret = __db_fcchk(env, + "DB_ENV->open", flags, DB_REGISTER, DB_RECOVER_FATAL)) != 0) + return (ret); + if (!LF_ISSET(DB_CREATE)) { + __db_errx(env, DB_STR("1573", + "recovery requires the create flag")); + return (EINVAL); + } + if (!LF_ISSET(DB_INIT_TXN)) { + __db_errx(env, DB_STR("1574", + "recovery requires transaction support")); + return (EINVAL); + } + } + if (LF_ISSET(DB_FAILCHK)) { + if (!ALIVE_ON(env)) { + __db_errx(env, DB_STR("1575", + "DB_FAILCHK requires DB_ENV->is_alive be configured")); + return (EINVAL); + } + if (dbenv->thr_max == 0) { + __db_errx(env, DB_STR("1576", + "DB_FAILCHK requires DB_ENV->set_thread_count be configured")); + return (EINVAL); + } + } + +#ifdef HAVE_MUTEX_THREAD_ONLY + /* + * Currently we support one kind of mutex that is intra-process only, + * POSIX 1003.1 pthreads, because a variety of systems don't support + * the full pthreads API, and our only alternative is test-and-set. + */ + if (!LF_ISSET(DB_PRIVATE)) { + __db_errx(env, DB_STR("1577", + "Berkeley DB library configured to support only private environments")); + return (EINVAL); + } +#endif + +#ifdef HAVE_MUTEX_FCNTL + /* + * !!! + * We need a file descriptor for fcntl(2) locking. We use the file + * handle from the REGENV file for this purpose. + * + * Since we may be using shared memory regions, e.g., shmget(2), and + * not a mapped-in regular file, the backing file may be only a few + * bytes in length. So, this depends on the ability to call fcntl to + * lock file offsets much larger than the actual physical file. I + * think that's safe -- besides, very few systems actually need this + * kind of support, SunOS is the only one still in wide use of which + * I'm aware. + * + * The error case is if an application lacks spinlocks and wants to be + * threaded. That doesn't work because fcntl will lock the underlying + * process, including all its threads. + */ + if (F_ISSET(env, ENV_THREAD)) { + __db_errx(env, DB_STR("1578", + "architecture lacks fast mutexes: applications cannot be threaded")); + return (EINVAL); + } +#endif + return (ret); +} + +/* + * __env_remove -- + * DB_ENV->remove. + * + * PUBLIC: int __env_remove __P((DB_ENV *, const char *, u_int32_t)); + */ +int +__env_remove(dbenv, db_home, flags) + DB_ENV *dbenv; + const char *db_home; + u_int32_t flags; +{ + ENV *env; + int ret, t_ret; + + env = dbenv->env; + +#undef OKFLAGS +#define OKFLAGS \ + (DB_FORCE | DB_USE_ENVIRON | DB_USE_ENVIRON_ROOT) + + /* Validate arguments. */ + if ((ret = __db_fchk(env, "DB_ENV->remove", flags, OKFLAGS)) != 0) + return (ret); + + ENV_ILLEGAL_AFTER_OPEN(env, "DB_ENV->remove"); + + if ((ret = __env_config(dbenv, db_home, &flags, 0)) != 0) + return (ret); + + /* + * Turn the environment off -- if the environment is corrupted, this + * could fail. Ignore any error if we're forcing the question. + */ + if ((ret = __env_turn_off(env, flags)) == 0 || LF_ISSET(DB_FORCE)) + ret = __env_remove_env(env); + + if ((t_ret = __env_close(dbenv, 0)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __env_config -- + * Argument-based initialization. + * + * PUBLIC: int __env_config __P((DB_ENV *, const char *, u_int32_t *, int)); + */ +int +__env_config(dbenv, db_home, flagsp, mode) + DB_ENV *dbenv; + const char *db_home; + u_int32_t *flagsp; + int mode; +{ + ENV *env; + int ret; + u_int32_t flags; + char *home, home_buf[DB_MAXPATHLEN]; + + env = dbenv->env; + flags = *flagsp; + + /* + * Set the database home. + * + * Use db_home by default, this allows utilities to reasonably + * override the environment either explicitly or by using a -h + * option. Otherwise, use the environment if it's permitted + * and initialized. + */ + home = (char *)db_home; + if (home == NULL && (LF_ISSET(DB_USE_ENVIRON) || + (LF_ISSET(DB_USE_ENVIRON_ROOT) && __os_isroot()))) { + home = home_buf; + if ((ret = __os_getenv( + env, "DB_HOME", &home, sizeof(home_buf))) != 0) + return (ret); + /* + * home set to NULL if __os_getenv failed to find DB_HOME. + */ + } + if (home != NULL) { + if (env->db_home != NULL) + __os_free(env, env->db_home); + if ((ret = __os_strdup(env, home, &env->db_home)) != 0) + return (ret); + } + + /* Save a copy of the DB_ENV->open method flags. */ + env->open_flags = flags; + + /* Default permissions are read-write for both owner and group. */ + env->db_mode = mode == 0 ? DB_MODE_660 : mode; + + /* Read the DB_CONFIG file. */ + if ((ret = __env_read_db_config(env)) != 0) + return (ret); + + /* + * Update the DB_ENV->open method flags. The copy of the flags might + * have been changed during reading DB_CONFIG file. + */ + flags = env->open_flags; + + /* + * If no temporary directory path was specified in the config file, + * choose one. + */ + if (dbenv->db_tmp_dir == NULL && (ret = __os_tmpdir(env, flags)) != 0) + return (ret); + + *flagsp = flags; + return (0); +} + +/* + * __env_close_pp -- + * DB_ENV->close pre/post processor. + * + * PUBLIC: int __env_close_pp __P((DB_ENV *, u_int32_t)); + */ +int +__env_close_pp(dbenv, flags) + DB_ENV *dbenv; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int rep_check, ret, t_ret; + u_int32_t close_flags, flags_orig; + + env = dbenv->env; + ret = 0; + close_flags = flags_orig = 0; + + /* + * Validate arguments, but as a DB_ENV handle destructor, we can't + * fail. + */ + if (flags != 0 && flags != DB_FORCESYNC && + (t_ret = __db_ferr(env, "DB_ENV->close", 0)) != 0 && ret == 0) + ret = t_ret; + +#define DBENV_FORCESYNC 0x00000001 +#define DBENV_CLOSE_REPCHECK 0x00000010 + if (flags == DB_FORCESYNC) + close_flags |= DBENV_FORCESYNC; + + /* + * If the environment has panic'd, all we do is try and discard + * the important resources. + */ + if (PANIC_ISSET(env)) { + /* clean up from registry file */ + if (dbenv->registry != NULL) { + /* + * Temporarily set no panic so we do not trigger the + * LAST_PANIC_CHECK_BEFORE_IO check in __os_physwr + * thus allowing the unregister to happen correctly. + */ + flags_orig = F_ISSET(dbenv, DB_ENV_NOPANIC); + F_SET(dbenv, DB_ENV_NOPANIC); + (void)__envreg_unregister(env, 0); + dbenv->registry = NULL; + if (!flags_orig) + F_CLR(dbenv, DB_ENV_NOPANIC); + } + + /* Close all underlying threads and sockets. */ + if (IS_ENV_REPLICATED(env)) + (void)__repmgr_close(env); + + /* Close all underlying file handles. */ + (void)__file_handle_cleanup(env); + + PANIC_CHECK(env); + } + + ENV_ENTER(env, ip); + + rep_check = IS_ENV_REPLICATED(env) ? 1 : 0; + if (rep_check) { +#ifdef HAVE_REPLICATION_THREADS + /* + * Shut down Replication Manager threads first of all. This + * must be done before __env_rep_enter to avoid a deadlock that + * could occur if repmgr's background threads try to do a rep + * operation that needs __rep_lockout. + */ + if ((t_ret = __repmgr_close(env)) != 0 && ret == 0) + ret = t_ret; +#endif + if ((t_ret = __env_rep_enter(env, 0)) != 0 && ret == 0) + ret = t_ret; + } + + if (rep_check) + close_flags |= DBENV_CLOSE_REPCHECK; + if ((t_ret = __env_close(dbenv, close_flags)) != 0 && ret == 0) + ret = t_ret; + + /* Don't ENV_LEAVE as we have already detached from the region. */ + return (ret); +} + +/* + * __env_close -- + * DB_ENV->close. + * + * PUBLIC: int __env_close __P((DB_ENV *, u_int32_t)); + */ +int +__env_close(dbenv, flags) + DB_ENV *dbenv; + u_int32_t flags; +{ + DB *dbp; + ENV *env; + int ret, rep_check, t_ret; + char **p; + u_int32_t close_flags; + + env = dbenv->env; + ret = 0; + close_flags = LF_ISSET(DBENV_FORCESYNC) ? 0 : DB_NOSYNC; + rep_check = LF_ISSET(DBENV_CLOSE_REPCHECK); + + /* + * Check to see if we were in the middle of restoring transactions and + * need to close the open files. + */ + if (TXN_ON(env) && (t_ret = __txn_preclose(env)) != 0 && ret == 0) + ret = t_ret; + +#ifdef HAVE_REPLICATION + if ((t_ret = __rep_env_close(env)) != 0 && ret == 0) + ret = t_ret; +#endif + + /* + * Close all databases opened in this environment after the rep region + * is closed. Rep region's internal database is already closed now. + */ + while ((dbp = TAILQ_FIRST(&env->dblist)) != NULL) { + /* + * Do not close the handle on a database partition, since it + * will be closed when closing the handle on the main database. + */ + while (dbp != NULL && F_ISSET(dbp, DB_AM_PARTDB)) + dbp = TAILQ_NEXT(dbp, dblistlinks); + DB_ASSERT(env, dbp != NULL); + /* + * Note down and ignore the error code. Since we can't do + * anything about the dbp handle anyway if the close + * operation fails. But we want to return the error to the + * caller. This is how this function takes care of various + * close operation errors. + */ + if (dbp->alt_close != NULL) + t_ret = dbp->alt_close(dbp, close_flags); + else + t_ret = __db_close(dbp, NULL, close_flags); + if (t_ret != 0 && ret == 0) + ret = t_ret; + } + + /* + * Detach from the regions and undo the allocations done by + * DB_ENV->open. + */ + if ((t_ret = __env_refresh(dbenv, 0, rep_check)) != 0 && ret == 0) + ret = t_ret; + +#ifdef HAVE_CRYPTO + /* + * Crypto comes last, because higher level close functions need + * cryptography. + */ + if ((t_ret = __crypto_env_close(env)) != 0 && ret == 0) + ret = t_ret; +#endif + + /* If we're registered, clean up. */ + if (dbenv->registry != NULL) { + (void)__envreg_unregister(env, 0); + dbenv->registry = NULL; + } + + /* Check we've closed all underlying file handles. */ + if ((t_ret = __file_handle_cleanup(env)) != 0 && ret == 0) + ret = t_ret; + + /* Release any string-based configuration parameters we've copied. */ + if (dbenv->db_log_dir != NULL) + __os_free(env, dbenv->db_log_dir); + dbenv->db_log_dir = NULL; + if (dbenv->db_tmp_dir != NULL) + __os_free(env, dbenv->db_tmp_dir); + dbenv->db_tmp_dir = NULL; + if (dbenv->db_md_dir != NULL) + __os_free(env, dbenv->db_md_dir); + dbenv->db_md_dir = NULL; + if (dbenv->db_data_dir != NULL) { + for (p = dbenv->db_data_dir; *p != NULL; ++p) + __os_free(env, *p); + __os_free(env, dbenv->db_data_dir); + dbenv->db_data_dir = NULL; + dbenv->data_next = 0; + } + if (dbenv->intermediate_dir_mode != NULL) + __os_free(env, dbenv->intermediate_dir_mode); + if (env->db_home != NULL) { + __os_free(env, env->db_home); + env->db_home = NULL; + } + + if (env->backup_handle != NULL) { + __os_free(env, env->backup_handle); + env->backup_handle = NULL; + } + + /* Discard the structure. */ + __db_env_destroy(dbenv); + + return (ret); +} + +/* + * __env_refresh -- + * Refresh the DB_ENV structure. + * PUBLIC: int __env_refresh __P((DB_ENV *, u_int32_t, int)); + */ +int +__env_refresh(dbenv, orig_flags, rep_check) + DB_ENV *dbenv; + u_int32_t orig_flags; + int rep_check; +{ + DB *ldbp; + DB_THREAD_INFO *ip; + ENV *env; + int ret, t_ret; + + env = dbenv->env; + ret = 0; + + /* + * Release resources allocated by DB_ENV->open, and return it to the + * state it was in just before __env_open was called. (This means + * state set by pre-open configuration functions must be preserved.) + * + * Refresh subsystems, in the reverse order they were opened (txn + * must be first, it may want to discard locks and flush the log). + * + * !!! + * Note that these functions, like all of __env_refresh, only undo + * the effects of __env_open. Functions that undo work done by + * db_env_create or by a configuration function should go in + * __env_close. + */ + if (TXN_ON(env) && + (t_ret = __txn_env_refresh(env)) != 0 && ret == 0) + ret = t_ret; + + if (LOGGING_ON(env) && + (t_ret = __log_env_refresh(env)) != 0 && ret == 0) + ret = t_ret; + + /* + * Locking should come after logging, because closing log results + * in files closing which may require locks being released. + */ + if (LOCKING_ON(env)) { + if (!F_ISSET(env, ENV_THREAD) && + env->env_lref != NULL && (t_ret = + __lock_id_free(env, env->env_lref)) != 0 && ret == 0) + ret = t_ret; + env->env_lref = NULL; + + if ((t_ret = __lock_env_refresh(env)) != 0 && ret == 0) + ret = t_ret; + } + + /* Discard the DB_ENV, ENV handle mutexes. */ + if ((t_ret = __mutex_free(env, &dbenv->mtx_db_env)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __mutex_free(env, &env->mtx_env)) != 0 && ret == 0) + ret = t_ret; + + /* + * Discard DB list and its mutex. + * Discard the MT mutex. + * + * !!! + * This must be done after we close the log region, because we close + * database handles and so acquire this mutex when we close log file + * handles. + */ + if (env->db_ref != 0) { + __db_errx(env, DB_STR("1579", + "Database handles still open at environment close")); + TAILQ_FOREACH(ldbp, &env->dblist, dblistlinks) + __db_errx(env, DB_STR_A("1580", + "Open database handle: %s%s%s", "%s %s %s"), + ldbp->fname == NULL ? "unnamed" : ldbp->fname, + ldbp->dname == NULL ? "" : "/", + ldbp->dname == NULL ? "" : ldbp->dname); + if (ret == 0) + ret = EINVAL; + } + TAILQ_INIT(&env->dblist); + if ((t_ret = __mutex_free(env, &env->mtx_dblist)) != 0 && ret == 0) + ret = t_ret; + if ((t_ret = __mutex_free(env, &env->mtx_mt)) != 0 && ret == 0) + ret = t_ret; + + if (env->mt != NULL) { + __os_free(env, env->mt); + env->mt = NULL; + } + + if (MPOOL_ON(env)) { + /* + * If it's a private environment, flush the contents to disk. + * Recovery would have put everything back together, but it's + * faster and cleaner to flush instead. + * + * Ignore application max-write configuration, we're shutting + * down. + */ + if (F_ISSET(env, ENV_PRIVATE) && + !F_ISSET(dbenv, DB_ENV_NOFLUSH) && + (t_ret = __memp_sync_int(env, NULL, 0, + DB_SYNC_CACHE | DB_SYNC_SUPPRESS_WRITE, NULL, NULL)) != 0 && + ret == 0) + ret = t_ret; + + if ((t_ret = __memp_env_refresh(env)) != 0 && ret == 0) + ret = t_ret; + } + + /* + * If we're included in a shared replication handle count, this + * is our last chance to decrement that count. + * + * !!! + * We can't afford to do anything dangerous after we decrement the + * handle count, of course, as replication may be proceeding with + * client recovery. However, since we're discarding the regions + * as soon as we drop the handle count, there's little opportunity + * to do harm. + */ + if (rep_check && (t_ret = __env_db_rep_exit(env)) != 0 && ret == 0) + ret = t_ret; + + /* + * Refresh the replication region. + * + * Must come after we call __env_db_rep_exit above. + */ + if (REP_ON(env) && (t_ret = __rep_env_refresh(env)) != 0 && ret == 0) + ret = t_ret; + +#ifdef HAVE_CRYPTO + /* + * Crypto comes last, because higher level close functions need + * cryptography. + */ + if (env->reginfo != NULL && + (t_ret = __crypto_env_refresh(env)) != 0 && ret == 0) + ret = t_ret; +#endif + + /* + * Mark the thread as out of the env before we get rid of the handles + * needed to do so. + */ + if (env->thr_hashtab != NULL && + (t_ret = __env_set_state(env, &ip, THREAD_OUT)) != 0 && ret == 0) + ret = t_ret; + + /* + * We are about to detach from the mutex region. This is the last + * chance we have to acquire/destroy a mutex -- acquire/destroy the + * mutex and release our reference. + * + * !!! + * There are two DbEnv methods that care about environment reference + * counts: DbEnv.close and DbEnv.remove. The DbEnv.close method is + * not a problem because it only decrements the reference count and + * no actual resources are discarded -- lots of threads of control + * can call DbEnv.close at the same time, and regardless of racing + * on the reference count mutex, we wouldn't have a problem. Since + * the DbEnv.remove method actually discards resources, we can have + * a problem. + * + * If we decrement the reference count to 0 here, go to sleep, and + * the DbEnv.remove method is called, by the time we run again, the + * underlying shared regions could have been removed. That's fine, + * except we might actually need the regions to resolve outstanding + * operations in the various subsystems, and if we don't have hard + * OS references to the regions, we could get screwed. Of course, + * we should have hard OS references to everything we need, but just + * in case, we put off decrementing the reference count as long as + * possible. + */ + if ((t_ret = __env_ref_decrement(env)) != 0 && ret == 0) + ret = t_ret; + +#ifdef HAVE_MUTEX_SUPPORT + if (MUTEX_ON(env) && + (t_ret = __mutex_env_refresh(env)) != 0 && ret == 0) + ret = t_ret; +#endif + /* Free memory for thread tracking. */ + if (env->reginfo != NULL) { + if (F_ISSET(env, ENV_PRIVATE)) { + __env_thread_destroy(env); + t_ret = __env_detach(env, 1); + } else + t_ret = __env_detach(env, 0); + + if (t_ret != 0 && ret == 0) + ret = t_ret; + + /* + * !!! + * Don't free env->reginfo or set the reference to NULL, + * that was done by __env_detach(). + */ + } + + if (env->recover_dtab.int_dispatch != NULL) { + __os_free(env, env->recover_dtab.int_dispatch); + env->recover_dtab.int_size = 0; + env->recover_dtab.int_dispatch = NULL; + } + if (env->recover_dtab.ext_dispatch != NULL) { + __os_free(env, env->recover_dtab.ext_dispatch); + env->recover_dtab.ext_size = 0; + env->recover_dtab.ext_dispatch = NULL; + } + + dbenv->flags = orig_flags; + + return (ret); +} + +/* + * __file_handle_cleanup -- + * Close any underlying open file handles so we don't leak system + * resources. + */ +static int +__file_handle_cleanup(env) + ENV *env; +{ + DB_FH *fhp; + + if (TAILQ_FIRST(&env->fdlist) == NULL) + return (0); + + __db_errx(env, DB_STR("1581", + "File handles still open at environment close")); + while ((fhp = TAILQ_FIRST(&env->fdlist)) != NULL) { + __db_errx(env, DB_STR_A("1582", "Open file handle: %s", "%s"), + fhp->name); + (void)__os_closehandle(env, fhp); + } + return (EINVAL); +} + +/* + * __env_get_open_flags + * DbEnv.get_open_flags method. + * + * PUBLIC: int __env_get_open_flags __P((DB_ENV *, u_int32_t *)); + */ +int +__env_get_open_flags(dbenv, flagsp) + DB_ENV *dbenv; + u_int32_t *flagsp; +{ + ENV *env; + + env = dbenv->env; + + ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->get_open_flags"); + + *flagsp = env->open_flags; + return (0); +} +/* + * __env_attach_regions -- + * Perform attaches to env and required regions (subsystems) + * + * PUBLIC: int __env_attach_regions __P((DB_ENV *, u_int32_t, u_int32_t, int)); + */ +int +__env_attach_regions(dbenv, flags, orig_flags, retry_ok) + DB_ENV *dbenv; + u_int32_t flags; + u_int32_t orig_flags; + int retry_ok; +{ + DB_THREAD_INFO *ip; + ENV *env; + REGINFO *infop; + u_int32_t init_flags; + int create_ok, rep_check, ret; + + ip = NULL; + env = dbenv->env; + rep_check = 0; + + /* Convert the DB_ENV->open flags to internal flags. */ + create_ok = LF_ISSET(DB_CREATE) ? 1 : 0; + if (LF_ISSET(DB_LOCKDOWN)) + F_SET(env, ENV_LOCKDOWN); + if (LF_ISSET(DB_PRIVATE)) + F_SET(env, ENV_PRIVATE); + if (LF_ISSET(DB_RECOVER_FATAL)) + F_SET(env, ENV_RECOVER_FATAL); + if (LF_ISSET(DB_SYSTEM_MEM)) + F_SET(env, ENV_SYSTEM_MEM); + if (LF_ISSET(DB_THREAD)) + F_SET(env, ENV_THREAD); + + /* + * Create/join the environment. We pass in the flags of interest to + * a thread subsequently joining an environment we create. If we're + * not the ones to create the environment, our flags will be updated + * to match the existing environment. + */ + init_flags = 0; + if (LF_ISSET(DB_INIT_CDB)) + FLD_SET(init_flags, DB_INITENV_CDB); + if (F_ISSET(dbenv, DB_ENV_CDB_ALLDB)) + FLD_SET(init_flags, DB_INITENV_CDB_ALLDB); + if (LF_ISSET(DB_INIT_LOCK)) + FLD_SET(init_flags, DB_INITENV_LOCK); + if (LF_ISSET(DB_INIT_LOG)) + FLD_SET(init_flags, DB_INITENV_LOG); + if (LF_ISSET(DB_INIT_MPOOL)) + FLD_SET(init_flags, DB_INITENV_MPOOL); + if (LF_ISSET(DB_INIT_REP)) + FLD_SET(init_flags, DB_INITENV_REP); + if (LF_ISSET(DB_INIT_TXN)) + FLD_SET(init_flags, DB_INITENV_TXN); + if ((ret = __env_attach(env, &init_flags, create_ok, retry_ok)) != 0) + goto err; + + /* + * __env_attach will return the saved init_flags field, which contains + * the DB_INIT_* flags used when the environment was created. + * + * We may be joining an environment -- reset our flags to match the + * ones in the environment. + */ + if (FLD_ISSET(init_flags, DB_INITENV_CDB)) + LF_SET(DB_INIT_CDB); + if (FLD_ISSET(init_flags, DB_INITENV_LOCK)) + LF_SET(DB_INIT_LOCK); + if (FLD_ISSET(init_flags, DB_INITENV_LOG)) + LF_SET(DB_INIT_LOG); + if (FLD_ISSET(init_flags, DB_INITENV_MPOOL)) + LF_SET(DB_INIT_MPOOL); + if (FLD_ISSET(init_flags, DB_INITENV_REP)) + LF_SET(DB_INIT_REP); + if (FLD_ISSET(init_flags, DB_INITENV_TXN)) + LF_SET(DB_INIT_TXN); + if (FLD_ISSET(init_flags, DB_INITENV_CDB_ALLDB) && + (ret = __env_set_flags(dbenv, DB_CDB_ALLDB, 1)) != 0) + goto err; + + /* Initialize for CDB product. */ + if (LF_ISSET(DB_INIT_CDB)) { + LF_SET(DB_INIT_LOCK); + F_SET(env, ENV_CDB); + } + + /* + * Update the flags to match the database environment. The application + * may have specified flags of 0 to join the environment, and this line + * replaces that value with the flags corresponding to the existing, + * underlying set of subsystems. This means the DbEnv.get_open_flags + * method returns the flags to open the existing environment instead of + * the specific flags passed to the DbEnv.open method. + */ + env->open_flags = flags; + + /* + * The DB_ENV structure has now been initialized. Turn off further + * use of the DB_ENV structure and most initialization methods, we're + * about to act on the values we currently have. + */ + F_SET(env, ENV_OPEN_CALLED); + + infop = env->reginfo; + +#ifdef HAVE_MUTEX_SUPPORT + /* + * Initialize the mutex regions first before ENV_ENTER(). + * Mutexes need to be 'on' when attaching to an existing env + * in order to safely allocate the thread tracking info. + */ + if ((ret = __mutex_open(env, create_ok)) != 0) + goto err; + /* The MUTEX_REQUIRED() in __env_alloc() expects this to be set. */ + infop->mtx_alloc = ((REGENV *)infop->primary)->mtx_regenv; +#endif + /* + * Initialize thread tracking and enter the API. + */ + if ((ret = + __env_thread_init(env, F_ISSET(infop, REGION_CREATE) ? 1 : 0)) != 0) + goto err; + + ENV_ENTER(env, ip); + + /* + * Initialize the subsystems. + */ + /* + * We can now acquire/create mutexes: increment the region's reference + * count. + */ + if ((ret = __env_ref_increment(env)) != 0) + goto err; + + /* + * Initialize the handle mutexes. + */ + if ((ret = __mutex_alloc(env, + MTX_ENV_HANDLE, DB_MUTEX_PROCESS_ONLY, &dbenv->mtx_db_env)) != 0 || + (ret = __mutex_alloc(env, + MTX_ENV_HANDLE, DB_MUTEX_PROCESS_ONLY, &env->mtx_env)) != 0) + goto err; + + /* + * Initialize the replication area next, so that we can lock out this + * call if we're currently running recovery for replication. + */ + if (LF_ISSET(DB_INIT_REP) && (ret = __rep_open(env)) != 0) + goto err; + + rep_check = IS_ENV_REPLICATED(env) ? 1 : 0; + if (rep_check && (ret = __env_rep_enter(env, 0)) != 0) + goto err; + + if (LF_ISSET(DB_INIT_MPOOL)) { + if ((ret = __memp_open(env, create_ok)) != 0) + goto err; + + /* + * BDB does do cache I/O during recovery and when starting up + * replication. If creating a new environment, then suppress + * any application max-write configuration. + */ + if (create_ok) + (void)__memp_set_config( + dbenv, DB_MEMP_SUPPRESS_WRITE, 1); + + /* + * Initialize the DB list and its mutex. If the mpool is + * not initialized, we can't ever open a DB handle, which + * is why this code lives here. + */ + TAILQ_INIT(&env->dblist); + if ((ret = __mutex_alloc(env, MTX_ENV_DBLIST, + DB_MUTEX_PROCESS_ONLY, &env->mtx_dblist)) != 0) + goto err; + + /* Register DB's pgin/pgout functions. */ + if ((ret = __memp_register( + env, DB_FTYPE_SET, __db_pgin, __db_pgout)) != 0) + goto err; + } + + /* + * Initialize the ciphering area prior to any running of recovery so + * that we can initialize the keys, etc. before recovery, including + * the MT mutex. + * + * !!! + * This must be after the mpool init, but before the log initialization + * because log_open may attempt to run log_recover during its open. + */ + if (LF_ISSET(DB_INIT_MPOOL | DB_INIT_LOG | DB_INIT_TXN) && + (ret = __crypto_region_init(env)) != 0) + goto err; + if ((ret = __mutex_alloc( + env, MTX_TWISTER, DB_MUTEX_PROCESS_ONLY, &env->mtx_mt)) != 0) + goto err; + + /* + * Transactions imply logging but do not imply locking. While almost + * all applications want both locking and logging, it would not be + * unreasonable for a single threaded process to want transactions for + * atomicity guarantees, but not necessarily need concurrency. + */ + if (LF_ISSET(DB_INIT_LOG | DB_INIT_TXN)) + if ((ret = __log_open(env)) != 0) + goto err; + if (LF_ISSET(DB_INIT_LOCK)) + if ((ret = __lock_open(env)) != 0) + goto err; + + if (LF_ISSET(DB_INIT_TXN)) { + if ((ret = __txn_open(env)) != 0) + goto err; + + /* + * If the application is running with transactions, initialize + * the function tables. + */ + if ((ret = __env_init_rec(env, + ((LOG *)env->lg_handle->reginfo.primary)->persist.version)) + != 0) + goto err; + } + + /* Perform recovery for any previous run. */ + if (LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL) && + (ret = __db_apprec(env, ip, NULL, NULL, 1, + LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL | DB_NO_CHECKPOINT))) != 0) + goto err; + + /* + * If we've created the regions, are running with transactions, and did + * not just run recovery, we need to log the fact that the transaction + * IDs got reset. + * + * If we ran recovery, there may be prepared-but-not-yet-committed + * transactions that need to be resolved. Recovery resets the minimum + * transaction ID and logs the reset if that's appropriate, so we + * don't need to do anything here in the recover case. + */ + if (TXN_ON(env) && + !FLD_ISSET(dbenv->lg_flags, DB_LOG_IN_MEMORY) && + F_ISSET(infop, REGION_CREATE) && + !LF_ISSET(DB_RECOVER | DB_RECOVER_FATAL) && + (ret = __txn_reset(env)) != 0) + goto err; + + /* The database environment is ready for business. */ + if ((ret = __env_turn_on(env)) != 0) + goto err; + + if (rep_check) + ret = __env_db_rep_exit(env); + + /* Turn any application-specific max-write configuration back on. */ + if (LF_ISSET(DB_INIT_MPOOL)) + (void)__memp_set_config(dbenv, DB_MEMP_SUPPRESS_WRITE, 0); + +err: if (ret == 0) + ENV_LEAVE(env, ip); + else { + /* + * If we fail after creating regions, panic and remove them. + * + * !!! + * No need to call __env_db_rep_exit, that work is done by the + * calls to __env_refresh. + */ + infop = env->reginfo; + if (infop != NULL && F_ISSET(infop, REGION_CREATE)) { + ret = __env_panic(env, ret); + + /* Refresh the DB_ENV so can use it to call remove. */ + (void)__env_refresh(dbenv, orig_flags, rep_check); + (void)__env_remove_env(env); + (void)__env_refresh(dbenv, orig_flags, 0); + } else + (void)__env_refresh(dbenv, orig_flags, rep_check); + /* clear the fact that the region had been opened */ + F_CLR(env, ENV_OPEN_CALLED); + } + + return (ret); +} diff --git a/src/env/env_recover.c b/src/env/env_recover.c new file mode 100644 index 00000000..9636554a --- /dev/null +++ b/src/env/env_recover.c @@ -0,0 +1,1093 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/fop.h" +#include "dbinc/btree.h" +#include "dbinc/hash.h" +#include "dbinc/heap.h" +#include "dbinc/mp.h" +#include "dbinc/qam.h" +#include "dbinc/txn.h" + +#ifndef lint +static const char copyright[] = + "Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved.\n"; +#endif + +static int __db_log_corrupt __P((ENV *, DB_LSN *)); +static int __env_init_rec_42 __P((ENV *)); +static int __env_init_rec_43 __P((ENV *)); +static int __env_init_rec_46 __P((ENV *)); +static int __env_init_rec_47 __P((ENV *)); +static int __env_init_rec_48 __P((ENV *)); +static int __log_earliest __P((ENV *, DB_LOGC *, int32_t *, DB_LSN *)); + +static double __lsn_diff __P((DB_LSN *, DB_LSN *, DB_LSN *, u_int32_t, int)); +static int __log_backup __P((ENV *, DB_LOGC *, DB_LSN *, DB_LSN*)); + +/* + * __db_apprec -- + * Perform recovery. If max_lsn is non-NULL, then we are trying + * to synchronize this system up with another system that has a max + * LSN of max_lsn, so we need to roll back sufficiently far for that + * to work. See __log_backup for details. + * + * PUBLIC: int __db_apprec __P((ENV *, + * PUBLIC: DB_THREAD_INFO *, DB_LSN *, DB_LSN *, int, u_int32_t)); + */ +int +__db_apprec(env, ip, max_lsn, trunclsn, update, flags) + ENV *env; + DB_THREAD_INFO *ip; + DB_LSN *max_lsn, *trunclsn; + int update; + u_int32_t flags; +{ + DBT data; + DB_ENV *dbenv; + DB_LOGC *logc; + DB_LSN ckp_lsn, first_lsn, last_lsn, lowlsn, lsn, stop_lsn, tlsn; + DB_LSN *vtrunc_ckp, *vtrunc_lsn; + DB_TXNHEAD *txninfo; + DB_TXNREGION *region; + REGENV *renv; + REGINFO *infop; + __txn_ckp_args *ckp_args; + time_t now, tlow; + double nfiles; + u_int32_t hi_txn, log_size, txnid; + int32_t low; + int all_recovered, progress, rectype, ret, t_ret; + char *p, *pass; + char t1[CTIME_BUFLEN], t2[CTIME_BUFLEN], time_buf[CTIME_BUFLEN]; + + COMPQUIET(nfiles, (double)0.001); + + dbenv = env->dbenv; + logc = NULL; + ckp_args = NULL; + hi_txn = TXN_MAXIMUM; + txninfo = NULL; + pass = DB_STR_P("initial"); + ZERO_LSN(lsn); + + /* + * XXX + * Get the log size. No locking required because we're single-threaded + * during recovery. + */ + log_size = ((LOG *)env->lg_handle->reginfo.primary)->log_size; + + /* + * If we need to, update the env handle timestamp. + */ + if (update && REP_ON(env)) { + infop = env->reginfo; + renv = infop->primary; + (void)time(&renv->rep_timestamp); + } + + /* Set in-recovery flags. */ + F_SET(env->lg_handle, DBLOG_RECOVER); + region = env->tx_handle->reginfo.primary; + F_SET(region, TXN_IN_RECOVERY); + + /* Allocate a cursor for the log. */ + if ((ret = __log_cursor(env, &logc)) != 0) + goto err; + + /* + * If the user is specifying recovery to a particular point in time + * or to a particular LSN, find the point to start recovery from. + */ + ZERO_LSN(lowlsn); + if (max_lsn != NULL) { + if ((ret = __log_backup(env, logc, max_lsn, &lowlsn)) != 0) + goto err; + } else if (dbenv->tx_timestamp != 0) { + if ((ret = __log_earliest(env, logc, &low, &lowlsn)) != 0) + goto err; + if ((int32_t)dbenv->tx_timestamp < low) { + t1[sizeof(t1) - 1] = '\0'; + (void)strncpy(t1, __os_ctime( + &dbenv->tx_timestamp, time_buf), sizeof(t1) - 1); + if ((p = strchr(t1, '\n')) != NULL) + *p = '\0'; + + t2[sizeof(t2) - 1] = '\0'; + tlow = (time_t)low; + (void)strncpy(t2, __os_ctime( + &tlow, time_buf), sizeof(t2) - 1); + if ((p = strchr(t2, '\n')) != NULL) + *p = '\0'; + + __db_errx(env, DB_STR_A("1509", + "Invalid recovery timestamp %s; earliest time is %s", + "%s %s"), t1, t2); + ret = EINVAL; + goto err; + } + } + + /* + * Recovery is done in three passes: + * Pass #0: + * We need to find the position from which we will open files. + * We need to open files beginning with the earlier of the + * most recent checkpoint LSN and a checkpoint LSN before the + * recovery timestamp, if specified. We need to be before the + * most recent checkpoint LSN because we are going to collect + * information about which transactions were begun before we + * start rolling forward. Those that were should never be undone + * because queue cannot use LSNs to determine what operations can + * safely be aborted and it cannot rollback operations in + * transactions for which there may be records not processed + * during recovery. We need to consider earlier points in time + * in case we are recovering to a particular timestamp. + * + * Pass #1: + * Read forward through the log from the position found in pass 0 + * opening and closing files, and recording transactions for which + * we've seen their first record (the transaction's prev_lsn is + * 0,0). At the end of this pass, we know all transactions for + * which we've seen begins and we have the "current" set of files + * open. + * + * Pass #2: + * Read backward through the log undoing any uncompleted TXNs. + * There are four cases: + * 1. If doing catastrophic recovery, we read to the + * beginning of the log + * 2. If we are doing normal reovery, then we have to roll + * back to the most recent checkpoint LSN. + * 3. If we are recovering to a point in time, then we have + * to roll back to the checkpoint whose ckp_lsn is earlier + * than the specified time. __log_earliest will figure + * this out for us. + * 4. If we are recovering back to a particular LSN, then + * we have to roll back to the checkpoint whose ckp_lsn + * is earlier than the max_lsn. __log_backup will figure + * that out for us. + * In case 2, "uncompleted TXNs" include all those who committed + * after the user's specified timestamp. + * + * Pass #3: + * Read forward through the log from the LSN found in pass #2, + * redoing any committed TXNs (which committed after any user- + * specified rollback point). During this pass, checkpoint + * file information is ignored, and file openings and closings + * are redone. + * + * ckp_lsn -- lsn of the last checkpoint or the first in the log. + * first_lsn -- the lsn where the forward passes begin. + * last_lsn -- the last lsn in the log, used for feedback + * lowlsn -- the lsn we are rolling back to, if we are recovering + * to a point in time. + * lsn -- temporary use lsn. + * stop_lsn -- the point at which forward roll should stop + */ + + /* + * Find out the last lsn, so that we can estimate how far along we + * are in recovery. This will help us determine how much log there + * is between the first LSN that we're going to be working with and + * the last one. We assume that each of the three phases takes the + * same amount of time (a false assumption) and then use the %-age + * of the amount of log traversed to figure out how much of the + * pass we've accomplished. + * + * If we can't find any log records, we're kind of done. + */ +#ifdef UMRW + ZERO_LSN(last_lsn); +#endif + memset(&data, 0, sizeof(data)); + /* + * Pass #0 + * Find the LSN from which we begin OPENFILES. + * + * If this is a catastrophic recovery, or if no checkpoint exists + * in the log, the LSN is the first LSN in the log. + * + * Otherwise, it is the minimum of (1) the LSN in the last checkpoint + * and (2) the LSN in the checkpoint before any specified recovery + * timestamp or max_lsn. + */ + /* + * Get the first LSN in the log; it's an initial default + * even if this is not a catastrophic recovery. + */ + if ((ret = __logc_get(logc, &ckp_lsn, &data, DB_FIRST)) != 0) { + if (ret == DB_NOTFOUND) + ret = 0; + else + __db_errx(env, DB_STR("1510", + "First log record not found")); + goto err; + } + first_lsn = ckp_lsn; + + if (!LF_ISSET(DB_RECOVER_FATAL)) { + if ((ret = __txn_getckp(env, &ckp_lsn)) == 0 && + (ret = __logc_get(logc, &ckp_lsn, &data, DB_SET)) == 0) { + /* We have a recent checkpoint. This is LSN (1). */ + if ((ret = __txn_ckp_read(env, + data.data, &ckp_args)) != 0) { + __db_errx(env, DB_STR_A("1511", + "Invalid checkpoint record at [%ld][%ld]", + "%ld %ld"), (u_long)ckp_lsn.file, + (u_long)ckp_lsn.offset); + goto err; + } + first_lsn = ckp_args->ckp_lsn; + __os_free(env, ckp_args); + } + + /* + * If LSN (2) exists, use it if it's before LSN (1). + * (If LSN (1) doesn't exist, first_lsn is the + * beginning of the log, so will "win" this check.) + * + * XXX + * In the recovery-to-a-timestamp case, lowlsn is chosen by + * __log_earliest, and is the checkpoint LSN of the + * *earliest* checkpoint in the unreclaimed log. I + * (krinsky) believe that we could optimize this by looking + * instead for the LSN of the *latest* checkpoint before + * the timestamp of interest, but I'm not sure that this + * is worth doing right now. (We have to look for lowlsn + * and low anyway, to make sure the requested timestamp is + * somewhere in the logs we have, and all that's required + * is that we pick *some* checkpoint after the beginning of + * the logs and before the timestamp. + */ + if ((dbenv->tx_timestamp != 0 || max_lsn != NULL) && + LOG_COMPARE(&lowlsn, &first_lsn) < 0) { + first_lsn = lowlsn; + } + } + + if ((ret = __logc_get(logc, &last_lsn, &data, DB_LAST)) != 0) { + if (ret == DB_NOTFOUND) + ret = 0; + else + __db_errx(env, DB_STR("1512", + "Last log record not found")); + goto err; + } + + rectype = 0; + txnid = 0; + do { + if (LOG_COMPARE(&lsn, &first_lsn) == 0) + break; + /* check if we have a recycle record. */ + if (rectype != DB___txn_recycle) + LOGCOPY_32(env, &rectype, data.data); + /* txnid is after rectype, which is a u_int32. */ + LOGCOPY_32(env, &txnid, + (u_int8_t *)data.data + sizeof(u_int32_t)); + + if (txnid != 0) + break; + } while ((ret = __logc_get(logc, &lsn, &data, DB_PREV)) == 0); + + /* + * There are no transactions, so there is nothing to do unless + * we're recovering to an LSN. If we are, we need to proceed since + * we'll still need to do a vtruncate based on information we haven't + * yet collected. + */ + if (ret == DB_NOTFOUND) + ret = 0; + else if (ret != 0) + goto err; + + hi_txn = txnid; + + /* Get the record at first_lsn. */ + if ((ret = __logc_get(logc, &first_lsn, &data, DB_SET)) != 0) { + __db_errx(env, DB_STR_A("1513", + "Checkpoint LSN record [%ld][%ld] not found", "%ld %ld"), + (u_long)first_lsn.file, (u_long)first_lsn.offset); + goto err; + } + + if (dbenv->db_feedback != NULL) { + if (last_lsn.file == first_lsn.file) + nfiles = (double) + (last_lsn.offset - first_lsn.offset) / log_size; + else + nfiles = (double)(last_lsn.file - first_lsn.file) + + (double)((log_size - first_lsn.offset) + + last_lsn.offset) / log_size; + /* We are going to divide by nfiles; make sure it isn't 0. */ + if (nfiles < 0.001) + nfiles = 0.001; + } + + /* Find a low txnid. */ + ret = 0; + if (hi_txn != 0) do { + /* txnid is after rectype, which is a u_int32. */ + LOGCOPY_32(env, &txnid, + (u_int8_t *)data.data + sizeof(u_int32_t)); + + if (txnid != 0) + break; + } while ((ret = __logc_get(logc, &lsn, &data, DB_NEXT)) == 0); + + /* + * There are no transactions and we're not recovering to an LSN (see + * above), so there is nothing to do. + */ + if (ret == DB_NOTFOUND) { + if (LOG_COMPARE(&lsn, &last_lsn) != 0) + ret = __db_log_corrupt(env, &lsn); + else + ret = 0; + } + + /* Reset to the first lsn. */ + if (ret != 0 || + (ret = __logc_get(logc, &first_lsn, &data, DB_SET)) != 0) + goto err; + + /* Initialize the transaction list. */ + if ((ret = __db_txnlist_init(env, ip, + txnid, hi_txn, max_lsn, &txninfo)) != 0) + goto err; + + /* + * Pass #1 + * Run forward through the log starting at the first relevant lsn. + */ + if ((ret = __env_openfiles(env, logc, + txninfo, &data, &first_lsn, &last_lsn, nfiles, 1)) != 0) + goto err; + + /* If there were no transactions, then we can bail out early. */ + if (hi_txn == 0 && max_lsn == NULL) { + lsn = last_lsn; + goto done; + } + + /* + * Pass #2. + * + * We used first_lsn to tell us how far back we need to recover, + * use it here. + */ + if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY)) + __db_msg(env, DB_STR_A("1514", + "Recovery starting from [%lu][%lu]", "%lu %lu"), + (u_long)first_lsn.file, (u_long)first_lsn.offset); + + pass = DB_STR_P("backward"); + for (ret = __logc_get(logc, &lsn, &data, DB_LAST); + ret == 0 && LOG_COMPARE(&lsn, &first_lsn) >= 0; + ret = __logc_get(logc, &lsn, &data, DB_PREV)) { + if (dbenv->db_feedback != NULL) { + progress = 34 + (int)(33 * (__lsn_diff(&first_lsn, + &last_lsn, &lsn, log_size, 0) / nfiles)); + dbenv->db_feedback(dbenv, DB_RECOVER, progress); + } + + tlsn = lsn; + ret = __db_dispatch(env, &env->recover_dtab, + &data, &tlsn, DB_TXN_BACKWARD_ROLL, txninfo); + if (ret != 0) { + if (ret != DB_TXN_CKP) + goto msgerr; + else + ret = 0; + } + } + if (ret == DB_NOTFOUND) { + if (LOG_COMPARE(&lsn, &first_lsn) > 0) + ret = __db_log_corrupt(env, &lsn); + else + ret = 0; + } + if (ret != 0) + goto err; + + /* + * Pass #3. If we are recovering to a timestamp or to an LSN, + * we need to make sure that we don't roll-forward beyond that + * point because there may be non-transactional operations (e.g., + * closes that would fail). The last_lsn variable is used for + * feedback calculations, but use it to set an initial stopping + * point for the forward pass, and then reset appropriately to + * derive a real stop_lsn that tells how far the forward pass + * should go. + */ + pass = DB_STR_P("forward"); + stop_lsn = last_lsn; + if (max_lsn != NULL || dbenv->tx_timestamp != 0) + stop_lsn = ((DB_TXNHEAD *)txninfo)->maxlsn; + + for (ret = __logc_get(logc, &lsn, &data, DB_NEXT); + ret == 0; ret = __logc_get(logc, &lsn, &data, DB_NEXT)) { + if (dbenv->db_feedback != NULL) { + progress = 67 + (int)(33 * (__lsn_diff(&first_lsn, + &last_lsn, &lsn, log_size, 1) / nfiles)); + dbenv->db_feedback(dbenv, DB_RECOVER, progress); + } + + tlsn = lsn; + ret = __db_dispatch(env, &env->recover_dtab, + &data, &tlsn, DB_TXN_FORWARD_ROLL, txninfo); + if (ret != 0) { + if (ret != DB_TXN_CKP) + goto msgerr; + else + ret = 0; + } + /* + * If we are recovering to a timestamp or an LSN, + * we need to make sure that we don't try to roll + * forward beyond the soon-to-be end of log. + */ + if (LOG_COMPARE(&lsn, &stop_lsn) >= 0) + break; + + } + if (ret == DB_NOTFOUND) + ret = __db_log_corrupt(env, &lsn); + if (ret != 0) + goto err; + + if (max_lsn == NULL) + region->last_txnid = ((DB_TXNHEAD *)txninfo)->maxid; + +done: + /* We are going to truncate, so we'd best close the cursor. */ + if (logc != NULL) { + if ((ret = __logc_close(logc)) != 0) + goto err; + logc = NULL; + } + /* + * Also flush the cache before truncating the log. It's recovery, + * ignore any application max-write configuration. + */ + if ((ret = __memp_sync_int(env, + NULL, 0, DB_SYNC_CACHE | DB_SYNC_SUPPRESS_WRITE, NULL, NULL)) != 0) + goto err; + if (dbenv->tx_timestamp != 0) { + /* Run recovery up to this timestamp. */ + region->last_ckp = ((DB_TXNHEAD *)txninfo)->ckplsn; + vtrunc_lsn = &((DB_TXNHEAD *)txninfo)->maxlsn; + vtrunc_ckp = &((DB_TXNHEAD *)txninfo)->ckplsn; + } else if (max_lsn != NULL) { + /* This is a HA client syncing to the master. */ + if (!IS_ZERO_LSN(((DB_TXNHEAD *)txninfo)->ckplsn)) + region->last_ckp = ((DB_TXNHEAD *)txninfo)->ckplsn; + else if ((ret = + __txn_findlastckp(env, ®ion->last_ckp, max_lsn)) != 0) + goto err; + vtrunc_lsn = max_lsn; + vtrunc_ckp = &((DB_TXNHEAD *)txninfo)->ckplsn; + } else { + /* + * The usual case: we recovered the whole (valid) log; clear + * out any partial record after the recovery point. + */ + vtrunc_lsn = &lsn; + vtrunc_ckp = ®ion->last_ckp; + } + if ((ret = __log_vtruncate(env, vtrunc_lsn, vtrunc_ckp, trunclsn)) != 0) + goto err; + + /* If we had no txns, figure out if we need a checkpoint. */ + if (hi_txn == 0 && __dbreg_log_nofiles(env)) + LF_SET(DB_NO_CHECKPOINT); + /* + * Usually we close all files at the end of recovery, unless there are + * prepared transactions or errors in the checkpoint. + */ + all_recovered = region->stat.st_nrestores == 0; + /* + * Log a checkpoint here so subsequent recoveries can skip what's been + * done; this is unnecessary for HA rep clients, as they do not write + * log records. + */ + if (max_lsn == NULL && !LF_ISSET(DB_NO_CHECKPOINT) && + (ret = __txn_checkpoint(env, + 0, 0, DB_CKP_INTERNAL | DB_FORCE)) != 0) { + /* + * If there was no space for the checkpoint or flushing db + * pages we can still bring the environment up, if only for + * read-only access. We must not close the open files because a + * subsequent recovery might still need to redo this portion + * of the log [#18590]. + */ + if (max_lsn == NULL && ret == ENOSPC) { + if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY)) + __db_msg(env, DB_STR_A("1515", + "Recovery continuing after non-fatal checkpoint error: %s", + "%s"), db_strerror(ret)); + all_recovered = 0; + } + else + goto err; + } + + if (all_recovered ) { + /* Close all the db files that are open. */ + if ((ret = __dbreg_close_files(env, 0)) != 0) + goto err; + } else { + if ((ret = __dbreg_mark_restored(env)) != 0) + goto err; + F_SET(env->lg_handle, DBLOG_OPENFILES); + } + + if (max_lsn != NULL) { + /* + * Now we need to open files that should be open in order for + * client processing to continue. However, since we've + * truncated the log, we need to recompute from where the + * openfiles pass should begin. + */ + if ((ret = __log_cursor(env, &logc)) != 0) + goto err; + if ((ret = + __logc_get(logc, &first_lsn, &data, DB_FIRST)) != 0) { + if (ret == DB_NOTFOUND) + ret = 0; + else + __db_errx(env, DB_STR("1516", + "First log record not found")); + goto err; + } + if ((ret = __txn_getckp(env, &first_lsn)) == 0 && + (ret = __logc_get(logc, &first_lsn, &data, DB_SET)) == 0) { + /* We have a recent checkpoint. This is LSN (1). */ + if ((ret = __txn_ckp_read(env, + data.data, &ckp_args)) != 0) { + __db_errx(env, DB_STR_A("1517", + "Invalid checkpoint record at [%ld][%ld]", + "%ld %ld"), (u_long)first_lsn.file, + (u_long)first_lsn.offset); + goto err; + } + first_lsn = ckp_args->ckp_lsn; + __os_free(env, ckp_args); + } + if ((ret = __logc_get(logc, &first_lsn, &data, DB_SET)) != 0) + goto err; + if ((ret = __env_openfiles(env, logc, + txninfo, &data, &first_lsn, max_lsn, nfiles, 1)) != 0) + goto err; + } else if (all_recovered) { + /* + * If there are no transactions that need resolution, whether + * because they are prepared or because recovery will need to + * process them, we need to reset the transaction ID space and + * log this fact. + */ + if ((rectype != DB___txn_recycle || hi_txn != 0) && + (ret = __txn_reset(env)) != 0) + goto err; + } else { + if ((ret = __txn_recycle_id(env, 0)) != 0) + goto err; + } + + if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY)) { + (void)time(&now); + __db_msg(env, DB_STR_A("1518", + "Recovery complete at %.24s", "%.24s"), + __os_ctime(&now, time_buf)); + __db_msg(env, DB_STR_A("1519", + "Maximum transaction ID %lx recovery checkpoint [%lu][%lu]", + "%lx %lu %lu"), (u_long)(txninfo == NULL ? + TXN_MINIMUM : ((DB_TXNHEAD *)txninfo)->maxid), + (u_long)region->last_ckp.file, + (u_long)region->last_ckp.offset); + } + + if (0) { +msgerr: __db_errx(env, DB_STR_A("1520", + "Recovery function for LSN %lu %lu failed on %s pass", + "%lu %lu %s"), (u_long)lsn.file, (u_long)lsn.offset, pass); + } + +err: if (logc != NULL && (t_ret = __logc_close(logc)) != 0 && ret == 0) + ret = t_ret; + + if (txninfo != NULL) + __db_txnlist_end(env, txninfo); + + dbenv->tx_timestamp = 0; + + F_CLR(env->lg_handle, DBLOG_RECOVER); + F_CLR(region, TXN_IN_RECOVERY); + + return (ret); +} + +/* + * Figure out how many logfiles we have processed. If we are moving + * forward (is_forward != 0), then we're computing current - low. If + * we are moving backward, we are computing high - current. max is + * the number of bytes per logfile. + */ +static double +__lsn_diff(low, high, current, max, is_forward) + DB_LSN *low, *high, *current; + u_int32_t max; + int is_forward; +{ + double nf; + + /* + * There are three cases in each direction. If you are in the + * same file, then all you need worry about is the difference in + * offsets. If you are in different files, then either your offsets + * put you either more or less than the integral difference in the + * number of files -- we need to handle both of these. + */ + if (is_forward) { + if (current->file == low->file) + nf = (double)(current->offset - low->offset) / max; + else if (current->offset < low->offset) + nf = (double)((current->file - low->file) - 1) + + (double)((max - low->offset) + current->offset) / + max; + else + nf = (double)(current->file - low->file) + + (double)(current->offset - low->offset) / max; + } else { + if (current->file == high->file) + nf = (double)(high->offset - current->offset) / max; + else if (current->offset > high->offset) + nf = (double)((high->file - current->file) - 1) + + (double) + ((max - current->offset) + high->offset) / max; + else + nf = (double)(high->file - current->file) + + (double)(high->offset - current->offset) / max; + } + return (nf); +} + +/* + * __log_backup -- + * + * This is used to find the earliest log record to process when a client + * is trying to sync up with a master whose max LSN is less than this + * client's max lsn; we want to roll back everything after that. + * + * Find the latest checkpoint whose ckp_lsn is less than the max lsn. + */ +static int +__log_backup(env, logc, max_lsn, start_lsn) + ENV *env; + DB_LOGC *logc; + DB_LSN *max_lsn, *start_lsn; +{ + DBT data; + DB_LSN lsn; + __txn_ckp_args *ckp_args; + int ret; + + memset(&data, 0, sizeof(data)); + ckp_args = NULL; + + if ((ret = __txn_getckp(env, &lsn)) != 0) + goto err; + while ((ret = __logc_get(logc, &lsn, &data, DB_SET)) == 0) { + if ((ret = __txn_ckp_read(env, data.data, &ckp_args)) != 0) + return (ret); + /* + * Follow checkpoints through the log until + * we find one with a ckp_lsn less than + * or equal max_lsn. + */ + if (LOG_COMPARE(&ckp_args->ckp_lsn, max_lsn) <= 0) { + *start_lsn = ckp_args->ckp_lsn; + break; + } + + lsn = ckp_args->last_ckp; + /* + * If there are no more checkpoints behind us, we're + * done. Break with DB_NOTFOUND. + */ + if (IS_ZERO_LSN(lsn)) { + ret = DB_NOTFOUND; + break; + } + __os_free(env, ckp_args); + ckp_args = NULL; + } + + if (ckp_args != NULL) + __os_free(env, ckp_args); + /* + * If we walked back through all the checkpoints, + * set the cursor on the first log record. + */ +err: if (IS_ZERO_LSN(*start_lsn) && (ret == 0 || ret == DB_NOTFOUND)) + ret = __logc_get(logc, start_lsn, &data, DB_FIRST); + return (ret); +} + +/* + * __log_earliest -- + * + * Return the earliest recovery point for the log files present. The + * earliest recovery time is the time stamp of the first checkpoint record + * whose checkpoint LSN is greater than the first LSN we process. + */ +static int +__log_earliest(env, logc, lowtime, lowlsn) + ENV *env; + DB_LOGC *logc; + int32_t *lowtime; + DB_LSN *lowlsn; +{ + __txn_ckp_args *ckpargs; + DB_LSN first_lsn, lsn; + DBT data; + u_int32_t rectype; + int cmp, ret; + + memset(&data, 0, sizeof(data)); + + /* + * Read forward through the log looking for the first checkpoint + * record whose ckp_lsn is greater than first_lsn. + */ + for (ret = __logc_get(logc, &first_lsn, &data, DB_FIRST); + ret == 0; ret = __logc_get(logc, &lsn, &data, DB_NEXT)) { + LOGCOPY_32(env, &rectype, data.data); + if (rectype != DB___txn_ckp) + continue; + if ((ret = + __txn_ckp_read(env, data.data, &ckpargs)) == 0) { + cmp = LOG_COMPARE(&ckpargs->ckp_lsn, &first_lsn); + *lowlsn = ckpargs->ckp_lsn; + *lowtime = ckpargs->timestamp; + + __os_free(env, ckpargs); + if (cmp >= 0) + break; + } + } + + return (ret); +} + +/* + * __env_openfiles -- + * Perform the pass of recovery that opens files. This is used + * both during regular recovery and an initial call to txn_recover (since + * we need files open in order to abort prepared, but not yet committed + * transactions). + * + * See the comments in db_apprec for a detailed description of the + * various recovery passes. + * + * If we are not doing feedback processing (i.e., we are doing txn_recover + * processing and in_recovery is zero), then last_lsn can be NULL. + * + * PUBLIC: int __env_openfiles __P((ENV *, + * PUBLIC: DB_LOGC *, void *, DBT *, DB_LSN *, DB_LSN *, double, int)); + */ +int +__env_openfiles(env, logc, txninfo, + data, open_lsn, last_lsn, nfiles, in_recovery) + ENV *env; + DB_LOGC *logc; + void *txninfo; + DBT *data; + DB_LSN *open_lsn, *last_lsn; + double nfiles; + int in_recovery; +{ + DB_ENV *dbenv; + DB_LSN lsn, tlsn; + u_int32_t log_size; + int progress, ret; + + dbenv = env->dbenv; + + /* + * XXX + * Get the log size. No locking required because we're single-threaded + * during recovery. + */ + log_size = ((LOG *)env->lg_handle->reginfo.primary)->log_size; + + lsn = *open_lsn; + for (;;) { + if (in_recovery && dbenv->db_feedback != NULL) { + DB_ASSERT(env, last_lsn != NULL); + progress = (int)(33 * (__lsn_diff(open_lsn, + last_lsn, &lsn, log_size, 1) / nfiles)); + dbenv->db_feedback(dbenv, DB_RECOVER, progress); + } + + tlsn = lsn; + ret = __db_dispatch(env, &env->recover_dtab, data, &tlsn, + in_recovery ? DB_TXN_OPENFILES : DB_TXN_POPENFILES, + txninfo); + if (ret != 0 && ret != DB_TXN_CKP) { + __db_errx(env, DB_STR_A("1521", + "Recovery function for LSN %lu %lu failed", + "%lu %lu"), (u_long)lsn.file, (u_long)lsn.offset); + break; + } + if ((ret = __logc_get(logc, &lsn, data, DB_NEXT)) != 0) { + if (ret == DB_NOTFOUND) { + if (last_lsn != NULL && + LOG_COMPARE(&lsn, last_lsn) != 0) + ret = __db_log_corrupt(env, &lsn); + else + ret = 0; + } + break; + } + } + + return (ret); +} + +static int +__db_log_corrupt(env, lsnp) + ENV *env; + DB_LSN *lsnp; +{ + __db_errx(env, DB_STR_A("1522", + "Log file corrupt at LSN: [%lu][%lu]", "%lu %lu"), + (u_long)lsnp->file, (u_long)lsnp->offset); + return (EINVAL); +} + +/* + * __env_init_rec -- + * + * PUBLIC: int __env_init_rec __P((ENV *, u_int32_t)); + */ +int +__env_init_rec(env, version) + ENV *env; + u_int32_t version; +{ + int ret; + + /* + * We need to prime the recovery table with the current recovery + * functions. Then we overwrite only specific entries based on + * each previous version we support. + */ + if ((ret = __bam_init_recover(env, &env->recover_dtab)) != 0) + goto err; + if ((ret = __crdel_init_recover(env, &env->recover_dtab)) != 0) + goto err; + if ((ret = __db_init_recover(env, &env->recover_dtab)) != 0) + goto err; + if ((ret = __dbreg_init_recover(env, &env->recover_dtab)) != 0) + goto err; + if ((ret = __fop_init_recover(env, &env->recover_dtab)) != 0) + goto err; + if ((ret = __ham_init_recover(env, &env->recover_dtab)) != 0) + goto err; + if ((ret = __heap_init_recover(env, &env->recover_dtab)) != 0) + goto err; + if ((ret = __qam_init_recover(env, &env->recover_dtab)) != 0) + goto err; + if ((ret = __repmgr_init_recover(env, &env->recover_dtab)) != 0) + goto err; + if ((ret = __txn_init_recover(env, &env->recover_dtab)) != 0) + goto err; + + /* + * After installing all the current recovery routines, we want to + * override them with older versions if we are reading a down rev + * log (from a downrev replication master). If a log record is + * changed then we must use the previous version for all older + * logs. If a record is changed in multiple revisions then the + * oldest revision that applies must be used. Therefore we override + * the recovery functions in reverse log version order. + */ + /* + * DB_LOGVERSION_53 is a strict superset of DB_LOGVERSION_50. + * So, only check > DB_LOGVERSION_48p2. If/When log records are + * altered, the condition below will need to change. + */ + if (version > DB_LOGVERSION_48p2) + goto done; + if ((ret = __env_init_rec_48(env)) != 0) + goto err; + /* + * Patch 2 added __db_pg_trunc but did not replace any log records + * so we want to override the same functions as in the original release. + */ + if (version >= DB_LOGVERSION_48) + goto done; + if ((ret = __env_init_rec_47(env)) != 0) + goto err; + if (version == DB_LOGVERSION_47) + goto done; + if ((ret = __env_init_rec_46(env)) != 0) + goto err; + /* + * There are no log record/recovery differences between 4.4 and 4.5. + * The log version changed due to checksum. There are no log recovery + * differences between 4.5 and 4.6. The name of the rep_gen in + * txn_checkpoint changed (to spare, since we don't use it anymore). + */ + if (version >= DB_LOGVERSION_44) + goto done; + if ((ret = __env_init_rec_43(env)) != 0) + goto err; + if (version == DB_LOGVERSION_43) + goto done; + if (version != DB_LOGVERSION_42) { + __db_errx(env, DB_STR_A("1523", "Unknown version %lu", + "%lu"), (u_long)version); + ret = EINVAL; + goto err; + } + ret = __env_init_rec_42(env); + +done: +err: return (ret); +} + +static int +__env_init_rec_42(env) + ENV *env; +{ + int ret; + + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __db_relink_42_recover, DB___db_relink_42)) != 0) + goto err; + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __db_pg_alloc_42_recover, DB___db_pg_alloc_42)) != 0) + goto err; + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __db_pg_free_42_recover, DB___db_pg_free_42)) != 0) + goto err; + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __db_pg_freedata_42_recover, DB___db_pg_freedata_42)) != 0) + goto err; +#ifdef HAVE_HASH + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __ham_metagroup_42_recover, DB___ham_metagroup_42)) != 0) + goto err; + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __ham_groupalloc_42_recover, DB___ham_groupalloc_42)) != 0) + goto err; +#endif + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __txn_ckp_42_recover, DB___txn_ckp_42)) != 0) + goto err; +err: + return (ret); +} + +static int +__env_init_rec_43(env) + ENV *env; +{ + int ret; + + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __bam_relink_43_recover, DB___bam_relink_43)) != 0) + goto err; + /* + * We want to use the 4.2-based txn_regop record. + */ + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __txn_regop_42_recover, DB___txn_regop_42)) != 0) + goto err; +err: + return (ret); +} + +static int +__env_init_rec_46(env) + ENV *env; +{ + int ret; + + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __bam_merge_44_recover, DB___bam_merge_44)) != 0) + goto err; + +err: return (ret); +} + +static int +__env_init_rec_47(env) + ENV *env; +{ + int ret; + + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __bam_split_42_recover, DB___bam_split_42)) != 0) + goto err; + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __db_pg_sort_44_recover, DB___db_pg_sort_44)) != 0) + goto err; + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __fop_create_42_recover, DB___fop_create_42)) != 0) + goto err; + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __fop_write_42_recover, DB___fop_write_42)) != 0) + goto err; + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __fop_rename_42_recover, DB___fop_rename_42)) != 0) + goto err; + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __fop_rename_noundo_46_recover, DB___fop_rename_noundo_46)) != 0) + goto err; + +err: + return (ret); +} + +static int +__env_init_rec_48(env) + ENV *env; +{ + int ret; + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __db_pg_sort_44_recover, DB___db_pg_sort_44)) != 0) + goto err; + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __db_addrem_42_recover, DB___db_addrem_42)) != 0) + goto err; + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __db_big_42_recover, DB___db_big_42)) != 0) + goto err; + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __bam_split_48_recover, DB___bam_split_48)) != 0) + goto err; +#ifdef HAVE_HASH + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __ham_insdel_42_recover, DB___ham_insdel_42)) != 0) + goto err; + if ((ret = __db_add_recovery_int(env, &env->recover_dtab, + __ham_replace_42_recover, DB___ham_replace_42)) != 0) + goto err; +#endif +err: + return (ret); +} diff --git a/src/env/env_region.c b/src/env/env_region.c new file mode 100644 index 00000000..113bea21 --- /dev/null +++ b/src/env/env_region.c @@ -0,0 +1,1497 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/mp.h" +#include "dbinc/lock.h" +#include "dbinc/log.h" +#include "dbinc/txn.h" + +static int __env_des_get __P((ENV *, REGINFO *, REGINFO *, REGION **)); +static int __env_faultmem __P((ENV *, void *, size_t, int)); +static int __env_sys_attach __P((ENV *, REGINFO *, REGION *)); +static int __env_sys_detach __P((ENV *, REGINFO *, int)); +static void __env_des_destroy __P((ENV *, REGION *)); +static void __env_remove_file __P((ENV *)); + +/* + * __env_attach + * Join/create the environment + * + * PUBLIC: int __env_attach __P((ENV *, u_int32_t *, int, int)); + */ +int +__env_attach(env, init_flagsp, create_ok, retry_ok) + ENV *env; + u_int32_t *init_flagsp; + int create_ok, retry_ok; +{ + DB_ENV *dbenv; + REGENV rbuf, *renv; + REGENV_REF ref; + REGINFO *infop; + REGION *rp, tregion; + size_t max, nrw, size; + long segid; + u_int32_t bytes, i, mbytes, nregions, signature; + u_int retry_cnt; + int majver, minver, patchver, ret; + char buf[sizeof(DB_REGION_FMT) + 20]; + + /* Initialization */ + dbenv = env->dbenv; + retry_cnt = 0; + signature = __env_struct_sig(); + + /* Repeated initialization. */ +loop: renv = NULL; + rp = NULL; + + /* Set up the ENV's REG_INFO structure. */ + if ((ret = __os_calloc(env, 1, sizeof(REGINFO), &infop)) != 0) + return (ret); + infop->env = env; + infop->type = REGION_TYPE_ENV; + infop->id = REGION_ID_ENV; + infop->flags = REGION_JOIN_OK; + if (create_ok) + F_SET(infop, REGION_CREATE_OK); + + /* Build the region name. */ + if (F_ISSET(env, ENV_PRIVATE)) + ret = __os_strdup(env, "process-private", &infop->name); + else { + (void)snprintf(buf, sizeof(buf), "%s", DB_REGION_ENV); + ret = __db_appname(env, DB_APP_NONE, buf, NULL, &infop->name); + } + if (ret != 0) + goto err; + + /* + * We have to single-thread the creation of the REGENV region. Once + * it exists, we can serialize using region mutexes, but until then + * we have to be the only player in the game. + * + * If this is a private environment, we are only called once and there + * are no possible race conditions. + * + * If this is a public environment, we use the filesystem to ensure + * the creation of the environment file is single-threaded. + * + * If the application has specified their own mapping functions, try + * and create the region. The application will have to let us know if + * it's actually a creation or not, and we'll have to fall-back to a + * join if it's not a create. + */ + if (F_ISSET(env, ENV_PRIVATE) || DB_GLOBAL(j_region_map) != NULL) + goto creation; + + /* + * Try to create the file, if we have the authority. We have to ensure + * that multiple threads/processes attempting to simultaneously create + * the file are properly ordered. Open using the O_CREAT and O_EXCL + * flags so that multiple attempts to create the region will return + * failure in all but one. POSIX 1003.1 requires that EEXIST be the + * errno return value -- I sure hope they're right. + */ + if (create_ok) { + if ((ret = __os_open(env, infop->name, 0, + DB_OSO_CREATE | DB_OSO_EXCL | DB_OSO_REGION, + env->db_mode, &env->lockfhp)) == 0) + goto creation; + if (ret != EEXIST) { + __db_err(env, ret, "%s", infop->name); + goto err; + } + } + + /* The region must exist, it's not okay to recreate it. */ + F_CLR(infop, REGION_CREATE_OK); + + /* + * If we couldn't create the file, try and open it. (If that fails, + * we're done.) + */ + if ((ret = __os_open( + env, infop->name, 0, DB_OSO_REGION, 0, &env->lockfhp)) != 0) + goto err; + + /* + * !!! + * The region may be in system memory not backed by the filesystem + * (more specifically, not backed by this file), and we're joining + * it. In that case, the process that created it will have written + * out a REGENV_REF structure as its only contents. We read that + * structure before we do anything further, e.g., we can't just map + * that file in and then figure out what's going on. + * + * All of this noise is because some systems don't have a coherent VM + * and buffer cache, and what's worse, when you mix operations on the + * VM and buffer cache, half the time you hang the system. + * + * If the file is the size of an REGENV_REF structure, then we know + * the real region is in some other memory. (The only way you get a + * file that size is to deliberately write it, as it's smaller than + * any possible disk sector created by writing a file or mapping the + * file into memory.) In which case, retrieve the structure from the + * file and use it to acquire the referenced memory. + * + * If the structure is larger than a REGENV_REF structure, then this + * file is backing the shared memory region, and we just map it into + * memory. + * + * And yes, this makes me want to take somebody and kill them. (I + * digress -- but you have no freakin' idea. This is unbelievably + * stupid and gross, and I've probably spent six months of my life, + * now, trying to make different versions of it work.) + */ + if ((ret = __os_ioinfo(env, infop->name, + env->lockfhp, &mbytes, &bytes, NULL)) != 0) { + __db_err(env, ret, "%s", infop->name); + goto err; + } + + /* + * !!! + * A size_t is OK -- regions get mapped into memory, and so can't + * be larger than a size_t. + */ + size = mbytes * MEGABYTE + bytes; + + /* + * If the size is less than the size of a REGENV_REF structure, the + * region (or, possibly, the REGENV_REF structure) has not yet been + * completely written. Shouldn't be possible, but there's no reason + * not to wait awhile and try again. + * + * If the region is precisely the size of a ref, then we don't + * have the region here, just the meta-data, which implies that + * that we are using SYSTEM V shared memory (SYSTEM_MEM). However, + * if the flags say that we are using SYSTEM_MEM and the region is + * bigger than the ref, something bad has happened -- we are storing + * something in the region file other than meta-data and that + * shouldn't happen. + */ + if (size < sizeof(ref)) + goto retry; + else { + + if (size == sizeof(ref)) + F_SET(env, ENV_SYSTEM_MEM); + else if (F_ISSET(env, ENV_SYSTEM_MEM)) { + ret = EINVAL; + __db_err(env, ret, DB_STR_A("1535", + "%s: existing environment not created in system memory", + "%s"), infop->name); + goto err; + } else { + if ((ret = __os_read(env, env->lockfhp, &rbuf, + sizeof(rbuf), &nrw)) != 0 || + nrw < (size_t)sizeof(rbuf) || + (ret = __os_seek(env, + env->lockfhp, 0, 0, rbuf.region_off)) != 0) { + __db_err(env, ret, DB_STR_A("1536", + "%s: unable to read region info", "%s"), + infop->name); + goto err; + } + } + + if ((ret = __os_read(env, env->lockfhp, &ref, + sizeof(ref), &nrw)) != 0 || nrw < (size_t)sizeof(ref)) { + if (ret == 0) + ret = EIO; + __db_err(env, ret, DB_STR_A("1537", + "%s: unable to read system-memory information", + "%s"), infop->name); + goto err; + } + size = ref.size; + max = ref.max; + segid = ref.segid; + } + +#ifndef HAVE_MUTEX_FCNTL + /* + * If we're not doing fcntl locking, we can close the file handle. We + * no longer need it and the less contact between the buffer cache and + * the VM, the better. + */ + (void)__os_closehandle(env, env->lockfhp); + env->lockfhp = NULL; +#endif + + /* Call the region join routine to acquire the region. */ + memset(&tregion, 0, sizeof(tregion)); + tregion.size = (roff_t)size; + tregion.max = (roff_t)max; + tregion.segid = segid; + if ((ret = __env_sys_attach(env, infop, &tregion)) != 0) + goto err; + +user_map_functions: + /* + * The environment's REGENV structure has to live at offset 0 instead + * of the usual alloc information. Set the primary reference and + * correct the "head" value to reference the alloc region. + */ + infop->primary = infop->addr; + infop->head = (u_int8_t *)infop->addr + sizeof(REGENV); + renv = infop->primary; + + /* + * Make sure the region matches our build. Special case a region + * that's all nul bytes, just treat it like any other corruption. + */ + if (renv->majver != DB_VERSION_MAJOR || + renv->minver != DB_VERSION_MINOR) { + if (renv->majver != 0 || renv->minver != 0) { + __db_errx(env, DB_STR_A("1538", + "Program version %d.%d doesn't match environment version %d.%d", + "%d %d %d %d"), DB_VERSION_MAJOR, DB_VERSION_MINOR, + renv->majver, renv->minver); + ret = DB_VERSION_MISMATCH; + } else + ret = EINVAL; + goto err; + } + if (renv->signature != signature) { + __db_errx(env, DB_STR("1539", + "Build signature doesn't match environment")); + ret = DB_VERSION_MISMATCH; + goto err; + } + + /* + * Check if the environment has had a catastrophic failure. + * + * Check the magic number to ensure the region is initialized. If the + * magic number isn't set, the lock may not have been initialized, and + * an attempt to use it could lead to random behavior. + * + * The panic and magic values aren't protected by any lock, so we never + * use them in any check that's more complex than set/not-set. + * + * !!! + * I'd rather play permissions games using the underlying file, but I + * can't because Windows/NT filesystems won't open files mode 0. + */ + if (renv->panic && !F_ISSET(dbenv, DB_ENV_NOPANIC)) { + ret = __env_panic_msg(env); + goto err; + } + if (renv->magic != DB_REGION_MAGIC) + goto retry; + + /* + * Get a reference to the underlying REGION information for this + * environment. + */ + if ((ret = __env_des_get(env, infop, infop, &rp)) != 0 || rp == NULL) + goto find_err; + infop->rp = rp; + + /* + * There's still a possibility for inconsistent data. When we acquired + * the size of the region and attached to it, it might have still been + * growing as part of its creation. We can detect this by checking the + * size we originally found against the region's current size. (The + * region's current size has to be final, the creator finished growing + * it before setting the magic number in the region.) + * + * !!! + * Skip this test when the application specified its own map functions. + * The size of the region is essentially unknown in that case: some + * other process asked the application's map function for some bytes, + * but we were never told the final size of the region. We could get + * a size back from the map function, but for all we know, our process' + * map function only knows how to join regions, it has no clue how big + * those regions are. + */ + if (DB_GLOBAL(j_region_map) == NULL && rp->size != size) + goto retry; + + /* + * Check our callers configuration flags, it's an error to configure + * incompatible or additional subsystems in an existing environment. + * Return the total set of flags to the caller so they initialize the + * correct set of subsystems. + */ + if (init_flagsp != NULL) { + FLD_CLR(*init_flagsp, renv->init_flags); + if (*init_flagsp != 0) { + __db_errx(env, DB_STR("1540", + "configured environment flags incompatible with existing environment")); + ret = EINVAL; + goto err; + } + *init_flagsp = renv->init_flags; + } + + /* + * Fault the pages into memory. Note, do this AFTER releasing the + * lock, because we're only reading the pages, not writing them. + */ + (void)__env_faultmem(env, infop->primary, rp->size, 0); + + /* Everything looks good, we're done. */ + env->reginfo = infop; + return (0); + +creation: + /* Create the environment region. */ + F_SET(infop, REGION_CREATE); + + /* + * Allocate room for REGION structures plus overhead. + */ + memset(&tregion, 0, sizeof(tregion)); + nregions = __memp_max_regions(env) + 5; + size = nregions * sizeof(REGION); + size += dbenv->passwd_len; + size += (dbenv->thr_max + dbenv->thr_max / 4) * + __env_alloc_size(sizeof(DB_THREAD_INFO)); + /* Space for replication buffer. */ + if (init_flagsp != NULL && FLD_ISSET(*init_flagsp, DB_INITENV_REP)) + size += MEGABYTE; + size += __txn_region_size(env); + size += __log_region_size(env); + size += __env_thread_size(env, size); + size += __lock_region_size(env, size); + + tregion.size = (roff_t)size; + tregion.segid = INVALID_REGION_SEGID; + + if ((tregion.max = dbenv->memory_max) == 0) { + /* Add some slop. */ + size += 16 * 1024; + tregion.max = (roff_t)size; + + tregion.max += (roff_t)__lock_region_max(env); + tregion.max += (roff_t)__txn_region_max(env); + tregion.max += (roff_t)__log_region_max(env); + tregion.max += (roff_t)__env_thread_max(env); + } else if (tregion.size > tregion.max) { + __db_errx(env, DB_STR_A("1542", + "Minimum environment memory size %ld is bigger than spcified max %ld.", + "%ld %ld"), (u_long)tregion.size, (u_long)tregion.max); + ret = EINVAL; + goto err; + } else if (F_ISSET(env, ENV_PRIVATE)) + infop->max_alloc = dbenv->memory_max; + + if ((ret = __env_sys_attach(env, infop, &tregion)) != 0) + goto err; + + /* + * If the application has specified its own mapping functions, we don't + * know until we get here if we are creating the region or not. The + * way we find out is underlying functions clear the REGION_CREATE flag. + */ + if (!F_ISSET(infop, REGION_CREATE)) + goto user_map_functions; + + /* + * Fault the pages into memory. Note, do this BEFORE we initialize + * anything, because we're writing the pages, not just reading them. + */ + (void)__env_faultmem(env, infop->addr, tregion.size, 1); + + /* + * The first object in the region is the REGENV structure. This is + * different from the other regions, and, from everything else in + * this region, where all objects are allocated from the pool, i.e., + * there aren't any fixed locations. The remaining space is made + * available for later allocation. + * + * The allocation space must be size_t aligned, because that's what + * the initialization routine is going to store there. To make sure + * that happens, the REGENV structure was padded with a final size_t. + * No other region needs to worry about it because all of them treat + * the entire region as allocation space. + * + * Set the primary reference and correct the "head" value to reference + * the alloc region. + */ + infop->primary = infop->addr; + infop->head = (u_int8_t *)infop->addr + sizeof(REGENV); + __env_alloc_init(infop, tregion.size - sizeof(REGENV)); + + /* + * Initialize the rest of the REGENV structure. (Don't set the magic + * number to the correct value, that would validate the environment). + */ + renv = infop->primary; + renv->magic = 0; + renv->panic = 0; + + (void)db_version(&majver, &minver, &patchver); + renv->majver = (u_int32_t)majver; + renv->minver = (u_int32_t)minver; + renv->patchver = (u_int32_t)patchver; + renv->signature = signature; + + (void)time(&renv->timestamp); + __os_unique_id(env, &renv->envid); + + /* + * Initialize init_flags to store the flags that any other environment + * handle that uses DB_JOINENV to join this environment will need. + */ + renv->init_flags = (init_flagsp == NULL) ? 0 : *init_flagsp; + + /* + * Set up the region array. We use an array rather than a linked list + * as we have to traverse this list after failure in some cases, and + * we don't want to infinitely loop should the application fail while + * we're manipulating the list. + */ + renv->region_cnt = nregions; + if ((ret = __env_alloc(infop, nregions * sizeof(REGION), &rp)) != 0) { + __db_err(env, ret, DB_STR("1543", + "unable to create new master region array")); + goto err; + } + renv->region_off = R_OFFSET(infop, rp); + for (i = 0; i < nregions; ++i, ++rp) + rp->id = INVALID_REGION_ID; + + renv->cipher_off = renv->thread_off = renv->rep_off = INVALID_ROFF; + renv->flags = 0; + renv->op_timestamp = renv->rep_timestamp = 0; + renv->mtx_regenv = MUTEX_INVALID; + renv->reg_panic = 0; + + /* + * Get the underlying REGION structure for this environment. Note, + * we created the underlying OS region before we acquired the REGION + * structure, which is backwards from the normal procedure. Update + * the REGION structure. + */ + if ((ret = __env_des_get(env, infop, infop, &rp)) != 0) { +find_err: __db_errx(env, DB_STR_A("1544", + "%s: unable to find environment", "%s"), infop->name); + if (ret == 0) + ret = EINVAL; + goto err; + } + infop->rp = rp; + rp->alloc = rp->size = tregion.size; + rp->max = tregion.max; + rp->segid = tregion.segid; + + /* + * !!! + * If we create an environment where regions are public and in system + * memory, we have to inform processes joining the environment how to + * attach to the shared memory segment. So, we write the shared memory + * identifier into the file, to be read by those other processes. + * + * XXX + * This is really OS-layer information, but I can't see any easy way + * to move it down there without passing down information that it has + * no right to know, e.g., that this is the one-and-only REGENV region + * and not some other random region. + */ + if (tregion.segid != INVALID_REGION_SEGID) { + ref.size = tregion.size; + ref.segid = tregion.segid; + ref.max = tregion.max; + if ((ret = __os_write( + env, env->lockfhp, &ref, sizeof(ref), &nrw)) != 0) { + __db_err(env, ret, DB_STR_A("1545", + "%s: unable to write out public environment ID", + "%s"), infop->name); + goto err; + } + } + +#ifndef HAVE_MUTEX_FCNTL + /* + * If we're not doing fcntl locking, we can close the file handle. We + * no longer need it and the less contact between the buffer cache and + * the VM, the better. + */ + if (env->lockfhp != NULL) { + (void)__os_closehandle(env, env->lockfhp); + env->lockfhp = NULL; + } +#endif + + /* Everything looks good, we're done. */ + env->reginfo = infop; + return (0); + +err: +retry: /* Close any open file handle. */ + if (env->lockfhp != NULL) { + (void)__os_closehandle(env, env->lockfhp); + env->lockfhp = NULL; + } + + /* + * If we joined or created the region, detach from it. If we created + * it, destroy it. Note, there's a path in the above code where we're + * using a temporary REGION structure because we haven't yet allocated + * the real one. In that case the region address (addr) will be filled + * in, but the REGION pointer (rp) won't. Fix it. + */ + if (infop->addr != NULL) { + if (infop->rp == NULL) + infop->rp = &tregion; + + (void)__env_sys_detach(env, + infop, F_ISSET(infop, REGION_CREATE)); + + if (rp != NULL && F_ISSET(env, DB_PRIVATE)) + __env_alloc_free(infop, rp); + } + + /* Free the allocated name and/or REGINFO structure. */ + if (infop->name != NULL) + __os_free(env, infop->name); + __os_free(env, infop); + + /* If we had a temporary error, wait awhile and try again. */ + if (ret == 0) { + if (!retry_ok || ++retry_cnt > 3) { + __db_errx(env, DB_STR("1546", + "unable to join the environment")); + ret = EAGAIN; + } else { + __os_yield(env, retry_cnt * 3, 0); + goto loop; + } + } + + return (ret); +} + +/* + * __env_turn_on -- + * Turn on the created environment. + * + * PUBLIC: int __env_turn_on __P((ENV *)); + */ +int +__env_turn_on(env) + ENV *env; +{ + REGENV *renv; + REGINFO *infop; + + infop = env->reginfo; + renv = infop->primary; + + /* If we didn't create the region, there's no need for further work. */ + if (!F_ISSET(infop, REGION_CREATE)) + return (0); + + /* + * Validate the file. All other threads of control are waiting + * on this value to be written -- "Let slip the hounds of war!" + */ + renv->magic = DB_REGION_MAGIC; + + return (0); +} + +/* + * __env_turn_off -- + * Turn off the environment. + * + * PUBLIC: int __env_turn_off __P((ENV *, u_int32_t)); + */ +int +__env_turn_off(env, flags) + ENV *env; + u_int32_t flags; +{ + REGENV *renv; + REGINFO *infop; + int ret, t_ret; + + ret = 0; + + /* + * Connect to the environment: If we can't join the environment, we + * guess it's because it doesn't exist and we're done. + * + * If the environment exists, attach and lock the environment. + */ + if (__env_attach(env, NULL, 0, 1) != 0) + return (0); + + infop = env->reginfo; + renv = infop->primary; + + MUTEX_LOCK(env, renv->mtx_regenv); + + /* + * If the environment is in use, we're done unless we're forcing the + * issue or the environment has panic'd. (If the environment panic'd, + * the thread holding the reference count may not have cleaned up, so + * we clean up. It's possible the application didn't plan on removing + * the environment in this particular call, but panic'd environments + * aren't useful to anyone.) + * + * Otherwise, panic the environment and overwrite the magic number so + * any thread of control attempting to connect (or racing with us) will + * back off and retry, or just die. + */ + if (renv->refcnt > 0 && !LF_ISSET(DB_FORCE) && !renv->panic) + ret = EBUSY; + else + renv->panic = 1; + + /* + * Unlock the environment (nobody should need this lock because + * we've poisoned the pool) and detach from the environment. + */ + MUTEX_UNLOCK(env, renv->mtx_regenv); + + if ((t_ret = __env_detach(env, 0)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __env_panic_set -- + * Set/clear unrecoverable error. + * + * PUBLIC: void __env_panic_set __P((ENV *, int)); + */ +void +__env_panic_set(env, on) + ENV *env; + int on; +{ + if (env != NULL && env->reginfo != NULL) + ((REGENV *)env->reginfo->primary)->panic = on ? 1 : 0; +} + +/* + * __env_ref_increment -- + * Increment the environment's reference count. + * + * PUBLIC: int __env_ref_increment __P((ENV *)); + */ +int +__env_ref_increment(env) + ENV *env; +{ + REGENV *renv; + REGINFO *infop; + int ret; + + infop = env->reginfo; + renv = infop->primary; + + /* If we're creating the primary region, allocate a mutex. */ + if (F_ISSET(infop, REGION_CREATE)) { + if ((ret = __mutex_alloc( + env, MTX_ENV_REGION, 0, &renv->mtx_regenv)) != 0) + return (ret); + renv->refcnt = 1; + } else { + /* Lock the environment, increment the reference, unlock. */ + MUTEX_LOCK(env, renv->mtx_regenv); + ++renv->refcnt; + MUTEX_UNLOCK(env, renv->mtx_regenv); + } + + F_SET(env, ENV_REF_COUNTED); + return (0); +} + +/* + * __env_ref_decrement -- + * Decrement the environment's reference count. + * + * PUBLIC: int __env_ref_decrement __P((ENV *)); + */ +int +__env_ref_decrement(env) + ENV *env; +{ + REGENV *renv; + REGINFO *infop; + + /* Be cautious -- we may not have an environment. */ + if ((infop = env->reginfo) == NULL) + return (0); + + renv = infop->primary; + + /* Even if we have an environment, may not have reference counted it. */ + if (F_ISSET(env, ENV_REF_COUNTED)) { + /* Lock the environment, decrement the reference, unlock. */ + MUTEX_LOCK(env, renv->mtx_regenv); + if (renv->refcnt == 0) + __db_errx(env, DB_STR("1547", + "environment reference count went negative")); + else + --renv->refcnt; + MUTEX_UNLOCK(env, renv->mtx_regenv); + + F_CLR(env, ENV_REF_COUNTED); + } + + /* If a private environment, we're done with the mutex, destroy it. */ + return (F_ISSET(env, ENV_PRIVATE) ? + __mutex_free(env, &renv->mtx_regenv) : 0); +} + +/* + * __env_ref_get -- + * Get the number of environment references. This is an unprotected + * read of refcnt to simply provide a spot check of the value. It + * is only intended for use as an internal utility routine. + * + * PUBLIC: int __env_ref_get __P((DB_ENV *, u_int32_t *)); + */ +int +__env_ref_get(dbenv, countp) + DB_ENV *dbenv; + u_int32_t *countp; +{ + ENV *env; + REGENV *renv; + REGINFO *infop; + + env = dbenv->env; + infop = env->reginfo; + renv = infop->primary; + *countp = renv->refcnt; + return (0); +} + +/* + * __env_detach -- + * Detach from the environment. + * + * PUBLIC: int __env_detach __P((ENV *, int)); + */ +int +__env_detach(env, destroy) + ENV *env; + int destroy; +{ + REGENV *renv; + REGINFO *infop; + REGION rp; + int ret, t_ret; + + infop = env->reginfo; + renv = infop->primary; + ret = 0; + + /* Close the locking file handle. */ + if (env->lockfhp != NULL) { + if ((t_ret = + __os_closehandle(env, env->lockfhp)) != 0 && ret == 0) + ret = t_ret; + env->lockfhp = NULL; + } + + /* + * If a private region, return the memory to the heap. Not needed for + * filesystem-backed or system shared memory regions, that memory isn't + * owned by any particular process. + */ + if (destroy) { + /* + * Free the REGION array. + * + * The actual underlying region structure is allocated from the + * primary shared region, and we're about to free it. Save a + * copy on our stack for the REGINFO to reference when it calls + * down into the OS layer to release the shared memory segment. + */ + rp = *infop->rp; + infop->rp = &rp; + + if (renv->region_off != INVALID_ROFF) + __env_alloc_free( + infop, R_ADDR(infop, renv->region_off)); + } + + /* + * Set the ENV->reginfo field to NULL. BDB uses the ENV->reginfo + * field to decide if the underlying region can be accessed or needs + * cleanup. We're about to destroy what it references, so it needs to + * be cleared. + */ + env->reginfo = NULL; + env->thr_hashtab = NULL; + + if ((t_ret = __env_sys_detach(env, infop, destroy)) != 0 && ret == 0) + ret = t_ret; + if (infop->name != NULL) + __os_free(env, infop->name); + + /* Discard the ENV->reginfo field's memory. */ + __os_free(env, infop); + + return (ret); +} + +/* + * __env_remove_env -- + * Remove an environment. + * + * PUBLIC: int __env_remove_env __P((ENV *)); + */ +int +__env_remove_env(env) + ENV *env; +{ + DB_ENV *dbenv; + REGENV *renv; + REGINFO *infop, reginfo; + REGION *rp; + u_int32_t flags_orig, i; + + dbenv = env->dbenv; + + /* + * We do not want to hang on a mutex request, nor do we care about + * panics. + */ + flags_orig = F_ISSET(dbenv, DB_ENV_NOLOCKING | DB_ENV_NOPANIC); + F_SET(dbenv, DB_ENV_NOLOCKING | DB_ENV_NOPANIC); + + /* + * This routine has to walk a nasty line between not looking into the + * environment (which may be corrupted after an app or system crash), + * and removing everything that needs removing. + * + * Connect to the environment: If we can't join the environment, we + * guess it's because it doesn't exist. Remove the underlying files, + * at least. + */ + if (__env_attach(env, NULL, 0, 0) != 0) + goto remfiles; + + infop = env->reginfo; + renv = infop->primary; + + /* + * Kill the environment, if it's not already dead. + */ + renv->panic = 1; + + /* + * Walk the array of regions. Connect to each region and disconnect + * with the destroy flag set. This shouldn't cause any problems, even + * if the region is corrupted, because we never look inside the region + * (with the single exception of mutex regions on systems where we have + * to return resources to the underlying system). + */ + for (rp = R_ADDR(infop, renv->region_off), + i = 0; i < renv->region_cnt; ++i, ++rp) { + if (rp->id == INVALID_REGION_ID || rp->type == REGION_TYPE_ENV) + continue; + /* + * !!! + * The REGION_CREATE_OK flag is set for Windows/95 -- regions + * are zero'd out when the last reference to the region goes + * away, in which case the underlying OS region code requires + * callers be prepared to create the region in order to join it. + */ + memset(®info, 0, sizeof(reginfo)); + reginfo.id = rp->id; + reginfo.flags = REGION_CREATE_OK; + + /* + * If we get here and can't attach and/or detach to the + * region, it's a mess. Ignore errors, there's nothing + * we can do about them. + */ + if (__env_region_attach(env, ®info, 0, 0) != 0) + continue; + +#ifdef HAVE_MUTEX_SYSTEM_RESOURCES + /* + * If destroying the mutex region, return any system + * resources to the system. + */ + if (reginfo.type == REGION_TYPE_MUTEX) + __mutex_resource_return(env, ®info); +#endif + (void)__env_region_detach(env, ®info, 1); + } + + /* Detach from the environment's primary region. */ + (void)__env_detach(env, 1); + +remfiles: + /* + * Walk the list of files in the directory, unlinking files in the + * Berkeley DB name space. + */ + __env_remove_file(env); + + F_CLR(dbenv, DB_ENV_NOLOCKING | DB_ENV_NOPANIC); + F_SET(dbenv, flags_orig); + + return (0); +} + +/* + * __env_remove_file -- + * Discard any region files in the filesystem. + */ +static void +__env_remove_file(env) + ENV *env; +{ + int cnt, fcnt, lastrm, ret; + const char *dir; + char saved_char, *p, **names, *path, buf[sizeof(DB_REGION_FMT) + 20]; + + /* Get the full path of a file in the environment. */ + (void)snprintf(buf, sizeof(buf), "%s", DB_REGION_ENV); + if ((ret = __db_appname(env, + DB_APP_NONE, buf, NULL, &path)) != 0) + return; + + /* Get the parent directory for the environment. */ + if ((p = __db_rpath(path)) == NULL) { + p = path; + saved_char = *p; + + dir = PATH_DOT; + } else { + saved_char = *p; + *p = '\0'; + + dir = path; + } + + /* Get the list of file names. */ + if ((ret = __os_dirlist(env, dir, 0, &names, &fcnt)) != 0) + __db_err(env, ret, "%s", dir); + + /* Restore the path, and free it. */ + *p = saved_char; + __os_free(env, path); + + if (ret != 0) + return; + + /* + * Remove files from the region directory. + */ + for (lastrm = -1, cnt = fcnt; --cnt >= 0;) { + /* Skip anything outside our name space. */ + if (!IS_DB_FILE(names[cnt])) + continue; + + /* Skip queue extent files. */ + if (strncmp(names[cnt], "__dbq.", 6) == 0) + continue; + if (strncmp(names[cnt], "__dbp.", 6) == 0) + continue; + + /* Skip registry files. */ + if (strncmp(names[cnt], "__db.register", 13) == 0) + continue; + + /* Skip replication files. */ + if (strncmp(names[cnt], "__db.rep", 8) == 0) + continue; + + /* + * Remove the primary environment region last, because it's + * the key to this whole mess. + */ + if (strcmp(names[cnt], DB_REGION_ENV) == 0) { + lastrm = cnt; + continue; + } + + /* Remove the file. */ + if (__db_appname(env, + DB_APP_NONE, names[cnt], NULL, &path) == 0) { + /* + * Overwrite region files. Temporary files would have + * been maintained in encrypted format, so there's no + * reason to overwrite them. This is not an exact + * check on the file being a region file, but it's + * not likely to be wrong, and the worst thing that can + * happen is we overwrite a file that didn't need to be + * overwritten. + */ + (void)__os_unlink(env, path, 1); + __os_free(env, path); + } + } + + if (lastrm != -1) + if (__db_appname(env, + DB_APP_NONE, names[lastrm], NULL, &path) == 0) { + (void)__os_unlink(env, path, 1); + __os_free(env, path); + } + __os_dirfree(env, names, fcnt); +} + +/* + * __env_region_attach + * Join/create a region. + * + * PUBLIC: int __env_region_attach __P((ENV *, REGINFO *, size_t, size_t)); + */ +int +__env_region_attach(env, infop, init, max) + ENV *env; + REGINFO *infop; + size_t init, max; +{ + REGION *rp; + int ret; + char buf[sizeof(DB_REGION_FMT) + 20]; + + /* + * Find or create a REGION structure for this region. If we create + * it, the REGION_CREATE flag will be set in the infop structure. + */ + F_CLR(infop, REGION_CREATE); + if ((ret = __env_des_get(env, env->reginfo, infop, &rp)) != 0) + return (ret); + infop->env = env; + infop->rp = rp; + infop->type = rp->type; + infop->id = rp->id; + + /* + * __env_des_get may have created the region and reset the create + * flag. If we're creating the region, set the desired size. + */ + if (F_ISSET(infop, REGION_CREATE)) { + rp->alloc = rp->size = (roff_t)init; + rp->max = (roff_t)max; + } + + /* Join/create the underlying region. */ + (void)snprintf(buf, sizeof(buf), DB_REGION_FMT, infop->id); + if ((ret = __db_appname(env, + DB_APP_NONE, buf, NULL, &infop->name)) != 0) + goto err; + if ((ret = __env_sys_attach(env, infop, rp)) != 0) + goto err; + + /* + * Fault the pages into memory. Note, do this BEFORE we initialize + * anything because we're writing pages in created regions, not just + * reading them. + */ + (void)__env_faultmem(env, + infop->addr, rp->size, F_ISSET(infop, REGION_CREATE)); + + /* + * !!! + * The underlying layer may have just decided that we are going + * to create the region. There are various system issues that + * can result in a useless region that requires re-initialization. + * + * If we created the region, initialize it for allocation. + */ + if (F_ISSET(infop, REGION_CREATE)) + __env_alloc_init(infop, rp->size); + + return (0); + +err: /* Discard the underlying region. */ + if (infop->addr != NULL) + (void)__env_sys_detach(env, + infop, F_ISSET(infop, REGION_CREATE)); + else if (infop->name != NULL) { + __os_free(env, infop->name); + infop->name = NULL; + } + infop->rp = NULL; + infop->id = INVALID_REGION_ID; + + /* Discard the REGION structure if we created it. */ + if (F_ISSET(infop, REGION_CREATE)) { + __env_des_destroy(env, rp); + F_CLR(infop, REGION_CREATE); + } + + return (ret); +} + +/* + * __env_region_share + * Share the primary region. + * + * PUBLIC: int __env_region_share __P((ENV *, REGINFO *)); + */ +int +__env_region_share(env, infop) + ENV *env; + REGINFO *infop; +{ + REGINFO *envinfo; + REGION *rp; + + envinfo = env->reginfo; + rp = envinfo->rp; + F_SET(infop, F_ISSET(envinfo, REGION_CREATE) | REGION_SHARED); + infop->addr = envinfo->addr; + infop->head = envinfo->head; + + infop->env = env; + infop->rp = rp; + infop->name = envinfo->name; + infop->fhp = envinfo->fhp; + infop->type = rp->type; + infop->id = rp->id; + + return (0); +} + +/* + * __env_region_detach -- + * Detach from a region. + * + * PUBLIC: int __env_region_detach __P((ENV *, REGINFO *, int)); + */ +int +__env_region_detach(env, infop, destroy) + ENV *env; + REGINFO *infop; + int destroy; +{ + REGION *rp; + REGION_MEM *mem, *next; + int ret; + + if (F_ISSET(env, ENV_PRIVATE)) + destroy = 1; + else if (F_ISSET(infop, REGION_SHARED)) + return (0); + + rp = infop->rp; + + /* + * When discarding the regions as we shut down a database environment, + * discard any allocated shared memory segments. This is the last time + * we use them, and db_region_destroy is the last region-specific call + * we make. + */ + if (F_ISSET(env, ENV_PRIVATE) && infop->primary != NULL) { + for (mem = infop->mem; mem != NULL; mem = next) { + next = mem->next; + __env_alloc_free(infop, mem); + } + __env_alloc_free(infop, infop->primary); + } + + if (F_ISSET(infop, REGION_SHARED)) + return (0); + + /* Detach from the underlying OS region. */ + ret = __env_sys_detach(env, infop, destroy); + + /* If we destroyed the region, discard the REGION structure. */ + if (destroy) + __env_des_destroy(env, rp); + + /* Destroy the structure. */ + if (infop->name != NULL) + __os_free(env, infop->name); + + return (ret); +} + +/* + * __env_sys_attach -- + * Prep and call the underlying OS attach function. + */ +static int +__env_sys_attach(env, infop, rp) + ENV *env; + REGINFO *infop; + REGION *rp; +{ + int ret; + + /* + * All regions are created on 8K boundaries out of sheer paranoia, + * so we don't make some underlying VM unhappy. Make sure we don't + * overflow or underflow. + */ +#define OS_VMPAGESIZE (8 * 1024) +#define OS_VMROUNDOFF(i) { \ + if ((i) + OS_VMPAGESIZE - 1 > (i)) \ + (i) += OS_VMPAGESIZE - 1; \ + (i) -= (i) % OS_VMPAGESIZE; \ +} + if (F_ISSET(infop, REGION_CREATE)) { + OS_VMROUNDOFF(rp->size); + OS_VMROUNDOFF(rp->max); + } + +#ifdef DB_REGIONSIZE_MAX + /* Some architectures have hard limits on the maximum region size. */ + if (rp->size > DB_REGIONSIZE_MAX) { + __db_errx(env, DB_STR_A("1548", + "region size %lu is too large; maximum is %lu", "%lu %lu"), + (u_long)rp->size, (u_long)DB_REGIONSIZE_MAX); + return (EINVAL); + } + if (rp->max > DB_REGIONSIZE_MAX) { + __db_errx(env, DB_STR_A("1549", + "region max %lu is too large; maximum is %lu", "%lu %lu"), + (u_long)rp->max, (u_long)DB_REGIONSIZE_MAX); + return (EINVAL); + } +#endif + + /* + * If a region is private, malloc the memory. + * + * !!! + * If this fails because the region is too large to malloc, mmap(2) + * using the MAP_ANON or MAP_ANONYMOUS flags would be an alternative. + * I don't know of any architectures (yet!) where malloc is a problem. + */ + if (F_ISSET(env, ENV_PRIVATE)) { +#if defined(HAVE_MUTEX_HPPA_MSEM_INIT) + /* + * !!! + * There exist spinlocks that don't work in malloc memory, e.g., + * the HP/UX msemaphore interface. If we don't have locks that + * will work in malloc memory, we better not be private or not + * be threaded. + */ + if (F_ISSET(env, ENV_THREAD)) { + __db_errx(env, DB_STR("1550", +"architecture does not support locks inside process-local (malloc) memory")); + __db_errx(env, DB_STR("1551", + "application may not specify both DB_PRIVATE and DB_THREAD")); + return (EINVAL); + } +#endif + if ((ret = __os_malloc( + env, sizeof(REGENV), &infop->addr)) != 0) + return (ret); + + } else { +#if !defined(HAVE_MMAP_EXTEND) + /* Extend any disk file to its full size before mapping it. */ + rp->size = rp->max; +#endif + if ((ret = __os_attach(env, infop, rp)) != 0) + return (ret); + } + + /* Set the start of the allocation region. */ + infop->head = infop->addr; + + /* + * We require that the memory is aligned to fix the largest integral + * type. Otherwise, multiple processes mapping the same shared region + * would have to memcpy every value before reading it. + */ + if (infop->addr != ALIGNP_INC(infop->addr, sizeof(uintmax_t))) { + __db_errx(env, DB_STR("1552", + "region memory was not correctly aligned")); + (void)__env_sys_detach(env, infop, + F_ISSET(infop, REGION_CREATE)); + return (EINVAL); + } + + return (0); +} + +/* + * __env_sys_detach -- + * Prep and call the underlying OS detach function. + */ +static int +__env_sys_detach(env, infop, destroy) + ENV *env; + REGINFO *infop; + int destroy; +{ + + /* If a region is private, free the memory. */ + if (F_ISSET(env, ENV_PRIVATE)) { + __os_free(env, infop->addr); + return (0); + } + + return (__os_detach(env, infop, destroy)); +} + +/* + * __env_des_get -- + * Return a reference to the shared information for a REGION, + * optionally creating a new entry. + */ +static int +__env_des_get(env, env_infop, infop, rpp) + ENV *env; + REGINFO *env_infop, *infop; + REGION **rpp; +{ + REGENV *renv; + REGION *rp, *empty_slot, *first_type; + u_int32_t i, maxid; + + *rpp = NULL; + renv = env_infop->primary; + + /* + * If the caller wants to join a region, walk through the existing + * regions looking for a matching ID (if ID specified) or matching + * type (if type specified). If we return based on a matching type + * return the "primary" region, that is, the first region that was + * created of this type. + * + * Track the first empty slot and maximum region ID for new region + * allocation. + * + * MaxID starts at REGION_ID_ENV, the ID of the primary environment. + */ + maxid = REGION_ID_ENV; + empty_slot = first_type = NULL; + for (rp = R_ADDR(env_infop, renv->region_off), + i = 0; i < renv->region_cnt; ++i, ++rp) { + if (rp->id == INVALID_REGION_ID) { + if (empty_slot == NULL) + empty_slot = rp; + continue; + } + if (infop->id != INVALID_REGION_ID) { + if (infop->id == rp->id) + break; + continue; + } + if (infop->type == rp->type && + F_ISSET(infop, REGION_JOIN_OK) && + (first_type == NULL || first_type->id > rp->id)) + first_type = rp; + + if (rp->id > maxid) + maxid = rp->id; + } + + /* If we found a matching ID (or a matching type), return it. */ + if (i >= renv->region_cnt) + rp = first_type; + if (rp != NULL) { + *rpp = rp; + return (0); + } + + /* + * If we didn't find a region and we don't have permission to create + * the region, fail. The caller generates any error message. + */ + if (!F_ISSET(infop, REGION_CREATE_OK)) + return (ENOENT); + + /* + * If we didn't find a region and don't have room to create the region + * fail with an error message, there's a sizing problem. + */ + if (empty_slot == NULL) { + __db_errx(env, DB_STR("1553", + "no room remaining for additional REGIONs")); + return (ENOENT); + } + + /* + * Initialize a REGION structure for the caller. If id was set, use + * that value, otherwise we use the next available ID. + */ + memset(empty_slot, 0, sizeof(REGION)); + empty_slot->segid = INVALID_REGION_SEGID; + + /* + * Set the type and ID; if no region ID was specified, + * allocate one. + */ + empty_slot->type = infop->type; + empty_slot->id = infop->id == INVALID_REGION_ID ? maxid + 1 : infop->id; + + F_SET(infop, REGION_CREATE); + + *rpp = empty_slot; + return (0); +} + +/* + * __env_des_destroy -- + * Destroy a reference to a REGION. + */ +static void +__env_des_destroy(env, rp) + ENV *env; + REGION *rp; +{ + COMPQUIET(env, NULL); + + rp->id = INVALID_REGION_ID; +} + +/* + * __env_faultmem -- + * Fault the region into memory. + */ +static int +__env_faultmem(env, addr, size, created) + ENV *env; + void *addr; + size_t size; + int created; +{ + int ret; + u_int8_t *p, *t; + + /* Ignore heap regions. */ + if (F_ISSET(env, ENV_PRIVATE)) + return (0); + + /* + * It's sometimes significantly faster to page-fault in all of the + * region's pages before we run the application, as we see nasty + * side-effects when we page-fault while holding various locks, i.e., + * the lock takes a long time to acquire because of the underlying + * page fault, and the other threads convoy behind the lock holder. + * + * If we created the region, we write a non-zero value so that the + * system can't cheat. If we're just joining the region, we can + * only read the value and try to confuse the compiler sufficiently + * that it doesn't figure out that we're never really using it. + * + * Touch every page (assuming pages are 512B, the smallest VM page + * size used in any general purpose processor). + */ + ret = 0; + if (F_ISSET(env->dbenv, DB_ENV_REGION_INIT)) { + if (created) + for (p = addr, + t = (u_int8_t *)addr + size; p < t; p += 512) + p[0] = 0xdb; + else + for (p = addr, + t = (u_int8_t *)addr + size; p < t; p += 512) + ret |= p[0]; + } + + return (ret); +} diff --git a/src/env/env_register.c b/src/env/env_register.c new file mode 100644 index 00000000..7475444d --- /dev/null +++ b/src/env/env_register.c @@ -0,0 +1,730 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 2004, 2012 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" + +#define REGISTER_FILE "__db.register" + +#define PID_EMPTY "X 0\n" /* Unused PID entry */ +#define PID_FMT "%24lu\n" /* PID entry format */ + /* Unused PID test */ +#define PID_ISEMPTY(p) (memcmp(p, PID_EMPTY, PID_LEN) == 0) +#define PID_LEN (25) /* PID entry length */ + +#define REGISTRY_LOCK(env, pos, nowait) \ + __os_fdlock(env, (env)->dbenv->registry, (off_t)(pos), 1, nowait) +#define REGISTRY_UNLOCK(env, pos) \ + __os_fdlock(env, (env)->dbenv->registry, (off_t)(pos), 0, 0) +#define REGISTRY_EXCL_LOCK(env, nowait) \ + REGISTRY_LOCK(env, 1, nowait) +#define REGISTRY_EXCL_UNLOCK(env) \ + REGISTRY_UNLOCK(env, 1) + +static int __envreg_add __P((ENV *, int *, u_int32_t)); +static int __envreg_pid_compare __P((const void *, const void *)); +static int __envreg_create_active_pid __P((ENV *, char *)); + +/* + * Support for portable, multi-process database environment locking, based on + * the Subversion SR (#11511). + * + * The registry feature is configured by specifying the DB_REGISTER flag to the + * DbEnv.open method. If DB_REGISTER is specified, DB opens the registry file + * in the database environment home directory. The registry file is formatted + * as follows: + * + * 12345 # process ID slot 1 + * X # empty slot + * 12346 # process ID slot 2 + * X # empty slot + * 12347 # process ID slot 3 + * 12348 # process ID slot 4 + * X 12349 # empty slot + * X # empty slot + * + * All lines are fixed-length. All lines are process ID slots. Empty slots + * are marked with leading non-digit characters. + * + * To modify the file, you get an exclusive lock on the first byte of the file. + * + * While holding any DbEnv handle, each process has an exclusive lock on the + * first byte of a process ID slot. There is a restriction on having more + * than one DbEnv handle open at a time, because Berkeley DB uses per-process + * locking to implement this feature, that is, a process may never have more + * than a single slot locked. + * + * This work requires that if a process dies or the system crashes, locks held + * by the dying processes will be dropped. (We can't use system shared + * memory-backed or filesystem-backed locks because they're persistent when a + * process dies.) On POSIX systems, we use fcntl(2) locks; on Win32 we have + * LockFileEx/UnlockFile, except for Win/9X and Win/ME which have to loop on + * Lockfile/UnlockFile. + * + * We could implement the same solution with flock locking instead of fcntl, + * but flock would require a separate file for each process of control (and + * probably each DbEnv handle) in the database environment, which is fairly + * ugly. + * + * Whenever a process opens a new DbEnv handle, it walks the registry file and + * verifies it CANNOT acquire the lock for any non-empty slot. If a lock for + * a non-empty slot is available, we know a process died holding an open handle, + * and recovery needs to be run. + * + * It's possible to get corruption in the registry file. If a write system + * call fails after partially completing, there can be corrupted entries in + * the registry file, or a partial entry at the end of the file. This is OK. + * A corrupted entry will be flagged as a non-empty line during the registry + * file walk. Since the line was corrupted by process failure, no process will + * hold a lock on the slot, which will lead to recovery being run. + * + * There can still be processes running in the environment when we recover it, + * and, in fact, there can still be processes running in the old environment + * after we're up and running in a new one. This is safe because performing + * recovery panics (and removes) the existing environment, so the window of + * vulnerability is small. Further, we check the panic flag in the DB API + * methods, when waking from spinning on a mutex, and whenever we're about to + * write to disk). The only window of corruption is if the write check of the + * panic were to complete, the region subsequently be recovered, and then the + * write continues. That's very, very unlikely to happen. This vulnerability + * already exists in Berkeley DB, too, the registry code doesn't make it any + * worse than it already is. + * + * The only way to avoid that window entirely is to ensure that all processes + * in the Berkeley DB environment exit before we run recovery. Applications + * can do that if they maintain their own process registry outside of Berkeley + * DB, but it's a little more difficult to do here. The obvious approach is + * to send signals to any process using the database environment as soon as we + * decide to run recovery, but there are problems with that approach: we might + * not have permission to send signals to the process, the process might have + * signal handlers installed, the cookie stored might not be the same as kill's + * argument, we may not be able to reliably tell if the process died, and there + * are probably other problems. However, if we can send a signal, it reduces + * the window, and so we include the code here. To configure it, turn on the + * DB_ENVREG_KILL_ALL #define. + */ +#define DB_ENVREG_KILL_ALL 0 + +/* + * __envreg_register -- + * Register a ENV handle. + * + * PUBLIC: int __envreg_register __P((ENV *, int *, u_int32_t)); + */ +int +__envreg_register(env, need_recoveryp, flags) + ENV *env; + int *need_recoveryp; + u_int32_t flags; +{ + DB_ENV *dbenv; + pid_t pid; + u_int32_t bytes, mbytes; + int ret; + char *pp; + + *need_recoveryp = 0; + + dbenv = env->dbenv; + dbenv->thread_id(dbenv, &pid, NULL); + pp = NULL; + + if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) + __db_msg(env, DB_STR_A("1524", + "%lu: register environment", "%lu"), (u_long)pid); + + /* Build the path name and open the registry file. */ + if ((ret = __db_appname(env, + DB_APP_NONE, REGISTER_FILE, NULL, &pp)) != 0) + goto err; + if ((ret = __os_open(env, pp, 0, + DB_OSO_CREATE, DB_MODE_660, &dbenv->registry)) != 0) + goto err; + + /* + * Wait for an exclusive lock on the file. + * + * !!! + * We're locking bytes that don't yet exist, but that's OK as far as + * I know. + */ + if ((ret = REGISTRY_EXCL_LOCK(env, 0)) != 0) + goto err; + + /* + * If the file size is 0, initialize the file. + * + * Run recovery if we create the file, that means we can clean up the + * system by removing the registry file and restarting the application. + */ + if ((ret = __os_ioinfo( + env, pp, dbenv->registry, &mbytes, &bytes, NULL)) != 0) + goto err; + if (mbytes == 0 && bytes == 0) { + if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) + __db_msg(env, DB_STR_A("1525", + "%lu: creating %s", "%lu %s"), (u_long)pid, pp); + *need_recoveryp = 1; + } + + /* Register this process. */ + if ((ret = __envreg_add(env, need_recoveryp, flags)) != 0) + goto err; + + /* + * Release our exclusive lock if we don't need to run recovery. If + * we need to run recovery, ENV->open will call back into register + * code once recovery has completed. + */ + if (*need_recoveryp == 0 && (ret = REGISTRY_EXCL_UNLOCK(env)) != 0) + goto err; + + if (0) { +err: *need_recoveryp = 0; + + /* + * !!! + * Closing the file handle must release all of our locks. + */ + if (dbenv->registry != NULL) + (void)__os_closehandle(env, dbenv->registry); + dbenv->registry = NULL; + } + + if (pp != NULL) + __os_free(env, pp); + + return (ret); +} + +/* + * __envreg_add -- + * Add the process' pid to the register. + */ +static int +__envreg_add(env, need_recoveryp, flags) + ENV *env; + int *need_recoveryp; + u_int32_t flags; +{ + DB_ENV *dbenv; + DB_THREAD_INFO *ip; + REGENV * renv; + REGINFO *infop; + pid_t pid; + off_t end, pos, dead; + size_t nr, nw; + u_int lcnt; + u_int32_t bytes, mbytes, orig_flags; + int need_recovery, ret, t_ret; + char *p, buf[PID_LEN + 10], pid_buf[PID_LEN + 10]; + + dbenv = env->dbenv; + need_recovery = 0; + COMPQUIET(dead, 0); + COMPQUIET(p, NULL); + ip = NULL; + + /* Get a copy of our process ID. */ + dbenv->thread_id(dbenv, &pid, NULL); + snprintf(pid_buf, sizeof(pid_buf), PID_FMT, (u_long)pid); + + if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) + __db_msg(env, DB_STR_A("1526", + "%lu: adding self to registry", "%lu"), (u_long)pid); + +#if DB_ENVREG_KILL_ALL + if (0) { +kill_all: /* + * A second pass through the file, this time killing any + * processes still running. + */ + if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0) + return (ret); + } +#endif + + /* + * Read the file. Skip empty slots, and check that a lock is held + * for any allocated slots. An allocated slot which we can lock + * indicates a process died holding a handle and recovery needs to + * be run. + */ + for (lcnt = 0;; ++lcnt) { + if ((ret = __os_read( + env, dbenv->registry, buf, PID_LEN, &nr)) != 0) + return (ret); + if (nr == 0) + break; + + /* + * A partial record at the end of the file is possible if a + * previously un-registered process was interrupted while + * registering. + */ + if (nr != PID_LEN) { + need_recovery = 1; + break; + } + + if (PID_ISEMPTY(buf)) { + if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) + __db_msg(env, DB_STR_A("1527", + "%02u: EMPTY", "%02u"), lcnt); + continue; + } + + /* + * !!! + * DB_REGISTER is implemented using per-process locking, only + * a single ENV handle may be open per process. Enforce + * that restriction. + */ + if (memcmp(buf, pid_buf, PID_LEN) == 0) { + __db_errx(env, DB_STR("1528", +"DB_REGISTER limits processes to one open DB_ENV handle per environment")); + return (EINVAL); + } + + if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) { + for (p = buf; *p == ' ';) + ++p; + buf[nr - 1] = '\0'; + } + +#if DB_ENVREG_KILL_ALL + if (need_recovery) { + pid = (pid_t)strtoul(buf, NULL, 10); + (void)kill(pid, SIGKILL); + + if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) + __db_msg(env, DB_STR_A("1529", + "%02u: %s: KILLED", "%02u %s"), lcnt, p); + continue; + } +#endif + pos = (off_t)lcnt * PID_LEN; + if (REGISTRY_LOCK(env, pos, 1) == 0) { + if ((ret = REGISTRY_UNLOCK(env, pos)) != 0) + return (ret); + + if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) + __db_msg(env, DB_STR_A("1530", + "%02u: %s: FAILED", "%02u %s"), lcnt, p); + + need_recovery = 1; + dead = pos; +#if DB_ENVREG_KILL_ALL + goto kill_all; +#else + break; +#endif + } else + if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) + __db_msg(env, DB_STR_A("1531", + "%02u: %s: LOCKED", "%02u %s"), lcnt, p); + } + + /* + * If we have to perform recovery... + * + * Mark all slots empty. Registry ignores empty slots we can't lock, + * so it doesn't matter if any of the processes are in the middle of + * exiting Berkeley DB -- they'll discard their lock when they exit. + */ + if (need_recovery) { + if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) + __db_msg(env, "%lu: recovery required", (u_long)pid); + + if (LF_ISSET(DB_FAILCHK) || LF_ISSET(DB_FAILCHK_ISALIVE)) { + if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) + __db_msg(env, + "%lu: performing failchk", (u_long)pid); + + if (LF_ISSET(DB_FAILCHK_ISALIVE)) + if ((ret = __envreg_create_active_pid( + env, pid_buf)) != 0) + goto sig_proc; + + /* The environment will already exist, so we do not + * want DB_CREATE set, nor do we want any recovery at + * this point. No need to put values back as flags is + * passed in by value. Save original dbenv flags in + * case we need to recover/remove existing environment. + * Set DB_ENV_FAILCHK before attach to help ensure we + * dont block on a mutex held by the dead process. + */ + LF_CLR(DB_CREATE | DB_RECOVER | DB_RECOVER_FATAL); + orig_flags = dbenv->flags; + F_SET(dbenv, DB_ENV_FAILCHK); + /* Attach to environment and subsystems. */ + if ((ret = __env_attach_regions( + dbenv, flags, orig_flags, 0)) != 0) + goto sig_proc; + if ((t_ret = + __env_set_state(env, &ip, THREAD_FAILCHK)) != 0 && + ret == 0) + ret = t_ret; + if ((t_ret = + __env_failchk_int(dbenv)) != 0 && ret == 0) + ret = t_ret; + + /* Free active pid array if used. */ + if (LF_ISSET(DB_FAILCHK_ISALIVE)) { + DB_GLOBAL(num_active_pids) = 0; + DB_GLOBAL(size_active_pids) = 0; + __os_free( env, DB_GLOBAL(active_pids)); + } + + /* Detach from environment and deregister thread. */ + if ((t_ret = + __env_refresh(dbenv, orig_flags, 0)) != 0 && + ret == 0) + ret = t_ret; + if (ret == 0) { + if ((ret = __os_seek(env, dbenv->registry, + 0, 0,(u_int32_t)dead)) != 0 || + (ret = __os_write(env, dbenv->registry, + PID_EMPTY, PID_LEN, &nw)) != 0) + return (ret); + need_recovery = 0; + goto add; + } + + } + /* If we can't attach, then we cannot set DB_REGISTER panic. */ +sig_proc: if (__env_attach(env, NULL, 0, 0) == 0) { + infop = env->reginfo; + renv = infop->primary; + /* Indicate DB_REGSITER panic. Also, set environment + * panic as this is the panic trigger mechanism in + * the code that everything looks for. + */ + renv->reg_panic = 1; + renv->panic = 1; + (void)__env_detach(env, 0); + } + + /* Wait for processes to see the panic and leave. */ + __os_yield(env, 0, dbenv->envreg_timeout); + + /* FIGURE out how big the file is. */ + if ((ret = __os_ioinfo( + env, NULL, dbenv->registry, &mbytes, &bytes, NULL)) != 0) + return (ret); + end = (off_t)mbytes * MEGABYTE + bytes; + + /* + * Seek to the beginning of the file and overwrite slots to + * the end of the file. + * + * It's possible for there to be a partial entry at the end of + * the file if a process died when trying to register. If so, + * correct for it and overwrite it as well. + */ + if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0) + return (ret); + for (lcnt = 0; lcnt < ((u_int)end / PID_LEN + + ((u_int)end % PID_LEN == 0 ? 0 : 1)); ++lcnt) { + + if ((ret = __os_read( + env, dbenv->registry, buf, PID_LEN, &nr)) != 0) + return (ret); + + pos = (off_t)lcnt * PID_LEN; + /* do not notify on dead process */ + if (pos != dead) { + pid = (pid_t)strtoul(buf, NULL, 10); + DB_EVENT(env, DB_EVENT_REG_ALIVE, &pid); + } + + if ((ret = __os_seek(env, + dbenv->registry, 0, 0, (u_int32_t)pos)) != 0 || + (ret = __os_write(env, + dbenv->registry, PID_EMPTY, PID_LEN, &nw)) != 0) + return (ret); + } + /* wait one last time to get everyone out */ + __os_yield(env, 0, dbenv->envreg_timeout); + } + + /* + * Seek to the first process slot and add ourselves to the first empty + * slot we can lock. + */ +add: if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0) + return (ret); + for (lcnt = 0;; ++lcnt) { + if ((ret = __os_read( + env, dbenv->registry, buf, PID_LEN, &nr)) != 0) + return (ret); + if (nr == PID_LEN && !PID_ISEMPTY(buf)) + continue; + pos = (off_t)lcnt * PID_LEN; + if (REGISTRY_LOCK(env, pos, 1) == 0) { + if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) + __db_msg(env, DB_STR_A("1532", + "%lu: locking slot %02u at offset %lu", + "%lu %02u %lu"), (u_long)pid, lcnt, + (u_long)pos); + + if ((ret = __os_seek(env, + dbenv->registry, 0, 0, (u_int32_t)pos)) != 0 || + (ret = __os_write(env, + dbenv->registry, pid_buf, PID_LEN, &nw)) != 0) + return (ret); + dbenv->registry_off = (u_int32_t)pos; + break; + } + } + + if (need_recovery) + *need_recoveryp = 1; + + return (ret); +} + +/* + * __envreg_unregister -- + * Unregister a ENV handle. + * + * PUBLIC: int __envreg_unregister __P((ENV *, int)); + */ +int +__envreg_unregister(env, recovery_failed) + ENV *env; + int recovery_failed; +{ + DB_ENV *dbenv; + size_t nw; + int ret, t_ret; + + dbenv = env->dbenv; + ret = 0; + + /* + * If recovery failed, we want to drop our locks and return, but still + * make sure any subsequent process doesn't decide everything is just + * fine and try to get into the database environment. In the case of + * an error, discard our locks, but leave our slot filled-in. + */ + if (recovery_failed) + goto err; + + /* + * Why isn't an exclusive lock necessary to discard a ENV handle? + * + * We mark our process ID slot empty before we discard the process slot + * lock, and threads of control reviewing the register file ignore any + * slots which they can't lock. + */ + if ((ret = __os_seek(env, + dbenv->registry, 0, 0, dbenv->registry_off)) != 0 || + (ret = __os_write( + env, dbenv->registry, PID_EMPTY, PID_LEN, &nw)) != 0) + goto err; + + /* + * !!! + * This code assumes that closing the file descriptor discards all + * held locks. + * + * !!! + * There is an ordering problem here -- in the case of a process that + * failed in recovery, we're unlocking both the exclusive lock and our + * slot lock. If the OS unlocked the exclusive lock and then allowed + * another thread of control to acquire the exclusive lock before also + * also releasing our slot lock, we could race. That can't happen, I + * don't think. + */ +err: if ((t_ret = + __os_closehandle(env, dbenv->registry)) != 0 && ret == 0) + ret = t_ret; + + dbenv->registry = NULL; + return (ret); +} + +/* + * __envreg_xunlock -- + * Discard the exclusive lock held by the ENV handle. + * + * PUBLIC: int __envreg_xunlock __P((ENV *)); + */ +int +__envreg_xunlock(env) + ENV *env; +{ + DB_ENV *dbenv; + pid_t pid; + int ret; + + dbenv = env->dbenv; + dbenv->thread_id(dbenv, &pid, NULL); + + if (FLD_ISSET(dbenv->verbose, DB_VERB_REGISTER)) + __db_msg(env, DB_STR_A("1533", + "%lu: recovery completed, unlocking", "%lu"), (u_long)pid); + + if ((ret = REGISTRY_EXCL_UNLOCK(env)) == 0) + return (ret); + + __db_err(env, ret, DB_STR_A("1534", + "%s: exclusive file unlock", "%s"), REGISTER_FILE); + return (__env_panic(env, ret)); +} + +/* + * __envreg_pid_compare -- + * Compare routine for qsort and bsearch calls. + * returns neg if key is less than membr, 0 if equal and + * pos if key is greater than membr. + */ +static int +__envreg_pid_compare(key, membr) + const void *key; + const void *membr; +{ + return ( *(pid_t*)key - *(pid_t*)membr ); +} + +/* + * __envreg_isalive -- + * Default isalive function that uses contents of an array of active pids + * gotten from the db_register file to determine if process is still + * alive. + * + * PUBLIC: int __envreg_isalive + * PUBLIC: __P((DB_ENV *, pid_t, db_threadid_t, u_int32_t)); + */ +int +__envreg_isalive(dbenv, pid, tid, flags ) + DB_ENV *dbenv; + pid_t pid; + db_threadid_t tid; + u_int32_t flags; +{ + /* in this case we really do not care about tid, simply for lint */ + DB_THREADID_INIT(tid); + + /* if is not an expected value then return early */ + if (!((flags == 0) || (flags == DB_MUTEX_PROCESS_ONLY))) + return (EINVAL); + + if (DB_GLOBAL(active_pids) == NULL || + DB_GLOBAL(num_active_pids) == 0 || dbenv == NULL) + return (0); + /* + * bsearch returns a pointer to an entry in active_pids if a match + * is found on pid, else no match found it returns NULL. This + * routine will return a 1 if a match is found, else a 0. + */ + if (bsearch(&pid, DB_GLOBAL(active_pids), DB_GLOBAL(num_active_pids), + sizeof(pid_t), __envreg_pid_compare)) + return 1; + + return (0); +} + +/* + * __envreg_create_active_pid -- + * Create array of pids, if need more room in array then double size. + * Only add active pids from DB_REGISTER file into array. + */ +static int +__envreg_create_active_pid(env, my_pid) + ENV *env; + char *my_pid; +{ + DB_ENV *dbenv; + char buf[PID_LEN + 10]; + int ret; + off_t pos; + pid_t pid, *tmparray; + size_t tmpsize, nr; + u_int lcnt; + + dbenv = env->dbenv; + pos = 0; + ret = 0; + + /* + * Walk through DB_REGISTER file, we grab pid entries that are locked + * as those represent processes that are still alive. Ignore empty + * slots, or those that are unlocked. + */ + if ((ret = __os_seek(env, dbenv->registry, 0, 0, 0)) != 0) + return (ret); + for (lcnt = 0;; ++lcnt) { + if ((ret = __os_read( + env, dbenv->registry, buf, PID_LEN, &nr)) != 0) + return (ret); + + /* all done is read nothing, or get a partial record */ + if (nr == 0 || nr != PID_LEN) + break; + if (PID_ISEMPTY(buf)) + continue; + + pos = (off_t)lcnt * PID_LEN; + if (REGISTRY_LOCK(env, pos, 1) == 0) { + /* got lock, so process died. Do not add to array */ + if ((ret = REGISTRY_UNLOCK(env, pos)) != 0) + return (ret); + } else { + /* first, check to make sure we have room in arrary */ + if (DB_GLOBAL(num_active_pids) + 1 > + DB_GLOBAL(size_active_pids)) { + tmpsize = + DB_GLOBAL(size_active_pids) * sizeof(pid_t); + + /* start with 512, then double if must grow */ + tmpsize = tmpsize>0 ? tmpsize*2 : 512; + if ((ret = __os_malloc + (env, tmpsize, &tmparray )) != 0) + return (ret); + + /* if array exists, then copy and free */ + if (DB_GLOBAL(active_pids)) { + memcpy( tmparray, + DB_GLOBAL(active_pids), + DB_GLOBAL(num_active_pids) * + sizeof(pid_t)); + __os_free( env, DB_GLOBAL(active_pids)); + } + + DB_GLOBAL(active_pids) = tmparray; + DB_GLOBAL(size_active_pids) = tmpsize; + + /* + * The process getting here has not been added + * to the DB_REGISTER file yet, so include it + * as the first item in array + */ + if (DB_GLOBAL(num_active_pids) == 0) { + pid = (pid_t)strtoul(my_pid, NULL, 10); + DB_GLOBAL(active_pids) + [DB_GLOBAL(num_active_pids)++] = pid; + } + } + + /* insert into array */ + pid = (pid_t)strtoul(buf, NULL, 10); + DB_GLOBAL(active_pids) + [DB_GLOBAL(num_active_pids)++] = pid; + + } + + } + + /* lets sort the array to allow for binary search in isalive func */ + qsort(DB_GLOBAL(active_pids), DB_GLOBAL(num_active_pids), + sizeof(pid_t), __envreg_pid_compare); + return (ret); +} diff --git a/src/env/env_sig.c b/src/env/env_sig.c new file mode 100644 index 00000000..6d127f85 --- /dev/null +++ b/src/env/env_sig.c @@ -0,0 +1,201 @@ +/*- + * DO NOT EDIT: automatically built by dist/s_sig. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" + +#include "dbinc/db_page.h" +#include "dbinc/btree.h" +#include "dbinc/crypto.h" +#include "dbinc/db_join.h" +#include "dbinc/db_verify.h" +#include "dbinc/hash.h" +#include "dbinc/heap.h" +#include "dbinc/lock.h" +#include "dbinc/log_verify.h" +#include "dbinc/mp.h" +#include "dbinc/partition.h" +#include "dbinc/qam.h" +#include "dbinc/txn.h" + +/* + * For a pure 32bit/64bit environment, we check all structures and calculate a + * signature. For compatible environment, we only check the structures in + * shared memory. + */ +#ifdef HAVE_MIXED_SIZE_ADDRESSING +#define __STRUCTURE_COUNT 41 +#else +#define __STRUCTURE_COUNT (41 + 104) +#endif + +/* + * __env_struct_sig -- + * Compute signature of structures. + * + * PUBLIC: u_int32_t __env_struct_sig __P((void)); + */ +u_int32_t +__env_struct_sig() +{ + u_short t[__STRUCTURE_COUNT + 5]; + u_int i; + + i = 0; +#define __ADD(s) (t[i++] = sizeof(struct s)) + +#ifdef HAVE_MUTEX_SUPPORT + __ADD(__db_mutex_stat); +#endif + __ADD(__db_lock_stat); + __ADD(__db_lock_hstat); + __ADD(__db_lock_pstat); + __ADD(__db_ilock); + __ADD(__db_lock_u); + __ADD(__db_lsn); + __ADD(__db_log_stat); + __ADD(__db_mpool_stat); + __ADD(__db_rep_stat); + __ADD(__db_repmgr_stat); + __ADD(__db_seq_stat); + __ADD(__db_bt_stat); + __ADD(__db_h_stat); + __ADD(__db_heap_stat); + __ADD(__db_qam_stat); + __ADD(__db_thread_info); + __ADD(__db_lockregion); + __ADD(__sh_dbt); + __ADD(__db_lockobj); + __ADD(__db_locker); + __ADD(__db_lockpart); + __ADD(__db_lock); + __ADD(__log); + __ADD(__mpool); + __ADD(__db_mpool_fstat_int); + __ADD(__mpoolfile); + __ADD(__bh); +#ifdef HAVE_MUTEX_SUPPORT + __ADD(__db_mutexregion); +#endif +#ifdef HAVE_MUTEX_SUPPORT + __ADD(__db_mutex_t); +#endif + __ADD(__db_reg_env); + __ADD(__db_region); + __ADD(__rep); + __ADD(__db_txn_stat_int); + __ADD(__db_txnregion); + +#ifndef HAVE_MIXED_SIZE_ADDRESSING + __ADD(__db_dbt); + __ADD(__db_lockreq); + __ADD(__db_log_cursor); + __ADD(__log_rec_spec); + __ADD(__db_mpoolfile); + __ADD(__db_mpool_fstat); + __ADD(__db_txn); + __ADD(__kids); + __ADD(__my_cursors); + __ADD(__femfs); + __ADD(__db_preplist); + __ADD(__db_txn_active); + __ADD(__db_txn_stat); + __ADD(__db_txn_token); + __ADD(__db_repmgr_site); + __ADD(__db_repmgr_conn_err); + __ADD(__db_seq_record); + __ADD(__db_sequence); + __ADD(__db); + __ADD(__cq_fq); + __ADD(__cq_aq); + __ADD(__cq_jq); + __ADD(__db_heap_rid); + __ADD(__dbc); + __ADD(__key_range); + __ADD(__db_compact); + __ADD(__db_env); + __ADD(__db_distab); + __ADD(__db_logvrfy_config); + __ADD(__db_channel); + __ADD(__db_site); + __ADD(__fn); + __ADD(__db_msgbuf); + __ADD(__pin_list); + __ADD(__env_thread_info); + __ADD(__flag_map); + __ADD(__db_backup_handle); + __ADD(__env); + __ADD(__dbc_internal); + __ADD(__dbpginfo); + __ADD(__epg); + __ADD(__cursor); + __ADD(__btree); + __ADD(__db_cipher); + __ADD(__db_foreign_info); + __ADD(__db_txnhead); + __ADD(__db_txnlist); + __ADD(__join_cursor); + __ADD(__pg_chksum); + __ADD(__pg_crypto); + __ADD(__heaphdr); + __ADD(__heaphdrsplt); + __ADD(__pglist); + __ADD(__vrfy_dbinfo); + __ADD(__vrfy_pageinfo); + __ADD(__vrfy_childinfo); + __ADD(__db_globals); + __ADD(__envq); + __ADD(__heap); + __ADD(__heap_cursor); + __ADD(__db_locktab); + __ADD(__db_entry); + __ADD(__fname); + __ADD(__db_log); + __ADD(__hdr); + __ADD(__log_persist); + __ADD(__db_commit); + __ADD(__db_filestart); + __ADD(__log_rec_hdr); + __ADD(__db_log_verify_info); + __ADD(__txn_verify_info); + __ADD(__lv_filereg_info); + __ADD(__lv_filelife); + __ADD(__lv_ckp_info); + __ADD(__lv_timestamp_info); + __ADD(__lv_txnrange); + __ADD(__add_recycle_params); + __ADD(__ckp_verify_params); + __ADD(__db_mpool); + __ADD(__db_mpreg); + __ADD(__db_mpool_hash); + __ADD(__bh_frozen_p); + __ADD(__bh_frozen_a); +#ifdef HAVE_MUTEX_SUPPORT + __ADD(__db_mutexmgr); +#endif + __ADD(__fh_t); + __ADD(__db_partition); + __ADD(__part_internal); + __ADD(__qcursor); + __ADD(__mpfarray); + __ADD(__qmpf); + __ADD(__queue); + __ADD(__qam_filelist); + __ADD(__db_reg_env_ref); + __ADD(__db_region_mem_t); + __ADD(__db_reginfo_t); + __ADD(__rep_waiter); + __ADD(__db_rep); + __ADD(__rep_lease_entry); + __ADD(__txn_detail); + __ADD(__db_txnmgr); + __ADD(__db_commit_info); + __ADD(__txn_logrec); +#endif + + return (__ham_func5(NULL, t, i * sizeof(t[0]))); +} diff --git a/src/env/env_stat.c b/src/env/env_stat.c new file mode 100644 index 00000000..9bc3fe7e --- /dev/null +++ b/src/env/env_stat.c @@ -0,0 +1,879 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996, 2012 Oracle and/or its affiliates. All rights reserved. + * + * $Id$ + */ + +#include "db_config.h" + +#include "db_int.h" +#include "dbinc/db_page.h" +#include "dbinc/db_am.h" +#include "dbinc/lock.h" +#include "dbinc/mp.h" +#include "dbinc/txn.h" + +#ifdef HAVE_STATISTICS +static int __env_print_all __P((ENV *, u_int32_t)); +static int __env_print_dbenv_all __P((ENV *, u_int32_t)); +static int __env_print_env_all __P((ENV *, u_int32_t)); +static int __env_print_fh __P((ENV *)); +static int __env_print_stats __P((ENV *, u_int32_t)); +static int __env_print_thread __P((ENV *)); +static int __env_stat_print __P((ENV *, u_int32_t)); +static char *__env_thread_state_print __P((DB_THREAD_STATE)); +static const char * + __reg_type __P((reg_type_t)); + +/* + * __env_stat_print_pp -- + * ENV->stat_print pre/post processor. + * + * PUBLIC: int __env_stat_print_pp __P((DB_ENV *, u_int32_t)); + */ +int +__env_stat_print_pp(dbenv, flags) + DB_ENV *dbenv; + u_int32_t flags; +{ + DB_THREAD_INFO *ip; + ENV *env; + int ret; + + env = dbenv->env; + + ENV_ILLEGAL_BEFORE_OPEN(env, "DB_ENV->stat_print"); + + if ((ret = __db_fchk(env, "DB_ENV->stat_print", + flags, DB_STAT_ALL | DB_STAT_ALLOC | + DB_STAT_CLEAR | DB_STAT_SUBSYSTEM)) != 0) + return (ret); + + ENV_ENTER(env, ip); + REPLICATION_WRAP(env, (__env_stat_print(env, flags)), 0, ret); + ENV_LEAVE(env, ip); + return (ret); +} + +/* + * __env_stat_print -- + * ENV->stat_print method. + */ +static int +__env_stat_print(env, flags) + ENV *env; + u_int32_t flags; +{ + time_t now; + int ret; + char time_buf[CTIME_BUFLEN]; + + (void)time(&now); + __db_msg(env, "%.24s\tLocal time", __os_ctime(&now, time_buf)); + + if ((ret = __env_print_stats(env, flags)) != 0) + return (ret); + + if (LF_ISSET(DB_STAT_ALL) && + (ret = __env_print_all(env, flags)) != 0) + return (ret); + + if ((ret = __env_print_thread(env)) != 0) + return (ret); + + if ((ret = __env_print_fh(env)) != 0) + return (ret); + + if (!LF_ISSET(DB_STAT_SUBSYSTEM)) + return (0); + + if (LOGGING_ON(env)) { + __db_msg(env, "%s", DB_GLOBAL(db_line)); + if ((ret = __log_stat_print(env, flags)) != 0) + return (ret); + + __db_msg(env, "%s", DB_GLOBAL(db_line)); + if ((ret = __dbreg_stat_print(env, flags)) != 0) + return (ret); + } + + if (LOCKING_ON(env)) { + __db_msg(env, "%s", DB_GLOBAL(db_line)); + if ((ret = __lock_stat_print(env, flags)) != 0) + return (ret); + } + + if (MPOOL_ON(env)) { + __db_msg(env, "%s", DB_GLOBAL(db_line)); + if ((ret = __memp_stat_print(env, flags)) != 0) + return (ret); + } + + if (REP_ON(env)) { + __db_msg(env, "%s", DB_GLOBAL(db_line)); + if ((ret = __rep_stat_print(env, flags)) != 0) + return (ret); +#ifdef HAVE_REPLICATION_THREADS + if ((ret = __repmgr_stat_print(env, flags)) != 0) + return (ret); +#endif + } + + if (TXN_ON(env)) { + __db_msg(env, "%s", DB_GLOBAL(db_line)); + if ((ret = __txn_stat_print(env, flags)) != 0) + return (ret); + } + +#ifdef HAVE_MUTEX_SUPPORT + /* + * Dump the mutexes last. If DB_STAT_CLEAR is set this will + * clear out the mutex counters and we want to see them in + * the context of the other subsystems first. + */ + if (MUTEX_ON(env)) { + __db_msg(env, "%s", DB_GLOBAL(db_line)); + if ((ret = __mutex_stat_print(env, flags)) != 0) + return (ret); + } +#endif + + return (0); +} + +/* + * __env_print_stats -- + * Display the default environment statistics. + * + */ +static int +__env_print_stats(env, flags) + ENV *env; + u_int32_t flags; +{ + REGENV *renv; + REGINFO *infop; + char time_buf[CTIME_BUFLEN]; + + infop = env->reginfo; + renv = infop->primary; + + if (LF_ISSET(DB_STAT_ALL)) { + __db_msg(env, "%s", DB_GLOBAL(db_line)); + __db_msg(env, "Default database environment information:"); + } + STAT_HEX("Magic number", renv->magic); + STAT_LONG("Panic value", renv->panic); + __db_msg(env, "%d.%d.%d\tEnvironment version", + renv->majver, renv->minver, renv->patchver); + STAT_LONG("Btree version", DB_BTREEVERSION); + STAT_LONG("Hash version", DB_HASHVERSION); + STAT_LONG("Lock version", DB_LOCKVERSION); + STAT_LONG("Log version", DB_LOGVERSION); + STAT_LONG("Queue version", DB_QAMVERSION); + STAT_LONG("Sequence version", DB_SEQUENCE_VERSION); + STAT_LONG("Txn version", DB_TXNVERSION); + __db_msg(env, + "%.24s\tCreation time", __os_ctime(&renv->timestamp, time_buf)); + STAT_HEX("Environment ID", renv->envid); + __mutex_print_debug_single(env, + "Primary region allocation and reference count mutex", + renv->mtx_regenv, flags); + STAT_LONG("References", renv->refcnt); + __db_dlbytes(env, "Current region size", + (u_long)0, (u_long)0, (u_long)infop->rp->size); + __db_dlbytes(env, "Maximum region size", + (u_long)0, (u_long)0, (u_long)infop->rp->max); + + return (0); +} + +/* + * __env_print_all -- + * Display the debugging environment statistics. + */ +static int +__env_print_all(env, flags) + ENV *env; + u_int32_t flags; +{ + int ret, t_ret; + + /* + * There are two structures -- DB_ENV and ENV. + */ + ret = __env_print_dbenv_all(env, flags); + if ((t_ret = __env_print_env_all(env, flags)) != 0 && ret == 0) + ret = t_ret; + + return (ret); +} + +/* + * __env_print_dbenv_all -- + * Display the debugging environment statistics. + */ +static int +__env_print_dbenv_all(env, flags) + ENV *env; + u_int32_t flags; +{ + static const FN db_env_fn[] = { + { DB_ENV_AUTO_COMMIT, "DB_ENV_AUTO_COMMIT" }, + { DB_ENV_CDB_ALLDB, "DB_ENV_CDB_ALLDB" }, + { DB_ENV_DIRECT_DB, "DB_ENV_DIRECT_DB" }, + { DB_ENV_DSYNC_DB, "DB_ENV_DSYNC_DB" }, + { DB_ENV_MULTIVERSION, "DB_ENV_MULTIVERSION" }, + { DB_ENV_NOLOCKING, "DB_ENV_NOLOCKING" }, + { DB_ENV_NOMMAP, "DB_ENV_NOMMAP" }, + { DB_ENV_NOPANIC, "DB_ENV_NOPANIC" }, + { DB_ENV_OVERWRITE, "DB_ENV_OVERWRITE" }, + { DB_ENV_REGION_INIT, "DB_ENV_REGION_INIT" }, + { DB_ENV_TIME_NOTGRANTED, "DB_ENV_TIME_NOTGRANTED" }, + { DB_ENV_TXN_NOSYNC, "DB_ENV_TXN_NOSYNC" }, + { DB_ENV_TXN_NOWAIT, "DB_ENV_TXN_NOWAIT" }, + { DB_ENV_TXN_SNAPSHOT, "DB_ENV_TXN_SNAPSHOT" }, + { DB_ENV_TXN_WRITE_NOSYNC, "DB_ENV_TXN_WRITE_NOSYNC" }, + { DB_ENV_YIELDCPU, "DB_ENV_YIELDCPU" }, + { 0, NULL } + }; + static const FN vfn[] = { + { DB_VERB_DEADLOCK, "DB_VERB_DEADLOCK" }, + { DB_VERB_FILEOPS, "DB_VERB_FILEOPS" }, + { DB_VERB_FILEOPS_ALL, "DB_VERB_FILEOPS_ALL" }, + { DB_VERB_RECOVERY, "DB_VERB_RECOVERY" }, + { DB_VERB_REGISTER, "DB_VERB_REGISTER" }, + { DB_VERB_REPLICATION, "DB_VERB_REPLICATION" }, + { DB_VERB_REP_ELECT, "DB_VERB_REP_ELECT" }, + { DB_VERB_REP_LEASE, "DB_VERB_REP_LEASE" }, + { DB_VERB_REP_MISC, "DB_VERB_REP_MISC" }, + { DB_VERB_REP_MSGS, "DB_VERB_REP_MSGS" }, + { DB_VERB_REP_SYNC, "DB_VERB_REP_SYNC" }, + { DB_VERB_REP_SYSTEM, "DB_VERB_REP_SYSTEM" }, + { DB_VERB_REP_TEST, "DB_VERB_REP_TEST" }, + { DB_VERB_REPMGR_CONNFAIL, "DB_VERB_REPMGR_CONNFAIL" }, + { DB_VERB_REPMGR_MISC, "DB_VERB_REPMGR_MISC" }, + { DB_VERB_WAITSFOR, "DB_VERB_WAITSFOR" }, + { 0, NULL } + }; + DB_ENV *dbenv; + DB_MSGBUF mb; + char **p; + + dbenv = env->dbenv; + DB_MSGBUF_INIT(&mb); + + __db_msg(env, "%s", DB_GLOBAL(db_line)); + STAT_POINTER("ENV", dbenv->env); + __mutex_print_debug_single( + env, "DB_ENV handle mutex", dbenv->mtx_db_env, flags); + STAT_ISSET("Errcall", dbenv->db_errcall); + STAT_ISSET("Errfile", dbenv->db_errfile); + STAT_STRING("Errpfx", dbenv->db_errpfx); + STAT_ISSET("Msgfile", dbenv->db_msgfile); + STAT_ISSET("Msgcall", dbenv->db_msgcall); + + STAT_ISSET("AppDispatch", dbenv->app_dispatch); + STAT_ISSET("Event", dbenv->db_event_func); + STAT_ISSET("Feedback", dbenv->db_feedback); + STAT_ISSET("Free", dbenv->db_free); + STAT_ISSET("Panic", dbenv->db_paniccall); + STAT_ISSET("Malloc", dbenv->db_malloc); + STAT_ISSET("Realloc", dbenv->db_realloc); + STAT_ISSET("IsAlive", dbenv->is_alive); + STAT_ISSET("ThreadId", dbenv->thread_id); + STAT_ISSET("ThreadIdString", dbenv->thread_id_string); + + STAT_STRING("Log dir", dbenv->db_log_dir); + STAT_STRING("Metadata dir", dbenv->db_md_dir); + STAT_STRING("Tmp dir", dbenv->db_tmp_dir); + if (dbenv->db_data_dir == NULL) + STAT_ISSET("Data dir", dbenv->db_data_dir); + else { + for (p = dbenv->db_data_dir; *p != NULL; ++p) + __db_msgadd(env, &mb, "%s\tData dir", *p); + DB_MSGBUF_FLUSH(env, &mb); + } + + STAT_STRING( + "Intermediate directory mode", dbenv->intermediate_dir_mode); + + STAT_LONG("Shared memory key", dbenv->shm_key); + + STAT_ISSET("Password", dbenv->passwd); + + STAT_ISSET("App private", dbenv->app_private); + STAT_ISSET("Api1 internal", dbenv->api1_internal); + STAT_ISSET("Api2 internal", dbenv->api2_internal); + + __db_prflags(env, NULL, dbenv->verbose, vfn, NULL, "\tVerbose flags"); + + STAT_ULONG("Mutex align", dbenv->mutex_align); + STAT_ULONG("Mutex cnt", dbenv->mutex_cnt); + STAT_ULONG("Mutex inc", dbenv->mutex_inc); + STAT_ULONG("Mutex tas spins", dbenv->mutex_tas_spins); + + STAT_ISSET("Lock conflicts", dbenv->lk_conflicts); + STAT_LONG("Lock modes", dbenv->lk_modes); + STAT_ULONG("Lock detect", dbenv->lk_detect); + STAT_ULONG("Lock init", dbenv->lk_init); + STAT_ULONG("Lock init lockers", dbenv->lk_init_lockers); + STAT_ULONG("Lock init objects", dbenv->lk_init_objects); + STAT_ULONG("Lock max", dbenv->lk_max); + STAT_ULONG("Lock max lockers", dbenv->lk_max_lockers); + STAT_ULONG("Lock max objects", dbenv->lk_max_objects); + STAT_ULONG("Lock partitions", dbenv->lk_partitions); + STAT_ULONG("Lock object hash table size", dbenv->object_t_size); + STAT_ULONG("Lock timeout", dbenv->lk_timeout); + + STAT_ULONG("Log bsize", dbenv->lg_bsize); + STAT_FMT("Log file mode", "%#o", int, dbenv->lg_filemode); + STAT_ULONG("Log region max", dbenv->lg_regionmax); + STAT_ULONG("Log size", dbenv->lg_size); + + STAT_ULONG("Cache GB", dbenv->mp_gbytes); + STAT_ULONG("Cache B", dbenv->mp_bytes); + STAT_ULONG("Cache max GB", dbenv->mp_max_gbytes); + STAT_ULONG("Cache max B", dbenv->mp_max_bytes); + STAT_ULONG("Cache mmap size", dbenv->mp_mmapsize); + STAT_ULONG("Cache max open fd", dbenv->mp_maxopenfd); + STAT_ULONG("Cache max write", dbenv->mp_maxwrite); + STAT_ULONG("Cache number", dbenv->mp_ncache); + STAT_ULONG("Cache max write sleep", dbenv->mp_maxwrite_sleep); + + STAT_ULONG("Txn init", dbenv->tx_init); + STAT_ULONG("Txn max", dbenv->tx_max); + STAT_ULONG("Txn timestamp", dbenv->tx_timestamp); + STAT_ULONG("Txn timeout", dbenv->tx_timeout); + + STAT_ULONG("Thread count", dbenv->thr_max); + + STAT_ISSET("Registry", dbenv->registry); + STAT_ULONG("Registry offset", dbenv->registry_off); + STAT_ULONG("Registry timeout", dbenv->envreg_timeout); + + __db_prflags(env, + NULL, dbenv->flags, db_env_fn, NULL, "\tPublic environment flags"); + + return (0); +} + +/* + * __env_print_env_all -- + * Display the debugging environment statistics. + */ +static int +__env_print_env_all(env, flags) + ENV *env; + u_int32_t flags; +{ + static const FN env_fn[] = { + { ENV_CDB, "ENV_CDB" }, + { ENV_DBLOCAL, "ENV_DBLOCAL" }, + { ENV_LOCKDOWN, "ENV_LOCKDOWN" }, + { ENV_NO_OUTPUT_SET, "ENV_NO_OUTPUT_SET" }, + { ENV_OPEN_CALLED, "ENV_OPEN_CALLED" }, + { ENV_PRIVATE, "ENV_PRIVATE" }, + { ENV_RECOVER_FATAL, "ENV_RECOVER_FATAL" }, + { ENV_REF_COUNTED, "ENV_REF_COUNTED" }, + { ENV_SYSTEM_MEM, "ENV_SYSTEM_MEM" }, + { ENV_THREAD, "ENV_THREAD" }, + { 0, NULL } + }; + static const FN ofn[] = { + { DB_CREATE, "DB_CREATE" }, + { DB_FORCE, "DB_FORCE" }, + { DB_INIT_CDB, "DB_INIT_CDB" }, + { DB_INIT_LOCK, "DB_INIT_LOCK" }, + { DB_INIT_LOG, "DB_INIT_LOG" }, + { DB_INIT_MPOOL, "DB_INIT_MPOOL" }, + { DB_INIT_REP, "DB_INIT_REP" }, + { DB_INIT_TXN, "DB_INIT_TXN" }, + { DB_LOCKDOWN, "DB_LOCKDOWN" }, + { DB_NOMMAP, "DB_NOMMAP" }, + { DB_PRIVATE, "DB_PRIVATE" }, + { DB_RDONLY, "DB_RDONLY" }, + { DB_RECOVER, "DB_RECOVER" }, + { DB_RECOVER_FATAL, "DB_RECOVER_FATAL" }, + { DB_SYSTEM_MEM, "DB_SYSTEM_MEM" }, + { DB_THREAD, "DB_THREAD" }, + { DB_TRUNCATE, "DB_TRUNCATE" }, + { DB_TXN_NOSYNC, "DB_TXN_NOSYNC" }, + { DB_USE_ENVIRON, "DB_USE_ENVIRON" }, + { DB_USE_ENVIRON_ROOT, "DB_USE_ENVIRON_ROOT" }, + { 0, NULL } + }; + static const FN regenvfn[] = { + { DB_REGENV_REPLOCKED, "DB_REGENV_REPLOCKED" }, + { 0, NULL } + }; + REGENV *renv; + REGINFO *infop; + REGION *rp; + u_int32_t i; + char time_buf[CTIME_BUFLEN]; + + infop = env->reginfo; + renv = infop->primary; + + __db_msg(env, "%s", DB_GLOBAL(db_line)); + STAT_POINTER("DB_ENV", env->dbenv); + __mutex_print_debug_single( + env, "ENV handle mutex", env->mtx_env, flags); + + STAT_STRING("Home", env->db_home); + __db_prflags(env, NULL, env->open_flags, ofn, NULL, "\tOpen flags"); + STAT_FMT("Mode", "%#o", int, env->db_mode); + + STAT_ULONG("Pid cache", env->pid_cache); + + STAT_ISSET("Lockfhp", env->lockfhp); + + STAT_ISSET("Locker", env->env_lref); + + STAT_ISSET("Internal recovery table", env->recover_dtab.int_dispatch); + STAT_ULONG("Number of recovery table slots", + env->recover_dtab.int_size); + STAT_ISSET("External recovery table", env->recover_dtab.ext_dispatch); + STAT_ULONG("Number of recovery table slots", + env->recover_dtab.ext_size); + + STAT_ULONG("Thread hash buckets", env->thr_nbucket); + STAT_ISSET("Thread hash table", env->thr_hashtab); + + __mutex_print_debug_single( + env, "ENV list of DB handles mutex", env->mtx_dblist, flags); + STAT_LONG("DB reference count", env->db_ref); + + __mutex_print_debug_single(env, "MT mutex", env->mtx_mt, flags); + + STAT_ISSET("Crypto handle", env->crypto_handle); + STAT_ISSET("Lock handle", env->lk_handle); + STAT_ISSET("Log handle", env->lg_handle); + STAT_ISSET("Cache handle", env->mp_handle); + STAT_ISSET("Mutex handle", env->mutex_handle); + STAT_ISSET("Replication handle", env->rep_handle); + STAT_ISSET("Txn handle", env->tx_handle); + + STAT_ISSET("User copy", env->dbt_usercopy); + + STAT_LONG("Test abort", env->test_abort); + STAT_LONG("Test check", env->test_check); + STAT_LONG("Test copy", env->test_copy); + + __db_prflags(env, + NULL, env->flags, env_fn, NULL, "\tPrivate environment flags"); + + __db_print_reginfo(env, infop, "Primary", flags); + __db_msg(env, "%s", DB_GLOBAL(db_line)); + __db_msg(env, "Per region database environment information:"); + for (rp = R_ADDR(infop, renv->region_off), + i = 0; i < renv->region_cnt; ++i, ++rp) { + if (rp->id == INVALID_REGION_ID) + continue; + __db_msg(env, "%s Region:", __reg_type(rp->type)); + STAT_LONG("Region ID", rp->id); + STAT_LONG("Segment ID", rp->segid); + __db_dlbytes(env, + "Size", (u_long)0, (u_long)0, (u_long)rp->size); + } + __db_prflags(env, + NULL, renv->init_flags, ofn, NULL, "\tInitialization flags"); + STAT_ULONG("Region slots", renv->region_cnt); + __db_prflags(env, + NULL, renv->flags, regenvfn, NULL, "\tReplication flags"); + __db_msg(env, "%.24s\tOperation timestamp", + renv->op_timestamp == 0 ? + "!Set" : __os_ctime(&renv->op_timestamp, time_buf)); + __db_msg(env, "%.24s\tReplication timestamp", + renv->rep_timestamp == 0 ? + "!Set" : __os_ctime(&renv->rep_timestamp, time_buf)); + + return (0); +} + +static char * +__env_thread_state_print(state) + DB_THREAD_STATE state; +{ + switch (state) { + case THREAD_ACTIVE: + return ("active"); + case THREAD_BLOCKED: + return ("blocked"); + case THREAD_BLOCKED_DEAD: + return ("blocked and dead"); + case THREAD_OUT: + return ("out"); + default: + return ("unknown"); + } + /* NOTREACHED */ +} + +/* + * __env_print_thread -- + * Display the thread block state. + */ +static int +__env_print_thread(env) + ENV *env; +{ + BH *bhp; + DB_ENV *dbenv; + DB_HASHTAB *htab; + DB_MPOOL *dbmp; + DB_THREAD_INFO *ip; + PIN_LIST *list, *lp; + REGENV *renv; + REGINFO *infop; + THREAD_INFO *thread; + u_int32_t i; + char buf[DB_THREADID_STRLEN]; + + dbenv = env->dbenv; + + /* The thread table may not be configured. */ + if ((htab = env->thr_hashtab) == NULL) + return (0); + + dbmp = env->mp_handle; + __db_msg(env, "%s", DB_GLOBAL(db_line)); + __db_msg(env, "Thread tracking information"); + + /* Dump out the info we have on thread tracking. */ + infop = env->reginfo; + renv = infop->primary; + thread = R_ADDR(infop, renv->thread_off); + STAT_ULONG("Thread blocks allocated", thread->thr_count); + STAT_ULONG("Thread allocation threshold", thread->thr_max); + STAT_ULONG("Thread hash buckets", thread->thr_nbucket); + + /* Dump out the info we have on active threads. */ + __db_msg(env, "Thread status blocks:"); + for (i = 0; i < env->thr_nbucket; i++) + SH_TAILQ_FOREACH(ip, &htab[i], dbth_links, __db_thread_info) { + if (ip->dbth_state == THREAD_SLOT_NOT_IN_USE) + continue; + __db_msg(env, "\tprocess/thread %s: %s", + dbenv->thread_id_string( + dbenv, ip->dbth_pid, ip->dbth_tid, buf), + __env_thread_state_print(ip->dbth_state)); + list = R_ADDR(env->reginfo, ip->dbth_pinlist); + for (lp = list; lp < &list[ip->dbth_pinmax]; lp++) { + if (lp->b_ref == INVALID_ROFF) + continue; + bhp = R_ADDR( + &dbmp->reginfo[lp->region], lp->b_ref); + __db_msg(env, + "\t\tpins: %lu", (u_long)bhp->pgno); + } + } + return (0); +} + +/* + * __env_print_fh -- + * Display statistics for all handles open in this environment. + */ +static int +__env_print_fh(env) + ENV *env; +{ + DB_FH *fhp; + + if (TAILQ_FIRST(&env->fdlist) == NULL) + return (0); + + __db_msg(env, "%s", DB_GLOBAL(db_line)); + __db_msg(env, "Environment file handle information"); + + MUTEX_LOCK(env, env->mtx_env); + + TAILQ_FOREACH(fhp, &env->fdlist, q) + __db_print_fh(env, NULL, fhp, 0); + + MUTEX_UNLOCK(env, env->mtx_env); + + return (0); +} + +/* + * __db_print_fh -- + * Print out a file handle. + * + * PUBLIC: void __db_print_fh __P((ENV *, const char *, DB_FH *, u_int32_t)); + */ +void +__db_print_fh(env, tag, fh, flags) + ENV *env; + const char *tag; + DB_FH *fh; + u_int32_t flags; +{ + static const FN fn[] = { + { DB_FH_NOSYNC, "DB_FH_NOSYNC" }, + { DB_FH_OPENED, "DB_FH_OPENED" }, + { DB_FH_UNLINK, "DB_FH_UNLINK" }, + { 0, NULL } + }; + + if (fh == NULL) { + STAT_ISSET(tag, fh); + return; + } + + STAT_STRING("file-handle.file name", fh->name); + + __mutex_print_debug_single( + env, "file-handle.mutex", fh->mtx_fh, flags); + + STAT_LONG("file-handle.reference count", fh->ref); + STAT_LONG("file-handle.file descriptor", fh->fd); + + STAT_ULONG("file-handle.page number", fh->pgno); + STAT_ULONG("file-handle.page size", fh->pgsize); + STAT_ULONG("file-handle.page offset", fh->offset); + + STAT_ULONG("file-handle.seek count", fh->seek_count); + STAT_ULONG("file-handle.read count", fh->read_count); + STAT_ULONG("file-handle.write count", fh->write_count); + + __db_prflags(env, NULL, fh->flags, fn, NULL, "\tfile-handle.flags"); +} + +/* + * __db_print_fileid -- + * Print out a file ID. + * + * PUBLIC: void __db_print_fileid __P((ENV *, u_int8_t *, const char *)); + */ +void +__db_print_fileid(env, id, suffix) + ENV *env; + u_int8_t *id; + const char *suffix; +{ + DB_MSGBUF mb; + int i; + + if (id == NULL) { + STAT_ISSET("ID", id); + return; + } + + DB_MSGBUF_INIT(&mb); + for (i = 0; i < DB_FILE_ID_LEN; ++i, ++id) { + __db_msgadd(env, &mb, "%x", (u_int)*id); + if (i < DB_FILE_ID_LEN - 1) + __db_msgadd(env, &mb, " "); + } + if (suffix != NULL) + __db_msgadd(env, &mb, "%s", suffix); + DB_MSGBUF_FLUSH(env, &mb); +} + +/* + * __db_dl -- + * Display a big value. + * + * PUBLIC: void __db_dl __P((ENV *, const char *, u_long)); + */ +void +__db_dl(env, msg, value) + ENV *env; + const char *msg; + u_long value; +{ + /* + * Two formats: if less than 10 million, display as the number, if + * greater than 10 million display as ###M. + */ + if (value < 10000000) + __db_msg(env, "%lu\t%s", value, msg); + else + __db_msg(env, "%luM\t%s (%lu)", value / 1000000, msg, value); +} + +/* + * __db_dl_pct -- + * Display a big value, and related percentage. + * + * PUBLIC: void __db_dl_pct + * PUBLIC: __P((ENV *, const char *, u_long, int, const char *)); + */ +void +__db_dl_pct(env, msg, value, pct, tag) + ENV *env; + const char *msg, *tag; + u_long value; + int pct; +{ + DB_MSGBUF mb; + + DB_MSGBUF_INIT(&mb); + + /* + * Two formats: if less than 10 million, display as the number, if + * greater than 10 million, round it off and display as ###M. + */ + if (value < 10000000) + __db_msgadd(env, &mb, "%lu\t%s", value, msg); + else + __db_msgadd(env, + &mb, "%luM\t%s", (value + 500000) / 1000000, msg); + if (tag == NULL) + __db_msgadd(env, &mb, " (%d%%)", pct); + else + __db_msgadd(env, &mb, " (%d%% %s)", pct, tag); + + DB_MSGBUF_FLUSH(env, &mb); +} + +/* + * __db_dlbytes -- + * Display a big number of bytes. + * + * PUBLIC: void __db_dlbytes + * PUBLIC: __P((ENV *, const char *, u_long, u_long, u_long)); + */ +void +__db_dlbytes(env, msg, gbytes, mbytes, bytes) + ENV *env; + const char *msg; + u_long gbytes, mbytes, bytes; +{ + DB_MSGBUF mb; + const char *sep; + + DB_MSGBUF_INIT(&mb); + + /* Normalize the values. */ + while (bytes >= MEGABYTE) { + ++mbytes; + bytes -= MEGABYTE; + } + while (mbytes >= GIGABYTE / MEGABYTE) { + ++gbytes; + mbytes -= GIGABYTE / MEGABYTE; + } + + if (gbytes == 0 && mbytes == 0 && bytes == 0) + __db_msgadd(env, &mb, "0"); + else { + sep = ""; + if (gbytes > 0) { + __db_msgadd(env, &mb, "%luGB", gbytes); + sep = " "; + } + if (mbytes > 0) { + __db_msgadd(env, &mb, "%s%luMB", sep, mbytes); + sep = " "; + } + if (bytes >= 1024) { + __db_msgadd(env, &mb, "%s%luKB", sep, bytes / 1024); + bytes %= 1024; + sep = " "; + } + if (bytes > 0) + __db_msgadd(env, &mb, "%s%luB", sep, bytes); + } + + __db_msgadd(env, &mb, "\t%s", msg); + + DB_MSGBUF_FLUSH(env, &mb); +} + +/* + * __db_print_reginfo -- + * Print out underlying shared region information. + * + * PUBLIC: void __db_print_reginfo + * PUBLIC: __P((ENV *, REGINFO *, const char *, u_int32_t)); + */ +void +__db_print_reginfo(env, infop, s, flags) + ENV *env; + REGINFO *infop; + const char *s; + u_int32_t flags; +{ + static const FN fn[] = { + { REGION_CREATE, "REGION_CREATE" }, + { REGION_CREATE_OK, "REGION_CREATE_OK" }, + { REGION_JOIN_OK, "REGION_JOIN_OK" }, + { REGION_SHARED, "REGION_SHARED" }, + { 0, NULL } + }; + + __db_msg(env, "%s", DB_GLOBAL(db_line)); + __db_msg(env, "%s REGINFO information:", s); + STAT_STRING("Region type", __reg_type(infop->type)); + STAT_ULONG("Region ID", infop->id); + STAT_STRING("Region name", infop->name); + STAT_POINTER("Region address", infop->addr); + STAT_POINTER("Region allocation head", infop->head); + STAT_POINTER("Region primary address", infop->primary); + STAT_ULONG("Region maximum allocation", infop->max_alloc); + STAT_ULONG("Region allocated", infop->allocated); + __env_alloc_print(infop, flags); + + __db_prflags(env, NULL, infop->flags, fn, NULL, "\tRegion flags"); +} + +/* + * __reg_type -- + * Return the region type string. + */ +static const char * +__reg_type(t) + reg_type_t t; +{ + switch (t) { + case REGION_TYPE_ENV: + return ("Environment"); + case REGION_TYPE_LOCK: + return ("Lock"); + case REGION_TYPE_LOG: + return ("Log"); + case REGION_TYPE_MPOOL: + return ("Mpool"); + case REGION_TYPE_MUTEX: + return ("Mutex"); + case REGION_TYPE_TXN: + return ("Transaction"); + case INVALID_REGION_TYPE: + return ("Invalid"); + } + return ("Unknown"); +} + +#else /* !HAVE_STATISTICS */ + +/* + * __db_stat_not_built -- + * Common error routine when library not built with statistics. + * + * PUBLIC: int __db_stat_not_built __P((ENV *)); + */ +int +__db_stat_not_built(env) + ENV *env; +{ + __db_errx(env, DB_STR("1554", + "Library build did not include statistics support")); + return (DB_OPNOTSUP); +} + +int +__env_stat_print_pp(dbenv, flags) + DB_ENV *dbenv; + u_int32_t flags; +{ + COMPQUIET(flags, 0); + + return (__db_stat_not_built(dbenv->env)); +} +#endif |