diff options
Diffstat (limited to 'bdb/log')
-rw-r--r-- | bdb/log/log.c | 881 | ||||
-rw-r--r-- | bdb/log/log.src | 46 | ||||
-rw-r--r-- | bdb/log/log_archive.c | 263 | ||||
-rw-r--r-- | bdb/log/log_compare.c | 6 | ||||
-rw-r--r-- | bdb/log/log_findckp.c | 135 | ||||
-rw-r--r-- | bdb/log/log_get.c | 1185 | ||||
-rw-r--r-- | bdb/log/log_method.c | 113 | ||||
-rw-r--r-- | bdb/log/log_put.c | 1038 | ||||
-rw-r--r-- | bdb/log/log_rec.c | 647 | ||||
-rw-r--r-- | bdb/log/log_register.c | 433 |
10 files changed, 2579 insertions, 2168 deletions
diff --git a/bdb/log/log.c b/bdb/log/log.c index 8ddb7bcaf7d..f57caeccb95 100644 --- a/bdb/log/log.c +++ b/bdb/log/log.c @@ -1,40 +1,34 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: log.c,v 11.42 2001/01/15 16:42:37 bostic Exp $"; +static const char revid[] = "$Id: log.c,v 11.111 2002/08/16 00:27:44 ubell Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include <sys/types.h> +#include <ctype.h> #include <stdlib.h> #include <string.h> #include <unistd.h> #endif -#ifdef HAVE_RPC -#include "db_server.h" -#endif - #include "db_int.h" -#include "log.h" -#include "db_dispatch.h" -#include "txn.h" -#include "txn_auto.h" - -#ifdef HAVE_RPC -#include "gen_client_ext.h" -#include "rpc_client_ext.h" -#endif +#include "dbinc/crypto.h" +#include "dbinc/hmac.h" +#include "dbinc/log.h" +#include "dbinc/txn.h" static int __log_init __P((DB_ENV *, DB_LOG *)); static int __log_recover __P((DB_LOG *)); +static size_t __log_region_size __P((DB_ENV *)); +static int __log_zero __P((DB_ENV *, DB_LSN *, DB_LSN *)); /* * __log_open -- @@ -49,16 +43,10 @@ __log_open(dbenv) DB_LOG *dblp; LOG *lp; int ret; - u_int8_t *readbufp; - - readbufp = NULL; /* Create/initialize the DB_LOG structure. */ if ((ret = __os_calloc(dbenv, 1, sizeof(DB_LOG), &dblp)) != 0) return (ret); - if ((ret = __os_calloc(dbenv, 1, dbenv->lg_bsize, &readbufp)) != 0) - goto err; - ZERO_LSN(dblp->c_lsn); dblp->dbenv = dbenv; /* Join/create the log region. */ @@ -69,40 +57,66 @@ __log_open(dbenv) if (F_ISSET(dbenv, DB_ENV_CREATE)) F_SET(&dblp->reginfo, REGION_CREATE_OK); if ((ret = __db_r_attach( - dbenv, &dblp->reginfo, LG_BASE_REGION_SIZE + dbenv->lg_bsize)) != 0) + dbenv, &dblp->reginfo, __log_region_size(dbenv))) != 0) goto err; - dblp->readbufp = readbufp; - /* If we created the region, initialize it. */ - if (F_ISSET(&dblp->reginfo, REGION_CREATE) && - (ret = __log_init(dbenv, dblp)) != 0) - goto err; + if (F_ISSET(&dblp->reginfo, REGION_CREATE)) + if ((ret = __log_init(dbenv, dblp)) != 0) + goto err; /* Set the local addresses. */ lp = dblp->reginfo.primary = R_ADDR(&dblp->reginfo, dblp->reginfo.rp->primary); - dblp->bufp = R_ADDR(&dblp->reginfo, lp->buffer_off); /* * If the region is threaded, then we have to lock both the handles * and the region, and we need to allocate a mutex for that purpose. */ - if (F_ISSET(dbenv, DB_ENV_THREAD)) { - if ((ret = __db_mutex_alloc( - dbenv, &dblp->reginfo, &dblp->mutexp)) != 0) - goto err; - if ((ret = __db_mutex_init( - dbenv, dblp->mutexp, 0, MUTEX_THREAD)) != 0) + if (F_ISSET(dbenv, DB_ENV_THREAD) && + (ret = __db_mutex_setup(dbenv, &dblp->reginfo, &dblp->mutexp, + MUTEX_ALLOC | MUTEX_NO_RLOCK)) != 0) + goto err; + + /* Initialize the rest of the structure. */ + dblp->bufp = R_ADDR(&dblp->reginfo, lp->buffer_off); + + /* + * Set the handle -- we may be about to run recovery, which allocates + * log cursors. Log cursors require logging be already configured, + * and the handle being set is what demonstrates that. + * + * If we created the region, run recovery. If that fails, make sure + * we reset the log handle before cleaning up, otherwise we will try + * and clean up again in the mainline DB_ENV initialization code. + */ + dbenv->lg_handle = dblp; + + if (F_ISSET(&dblp->reginfo, REGION_CREATE)) { + if ((ret = __log_recover(dblp)) != 0) { + dbenv->lg_handle = NULL; goto err; + } + + /* + * We first take the log file size from the environment, if + * specified. If that wasn't set, recovery may have set it + * from the persistent information in a log file header. If + * that didn't set it either, we default. + */ + if (lp->log_size == 0) + lp->log_size = lp->log_nsize = LG_MAX_DEFAULT; + } else { + /* + * A process joining the region may have reset the log file + * size, too. If so, it only affects the next log file we + * create. + */ + if (dbenv->lg_size != 0) + lp->log_nsize = dbenv->lg_size; } R_UNLOCK(dbenv, &dblp->reginfo); - - dblp->r_file = 0; - dblp->r_off = 0; - dblp->r_size = 0; - dbenv->lg_handle = dblp; return (0); err: if (dblp->reginfo.addr != NULL) { @@ -112,11 +126,11 @@ err: if (dblp->reginfo.addr != NULL) { (void)__db_r_detach(dbenv, &dblp->reginfo, 0); } - if (readbufp != NULL) - __os_free(readbufp, dbenv->lg_bsize); if (dblp->mutexp != NULL) __db_mutex_free(dbenv, &dblp->reginfo, dblp->mutexp); - __os_free(dblp, sizeof(*dblp)); + + __os_free(dbenv, dblp); + return (ret); } @@ -129,9 +143,13 @@ __log_init(dbenv, dblp) DB_ENV *dbenv; DB_LOG *dblp; { + DB_MUTEX *flush_mutexp; LOG *region; int ret; void *p; +#ifdef HAVE_MUTEX_SYSTEM_RESOURCES + u_int8_t *addr; +#endif if ((ret = __db_shalloc(dblp->reginfo.addr, sizeof(*region), 0, &dblp->reginfo.primary)) != 0) @@ -141,15 +159,55 @@ __log_init(dbenv, dblp) region = dblp->reginfo.primary; memset(region, 0, sizeof(*region)); - region->persist.lg_max = dbenv->lg_max; - region->persist.magic = DB_LOGMAGIC; - region->persist.version = DB_LOGVERSION; - region->persist.mode = dbenv->db_mode; + region->fid_max = 0; SH_TAILQ_INIT(®ion->fq); + region->free_fid_stack = INVALID_ROFF; + region->free_fids = region->free_fids_alloced = 0; /* Initialize LOG LSNs. */ - region->lsn.file = 1; - region->lsn.offset = 0; + INIT_LSN(region->lsn); + INIT_LSN(region->ready_lsn); + INIT_LSN(region->t_lsn); + + /* + * It's possible to be waiting for an LSN of [1][0], if a replication + * client gets the first log record out of order. An LSN of [0][0] + * signifies that we're not waiting. + */ + ZERO_LSN(region->waiting_lsn); + + /* + * Log makes note of the fact that it ran into a checkpoint on + * startup if it did so, as a recovery optimization. A zero + * LSN signifies that it hasn't found one [yet]. + */ + ZERO_LSN(region->cached_ckp_lsn); + +#ifdef HAVE_MUTEX_SYSTEM_RESOURCES + /* Allocate room for the log maintenance info and initialize it. */ + if ((ret = __db_shalloc(dblp->reginfo.addr, + sizeof(REGMAINT) + LG_MAINT_SIZE, 0, &addr)) != 0) + goto mem_err; + __db_maintinit(&dblp->reginfo, addr, LG_MAINT_SIZE); + region->maint_off = R_OFFSET(&dblp->reginfo, addr); +#endif + + if ((ret = __db_mutex_setup(dbenv, &dblp->reginfo, ®ion->fq_mutex, + MUTEX_NO_RLOCK)) != 0) + return (ret); + + /* + * We must create a place for the flush mutex separately; mutexes have + * to be aligned to MUTEX_ALIGN, and the only way to guarantee that is + * to make sure they're at the beginning of a shalloc'ed chunk. + */ + if ((ret = __db_shalloc(dblp->reginfo.addr, + sizeof(DB_MUTEX), MUTEX_ALIGN, &flush_mutexp)) != 0) + goto mem_err; + if ((ret = __db_mutex_setup(dbenv, &dblp->reginfo, flush_mutexp, + MUTEX_NO_RLOCK)) != 0) + return (ret); + region->flush_mutex_off = R_OFFSET(&dblp->reginfo, flush_mutexp); /* Initialize the buffer. */ if ((ret = @@ -159,9 +217,23 @@ mem_err: __db_err(dbenv, "Unable to allocate memory for the log buffer"); } region->buffer_size = dbenv->lg_bsize; region->buffer_off = R_OFFSET(&dblp->reginfo, p); + region->log_size = region->log_nsize = dbenv->lg_size; - /* Try and recover any previous log files before releasing the lock. */ - return (__log_recover(dblp)); + /* Initialize the commit Queue. */ + SH_TAILQ_INIT(®ion->free_commits); + SH_TAILQ_INIT(®ion->commits); + region->ncommit = 0; + + /* + * Fill in the log's persistent header. Don't fill in the log file + * sizes, as they may change at any time and so have to be filled in + * as each log file is created. + */ + region->persist.magic = DB_LOGMAGIC; + region->persist.version = DB_LOGVERSION; + region->persist.mode = (u_int32_t)dbenv->db_mode; + + return (0); } /* @@ -173,12 +245,16 @@ __log_recover(dblp) DB_LOG *dblp; { DBT dbt; + DB_ENV *dbenv; + DB_LOGC *logc; DB_LSN lsn; LOG *lp; - int cnt, found_checkpoint, ret; - u_int32_t chk; + u_int32_t cnt, rectype; + int ret; logfile_validity status; + logc = NULL; + dbenv = dblp->dbenv; lp = dblp->reginfo.primary; /* @@ -192,8 +268,9 @@ __log_recover(dblp) /* * If the last file is an old version, readable or no, start a new - * file. Don't bother finding checkpoints; if we didn't take a - * checkpoint right before upgrading, the user screwed up anyway. + * file. Don't bother finding the end of the last log file; + * we assume that it's valid in its entirety, since the user + * should have shut down cleanly or run recovery before upgrading. */ if (status == DB_LV_OLD_READABLE || status == DB_LV_OLD_UNREADABLE) { lp->lsn.file = lp->s_lsn.file = cnt + 1; @@ -213,25 +290,35 @@ __log_recover(dblp) lsn.file = cnt; lsn.offset = 0; - /* Set the cursor. Shouldn't fail; leave error messages on. */ - memset(&dbt, 0, sizeof(dbt)); - if ((ret = __log_get(dblp, &lsn, &dbt, DB_SET, 0)) != 0) + /* + * Allocate a cursor and set it to the first record. This shouldn't + * fail, leave error messages on. + */ + if ((ret = dbenv->log_cursor(dbenv, &logc, 0)) != 0) return (ret); + F_SET(logc, DB_LOG_LOCKED); + memset(&dbt, 0, sizeof(dbt)); + if ((ret = logc->get(logc, &lsn, &dbt, DB_SET)) != 0) + goto err; /* - * Read to the end of the file, saving checkpoints. This will fail - * at some point, so turn off error messages. + * Read to the end of the file. This may fail at some point, so + * turn off error messages. */ - found_checkpoint = 0; - while (__log_get(dblp, &lsn, &dbt, DB_NEXT, 1) == 0) { + F_SET(logc, DB_LOG_SILENT_ERR); + while (logc->get(logc, &lsn, &dbt, DB_NEXT) == 0) { if (dbt.size < sizeof(u_int32_t)) continue; - memcpy(&chk, dbt.data, sizeof(u_int32_t)); - if (chk == DB_txn_ckp) { - lp->chkpt_lsn = lsn; - found_checkpoint = 1; - } + memcpy(&rectype, dbt.data, sizeof(u_int32_t)); + if (rectype == DB___txn_ckp) + /* + * If we happen to run into a checkpoint, cache its + * LSN so that the transaction system doesn't have + * to walk this log file again looking for it. + */ + lp->cached_ckp_lsn = lsn; } + F_CLR(logc, DB_LOG_SILENT_ERR); /* * We now know where the end of the log is. Set the first LSN that @@ -240,59 +327,24 @@ __log_recover(dblp) */ lp->lsn = lsn; lp->s_lsn = lsn; - lp->lsn.offset += dblp->c_len; - lp->s_lsn.offset += dblp->c_len; + lp->lsn.offset += logc->c_len; + lp->s_lsn.offset += logc->c_len; /* Set up the current buffer information, too. */ - lp->len = dblp->c_len; + lp->len = logc->c_len; lp->b_off = 0; lp->w_off = lp->lsn.offset; - /* - * It's possible that we didn't find a checkpoint because there wasn't - * one in the last log file. Start searching. - */ - if (!found_checkpoint && cnt > 1) { - lsn.file = cnt; - lsn.offset = 0; - - /* Set the cursor. Shouldn't fail, leave error messages on. */ - if ((ret = __log_get(dblp, &lsn, &dbt, DB_SET, 0)) != 0) - return (ret); - - /* - * Read to the end of the file, saving checkpoints. Again, - * this can fail if there are no checkpoints in any log file, - * so turn error messages off. - */ - while (__log_get(dblp, &lsn, &dbt, DB_PREV, 1) == 0) { - if (dbt.size < sizeof(u_int32_t)) - continue; - memcpy(&chk, dbt.data, sizeof(u_int32_t)); - if (chk == DB_txn_ckp) { - lp->chkpt_lsn = lsn; - found_checkpoint = 1; - break; - } - } - } - - /* If we never find a checkpoint, that's okay, just 0 it out. */ - if (!found_checkpoint) -skipsearch: ZERO_LSN(lp->chkpt_lsn); - - /* - * Reset the cursor lsn to the beginning of the log, so that an - * initial call to DB_NEXT does the right thing. - */ - ZERO_LSN(dblp->c_lsn); - - if (FLD_ISSET(dblp->dbenv->verbose, DB_VERB_RECOVERY)) - __db_err(dblp->dbenv, +skipsearch: + if (FLD_ISSET(dbenv->verbose, DB_VERB_RECOVERY)) + __db_err(dbenv, "Finding last valid log LSN: file: %lu offset %lu", (u_long)lp->lsn.file, (u_long)lp->lsn.offset); - return (0); +err: if (logc != NULL) + (void)logc->close(logc, 0); + + return (ret); } /* @@ -301,20 +353,23 @@ skipsearch: ZERO_LSN(lp->chkpt_lsn); * the number of the first readable log file, else it will contain the number * of the last log file (which may be too old to read). * - * PUBLIC: int __log_find __P((DB_LOG *, int, int *, logfile_validity *)); + * PUBLIC: int __log_find __P((DB_LOG *, int, u_int32_t *, logfile_validity *)); */ int __log_find(dblp, find_first, valp, statusp) DB_LOG *dblp; - int find_first, *valp; + int find_first; + u_int32_t *valp; logfile_validity *statusp; { + DB_ENV *dbenv; logfile_validity logval_status, status; u_int32_t clv, logval; int cnt, fcnt, ret; const char *dir; - char **names, *p, *q, savech; + char *c, **names, *p, *q, savech; + dbenv = dblp->dbenv; logval_status = status = DB_LV_NONEXISTENT; /* Return a value of 0 as the log file number on failure. */ @@ -333,7 +388,7 @@ __log_find(dblp, find_first, valp, statusp) } /* Get the list of file names. */ - ret = __os_dirlist(dblp->dbenv, dir, &names, &fcnt); + ret = __os_dirlist(dbenv, dir, &names, &fcnt); /* * !!! @@ -345,8 +400,8 @@ __log_find(dblp, find_first, valp, statusp) *q = savech; if (ret != 0) { - __db_err(dblp->dbenv, "%s: %s", dir, db_strerror(ret)); - __os_freestr(p); + __db_err(dbenv, "%s: %s", dir, db_strerror(ret)); + __os_free(dbenv, p); return (ret); } @@ -356,74 +411,92 @@ __log_find(dblp, find_first, valp, statusp) continue; /* + * Names of the form log\.[0-9]* are reserved for DB. Other + * names sharing LFPREFIX, such as "log.db", are legal. + */ + for (c = names[cnt] + sizeof(LFPREFIX) - 1; *c != '\0'; c++) + if (!isdigit((int)*c)) + break; + if (*c != '\0') + continue; + + /* * Use atol, not atoi; if an "int" is 16-bits, the largest * log file name won't fit. */ clv = atol(names[cnt] + (sizeof(LFPREFIX) - 1)); + + /* + * If searching for the first log file, we want to return the + * oldest log file we can read, or, if no readable log files + * exist, the newest log file we can't read (the crossover + * point between the old and new versions of the log file). + * + * If we're searching for the last log file, we want to return + * the newest log file, period. + * + * Readable log files should never preceede unreadable log + * files, that would mean the admin seriously screwed up. + */ if (find_first) { - if (logval != 0 && clv > logval) + if (logval != 0 && + status != DB_LV_OLD_UNREADABLE && clv > logval) continue; } else if (logval != 0 && clv < logval) continue; - /* - * Take note of whether the log file logval is - * an old version or incompletely initialized. - */ - if ((ret = __log_valid(dblp, clv, 1, &status)) != 0) + if ((ret = __log_valid(dblp, clv, 1, &status)) != 0) { + __db_err(dbenv, "Invalid log file: %s: %s", + names[cnt], db_strerror(ret)); goto err; + } switch (status) { + case DB_LV_NONEXISTENT: + /* __log_valid never returns DB_LV_NONEXISTENT. */ + DB_ASSERT(0); + break; case DB_LV_INCOMPLETE: /* - * It's acceptable for the last log file to - * have been incompletely initialized--it's possible - * to create a log file but not write anything to it, - * and recovery needs to gracefully handle this. - * - * Just ignore it; we don't want to return this - * as a valid log file. + * The last log file may not have been initialized -- + * it's possible to create a log file but not write + * anything to it. If performing recovery (that is, + * if find_first isn't set), ignore the file, it's + * not interesting. If we're searching for the first + * log record, return the file (assuming we don't find + * something better), as the "real" first log record + * is likely to be in the log buffer, and we want to + * set the file LSN for our return. */ + if (find_first) + goto found; break; - case DB_LV_NONEXISTENT: - /* Should never happen. */ - DB_ASSERT(0); + case DB_LV_OLD_UNREADABLE: + /* + * If we're searching for the first log file, then we + * only want this file if we don't yet have a file or + * already have an unreadable file and this one is + * newer than that one. If we're searching for the + * last log file, we always want this file because we + * wouldn't be here if it wasn't newer than our current + * choice. + */ + if (!find_first || logval == 0 || + (status == DB_LV_OLD_UNREADABLE && clv > logval)) + goto found; break; case DB_LV_NORMAL: case DB_LV_OLD_READABLE: - logval = clv; +found: logval = clv; logval_status = status; break; - case DB_LV_OLD_UNREADABLE: - /* - * Continue; we want the oldest valid log, - * and clv is too old to be useful. We don't - * want it to supplant logval if we're looking for - * the oldest valid log, but we do want to return - * it if it's the last log file--we want the very - * last file number, so that our caller can - * start a new file after it. - * - * The code here assumes that there will never - * be a too-old log that's preceded by a log - * of the current version, but in order to - * attain that state of affairs the user - * would have had to really seriously screw - * up; I think we can safely assume this won't - * happen. - */ - if (!find_first) { - logval = clv; - logval_status = status; - } - break; } } *valp = logval; -err: __os_dirfree(names, fcnt); - __os_freestr(p); +err: __os_dirfree(dbenv, names, fcnt); + __os_free(dbenv, p); *statusp = logval_status; return (ret); @@ -446,30 +519,48 @@ __log_valid(dblp, number, set_persist, statusp) int set_persist; logfile_validity *statusp; { + DB_CIPHER *db_cipher; + DB_ENV *dbenv; DB_FH fh; + HDR *hdr; LOG *region; - LOGP persist; - char *fname; - int ret; + LOGP *persist; logfile_validity status; - size_t nw; + size_t hdrsize, nw, recsize; + int is_hmac, need_free, ret; + u_int8_t *tmp; + char *fname; + dbenv = dblp->dbenv; + db_cipher = dbenv->crypto_handle; + persist = NULL; status = DB_LV_NORMAL; /* Try to open the log file. */ if ((ret = __log_name(dblp, number, &fname, &fh, DB_OSO_RDONLY | DB_OSO_SEQ)) != 0) { - __os_freestr(fname); + __os_free(dbenv, fname); return (ret); } + need_free = 0; + hdrsize = HDR_NORMAL_SZ; + is_hmac = 0; + recsize = sizeof(LOGP); + if (CRYPTO_ON(dbenv)) { + hdrsize = HDR_CRYPTO_SZ; + recsize = sizeof(LOGP); + recsize += db_cipher->adj_size(recsize); + is_hmac = 1; + } + if ((ret = __os_calloc(dbenv, 1, recsize + hdrsize, &tmp)) != 0) + return (ret); + need_free = 1; + hdr = (HDR *)tmp; + persist = (LOGP *)(tmp + hdrsize); /* Try to read the header. */ - if ((ret = - __os_seek(dblp->dbenv, - &fh, 0, 0, sizeof(HDR), 0, DB_OS_SEEK_SET)) != 0 || - (ret = - __os_read(dblp->dbenv, &fh, &persist, sizeof(LOGP), &nw)) != 0 || - nw != sizeof(LOGP)) { + if ((ret = __os_read(dbenv, &fh, tmp, recsize + hdrsize, &nw)) != 0 || + nw != recsize + hdrsize) { if (ret == 0) status = DB_LV_INCOMPLETE; else @@ -477,19 +568,63 @@ __log_valid(dblp, number, set_persist, statusp) * The error was a fatal read error, not just an * incompletely initialized log file. */ - __db_err(dblp->dbenv, "Ignoring log file: %s: %s", + __db_err(dbenv, "Ignoring log file: %s: %s", fname, db_strerror(ret)); - (void)__os_closehandle(&fh); + (void)__os_closehandle(dbenv, &fh); goto err; } - (void)__os_closehandle(&fh); + (void)__os_closehandle(dbenv, &fh); + + /* + * Now we have to validate the persistent record. We have + * several scenarios we have to deal with: + * + * 1. User has crypto turned on: + * - They're reading an old, unencrypted log file + * . We will fail the record size match check below. + * - They're reading a current, unencrypted log file + * . We will fail the record size match check below. + * - They're reading an old, encrypted log file [NOT YET] + * . After decryption we'll fail the version check. [NOT YET] + * - They're reading a current, encrypted log file + * . We should proceed as usual. + * 2. User has crypto turned off: + * - They're reading an old, unencrypted log file + * . We will fail the version check. + * - They're reading a current, unencrypted log file + * . We should proceed as usual. + * - They're reading an old, encrypted log file [NOT YET] + * . We'll fail the magic number check (it is encrypted). + * - They're reading a current, encrypted log file + * . We'll fail the magic number check (it is encrypted). + */ + if (CRYPTO_ON(dbenv)) { + /* + * If we are trying to decrypt an unencrypted log + * we can only detect that by having an unreasonable + * data length for our persistent data. + */ + if ((hdr->len - hdrsize) != sizeof(LOGP)) { + __db_err(dbenv, "log record size mismatch"); + goto err; + } + /* Check the checksum and decrypt. */ + if ((ret = __db_check_chksum(dbenv, db_cipher, &hdr->chksum[0], + (u_int8_t *)persist, hdr->len - hdrsize, is_hmac)) != 0) { + __db_err(dbenv, "log record checksum mismatch"); + goto err; + } + if ((ret = db_cipher->decrypt(dbenv, db_cipher->data, + &hdr->iv[0], (u_int8_t *)persist, hdr->len - hdrsize)) != 0) + goto err; + } /* Validate the header. */ - if (persist.magic != DB_LOGMAGIC) { - __db_err(dblp->dbenv, + if (persist->magic != DB_LOGMAGIC) { + __db_err(dbenv, "Ignoring log file: %s: magic number %lx, not %lx", - fname, (u_long)persist.magic, (u_long)DB_LOGMAGIC); + fname, (u_long)persist->magic, (u_long)DB_LOGMAGIC); ret = EINVAL; goto err; } @@ -499,135 +634,162 @@ __log_valid(dblp, number, set_persist, statusp) * belongs to an unreadable or readable old version; leave it * alone if and only if the log file version is the current one. */ - if (persist.version > DB_LOGVERSION) { + if (persist->version > DB_LOGVERSION) { /* This is a fatal error--the log file is newer than DB. */ - __db_err(dblp->dbenv, + __db_err(dbenv, "Ignoring log file: %s: unsupported log version %lu", - fname, (u_long)persist.version); + fname, (u_long)persist->version); ret = EINVAL; goto err; - } else if (persist.version < DB_LOGOLDVER) { + } else if (persist->version < DB_LOGOLDVER) { status = DB_LV_OLD_UNREADABLE; /* * We don't want to set persistent info based on an * unreadable region, so jump to "err". */ goto err; - } else if (persist.version < DB_LOGVERSION) + } else if (persist->version < DB_LOGVERSION) status = DB_LV_OLD_READABLE; /* - * If the log is thus far readable and we're doing system - * initialization, set the region's persistent information - * based on the headers. + * Only if we have a current log do we verify the checksum. + * We could not check the checksum before checking the magic + * and version because old log hdrs have the length and checksum + * in a different location. + */ + if (!CRYPTO_ON(dbenv) && ((ret = __db_check_chksum(dbenv, + db_cipher, &hdr->chksum[0], (u_int8_t *)persist, + hdr->len - hdrsize, is_hmac)) != 0)) { + __db_err(dbenv, "log record checksum mismatch"); + goto err; + } + + /* + * If the log is readable so far and we're doing system initialization, + * set the region's persistent information based on the headers. + * + * Always set the current log file size. Only set the next log file's + * size if the application hasn't set it already. + * + * XXX + * Always use the persistent header's mode, regardless of what was set + * in the current environment. We've always done it this way, but it's + * probably a bug -- I can't think of a way not-changing the mode would + * be a problem, though. */ if (set_persist) { region = dblp->reginfo.primary; - region->persist.lg_max = persist.lg_max; - region->persist.mode = persist.mode; + region->log_size = persist->log_size; + if (region->log_nsize == 0) + region->log_nsize = persist->log_size; + region->persist.mode = persist->mode; } -err: __os_freestr(fname); +err: __os_free(dbenv, fname); + if (need_free) + __os_free(dbenv, tmp); *statusp = status; return (ret); } /* - * __log_close -- - * Internal version of log_close: only called from dbenv_refresh. + * __log_dbenv_refresh -- + * Clean up after the log system on a close or failed open. Called only + * from __dbenv_refresh. (Formerly called __log_close.) * - * PUBLIC: int __log_close __P((DB_ENV *)); + * PUBLIC: int __log_dbenv_refresh __P((DB_ENV *)); */ int -__log_close(dbenv) +__log_dbenv_refresh(dbenv) DB_ENV *dbenv; { DB_LOG *dblp; int ret, t_ret; - ret = 0; dblp = dbenv->lg_handle; /* We may have opened files as part of XA; if so, close them. */ F_SET(dblp, DBLOG_RECOVER); - __log_close_files(dbenv); + ret = __dbreg_close_files(dbenv); /* Discard the per-thread lock. */ if (dblp->mutexp != NULL) __db_mutex_free(dbenv, &dblp->reginfo, dblp->mutexp); /* Detach from the region. */ - ret = __db_r_detach(dbenv, &dblp->reginfo, 0); + if ((t_ret = + __db_r_detach(dbenv, &dblp->reginfo, 0)) != 0 && ret == 0) + ret = t_ret; /* Close open files, release allocated memory. */ if (F_ISSET(&dblp->lfh, DB_FH_VALID) && - (t_ret = __os_closehandle(&dblp->lfh)) != 0 && ret == 0) - ret = t_ret; - if (dblp->c_dbt.data != NULL) - __os_free(dblp->c_dbt.data, dblp->c_dbt.ulen); - if (F_ISSET(&dblp->c_fh, DB_FH_VALID) && - (t_ret = __os_closehandle(&dblp->c_fh)) != 0 && ret == 0) + (t_ret = __os_closehandle(dbenv, &dblp->lfh)) != 0 && ret == 0) ret = t_ret; if (dblp->dbentry != NULL) - __os_free(dblp->dbentry, - (dblp->dbentry_cnt * sizeof(DB_ENTRY))); - if (dblp->readbufp != NULL) - __os_free(dblp->readbufp, dbenv->lg_bsize); + __os_free(dbenv, dblp->dbentry); - __os_free(dblp, sizeof(*dblp)); + __os_free(dbenv, dblp); dbenv->lg_handle = NULL; return (ret); } /* - * log_stat -- - * Return LOG statistics. + * __log_stat -- + * Return log statistics. + * + * PUBLIC: int __log_stat __P((DB_ENV *, DB_LOG_STAT **, u_int32_t)); */ int -log_stat(dbenv, statp, db_malloc) +__log_stat(dbenv, statp, flags) DB_ENV *dbenv; DB_LOG_STAT **statp; - void *(*db_malloc) __P((size_t)); + u_int32_t flags; { DB_LOG *dblp; DB_LOG_STAT *stats; LOG *region; int ret; -#ifdef HAVE_RPC - if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) - return (__dbcl_log_stat(dbenv, statp, db_malloc)); -#endif - PANIC_CHECK(dbenv); - ENV_REQUIRES_CONFIG(dbenv, dbenv->lg_handle, DB_INIT_LOG); + ENV_REQUIRES_CONFIG(dbenv, + dbenv->lg_handle, "DB_ENV->log_stat", DB_INIT_LOG); *statp = NULL; + if ((ret = __db_fchk(dbenv, + "DB_ENV->log_stat", flags, DB_STAT_CLEAR)) != 0) + return (ret); dblp = dbenv->lg_handle; region = dblp->reginfo.primary; - if ((ret = __os_malloc(dbenv, - sizeof(DB_LOG_STAT), db_malloc, &stats)) != 0) + if ((ret = __os_umalloc(dbenv, sizeof(DB_LOG_STAT), &stats)) != 0) return (ret); /* Copy out the global statistics. */ R_LOCK(dbenv, &dblp->reginfo); *stats = region->stat; + if (LF_ISSET(DB_STAT_CLEAR)) + memset(®ion->stat, 0, sizeof(region->stat)); stats->st_magic = region->persist.magic; stats->st_version = region->persist.version; stats->st_mode = region->persist.mode; stats->st_lg_bsize = region->buffer_size; - stats->st_lg_max = region->persist.lg_max; + stats->st_lg_size = region->log_nsize; stats->st_region_wait = dblp->reginfo.rp->mutex.mutex_set_wait; stats->st_region_nowait = dblp->reginfo.rp->mutex.mutex_set_nowait; + if (LF_ISSET(DB_STAT_CLEAR)) { + dblp->reginfo.rp->mutex.mutex_set_wait = 0; + dblp->reginfo.rp->mutex.mutex_set_nowait = 0; + } stats->st_regsize = dblp->reginfo.rp->size; stats->st_cur_file = region->lsn.file; stats->st_cur_offset = region->lsn.offset; + stats->st_disk_file = region->s_lsn.file; + stats->st_disk_offset = region->s_lsn.offset; R_UNLOCK(dbenv, &dblp->reginfo); @@ -636,22 +798,287 @@ log_stat(dbenv, statp, db_malloc) } /* - * __log_lastckp -- - * Return the current chkpt_lsn, so that we can store it in - * the transaction region and keep the chain of checkpoints - * unbroken across environment recreates. + * __log_get_cached_ckp_lsn -- + * Retrieve any last checkpoint LSN that we may have found on startup. + * + * PUBLIC: void __log_get_cached_ckp_lsn __P((DB_ENV *, DB_LSN *)); + */ +void +__log_get_cached_ckp_lsn(dbenv, ckp_lsnp) + DB_ENV *dbenv; + DB_LSN *ckp_lsnp; +{ + DB_LOG *dblp; + LOG *lp; + + dblp = (DB_LOG *)dbenv->lg_handle; + lp = (LOG *)dblp->reginfo.primary; + + R_LOCK(dbenv, &dblp->reginfo); + *ckp_lsnp = lp->cached_ckp_lsn; + R_UNLOCK(dbenv, &dblp->reginfo); +} + +/* + * __log_region_size -- + * Return the amount of space needed for the log region. + * Make the region large enough to hold txn_max transaction + * detail structures plus some space to hold thread handles + * and the beginning of the shalloc region and anything we + * need for mutex system resource recording. + */ +static size_t +__log_region_size(dbenv) + DB_ENV *dbenv; +{ + size_t s; + + s = dbenv->lg_regionmax + dbenv->lg_bsize; +#ifdef HAVE_MUTEX_SYSTEM_RESOURCES + if (F_ISSET(dbenv, DB_ENV_THREAD)) + s += sizeof(REGMAINT) + LG_MAINT_SIZE; +#endif + return (s); +} + +/* + * __log_region_destroy + * Destroy any region maintenance info. + * + * PUBLIC: void __log_region_destroy __P((DB_ENV *, REGINFO *)); + */ +void +__log_region_destroy(dbenv, infop) + DB_ENV *dbenv; + REGINFO *infop; +{ + __db_shlocks_destroy(infop, (REGMAINT *)R_ADDR(infop, + ((LOG *)R_ADDR(infop, infop->rp->primary))->maint_off)); + + COMPQUIET(dbenv, NULL); + COMPQUIET(infop, NULL); +} + +/* + * __log_vtruncate + * This is a virtual truncate. We set up the log indicators to + * make everyone believe that the given record is the last one in the + * log. Returns with the next valid LSN (i.e., the LSN of the next + * record to be written). This is used in replication to discard records + * in the log file that do not agree with the master. + * + * PUBLIC: int __log_vtruncate __P((DB_ENV *, DB_LSN *, DB_LSN *)); + */ +int +__log_vtruncate(dbenv, lsn, ckplsn) + DB_ENV *dbenv; + DB_LSN *lsn, *ckplsn; +{ + DBT log_dbt; + DB_FH fh; + DB_LOG *dblp; + DB_LOGC *logc; + DB_LSN end_lsn; + LOG *lp; + u_int32_t bytes, c_len; + int fn, ret, t_ret; + char *fname; + + /* Need to find out the length of this soon-to-be-last record. */ + if ((ret = dbenv->log_cursor(dbenv, &logc, 0)) != 0) + return (ret); + memset(&log_dbt, 0, sizeof(log_dbt)); + ret = logc->get(logc, lsn, &log_dbt, DB_SET); + c_len = logc->c_len; + if ((t_ret = logc->close(logc, 0)) != 0 && ret == 0) + ret = t_ret; + if (ret != 0) + return (ret); + + /* Now do the truncate. */ + dblp = (DB_LOG *)dbenv->lg_handle; + lp = (LOG *)dblp->reginfo.primary; + + R_LOCK(dbenv, &dblp->reginfo); + end_lsn = lp->lsn; + lp->lsn = *lsn; + lp->len = c_len; + lp->lsn.offset += lp->len; + + /* + * I am going to assume that the number of bytes written since + * the last checkpoint doesn't exceed a 32-bit number. + */ + DB_ASSERT(lp->lsn.file >= ckplsn->file); + bytes = 0; + if (ckplsn->file != lp->lsn.file) { + bytes = lp->log_size - ckplsn->offset; + if (lp->lsn.file > ckplsn->file + 1) + bytes += lp->log_size * + (lp->lsn.file - ckplsn->file - 1); + bytes += lp->lsn.offset; + } else + bytes = lp->lsn.offset - ckplsn->offset; + + lp->stat.st_wc_mbytes += bytes / MEGABYTE; + lp->stat.st_wc_bytes += bytes % MEGABYTE; + + /* + * If the saved lsn is greater than our new end of log, reset it + * to our current end of log. + */ + if (log_compare(&lp->s_lsn, lsn) > 0) + lp->s_lsn = lp->lsn; + + /* + * If the new end of log is in the middle of the buffer, + * don't change the w_off or f_lsn. If the new end is + * before the w_off then reset w_off and f_lsn to the new + * end of log. + */ + if (lp->w_off >= lp->lsn.offset) { + lp->f_lsn = lp->lsn; + lp->w_off = lp->lsn.offset; + lp->b_off = 0; + } else + lp->b_off = lp->lsn.offset - lp->w_off; + + ZERO_LSN(lp->waiting_lsn); + lp->ready_lsn = lp->lsn; + lp->wait_recs = 0; + lp->rcvd_recs = 0; + + /* Now throw away any extra log files that we have around. */ + for (fn = lp->lsn.file + 1;; fn++) { + if (__log_name(dblp, fn, &fname, &fh, DB_OSO_RDONLY) != 0) { + __os_free(dbenv, fname); + break; + } + (void)__os_closehandle(dbenv, &fh); + ret = __os_unlink(dbenv, fname); + __os_free(dbenv, fname); + if (ret != 0) + goto err; + } + + /* Truncate the log to the new point. */ + if ((ret = __log_zero(dbenv, &lp->lsn, &end_lsn)) != 0) + goto err; + +err: R_UNLOCK(dbenv, &dblp->reginfo); + return (ret); +} + +/* + * __log_is_outdated -- + * Used by the replication system to identify if a client's logs + * are too old. The log represented by dbenv is compared to the file + * number passed in fnum. If the log file fnum does not exist and is + * lower-numbered than the current logs, the we return *outdatedp non + * zero, else we return it 0. * - * PUBLIC: int __log_lastckp __P((DB_ENV *, DB_LSN *)); + * PUBLIC: int __log_is_outdated __P((DB_ENV *dbenv, + * PUBLIC: u_int32_t fnum, int *outdatedp)); */ int -__log_lastckp(dbenv, lsnp) +__log_is_outdated(dbenv, fnum, outdatedp) DB_ENV *dbenv; - DB_LSN *lsnp; + u_int32_t fnum; + int *outdatedp; { + DB_LOG *dblp; LOG *lp; + char *name; + int ret; + u_int32_t cfile; - lp = (LOG *)(((DB_LOG *)dbenv->lg_handle)->reginfo.primary); + dblp = dbenv->lg_handle; + *outdatedp = 0; + + if ((ret = __log_name(dblp, fnum, &name, NULL, 0)) != 0) + return (ret); + + /* If the file exists, we're just fine. */ + if (__os_exists(name, NULL) == 0) + goto out; + + /* + * It didn't exist, decide if the file number is too big or + * too little. If it's too little, then we need to indicate + * that the LSN is outdated. + */ + R_LOCK(dbenv, &dblp->reginfo); + lp = (LOG *)dblp->reginfo.primary; + cfile = lp->lsn.file; + R_UNLOCK(dbenv, &dblp->reginfo); + + if (cfile > fnum) + *outdatedp = 1; +out: __os_free(dbenv, name); + return (ret); +} + +/* + * __log_zero -- + * Zero out the tail of a log after a truncate. + */ +static int +__log_zero(dbenv, from_lsn, to_lsn) + DB_ENV *dbenv; + DB_LSN *from_lsn, *to_lsn; +{ + char *lname; + DB_LOG *dblp; + LOG *lp; + int ret; + size_t nbytes, len, nw; + u_int8_t buf[4096]; + u_int32_t mbytes, bytes; + + dblp = dbenv->lg_handle; + lp = (LOG *)dblp->reginfo.primary; + lname = NULL; + + if (dblp->lfname != lp->lsn.file) { + if (F_ISSET(&dblp->lfh, DB_FH_VALID)) + (void)__os_closehandle(dbenv, &dblp->lfh); + dblp->lfname = lp->lsn.file; + } + + if (from_lsn->file != to_lsn->file) { + /* We removed some log files; have to 0 to end of file. */ + if (!F_ISSET(&dblp->lfh, DB_FH_VALID) && (ret = + __log_name(dblp, dblp->lfname, &lname, &dblp->lfh, 0)) != 0) + return (ret); + if ((ret = __os_ioinfo(dbenv, + NULL, &dblp->lfh, &mbytes, &bytes, NULL)) != 0) + goto err; + len = mbytes * MEGABYTE + bytes - from_lsn->offset; + } else if (to_lsn->offset <= from_lsn->offset) + return (0); + else + len = to_lsn->offset = from_lsn->offset; + + memset(buf, 0, sizeof(buf)); + + /* Initialize the write position. */ + if (!F_ISSET(&dblp->lfh, DB_FH_VALID) && + (ret = __log_name(dblp, dblp->lfname, &lname, &dblp->lfh, 0)) != 0) + goto err; + + if ((ret = __os_seek(dbenv, + &dblp->lfh, 0, 0, from_lsn->offset, 0, DB_OS_SEEK_SET)) != 0) + return (ret); + + while (len > 0) { + nbytes = len > sizeof(buf) ? sizeof(buf) : len; + if ((ret = + __os_write(dbenv, &dblp->lfh, buf, nbytes, &nw)) != 0) + return (ret); + len -= nbytes; + } +err: if (lname != NULL) + __os_free(dbenv, lname); - *lsnp = lp->chkpt_lsn; return (0); } diff --git a/bdb/log/log.src b/bdb/log/log.src deleted file mode 100644 index a92fae8de26..00000000000 --- a/bdb/log/log.src +++ /dev/null @@ -1,46 +0,0 @@ -/*- - * See the file LICENSE for redistribution information. - * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 - * Sleepycat Software. All rights reserved. - * - * $Id: log.src,v 10.12 2000/02/17 20:24:10 bostic Exp $ - */ - -PREFIX log - -INCLUDE #include "db_config.h" -INCLUDE -INCLUDE #ifndef NO_SYSTEM_INCLUDES -INCLUDE #include <sys/types.h> -INCLUDE -INCLUDE #include <ctype.h> -INCLUDE #include <errno.h> -INCLUDE #include <string.h> -INCLUDE #endif -INCLUDE -INCLUDE #include "db_int.h" -INCLUDE #include "db_page.h" -INCLUDE #include "db_dispatch.h" -INCLUDE #include "db_am.h" -INCLUDE #include "log.h" -INCLUDE #include "txn.h" -INCLUDE - -/* Used for registering name/id translations at open or close. */ -DEPRECATED register1 1 -ARG opcode u_int32_t lu -DBT name DBT s -DBT uid DBT s -ARG fileid int32_t ld -ARG ftype DBTYPE lx -END - -BEGIN register 2 -ARG opcode u_int32_t lu -DBT name DBT s -DBT uid DBT s -ARG fileid int32_t ld -ARG ftype DBTYPE lx -ARG meta_pgno db_pgno_t lu -END diff --git a/bdb/log/log_archive.c b/bdb/log/log_archive.c index 83728c79e55..19e1af5a93e 100644 --- a/bdb/log/log_archive.c +++ b/bdb/log/log_archive.c @@ -1,14 +1,14 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1997, 1998, 1999, 2000 + * Copyright (c) 1997-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: log_archive.c,v 11.13 2000/11/30 00:58:40 ubell Exp $"; +static const char revid[] = "$Id: log_archive.c,v 11.39 2002/08/06 05:00:31 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -19,50 +19,41 @@ static const char revid[] = "$Id: log_archive.c,v 11.13 2000/11/30 00:58:40 ubel #include <unistd.h> #endif -#ifdef HAVE_RPC -#include "db_server.h" -#endif - #include "db_int.h" -#include "db_dispatch.h" -#include "log.h" -#include "clib_ext.h" /* XXX: needed for getcwd. */ - -#ifdef HAVE_RPC -#include "gen_client_ext.h" -#include "rpc_client_ext.h" -#endif +#include "dbinc/db_page.h" +#include "dbinc/log.h" +#include "dbinc/qam.h" +#include "dbinc/txn.h" static int __absname __P((DB_ENV *, char *, char *, char **)); -static int __build_data __P((DB_ENV *, char *, char ***, void *(*)(size_t))); +static int __build_data __P((DB_ENV *, char *, char ***)); static int __cmpfunc __P((const void *, const void *)); -static int __usermem __P((DB_ENV *, char ***, void *(*)(size_t))); +static int __usermem __P((DB_ENV *, char ***)); /* - * log_archive -- + * __log_archive -- * Supporting function for db_archive(1). + * + * PUBLIC: int __log_archive __P((DB_ENV *, char **[], u_int32_t)); */ int -log_archive(dbenv, listp, flags, db_malloc) +__log_archive(dbenv, listp, flags) DB_ENV *dbenv; char ***listp; u_int32_t flags; - void *(*db_malloc) __P((size_t)); { DBT rec; DB_LOG *dblp; + DB_LOGC *logc; DB_LSN stable_lsn; - u_int32_t fnum; - int array_size, n, ret; + __txn_ckp_args *ckp_args; char **array, **arrayp, *name, *p, *pref, buf[MAXPATHLEN]; - -#ifdef HAVE_RPC - if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) - return (__dbcl_log_archive(dbenv, listp, flags, db_malloc)); -#endif + int array_size, db_arch_abs, n, ret; + u_int32_t fnum; PANIC_CHECK(dbenv); - ENV_REQUIRES_CONFIG(dbenv, dbenv->lg_handle, DB_INIT_LOG); + ENV_REQUIRES_CONFIG(dbenv, + dbenv->lg_handle, "DB_ENV->log_archive", DB_INIT_LOG); name = NULL; dblp = dbenv->lg_handle; @@ -70,15 +61,24 @@ log_archive(dbenv, listp, flags, db_malloc) #define OKFLAGS (DB_ARCH_ABS | DB_ARCH_DATA | DB_ARCH_LOG) if (flags != 0) { - if ((ret = - __db_fchk(dbenv, "log_archive", flags, OKFLAGS)) != 0) + if ((ret = __db_fchk( + dbenv, "DB_ENV->log_archive", flags, OKFLAGS)) != 0) return (ret); - if ((ret = - __db_fcchk(dbenv, - "log_archive", flags, DB_ARCH_DATA, DB_ARCH_LOG)) != 0) + if ((ret = __db_fcchk(dbenv, "DB_ENV->log_archive", + flags, DB_ARCH_DATA, DB_ARCH_LOG)) != 0) return (ret); } + if (LF_ISSET(DB_ARCH_ABS)) { + db_arch_abs = 1; + LF_CLR(DB_ARCH_ABS); + } else + db_arch_abs = 0; + + if (flags == 0 || flags == DB_ARCH_DATA) + ENV_REQUIRES_CONFIG(dbenv, + dbenv->tx_handle, "DB_ENV->log_archive", DB_INIT_TXN); + /* * Get the absolute pathname of the current directory. It would * be nice to get the shortest pathname of the database directory, @@ -88,7 +88,7 @@ log_archive(dbenv, listp, flags, db_malloc) * Can't trust getcwd(3) to set a valid errno. If it doesn't, just * guess that we ran out of memory. */ - if (LF_ISSET(DB_ARCH_ABS)) { + if (db_arch_abs) { __os_set_errno(0); if ((pref = getcwd(buf, sizeof(buf))) == NULL) { if (__os_get_errno() == 0) @@ -98,31 +98,55 @@ log_archive(dbenv, listp, flags, db_malloc) } else pref = NULL; - switch (LF_ISSET(~DB_ARCH_ABS)) { + switch (flags) { case DB_ARCH_DATA: - return (__build_data(dbenv, pref, listp, db_malloc)); + return (__build_data(dbenv, pref, listp)); case DB_ARCH_LOG: memset(&rec, 0, sizeof(rec)); - if (F_ISSET(dbenv, DB_ENV_THREAD)) - F_SET(&rec, DB_DBT_MALLOC); - if ((ret = log_get(dbenv, &stable_lsn, &rec, DB_LAST)) != 0) + if ((ret = dbenv->log_cursor(dbenv, &logc, 0)) != 0) + return (ret); +#ifdef UMRW + ZERO_LSN(stable_lsn); +#endif + ret = logc->get(logc, &stable_lsn, &rec, DB_LAST); + (void)logc->close(logc, 0); + if (ret != 0) return (ret); - if (F_ISSET(dbenv, DB_ENV_THREAD)) - __os_free(rec.data, rec.size); fnum = stable_lsn.file; break; case 0: - if ((ret = __log_findckp(dbenv, &stable_lsn)) != 0) { + memset(&rec, 0, sizeof(rec)); + if (__txn_getckp(dbenv, &stable_lsn) != 0) { /* - * A return of DB_NOTFOUND means that we didn't find - * any records in the log (so we are not going to be - * deleting any log files). + * A failure return means that there's no checkpoint + * in the log (so we are not going to be deleting + * any log files). */ - if (ret != DB_NOTFOUND) - return (ret); *listp = NULL; return (0); } + if ((ret = dbenv->log_cursor(dbenv, &logc, 0)) != 0) + return (ret); + if ((ret = logc->get(logc, &stable_lsn, &rec, DB_SET)) != 0 || + (ret = __txn_ckp_read(dbenv, rec.data, &ckp_args)) != 0) { + /* + * A return of DB_NOTFOUND may only mean that the + * checkpoint LSN is before the beginning of the + * log files that we still have. This is not + * an error; it just means our work is done. + */ + if (ret == DB_NOTFOUND) { + *listp = NULL; + ret = 0; + } + (void)logc->close(logc, 0); + return (ret); + } + if ((ret = logc->close(logc, 0)) != 0) + return (ret); + stable_lsn = ckp_args->ckp_lsn; + __os_free(dbenv, ckp_args); + /* Remove any log files before the last stable LSN. */ fnum = stable_lsn.file - 1; break; @@ -130,9 +154,9 @@ log_archive(dbenv, listp, flags, db_malloc) #define LIST_INCREMENT 64 /* Get some initial space. */ - array_size = 10; + array_size = 64; if ((ret = __os_malloc(dbenv, - sizeof(char *) * array_size, NULL, &array)) != 0) + sizeof(char *) * array_size, &array)) != 0) return (ret); array[0] = NULL; @@ -143,27 +167,27 @@ log_archive(dbenv, listp, flags, db_malloc) if (__os_exists(name, NULL) != 0) { if (LF_ISSET(DB_ARCH_LOG) && fnum == stable_lsn.file) continue; - __os_freestr(name); + __os_free(dbenv, name); name = NULL; break; } - if (n >= array_size - 1) { + if (n >= array_size - 2) { array_size += LIST_INCREMENT; if ((ret = __os_realloc(dbenv, - sizeof(char *) * array_size, NULL, &array)) != 0) + sizeof(char *) * array_size, &array)) != 0) goto err; } - if (LF_ISSET(DB_ARCH_ABS)) { + if (db_arch_abs) { if ((ret = __absname(dbenv, pref, name, &array[n])) != 0) goto err; - __os_freestr(name); + __os_free(dbenv, name); } else if ((p = __db_rpath(name)) != NULL) { if ((ret = __os_strdup(dbenv, p + 1, &array[n])) != 0) goto err; - __os_freestr(name); + __os_free(dbenv, name); } else array[n] = name; @@ -182,7 +206,7 @@ log_archive(dbenv, listp, flags, db_malloc) qsort(array, (size_t)n, sizeof(char *), __cmpfunc); /* Rework the memory. */ - if ((ret = __usermem(dbenv, &array, db_malloc)) != 0) + if ((ret = __usermem(dbenv, &array)) != 0) goto err; *listp = array; @@ -190,11 +214,11 @@ log_archive(dbenv, listp, flags, db_malloc) err: if (array != NULL) { for (arrayp = array; *arrayp != NULL; ++arrayp) - __os_freestr(*arrayp); - __os_free(array, sizeof(char *) * array_size); + __os_free(dbenv, *arrayp); + __os_free(dbenv, array); } if (name != NULL) - __os_freestr(name); + __os_free(dbenv, name); return (ret); } @@ -203,73 +227,89 @@ err: if (array != NULL) { * Build a list of datafiles for return. */ static int -__build_data(dbenv, pref, listp, db_malloc) +__build_data(dbenv, pref, listp) DB_ENV *dbenv; char *pref, ***listp; - void *(*db_malloc) __P((size_t)); { DBT rec; + DB_LOGC *logc; DB_LSN lsn; - __log_register_args *argp; + __dbreg_register_args *argp; u_int32_t rectype; - int array_size, last, n, nxt, ret; - char **array, **arrayp, *p, *real_name; + int array_size, last, n, nxt, ret, t_ret; + char **array, **arrayp, **list, **lp, *p, *real_name; /* Get some initial space. */ - array_size = 10; + array_size = 64; if ((ret = __os_malloc(dbenv, - sizeof(char *) * array_size, NULL, &array)) != 0) + sizeof(char *) * array_size, &array)) != 0) return (ret); array[0] = NULL; memset(&rec, 0, sizeof(rec)); - if (F_ISSET(dbenv, DB_ENV_THREAD)) - F_SET(&rec, DB_DBT_MALLOC); - for (n = 0, ret = log_get(dbenv, &lsn, &rec, DB_FIRST); - ret == 0; ret = log_get(dbenv, &lsn, &rec, DB_NEXT)) { + if ((ret = dbenv->log_cursor(dbenv, &logc, 0)) != 0) + return (ret); + for (n = 0; (ret = logc->get(logc, &lsn, &rec, DB_PREV)) == 0;) { if (rec.size < sizeof(rectype)) { ret = EINVAL; - __db_err(dbenv, "log_archive: bad log record"); - goto lg_free; + __db_err(dbenv, "DB_ENV->log_archive: bad log record"); + goto free_continue; } memcpy(&rectype, rec.data, sizeof(rectype)); - if (rectype != DB_log_register) { - if (F_ISSET(dbenv, DB_ENV_THREAD)) { - __os_free(rec.data, rec.size); - rec.data = NULL; - } + if (rectype != DB___dbreg_register) continue; - } - if ((ret = __log_register_read(dbenv, rec.data, &argp)) != 0) { + if ((ret = + __dbreg_register_read(dbenv, rec.data, &argp)) != 0) { ret = EINVAL; __db_err(dbenv, - "log_archive: unable to read log record"); - goto lg_free; + "DB_ENV->log_archive: unable to read log record"); + goto free_continue; } - if (n >= array_size - 1) { + if (n >= array_size - 2) { array_size += LIST_INCREMENT; if ((ret = __os_realloc(dbenv, - sizeof(char *) * array_size, NULL, &array)) != 0) - goto lg_free; + sizeof(char *) * array_size, &array)) != 0) + goto free_continue; } if ((ret = __os_strdup(dbenv, - argp->name.data, &array[n])) != 0) { -lg_free: if (F_ISSET(&rec, DB_DBT_MALLOC) && rec.data != NULL) - __os_free(rec.data, rec.size); - goto err1; - } - - array[++n] = NULL; - __os_free(argp, 0); - - if (F_ISSET(dbenv, DB_ENV_THREAD)) { - __os_free(rec.data, rec.size); - rec.data = NULL; + argp->name.data, &array[n++])) != 0) + goto free_continue; + array[n] = NULL; + + if (argp->ftype == DB_QUEUE) { + if ((ret = __qam_extent_names(dbenv, + argp->name.data, &list)) != 0) + goto q_err; + for (lp = list; + lp != NULL && *lp != NULL; lp++) { + if (n >= array_size - 2) { + array_size += LIST_INCREMENT; + if ((ret = __os_realloc(dbenv, + sizeof(char *) * + array_size, &array)) != 0) + goto q_err; + } + if ((ret = + __os_strdup(dbenv, *lp, &array[n++])) != 0) + goto q_err; + array[n] = NULL; + } +q_err: if (list != NULL) + __os_free(dbenv, list); } +free_continue: __os_free(dbenv, argp); + if (ret != 0) + break; } + if (ret == DB_NOTFOUND) + ret = 0; + if ((t_ret = logc->close(logc, 0)) != 0 && ret == 0) + ret = t_ret; + if (ret != 0) + goto err1; /* If there's nothing to return, we're done. */ if (n == 0) { @@ -297,34 +337,34 @@ lg_free: if (F_ISSET(&rec, DB_DBT_MALLOC) && rec.data != NULL) } for (++nxt; nxt < n && strcmp(array[last], array[nxt]) == 0; ++nxt) { - __os_freestr(array[nxt]); + __os_free(dbenv, array[nxt]); array[nxt] = NULL; } /* Get the real name. */ if ((ret = __db_appname(dbenv, - DB_APP_DATA, NULL, array[last], 0, NULL, &real_name)) != 0) + DB_APP_DATA, array[last], 0, NULL, &real_name)) != 0) goto err2; /* If the file doesn't exist, ignore it. */ if (__os_exists(real_name, NULL) != 0) { - __os_freestr(real_name); - __os_freestr(array[last]); + __os_free(dbenv, real_name); + __os_free(dbenv, array[last]); array[last] = NULL; continue; } /* Rework the name as requested by the user. */ - __os_freestr(array[last]); + __os_free(dbenv, array[last]); array[last] = NULL; if (pref != NULL) { ret = __absname(dbenv, pref, real_name, &array[last]); - __os_freestr(real_name); + __os_free(dbenv, real_name); if (ret != 0) goto err2; } else if ((p = __db_rpath(real_name)) != NULL) { ret = __os_strdup(dbenv, p + 1, &array[last]); - __os_freestr(real_name); + __os_free(dbenv, real_name); if (ret != 0) goto err2; } else @@ -336,7 +376,7 @@ lg_free: if (F_ISSET(&rec, DB_DBT_MALLOC) && rec.data != NULL) array[last] = NULL; /* Rework the memory. */ - if ((ret = __usermem(dbenv, &array, db_malloc)) != 0) + if ((ret = __usermem(dbenv, &array)) != 0) goto err1; *listp = array; @@ -349,13 +389,13 @@ err2: /* */ if (array != NULL) for (; nxt < n; ++nxt) - __os_freestr(array[nxt]); + __os_free(dbenv, array[nxt]); /* FALLTHROUGH */ err1: if (array != NULL) { for (arrayp = array; *arrayp != NULL; ++arrayp) - __os_freestr(*arrayp); - __os_free(array, array_size * sizeof(char *)); + __os_free(dbenv, *arrayp); + __os_free(dbenv, array); } return (ret); } @@ -379,7 +419,7 @@ __absname(dbenv, pref, name, newnamep) /* Malloc space for concatenating the two. */ if ((ret = __os_malloc(dbenv, - l_pref + l_name + 2, NULL, &newname)) != 0) + l_pref + l_name + 2, &newname)) != 0) return (ret); *newnamep = newname; @@ -400,10 +440,9 @@ __absname(dbenv, pref, name, newnamep) * If the user has their own malloc routine, use it. */ static int -__usermem(dbenv, listp, db_malloc) +__usermem(dbenv, listp) DB_ENV *dbenv; char ***listp; - void *(*db_malloc) __P((size_t)); { size_t len; int ret; @@ -415,7 +454,7 @@ __usermem(dbenv, listp, db_malloc) len += sizeof(char *); /* Allocate it and set up the pointers. */ - if ((ret = __os_malloc(dbenv, len, db_malloc, &array)) != 0) + if ((ret = __os_umalloc(dbenv, len, &array)) != 0) return (ret); strp = (char *)(array + (orig - *listp) + 1); @@ -427,13 +466,13 @@ __usermem(dbenv, listp, db_malloc) *arrayp = strp; strp += len + 1; - __os_freestr(*orig); + __os_free(dbenv, *orig); } /* NULL-terminate the list. */ *arrayp = NULL; - __os_free(*listp, 0); + __os_free(dbenv, *listp); *listp = array; return (0); diff --git a/bdb/log/log_compare.c b/bdb/log/log_compare.c index 9bc3c028a5f..115f9c21b76 100644 --- a/bdb/log/log_compare.c +++ b/bdb/log/log_compare.c @@ -1,13 +1,13 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: log_compare.c,v 11.3 2000/02/14 02:59:59 bostic Exp $"; +static const char revid[] = "$Id: log_compare.c,v 11.6 2002/01/11 15:52:50 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -19,6 +19,8 @@ static const char revid[] = "$Id: log_compare.c,v 11.3 2000/02/14 02:59:59 bosti /* * log_compare -- * Compare two LSN's; return 1, 0, -1 if first is >, == or < second. + * + * EXTERN: int log_compare __P((const DB_LSN *, const DB_LSN *)); */ int log_compare(lsn0, lsn1) diff --git a/bdb/log/log_findckp.c b/bdb/log/log_findckp.c deleted file mode 100644 index b1e8fddbdb7..00000000000 --- a/bdb/log/log_findckp.c +++ /dev/null @@ -1,135 +0,0 @@ -/*- - * See the file LICENSE for redistribution information. - * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 - * Sleepycat Software. All rights reserved. - */ - -#include "db_config.h" - -#ifndef lint -static const char revid[] = "$Id: log_findckp.c,v 11.5 2000/11/30 00:58:40 ubell Exp $"; -#endif /* not lint */ - -#ifndef NO_SYSTEM_INCLUDES -#include <sys/types.h> - -#include <string.h> -#endif - -#include "db_int.h" -#include "log.h" -#include "txn.h" - -/* - * __log_findckp -- - * - * Looks for the most recent checkpoint that occurs before the most recent - * checkpoint LSN, subject to the constraint that there must be at least two - * checkpoints. The reason you need two checkpoints is that you might have - * crashed during the most recent one and may not have a copy of all the - * open files. This is the point from which recovery can start and the - * point up to which archival/truncation can take place. Checkpoints in - * the log look like: - * - * ------------------------------------------------------------------- - * | ckp A, ckplsn 100 | .... record .... | ckp B, ckplsn 600 | ... - * ------------------------------------------------------------------- - * LSN 500 LSN 1000 - * - * If we read what log returns from using the DB_CKP parameter to logput, - * we'll get the record at LSN 1000. The checkpoint LSN there is 600. - * Now we have to scan backwards looking for a checkpoint before LSN 600. - * We find one at 500. This means that we can truncate the log before - * 500 or run recovery beginning at 500. - * - * Returns 0 if we find a suitable checkpoint or we retrieved the first - * record in the log from which to start. Returns DB_NOTFOUND if there - * are no log records, errno on error. - * - * PUBLIC: int __log_findckp __P((DB_ENV *, DB_LSN *)); - */ -int -__log_findckp(dbenv, lsnp) - DB_ENV *dbenv; - DB_LSN *lsnp; -{ - DBT data; - DB_LSN ckp_lsn, final_ckp, last_ckp, next_lsn; - __txn_ckp_args *ckp_args; - int ret; - - /* - * Need to find the appropriate point from which to begin - * recovery. - */ - memset(&data, 0, sizeof(data)); - if (F_ISSET(dbenv, DB_ENV_THREAD)) - F_SET(&data, DB_DBT_MALLOC); - ZERO_LSN(ckp_lsn); - if ((ret = log_get(dbenv, &last_ckp, &data, DB_CHECKPOINT)) != 0) { - if (ret == ENOENT) - goto get_first; - else - return (ret); - } - final_ckp = last_ckp; - - next_lsn = last_ckp; - do { - if (F_ISSET(dbenv, DB_ENV_THREAD)) - __os_free(data.data, data.size); - - if ((ret = log_get(dbenv, &next_lsn, &data, DB_SET)) != 0) - return (ret); - if ((ret = __txn_ckp_read(dbenv, data.data, &ckp_args)) != 0) { - if (F_ISSET(dbenv, DB_ENV_THREAD)) - __os_free(data.data, data.size); - return (ret); - } - if (IS_ZERO_LSN(ckp_lsn)) - ckp_lsn = ckp_args->ckp_lsn; - if (FLD_ISSET(dbenv->verbose, DB_VERB_CHKPOINT)) { - __db_err(dbenv, "Checkpoint at: [%lu][%lu]", - (u_long)last_ckp.file, (u_long)last_ckp.offset); - __db_err(dbenv, "Checkpoint LSN: [%lu][%lu]", - (u_long)ckp_args->ckp_lsn.file, - (u_long)ckp_args->ckp_lsn.offset); - __db_err(dbenv, "Previous checkpoint: [%lu][%lu]", - (u_long)ckp_args->last_ckp.file, - (u_long)ckp_args->last_ckp.offset); - } - last_ckp = next_lsn; - next_lsn = ckp_args->last_ckp; - __os_free(ckp_args, sizeof(*ckp_args)); - - /* - * Keep looping until either you 1) run out of checkpoints, - * 2) you've found a checkpoint before the most recent - * checkpoint's LSN and you have at least 2 checkpoints. - */ - } while (!IS_ZERO_LSN(next_lsn) && - (log_compare(&last_ckp, &ckp_lsn) > 0 || - log_compare(&final_ckp, &last_ckp) == 0)); - - if (F_ISSET(dbenv, DB_ENV_THREAD)) - __os_free(data.data, data.size); - - /* - * At this point, either, next_lsn is ZERO or ckp_lsn is the - * checkpoint lsn and last_ckp is the LSN of the last checkpoint - * before ckp_lsn. If the compare in the loop is still true, then - * next_lsn must be 0 and we need to roll forward from the - * beginning of the log. - */ - if (log_compare(&last_ckp, &ckp_lsn) >= 0 || - log_compare(&final_ckp, &last_ckp) == 0) { -get_first: if ((ret = log_get(dbenv, &last_ckp, &data, DB_FIRST)) != 0) - return (ret); - if (F_ISSET(dbenv, DB_ENV_THREAD)) - __os_free(data.data, data.size); - } - *lsnp = last_ckp; - - return (IS_ZERO_LSN(last_ckp) ? DB_NOTFOUND : 0); -} diff --git a/bdb/log/log_get.c b/bdb/log/log_get.c index b75d50a62fd..c8b028da0fb 100644 --- a/bdb/log/log_get.c +++ b/bdb/log/log_get.c @@ -1,13 +1,13 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: log_get.c,v 11.32 2001/01/11 18:19:53 bostic Exp $"; +static const char revid[] = "$Id: log_get.c,v 11.81 2002/08/14 20:09:27 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -17,81 +17,175 @@ static const char revid[] = "$Id: log_get.c,v 11.32 2001/01/11 18:19:53 bostic E #include <unistd.h> #endif -#ifdef HAVE_RPC -#include "db_server.h" -#endif - #include "db_int.h" -#include "db_page.h" -#include "log.h" -#include "hash.h" +#include "dbinc/crypto.h" +#include "dbinc/db_page.h" +#include "dbinc/hmac.h" +#include "dbinc/log.h" +#include "dbinc/hash.h" -#ifdef HAVE_RPC -#include "gen_client_ext.h" -#include "rpc_client_ext.h" -#endif +typedef enum { L_ALREADY, L_ACQUIRED, L_NONE } RLOCK; + +static int __log_c_close __P((DB_LOGC *, u_int32_t)); +static int __log_c_get __P((DB_LOGC *, DB_LSN *, DBT *, u_int32_t)); +static int __log_c_get_int __P((DB_LOGC *, DB_LSN *, DBT *, u_int32_t)); +static int __log_c_hdrchk __P((DB_LOGC *, HDR *, int *)); +static int __log_c_incursor __P((DB_LOGC *, DB_LSN *, HDR *, u_int8_t **)); +static int __log_c_inregion __P((DB_LOGC *, + DB_LSN *, RLOCK *, DB_LSN *, HDR *, u_int8_t **)); +static int __log_c_io __P((DB_LOGC *, + u_int32_t, u_int32_t, void *, size_t *, int *)); +static int __log_c_ondisk __P((DB_LOGC *, + DB_LSN *, DB_LSN *, int, HDR *, u_int8_t **, int *)); +static int __log_c_set_maxrec __P((DB_LOGC *, char *)); +static int __log_c_shortread __P((DB_LOGC *, int)); /* - * log_get -- - * Get a log record. + * __log_cursor -- + * Create a log cursor. + * + * PUBLIC: int __log_cursor __P((DB_ENV *, DB_LOGC **, u_int32_t)); */ int -log_get(dbenv, alsn, dbt, flags) +__log_cursor(dbenv, logcp, flags) + DB_ENV *dbenv; + DB_LOGC **logcp; + u_int32_t flags; +{ + DB_LOGC *logc; + int ret; + + PANIC_CHECK(dbenv); + ENV_REQUIRES_CONFIG(dbenv, + dbenv->lg_handle, "DB_ENV->log_cursor", DB_INIT_LOG); + + *logcp = NULL; + + /* Validate arguments. */ + if ((ret = __db_fchk(dbenv, "DB_ENV->log_cursor", flags, 0)) != 0) + return (ret); + + /* Allocate memory for the cursor. */ + if ((ret = __os_calloc(dbenv, 1, sizeof(DB_LOGC), &logc)) != 0) + goto err; + if ((ret = __os_calloc(dbenv, 1, sizeof(DB_FH), &logc->c_fh)) != 0) + goto err; + + logc->bp_size = DB_LOGC_BUF_SIZE; + if ((ret = __os_malloc(dbenv, logc->bp_size, &logc->bp)) != 0) + goto err; + + logc->dbenv = dbenv; + logc->close = __log_c_close; + logc->get = __log_c_get; + + *logcp = logc; + return (0); + +err: if (logc != NULL) { + if (logc->c_fh != NULL) + __os_free(dbenv, logc->c_fh); + __os_free(dbenv, logc); + } + + return (ret); +} + +/* + * __log_c_close -- + * Close a log cursor. + */ +static int +__log_c_close(logc, flags) + DB_LOGC *logc; + u_int32_t flags; +{ DB_ENV *dbenv; + int ret; + + dbenv = logc->dbenv; + + PANIC_CHECK(dbenv); + if ((ret = __db_fchk(dbenv, "DB_LOGC->close", flags, 0)) != 0) + return (ret); + + if (F_ISSET(logc->c_fh, DB_FH_VALID)) + (void)__os_closehandle(dbenv, logc->c_fh); + + if (logc->c_dbt.data != NULL) + __os_free(dbenv, logc->c_dbt.data); + + __os_free(dbenv, logc->bp); + __os_free(dbenv, logc->c_fh); + __os_free(dbenv, logc); + + return (0); +} + +/* + * __log_c_get -- + * Get a log record. + */ +static int +__log_c_get(logc, alsn, dbt, flags) + DB_LOGC *logc; DB_LSN *alsn; DBT *dbt; u_int32_t flags; { - DB_LOG *dblp; + DB_ENV *dbenv; DB_LSN saved_lsn; int ret; -#ifdef HAVE_RPC - if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) - return (__dbcl_log_get(dbenv, alsn, dbt, flags)); -#endif + dbenv = logc->dbenv; PANIC_CHECK(dbenv); - ENV_REQUIRES_CONFIG(dbenv, dbenv->lg_handle, DB_INIT_LOG); /* Validate arguments. */ - if (flags != DB_CHECKPOINT && flags != DB_CURRENT && - flags != DB_FIRST && flags != DB_LAST && - flags != DB_NEXT && flags != DB_PREV && flags != DB_SET) - return (__db_ferr(dbenv, "log_get", 1)); - - if (F_ISSET(dbenv, DB_ENV_THREAD)) { - if (flags == DB_NEXT || flags == DB_PREV || flags == DB_CURRENT) - return (__db_ferr(dbenv, "log_get", 1)); - if (!F_ISSET(dbt, - DB_DBT_MALLOC | DB_DBT_REALLOC | DB_DBT_USERMEM)) - return (__db_ferr(dbenv, "threaded data", 1)); + switch (flags) { + case DB_CURRENT: + case DB_FIRST: + case DB_LAST: + case DB_NEXT: + case DB_PREV: + break; + case DB_SET: + if (IS_ZERO_LSN(*alsn)) { + __db_err(dbenv, "DB_LOGC->get: invalid LSN"); + return (EINVAL); + } + break; + default: + return (__db_ferr(dbenv, "DB_LOGC->get", 1)); } - dblp = dbenv->lg_handle; - R_LOCK(dbenv, &dblp->reginfo); - /* - * The alsn field is only initialized if DB_SET is the flag, so this - * assignment causes uninitialized memory complaints for other flag - * values. + * On error, we take care not to overwrite the caller's LSN. This + * is because callers looking for the end of the log loop using the + * DB_NEXT flag, and expect to take the last successful lsn out of + * the passed-in structure after DB_LOGC->get fails with DB_NOTFOUND. + * + * !!! + * This line is often flagged an uninitialized memory read during a + * Purify or similar tool run, as the application didn't initialize + * *alsn. If the application isn't setting the DB_SET flag, there is + * no reason it should have initialized *alsn, but we can't know that + * and we want to make sure we never overwrite whatever the application + * put in there. */ -#ifdef UMRW - if (flags == DB_SET) - saved_lsn = *alsn; - else - ZERO_LSN(saved_lsn); -#else saved_lsn = *alsn; -#endif /* - * If we get one of the log's header records, repeat the operation. - * This assumes that applications don't ever request the log header - * records by LSN, but that seems reasonable to me. + * If we get one of the log's header records as a result of doing a + * DB_FIRST, DB_NEXT, DB_LAST or DB_PREV, repeat the operation, log + * file header records aren't useful to applications. */ - if ((ret = __log_get(dblp, - alsn, dbt, flags, 0)) == 0 && alsn->offset == 0) { + if ((ret = __log_c_get_int(logc, alsn, dbt, flags)) != 0) { + *alsn = saved_lsn; + return (ret); + } + if (alsn->offset == 0 && (flags == DB_FIRST || + flags == DB_NEXT || flags == DB_LAST || flags == DB_PREV)) { switch (flags) { case DB_FIRST: flags = DB_NEXT; @@ -101,92 +195,100 @@ log_get(dbenv, alsn, dbt, flags) break; } if (F_ISSET(dbt, DB_DBT_MALLOC)) { - __os_free(dbt->data, dbt->size); + __os_free(dbenv, dbt->data); dbt->data = NULL; } - ret = __log_get(dblp, alsn, dbt, flags, 0); + if ((ret = __log_c_get_int(logc, alsn, dbt, flags)) != 0) { + *alsn = saved_lsn; + return (ret); + } } - if (ret != 0) - *alsn = saved_lsn; - R_UNLOCK(dbenv, &dblp->reginfo); - - return (ret); + return (0); } /* - * __log_get -- + * __log_c_get_int -- * Get a log record; internal version. - * - * PUBLIC: int __log_get __P((DB_LOG *, DB_LSN *, DBT *, u_int32_t, int)); */ -int -__log_get(dblp, alsn, dbt, flags, silent) - DB_LOG *dblp; +static int +__log_c_get_int(logc, alsn, dbt, flags) + DB_LOGC *logc; DB_LSN *alsn; DBT *dbt; u_int32_t flags; - int silent; { + DB_CIPHER *db_cipher; DB_ENV *dbenv; - DB_LSN nlsn; + DB_LOG *dblp; + DB_LSN last_lsn, nlsn; HDR hdr; LOG *lp; - const char *fail; - char *np, *tbuf; - int cnt, ret; + RLOCK rlock; logfile_validity status; - size_t len, nr; - u_int32_t offset; - u_int8_t *p; - void *shortp, *readp; + u_int32_t cnt; + u_int8_t *rp; + int eof, is_hmac, ret; + dbenv = logc->dbenv; + dblp = dbenv->lg_handle; lp = dblp->reginfo.primary; - fail = np = tbuf = NULL; - dbenv = dblp->dbenv; + is_hmac = 0; - nlsn = dblp->c_lsn; + /* + * We don't acquire the log region lock until we need it, and we + * release it as soon as we're done. + */ + rlock = F_ISSET(logc, DB_LOG_LOCKED) ? L_ALREADY : L_NONE; + + nlsn = logc->c_lsn; switch (flags) { - case DB_CHECKPOINT: - nlsn = lp->chkpt_lsn; - if (IS_ZERO_LSN(nlsn)) { - /* No db_err. The caller may expect this. */ - ret = ENOENT; - goto err2; - } - break; case DB_NEXT: /* Next log record. */ if (!IS_ZERO_LSN(nlsn)) { /* Increment the cursor by the cursor record size. */ - nlsn.offset += dblp->c_len; + nlsn.offset += logc->c_len; break; } + flags = DB_FIRST; /* FALLTHROUGH */ - case DB_FIRST: /* Find the first log record. */ + case DB_FIRST: /* First log record. */ /* Find the first log file. */ if ((ret = __log_find(dblp, 1, &cnt, &status)) != 0) - goto err2; + goto err; /* - * We want any readable version, so either DB_LV_NORMAL - * or DB_LV_OLD_READABLE is acceptable here. If it's - * not one of those two, there is no first log record that - * we can read. + * DB_LV_INCOMPLETE: + * Theoretically, the log file we want could be created + * but not yet written, the "first" log record must be + * in the log buffer. + * DB_LV_NORMAL: + * DB_LV_OLD_READABLE: + * We found a log file we can read. + * DB_LV_NONEXISTENT: + * No log files exist, the "first" log record must be in + * the log buffer. + * DB_LV_OLD_UNREADABLE: + * No readable log files exist, we're at the cross-over + * point between two versions. The "first" log record + * must be in the log buffer. */ - if (status != DB_LV_NORMAL && status != DB_LV_OLD_READABLE) { - ret = DB_NOTFOUND; - goto err2; + switch (status) { + case DB_LV_INCOMPLETE: + DB_ASSERT(lp->lsn.file == cnt); + /* FALLTHROUGH */ + case DB_LV_NORMAL: + case DB_LV_OLD_READABLE: + nlsn.file = cnt; + break; + case DB_LV_NONEXISTENT: + nlsn.file = 1; + DB_ASSERT(lp->lsn.file == nlsn.file); + break; + case DB_LV_OLD_UNREADABLE: + nlsn.file = cnt + 1; + DB_ASSERT(lp->lsn.file == nlsn.file); + break; } - - /* - * We may have only entered records in the buffer, and not - * yet written a log file. If no log files were found and - * there's anything in the buffer, it belongs to file 1. - */ - if (cnt == 0) - cnt = 1; - - nlsn.file = cnt; nlsn.offset = 0; break; case DB_CURRENT: /* Current log record. */ @@ -197,21 +299,28 @@ __log_get(dblp, alsn, dbt, flags, silent) if (nlsn.offset == 0) { if (nlsn.file == 1 || __log_valid(dblp, - nlsn.file - 1, 0, &status) != 0) - return (DB_NOTFOUND); + nlsn.file - 1, 0, &status) != 0) { + ret = DB_NOTFOUND; + goto err; + } if (status != DB_LV_NORMAL && - status != DB_LV_OLD_READABLE) - return (DB_NOTFOUND); + status != DB_LV_OLD_READABLE) { + ret = DB_NOTFOUND; + goto err; + } --nlsn.file; - nlsn.offset = dblp->c_off; - } else - nlsn.offset = dblp->c_off; + } + nlsn.offset = logc->c_prev; break; } /* FALLTHROUGH */ case DB_LAST: /* Last log record. */ + if (rlock == L_NONE) { + rlock = L_ACQUIRED; + R_LOCK(dbenv, &dblp->reginfo); + } nlsn.file = lp->lsn.file; nlsn.offset = lp->lsn.offset - lp->len; break; @@ -225,241 +334,725 @@ next_file: ++nlsn.file; nlsn.offset = 0; } - /* Return 1 if the request is past the end of the log. */ - if (nlsn.file > lp->lsn.file || - (nlsn.file == lp->lsn.file && nlsn.offset >= lp->lsn.offset)) - return (DB_NOTFOUND); + /* + * The above switch statement should have set nlsn to the lsn of + * the requested record. + */ - /* If we've switched files, discard the current file handle. */ - if (dblp->c_lsn.file != nlsn.file && - F_ISSET(&dblp->c_fh, DB_FH_VALID)) { - (void)__os_closehandle(&dblp->c_fh); + if (CRYPTO_ON(dbenv)) { + hdr.size = HDR_CRYPTO_SZ; + is_hmac = 1; + } else { + hdr.size = HDR_NORMAL_SZ; + is_hmac = 0; } - - /* If the entire record is in the in-memory buffer, copy it out. */ - if (nlsn.file == lp->lsn.file && nlsn.offset >= lp->w_off) { - /* Copy the header. */ - p = dblp->bufp + (nlsn.offset - lp->w_off); - memcpy(&hdr, p, sizeof(HDR)); - - /* Copy the record. */ - len = hdr.len - sizeof(HDR); - if ((ret = __db_retcopy(NULL, dbt, p + sizeof(HDR), - len, &dblp->c_dbt.data, &dblp->c_dbt.ulen)) != 0) - goto err2; + /* Check to see if the record is in the cursor's buffer. */ + if ((ret = __log_c_incursor(logc, &nlsn, &hdr, &rp)) != 0) + goto err; + if (rp != NULL) goto cksum; - } - shortp = NULL; + /* + * Look to see if we're moving backward in the log with the last record + * coming from the disk -- it means the record can't be in the region's + * buffer. Else, check the region's buffer. + * + * If the record isn't in the region's buffer, we're going to have to + * read the record from disk. We want to make a point of not reading + * past the end of the logical log (after recovery, there may be data + * after the end of the logical log, not to mention the log file may + * have been pre-allocated). So, zero out last_lsn, and initialize it + * inside __log_c_inregion -- if it's still zero when we check it in + * __log_c_ondisk, that's OK, it just means the logical end of the log + * isn't an issue for this request. + */ + ZERO_LSN(last_lsn); + if (!F_ISSET(logc, DB_LOG_DISK) || + log_compare(&nlsn, &logc->c_lsn) > 0) { + F_CLR(logc, DB_LOG_DISK); - /* Acquire a file descriptor. */ - if (!F_ISSET(&dblp->c_fh, DB_FH_VALID)) { - if ((ret = __log_name(dblp, nlsn.file, - &np, &dblp->c_fh, DB_OSO_RDONLY | DB_OSO_SEQ)) != 0) { - fail = np; - goto err1; - } - __os_freestr(np); - np = NULL; + if ((ret = __log_c_inregion(logc, + &nlsn, &rlock, &last_lsn, &hdr, &rp)) != 0) + goto err; + if (rp != NULL) + goto cksum; } - /* See if we've already read this */ - if (nlsn.file == dblp->r_file && nlsn.offset > dblp->r_off - && nlsn.offset + sizeof(HDR) < dblp->r_off + dblp->r_size) - goto got_header; - /* - * Seek to the header offset and read the header. Because the file - * may be pre-allocated, we have to make sure that we're not reading - * past the information in the start of the in-memory buffer. + * We have to read from an on-disk file to retrieve the record. + * If we ever can't retrieve the record at offset 0, we're done, + * return EOF/DB_NOTFOUND. + * + * Discard the region lock if we're still holding it, the on-disk + * reading routines don't need it. */ - - readp = &hdr; - offset = nlsn.offset; - if (nlsn.file == lp->lsn.file && offset + sizeof(HDR) > lp->w_off) - nr = lp->w_off - offset; - else if (dblp->readbufp == NULL) - nr = sizeof(HDR); - else { - nr = lp->buffer_size; - readp = dblp->readbufp; - dblp->r_file = nlsn.file; - /* Going backwards. Put the current in the middle. */ - if (flags == DB_PREV || flags == DB_LAST) { - if (offset <= lp->buffer_size/2) - offset = 0; - else - offset = offset - lp->buffer_size/2; - } - if (nlsn.file == lp->lsn.file && offset + nr > lp->lsn.offset) - nr = lp->lsn.offset - offset; - dblp->r_off = offset; + if (rlock == L_ACQUIRED) { + rlock = L_NONE; + R_UNLOCK(dbenv, &dblp->reginfo); + } + if ((ret = __log_c_ondisk( + logc, &nlsn, &last_lsn, flags, &hdr, &rp, &eof)) != 0) + goto err; + if (eof == 1) { + /* + * Only DB_NEXT automatically moves to the next file, and + * it only happens once. + */ + if (flags != DB_NEXT || nlsn.offset == 0) + return (DB_NOTFOUND); + goto next_file; } + F_SET(logc, DB_LOG_DISK); - if ((ret = __os_seek(dblp->dbenv, - &dblp->c_fh, 0, 0, offset, 0, DB_OS_SEEK_SET)) != 0) { - fail = "seek"; - goto err1; +cksum: /* + * Discard the region lock if we're still holding it. (The path to + * get here is that we acquired the lock because of the caller's + * flag argument, but we found the record in the cursor's buffer. + * Improbable, but it's easy to avoid. + */ + if (rlock == L_ACQUIRED) { + rlock = L_NONE; + R_UNLOCK(dbenv, &dblp->reginfo); } - if ((ret = __os_read(dblp->dbenv, &dblp->c_fh, readp, nr, &nr)) != 0) { - fail = "read"; - goto err1; + + /* + * Checksum: there are two types of errors -- a configuration error + * or a checksum mismatch. The former is always bad. The latter is + * OK if we're searching for the end of the log, and very, very bad + * if we're reading random log records. + */ + db_cipher = dbenv->crypto_handle; + if ((ret = __db_check_chksum(dbenv, db_cipher, + hdr.chksum, rp + hdr.size, hdr.len - hdr.size, is_hmac)) != 0) { + if (F_ISSET(logc, DB_LOG_SILENT_ERR)) { + if (ret == 0 || ret == -1) + ret = EIO; + } else if (ret == -1) { + __db_err(dbenv, + "DB_LOGC->get: log record checksum mismatch"); + __db_err(dbenv, + "DB_LOGC->get: catastrophic recovery may be required"); + ret = __db_panic(dbenv, DB_RUNRECOVERY); + } + goto err; } - if (nr < sizeof(HDR)) { - /* If read returns EOF, try the next file. */ - if (nr == 0) { - if (flags != DB_NEXT || nlsn.file == lp->lsn.file) - goto corrupt; + + /* + * If we got a 0-length record, that means we're in the midst of + * some bytes that got 0'd as the result of a vtruncate. We're + * going to have to retry. + */ + if (hdr.len == 0) { + switch (flags) { + case DB_FIRST: + case DB_NEXT: + /* Zero'd records always indicate the end of a file. */ goto next_file; + + case DB_LAST: + case DB_PREV: + /* + * We should never get here. If we recover a log + * file with 0's at the end, we'll treat the 0'd + * headers as the end of log and ignore them. If + * we're reading backwards from another file, then + * the first record in that new file should have its + * prev field set correctly. + */ + __db_err(dbenv, + "Encountered zero length records while traversing backwards"); + DB_ASSERT(0); + case DB_SET: + default: + /* Return the 0-length record. */ + break; } + } - if (dblp->readbufp != NULL) - memcpy((u_int8_t *) &hdr, readp, nr); + /* Copy the record into the user's DBT. */ + if ((ret = __db_retcopy(dbenv, dbt, rp + hdr.size, + (u_int32_t)(hdr.len - hdr.size), + &logc->c_dbt.data, &logc->c_dbt.ulen)) != 0) + goto err; + if (CRYPTO_ON(dbenv)) { + if ((ret = db_cipher->decrypt(dbenv, db_cipher->data, + hdr.iv, dbt->data, hdr.len - hdr.size)) != 0) { + ret = EAGAIN; + goto err; + } /* - * If read returns a short count the rest of the record has - * to be in the in-memory buffer. + * Return the original log record size to the user, + * even though we've allocated more than that, possibly. + * The log record is decrypted in the user dbt, not in + * the buffer, so we must do this here after decryption, + * not adjust the len passed to the __db_retcopy call. */ - if (lp->b_off < sizeof(HDR) - nr) - goto corrupt; + dbt->size = hdr.orig_size; + } - /* Get the rest of the header from the in-memory buffer. */ - memcpy((u_int8_t *)&hdr + nr, dblp->bufp, sizeof(HDR) - nr); + /* Update the cursor and the returned LSN. */ + *alsn = nlsn; + logc->c_lsn = nlsn; + logc->c_len = hdr.len; + logc->c_prev = hdr.prev; - if (hdr.len == 0) - goto next_file; +err: if (rlock == L_ACQUIRED) + R_UNLOCK(dbenv, &dblp->reginfo); - shortp = dblp->bufp + (sizeof(HDR) - nr); - } + return (ret); +} - else if (dblp->readbufp != NULL) { - dblp->r_size = nr; -got_header: memcpy((u_int8_t *)&hdr, - dblp->readbufp + (nlsn.offset - dblp->r_off), sizeof(HDR)); - } +/* + * __log_c_incursor -- + * Check to see if the requested record is in the cursor's buffer. + */ +static int +__log_c_incursor(logc, lsn, hdr, pp) + DB_LOGC *logc; + DB_LSN *lsn; + HDR *hdr; + u_int8_t **pp; +{ + u_int8_t *p; + + *pp = NULL; /* - * Check for buffers of 0's, that's what we usually see during recovery, - * although it's certainly not something on which we can depend. Check - * for impossibly large records. The malloc should fail later, but we - * have customers that run mallocs that handle allocation failure as a - * fatal error. + * Test to see if the requested LSN could be part of the cursor's + * buffer. + * + * The record must be part of the same file as the cursor's buffer. + * The record must start at a byte offset equal to or greater than + * the cursor buffer. + * The record must not start at a byte offset after the cursor + * buffer's end. */ - if (hdr.len == 0) - goto next_file; - if (hdr.len <= sizeof(HDR) || hdr.len > lp->persist.lg_max) - goto corrupt; - len = hdr.len - sizeof(HDR); - - /* If we've already moved to the in-memory buffer, fill from there. */ - if (shortp != NULL) { - if (lp->b_off < ((u_int8_t *)shortp - dblp->bufp) + len) - goto corrupt; - if ((ret = __db_retcopy(NULL, dbt, shortp, len, - &dblp->c_dbt.data, &dblp->c_dbt.ulen)) != 0) - goto err2; - goto cksum; - } + if (logc->bp_lsn.file != lsn->file) + return (0); + if (logc->bp_lsn.offset > lsn->offset) + return (0); + if (logc->bp_lsn.offset + logc->bp_rlen <= lsn->offset + hdr->size) + return (0); - if (dblp->readbufp != NULL) { - if (nlsn.offset + hdr.len < dblp->r_off + dblp->r_size) { - if ((ret = __db_retcopy(NULL, dbt, dblp->readbufp + - (nlsn.offset - dblp->r_off) + sizeof(HDR), - len, &dblp->c_dbt.data, &dblp->c_dbt.ulen)) != 0) - goto err2; - goto cksum; - } else if ((ret = __os_seek(dblp->dbenv, &dblp->c_fh, 0, - 0, nlsn.offset + sizeof(HDR), 0, DB_OS_SEEK_SET)) != 0) { - fail = "seek"; - goto err1; - } + /* + * Read the record's header and check if the record is entirely held + * in the buffer. If the record is not entirely held, get it again. + * (The only advantage in having part of the record locally is that + * we might avoid a system call because we already have the HDR in + * memory.) + * + * If the header check fails for any reason, it must be because the + * LSN is bogus. Fail hard. + */ + p = logc->bp + (lsn->offset - logc->bp_lsn.offset); + memcpy(hdr, p, hdr->size); + if (__log_c_hdrchk(logc, hdr, NULL)) + return (DB_NOTFOUND); + if (logc->bp_lsn.offset + logc->bp_rlen <= lsn->offset + hdr->len) + return (0); + + *pp = p; /* Success. */ + + return (0); +} + +/* + * __log_c_inregion -- + * Check to see if the requested record is in the region's buffer. + */ +static int +__log_c_inregion(logc, lsn, rlockp, last_lsn, hdr, pp) + DB_LOGC *logc; + DB_LSN *lsn, *last_lsn; + RLOCK *rlockp; + HDR *hdr; + u_int8_t **pp; +{ + DB_ENV *dbenv; + DB_LOG *dblp; + LOG *lp; + size_t len, nr; + u_int32_t b_disk, b_region; + int ret; + u_int8_t *p; + + dbenv = logc->dbenv; + dblp = dbenv->lg_handle; + lp = ((DB_LOG *)logc->dbenv->lg_handle)->reginfo.primary; + + ret = 0; + *pp = NULL; + + /* If we haven't yet acquired the log region lock, do so. */ + if (*rlockp == L_NONE) { + *rlockp = L_ACQUIRED; + R_LOCK(dbenv, &dblp->reginfo); } /* - * Allocate temporary memory to hold the record. + * The routines to read from disk must avoid reading past the logical + * end of the log, so pass that information back to it. * - * XXX - * We're calling malloc(3) with a region locked. This isn't - * a good idea. + * Since they're reading directly from the disk, they must also avoid + * reading past the offset we've written out. If the log was + * truncated, it's possible that there are zeroes or garbage on + * disk after this offset, and the logical end of the log can + * come later than this point if the log buffer isn't empty. */ - if ((ret = __os_malloc(dbenv, len, NULL, &tbuf)) != 0) - goto err1; + *last_lsn = lp->lsn; + if (last_lsn->offset > lp->w_off) + last_lsn->offset = lp->w_off; /* - * Read the record into the buffer. If read returns a short count, - * there was an error or the rest of the record is in the in-memory - * buffer. Note, the information may be garbage if we're in recovery, - * so don't read past the end of the buffer's memory. - * - * Because the file may be pre-allocated, we have to make sure that - * we're not reading past the information in the start of the in-memory + * Test to see if the requested LSN could be part of the region's * buffer. + * + * During recovery, we read the log files getting the information to + * initialize the region. In that case, the region's lsn field will + * not yet have been filled in, use only the disk. + * + * The record must not start at a byte offset after the region buffer's + * end, since that means the request is for a record after the end of + * the log. Do this test even if the region's buffer is empty -- after + * recovery, the log files may continue past the declared end-of-log, + * and the disk reading routine will incorrectly attempt to read the + * remainder of the log. + * + * Otherwise, test to see if the region's buffer actually has what we + * want: + * + * The buffer must have some useful content. + * The record must be in the same file as the region's buffer and must + * start at a byte offset equal to or greater than the region's buffer. + */ + if (IS_ZERO_LSN(lp->lsn)) + return (0); + if (lsn->file > lp->lsn.file || + (lsn->file == lp->lsn.file && lsn->offset >= lp->lsn.offset)) + return (DB_NOTFOUND); + if (lp->b_off == 0) + return (0); + if (lsn->file < lp->f_lsn.file || lsn->offset < lp->f_lsn.offset) + return (0); + + /* + * The current contents of the cursor's buffer will be useless for a + * future call -- trash it rather than try and make it look correct. + */ + ZERO_LSN(logc->bp_lsn); + + /* + * If the requested LSN is greater than the region buffer's first + * byte, we know the entire record is in the buffer. + * + * If the header check fails for any reason, it must be because the + * LSN is bogus. Fail hard. */ - if (nlsn.file == lp->lsn.file && - nlsn.offset + sizeof(HDR) + len > lp->w_off) - nr = lp->w_off - (nlsn.offset + sizeof(HDR)); + if (lsn->offset > lp->f_lsn.offset) { + p = dblp->bufp + (lsn->offset - lp->w_off); + memcpy(hdr, p, hdr->size); + if (__log_c_hdrchk(logc, hdr, NULL)) + return (DB_NOTFOUND); + if (logc->bp_size <= hdr->len) { + len = ALIGN(hdr->len * 2, 128); + if ((ret = + __os_realloc(logc->dbenv, len, &logc->bp)) != 0) + return (ret); + logc->bp_size = (u_int32_t)len; + } + memcpy(logc->bp, p, hdr->len); + *pp = logc->bp; + return (0); + } + + /* + * There's a partial record, that is, the requested record starts + * in a log file and finishes in the region buffer. We have to + * find out how many bytes of the record are in the region buffer + * so we can copy them out into the cursor buffer. First, check + * to see if the requested record is the only record in the region + * buffer, in which case we should copy the entire region buffer. + * + * Else, walk back through the region's buffer to find the first LSN + * after the record that crosses the buffer boundary -- we can detect + * that LSN, because its "prev" field will reference the record we + * want. The bytes we need to copy from the region buffer are the + * bytes up to the record we find. The bytes we'll need to allocate + * to hold the log record are the bytes between the two offsets. + */ + b_disk = lp->w_off - lsn->offset; + if (lp->b_off <= lp->len) + b_region = (u_int32_t)lp->b_off; else - nr = len; - if ((ret = __os_read(dblp->dbenv, &dblp->c_fh, tbuf, nr, &nr)) != 0) { - fail = "read"; - goto err1; + for (p = dblp->bufp + (lp->b_off - lp->len);;) { + memcpy(hdr, p, hdr->size); + if (hdr->prev == lsn->offset) { + b_region = (u_int32_t)(p - dblp->bufp); + break; + } + p = dblp->bufp + (hdr->prev - lp->w_off); + } + + /* + * If we don't have enough room for the record, we have to allocate + * space. We have to do it while holding the region lock, which is + * truly annoying, but there's no way around it. This call is why + * we allocate cursor buffer space when allocating the cursor instead + * of waiting. + */ + if (logc->bp_size <= b_region + b_disk) { + len = ALIGN((b_region + b_disk) * 2, 128); + if ((ret = __os_realloc(logc->dbenv, len, &logc->bp)) != 0) + return (ret); + logc->bp_size = (u_int32_t)len; } - if (len - nr > lp->buffer_size) - goto corrupt; - if (nr != len) { - if (lp->b_off < len - nr) - goto corrupt; - - /* Get the rest of the record from the in-memory buffer. */ - memcpy((u_int8_t *)tbuf + nr, dblp->bufp, len - nr); + + /* Copy the region's bytes to the end of the cursor's buffer. */ + p = (logc->bp + logc->bp_size) - b_region; + memcpy(p, dblp->bufp, b_region); + + /* Release the region lock. */ + if (*rlockp == L_ACQUIRED) { + *rlockp = L_NONE; + R_UNLOCK(dbenv, &dblp->reginfo); } - /* Copy the record into the user's DBT. */ - if ((ret = __db_retcopy(NULL, dbt, tbuf, len, - &dblp->c_dbt.data, &dblp->c_dbt.ulen)) != 0) - goto err2; - __os_free(tbuf, 0); - tbuf = NULL; + /* + * Read the rest of the information from disk. Neither short reads + * or EOF are acceptable, the bytes we want had better be there. + */ + if (b_disk != 0) { + p -= b_disk; + nr = b_disk; + if ((ret = __log_c_io( + logc, lsn->file, lsn->offset, p, &nr, NULL)) != 0) + return (ret); + if (nr < b_disk) + return (__log_c_shortread(logc, 0)); + } -cksum: /* - * If the user specified a partial record read, the checksum can't - * match. It's not an obvious thing to do, but a user testing for - * the length of a record might do it. + /* Copy the header information into the caller's structure. */ + memcpy(hdr, p, hdr->size); + + *pp = p; + return (0); +} + +/* + * __log_c_ondisk -- + * Read a record off disk. + */ +static int +__log_c_ondisk(logc, lsn, last_lsn, flags, hdr, pp, eofp) + DB_LOGC *logc; + DB_LSN *lsn, *last_lsn; + int flags, *eofp; + HDR *hdr; + u_int8_t **pp; +{ + DB_ENV *dbenv; + size_t len, nr; + u_int32_t offset; + int ret; + + dbenv = logc->dbenv; + *eofp = 0; + + nr = hdr->size; + if ((ret = + __log_c_io(logc, lsn->file, lsn->offset, hdr, &nr, eofp)) != 0) + return (ret); + if (*eofp) + return (0); + + /* If we read 0 bytes, assume we've hit EOF. */ + if (nr == 0) { + *eofp = 1; + return (0); + } + + /* Check the HDR. */ + if ((ret = __log_c_hdrchk(logc, hdr, eofp)) != 0) + return (ret); + if (*eofp) + return (0); + + /* Otherwise, we should have gotten the bytes we wanted. */ + if (nr < hdr->size) + return (__log_c_shortread(logc, 0)); + + /* + * Regardless of how we return, the previous contents of the cursor's + * buffer are useless -- trash it. */ - if (!F_ISSET(dbt, DB_DBT_PARTIAL) && - hdr.cksum != __ham_func4(NULL, dbt->data, dbt->size)) { - if (!silent) - __db_err(dbenv, "log_get: checksum mismatch"); - goto corrupt; + ZERO_LSN(logc->bp_lsn); + + /* + * Otherwise, we now (finally!) know how big the record is. (Maybe + * we should have just stuck the length of the record into the LSN!?) + * Make sure we have enough space. + */ + if (logc->bp_size <= hdr->len) { + len = ALIGN(hdr->len * 2, 128); + if ((ret = __os_realloc(dbenv, len, &logc->bp)) != 0) + return (ret); + logc->bp_size = (u_int32_t)len; } - /* Update the cursor and the return lsn. */ - dblp->c_off = hdr.prev; - dblp->c_len = hdr.len; - dblp->c_lsn = nlsn; - *alsn = nlsn; + /* + * If we're moving forward in the log file, read this record in at the + * beginning of the buffer. Otherwise, read this record in at the end + * of the buffer, making sure we don't try and read before the start + * of the file. (We prefer positioning at the end because transaction + * aborts use DB_SET to move backward through the log and we might get + * lucky.) + * + * Read a buffer's worth, without reading past the logical EOF. The + * last_lsn may be a zero LSN, but that's OK, the test works anyway. + */ + if (flags == DB_FIRST || flags == DB_NEXT) + offset = lsn->offset; + else if (lsn->offset + hdr->len < logc->bp_size) + offset = 0; + else + offset = (lsn->offset + hdr->len) - logc->bp_size; + + nr = logc->bp_size; + if (lsn->file == last_lsn->file && offset + nr >= last_lsn->offset) + nr = last_lsn->offset - offset; + + if ((ret = + __log_c_io(logc, lsn->file, offset, logc->bp, &nr, eofp)) != 0) + return (ret); + + /* + * We should have at least gotten the bytes up-to-and-including the + * record we're reading. + */ + if (nr < (lsn->offset + hdr->len) - offset) + return (__log_c_shortread(logc, 1)); + + /* Set up the return information. */ + logc->bp_rlen = (u_int32_t)nr; + logc->bp_lsn.file = lsn->file; + logc->bp_lsn.offset = offset; + *pp = logc->bp + (lsn->offset - offset); + + return (0); +} + +/* + * __log_c_hdrchk -- + * + * Check for corrupted HDRs before we use them to allocate memory or find + * records. + * + * If the log files were pre-allocated, a zero-filled HDR structure is the + * logical file end. However, we can see buffers filled with 0's during + * recovery, too (because multiple log buffers were written asynchronously, + * and one made it to disk before a different one that logically precedes + * it in the log file. + * + * XXX + * I think there's a potential pre-allocation recovery flaw here -- if we + * fail to write a buffer at the end of a log file (by scheduling its + * write asynchronously, and it never making it to disk), then succeed in + * writing a log file block to a subsequent log file, I don't think we will + * detect that the buffer of 0's should have marked the end of the log files + * during recovery. I think we may need to always write some garbage after + * each block write if we pre-allocate log files. (At the moment, we do not + * pre-allocate, so this isn't currently an issue.) + * + * Check for impossibly large records. The malloc should fail later, but we + * have customers that run mallocs that treat all allocation failures as fatal + * errors. + * + * Note that none of this is necessarily something awful happening. We let + * the application hand us any LSN they want, and it could be a pointer into + * the middle of a log record, there's no way to tell. + */ +static int +__log_c_hdrchk(logc, hdr, eofp) + DB_LOGC *logc; + HDR *hdr; + int *eofp; +{ + DB_ENV *dbenv; + int ret; + + dbenv = logc->dbenv; + + /* Sanity check the log record's size. */ + if (hdr->len <= hdr->size) + goto err; + /* + * If the cursor's max-record value isn't yet set, it means we aren't + * reading these records from a log file and no check is necessary. + */ + if (logc->bp_maxrec != 0 && hdr->len > logc->bp_maxrec) { + /* + * If we fail the check, there's the pathological case that + * we're reading the last file, it's growing, and our initial + * check information was wrong. Get it again, to be sure. + */ + if ((ret = __log_c_set_maxrec(logc, NULL)) != 0) { + __db_err(dbenv, "DB_LOGC->get: %s", db_strerror(ret)); + return (ret); + } + if (logc->bp_maxrec != 0 && hdr->len > logc->bp_maxrec) + goto err; + } + + if (eofp != NULL) { + if (hdr->prev == 0 && hdr->chksum[0] == 0 && hdr->len == 0) { + *eofp = 1; + return (0); + } + *eofp = 0; + } return (0); -corrupt:/* - * This is the catchall -- for some reason we didn't find enough - * information or it wasn't reasonable information, and it wasn't - * because a system call failed. +err: if (!F_ISSET(logc, DB_LOG_SILENT_ERR)) + __db_err(dbenv, "DB_LOGC->get: invalid log record header"); + return (EIO); +} + +/* + * __log_c_io -- + * Read records from a log file. + */ +static int +__log_c_io(logc, fnum, offset, p, nrp, eofp) + DB_LOGC *logc; + u_int32_t fnum, offset; + void *p; + size_t *nrp; + int *eofp; +{ + DB_ENV *dbenv; + DB_LOG *dblp; + int ret; + char *np; + + dbenv = logc->dbenv; + dblp = dbenv->lg_handle; + + /* + * If we've switched files, discard the current file handle and acquire + * a new one. */ - ret = EIO; - fail = "read"; + if (F_ISSET(logc->c_fh, DB_FH_VALID) && logc->bp_lsn.file != fnum) + if ((ret = __os_closehandle(dbenv, logc->c_fh)) != 0) + return (ret); + if (!F_ISSET(logc->c_fh, DB_FH_VALID)) { + if ((ret = __log_name(dblp, fnum, + &np, logc->c_fh, DB_OSO_RDONLY | DB_OSO_SEQ)) != 0) { + /* + * If we're allowed to return EOF, assume that's the + * problem, set the EOF status flag and return 0. + */ + if (eofp != NULL) { + *eofp = 1; + ret = 0; + } else if (!F_ISSET(logc, DB_LOG_SILENT_ERR)) + __db_err(dbenv, "DB_LOGC->get: %s: %s", + np, db_strerror(ret)); + __os_free(dbenv, np); + return (ret); + } -err1: if (!silent) { - if (fail == NULL) - __db_err(dbenv, "log_get: %s", db_strerror(ret)); - else + if ((ret = __log_c_set_maxrec(logc, np)) != 0) { __db_err(dbenv, - "log_get: %s: %s", fail, db_strerror(ret)); + "DB_LOGC->get: %s: %s", np, db_strerror(ret)); + __os_free(dbenv, np); + return (ret); + } + __os_free(dbenv, np); } -err2: if (np != NULL) - __os_freestr(np); - if (tbuf != NULL) - __os_free(tbuf, 0); - return (ret); + /* Seek to the record's offset. */ + if ((ret = __os_seek(dbenv, + logc->c_fh, 0, 0, offset, 0, DB_OS_SEEK_SET)) != 0) { + if (!F_ISSET(logc, DB_LOG_SILENT_ERR)) + __db_err(dbenv, + "DB_LOGC->get: seek: %s", db_strerror(ret)); + return (ret); + } + + /* Read the data. */ + if ((ret = __os_read(dbenv, logc->c_fh, p, *nrp, nrp)) != 0) { + if (!F_ISSET(logc, DB_LOG_SILENT_ERR)) + __db_err(dbenv, + "DB_LOGC->get: read: %s", db_strerror(ret)); + return (ret); + } + + return (0); +} + +/* + * __log_c_shortread -- + * Read was short -- return a consistent error message and error. + */ +static int +__log_c_shortread(logc, silent) + DB_LOGC *logc; + int silent; +{ + if (!silent || !F_ISSET(logc, DB_LOG_SILENT_ERR)) + __db_err(logc->dbenv, "DB_LOGC->get: short read"); + return (EIO); +} + +/* + * __log_c_set_maxrec -- + * Bound the maximum log record size in a log file. + */ +static int +__log_c_set_maxrec(logc, np) + DB_LOGC *logc; + char *np; +{ + DB_ENV *dbenv; + DB_LOG *dblp; + LOG *lp; + u_int32_t mbytes, bytes; + int ret; + + dbenv = logc->dbenv; + dblp = dbenv->lg_handle; + + /* + * We don't want to try and allocate huge chunks of memory because + * applications with error-checking malloc's often consider that a + * hard failure. If we're about to look at a corrupted record with + * a bizarre size, we need to know before trying to allocate space + * to hold it. We could read the persistent data at the beginning + * of the file but that's hard -- we may have to decrypt it, checksum + * it and so on. Stat the file instead. + */ + if ((ret = + __os_ioinfo(dbenv, np, logc->c_fh, &mbytes, &bytes, NULL)) != 0) + return (ret); + + logc->bp_maxrec = mbytes * MEGABYTE + bytes; + + /* + * If reading from the log file currently being written, we could get + * an incorrect size, that is, if the cursor was opened on the file + * when it had only a few hundred bytes, and then the cursor used to + * move forward in the file, after more log records were written, the + * original stat value would be wrong. Use the maximum of the current + * log file size and the size of the buffer -- that should represent + * the max of any log record currently in the file. + * + * The log buffer size is set when the environment is opened and never + * changed, we don't need a lock on it. + */ + lp = dblp->reginfo.primary; + logc->bp_maxrec += lp->buffer_size; + + return (0); } diff --git a/bdb/log/log_method.c b/bdb/log/log_method.c index 883f485d891..42adaf11c6c 100644 --- a/bdb/log/log_method.c +++ b/bdb/log/log_method.c @@ -1,38 +1,39 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1999, 2000 + * Copyright (c) 1999-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: log_method.c,v 11.14 2000/11/30 00:58:40 ubell Exp $"; +static const char revid[] = "$Id: log_method.c,v 11.32 2002/05/30 22:16:47 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES #include <sys/types.h> +#ifdef HAVE_RPC +#include <rpc/rpc.h> +#endif + #include <stdlib.h> #include <string.h> #include <unistd.h> #endif -#ifdef HAVE_RPC -#include "db_server.h" -#endif - #include "db_int.h" -#include "log.h" +#include "dbinc/log.h" #ifdef HAVE_RPC -#include "gen_client_ext.h" -#include "rpc_client_ext.h" +#include "dbinc_auto/db_server.h" +#include "dbinc_auto/rpc_client_ext.h" #endif -static int __log_set_lg_max __P((DB_ENV *, u_int32_t)); static int __log_set_lg_bsize __P((DB_ENV *, u_int32_t)); static int __log_set_lg_dir __P((DB_ENV *, const char *)); +static int __log_set_lg_max __P((DB_ENV *, u_int32_t)); +static int __log_set_lg_regionmax __P((DB_ENV *, u_int32_t)); /* * __log_dbenv_create -- @@ -44,13 +45,16 @@ void __log_dbenv_create(dbenv) DB_ENV *dbenv; { - dbenv->lg_bsize = LG_BSIZE_DEFAULT; - dbenv->set_lg_bsize = __log_set_lg_bsize; + /* + * !!! + * Our caller has not yet had the opportunity to reset the panic + * state or turn off mutex locking, and so we can neither check + * the panic state or acquire a mutex in the DB_ENV create path. + */ - dbenv->lg_max = LG_MAX_DEFAULT; - dbenv->set_lg_max = __log_set_lg_max; + dbenv->lg_bsize = LG_BSIZE_DEFAULT; + dbenv->lg_regionmax = LG_BASE_REGION_SIZE; - dbenv->set_lg_dir = __log_set_lg_dir; #ifdef HAVE_RPC /* * If we have a client, overwrite what we just setup to @@ -58,10 +62,29 @@ __log_dbenv_create(dbenv) */ if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) { dbenv->set_lg_bsize = __dbcl_set_lg_bsize; - dbenv->set_lg_max = __dbcl_set_lg_max; dbenv->set_lg_dir = __dbcl_set_lg_dir; - } + dbenv->set_lg_max = __dbcl_set_lg_max; + dbenv->set_lg_regionmax = __dbcl_set_lg_regionmax; + dbenv->log_archive = __dbcl_log_archive; + dbenv->log_cursor = __dbcl_log_cursor; + dbenv->log_file = __dbcl_log_file; + dbenv->log_flush = __dbcl_log_flush; + dbenv->log_put = __dbcl_log_put; + dbenv->log_stat = __dbcl_log_stat; + } else #endif + { + dbenv->set_lg_bsize = __log_set_lg_bsize; + dbenv->set_lg_dir = __log_set_lg_dir; + dbenv->set_lg_max = __log_set_lg_max; + dbenv->set_lg_regionmax = __log_set_lg_regionmax; + dbenv->log_archive = __log_archive; + dbenv->log_cursor = __log_cursor; + dbenv->log_file = __log_file; + dbenv->log_flush = __log_flush; + dbenv->log_put = __log_put; + dbenv->log_stat = __log_stat; + } } /* @@ -73,10 +96,16 @@ __log_set_lg_bsize(dbenv, lg_bsize) DB_ENV *dbenv; u_int32_t lg_bsize; { + u_int32_t lg_max; + ENV_ILLEGAL_AFTER_OPEN(dbenv, "set_lg_bsize"); + if (lg_bsize == 0) + lg_bsize = LG_BSIZE_DEFAULT; + /* Let's not be silly. */ - if (lg_bsize > dbenv->lg_max / 4) { + lg_max = dbenv->lg_size == 0 ? LG_MAX_DEFAULT : dbenv->lg_size; + if (lg_bsize > lg_max / 4) { __db_err(dbenv, "log buffer size must be <= log file size / 4"); return (EINVAL); } @@ -94,15 +123,53 @@ __log_set_lg_max(dbenv, lg_max) DB_ENV *dbenv; u_int32_t lg_max; { - ENV_ILLEGAL_AFTER_OPEN(dbenv, "set_lg_max"); + LOG *region; + + if (lg_max == 0) + lg_max = LG_MAX_DEFAULT; + + if (F_ISSET(dbenv, DB_ENV_OPEN_CALLED)) { + if (!LOGGING_ON(dbenv)) + return (__db_env_config( + dbenv, "set_lg_max", DB_INIT_LOG)); + region = ((DB_LOG *)dbenv->lg_handle)->reginfo.primary; + + /* Let's not be silly. */ + if (lg_max < region->buffer_size * 4) + goto err; + region->log_nsize = lg_max; + } else { + /* Let's not be silly. */ + if (lg_max < dbenv->lg_bsize * 4) + goto err; + dbenv->lg_size = lg_max; + } + + return (0); + +err: __db_err(dbenv, "log file size must be >= log buffer size * 4"); + return (EINVAL); +} + +/* + * __log_set_lg_regionmax -- + * Set the region size. + */ +static int +__log_set_lg_regionmax(dbenv, lg_regionmax) + DB_ENV *dbenv; + u_int32_t lg_regionmax; +{ + ENV_ILLEGAL_AFTER_OPEN(dbenv, "set_lg_regionmax"); /* Let's not be silly. */ - if (lg_max < dbenv->lg_bsize * 4) { - __db_err(dbenv, "log file size must be >= log buffer size * 4"); + if (lg_regionmax != 0 && lg_regionmax < LG_BASE_REGION_SIZE) { + __db_err(dbenv, + "log file size must be >= %d", LG_BASE_REGION_SIZE); return (EINVAL); } - dbenv->lg_max = lg_max; + dbenv->lg_regionmax = lg_regionmax; return (0); } @@ -116,6 +183,6 @@ __log_set_lg_dir(dbenv, dir) const char *dir; { if (dbenv->db_log_dir != NULL) - __os_freestr(dbenv->db_log_dir); + __os_free(dbenv, dbenv->db_log_dir); return (__os_strdup(dbenv, dir, &dbenv->db_log_dir)); } diff --git a/bdb/log/log_put.c b/bdb/log/log_put.c index c61f53e6c3d..bf6de2b0f7b 100644 --- a/bdb/log/log_put.c +++ b/bdb/log/log_put.c @@ -1,13 +1,13 @@ /*- * See the file LICENSE for redistribution information. * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 + * Copyright (c) 1996-2002 * Sleepycat Software. All rights reserved. */ #include "db_config.h" #ifndef lint -static const char revid[] = "$Id: log_put.c,v 11.26 2000/11/30 00:58:40 ubell Exp $"; +static const char revid[] = "$Id: log_put.c,v 11.112 2002/09/10 02:39:26 bostic Exp $"; #endif /* not lint */ #ifndef NO_SYSTEM_INCLUDES @@ -29,109 +29,424 @@ static const char revid[] = "$Id: log_put.c,v 11.26 2000/11/30 00:58:40 ubell Ex #include <unistd.h> #endif -#ifdef HAVE_RPC -#include "db_server.h" -#endif - #include "db_int.h" -#include "db_page.h" -#include "log.h" -#include "hash.h" -#include "clib_ext.h" - -#ifdef HAVE_RPC -#include "gen_client_ext.h" -#include "rpc_client_ext.h" -#endif +#include "dbinc/crypto.h" +#include "dbinc/hmac.h" +#include "dbinc/log.h" +#include "dbinc/rep.h" +#include "dbinc/txn.h" +static int __log_encrypt_record __P((DB_ENV *, DBT *, HDR *, u_int32_t)); static int __log_fill __P((DB_LOG *, DB_LSN *, void *, u_int32_t)); -static int __log_flush __P((DB_LOG *, const DB_LSN *)); +static int __log_flush_commit __P((DB_ENV *, const DB_LSN *, u_int32_t)); +static int __log_flush_int __P((DB_LOG *, const DB_LSN *, int)); static int __log_newfh __P((DB_LOG *)); -static int __log_putr __P((DB_LOG *, DB_LSN *, const DBT *, u_int32_t)); -static int __log_open_files __P((DB_ENV *)); +static int __log_put_next __P((DB_ENV *, + DB_LSN *, const DBT *, HDR *, DB_LSN *)); +static int __log_putr __P((DB_LOG *, + DB_LSN *, const DBT *, u_int32_t, HDR *)); static int __log_write __P((DB_LOG *, void *, u_int32_t)); /* - * log_put -- - * Write a log record. + * __log_put -- + * Write a log record. This is the public interface, DB_ENV->log_put. + * + * PUBLIC: int __log_put __P((DB_ENV *, DB_LSN *, const DBT *, u_int32_t)); */ int -log_put(dbenv, lsn, dbt, flags) +__log_put(dbenv, lsnp, udbt, flags) DB_ENV *dbenv; - DB_LSN *lsn; - const DBT *dbt; + DB_LSN *lsnp; + const DBT *udbt; u_int32_t flags; { + DB_CIPHER *db_cipher; + DBT *dbt, t; DB_LOG *dblp; - int ret; - -#ifdef HAVE_RPC - if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) - return (__dbcl_log_put(dbenv, lsn, dbt, flags)); -#endif + DB_LSN lsn, old_lsn; + HDR hdr; + LOG *lp; + u_int32_t do_flush, op, writeonly; + int lock_held, need_free, ret; + u_int8_t *key; PANIC_CHECK(dbenv); - ENV_REQUIRES_CONFIG(dbenv, dbenv->lg_handle, DB_INIT_LOG); + ENV_REQUIRES_CONFIG(dbenv, + dbenv->lg_handle, "DB_ENV->log_put", DB_INIT_LOG); /* Validate arguments. */ - if (flags != 0 && flags != DB_CHECKPOINT && - flags != DB_CURLSN && flags != DB_FLUSH) - return (__db_ferr(dbenv, "log_put", 0)); + op = DB_OPFLAGS_MASK & flags; + if (op != 0 && op != DB_COMMIT) + return (__db_ferr(dbenv, "DB_ENV->log_put", 0)); + + /* Check for allowed bit-flags. */ + if (LF_ISSET(~(DB_OPFLAGS_MASK | + DB_FLUSH | DB_NOCOPY | DB_PERMANENT | DB_WRNOSYNC))) + return (__db_ferr(dbenv, "DB_ENV->log_put", 0)); + + /* DB_WRNOSYNC and DB_FLUSH are mutually exclusive. */ + if (LF_ISSET(DB_WRNOSYNC) && LF_ISSET(DB_FLUSH)) + return (__db_ferr(dbenv, "DB_ENV->log_put", 1)); + + /* Replication clients should never write log records. */ + if (F_ISSET(dbenv, DB_ENV_REP_CLIENT) || + F_ISSET(dbenv, DB_ENV_REP_LOGSONLY)) { + __db_err(dbenv, + "DB_ENV->log_put is illegal on replication clients"); + return (EINVAL); + } dblp = dbenv->lg_handle; + lp = dblp->reginfo.primary; + db_cipher = dbenv->crypto_handle; + dbt = &t; + t = *udbt; + lock_held = need_free = 0; + do_flush = LF_ISSET(DB_FLUSH); + writeonly = LF_ISSET(DB_WRNOSYNC); + + /* + * If we are coming from the logging code, we use an internal + * flag, DB_NOCOPY, because we know we can overwrite/encrypt + * the log record in place. Otherwise, if a user called log_put + * then we must copy it to new memory so that we know we can + * write it. + * + * We also must copy it to new memory if we are a replication + * master so that we retain an unencrypted copy of the log + * record to send to clients. + */ + if (!LF_ISSET(DB_NOCOPY) || F_ISSET(dbenv, DB_ENV_REP_MASTER)) { + if (CRYPTO_ON(dbenv)) + t.size += db_cipher->adj_size(udbt->size); + if ((ret = __os_calloc(dbenv, 1, t.size, &t.data)) != 0) + goto err; + need_free = 1; + memcpy(t.data, udbt->data, udbt->size); + } + if ((ret = __log_encrypt_record(dbenv, dbt, &hdr, udbt->size)) != 0) + goto err; + if (CRYPTO_ON(dbenv)) + key = db_cipher->mac_key; + else + key = NULL; + /* Otherwise, we actually have a record to put. Put it. */ + + /* Before we grab the region lock, calculate the record's checksum. */ + __db_chksum(dbt->data, dbt->size, key, hdr.chksum); + R_LOCK(dbenv, &dblp->reginfo); - ret = __log_put(dbenv, lsn, dbt, flags); - R_UNLOCK(dbenv, &dblp->reginfo); + lock_held = 1; + + ZERO_LSN(old_lsn); + if ((ret = __log_put_next(dbenv, &lsn, dbt, &hdr, &old_lsn)) != 0) + goto err; + + if (F_ISSET(dbenv, DB_ENV_REP_MASTER)) { + /* + * Replication masters need to drop the lock to send + * messages, but we want to drop and reacquire it a minimal + * number of times. + */ + R_UNLOCK(dbenv, &dblp->reginfo); + lock_held = 0; + + /* + * If we changed files and we're in a replicated + * environment, we need to inform our clients now that + * we've dropped the region lock. + * + * Note that a failed NEWFILE send is a dropped message + * that our client can handle, so we can ignore it. It's + * possible that the record we already put is a commit, so + * we don't just want to return failure. + */ + if (!IS_ZERO_LSN(old_lsn)) + (void)__rep_send_message(dbenv, + DB_EID_BROADCAST, REP_NEWFILE, &old_lsn, NULL, 0); + + /* + * Then send the log record itself on to our clients. + * + * If the send fails and we're a commit or checkpoint, + * there's nothing we can do; the record's in the log. + * Flush it, even if we're running with TXN_NOSYNC, on the + * grounds that it should be in durable form somewhere. + */ + /* + * !!! + * In the crypto case, we MUST send the udbt, not the + * now-encrypted dbt. Clients have no way to decrypt + * without the header. + */ + if ((__rep_send_message(dbenv, + DB_EID_BROADCAST, REP_LOG, &lsn, udbt, flags) != 0) && + LF_ISSET(DB_PERMANENT)) + do_flush |= DB_FLUSH; + } + + /* + * If needed, do a flush. Note that failures at this point + * are only permissible if we know we haven't written a commit + * record; __log_flush_commit is responsible for enforcing this. + * + * If a flush is not needed, see if WRITE_NOSYNC was set and we + * need to write out the log buffer. + */ + if (do_flush || writeonly) { + if (!lock_held) { + R_LOCK(dbenv, &dblp->reginfo); + lock_held = 1; + } + if (do_flush) + ret = __log_flush_commit(dbenv, &lsn, flags); + else if (lp->b_off != 0) + /* + * writeonly: if there's anything in the current + * log buffer, we need to write it out. + */ + if ((ret = __log_write(dblp, + dblp->bufp, (u_int32_t)lp->b_off)) == 0) + lp->b_off = 0; + } + +err: if (lock_held) + R_UNLOCK(dbenv, &dblp->reginfo); + if (need_free) + __os_free(dbenv, dbt->data); + + if (ret == 0) + *lsnp = lsn; + return (ret); } /* - * __log_put -- - * Write a log record; internal version. + * __log_txn_lsn -- * - * PUBLIC: int __log_put __P((DB_ENV *, DB_LSN *, const DBT *, u_int32_t)); + * PUBLIC: void __log_txn_lsn + * PUBLIC: __P((DB_ENV *, DB_LSN *, u_int32_t *, u_int32_t *)); */ -int -__log_put(dbenv, lsn, dbt, flags) +void +__log_txn_lsn(dbenv, lsnp, mbytesp, bytesp) + DB_ENV *dbenv; + DB_LSN *lsnp; + u_int32_t *mbytesp, *bytesp; +{ + DB_LOG *dblp; + LOG *lp; + + dblp = dbenv->lg_handle; + lp = dblp->reginfo.primary; + + R_LOCK(dbenv, &dblp->reginfo); + + /* + * We are trying to get the LSN of the last entry in the log. We use + * this in two places: 1) DB_ENV->txn_checkpiont uses it as a first + * value when trying to compute an LSN such that all transactions begun + * before it are complete. 2) DB_ENV->txn_begin uses it as the + * begin_lsn. + * + * Typically, it's easy to get the last written LSN, you simply look + * at the current log pointer and back up the number of bytes of the + * last log record. However, if the last thing we did was write the + * log header of a new log file, then, this doesn't work, so we return + * the first log record that will be written in this new file. + */ + *lsnp = lp->lsn; + if (lp->lsn.offset > lp->len) + lsnp->offset -= lp->len; + + /* + * Since we're holding the log region lock, return the bytes put into + * the log since the last checkpoint, transaction checkpoint needs it. + * + * We add the current buffer offset so as to count bytes that have not + * yet been written, but are sitting in the log buffer. + */ + if (mbytesp != NULL) { + *mbytesp = lp->stat.st_wc_mbytes; + *bytesp = (u_int32_t)(lp->stat.st_wc_bytes + lp->b_off); + } + + R_UNLOCK(dbenv, &dblp->reginfo); +} + +/* + * __log_put_next -- + * Put the given record as the next in the log, wherever that may + * turn out to be. + */ +static int +__log_put_next(dbenv, lsn, dbt, hdr, old_lsnp) DB_ENV *dbenv; DB_LSN *lsn; const DBT *dbt; - u_int32_t flags; + HDR *hdr; + DB_LSN *old_lsnp; { - DBT t; DB_LOG *dblp; + DB_LSN old_lsn; LOG *lp; - u_int32_t lastoff; - int ret; + int newfile, ret; dblp = dbenv->lg_handle; lp = dblp->reginfo.primary; /* - * If the application just wants to know where we are, fill in - * the information. Currently used by the transaction manager - * to avoid writing TXN_begin records. + * Save a copy of lp->lsn before we might decide to switch log + * files and change it. If we do switch log files, and we're + * doing replication, we'll need to tell our clients about the + * switch, and they need to receive a NEWFILE message + * with this "would-be" LSN in order to know they're not + * missing any log records. */ - if (flags == DB_CURLSN) { - lsn->file = lp->lsn.file; - lsn->offset = lp->lsn.offset; - return (0); - } + old_lsn = lp->lsn; + newfile = 0; - /* If this information won't fit in the file, swap files. */ - if (lp->lsn.offset + sizeof(HDR) + dbt->size > lp->persist.lg_max) { - if (sizeof(HDR) + - sizeof(LOGP) + dbt->size > lp->persist.lg_max) { + /* + * If this information won't fit in the file, or if we're a + * replication client environment and have been told to do so, + * swap files. + */ + if (lp->lsn.offset == 0 || + lp->lsn.offset + hdr->size + dbt->size > lp->log_size) { + if (hdr->size + sizeof(LOGP) + dbt->size > lp->log_size) { __db_err(dbenv, - "log_put: record larger than maximum file size"); + "DB_ENV->log_put: record larger than maximum file size"); return (EINVAL); } - /* Flush the log. */ - if ((ret = __log_flush(dblp, NULL)) != 0) + if ((ret = __log_newfile(dblp, NULL)) != 0) return (ret); /* + * Flag that we switched files, in case we're a master + * and need to send this information to our clients. + * We postpone doing the actual send until we can + * safely release the log region lock and are doing so + * anyway. + */ + newfile = 1; + + if (dbenv->db_noticecall != NULL) + dbenv->db_noticecall(dbenv, DB_NOTICE_LOGFILE_CHANGED); + } + + /* + * The offset into the log file at this point is the LSN where + * we're about to put this record, and is the LSN the caller wants. + */ + *lsn = lp->lsn; + + /* If we switched log files, let our caller know where. */ + if (newfile) + *old_lsnp = old_lsn; + + /* Actually put the record. */ + return (__log_putr(dblp, lsn, dbt, lp->lsn.offset - lp->len, hdr)); +} + +/* + * __log_flush_commit -- + * Flush a record for which the DB_FLUSH flag to log_put has been set. + */ +static int +__log_flush_commit(dbenv, lsnp, flags) + DB_ENV *dbenv; + const DB_LSN *lsnp; + u_int32_t flags; +{ + DB_LOG *dblp; + DB_LSN flush_lsn; + LOG *lp; + int ret; + u_int32_t op; + + dblp = dbenv->lg_handle; + lp = dblp->reginfo.primary; + flush_lsn = *lsnp; + op = DB_OPFLAGS_MASK & flags; + + if ((ret = __log_flush_int(dblp, &flush_lsn, 1)) == 0) + return (0); + + /* + * If a flush supporting a transaction commit fails, we must abort the + * transaction. (If we aren't doing a commit, return the failure; if + * if the commit we care about made it to disk successfully, we just + * ignore the failure, because there's no way to undo the commit.) + */ + if (op != DB_COMMIT) + return (ret); + + if (flush_lsn.file != lp->lsn.file || flush_lsn.offset < lp->w_off) + return (0); + + /* + * Else, make sure that the commit record does not get out after we + * abort the transaction. Do this by overwriting the commit record + * in the buffer. (Note that other commits in this buffer will wait + * wait until a sucessful write happens, we do not wake them.) We + * point at the right part of the buffer and write an abort record + * over the commit. We must then try and flush the buffer again, + * since the interesting part of the buffer may have actually made + * it out to disk before there was a failure, we can't know for sure. + */ + if (__txn_force_abort(dbenv, + dblp->bufp + flush_lsn.offset - lp->w_off) == 0) + (void)__log_flush_int(dblp, &flush_lsn, 0); + + return (ret); +} + +/* + * __log_newfile -- + * Initialize and switch to a new log file. (Note that this is + * called both when no log yet exists and when we fill a log file.) + * + * PUBLIC: int __log_newfile __P((DB_LOG *, DB_LSN *)); + */ +int +__log_newfile(dblp, lsnp) + DB_LOG *dblp; + DB_LSN *lsnp; +{ + DB_CIPHER *db_cipher; + DB_ENV *dbenv; + DB_LSN lsn; + DBT t; + HDR hdr; + LOG *lp; + int need_free, ret; + u_int32_t lastoff; + size_t tsize; + u_int8_t *tmp; + + dbenv = dblp->dbenv; + lp = dblp->reginfo.primary; + + /* If we're not at the beginning of a file already, start a new one. */ + if (lp->lsn.offset != 0) { + /* + * Flush the log so this file is out and can be closed. We + * cannot release the region lock here because we need to + * protect the end of the file while we switch. In + * particular, a thread with a smaller record than ours + * could detect that there is space in the log. Even + * blocking that event by declaring the file full would + * require all threads to wait here so that the lsn.file + * can be moved ahead after the flush completes. This + * probably can be changed if we had an lsn for the + * previous file and one for the curent, but it does not + * seem like this would get much more throughput, if any. + */ + if ((ret = __log_flush_int(dblp, NULL, 0)) != 0) + return (ret); + + DB_ASSERT(lp->b_off == 0); + /* * Save the last known offset from the previous file, we'll * need it to initialize the persistent header information. */ @@ -143,78 +458,50 @@ __log_put(dbenv, lsn, dbt, flags) /* Reset the file write offset. */ lp->w_off = 0; - - if (dbenv->db_noticecall != NULL) - dbenv->db_noticecall(dbenv, DB_NOTICE_LOGFILE_CHANGED); } else lastoff = 0; - /* Initialize the LSN information returned to the user. */ - lsn->file = lp->lsn.file; - lsn->offset = lp->lsn.offset; - /* * Insert persistent information as the first record in every file. * Note that the previous length is wrong for the very first record * of the log, but that's okay, we check for it during retrieval. */ - if (lp->lsn.offset == 0) { - t.data = &lp->persist; - t.size = sizeof(LOGP); - if ((ret = __log_putr(dblp, lsn, - &t, lastoff == 0 ? 0 : lastoff - lp->len)) != 0) - return (ret); + DB_ASSERT(lp->b_off == 0); - /* - * Record files open in this log. - * If we are recovering then we are in the - * process of outputting the files, don't do - * it again. - */ - if (!F_ISSET(dblp, DBLOG_RECOVER) && - (ret = __log_open_files(dbenv)) != 0) - return (ret); - - /* Update the LSN information returned to the user. */ - lsn->file = lp->lsn.file; - lsn->offset = lp->lsn.offset; - } + memset(&t, 0, sizeof(t)); + memset(&hdr, 0, sizeof(HDR)); - /* Write the application's log record. */ - if ((ret = __log_putr(dblp, lsn, dbt, lp->lsn.offset - lp->len)) != 0) + need_free = 0; + tsize = sizeof(LOGP); + db_cipher = dbenv->crypto_handle; + if (CRYPTO_ON(dbenv)) + tsize += db_cipher->adj_size(tsize); + if ((ret = __os_calloc(dbenv, 1, tsize, &tmp)) != 0) return (ret); + lp->persist.log_size = lp->log_size = lp->log_nsize; + memcpy(tmp, &lp->persist, sizeof(LOGP)); + t.data = tmp; + t.size = (u_int32_t)tsize; + need_free = 1; - /* - * On a checkpoint, we: - * Put out the checkpoint record (above). - * Save the LSN of the checkpoint in the shared region. - * Append the set of file name information into the log. - */ - if (flags == DB_CHECKPOINT) { - lp->chkpt_lsn = *lsn; - if ((ret = __log_open_files(dbenv)) != 0) - return (ret); - } + if ((ret = + __log_encrypt_record(dbenv, &t, &hdr, (u_int32_t)tsize)) != 0) + goto err; + __db_chksum(t.data, t.size, + (CRYPTO_ON(dbenv)) ? db_cipher->mac_key : NULL, hdr.chksum); + lsn = lp->lsn; + if ((ret = __log_putr(dblp, &lsn, + &t, lastoff == 0 ? 0 : lastoff - lp->len, &hdr)) != 0) + goto err; - /* - * On a checkpoint or when flush is requested, we: - * Flush the current buffer contents to disk. - * Sync the log to disk. - */ - if (flags == DB_FLUSH || flags == DB_CHECKPOINT) - if ((ret = __log_flush(dblp, NULL)) != 0) - return (ret); + /* Update the LSN information returned to the caller. */ + if (lsnp != NULL) + *lsnp = lp->lsn; - /* - * On a checkpoint, we: - * Save the time the checkpoint was written. - * Reset the bytes written since the last checkpoint. - */ - if (flags == DB_CHECKPOINT) { - (void)time(&lp->chkpt); - lp->stat.st_wc_bytes = lp->stat.st_wc_mbytes = 0; - } - return (0); +err: + if (need_free) + __os_free(dbenv, tmp); + return (ret); } /* @@ -222,100 +509,253 @@ __log_put(dbenv, lsn, dbt, flags) * Actually put a record into the log. */ static int -__log_putr(dblp, lsn, dbt, prev) +__log_putr(dblp, lsn, dbt, prev, h) DB_LOG *dblp; DB_LSN *lsn; const DBT *dbt; u_int32_t prev; + HDR *h; { - HDR hdr; + DB_CIPHER *db_cipher; + DB_ENV *dbenv; + DB_LSN f_lsn; LOG *lp; - int ret; + HDR tmp, *hdr; + int ret, t_ret; + size_t b_off, nr; + u_int32_t w_off; + dbenv = dblp->dbenv; lp = dblp->reginfo.primary; /* + * If we weren't given a header, use a local one. + */ + db_cipher = dbenv->crypto_handle; + if (h == NULL) { + hdr = &tmp; + memset(hdr, 0, sizeof(HDR)); + if (CRYPTO_ON(dbenv)) + hdr->size = HDR_CRYPTO_SZ; + else + hdr->size = HDR_NORMAL_SZ; + } else + hdr = h; + + /* Save our position in case we fail. */ + b_off = lp->b_off; + w_off = lp->w_off; + f_lsn = lp->f_lsn; + + /* * Initialize the header. If we just switched files, lsn.offset will * be 0, and what we really want is the offset of the previous record * in the previous file. Fortunately, prev holds the value we want. */ - hdr.prev = prev; - hdr.len = sizeof(HDR) + dbt->size; - hdr.cksum = __ham_func4(NULL, dbt->data, dbt->size); + hdr->prev = prev; + hdr->len = (u_int32_t)hdr->size + dbt->size; - if ((ret = __log_fill(dblp, lsn, &hdr, sizeof(HDR))) != 0) - return (ret); - lp->len = sizeof(HDR); - lp->lsn.offset += sizeof(HDR); + /* + * If we were passed in a nonzero checksum, our caller calculated + * the checksum before acquiring the log mutex, as an optimization. + * + * If our caller calculated a real checksum of 0, we'll needlessly + * recalculate it. C'est la vie; there's no out-of-bounds value + * here. + */ + if (hdr->chksum[0] == 0) + __db_chksum(dbt->data, dbt->size, + (CRYPTO_ON(dbenv)) ? db_cipher->mac_key : NULL, + hdr->chksum); + + if ((ret = __log_fill(dblp, lsn, hdr, (u_int32_t)hdr->size)) != 0) + goto err; if ((ret = __log_fill(dblp, lsn, dbt->data, dbt->size)) != 0) - return (ret); - lp->len += dbt->size; - lp->lsn.offset += dbt->size; + goto err; + + lp->len = (u_int32_t)(hdr->size + dbt->size); + lp->lsn.offset += (u_int32_t)(hdr->size + dbt->size); return (0); +err: + /* + * If we wrote more than one buffer before failing, get the + * first one back. The extra buffers will fail the checksums + * and be ignored. + */ + if (w_off + lp->buffer_size < lp->w_off) { + if ((t_ret = + __os_seek(dbenv, + &dblp->lfh, 0, 0, w_off, 0, DB_OS_SEEK_SET)) != 0 || + (t_ret = __os_read(dbenv, &dblp->lfh, dblp->bufp, + b_off, &nr)) != 0) + return (__db_panic(dbenv, t_ret)); + if (nr != b_off) { + __db_err(dbenv, "Short read while restoring log"); + return (__db_panic(dbenv, EIO)); + } + } + + /* Reset to where we started. */ + lp->w_off = w_off; + lp->b_off = b_off; + lp->f_lsn = f_lsn; + + return (ret); } /* - * log_flush -- + * __log_flush -- * Write all records less than or equal to the specified LSN. + * + * PUBLIC: int __log_flush __P((DB_ENV *, const DB_LSN *)); */ int -log_flush(dbenv, lsn) +__log_flush(dbenv, lsn) DB_ENV *dbenv; const DB_LSN *lsn; { DB_LOG *dblp; int ret; -#ifdef HAVE_RPC - if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) - return (__dbcl_log_flush(dbenv, lsn)); -#endif - PANIC_CHECK(dbenv); - ENV_REQUIRES_CONFIG(dbenv, dbenv->lg_handle, DB_INIT_LOG); + ENV_REQUIRES_CONFIG(dbenv, + dbenv->lg_handle, "DB_ENV->log_flush", DB_INIT_LOG); dblp = dbenv->lg_handle; R_LOCK(dbenv, &dblp->reginfo); - ret = __log_flush(dblp, lsn); + ret = __log_flush_int(dblp, lsn, 1); R_UNLOCK(dbenv, &dblp->reginfo); return (ret); } /* - * __log_flush -- + * __log_flush_int -- * Write all records less than or equal to the specified LSN; internal * version. */ static int -__log_flush(dblp, lsn) +__log_flush_int(dblp, lsnp, release) DB_LOG *dblp; - const DB_LSN *lsn; + const DB_LSN *lsnp; + int release; { - DB_LSN t_lsn; + DB_ENV *dbenv; + DB_LSN flush_lsn, f_lsn; + DB_MUTEX *flush_mutexp; LOG *lp; - int current, ret; + int current, do_flush, first, ret; + size_t b_off; + struct __db_commit *commit; + u_int32_t ncommit, w_off; ret = 0; + ncommit = 0; + dbenv = dblp->dbenv; lp = dblp->reginfo.primary; + flush_mutexp = R_ADDR(&dblp->reginfo, lp->flush_mutex_off); /* * If no LSN specified, flush the entire log by setting the flush LSN * to the last LSN written in the log. Otherwise, check that the LSN * isn't a non-existent record for the log. */ - if (lsn == NULL) { - t_lsn.file = lp->lsn.file; - t_lsn.offset = lp->lsn.offset - lp->len; - lsn = &t_lsn; - } else - if (lsn->file > lp->lsn.file || - (lsn->file == lp->lsn.file && - lsn->offset > lp->lsn.offset - lp->len)) { - __db_err(dblp->dbenv, - "log_flush: LSN past current end-of-log"); - return (EINVAL); - } + if (lsnp == NULL) { + flush_lsn.file = lp->lsn.file; + flush_lsn.offset = lp->lsn.offset - lp->len; + } else if (lsnp->file > lp->lsn.file || + (lsnp->file == lp->lsn.file && + lsnp->offset > lp->lsn.offset - lp->len)) { + __db_err(dbenv, + "DB_ENV->log_flush: LSN past current end-of-log"); + return (EINVAL); + } else { + /* + * See if we need to wait. s_lsn is not locked so some + * care is needed. The sync point can only move forward. + * If the file we want is in the past we are done. + * If the file numbers are the same check the offset. + * If this fails check the file numbers again since the + * offset might have changed while we were looking. + * This all assumes we can read an integer in one + * state or the other, not in transition. + */ + if (lp->s_lsn.file > lsnp->file) + return (0); + + if (lp->s_lsn.file == lsnp->file && + lp->s_lsn.offset > lsnp->offset) + return (0); + + if (lp->s_lsn.file > lsnp->file) + return (0); + + flush_lsn = *lsnp; + } + + /* + * If a flush is in progress and we're allowed to do so, drop + * the region lock and block waiting for the next flush. + */ + if (release && lp->in_flush != 0) { + if ((commit = SH_TAILQ_FIRST( + &lp->free_commits, __db_commit)) == NULL) { + if ((ret = + __db_shalloc(dblp->reginfo.addr, + sizeof(struct __db_commit), + MUTEX_ALIGN, &commit)) != 0) + goto flush; + memset(commit, 0, sizeof(*commit)); + if ((ret = __db_mutex_setup(dbenv, &dblp->reginfo, + &commit->mutex, MUTEX_SELF_BLOCK | + MUTEX_NO_RLOCK)) != 0) { + __db_shalloc_free(dblp->reginfo.addr, commit); + return (ret); + } + MUTEX_LOCK(dbenv, &commit->mutex); + } else + SH_TAILQ_REMOVE( + &lp->free_commits, commit, links, __db_commit); + + lp->ncommit++; + + /* + * Flushes may be requested out of LSN order; be + * sure we only move lp->t_lsn forward. + */ + if (log_compare(&lp->t_lsn, &flush_lsn) < 0) + lp->t_lsn = flush_lsn; + + commit->lsn = flush_lsn; + SH_TAILQ_INSERT_HEAD( + &lp->commits, commit, links, __db_commit); + R_UNLOCK(dbenv, &dblp->reginfo); + /* Wait here for the in-progress flush to finish. */ + MUTEX_LOCK(dbenv, &commit->mutex); + R_LOCK(dbenv, &dblp->reginfo); + + lp->ncommit--; + /* + * Grab the flag before freeing the struct to see if + * we need to flush the log to commit. If so, + * use the maximal lsn for any committing thread. + */ + do_flush = F_ISSET(commit, DB_COMMIT_FLUSH); + F_CLR(commit, DB_COMMIT_FLUSH); + SH_TAILQ_INSERT_HEAD( + &lp->free_commits, commit, links, __db_commit); + if (do_flush) { + lp->in_flush--; + flush_lsn = lp->t_lsn; + } else + return (0); + } + + /* + * Protect flushing with its own mutex so we can release + * the region lock except during file switches. + */ +flush: MUTEX_LOCK(dbenv, flush_mutexp); /* * If the LSN is less than or equal to the last-sync'd LSN, we're done. @@ -323,9 +763,12 @@ __log_flush(dblp, lsn) * after the byte we absolutely know was written to disk, so the test * is <, not <=. */ - if (lsn->file < lp->s_lsn.file || - (lsn->file == lp->s_lsn.file && lsn->offset < lp->s_lsn.offset)) - return (0); + if (flush_lsn.file < lp->s_lsn.file || + (flush_lsn.file == lp->s_lsn.file && + flush_lsn.offset < lp->s_lsn.offset)) { + MUTEX_UNLOCK(dbenv, flush_mutexp); + goto done; + } /* * We may need to write the current buffer. We have to write the @@ -333,9 +776,12 @@ __log_flush(dblp, lsn) * buffer's starting LSN. */ current = 0; - if (lp->b_off != 0 && log_compare(lsn, &lp->f_lsn) >= 0) { - if ((ret = __log_write(dblp, dblp->bufp, lp->b_off)) != 0) - return (ret); + if (lp->b_off != 0 && log_compare(&flush_lsn, &lp->f_lsn) >= 0) { + if ((ret = __log_write(dblp, + dblp->bufp, (u_int32_t)lp->b_off)) != 0) { + MUTEX_UNLOCK(dbenv, flush_mutexp); + goto done; + } lp->b_off = 0; current = 1; @@ -348,23 +794,90 @@ __log_flush(dblp, lsn) * buffer, don't bother. We have nothing to write and nothing to * sync. */ - if (dblp->lfname != lp->lsn.file) { - if (!current) - return (0); - if ((ret = __log_newfh(dblp)) != 0) - return (ret); - } + if (!F_ISSET(&dblp->lfh, DB_FH_VALID) || dblp->lfname != lp->lsn.file) + if (!current || (ret = __log_newfh(dblp)) != 0) { + MUTEX_UNLOCK(dbenv, flush_mutexp); + goto done; + } + + /* + * We are going to flush, release the region. + * First get the current state of the buffer since + * another write may come in, but we may not flush it. + */ + b_off = lp->b_off; + w_off = lp->w_off; + f_lsn = lp->f_lsn; + lp->in_flush++; + if (release) + R_UNLOCK(dbenv, &dblp->reginfo); /* Sync all writes to disk. */ - if ((ret = __os_fsync(dblp->dbenv, &dblp->lfh)) != 0) - return (__db_panic(dblp->dbenv, ret)); + if ((ret = __os_fsync(dbenv, &dblp->lfh)) != 0) { + MUTEX_UNLOCK(dbenv, flush_mutexp); + if (release) + R_LOCK(dbenv, &dblp->reginfo); + ret = __db_panic(dbenv, ret); + return (ret); + } + + /* + * Set the last-synced LSN. + * This value must be set to the LSN past the last complete + * record that has been flushed. This is at least the first + * lsn, f_lsn. If the buffer is empty, b_off == 0, then + * we can move up to write point since the first lsn is not + * set for the new buffer. + */ + lp->s_lsn = f_lsn; + if (b_off == 0) + lp->s_lsn.offset = w_off; + + MUTEX_UNLOCK(dbenv, flush_mutexp); + if (release) + R_LOCK(dbenv, &dblp->reginfo); + + lp->in_flush--; ++lp->stat.st_scount; - /* Set the last-synced LSN, using the on-disk write offset. */ - lp->s_lsn.file = lp->f_lsn.file; - lp->s_lsn.offset = lp->w_off; + /* + * How many flush calls (usually commits) did this call actually sync? + * At least one, if it got here. + */ + ncommit = 1; +done: + if (lp->ncommit != 0) { + first = 1; + for (commit = SH_TAILQ_FIRST(&lp->commits, __db_commit); + commit != NULL; + commit = SH_TAILQ_NEXT(commit, links, __db_commit)) + if (log_compare(&lp->s_lsn, &commit->lsn) > 0) { + MUTEX_UNLOCK(dbenv, &commit->mutex); + SH_TAILQ_REMOVE( + &lp->commits, commit, links, __db_commit); + ncommit++; + } else if (first == 1) { + F_SET(commit, DB_COMMIT_FLUSH); + MUTEX_UNLOCK(dbenv, &commit->mutex); + SH_TAILQ_REMOVE( + &lp->commits, commit, links, __db_commit); + /* + * This thread will wake and flush. + * If another thread commits and flushes + * first we will waste a trip trough the + * mutex. + */ + lp->in_flush++; + first = 0; + } + } + if (lp->stat.st_maxcommitperflush < ncommit) + lp->stat.st_maxcommitperflush = ncommit; + if (lp->stat.st_mincommitperflush > ncommit || + lp->stat.st_mincommitperflush == 0) + lp->stat.st_mincommitperflush = ncommit; - return (0); + return (ret); } /* @@ -415,7 +928,7 @@ __log_fill(dblp, lsn, addr, len) nw = remain > len ? len : remain; memcpy(dblp->bufp + lp->b_off, addr, nw); addr = (u_int8_t *)addr + nw; - len -= nw; + len -= (u_int32_t)nw; lp->b_off += nw; /* If we fill the buffer, flush it. */ @@ -439,15 +952,18 @@ __log_write(dblp, addr, len) void *addr; u_int32_t len; { + DB_ENV *dbenv; LOG *lp; size_t nw; int ret; + dbenv = dblp->dbenv; + lp = dblp->reginfo.primary; + /* * If we haven't opened the log file yet or the current one * has changed, acquire a new log file. */ - lp = dblp->reginfo.primary; if (!F_ISSET(&dblp->lfh, DB_FH_VALID) || dblp->lfname != lp->lsn.file) if ((ret = __log_newfh(dblp)) != 0) return (ret); @@ -457,14 +973,10 @@ __log_write(dblp, addr, len) * since we last did). */ if ((ret = - __os_seek(dblp->dbenv, + __os_seek(dbenv, &dblp->lfh, 0, 0, lp->w_off, 0, DB_OS_SEEK_SET)) != 0 || - (ret = __os_write(dblp->dbenv, &dblp->lfh, addr, len, &nw)) != 0) - return (__db_panic(dblp->dbenv, ret)); - if (nw != len) { - __db_err(dblp->dbenv, "Short write while writing log"); - return (EIO); - } + (ret = __os_write(dbenv, &dblp->lfh, addr, len, &nw)) != 0) + return (ret); /* Reset the buffer offset and update the seek offset. */ lp->w_off += len; @@ -484,11 +996,13 @@ __log_write(dblp, addr, len) } /* - * log_file -- + * __log_file -- * Map a DB_LSN to a file name. + * + * PUBLIC: int __log_file __P((DB_ENV *, const DB_LSN *, char *, size_t)); */ int -log_file(dbenv, lsn, namep, len) +__log_file(dbenv, lsn, namep, len) DB_ENV *dbenv; const DB_LSN *lsn; char *namep; @@ -498,13 +1012,9 @@ log_file(dbenv, lsn, namep, len) int ret; char *name; -#ifdef HAVE_RPC - if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) - return (__dbcl_log_file(dbenv, lsn, namep, len)); -#endif - PANIC_CHECK(dbenv); - ENV_REQUIRES_CONFIG(dbenv, dbenv->lg_handle, DB_INIT_LOG); + ENV_REQUIRES_CONFIG(dbenv, + dbenv->lg_handle, "DB_ENV->log_file", DB_INIT_LOG); dblp = dbenv->lg_handle; R_LOCK(dbenv, &dblp->reginfo); @@ -516,11 +1026,11 @@ log_file(dbenv, lsn, namep, len) /* Check to make sure there's enough room and copy the name. */ if (len < strlen(name) + 1) { *namep = '\0'; - __db_err(dbenv, "log_file: name buffer is too short"); + __db_err(dbenv, "DB_ENV->log_file: name buffer is too short"); return (EINVAL); } (void)strcpy(namep, name); - __os_freestr(name); + __os_free(dbenv, name); return (0); } @@ -533,19 +1043,21 @@ static int __log_newfh(dblp) DB_LOG *dblp; { + DB_ENV *dbenv; LOG *lp; int ret; char *name; + dbenv = dblp->dbenv; + lp = dblp->reginfo.primary; + /* Close any previous file descriptor. */ if (F_ISSET(&dblp->lfh, DB_FH_VALID)) - (void)__os_closehandle(&dblp->lfh); - - /* Get the path of the new file and open it. */ - lp = dblp->reginfo.primary; - dblp->lfname = lp->lsn.file; + (void)__os_closehandle(dbenv, &dblp->lfh); /* + * Get the path of the new file and open it. + * * Adding DB_OSO_LOG to the flags may add additional platform-specific * optimizations. On WinNT, the logfile is preallocated, which may * have a time penalty at startup, but have better overall throughput. @@ -557,14 +1069,16 @@ __log_newfh(dblp) * maximum size down into the Windows __os_open routine, because it * wants to pre-allocate it. */ - dblp->lfh.log_size = dblp->dbenv->lg_max; + dblp->lfname = lp->lsn.file; + dblp->lfh.log_size = lp->log_size; if ((ret = __log_name(dblp, dblp->lfname, &name, &dblp->lfh, - DB_OSO_CREATE |/* DB_OSO_LOG |*/ DB_OSO_SEQ)) != 0) - __db_err(dblp->dbenv, - "log_put: %s: %s", name, db_strerror(ret)); + DB_OSO_CREATE |/* DB_OSO_LOG |*/ DB_OSO_SEQ | + (F_ISSET(dbenv, DB_ENV_DIRECT_LOG) ? DB_OSO_DIRECT : 0))) != 0) + __db_err(dbenv, + "DB_ENV->log_put: %s: %s", name, db_strerror(ret)); - __os_freestr(name); + __os_free(dbenv, name); return (ret); } @@ -582,11 +1096,13 @@ __log_name(dblp, filenumber, namep, fhp, flags) char **namep; DB_FH *fhp; { + DB_ENV *dbenv; LOG *lp; int ret; char *oname; char old[sizeof(LFPREFIX) + 5 + 20], new[sizeof(LFPREFIX) + 10 + 20]; + dbenv = dblp->dbenv; lp = dblp->reginfo.primary; /* @@ -608,13 +1124,12 @@ __log_name(dblp, filenumber, namep, fhp, flags) * file, return regardless. */ (void)snprintf(new, sizeof(new), LFNAME, filenumber); - if ((ret = __db_appname(dblp->dbenv, - DB_APP_LOG, NULL, new, 0, NULL, namep)) != 0 || fhp == NULL) + if ((ret = __db_appname(dbenv, + DB_APP_LOG, new, 0, NULL, namep)) != 0 || fhp == NULL) return (ret); /* Open the new-style file -- if we succeed, we're done. */ - if ((ret = __os_open(dblp->dbenv, - *namep, flags, lp->persist.mode, fhp)) == 0) + if ((ret = __os_open(dbenv, *namep, flags, lp->persist.mode, fhp)) == 0) return (0); /* @@ -622,15 +1137,14 @@ __log_name(dblp, filenumber, namep, fhp, flags) * the caller isn't interested in old-style files. */ if (!LF_ISSET(DB_OSO_RDONLY)) { - __db_err(dblp->dbenv, + __db_err(dbenv, "%s: log file open failed: %s", *namep, db_strerror(ret)); - return (__db_panic(dblp->dbenv, ret)); + return (__db_panic(dbenv, ret)); } /* Create an old-style file name. */ (void)snprintf(old, sizeof(old), LFNAME_V1, filenumber); - if ((ret = __db_appname(dblp->dbenv, - DB_APP_LOG, NULL, old, 0, NULL, &oname)) != 0) + if ((ret = __db_appname(dbenv, DB_APP_LOG, old, 0, NULL, &oname)) != 0) goto err; /* @@ -638,9 +1152,9 @@ __log_name(dblp, filenumber, namep, fhp, flags) * space allocated for the new-style name and return the old-style * name to the caller. */ - if ((ret = __os_open(dblp->dbenv, + if ((ret = __os_open(dbenv, oname, flags, lp->persist.mode, fhp)) == 0) { - __os_freestr(*namep); + __os_free(dbenv, *namep); *namep = oname; return (0); } @@ -653,52 +1167,82 @@ __log_name(dblp, filenumber, namep, fhp, flags) * old-style name, but we expected it to exist and we weren't just * looking for any log file. That's not a likely error. */ -err: __os_freestr(oname); +err: __os_free(dbenv, oname); return (ret); } -static int -__log_open_files(dbenv) +/* + * __log_rep_put -- + * Short-circuit way for replication clients to put records into the + * log. Replication clients' logs need to be laid out exactly their masters' + * are, so we let replication take responsibility for when the log gets + * flushed, when log switches files, etc. This is just a thin PUBLIC wrapper + * for __log_putr with a slightly prettier interface. + * + * Note that the log region mutex should be held when this is called. + * + * PUBLIC: int __log_rep_put __P((DB_ENV *, DB_LSN *, const DBT *)); + */ +int +__log_rep_put(dbenv, lsnp, rec) DB_ENV *dbenv; + DB_LSN *lsnp; + const DBT *rec; { + DB_CIPHER *db_cipher; DB_LOG *dblp; - DB_LSN r_unused; - DBT fid_dbt, t; - FNAME *fnp; + HDR hdr; + DBT *dbt, t; LOG *lp; - int ret; + int need_free, ret; dblp = dbenv->lg_handle; lp = dblp->reginfo.primary; - for (fnp = SH_TAILQ_FIRST(&lp->fq, __fname); - fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) { - if (fnp->ref == 0) /* Entry not in use. */ - continue; - if (fnp->name_off != INVALID_ROFF) { - memset(&t, 0, sizeof(t)); - t.data = R_ADDR(&dblp->reginfo, fnp->name_off); - t.size = strlen(t.data) + 1; - } - memset(&fid_dbt, 0, sizeof(fid_dbt)); - fid_dbt.data = fnp->ufid; - fid_dbt.size = DB_FILE_ID_LEN; - /* - * Output LOG_CHECKPOINT records which will be - * processed during the OPENFILES pass of recovery. - * At the end of recovery we want to output the - * files that were open so that a future recovery - * run will have the correct files open during - * a backward pass. For this we output LOG_CLOSE - * records so that the files will be closed on - * the forward pass. - */ - if ((ret = __log_register_log(dbenv, - NULL, &r_unused, 0, - F_ISSET(dblp, DBLOG_RECOVER) ? LOG_CLOSE : LOG_CHECKPOINT, - fnp->name_off == INVALID_ROFF ? NULL : &t, - &fid_dbt, fnp->id, fnp->s_type, fnp->meta_pgno)) != 0) + memset(&hdr, 0, sizeof(HDR)); + t = *rec; + dbt = &t; + need_free = 0; + db_cipher = (DB_CIPHER *)dbenv->crypto_handle; + if (CRYPTO_ON(dbenv)) + t.size += db_cipher->adj_size(rec->size); + if ((ret = __os_calloc(dbenv, 1, t.size, &t.data)) != 0) + goto err; + need_free = 1; + memcpy(t.data, rec->data, rec->size); + + if ((ret = __log_encrypt_record(dbenv, dbt, &hdr, rec->size)) != 0) + goto err; + __db_chksum(t.data, t.size, + (CRYPTO_ON(dbenv)) ? db_cipher->mac_key : NULL, hdr.chksum); + + DB_ASSERT(log_compare(lsnp, &lp->lsn) == 0); + ret = __log_putr(dblp, lsnp, dbt, lp->lsn.offset - lp->len, &hdr); +err: + if (need_free) + __os_free(dbenv, t.data); + return (ret); +} + +static int +__log_encrypt_record(dbenv, dbt, hdr, orig) + DB_ENV *dbenv; + DBT *dbt; + HDR *hdr; + u_int32_t orig; +{ + DB_CIPHER *db_cipher; + int ret; + + if (CRYPTO_ON(dbenv)) { + db_cipher = (DB_CIPHER *)dbenv->crypto_handle; + hdr->size = HDR_CRYPTO_SZ; + hdr->orig_size = orig; + if ((ret = db_cipher->encrypt(dbenv, db_cipher->data, + hdr->iv, dbt->data, dbt->size)) != 0) return (ret); + } else { + hdr->size = HDR_NORMAL_SZ; } return (0); } diff --git a/bdb/log/log_rec.c b/bdb/log/log_rec.c deleted file mode 100644 index 493dd06d4c6..00000000000 --- a/bdb/log/log_rec.c +++ /dev/null @@ -1,647 +0,0 @@ -/*- - * See the file LICENSE for redistribution information. - * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 - * Sleepycat Software. All rights reserved. - */ -/* - * Copyright (c) 1995, 1996 - * The President and Fellows of Harvard University. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - */ - -#include "db_config.h" - -#ifndef lint -static const char revid[] = "$Id: log_rec.c,v 11.48 2001/01/11 18:19:53 bostic Exp $"; -#endif /* not lint */ - -#ifndef NO_SYSTEM_INCLUDES -#include <sys/types.h> - -#include <string.h> -#endif - -#include "db_int.h" -#include "db_page.h" -#include "db_am.h" -#include "log.h" - -static int __log_check_master __P((DB_ENV *, u_int8_t *, char *)); -static int __log_do_open __P((DB_ENV *, DB_LOG *, - u_int8_t *, char *, DBTYPE, int32_t, db_pgno_t)); -static int __log_open_file __P((DB_ENV *, DB_LOG *, __log_register_args *)); - -/* - * PUBLIC: int __log_register_recover - * PUBLIC: __P((DB_ENV *, DBT *, DB_LSN *, db_recops, void *)); - */ -int -__log_register_recover(dbenv, dbtp, lsnp, op, info) - DB_ENV *dbenv; - DBT *dbtp; - DB_LSN *lsnp; - db_recops op; - void *info; -{ - DB_ENTRY *dbe; - DB_LOG *logp; - DB *dbp; - __log_register_args *argp; - int do_rem, ret, t_ret; - - logp = dbenv->lg_handle; - dbp = NULL; - -#ifdef DEBUG_RECOVER - REC_PRINT(__log_register_print); -#endif - COMPQUIET(lsnp, NULL); - - if ((ret = __log_register_read(dbenv, dbtp->data, &argp)) != 0) - goto out; - - if ((argp->opcode == LOG_OPEN && - (DB_REDO(op) || op == DB_TXN_OPENFILES)) || - (argp->opcode == LOG_CLOSE && DB_UNDO(op))) { - /* - * If we are redoing an open or undoing a close, then we need - * to open a file. We must open the file even if - * the meta page is not yet written as we may be creating it. - */ - if (op == DB_TXN_OPENFILES) - F_SET(logp, DBLOG_FORCE_OPEN); - ret = __log_open_file(dbenv, logp, argp); - F_CLR(logp, DBLOG_FORCE_OPEN); - if (ret == ENOENT || ret == EINVAL) { - if (op == DB_TXN_OPENFILES && argp->name.size != 0 && - (ret = __db_txnlist_delete(dbenv, info, - argp->name.data, argp->fileid, 0)) != 0) - goto out; - ret = 0; - } - } else if (argp->opcode != LOG_CHECKPOINT) { - /* - * If we are undoing an open, then we need to close the file. - * - * If the file is deleted, then we can just ignore this close. - * Otherwise, we should usually have a valid dbp we should - * close or whose reference count should be decremented. - * However, if we shut down without closing a file, we may, in - * fact, not have the file open, and that's OK. - */ - do_rem = 0; - MUTEX_THREAD_LOCK(dbenv, logp->mutexp); - if (argp->fileid < logp->dbentry_cnt) { - dbe = &logp->dbentry[argp->fileid]; - - DB_ASSERT(dbe->refcount == 1); - - ret = __db_txnlist_close(info, - argp->fileid, dbe->count); - if ((dbp = TAILQ_FIRST(&dbe->dblist)) != NULL) - (void)log_unregister(dbenv, dbp); - do_rem = 1; - } - MUTEX_THREAD_UNLOCK(dbenv, logp->mutexp); - if (do_rem) { - (void)__log_rem_logid(logp, dbp, argp->fileid); - /* - * If remove or rename has closed the file, don't - * sync. - */ - if (dbp != NULL && - (t_ret = dbp->close(dbp, - dbp->mpf == NULL ? DB_NOSYNC : 0)) != 0 && ret == 0) - ret = t_ret; - } - } else if (DB_UNDO(op) || op == DB_TXN_OPENFILES) { - /* - * It's a checkpoint and we are rolling backward. It - * is possible that the system was shut down and thus - * ended with a stable checkpoint; this file was never - * closed and has therefore not been reopened yet. If - * so, we need to try to open it. - */ - ret = __log_open_file(dbenv, logp, argp); - if (ret == ENOENT || ret == EINVAL) { - if (argp->name.size != 0 && (ret = - __db_txnlist_delete(dbenv, info, - argp->name.data, argp->fileid, 0)) != 0) - goto out; - ret = 0; - } - } - -out: if (argp != NULL) - __os_free(argp, 0); - return (ret); -} - -/* - * __log_open_file -- - * Called during log_register recovery. Make sure that we have an - * entry in the dbentry table for this ndx. Returns 0 on success, - * non-zero on error. - */ -static int -__log_open_file(dbenv, lp, argp) - DB_ENV *dbenv; - DB_LOG *lp; - __log_register_args *argp; -{ - DB_ENTRY *dbe; - DB *dbp; - - /* - * We never re-open temporary files. Temp files are only - * useful during aborts in which case the dbp was entered - * when the file was registered. During recovery, we treat - * temp files as properly deleted files, allowing the open to - * fail and not reporting any errors when recovery fails to - * get a valid dbp from db_fileid_to_db. - */ - if (argp->name.size == 0) { - (void)__log_add_logid(dbenv, lp, NULL, argp->fileid); - return (ENOENT); - } - - /* - * Because of reference counting, we cannot automatically close files - * during recovery, so when we're opening, we have to check that the - * name we are opening is what we expect. If it's not, then we close - * the old file and open the new one. - */ - MUTEX_THREAD_LOCK(dbenv, lp->mutexp); - if (argp->fileid < lp->dbentry_cnt) - dbe = &lp->dbentry[argp->fileid]; - else - dbe = NULL; - - if (dbe != NULL) { - dbe->deleted = 0; - if ((dbp = TAILQ_FIRST(&dbe->dblist)) != NULL) { - if (dbp->meta_pgno != argp->meta_pgno || - memcmp(dbp->fileid, - argp->uid.data, DB_FILE_ID_LEN) != 0) { - MUTEX_THREAD_UNLOCK(dbenv, lp->mutexp); - goto reopen; - } - if (!F_ISSET(lp, DBLOG_RECOVER)) - dbe->refcount++; - MUTEX_THREAD_UNLOCK(dbenv, lp->mutexp); - return (0); - } - } - - MUTEX_THREAD_UNLOCK(dbenv, lp->mutexp); - if (0) { -reopen: (void)log_unregister(dbp->dbenv, dbp); - (void)__log_rem_logid(lp, dbp, argp->fileid); - dbp->close(dbp, 0); - } - - return (__log_do_open(dbenv, lp, - argp->uid.data, argp->name.data, - argp->ftype, argp->fileid, argp->meta_pgno)); -} - -/* - * log_reopen_file -- close and reopen a db file. - * Must be called when a metadata page changes. - * - * PUBLIC: int __log_reopen_file __P((DB_ENV *, - * PUBLIC: char *, int32_t, u_int8_t *, db_pgno_t)); - * - */ -int -__log_reopen_file(dbenv, name, ndx, fileid, meta_pgno) - DB_ENV *dbenv; - char *name; - int32_t ndx; - u_int8_t *fileid; - db_pgno_t meta_pgno; -{ - DB *dbp; - DB_LOG *logp; - DBTYPE ftype; - FNAME *fnp; - LOG *lp; - char *tmp_name; - int ret; - - logp = dbenv->lg_handle; - - if (name == NULL) { - R_LOCK(dbenv, &logp->reginfo); - - lp = logp->reginfo.primary; - - for (fnp = SH_TAILQ_FIRST(&lp->fq, __fname); - fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) { - if (fnp->ref == 0) /* Entry not in use. */ - continue; - if (memcmp(fnp->ufid, fileid, DB_FILE_ID_LEN) == 0) - break; - } - - if (fnp == 0 || fnp->name_off == INVALID_ROFF) { - __db_err(dbenv, - "metasub recover: non-existent file id"); - return (EINVAL); - } - - name = R_ADDR(&logp->reginfo, fnp->name_off); - ret = __os_strdup(dbenv, name, &tmp_name); - R_UNLOCK(dbenv, &logp->reginfo); - if (ret != 0) - goto out; - name = tmp_name; - } else - tmp_name = NULL; - - if ((ret = __db_fileid_to_db(dbenv, &dbp, ndx, 0)) != 0) - goto out; - ftype = dbp->type; - (void)log_unregister(dbenv, dbp); - (void)__log_rem_logid(logp, dbp, ndx); - (void)dbp->close(dbp, 0); - - ret = __log_do_open(dbenv, logp, fileid, name, ftype, ndx, meta_pgno); - - if (tmp_name != NULL) - __os_free(tmp_name, 0); - -out: return (ret); -} - -/* - * __log_do_open -- - * Open files referenced in the log. This is the part of the open that - * is not protected by the thread mutex. - */ -static int -__log_do_open(dbenv, lp, uid, name, ftype, ndx, meta_pgno) - DB_ENV *dbenv; - DB_LOG *lp; - u_int8_t *uid; - char *name; - DBTYPE ftype; - int32_t ndx; - db_pgno_t meta_pgno; -{ - DB *dbp; - int ret; - u_int8_t zeroid[DB_FILE_ID_LEN]; - - if ((ret = db_create(&dbp, lp->dbenv, 0)) != 0) - return (ret); - - dbp->log_fileid = ndx; - - /* - * This is needed to signal to the locking routines called while - * opening databases that we are potentially undoing a transaction - * from an XA process. Since the XA process does not share - * locks with the aborting transaction this prevents us from - * deadlocking during the open during rollback. - * Because this routine is called either during recovery or during an - * XA_ABORT, we can safely set DB_AM_RECOVER in the dbp since it - * will not be shared with other threads. - */ - F_SET(dbp, DB_AM_RECOVER); - if (meta_pgno != PGNO_BASE_MD) - memcpy(dbp->fileid, uid, DB_FILE_ID_LEN); - dbp->type = ftype; - if ((ret = - __db_dbopen(dbp, name, 0, __db_omode("rw----"), meta_pgno)) == 0) { - /* - * Verify that we are opening the same file that we were - * referring to when we wrote this log record. - */ - if (meta_pgno != PGNO_BASE_MD && - __log_check_master(dbenv, uid, name) != 0) - goto not_right; - if (memcmp(uid, dbp->fileid, DB_FILE_ID_LEN) != 0) { - memset(zeroid, 0, DB_FILE_ID_LEN); - if (memcmp(dbp->fileid, zeroid, DB_FILE_ID_LEN) != 0) - goto not_right; - memcpy(dbp->fileid, uid, DB_FILE_ID_LEN); - } - if (IS_RECOVERING(dbenv)) { - (void)log_register(dbp->dbenv, dbp, name); - (void)__log_add_logid(dbenv, lp, dbp, ndx); - } - return (0); - } - -not_right: - (void)dbp->close(dbp, 0); - (void)__log_add_logid(dbenv, lp, NULL, ndx); - - return (ENOENT); -} - -static int -__log_check_master(dbenv, uid, name) - DB_ENV *dbenv; - u_int8_t *uid; - char *name; -{ - DB *dbp; - int ret; - - ret = 0; - if ((ret = db_create(&dbp, dbenv, 0)) != 0) - return (ret); - dbp->type = DB_BTREE; - ret = __db_dbopen(dbp, name, 0, __db_omode("rw----"), PGNO_BASE_MD); - - if (ret == 0 && memcmp(uid, dbp->fileid, DB_FILE_ID_LEN) != 0) - ret = EINVAL; - - (void) dbp->close(dbp, 0); - return (ret); -} - -/* - * __log_add_logid -- - * Adds a DB entry to the log's DB entry table. - * - * PUBLIC: int __log_add_logid __P((DB_ENV *, DB_LOG *, DB *, int32_t)); - */ -int -__log_add_logid(dbenv, logp, dbp, ndx) - DB_ENV *dbenv; - DB_LOG *logp; - DB *dbp; - int32_t ndx; -{ - DB *dbtmp; - int32_t i; - int ret; - - ret = 0; - - MUTEX_THREAD_LOCK(dbenv, logp->mutexp); - - /* - * Check if we need to grow the table. Note, ndx is 0-based (the - * index into the DB entry table) an dbentry_cnt is 1-based, the - * number of available slots. - */ - if (logp->dbentry_cnt <= ndx) { - if ((ret = __os_realloc(dbenv, - (ndx + DB_GROW_SIZE) * sizeof(DB_ENTRY), - NULL, &logp->dbentry)) != 0) - goto err; - - /* - * We have moved the head of the queue. - * Fix up the queue header of an empty queue or the previous - * pointer of the first element. - */ - for (i = 0; i < logp->dbentry_cnt; i++) { - if ((dbtmp = - TAILQ_FIRST(&logp->dbentry[i].dblist)) == NULL) - TAILQ_INIT(&logp->dbentry[i].dblist); - else - TAILQ_REINSERT_HEAD( - &logp->dbentry[i].dblist, dbtmp, links); - } - - /* Initialize the new entries. */ - for (i = logp->dbentry_cnt; i < ndx + DB_GROW_SIZE; i++) { - logp->dbentry[i].count = 0; - TAILQ_INIT(&logp->dbentry[i].dblist); - logp->dbentry[i].deleted = 0; - logp->dbentry[i].refcount = 0; - } - - logp->dbentry_cnt = i; - } - - if (logp->dbentry[ndx].deleted == 0 && - TAILQ_FIRST(&logp->dbentry[ndx].dblist) == NULL) { - logp->dbentry[ndx].count = 0; - if (dbp != NULL) - TAILQ_INSERT_HEAD(&logp->dbentry[ndx].dblist, - dbp, links); - logp->dbentry[ndx].deleted = dbp == NULL; - logp->dbentry[ndx].refcount = 1; - } else if (!F_ISSET(logp, DBLOG_RECOVER)) { - if (dbp != NULL) - TAILQ_INSERT_HEAD(&logp->dbentry[ndx].dblist, - dbp, links); - logp->dbentry[ndx].refcount++; - } - -err: MUTEX_THREAD_UNLOCK(dbenv, logp->mutexp); - return (ret); -} - -/* - * __db_fileid_to_db -- - * Return the DB corresponding to the specified fileid. - * - * PUBLIC: int __db_fileid_to_db __P((DB_ENV *, DB **, int32_t, int)); - */ -int -__db_fileid_to_db(dbenv, dbpp, ndx, inc) - DB_ENV *dbenv; - DB **dbpp; - int32_t ndx; - int inc; -{ - DB_LOG *logp; - DB *dbp; - FNAME *fname; - int ret; - char *name; - - ret = 0; - logp = dbenv->lg_handle; - - MUTEX_THREAD_LOCK(dbenv, logp->mutexp); - - /* - * Under XA, a process different than the one issuing DB operations - * may abort a transaction. In this case, recovery routines are run - * by a process that does not necessarily have the file open, so we - * we must open the file explicitly. - */ - if (ndx >= logp->dbentry_cnt || - (!logp->dbentry[ndx].deleted && - (dbp = TAILQ_FIRST(&logp->dbentry[ndx].dblist)) == NULL)) { - if (F_ISSET(logp, DBLOG_RECOVER)) { - ret = ENOENT; - goto err; - } - if (__log_lid_to_fname(logp, ndx, &fname) != 0) { - /* Couldn't find entry; this is a fatal error. */ - __db_err(dbenv, "Missing log fileid entry"); - ret = EINVAL; - goto err; - } - name = R_ADDR(&logp->reginfo, fname->name_off); - - /* - * __log_do_open is called without protection of the - * log thread lock. - */ - MUTEX_THREAD_UNLOCK(dbenv, logp->mutexp); - - /* - * At this point, we are not holding the thread lock, so exit - * directly instead of going through the exit code at the - * bottom. If the __log_do_open succeeded, then we don't need - * to do any of the remaining error checking at the end of this - * routine. - */ - if ((ret = __log_do_open(dbenv, logp, - fname->ufid, name, fname->s_type, - ndx, fname->meta_pgno)) != 0) - return (ret); - - *dbpp = TAILQ_FIRST(&logp->dbentry[ndx].dblist); - return (0); - } - - /* - * Return DB_DELETED if the file has been deleted (it's not an error). - */ - if (logp->dbentry[ndx].deleted) { - ret = DB_DELETED; - if (inc) - logp->dbentry[ndx].count++; - goto err; - } - - /* - * Otherwise return 0, but if we don't have a corresponding DB, it's - * an error. - */ - if ((*dbpp = TAILQ_FIRST(&logp->dbentry[ndx].dblist)) == NULL) - ret = ENOENT; - -err: MUTEX_THREAD_UNLOCK(dbenv, logp->mutexp); - return (ret); -} - -/* - * __log_close_files -- - * Close files that were opened by the recovery daemon. We sync the - * file, unless its mpf pointer has been NULLed by a db_remove or - * db_rename. We may not have flushed the log_register record that - * closes the file. - * - * PUBLIC: void __log_close_files __P((DB_ENV *)); - */ -void -__log_close_files(dbenv) - DB_ENV *dbenv; -{ - DB_ENTRY *dbe; - DB_LOG *logp; - DB *dbp; - int32_t i; - - logp = dbenv->lg_handle; - MUTEX_THREAD_LOCK(dbenv, logp->mutexp); - for (i = 0; i < logp->dbentry_cnt; i++) { - dbe = &logp->dbentry[i]; - while ((dbp = TAILQ_FIRST(&dbe->dblist)) != NULL) { - (void)log_unregister(dbenv, dbp); - TAILQ_REMOVE(&dbe->dblist, dbp, links); - (void)dbp->close(dbp, dbp->mpf == NULL ? DB_NOSYNC : 0); - } - dbe->deleted = 0; - dbe->refcount = 0; - } - MUTEX_THREAD_UNLOCK(dbenv, logp->mutexp); -} - -/* - * __log_rem_logid - * Remove an entry from the log table. Find the appropriate DB and - * unlink it from the linked list off the table. If the DB is NULL, treat - * this as a simple refcount decrement. - * - * PUBLIC: void __log_rem_logid __P((DB_LOG *, DB *, int32_t)); - */ -void -__log_rem_logid(logp, dbp, ndx) - DB_LOG *logp; - DB *dbp; - int32_t ndx; -{ - DB *xdbp; - - MUTEX_THREAD_LOCK(logp->dbenv, logp->mutexp); - if (--logp->dbentry[ndx].refcount == 0) { - TAILQ_INIT(&logp->dbentry[ndx].dblist); - logp->dbentry[ndx].deleted = 0; - } else if (dbp != NULL) - for (xdbp = TAILQ_FIRST(&logp->dbentry[ndx].dblist); - xdbp != NULL; - xdbp = TAILQ_NEXT(xdbp, links)) - if (xdbp == dbp) { - TAILQ_REMOVE(&logp->dbentry[ndx].dblist, - xdbp, links); - break; - } - - MUTEX_THREAD_UNLOCK(logp->dbenv, logp->mutexp); -} - -/* - * __log_lid_to_fname -- - * Traverse the shared-memory region looking for the entry that - * matches the passed log fileid. Returns 0 on success; -1 on error. - * PUBLIC: int __log_lid_to_fname __P((DB_LOG *, int32_t, FNAME **)); - */ -int -__log_lid_to_fname(dblp, lid, fnamep) - DB_LOG *dblp; - int32_t lid; - FNAME **fnamep; -{ - FNAME *fnp; - LOG *lp; - - lp = dblp->reginfo.primary; - - for (fnp = SH_TAILQ_FIRST(&lp->fq, __fname); - fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) { - if (fnp->ref == 0) /* Entry not in use. */ - continue; - if (fnp->id == lid) { - *fnamep = fnp; - return (0); - } - } - return (-1); -} diff --git a/bdb/log/log_register.c b/bdb/log/log_register.c deleted file mode 100644 index 1e0e523d8b9..00000000000 --- a/bdb/log/log_register.c +++ /dev/null @@ -1,433 +0,0 @@ -/*- - * See the file LICENSE for redistribution information. - * - * Copyright (c) 1996, 1997, 1998, 1999, 2000 - * Sleepycat Software. All rights reserved. - */ -#include "db_config.h" - -#ifndef lint -static const char revid[] = "$Id: log_register.c,v 11.35 2001/01/10 16:04:19 bostic Exp $"; -#endif /* not lint */ - -#ifndef NO_SYSTEM_INCLUDES -#include <sys/types.h> - -#include <string.h> -#endif - -#ifdef HAVE_RPC -#include "db_server.h" -#endif - -#include "db_int.h" -#include "log.h" - -#ifdef HAVE_RPC -#include "gen_client_ext.h" -#include "rpc_client_ext.h" -#endif - -/* - * log_register -- - * Register a file name. - */ -int -log_register(dbenv, dbp, name) - DB_ENV *dbenv; - DB *dbp; - const char *name; -{ - DBT fid_dbt, r_name; - DB_LOG *dblp; - DB_LSN r_unused; - FNAME *found_fnp, *fnp, *recover_fnp, *reuse_fnp; - LOG *lp; - size_t len; - int32_t maxid; - int inserted, ok, ret; - void *namep; - -#ifdef HAVE_RPC - if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) - return (__dbcl_log_register(dbenv, dbp, name)); -#endif - - PANIC_CHECK(dbenv); - ENV_REQUIRES_CONFIG(dbenv, dbenv->lg_handle, DB_INIT_LOG); - - dblp = dbenv->lg_handle; - lp = dblp->reginfo.primary; - fnp = reuse_fnp = NULL; - inserted = ret = 0; - namep = NULL; - - /* Check the arguments. */ - if (dbp->type != DB_BTREE && dbp->type != DB_QUEUE && - dbp->type != DB_HASH && dbp->type != DB_RECNO) { - __db_err(dbenv, "log_register: unknown DB file type"); - return (EINVAL); - } - - R_LOCK(dbenv, &dblp->reginfo); - - /* - * See if we've already got this file in the log, finding the - * (maximum+1) in-use file id and some available file id (if we - * find an available fid, we'll use it, else we'll have to allocate - * one after the maximum that we found). - */ - ok = 0; - found_fnp = recover_fnp = NULL; - for (maxid = 0, fnp = SH_TAILQ_FIRST(&lp->fq, __fname); - fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) { - if (F_ISSET(dblp, DBLOG_RECOVER) && fnp->id == dbp->log_fileid) - recover_fnp = fnp; - if (fnp->ref == 0) { /* Entry is not in use. */ - if (reuse_fnp == NULL) - reuse_fnp = fnp; - continue; - } - if (memcmp(dbp->fileid, fnp->ufid, DB_FILE_ID_LEN) == 0) { - if (fnp->meta_pgno == 0) { - if (fnp->locked == 1) { - __db_err(dbenv, "File is locked"); - return (EINVAL); - } - if (found_fnp != NULL) { - fnp = found_fnp; - goto found; - } - ok = 1; - } - if (dbp->meta_pgno == fnp->meta_pgno) { - if (F_ISSET(dblp, DBLOG_RECOVER)) { - if (fnp->id != dbp->log_fileid) { - /* - * If we are in recovery, there - * is only one dbp on the list. - * If the refcount goes to 0, - * we will clear the list. If - * it doesn't, we want to leave - * the dbp where it is, so - * passing a NULL to rem_logid - * is correct. - */ - __log_rem_logid(dblp, - NULL, fnp->id); - if (recover_fnp != NULL) - break; - continue; - } - fnp->ref = 1; - goto found; - } - ++fnp->ref; - if (ok) - goto found; - found_fnp = fnp; - } - } - if (maxid <= fnp->id) - maxid = fnp->id + 1; - } - if ((fnp = found_fnp) != NULL) - goto found; - - /* Fill in fnp structure. */ - if (recover_fnp != NULL) /* This has the right number */ - fnp = recover_fnp; - else if (reuse_fnp != NULL) /* Reuse existing one. */ - fnp = reuse_fnp; - else { /* Allocate a new one. */ - if ((ret = __db_shalloc(dblp->reginfo.addr, - sizeof(FNAME), 0, &fnp)) != 0) - goto mem_err; - fnp->id = maxid; - } - - if (F_ISSET(dblp, DBLOG_RECOVER)) - fnp->id = dbp->log_fileid; - - fnp->ref = 1; - fnp->locked = 0; - fnp->s_type = dbp->type; - memcpy(fnp->ufid, dbp->fileid, DB_FILE_ID_LEN); - fnp->meta_pgno = dbp->meta_pgno; - - if (name != NULL) { - len = strlen(name) + 1; - if ((ret = - __db_shalloc(dblp->reginfo.addr, len, 0, &namep)) != 0) { -mem_err: __db_err(dbenv, - "Unable to allocate memory to register %s", name); - goto err; - } - fnp->name_off = R_OFFSET(&dblp->reginfo, namep); - memcpy(namep, name, len); - } else - fnp->name_off = INVALID_ROFF; - - /* Only do the insert if we allocated a new fnp. */ - if (reuse_fnp == NULL && recover_fnp == NULL) - SH_TAILQ_INSERT_HEAD(&lp->fq, fnp, q, __fname); - inserted = 1; - - /* Log the registry. */ - if (!F_ISSET(dblp, DBLOG_RECOVER)) { - /* - * We allow logging on in-memory databases, so the name here - * could be NULL. - */ - if (name != NULL) { - r_name.data = (void *)name; - r_name.size = strlen(name) + 1; - } - memset(&fid_dbt, 0, sizeof(fid_dbt)); - fid_dbt.data = dbp->fileid; - fid_dbt.size = DB_FILE_ID_LEN; - if ((ret = __log_register_log(dbenv, NULL, &r_unused, - 0, LOG_OPEN, name == NULL ? NULL : &r_name, - &fid_dbt, fnp->id, dbp->type, dbp->meta_pgno)) != 0) - goto err; - } - -found: /* - * If we found the entry in the shared area, then the file is - * already open, so there is no need to log the open. We only - * log the open and closes on the first open and last close. - */ - if (!F_ISSET(dblp, DBLOG_RECOVER) && - (ret = __log_add_logid(dbenv, dblp, dbp, fnp->id)) != 0) - goto err; - - if (!F_ISSET(dblp, DBLOG_RECOVER)) - dbp->log_fileid = fnp->id; - - if (0) { -err: if (inserted) - SH_TAILQ_REMOVE(&lp->fq, fnp, q, __fname); - if (namep != NULL) - __db_shalloc_free(dblp->reginfo.addr, namep); - if (fnp != NULL) - __db_shalloc_free(dblp->reginfo.addr, fnp); - } - - R_UNLOCK(dbenv, &dblp->reginfo); - - return (ret); -} - -/* - * log_unregister -- - * Discard a registered file name. - */ -int -log_unregister(dbenv, dbp) - DB_ENV *dbenv; - DB *dbp; -{ - int ret; - -#ifdef HAVE_RPC - if (F_ISSET(dbenv, DB_ENV_RPCCLIENT)) - return (__dbcl_log_unregister(dbenv, dbp)); -#endif - - PANIC_CHECK(dbenv); - ENV_REQUIRES_CONFIG(dbenv, dbenv->lg_handle, DB_INIT_LOG); - - ret = __log_filelist_update(dbenv, dbp, dbp->log_fileid, NULL, NULL); - dbp->log_fileid = DB_LOGFILEID_INVALID; - return (ret); -} - -/* - * PUBLIC: int __log_filelist_update - * PUBLIC: __P((DB_ENV *, DB *, int32_t, const char *, int *)); - * - * Utility player for updating and logging the file list. Called - * for 3 reasons: - * 1) mark file closed: newname == NULL. - * 2) change filename: newname != NULL. - * 3) from recovery to verify & change filename if necessary, set != NULL. - */ -int -__log_filelist_update(dbenv, dbp, fid, newname, set) - DB_ENV *dbenv; - DB *dbp; - int32_t fid; - const char *newname; - int *set; -{ - DBT fid_dbt, r_name; - DB_LOG *dblp; - DB_LSN r_unused; - FNAME *fnp; - LOG *lp; - u_int32_t len, newlen; - int ret; - void *namep; - - ret = 0; - dblp = dbenv->lg_handle; - lp = dblp->reginfo.primary; - - R_LOCK(dbenv, &dblp->reginfo); - - /* Find the entry in the log. */ - for (fnp = SH_TAILQ_FIRST(&lp->fq, __fname); - fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) - if (fid == fnp->id) - break; - if (fnp == NULL) { - __db_err(dbenv, "log_unregister: non-existent file id"); - ret = EINVAL; - goto ret1; - } - - /* - * Log the unregistry only if this is the last one and we are - * really closing the file or if this is an abort of a created - * file and we need to make sure there is a record in the log. - */ - namep = NULL; - len = 0; - if (fnp->name_off != INVALID_ROFF) { - namep = R_ADDR(&dblp->reginfo, fnp->name_off); - len = strlen(namep) + 1; - } - if (!F_ISSET(dblp, DBLOG_RECOVER) && fnp->ref == 1) { - if (namep != NULL) { - memset(&r_name, 0, sizeof(r_name)); - r_name.data = namep; - r_name.size = len; - } - memset(&fid_dbt, 0, sizeof(fid_dbt)); - fid_dbt.data = fnp->ufid; - fid_dbt.size = DB_FILE_ID_LEN; - if ((ret = __log_register_log(dbenv, NULL, &r_unused, - 0, LOG_CLOSE, - fnp->name_off == INVALID_ROFF ? NULL : &r_name, - &fid_dbt, fid, fnp->s_type, fnp->meta_pgno)) - != 0) - goto ret1; - } - - /* - * If we are changing the name we must log this fact. - */ - if (newname != NULL) { - DB_ASSERT(fnp->ref == 1); - newlen = strlen(newname) + 1; - if (!F_ISSET(dblp, DBLOG_RECOVER)) { - r_name.data = (void *) newname; - r_name.size = newlen; - if ((ret = __log_register_log(dbenv, - NULL, &r_unused, 0, LOG_OPEN, &r_name, &fid_dbt, - fnp->id, fnp->s_type, fnp->meta_pgno)) != 0) - goto ret1; - } - - /* - * Check to see if the name is already correct. - */ - if (set != NULL) { - if (len != newlen || memcmp(namep, newname, len) != 0) - *set = 1; - else { - *set = 0; - goto ret1; - } - } - - /* - * Change the name, realloc memory if necessary - */ - if (len < newlen) { - __db_shalloc_free(dblp->reginfo.addr, - R_ADDR(&dblp->reginfo, fnp->name_off)); - if ((ret = __db_shalloc( - dblp->reginfo.addr, newlen, 0, &namep)) != 0) { - __db_err(dbenv, - "Unable to allocate memory to register %s", - newname); - goto ret1; - } - fnp->name_off = R_OFFSET(&dblp->reginfo, namep); - } else - namep = R_ADDR(&dblp->reginfo, fnp->name_off); - memcpy(namep, newname, newlen); - } else { - - /* - * If more than 1 reference, just decrement the reference - * and return. Otherwise, free the name if one exists. - */ - DB_ASSERT(fnp->ref >= 1); - --fnp->ref; - if (fnp->ref == 0) { - if (fnp->name_off != INVALID_ROFF) - __db_shalloc_free(dblp->reginfo.addr, - R_ADDR(&dblp->reginfo, fnp->name_off)); - fnp->name_off = INVALID_ROFF; - } - - /* - * Remove from the process local table. If this - * operation is taking place during recovery, then - * the logid was never added to the table, so do not remove it. - */ - if (!F_ISSET(dblp, DBLOG_RECOVER)) - __log_rem_logid(dblp, dbp, fid); - } - -ret1: R_UNLOCK(dbenv, &dblp->reginfo); - return (ret); -} - -/* - * __log_file_lock -- lock a file for single access - * This only works if logging is on. - * - * PUBLIC: int __log_file_lock __P((DB *)); - */ -int -__log_file_lock(dbp) - DB *dbp; -{ - DB_ENV *dbenv; - DB_LOG *dblp; - FNAME *fnp; - LOG *lp; - int ret; - - dbenv = dbp->dbenv; - dblp = dbenv->lg_handle; - lp = dblp->reginfo.primary; - - ret = 0; - R_LOCK(dbenv, &dblp->reginfo); - - for (fnp = SH_TAILQ_FIRST(&lp->fq, __fname); - fnp != NULL; fnp = SH_TAILQ_NEXT(fnp, q, __fname)) { - if (fnp->ref == 0) - continue; - - if (!memcmp(dbp->fileid, fnp->ufid, DB_FILE_ID_LEN)) { - if (fnp->meta_pgno == 0) { - if (fnp->ref != 1) - goto err; - - fnp->locked = 1; - } else { -err: __db_err(dbp->dbenv, "File is open"); - ret = EINVAL; - goto done; - } - - } - } -done: R_UNLOCK(dbenv, &dblp->reginfo); - return (ret); -} |