diff options
Diffstat (limited to 'storage/bdb/log/log_get.c')
-rw-r--r-- | storage/bdb/log/log_get.c | 1058 |
1 files changed, 1058 insertions, 0 deletions
diff --git a/storage/bdb/log/log_get.c b/storage/bdb/log/log_get.c new file mode 100644 index 00000000000..c8b028da0fb --- /dev/null +++ b/storage/bdb/log/log_get.c @@ -0,0 +1,1058 @@ +/*- + * See the file LICENSE for redistribution information. + * + * Copyright (c) 1996-2002 + * Sleepycat Software. All rights reserved. + */ +#include "db_config.h" + +#ifndef lint +static const char revid[] = "$Id: log_get.c,v 11.81 2002/08/14 20:09:27 bostic Exp $"; +#endif /* not lint */ + +#ifndef NO_SYSTEM_INCLUDES +#include <sys/types.h> + +#include <string.h> +#include <unistd.h> +#endif + +#include "db_int.h" +#include "dbinc/crypto.h" +#include "dbinc/db_page.h" +#include "dbinc/hmac.h" +#include "dbinc/log.h" +#include "dbinc/hash.h" + +typedef enum { L_ALREADY, L_ACQUIRED, L_NONE } RLOCK; + +static int __log_c_close __P((DB_LOGC *, u_int32_t)); +static int __log_c_get __P((DB_LOGC *, DB_LSN *, DBT *, u_int32_t)); +static int __log_c_get_int __P((DB_LOGC *, DB_LSN *, DBT *, u_int32_t)); +static int __log_c_hdrchk __P((DB_LOGC *, HDR *, int *)); +static int __log_c_incursor __P((DB_LOGC *, DB_LSN *, HDR *, u_int8_t **)); +static int __log_c_inregion __P((DB_LOGC *, + DB_LSN *, RLOCK *, DB_LSN *, HDR *, u_int8_t **)); +static int __log_c_io __P((DB_LOGC *, + u_int32_t, u_int32_t, void *, size_t *, int *)); +static int __log_c_ondisk __P((DB_LOGC *, + DB_LSN *, DB_LSN *, int, HDR *, u_int8_t **, int *)); +static int __log_c_set_maxrec __P((DB_LOGC *, char *)); +static int __log_c_shortread __P((DB_LOGC *, int)); + +/* + * __log_cursor -- + * Create a log cursor. + * + * PUBLIC: int __log_cursor __P((DB_ENV *, DB_LOGC **, u_int32_t)); + */ +int +__log_cursor(dbenv, logcp, flags) + DB_ENV *dbenv; + DB_LOGC **logcp; + u_int32_t flags; +{ + DB_LOGC *logc; + int ret; + + PANIC_CHECK(dbenv); + ENV_REQUIRES_CONFIG(dbenv, + dbenv->lg_handle, "DB_ENV->log_cursor", DB_INIT_LOG); + + *logcp = NULL; + + /* Validate arguments. */ + if ((ret = __db_fchk(dbenv, "DB_ENV->log_cursor", flags, 0)) != 0) + return (ret); + + /* Allocate memory for the cursor. */ + if ((ret = __os_calloc(dbenv, 1, sizeof(DB_LOGC), &logc)) != 0) + goto err; + if ((ret = __os_calloc(dbenv, 1, sizeof(DB_FH), &logc->c_fh)) != 0) + goto err; + + logc->bp_size = DB_LOGC_BUF_SIZE; + if ((ret = __os_malloc(dbenv, logc->bp_size, &logc->bp)) != 0) + goto err; + + logc->dbenv = dbenv; + logc->close = __log_c_close; + logc->get = __log_c_get; + + *logcp = logc; + return (0); + +err: if (logc != NULL) { + if (logc->c_fh != NULL) + __os_free(dbenv, logc->c_fh); + __os_free(dbenv, logc); + } + + return (ret); +} + +/* + * __log_c_close -- + * Close a log cursor. + */ +static int +__log_c_close(logc, flags) + DB_LOGC *logc; + u_int32_t flags; +{ + DB_ENV *dbenv; + int ret; + + dbenv = logc->dbenv; + + PANIC_CHECK(dbenv); + if ((ret = __db_fchk(dbenv, "DB_LOGC->close", flags, 0)) != 0) + return (ret); + + if (F_ISSET(logc->c_fh, DB_FH_VALID)) + (void)__os_closehandle(dbenv, logc->c_fh); + + if (logc->c_dbt.data != NULL) + __os_free(dbenv, logc->c_dbt.data); + + __os_free(dbenv, logc->bp); + __os_free(dbenv, logc->c_fh); + __os_free(dbenv, logc); + + return (0); +} + +/* + * __log_c_get -- + * Get a log record. + */ +static int +__log_c_get(logc, alsn, dbt, flags) + DB_LOGC *logc; + DB_LSN *alsn; + DBT *dbt; + u_int32_t flags; +{ + DB_ENV *dbenv; + DB_LSN saved_lsn; + int ret; + + dbenv = logc->dbenv; + + PANIC_CHECK(dbenv); + + /* Validate arguments. */ + switch (flags) { + case DB_CURRENT: + case DB_FIRST: + case DB_LAST: + case DB_NEXT: + case DB_PREV: + break; + case DB_SET: + if (IS_ZERO_LSN(*alsn)) { + __db_err(dbenv, "DB_LOGC->get: invalid LSN"); + return (EINVAL); + } + break; + default: + return (__db_ferr(dbenv, "DB_LOGC->get", 1)); + } + + /* + * On error, we take care not to overwrite the caller's LSN. This + * is because callers looking for the end of the log loop using the + * DB_NEXT flag, and expect to take the last successful lsn out of + * the passed-in structure after DB_LOGC->get fails with DB_NOTFOUND. + * + * !!! + * This line is often flagged an uninitialized memory read during a + * Purify or similar tool run, as the application didn't initialize + * *alsn. If the application isn't setting the DB_SET flag, there is + * no reason it should have initialized *alsn, but we can't know that + * and we want to make sure we never overwrite whatever the application + * put in there. + */ + saved_lsn = *alsn; + + /* + * If we get one of the log's header records as a result of doing a + * DB_FIRST, DB_NEXT, DB_LAST or DB_PREV, repeat the operation, log + * file header records aren't useful to applications. + */ + if ((ret = __log_c_get_int(logc, alsn, dbt, flags)) != 0) { + *alsn = saved_lsn; + return (ret); + } + if (alsn->offset == 0 && (flags == DB_FIRST || + flags == DB_NEXT || flags == DB_LAST || flags == DB_PREV)) { + switch (flags) { + case DB_FIRST: + flags = DB_NEXT; + break; + case DB_LAST: + flags = DB_PREV; + break; + } + if (F_ISSET(dbt, DB_DBT_MALLOC)) { + __os_free(dbenv, dbt->data); + dbt->data = NULL; + } + if ((ret = __log_c_get_int(logc, alsn, dbt, flags)) != 0) { + *alsn = saved_lsn; + return (ret); + } + } + + return (0); +} + +/* + * __log_c_get_int -- + * Get a log record; internal version. + */ +static int +__log_c_get_int(logc, alsn, dbt, flags) + DB_LOGC *logc; + DB_LSN *alsn; + DBT *dbt; + u_int32_t flags; +{ + DB_CIPHER *db_cipher; + DB_ENV *dbenv; + DB_LOG *dblp; + DB_LSN last_lsn, nlsn; + HDR hdr; + LOG *lp; + RLOCK rlock; + logfile_validity status; + u_int32_t cnt; + u_int8_t *rp; + int eof, is_hmac, ret; + + dbenv = logc->dbenv; + dblp = dbenv->lg_handle; + lp = dblp->reginfo.primary; + is_hmac = 0; + + /* + * We don't acquire the log region lock until we need it, and we + * release it as soon as we're done. + */ + rlock = F_ISSET(logc, DB_LOG_LOCKED) ? L_ALREADY : L_NONE; + + nlsn = logc->c_lsn; + switch (flags) { + case DB_NEXT: /* Next log record. */ + if (!IS_ZERO_LSN(nlsn)) { + /* Increment the cursor by the cursor record size. */ + nlsn.offset += logc->c_len; + break; + } + flags = DB_FIRST; + /* FALLTHROUGH */ + case DB_FIRST: /* First log record. */ + /* Find the first log file. */ + if ((ret = __log_find(dblp, 1, &cnt, &status)) != 0) + goto err; + + /* + * DB_LV_INCOMPLETE: + * Theoretically, the log file we want could be created + * but not yet written, the "first" log record must be + * in the log buffer. + * DB_LV_NORMAL: + * DB_LV_OLD_READABLE: + * We found a log file we can read. + * DB_LV_NONEXISTENT: + * No log files exist, the "first" log record must be in + * the log buffer. + * DB_LV_OLD_UNREADABLE: + * No readable log files exist, we're at the cross-over + * point between two versions. The "first" log record + * must be in the log buffer. + */ + switch (status) { + case DB_LV_INCOMPLETE: + DB_ASSERT(lp->lsn.file == cnt); + /* FALLTHROUGH */ + case DB_LV_NORMAL: + case DB_LV_OLD_READABLE: + nlsn.file = cnt; + break; + case DB_LV_NONEXISTENT: + nlsn.file = 1; + DB_ASSERT(lp->lsn.file == nlsn.file); + break; + case DB_LV_OLD_UNREADABLE: + nlsn.file = cnt + 1; + DB_ASSERT(lp->lsn.file == nlsn.file); + break; + } + nlsn.offset = 0; + break; + case DB_CURRENT: /* Current log record. */ + break; + case DB_PREV: /* Previous log record. */ + if (!IS_ZERO_LSN(nlsn)) { + /* If at start-of-file, move to the previous file. */ + if (nlsn.offset == 0) { + if (nlsn.file == 1 || + __log_valid(dblp, + nlsn.file - 1, 0, &status) != 0) { + ret = DB_NOTFOUND; + goto err; + } + + if (status != DB_LV_NORMAL && + status != DB_LV_OLD_READABLE) { + ret = DB_NOTFOUND; + goto err; + } + + --nlsn.file; + } + nlsn.offset = logc->c_prev; + break; + } + /* FALLTHROUGH */ + case DB_LAST: /* Last log record. */ + if (rlock == L_NONE) { + rlock = L_ACQUIRED; + R_LOCK(dbenv, &dblp->reginfo); + } + nlsn.file = lp->lsn.file; + nlsn.offset = lp->lsn.offset - lp->len; + break; + case DB_SET: /* Set log record. */ + nlsn = *alsn; + break; + } + + if (0) { /* Move to the next file. */ +next_file: ++nlsn.file; + nlsn.offset = 0; + } + + /* + * The above switch statement should have set nlsn to the lsn of + * the requested record. + */ + + if (CRYPTO_ON(dbenv)) { + hdr.size = HDR_CRYPTO_SZ; + is_hmac = 1; + } else { + hdr.size = HDR_NORMAL_SZ; + is_hmac = 0; + } + /* Check to see if the record is in the cursor's buffer. */ + if ((ret = __log_c_incursor(logc, &nlsn, &hdr, &rp)) != 0) + goto err; + if (rp != NULL) + goto cksum; + + /* + * Look to see if we're moving backward in the log with the last record + * coming from the disk -- it means the record can't be in the region's + * buffer. Else, check the region's buffer. + * + * If the record isn't in the region's buffer, we're going to have to + * read the record from disk. We want to make a point of not reading + * past the end of the logical log (after recovery, there may be data + * after the end of the logical log, not to mention the log file may + * have been pre-allocated). So, zero out last_lsn, and initialize it + * inside __log_c_inregion -- if it's still zero when we check it in + * __log_c_ondisk, that's OK, it just means the logical end of the log + * isn't an issue for this request. + */ + ZERO_LSN(last_lsn); + if (!F_ISSET(logc, DB_LOG_DISK) || + log_compare(&nlsn, &logc->c_lsn) > 0) { + F_CLR(logc, DB_LOG_DISK); + + if ((ret = __log_c_inregion(logc, + &nlsn, &rlock, &last_lsn, &hdr, &rp)) != 0) + goto err; + if (rp != NULL) + goto cksum; + } + + /* + * We have to read from an on-disk file to retrieve the record. + * If we ever can't retrieve the record at offset 0, we're done, + * return EOF/DB_NOTFOUND. + * + * Discard the region lock if we're still holding it, the on-disk + * reading routines don't need it. + */ + if (rlock == L_ACQUIRED) { + rlock = L_NONE; + R_UNLOCK(dbenv, &dblp->reginfo); + } + if ((ret = __log_c_ondisk( + logc, &nlsn, &last_lsn, flags, &hdr, &rp, &eof)) != 0) + goto err; + if (eof == 1) { + /* + * Only DB_NEXT automatically moves to the next file, and + * it only happens once. + */ + if (flags != DB_NEXT || nlsn.offset == 0) + return (DB_NOTFOUND); + goto next_file; + } + F_SET(logc, DB_LOG_DISK); + +cksum: /* + * Discard the region lock if we're still holding it. (The path to + * get here is that we acquired the lock because of the caller's + * flag argument, but we found the record in the cursor's buffer. + * Improbable, but it's easy to avoid. + */ + if (rlock == L_ACQUIRED) { + rlock = L_NONE; + R_UNLOCK(dbenv, &dblp->reginfo); + } + + /* + * Checksum: there are two types of errors -- a configuration error + * or a checksum mismatch. The former is always bad. The latter is + * OK if we're searching for the end of the log, and very, very bad + * if we're reading random log records. + */ + db_cipher = dbenv->crypto_handle; + if ((ret = __db_check_chksum(dbenv, db_cipher, + hdr.chksum, rp + hdr.size, hdr.len - hdr.size, is_hmac)) != 0) { + if (F_ISSET(logc, DB_LOG_SILENT_ERR)) { + if (ret == 0 || ret == -1) + ret = EIO; + } else if (ret == -1) { + __db_err(dbenv, + "DB_LOGC->get: log record checksum mismatch"); + __db_err(dbenv, + "DB_LOGC->get: catastrophic recovery may be required"); + ret = __db_panic(dbenv, DB_RUNRECOVERY); + } + goto err; + } + + /* + * If we got a 0-length record, that means we're in the midst of + * some bytes that got 0'd as the result of a vtruncate. We're + * going to have to retry. + */ + if (hdr.len == 0) { + switch (flags) { + case DB_FIRST: + case DB_NEXT: + /* Zero'd records always indicate the end of a file. */ + goto next_file; + + case DB_LAST: + case DB_PREV: + /* + * We should never get here. If we recover a log + * file with 0's at the end, we'll treat the 0'd + * headers as the end of log and ignore them. If + * we're reading backwards from another file, then + * the first record in that new file should have its + * prev field set correctly. + */ + __db_err(dbenv, + "Encountered zero length records while traversing backwards"); + DB_ASSERT(0); + case DB_SET: + default: + /* Return the 0-length record. */ + break; + } + } + + /* Copy the record into the user's DBT. */ + if ((ret = __db_retcopy(dbenv, dbt, rp + hdr.size, + (u_int32_t)(hdr.len - hdr.size), + &logc->c_dbt.data, &logc->c_dbt.ulen)) != 0) + goto err; + + if (CRYPTO_ON(dbenv)) { + if ((ret = db_cipher->decrypt(dbenv, db_cipher->data, + hdr.iv, dbt->data, hdr.len - hdr.size)) != 0) { + ret = EAGAIN; + goto err; + } + /* + * Return the original log record size to the user, + * even though we've allocated more than that, possibly. + * The log record is decrypted in the user dbt, not in + * the buffer, so we must do this here after decryption, + * not adjust the len passed to the __db_retcopy call. + */ + dbt->size = hdr.orig_size; + } + + /* Update the cursor and the returned LSN. */ + *alsn = nlsn; + logc->c_lsn = nlsn; + logc->c_len = hdr.len; + logc->c_prev = hdr.prev; + +err: if (rlock == L_ACQUIRED) + R_UNLOCK(dbenv, &dblp->reginfo); + + return (ret); +} + +/* + * __log_c_incursor -- + * Check to see if the requested record is in the cursor's buffer. + */ +static int +__log_c_incursor(logc, lsn, hdr, pp) + DB_LOGC *logc; + DB_LSN *lsn; + HDR *hdr; + u_int8_t **pp; +{ + u_int8_t *p; + + *pp = NULL; + + /* + * Test to see if the requested LSN could be part of the cursor's + * buffer. + * + * The record must be part of the same file as the cursor's buffer. + * The record must start at a byte offset equal to or greater than + * the cursor buffer. + * The record must not start at a byte offset after the cursor + * buffer's end. + */ + if (logc->bp_lsn.file != lsn->file) + return (0); + if (logc->bp_lsn.offset > lsn->offset) + return (0); + if (logc->bp_lsn.offset + logc->bp_rlen <= lsn->offset + hdr->size) + return (0); + + /* + * Read the record's header and check if the record is entirely held + * in the buffer. If the record is not entirely held, get it again. + * (The only advantage in having part of the record locally is that + * we might avoid a system call because we already have the HDR in + * memory.) + * + * If the header check fails for any reason, it must be because the + * LSN is bogus. Fail hard. + */ + p = logc->bp + (lsn->offset - logc->bp_lsn.offset); + memcpy(hdr, p, hdr->size); + if (__log_c_hdrchk(logc, hdr, NULL)) + return (DB_NOTFOUND); + if (logc->bp_lsn.offset + logc->bp_rlen <= lsn->offset + hdr->len) + return (0); + + *pp = p; /* Success. */ + + return (0); +} + +/* + * __log_c_inregion -- + * Check to see if the requested record is in the region's buffer. + */ +static int +__log_c_inregion(logc, lsn, rlockp, last_lsn, hdr, pp) + DB_LOGC *logc; + DB_LSN *lsn, *last_lsn; + RLOCK *rlockp; + HDR *hdr; + u_int8_t **pp; +{ + DB_ENV *dbenv; + DB_LOG *dblp; + LOG *lp; + size_t len, nr; + u_int32_t b_disk, b_region; + int ret; + u_int8_t *p; + + dbenv = logc->dbenv; + dblp = dbenv->lg_handle; + lp = ((DB_LOG *)logc->dbenv->lg_handle)->reginfo.primary; + + ret = 0; + *pp = NULL; + + /* If we haven't yet acquired the log region lock, do so. */ + if (*rlockp == L_NONE) { + *rlockp = L_ACQUIRED; + R_LOCK(dbenv, &dblp->reginfo); + } + + /* + * The routines to read from disk must avoid reading past the logical + * end of the log, so pass that information back to it. + * + * Since they're reading directly from the disk, they must also avoid + * reading past the offset we've written out. If the log was + * truncated, it's possible that there are zeroes or garbage on + * disk after this offset, and the logical end of the log can + * come later than this point if the log buffer isn't empty. + */ + *last_lsn = lp->lsn; + if (last_lsn->offset > lp->w_off) + last_lsn->offset = lp->w_off; + + /* + * Test to see if the requested LSN could be part of the region's + * buffer. + * + * During recovery, we read the log files getting the information to + * initialize the region. In that case, the region's lsn field will + * not yet have been filled in, use only the disk. + * + * The record must not start at a byte offset after the region buffer's + * end, since that means the request is for a record after the end of + * the log. Do this test even if the region's buffer is empty -- after + * recovery, the log files may continue past the declared end-of-log, + * and the disk reading routine will incorrectly attempt to read the + * remainder of the log. + * + * Otherwise, test to see if the region's buffer actually has what we + * want: + * + * The buffer must have some useful content. + * The record must be in the same file as the region's buffer and must + * start at a byte offset equal to or greater than the region's buffer. + */ + if (IS_ZERO_LSN(lp->lsn)) + return (0); + if (lsn->file > lp->lsn.file || + (lsn->file == lp->lsn.file && lsn->offset >= lp->lsn.offset)) + return (DB_NOTFOUND); + if (lp->b_off == 0) + return (0); + if (lsn->file < lp->f_lsn.file || lsn->offset < lp->f_lsn.offset) + return (0); + + /* + * The current contents of the cursor's buffer will be useless for a + * future call -- trash it rather than try and make it look correct. + */ + ZERO_LSN(logc->bp_lsn); + + /* + * If the requested LSN is greater than the region buffer's first + * byte, we know the entire record is in the buffer. + * + * If the header check fails for any reason, it must be because the + * LSN is bogus. Fail hard. + */ + if (lsn->offset > lp->f_lsn.offset) { + p = dblp->bufp + (lsn->offset - lp->w_off); + memcpy(hdr, p, hdr->size); + if (__log_c_hdrchk(logc, hdr, NULL)) + return (DB_NOTFOUND); + if (logc->bp_size <= hdr->len) { + len = ALIGN(hdr->len * 2, 128); + if ((ret = + __os_realloc(logc->dbenv, len, &logc->bp)) != 0) + return (ret); + logc->bp_size = (u_int32_t)len; + } + memcpy(logc->bp, p, hdr->len); + *pp = logc->bp; + return (0); + } + + /* + * There's a partial record, that is, the requested record starts + * in a log file and finishes in the region buffer. We have to + * find out how many bytes of the record are in the region buffer + * so we can copy them out into the cursor buffer. First, check + * to see if the requested record is the only record in the region + * buffer, in which case we should copy the entire region buffer. + * + * Else, walk back through the region's buffer to find the first LSN + * after the record that crosses the buffer boundary -- we can detect + * that LSN, because its "prev" field will reference the record we + * want. The bytes we need to copy from the region buffer are the + * bytes up to the record we find. The bytes we'll need to allocate + * to hold the log record are the bytes between the two offsets. + */ + b_disk = lp->w_off - lsn->offset; + if (lp->b_off <= lp->len) + b_region = (u_int32_t)lp->b_off; + else + for (p = dblp->bufp + (lp->b_off - lp->len);;) { + memcpy(hdr, p, hdr->size); + if (hdr->prev == lsn->offset) { + b_region = (u_int32_t)(p - dblp->bufp); + break; + } + p = dblp->bufp + (hdr->prev - lp->w_off); + } + + /* + * If we don't have enough room for the record, we have to allocate + * space. We have to do it while holding the region lock, which is + * truly annoying, but there's no way around it. This call is why + * we allocate cursor buffer space when allocating the cursor instead + * of waiting. + */ + if (logc->bp_size <= b_region + b_disk) { + len = ALIGN((b_region + b_disk) * 2, 128); + if ((ret = __os_realloc(logc->dbenv, len, &logc->bp)) != 0) + return (ret); + logc->bp_size = (u_int32_t)len; + } + + /* Copy the region's bytes to the end of the cursor's buffer. */ + p = (logc->bp + logc->bp_size) - b_region; + memcpy(p, dblp->bufp, b_region); + + /* Release the region lock. */ + if (*rlockp == L_ACQUIRED) { + *rlockp = L_NONE; + R_UNLOCK(dbenv, &dblp->reginfo); + } + + /* + * Read the rest of the information from disk. Neither short reads + * or EOF are acceptable, the bytes we want had better be there. + */ + if (b_disk != 0) { + p -= b_disk; + nr = b_disk; + if ((ret = __log_c_io( + logc, lsn->file, lsn->offset, p, &nr, NULL)) != 0) + return (ret); + if (nr < b_disk) + return (__log_c_shortread(logc, 0)); + } + + /* Copy the header information into the caller's structure. */ + memcpy(hdr, p, hdr->size); + + *pp = p; + return (0); +} + +/* + * __log_c_ondisk -- + * Read a record off disk. + */ +static int +__log_c_ondisk(logc, lsn, last_lsn, flags, hdr, pp, eofp) + DB_LOGC *logc; + DB_LSN *lsn, *last_lsn; + int flags, *eofp; + HDR *hdr; + u_int8_t **pp; +{ + DB_ENV *dbenv; + size_t len, nr; + u_int32_t offset; + int ret; + + dbenv = logc->dbenv; + *eofp = 0; + + nr = hdr->size; + if ((ret = + __log_c_io(logc, lsn->file, lsn->offset, hdr, &nr, eofp)) != 0) + return (ret); + if (*eofp) + return (0); + + /* If we read 0 bytes, assume we've hit EOF. */ + if (nr == 0) { + *eofp = 1; + return (0); + } + + /* Check the HDR. */ + if ((ret = __log_c_hdrchk(logc, hdr, eofp)) != 0) + return (ret); + if (*eofp) + return (0); + + /* Otherwise, we should have gotten the bytes we wanted. */ + if (nr < hdr->size) + return (__log_c_shortread(logc, 0)); + + /* + * Regardless of how we return, the previous contents of the cursor's + * buffer are useless -- trash it. + */ + ZERO_LSN(logc->bp_lsn); + + /* + * Otherwise, we now (finally!) know how big the record is. (Maybe + * we should have just stuck the length of the record into the LSN!?) + * Make sure we have enough space. + */ + if (logc->bp_size <= hdr->len) { + len = ALIGN(hdr->len * 2, 128); + if ((ret = __os_realloc(dbenv, len, &logc->bp)) != 0) + return (ret); + logc->bp_size = (u_int32_t)len; + } + + /* + * If we're moving forward in the log file, read this record in at the + * beginning of the buffer. Otherwise, read this record in at the end + * of the buffer, making sure we don't try and read before the start + * of the file. (We prefer positioning at the end because transaction + * aborts use DB_SET to move backward through the log and we might get + * lucky.) + * + * Read a buffer's worth, without reading past the logical EOF. The + * last_lsn may be a zero LSN, but that's OK, the test works anyway. + */ + if (flags == DB_FIRST || flags == DB_NEXT) + offset = lsn->offset; + else if (lsn->offset + hdr->len < logc->bp_size) + offset = 0; + else + offset = (lsn->offset + hdr->len) - logc->bp_size; + + nr = logc->bp_size; + if (lsn->file == last_lsn->file && offset + nr >= last_lsn->offset) + nr = last_lsn->offset - offset; + + if ((ret = + __log_c_io(logc, lsn->file, offset, logc->bp, &nr, eofp)) != 0) + return (ret); + + /* + * We should have at least gotten the bytes up-to-and-including the + * record we're reading. + */ + if (nr < (lsn->offset + hdr->len) - offset) + return (__log_c_shortread(logc, 1)); + + /* Set up the return information. */ + logc->bp_rlen = (u_int32_t)nr; + logc->bp_lsn.file = lsn->file; + logc->bp_lsn.offset = offset; + + *pp = logc->bp + (lsn->offset - offset); + + return (0); +} + +/* + * __log_c_hdrchk -- + * + * Check for corrupted HDRs before we use them to allocate memory or find + * records. + * + * If the log files were pre-allocated, a zero-filled HDR structure is the + * logical file end. However, we can see buffers filled with 0's during + * recovery, too (because multiple log buffers were written asynchronously, + * and one made it to disk before a different one that logically precedes + * it in the log file. + * + * XXX + * I think there's a potential pre-allocation recovery flaw here -- if we + * fail to write a buffer at the end of a log file (by scheduling its + * write asynchronously, and it never making it to disk), then succeed in + * writing a log file block to a subsequent log file, I don't think we will + * detect that the buffer of 0's should have marked the end of the log files + * during recovery. I think we may need to always write some garbage after + * each block write if we pre-allocate log files. (At the moment, we do not + * pre-allocate, so this isn't currently an issue.) + * + * Check for impossibly large records. The malloc should fail later, but we + * have customers that run mallocs that treat all allocation failures as fatal + * errors. + * + * Note that none of this is necessarily something awful happening. We let + * the application hand us any LSN they want, and it could be a pointer into + * the middle of a log record, there's no way to tell. + */ +static int +__log_c_hdrchk(logc, hdr, eofp) + DB_LOGC *logc; + HDR *hdr; + int *eofp; +{ + DB_ENV *dbenv; + int ret; + + dbenv = logc->dbenv; + + /* Sanity check the log record's size. */ + if (hdr->len <= hdr->size) + goto err; + /* + * If the cursor's max-record value isn't yet set, it means we aren't + * reading these records from a log file and no check is necessary. + */ + if (logc->bp_maxrec != 0 && hdr->len > logc->bp_maxrec) { + /* + * If we fail the check, there's the pathological case that + * we're reading the last file, it's growing, and our initial + * check information was wrong. Get it again, to be sure. + */ + if ((ret = __log_c_set_maxrec(logc, NULL)) != 0) { + __db_err(dbenv, "DB_LOGC->get: %s", db_strerror(ret)); + return (ret); + } + if (logc->bp_maxrec != 0 && hdr->len > logc->bp_maxrec) + goto err; + } + + if (eofp != NULL) { + if (hdr->prev == 0 && hdr->chksum[0] == 0 && hdr->len == 0) { + *eofp = 1; + return (0); + } + *eofp = 0; + } + return (0); + +err: if (!F_ISSET(logc, DB_LOG_SILENT_ERR)) + __db_err(dbenv, "DB_LOGC->get: invalid log record header"); + return (EIO); +} + +/* + * __log_c_io -- + * Read records from a log file. + */ +static int +__log_c_io(logc, fnum, offset, p, nrp, eofp) + DB_LOGC *logc; + u_int32_t fnum, offset; + void *p; + size_t *nrp; + int *eofp; +{ + DB_ENV *dbenv; + DB_LOG *dblp; + int ret; + char *np; + + dbenv = logc->dbenv; + dblp = dbenv->lg_handle; + + /* + * If we've switched files, discard the current file handle and acquire + * a new one. + */ + if (F_ISSET(logc->c_fh, DB_FH_VALID) && logc->bp_lsn.file != fnum) + if ((ret = __os_closehandle(dbenv, logc->c_fh)) != 0) + return (ret); + if (!F_ISSET(logc->c_fh, DB_FH_VALID)) { + if ((ret = __log_name(dblp, fnum, + &np, logc->c_fh, DB_OSO_RDONLY | DB_OSO_SEQ)) != 0) { + /* + * If we're allowed to return EOF, assume that's the + * problem, set the EOF status flag and return 0. + */ + if (eofp != NULL) { + *eofp = 1; + ret = 0; + } else if (!F_ISSET(logc, DB_LOG_SILENT_ERR)) + __db_err(dbenv, "DB_LOGC->get: %s: %s", + np, db_strerror(ret)); + __os_free(dbenv, np); + return (ret); + } + + if ((ret = __log_c_set_maxrec(logc, np)) != 0) { + __db_err(dbenv, + "DB_LOGC->get: %s: %s", np, db_strerror(ret)); + __os_free(dbenv, np); + return (ret); + } + __os_free(dbenv, np); + } + + /* Seek to the record's offset. */ + if ((ret = __os_seek(dbenv, + logc->c_fh, 0, 0, offset, 0, DB_OS_SEEK_SET)) != 0) { + if (!F_ISSET(logc, DB_LOG_SILENT_ERR)) + __db_err(dbenv, + "DB_LOGC->get: seek: %s", db_strerror(ret)); + return (ret); + } + + /* Read the data. */ + if ((ret = __os_read(dbenv, logc->c_fh, p, *nrp, nrp)) != 0) { + if (!F_ISSET(logc, DB_LOG_SILENT_ERR)) + __db_err(dbenv, + "DB_LOGC->get: read: %s", db_strerror(ret)); + return (ret); + } + + return (0); +} + +/* + * __log_c_shortread -- + * Read was short -- return a consistent error message and error. + */ +static int +__log_c_shortread(logc, silent) + DB_LOGC *logc; + int silent; +{ + if (!silent || !F_ISSET(logc, DB_LOG_SILENT_ERR)) + __db_err(logc->dbenv, "DB_LOGC->get: short read"); + return (EIO); +} + +/* + * __log_c_set_maxrec -- + * Bound the maximum log record size in a log file. + */ +static int +__log_c_set_maxrec(logc, np) + DB_LOGC *logc; + char *np; +{ + DB_ENV *dbenv; + DB_LOG *dblp; + LOG *lp; + u_int32_t mbytes, bytes; + int ret; + + dbenv = logc->dbenv; + dblp = dbenv->lg_handle; + + /* + * We don't want to try and allocate huge chunks of memory because + * applications with error-checking malloc's often consider that a + * hard failure. If we're about to look at a corrupted record with + * a bizarre size, we need to know before trying to allocate space + * to hold it. We could read the persistent data at the beginning + * of the file but that's hard -- we may have to decrypt it, checksum + * it and so on. Stat the file instead. + */ + if ((ret = + __os_ioinfo(dbenv, np, logc->c_fh, &mbytes, &bytes, NULL)) != 0) + return (ret); + + logc->bp_maxrec = mbytes * MEGABYTE + bytes; + + /* + * If reading from the log file currently being written, we could get + * an incorrect size, that is, if the cursor was opened on the file + * when it had only a few hundred bytes, and then the cursor used to + * move forward in the file, after more log records were written, the + * original stat value would be wrong. Use the maximum of the current + * log file size and the size of the buffer -- that should represent + * the max of any log record currently in the file. + * + * The log buffer size is set when the environment is opened and never + * changed, we don't need a lock on it. + */ + lp = dblp->reginfo.primary; + logc->bp_maxrec += lp->buffer_size; + + return (0); +} |