diff options
Diffstat (limited to 'src/log/log.c')
-rw-r--r-- | src/log/log.c | 182 |
1 files changed, 118 insertions, 64 deletions
diff --git a/src/log/log.c b/src/log/log.c index 3bf04d025d8..ce2d7191491 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -105,7 +105,7 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) * LSN has moved into a later log file and there should be a * log file ready to close. */ - while (log->sync_lsn.file < min_lsn->file) { + while (log->sync_lsn.l.file < min_lsn->l.file) { WT_ERR(__wt_cond_signal(session, S2C(session)->log_file_cond)); WT_ERR(__wt_cond_wait(session, log->log_sync_cond, 10000)); @@ -116,10 +116,11 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) * Sync the directory if the log file entry hasn't been written * into the directory. */ - if (log->sync_dir_lsn.file < min_lsn->file) { + if (log->sync_dir_lsn.l.file < min_lsn->l.file) { WT_ERR(__wt_verbose(session, WT_VERB_LOG, "log_force_sync: sync directory %s to LSN %d/%lu", - log->log_dir_fh->name, min_lsn->file, min_lsn->offset)); + log->log_dir_fh->name, + min_lsn->l.file, min_lsn->l.offset)); WT_ERR(__wt_directory_sync_fh(session, log->log_dir_fh)); log->sync_dir_lsn = *min_lsn; WT_STAT_FAST_CONN_INCR(session, log_sync_dir); @@ -130,7 +131,7 @@ __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) if (__wt_log_cmp(&log->sync_lsn, min_lsn) < 0) { WT_ERR(__wt_verbose(session, WT_VERB_LOG, "log_force_sync: sync %s to LSN %d/%lu", - log->log_fh->name, min_lsn->file, min_lsn->offset)); + log->log_fh->name, min_lsn->l.file, min_lsn->l.offset)); WT_ERR(__wt_fsync(session, log->log_fh)); log->sync_lsn = *min_lsn; WT_STAT_FAST_CONN_INCR(session, log_sync); @@ -174,7 +175,7 @@ __wt_log_needs_recovery(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn, bool *recp) * we can skip recovery. */ WT_RET(__wt_curlog_open(session, "log:", NULL, &c)); - c->set_key(c, ckp_lsn->file, ckp_lsn->offset, 0); + c->set_key(c, ckp_lsn->l.file, ckp_lsn->l.offset, 0); if ((ret = c->search(c)) == 0) { while ((ret = c->next(c)) == 0) { /* @@ -278,7 +279,7 @@ __wt_log_get_all_files(WT_SESSION_IMPL *session, /* Filter out any files that are below the checkpoint LSN. */ for (max = 0, i = 0; i < count; ) { WT_ERR(__wt_log_extract_lognum(session, files[i], &id)); - if (active_only && id < log->ckpt_lsn.file) { + if (active_only && id < log->ckpt_lsn.l.file) { __wt_free(session, files[i]); files[i] = files[count - 1]; files[--count] = NULL; @@ -459,8 +460,8 @@ __log_size_fit(WT_SESSION_IMPL *session, WT_LSN *lsn, uint64_t recsize) conn = S2C(session); log = conn->log; - return (lsn->offset == WT_LOG_FIRST_RECORD || - lsn->offset + (wt_off_t)recsize < conn->log_file_max); + return (lsn->l.offset == WT_LOG_FIRST_RECORD || + lsn->l.offset + (wt_off_t)recsize < conn->log_file_max); } /* @@ -536,9 +537,7 @@ __log_fill(WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool force, WT_ITEM *record, WT_LSN *lsnp) { WT_DECL_RET; - WT_LOG_RECORD *logrec; - logrec = (WT_LOG_RECORD *)record->mem; /* * Call __wt_write or copy into the buffer. For now the offset is the * real byte offset. If the offset becomes a unit of WT_LOG_ALIGN this @@ -547,19 +546,19 @@ __log_fill(WT_SESSION_IMPL *session, */ if (!force && !F_ISSET(myslot, WT_MYSLOT_UNBUFFERED)) memcpy((char *)myslot->slot->slot_buf.mem + myslot->offset, - logrec, logrec->len); + record->mem, record->size); else /* * If this is a force or unbuffered write, write it now. */ WT_ERR(__wt_write(session, myslot->slot->slot_fh, myslot->offset + myslot->slot->slot_start_offset, - (size_t)logrec->len, (void *)logrec)); + record->size, record->mem)); - WT_STAT_FAST_CONN_INCRV(session, log_bytes_written, logrec->len); + WT_STAT_FAST_CONN_INCRV(session, log_bytes_written, record->size); if (lsnp != NULL) { *lsnp = myslot->slot->slot_start_lsn; - lsnp->offset += (wt_off_t)myslot->offset; + lsnp->l.offset += (uint32_t)myslot->offset; } err: if (ret != 0 && myslot->slot->slot_error == 0) @@ -596,19 +595,31 @@ __log_file_header( WT_ASSERT(session, sizeof(WT_LOG_DESC) < log->allocsize); WT_RET(__wt_scr_alloc(session, log->allocsize, &buf)); memset(buf->mem, 0, log->allocsize); + buf->size = log->allocsize; + logrec = (WT_LOG_RECORD *)buf->mem; desc = (WT_LOG_DESC *)logrec->record; desc->log_magic = WT_LOG_MAGIC; desc->majorv = WT_LOG_MAJOR_VERSION; desc->minorv = WT_LOG_MINOR_VERSION; desc->log_size = (uint64_t)conn->log_file_max; + __wt_log_desc_byteswap(desc); /* * Now that the record is set up, initialize the record header. + * + * Checksum a little-endian version of the header, and write everything + * in little-endian format. The checksum is (potentially) returned in a + * big-endian format, swap it into place in a separate step. */ logrec->len = log->allocsize; logrec->checksum = 0; + __wt_log_record_byteswap(logrec); logrec->checksum = __wt_cksum(logrec, log->allocsize); +#ifdef WORDS_BIGENDIAN + logrec->checksum = __wt_bswap32(logrec->checksum); +#endif + WT_CLEAR(tmp); memset(&myslot, 0, sizeof(myslot)); myslot.slot = &tmp; @@ -624,7 +635,7 @@ __log_file_header( tmp.slot_fh = fh; } else { WT_ASSERT(session, fh == NULL); - WT_ERR(__wt_log_acquire(session, logrec->len, &tmp)); + WT_ERR(__wt_log_acquire(session, log->allocsize, &tmp)); } WT_ERR(__log_fill(session, &myslot, true, buf, NULL)); /* @@ -673,7 +684,9 @@ __log_openfile(WT_SESSION_IMPL *session, memset(buf->mem, 0, allocsize); WT_ERR(__wt_read(session, *fh, 0, allocsize, buf->mem)); logrec = (WT_LOG_RECORD *)buf->mem; + __wt_log_record_byteswap(logrec); desc = (WT_LOG_DESC *)logrec->record; + __wt_log_desc_byteswap(desc); if (desc->log_magic != WT_LOG_MAGIC) WT_PANIC_RET(session, WT_ERROR, "log file %s corrupted: Bad magic number %" PRIu32, @@ -829,8 +842,7 @@ __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created) * We need to setup the LSNs. Set the end LSN and alloc LSN to * the end of the header. */ - log->alloc_lsn.file = log->fileid; - log->alloc_lsn.offset = WT_LOG_FIRST_RECORD; + WT_SET_LSN(&log->alloc_lsn, log->fileid, WT_LOG_FIRST_RECORD); end_lsn = log->alloc_lsn; /* @@ -890,7 +902,7 @@ __wt_log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot) * Pre-allocate on the first real write into the log file, if it * was just created (i.e. not pre-allocated). */ - if (log->alloc_lsn.offset == WT_LOG_FIRST_RECORD && created_log) + if (log->alloc_lsn.l.offset == WT_LOG_FIRST_RECORD && created_log) WT_RET(__log_prealloc(session, log->log_fh)); /* * Initialize the slot for activation. @@ -931,8 +943,9 @@ __log_truncate(WT_SESSION_IMPL *session, /* * Truncate the log file to the given LSN. */ - WT_ERR(__log_openfile(session, false, &log_fh, file_prefix, lsn->file)); - WT_ERR(__wt_ftruncate(session, log_fh, lsn->offset)); + WT_ERR(__log_openfile(session, + false, &log_fh, file_prefix, lsn->l.file)); + WT_ERR(__wt_ftruncate(session, log_fh, lsn->l.offset)); WT_ERR(__wt_fsync(session, log_fh)); WT_ERR(__wt_close(session, &log_fh)); @@ -946,7 +959,8 @@ __log_truncate(WT_SESSION_IMPL *session, WT_LOG_FILENAME, &logfiles, &logcount)); for (i = 0; i < logcount; i++) { WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); - if (lognum > lsn->file && lognum < log->trunc_lsn.file) { + if (lognum > lsn->l.file && + lognum < log->trunc_lsn.l.file) { WT_ERR(__log_openfile(session, false, &log_fh, file_prefix, lognum)); /* @@ -1111,10 +1125,8 @@ __wt_log_open(WT_SESSION_IMPL *session) if (firstlog == UINT32_MAX) { WT_ASSERT(session, logcount == 0); WT_INIT_LSN(&log->first_lsn); - } else { - log->first_lsn.file = firstlog; - log->first_lsn.offset = 0; - } + } else + WT_SET_LSN(&log->first_lsn, firstlog, 0); /* * Start logging at the beginning of the next log file, no matter @@ -1346,7 +1358,7 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) * sync operations. The most recent one will set the LSN to the * beginning of our file. */ - if (log->sync_lsn.file < slot->slot_end_lsn.file || + if (log->sync_lsn.l.file < slot->slot_end_lsn.l.file || __wt_spin_trylock(session, &log->log_sync_lock) != 0) { WT_ERR(__wt_cond_wait( session, log->log_sync_cond, 10000)); @@ -1366,12 +1378,12 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) * now if needed. */ if (F_ISSET(slot, WT_SLOT_SYNC_DIR) && - (log->sync_dir_lsn.file < sync_lsn.file)) { + (log->sync_dir_lsn.l.file < sync_lsn.l.file)) { WT_ASSERT(session, log->log_dir_fh != NULL); WT_ERR(__wt_verbose(session, WT_VERB_LOG, - "log_release: sync directory %s to LSN %d/%lu", + "log_release: sync directory %s to LSN %u/%lu", log->log_dir_fh->name, - sync_lsn.file, sync_lsn.offset)); + sync_lsn.l.file, sync_lsn.l.offset)); WT_ERR(__wt_directory_sync_fh( session, log->log_dir_fh)); log->sync_dir_lsn = sync_lsn; @@ -1384,8 +1396,9 @@ __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) if (F_ISSET(slot, WT_SLOT_SYNC) && __wt_log_cmp(&log->sync_lsn, &slot->slot_end_lsn) < 0) { WT_ERR(__wt_verbose(session, WT_VERB_LOG, - "log_release: sync log %s to LSN %d/%lu", - log->log_fh->name, sync_lsn.file, sync_lsn.offset)); + "log_release: sync log %s to LSN %u/%lu", + log->log_fh->name, + sync_lsn.l.file, sync_lsn.l.offset)); WT_STAT_FAST_CONN_INCR(session, log_sync); WT_ERR(__wt_fsync(session, log->log_fh)); log->sync_lsn = sync_lsn; @@ -1426,10 +1439,11 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, WT_LOG_RECORD *logrec; WT_LSN end_lsn, next_lsn, rd_lsn, start_lsn; wt_off_t log_size; - uint32_t allocsize, cksum, firstlog, lastlog, lognum, rdup_len, reclen; + uint32_t allocsize, firstlog, lastlog, lognum, rdup_len, reclen; + uint32_t cksum_calculate, cksum_tmp; u_int i, logcount; int firstrecord; - bool eol; + bool eol, partial_record; char **logfiles; conn = S2C(session); @@ -1449,8 +1463,8 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, if (LF_ISSET(WT_LOGSCAN_RECOVER)) WT_RET(__wt_verbose(session, WT_VERB_LOG, - "__wt_log_scan truncating to %u/%" PRIuMAX, - log->trunc_lsn.file, (uintmax_t)log->trunc_lsn.offset)); + "__wt_log_scan truncating to %u/%u", + log->trunc_lsn.l.file, log->trunc_lsn.l.offset)); if (log != NULL) { allocsize = log->allocsize; @@ -1468,8 +1482,8 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, "choose either a start LSN or a start flag"); /* Offsets must be on allocation boundaries. */ - if (lsnp->offset % allocsize != 0 || - lsnp->file > log->fileid) + if (lsnp->l.offset % allocsize != 0 || + lsnp->l.file > log->fileid) return (WT_NOTFOUND); /* @@ -1509,14 +1523,13 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, lastlog = WT_MAX(lastlog, lognum); firstlog = WT_MIN(firstlog, lognum); } - start_lsn.file = firstlog; - end_lsn.file = lastlog; - start_lsn.offset = end_lsn.offset = 0; + WT_SET_LSN(&start_lsn, firstlog, 0); + WT_SET_LSN(&end_lsn, lastlog, 0); __wt_log_files_free(session, logfiles, logcount); logfiles = NULL; } WT_ERR(__log_openfile( - session, false, &log_fh, WT_LOG_FILENAME, start_lsn.file)); + session, false, &log_fh, WT_LOG_FILENAME, start_lsn.l.file)); WT_ERR(__wt_filesize(session, log_fh, &log_size)); rd_lsn = start_lsn; @@ -1524,8 +1537,17 @@ __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *lsnp, uint32_t flags, WT_ERR(__wt_scr_alloc(session, 0, &decryptitem)); WT_ERR(__wt_scr_alloc(session, 0, &uncitem)); for (;;) { - if (rd_lsn.offset + allocsize > log_size) { + if (rd_lsn.l.offset + allocsize > log_size) { advance: + if (rd_lsn.l.offset == log_size) + partial_record = false; + else + /* + * See if there is anything non-zero at the + * end of this log file. + */ + WT_ERR(__log_has_hole(session, log_fh, + rd_lsn.l.offset, &partial_record)); /* * If we read the last record, go to the next file. */ @@ -1538,16 +1560,24 @@ advance: if (LF_ISSET(WT_LOGSCAN_RECOVER)) WT_ERR(__log_truncate(session, &rd_lsn, WT_LOG_FILENAME, 1)); - rd_lsn.file++; - rd_lsn.offset = 0; + /* + * If we had a partial record, we'll want to break + * now after closing and truncating. Although for now + * log_truncate does not modify the LSN passed in, + * this code does not assume it is unmodified after that + * call which is why it uses the boolean set earlier. + */ + if (partial_record) + break; + WT_SET_LSN(&rd_lsn, rd_lsn.l.file + 1, 0); /* * Avoid an error message when we reach end of log * by checking here. */ - if (rd_lsn.file > end_lsn.file) + if (rd_lsn.l.file > end_lsn.l.file) break; WT_ERR(__log_openfile(session, - false, &log_fh, WT_LOG_FILENAME, rd_lsn.file)); + false, &log_fh, WT_LOG_FILENAME, rd_lsn.l.file)); WT_ERR(__wt_filesize(session, log_fh, &log_size)); eol = false; continue; @@ -1557,14 +1587,16 @@ advance: */ WT_ASSERT(session, buf->memsize >= allocsize); WT_ERR(__wt_read(session, - log_fh, rd_lsn.offset, (size_t)allocsize, buf->mem)); + log_fh, rd_lsn.l.offset, (size_t)allocsize, buf->mem)); /* - * First 4 bytes is the real record length. See if we - * need to read more than the allocation size. We expect - * that we rarely will have to read more. Most log records - * will be fairly small. + * See if we need to read more than the allocation size. We + * expect that we rarely will have to read more. Most log + * records will be fairly small. */ - reclen = *(uint32_t *)buf->mem; + reclen = ((WT_LOG_RECORD *)buf->mem)->len; +#ifdef WORDS_BIGENDIAN + reclen = __wt_bswap32(reclen); +#endif /* * Log files are pre-allocated. We need to detect the * difference between a hole in the file (where this location @@ -1578,7 +1610,7 @@ advance: */ if (reclen == 0) { WT_ERR(__log_has_hole( - session, log_fh, rd_lsn.offset, &eol)); + session, log_fh, rd_lsn.l.offset, &eol)); if (eol) /* Found a hole. This LSN is the end. */ break; @@ -1590,28 +1622,40 @@ advance: if (reclen > allocsize) { /* * The log file end could be the middle of this - * log record. + * log record. If we have a partially written record + * then this is considered the end of the log. */ - if (rd_lsn.offset + rdup_len > log_size) - goto advance; + if (rd_lsn.l.offset + rdup_len > log_size) { + eol = true; + break; + } /* * We need to round up and read in the full padded * record, especially for direct I/O. */ WT_ERR(__wt_buf_grow(session, buf, rdup_len)); - WT_ERR(__wt_read(session, - log_fh, rd_lsn.offset, (size_t)rdup_len, buf->mem)); + WT_ERR(__wt_read(session, log_fh, + rd_lsn.l.offset, (size_t)rdup_len, buf->mem)); WT_STAT_FAST_CONN_INCR(session, log_scan_rereads); } /* * We read in the record, verify checksum. + * + * Handle little- and big-endian objects. Objects are written + * in little-endian format: save the header checksum, and + * calculate the checksum for the header in its little-endian + * form. Then, restore the header's checksum, and byte-swap + * the whole thing as necessary, leaving us with a calculated + * checksum that should match the checksum in the header. */ buf->size = reclen; logrec = (WT_LOG_RECORD *)buf->mem; - cksum = logrec->checksum; + cksum_tmp = logrec->checksum; logrec->checksum = 0; - logrec->checksum = __wt_cksum(logrec, logrec->len); - if (logrec->checksum != cksum) { + cksum_calculate = __wt_cksum(logrec, reclen); + logrec->checksum = cksum_tmp; + __wt_log_record_byteswap(logrec); + if (logrec->checksum != cksum_calculate) { /* * A checksum mismatch means we have reached the end of * the useful part of the log. This should be found on @@ -1636,8 +1680,8 @@ advance: */ WT_STAT_FAST_CONN_INCR(session, log_scan_records); next_lsn = rd_lsn; - next_lsn.offset += (wt_off_t)rdup_len; - if (rd_lsn.offset != 0) { + next_lsn.l.offset += rdup_len; + if (rd_lsn.l.offset != 0) { /* * We need to manage the different buffers here. * Buf is the buffer this function uses to read from @@ -1890,10 +1934,19 @@ __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, rdup_len - record->size); record->size = rdup_len; } + /* + * Checksum a little-endian version of the header, and write everything + * in little-endian format. The checksum is (potentially) returned in a + * big-endian format, swap it into place in a separate step. + */ logrec = (WT_LOG_RECORD *)record->mem; logrec->len = (uint32_t)record->size; logrec->checksum = 0; + __wt_log_record_byteswap(logrec); logrec->checksum = __wt_cksum(logrec, record->size); +#ifdef WORDS_BIGENDIAN + logrec->checksum = __wt_bswap32(logrec->checksum); +#endif WT_STAT_FAST_CONN_INCR(session, log_writes); @@ -2061,7 +2114,8 @@ __wt_log_flush(WT_SESSION_IMPL *session, uint32_t flags) WT_RET(__wt_log_flush_lsn(session, &lsn, false)); WT_RET(__wt_verbose(session, WT_VERB_LOG, - "log_flush: flags %d LSN %d/%lu", flags, lsn.file, lsn.offset)); + "log_flush: flags %d LSN %u/%lu", + flags, lsn.l.file, lsn.l.offset)); /* * If the user wants write-no-sync, there is nothing more to do. * If the user wants background sync, set the LSN and we're done. |