/*- * Copyright (c) 2014-present MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * * See the file LICENSE for redistribution information. */ #include "wt_internal.h" static int __log_newfile(WT_SESSION_IMPL *, bool, bool *); static int __log_openfile(WT_SESSION_IMPL *, uint32_t, uint32_t, WT_FH **); static int __log_truncate(WT_SESSION_IMPL *, WT_LSN *, bool, bool); static int __log_write_internal(WT_SESSION_IMPL *, WT_ITEM *, WT_LSN *, uint32_t); #define WT_LOG_COMPRESS_SKIP (offsetof(WT_LOG_RECORD, record)) #define WT_LOG_ENCRYPT_SKIP (offsetof(WT_LOG_RECORD, record)) /* AUTOMATIC FLAG VALUE GENERATION START 0 */ #define WT_LOG_OPEN_CREATE_OK 0x1u /* Flag to __log_openfile() */ /* AUTOMATIC FLAG VALUE GENERATION STOP 32 */ /* * __wt_log_printf -- * Write a text message to the log. */ int __wt_log_printf(WT_SESSION_IMPL *session, const char *format, ...) { WT_DECL_RET; va_list ap; va_start(ap, format); ret = __wt_log_vprintf(session, format, ap); va_end(ap); return (ret); } /* * __log_checksum_match -- * Given a log record, return whether the checksum matches. */ static bool __log_checksum_match(WT_ITEM *buf, uint32_t reclen) { WT_LOG_RECORD *logrec; uint32_t checksum_saved, checksum_tmp; bool checksum_matched; logrec = buf->mem; checksum_saved = checksum_tmp = logrec->checksum; #ifdef WORDS_BIGENDIAN checksum_tmp = __wt_bswap32(checksum_tmp); #endif logrec->checksum = 0; checksum_matched = __wt_checksum_match(logrec, reclen, checksum_tmp); logrec->checksum = checksum_saved; return (checksum_matched); } /* * __log_get_files -- * Retrieve the list of all log-related files of the given prefix type. */ static int __log_get_files(WT_SESSION_IMPL *session, const char *file_prefix, char ***filesp, u_int *countp) { WT_CONNECTION_IMPL *conn; const char *log_path; *countp = 0; *filesp = NULL; conn = S2C(session); log_path = conn->log_path; if (log_path == NULL) log_path = ""; return (__wt_fs_directory_list(session, log_path, file_prefix, filesp, countp)); } /* * __log_get_files_single -- * Retrieve a single log-related file of the given prefix type. */ static int __log_get_files_single( WT_SESSION_IMPL *session, const char *file_prefix, char ***filesp, u_int *countp) { WT_CONNECTION_IMPL *conn; const char *log_path; *countp = 0; *filesp = NULL; conn = S2C(session); log_path = conn->log_path; if (log_path == NULL) log_path = ""; return (__wt_fs_directory_list_single(session, log_path, file_prefix, filesp, countp)); } /* * __log_prealloc_remove -- * Remove all previously created pre-allocated files. */ static int __log_prealloc_remove(WT_SESSION_IMPL *session) { WT_DECL_RET; WT_LOG *log; uint32_t lognum; u_int i, logcount; char **logfiles; logfiles = NULL; logcount = 0; log = S2C(session)->log; __wt_spin_lock(session, &log->log_fs_lock); /* * Clean up any old interim pre-allocated files. We clean up these files because settings may * have changed upon reboot and we want those settings to take effect right away. */ WT_ERR(__log_get_files(session, WT_LOG_TMPNAME, &logfiles, &logcount)); for (i = 0; i < logcount; i++) { WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); WT_ERR(__wt_log_remove(session, WT_LOG_TMPNAME, lognum)); } WT_ERR(__wt_fs_directory_list_free(session, &logfiles, logcount)); WT_ERR(__log_get_files(session, WT_LOG_PREPNAME, &logfiles, &logcount)); for (i = 0; i < logcount; i++) { WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); WT_ERR(__wt_log_remove(session, WT_LOG_PREPNAME, lognum)); } err: WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount)); __wt_spin_unlock(session, &log->log_fs_lock); return (ret); } /* * __log_wait_for_earlier_slot -- * Wait for write_lsn to catch up to this slot. */ static void __log_wait_for_earlier_slot(WT_SESSION_IMPL *session, WT_LOGSLOT *slot) { WT_CONNECTION_IMPL *conn; WT_LOG *log; int yield_count; conn = S2C(session); log = conn->log; yield_count = 0; while (__wt_log_cmp(&log->write_lsn, &slot->slot_release_lsn) != 0) { /* * If we're on a locked path and the write LSN is not advancing, unlock in case an earlier * thread is trying to switch its slot and complete its operation. */ if (FLD_ISSET(session->lock_flags, WT_SESSION_LOCKED_SLOT)) __wt_spin_unlock(session, &log->log_slot_lock); /* * This may not be initialized if we are starting at an older log file version. So only * signal if valid. */ if (conn->log_wrlsn_cond != NULL) __wt_cond_signal(session, conn->log_wrlsn_cond); if (++yield_count < WT_THOUSAND) __wt_yield(); else __wt_cond_wait(session, log->log_write_cond, 200, NULL); if (FLD_ISSET(session->lock_flags, WT_SESSION_LOCKED_SLOT)) __wt_spin_lock(session, &log->log_slot_lock); } } /* * __log_fs_read -- * Wrapper when reading from a log file. */ static int __log_fs_read(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t offset, size_t len, void *buf) { WT_DECL_RET; __wt_capacity_throttle(session, len, WT_THROTTLE_LOG); if ((ret = __wt_read(session, fh, offset, len, buf)) != 0) WT_RET_MSG(session, ret, "%s: log read failure", fh->name); return (ret); } /* * __log_fs_write -- * Wrapper when writing to a log file. If we're writing to a new log file for the first time * wait for writes to the previous log file. */ static int __log_fs_write( WT_SESSION_IMPL *session, WT_LOGSLOT *slot, wt_off_t offset, size_t len, const void *buf) { WT_DECL_RET; /* * If we're writing into a new log file and we're running in compatibility mode to an older * release, we have to wait for all writes to the previous log file to complete otherwise there * could be a hole at the end of the previous log file that we cannot detect. * * NOTE: Check for a version less than the one writing the system record since we've had a log * version change without any actual file format changes. */ if (S2C(session)->log->log_version < WT_LOG_VERSION_SYSTEM && slot->slot_release_lsn.l.file < slot->slot_start_lsn.l.file) { __log_wait_for_earlier_slot(session, slot); WT_RET(__wt_log_force_sync(session, &slot->slot_release_lsn)); } __wt_capacity_throttle(session, len, WT_THROTTLE_LOG); if ((ret = __wt_write(session, slot->slot_fh, offset, len, buf)) != 0) WT_RET_PANIC(session, ret, "%s: fatal log failure", slot->slot_fh->name); return (ret); } /* * __wt_log_ckpt -- * Record the given LSN as the checkpoint LSN and signal the archive thread as needed. */ void __wt_log_ckpt(WT_SESSION_IMPL *session, WT_LSN *ckpt_lsn) { WT_CONNECTION_IMPL *conn; WT_LOG *log; int i; conn = S2C(session); log = conn->log; WT_ASSIGN_LSN(&log->ckpt_lsn, ckpt_lsn); if (conn->log_cond != NULL) __wt_cond_signal(session, conn->log_cond); /* * If we are storing debugging LSNs to retain additional log files from archiving, then rotate * the newest LSN into the array. */ if (conn->debug_ckpt_cnt != 0) { for (i = (int)conn->debug_ckpt_cnt - 1; i > 0; --i) conn->debug_ckpt[i] = conn->debug_ckpt[i - 1]; conn->debug_ckpt[0] = *ckpt_lsn; } } /* * __wt_log_flush_lsn -- * Force out buffered records and return the LSN, either the write_start_lsn or write_lsn * depending on the argument. */ int __wt_log_flush_lsn(WT_SESSION_IMPL *session, WT_LSN *lsn, bool start) { WT_CONNECTION_IMPL *conn; WT_LOG *log; conn = S2C(session); log = conn->log; WT_RET(__wt_log_force_write(session, 1, NULL)); __wt_log_wrlsn(session, NULL); if (start) WT_ASSIGN_LSN(lsn, &log->write_start_lsn); else WT_ASSIGN_LSN(lsn, &log->write_lsn); return (0); } /* * __wt_log_force_sync -- * Force a sync of the log and files. */ int __wt_log_force_sync(WT_SESSION_IMPL *session, WT_LSN *min_lsn) { WT_DECL_RET; WT_FH *log_fh; WT_LOG *log; uint64_t fsync_duration_usecs, time_start, time_stop; log = S2C(session)->log; log_fh = NULL; /* * We need to wait for the previous log file to get written to disk before we sync out the * current one and advance the LSN. Signal the worker thread because we know the LSN has moved * into a later log file and there should be a log file ready to close. */ while (log->sync_lsn.l.file < min_lsn->l.file) { __wt_cond_signal(session, S2C(session)->log_file_cond); __wt_cond_wait(session, log->log_sync_cond, 10000, NULL); } __wt_spin_lock(session, &log->log_sync_lock); WT_ASSERT(session, log->log_dir_fh != NULL); /* * Sync the directory if the log file entry hasn't been written into the directory. */ if (log->sync_dir_lsn.l.file < min_lsn->l.file) { __wt_verbose(session, WT_VERB_LOG, "log_force_sync: sync directory %s to LSN %" PRIu32 "/%" PRIu32, log->log_dir_fh->name, min_lsn->l.file, min_lsn->l.offset); time_start = __wt_clock(session); WT_ERR(__wt_fsync(session, log->log_dir_fh, true)); time_stop = __wt_clock(session); fsync_duration_usecs = WT_CLOCKDIFF_US(time_stop, time_start); WT_ASSIGN_LSN(&log->sync_dir_lsn, min_lsn); WT_STAT_CONN_INCR(session, log_sync_dir); WT_STAT_CONN_INCRV(session, log_sync_dir_duration, fsync_duration_usecs); } /* * Sync the log file if needed. */ if (__wt_log_cmp(&log->sync_lsn, min_lsn) < 0) { /* * Get our own file handle to the log file. It is possible for the file handle in the log * structure to change out from under us and either be NULL or point to a different file * than we want. */ WT_ERR(__log_openfile(session, min_lsn->l.file, 0, &log_fh)); __wt_verbose(session, WT_VERB_LOG, "log_force_sync: sync %s to LSN %" PRIu32 "/%" PRIu32, log_fh->name, min_lsn->l.file, min_lsn->l.offset); time_start = __wt_clock(session); WT_ERR(__wt_fsync(session, log_fh, true)); time_stop = __wt_clock(session); fsync_duration_usecs = WT_CLOCKDIFF_US(time_stop, time_start); WT_ASSIGN_LSN(&log->sync_lsn, min_lsn); WT_STAT_CONN_INCR(session, log_sync); WT_STAT_CONN_INCRV(session, log_sync_duration, fsync_duration_usecs); __wt_cond_signal(session, log->log_sync_cond); } err: __wt_spin_unlock(session, &log->log_sync_lock); if (log_fh != NULL) WT_TRET(__wt_close(session, &log_fh)); return (ret); } /* * __wt_log_needs_recovery -- * Return 0 if we encounter a clean shutdown and 1 if recovery must be run in the given * variable. */ int __wt_log_needs_recovery(WT_SESSION_IMPL *session, WT_LSN *ckp_lsn, bool *recp) { WT_CONNECTION_IMPL *conn; WT_CURSOR *c; WT_DECL_RET; WT_ITEM dummy_key, dummy_value; WT_LOG *log; uint64_t dummy_txnid; uint32_t dummy_fileid, dummy_optype, rectype; /* * Default is to run recovery always (regardless of whether this connection has logging * enabled). */ *recp = true; conn = S2C(session); log = conn->log; if (log == NULL) return (0); /* * See if there are any data modification records between the checkpoint LSN and the end of the * log. If there are none then we can skip recovery. */ WT_RET(__wt_curlog_open(session, "log:", NULL, &c)); c->set_key(c, ckp_lsn->l.file, ckp_lsn->l.offset, 0); if ((ret = c->search(c)) == 0) { while ((ret = c->next(c)) == 0) { /* * The only thing we care about is the rectype. */ WT_ERR(c->get_value( c, &dummy_txnid, &rectype, &dummy_optype, &dummy_fileid, &dummy_key, &dummy_value)); if (rectype == WT_LOGREC_COMMIT) break; } /* * If we get to the end of the log, we can skip recovery. */ if (ret == WT_NOTFOUND) { *recp = false; ret = 0; } } else if (ret == WT_NOTFOUND) /* * We should always find the checkpoint LSN as it now points to the beginning of a written * log record. But if we're running recovery on an earlier database we may not. In that * case, we need to run recovery, don't return an error. */ ret = 0; else WT_ERR(ret); err: WT_TRET(c->close(c)); return (ret); } /* * __wt_log_written_reset -- * Interface to reset the amount of log written during this checkpoint period. Called from the * checkpoint code. */ void __wt_log_written_reset(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; conn = S2C(session); if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) conn->log->log_written = 0; } /* * __wt_log_get_backup_files -- * Retrieve the list of log files for taking a backup, either all of them or only the active * ones (those that are not candidates for archiving). The caller is responsible for freeing the * directory list returned. */ int __wt_log_get_backup_files( WT_SESSION_IMPL *session, char ***filesp, u_int *countp, uint32_t *maxid, bool active_only) { WT_DECL_RET; WT_LOG *log; uint32_t id, max, max_file, min_file; u_int count, i; char **files; *filesp = NULL; *countp = 0; *maxid = 0; id = 0; log = S2C(session)->log; /* * Capture the next file utilized for writing to the log, before forcing a new log file. This * represents the latest journal file that needs to be copied. Note the checkpoint selected for * backup may be writing to an even later log file. In that case, copying the journal files is * correct, but wasteful. */ max_file = log->alloc_lsn.l.file; /* * Capture the journal file the current checkpoint started in. The current checkpoint or a later * one may be selected for backing up, requiring log files as early as this file. Together with * max_file, this defines the range of journal files to include. */ min_file = log->ckpt_lsn.l.file; /* * Force the current slot to get written to the file. Also switch to using a new log file. That * log file will be removed from the list of files returned. New writes will not be included in * the backup. */ F_SET(log, WT_LOG_FORCE_NEWFILE); WT_RET(__wt_log_force_write(session, 1, NULL)); WT_RET(__log_get_files(session, WT_LOG_FILENAME, &files, &count)); for (max = 0, i = 0; i < count;) { WT_ERR(__wt_log_extract_lognum(session, files[i], &id)); if ((active_only && id < min_file) || id > max_file) { /* * Any files not being returned are individually freed and the array adjusted. */ __wt_free(session, files[i]); files[i] = files[count - 1]; files[--count] = NULL; } else { if (id > max) max = id; i++; } } *maxid = max; *filesp = files; *countp = count; /* * Only free on error. The caller is responsible for calling free once it is done using the * returned list. */ if (0) { err: WT_TRET(__wt_fs_directory_list_free(session, &files, count)); } return (ret); } /* * __wt_log_filename -- * Given a log number, return a WT_ITEM of a generated log file name of the given prefix type. */ int __wt_log_filename(WT_SESSION_IMPL *session, uint32_t id, const char *file_prefix, WT_ITEM *buf) { return ( __wt_filename_construct(session, S2C(session)->log_path, file_prefix, UINTMAX_MAX, id, buf)); } /* * __wt_log_extract_lognum -- * Given a log file name, extract out the log number. */ int __wt_log_extract_lognum(WT_SESSION_IMPL *session, const char *name, uint32_t *id) { const char *p; if (id == NULL || name == NULL) WT_RET_MSG(session, EINVAL, "unexpected usage: no id or no name"); if ((p = strrchr(name, '.')) == NULL || /* NOLINTNEXTLINE(cert-err34-c) */ sscanf(++p, "%" SCNu32, id) != 1) WT_RET_MSG(session, WT_ERROR, "Bad log file name '%s'", name); return (0); } /* * __wt_log_reset -- * Reset the existing log file to after the given file number. Called from recovery when * toggling logging back on, it was off the previous open but it was on earlier before that * toggle. */ int __wt_log_reset(WT_SESSION_IMPL *session, uint32_t lognum) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; uint32_t old_lognum; u_int i, logcount; char **logfiles; conn = S2C(session); log = conn->log; if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED) || log->fileid > lognum) return (0); WT_ASSERT(session, F_ISSET(conn, WT_CONN_RECOVERING)); WT_ASSERT(session, !F_ISSET(conn, WT_CONN_READONLY)); /* * We know we're single threaded and called from recovery only when toggling logging back on. * Therefore the only log files we have are old and outdated and the new one created when * logging opened before recovery. We have to remove all old log files first and then create the * new one so that log file numbers are contiguous in the file system. */ WT_RET(__wt_close(session, &log->log_fh)); WT_RET(__log_get_files(session, WT_LOG_FILENAME, &logfiles, &logcount)); for (i = 0; i < logcount; i++) { WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &old_lognum)); WT_ASSERT(session, old_lognum < lognum || lognum == 1); WT_ERR(__wt_log_remove(session, WT_LOG_FILENAME, old_lognum)); } log->fileid = lognum; /* Send in true to update connection creation LSNs. */ WT_WITH_SLOT_LOCK(session, log, ret = __log_newfile(session, true, NULL)); WT_ERR(__wt_log_slot_init(session, false)); err: WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount)); return (ret); } /* * __log_prealloc -- * Pre-allocate a log file. */ static int __log_prealloc(WT_SESSION_IMPL *session, WT_FH *fh) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; conn = S2C(session); log = conn->log; /* * If the user configured zero filling, pre-allocate the log file manually. Otherwise use the * file extension method to create and zero the log file based on what is available. */ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ZERO_FILL)) return (__wt_file_zero(session, fh, log->first_record, conn->log_file_max)); /* If configured to not extend the file, we're done. */ if (conn->log_extend_len == 0) return (0); /* * We have exclusive access to the log file and there are no other writes happening * concurrently, so there are no locking issues. */ ret = __wt_fextend(session, fh, conn->log_extend_len); return (ret == EBUSY || ret == ENOTSUP ? 0 : ret); } /* * __log_size_fit -- * Return whether or not recsize will fit in the log file. */ static int __log_size_fit(WT_SESSION_IMPL *session, WT_LSN *lsn, uint64_t recsize) { WT_CONNECTION_IMPL *conn; WT_LOG *log; conn = S2C(session); log = conn->log; return ( lsn->l.offset == log->first_record || lsn->l.offset + (wt_off_t)recsize < conn->log_file_max); } /* * __log_decompress -- * Decompress a log record. */ static int __log_decompress(WT_SESSION_IMPL *session, WT_ITEM *in, WT_ITEM *out) { WT_COMPRESSOR *compressor; WT_CONNECTION_IMPL *conn; WT_LOG_RECORD *logrec; size_t result_len, skip; uint32_t uncompressed_size; conn = S2C(session); logrec = (WT_LOG_RECORD *)in->mem; skip = WT_LOG_COMPRESS_SKIP; compressor = conn->log_compressor; if (compressor == NULL || compressor->decompress == NULL) WT_RET_MSG(session, WT_ERROR, "Compressed record with no configured compressor"); uncompressed_size = logrec->mem_len; WT_RET(__wt_buf_initsize(session, out, uncompressed_size)); memcpy(out->mem, in->mem, skip); WT_RET(compressor->decompress(compressor, &session->iface, (uint8_t *)in->mem + skip, in->size - skip, (uint8_t *)out->mem + skip, uncompressed_size - skip, &result_len)); /* * If checksums were turned off because we're depending on the decompression to fail on any * corrupted data, we'll end up here after corruption happens. If we're salvaging the file, it's * OK, otherwise it's really, really bad. */ if (result_len != uncompressed_size - WT_LOG_COMPRESS_SKIP) WT_RET_MSG(session, WT_ERROR, "decompression failed with incorrect size"); return (0); } /* * __log_decrypt -- * Decrypt a log record. */ static int __log_decrypt(WT_SESSION_IMPL *session, WT_ITEM *in, WT_ITEM *out) { WT_CONNECTION_IMPL *conn; WT_ENCRYPTOR *encryptor; WT_KEYED_ENCRYPTOR *kencryptor; conn = S2C(session); kencryptor = conn->kencryptor; if (kencryptor == NULL || (encryptor = kencryptor->encryptor) == NULL || encryptor->decrypt == NULL) WT_RET_MSG(session, WT_ERROR, "Encrypted record with no configured decrypt method"); return (__wt_decrypt(session, encryptor, WT_LOG_ENCRYPT_SKIP, in, out)); } /* * __wt_log_fill -- * Copy a thread's log records into the assigned slot. */ int __wt_log_fill( WT_SESSION_IMPL *session, WT_MYSLOT *myslot, bool force, WT_ITEM *record, WT_LSN *lsnp) { WT_DECL_RET; /* * Call write or copy into the buffer. For now the offset is the real byte offset. If the offset * becomes a unit of WT_LOG_ALIGN this is where we would multiply by WT_LOG_ALIGN to get the * real file byte offset for write(). */ if (!force && !F_ISSET(myslot, WT_MYSLOT_UNBUFFERED)) memcpy((char *)myslot->slot->slot_buf.mem + myslot->offset, record->mem, record->size); else /* * If this is a force or unbuffered write, write it now. */ WT_ERR(__log_fs_write(session, myslot->slot, myslot->offset + myslot->slot->slot_start_offset, record->size, record->mem)); WT_STAT_CONN_INCRV(session, log_bytes_written, record->size); if (lsnp != NULL) { *lsnp = myslot->slot->slot_start_lsn; lsnp->l.offset += (uint32_t)myslot->offset; } err: if (ret != 0 && myslot->slot->slot_error == 0) myslot->slot->slot_error = ret; return (ret); } /* * __log_file_header -- * Create and write a log file header into a file handle. If writing into the main log, it will * be called locked. If writing into a pre-allocated log, it will be called unlocked. */ static int __log_file_header(WT_SESSION_IMPL *session, WT_FH *fh, WT_LSN *end_lsn, bool prealloc) { WT_CONNECTION_IMPL *conn; WT_DECL_ITEM(buf); WT_DECL_RET; WT_LOG *log; WT_LOGSLOT tmp; WT_LOG_DESC *desc; WT_LOG_RECORD *logrec; WT_MYSLOT myslot; conn = S2C(session); log = conn->log; /* * Set up the log descriptor record. Use a scratch buffer to get correct alignment for direct * I/O. */ WT_ASSERT(session, sizeof(WT_LOG_DESC) < log->allocsize); WT_RET(__wt_scr_alloc(session, log->allocsize, &buf)); memset(buf->mem, 0, log->allocsize); buf->size = log->allocsize; logrec = (WT_LOG_RECORD *)buf->mem; desc = (WT_LOG_DESC *)logrec->record; desc->log_magic = WT_LOG_MAGIC; desc->version = log->log_version; desc->log_size = (uint64_t)conn->log_file_max; __wt_log_desc_byteswap(desc); /* * Now that the record is set up, initialize the record header. * * Checksum a little-endian version of the header, and write everything in little-endian format. * The checksum is (potentially) returned in a big-endian format, swap it into place in a * separate step. */ logrec->len = log->allocsize; logrec->checksum = 0; __wt_log_record_byteswap(logrec); logrec->checksum = __wt_checksum(logrec, log->allocsize); #ifdef WORDS_BIGENDIAN logrec->checksum = __wt_bswap32(logrec->checksum); #endif WT_CLEAR(tmp); memset(&myslot, 0, sizeof(myslot)); myslot.slot = &tmp; /* * We may recursively call __wt_log_acquire to allocate log space for the log descriptor record. * Call __wt_log_fill to write it, but we do not need to call __wt_log_release because we're not * waiting for any earlier operations to complete. */ if (prealloc) { WT_ASSERT(session, fh != NULL); tmp.slot_fh = fh; } else { WT_ASSERT(session, fh == NULL); WT_ERR(__wt_log_acquire(session, log->allocsize, &tmp)); } WT_ERR(__wt_log_fill(session, &myslot, true, buf, NULL)); /* * Make sure the header gets to disk. */ WT_ERR(__wt_fsync(session, tmp.slot_fh, true)); if (end_lsn != NULL) WT_ASSIGN_LSN(end_lsn, &tmp.slot_end_lsn); err: __wt_scr_free(session, &buf); return (ret); } /* * __log_openfile -- * Open a log file with the given log file number and return the WT_FH. */ static int __log_openfile(WT_SESSION_IMPL *session, uint32_t id, uint32_t flags, WT_FH **fhp) { WT_CONNECTION_IMPL *conn; WT_DECL_ITEM(buf); WT_DECL_RET; u_int wtopen_flags; conn = S2C(session); WT_RET(__wt_scr_alloc(session, 0, &buf)); /* * If we are creating the file then we use a temporary file name. Otherwise it is a log file * name. */ if (LF_ISSET(WT_LOG_OPEN_CREATE_OK)) { wtopen_flags = WT_FS_OPEN_CREATE; WT_ERR(__wt_log_filename(session, id, WT_LOG_TMPNAME, buf)); } else { wtopen_flags = 0; WT_ERR(__wt_log_filename(session, id, WT_LOG_FILENAME, buf)); } __wt_verbose(session, WT_VERB_LOG, "opening log %s", (const char *)buf->data); if (FLD_ISSET(conn->direct_io, WT_DIRECT_IO_LOG)) FLD_SET(wtopen_flags, WT_FS_OPEN_DIRECTIO); WT_ERR(__wt_open(session, buf->data, WT_FS_OPEN_FILE_TYPE_LOG, wtopen_flags, fhp)); err: __wt_scr_free(session, &buf); return (ret); } /* * __log_open_verify -- * Open a log file with the given log file number, verify its header and return various pieces * of system information about this log file. */ static int __log_open_verify(WT_SESSION_IMPL *session, uint32_t id, WT_FH **fhp, WT_LSN *lsnp, uint16_t *versionp, bool *need_salvagep) { WT_CONNECTION_IMPL *conn; WT_DECL_ITEM(buf); WT_DECL_RET; WT_FH *fh; WT_LOG *log; WT_LOG_DESC *desc; WT_LOG_RECORD *logrec; uint32_t allocsize, rectype; const uint8_t *end, *p; bool need_salvage, salvage_mode; conn = S2C(session); fh = NULL; log = conn->log; need_salvage = false; WT_RET(__wt_scr_alloc(session, 0, &buf)); salvage_mode = (need_salvagep != NULL && F_ISSET(conn, WT_CONN_SALVAGE)); if (log == NULL) allocsize = WT_LOG_ALIGN; else allocsize = log->allocsize; if (lsnp != NULL) WT_ZERO_LSN(lsnp); WT_ERR(__wt_buf_grow(session, buf, allocsize)); memset(buf->mem, 0, allocsize); /* * Any operation that fails from here on out indicates corruption that could be salvaged. */ need_salvage = true; /* * Read in the log file header and verify it. */ WT_ERR(__log_openfile(session, id, 0, &fh)); WT_ERR(__log_fs_read(session, fh, 0, allocsize, buf->mem)); logrec = (WT_LOG_RECORD *)buf->mem; __wt_log_record_byteswap(logrec); desc = (WT_LOG_DESC *)logrec->record; __wt_log_desc_byteswap(desc); if (desc->log_magic != WT_LOG_MAGIC) { if (salvage_mode) WT_ERR_MSG(session, WT_ERROR, "log file %s corrupted: Bad magic number %" PRIu32, fh->name, desc->log_magic); else WT_ERR_MSG(session, WT_TRY_SALVAGE, "log file %s corrupted: Bad magic number %" PRIu32, fh->name, desc->log_magic); } /* * We cannot read future log file formats. */ if (desc->version > WT_LOG_VERSION) WT_ERR_MSG(session, WT_ERROR, "unsupported WiredTiger file version: this build only supports versions up to %d, and " "the file is version %" PRIu16, WT_LOG_VERSION, desc->version); /* * We error if the log version is less than the required minimum or larger than the required * maximum. */ if (conn->req_max_major != WT_CONN_COMPAT_NONE && desc->version > conn->log_req_max) WT_ERR_MSG(session, WT_ERROR, WT_COMPAT_MSG_PREFIX "unsupported WiredTiger file version: this build requires a maximum version of %" PRIu16 ", and the file is version %" PRIu16, conn->log_req_max, desc->version); if (conn->req_min_major != WT_CONN_COMPAT_NONE && desc->version < conn->log_req_min) WT_ERR_MSG(session, WT_ERROR, WT_COMPAT_MSG_PREFIX "unsupported WiredTiger file version: this build requires a minimum version of %" PRIu16 ", and the file is version %" PRIu16, conn->log_req_min, desc->version); /* * Set up the return values since the header is valid. */ if (versionp != NULL) *versionp = desc->version; /* * Skip reading in the previous LSN if log file is an old version or if the caller doesn't care * about the LSN. Otherwise read that record in and set up the LSN. We already have a buffer * that is the correct size. Reuse it. */ if (lsnp == NULL || (desc->version < WT_LOG_VERSION_SYSTEM)) goto err; memset(buf->mem, 0, allocsize); WT_ERR(__log_fs_read(session, fh, allocsize, allocsize, buf->mem)); logrec = (WT_LOG_RECORD *)buf->mem; /* * We have a valid header but the system record is not there. The log ends here. Return without * setting the LSN. */ if (logrec->len == 0) { __wt_verbose(session, WT_VERB_LOG, "Log %s found empty log after header", fh->name); goto err; } if (!__log_checksum_match(buf, allocsize)) WT_ERR_MSG(session, WT_ERROR, "%s: System log record checksum mismatch", fh->name); __wt_log_record_byteswap(logrec); p = WT_LOG_SKIP_HEADER(buf->data); end = (const uint8_t *)buf->data + allocsize; WT_ERR(__wt_logrec_read(session, &p, end, &rectype)); if (rectype != WT_LOGREC_SYSTEM) WT_ERR_MSG(session, WT_ERROR, "System log record missing"); WT_ERR(__wt_log_recover_system(session, &p, end, lsnp)); err: __wt_scr_free(session, &buf); /* * Return the file handle if needed, otherwise close it. */ if (fhp != NULL && ret == 0) *fhp = fh; else if (ret != 0 && need_salvage && salvage_mode) { /* Let the caller know this file must be salvaged. */ ret = 0; WT_TRET(__wt_close(session, &fh)); if (fhp != NULL) *fhp = NULL; *need_salvagep = true; } else WT_TRET(__wt_close(session, &fh)); return (ret); } /* * __log_record_verify -- * Check that values of the log record header are valid. No byteswap of the header has been done * at this point. */ static int __log_record_verify( WT_SESSION_IMPL *session, WT_FH *log_fh, uint32_t offset, WT_LOG_RECORD *logrecp, bool *corrupt) { WT_LOG_RECORD logrec; size_t i; *corrupt = false; /* * Make our own copy of the header so we can get the bytes in the proper order. */ logrec = *logrecp; __wt_log_record_byteswap(&logrec); if (F_ISSET(&logrec, ~(WT_LOG_RECORD_ALL_FLAGS))) { WT_RET( __wt_msg(session, "%s: log record at position %" PRIu32 " has flag corruption 0x%" PRIx16, log_fh->name, offset, logrec.flags)); *corrupt = true; } for (i = 0; i < sizeof(logrec.unused); i++) if (logrec.unused[i] != 0) { WT_RET(__wt_msg(session, "%s: log record at position %" PRIu32 " has unused[%" WT_SIZET_FMT "] corruption 0x%" PRIx8, log_fh->name, offset, i, logrec.unused[i])); *corrupt = true; } if (logrec.mem_len != 0 && !F_ISSET(&logrec, WT_LOG_RECORD_COMPRESSED | WT_LOG_RECORD_ENCRYPTED)) { WT_RET(__wt_msg(session, "%s: log record at position %" PRIu32 " has memory len corruption 0x%" PRIx32, log_fh->name, offset, logrec.mem_len)); *corrupt = true; } if (logrec.len <= offsetof(WT_LOG_RECORD, record)) { WT_RET(__wt_msg(session, "%s: log record at position %" PRIu32 " has record len corruption 0x%" PRIx32, log_fh->name, offset, logrec.len)); *corrupt = true; } return (0); } /* * __log_alloc_prealloc -- * Look for a pre-allocated log file and rename it to use as the next real log file. Called * locked. */ static int __log_alloc_prealloc(WT_SESSION_IMPL *session, uint32_t to_num) { WT_CONNECTION_IMPL *conn; WT_DECL_ITEM(from_path); WT_DECL_ITEM(to_path); WT_DECL_RET; WT_LOG *log; uint32_t from_num; u_int logcount; char **logfiles; bool locked; conn = S2C(session); log = conn->log; logfiles = NULL; locked = false; /* * If there are no pre-allocated files, return WT_NOTFOUND. */ WT_RET(__log_get_files_single(session, WT_LOG_PREPNAME, &logfiles, &logcount)); if (logcount == 0) return (WT_NOTFOUND); /* We have a file to use. */ WT_ERR(__wt_log_extract_lognum(session, logfiles[0], &from_num)); WT_ERR(__wt_scr_alloc(session, 0, &from_path)); WT_ERR(__wt_scr_alloc(session, 0, &to_path)); WT_ERR(__wt_log_filename(session, from_num, WT_LOG_PREPNAME, from_path)); WT_ERR(__wt_log_filename(session, to_num, WT_LOG_FILENAME, to_path)); __wt_spin_lock(session, &log->log_fs_lock); locked = true; __wt_verbose(session, WT_VERB_LOG, "log_alloc_prealloc: rename log %s to %s", (const char *)from_path->data, (const char *)to_path->data); WT_STAT_CONN_INCR(session, log_prealloc_used); /* * All file setup, writing the header and pre-allocation was done before. We only need to rename * it. */ WT_ERR(__wt_fs_rename(session, from_path->data, to_path->data, false)); err: __wt_scr_free(session, &from_path); __wt_scr_free(session, &to_path); if (locked) __wt_spin_unlock(session, &log->log_fs_lock); WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount)); return (ret); } /* * __log_newfile -- * Create the next log file and write the file header record into it. */ static int __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *log_fh; WT_LOG *log; WT_LSN end_lsn, logrec_lsn; u_int yield_cnt; bool create_log, skipp; conn = S2C(session); log = conn->log; /* * Set aside the log file handle to be closed later. Other threads may still be using it to * write to the log. If the log file size is small we could fill a log file before the previous * one is closed. Wait for that to close. */ WT_ASSERT(session, FLD_ISSET(session->lock_flags, WT_SESSION_LOCKED_SLOT)); for (yield_cnt = 0; log->log_close_fh != NULL;) { WT_STAT_CONN_INCR(session, log_close_yields); /* * Processing slots will conditionally signal the file close server thread. But if we've * tried a while, signal the thread directly here. */ __wt_log_wrlsn(session, NULL); if (++yield_cnt % WT_THOUSAND == 0) { __wt_spin_unlock(session, &log->log_slot_lock); __wt_cond_signal(session, conn->log_file_cond); __wt_spin_lock(session, &log->log_slot_lock); } if (++yield_cnt > WT_THOUSAND * 10) return (__wt_set_return(session, EBUSY)); __wt_yield(); } /* * Note, the file server worker thread requires the LSN be set once the close file handle is * set, force that ordering. */ if (log->log_fh == NULL) log->log_close_fh = NULL; else { WT_ASSIGN_LSN(&log->log_close_lsn, &log->alloc_lsn); WT_PUBLISH(log->log_close_fh, log->log_fh); } log->fileid++; /* * If pre-allocating log files look for one; otherwise, or if we don't find one, create a log * file. We can't use pre-allocated log files while a hot backup is in progress: applications * can copy the files in any way they choose, and a log file rename might confuse things. */ create_log = true; if (conn->log_prealloc > 0 && conn->hot_backup_start == 0) { WT_WITH_HOTBACKUP_READ_LOCK( session, ret = __log_alloc_prealloc(session, log->fileid), &skipp); if (!skipp) { /* * If ret is 0 it means we found a pre-allocated file. If ret is WT_NOTFOUND, create the * new log file and signal the server, we missed our pre-allocation. If ret is non-zero * but not WT_NOTFOUND, return the error. */ WT_RET_NOTFOUND_OK(ret); if (ret == 0) create_log = false; else { WT_STAT_CONN_INCR(session, log_prealloc_missed); if (conn->log_cond != NULL) __wt_cond_signal(session, conn->log_cond); } } } /* * If we need to create the log file, do so now. */ if (create_log) { /* * Increment the missed pre-allocated file counter only if a hot backup is not in progress. * We are deliberately not using pre-allocated log files during backup (see comment above). */ if (conn->hot_backup_start == 0) log->prep_missed++; WT_RET(__wt_log_allocfile(session, log->fileid, WT_LOG_FILENAME)); } /* * Since the file system clears the output file handle pointer before searching the handle list * and filling in the new file handle, we must pass in a local file handle. Otherwise there is a * wide window where another thread could see a NULL log file handle. */ WT_RET(__log_open_verify(session, log->fileid, &log_fh, NULL, NULL, NULL)); /* * Write the LSN at the end of the last record in the previous log file as the first record in * this log file. */ if (log->fileid == 1) WT_INIT_LSN(&logrec_lsn); else WT_ASSIGN_LSN(&logrec_lsn, &log->alloc_lsn); /* * We need to setup the LSNs. Set the end LSN and alloc LSN to the end of the header. */ WT_SET_LSN(&log->alloc_lsn, log->fileid, WT_LOG_END_HEADER); /* * If we're running the version where we write a system record do so now and update the * alloc_lsn. */ if (log->log_version >= WT_LOG_VERSION_SYSTEM) { WT_RET(__wt_log_system_record(session, log_fh, &logrec_lsn)); WT_SET_LSN(&log->alloc_lsn, log->fileid, log->first_record); } WT_ASSIGN_LSN(&end_lsn, &log->alloc_lsn); WT_PUBLISH(log->log_fh, log_fh); /* * If we're called from connection creation code, we need to update the LSNs since we're the * only write in progress. */ if (conn_open) { WT_RET(__wt_fsync(session, log->log_fh, true)); WT_ASSIGN_LSN(&log->sync_lsn, &end_lsn); WT_ASSIGN_LSN(&log->write_lsn, &end_lsn); WT_ASSIGN_LSN(&log->write_start_lsn, &end_lsn); } WT_ASSIGN_LSN(&log->dirty_lsn, &log->alloc_lsn); if (created != NULL) *created = create_log; return (0); } /* * __log_set_version -- * Set version related information under lock. */ static int __log_set_version( WT_SESSION_IMPL *session, uint16_t version, uint32_t first_rec, bool live_chg, bool downgrade) { WT_CONNECTION_IMPL *conn; WT_LOG *log; conn = S2C(session); log = conn->log; log->log_version = version; log->first_record = first_rec; if (downgrade) FLD_SET(conn->log_flags, WT_CONN_LOG_DOWNGRADED); else FLD_CLR(conn->log_flags, WT_CONN_LOG_DOWNGRADED); if (live_chg) F_SET(log, WT_LOG_FORCE_NEWFILE); if (!F_ISSET(conn, WT_CONN_READONLY)) return (__log_prealloc_remove(session)); return (0); } /* * __wt_log_set_version -- * Change the version number in logging. Will be done with locking. We need to force the log * file to advance and remove all old pre-allocated files. */ int __wt_log_set_version(WT_SESSION_IMPL *session, uint16_t version, uint32_t first_rec, bool downgrade, bool live_chg, uint32_t *lognump) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; conn = S2C(session); log = conn->log; /* * The steps are: * - Set up versions and remove files under lock. * - Set a flag so that the next slot change forces a file change. * - Force out the slot that is currently active in the current log. * - Write a log record to force a record into the new log file. */ WT_WITH_SLOT_LOCK( session, log, ret = __log_set_version(session, version, first_rec, live_chg, downgrade)); if (!live_chg) return (ret); WT_ERR(ret); /* * A new log file will be used when we force out the earlier slot. */ WT_ERR(__wt_log_force_write(session, 1, NULL)); /* * We need to write a record to the new version log file so that a potential checkpoint finds * LSNs in that new log file and an archive correctly removes all earlier logs. Write an * internal printf record. */ WT_ERR(__wt_log_printf(session, "COMPATIBILITY: Version now %" PRIu16, log->log_version)); if (lognump != NULL) *lognump = log->alloc_lsn.l.file; err: return (ret); } /* * __wt_log_acquire -- * Called serially when switching slots. Can be called recursively from __log_newfile when we * change log files. */ int __wt_log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot) { WT_CONNECTION_IMPL *conn; WT_LOG *log; bool created_log; conn = S2C(session); log = conn->log; created_log = true; /* * Add recsize to alloc_lsn. Save our starting LSN where the previous allocation finished for * the release LSN. That way when log files switch, we're waiting for the correct LSN from * outstanding writes. */ WT_ASSERT(session, FLD_ISSET(session->lock_flags, WT_SESSION_LOCKED_SLOT)); /* * We need to set the release LSN earlier, before a log file change. */ WT_ASSIGN_LSN(&slot->slot_release_lsn, &log->alloc_lsn); /* * Make sure that the size can fit in the file. Proactively switch if it cannot. This reduces, * but does not eliminate, log files that exceed the maximum file size. We want to minimize the * risk of an error due to no space. */ if (F_ISSET(log, WT_LOG_FORCE_NEWFILE) || !__log_size_fit(session, &log->alloc_lsn, recsize)) { WT_RET(__log_newfile(session, false, &created_log)); F_CLR(log, WT_LOG_FORCE_NEWFILE); if (log->log_close_fh != NULL) F_SET(slot, WT_SLOT_CLOSEFH); } /* * Pre-allocate on the first real write into the log file, if it was just created (i.e. not * pre-allocated). */ if (log->alloc_lsn.l.offset == log->first_record && created_log) WT_RET(__log_prealloc(session, log->log_fh)); /* * Initialize the slot for activation. */ __wt_log_slot_activate(session, slot); return (0); } /* * __log_truncate_file -- * Truncate a log file to the specified offset. If the underlying file system doesn't support * truncate then we need to zero out the rest of the file, doing an effective truncate. */ static int __log_truncate_file(WT_SESSION_IMPL *session, WT_FH *log_fh, wt_off_t offset) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; bool skipp; conn = S2C(session); log = conn->log; if (!F_ISSET(log, WT_LOG_TRUNCATE_NOTSUP) && conn->hot_backup_start == 0) { WT_WITH_HOTBACKUP_READ_LOCK(session, ret = __wt_ftruncate(session, log_fh, offset), &skipp); if (!skipp) { if (ret != ENOTSUP) return (ret); F_SET(log, WT_LOG_TRUNCATE_NOTSUP); } } return (__wt_file_zero(session, log_fh, offset, conn->log_file_max)); } /* * __log_truncate -- * Truncate the log to the given LSN. If this_log is set, it will only truncate the log file * indicated in the given LSN. If not set, it will truncate between the given LSN and the * trunc_lsn. That is, since we pre-allocate log files, it will free that space and allow the * log to be traversed. We use the trunc_lsn because logging has already opened the new/next log * file before recovery ran. If salvage_mode is set, we verify headers of log files visited and * recreate them if they are damaged. This function assumes we are in recovery or other * dedicated time and not during live running. */ static int __log_truncate(WT_SESSION_IMPL *session, WT_LSN *lsn, bool this_log, bool salvage_mode) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_FH *log_fh; WT_LOG *log; uint32_t lognum, salvage_first, salvage_last; u_int i, logcount; char **logfiles; bool need_salvage, opened; conn = S2C(session); log = conn->log; log_fh = NULL; logcount = 0; logfiles = NULL; salvage_first = salvage_last = 0; need_salvage = false; /* * Truncate the log file to the given LSN. * * It's possible the underlying file system doesn't support truncate (there are existing * examples), which is fine, but we don't want to repeatedly do the setup work just to find that * out every time. Check before doing work, and if there's a not-supported error, turn off * future truncates. */ WT_ERR(__log_openfile(session, lsn->l.file, 0, &log_fh)); WT_ERR(__log_truncate_file(session, log_fh, lsn->l.offset)); WT_ERR(__wt_fsync(session, log_fh, true)); WT_ERR(__wt_close(session, &log_fh)); if (salvage_mode) WT_ERR(__wt_msg(session, "salvage: log file %" PRIu32 " truncated", lsn->l.file)); /* * If we just want to truncate the current log, return and skip looking for intervening logs. */ if (this_log) goto err; WT_ERR(__log_get_files(session, WT_LOG_FILENAME, &logfiles, &logcount)); for (i = 0; i < logcount; i++) { WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); if (lognum > lsn->l.file && lognum < log->trunc_lsn.l.file) { opened = false; if (salvage_mode) { /* * When salvaging, we verify that the header of the log file is valid. If not, * create a new, empty one. */ need_salvage = false; WT_ERR(__log_open_verify(session, lognum, &log_fh, NULL, NULL, &need_salvage)); if (need_salvage) { WT_ASSERT(session, log_fh == NULL); WT_ERR(__wt_log_remove(session, WT_LOG_FILENAME, lognum)); WT_ERR(__wt_log_allocfile(session, lognum, WT_LOG_FILENAME)); } else opened = true; if (salvage_first == 0) salvage_first = lognum; salvage_last = lognum; } if (!opened) WT_ERR(__log_openfile(session, lognum, 0, &log_fh)); /* * If there are intervening files pre-allocated, truncate them to the end of the log * file header. */ WT_ERR(__log_truncate_file(session, log_fh, log->first_record)); WT_ERR(__wt_fsync(session, log_fh, true)); WT_ERR(__wt_close(session, &log_fh)); } } err: WT_TRET(__wt_close(session, &log_fh)); WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount)); if (salvage_first != 0) { if (salvage_last > salvage_first) WT_TRET( __wt_msg(session, "salvage: log files %" PRIu32 "-%" PRIu32 " truncated at beginning", salvage_first, salvage_last)); else WT_TRET(__wt_msg( session, "salvage: log file %" PRIu32 " truncated at beginning", salvage_first)); } return (ret); } /* * __wt_log_allocfile -- * Given a log number, create a new log file by writing the header, pre-allocating the file and * moving it to the destination name. */ int __wt_log_allocfile(WT_SESSION_IMPL *session, uint32_t lognum, const char *dest) { WT_CONNECTION_IMPL *conn; WT_DECL_ITEM(from_path); WT_DECL_ITEM(to_path); WT_DECL_RET; WT_FH *log_fh; WT_LOG *log; uint32_t tmp_id; conn = S2C(session); log = conn->log; log_fh = NULL; /* * Preparing a log file entails creating a temporary file: * - Writing the header. * - Truncating to the offset of the first record. * - Pre-allocating the file if needed. * - Renaming it to the desired file name. */ WT_RET(__wt_scr_alloc(session, 0, &from_path)); WT_ERR(__wt_scr_alloc(session, 0, &to_path)); tmp_id = __wt_atomic_add32(&log->tmp_fileid, 1); WT_ERR(__wt_log_filename(session, tmp_id, WT_LOG_TMPNAME, from_path)); WT_ERR(__wt_log_filename(session, lognum, dest, to_path)); __wt_spin_lock(session, &log->log_fs_lock); /* * Set up the temporary file. */ WT_ERR(__log_openfile(session, tmp_id, WT_LOG_OPEN_CREATE_OK, &log_fh)); WT_ERR(__log_file_header(session, log_fh, NULL, true)); WT_ERR(__log_prealloc(session, log_fh)); WT_ERR(__wt_fsync(session, log_fh, true)); WT_ERR(__wt_close(session, &log_fh)); __wt_verbose(session, WT_VERB_LOG, "log_allocfile: rename %s to %s", (const char *)from_path->data, (const char *)to_path->data); /* * Rename it into place and make it available. */ WT_ERR(__wt_fs_rename(session, from_path->data, to_path->data, false)); err: __wt_scr_free(session, &from_path); __wt_scr_free(session, &to_path); __wt_spin_unlock(session, &log->log_fs_lock); WT_TRET(__wt_close(session, &log_fh)); return (ret); } /* * __wt_log_remove -- * Given a log number, remove that log file. */ int __wt_log_remove(WT_SESSION_IMPL *session, const char *file_prefix, uint32_t lognum) { WT_DECL_ITEM(path); WT_DECL_RET; WT_RET(__wt_scr_alloc(session, 0, &path)); WT_ERR(__wt_log_filename(session, lognum, file_prefix, path)); __wt_verbose(session, WT_VERB_LOG, "log_remove: remove log %s", (const char *)path->data); WT_ERR(__wt_fs_remove(session, path->data, false)); err: __wt_scr_free(session, &path); return (ret); } /* * __wt_log_compat_verify -- * Verify the last log when opening for the compatibility settings. This is separate because we * need to do it very early in the startup process. */ int __wt_log_compat_verify(WT_SESSION_IMPL *session) { WT_DECL_RET; uint32_t lastlog, lognum; u_int i, logcount; char **logfiles; lastlog = 0; WT_ERR(__log_get_files(session, WT_LOG_FILENAME, &logfiles, &logcount)); for (i = 0; i < logcount; i++) { WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); lastlog = WT_MAX(lastlog, lognum); } if (lastlog != 0) WT_ERR(__log_open_verify(session, lastlog, NULL, NULL, NULL, NULL)); err: WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount)); return (ret); } /* * __wt_log_open -- * Open the appropriate log file for the connection. The purpose is to find the last log file * that exists, open it and set our initial LSNs to the end of that file. If none exist, call * __log_newfile to create it. */ int __wt_log_open(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; uint32_t firstlog, lastlog, lognum; uint16_t version; u_int i, logcount; char **logfiles; bool need_salvage; conn = S2C(session); log = conn->log; logfiles = NULL; logcount = 0; /* * Open up a file handle to the log directory if we haven't. */ if (log->log_dir_fh == NULL) { __wt_verbose(session, WT_VERB_LOG, "log_open: open fh to directory %s", conn->log_path); WT_RET( __wt_open(session, conn->log_path, WT_FS_OPEN_FILE_TYPE_DIRECTORY, 0, &log->log_dir_fh)); } if (!F_ISSET(conn, WT_CONN_READONLY)) WT_ERR(__log_prealloc_remove(session)); again: /* * Now look at the log files and set our LSNs. */ lastlog = 0; firstlog = UINT32_MAX; need_salvage = false; WT_ERR(__log_get_files(session, WT_LOG_FILENAME, &logfiles, &logcount)); for (i = 0; i < logcount; i++) { WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); lastlog = WT_MAX(lastlog, lognum); firstlog = WT_MIN(firstlog, lognum); } log->fileid = lastlog; __wt_verbose( session, WT_VERB_LOG, "log_open: first log %" PRIu32 " last log %" PRIu32, firstlog, lastlog); if (firstlog == UINT32_MAX) { WT_ASSERT(session, logcount == 0); WT_INIT_LSN(&log->first_lsn); } else { WT_SET_LSN(&log->first_lsn, firstlog, 0); /* * If we have existing log files, check the last log now before we create a new log file so * that we can detect an unsupported version before modifying the file space. */ WT_ERR(__log_open_verify(session, lastlog, NULL, NULL, &version, &need_salvage)); /* * If we were asked to salvage and the last log file was indeed corrupt, remove it and try * all over again. */ if (need_salvage) { WT_ERR(__wt_log_remove(session, WT_LOG_FILENAME, lastlog)); WT_ERR(__wt_msg(session, "salvage: log file %" PRIu32 " removed", lastlog)); WT_ERR(__wt_fs_directory_list_free(session, &logfiles, logcount)); logfiles = NULL; goto again; } } /* * Start logging at the beginning of the next log file, no matter where the previous log file * ends. */ if (!F_ISSET(conn, WT_CONN_READONLY)) { WT_WITH_SLOT_LOCK(session, log, ret = __log_newfile(session, true, NULL)); WT_ERR(ret); } /* If we found log files, save the new state. */ if (logcount > 0) { /* * If we're running in a downgraded mode and there are earlier logs detect if they're at a * higher version. If so, we need to force recovery (to write a full checkpoint) and force * archiving to remove all higher version logs. */ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_DOWNGRADED)) { for (i = 0; i < logcount; ++i) { WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); /* * By sending in a NULL file handle, we don't have to close the file. */ WT_ERR(__log_open_verify(session, lognum, NULL, NULL, &version, NULL)); /* * If we find any log file at the wrong version set the flag and we're done. */ if (log->log_version != version) { FLD_SET(conn->log_flags, WT_CONN_LOG_FORCE_DOWNGRADE); break; } } } WT_ASSIGN_LSN(&log->trunc_lsn, &log->alloc_lsn); FLD_SET(conn->log_flags, WT_CONN_LOG_EXISTED); } err: WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount)); if (ret == 0) F_SET(log, WT_LOG_OPENED); return (ret); } /* * __wt_log_close -- * Close the log file. */ int __wt_log_close(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_LOG *log; conn = S2C(session); log = conn->log; if (log->log_close_fh != NULL && log->log_close_fh != log->log_fh) { __wt_verbose(session, WT_VERB_LOG, "closing old log %s", log->log_close_fh->name); if (!F_ISSET(conn, WT_CONN_READONLY)) WT_RET(__wt_fsync(session, log->log_close_fh, true)); WT_RET(__wt_close(session, &log->log_close_fh)); } if (log->log_fh != NULL) { __wt_verbose(session, WT_VERB_LOG, "closing log %s", log->log_fh->name); if (!F_ISSET(conn, WT_CONN_READONLY)) WT_RET(__wt_fsync(session, log->log_fh, true)); WT_RET(__wt_close(session, &log->log_fh)); log->log_fh = NULL; } if (log->log_dir_fh != NULL) { __wt_verbose(session, WT_VERB_LOG, "closing log directory %s", log->log_dir_fh->name); if (!F_ISSET(conn, WT_CONN_READONLY)) WT_RET(__wt_fsync(session, log->log_dir_fh, true)); WT_RET(__wt_close(session, &log->log_dir_fh)); log->log_dir_fh = NULL; } F_CLR(log, WT_LOG_OPENED); return (0); } /* * __log_has_hole -- * Determine if the current offset represents a hole in the log file (i.e. there is valid data * somewhere after the hole), or if this is the end of this log file and the remainder of the * file is zeroes. */ static int __log_has_hole(WT_SESSION_IMPL *session, WT_FH *fh, wt_off_t log_size, wt_off_t offset, wt_off_t *error_offset, bool *hole) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_LOG_RECORD *logrec; wt_off_t off, remainder; size_t allocsize, buf_left, bufsz, rdlen; char *buf, *p, *zerobuf; bool corrupt; *error_offset = 0; corrupt = *hole = false; conn = S2C(session); log = conn->log; remainder = log_size - offset; /* * It can be very slow looking for the last real record in the log in very small chunks. Walk a * megabyte at a time. If we find a part of the log that is not just zeroes we know this log * file has a hole in it. */ buf = zerobuf = NULL; if (log == NULL || log->allocsize < WT_MEGABYTE) bufsz = WT_MEGABYTE; else bufsz = log->allocsize; if ((size_t)remainder < bufsz) bufsz = (size_t)remainder; WT_RET(__wt_calloc_def(session, bufsz, &buf)); WT_ERR(__wt_calloc_def(session, bufsz, &zerobuf)); /* * Read in a chunk starting at the given offset. Compare against a known zero byte chunk. */ for (off = offset; remainder > 0; remainder -= (wt_off_t)rdlen, off += (wt_off_t)rdlen) { rdlen = WT_MIN(bufsz, (size_t)remainder); WT_ERR(__log_fs_read(session, fh, off, rdlen, buf)); allocsize = (log == NULL ? WT_LOG_ALIGN : log->allocsize); if (memcmp(buf, zerobuf, rdlen) != 0) { /* * Find where the next log record starts after the hole. */ for (p = buf, buf_left = rdlen; buf_left > 0; buf_left -= rdlen, p += rdlen) { rdlen = WT_MIN(allocsize, buf_left); if (memcmp(p, zerobuf, rdlen) != 0) break; } /* * A presumed log record begins here where the buffer becomes non-zero. If we have * enough of a log record present in the buffer, we either have a valid header or * corruption. Verify the header of this record to determine whether it is just a hole * or corruption. * * We don't bother making this check for backup copies, as records may have their * beginning zeroed, hence the part after a hole may in fact be the middle of the * record. */ if (!F_ISSET(conn, WT_CONN_WAS_BACKUP)) { logrec = (WT_LOG_RECORD *)p; if (buf_left >= sizeof(WT_LOG_RECORD)) { off += p - buf; WT_ERR(__log_record_verify(session, fh, (uint32_t)off, logrec, &corrupt)); if (corrupt) *error_offset = off; } } *hole = true; break; } } err: __wt_free(session, buf); __wt_free(session, zerobuf); return (ret); } /* * __log_check_partial_write -- * Determine if the log record may be a partial write. If that's possible, return true, * otherwise false. Since the log file is initially zeroed up to a predetermined size, any * record that falls within that boundary that ends in one or more zeroes may be partial (or the * initial record may have been padded with zeroes before writing). The only way we have any * certainty is if the last byte is non-zero, when that happens, we know that the write cannot * be partial. */ static bool __log_check_partial_write(WT_SESSION_IMPL *session, WT_ITEM *buf, uint32_t reclen) { uint8_t *rec; WT_UNUSED(session); /* * We only check the final byte since that's the only way have any certainty. Even if the second * to last byte is non-zero and the last byte is zero, that could still technically be the * result of a partial write, however unlikely it may be. */ rec = buf->mem; return (reclen > 0 && rec[reclen - 1] == 0); } /* * __wt_log_release -- * Release a log slot. */ int __wt_log_release(WT_SESSION_IMPL *session, WT_LOGSLOT *slot, bool *freep) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_LSN sync_lsn; uint64_t fsync_duration_usecs, time_start, time_stop; int64_t release_buffered, release_bytes; bool locked; conn = S2C(session); log = conn->log; locked = false; if (freep != NULL) *freep = 1; release_buffered = WT_LOG_SLOT_RELEASED_BUFFERED(slot->slot_state); release_bytes = release_buffered + slot->slot_unbuffered; /* * Checkpoints can be configured based on amount of log written. Add in this log record to the * sum and if needed, signal the checkpoint condition. The logging subsystem manages the * accumulated field. There is a bit of layering violation here checking the connection ckpt * field and using its condition. */ if (WT_CKPT_LOGSIZE(conn)) { log->log_written += (wt_off_t)release_bytes; __wt_checkpoint_signal(session, log->log_written); } /* Write the buffered records */ if (release_buffered != 0) WT_ERR(__log_fs_write( session, slot, slot->slot_start_offset, (size_t)release_buffered, slot->slot_buf.mem)); /* * If we have to wait for a synchronous operation, we do not pass handling of this slot off to * the worker thread. The caller is responsible for freeing the slot in that case. Otherwise the * worker thread will free it. */ if (!F_ISSET(slot, WT_SLOT_FLUSH | WT_SLOT_SYNC_FLAGS)) { if (freep != NULL) *freep = 0; slot->slot_state = WT_LOG_SLOT_WRITTEN; /* * After this point the worker thread owns the slot. There is nothing more to do but return. */ /* * !!! Signaling the wrlsn_cond condition here results in * worse performance because it causes more scheduling churn * and more walking of the slot pool for a very small number * of slots to process. Don't signal here. */ return (0); } /* * Wait for earlier groups to finish, otherwise there could be holes in the log file. */ WT_STAT_CONN_INCR(session, log_release_write_lsn); __log_wait_for_earlier_slot(session, slot); WT_ASSIGN_LSN(&log->write_start_lsn, &slot->slot_start_lsn); WT_ASSIGN_LSN(&log->write_lsn, &slot->slot_end_lsn); WT_ASSERT(session, slot != log->active_slot); __wt_cond_signal(session, log->log_write_cond); F_CLR(slot, WT_SLOT_FLUSH); /* * Signal the close thread if needed. */ if (F_ISSET(slot, WT_SLOT_CLOSEFH)) __wt_cond_signal(session, conn->log_file_cond); if (F_ISSET(slot, WT_SLOT_SYNC_DIRTY) && !F_ISSET(slot, WT_SLOT_SYNC) && (ret = __wt_fsync(session, log->log_fh, false)) != 0) { /* * Ignore ENOTSUP, but don't try again. */ if (ret != ENOTSUP) WT_ERR(ret); conn->log_dirty_max = 0; } /* * Try to consolidate calls to fsync to wait less. Acquire a spin lock so that threads finishing * writing to the log will wait while the current fsync completes and advance log->sync_lsn. */ while (F_ISSET(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR)) { /* * We have to wait until earlier log files have finished their sync operations. The most * recent one will set the LSN to the beginning of our file. */ if (log->sync_lsn.l.file < slot->slot_end_lsn.l.file || __wt_spin_trylock(session, &log->log_sync_lock) != 0) { __wt_cond_wait(session, log->log_sync_cond, 10000, NULL); continue; } locked = true; /* * Record the current end of our update after the lock. That is how far our calls can * guarantee. */ WT_ASSIGN_LSN(&sync_lsn, &slot->slot_end_lsn); /* * Check if we have to sync the parent directory. Some combinations of sync flags may result * in the log file not yet stable in its parent directory. Do that now if needed. */ if (F_ISSET(slot, WT_SLOT_SYNC_DIR) && (log->sync_dir_lsn.l.file < sync_lsn.l.file)) { WT_ASSERT(session, log->log_dir_fh != NULL); __wt_verbose(session, WT_VERB_LOG, "log_release: sync directory %s to LSN %" PRIu32 "/%" PRIu32, log->log_dir_fh->name, sync_lsn.l.file, sync_lsn.l.offset); time_start = __wt_clock(session); WT_ERR(__wt_fsync(session, log->log_dir_fh, true)); time_stop = __wt_clock(session); fsync_duration_usecs = WT_CLOCKDIFF_US(time_stop, time_start); WT_ASSIGN_LSN(&log->sync_dir_lsn, &sync_lsn); WT_STAT_CONN_INCR(session, log_sync_dir); WT_STAT_CONN_INCRV(session, log_sync_dir_duration, fsync_duration_usecs); } /* * Sync the log file if needed. */ if (F_ISSET(slot, WT_SLOT_SYNC) && __wt_log_cmp(&log->sync_lsn, &slot->slot_end_lsn) < 0) { __wt_verbose(session, WT_VERB_LOG, "log_release: sync log %s to LSN %" PRIu32 "/%" PRIu32, log->log_fh->name, sync_lsn.l.file, sync_lsn.l.offset); WT_STAT_CONN_INCR(session, log_sync); time_start = __wt_clock(session); WT_ERR(__wt_fsync(session, log->log_fh, true)); time_stop = __wt_clock(session); fsync_duration_usecs = WT_CLOCKDIFF_US(time_stop, time_start); WT_STAT_CONN_INCRV(session, log_sync_duration, fsync_duration_usecs); WT_ASSIGN_LSN(&log->sync_lsn, &sync_lsn); __wt_cond_signal(session, log->log_sync_cond); } /* * Clear the flags before leaving the loop. */ F_CLR(slot, WT_SLOT_SYNC | WT_SLOT_SYNC_DIR); locked = false; __wt_spin_unlock(session, &log->log_sync_lock); } err: if (locked) __wt_spin_unlock(session, &log->log_sync_lock); if (ret != 0 && slot->slot_error == 0) slot->slot_error = ret; return (ret); } /* * __log_salvage_message -- * Show messages consistently for a salvageable error. */ static int __log_salvage_message( WT_SESSION_IMPL *session, const char *log_name, const char *extra_msg, wt_off_t offset) { WT_RET(__wt_msg(session, "log file %s corrupted%s at position %" PRIuMAX ", truncated", log_name, extra_msg, (uintmax_t)offset)); F_SET(S2C(session), WT_CONN_DATA_CORRUPTION); return (WT_ERROR); } /* * __wt_log_scan -- * Scan the logs, calling a function on each record found. */ int __wt_log_scan(WT_SESSION_IMPL *session, WT_LSN *start_lsnp, WT_LSN *end_lsnp, uint32_t flags, int (*func)(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, WT_LSN *next_lsnp, void *cookie, int firstrecord), void *cookie) { WT_CONNECTION_IMPL *conn; WT_DECL_ITEM(buf); WT_DECL_ITEM(decryptitem); WT_DECL_ITEM(uncitem); WT_DECL_RET; WT_FH *log_fh; WT_ITEM *cbbuf; WT_LOG *log; WT_LOG_RECORD *logrec; WT_LSN end_lsn, next_lsn, prev_eof, prev_lsn, rd_lsn, start_lsn; wt_off_t bad_offset, log_size; uint32_t allocsize, firstlog, lastlog, lognum, rdup_len, reclen; uint16_t version; u_int i, logcount; int firstrecord; char **logfiles; bool corrupt, eol, need_salvage, partial_record; conn = S2C(session); log = conn->log; log_fh = NULL; logcount = 0; logfiles = NULL; corrupt = eol = false; firstrecord = 1; need_salvage = false; /* * If the caller did not give us a callback function there is nothing to do. */ if (func == NULL) return (0); if (start_lsnp != NULL && LF_ISSET(WT_LOGSCAN_FIRST | WT_LOGSCAN_FROM_CKP)) WT_RET_MSG(session, WT_ERROR, "choose either a start LSN or a start flag"); /* * Set up the allocation size, starting and ending LSNs. The values for those depend on whether * logging is currently enabled or not. */ lastlog = 0; if (log != NULL) { allocsize = log->allocsize; WT_ASSIGN_LSN(&end_lsn, &log->alloc_lsn); WT_ASSIGN_LSN(&start_lsn, &log->first_lsn); if (start_lsnp == NULL) { if (LF_ISSET(WT_LOGSCAN_FROM_CKP)) WT_ASSIGN_LSN(&start_lsn, &log->ckpt_lsn); else if (!LF_ISSET(WT_LOGSCAN_FIRST)) WT_RET_MSG(session, WT_ERROR, "WT_LOGSCAN_FIRST not set"); } lastlog = log->fileid; } else { /* * If logging is not configured, we can still print out the log if log files exist. We just * need to set the LSNs from what is in the files versus what is in the live connection. */ /* * Set allocsize to the minimum alignment it could be. Larger records and larger allocation * boundaries should always be a multiple of this. */ allocsize = WT_LOG_ALIGN; firstlog = UINT32_MAX; WT_RET(__log_get_files(session, WT_LOG_FILENAME, &logfiles, &logcount)); if (logcount == 0) WT_RET_MSG(session, ENOTSUP, "no log files found"); for (i = 0; i < logcount; i++) { WT_ERR(__wt_log_extract_lognum(session, logfiles[i], &lognum)); lastlog = WT_MAX(lastlog, lognum); firstlog = WT_MIN(firstlog, lognum); } WT_SET_LSN(&start_lsn, firstlog, 0); WT_SET_LSN(&end_lsn, lastlog, 0); WT_ERR(__wt_fs_directory_list_free(session, &logfiles, logcount)); } if (start_lsnp != NULL) { /* * Offsets must be on allocation boundaries. An invalid LSN from a user should just return * WT_NOTFOUND. It is not an error. But if it is from recovery, we expect valid LSNs so give * more information about that. */ if (start_lsnp->l.offset % allocsize != 0) { if (LF_ISSET(WT_LOGSCAN_RECOVER | WT_LOGSCAN_RECOVER_METADATA)) WT_ERR_MSG(session, WT_NOTFOUND, "__wt_log_scan unaligned LSN %" PRIu32 "/%" PRIu32, start_lsnp->l.file, start_lsnp->l.offset); else WT_ERR(WT_NOTFOUND); } /* * If the file is in the future it doesn't exist. An invalid LSN from a user should just * return WT_NOTFOUND. It is not an error. But if it is from recovery, we expect valid LSNs * so give more information about that. */ if (start_lsnp->l.file > lastlog) { if (LF_ISSET(WT_LOGSCAN_RECOVER | WT_LOGSCAN_RECOVER_METADATA)) WT_ERR_MSG(session, WT_NOTFOUND, "__wt_log_scan LSN %" PRIu32 "/%" PRIu32 " larger than biggest log file %" PRIu32, start_lsnp->l.file, start_lsnp->l.offset, lastlog); else WT_ERR(WT_NOTFOUND); } /* * Log cursors may not know the starting LSN. If an LSN is passed in that it is equal to the * smallest LSN, start from the beginning of the log. */ if (!WT_IS_INIT_LSN(start_lsnp)) WT_ASSIGN_LSN(&start_lsn, start_lsnp); } WT_ERR(__log_open_verify(session, start_lsn.l.file, &log_fh, &prev_lsn, NULL, &need_salvage)); if (need_salvage) WT_ERR_MSG(session, WT_ERROR, "log file requires salvage"); WT_ERR(__wt_filesize(session, log_fh, &log_size)); WT_ASSIGN_LSN(&rd_lsn, &start_lsn); if (LF_ISSET(WT_LOGSCAN_RECOVER | WT_LOGSCAN_RECOVER_METADATA)) __wt_verbose(session, WT_VERB_RECOVERY_PROGRESS, "Recovering log %" PRIu32 " through %" PRIu32, rd_lsn.l.file, end_lsn.l.file); WT_ERR(__wt_scr_alloc(session, WT_LOG_ALIGN, &buf)); WT_ERR(__wt_scr_alloc(session, 0, &decryptitem)); WT_ERR(__wt_scr_alloc(session, 0, &uncitem)); for (;;) { if (rd_lsn.l.offset + allocsize > log_size) { advance: if (rd_lsn.l.offset == log_size) partial_record = false; else { /* * See if there is anything non-zero at the end of this log file. */ WT_ERR(__log_has_hole( session, log_fh, log_size, rd_lsn.l.offset, &bad_offset, &partial_record)); if (bad_offset != 0) { need_salvage = true; WT_ERR(__log_salvage_message(session, log_fh->name, "", bad_offset)); } } /* * If we read the last record, go to the next file. */ WT_ERR(__wt_close(session, &log_fh)); log_fh = NULL; eol = true; /* * Truncate this log file before we move to the next. */ if (LF_ISSET(WT_LOGSCAN_RECOVER) && __wt_log_cmp(&rd_lsn, &log->trunc_lsn) < 0) { __wt_verbose(session, WT_VERB_LOG, "Truncate end of log %" PRIu32 "/%" PRIu32, rd_lsn.l.file, rd_lsn.l.offset); WT_ERR(__log_truncate(session, &rd_lsn, true, false)); } /* * If we had a partial record, we'll want to break now after closing and truncating. * Although for now log_truncate does not modify the LSN passed in, this code does not * assume it is unmodified after that call which is why it uses the boolean set earlier. */ if (partial_record) break; /* * Avoid an error message when we reach end of log by checking here. */ prev_eof = rd_lsn; WT_SET_LSN(&rd_lsn, rd_lsn.l.file + 1, 0); if (rd_lsn.l.file > end_lsn.l.file) break; if (LF_ISSET(WT_LOGSCAN_RECOVER | WT_LOGSCAN_RECOVER_METADATA)) __wt_verbose(session, WT_VERB_RECOVERY_PROGRESS, "Recovering log %" PRIu32 " through %" PRIu32, rd_lsn.l.file, end_lsn.l.file); WT_ERR(__log_open_verify( session, rd_lsn.l.file, &log_fh, &prev_lsn, &version, &need_salvage)); if (need_salvage) WT_ERR_MSG(session, WT_ERROR, "log file requires salvage"); /* * Opening the log file reads with verify sets up the previous LSN from the first * record. This detects a "hole" at the end of the previous log file. */ if (LF_ISSET(WT_LOGSCAN_RECOVER) && !WT_IS_INIT_LSN(&prev_lsn) && !WT_IS_ZERO_LSN(&prev_lsn) && prev_lsn.l.offset != prev_eof.l.offset) { WT_ASSERT(session, prev_eof.l.file == prev_lsn.l.file); break; } /* * If we read a current version log file without a previous LSN record the log ended * after writing that header. We're done. */ if (LF_ISSET(WT_LOGSCAN_RECOVER) && version == WT_LOG_VERSION_SYSTEM && WT_IS_ZERO_LSN(&prev_lsn)) { __wt_verbose(session, WT_VERB_LOG, "log_scan: Stopping, no system record detected in %s.", log_fh->name); break; } WT_ERR(__wt_filesize(session, log_fh, &log_size)); eol = false; continue; } /* * Read the minimum allocation size a record could be. Conditionally set the need_salvage * flag so that if the read fails, we know this is an situation we can salvage. */ WT_ASSERT(session, buf->memsize >= allocsize); need_salvage = F_ISSET(conn, WT_CONN_SALVAGE); WT_ERR(__log_fs_read(session, log_fh, rd_lsn.l.offset, (size_t)allocsize, buf->mem)); need_salvage = false; /* * See if we need to read more than the allocation size. We expect that we rarely will have * to read more. Most log records will be fairly small. */ reclen = ((WT_LOG_RECORD *)buf->mem)->len; #ifdef WORDS_BIGENDIAN reclen = __wt_bswap32(reclen); #endif /* * Log files are pre-allocated. We need to detect the difference between a hole in the file * (where this location would be considered the end of log) and the last record in the log * and we're at the zeroed part of the file. If we find a zeroed record, scan forward in the * log looking for any data. If we detect any we have a hole and stop. Otherwise if the rest * is all zeroes advance to the next file. When recovery finds the end of the log, truncate * the file and remove any later log files that may exist. */ if (reclen == 0) { WT_ERR(__log_has_hole(session, log_fh, log_size, rd_lsn.l.offset, &bad_offset, &eol)); if (bad_offset != 0) { need_salvage = true; WT_ERR(__log_salvage_message(session, log_fh->name, "", bad_offset)); } if (eol) /* Found a hole. This LSN is the end. */ break; /* Last record in log. Look for more. */ goto advance; } rdup_len = __wt_rduppo2(reclen, allocsize); if (reclen > allocsize) { /* * The log file end could be the middle of this log record. If we have a partially * written record then this is considered the end of the log. */ if (rd_lsn.l.offset + rdup_len > log_size) { eol = true; break; } /* * We need to round up and read in the full padded record, especially for direct I/O. */ WT_ERR(__wt_buf_grow(session, buf, rdup_len)); WT_ERR(__log_fs_read(session, log_fh, rd_lsn.l.offset, (size_t)rdup_len, buf->mem)); WT_STAT_CONN_INCR(session, log_scan_rereads); } /* * We read in the record, now verify the checksum. A failed checksum does not imply * corruption, it may be the result of a partial write. */ buf->size = reclen; logrec = (WT_LOG_RECORD *)buf->mem; if (!__log_checksum_match(buf, reclen)) { /* * A checksum mismatch means we have reached the end of the useful part of the log. This * should be found on the first pass through recovery. In the second pass where we * truncate the log, this is where it should end. Continue processing where possible, so * remember any error returns, but don't skip to the error handler. */ if (log != NULL) WT_ASSIGN_LSN(&log->trunc_lsn, &rd_lsn); /* * If the user asked for a specific LSN and it is not a valid LSN, return WT_NOTFOUND. */ if (LF_ISSET(WT_LOGSCAN_ONE)) ret = WT_NOTFOUND; /* * When we have a checksum mismatch, we would like * to determine whether it may be the result of: * 1) some expected corruption that can occur during * backups * 2) a partial write that can naturally occur when * an application crashes * 3) some other corruption * so that we can (in case 3) flag cases of file system * or hardware failures. Unfortunately, we have found * on some systems that file system writes may in fact * be lost, and this can readily be triggered with * normal operations. Rather than force users to * salvage in these situations, we merely truncate the * log at this point and issue a message. */ if (F_ISSET(conn, WT_CONN_WAS_BACKUP)) break; if (!__log_check_partial_write(session, buf, reclen)) { /* * It's not a partial write, and we have a bad checksum. We treat it as a corruption * that must be salvaged. */ need_salvage = true; WT_TRET( __log_salvage_message(session, log_fh->name, ", bad checksum", rd_lsn.l.offset)); } else { /* * It may be a partial write, or it's possible that the header is corrupt. Make a * sanity check of the log record header. */ WT_TRET(__log_record_verify(session, log_fh, rd_lsn.l.offset, logrec, &corrupt)); if (corrupt) { need_salvage = true; WT_TRET(__log_salvage_message(session, log_fh->name, "", rd_lsn.l.offset)); } } break; } __wt_log_record_byteswap(logrec); /* * We have a valid log record. If it is not the log file header, invoke the callback. */ WT_STAT_CONN_INCR(session, log_scan_records); WT_ASSIGN_LSN(&next_lsn, &rd_lsn); next_lsn.l.offset += rdup_len; if (rd_lsn.l.offset != 0) { /* * We need to manage the different buffers here. Buf is the buffer this function uses to * read from the disk. The callback buffer may change based on whether encryption and * compression are used. * * We want to free any buffers from compression and encryption but keep the one we use * for reading. */ cbbuf = buf; if (F_ISSET(logrec, WT_LOG_RECORD_ENCRYPTED)) { WT_ERR(__log_decrypt(session, cbbuf, decryptitem)); cbbuf = decryptitem; } if (F_ISSET(logrec, WT_LOG_RECORD_COMPRESSED)) { WT_ERR(__log_decompress(session, cbbuf, uncitem)); cbbuf = uncitem; } WT_ERR((*func)(session, cbbuf, &rd_lsn, &next_lsn, cookie, firstrecord)); firstrecord = 0; if (LF_ISSET(WT_LOGSCAN_ONE)) break; } /* * Exit the scanning loop if the next LSN seen is greater than our user set end range LSN. */ if (end_lsnp != NULL && __wt_log_cmp(&next_lsn, end_lsnp) > 0) break; WT_ASSIGN_LSN(&rd_lsn, &next_lsn); } /* Truncate if we're in recovery. */ if (LF_ISSET(WT_LOGSCAN_RECOVER) && __wt_log_cmp(&rd_lsn, &log->trunc_lsn) < 0) { __wt_verbose(session, WT_VERB_LOG, "End of recovery truncate end of log %" PRIu32 "/%" PRIu32, rd_lsn.l.file, rd_lsn.l.offset); /* Preserve prior error and fall through to error handling. */ WT_TRET(__log_truncate(session, &rd_lsn, false, false)); } err: WT_STAT_CONN_INCR(session, log_scans); /* * If we are salvaging and failed a salvageable operation, then truncate the log at the fail * point. */ if (ret != 0 && ret != WT_PANIC && need_salvage) { WT_TRET(__wt_close(session, &log_fh)); log_fh = NULL; WT_TRET(__log_truncate(session, &rd_lsn, false, true)); ret = 0; } /* * If the first attempt to read a log record results in an error recovery is likely going to * fail. Try to provide a helpful failure message. */ if (ret != 0 && firstrecord && LF_ISSET(WT_LOGSCAN_RECOVER | WT_LOGSCAN_RECOVER_METADATA)) { __wt_err(session, ret, "WiredTiger is unable to read the recovery log"); __wt_err(session, ret, "This may be due to the log files being encrypted, being from an older version or due to " "corruption on disk"); __wt_err(session, ret, "You should confirm that you have opened the database with the correct options including " "all encryption and compression options"); } WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount)); __wt_scr_free(session, &buf); __wt_scr_free(session, &decryptitem); __wt_scr_free(session, &uncitem); /* * If the caller wants one record and it is at the end of log, return WT_NOTFOUND. */ if (LF_ISSET(WT_LOGSCAN_ONE) && eol && ret == 0) ret = WT_NOTFOUND; WT_TRET(__wt_close(session, &log_fh)); return (ret); } /* * __wt_log_force_write -- * Force a switch and release and write of the current slot. Wrapper function that takes the * lock. */ int __wt_log_force_write(WT_SESSION_IMPL *session, bool retry, bool *did_work) { WT_LOG *log; WT_MYSLOT myslot; log = S2C(session)->log; memset(&myslot, 0, sizeof(myslot)); WT_STAT_CONN_INCR(session, log_force_write); if (did_work != NULL) *did_work = true; myslot.slot = log->active_slot; return (__wt_log_slot_switch(session, &myslot, retry, true, did_work)); } /* * __wt_log_write -- * Write a record into the log, compressing as necessary. */ int __wt_log_write(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags) { WT_COMPRESSOR *compressor; WT_CONNECTION_IMPL *conn; WT_DECL_ITEM(citem); WT_DECL_ITEM(eitem); WT_DECL_RET; WT_ITEM *ip; WT_KEYED_ENCRYPTOR *kencryptor; WT_LOG *log; WT_LOG_RECORD *newlrp; size_t dst_len, len, new_size, result_len, src_len; uint8_t *dst, *src; int compression_failed; conn = S2C(session); log = conn->log; /* * An error during opening the logging subsystem can result in it being enabled, but without an * open log file. In that case, just return. We can also have logging opened for reading in a * read-only database and attempt to write a record on close. */ if (!F_ISSET(log, WT_LOG_OPENED) || F_ISSET(conn, WT_CONN_READONLY)) return (0); ip = record; if ((compressor = conn->log_compressor) != NULL && record->size < log->allocsize) { WT_STAT_CONN_INCR(session, log_compress_small); } else if (compressor != NULL) { /* Skip the log header */ src = (uint8_t *)record->mem + WT_LOG_COMPRESS_SKIP; src_len = record->size - WT_LOG_COMPRESS_SKIP; /* * Compute the size needed for the destination buffer. We only allocate enough memory for a * copy of the original by default, if any compressed version is bigger than the original, * we won't use it. However, some compression engines (snappy is one example), may need more * memory because they don't stop just because there's no more memory into which to * compress. */ if (compressor->pre_size == NULL) len = src_len; else WT_ERR(compressor->pre_size(compressor, &session->iface, src, src_len, &len)); new_size = len + WT_LOG_COMPRESS_SKIP; WT_ERR(__wt_scr_alloc(session, new_size, &citem)); /* Skip the header bytes of the destination data. */ dst = (uint8_t *)citem->mem + WT_LOG_COMPRESS_SKIP; dst_len = len; compression_failed = 0; WT_ERR(compressor->compress(compressor, &session->iface, src, src_len, dst, dst_len, &result_len, &compression_failed)); result_len += WT_LOG_COMPRESS_SKIP; /* * If compression fails, or doesn't gain us at least one unit of allocation, fallback to the * original version. This isn't unexpected: if compression doesn't work for some chunk of * data for some reason (noting likely additional format/header information which compressed * output requires), it just means the uncompressed version is as good as it gets, and * that's what we use. */ if (compression_failed || result_len / log->allocsize >= record->size / log->allocsize) WT_STAT_CONN_INCR(session, log_compress_write_fails); else { WT_STAT_CONN_INCR(session, log_compress_writes); WT_STAT_CONN_INCRV(session, log_compress_mem, record->size); WT_STAT_CONN_INCRV(session, log_compress_len, result_len); /* * Copy in the skipped header bytes, set the final data size. */ memcpy(citem->mem, record->mem, WT_LOG_COMPRESS_SKIP); citem->size = result_len; ip = citem; newlrp = (WT_LOG_RECORD *)citem->mem; F_SET(newlrp, WT_LOG_RECORD_COMPRESSED); WT_ASSERT(session, result_len < UINT32_MAX && record->size < UINT32_MAX); newlrp->mem_len = WT_STORE_SIZE(record->size); } } if ((kencryptor = conn->kencryptor) != NULL) { /* * Allocate enough space for the original record plus the encryption size constant plus the * length we store. */ __wt_encrypt_size(session, kencryptor, ip->size, &new_size); WT_ERR(__wt_scr_alloc(session, new_size, &eitem)); WT_ERR(__wt_encrypt(session, kencryptor, WT_LOG_ENCRYPT_SKIP, ip, eitem)); /* * Final setup of new buffer. Set the flag for encryption in the record header. */ ip = eitem; newlrp = (WT_LOG_RECORD *)eitem->mem; F_SET(newlrp, WT_LOG_RECORD_ENCRYPTED); WT_ASSERT(session, new_size < UINT32_MAX && ip->size < UINT32_MAX); } ret = __log_write_internal(session, ip, lsnp, flags); err: __wt_scr_free(session, &citem); __wt_scr_free(session, &eitem); return (ret); } /* * __log_write_internal -- * Write a record into the log. */ static int __log_write_internal(WT_SESSION_IMPL *session, WT_ITEM *record, WT_LSN *lsnp, uint32_t flags) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LOG *log; WT_LOG_RECORD *logrec; WT_LSN lsn; WT_MYSLOT myslot; int64_t release_size; uint32_t fill_size, force, rdup_len; bool free_slot; conn = S2C(session); log = conn->log; if (record->size > UINT32_MAX) WT_RET_MSG(session, EFBIG, "Log record size of %" WT_SIZET_FMT " exceeds the maximum supported size of %" PRIu32, record->size, UINT32_MAX); WT_INIT_LSN(&lsn); myslot.slot = NULL; memset(&myslot, 0, sizeof(myslot)); /* * Assume the WT_ITEM the caller passed is a WT_LOG_RECORD, which has a header at the beginning * for us to fill in. * * If using direct_io, the caller should pass us an aligned record. But we need to make sure it * is big enough and zero-filled so that we can write the full amount. Do this whether or not * direct_io is in use because it makes the reading code cleaner. */ WT_STAT_CONN_INCRV(session, log_bytes_payload, record->size); rdup_len = __wt_rduppo2((uint32_t)record->size, log->allocsize); WT_ERR(__wt_buf_grow(session, record, rdup_len)); WT_ASSERT(session, record->data == record->mem); /* * If the caller's record only partially fills the necessary space, we need to zero-fill the * remainder. * * The cast is safe, we've already checked to make sure it's in range. */ fill_size = rdup_len - (uint32_t)record->size; if (fill_size != 0) { memset((uint8_t *)record->mem + record->size, 0, fill_size); /* * Set the last byte of the log record to a non-zero value, that allows us, on the input * side, to tell that a log record was completely written; there couldn't have been a * partial write. That means that any checksum mismatch in those conditions is a log * corruption. * * Without this changed byte, when we see a zeroed last byte, we must always treat a * checksum error as a possible partial write. Since partial writes can happen as a result * of an interrupted process (for example, a shutdown), we must treat a checksum error as a * normal occurrence, and merely the place where the log must be truncated. So any real * corruption within log records is hard to detect as such. * * However, we can only make this modification if there is more than one byte being filled, * as the first zero byte past the actual record is needed to terminate the loop in * txn_commit_apply. * * This is not a log format change, as we only are changing a byte in the padding portion of * a record, and no logging code has ever checked that it is any particular value up to now. */ if (fill_size > 1) *((uint8_t *)record->mem + rdup_len - 1) = WT_DEBUG_BYTE; record->size = rdup_len; } /* * Checksum a little-endian version of the header, and write everything in little-endian format. * The checksum is (potentially) returned in a big-endian format, swap it into place in a * separate step. */ logrec = (WT_LOG_RECORD *)record->mem; logrec->len = (uint32_t)record->size; logrec->checksum = 0; __wt_log_record_byteswap(logrec); logrec->checksum = __wt_checksum(logrec, record->size); #ifdef WORDS_BIGENDIAN logrec->checksum = __wt_bswap32(logrec->checksum); #endif WT_STAT_CONN_INCR(session, log_writes); /* * The only time joining a slot should ever return an error is if it detects a panic. */ __wt_log_slot_join(session, rdup_len, flags, &myslot); /* * If the addition of this record crosses the buffer boundary, switch in a new slot. */ force = LF_ISSET(WT_LOG_FLUSH | WT_LOG_FSYNC); ret = 0; if (myslot.end_offset >= WT_LOG_SLOT_BUF_MAX || F_ISSET(&myslot, WT_MYSLOT_UNBUFFERED) || force) ret = __wt_log_slot_switch(session, &myslot, true, false, NULL); if (ret == 0) ret = __wt_log_fill(session, &myslot, false, record, &lsn); release_size = __wt_log_slot_release(&myslot, (int64_t)rdup_len); /* * If we get an error we still need to do proper accounting in the slot fields. XXX On error we * may still need to call release and free. */ if (ret != 0) myslot.slot->slot_error = ret; WT_ASSERT(session, ret == 0); if (WT_LOG_SLOT_DONE(release_size)) { WT_ERR(__wt_log_release(session, myslot.slot, &free_slot)); if (free_slot) __wt_log_slot_free(session, myslot.slot); } else if (force) { /* * If we are going to wait for this slot to get written, signal the log server thread. * * XXX I've seen times when conditions are NULL. */ if (conn->log_cond != NULL) { __wt_cond_signal(session, conn->log_cond); __wt_yield(); } else WT_ERR(__wt_log_force_write(session, 1, NULL)); } if (LF_ISSET(WT_LOG_FLUSH)) { /* Wait for our writes to reach the OS */ while (__wt_log_cmp(&log->write_lsn, &lsn) <= 0 && myslot.slot->slot_error == 0) __wt_cond_wait(session, log->log_write_cond, 10000, NULL); } else if (LF_ISSET(WT_LOG_FSYNC)) { /* Wait for our writes to reach disk */ while (__wt_log_cmp(&log->sync_lsn, &lsn) <= 0 && myslot.slot->slot_error == 0) __wt_cond_wait(session, log->log_sync_cond, 10000, NULL); } err: if (ret == 0 && lsnp != NULL) *lsnp = lsn; /* * If we're synchronous and some thread had an error, we don't know if our write made it out to * the file or not. The error could be before or after us. So, if anyone got an error, we report * it. If we're not synchronous, only report if our own operation got an error. */ if (LF_ISSET(WT_LOG_DSYNC | WT_LOG_FSYNC) && ret == 0 && myslot.slot != NULL) ret = myslot.slot->slot_error; /* * If one of the sync flags is set, assert the proper LSN has moved to match on success. */ WT_ASSERT( session, ret != 0 || !LF_ISSET(WT_LOG_FLUSH) || __wt_log_cmp(&log->write_lsn, &lsn) >= 0); WT_ASSERT( session, ret != 0 || !LF_ISSET(WT_LOG_FSYNC) || __wt_log_cmp(&log->sync_lsn, &lsn) >= 0); return (ret); } /* * __wt_log_vprintf -- * Write a message into the log. */ int __wt_log_vprintf(WT_SESSION_IMPL *session, const char *fmt, va_list ap) { WT_CONNECTION_IMPL *conn; WT_DECL_ITEM(logrec); WT_DECL_RET; size_t header_size, len; uint32_t rectype; const char *rec_fmt; va_list ap_copy; conn = S2C(session); rectype = WT_LOGREC_MESSAGE; rec_fmt = WT_UNCHECKED_STRING(I); if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) return (0); va_copy(ap_copy, ap); len = 1; ret = __wt_vsnprintf_len_incr(NULL, 0, &len, fmt, ap_copy); va_end(ap_copy); WT_RET(ret); WT_RET(__wt_logrec_alloc(session, sizeof(WT_LOG_RECORD) + len, &logrec)); /* * We're writing a record with the type (an integer) followed by a string (NUL-terminated data). * To avoid writing the string into a buffer before copying it, we write the header first, then * the raw bytes of the string. */ WT_ERR(__wt_struct_size(session, &header_size, rec_fmt, rectype)); WT_ERR(__wt_struct_pack( session, (uint8_t *)logrec->data + logrec->size, header_size, rec_fmt, rectype)); logrec->size += (uint32_t)header_size; WT_ERR(__wt_vsnprintf((char *)logrec->data + logrec->size, len, fmt, ap)); __wt_verbose(session, WT_VERB_LOG, "log_printf: %s", (char *)logrec->data + logrec->size); logrec->size += len; WT_ERR(__wt_log_write(session, logrec, NULL, 0)); err: __wt_scr_free(session, &logrec); return (ret); } /* * __wt_log_flush -- * Forcibly flush the log to the synchronization level specified. Wait until it has been * completed. */ int __wt_log_flush(WT_SESSION_IMPL *session, uint32_t flags) { WT_CONNECTION_IMPL *conn; WT_LOG *log; WT_LSN last_lsn, lsn; conn = S2C(session); WT_ASSERT(session, FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)); log = conn->log; /* * We need to flush out the current slot first to get the real end of log LSN in log->alloc_lsn. */ WT_RET(__wt_log_flush_lsn(session, &lsn, false)); WT_ASSIGN_LSN(&last_lsn, &log->alloc_lsn); /* * If the last write caused a switch to a new log file, we should only wait for the last write * to be flushed. Otherwise, if the workload is single-threaded we could wait here forever * because the write LSN doesn't switch into the new file until it contains a record. */ if (last_lsn.l.offset == log->first_record) WT_ASSIGN_LSN(&last_lsn, &log->log_close_lsn); /* * Wait until all current outstanding writes have been written to the file system. */ while (__wt_log_cmp(&last_lsn, &lsn) > 0) { __wt_sleep(0, WT_THOUSAND); WT_RET(__wt_log_flush_lsn(session, &lsn, false)); } __wt_verbose(session, WT_VERB_LOG, "log_flush: flags %#" PRIx32 " LSN %" PRIu32 "/%" PRIu32, flags, lsn.l.file, lsn.l.offset); /* * If the user wants write-no-sync, there is nothing more to do. If the user wants background * sync, set the LSN and we're done. If the user wants sync, force it now. */ if (LF_ISSET(WT_LOG_FSYNC)) WT_RET(__wt_log_force_sync(session, &lsn)); return (0); }