/*-
 * Copyright (c) 2014-present MongoDB, Inc.
 * Copyright (c) 2008-2014 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

#include "wt_internal.h"

/*
 * __logmgr_sync_cfg --
 *     Interpret the transaction_sync config.
 */
static int
__logmgr_sync_cfg(WT_SESSION_IMPL *session, const char **cfg)
{
    WT_CONFIG_ITEM cval;
    WT_CONNECTION_IMPL *conn;
    uint32_t txn_logsync;

    conn = S2C(session);

    /*
     * Collect all the flag settings into a local variable and then assign into the connection after
     * we're done so that there is no chance of another thread seeing an interim value while we're
     * processing during a reconfigure.
     */
    txn_logsync = 0;
    WT_RET(__wt_config_gets(session, cfg, "transaction_sync.enabled", &cval));
    if (cval.val)
        FLD_SET(txn_logsync, WT_LOG_SYNC_ENABLED);
    else
        FLD_CLR(txn_logsync, WT_LOG_SYNC_ENABLED);

    WT_RET(__wt_config_gets(session, cfg, "transaction_sync.method", &cval));
    if (WT_STRING_MATCH("dsync", cval.str, cval.len))
        FLD_SET(txn_logsync, WT_LOG_DSYNC | WT_LOG_FLUSH);
    else if (WT_STRING_MATCH("fsync", cval.str, cval.len))
        FLD_SET(txn_logsync, WT_LOG_FSYNC);
    else if (WT_STRING_MATCH("none", cval.str, cval.len))
        FLD_SET(txn_logsync, WT_LOG_FLUSH);
    WT_PUBLISH(conn->txn_logsync, txn_logsync);
    return (0);
}

/*
 * __logmgr_force_remove --
 *     Force a checkpoint out and then force a removal, waiting for the first log to be removed up
 *     to the given log number.
 */
static int
__logmgr_force_remove(WT_SESSION_IMPL *session, uint32_t lognum)
{
    WT_CONNECTION_IMPL *conn;
    WT_LOG *log;
    WT_SESSION_IMPL *tmp_session;
    uint64_t sleep_usecs, yield_cnt;

    conn = S2C(session);
    log = conn->log;
    sleep_usecs = yield_cnt = 0;

    WT_RET(__wt_open_internal_session(conn, "compatibility-reconfig", true, 0, 0, &tmp_session));
    while (log->first_lsn.l.file < lognum) {
        /*
         * Force a checkpoint to be written in the new log file and force the removal of all
         * previous log files. We do the checkpoint in the loop because the checkpoint LSN in the
         * log record could still reflect the previous log file in cases such as the write LSN has
         * not yet advanced into the new log file due to another group of threads still in progress
         * with their slot copies or writes.
         */
        WT_RET(tmp_session->iface.checkpoint(&tmp_session->iface, "force=1"));
        /*
         * It's reasonable to start the back off prior to trying at all because the backoff is very
         * gradual.
         */
        __wt_spin_backoff(&yield_cnt, &sleep_usecs);
        WT_STAT_CONN_INCRV(session, log_force_remove_sleep, sleep_usecs);

        WT_RET(WT_SESSION_CHECK_PANIC(tmp_session));
        WT_RET(__wt_log_truncate_files(tmp_session, NULL, true));
    }
    WT_RET(__wt_session_close_internal(tmp_session));
    return (0);
}

/*
 * __logmgr_get_log_version --
 *     Get the log version required for the given WiredTiger version.
 */
static uint16_t
__logmgr_get_log_version(WT_VERSION version)
{
    if (!__wt_version_defined(version))
        return (WT_NO_VALUE);

    if (__wt_version_lt(version, WT_LOG_V2_VERSION))
        return (1);
    else if (__wt_version_lt(version, WT_LOG_V3_VERSION))
        return (2);
    else if (__wt_version_lt(version, WT_LOG_V4_VERSION))
        return (3);
    else if (__wt_version_lt(version, WT_LOG_V5_VERSION))
        return (4);
    else
        return (WT_LOG_VERSION);
}

/*
 * __wt_logmgr_compat_version --
 *     Set up the compatibility versions in the log manager. This is split out because it is called
 *     much earlier than log subsystem creation on startup so that we can verify the system state in
 *     files before modifying files.
 */
void
__wt_logmgr_compat_version(WT_SESSION_IMPL *session)
{
    WT_CONNECTION_IMPL *conn;

    conn = S2C(session);
    conn->log_req_max = __logmgr_get_log_version(conn->compat_req_max);
    conn->log_req_min = __logmgr_get_log_version(conn->compat_req_min);
}

/*
 * __logmgr_version --
 *     Set up the versions in the log manager.
 */
static int
__logmgr_version(WT_SESSION_IMPL *session, bool reconfig)
{
    WT_CONNECTION_IMPL *conn;
    WT_LOG *log;
    uint32_t first_record, lognum;
    uint16_t new_version;
    bool downgrade;

    conn = S2C(session);
    log = conn->log;
    if (log == NULL)
        return (0);

    /*
     * Set the log file format versions based on compatibility versions set in the connection. The
     * compatibility version must be set at this point. We must set this before we call log_open to
     * open or create a log file.
     */
    WT_ASSERT(session, __wt_version_defined(conn->compat_version));
    new_version = __logmgr_get_log_version(conn->compat_version);

    if (new_version > 1)
        first_record = WT_LOG_END_HEADER + log->allocsize;
    else
        first_record = WT_LOG_END_HEADER;

    __wt_logmgr_compat_version(session);

    /*
     * If the version is the same, there is nothing to do.
     */
    if (log->log_version == new_version)
        return (0);

    /*
     * Note: downgrade in this context means the new version is not the latest possible version. It
     * does not mean the direction of change from the release we may be running currently.
     */
    downgrade = new_version != WT_LOG_VERSION;

    /*
     * If we are reconfiguring and at a new version we need to force the log file to advance so that
     * we write out a log file at the correct version. When we are downgrading we must force a
     * checkpoint and finally log removal, even if disabled, so that all new version log files are
     * gone.
     *
     * All of the version changes must be handled with locks on reconfigure because other threads
     * may be changing log files, using pre-allocated files.
     */
    /*
     * Set the version. If it is a live change the logging subsystem will do other work as well to
     * move to a new log file.
     */
    WT_RET(__wt_log_set_version(session, new_version, first_record, downgrade, reconfig, &lognum));
    if (reconfig && FLD_ISSET(conn->log_flags, WT_CONN_LOG_DOWNGRADED))
        WT_RET(__logmgr_force_remove(session, lognum));
    return (0);
}

/*
 * __wt_logmgr_config --
 *     Parse and setup the logging server options.
 */
int
__wt_logmgr_config(WT_SESSION_IMPL *session, const char **cfg, bool reconfig)
{
    WT_CONFIG_ITEM cval;
    WT_CONNECTION_IMPL *conn;
    bool enabled;

    /*
     * A note on reconfiguration: the standard "is this configuration string allowed" checks should
     * fail if reconfiguration has invalid strings, for example, "log=(enabled)", or
     * "statistics_log=(path=XXX)", because the connection reconfiguration method doesn't allow
     * those strings. Additionally, the base configuration values during reconfiguration are the
     * currently configured values (so we don't revert to default values when repeatedly
     * reconfiguring), and configuration processing of a currently set value should not change the
     * currently set value.
     *
     * In this code path, log server reconfiguration does not stop/restart the log server, so
     * there's no point in re-evaluating configuration strings that cannot be reconfigured, risking
     * bugs in configuration setup, and depending on evaluation of currently set values to always
     * result in the currently set value. Skip tests for any configuration strings which don't make
     * sense during reconfiguration, but don't worry about error reporting because it should never
     * happen.
     */

    conn = S2C(session);

    WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval));
    enabled = cval.val != 0;

    /*
     * If we're reconfiguring, enabled must match the already existing setting.
     *
     * If it is off and the user it turning it on, or it is on and the user is turning it off,
     * return an error.
     *
     * See above: should never happen.
     */
    if (reconfig &&
      ((enabled && !FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) ||
        (!enabled && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))))
        WT_RET_MSG(
          session, EINVAL, "log manager reconfigure: enabled mismatch with existing setting");

    /* Logging is incompatible with in-memory */
    if (enabled) {
        WT_RET(__wt_config_gets(session, cfg, "in_memory", &cval));
        if (cval.val != 0)
            WT_RET_MSG(
              session, EINVAL, "In-memory configuration incompatible with log=(enabled=true)");
    }

    if (enabled)
        FLD_SET(conn->log_flags, WT_CONN_LOG_CONFIG_ENABLED);
    else
        FLD_CLR(conn->log_flags, WT_CONN_LOG_CONFIG_ENABLED);

    /*
     * Setup a log path and compression even if logging is disabled in case we are going to print a
     * log. Only do this on creation. Once a compressor or log path are set they cannot be changed.
     *
     * See above: should never happen.
     */
    if (!reconfig) {
        conn->log_compressor = NULL;
        WT_RET(__wt_config_gets_none(session, cfg, "log.compressor", &cval));
        WT_RET(__wt_compressor_config(session, &cval, &conn->log_compressor));

        conn->log_path = NULL;
        WT_RET(__wt_config_gets(session, cfg, "log.path", &cval));
        WT_RET(__wt_strndup(session, cval.str, cval.len, &conn->log_path));
    }

    /* We are done if logging isn't enabled. */
    if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_CONFIG_ENABLED))
        return (0);

    /*
     * The configuration string log.archive is deprecated, only take it if it's explicitly set by
     * the application, that is, ignore its default value. Look for an explicit log.remove setting,
     * then an explicit log.archive setting, then the default log.remove setting.
     */
    if (__wt_config_gets(session, cfg + 1, "log.remove", &cval) != 0 &&
      __wt_config_gets(session, cfg + 1, "log.archive", &cval) != 0)
        WT_RET(__wt_config_gets(session, cfg, "log.remove", &cval));
    if (cval.val != 0)
        FLD_SET(conn->log_flags, WT_CONN_LOG_REMOVE);

    /*
     * The file size cannot be reconfigured. The amount of memory allocated to the log slots may be
     * based on the log file size at creation and we don't want to re-allocate that memory while
     * running.
     *
     * See above: should never happen.
     */
    if (!reconfig) {
        WT_RET(__wt_config_gets(session, cfg, "log.file_max", &cval));
        conn->log_file_max = (wt_off_t)cval.val;
        /*
         * With the default log file extend configuration or if the log file extension size is
         * larger than the configured maximum log file size, set the log file extension size to the
         * configured maximum log file size.
         */
        if (conn->log_extend_len == WT_CONFIG_UNSET || conn->log_extend_len > conn->log_file_max)
            conn->log_extend_len = conn->log_file_max;
        WT_STAT_CONN_SET(session, log_max_filesize, conn->log_file_max);
    }

    WT_RET(__wt_config_gets(session, cfg, "log.os_cache_dirty_pct", &cval));
    if (cval.val != 0)
        conn->log_dirty_max = (conn->log_file_max * cval.val) / 100;

    /*
     * If pre-allocation is configured, set the initial number to a few. We'll adapt as load
     * dictates.
     */
    WT_RET(__wt_config_gets(session, cfg, "log.prealloc", &cval));
    if (cval.val != 0)
        conn->log_prealloc = 1;

    WT_RET(__wt_config_gets(session, cfg, "log.force_write_wait", &cval));
    if (cval.val != 0)
        conn->log_force_write_wait = (uint32_t)cval.val;

    /*
     * Note it's meaningless to reconfigure this value during runtime, it only matters on create
     * before recovery runs.
     *
     * See above: should never happen.
     */
    if (!reconfig) {
        WT_RET(__wt_config_gets_def(session, cfg, "log.recover", 0, &cval));
        if (WT_STRING_MATCH("error", cval.str, cval.len))
            FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR);
    }

    WT_RET(__wt_config_gets(session, cfg, "log.zero_fill", &cval));
    if (cval.val != 0) {
        if (F_ISSET(conn, WT_CONN_READONLY))
            WT_RET_MSG(
              session, EINVAL, "Read-only configuration incompatible with zero-filling log files");
        FLD_SET(conn->log_flags, WT_CONN_LOG_ZERO_FILL);
    }

    WT_RET(__logmgr_sync_cfg(session, cfg));
    if (conn->log_cond != NULL)
        __wt_cond_signal(session, conn->log_cond);
    return (0);
}

/*
 * __wt_logmgr_reconfig --
 *     Reconfigure logging.
 */
int
__wt_logmgr_reconfig(WT_SESSION_IMPL *session, const char **cfg)
{
    WT_RET(__wt_logmgr_config(session, cfg, true));
    return (__logmgr_version(session, true));
}

/*
 * __log_remove_once_int --
 *     Helper for __log_remove_once. Intended to be called while holding the hot backup read lock.
 */
static int
__log_remove_once_int(
  WT_SESSION_IMPL *session, char **logfiles, u_int logcount, uint32_t min_lognum)
{
    uint32_t lognum;
    u_int i;

    for (i = 0; i < logcount; i++) {
        WT_RET(__wt_log_extract_lognum(session, logfiles[i], &lognum));
        if (lognum < min_lognum)
            WT_RET(__wt_log_remove(session, WT_LOG_FILENAME, lognum));
    }

    return (0);
}

/*
 * __log_remove_once --
 *     Perform one iteration of log removal. Must be called with the log removal lock held.
 */
static int
__log_remove_once(WT_SESSION_IMPL *session, uint32_t backup_file)
{
    WT_CONNECTION_IMPL *conn;
    WT_DECL_RET;
    WT_LOG *log;
    uint32_t dbg_val, min_lognum;
    u_int logcount;
    char **logfiles;

    conn = S2C(session);
    log = conn->log;
    logcount = 0;
    logfiles = NULL;

    /*
     * If we're coming from a backup cursor we want the smaller of the last full log file copied in
     * backup or the checkpoint LSN. Otherwise we want the minimum of the last log file written to
     * disk and the checkpoint LSN.
     */
    min_lognum = backup_file == 0 ? WT_MIN(log->ckpt_lsn.l.file, log->sync_lsn.l.file) :
                                    WT_MIN(log->ckpt_lsn.l.file, backup_file);

    /* Adjust the number of log files to retain based on debugging options. */
    WT_ORDERED_READ(dbg_val, conn->debug_ckpt_cnt);
    if (FLD_ISSET(conn->debug_flags, WT_CONN_DEBUG_CKPT_RETAIN) && dbg_val != 0)
        min_lognum = WT_MIN(conn->debug_ckpt[dbg_val - 1].l.file, min_lognum);
    WT_ORDERED_READ(dbg_val, conn->debug_log_cnt);
    if (dbg_val != 0) {
        /*
         * If we're performing checkpoints, apply the retain value as a minimum, increasing the
         * number the log files we keep. If not performing checkpoints, it's an absolute number of
         * log files to keep. This means we can potentially remove log files required for recovery
         * if the number of log files exceeds the configured value and the system has yet to be
         * checkpointed.
         *
         * Check for N+1, that is, we retain N full log files, and one partial.
         */
        if ((dbg_val + 1) >= log->fileid)
            return (0);
        if (WT_IS_INIT_LSN(&log->ckpt_lsn))
            min_lognum = log->fileid - (dbg_val + 1);
        else
            min_lognum = WT_MIN(log->fileid - (dbg_val + 1), min_lognum);
    }
    __wt_verbose(session, WT_VERB_LOG, "log_remove: remove to log number %" PRIu32, min_lognum);

    /*
     * Main remove code. Get the list of all log files and remove any earlier than the minimum log
     * number.
     */
    WT_ERR(__wt_fs_directory_list(session, conn->log_path, WT_LOG_FILENAME, &logfiles, &logcount));

    /*
     * If backup_file is non-zero we know we're coming from an incremental backup cursor. In that
     * case just perform the remove operation without the lock.
     */
    if (backup_file != 0)
        ret = __log_remove_once_int(session, logfiles, logcount, min_lognum);
    else
        WT_WITH_HOTBACKUP_READ_LOCK(
          session, ret = __log_remove_once_int(session, logfiles, logcount, min_lognum), NULL);
    WT_ERR(ret);

    /*
     * Indicate what is our new earliest LSN. It is the start of the log file containing the last
     * checkpoint.
     */
    WT_SET_LSN(&log->first_lsn, min_lognum, 0);

    if (0)
err:
        __wt_err(session, ret, "log removal server error");
    WT_TRET(__wt_fs_directory_list_free(session, &logfiles, logcount));
    return (ret);
}

/*
 * __log_prealloc_once --
 *     Perform one iteration of log pre-allocation.
 */
static int
__log_prealloc_once(WT_SESSION_IMPL *session)
{
    WT_CONNECTION_IMPL *conn;
    WT_DECL_RET;
    WT_LOG *log;
    u_int i, reccount;
    char **recfiles;

    conn = S2C(session);
    log = conn->log;
    reccount = 0;
    recfiles = NULL;

    /*
     * Allocate up to the maximum number, accounting for any existing files that may not have been
     * used yet.
     */
    WT_ERR(__wt_fs_directory_list(session, conn->log_path, WT_LOG_PREPNAME, &recfiles, &reccount));

    /*
     * Adjust the number of files to pre-allocate if we find that the critical path had to allocate
     * them since we last ran.
     */
    if (log->prep_missed > 0) {
        conn->log_prealloc += log->prep_missed;
        __wt_verbose(session, WT_VERB_LOG, "Missed %" PRIu32 ". Now pre-allocating up to %" PRIu32,
          log->prep_missed, conn->log_prealloc);
    } else if (reccount > conn->log_prealloc / 2 && conn->log_prealloc > 2) {
        /*
         * If we used less than half, then start adjusting down.
         */
        --conn->log_prealloc;
        __wt_verbose(session, WT_VERB_LOG,
          "Adjust down. Did not use %" PRIu32 ". Now pre-allocating %" PRIu32, reccount,
          conn->log_prealloc);
    }

    WT_STAT_CONN_SET(session, log_prealloc_max, conn->log_prealloc);
    /*
     * Allocate up to the maximum number that we just computed and detected.
     */
    for (i = reccount; i < (u_int)conn->log_prealloc; i++) {
        WT_ERR(__wt_log_allocfile(session, ++log->prep_fileid, WT_LOG_PREPNAME));
        WT_STAT_CONN_INCR(session, log_prealloc_files);
    }
    /*
     * Reset the missed count now. If we missed during pre-allocating the log files, it means the
     * allocation is not keeping up, not that we didn't allocate enough. So we don't just want to
     * keep adding in more.
     */
    log->prep_missed = 0;

    if (0)
err:
        __wt_err(session, ret, "log pre-alloc server error");
    WT_TRET(__wt_fs_directory_list_free(session, &recfiles, reccount));
    return (ret);
}

/*
 * __wt_log_truncate_files --
 *     Truncate log files via remove once. Requires that the server is not currently running.
 */
int
__wt_log_truncate_files(WT_SESSION_IMPL *session, WT_CURSOR *cursor, bool force)
{
    WT_CONNECTION_IMPL *conn;
    WT_DECL_RET;
    WT_LOG *log;
    uint32_t backup_file;

    conn = S2C(session);
    if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
        return (0);
    if (!force && FLD_ISSET(conn->server_flags, WT_CONN_SERVER_LOG) &&
      FLD_ISSET(conn->log_flags, WT_CONN_LOG_REMOVE))
        WT_RET_MSG(session, EINVAL, "Attempt to remove manually while a server is running");

    log = conn->log;

    backup_file = 0;
    if (cursor != NULL) {
        WT_ASSERT(session, force == false);
        backup_file = WT_CURSOR_BACKUP_ID(cursor);
    }
    WT_ASSERT(session, backup_file <= log->alloc_lsn.l.file);
    __wt_verbose(
      session, WT_VERB_LOG, "log_truncate_files: remove once up to %" PRIu32, backup_file);

    __wt_writelock(session, &log->log_remove_lock);
    ret = __log_remove_once(session, backup_file);
    __wt_writeunlock(session, &log->log_remove_lock);
    return (ret);
}

/*
 * __log_file_server --
 *     The log file server thread. This worker thread manages log file operations such as closing
 *     and syncing.
 */
static WT_THREAD_RET
__log_file_server(void *arg)
{
    WT_CONNECTION_IMPL *conn;
    WT_DECL_RET;
    WT_FH *close_fh;
    WT_LOG *log;
    WT_LSN close_end_lsn;
    WT_SESSION_IMPL *session;
    uint32_t filenum;

    session = arg;
    conn = S2C(session);
    log = conn->log;
    while (FLD_ISSET(conn->server_flags, WT_CONN_SERVER_LOG)) {
        /*
         * If there is a log file to close, make sure any outstanding write operations have
         * completed, then fsync and close it.
         */
        if ((close_fh = log->log_close_fh) != NULL) {
            WT_ERR(__wt_log_extract_lognum(session, close_fh->name, &filenum));
            /*
             * The closing file handle should have a correct close LSN.
             */
            WT_ASSERT(session, log->log_close_lsn.l.file == filenum);

            if (__wt_log_cmp(&log->write_lsn, &log->log_close_lsn) >= 0) {
                /*
                 * We've copied the file handle, clear out the one in the log structure to allow it
                 * to be set again. Copy the LSN before clearing the file handle. Use a barrier to
                 * make sure the compiler does not reorder the following two statements.
                 */
                WT_ASSIGN_LSN(&close_end_lsn, &log->log_close_lsn);
                WT_FULL_BARRIER();
                log->log_close_fh = NULL;
                /*
                 * Set the close_end_lsn to the LSN immediately after ours. That is, the beginning
                 * of the next log file. We need to know the LSN file number of our own close in
                 * case earlier calls are still in progress and the next one to move the sync_lsn
                 * into the next file for later syncs.
                 */
                WT_ERR(__wt_fsync(session, close_fh, true));

                /*
                 * We want to have the file size reflect actual data with minimal pre-allocated
                 * zeroed space. We can't truncate the file during hot backup, or the underlying
                 * file system may not support truncate: both are OK, it's just more work during
                 * cursor traversal.
                 */
                if (conn->hot_backup_start == 0 && conn->log_cursors == 0) {
                    WT_WITH_HOTBACKUP_READ_LOCK(session,
                      ret = __wt_ftruncate(session, close_fh, close_end_lsn.l.offset), NULL);
                    WT_ERR_ERROR_OK(ret, ENOTSUP, false);
                }
                WT_SET_LSN(&close_end_lsn, close_end_lsn.l.file + 1, 0);
                __wt_spin_lock(session, &log->log_sync_lock);
                WT_ERR(__wt_close(session, &close_fh));
                WT_ASSERT(session, __wt_log_cmp(&close_end_lsn, &log->sync_lsn) >= 0);
                WT_ASSIGN_LSN(&log->sync_lsn, &close_end_lsn);
                __wt_cond_signal(session, log->log_sync_cond);
                __wt_spin_unlock(session, &log->log_sync_lock);
            }
        }

        /* Wait until the next event. */
        __wt_cond_wait(session, conn->log_file_cond, 100 * WT_THOUSAND, NULL);
    }

    if (0) {
err:
        WT_IGNORE_RET(__wt_panic(session, ret, "log close server error"));
    }
    __wt_spin_unlock_if_owned(session, &log->log_sync_lock);
    return (WT_THREAD_RET_VALUE);
}

/*
 * Simple structure for sorting written slots.
 */
typedef struct {
    WT_LSN lsn;
    uint32_t slot_index;
} WT_LOG_WRLSN_ENTRY;

/*
 * WT_WRLSN_ENTRY_CMP_LT --
 *	Return comparison of a written slot pair by LSN.
 */
#define WT_WRLSN_ENTRY_CMP_LT(entry1, entry2)        \
    ((entry1).lsn.l.file < (entry2).lsn.l.file ||    \
      ((entry1).lsn.l.file == (entry2).lsn.l.file && \
        (entry1).lsn.l.offset < (entry2).lsn.l.offset))

/*
 * __wt_log_wrlsn --
 *     Process written log slots and attempt to coalesce them if the LSNs are contiguous. The
 *     purpose of this function is to advance the write_lsn in LSN order after the buffer is written
 *     to the log file.
 */
void
__wt_log_wrlsn(WT_SESSION_IMPL *session, int *yield)
{
    WT_CONNECTION_IMPL *conn;
    WT_LOG *log;
    WT_LOGSLOT *coalescing, *slot;
    WT_LOG_WRLSN_ENTRY written[WT_SLOT_POOL];
    WT_LSN save_lsn;
    size_t written_i;
    uint32_t i, save_i;

    conn = S2C(session);
    log = conn->log;
    __wt_spin_lock(session, &log->log_writelsn_lock);
restart:
    coalescing = NULL;
    WT_INIT_LSN(&save_lsn);
    written_i = 0;
    i = 0;

    /*
     * Walk the array once saving any slots that are in the WT_LOG_SLOT_WRITTEN state.
     */
    while (i < WT_SLOT_POOL) {
        save_i = i;
        slot = &log->slot_pool[i++];
        if (slot->slot_state != WT_LOG_SLOT_WRITTEN)
            continue;
        written[written_i].slot_index = save_i;
        WT_ASSIGN_LSN(&written[written_i++].lsn, &slot->slot_release_lsn);
    }
    /*
     * If we found any written slots process them. We sort them based on the release LSN, and then
     * look for them in order.
     */
    if (written_i > 0) {
        if (yield != NULL)
            *yield = 0;
        WT_INSERTION_SORT(written, written_i, WT_LOG_WRLSN_ENTRY, WT_WRLSN_ENTRY_CMP_LT);
        /*
         * We know the written array is sorted by LSN. Go through them either advancing write_lsn or
         * coalesce contiguous ranges of written slots.
         */
        for (i = 0; i < written_i; i++) {
            slot = &log->slot_pool[written[i].slot_index];
            /*
             * The log server thread pushes out slots periodically. Sometimes they are empty slots.
             * If we find an empty slot, where empty means the start and end LSN are the same, free
             * it and continue.
             */
            if (__wt_log_cmp(&slot->slot_start_lsn, &slot->slot_release_lsn) == 0 &&
              __wt_log_cmp(&slot->slot_start_lsn, &slot->slot_end_lsn) == 0) {
                __wt_log_slot_free(session, slot);
                continue;
            }
            if (coalescing != NULL) {
                /*
                 * If the write_lsn changed, we may be able to process slots. Try again.
                 */
                if (__wt_log_cmp(&log->write_lsn, &save_lsn) != 0)
                    goto restart;
                if (__wt_log_cmp(&coalescing->slot_end_lsn, &written[i].lsn) != 0) {
                    coalescing = slot;
                    continue;
                }
                /*
                 * If we get here we have a slot to coalesce and free.
                 */
                coalescing->slot_last_offset = slot->slot_last_offset;
                WT_ASSIGN_LSN(&coalescing->slot_end_lsn, &slot->slot_end_lsn);
                WT_STAT_CONN_INCR(session, log_slot_coalesced);
                /*
                 * Copy the flag for later closing.
                 */
                if (F_ISSET_ATOMIC_16(slot, WT_SLOT_CLOSEFH))
                    F_SET_ATOMIC_16(coalescing, WT_SLOT_CLOSEFH);
            } else {
                /*
                 * If this written slot is not the next LSN, try to start coalescing with later
                 * slots. A synchronous write may update write_lsn so save the last one we saw to
                 * check when coalescing slots.
                 */
                WT_ASSIGN_LSN(&save_lsn, &log->write_lsn);
                if (__wt_log_cmp(&log->write_lsn, &written[i].lsn) != 0) {
                    coalescing = slot;
                    continue;
                }
                /*
                 * If we get here we have a slot to process. Advance the LSN and process the slot.
                 */
                WT_ASSERT(session, __wt_log_cmp(&written[i].lsn, &slot->slot_release_lsn) == 0);
                /*
                 * We need to maintain the starting offset of a log record so that the checkpoint
                 * LSN refers to the beginning of a real record. The last offset in a slot is kept
                 * so that the checkpoint LSN is close to the end of the record.
                 */
                if (slot->slot_start_lsn.l.offset != slot->slot_last_offset)
                    slot->slot_start_lsn.l.offset = (uint32_t)slot->slot_last_offset;
                WT_ASSIGN_LSN(&log->write_start_lsn, &slot->slot_start_lsn);
                WT_ASSIGN_LSN(&log->write_lsn, &slot->slot_end_lsn);
                __wt_cond_signal(session, log->log_write_cond);
                WT_STAT_CONN_INCR(session, log_write_lsn);
                /*
                 * Signal the close thread if needed.
                 */
                if (F_ISSET_ATOMIC_16(slot, WT_SLOT_CLOSEFH))
                    __wt_cond_signal(session, conn->log_file_cond);
            }
            __wt_log_slot_free(session, slot);
        }
    }
    __wt_spin_unlock(session, &log->log_writelsn_lock);
}

/*
 * __log_wrlsn_server --
 *     The log wrlsn server thread.
 */
static WT_THREAD_RET
__log_wrlsn_server(void *arg)
{
    WT_CONNECTION_IMPL *conn;
    WT_DECL_RET;
    WT_LOG *log;
    WT_LSN prev;
    WT_SESSION_IMPL *session;
    int yield;
    bool did_work;

    session = arg;
    conn = S2C(session);
    log = conn->log;
    yield = 0;
    WT_INIT_LSN(&prev);
    while (FLD_ISSET(conn->server_flags, WT_CONN_SERVER_LOG)) {
        /*
         * Write out any log record buffers if anything was done since last time. Only call the
         * function to walk the slots if the system is not idle. On an idle system the alloc_lsn
         * will not advance and the written lsn will match the alloc_lsn.
         */
        if (__wt_log_cmp(&prev, &log->alloc_lsn) != 0 ||
          __wt_log_cmp(&log->write_lsn, &log->alloc_lsn) != 0)
            __wt_log_wrlsn(session, &yield);
        else
            WT_STAT_CONN_INCR(session, log_write_lsn_skip);
        prev = log->alloc_lsn;
        did_work = yield == 0;

        /*
         * If __wt_log_wrlsn did work we want to yield instead of sleep.
         */
        if (yield++ < WT_THOUSAND)
            __wt_yield();
        else
            __wt_cond_auto_wait(session, conn->log_wrlsn_cond, did_work, NULL);
    }
    /*
     * On close we need to do this one more time because there could be straggling log writes that
     * need to be written.
     */
    WT_ERR(__wt_log_force_write(session, 1, NULL));
    __wt_log_wrlsn(session, NULL);
    if (0) {
err:
        WT_IGNORE_RET(__wt_panic(session, ret, "log wrlsn server error"));
    }
    return (WT_THREAD_RET_VALUE);
}

/*
 * __log_server --
 *     The log server thread.
 */
static WT_THREAD_RET
__log_server(void *arg)
{
    WT_CONNECTION_IMPL *conn;
    WT_DECL_RET;
    WT_LOG *log;
    WT_SESSION_IMPL *session;
    uint64_t force_write_time_start, force_write_timediff;
    uint64_t time_start, time_stop, timediff;
    bool did_work, signalled;

    session = arg;
    conn = S2C(session);
    log = conn->log;
    force_write_timediff = 0;
    signalled = false;

    /*
     * Set this to the number of milliseconds we want to run log force write, remove and
     * pre-allocation. Start it so that we run on the first time through.
     */
    timediff = WT_THOUSAND;
    force_write_time_start = time_start = __wt_clock(session);

    /*
     * The log server thread does a variety of work. It forces out any buffered log writes. It
     * pre-allocates log files and it performs log removal. The reason the wrlsn thread does not
     * force out the buffered writes is because we want to process and move the write_lsn forward as
     * quickly as possible. The same reason applies to why the log file server thread does not force
     * out the writes. That thread does fsync calls which can take a long time and we don't want log
     * records sitting in the buffer over the time it takes to sync out an earlier file.
     */
    did_work = true;
    while (FLD_ISSET(conn->server_flags, WT_CONN_SERVER_LOG)) {
        /*
         * Slots depend on future activity. Force out buffered writes in case we are idle. This
         * cannot be part of the wrlsn thread because of interaction advancing the write_lsn and a
         * buffer may need to wait for the write_lsn to advance in the case of a synchronous buffer.
         * We end up with a hang.
         */
        if (conn->log_force_write_wait == 0 ||
          force_write_timediff >= conn->log_force_write_wait * WT_THOUSAND) {
            WT_ERR_ERROR_OK(__wt_log_force_write(session, 0, &did_work), EBUSY, false);
            force_write_time_start = __wt_clock(session);
        }
        /*
         * We don't want to remove or pre-allocate files as often as we want to force out log
         * buffers. Only do it once per second or if the condition was signalled.
         */
        if (timediff >= WT_THOUSAND || signalled) {

            /*
             * Perform log pre-allocation.
             */
            if (conn->log_prealloc > 0) {
                /*
                 * Log file pre-allocation is disabled when a hot backup cursor is open because we
                 * have agreed not to rename or remove any files in the database directory.
                 */
                WT_WITH_HOTBACKUP_READ_LOCK(session, ret = __log_prealloc_once(session), NULL);
                WT_ERR(ret);
            }

            /*
             * Perform the removal.
             */
            if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_REMOVE)) {
                if (__wt_try_writelock(session, &log->log_remove_lock) == 0) {
                    ret = __log_remove_once(session, 0);
                    __wt_writeunlock(session, &log->log_remove_lock);
                    WT_ERR(ret);
                } else
                    __wt_verbose(session, WT_VERB_LOG, "%s",
                      "log_remove: Blocked due to open log cursor holding remove lock");
            }
            time_start = __wt_clock(session);
        }

        /* Wait until the next event. */
        __wt_cond_auto_wait_signal(session, conn->log_cond, did_work, NULL, &signalled);
        time_stop = __wt_clock(session);
        timediff = WT_CLOCKDIFF_MS(time_stop, time_start);
        force_write_timediff = WT_CLOCKDIFF_MS(time_stop, force_write_time_start);
    }

    if (0) {
err:
        WT_IGNORE_RET(__wt_panic(session, ret, "log server error"));
    }
    return (WT_THREAD_RET_VALUE);
}

/*
 * __wt_logmgr_create --
 *     Initialize the log subsystem (before running recovery).
 */
int
__wt_logmgr_create(WT_SESSION_IMPL *session)
{
    WT_CONNECTION_IMPL *conn;
    WT_LOG *log;

    conn = S2C(session);

    /*
     * Logging configuration is parsed early on for compatibility checking. It is separated from
     * turning on the subsystem. We only need to proceed here if logging is enabled.
     */
    if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_CONFIG_ENABLED))
        return (0);

    FLD_SET(conn->log_flags, WT_CONN_LOG_ENABLED);
    /*
     * Logging is on, allocate the WT_LOG structure and open the log file.
     */
    WT_RET(__wt_calloc_one(session, &conn->log));
    log = conn->log;
    WT_RET(__wt_spin_init(session, &log->log_lock, "log"));
    WT_RET(__wt_spin_init(session, &log->log_fs_lock, "log files"));
    WT_RET(__wt_spin_init(session, &log->log_slot_lock, "log slot"));
    WT_RET(__wt_spin_init(session, &log->log_sync_lock, "log sync"));
    WT_RET(__wt_spin_init(session, &log->log_writelsn_lock, "log write LSN"));
    WT_RET(__wt_rwlock_init(session, &log->log_remove_lock));
    if (FLD_ISSET(conn->direct_io, WT_DIRECT_IO_LOG))
        log->allocsize = (uint32_t)WT_MAX(conn->buffer_alignment, WT_LOG_ALIGN);
    else
        log->allocsize = WT_LOG_ALIGN;
    WT_INIT_LSN(&log->alloc_lsn);
    WT_INIT_LSN(&log->ckpt_lsn);
    WT_INIT_LSN(&log->first_lsn);
    WT_INIT_LSN(&log->sync_lsn);
    /*
     * We only use file numbers for directory sync, so this needs to initialized to zero.
     */
    WT_ZERO_LSN(&log->sync_dir_lsn);
    WT_INIT_LSN(&log->trunc_lsn);
    WT_INIT_LSN(&log->write_lsn);
    WT_INIT_LSN(&log->write_start_lsn);
    log->fileid = 0;
    WT_RET(__logmgr_version(session, false));

    WT_RET(__wt_cond_alloc(session, "log sync", &log->log_sync_cond));
    WT_RET(__wt_cond_alloc(session, "log write", &log->log_write_cond));
    WT_RET(__wt_log_open(session));
    WT_RET(__wt_log_slot_init(session, true));

    return (0);
}

/*
 * __wt_logmgr_open --
 *     Start the log service threads.
 */
int
__wt_logmgr_open(WT_SESSION_IMPL *session)
{
    WT_CONNECTION_IMPL *conn;
    uint32_t session_flags;

    conn = S2C(session);

    /* If no log thread services are configured, we're done. */
    if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED))
        return (0);

    FLD_SET(conn->server_flags, WT_CONN_SERVER_LOG);

    /*
     * Start the log close thread. It is not configurable. If logging is enabled, this thread runs.
     */
    session_flags = WT_SESSION_NO_DATA_HANDLES;
    WT_RET(__wt_open_internal_session(
      conn, "log-close-server", false, session_flags, 0, &conn->log_file_session));
    WT_RET(__wt_cond_alloc(conn->log_file_session, "log close server", &conn->log_file_cond));

    /*
     * Start the log file close thread.
     */
    WT_RET(__wt_thread_create(
      conn->log_file_session, &conn->log_file_tid, __log_file_server, conn->log_file_session));
    conn->log_file_tid_set = true;

    /*
     * Start the log write LSN thread. It is not configurable. If logging is enabled, this thread
     * runs.
     */
    WT_RET(__wt_open_internal_session(
      conn, "log-wrlsn-server", false, session_flags, 0, &conn->log_wrlsn_session));
    WT_RET(__wt_cond_auto_alloc(conn->log_wrlsn_session, "log write lsn server", 10 * WT_THOUSAND,
      WT_MILLION, &conn->log_wrlsn_cond));
    WT_RET(__wt_thread_create(
      conn->log_wrlsn_session, &conn->log_wrlsn_tid, __log_wrlsn_server, conn->log_wrlsn_session));
    conn->log_wrlsn_tid_set = true;

    /*
     * If a log server thread exists, the user may have reconfigured removal or pre-allocation.
     * Signal the thread. Otherwise the user wants removal and/or allocation and we need to start up
     * the thread.
     */
    if (conn->log_session != NULL) {
        WT_ASSERT(session, conn->log_cond != NULL);
        WT_ASSERT(session, conn->log_tid_set == true);
        __wt_cond_signal(session, conn->log_cond);
    } else {
        /* The log server gets its own session. */
        WT_RET(__wt_open_internal_session(
          conn, "log-server", false, session_flags, 0, &conn->log_session));
        WT_RET(__wt_cond_auto_alloc(
          conn->log_session, "log server", 50 * WT_THOUSAND, WT_MILLION, &conn->log_cond));

        /*
         * Start the thread.
         */
        WT_RET(
          __wt_thread_create(conn->log_session, &conn->log_tid, __log_server, conn->log_session));
        conn->log_tid_set = true;
    }

    return (0);
}

/*
 * __wt_logmgr_destroy --
 *     Destroy the log removal server thread and logging subsystem.
 */
int
__wt_logmgr_destroy(WT_SESSION_IMPL *session)
{
    WT_CONNECTION_IMPL *conn;
    WT_DECL_RET;

    conn = S2C(session);

    FLD_CLR(conn->server_flags, WT_CONN_SERVER_LOG);

    if (!FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) {
        /*
         * We always set up the log_path so printlog can work without recovery. Therefore, always
         * free it, even if logging isn't on.
         */
        __wt_free(session, conn->log_path);
        return (0);
    }
    if (conn->log_tid_set) {
        __wt_cond_signal(session, conn->log_cond);
        WT_TRET(__wt_thread_join(session, &conn->log_tid));
        conn->log_tid_set = false;
    }
    if (conn->log_file_tid_set) {
        __wt_cond_signal(session, conn->log_file_cond);
        WT_TRET(__wt_thread_join(session, &conn->log_file_tid));
        conn->log_file_tid_set = false;
    }
    if (conn->log_file_session != NULL) {
        WT_TRET(__wt_session_close_internal(conn->log_file_session));
        conn->log_file_session = NULL;
    }
    if (conn->log_wrlsn_tid_set) {
        __wt_cond_signal(session, conn->log_wrlsn_cond);
        WT_TRET(__wt_thread_join(session, &conn->log_wrlsn_tid));
        conn->log_wrlsn_tid_set = false;
    }
    if (conn->log_wrlsn_session != NULL) {
        WT_TRET(__wt_session_close_internal(conn->log_wrlsn_session));
        conn->log_wrlsn_session = NULL;
    }

    WT_TRET(__wt_log_slot_destroy(session));
    WT_TRET(__wt_log_close(session));

    /* Close the server thread's session. */
    if (conn->log_session != NULL) {
        WT_TRET(__wt_session_close_internal(conn->log_session));
        conn->log_session = NULL;
    }

    /* Destroy the condition variables now that all threads are stopped */
    __wt_cond_destroy(session, &conn->log_cond);
    __wt_cond_destroy(session, &conn->log_file_cond);
    __wt_cond_destroy(session, &conn->log_wrlsn_cond);

    __wt_cond_destroy(session, &conn->log->log_sync_cond);
    __wt_cond_destroy(session, &conn->log->log_write_cond);
    __wt_rwlock_destroy(session, &conn->log->log_remove_lock);
    __wt_spin_destroy(session, &conn->log->log_lock);
    __wt_spin_destroy(session, &conn->log->log_fs_lock);
    __wt_spin_destroy(session, &conn->log->log_slot_lock);
    __wt_spin_destroy(session, &conn->log->log_sync_lock);
    __wt_spin_destroy(session, &conn->log->log_writelsn_lock);
    __wt_free(session, conn->log_path);
    __wt_free(session, conn->log);
    return (ret);
}