/*- * Copyright (c) 2014-2016 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * * See the file LICENSE for redistribution information. */ #include "wt_internal.h" static int __checkpoint_lock_tree( WT_SESSION_IMPL *, bool, bool, const char *[]); static int __checkpoint_mark_deletes(WT_SESSION_IMPL *, const char *[]); static int __checkpoint_presync(WT_SESSION_IMPL *, const char *[]); static int __checkpoint_tree_helper(WT_SESSION_IMPL *, const char *[]); /* * __checkpoint_name_ok -- * Complain if the checkpoint name isn't acceptable. */ static int __checkpoint_name_ok(WT_SESSION_IMPL *session, const char *name, size_t len) { /* Check for characters we don't want to see in a metadata file. */ WT_RET(__wt_name_check(session, name, len)); /* * The internal checkpoint name is special, applications aren't allowed * to use it. Be aggressive and disallow any matching prefix, it makes * things easier when checking in other places. */ if (len < strlen(WT_CHECKPOINT)) return (0); if (!WT_PREFIX_MATCH(name, WT_CHECKPOINT)) return (0); WT_RET_MSG(session, EINVAL, "the checkpoint name \"%s\" is reserved", WT_CHECKPOINT); } /* * __checkpoint_name_check -- * Check for an attempt to name a checkpoint that includes anything * other than a file object. */ static int __checkpoint_name_check(WT_SESSION_IMPL *session, const char *uri) { WT_CURSOR *cursor; WT_DECL_RET; const char *fail; cursor = NULL; fail = NULL; /* * This function exists as a place for this comment: named checkpoints * are only supported on file objects, and not on LSM trees or Helium * devices. If a target list is configured for the checkpoint, this * function is called with each target list entry; check the entry to * make sure it's backed by a file. If no target list is configured, * confirm the metadata file contains no non-file objects. */ if (uri == NULL) { WT_RET(__wt_metadata_cursor(session, &cursor)); while ((ret = cursor->next(cursor)) == 0) { WT_ERR(cursor->get_key(cursor, &uri)); if (!WT_PREFIX_MATCH(uri, "colgroup:") && !WT_PREFIX_MATCH(uri, "file:") && !WT_PREFIX_MATCH(uri, "index:") && !WT_PREFIX_MATCH(uri, "table:")) { fail = uri; break; } } WT_ERR_NOTFOUND_OK(ret); } else if (!WT_PREFIX_MATCH(uri, "colgroup:") && !WT_PREFIX_MATCH(uri, "file:") && !WT_PREFIX_MATCH(uri, "index:") && !WT_PREFIX_MATCH(uri, "table:")) fail = uri; if (fail != NULL) WT_ERR_MSG(session, EINVAL, "%s object does not support named checkpoints", fail); err: WT_TRET(__wt_metadata_cursor_release(session, &cursor)); return (ret); } /* * __checkpoint_apply_all -- * Apply an operation to all files involved in a checkpoint. */ static int __checkpoint_apply_all(WT_SESSION_IMPL *session, const char *cfg[], int (*op)(WT_SESSION_IMPL *, const char *[]), bool *fullp) { WT_CONFIG targetconf; WT_CONFIG_ITEM cval, k, v; WT_DECL_ITEM(tmp); WT_DECL_RET; bool ckpt_closed, named, target_list; target_list = false; /* Flag if this is a named checkpoint, and check if the name is OK. */ WT_RET(__wt_config_gets(session, cfg, "name", &cval)); named = cval.len != 0; if (named) WT_RET(__checkpoint_name_ok(session, cval.str, cval.len)); /* Step through the targets and optionally operate on each one. */ WT_ERR(__wt_config_gets(session, cfg, "target", &cval)); __wt_config_subinit(session, &targetconf, &cval); while ((ret = __wt_config_next(&targetconf, &k, &v)) == 0) { if (!target_list) { WT_ERR(__wt_scr_alloc(session, 512, &tmp)); target_list = true; } if (v.len != 0) WT_ERR_MSG(session, EINVAL, "invalid checkpoint target %.*s: URIs may require " "quoting", (int)cval.len, (char *)cval.str); /* Some objects don't support named checkpoints. */ if (named) WT_ERR(__checkpoint_name_check(session, k.str)); if (op == NULL) continue; WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str)); if ((ret = __wt_schema_worker( session, tmp->data, op, NULL, cfg, 0)) != 0) WT_ERR_MSG(session, ret, "%s", (const char *)tmp->data); } WT_ERR_NOTFOUND_OK(ret); if (!target_list && named) /* Some objects don't support named checkpoints. */ WT_ERR(__checkpoint_name_check(session, NULL)); if (!target_list && op != NULL) { /* * If the checkpoint is named or we're dropping checkpoints, we * checkpoint both open and closed files; else, only checkpoint * open files. * * XXX * We don't optimize unnamed checkpoints of a list of targets, * we open the targets and checkpoint them even if they are * quiescent and don't need a checkpoint, believing applications * unlikely to checkpoint a list of closed targets. */ ckpt_closed = named; if (!ckpt_closed) { WT_ERR(__wt_config_gets(session, cfg, "drop", &cval)); ckpt_closed = cval.len != 0; } WT_ERR(ckpt_closed ? __wt_meta_apply_all(session, op, NULL, cfg) : __wt_conn_btree_apply(session, NULL, op, NULL, cfg)); } if (fullp != NULL) *fullp = !target_list; err: __wt_scr_free(session, &tmp); return (ret); } /* * __checkpoint_apply -- * Apply an operation to all handles locked for a checkpoint. */ static int __checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[], int (*op)(WT_SESSION_IMPL *, const char *[])) { WT_DECL_RET; u_int i; /* If we have already locked the handles, apply the operation. */ for (i = 0; i < session->ckpt_handle_next; ++i) { if (session->ckpt_handle[i] == NULL) continue; WT_WITH_DHANDLE(session, session->ckpt_handle[i], ret = (*op)(session, cfg)); WT_RET(ret); } return (0); } /* * __checkpoint_data_source -- * Checkpoint all data sources. */ static int __checkpoint_data_source(WT_SESSION_IMPL *session, const char *cfg[]) { WT_NAMED_DATA_SOURCE *ndsrc; WT_DATA_SOURCE *dsrc; /* * A place-holder, to support Helium devices: we assume calling the * underlying data-source session checkpoint function is sufficient to * checkpoint all objects in the data source, open or closed, and we * don't attempt to optimize the checkpoint of individual targets. * Those assumptions is correct for the Helium device, but it's not * necessarily going to be true for other data sources. * * It's not difficult to support data-source checkpoints of individual * targets (__wt_schema_worker is the underlying function that will do * the work, and it's already written to support data-sources, although * we'd probably need to pass the URI of the object to the data source * checkpoint function which we don't currently do). However, doing a * full data checkpoint is trickier: currently, the connection code is * written to ignore all objects other than "file:", and that code will * require significant changes to work with data sources. */ TAILQ_FOREACH(ndsrc, &S2C(session)->dsrcqh, q) { dsrc = ndsrc->dsrc; if (dsrc->checkpoint != NULL) WT_RET(dsrc->checkpoint(dsrc, (WT_SESSION *)session, (WT_CONFIG_ARG *)cfg)); } return (0); } /* * __wt_checkpoint_get_handles -- * Get a list of handles to flush. */ int __wt_checkpoint_get_handles(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BTREE *btree; WT_DECL_RET; const char *name; /* Should not be called with anything other than a file object. */ WT_ASSERT(session, session->dhandle->checkpoint == NULL); WT_ASSERT(session, WT_PREFIX_MATCH(session->dhandle->name, "file:")); /* Skip files that are never involved in a checkpoint. */ if (F_ISSET(S2BT(session), WT_BTREE_NO_CHECKPOINT)) return (0); /* Make sure there is space for the next entry. */ WT_RET(__wt_realloc_def(session, &session->ckpt_handle_allocated, session->ckpt_handle_next + 1, &session->ckpt_handle)); /* Not strictly necessary, but cleaner to clear the current handle. */ name = session->dhandle->name; session->dhandle = NULL; if ((ret = __wt_session_get_btree(session, name, NULL, NULL, 0)) != 0) return (ret == EBUSY ? 0 : ret); /* * Save the current eviction walk setting: checkpoint can interfere * with eviction and we don't want to unfairly penalize (or promote) * eviction in trees due to checkpoints. */ btree = S2BT(session); btree->evict_walk_saved = btree->evict_walk_period; WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_tree(session, true, true, cfg)); if (ret != 0) { WT_TRET(__wt_session_release_btree(session)); return (ret); } /* * Flag that the handle is part of a checkpoint for the purposes * of transaction visibility checks. */ WT_PUBLISH(btree->include_checkpoint_txn, true); session->ckpt_handle[session->ckpt_handle_next++] = session->dhandle; return (0); } /* * __checkpoint_update_generation -- * Update the checkpoint generation of the current tree. * * This indicates that the tree will not be visited again by the current * checkpoint. */ static void __checkpoint_update_generation(WT_SESSION_IMPL *session) { WT_BTREE *btree; btree = S2BT(session); if (!WT_IS_METADATA(session->dhandle)) WT_PUBLISH(btree->include_checkpoint_txn, false); WT_PUBLISH(btree->checkpoint_gen, S2C(session)->txn_global.checkpoint_gen); WT_STAT_DATA_SET(session, btree_checkpoint_generation, btree->checkpoint_gen); } /* * __checkpoint_reduce_dirty_cache -- * Release clean trees from the list cached for checkpoints. */ static void __checkpoint_reduce_dirty_cache(WT_SESSION_IMPL *session) { WT_CACHE *cache; WT_CONNECTION_IMPL *conn; struct timespec start, last, stop; double current_dirty, delta; uint64_t bytes_written_last, bytes_written_start, bytes_written_total; uint64_t cache_size, max_write; uint64_t current_us, stepdown_us, total_ms, work_us; bool progress; conn = S2C(session); cache = conn->cache; /* Give up if scrubbing is disabled. */ if (cache->eviction_checkpoint_target == 0 || cache->eviction_checkpoint_target >= cache->eviction_dirty_trigger) return; __wt_epoch(session, &start); last = start; bytes_written_last = 0; bytes_written_start = cache->bytes_written; cache_size = conn->cache_size; /* * If the cache size is zero or very small, we're done. The cache * size can briefly become zero if we're transitioning to a shared * cache via reconfigure. This avoids potential divide by zero. */ if (cache_size < 10 * WT_MEGABYTE) return; stepdown_us = 10000; work_us = 0; progress = false; /* Step down the scrub target (as a percentage) in units of 10MB. */ delta = WT_MIN(1.0, (100 * 10.0 * WT_MEGABYTE) / cache_size); /* * Start with the scrub target equal to the expected maximum percentage * of dirty data in cache. */ cache->eviction_scrub_limit = cache->eviction_dirty_trigger; /* Stop if we write as much dirty data as is currently in cache. */ max_write = __wt_cache_dirty_leaf_inuse(cache); /* Step down the dirty target to the eviction trigger */ for (;;) { current_dirty = (100.0 * __wt_cache_dirty_leaf_inuse(cache)) / cache_size; if (current_dirty <= (double)cache->eviction_checkpoint_target) break; __wt_sleep(0, stepdown_us / 10); __wt_epoch(session, &stop); current_us = WT_TIMEDIFF_US(stop, last); total_ms = WT_TIMEDIFF_MS(stop, start); bytes_written_total = cache->bytes_written - bytes_written_start; if (current_dirty > cache->eviction_scrub_limit) { /* * We haven't reached the current target. * * Don't wait indefinitely: there might be dirty pages * that can't be evicted. If we can't meet the target, * give up and start the checkpoint for real. */ if (current_us > WT_MAX(WT_MILLION, 10 * stepdown_us) || bytes_written_total > max_write) break; continue; } /* * Estimate how long the next step down of dirty data should * take. * * The calculation here assumes that the system is writing from * cache as fast as it can, and determines the write throughput * based on the change in the bytes written from cache since * the start of the call. We use that to estimate how long it * will take to step the dirty target down by delta. * * Take care to avoid dividing by zero. */ if (bytes_written_total - bytes_written_last > WT_MEGABYTE && work_us > 0) { stepdown_us = (uint64_t)((delta * cache_size / 100) / ((double)bytes_written_total / work_us)); stepdown_us = WT_MAX(1, stepdown_us); if (!progress) stepdown_us = WT_MIN(stepdown_us, 200000); progress = true; bytes_written_last = bytes_written_total; } work_us += current_us; /* * Smooth out step down: try to limit the impact on * performance to 10% by waiting once we reach the last * level. */ __wt_sleep(0, 10 * stepdown_us); cache->eviction_scrub_limit = WT_MAX(cache->eviction_dirty_target, current_dirty - delta); WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, cache->eviction_scrub_limit); __wt_epoch(session, &last); } __wt_epoch(session, &stop); total_ms = WT_TIMEDIFF_MS(stop, start); WT_STAT_CONN_SET(session, txn_checkpoint_scrub_time, total_ms); } /* * __checkpoint_release_clean_trees -- * Release clean trees from the list cached for checkpoints. */ static int __checkpoint_release_clean_trees(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_DATA_HANDLE *dhandle; WT_DECL_RET; u_int i; for (i = 0; i < session->ckpt_handle_next; i++) { dhandle = session->ckpt_handle[i]; btree = dhandle->handle; if (!F_ISSET(btree, WT_BTREE_SKIP_CKPT)) continue; __wt_meta_ckptlist_free(session, btree->ckpt); btree->ckpt = NULL; WT_WITH_DHANDLE(session, dhandle, __checkpoint_update_generation(session)); session->ckpt_handle[i] = NULL; WT_WITH_DHANDLE(session, dhandle, ret = __wt_session_release_btree(session)); WT_RET(ret); } return (0); } /* * __checkpoint_stats -- * Update checkpoint timer stats. */ static void __checkpoint_stats( WT_SESSION_IMPL *session, struct timespec *start, struct timespec *stop) { WT_CONNECTION_IMPL *conn; uint64_t msec; conn = S2C(session); /* * Get time diff in microseconds. */ msec = WT_TIMEDIFF_MS(*stop, *start); if (msec > conn->ckpt_time_max) conn->ckpt_time_max = msec; if (conn->ckpt_time_min == 0 || msec < conn->ckpt_time_min) conn->ckpt_time_min = msec; conn->ckpt_time_recent = msec; conn->ckpt_time_total += msec; } /* * __checkpoint_verbose_track -- * Output a verbose message with timing information */ static void __checkpoint_verbose_track(WT_SESSION_IMPL *session, const char *msg, struct timespec *start) { #ifdef HAVE_VERBOSE struct timespec stop; uint64_t msec; if (!WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) return; __wt_epoch(session, &stop); /* * Get time diff in microseconds. */ msec = WT_TIMEDIFF_MS(stop, *start); __wt_verbose(session, WT_VERB_CHECKPOINT, "time: %" PRIu64 " us, gen: %" PRIu64 ": Full database checkpoint %s", msec, S2C(session)->txn_global.checkpoint_gen, msg); /* Update the timestamp so we are reporting intervals. */ memcpy(start, &stop, sizeof(*start)); #else WT_UNUSED(session); WT_UNUSED(msg); WT_UNUSED(start); #endif } /* * __txn_checkpoint -- * Checkpoint a database or a list of objects in the database. */ static int __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) { struct timespec fsync_start, fsync_stop; struct timespec start, stop, verb_timer; WT_CACHE *cache; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_ISOLATION saved_isolation; WT_TXN_STATE *txn_state; void *saved_meta_next; u_int i; uint64_t fsync_duration_usecs; bool full, idle, logging, tracking; const char *txn_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_begin_transaction), "isolation=snapshot", NULL }; conn = S2C(session); cache = conn->cache; txn = &session->txn; txn_global = &conn->txn_global; txn_state = WT_SESSION_TXN_STATE(session); saved_isolation = session->isolation; full = idle = logging = tracking = false; /* * Do a pass over the configuration arguments and figure out what kind * of checkpoint this is. */ WT_RET(__checkpoint_apply_all(session, cfg, NULL, &full)); /* Configure logging only if doing a full checkpoint. */ logging = FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED); /* Reset the maximum page size seen by eviction. */ conn->cache->evict_max_page_size = 0; /* Initialize the verbose tracking timer */ __wt_epoch(session, &verb_timer); /* * Update the global oldest ID so we do all possible cleanup. * * This is particularly important for compact, so that all dirty pages * can be fully written. */ WT_ERR(__wt_txn_update_oldest( session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT)); /* Flush data-sources before we start the checkpoint. */ WT_ERR(__checkpoint_data_source(session, cfg)); /* * Try to reduce the amount of dirty data in cache so there is less * work do during the critical section of the checkpoint. */ __checkpoint_reduce_dirty_cache(session); /* Tell logging that we are about to start a database checkpoint. */ if (full && logging) WT_ERR(__wt_txn_checkpoint_log( session, full, WT_TXN_LOG_CKPT_PREPARE, NULL)); __checkpoint_verbose_track(session, "starting transaction", &verb_timer); if (full) __wt_epoch(session, &start); /* * Start the checkpoint for real. * * Bump the global checkpoint generation, used to figure out whether * checkpoint has visited a tree. Use an atomic increment even though * we are single-threaded because readers of the checkpoint generation * don't hold the checkpoint lock. * * We do need to update it before clearing the checkpoint's entry out * of the transaction table, or a thread evicting in a tree could * ignore the checkpoint's transaction. */ (void)__wt_atomic_addv64(&txn_global->checkpoint_gen, 1); WT_STAT_CONN_SET(session, txn_checkpoint_generation, txn_global->checkpoint_gen); /* Keep track of handles acquired for locking. */ WT_ERR(__wt_meta_track_on(session)); tracking = true; /* * Get a list of handles we want to flush; for named checkpoints this * may pull closed objects into the session cache. * * We want to skip checkpointing clean handles whenever possible. That * is, when the checkpoint is not named or forced. However, we need to * take care about ordering with respect to the checkpoint transaction. * * If we skip clean handles before starting the transaction, the * checkpoint can miss updates in trees that become dirty as the * checkpoint is starting. If we wait until the transaction has * started before locking a handle, there could be a metadata-changing * operation in between (e.g., salvage) that will cause a write * conflict when the checkpoint goes to write the metadata. * * First, gather all handles, then start the checkpoint transaction, * then release any clean handles. */ WT_ASSERT(session, session->ckpt_handle_next == 0); WT_WITH_SCHEMA_LOCK(session, WT_WITH_TABLE_LOCK(session, WT_WITH_HANDLE_LIST_LOCK(session, ret = __checkpoint_apply_all( session, cfg, __wt_checkpoint_get_handles, NULL)))); WT_ERR(ret); /* * Start a snapshot transaction for the checkpoint. * * Note: we don't go through the public API calls because they have * side effects on cursors, which applications can hold open across * calls to checkpoint. */ WT_ERR(__wt_txn_begin(session, txn_cfg)); /* Ensure a transaction ID is allocated prior to sharing it globally */ WT_ERR(__wt_txn_id_check(session)); /* * Mark the connection as clean. If some data gets modified after * generating checkpoint transaction id, connection will be reset to * dirty when reconciliation marks the btree dirty on encountering the * dirty page. */ conn->modified = false; /* * Save the checkpoint session ID. * * We never do checkpoints in the default session (with id zero). */ WT_ASSERT(session, session->id != 0 && txn_global->checkpoint_id == 0); txn_global->checkpoint_id = session->id; /* * Remove the checkpoint transaction from the global table. * * This allows ordinary visibility checks to move forward because * checkpoints often take a long time and only write to the metadata. */ __wt_writelock(session, txn_global->scan_rwlock); txn_global->checkpoint_txnid = txn->id; txn_global->checkpoint_pinned = WT_MIN(txn->id, txn->snap_min); /* * Sanity check that the oldest ID hasn't moved on before we have * cleared our entry. */ WT_ASSERT(session, WT_TXNID_LE(txn_global->oldest_id, txn_state->id) && WT_TXNID_LE(txn_global->oldest_id, txn_state->pinned_id)); /* * Clear our entry from the global transaction session table. Any * operation that needs to know about the ID for this checkpoint will * consider the checkpoint ID in the global structure. Most operations * can safely ignore the checkpoint ID (see the visible all check for * details). */ txn_state->id = txn_state->pinned_id = txn_state->metadata_pinned = WT_TXN_NONE; __wt_writeunlock(session, txn_global->scan_rwlock); /* * Unblock updates -- we can figure out that any updates to clean pages * after this point are too new to be written in the checkpoint. */ cache->eviction_scrub_limit = 0.0; WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0); /* * Mark old checkpoints that are being deleted and figure out which * trees we can skip in this checkpoint. * * Release clean trees. Any updates made after this point will not * visible to the checkpoint transaction. */ WT_ERR(__checkpoint_apply(session, cfg, __checkpoint_mark_deletes)); WT_ERR(__checkpoint_release_clean_trees(session)); /* Tell logging that we have started a database checkpoint. */ if (full && logging) WT_ERR(__wt_txn_checkpoint_log( session, full, WT_TXN_LOG_CKPT_START, NULL)); WT_ERR(__checkpoint_apply(session, cfg, __checkpoint_tree_helper)); /* * Clear the dhandle so the visibility check doesn't get confused about * the snap min. Don't bother restoring the handle since it doesn't * make sense to carry a handle across a checkpoint. */ session->dhandle = NULL; /* Release the snapshot so we aren't pinning updates in cache. */ __wt_txn_release_snapshot(session); /* Mark all trees as open for business (particularly eviction). */ WT_ERR(__checkpoint_apply(session, cfg, __checkpoint_presync)); __wt_evict_server_wake(session); __checkpoint_verbose_track(session, "committing transaction", &verb_timer); /* * Checkpoints have to hit disk (it would be reasonable to configure for * lazy checkpoints, but we don't support them yet). */ __wt_epoch(session, &fsync_start); WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync)); __wt_epoch(session, &fsync_stop); fsync_duration_usecs = WT_TIMEDIFF_US(fsync_stop, fsync_start); WT_STAT_CONN_INCR(session, txn_checkpoint_fsync_post); WT_STAT_CONN_SET(session, txn_checkpoint_fsync_post_duration, fsync_duration_usecs); __checkpoint_verbose_track(session, "sync completed", &verb_timer); /* * Commit the transaction now that we are sure that all files in the * checkpoint have been flushed to disk. It's OK to commit before * checkpointing the metadata since we know that all files in the * checkpoint are now in a consistent state. */ WT_ERR(__wt_txn_commit(session, NULL)); /* * Ensure that the metadata changes are durable before the checkpoint * is resolved. Do this by either checkpointing the metadata or syncing * the log file. * Recovery relies on the checkpoint LSN in the metadata only being * updated by full checkpoints so only checkpoint the metadata for * full or non-logged checkpoints. * * This is very similar to __wt_meta_track_off, ideally they would be * merged. */ if (full || !logging) { session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED; /* Disable metadata tracking during the metadata checkpoint. */ saved_meta_next = session->meta_track_next; session->meta_track_next = NULL; WT_WITH_METADATA_LOCK(session, WT_WITH_DHANDLE(session, WT_SESSION_META_DHANDLE(session), ret = __wt_checkpoint(session, cfg))); session->meta_track_next = saved_meta_next; WT_ERR(ret); WT_WITH_DHANDLE(session, WT_SESSION_META_DHANDLE(session), ret = __wt_checkpoint_sync(session, NULL)); WT_ERR(ret); __checkpoint_verbose_track(session, "metadata sync completed", &verb_timer); } else WT_WITH_DHANDLE(session, WT_SESSION_META_DHANDLE(session), ret = __wt_txn_checkpoint_log( session, false, WT_TXN_LOG_CKPT_SYNC, NULL)); /* * Now that the metadata is stable, re-open the metadata file for * regular eviction by clearing the checkpoint_pinned flag. */ txn_global->checkpoint_pinned = WT_TXN_NONE; if (full) { __wt_epoch(session, &stop); __checkpoint_stats(session, &start, &stop); } err: /* * XXX * Rolling back the changes here is problematic. * * If we unroll here, we need a way to roll back changes to the avail * list for each tree that was successfully synced before the error * occurred. Otherwise, the next time we try this operation, we will * try to free an old checkpoint again. * * OTOH, if we commit the changes after a failure, we have partially * overwritten the checkpoint, so what ends up on disk is not * consistent. */ if (ret != 0 && !conn->modified) conn->modified = true; session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED; if (tracking) WT_TRET(__wt_meta_track_off(session, false, ret != 0)); cache->eviction_scrub_limit = 0.0; WT_STAT_CONN_SET(session, txn_checkpoint_scrub_target, 0); if (F_ISSET(txn, WT_TXN_RUNNING)) { /* * Clear the dhandle so the visibility check doesn't get * confused about the snap min. Don't bother restoring the * handle since it doesn't make sense to carry a handle across * a checkpoint. */ session->dhandle = NULL; WT_TRET(__wt_txn_rollback(session, NULL)); } /* * Tell logging that we have finished a database checkpoint. Do not * write a log record if the database was idle. */ if (full && logging) { if (ret == 0 && F_ISSET(((WT_CURSOR_BTREE *) session->meta_cursor)->btree, WT_BTREE_SKIP_CKPT)) idle = true; WT_TRET(__wt_txn_checkpoint_log(session, full, (ret == 0 && !idle) ? WT_TXN_LOG_CKPT_STOP : WT_TXN_LOG_CKPT_CLEANUP, NULL)); } for (i = 0; i < session->ckpt_handle_next; ++i) { if (session->ckpt_handle[i] == NULL) continue; WT_WITH_DHANDLE(session, session->ckpt_handle[i], WT_TRET(__wt_session_release_btree(session))); } __wt_free(session, session->ckpt_handle); session->ckpt_handle_allocated = session->ckpt_handle_next = 0; session->isolation = txn->isolation = saved_isolation; return (ret); } /* * __txn_checkpoint_wrapper -- * Checkpoint wrapper. */ static int __txn_checkpoint_wrapper(WT_SESSION_IMPL *session, const char *cfg[]) { WT_DECL_RET; WT_TXN_GLOBAL *txn_global; txn_global = &S2C(session)->txn_global; WT_STAT_CONN_SET(session, txn_checkpoint_running, 1); txn_global->checkpoint_running = true; WT_FULL_BARRIER(); ret = __txn_checkpoint(session, cfg); WT_STAT_CONN_SET(session, txn_checkpoint_running, 0); txn_global->checkpoint_running = false; WT_FULL_BARRIER(); return (ret); } /* * __wt_txn_checkpoint -- * Checkpoint a database or a list of objects in the database. */ int __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[], bool waiting) { WT_DECL_RET; uint32_t mask; /* * Reset open cursors. Do this explicitly, even though it will happen * implicitly in the call to begin_transaction for the checkpoint, the * checkpoint code will acquire the schema lock before we do that, and * some implementation of WT_CURSOR::reset might need the schema lock. */ WT_RET(__wt_session_reset_cursors(session, false)); /* Ensure the metadata table is open before taking any locks. */ WT_RET(__wt_metadata_cursor(session, NULL)); /* * Don't highjack the session checkpoint thread for eviction. * * Application threads are not generally available for potentially slow * operations, but checkpoint does enough I/O it may be called upon to * perform slow operations for the block manager. * * Application checkpoints wait until the checkpoint lock is available, * compaction checkpoints don't. */ #define WT_TXN_SESSION_MASK \ (WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION) mask = F_MASK(session, WT_TXN_SESSION_MASK); F_SET(session, WT_SESSION_CAN_WAIT | WT_SESSION_NO_EVICTION); /* * Only one checkpoint can be active at a time, and checkpoints must run * in the same order as they update the metadata. It's probably a bad * idea to run checkpoints out of multiple threads, but as compaction * calls checkpoint directly, it can be tough to avoid. Serialize here * to ensure we don't get into trouble. */ if (waiting) WT_WITH_CHECKPOINT_LOCK(session, ret = __txn_checkpoint_wrapper(session, cfg)); else WT_WITH_CHECKPOINT_LOCK_NOWAIT(session, ret, ret = __txn_checkpoint_wrapper(session, cfg)); F_CLR(session, WT_TXN_SESSION_MASK); F_SET(session, mask); return (ret); } /* * __drop -- * Drop all checkpoints with a specific name. */ static void __drop(WT_CKPT *ckptbase, const char *name, size_t len) { WT_CKPT *ckpt; /* * If we're dropping internal checkpoints, match to the '.' separating * the checkpoint name from the generational number, and take all that * we can find. Applications aren't allowed to use any variant of this * name, so the test is still pretty simple, if the leading bytes match, * it's one we want to drop. */ if (strncmp(WT_CHECKPOINT, name, len) == 0) { WT_CKPT_FOREACH(ckptbase, ckpt) if (WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) F_SET(ckpt, WT_CKPT_DELETE); } else WT_CKPT_FOREACH(ckptbase, ckpt) if (WT_STRING_MATCH(ckpt->name, name, len)) F_SET(ckpt, WT_CKPT_DELETE); } /* * __drop_from -- * Drop all checkpoints after, and including, the named checkpoint. */ static void __drop_from(WT_CKPT *ckptbase, const char *name, size_t len) { WT_CKPT *ckpt; bool matched; /* * There's a special case -- if the name is "all", then we delete all * of the checkpoints. */ if (WT_STRING_MATCH("all", name, len)) { WT_CKPT_FOREACH(ckptbase, ckpt) F_SET(ckpt, WT_CKPT_DELETE); return; } /* * We use the first checkpoint we can find, that is, if there are two * checkpoints with the same name in the list, we'll delete from the * first match to the end. */ matched = false; WT_CKPT_FOREACH(ckptbase, ckpt) { if (!matched && !WT_STRING_MATCH(ckpt->name, name, len)) continue; matched = true; F_SET(ckpt, WT_CKPT_DELETE); } } /* * __drop_to -- * Drop all checkpoints before, and including, the named checkpoint. */ static void __drop_to(WT_CKPT *ckptbase, const char *name, size_t len) { WT_CKPT *ckpt, *mark; /* * We use the last checkpoint we can find, that is, if there are two * checkpoints with the same name in the list, we'll delete from the * beginning to the second match, not the first. */ mark = NULL; WT_CKPT_FOREACH(ckptbase, ckpt) if (WT_STRING_MATCH(ckpt->name, name, len)) mark = ckpt; if (mark == NULL) return; WT_CKPT_FOREACH(ckptbase, ckpt) { F_SET(ckpt, WT_CKPT_DELETE); if (ckpt == mark) break; } } /* * __checkpoint_lock_tree -- * Acquire the locks required to checkpoint a tree. */ static int __checkpoint_lock_tree(WT_SESSION_IMPL *session, bool is_checkpoint, bool need_tracking, const char *cfg[]) { WT_BTREE *btree; WT_CKPT *ckpt, *ckptbase; WT_CONFIG dropconf; WT_CONFIG_ITEM cval, k, v; WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; char *name_alloc; const char *name; bool hot_backup_locked; btree = S2BT(session); conn = S2C(session); ckpt = ckptbase = NULL; dhandle = session->dhandle; hot_backup_locked = false; name_alloc = NULL; /* Only referenced in diagnostic builds. */ WT_UNUSED(is_checkpoint); /* * Only referenced in diagnostic builds and gcc 5.1 isn't satisfied * with wrapping the entire assert condition in the unused macro. */ WT_UNUSED(need_tracking); /* * Most callers need meta tracking to be on here, otherwise it is * possible for this checkpoint to cleanup handles that are still in * use. The exceptions are: * - Checkpointing the metadata handle itself. * - On connection close when we know there can't be any races. */ WT_ASSERT(session, !need_tracking || WT_IS_METADATA(dhandle) || WT_META_TRACKING(session)); /* Get the list of checkpoints for this file. */ WT_RET(__wt_meta_ckptlist_get(session, dhandle->name, &ckptbase)); /* This may be a named checkpoint, check the configuration. */ cval.len = 0; if (cfg != NULL) WT_ERR(__wt_config_gets(session, cfg, "name", &cval)); if (cval.len == 0) name = WT_CHECKPOINT; else { WT_ERR(__checkpoint_name_ok(session, cval.str, cval.len)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &name_alloc)); name = name_alloc; } /* We may be dropping specific checkpoints, check the configuration. */ if (cfg != NULL) { cval.len = 0; WT_ERR(__wt_config_gets(session, cfg, "drop", &cval)); if (cval.len != 0) { __wt_config_subinit(session, &dropconf, &cval); while ((ret = __wt_config_next(&dropconf, &k, &v)) == 0) { /* Disallow unsafe checkpoint names. */ if (v.len == 0) WT_ERR(__checkpoint_name_ok( session, k.str, k.len)); else WT_ERR(__checkpoint_name_ok( session, v.str, v.len)); if (v.len == 0) __drop(ckptbase, k.str, k.len); else if (WT_STRING_MATCH("from", k.str, k.len)) __drop_from(ckptbase, v.str, v.len); else if (WT_STRING_MATCH("to", k.str, k.len)) __drop_to(ckptbase, v.str, v.len); else WT_ERR_MSG(session, EINVAL, "unexpected value for checkpoint " "key: %.*s", (int)k.len, k.str); } WT_ERR_NOTFOUND_OK(ret); } } /* Drop checkpoints with the same name as the one we're taking. */ __drop(ckptbase, name, strlen(name)); /* Add a new checkpoint entry at the end of the list. */ WT_CKPT_FOREACH(ckptbase, ckpt) ; WT_ERR(__wt_strdup(session, name, &ckpt->name)); /* * We are now done with the local use of the name. Free the local * allocation, if needed. */ __wt_free(session, name_alloc); F_SET(ckpt, WT_CKPT_ADD); /* * We can't delete checkpoints if a backup cursor is open. WiredTiger * checkpoints are uniquely named and it's OK to have multiple of them * in the system: clear the delete flag for them, and otherwise fail. * Hold the lock until we're done (blocking hot backups from starting), * we don't want to race with a future hot backup. */ __wt_readlock(session, conn->hot_backup_lock); hot_backup_locked = true; if (conn->hot_backup) WT_CKPT_FOREACH(ckptbase, ckpt) { if (!F_ISSET(ckpt, WT_CKPT_DELETE)) continue; if (WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) { F_CLR(ckpt, WT_CKPT_DELETE); continue; } WT_ERR_MSG(session, EBUSY, "checkpoint %s blocked by hot backup: it would " "delete an existing checkpoint, and checkpoints " "cannot be deleted during a hot backup", ckpt->name); } /* * Lock the checkpoints that will be deleted. * * Checkpoints are only locked when tracking is enabled, which covers * checkpoint and drop operations, but not close. The reasoning is * there should be no access to a checkpoint during close, because any * thread accessing a checkpoint will also have the current file handle * open. */ if (WT_META_TRACKING(session)) WT_CKPT_FOREACH(ckptbase, ckpt) { if (!F_ISSET(ckpt, WT_CKPT_DELETE)) continue; /* * We can't delete checkpoints referenced by a cursor. * WiredTiger checkpoints are uniquely named and it's * OK to have multiple in the system: clear the delete * flag for them, and otherwise fail. */ ret = __wt_session_lock_checkpoint(session, ckpt->name); if (ret == 0) continue; if (ret == EBUSY && WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) { F_CLR(ckpt, WT_CKPT_DELETE); continue; } WT_ERR_MSG(session, ret, "checkpoints cannot be dropped when in-use"); } /* * There are special files: those being bulk-loaded, salvaged, upgraded * or verified during the checkpoint. We have to do something for those * objects because a checkpoint is an external name the application can * reference and the name must exist no matter what's happening during * the checkpoint. For bulk-loaded files, we could block until the load * completes, checkpoint the partial load, or magic up an empty-file * checkpoint. The first is too slow, the second is insane, so do the * third. * Salvage, upgrade and verify don't currently require any work, all * three hold the schema lock, blocking checkpoints. If we ever want to * fix that (and I bet we eventually will, at least for verify), we can * copy the last checkpoint the file has. That works if we guarantee * salvage, upgrade and verify act on objects with previous checkpoints * (true if handles are closed/re-opened between object creation and a * subsequent salvage, upgrade or verify operation). Presumably, * salvage and upgrade will discard all previous checkpoints when they * complete, which is fine with us. This change will require reference * counting checkpoints, and once that's done, we should use checkpoint * copy instead of forcing checkpoints on clean objects to associate * names with checkpoints. */ WT_ASSERT(session, !is_checkpoint || !F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)); __wt_readunlock(session, conn->hot_backup_lock); WT_ASSERT(session, btree->ckpt == NULL); btree->ckpt = ckptbase; return (0); err: if (hot_backup_locked) __wt_readunlock(session, conn->hot_backup_lock); __wt_meta_ckptlist_free(session, ckptbase); __wt_free(session, name_alloc); return (ret); } /* * __checkpoint_mark_deletes -- * Figure out what old checkpoints will be deleted, and whether the * checkpoint can be skipped entirely. */ static int __checkpoint_mark_deletes( WT_SESSION_IMPL *session, const char *cfg[]) { WT_BTREE *btree; WT_CKPT *ckpt, *ckptbase; WT_CONFIG_ITEM cval; const char *name; int deleted; bool force; btree = S2BT(session); ckptbase = btree->ckpt; /* * Check for clean objects not requiring a checkpoint. * * If we're closing a handle, and the object is clean, we can skip the * checkpoint, whatever checkpoints we have are sufficient. (We might * not have any checkpoints if the object was never modified, and that's * OK: the object creation code doesn't mark the tree modified so we can * skip newly created trees here.) * * If the application repeatedly checkpoints an object (imagine hourly * checkpoints using the same explicit or internal name), there's no * reason to repeat the checkpoint for clean objects. The test is if * the only checkpoint we're deleting is the last one in the list and * it has the same name as the checkpoint we're about to take, skip the * work. (We can't skip checkpoints that delete more than the last * checkpoint because deleting those checkpoints might free up space in * the file.) This means an application toggling between two (or more) * checkpoint names will repeatedly take empty checkpoints, but that's * not likely enough to make detection worthwhile. * * Checkpoint read-only objects otherwise: the application must be able * to open the checkpoint in a cursor after taking any checkpoint, which * means it must exist. */ force = false; F_CLR(btree, WT_BTREE_SKIP_CKPT); if (!btree->modified && cfg != NULL) { WT_RET(__wt_config_gets(session, cfg, "force", &cval)); force = cval.val != 0; } if (!btree->modified && !force) { deleted = 0; WT_CKPT_FOREACH(ckptbase, ckpt) if (F_ISSET(ckpt, WT_CKPT_DELETE)) ++deleted; /* * Complicated test: if the tree is clean and last two * checkpoints have the same name (correcting for internal * checkpoint names with their generational suffix numbers), we * can skip the checkpoint, there's nothing to do. The * exception is if we're deleting two or more checkpoints: then * we may save space. */ name = (ckpt - 1)->name; if (ckpt > ckptbase + 1 && deleted < 2 && (strcmp(name, (ckpt - 2)->name) == 0 || (WT_PREFIX_MATCH(name, WT_CHECKPOINT) && WT_PREFIX_MATCH((ckpt - 2)->name, WT_CHECKPOINT)))) { F_SET(btree, WT_BTREE_SKIP_CKPT); return (0); } } return (0); } /* * __checkpoint_tree -- * Checkpoint a single tree. * Assumes all necessary locks have been acquired by the caller. */ static int __checkpoint_tree( WT_SESSION_IMPL *session, bool is_checkpoint, const char *cfg[]) { WT_BM *bm; WT_BTREE *btree; WT_CKPT *ckpt, *ckptbase; WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; WT_LSN ckptlsn; int was_modified; bool fake_ckpt; WT_UNUSED(cfg); btree = S2BT(session); bm = btree->bm; ckptbase = btree->ckpt; conn = S2C(session); dhandle = session->dhandle; fake_ckpt = false; was_modified = btree->modified; /* * Set the checkpoint LSN to the maximum LSN so that if logging is * disabled, recovery will never roll old changes forward over the * non-logged changes in this checkpoint. If logging is enabled, a * real checkpoint LSN will be assigned for this checkpoint and * overwrite this. */ WT_MAX_LSN(&ckptlsn); /* * If an object has never been used (in other words, if it could become * a bulk-loaded file), then we must fake the checkpoint. This is good * because we don't write physical checkpoint blocks for just-created * files, but it's not just a good idea. The reason is because deleting * a physical checkpoint requires writing the file, and fake checkpoints * can't write the file. If you (1) create a physical checkpoint for an * empty file which writes blocks, (2) start bulk-loading records into * the file, (3) during the bulk-load perform another checkpoint with * the same name; in order to keep from having two checkpoints with the * same name you would have to use the bulk-load's fake checkpoint to * delete a physical checkpoint, and that will end in tears. */ if (is_checkpoint) if (btree->bulk_load_ok) { fake_ckpt = true; goto fake; } /* * Mark the root page dirty to ensure something gets written. (If the * tree is modified, we must write the root page anyway, this doesn't * add additional writes to the process. If the tree is not modified, * we have to dirty the root page to ensure something gets written.) * This is really about paranoia: if the tree modification value gets * out of sync with the set of dirty pages (modify is set, but there * are no dirty pages), we perform a checkpoint without any writes, no * checkpoint is created, and then things get bad. * While marking the root page as dirty, we do not want to dirty the * btree because we are marking the btree as clean just after this call. * Also, marking the btree dirty at this stage will unnecessarily mark * the connection as dirty causing checkpoint-skip code to fail. */ WT_ERR(__wt_page_modify_init(session, btree->root.page)); __wt_page_only_modify_set(session, btree->root.page); /* * Clear the tree's modified flag; any changes before we clear the flag * are guaranteed to be part of this checkpoint (unless reconciliation * skips updates for transactional reasons), and changes subsequent to * the checkpoint start, which might not be included, will re-set the * modified flag. The "unless reconciliation skips updates" problem is * handled in the reconciliation code: if reconciliation skips updates, * it sets the modified flag itself. Use a full barrier so we get the * store done quickly, this isn't a performance path. */ btree->modified = false; WT_FULL_BARRIER(); /* Tell logging that a file checkpoint is starting. */ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) WT_ERR(__wt_txn_checkpoint_log( session, false, WT_TXN_LOG_CKPT_START, &ckptlsn)); /* Flush the file from the cache, creating the checkpoint. */ if (is_checkpoint) WT_ERR(__wt_cache_op(session, WT_SYNC_CHECKPOINT)); else WT_ERR(__wt_cache_op(session, WT_SYNC_CLOSE)); /* * All blocks being written have been written; set the object's write * generation. */ WT_CKPT_FOREACH(ckptbase, ckpt) if (F_ISSET(ckpt, WT_CKPT_ADD)) ckpt->write_gen = btree->write_gen; fake: /* * If we're faking a checkpoint and logging is enabled, recovery should * roll forward any changes made between now and the next checkpoint, * so set the checkpoint LSN to the beginning of time. */ if (fake_ckpt && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) WT_INIT_LSN(&ckptlsn); /* * Update the object's metadata. * * If the object is the metadata, the call to __wt_meta_ckptlist_set * will update the turtle file and swap the new one into place. We * need to make sure the metadata is on disk before the turtle file is * updated. * * If we are doing a checkpoint in a file without a transaction (e.g., * closing a dirty tree before an exclusive operation like verify), * the metadata update will be auto-committed. In that case, we need to * sync the file here or we could roll forward the metadata in * recovery and open a checkpoint that isn't yet durable. */ if (WT_IS_METADATA(dhandle) || !F_ISSET(&session->txn, WT_TXN_RUNNING)) WT_ERR(__wt_checkpoint_sync(session, NULL)); WT_ERR(__wt_meta_ckptlist_set( session, dhandle->name, ckptbase, &ckptlsn)); /* * If we wrote a checkpoint (rather than faking one), pages may be * available for re-use. If tracking is enabled, defer making pages * available until transaction end. The exception is if the handle is * being discarded, in which case the handle will be gone by the time * we try to apply or unroll the meta tracking event. */ if (!fake_ckpt) { if (WT_META_TRACKING(session) && is_checkpoint) WT_ERR(__wt_meta_track_checkpoint(session)); else WT_ERR(bm->checkpoint_resolve(bm, session)); } /* Tell logging that the checkpoint is complete. */ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) WT_ERR(__wt_txn_checkpoint_log( session, false, WT_TXN_LOG_CKPT_STOP, NULL)); err: /* * If the checkpoint didn't complete successfully, make sure the * tree is marked dirty. */ if (ret != 0 && !btree->modified && was_modified) { btree->modified = true; if (!S2C(session)->modified) S2C(session)->modified = true; } __wt_meta_ckptlist_free(session, ckptbase); btree->ckpt = NULL; return (ret); } /* * __checkpoint_presync -- * Visit all handles after the checkpoint writes are complete and before * syncing. At this point, all trees should be completely open for * business. */ static int __checkpoint_presync(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BTREE *btree; WT_UNUSED(cfg); btree = S2BT(session); WT_ASSERT(session, !btree->include_checkpoint_txn); btree->evict_walk_period = btree->evict_walk_saved; return (0); } /* * __checkpoint_tree_helper -- * Checkpoint a tree (suitable for use in *_apply functions). */ static int __checkpoint_tree_helper(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BTREE *btree; WT_DECL_RET; btree = S2BT(session); ret = __checkpoint_tree(session, true, cfg); /* * Whatever happened, we aren't visiting this tree again in this * checkpoint. Don't keep updates pinned any longer. */ __checkpoint_update_generation(session); /* * In case this tree was being skipped by the eviction server * during the checkpoint, restore the previous state. */ btree->evict_walk_period = btree->evict_walk_saved; /* * Wake the eviction server, in case application threads have * stalled while the eviction server decided it couldn't make * progress. Without this, application threads will be stalled * until the eviction server next wakes. */ __wt_evict_server_wake(session); return (ret); } /* * __wt_checkpoint -- * Checkpoint a file. */ int __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) { WT_DECL_RET; /* Should not be called with a checkpoint handle. */ WT_ASSERT(session, session->dhandle->checkpoint == NULL); /* We must hold the metadata lock if checkpointing the metadata. */ WT_ASSERT(session, !WT_IS_METADATA(session->dhandle) || F_ISSET(session, WT_SESSION_LOCKED_METADATA)); WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_tree(session, true, true, cfg)); WT_RET(ret); WT_SAVE_DHANDLE(session, ret = __checkpoint_mark_deletes(session, cfg)); WT_RET(ret); return (__checkpoint_tree(session, true, cfg)); } /* * __wt_checkpoint_sync -- * Sync a file that has been checkpointed, and wait for the result. */ int __wt_checkpoint_sync(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BM *bm; WT_UNUSED(cfg); bm = S2BT(session)->bm; /* Should not be called with a checkpoint handle. */ WT_ASSERT(session, session->dhandle->checkpoint == NULL); /* Unnecessary if checkpoint_sync has been configured "off". */ if (!F_ISSET(S2C(session), WT_CONN_CKPT_SYNC)) return (0); return (bm->sync(bm, session, true)); } /* * __wt_checkpoint_close -- * Checkpoint a single file as part of closing the handle. */ int __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final) { WT_BTREE *btree; WT_DECL_RET; bool bulk, need_tracking; btree = S2BT(session); bulk = F_ISSET(btree, WT_BTREE_BULK); /* * If the handle is already dead or the file isn't durable, force the * discard. * * If the file isn't durable, mark the handle dead, there are asserts * later on that only dead handles can have modified pages. */ if (F_ISSET(btree, WT_BTREE_NO_CHECKPOINT)) F_SET(session->dhandle, WT_DHANDLE_DEAD); if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) return (__wt_cache_op(session, WT_SYNC_DISCARD)); /* * If closing an unmodified file, check that no update is required * for active readers. */ if (!btree->modified && !bulk) { WT_RET(__wt_txn_update_oldest( session, WT_TXN_OLDEST_STRICT | WT_TXN_OLDEST_WAIT)); return (__wt_txn_visible_all(session, btree->rec_max_txn) ? __wt_cache_op(session, WT_SYNC_DISCARD) : EBUSY); } /* * Turn on metadata tracking if: * - The session is not already doing metadata tracking. * - The file was not bulk loaded. * - The close is not during connection close. */ need_tracking = !WT_META_TRACKING(session) && !bulk && !final; if (need_tracking) WT_RET(__wt_meta_track_on(session)); WT_SAVE_DHANDLE(session, ret = __checkpoint_lock_tree(session, false, need_tracking, NULL)); WT_ASSERT(session, ret == 0); if (ret == 0) { WT_SAVE_DHANDLE(session, ret = __checkpoint_mark_deletes(session, NULL)); WT_ASSERT(session, ret == 0); } if (ret == 0) ret = __checkpoint_tree(session, false, NULL); if (need_tracking) WT_TRET(__wt_meta_track_off(session, true, ret != 0)); return (ret); }