/*- * Copyright (c) 2014-2015 MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * * See the file LICENSE for redistribution information. */ #include "wt_internal.h" /* * __wt_checkpoint_name_ok -- * Complain if the checkpoint name isn't acceptable. */ int __wt_checkpoint_name_ok(WT_SESSION_IMPL *session, const char *name, size_t len) { /* Check for characters we don't want to see in a metadata file. */ WT_RET(__wt_name_check(session, name, len)); /* * The internal checkpoint name is special, applications aren't allowed * to use it. Be aggressive and disallow any matching prefix, it makes * things easier when checking in other places. */ if (len < strlen(WT_CHECKPOINT)) return (0); if (!WT_PREFIX_MATCH(name, WT_CHECKPOINT)) return (0); WT_RET_MSG(session, EINVAL, "the checkpoint name \"%s\" is reserved", WT_CHECKPOINT); } /* * __checkpoint_name_check -- * Check for an attempt to name a checkpoint that includes anything * other than a file object. */ static int __checkpoint_name_check(WT_SESSION_IMPL *session, const char *uri) { WT_CURSOR *cursor; WT_DECL_RET; const char *fail; cursor = NULL; fail = NULL; /* * This function exists as a place for this comment: named checkpoints * are only supported on file objects, and not on LSM trees or Helium * devices. If a target list is configured for the checkpoint, this * function is called with each target list entry; check the entry to * make sure it's backed by a file. If no target list is configured, * confirm the metadata file contains no non-file objects. */ if (uri == NULL) { WT_ERR(__wt_metadata_cursor(session, NULL, &cursor)); while ((ret = cursor->next(cursor)) == 0) { WT_ERR(cursor->get_key(cursor, &uri)); if (!WT_PREFIX_MATCH(uri, "colgroup:") && !WT_PREFIX_MATCH(uri, "file:") && !WT_PREFIX_MATCH(uri, "index:") && !WT_PREFIX_MATCH(uri, "table:")) { fail = uri; break; } } WT_ERR_NOTFOUND_OK(ret); } else if (!WT_PREFIX_MATCH(uri, "colgroup:") && !WT_PREFIX_MATCH(uri, "file:") && !WT_PREFIX_MATCH(uri, "index:") && !WT_PREFIX_MATCH(uri, "table:")) fail = uri; if (fail != NULL) WT_ERR_MSG(session, EINVAL, "%s object does not support named checkpoints", fail); err: if (cursor != NULL) WT_TRET(cursor->close(cursor)); return (ret); } /* * __checkpoint_apply_all -- * Apply an operation to all files involved in a checkpoint. */ static int __checkpoint_apply_all(WT_SESSION_IMPL *session, const char *cfg[], int (*op)(WT_SESSION_IMPL *, const char *[]), bool *fullp) { WT_CONFIG targetconf; WT_CONFIG_ITEM cval, k, v; WT_DECL_ITEM(tmp); WT_DECL_RET; bool ckpt_closed, named, target_list; target_list = false; /* Flag if this is a named checkpoint, and check if the name is OK. */ WT_RET(__wt_config_gets(session, cfg, "name", &cval)); named = cval.len != 0; if (named) WT_RET(__wt_checkpoint_name_ok(session, cval.str, cval.len)); /* Step through the targets and optionally operate on each one. */ WT_ERR(__wt_config_gets(session, cfg, "target", &cval)); WT_ERR(__wt_config_subinit(session, &targetconf, &cval)); while ((ret = __wt_config_next(&targetconf, &k, &v)) == 0) { if (!target_list) { WT_ERR(__wt_scr_alloc(session, 512, &tmp)); target_list = true; } if (v.len != 0) WT_ERR_MSG(session, EINVAL, "invalid checkpoint target %.*s: URIs may require " "quoting", (int)cval.len, (char *)cval.str); /* Some objects don't support named checkpoints. */ if (named) WT_ERR(__checkpoint_name_check(session, k.str)); if (op == NULL) continue; WT_ERR(__wt_buf_fmt(session, tmp, "%.*s", (int)k.len, k.str)); if ((ret = __wt_schema_worker( session, tmp->data, op, NULL, cfg, 0)) != 0) WT_ERR_MSG(session, ret, "%s", (const char *)tmp->data); } WT_ERR_NOTFOUND_OK(ret); if (!target_list && named) /* Some objects don't support named checkpoints. */ WT_ERR(__checkpoint_name_check(session, NULL)); if (!target_list && op != NULL) { /* * If the checkpoint is named or we're dropping checkpoints, we * checkpoint both open and closed files; else, only checkpoint * open files. * * XXX * We don't optimize unnamed checkpoints of a list of targets, * we open the targets and checkpoint them even if they are * quiescent and don't need a checkpoint, believing applications * unlikely to checkpoint a list of closed targets. */ ckpt_closed = named; if (!ckpt_closed) { WT_ERR(__wt_config_gets(session, cfg, "drop", &cval)); ckpt_closed = cval.len != 0; } WT_ERR(ckpt_closed ? __wt_meta_btree_apply(session, op, cfg) : __wt_conn_btree_apply(session, false, NULL, op, cfg)); } if (fullp != NULL) *fullp = !target_list; err: __wt_scr_free(session, &tmp); return (ret); } /* * __checkpoint_apply -- * Apply an operation to all handles locked for a checkpoint. */ static int __checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[], int (*op)(WT_SESSION_IMPL *, const char *[])) { WT_DECL_RET; u_int i; /* If we have already locked the handles, apply the operation. */ for (i = 0; i < session->ckpt_handle_next; ++i) { if (session->ckpt_handle[i].dhandle != NULL) WT_WITH_DHANDLE(session, session->ckpt_handle[i].dhandle, ret = (*op)(session, cfg)); else WT_WITH_HANDLE_LIST_LOCK(session, ret = __wt_conn_btree_apply_single(session, session->ckpt_handle[i].name, NULL, op, cfg)); WT_RET(ret); } return (0); } /* * __checkpoint_data_source -- * Checkpoint all data sources. */ static int __checkpoint_data_source(WT_SESSION_IMPL *session, const char *cfg[]) { WT_NAMED_DATA_SOURCE *ndsrc; WT_DATA_SOURCE *dsrc; /* * A place-holder, to support Helium devices: we assume calling the * underlying data-source session checkpoint function is sufficient to * checkpoint all objects in the data source, open or closed, and we * don't attempt to optimize the checkpoint of individual targets. * Those assumptions is correct for the Helium device, but it's not * necessarily going to be true for other data sources. * * It's not difficult to support data-source checkpoints of individual * targets (__wt_schema_worker is the underlying function that will do * the work, and it's already written to support data-sources, although * we'd probably need to pass the URI of the object to the data source * checkpoint function which we don't currently do). However, doing a * full data checkpoint is trickier: currently, the connection code is * written to ignore all objects other than "file:", and that code will * require significant changes to work with data sources. */ TAILQ_FOREACH(ndsrc, &S2C(session)->dsrcqh, q) { dsrc = ndsrc->dsrc; if (dsrc->checkpoint != NULL) WT_RET(dsrc->checkpoint(dsrc, (WT_SESSION *)session, (WT_CONFIG_ARG *)cfg)); } return (0); } /* * __wt_checkpoint_list -- * Get a list of handles to flush. */ int __wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[]) { WT_DECL_RET; const char *name; WT_UNUSED(cfg); /* Should not be called with anything other than a file object. */ WT_ASSERT(session, session->dhandle->checkpoint == NULL); WT_ASSERT(session, WT_PREFIX_MATCH(session->dhandle->name, "file:")); /* Make sure there is space for the next entry. */ WT_RET(__wt_realloc_def(session, &session->ckpt_handle_allocated, session->ckpt_handle_next + 1, &session->ckpt_handle)); /* Not strictly necessary, but cleaner to clear the current handle. */ name = session->dhandle->name; session->dhandle = NULL; /* Record busy file names, we'll deal with them in the checkpoint. */ if ((ret = __wt_session_get_btree(session, name, NULL, NULL, 0)) == 0) session->ckpt_handle[session->ckpt_handle_next++].dhandle = session->dhandle; else if (ret == EBUSY) ret = __wt_strdup(session, name, &session->ckpt_handle[session->ckpt_handle_next++].name); return (ret); } /* * __checkpoint_write_leaves -- * Write any dirty leaf pages for all checkpoint handles. */ static int __checkpoint_write_leaves(WT_SESSION_IMPL *session, const char *cfg[]) { WT_UNUSED(cfg); return (__wt_cache_op(session, NULL, WT_SYNC_WRITE_LEAVES)); } /* * __checkpoint_stats -- * Update checkpoint timer stats. */ static void __checkpoint_stats( WT_SESSION_IMPL *session, struct timespec *start, struct timespec *stop) { uint64_t msec; /* * Get time diff in microseconds. */ msec = WT_TIMEDIFF(*stop, *start) / WT_MILLION; if (msec > WT_CONN_STAT(session, txn_checkpoint_time_max)) WT_STAT_FAST_CONN_SET(session, txn_checkpoint_time_max, msec); if (WT_CONN_STAT(session, txn_checkpoint_time_min) == 0 || msec < WT_CONN_STAT(session, txn_checkpoint_time_min)) WT_STAT_FAST_CONN_SET(session, txn_checkpoint_time_min, msec); WT_STAT_FAST_CONN_SET(session, txn_checkpoint_time_recent, msec); WT_STAT_FAST_CONN_INCRV(session, txn_checkpoint_time_total, msec); } /* * __checkpoint_verbose_track -- * Output a verbose message with timing information */ static int __checkpoint_verbose_track(WT_SESSION_IMPL *session, const char *msg, struct timespec *start) { #ifdef HAVE_VERBOSE struct timespec stop; uint64_t msec; if (!WT_VERBOSE_ISSET(session, WT_VERB_CHECKPOINT)) return (0); WT_RET(__wt_epoch(session, &stop)); /* * Get time diff in microseconds. */ msec = WT_TIMEDIFF(stop, *start) / WT_MILLION; WT_RET(__wt_verbose(session, WT_VERB_CHECKPOINT, "time: %" PRIu64 " us, gen: %" PRIu64 ": Full database checkpoint %s", msec, S2C(session)->txn_global.checkpoint_gen, msg)); /* Update the timestamp so we are reporting intervals. */ memcpy(start, &stop, sizeof(*start)); #else WT_UNUSED(session); WT_UNUSED(msg); WT_UNUSED(start); #endif return (0); } /* * __wt_txn_checkpoint -- * Checkpoint a database or a list of objects in the database. */ int __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) { struct timespec start, stop, verb_timer; WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_TXN *txn; WT_TXN_GLOBAL *txn_global; WT_TXN_ISOLATION saved_isolation; WT_TXN_STATE *txn_state; const char *txn_cfg[] = { WT_CONFIG_BASE(session, session_begin_transaction), "isolation=snapshot", NULL }; void *saved_meta_next; u_int i; bool full, idle, logging, tracking; conn = S2C(session); txn = &session->txn; txn_global = &conn->txn_global; txn_state = WT_SESSION_TXN_STATE(session); saved_isolation = session->isolation; full = idle = logging = tracking = false; /* Ensure the metadata table is open before taking any locks. */ WT_RET(__wt_metadata_open(session)); /* * Do a pass over the configuration arguments and figure out what kind * kind of checkpoint this is. */ WT_RET(__checkpoint_apply_all(session, cfg, NULL, &full)); /* Configure logging only if doing a full checkpoint. */ logging = FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED); /* * Get a list of handles we want to flush; this may pull closed objects * into the session cache, but we're going to do that eventually anyway. */ WT_WITH_SCHEMA_LOCK(session, WT_WITH_TABLE_LOCK(session, WT_WITH_HANDLE_LIST_LOCK(session, ret = __checkpoint_apply_all( session, cfg, __wt_checkpoint_list, NULL)))); WT_ERR(ret); /* * Update the global oldest ID so we do all possible cleanup. * * This is particularly important for compact, so that all dirty pages * can be fully written. */ __wt_txn_update_oldest(session, true); /* Flush data-sources before we start the checkpoint. */ WT_ERR(__checkpoint_data_source(session, cfg)); WT_ERR(__wt_epoch(session, &verb_timer)); WT_ERR(__checkpoint_verbose_track(session, "starting write leaves", &verb_timer)); /* Flush dirty leaf pages before we start the checkpoint. */ session->isolation = txn->isolation = WT_ISO_READ_COMMITTED; WT_ERR(__checkpoint_apply(session, cfg, __checkpoint_write_leaves)); /* * The underlying flush routine scheduled an asynchronous flush * after writing the leaf pages, but in order to minimize I/O * while holding the schema lock, do a flush and wait for the * completion. Do it after flushing the pages to give the * asynchronous flush as much time as possible before we wait. */ if (F_ISSET(conn, WT_CONN_CKPT_SYNC)) WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync)); /* Acquire the schema lock. */ F_SET(session, WT_SESSION_LOCKED_SCHEMA); __wt_spin_lock(session, &conn->schema_lock); WT_ERR(__wt_meta_track_on(session)); tracking = true; /* Tell logging that we are about to start a database checkpoint. */ if (full && logging) WT_ERR(__wt_txn_checkpoint_log( session, full, WT_TXN_LOG_CKPT_PREPARE, NULL)); WT_ERR(__checkpoint_verbose_track(session, "starting transaction", &verb_timer)); if (full) WT_ERR(__wt_epoch(session, &start)); /* * Bump the global checkpoint generation, used to figure out whether * checkpoint has visited a tree. There is no need for this to be * atomic: it is only written while holding the checkpoint lock. * * We do need to update it before clearing the checkpoint's entry out * of the transaction table, or a thread evicting in a tree could * ignore the checkpoint's transaction. */ ++txn_global->checkpoint_gen; WT_STAT_FAST_CONN_SET(session, txn_checkpoint_generation, txn_global->checkpoint_gen); /* * Start a snapshot transaction for the checkpoint. * * Note: we don't go through the public API calls because they have * side effects on cursors, which applications can hold open across * calls to checkpoint. */ WT_ERR(__wt_txn_begin(session, txn_cfg)); /* Ensure a transaction ID is allocated prior to sharing it globally */ WT_ERR(__wt_txn_id_check(session)); /* * Save the checkpoint session ID. We never do checkpoints in the * default session (with id zero). */ WT_ASSERT(session, session->id != 0 && txn_global->checkpoint_id == 0); txn_global->checkpoint_id = session->id; txn_global->checkpoint_pinned = WT_MIN(txn_state->id, txn_state->snap_min); /* * We're about to clear the checkpoint transaction from the global * state table so the oldest ID can move forward. Make sure everything * we've done above is scheduled. */ WT_FULL_BARRIER(); /* * Sanity check that the oldest ID hasn't moved on before we have * cleared our entry. */ WT_ASSERT(session, WT_TXNID_LE(txn_global->oldest_id, txn_state->id) && WT_TXNID_LE(txn_global->oldest_id, txn_state->snap_min)); /* * Clear our entry from the global transaction session table. Any * operation that needs to know about the ID for this checkpoint will * consider the checkpoint ID in the global structure. Most operations * can safely ignore the checkpoint ID (see the visible all check for * details). */ txn_state->id = txn_state->snap_min = WT_TXN_NONE; /* Tell logging that we have started a database checkpoint. */ if (full && logging) WT_ERR(__wt_txn_checkpoint_log( session, full, WT_TXN_LOG_CKPT_START, NULL)); WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint)); /* * Clear the dhandle so the visibility check doesn't get confused about * the snap min. Don't bother restoring the handle since it doesn't * make sense to carry a handle across a checkpoint. */ session->dhandle = NULL; /* Release the snapshot so we aren't pinning pages in cache. */ __wt_txn_release_snapshot(session); WT_ERR(__checkpoint_verbose_track(session, "committing transaction", &verb_timer)); /* * Checkpoints have to hit disk (it would be reasonable to configure for * lazy checkpoints, but we don't support them yet). */ if (F_ISSET(conn, WT_CONN_CKPT_SYNC)) WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync)); WT_ERR(__checkpoint_verbose_track(session, "sync completed", &verb_timer)); /* * Commit the transaction now that we are sure that all files in the * checkpoint have been flushed to disk. It's OK to commit before * checkpointing the metadata since we know that all files in the * checkpoint are now in a consistent state. */ WT_ERR(__wt_txn_commit(session, NULL)); /* * Ensure that the metadata changes are durable before the checkpoint * is resolved. Do this by either checkpointing the metadata or syncing * the log file. * Recovery relies on the checkpoint LSN in the metadata only being * updated by full checkpoints so only checkpoint the metadata for * full or non-logged checkpoints. */ if (full || !logging) { session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED; /* Disable metadata tracking during the metadata checkpoint. */ saved_meta_next = session->meta_track_next; session->meta_track_next = NULL; WT_WITH_DHANDLE(session, session->meta_dhandle, ret = __wt_checkpoint(session, cfg)); session->meta_track_next = saved_meta_next; WT_ERR(ret); WT_ERR(__checkpoint_verbose_track(session, "metadata sync completed", &verb_timer)); } else WT_WITH_DHANDLE(session, session->meta_dhandle, ret = __wt_txn_checkpoint_log( session, false, WT_TXN_LOG_CKPT_SYNC, NULL)); if (full) { WT_ERR(__wt_epoch(session, &stop)); __checkpoint_stats(session, &start, &stop); } err: /* * XXX * Rolling back the changes here is problematic. * * If we unroll here, we need a way to roll back changes to the avail * list for each tree that was successfully synced before the error * occurred. Otherwise, the next time we try this operation, we will * try to free an old checkpoint again. * * OTOH, if we commit the changes after a failure, we have partially * overwritten the checkpoint, so what ends up on disk is not * consistent. */ session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED; if (tracking) WT_TRET(__wt_meta_track_off(session, false, ret != 0)); if (F_ISSET(txn, WT_TXN_RUNNING)) { /* * Clear the dhandle so the visibility check doesn't get * confused about the snap min. Don't bother restoring the * handle since it doesn't make sense to carry a handle across * a checkpoint. */ session->dhandle = NULL; WT_TRET(__wt_txn_rollback(session, NULL)); } /* * Tell logging that we have finished a database checkpoint. Do not * write a log record if the database was idle. */ if (full && logging) { if (ret == 0 && F_ISSET((WT_BTREE *)session->meta_dhandle->handle, WT_BTREE_SKIP_CKPT)) idle = true; WT_TRET(__wt_txn_checkpoint_log(session, full, (ret == 0 && !idle) ? WT_TXN_LOG_CKPT_STOP : WT_TXN_LOG_CKPT_CLEANUP, NULL)); } for (i = 0; i < session->ckpt_handle_next; ++i) { if (session->ckpt_handle[i].dhandle == NULL) { __wt_free(session, session->ckpt_handle[i].name); continue; } WT_WITH_DHANDLE(session, session->ckpt_handle[i].dhandle, WT_TRET(__wt_session_release_btree(session))); } __wt_free(session, session->ckpt_handle); session->ckpt_handle_allocated = session->ckpt_handle_next = 0; if (F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)) { F_CLR(session, WT_SESSION_LOCKED_SCHEMA); __wt_spin_unlock(session, &conn->schema_lock); } session->isolation = txn->isolation = saved_isolation; return (ret); } /* * __drop -- * Drop all checkpoints with a specific name. */ static void __drop(WT_CKPT *ckptbase, const char *name, size_t len) { WT_CKPT *ckpt; /* * If we're dropping internal checkpoints, match to the '.' separating * the checkpoint name from the generational number, and take all that * we can find. Applications aren't allowed to use any variant of this * name, so the test is still pretty simple, if the leading bytes match, * it's one we want to drop. */ if (strncmp(WT_CHECKPOINT, name, len) == 0) { WT_CKPT_FOREACH(ckptbase, ckpt) if (WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) F_SET(ckpt, WT_CKPT_DELETE); } else WT_CKPT_FOREACH(ckptbase, ckpt) if (WT_STRING_MATCH(ckpt->name, name, len)) F_SET(ckpt, WT_CKPT_DELETE); } /* * __drop_from -- * Drop all checkpoints after, and including, the named checkpoint. */ static void __drop_from(WT_CKPT *ckptbase, const char *name, size_t len) { WT_CKPT *ckpt; bool matched; /* * There's a special case -- if the name is "all", then we delete all * of the checkpoints. */ if (WT_STRING_MATCH("all", name, len)) { WT_CKPT_FOREACH(ckptbase, ckpt) F_SET(ckpt, WT_CKPT_DELETE); return; } /* * We use the first checkpoint we can find, that is, if there are two * checkpoints with the same name in the list, we'll delete from the * first match to the end. */ matched = false; WT_CKPT_FOREACH(ckptbase, ckpt) { if (!matched && !WT_STRING_MATCH(ckpt->name, name, len)) continue; matched = true; F_SET(ckpt, WT_CKPT_DELETE); } } /* * __drop_to -- * Drop all checkpoints before, and including, the named checkpoint. */ static void __drop_to(WT_CKPT *ckptbase, const char *name, size_t len) { WT_CKPT *ckpt, *mark; /* * We use the last checkpoint we can find, that is, if there are two * checkpoints with the same name in the list, we'll delete from the * beginning to the second match, not the first. */ mark = NULL; WT_CKPT_FOREACH(ckptbase, ckpt) if (WT_STRING_MATCH(ckpt->name, name, len)) mark = ckpt; if (mark == NULL) return; WT_CKPT_FOREACH(ckptbase, ckpt) { F_SET(ckpt, WT_CKPT_DELETE); if (ckpt == mark) break; } } /* * __checkpoint_worker -- * Checkpoint a tree. */ static int __checkpoint_worker( WT_SESSION_IMPL *session, const char *cfg[], bool is_checkpoint) { WT_BM *bm; WT_BTREE *btree; WT_CKPT *ckpt, *ckptbase; WT_CONFIG dropconf; WT_CONFIG_ITEM cval, k, v; WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DECL_RET; WT_LSN ckptlsn; int deleted, was_modified; bool fake_ckpt, force, hot_backup_locked; const char *name; char *name_alloc; btree = S2BT(session); bm = btree->bm; conn = S2C(session); ckpt = ckptbase = NULL; dhandle = session->dhandle; was_modified = btree->modified; fake_ckpt = hot_backup_locked = false; name_alloc = NULL; /* * Set the checkpoint LSN to the maximum LSN so that if logging is * disabled, recovery will never roll old changes forward over the * non-logged changes in this checkpoint. If logging is enabled, a * real checkpoint LSN will be assigned later for this checkpoint and * overwrite this. */ WT_MAX_LSN(&ckptlsn); /* Get the list of checkpoints for this file. */ WT_RET(__wt_meta_ckptlist_get(session, dhandle->name, &ckptbase)); /* This may be a named checkpoint, check the configuration. */ cval.len = 0; if (cfg != NULL) WT_ERR(__wt_config_gets(session, cfg, "name", &cval)); if (cval.len == 0) name = WT_CHECKPOINT; else { WT_ERR(__wt_checkpoint_name_ok(session, cval.str, cval.len)); WT_ERR(__wt_strndup(session, cval.str, cval.len, &name_alloc)); name = name_alloc; } /* We may be dropping specific checkpoints, check the configuration. */ if (cfg != NULL) { cval.len = 0; WT_ERR(__wt_config_gets(session, cfg, "drop", &cval)); if (cval.len != 0) { WT_ERR(__wt_config_subinit(session, &dropconf, &cval)); while ((ret = __wt_config_next(&dropconf, &k, &v)) == 0) { /* Disallow unsafe checkpoint names. */ if (v.len == 0) WT_ERR(__wt_checkpoint_name_ok( session, k.str, k.len)); else WT_ERR(__wt_checkpoint_name_ok( session, v.str, v.len)); if (v.len == 0) __drop(ckptbase, k.str, k.len); else if (WT_STRING_MATCH("from", k.str, k.len)) __drop_from(ckptbase, v.str, v.len); else if (WT_STRING_MATCH("to", k.str, k.len)) __drop_to(ckptbase, v.str, v.len); else WT_ERR_MSG(session, EINVAL, "unexpected value for checkpoint " "key: %.*s", (int)k.len, k.str); } WT_ERR_NOTFOUND_OK(ret); } } /* Drop checkpoints with the same name as the one we're taking. */ __drop(ckptbase, name, strlen(name)); /* * Check for clean objects not requiring a checkpoint. * * If we're closing a handle, and the object is clean, we can skip the * checkpoint, whatever checkpoints we have are sufficient. (We might * not have any checkpoints if the object was never modified, and that's * OK: the object creation code doesn't mark the tree modified so we can * skip newly created trees here.) * * If the application repeatedly checkpoints an object (imagine hourly * checkpoints using the same explicit or internal name), there's no * reason to repeat the checkpoint for clean objects. The test is if * the only checkpoint we're deleting is the last one in the list and * it has the same name as the checkpoint we're about to take, skip the * work. (We can't skip checkpoints that delete more than the last * checkpoint because deleting those checkpoints might free up space in * the file.) This means an application toggling between two (or more) * checkpoint names will repeatedly take empty checkpoints, but that's * not likely enough to make detection worthwhile. * * Checkpoint read-only objects otherwise: the application must be able * to open the checkpoint in a cursor after taking any checkpoint, which * means it must exist. */ force = false; F_CLR(btree, WT_BTREE_SKIP_CKPT); if (!btree->modified && cfg != NULL) { ret = __wt_config_gets(session, cfg, "force", &cval); if (ret != 0 && ret != WT_NOTFOUND) WT_ERR(ret); if (ret == 0 && cval.val != 0) force = true; } if (!btree->modified && !force) { if (!is_checkpoint) goto nockpt; deleted = 0; WT_CKPT_FOREACH(ckptbase, ckpt) if (F_ISSET(ckpt, WT_CKPT_DELETE)) ++deleted; /* * Complicated test: if the last checkpoint in the object has * the same name as the checkpoint we're taking (correcting for * internal checkpoint names with their generational suffix * numbers), we can skip the checkpoint, there's nothing to do. * The exception is if we're deleting two or more checkpoints: * then we may save space. */ if (ckpt > ckptbase && (strcmp(name, (ckpt - 1)->name) == 0 || (WT_PREFIX_MATCH(name, WT_CHECKPOINT) && WT_PREFIX_MATCH((ckpt - 1)->name, WT_CHECKPOINT))) && deleted < 2) { nockpt: F_SET(btree, WT_BTREE_SKIP_CKPT); WT_PUBLISH(btree->checkpoint_gen, S2C(session)->txn_global.checkpoint_gen); WT_STAT_FAST_DATA_SET(session, btree_checkpoint_generation, btree->checkpoint_gen); goto done; } } /* Add a new checkpoint entry at the end of the list. */ WT_CKPT_FOREACH(ckptbase, ckpt) ; WT_ERR(__wt_strdup(session, name, &ckpt->name)); F_SET(ckpt, WT_CKPT_ADD); /* * We can't delete checkpoints if a backup cursor is open. WiredTiger * checkpoints are uniquely named and it's OK to have multiple of them * in the system: clear the delete flag for them, and otherwise fail. * Hold the lock until we're done (blocking hot backups from starting), * we don't want to race with a future hot backup. */ __wt_spin_lock(session, &conn->hot_backup_lock); hot_backup_locked = true; if (conn->hot_backup) WT_CKPT_FOREACH(ckptbase, ckpt) { if (!F_ISSET(ckpt, WT_CKPT_DELETE)) continue; if (WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) { F_CLR(ckpt, WT_CKPT_DELETE); continue; } WT_ERR_MSG(session, EBUSY, "checkpoint %s blocked by hot backup: it would " "delete an existing checkpoint, and checkpoints " "cannot be deleted during a hot backup", ckpt->name); } /* * Lock the checkpoints that will be deleted. * * Checkpoints are only locked when tracking is enabled, which covers * checkpoint and drop operations, but not close. The reasoning is * there should be no access to a checkpoint during close, because any * thread accessing a checkpoint will also have the current file handle * open. */ if (WT_META_TRACKING(session)) WT_CKPT_FOREACH(ckptbase, ckpt) { if (!F_ISSET(ckpt, WT_CKPT_DELETE)) continue; /* * We can't delete checkpoints referenced by a cursor. * WiredTiger checkpoints are uniquely named and it's * OK to have multiple in the system: clear the delete * flag for them, and otherwise fail. */ ret = __wt_session_lock_checkpoint(session, ckpt->name); if (ret == 0) continue; if (ret == EBUSY && WT_PREFIX_MATCH(ckpt->name, WT_CHECKPOINT)) { F_CLR(ckpt, WT_CKPT_DELETE); continue; } WT_ERR_MSG(session, ret, "checkpoints cannot be dropped when in-use"); } /* * There are special files: those being bulk-loaded, salvaged, upgraded * or verified during the checkpoint. We have to do something for those * objects because a checkpoint is an external name the application can * reference and the name must exist no matter what's happening during * the checkpoint. For bulk-loaded files, we could block until the load * completes, checkpoint the partial load, or magic up an empty-file * checkpoint. The first is too slow, the second is insane, so do the * third. * Salvage, upgrade and verify don't currently require any work, all * three hold the schema lock, blocking checkpoints. If we ever want to * fix that (and I bet we eventually will, at least for verify), we can * copy the last checkpoint the file has. That works if we guarantee * salvage, upgrade and verify act on objects with previous checkpoints * (true if handles are closed/re-opened between object creation and a * subsequent salvage, upgrade or verify operation). Presumably, * salvage and upgrade will discard all previous checkpoints when they * complete, which is fine with us. This change will require reference * counting checkpoints, and once that's done, we should use checkpoint * copy instead of forcing checkpoints on clean objects to associate * names with checkpoints. */ if (is_checkpoint) switch (F_MASK(btree, WT_BTREE_SPECIAL_FLAGS)) { case 0: break; case WT_BTREE_BULK: /* * The only checkpoints a bulk-loaded file should have * are fake ones we created without the underlying block * manager. I'm leaving this code here because it's a * cheap test and a nasty race. */ WT_CKPT_FOREACH(ckptbase, ckpt) if (!F_ISSET(ckpt, WT_CKPT_ADD | WT_CKPT_FAKE)) WT_ERR_MSG(session, ret, "block-manager checkpoint found " "for a bulk-loaded file"); fake_ckpt = true; goto fake; case WT_BTREE_SALVAGE: case WT_BTREE_UPGRADE: case WT_BTREE_VERIFY: WT_ERR_MSG(session, EINVAL, "checkpoints are blocked during salvage, upgrade " "or verify operations"); } /* * If an object has never been used (in other words, if it could become * a bulk-loaded file), then we must fake the checkpoint. This is good * because we don't write physical checkpoint blocks for just-created * files, but it's not just a good idea. The reason is because deleting * a physical checkpoint requires writing the file, and fake checkpoints * can't write the file. If you (1) create a physical checkpoint for an * empty file which writes blocks, (2) start bulk-loading records into * the file, (3) during the bulk-load perform another checkpoint with * the same name; in order to keep from having two checkpoints with the * same name you would have to use the bulk-load's fake checkpoint to * delete a physical checkpoint, and that will end in tears. */ if (is_checkpoint) if (btree->bulk_load_ok) { fake_ckpt = true; goto fake; } /* * Mark the root page dirty to ensure something gets written. (If the * tree is modified, we must write the root page anyway, this doesn't * add additional writes to the process. If the tree is not modified, * we have to dirty the root page to ensure something gets written.) * This is really about paranoia: if the tree modification value gets * out of sync with the set of dirty pages (modify is set, but there * are no dirty pages), we perform a checkpoint without any writes, no * checkpoint is created, and then things get bad. */ WT_ERR(__wt_page_modify_init(session, btree->root.page)); __wt_page_modify_set(session, btree->root.page); /* * Clear the tree's modified flag; any changes before we clear the flag * are guaranteed to be part of this checkpoint (unless reconciliation * skips updates for transactional reasons), and changes subsequent to * the checkpoint start, which might not be included, will re-set the * modified flag. The "unless reconciliation skips updates" problem is * handled in the reconciliation code: if reconciliation skips updates, * it sets the modified flag itself. Use a full barrier so we get the * store done quickly, this isn't a performance path. */ btree->modified = 0; WT_FULL_BARRIER(); /* Tell logging that a file checkpoint is starting. */ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) WT_ERR(__wt_txn_checkpoint_log( session, false, WT_TXN_LOG_CKPT_START, &ckptlsn)); /* Flush the file from the cache, creating the checkpoint. */ if (is_checkpoint) WT_ERR(__wt_cache_op(session, ckptbase, WT_SYNC_CHECKPOINT)); else WT_ERR(__wt_cache_op(session, ckptbase, WT_SYNC_CLOSE)); /* * All blocks being written have been written; set the object's write * generation. */ WT_CKPT_FOREACH(ckptbase, ckpt) if (F_ISSET(ckpt, WT_CKPT_ADD)) ckpt->write_gen = btree->write_gen; fake: /* * If we're faking a checkpoint and logging is enabled, recovery should * roll forward any changes made between now and the next checkpoint, * so set the checkpoint LSN to the beginning of time. */ if (fake_ckpt && FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) WT_INIT_LSN(&ckptlsn); /* * Update the object's metadata. * * If the object is the metadata, the call to __wt_meta_ckptlist_set * will update the turtle file and swap the new one into place. We * need to make sure the metadata is on disk before the turtle file is * updated. * * If we are doing a checkpoint in a file without a transaction (e.g., * closing a dirty tree before an exclusive operation like verify), * the metadata update will be auto-committed. In that case, we need to * sync the file here or we could roll forward the metadata in * recovery and open a checkpoint that isn't yet durable. */ if (F_ISSET(conn, WT_CONN_CKPT_SYNC) && (WT_IS_METADATA(dhandle) || !F_ISSET(&session->txn, WT_TXN_RUNNING))) WT_ERR(__wt_checkpoint_sync(session, NULL)); WT_ERR(__wt_meta_ckptlist_set( session, dhandle->name, ckptbase, &ckptlsn)); /* * If we wrote a checkpoint (rather than faking one), pages may be * available for re-use. If tracking enabled, defer making pages * available until transaction end. The exception is if the handle * is being discarded, in which case the handle will be gone by the * time we try to apply or unroll the meta tracking event. */ if (!fake_ckpt) { if (WT_META_TRACKING(session) && is_checkpoint) WT_ERR(__wt_meta_track_checkpoint(session)); else WT_ERR(bm->checkpoint_resolve(bm, session)); } /* Tell logging that the checkpoint is complete. */ if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) WT_ERR(__wt_txn_checkpoint_log( session, false, WT_TXN_LOG_CKPT_STOP, NULL)); done: err: /* * If the checkpoint didn't complete successfully, make sure the * tree is marked dirty. */ if (ret != 0 && !btree->modified && was_modified) btree->modified = 1; if (hot_backup_locked) __wt_spin_unlock(session, &conn->hot_backup_lock); __wt_meta_ckptlist_free(session, ckptbase); __wt_free(session, name_alloc); return (ret); } /* * __wt_checkpoint -- * Checkpoint a file. */ int __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) { /* Should not be called with a checkpoint handle. */ WT_ASSERT(session, session->dhandle->checkpoint == NULL); /* Should be holding the schema lock. */ WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); return (__checkpoint_worker(session, cfg, true)); } /* * __wt_checkpoint_sync -- * Sync a file that has been checkpointed, and wait for the result. */ int __wt_checkpoint_sync(WT_SESSION_IMPL *session, const char *cfg[]) { WT_BM *bm; WT_UNUSED(cfg); bm = S2BT(session)->bm; /* Should not be called with a checkpoint handle. */ WT_ASSERT(session, session->dhandle->checkpoint == NULL); /* Should have an underlying block manager reference. */ WT_ASSERT(session, bm != NULL); return (bm->sync(bm, session, false)); } /* * __wt_checkpoint_close -- * Checkpoint a single file as part of closing the handle. */ int __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final) { WT_BTREE *btree; WT_DECL_RET; bool bulk, need_tracking; btree = S2BT(session); bulk = F_ISSET(btree, WT_BTREE_BULK); /* If the handle is already dead, force the discard. */ if (F_ISSET(session->dhandle, WT_DHANDLE_DEAD)) return (__wt_cache_op(session, NULL, WT_SYNC_DISCARD_FORCE)); /* * If closing an unmodified file, check that no update is required * for active readers. */ if (!btree->modified && !bulk) { __wt_txn_update_oldest(session, true); return (__wt_txn_visible_all(session, btree->rec_max_txn) ? __wt_cache_op(session, NULL, WT_SYNC_DISCARD) : EBUSY); } /* * We should already have the schema lock unless we're finishing a bulk * load -- the only other paths to closing files (sweep and LSM) have * already checked for read-only trees. */ WT_ASSERT(session, final || bulk || F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); /* * Turn on metadata tracking if: * - The session is not already doing metadata tracking. * - The file was bulk loaded. * - The close is not during connection close. */ need_tracking = !WT_META_TRACKING(session) && !bulk && !final; if (need_tracking) WT_RET(__wt_meta_track_on(session)); WT_TRET(__checkpoint_worker(session, NULL, false)); if (need_tracking) WT_RET(__wt_meta_track_off(session, true, ret != 0)); return (ret); }