/*- * Copyright (c) 2014-present MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * * See the file LICENSE for redistribution information. */ #include "wt_internal.h" /* * __session_add_dhandle -- * Add a handle to the session's cache. */ static int __session_add_dhandle(WT_SESSION_IMPL *session) { WT_DATA_HANDLE_CACHE *dhandle_cache; uint64_t bucket; /* Allocate a handle cache entry. */ WT_RET(__wt_calloc_one(session, &dhandle_cache)); dhandle_cache->dhandle = session->dhandle; bucket = dhandle_cache->dhandle->name_hash & (S2C(session)->dh_hash_size - 1); TAILQ_INSERT_HEAD(&session->dhandles, dhandle_cache, q); TAILQ_INSERT_HEAD(&session->dhhash[bucket], dhandle_cache, hashq); return (0); } /* * __session_discard_dhandle -- * Remove a data handle from the session cache. */ static void __session_discard_dhandle(WT_SESSION_IMPL *session, WT_DATA_HANDLE_CACHE *dhandle_cache) { uint64_t bucket; bucket = dhandle_cache->dhandle->name_hash & (S2C(session)->dh_hash_size - 1); TAILQ_REMOVE(&session->dhandles, dhandle_cache, q); TAILQ_REMOVE(&session->dhhash[bucket], dhandle_cache, hashq); WT_DHANDLE_RELEASE(dhandle_cache->dhandle); __wt_overwrite_and_free(session, dhandle_cache); } /* * __session_find_dhandle -- * Search for a data handle in the session cache. */ static void __session_find_dhandle(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint, WT_DATA_HANDLE_CACHE **dhandle_cachep) { WT_DATA_HANDLE *dhandle; WT_DATA_HANDLE_CACHE *dhandle_cache; uint64_t bucket; dhandle = NULL; bucket = __wt_hash_city64(uri, strlen(uri)) & (S2C(session)->dh_hash_size - 1); retry: TAILQ_FOREACH (dhandle_cache, &session->dhhash[bucket], hashq) { dhandle = dhandle_cache->dhandle; if (WT_DHANDLE_INACTIVE(dhandle) && !WT_IS_METADATA(dhandle)) { __session_discard_dhandle(session, dhandle_cache); /* We deleted our entry, retry from the start. */ goto retry; } if (strcmp(uri, dhandle->name) != 0) continue; if (checkpoint == NULL && dhandle->checkpoint == NULL) break; if (checkpoint != NULL && dhandle->checkpoint != NULL && strcmp(checkpoint, dhandle->checkpoint) == 0) break; } *dhandle_cachep = dhandle_cache; } /* * __wt_session_lock_dhandle -- * Return when the current data handle is either (a) open with the requested lock mode; or (b) * closed and write locked. If exclusive access is requested and cannot be granted immediately * because the handle is in use, fail with EBUSY. Here is a brief summary of how different * operations synchronize using either the schema lock, handle locks or handle flags: open -- * one thread gets the handle exclusive, reverts to a shared handle lock once the handle is * open; bulk load -- * sets bulk and exclusive; salvage, truncate, update, verify -- * hold the schema lock, get the handle exclusive, set a "special" flag; sweep -- * gets a write lock on the handle, doesn't set exclusive The principle is that some application * operations can cause other application operations to fail (so attempting to open a cursor on * a file while it is being bulk-loaded will fail), but internal or database-wide operations * should not prevent application-initiated operations. For example, attempting to verify a file * should not fail because the sweep server happens to be in the process of closing that file. */ int __wt_session_lock_dhandle(WT_SESSION_IMPL *session, uint32_t flags, bool *is_deadp) { WT_BTREE *btree; WT_DATA_HANDLE *dhandle; WT_DECL_RET; bool is_open, lock_busy, want_exclusive; *is_deadp = false; dhandle = session->dhandle; btree = dhandle->handle; lock_busy = false; want_exclusive = LF_ISSET(WT_DHANDLE_EXCLUSIVE); /* * If this session already has exclusive access to the handle, there is no point trying to lock * it again. * * This should only happen if a checkpoint handle is locked multiple times during a checkpoint * operation, or the handle is already open without any special flags. In particular, it must * fail if attempting to checkpoint a handle opened for a bulk load, even in the same session. */ if (dhandle->excl_session == session) { if (!LF_ISSET(WT_DHANDLE_LOCK_ONLY) && (!F_ISSET(dhandle, WT_DHANDLE_OPEN) || (btree != NULL && F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)))) return (__wt_set_return(session, EBUSY)); ++dhandle->excl_ref; return (0); } /* * Check that the handle is open. We've already incremented the reference count, so once the * handle is open it won't be closed by another thread. * * If we can see the WT_DHANDLE_OPEN flag set while holding a lock on the handle, then it's * really open and we can start using it. Alternatively, if we can get an exclusive lock and * WT_DHANDLE_OPEN is still not set, we need to do the open. */ for (;;) { /* If the handle is dead, give up. */ if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) { *is_deadp = true; return (0); } /* * If the handle is already open for a special operation, give up. */ if (btree != NULL && F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)) return (__wt_set_return(session, EBUSY)); /* * If the handle is open, get a read lock and recheck. * * Wait for a read lock if we want exclusive access and failed to get it: the sweep server * may be closing this handle, and we need to wait for it to release its lock. If we want * exclusive access and find the handle open once we get the read lock, give up: some other * thread has it locked for real. */ if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && (!want_exclusive || lock_busy)) { WT_WITH_DHANDLE(session, dhandle, __wt_session_dhandle_readlock(session)); if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) { *is_deadp = true; WT_WITH_DHANDLE(session, dhandle, __wt_session_dhandle_readunlock(session)); return (0); } is_open = F_ISSET(dhandle, WT_DHANDLE_OPEN); if (is_open && !want_exclusive) return (0); WT_WITH_DHANDLE(session, dhandle, __wt_session_dhandle_readunlock(session)); } else is_open = false; /* * It isn't open or we want it exclusive: try to get an exclusive lock. There is some * subtlety here: if we race with another thread that successfully opens the file, we don't * want to block waiting to get exclusive access. */ WT_WITH_DHANDLE(session, dhandle, ret = __wt_session_dhandle_try_writelock(session)); if (ret == 0) { if (F_ISSET(dhandle, WT_DHANDLE_DEAD)) { *is_deadp = true; WT_WITH_DHANDLE(session, dhandle, __wt_session_dhandle_writeunlock(session)); return (0); } /* * If it was opened while we waited, drop the write lock and get a read lock instead. */ if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && !want_exclusive) { lock_busy = false; WT_WITH_DHANDLE(session, dhandle, __wt_session_dhandle_writeunlock(session)); continue; } /* We have an exclusive lock, we're done. */ F_SET(dhandle, WT_DHANDLE_EXCLUSIVE); WT_ASSERT(session, dhandle->excl_session == NULL && dhandle->excl_ref == 0); dhandle->excl_session = session; dhandle->excl_ref = 1; WT_ASSERT(session, !F_ISSET(dhandle, WT_DHANDLE_DEAD)); return (0); } if (ret != EBUSY || (is_open && want_exclusive) || LF_ISSET(WT_DHANDLE_LOCK_ONLY)) return (ret); lock_busy = true; /* Give other threads a chance to make progress. */ WT_STAT_CONN_INCR(session, dhandle_lock_blocked); __wt_yield(); } } /* * __wt_session_release_dhandle -- * Unlock a data handle. */ int __wt_session_release_dhandle(WT_SESSION_IMPL *session) { WT_BTREE *btree; WT_DATA_HANDLE *dhandle; WT_DATA_HANDLE_CACHE *dhandle_cache; WT_DECL_RET; bool locked, write_locked; dhandle = session->dhandle; btree = dhandle->handle; write_locked = F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE); locked = true; /* * If we had special flags set, close the handle so that future access can get a handle without * special flags. */ if (F_ISSET(dhandle, WT_DHANDLE_DISCARD | WT_DHANDLE_DISCARD_KILL)) { WT_SAVE_DHANDLE(session, __session_find_dhandle(session, dhandle->name, dhandle->checkpoint, &dhandle_cache)); if (dhandle_cache != NULL) __session_discard_dhandle(session, dhandle_cache); } /* * Close the handle if we are finishing a bulk load or if the handle is set to discard on * release. Bulk loads are special because they may have huge root pages in memory, and we need * to push those pages out of the cache. The only way to do that is to close the handle. */ if (btree != NULL && F_ISSET(btree, WT_BTREE_BULK)) { WT_ASSERT( session, F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) && !F_ISSET(dhandle, WT_DHANDLE_DISCARD)); /* * Acquire the schema lock while closing out the handles. This avoids racing with a * checkpoint while it gathers a set of handles. */ WT_WITH_SCHEMA_LOCK(session, ret = __wt_conn_dhandle_close(session, false, false)); } else if ((btree != NULL && F_ISSET(btree, WT_BTREE_SPECIAL_FLAGS)) || F_ISSET(dhandle, WT_DHANDLE_DISCARD | WT_DHANDLE_DISCARD_KILL)) { WT_ASSERT(session, F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE)); ret = __wt_conn_dhandle_close(session, false, F_ISSET(dhandle, WT_DHANDLE_DISCARD_KILL)); F_CLR(dhandle, WT_DHANDLE_DISCARD | WT_DHANDLE_DISCARD_KILL); } if (session == dhandle->excl_session) { if (--dhandle->excl_ref == 0) dhandle->excl_session = NULL; else locked = false; } if (locked) { if (write_locked) { F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); WT_WITH_DHANDLE(session, dhandle, __wt_session_dhandle_writeunlock(session)); } else WT_WITH_DHANDLE(session, dhandle, __wt_session_dhandle_readunlock(session)); } session->dhandle = NULL; return (ret); } /* * __session_fetch_checkpoint_meta -- * Retrieve information about the selected checkpoint. Notes on the returned values are found * under __session_lookup_checkpoint. */ static int __session_fetch_checkpoint_meta(WT_SESSION_IMPL *session, const char *ckpt_name, WT_CKPT_SNAPSHOT *info_ret, uint64_t *snapshot_time_ret, uint64_t *stable_time_ret, uint64_t *oldest_time_ret) { /* Get the timestamps. */ WT_RET(__wt_meta_read_checkpoint_timestamp( session, ckpt_name, &info_ret->stable_ts, stable_time_ret)); WT_RET( __wt_meta_read_checkpoint_oldest(session, ckpt_name, &info_ret->oldest_ts, oldest_time_ret)); /* Get the snapshot. */ WT_RET(__wt_meta_read_checkpoint_snapshot(session, ckpt_name, &info_ret->snapshot_write_gen, &info_ret->snapshot_min, &info_ret->snapshot_max, &info_ret->snapshot_txns, &info_ret->snapshot_count, snapshot_time_ret)); /* * If we successfully read a null snapshot, set the min and max to WT_TXN_MAX so everything is * visible. (Whether this is desirable isn't entirely clear, but if we leave them set to * WT_TXN_NONE, then nothing is visible, and that's clearly not useful. The other choices are to * fail, which doesn't help, or to signal somehow to the checkpoint cursor that it should run * without a dummy transaction, which doesn't work.) */ if (info_ret->snapshot_min == WT_TXN_NONE && info_ret->snapshot_max == WT_TXN_NONE) { info_ret->snapshot_min = info_ret->snapshot_max = WT_TXN_MAX; WT_ASSERT(session, info_ret->snapshot_txns == NULL && info_ret->snapshot_count == 0); } return (0); } /* * __session_fetch_checkpoint_snapshot_wall_time -- * Like __session_fetch_checkpoint_meta, but retrieves just the wall clock time of the snapshot. */ static int __session_fetch_checkpoint_snapshot_wall_time( WT_SESSION_IMPL *session, const char *ckpt_name, uint64_t *walltime) { return (__wt_meta_read_checkpoint_snapshot( session, ckpt_name, NULL, NULL, NULL, NULL, NULL, walltime)); } /* * __session_open_hs_ckpt -- * Get a btree handle for the requested checkpoint of the history store and return it. */ static int __session_open_hs_ckpt(WT_SESSION_IMPL *session, const char *checkpoint, const char *cfg[], uint32_t flags, int64_t order_expected, WT_DATA_HANDLE **hs_dhandlep) { WT_RET(__wt_session_get_dhandle(session, WT_HS_URI, checkpoint, cfg, flags)); if (session->dhandle->checkpoint_order != order_expected) { /* Not what we were expecting; treat as EBUSY and let the caller retry. */ WT_RET(__wt_session_release_dhandle(session)); return (__wt_set_return(session, EBUSY)); } /* The handle is left in the session; return it explicitly for caller's convenience. */ *hs_dhandlep = session->dhandle; return (0); } /* * __wt_session_get_btree_ckpt -- * Check the configuration strings for a checkpoint name. If opening a checkpoint, resolve the * checkpoint name, get a btree handle for it, load that into the session, and if requested with * non-null pointers, also resolve a matching history store checkpoint, open a handle for it, * return that, and also find and return the corresponding snapshot/timestamp metadata. The * transactions array in the snapshot info is allocated and must be freed by the caller on * success. If not opening a checkpoint, the history store dhandle and snapshot info is * immaterial; if the return pointers are not null, send back nulls and in particular never * allocate or open anything. */ int __wt_session_get_btree_ckpt(WT_SESSION_IMPL *session, const char *uri, const char *cfg[], uint32_t flags, WT_DATA_HANDLE **hs_dhandlep, WT_CKPT_SNAPSHOT *ckpt_snapshot) { WT_CONFIG_ITEM cval; WT_DECL_RET; uint64_t ds_time, first_snapshot_time, hs_time, oldest_time, snapshot_time, stable_time; int64_t ds_order, hs_order; const char *checkpoint, *hs_checkpoint; bool is_hs, is_unnamed_ckpt, is_reserved_name, must_resolve; ds_time = first_snapshot_time = hs_time = oldest_time = snapshot_time = stable_time = 0; ds_order = hs_order = 0; checkpoint = NULL; hs_checkpoint = NULL; is_hs = is_unnamed_ckpt = is_reserved_name = must_resolve = false; /* These should only be set together. Asking for only one doesn't make sense. */ WT_ASSERT(session, (hs_dhandlep == NULL) == (ckpt_snapshot == NULL)); if (hs_dhandlep != NULL) *hs_dhandlep = NULL; if (ckpt_snapshot != NULL) { ckpt_snapshot->ckpt_id = 0; ckpt_snapshot->oldest_ts = WT_TS_NONE; ckpt_snapshot->stable_ts = WT_TS_NONE; ckpt_snapshot->snapshot_write_gen = 0; ckpt_snapshot->snapshot_min = WT_TXN_MAX; ckpt_snapshot->snapshot_max = WT_TXN_MAX; ckpt_snapshot->snapshot_txns = NULL; ckpt_snapshot->snapshot_count = 0; } /* * This function exists to handle checkpoint configuration. Callers that never open a checkpoint * call the underlying function directly. */ WT_RET_NOTFOUND_OK(__wt_config_gets_def(session, cfg, "checkpoint", 0, &cval)); if (cval.len == 0) { /* We are not opening a checkpoint. This is the simple case; retire it immediately. */ return (__wt_session_get_dhandle(session, uri, NULL, cfg, flags)); } /* * Here and below is only for checkpoints. * * Ultimately, unless we're being opened from a context where we won't ever need to access the * history store, we need two dhandles and a set of snapshot/timestamp info that all match. * * "Match" here is a somewhat complex issue. In the simple case, it means trees and a snapshot * that came from the same global checkpoint. But because checkpoints skip clean trees, either * tree can potentially be from an earlier global checkpoint. This means we cannot readily * identify matching trees by looking at them (or by looking at their metadata either) -- both * the order numbers and the wall clock times can easily be different. Consequently we don't try * to actively find or check matching trees; instead we rely on the system to not produce * mutually inconsistent checkpoints, and read out whatever exists taking active steps to avoid * racing with a currently running checkpoint. * * Note that this fundamentally relies on partial checkpoints being prohibited. In the presence * of partial checkpoints we would have to actively find matching trees, and in many cases * (because old unnamed checkpoints are garbage collected) the proper matching history store * wouldn't exist any more and we'd be stuck. * * The scheme is as follows: 1. Read checkpoint info out of the metadata, and retry until we get * a consistent set; then 2. Open both dhandles and retry the whole thing if we didn't get the * trees we expected. * * For the first part, we look up the requested checkpoint in both the data store and history * store's metadata (either by name or for WiredTigerCheckpoint by picking the most recent * checkpoint), and look up the snapshot and timestamps in the global metadata. For all of these * we retrieve the wall clock time of the checkpoint, which we'll use to check for consistency. * For the trees we also retrieve the order numbers of the checkpoints, which we'll use to check * that the dhandles we open are the ones we wanted. (For unnamed checkpoints, they must be, * because unnamed checkpoints are never replaced, but for named checkpoints it's possible for * the open to race with regeneration of the checkpoint.) * * Because the snapshot information is always written by every checkpoint, and is written last, * we use its wall clock time as the reference. This is always the wall clock time of the most * recent completed global checkpoint of the same name, or the most recent completed unnamed * checkpoint, as appropriate. We read this time twice, once at the very beginning and again * along with the snapshot information itself at the end after the other items. If these two * times don't match, a global checkpoint completed while we were reading. In this case we * cannot tell for sure if we read one of the trees' metadata before the checkpoint updated it; * if the tree's wall clock time is older than the snapshot's, it might be because that tree was * skipped, but it might also be because there was an update but we read before the update * happened. Therefore, we need to retry. * * If the two copies of the snapshot time match, we check the other wall clock times against the * snapshot time. If any of the items are newer, they were written by a currently running * checkpoint that hasn't finished yet, and we need to retry. * * (For the timestamps it is slightly easier; either timestamp might not be present, in which * case both the timestamp and its associated time will read back as zero. We take advantage of * the knowledge that for both these timestamps the system cannot transition from a state with * the timestamp set to one where it is not, and therefore once any checkpoint includes either * timestamp, every subsequent checkpoint will too. Therefore, the timestamps' wall times should * either match the snapshot or be zero; and if they're zero, it doesn't matter if they were * actually zero in a newer, currently running checkpoint, because then they must have always * been zero.) * * This scheme relies on the fact that the checkpoint wall clock time always moves forward. Each * checkpoint is given a wall clock time at least one second greater than the previous * checkpoint. Before recovery, we load the time of the last successful checkpoint in the * previous database so we can ensure checkpoint times increase across restarts. This avoids * trouble if the system clock moves backwards between runs, and also avoids possible issues if * the checkpoint clock runs forward. (See comment about that in * __txn_checkpoint_establish_time().) When reading from a previous database, the checkpoint * time in the snapshot and timestamp metadata default to zero if not present, avoiding * confusion caused by older versions that don't include these values. * * Also note that only the exact name "WiredTigerCheckpoint" needs to be resolved. Requests to * open specific versions, such as "WiredTigerCheckpoint.6", is forbidden. * * It is also at least theoretically possible for there to be no matching history store * checkpoint. If looking up the checkpoint names finds no history store checkpoint, its name * will come back as null and we must avoid trying to open it, either here or later on in the * life of the checkpoint cursor. */ is_hs = strcmp(uri, WT_HS_URI) == 0; if (is_hs) /* We're opening the history store directly, so don't open it twice. */ hs_dhandlep = NULL; /* * Applications can use the internal reserved name "WiredTigerCheckpoint" to open the latest * checkpoint, but they are not allowed to directly open specific checkpoint versions, such as * "WiredTigerCheckpoint.6". However, internally it is allowed to open the history store with a * specific version. */ is_reserved_name = cval.len > strlen(WT_CHECKPOINT) && WT_PREFIX_MATCH(cval.str, WT_CHECKPOINT); if (is_reserved_name && (!is_hs || session->hs_checkpoint == NULL)) WT_RET_MSG( session, EINVAL, "the prefix \"%s\" for checkpoint cursors is reserved", WT_CHECKPOINT); /* * Test for the internal checkpoint name (WiredTigerCheckpoint). Note: must_resolve is true in a * subset of the cases where is_unnamed_ckpt is true. */ must_resolve = cval.len == strlen(WT_CHECKPOINT) && WT_PREFIX_MATCH(cval.str, WT_CHECKPOINT); is_unnamed_ckpt = cval.len >= strlen(WT_CHECKPOINT) && WT_PREFIX_MATCH(cval.str, WT_CHECKPOINT); /* This is the top of a retry loop. */ do { ret = 0; if (!must_resolve) /* Copy the checkpoint name first because we may need it to get the first wall time. */ WT_RET(__wt_strndup(session, cval.str, cval.len, &checkpoint)); if (ckpt_snapshot != NULL) { /* We're about to re-fetch this; discard the prior version. No effect the first time. */ __wt_free(session, ckpt_snapshot->snapshot_txns); /* * Now, as the first step of the retrieval process, get the wall-clock time of the * snapshot metadata (only). If we need the name, we'll have copied it already. */ WT_RET(__session_fetch_checkpoint_snapshot_wall_time( session, is_unnamed_ckpt ? NULL : checkpoint, &first_snapshot_time)); } if (must_resolve) /* Look up the most recent data store checkpoint. This fetches the exact name to use. */ WT_RET(__wt_meta_checkpoint_last_name(session, uri, &checkpoint, &ds_order, &ds_time)); else /* Look up the checkpoint by name and get its time and order information. */ WT_RET(__wt_meta_checkpoint_by_name(session, uri, checkpoint, &ds_order, &ds_time)); /* Look up the history store checkpoint. */ if (hs_dhandlep != NULL) { if (must_resolve) WT_RET_NOTFOUND_OK(__wt_meta_checkpoint_last_name( session, WT_HS_URI, &hs_checkpoint, &hs_order, &hs_time)); else { ret = __wt_meta_checkpoint_by_name(session, WT_HS_URI, checkpoint, &hs_order, &hs_time); WT_RET_NOTFOUND_OK(ret); if (ret == WT_NOTFOUND) ret = 0; else WT_RET(__wt_strdup(session, checkpoint, &hs_checkpoint)); } } /* * If we were asked for snapshot metadata, fetch it now, including the time (comparable to * checkpoint times) for each element. */ if (ckpt_snapshot != NULL) { WT_RET(__session_fetch_checkpoint_meta(session, is_unnamed_ckpt ? NULL : checkpoint, ckpt_snapshot, &snapshot_time, &stable_time, &oldest_time)); /* * If we have not raced with a checkpoint, we still may have an inconsistency in a * specific scenario that involves bulk operations. When a bulk operation finishes, it * generates a single file checkpoint which is different from a system wide checkpoint. * The single file checkpoint only bumps the data store time which makes it ahead of the * last system wide checkpoint time and leads to the inconsistency. */ if (first_snapshot_time == snapshot_time && ds_time > snapshot_time && hs_time <= snapshot_time) { /* * If a system wide checkpoint is running, the inconsistency should be resolved, it * is worth retrying. */ if (S2C(session)->txn_global.checkpoint_running) ret = __wt_set_return(session, EBUSY); else { __wt_verbose_warning(session, WT_VERB_DEFAULT, "Session (@: 0x%p name: %s) could not open the checkpoint '%s' (config: %s) " "on the file '%s'.", (void *)session, session->name == NULL ? "EMPTY" : session->name, checkpoint, cval.str, uri); ret = __wt_set_return(session, WT_NOTFOUND); goto err; } } /* * Check if we raced with a running checkpoint. * * If the two copies of the snapshot don't match, or if any of the other metadata items' * time is newer than the snapshot, we read in the middle of that material being updated * and we need to retry. * * Otherwise we have successfully gotten a matching set, as described above. * * If there is no history store checkpoint, its time will be zero, which will be * accepted. * * We skip the test entirely if we aren't trying to return a snapshot (and therefore not * history either) because there's nothing to check, and if we didn't retrieve the * snapshot its time will be 0 and the check will fail gratuitously and lead to retrying * forever. */ else if (first_snapshot_time != snapshot_time || ds_time > snapshot_time || hs_time > snapshot_time || stable_time > snapshot_time || oldest_time > snapshot_time) ret = __wt_set_return(session, EBUSY); else { /* Crosscheck that we didn't somehow get an older timestamp. */ WT_ASSERT(session, stable_time == snapshot_time || stable_time == 0); WT_ASSERT(session, oldest_time == snapshot_time || oldest_time == 0); } /* * Return the snapshot's wall time as the (global) checkpoint ID. The ID is a 64-bit * value of unspecified semantics such that if you open the same checkpoint name and get * different IDs, the cursors you got are looking at different versions of that * checkpoint, which usually isn't what you want. Test code uses this to check whether a * collection of checkpoint cursors they opened on different files all came from the * same global checkpoint or not. This is the same problem as checking if the history * store checkpoint and data store checkpoint match, so the wall time is the right thing * to use for it. */ ckpt_snapshot->ckpt_id = snapshot_time; } if (ret == 0) { /* Get a handle for the data store. */ ret = __wt_session_get_dhandle(session, uri, checkpoint, cfg, flags); if (ret == 0 && session->dhandle->checkpoint_order != ds_order) { /* The tree we opened is newer than the one we expected; need to retry. */ WT_TRET(__wt_session_release_dhandle(session)); WT_TRET(__wt_set_return(session, EBUSY)); } } if (ret == 0 && hs_checkpoint != NULL) { /* Get a handle for the history store. */ WT_ASSERT(session, hs_dhandlep != NULL); WT_WITHOUT_DHANDLE(session, ret = __session_open_hs_ckpt(session, hs_checkpoint, cfg, flags, hs_order, hs_dhandlep)); if (ret != 0) WT_TRET(__wt_session_release_dhandle(session)); } /* Drop the names; we don't need them any more. Nulls the pointers; retry relies on that. */ __wt_free(session, checkpoint); __wt_free(session, hs_checkpoint); /* * There's a potential race: we get the name of the most recent unnamed checkpoint, but if * it's discarded (or locked so it can be discarded) by the time we try to open it, we'll * fail the open. Retry in those cases; a new checkpoint version should surface, and we * can't return an error. The application will be justifiably upset if we can't open the * last checkpoint instance of an object. * * The WT_NOTFOUND condition will eventually clear; some unnamed checkpoint existed when we * looked up the name (otherwise we would have failed then) so a new one must be in * progress. * * At this point we should either have ret == 0 and the handles we were asked for, or ret != * 0 and no handles. * * For named checkpoints, we don't retry, I guess because the application ought not to try * to open its checkpoints while regenerating them. */ } while (is_unnamed_ckpt && (ret == WT_NOTFOUND || ret == EBUSY)); err: __wt_free(session, checkpoint); __wt_free(session, hs_checkpoint); return (ret); } /* * __wt_session_close_cache -- * Close any cached handles in a session. */ void __wt_session_close_cache(WT_SESSION_IMPL *session) { WT_DATA_HANDLE_CACHE *dhandle_cache, *dhandle_cache_tmp; WT_TAILQ_SAFE_REMOVE_BEGIN(dhandle_cache, &session->dhandles, q, dhandle_cache_tmp) { __session_discard_dhandle(session, dhandle_cache); } WT_TAILQ_SAFE_REMOVE_END } /* * __wt_session_dhandle_sweep -- * Discard any session dhandles that are not open. */ void __wt_session_dhandle_sweep(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; WT_DATA_HANDLE *dhandle; WT_DATA_HANDLE_CACHE *dhandle_cache, *dhandle_cache_tmp; uint64_t now; conn = S2C(session); /* * Periodically sweep for dead handles; if we've swept recently, don't do it again. */ __wt_seconds(session, &now); if (now - session->last_sweep < conn->sweep_interval) return; session->last_sweep = now; WT_STAT_CONN_INCR(session, dh_session_sweeps); TAILQ_FOREACH_SAFE(dhandle_cache, &session->dhandles, q, dhandle_cache_tmp) { dhandle = dhandle_cache->dhandle; /* * Only discard handles that are dead or dying and, in the case of btrees, have been * evicted. These checks are not done with any locks in place, other than the data handle * reference, so we cannot peer past what is in the dhandle directly. */ if (dhandle != session->dhandle && dhandle->session_inuse == 0 && (WT_DHANDLE_INACTIVE(dhandle) || (dhandle->timeofdeath != 0 && now - dhandle->timeofdeath > conn->sweep_idle_time)) && (!WT_DHANDLE_BTREE(dhandle) || F_ISSET(dhandle, WT_DHANDLE_EVICTED))) { WT_STAT_CONN_INCR(session, dh_session_handles); WT_ASSERT(session, !WT_IS_METADATA(dhandle)); __session_discard_dhandle(session, dhandle_cache); } } } /* * __session_find_shared_dhandle -- * Search for a data handle in the connection and add it to a session's cache. We must increment * the handle's reference count while holding the handle list lock. */ static int __session_find_shared_dhandle(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) { WT_DECL_RET; WT_WITH_HANDLE_LIST_READ_LOCK(session, if ((ret = __wt_conn_dhandle_find(session, uri, checkpoint)) == 0) WT_DHANDLE_ACQUIRE(session->dhandle)); if (ret != WT_NOTFOUND) return (ret); WT_WITH_HANDLE_LIST_WRITE_LOCK(session, if ((ret = __wt_conn_dhandle_alloc(session, uri, checkpoint)) == 0) WT_DHANDLE_ACQUIRE(session->dhandle)); return (ret); } /* * __session_get_dhandle -- * Search for a data handle, first in the session cache, then in the connection. */ static int __session_get_dhandle(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint) { WT_DATA_HANDLE_CACHE *dhandle_cache; WT_DECL_RET; __session_find_dhandle(session, uri, checkpoint, &dhandle_cache); if (dhandle_cache != NULL) { session->dhandle = dhandle_cache->dhandle; return (0); } /* Sweep the handle list to remove any dead handles. */ __wt_session_dhandle_sweep(session); /* * We didn't find a match in the session cache, search the shared handle list and cache the * handle we find. */ WT_RET(__session_find_shared_dhandle(session, uri, checkpoint)); /* * Fixup the reference count on failure (we incremented the reference count while holding the * handle-list lock). */ if ((ret = __session_add_dhandle(session)) != 0) { WT_DHANDLE_RELEASE(session->dhandle); session->dhandle = NULL; } return (ret); } /* * __wt_session_dhandle_readlock -- * Acquire read lock for the session's current dhandle. */ void __wt_session_dhandle_readlock(WT_SESSION_IMPL *session) { WT_ASSERT(session, session->dhandle != NULL); __wt_readlock(session, &session->dhandle->rwlock); } /* * __wt_session_dhandle_readunlock -- * Release read lock for the session's current dhandle. */ void __wt_session_dhandle_readunlock(WT_SESSION_IMPL *session) { WT_ASSERT(session, session->dhandle != NULL); __wt_readunlock(session, &session->dhandle->rwlock); } /* * __wt_session_dhandle_writeunlock -- * Release write lock for the session's current dhandle. */ void __wt_session_dhandle_writeunlock(WT_SESSION_IMPL *session) { WT_ASSERT(session, session->dhandle != NULL); WT_ASSERT(session, FLD_ISSET(session->dhandle->lock_flags, WT_DHANDLE_LOCK_WRITE)); FLD_CLR(session->dhandle->lock_flags, WT_DHANDLE_LOCK_WRITE); __wt_writeunlock(session, &session->dhandle->rwlock); } /* * __wt_session_dhandle_try_writelock -- * Try to acquire write lock for the session's current dhandle. */ int __wt_session_dhandle_try_writelock(WT_SESSION_IMPL *session) { WT_DECL_RET; WT_ASSERT(session, session->dhandle != NULL); if ((ret = __wt_try_writelock(session, &session->dhandle->rwlock)) == 0) FLD_SET(session->dhandle->lock_flags, WT_DHANDLE_LOCK_WRITE); return (ret); } /* * __wt_session_get_dhandle -- * Get a data handle for the given name, set session->dhandle. Optionally if we opened a * checkpoint return its checkpoint order number. */ int __wt_session_get_dhandle(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint, const char *cfg[], uint32_t flags) { WT_DATA_HANDLE *dhandle; WT_DECL_RET; bool is_dead; WT_ASSERT(session, !F_ISSET(session, WT_SESSION_NO_DATA_HANDLES)); for (;;) { WT_RET(__session_get_dhandle(session, uri, checkpoint)); dhandle = session->dhandle; /* Try to lock the handle. */ WT_RET(__wt_session_lock_dhandle(session, flags, &is_dead)); if (is_dead) continue; /* If the handle is open in the mode we want, we're done. */ if (LF_ISSET(WT_DHANDLE_LOCK_ONLY) || (F_ISSET(dhandle, WT_DHANDLE_OPEN) && !LF_ISSET(WT_BTREE_SPECIAL_FLAGS))) break; WT_ASSERT(session, F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE)); /* * For now, we need the schema lock and handle list locks to open a file for real. * * Code needing exclusive access (such as drop or verify) assumes that it can close all open * handles, then open an exclusive handle on the active tree and no other threads can reopen * handles in the meantime. A combination of the schema and handle list locks are used to * enforce this. */ if (!FLD_ISSET(session->lock_flags, WT_SESSION_LOCKED_SCHEMA)) { dhandle->excl_session = NULL; dhandle->excl_ref = 0; F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); WT_WITH_DHANDLE(session, dhandle, __wt_session_dhandle_writeunlock(session)); WT_WITH_SCHEMA_LOCK( session, ret = __wt_session_get_dhandle(session, uri, checkpoint, cfg, flags)); return (ret); } /* Open the handle. */ if ((ret = __wt_conn_dhandle_open(session, cfg, flags)) == 0 && LF_ISSET(WT_DHANDLE_EXCLUSIVE)) break; /* * If we got the handle exclusive to open it but only want ordinary access, drop our lock * and retry the open. */ dhandle->excl_session = NULL; dhandle->excl_ref = 0; F_CLR(dhandle, WT_DHANDLE_EXCLUSIVE); WT_WITH_DHANDLE(session, dhandle, __wt_session_dhandle_writeunlock(session)); WT_RET(ret); } WT_ASSERT(session, !F_ISSET(dhandle, WT_DHANDLE_DEAD)); WT_ASSERT(session, LF_ISSET(WT_DHANDLE_LOCK_ONLY) || F_ISSET(dhandle, WT_DHANDLE_OPEN)); WT_ASSERT(session, LF_ISSET(WT_DHANDLE_EXCLUSIVE) == F_ISSET(dhandle, WT_DHANDLE_EXCLUSIVE) || dhandle->excl_ref > 1); return (0); } /* * __wt_session_lock_checkpoint -- * Lock the btree handle for the given checkpoint name. */ int __wt_session_lock_checkpoint(WT_SESSION_IMPL *session, const char *checkpoint) { WT_DATA_HANDLE *saved_dhandle; WT_DECL_RET; WT_ASSERT(session, WT_META_TRACKING(session)); saved_dhandle = session->dhandle; /* * Get the checkpoint handle exclusive, so no one else can access it while we are creating the * new checkpoint. Hold the lock until the checkpoint completes. */ WT_ERR(__wt_session_get_dhandle( session, saved_dhandle->name, checkpoint, NULL, WT_DHANDLE_EXCLUSIVE | WT_DHANDLE_LOCK_ONLY)); if ((ret = __wt_meta_track_handle_lock(session, false)) != 0) { WT_TRET(__wt_session_release_dhandle(session)); goto err; } /* * Get exclusive access to the handle and then flush any pages in this checkpoint from the cache * (we are about to re-write the checkpoint which will mean cached pages no longer have valid * contents). This is especially noticeable with memory mapped files, since changes to the * underlying file are visible to the in-memory pages. */ WT_ERR(__wt_evict_file_exclusive_on(session)); ret = __wt_evict_file(session, WT_SYNC_DISCARD); __wt_evict_file_exclusive_off(session); WT_ERR(ret); /* * We lock checkpoint handles that we are overwriting, so the handle must be closed when we * release it. */ F_SET(session->dhandle, WT_DHANDLE_DISCARD); /* Restore the original data handle in the session. */ err: session->dhandle = saved_dhandle; return (ret); }