summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/tiered
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2021-06-02 16:11:25 +1000
committerEvergreen Agent <no-reply@evergreen.mongodb.com>2021-06-02 06:26:56 +0000
commit6c317bc5fa081cd800ccdc865e5775fab31b3d88 (patch)
tree616a4aac7f533a383322ebe4c4bbd24c99483e5b /src/third_party/wiredtiger/src/tiered
parent727cb006bc1723840d5cf98775d56f2b0eaca8a3 (diff)
downloadmongo-6c317bc5fa081cd800ccdc865e5775fab31b3d88.tar.gz
Import wiredtiger: 7374df6c344587d433853d01f0c6241428ab7a80 from branch mongodb-4.4
ref: bae0c1c914..7374df6c34 for: 4.4.7 WT-6230 Sanitize python test suite directory naming WT-6436 Fix not resetting the key when retrying to search the history store WT-6555 Fix memory error in test_txn13 WT-7135 Additional checks to detect when writing corrupted metadata WT-7267 Compare entire history store key when inferring cursor position in `search_near` WT-7348 Complete CMake POSIX support WT-7379 Disable column store tests in compatibility test WT-7440 Integrate file cursor with tiered storage WT-7452 Improve logging when recovery (and RTS) is taking a long time WT-7469 Fix potential hot backup read lock never unlocked WT-7493 Add a new connection config to control the page eviction with update restore eviction WT-7498 Implement tiered storage internal thread operations WT-7504 Fix test_hs21 cache stuck dirty WT-7510 Disable import when direct I/O is enabled in test/format WT-7532 Hold schema lock when tiered manager calls flush_tier_once WT-7541 Updated evergreen command to parse folder names with undesirable characters WT-7542 Add a Python test to reconfigure zstd compression level after restart WT-7545 Limit upgrade/downgrade testing to timestamp transactions at snapshot isolation WT-7548 Create macro to identify dhandles directly associated with a Btree WT-7549 clean up block manager identifiers to use object id naming WT-7550 Properly check pinned page and fix not resetting cursor if error WT-7565 Update invalid backup configurations WT-7566 Resolve write after free for dead dhandle WT-7567 Rework tiered storage reconfigure WT-7569 Fix wrongly squash an out of order timestamp update WT-7573 Print an error message and exit for invalid backup configurations in wtperf tests WT-7574 disable compact tests for OS/X WT-7581 Make wt_cache_config args consistent with other config functions WT-7595 Add flag to history store cursor to track whether underlying table insertion was successful WT-7602 Fix MacOS CMake Compilation Issues Reverted ticket(s): WT-7503 Change default compressor for WT HS to Zstandard
Diffstat (limited to 'src/third_party/wiredtiger/src/tiered')
-rw-r--r--src/third_party/wiredtiger/src/tiered/tiered_config.c10
-rw-r--r--src/third_party/wiredtiger/src/tiered/tiered_cursor.c1228
-rw-r--r--src/third_party/wiredtiger/src/tiered/tiered_handle.c63
-rw-r--r--src/third_party/wiredtiger/src/tiered/tiered_work.c151
4 files changed, 176 insertions, 1276 deletions
diff --git a/src/third_party/wiredtiger/src/tiered/tiered_config.c b/src/third_party/wiredtiger/src/tiered/tiered_config.c
index 6971ec4b7b5..8c0ec27333e 100644
--- a/src/third_party/wiredtiger/src/tiered/tiered_config.c
+++ b/src/third_party/wiredtiger/src/tiered/tiered_config.c
@@ -42,6 +42,8 @@ __tiered_common_config(WT_SESSION_IMPL *session, const char **cfg, WT_BUCKET_STO
{
WT_CONFIG_ITEM cval;
+ if (bstorage == NULL)
+ return (0);
WT_RET(__wt_config_gets(session, cfg, "tiered_storage.local_retention", &cval));
bstorage->retain_secs = (uint64_t)cval.val;
@@ -153,6 +155,8 @@ __wt_tiered_conn_config(WT_SESSION_IMPL *session, const char **cfg, bool reconfi
if (!reconfig)
WT_RET(__wt_tiered_bucket_config(session, cfg, &conn->bstorage));
+ else
+ WT_ERR(__tiered_common_config(session, cfg, conn->bstorage));
/* If the connection is not set up for tiered storage there is nothing more to do. */
if (conn->bstorage == NULL)
@@ -161,13 +165,7 @@ __wt_tiered_conn_config(WT_SESSION_IMPL *session, const char **cfg, bool reconfi
__wt_verbose(
session, WT_VERB_TIERED, "TIERED_CONFIG: prefix %s", conn->bstorage->bucket_prefix);
- /*
- * If reconfiguring, see if the other settings have changed on the system bucket storage.
- */
WT_ASSERT(session, conn->bstorage != NULL);
- if (reconfig)
- WT_ERR(__tiered_common_config(session, cfg, conn->bstorage));
-
WT_STAT_CONN_SET(session, tiered_object_size, conn->bstorage->object_size);
WT_STAT_CONN_SET(session, tiered_retention, conn->bstorage->retain_secs);
diff --git a/src/third_party/wiredtiger/src/tiered/tiered_cursor.c b/src/third_party/wiredtiger/src/tiered/tiered_cursor.c
deleted file mode 100644
index c913f9b33ca..00000000000
--- a/src/third_party/wiredtiger/src/tiered/tiered_cursor.c
+++ /dev/null
@@ -1,1228 +0,0 @@
-/*-
- * Copyright (c) 2014-present MongoDB, Inc.
- * Copyright (c) 2008-2014 WiredTiger, Inc.
- * All rights reserved.
- *
- * See the file LICENSE for redistribution information.
- */
-
-#include "wt_internal.h"
-
-#define WT_FORALL_CURSORS(curtiered, c, i) \
- for ((i) = 0; i < WT_TIERED_MAX_TIERS;) \
- if (((c) = (curtiered)->cursors[(i)++]) != NULL)
-
-#define WT_TIERED_CURCMP(s, tiered, c1, c2, cmp) \
- __wt_compare(s, (tiered)->collator, &(c1)->key, &(c2)->key, &(cmp))
-
-/*
- * __curtiered_open_cursors --
- * Open cursors for the current set of files.
- */
-static int
-__curtiered_open_cursors(WT_CURSOR_TIERED *curtiered)
-{
- WT_CURSOR *cursor;
- WT_DATA_HANDLE *dhandle;
- WT_DECL_RET;
- WT_SESSION_IMPL *session;
- WT_TIERED *tiered;
- u_int i;
-
- cursor = &curtiered->iface;
- session = CUR2S(curtiered);
- dhandle = NULL;
- tiered = curtiered->tiered;
-
- /*
- * If the key is pointing to memory that is pinned by a tier cursor, take a copy before closing
- * cursors.
- */
- if (F_ISSET(cursor, WT_CURSTD_KEY_INT))
- WT_ERR(__cursor_needkey(cursor));
-
- F_CLR(curtiered, WT_CURTIERED_ITERATE_NEXT | WT_CURTIERED_ITERATE_PREV);
-
- WT_ASSERT(session, curtiered->cursors == NULL);
- WT_ERR(__wt_calloc_def(session, WT_TIERED_MAX_TIERS, &curtiered->cursors));
-
- /* Open the cursors for tiers that have changed. */
- __wt_verbose(session, WT_VERB_TIERED,
- "tiered opening cursor session(%p):tiered cursor(%p), tiers: %d", (void *)session,
- (void *)curtiered, (int)WT_TIERED_MAX_TIERS);
- for (i = 0; i < WT_TIERED_MAX_TIERS; i++) {
- dhandle = tiered->tiers[i].tier;
- if (dhandle == NULL)
- continue;
-
- /*
- * Read from the checkpoint if the file has been written. Once all cursors switch, the
- * in-memory tree can be evicted.
- */
- WT_ASSERT(session, curtiered->cursors[i] == NULL);
- WT_ERR(__wt_open_cursor(session, dhandle->name, cursor, NULL, &curtiered->cursors[i]));
-
- /* Child cursors always use overwrite and raw mode. */
- F_SET(curtiered->cursors[i], WT_CURSTD_OVERWRITE | WT_CURSTD_RAW);
- }
-
-err:
- return (ret);
-}
-
-/*
- * __curtiered_close_cursors --
- * Close any btree cursors that are not needed.
- */
-static int
-__curtiered_close_cursors(WT_SESSION_IMPL *session, WT_CURSOR_TIERED *curtiered)
-{
- WT_CURSOR *c;
- u_int i;
-
- __wt_verbose(session, WT_VERB_TIERED, "tiered close cursors session(%p):tiered cursor(%p)",
- (void *)session, (void *)curtiered);
-
- if (curtiered->cursors == NULL)
- return (0);
-
- /* Walk the cursors, closing them. */
- for (i = 0; i < WT_TIERED_MAX_TIERS; i++) {
- if ((c = (curtiered)->cursors[i]) != NULL) {
- curtiered->cursors[i] = NULL;
- WT_RET(c->close(c));
- }
- }
-
- __wt_free(session, curtiered->cursors);
- return (0);
-}
-
-/*
- * __curtiered_reset_cursors --
- * Reset any positioned tier cursors. If the skip parameter is non-NULL, that cursor is about to
- * be used, so there is no need to reset it.
- */
-static int
-__curtiered_reset_cursors(WT_CURSOR_TIERED *curtiered, WT_CURSOR *skip)
-{
- WT_CURSOR *c;
- WT_DECL_RET;
- u_int i;
-
- /* Fast path if the cursor is not positioned. */
- if ((curtiered->current == NULL || curtiered->current == skip) &&
- !F_ISSET(curtiered, WT_CURTIERED_ITERATE_NEXT | WT_CURTIERED_ITERATE_PREV))
- return (0);
-
- WT_FORALL_CURSORS(curtiered, c, i)
- {
- if (c == skip)
- continue;
- if (F_ISSET(c, WT_CURSTD_KEY_INT))
- WT_TRET(c->reset(c));
- }
-
- curtiered->current = NULL;
- F_CLR(curtiered, WT_CURTIERED_ITERATE_NEXT | WT_CURTIERED_ITERATE_PREV);
-
- return (ret);
-}
-
-/*
- * __curtiered_enter --
- * Start an operation on a tiered cursor.
- */
-static inline int
-__curtiered_enter(WT_CURSOR_TIERED *curtiered, bool reset)
-{
- WT_SESSION_IMPL *session;
-
- session = CUR2S(curtiered);
-
- if (curtiered->cursors == NULL)
- WT_RET(__curtiered_open_cursors(curtiered));
-
- if (reset) {
- WT_ASSERT(session, !F_ISSET(&curtiered->iface, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT));
- WT_RET(__curtiered_reset_cursors(curtiered, NULL));
- }
-
- if (!F_ISSET(curtiered, WT_CURTIERED_ACTIVE)) {
- /*
- * Opening this tiered cursor has opened a number of other cursors, ensure we don't mistake
- * this as the first cursor in a session.
- */
- ++session->ncursors;
- WT_RET(__cursor_enter(session));
- F_SET(curtiered, WT_CURTIERED_ACTIVE);
- }
-
- return (0);
-}
-/*
- * __curtiered_leave --
- * Finish an operation on a tiered cursor.
- */
-static void
-__curtiered_leave(WT_CURSOR_TIERED *curtiered)
-{
- WT_SESSION_IMPL *session;
-
- session = CUR2S(curtiered);
-
- if (F_ISSET(curtiered, WT_CURTIERED_ACTIVE)) {
- --session->ncursors;
- __cursor_leave(session);
- F_CLR(curtiered, WT_CURTIERED_ACTIVE);
- }
-}
-
-/*
- * We need a tombstone to mark deleted records, and we use the special value below for that purpose.
- * We use two 0x14 (Device Control 4) bytes to minimize the likelihood of colliding with an
- * application-chosen encoding byte, if the application uses two leading DC4 byte for some reason,
- * we'll do a wasted data copy each time a new value is inserted into the object.
- */
-static const WT_ITEM __tombstone = {"\x14\x14", 2, NULL, 0, 0};
-
-/*
- * __curtiered_deleted --
- * Check whether the current value is a tombstone.
- */
-static inline bool
-__curtiered_deleted(WT_CURSOR_TIERED *curtiered, const WT_ITEM *item)
-{
- WT_UNUSED(curtiered);
- return (item->size == __tombstone.size &&
- memcmp(item->data, __tombstone.data, __tombstone.size) == 0);
-}
-
-/*
- * __curtiered_deleted_encode --
- * Encode values that are in the encoded name space.
- */
-static inline int
-__curtiered_deleted_encode(
- WT_SESSION_IMPL *session, const WT_ITEM *value, WT_ITEM *final_value, WT_ITEM **tmpp)
-{
- WT_ITEM *tmp;
-
- /*
- * If value requires encoding, get a scratch buffer of the right size and create a copy of the
- * data with the first byte of the tombstone appended.
- */
- if (value->size >= __tombstone.size &&
- memcmp(value->data, __tombstone.data, __tombstone.size) == 0) {
- WT_RET(__wt_scr_alloc(session, value->size + 1, tmpp));
- tmp = *tmpp;
-
- memcpy(tmp->mem, value->data, value->size);
- memcpy((uint8_t *)tmp->mem + value->size, __tombstone.data, 1);
- final_value->data = tmp->mem;
- final_value->size = value->size + 1;
- } else {
- final_value->data = value->data;
- final_value->size = value->size;
- }
-
- return (0);
-}
-
-/*
- * __curtiered_deleted_decode --
- * Decode values that start with the tombstone.
- */
-static inline void
-__curtiered_deleted_decode(WT_CURSOR_TIERED *curtiered, WT_ITEM *value)
-{
- WT_UNUSED(curtiered);
- /*
- * Take care with this check: when a tiered cursor is used for a merge, it is valid to return
- * the tombstone value.
- */
- if (value->size > __tombstone.size &&
- memcmp(value->data, __tombstone.data, __tombstone.size) == 0)
- --value->size;
-}
-
-/*
- * __wt_curtiered_close --
- * WT_CURSOR->close method for the tiered cursor type.
- */
-int
-__wt_curtiered_close(WT_CURSOR *cursor)
-{
- WT_CURSOR_TIERED *curtiered;
- WT_DECL_RET;
- WT_SESSION_IMPL *session;
-
- /*
- * Don't use the normal __curtiered_enter path: that is wasted work when closing, and the cursor
- * may never have been used.
- */
- curtiered = (WT_CURSOR_TIERED *)cursor;
- CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, close, NULL);
-err:
- WT_TRET(__curtiered_close_cursors(session, curtiered));
-
- /* In case we were somehow left positioned, clear that. */
- __curtiered_leave(curtiered);
-
- if (curtiered->tiered != NULL)
- WT_WITH_DHANDLE(session, (WT_DATA_HANDLE *)curtiered->tiered,
- WT_TRET(__wt_session_release_dhandle(session)));
- __wt_cursor_close(cursor);
-
- API_END_RET(session, ret);
-}
-
-/*
- * __curtiered_get_current --
- * Find the smallest / largest of the cursors and copy its key/value.
- */
-static int
-__curtiered_get_current(
- WT_SESSION_IMPL *session, WT_CURSOR_TIERED *curtiered, bool smallest, bool *deletedp)
-{
- WT_CURSOR *c, *current;
- u_int i;
- int cmp;
- bool multiple;
-
- current = NULL;
- multiple = false;
-
- WT_FORALL_CURSORS(curtiered, c, i)
- {
- if (!F_ISSET(c, WT_CURSTD_KEY_INT))
- continue;
- if (current == NULL) {
- current = c;
- continue;
- }
- WT_RET(WT_TIERED_CURCMP(session, curtiered->tiered, c, current, cmp));
- if (smallest ? cmp < 0 : cmp > 0) {
- current = c;
- multiple = false;
- } else if (cmp == 0)
- multiple = true;
- }
-
- c = &curtiered->iface;
- if ((curtiered->current = current) == NULL) {
- F_CLR(c, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
- return (WT_NOTFOUND);
- }
-
- if (multiple)
- F_SET(curtiered, WT_CURTIERED_MULTIPLE);
- else
- F_CLR(curtiered, WT_CURTIERED_MULTIPLE);
-
- WT_RET(current->get_key(current, &c->key));
- WT_RET(current->get_value(current, &c->value));
-
- F_CLR(c, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
- if ((*deletedp = __curtiered_deleted(curtiered, &c->value)) == false)
- F_SET(c, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
-
- return (0);
-}
-
-/*
- * __curtiered_compare --
- * WT_CURSOR->compare implementation for the tiered cursor type.
- */
-static int
-__curtiered_compare(WT_CURSOR *a, WT_CURSOR *b, int *cmpp)
-{
- WT_CURSOR_TIERED *atiered;
- WT_DECL_RET;
- WT_SESSION_IMPL *session;
-
- atiered = (WT_CURSOR_TIERED *)a;
- CURSOR_API_CALL(a, session, compare, NULL);
-
- /*
- * Confirm both cursors refer to the same source and have keys, then compare the keys.
- */
- if (strcmp(a->uri, b->uri) != 0)
- WT_ERR_MSG(session, EINVAL, "comparison method cursors must reference the same object");
-
- WT_ERR(__cursor_needkey(a));
- WT_ERR(__cursor_needkey(b));
-
- WT_ERR(__wt_compare(session, atiered->tiered->collator, &a->key, &b->key, cmpp));
-
-err:
- API_END_RET(session, ret);
-}
-
-/*
- * __curtiered_position_tier --
- * Position a tier cursor.
- */
-static int
-__curtiered_position_tier(WT_CURSOR_TIERED *curtiered, WT_CURSOR *c, bool forward, int *cmpp)
-{
- WT_CURSOR *cursor;
- WT_SESSION_IMPL *session;
-
- cursor = &curtiered->iface;
- session = CUR2S(cursor);
-
- c->set_key(c, &cursor->key);
- WT_RET(c->search_near(c, cmpp));
-
- while (forward ? *cmpp < 0 : *cmpp > 0) {
- WT_RET(forward ? c->next(c) : c->prev(c));
-
- /*
- * With higher isolation levels, where we have stable reads, we're done: the cursor is now
- * positioned as expected.
- *
- * With read-uncommitted isolation, a new record could have appeared in between the search
- * and stepping forward / back. In that case, keep going until we see a key in the expected
- * range.
- */
- if (session->txn->isolation != WT_ISO_READ_UNCOMMITTED)
- return (0);
-
- WT_RET(WT_TIERED_CURCMP(session, curtiered->tiered, c, cursor, *cmpp));
- }
-
- return (0);
-}
-
-/*
- * __curtiered_next --
- * WT_CURSOR->next method for the tiered cursor type.
- */
-static int
-__curtiered_next(WT_CURSOR *cursor)
-{
- WT_CURSOR *c;
- WT_CURSOR_TIERED *curtiered;
- WT_DECL_RET;
- WT_SESSION_IMPL *session;
- u_int i;
- int cmp;
- bool deleted;
-
- curtiered = (WT_CURSOR_TIERED *)cursor;
-
- CURSOR_API_CALL(cursor, session, next, NULL);
- __cursor_novalue(cursor);
- WT_ERR(__curtiered_enter(curtiered, false));
-
- /* If we aren't positioned for a forward scan, get started. */
- if (curtiered->current == NULL || !F_ISSET(curtiered, WT_CURTIERED_ITERATE_NEXT)) {
- WT_FORALL_CURSORS(curtiered, c, i)
- {
- if (!F_ISSET(cursor, WT_CURSTD_KEY_SET)) {
- WT_ERR(c->reset(c));
- ret = c->next(c);
- } else if (c != curtiered->current &&
- (ret = __curtiered_position_tier(curtiered, c, true, &cmp)) == 0 && cmp == 0 &&
- curtiered->current == NULL)
- curtiered->current = c;
- WT_ERR_NOTFOUND_OK(ret, false);
- }
- F_SET(curtiered, WT_CURTIERED_ITERATE_NEXT | WT_CURTIERED_MULTIPLE);
- F_CLR(curtiered, WT_CURTIERED_ITERATE_PREV);
-
- /* We just positioned *at* the key, now move. */
- if (curtiered->current != NULL)
- goto retry;
- } else {
-retry:
- /*
- * If there are multiple cursors on that key, move them forward.
- */
- if (F_ISSET(curtiered, WT_CURTIERED_MULTIPLE)) {
- WT_FORALL_CURSORS(curtiered, c, i)
- {
- if (!F_ISSET(c, WT_CURSTD_KEY_INT))
- continue;
- if (c != curtiered->current) {
- WT_ERR(
- WT_TIERED_CURCMP(session, curtiered->tiered, c, curtiered->current, cmp));
- if (cmp == 0)
- WT_ERR_NOTFOUND_OK(c->next(c), false);
- }
- }
- }
-
- /* Move the smallest cursor forward. */
- c = curtiered->current;
- WT_ERR_NOTFOUND_OK(c->next(c), false);
- }
-
- /* Find the cursor(s) with the smallest key. */
- if ((ret = __curtiered_get_current(session, curtiered, true, &deleted)) == 0 && deleted)
- goto retry;
-
-err:
- __curtiered_leave(curtiered);
- if (ret == 0)
- __curtiered_deleted_decode(curtiered, &cursor->value);
- API_END_RET(session, ret);
-}
-
-/*
- * __curtiered_prev --
- * WT_CURSOR->prev method for the tiered cursor type.
- */
-static int
-__curtiered_prev(WT_CURSOR *cursor)
-{
- WT_CURSOR *c;
- WT_CURSOR_TIERED *curtiered;
- WT_DECL_RET;
- WT_SESSION_IMPL *session;
- u_int i;
- int cmp;
- bool deleted;
-
- curtiered = (WT_CURSOR_TIERED *)cursor;
-
- CURSOR_API_CALL(cursor, session, prev, NULL);
- __cursor_novalue(cursor);
- WT_ERR(__curtiered_enter(curtiered, false));
-
- /* If we aren't positioned for a reverse scan, get started. */
- if (curtiered->current == NULL || !F_ISSET(curtiered, WT_CURTIERED_ITERATE_PREV)) {
- WT_FORALL_CURSORS(curtiered, c, i)
- {
- if (!F_ISSET(cursor, WT_CURSTD_KEY_SET)) {
- WT_ERR(c->reset(c));
- ret = c->prev(c);
- } else if (c != curtiered->current &&
- (ret = __curtiered_position_tier(curtiered, c, false, &cmp)) == 0 && cmp == 0 &&
- curtiered->current == NULL)
- curtiered->current = c;
- WT_ERR_NOTFOUND_OK(ret, false);
- }
- F_SET(curtiered, WT_CURTIERED_ITERATE_PREV | WT_CURTIERED_MULTIPLE);
- F_CLR(curtiered, WT_CURTIERED_ITERATE_NEXT);
-
- /* We just positioned *at* the key, now move. */
- if (curtiered->current != NULL)
- goto retry;
- } else {
-retry:
- /*
- * If there are multiple cursors on that key, move them backwards.
- */
- if (F_ISSET(curtiered, WT_CURTIERED_MULTIPLE)) {
- WT_FORALL_CURSORS(curtiered, c, i)
- {
- if (!F_ISSET(c, WT_CURSTD_KEY_INT))
- continue;
- if (c != curtiered->current) {
- WT_ERR(
- WT_TIERED_CURCMP(session, curtiered->tiered, c, curtiered->current, cmp));
- if (cmp == 0)
- WT_ERR_NOTFOUND_OK(c->prev(c), false);
- }
- }
- }
-
- /* Move the largest cursor backwards. */
- c = curtiered->current;
- WT_ERR_NOTFOUND_OK(c->prev(c), false);
- }
-
- /* Find the cursor(s) with the largest key. */
- if ((ret = __curtiered_get_current(session, curtiered, false, &deleted)) == 0 && deleted)
- goto retry;
-
-err:
- __curtiered_leave(curtiered);
- if (ret == 0)
- __curtiered_deleted_decode(curtiered, &cursor->value);
- API_END_RET(session, ret);
-}
-
-/*
- * __curtiered_reset --
- * WT_CURSOR->reset method for the tiered cursor type.
- */
-static int
-__curtiered_reset(WT_CURSOR *cursor)
-{
- WT_CURSOR_TIERED *curtiered;
- WT_DECL_RET;
- WT_SESSION_IMPL *session;
-
- /*
- * Don't use the normal __curtiered_enter path: that is wasted work when all we want to do is
- * give up our position.
- */
- curtiered = (WT_CURSOR_TIERED *)cursor;
- CURSOR_API_CALL_PREPARE_ALLOWED(cursor, session, reset, NULL);
- F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
-
- WT_TRET(__curtiered_reset_cursors(curtiered, NULL));
-
- /* In case we were left positioned, clear that. */
- __curtiered_leave(curtiered);
-
-err:
- API_END_RET(session, ret);
-}
-
-/*
- * __curtiered_lookup --
- * Position a tiered cursor.
- */
-static int
-__curtiered_lookup(WT_CURSOR_TIERED *curtiered, WT_ITEM *value)
-{
- WT_CURSOR *c, *cursor;
- WT_DECL_RET;
- u_int i;
-
- c = NULL;
- cursor = &curtiered->iface;
-
- WT_FORALL_CURSORS(curtiered, c, i)
- {
- c->set_key(c, &cursor->key);
- if ((ret = c->search(c)) == 0) {
- WT_ERR(c->get_key(c, &cursor->key));
- WT_ERR(c->get_value(c, value));
- if (__curtiered_deleted(curtiered, value))
- ret = WT_NOTFOUND;
- goto done;
- }
- WT_ERR_NOTFOUND_OK(ret, false);
- F_CLR(c, WT_CURSTD_KEY_SET);
- }
- WT_ERR(WT_NOTFOUND);
-
-done:
-err:
- if (ret == 0) {
- F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
- F_SET(cursor, WT_CURSTD_KEY_INT);
- curtiered->current = c;
- if (value == &cursor->value)
- F_SET(cursor, WT_CURSTD_VALUE_INT);
- } else if (c != NULL)
- WT_TRET(c->reset(c));
-
- return (ret);
-}
-
-/*
- * __curtiered_search --
- * WT_CURSOR->search method for the tiered cursor type.
- */
-static int
-__curtiered_search(WT_CURSOR *cursor)
-{
- WT_CURSOR_TIERED *curtiered;
- WT_DECL_RET;
- WT_SESSION_IMPL *session;
-
- curtiered = (WT_CURSOR_TIERED *)cursor;
-
- CURSOR_API_CALL(cursor, session, search, NULL);
- WT_ERR(__cursor_needkey(cursor));
- __cursor_novalue(cursor);
- WT_ERR(__curtiered_enter(curtiered, true));
- F_CLR(curtiered, WT_CURTIERED_ITERATE_NEXT | WT_CURTIERED_ITERATE_PREV);
-
- ret = __curtiered_lookup(curtiered, &cursor->value);
-
-err:
- __curtiered_leave(curtiered);
- if (ret == 0)
- __curtiered_deleted_decode(curtiered, &cursor->value);
- API_END_RET(session, ret);
-}
-
-/*
- * __curtiered_search_near --
- * WT_CURSOR->search_near method for the tiered cursor type.
- */
-static int
-__curtiered_search_near(WT_CURSOR *cursor, int *exactp)
-{
- WT_CURSOR *c, *closest;
- WT_CURSOR_TIERED *curtiered;
- WT_DECL_RET;
- WT_SESSION_IMPL *session;
- u_int i;
- int cmp, exact;
- bool deleted;
-
- closest = NULL;
- curtiered = (WT_CURSOR_TIERED *)cursor;
- exact = 0;
-
- CURSOR_API_CALL(cursor, session, search_near, NULL);
- WT_ERR(__cursor_needkey(cursor));
- __cursor_novalue(cursor);
- WT_ERR(__curtiered_enter(curtiered, true));
- F_CLR(curtiered, WT_CURTIERED_ITERATE_NEXT | WT_CURTIERED_ITERATE_PREV);
-
- /*
- * search_near is somewhat fiddly: we can't just use a nearby key from the first tier because
- * there could be a closer key in a lower tier.
- *
- * As we search down the tiers, we stop as soon as we find an exact match. Otherwise, we
- * maintain the smallest cursor larger than the search key and the largest cursor smaller than
- * the search key. At the end, we prefer the larger cursor, but if no record is larger, position
- * on the last record in the tree.
- */
- WT_FORALL_CURSORS(curtiered, c, i)
- {
- c->set_key(c, &cursor->key);
- if ((ret = c->search_near(c, &cmp)) == WT_NOTFOUND) {
- ret = 0;
- continue;
- }
- if (ret != 0)
- goto err;
-
- /* Do we have an exact match? */
- if (cmp == 0) {
- closest = c;
- exact = 1;
- break;
- }
-
- /*
- * Prefer larger cursors. There are two reasons: (1) we expect prefix searches to be a
- * common case (as in our own indices); and (2) we need a way to unambiguously know we have
- * the "closest" result.
- */
- if (cmp < 0) {
- if ((ret = c->next(c)) == WT_NOTFOUND) {
- ret = 0;
- continue;
- }
- if (ret != 0)
- goto err;
- }
-
- /*
- * We are trying to find the smallest cursor greater than the search key.
- */
- if (closest == NULL)
- closest = c;
- else {
- WT_ERR(WT_TIERED_CURCMP(session, curtiered->tiered, c, closest, cmp));
- if (cmp < 0)
- closest = c;
- }
- }
-
- /*
- * At this point, we either have an exact match, or closest is the smallest cursor larger than
- * the search key, or it is NULL if the search key is larger than any record in the tree.
- */
- cmp = exact ? 0 : 1;
-
- /*
- * If we land on a deleted item, try going forwards or backwards to find one that isn't deleted.
- * If the whole tree is empty, we'll end up with WT_NOTFOUND, as expected.
- */
- if (closest == NULL)
- deleted = true;
- else {
- WT_ERR(closest->get_key(closest, &cursor->key));
- WT_ERR(closest->get_value(closest, &cursor->value));
- curtiered->current = closest;
- closest = NULL;
- deleted = __curtiered_deleted(curtiered, &cursor->value);
- if (!deleted)
- __curtiered_deleted_decode(curtiered, &cursor->value);
- else {
- /*
- * We have a key pointing at memory that is pinned by the current tier cursor. In the
- * unlikely event that we have to reopen cursors to move to the next record, make sure
- * the cursor flags are set so a copy is made before the current tier cursor releases
- * its position.
- */
- F_CLR(cursor, WT_CURSTD_KEY_SET);
- F_SET(cursor, WT_CURSTD_KEY_INT);
- /*
- * We call __curtiered_next here as we want to advance forward. If we are a random
- * tiered cursor calling next on the cursor will not advance as we intend.
- */
- if ((ret = __curtiered_next(cursor)) == 0) {
- cmp = 1;
- deleted = false;
- }
- }
- WT_ERR_NOTFOUND_OK(ret, false);
- }
- if (deleted) {
- curtiered->current = NULL;
- /*
- * We call prev directly here as cursor->prev may be "invalid" if this is a random cursor.
- */
- WT_ERR(__curtiered_prev(cursor));
- cmp = -1;
- }
- *exactp = cmp;
-
-err:
- __curtiered_leave(curtiered);
- if (closest != NULL)
- WT_TRET(closest->reset(closest));
-
- F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
- if (ret == 0) {
- F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
- } else
- curtiered->current = NULL;
-
- API_END_RET(session, ret);
-}
-
-/*
- * __curtiered_put --
- * Put an entry into the primary tree.
- */
-static inline int
-__curtiered_put(WT_CURSOR_TIERED *curtiered, const WT_ITEM *key, const WT_ITEM *value,
- bool position, bool reserve)
-{
- WT_CURSOR *primary;
- int (*func)(WT_CURSOR *);
-
- /*
- * Clear the existing cursor position. Don't clear the primary cursor: we're about to use it
- * anyway.
- */
- primary = curtiered->cursors[WT_TIERED_INDEX_LOCAL];
- WT_RET(__curtiered_reset_cursors(curtiered, primary));
-
- /* If necessary, set the position for future scans. */
- if (position)
- curtiered->current = primary;
-
- primary->set_key(primary, key);
-
- /* Our API always leaves the cursor positioned after a reserve call. */
- WT_ASSERT(CUR2S(curtiered), !reserve || position);
- func = primary->insert;
- if (position)
- func = reserve ? primary->reserve : primary->update;
- if (!reserve)
- primary->set_value(primary, value);
- return (func(primary));
-}
-
-/*
- * __curtiered_insert --
- * WT_CURSOR->insert method for the tiered cursor type.
- */
-static int
-__curtiered_insert(WT_CURSOR *cursor)
-{
- WT_CURSOR_TIERED *curtiered;
- WT_DECL_ITEM(buf);
- WT_DECL_RET;
- WT_ITEM value;
- WT_SESSION_IMPL *session;
-
- curtiered = (WT_CURSOR_TIERED *)cursor;
-
- CURSOR_UPDATE_API_CALL(cursor, session, insert);
- WT_ERR(__cursor_needkey(cursor));
- WT_ERR(__cursor_needvalue(cursor));
- WT_ERR(__curtiered_enter(curtiered, false));
-
- /*
- * It isn't necessary to copy the key out after the lookup in this case because any non-failed
- * lookup results in an error, and a failed lookup leaves the original key intact.
- */
- if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE) &&
- (ret = __curtiered_lookup(curtiered, &value)) != WT_NOTFOUND) {
- if (ret == 0)
- ret = WT_DUPLICATE_KEY;
- goto err;
- }
-
- WT_ERR(__curtiered_deleted_encode(session, &cursor->value, &value, &buf));
- WT_ERR(__curtiered_put(curtiered, &cursor->key, &value, false, false));
-
- /*
- * WT_CURSOR.insert doesn't leave the cursor positioned, and the application may want to free
- * the memory used to configure the insert; don't read that memory again (matching the
- * underlying file object cursor insert semantics).
- */
- F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
-
-err:
- __wt_scr_free(session, &buf);
- __curtiered_leave(curtiered);
- CURSOR_UPDATE_API_END(session, ret);
- return (ret);
-}
-
-/*
- * __curtiered_update --
- * WT_CURSOR->update method for the tiered cursor type.
- */
-static int
-__curtiered_update(WT_CURSOR *cursor)
-{
- WT_CURSOR_TIERED *curtiered;
- WT_DECL_ITEM(buf);
- WT_DECL_RET;
- WT_ITEM value;
- WT_SESSION_IMPL *session;
-
- curtiered = (WT_CURSOR_TIERED *)cursor;
-
- CURSOR_UPDATE_API_CALL(cursor, session, update);
- WT_ERR(__cursor_needkey(cursor));
- WT_ERR(__cursor_needvalue(cursor));
- WT_ERR(__curtiered_enter(curtiered, false));
-
- if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
- WT_ERR(__curtiered_lookup(curtiered, &value));
- /*
- * Copy the key out, since the insert resets non-primary tier cursors which our lookup may
- * have landed on.
- */
- WT_ERR(__cursor_needkey(cursor));
- }
- WT_ERR(__curtiered_deleted_encode(session, &cursor->value, &value, &buf));
- WT_ERR(__curtiered_put(curtiered, &cursor->key, &value, true, false));
-
- /*
- * Set the cursor to reference the internal key/value of the positioned cursor.
- */
- F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
- WT_ITEM_SET(cursor->key, curtiered->current->key);
- WT_ITEM_SET(cursor->value, curtiered->current->value);
- WT_ASSERT(session, F_MASK(curtiered->current, WT_CURSTD_KEY_SET) == WT_CURSTD_KEY_INT);
- WT_ASSERT(session, F_MASK(curtiered->current, WT_CURSTD_VALUE_SET) == WT_CURSTD_VALUE_INT);
- F_SET(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
-
-err:
- __wt_scr_free(session, &buf);
- __curtiered_leave(curtiered);
- CURSOR_UPDATE_API_END(session, ret);
- return (ret);
-}
-
-/*
- * __curtiered_remove --
- * WT_CURSOR->remove method for the tiered cursor type.
- */
-static int
-__curtiered_remove(WT_CURSOR *cursor)
-{
- WT_CURSOR_TIERED *curtiered;
- WT_DECL_RET;
- WT_ITEM value;
- WT_SESSION_IMPL *session;
- bool positioned;
-
- curtiered = (WT_CURSOR_TIERED *)cursor;
-
- /* Check if the cursor is positioned. */
- positioned = F_ISSET(cursor, WT_CURSTD_KEY_INT);
-
- CURSOR_REMOVE_API_CALL(cursor, session, NULL);
- WT_ERR(__cursor_needkey(cursor));
- __cursor_novalue(cursor);
- WT_ERR(__curtiered_enter(curtiered, false));
-
- if (!F_ISSET(cursor, WT_CURSTD_OVERWRITE)) {
- WT_ERR(__curtiered_lookup(curtiered, &value));
- /*
- * Copy the key out, since the insert resets non-primary tier cursors which our lookup may
- * have landed on.
- */
- WT_ERR(__cursor_needkey(cursor));
- }
- WT_ERR(__curtiered_put(curtiered, &cursor->key, &__tombstone, positioned, false));
-
- /*
- * If the cursor was positioned, it stays positioned with a key but no value, otherwise, there's
- * no position, key or value. This isn't just cosmetic, without a reset, iteration on this
- * cursor won't start at the beginning/end of the table.
- */
- F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
- if (positioned)
- F_SET(cursor, WT_CURSTD_KEY_INT);
- else
- WT_TRET(cursor->reset(cursor));
-
-err:
- __curtiered_leave(curtiered);
- CURSOR_UPDATE_API_END(session, ret);
- return (ret);
-}
-
-/*
- * __curtiered_reserve --
- * WT_CURSOR->reserve method for the tiered cursor type.
- */
-static int
-__curtiered_reserve(WT_CURSOR *cursor)
-{
- WT_CURSOR_TIERED *curtiered;
- WT_DECL_RET;
- WT_ITEM value;
- WT_SESSION_IMPL *session;
-
- curtiered = (WT_CURSOR_TIERED *)cursor;
-
- CURSOR_UPDATE_API_CALL(cursor, session, reserve);
- WT_ERR(__cursor_needkey(cursor));
- __cursor_novalue(cursor);
- WT_ERR(__wt_txn_context_check(session, true));
- WT_ERR(__curtiered_enter(curtiered, false));
-
- WT_ERR(__curtiered_lookup(curtiered, &value));
- /*
- * Copy the key out, since the insert resets non-primary tier cursors which our lookup may have
- * landed on.
- */
- WT_ERR(__cursor_needkey(cursor));
- ret = __curtiered_put(curtiered, &cursor->key, NULL, true, true);
-
-err:
- __curtiered_leave(curtiered);
- CURSOR_UPDATE_API_END(session, ret);
-
- /*
- * The application might do a WT_CURSOR.get_value call when we return, so we need a value and
- * the underlying functions didn't set one up. For various reasons, those functions may not have
- * done a search and any previous value in the cursor might race with WT_CURSOR.reserve (and in
- * cases like tiered, the reserve never encountered the original key). For simplicity, repeat
- * the search here.
- */
- return (ret == 0 ? cursor->search(cursor) : ret);
-}
-
-/*
- * __curtiered_next_random --
- * WT_CURSOR->next method for the tiered cursor type when configured with next_random.
- */
-static int
-__curtiered_next_random(WT_CURSOR *cursor)
-{
- WT_CURSOR *c;
- WT_CURSOR_TIERED *curtiered;
- WT_DECL_RET;
- WT_SESSION_IMPL *session;
- u_int i, tier;
- int exact;
-
- c = NULL;
- curtiered = (WT_CURSOR_TIERED *)cursor;
-
- CURSOR_API_CALL(cursor, session, next, NULL);
- __cursor_novalue(cursor);
- WT_ERR(__curtiered_enter(curtiered, false));
-
- /*
- * Select a random tier. If it is empty, try the next tier and so on, wrapping around until we
- * find something or run out of tiers.
- */
- tier = __wt_random(&session->rnd) % WT_TIERED_MAX_TIERS;
- for (i = 0; i < WT_TIERED_MAX_TIERS; i++) {
- c = curtiered->cursors[tier];
- WT_ERR_NOTFOUND_OK(__wt_curfile_next_random(c), true);
- if (ret == WT_NOTFOUND) {
- if (++tier == WT_TIERED_MAX_TIERS)
- tier = 0;
- continue;
- }
-
- F_SET(cursor, WT_CURSTD_KEY_INT);
- WT_ERR(c->get_key(c, &cursor->key));
- /*
- * Search near the current key to resolve any tombstones and position to a valid record. If
- * we see a WT_NOTFOUND here that is valid, as the tree has no documents visible to us.
- */
- WT_ERR(__curtiered_search_near(cursor, &exact));
- break;
- }
-
-err:
- if (ret != 0) {
- /* We didn't find a valid record. Don't leave cursor positioned */
- F_CLR(cursor, WT_CURSTD_KEY_INT | WT_CURSTD_VALUE_INT);
- }
- __curtiered_leave(curtiered);
- API_END_RET(session, ret);
-}
-
-/*
- * __curtiered_insert_bulk --
- * WT_CURSOR->insert method for tiered bulk cursors.
- */
-static int
-__curtiered_insert_bulk(WT_CURSOR *cursor)
-{
- WT_CURSOR *bulk_cursor;
- WT_CURSOR_TIERED *curtiered;
- WT_SESSION_IMPL *session;
-
- curtiered = (WT_CURSOR_TIERED *)cursor;
- session = CUR2S(curtiered);
- bulk_cursor = curtiered->cursors[WT_TIERED_INDEX_LOCAL];
-
- WT_ASSERT(session, bulk_cursor != NULL);
- bulk_cursor->set_key(bulk_cursor, &cursor->key);
- bulk_cursor->set_value(bulk_cursor, &cursor->value);
- WT_RET(bulk_cursor->insert(bulk_cursor));
-
- return (0);
-}
-
-/*
- * __curtiered_open_bulk --
- * WT_SESSION->open_cursor method for tiered bulk cursors.
- */
-static int
-__curtiered_open_bulk(WT_CURSOR_TIERED *curtiered, const char *cfg[])
-{
- WT_CURSOR *cursor;
- WT_DATA_HANDLE *dhandle;
- WT_DECL_RET;
- WT_SESSION_IMPL *session;
- WT_TIERED *tiered;
-
- cursor = &curtiered->iface;
- session = CUR2S(curtiered);
- tiered = curtiered->tiered;
-
- /* Bulk cursors only support insert and close. */
- __wt_cursor_set_notsup(cursor);
- cursor->insert = __curtiered_insert_bulk;
- cursor->close = __wt_curtiered_close;
-
- WT_ASSERT(session, curtiered->cursors == NULL);
- WT_ERR(__wt_calloc_def(session, WT_TIERED_MAX_TIERS, &curtiered->cursors));
-
- /* Open a bulk cursor on the local tier. */
- dhandle = tiered->tiers[WT_TIERED_INDEX_LOCAL].tier;
- WT_ASSERT(session, dhandle != NULL);
- WT_ERR(__wt_open_cursor(
- session, dhandle->name, cursor, cfg, &curtiered->cursors[WT_TIERED_INDEX_LOCAL]));
-
- /* Child cursors always use overwrite and raw mode. */
- F_SET(curtiered->cursors[WT_TIERED_INDEX_LOCAL], WT_CURSTD_OVERWRITE | WT_CURSTD_RAW);
-
- if (0) {
-err:
- __wt_free(session, curtiered->cursors);
- }
- return (ret);
-}
-
-/*
- * __wt_curtiered_open --
- * WT_SESSION->open_cursor method for tiered cursors.
- */
-int
-__wt_curtiered_open(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[],
- WT_CURSOR **cursorp)
-{
- WT_CONFIG_ITEM cval;
- WT_CURSOR_STATIC_INIT(iface, __wt_cursor_get_key, /* get-key */
- __wt_cursor_get_value, /* get-value */
- __wt_cursor_set_key, /* set-key */
- __wt_cursor_set_value, /* set-value */
- __curtiered_compare, /* compare */
- __wt_cursor_equals, /* equals */
- __curtiered_next, /* next */
- __curtiered_prev, /* prev */
- __curtiered_reset, /* reset */
- __curtiered_search, /* search */
- __curtiered_search_near, /* search-near */
- __curtiered_insert, /* insert */
- __wt_cursor_modify_value_format_notsup, /* modify */
- __curtiered_update, /* update */
- __curtiered_remove, /* remove */
- __curtiered_reserve, /* reserve */
- __wt_cursor_reconfigure, /* reconfigure */
- __wt_cursor_notsup, /* cache */
- __wt_cursor_reopen_notsup, /* reopen */
- __wt_curtiered_close); /* close */
- WT_CURSOR *cursor;
- WT_CURSOR_TIERED *curtiered;
- WT_DECL_RET;
- WT_TIERED *tiered;
- bool bulk;
-
- WT_STATIC_ASSERT(offsetof(WT_CURSOR_TIERED, iface) == 0);
-
- curtiered = NULL;
- cursor = NULL;
- tiered = NULL;
-
- if (!WT_PREFIX_MATCH(uri, "tiered:"))
- return (__wt_unexpected_object_type(session, uri, "tiered:"));
-
- WT_RET(__wt_config_gets_def(session, cfg, "checkpoint", 0, &cval));
- if (cval.len != 0)
- WT_RET_MSG(session, EINVAL, "tiered tables do not support opening by checkpoint");
-
- WT_RET(__wt_config_gets_def(session, cfg, "bulk", 0, &cval));
- bulk = cval.val != 0;
-
- /* Get the tiered data handle. */
- ret = __wt_session_get_dhandle(session, uri, NULL, cfg, bulk ? WT_DHANDLE_EXCLUSIVE : 0);
-
- /* Check whether the exclusive open for a bulk load succeeded. */
- if (bulk && ret == EBUSY)
- ret = EINVAL;
- /* Flag any errors from the tree get. */
- WT_ERR(ret);
-
- tiered = (WT_TIERED *)session->dhandle;
-
- /* Make sure we have exclusive access if and only if we want it */
- WT_ASSERT(session, !bulk || tiered->iface.excl_session != NULL);
-
- WT_ERR(__wt_calloc_one(session, &curtiered));
- cursor = (WT_CURSOR *)curtiered;
- *cursor = iface;
- cursor->session = (WT_SESSION *)session;
- WT_ERR(__wt_strdup(session, tiered->iface.name, &cursor->uri));
- cursor->key_format = tiered->key_format;
- cursor->value_format = tiered->value_format;
-
- curtiered->tiered = tiered;
- tiered = NULL;
-
- /* If the next_random option is set, configure a random cursor */
- WT_ERR(__wt_config_gets_def(session, cfg, "next_random", 0, &cval));
- if (cval.val != 0) {
- __wt_cursor_set_notsup(cursor);
- cursor->next = __curtiered_next_random;
- }
-
- WT_ERR(__wt_cursor_init(cursor, cursor->uri, owner, cfg, cursorp));
-
- if (bulk)
- WT_ERR(__curtiered_open_bulk(curtiered, cfg));
-
- if (0) {
-err:
- if (curtiered != NULL)
- WT_TRET(__wt_curtiered_close(cursor));
- else if (tiered != NULL)
- WT_WITH_DHANDLE(
- session, (WT_DATA_HANDLE *)tiered, WT_TRET(__wt_session_release_dhandle(session)));
-
- *cursorp = NULL;
- }
-
- return (ret);
-}
diff --git a/src/third_party/wiredtiger/src/tiered/tiered_handle.c b/src/third_party/wiredtiger/src/tiered/tiered_handle.c
index 363a9c97140..11ecd0a4941 100644
--- a/src/third_party/wiredtiger/src/tiered/tiered_handle.c
+++ b/src/third_party/wiredtiger/src/tiered/tiered_handle.c
@@ -138,7 +138,7 @@ __tiered_create_object(WT_SESSION_IMPL *session, WT_TIERED *tiered)
orig_name = tiered->tiers[WT_TIERED_INDEX_LOCAL].name;
/*
* If we have an existing local file in the tier, alter the table to indicate this one is now
- * readonly.
+ * readonly. We are already holding the schema lock so we can call alter.
*/
if (orig_name != NULL) {
cfg[0] = "readonly=true";
@@ -161,13 +161,6 @@ __tiered_create_object(WT_SESSION_IMPL *session, WT_TIERED *tiered)
/* Create the new shared object. */
WT_ERR(__wt_schema_create(session, name, config));
-#if 0
- /*
- * If we get here we have successfully created the object. It is ready to be fully flushed to
- * the cloud. Push a work element to let the internal thread do that here.
- */
-#endif
-
err:
__wt_free(session, config);
__wt_free(session, name);
@@ -313,10 +306,6 @@ static int
__tiered_switch(WT_SESSION_IMPL *session, const char *config)
{
WT_DECL_RET;
-#if 0
- WT_FILE_SYSTEM *fs;
- WT_STORAGE_SOURCE *storage_source;
-#endif
WT_TIERED *tiered;
bool need_object, need_tree, tracking;
@@ -357,42 +346,22 @@ __tiered_switch(WT_SESSION_IMPL *session, const char *config)
WT_RET(__wt_meta_track_on(session));
tracking = true;
- /* Create the object: entry in the metadata. */
- if (need_object)
- WT_ERR(__tiered_create_object(session, tiered));
-
if (need_tree)
WT_ERR(__tiered_create_tier_tree(session, tiered));
+ /* Create the object: entry in the metadata. */
+ if (need_object) {
+ WT_ERR(__tiered_create_object(session, tiered));
+#if 1
+ WT_ERR(__wt_tiered_put_flush(session, tiered));
+#else
+ WT_ERR(__wt_tier_flush(session, tiered, tiered->current_id));
+#endif
+ }
+
/* We always need to create a local object. */
WT_ERR(__tiered_create_local(session, tiered));
-#if 0
- /*
- * We expect this part to be done asynchronously in its own thread. First flush the contents of
- * the data file to the new cloud object.
- */
- storage_source = tiered->bstorage->storage_source;
- fs = tiered->bucket_storage->file_system;
- WT_ASSERT(session, storage_source != NULL);
-
- /* This call make take a while, and may fail due to network timeout. */
- WT_ERR(storage_source->ss_flush(storage_source, &session->iface,
- fs, old_filename, object_name, NULL));
-
- /*
- * The metadata for the old local object will be initialized with "flush=0". When the flush call
- * completes, it can be marked as "flush=1". When that's done, we can finish the flush. The
- * flush finish call moves the file from the home directory to the extension's cache. Then the
- * extension will own it.
- *
- * We may need a way to restart flushes for those not completed (after a crash), or failed (due
- * to previous network outage).
- */
- WT_ERR(storage_source->ss_flush_finish(storage_source, &session->iface,
- fs, old_filename, object_name, NULL));
-#endif
-
/* Update the tiered: metadata to new object number and tiered array. */
WT_ERR(__tiered_update_metadata(session, tiered, config));
tracking = false;
@@ -485,7 +454,10 @@ __tiered_open(WT_SESSION_IMPL *session, const char *cfg[])
WT_DECL_ITEM(tmp);
WT_DECL_RET;
WT_TIERED *tiered;
+#if 1
+ WT_TIERED_WORK_UNIT *entry;
uint32_t unused;
+#endif
char *metaconf;
const char *obj_cfg[] = {WT_CONFIG_BASE(session, object_meta), NULL, NULL};
const char **tiered_cfg, *config;
@@ -549,10 +521,17 @@ __tiered_open(WT_SESSION_IMPL *session, const char *cfg[])
__wt_free(session, dhandle->cfg[1]);
dhandle->cfg[1] = metaconf;
}
+#if 1
if (0) {
/* Temp code to keep s_all happy. */
FLD_SET(unused, WT_TIERED_OBJ_LOCAL | WT_TIERED_TREE_UNUSED);
+ FLD_SET(unused, WT_TIERED_WORK_FORCE | WT_TIERED_WORK_FREE);
+ WT_ERR(__wt_tiered_put_drop_local(session, tiered, tiered->current_id));
+ WT_ERR(__wt_tiered_put_drop_shared(session, tiered, tiered->current_id));
+ __wt_tiered_get_drop_local(session, 0, &entry);
+ __wt_tiered_get_drop_shared(session, &entry);
}
+#endif
if (0) {
err:
diff --git a/src/third_party/wiredtiger/src/tiered/tiered_work.c b/src/third_party/wiredtiger/src/tiered/tiered_work.c
new file mode 100644
index 00000000000..728a7a0b3b2
--- /dev/null
+++ b/src/third_party/wiredtiger/src/tiered/tiered_work.c
@@ -0,0 +1,151 @@
+/*-
+ * Copyright (c) 2014-present MongoDB, Inc.
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_tiered_push_work --
+ * Push a work unit to the queue. Assumes it is passed an already filled out structure.
+ */
+void
+__wt_tiered_push_work(WT_SESSION_IMPL *session, WT_TIERED_WORK_UNIT *entry)
+{
+ WT_CONNECTION_IMPL *conn;
+
+ conn = S2C(session);
+ __wt_spin_lock(session, &conn->tiered_lock);
+ TAILQ_INSERT_TAIL(&conn->tieredqh, entry, q);
+ WT_STAT_CONN_INCR(session, tiered_work_units_created);
+ __wt_spin_unlock(session, &conn->tiered_lock);
+ __wt_cond_signal(session, conn->tiered_cond);
+ return;
+}
+
+/*
+ * __wt_tiered_pop_work --
+ * Pop a work unit of the given type from the queue. If a maximum value is given, only return a
+ * work unit that is less than the maximum value. The caller is responsible for freeing the
+ * returned work unit structure.
+ */
+void
+__wt_tiered_pop_work(
+ WT_SESSION_IMPL *session, uint32_t type, uint64_t maxval, WT_TIERED_WORK_UNIT **entryp)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_TIERED_WORK_UNIT *entry;
+
+ *entryp = entry = NULL;
+
+ conn = S2C(session);
+ if (TAILQ_EMPTY(&conn->tieredqh))
+ return;
+ __wt_spin_lock(session, &conn->tiered_lock);
+
+ TAILQ_FOREACH (entry, &conn->tieredqh, q) {
+ if (FLD_ISSET(type, entry->type) && (maxval == 0 || entry->op_val < maxval)) {
+ TAILQ_REMOVE(&conn->tieredqh, entry, q);
+ WT_STAT_CONN_INCR(session, tiered_work_units_dequeued);
+ break;
+ }
+ }
+ *entryp = entry;
+ __wt_spin_unlock(session, &conn->tiered_lock);
+ return;
+}
+
+/*
+ * __wt_tiered_get_flush --
+ * Get the first flush work unit from the queue. The id information cannot change between our
+ * caller and here. The caller is responsible for freeing the work unit.
+ */
+void
+__wt_tiered_get_flush(WT_SESSION_IMPL *session, WT_TIERED_WORK_UNIT **entryp)
+{
+ __wt_tiered_pop_work(session, WT_TIERED_WORK_FLUSH, 0, entryp);
+ return;
+}
+
+/*
+ * __wt_tiered_get_drop_local --
+ * Get a drop local work unit if it is less than the time given. The caller is responsible for
+ * freeing the work unit.
+ */
+void
+__wt_tiered_get_drop_local(WT_SESSION_IMPL *session, uint64_t now, WT_TIERED_WORK_UNIT **entryp)
+{
+ __wt_tiered_pop_work(session, WT_TIERED_WORK_DROP_LOCAL, now, entryp);
+ return;
+}
+
+/*
+ * __wt_tiered_get_drop_shared --
+ * Get a drop shared work unit. The caller is responsible for freeing the work unit.
+ */
+void
+__wt_tiered_get_drop_shared(WT_SESSION_IMPL *session, WT_TIERED_WORK_UNIT **entryp)
+{
+ __wt_tiered_pop_work(session, WT_TIERED_WORK_DROP_SHARED, 0, entryp);
+ return;
+}
+
+/*
+ * __wt_tiered_put_drop_local --
+ * Add a drop local work unit for the given ID to the queue.
+ */
+int
+__wt_tiered_put_drop_local(WT_SESSION_IMPL *session, WT_TIERED *tiered, uint64_t id)
+{
+ WT_TIERED_WORK_UNIT *entry;
+ uint64_t now;
+
+ WT_RET(__wt_calloc_one(session, &entry));
+ entry->type = WT_TIERED_WORK_DROP_LOCAL;
+ entry->id = id;
+ WT_ASSERT(session, tiered->bstorage != NULL);
+ __wt_seconds(session, &now);
+ /* Put a work unit in the queue with the time this object expires. */
+ entry->op_val = now + tiered->bstorage->retain_secs;
+ entry->tiered = tiered;
+ __wt_tiered_push_work(session, entry);
+ return (0);
+}
+
+/*
+ * __wt_tiered_put_drop_shared --
+ * Add a drop shared work unit for the given ID to the queue.
+ */
+int
+__wt_tiered_put_drop_shared(WT_SESSION_IMPL *session, WT_TIERED *tiered, uint64_t id)
+{
+ WT_TIERED_WORK_UNIT *entry;
+
+ WT_RET(__wt_calloc_one(session, &entry));
+ entry->type = WT_TIERED_WORK_DROP_SHARED;
+ entry->id = id;
+ entry->tiered = tiered;
+ __wt_tiered_push_work(session, entry);
+ return (0);
+}
+
+/*
+ * __wt_tiered_put_flush --
+ * Add a flush work unit to the queue. We're single threaded so the tiered structure's id
+ * information cannot change between our caller and here.
+ */
+int
+__wt_tiered_put_flush(WT_SESSION_IMPL *session, WT_TIERED *tiered)
+{
+ WT_TIERED_WORK_UNIT *entry;
+
+ WT_RET(__wt_calloc_one(session, &entry));
+ entry->type = WT_TIERED_WORK_FLUSH;
+ entry->id = tiered->current_id;
+ entry->tiered = tiered;
+ __wt_tiered_push_work(session, entry);
+ return (0);
+}