summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/btree/bt_cursor.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/third_party/wiredtiger/src/btree/bt_cursor.c')
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_cursor.c295
1 files changed, 137 insertions, 158 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c
index c45d9ed8b6b..8f64dd5562e 100644
--- a/src/third_party/wiredtiger/src/btree/bt_cursor.c
+++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c
@@ -54,7 +54,7 @@ __cursor_state_restore(WT_CURSOR *cursor, WT_CURFILE_STATE *state)
* Return if we have a page pinned.
*/
static inline bool
-__cursor_page_pinned(WT_CURSOR_BTREE *cbt)
+__cursor_page_pinned(WT_CURSOR_BTREE *cbt, bool search_operation)
{
WT_CURSOR *cursor;
WT_SESSION_IMPL *session;
@@ -72,11 +72,25 @@ __cursor_page_pinned(WT_CURSOR_BTREE *cbt)
}
/*
- * Check if the key references the page. When returning from search, the page is active and the
- * key is internal. After the application sets a key, the key is external, and the page is
- * useless.
+ * Check if the key references an item on a page. When returning from search, the page is pinned
+ * and the key is internal. After the application sets a key, the key becomes external. For the
+ * search and search-near operations, we assume locality and check any pinned page first on each
+ * new search operation. For operations other than search and search-near, check if we have an
+ * internal key. If the page is pinned and we're pointing into the page, we don't need to search
+ * at all, we can proceed with the operation. However, if the key has been set, that is, it's an
+ * external key, we're going to have to do a full search.
*/
- if (!F_ISSET(cursor, WT_CURSTD_KEY_INT))
+ if (!search_operation && !F_ISSET(cursor, WT_CURSTD_KEY_INT))
+ return (false);
+
+ /*
+ * XXX No fast-path searches at read-committed isolation. Underlying transactional functions
+ * called by the fast and slow path search code handle transaction IDs differently, resulting in
+ * different search results at read-committed isolation. This makes no difference for the update
+ * functions, but in the case of a search, we will see different results based on the cursor's
+ * initial location. See WT-5134 for the details.
+ */
+ if (search_operation && session->txn.isolation == WT_ISO_READ_COMMITTED)
return (false);
/*
@@ -181,15 +195,13 @@ static inline bool
__cursor_fix_implicit(WT_BTREE *btree, WT_CURSOR_BTREE *cbt)
{
/*
- * When there's no exact match, column-store search returns the key
- * nearest the searched-for key (continuing past keys smaller than the
- * searched-for key to return the next-largest key). Therefore, if the
- * returned comparison is -1, the searched-for key was larger than any
- * row on the page's standard information or column-store insert list.
+ * When there's no exact match, column-store search returns the key nearest the searched-for key
+ * (continuing past keys smaller than the searched-for key to return the next-largest key).
+ * Therefore, if the returned comparison is -1, the searched-for key was larger than any row on
+ * the page's standard information or column-store insert list.
*
- * If the returned comparison is NOT -1, there was a row equal to or
- * larger than the searched-for key, and we implicitly create missing
- * rows.
+ * If the returned comparison is NOT -1, there was a row equal to or larger than the
+ * searched-for key, and we implicitly create missing rows.
*/
return (btree->type == BTREE_COL_FIX && cbt->compare != -1);
}
@@ -541,7 +553,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt)
* pinned page doesn't find an exact match, search from the root.
*/
valid = false;
- if (__cursor_page_pinned(cbt)) {
+ if (__cursor_page_pinned(cbt, true)) {
__wt_txn_cursor_op(session);
WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, cbt->ref, false) :
@@ -630,19 +642,17 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
__cursor_state_save(cursor, &state);
/*
- * If we have a row-store page pinned, search it; if we don't have a
- * page pinned, or the search of the pinned page doesn't find an exact
- * match, search from the root. Unlike WT_CURSOR.search, ignore pinned
- * pages in the case of column-store, search-near isn't an interesting
- * enough case for column-store to add the complexity needed to avoid
- * the tree search.
+ * If we have a row-store page pinned, search it; if we don't have a page pinned, or the search
+ * of the pinned page doesn't find an exact match, search from the root. Unlike
+ * WT_CURSOR.search, ignore pinned pages in the case of column-store, search-near isn't an
+ * interesting enough case for column-store to add the complexity needed to avoid the tree
+ * search.
*
- * Set the "insert" flag for the btree row-store search; we may intend
- * to position the cursor at the end of the tree, rather than match an
- * existing record.
+ * Set the "insert" flag for the btree row-store search; we may intend to position the cursor at
+ * the end of the tree, rather than match an existing record.
*/
valid = false;
- if (btree->type == BTREE_ROW && __cursor_page_pinned(cbt)) {
+ if (btree->type == BTREE_ROW && __cursor_page_pinned(cbt, true)) {
__wt_txn_cursor_op(session);
WT_ERR(__cursor_row_search(session, cbt, cbt->ref, true));
@@ -667,17 +677,15 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp)
/*
* If we find a valid key, return it.
*
- * Else, creating a record past the end of the tree in a fixed-length
- * column-store implicitly fills the gap with empty records. In this
- * case, we instantiate the empty record, it's an exact match.
+ * Else, creating a record past the end of the tree in a fixed-length column-store implicitly
+ * fills the gap with empty records. In this case, we instantiate the empty record, it's an
+ * exact match.
*
- * Else, move to the next key in the tree (bias for prefix searches).
- * Cursor next skips invalid rows, so we don't have to test for them
- * again.
+ * Else, move to the next key in the tree (bias for prefix searches). Cursor next skips invalid
+ * rows, so we don't have to test for them again.
*
- * Else, redo the search and move to the previous key in the tree.
- * Cursor previous skips invalid rows, so we don't have to test for
- * them again.
+ * Else, redo the search and move to the previous key in the tree. Cursor previous skips invalid
+ * rows, so we don't have to test for them again.
*
* If that fails, quit, there's no record to return.
*/
@@ -784,16 +792,14 @@ __wt_btcur_insert(WT_CURSOR_BTREE *cbt)
__cursor_state_save(cursor, &state);
/*
- * If inserting with overwrite configured, and positioned to an on-page
- * key, the update doesn't require another search. Cursors configured
- * for append aren't included, regardless of whether or not they meet
- * all other criteria.
+ * If inserting with overwrite configured, and positioned to an on-page key, the update doesn't
+ * require another search. Cursors configured for append aren't included, regardless of whether
+ * or not they meet all other criteria.
*
- * Fixed-length column store can never use a positioned cursor to update
- * because the cursor may not be positioned to the correct record in the
- * case of implicit records in the append list.
+ * Fixed-length column store can never use a positioned cursor to update because the cursor may
+ * not be positioned to the correct record in the case of implicit records in the append list.
*/
- if (btree->type != BTREE_COL_FIX && __cursor_page_pinned(cbt) &&
+ if (btree->type != BTREE_COL_FIX && __cursor_page_pinned(cbt, false) &&
F_ISSET(cursor, WT_CURSTD_OVERWRITE) && !append_key) {
WT_ERR(__wt_txn_autocommit_check(session));
/*
@@ -997,29 +1003,24 @@ __wt_btcur_remove(WT_CURSOR_BTREE *cbt, bool positioned)
__cursor_state_save(cursor, &state);
/*
- * If remove positioned to an on-page key, the remove doesn't require
- * another search. We don't care about the "overwrite" configuration
- * because regardless of the overwrite setting, any existing record is
- * removed, and the record must exist with a positioned cursor.
+ * If remove positioned to an on-page key, the remove doesn't require another search. We don't
+ * care about the "overwrite" configuration because regardless of the overwrite setting, any
+ * existing record is removed, and the record must exist with a positioned cursor.
*
- * There's trickiness in the page-pinned check. By definition a remove
- * operation leaves a cursor positioned if it's initially positioned.
- * However, if every item on the page is deleted and we unpin the page,
- * eviction might delete the page and our search will re-instantiate an
- * empty page for us. Cursor remove returns not-found whether or not
- * that eviction/deletion happens and it's OK unless cursor-overwrite
- * is configured (which means we return success even if there's no item
- * to delete). In that case, we'll fail when we try to point the cursor
- * at the key on the page to satisfy the positioned requirement. It's
- * arguably safe to simply leave the key initialized in the cursor (as
- * that's all a positioned cursor implies), but it's probably safer to
- * avoid page eviction entirely in the positioned case.
+ * There's trickiness in the page-pinned check. By definition a remove operation leaves a cursor
+ * positioned if it's initially positioned. However, if every item on the page is deleted and we
+ * unpin the page, eviction might delete the page and our search will re-instantiate an empty
+ * page for us. Cursor remove returns not-found whether or not that eviction/deletion happens
+ * and it's OK unless cursor-overwrite is configured (which means we return success even if
+ * there's no item to delete). In that case, we'll fail when we try to point the cursor at the
+ * key on the page to satisfy the positioned requirement. It's arguably safe to simply leave the
+ * key initialized in the cursor (as that's all a positioned cursor implies), but it's probably
+ * safer to avoid page eviction entirely in the positioned case.
*
- * Fixed-length column store can never use a positioned cursor to update
- * because the cursor may not be positioned to the correct record in the
- * case of implicit records in the append list.
+ * Fixed-length column store can never use a positioned cursor to update because the cursor may
+ * not be positioned to the correct record in the case of implicit records in the append list.
*/
- if (btree->type != BTREE_COL_FIX && __cursor_page_pinned(cbt)) {
+ if (btree->type != BTREE_COL_FIX && __cursor_page_pinned(cbt, false)) {
WT_ERR(__wt_txn_autocommit_check(session));
/*
@@ -1036,12 +1037,11 @@ __wt_btcur_remove(WT_CURSOR_BTREE *cbt, bool positioned)
retry:
/*
- * Note these steps must be repeatable, we'll continue to take this path
- * as long as we encounter WT_RESTART.
+ * Note these steps must be repeatable, we'll continue to take this path as long as we encounter
+ * WT_RESTART.
*
- * Any pinned page goes away if we do a search, including as a result of
- * a restart. Get a local copy of any pinned key and re-save the cursor
- * state: we may retry but eventually fail.
+ * Any pinned page goes away if we do a search, including as a result of a restart. Get a local
+ * copy of any pinned key and re-save the cursor state: we may retry but eventually fail.
*/
WT_ERR(__cursor_localkey(cursor));
__cursor_state_save(cursor, &state);
@@ -1085,14 +1085,12 @@ retry:
if (!__cursor_fix_implicit(btree, cbt))
goto search_notfound;
/*
- * Creating a record past the end of the tree in a
- * fixed-length column-store implicitly fills the
- * gap with empty records. Return success in that
- * case, the record was deleted successfully.
+ * Creating a record past the end of the tree in a fixed-length column-store implicitly
+ * fills the gap with empty records. Return success in that case, the record was deleted
+ * successfully.
*
- * Correct the btree cursor's location: the search
- * will have pointed us at the previous/next item,
- * and that's not correct.
+ * Correct the btree cursor's location: the search will have pointed us at the
+ * previous/next item, and that's not correct.
*/
cbt->recno = cursor->recno;
} else
@@ -1107,11 +1105,10 @@ err:
if (ret == 0) {
/*
- * If positioned originally, but we had to do a search, acquire
- * a position so we can return success.
+ * If positioned originally, but we had to do a search, acquire a position so we can return
+ * success.
*
- * If not positioned originally, leave it that way, clear any
- * key and reset the cursor.
+ * If not positioned originally, leave it that way, clear any key and reset the cursor.
*/
if (positioned) {
if (searched)
@@ -1192,16 +1189,14 @@ __btcur_update(WT_CURSOR_BTREE *cbt, WT_ITEM *value, u_int modify_type)
__cursor_state_save(cursor, &state);
/*
- * If update positioned to an on-page key, the update doesn't require
- * another search. We don't care about the "overwrite" configuration
- * because regardless of the overwrite setting, any existing record is
- * updated, and the record must exist with a positioned cursor.
+ * If update positioned to an on-page key, the update doesn't require another search. We don't
+ * care about the "overwrite" configuration because regardless of the overwrite setting, any
+ * existing record is updated, and the record must exist with a positioned cursor.
*
- * Fixed-length column store can never use a positioned cursor to update
- * because the cursor may not be positioned to the correct record in the
- * case of implicit records in the append list.
+ * Fixed-length column store can never use a positioned cursor to update because the cursor may
+ * not be positioned to the correct record in the case of implicit records in the append list.
*/
- if (btree->type != BTREE_COL_FIX && __cursor_page_pinned(cbt)) {
+ if (btree->type != BTREE_COL_FIX && __cursor_page_pinned(cbt, false)) {
WT_ERR(__wt_txn_autocommit_check(session));
/*
@@ -1349,23 +1344,20 @@ __cursor_chain_exceeded(WT_CURSOR_BTREE *cbt)
/*
* Step through the modify operations at the beginning of the chain.
*
- * Deleted or standard updates are anticipated to be sufficient to base
- * the modify (although that's not guaranteed: they may not be visible
- * or might abort before we read them). Also, this is not a hard
- * limit, threads can race modifying updates.
+ * Deleted or standard updates are anticipated to be sufficient to base the modify (although
+ * that's not guaranteed: they may not be visible or might abort before we read them). Also,
+ * this is not a hard limit, threads can race modifying updates.
*
- * If the total size in bytes of the updates exceeds some factor of the
- * underlying value size (which we know because the cursor is
- * positioned), create a new full copy of the value. This limits the
- * cache pressure from creating full copies to that factor: with the
- * default factor of 1, the total size in memory of a set of modify
- * updates is limited to double the size of the modifies.
+ * If the total size in bytes of the updates exceeds some factor of the underlying value size
+ * (which we know because the cursor is positioned), create a new full copy of the value. This
+ * limits the cache pressure from creating full copies to that factor: with the default factor
+ * of 1, the total size in memory of a set of modify updates is limited to double the size of
+ * the modifies.
*
- * Otherwise, limit the length of the update chain to a fixed size to
- * bound the cost of rebuilding the value during reads. When history
- * has to be maintained, creating extra copies of large documents
- * multiplies cache pressure because the old ones cannot be freed, so
- * allow the modify chain to grow.
+ * Otherwise, limit the length of the update chain to a fixed size to bound the cost of
+ * rebuilding the value during reads. When history has to be maintained, creating extra copies
+ * of large documents multiplies cache pressure because the old ones cannot be freed, so allow
+ * the modify chain to grow.
*/
for (i = 0, upd_size = 0; upd != NULL && upd->type == WT_UPDATE_MODIFY; ++i, upd = upd->next) {
upd_size += WT_UPDATE_MEMSIZE(upd);
@@ -1400,26 +1392,22 @@ __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries)
__cursor_state_save(cursor, &state);
/*
- * Get the current value and apply the modification to it, for a few
- * reasons: first, we set the updated value so the application can
- * retrieve the cursor's value; second, we use the updated value as
- * the update if the update chain is too long; third, there's a check
- * if the updated value is too large to store; fourth, to simplify the
- * count of bytes being added/removed; fifth, we can get into serious
- * trouble if we attempt to modify a value that doesn't exist or read
- * a value that might not exist in the future. For the fifth reason,
- * fail if in anything other than a snapshot transaction, read-committed
- * and read-uncommitted imply values that might disappear out from under
- * us or an inability to repeat point-in-time reads.
+ * Get the current value and apply the modification to it, for a few reasons: first, we set the
+ * updated value so the application can retrieve the cursor's value; second, we use the updated
+ * value as the update if the update chain is too long; third, there's a check if the updated
+ * value is too large to store; fourth, to simplify the count of bytes being added/removed;
+ * fifth, we can get into serious trouble if we attempt to modify a value that doesn't exist or
+ * read a value that might not exist in the future. For the fifth reason, fail if in anything
+ * other than a snapshot transaction, read-committed and read-uncommitted imply values that
+ * might disappear out from under us or an inability to repeat point-in-time reads.
*
- * Also, an application might read a value outside of a transaction and
- * then call modify. For that to work, the read must be part of the
- * transaction that performs the update for correctness, otherwise we
- * could race with another thread and end up modifying the wrong value.
- * A clever application could get this right (imagine threads that only
- * updated non-overlapping, fixed-length byte strings), but it's unsafe
- * because it will work most of the time and the failure is unlikely to
- * be detected. Require explicit transactions for modify operations.
+ * Also, an application might read a value outside of a transaction and then call modify. For
+ * that to work, the read must be part of the transaction that performs the update for
+ * correctness, otherwise we could race with another thread and end up modifying the wrong
+ * value. A clever application could get this right (imagine threads that only updated
+ * non-overlapping, fixed-length byte strings), but it's unsafe because it will work most of the
+ * time and the failure is unlikely to be detected. Require explicit transactions for modify
+ * operations.
*/
if (session->txn.isolation != WT_ISO_SNAPSHOT)
WT_ERR_MSG(session, ENOTSUP,
@@ -1444,9 +1432,8 @@ __wt_btcur_modify(WT_CURSOR_BTREE *cbt, WT_MODIFY *entries, int nentries)
/*
* WT_CURSOR.modify is update-without-overwrite.
*
- * Use the modify buffer as the update if the data package saves us some
- * memory and the update chain is under the limit, else use the complete
- * value.
+ * Use the modify buffer as the update if the data package saves us some memory and the update
+ * chain is under the limit, else use the complete value.
*/
overwrite = F_ISSET(cursor, WT_CURSTD_OVERWRITE);
F_CLR(cursor, WT_CURSTD_OVERWRITE);
@@ -1645,23 +1632,19 @@ __cursor_truncate(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSOR_BT
yield_count = sleep_usecs = 0;
/*
- * First, call the cursor search method to re-position the cursor: we
- * may not have a cursor position (if the higher-level truncate code
- * switched the cursors to have an "external" cursor key, and because
- * we don't save a copy of the page's write generation information,
- * which we need to remove records).
+ * First, call the cursor search method to re-position the cursor: we may not have a cursor position
+ * (if the higher-level truncate code switched the cursors to have an "external" cursor key, and
+ * because we don't save a copy of the page's write generation information, which we need to remove
+ * records).
*
- * Once that's done, we can delete records without a full search, unless
- * we encounter a restart error because the page was modified by some
- * other thread of control; in that case, repeat the full search to
- * refresh the page's modification information.
+ * Once that's done, we can delete records without a full search, unless we encounter a restart
+ * error because the page was modified by some other thread of control; in that case, repeat the
+ * full search to refresh the page's modification information.
*
- * If this is a row-store, we delete leaf pages having no overflow items
- * without reading them; for that to work, we have to ensure we read the
- * page referenced by the ending cursor, since we may be deleting only a
- * partial page at the end of the truncation. Our caller already fully
- * instantiated the end cursor, so we know that page is pinned in memory
- * and we can proceed without concern.
+ * If this is a row-store, we delete leaf pages having no overflow items without reading them; for
+ * that to work, we have to ensure we read the page referenced by the ending cursor, since we may be
+ * deleting only a partial page at the end of the truncation. Our caller already fully instantiated
+ * the end cursor, so we know that page is pinned in memory and we can proceed without concern.
*/
retry:
WT_ERR(__wt_btcur_search(start));
@@ -1703,23 +1686,20 @@ __cursor_truncate_fix(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *start, WT_CURSO
yield_count = sleep_usecs = 0;
/*
- * Handle fixed-length column-store objects separately: for row-store
- * and variable-length column-store objects we have "deleted" values
- * and so returned objects actually exist: fixed-length column-store
- * objects are filled-in if they don't exist, that is, if you create
- * record 37, records 1-36 magically appear. Those records can't be
- * deleted, which means we have to ignore already "deleted" records.
+ * Handle fixed-length column-store objects separately: for row-store and variable-length
+ * column-store objects we have "deleted" values and so returned objects actually exist:
+ * fixed-length column-store objects are filled-in if they don't exist, that is, if you create
+ * record 37, records 1-36 magically appear. Those records can't be deleted, which means we have to
+ * ignore already "deleted" records.
*
- * First, call the cursor search method to re-position the cursor: we
- * may not have a cursor position (if the higher-level truncate code
- * switched the cursors to have an "external" cursor key, and because
- * we don't save a copy of the page's write generation information,
- * which we need to remove records).
+ * First, call the cursor search method to re-position the cursor: we may not have a cursor position
+ * (if the higher-level truncate code switched the cursors to have an "external" cursor key, and
+ * because we don't save a copy of the page's write generation information, which we need to remove
+ * records).
*
- * Once that's done, we can delete records without a full search, unless
- * we encounter a restart error because the page was modified by some
- * other thread of control; in that case, repeat the full search to
- * refresh the page's modification information.
+ * Once that's done, we can delete records without a full search, unless we encounter a restart
+ * error because the page was modified by some other thread of control; in that case, repeat the
+ * full search to refresh the page's modification information.
*/
retry:
WT_ERR(__wt_btcur_search(start));
@@ -1764,13 +1744,12 @@ __wt_btcur_range_truncate(WT_CURSOR_BTREE *start, WT_CURSOR_BTREE *stop)
WT_STAT_DATA_INCR(session, cursor_truncate);
/*
- * For recovery, log the start and stop keys for a truncate operation,
- * not the individual records removed. On the other hand, for rollback
- * we need to keep track of all the in-memory operations.
+ * For recovery, log the start and stop keys for a truncate operation, not the individual
+ * records removed. On the other hand, for rollback we need to keep track of all the in-memory
+ * operations.
*
- * We deal with this here by logging the truncate range first, then (in
- * the logging code) disabling writing of the in-memory remove records
- * to disk.
+ * We deal with this here by logging the truncate range first, then (in the logging code)
+ * disabling writing of the in-memory remove records to disk.
*/
if (FLD_ISSET(S2C(session)->log_flags, WT_CONN_LOG_ENABLED))
WT_RET(__wt_txn_truncate_log(session, start, stop));