summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/btree/row_modify.c
diff options
context:
space:
mode:
authorEliot Horowitz <eliot@10gen.com>2014-11-04 15:46:40 -0500
committerEliot Horowitz <eliot@10gen.com>2014-11-05 11:21:19 -0500
commit5ca2daf551a2c631a5f573cb054406f5d49fbef5 (patch)
treeb0a23d34ffdb376bac0b79ed17b5619cfc0d9b47 /src/third_party/wiredtiger/src/btree/row_modify.c
parent017704acdfc7517efadb3fab167bba06c025c01a (diff)
downloadmongo-5ca2daf551a2c631a5f573cb054406f5d49fbef5.tar.gz
SERVER-15953: add wiredtiger to third_party
Diffstat (limited to 'src/third_party/wiredtiger/src/btree/row_modify.c')
-rw-r--r--src/third_party/wiredtiger/src/btree/row_modify.c346
1 files changed, 346 insertions, 0 deletions
diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c
new file mode 100644
index 00000000000..e0036d14cbb
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/row_modify.c
@@ -0,0 +1,346 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_page_modify_alloc --
+ * Allocate a page's modification structure.
+ */
+int
+__wt_page_modify_alloc(WT_SESSION_IMPL *session, WT_PAGE *page)
+{
+ WT_CONNECTION_IMPL *conn;
+ WT_PAGE_MODIFY *modify;
+
+ conn = S2C(session);
+
+ WT_RET(__wt_calloc_def(session, 1, &modify));
+
+ /*
+ * Select a spinlock for the page; let the barrier immediately below
+ * keep things from racing too badly.
+ */
+ modify->page_lock = ++conn->page_lock_cnt % WT_PAGE_LOCKS(conn);
+
+ /*
+ * Multiple threads of control may be searching and deciding to modify
+ * a page. If our modify structure is used, update the page's memory
+ * footprint, else discard the modify structure, another thread did the
+ * work.
+ */
+ if (WT_ATOMIC_CAS8(page->modify, NULL, modify))
+ __wt_cache_page_inmem_incr(session, page, sizeof(*modify));
+ else
+ __wt_free(session, modify);
+ return (0);
+}
+
+/*
+ * __wt_row_modify --
+ * Row-store insert, update and delete.
+ */
+int
+__wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt,
+ WT_ITEM *key, WT_ITEM *value, WT_UPDATE *upd, int is_remove)
+{
+ WT_DECL_RET;
+ WT_INSERT *ins;
+ WT_INSERT_HEAD *ins_head, **ins_headp;
+ WT_PAGE *page;
+ WT_UPDATE *old_upd, **upd_entry;
+ size_t ins_size, upd_size;
+ uint32_t ins_slot;
+ u_int i, skipdepth;
+ int logged;
+
+ ins = NULL;
+ page = cbt->ref->page;
+ logged = 0;
+
+ /* This code expects a remove to have a NULL value. */
+ if (is_remove)
+ value = NULL;
+
+ /* If we don't yet have a modify structure, we'll need one. */
+ WT_RET(__wt_page_modify_init(session, page));
+
+ /*
+ * Modify: allocate an update array as necessary, build a WT_UPDATE
+ * structure, and call a serialized function to insert the WT_UPDATE
+ * structure.
+ *
+ * Insert: allocate an insert array as necessary, build a WT_INSERT
+ * and WT_UPDATE structure pair, and call a serialized function to
+ * insert the WT_INSERT structure.
+ */
+ if (cbt->compare == 0) {
+ if (cbt->ins == NULL) {
+ /* Allocate an update array as necessary. */
+ WT_PAGE_ALLOC_AND_SWAP(session, page,
+ page->pg_row_upd, upd_entry, page->pg_row_entries);
+
+ /* Set the WT_UPDATE array reference. */
+ upd_entry = &page->pg_row_upd[cbt->slot];
+ } else
+ upd_entry = &cbt->ins->upd;
+
+ if (upd == NULL) {
+ /* Make sure the update can proceed. */
+ WT_ERR(__wt_txn_update_check(
+ session, old_upd = *upd_entry));
+
+ /* Allocate a WT_UPDATE structure and transaction ID. */
+ WT_ERR(
+ __wt_update_alloc(session, value, &upd, &upd_size));
+ WT_ERR(__wt_txn_modify(session, upd));
+ logged = 1;
+
+ /* Avoid WT_CURSOR.update data copy. */
+ cbt->modify_update = upd;
+ } else {
+ upd_size = sizeof(WT_UPDATE) +
+ (WT_UPDATE_DELETED_ISSET(upd) ? 0 : upd->size);
+
+ /*
+ * We are restoring updates that couldn't be evicted,
+ * there should only be one update list per key.
+ */
+ WT_ASSERT(session, *upd_entry == NULL);
+ /*
+ * Set the "old" entry to the second update in the list
+ * so that the serialization function succeeds in
+ * swapping the first update into place.
+ */
+ old_upd = *upd_entry = upd->next;
+ }
+
+ /*
+ * Point the new WT_UPDATE item to the next element in the list.
+ * If we get it right, the serialization function lock acts as
+ * our memory barrier to flush this write.
+ */
+ upd->next = old_upd;
+
+ /* Serialize the update. */
+ WT_ERR(__wt_update_serial(
+ session, page, upd_entry, &upd, upd_size));
+ } else {
+ /*
+ * Allocate the insert array as necessary.
+ *
+ * We allocate an additional insert array slot for insert keys
+ * sorting less than any key on the page. The test to select
+ * that slot is baroque: if the search returned the first page
+ * slot, we didn't end up processing an insert list, and the
+ * comparison value indicates the search key was smaller than
+ * the returned slot, then we're using the smallest-key insert
+ * slot. That's hard, so we set a flag.
+ */
+ WT_PAGE_ALLOC_AND_SWAP(session, page,
+ page->pg_row_ins, ins_headp, page->pg_row_entries + 1);
+
+ ins_slot = F_ISSET(cbt, WT_CBT_SEARCH_SMALLEST) ?
+ page->pg_row_entries: cbt->slot;
+ ins_headp = &page->pg_row_ins[ins_slot];
+
+ /* Allocate the WT_INSERT_HEAD structure as necessary. */
+ WT_PAGE_ALLOC_AND_SWAP(session, page, *ins_headp, ins_head, 1);
+ ins_head = *ins_headp;
+
+ /* Choose a skiplist depth for this insert. */
+ skipdepth = __wt_skip_choose_depth(session);
+
+ /*
+ * Allocate a WT_INSERT/WT_UPDATE pair and transaction ID, and
+ * update the cursor to reference it (the WT_INSERT_HEAD might
+ * be allocated, the WT_INSERT was allocated).
+ */
+ WT_ERR(__wt_row_insert_alloc(
+ session, key, skipdepth, &ins, &ins_size));
+ cbt->ins_head = ins_head;
+ cbt->ins = ins;
+
+ if (upd == NULL) {
+ WT_ERR(
+ __wt_update_alloc(session, value, &upd, &upd_size));
+ WT_ERR(__wt_txn_modify(session, upd));
+ logged = 1;
+
+ /* Avoid WT_CURSOR.update data copy. */
+ cbt->modify_update = upd;
+ } else
+ upd_size = sizeof(WT_UPDATE) +
+ (WT_UPDATE_DELETED_ISSET(upd) ? 0 : upd->size);
+
+ ins->upd = upd;
+ ins_size += upd_size;
+
+ /*
+ * If there was no insert list during the search, the cursor's
+ * information cannot be correct, search couldn't have
+ * initialized it.
+ *
+ * Otherwise, point the new WT_INSERT item's skiplist to the
+ * next elements in the insert list (which we will check are
+ * still valid inside the serialization function).
+ *
+ * The serial mutex acts as our memory barrier to flush these
+ * writes before inserting them into the list.
+ */
+ if (WT_SKIP_FIRST(ins_head) == NULL)
+ for (i = 0; i < skipdepth; i++) {
+ cbt->ins_stack[i] = &ins_head->head[i];
+ ins->next[i] = cbt->next_stack[i] = NULL;
+ }
+ else
+ for (i = 0; i < skipdepth; i++)
+ ins->next[i] = cbt->next_stack[i];
+
+ /* Insert the WT_INSERT structure. */
+ WT_ERR(__wt_insert_serial(
+ session, page, cbt->ins_head, cbt->ins_stack,
+ &ins, ins_size, skipdepth));
+ }
+
+ if (logged)
+ WT_ERR(__wt_txn_log_op(session, cbt));
+
+ if (0) {
+err: /*
+ * Remove the update from the current transaction, so we don't
+ * try to modify it on rollback.
+ */
+ if (logged)
+ __wt_txn_unmodify(session);
+ __wt_free(session, ins);
+ cbt->ins = NULL;
+ __wt_free(session, upd);
+ }
+
+ return (ret);
+}
+
+/*
+ * __wt_row_insert_alloc --
+ * Row-store insert: allocate a WT_INSERT structure and fill it in.
+ */
+int
+__wt_row_insert_alloc(WT_SESSION_IMPL *session,
+ WT_ITEM *key, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep)
+{
+ WT_INSERT *ins;
+ size_t ins_size;
+
+ /*
+ * Allocate the WT_INSERT structure, next pointers for the skip list,
+ * and room for the key. Then copy the key into place.
+ */
+ ins_size = sizeof(WT_INSERT) +
+ skipdepth * sizeof(WT_INSERT *) + key->size;
+ WT_RET(__wt_calloc(session, 1, ins_size, &ins));
+
+ ins->u.key.offset = WT_STORE_SIZE(ins_size - key->size);
+ WT_INSERT_KEY_SIZE(ins) = WT_STORE_SIZE(key->size);
+ memcpy(WT_INSERT_KEY(ins), key->data, key->size);
+
+ *insp = ins;
+ if (ins_sizep != NULL)
+ *ins_sizep = ins_size;
+ return (0);
+}
+
+/*
+ * __wt_update_alloc --
+ * Allocate a WT_UPDATE structure and associated value and fill it in.
+ */
+int
+__wt_update_alloc(
+ WT_SESSION_IMPL *session, WT_ITEM *value, WT_UPDATE **updp, size_t *sizep)
+{
+ WT_UPDATE *upd;
+ size_t size;
+
+ /*
+ * Allocate the WT_UPDATE structure and room for the value, then copy
+ * the value into place.
+ */
+ size = value == NULL ? 0 : value->size;
+ WT_RET(__wt_calloc(session, 1, sizeof(WT_UPDATE) + size, &upd));
+ if (value == NULL)
+ WT_UPDATE_DELETED_SET(upd);
+ else {
+ upd->size = WT_STORE_SIZE(size);
+ memcpy(WT_UPDATE_DATA(upd), value->data, size);
+ }
+
+ *updp = upd;
+ *sizep = sizeof(WT_UPDATE) + size;
+ return (0);
+}
+
+/*
+ * __wt_update_obsolete_check --
+ * Check for obsolete updates.
+ */
+WT_UPDATE *
+__wt_update_obsolete_check(WT_SESSION_IMPL *session, WT_UPDATE *upd)
+{
+ WT_UPDATE *first, *next;
+
+ /*
+ * This function identifies obsolete updates, and truncates them from
+ * the rest of the chain; because this routine is called from inside
+ * a serialization function, the caller has responsibility for actually
+ * freeing the memory.
+ *
+ * Walk the list of updates, looking for obsolete updates at the end.
+ */
+ for (first = NULL; upd != NULL; upd = upd->next)
+ if (__wt_txn_visible_all(session, upd->txnid)) {
+ if (first == NULL)
+ first = upd;
+ } else if (upd->txnid != WT_TXN_ABORTED)
+ first = NULL;
+
+ /*
+ * We cannot discard this WT_UPDATE structure, we can only discard
+ * WT_UPDATE structures subsequent to it, other threads of control will
+ * terminate their walk in this element. Save a reference to the list
+ * we will discard, and terminate the list.
+ */
+ if (first != NULL &&
+ (next = first->next) != NULL &&
+ WT_ATOMIC_CAS8(first->next, next, NULL))
+ return (next);
+
+ return (NULL);
+}
+
+/*
+ * __wt_update_obsolete_free --
+ * Free an obsolete update list.
+ */
+void
+__wt_update_obsolete_free(
+ WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd)
+{
+ WT_UPDATE *next;
+ size_t size;
+
+ /* Free a WT_UPDATE list. */
+ for (size = 0; upd != NULL; upd = next) {
+ /* Deleted items have a dummy size: don't include that. */
+ size += sizeof(WT_UPDATE) +
+ (WT_UPDATE_DELETED_ISSET(upd) ? 0 : upd->size);
+
+ next = upd->next;
+ __wt_free(session, upd);
+ }
+ if (size != 0)
+ __wt_cache_page_inmem_decr(session, page, size);
+}