diff options
author | Michael Cahill <michael.cahill@wiredtiger.com> | 2012-02-03 13:25:00 +1100 |
---|---|---|
committer | Michael Cahill <michael.cahill@wiredtiger.com> | 2012-02-03 13:25:00 +1100 |
commit | 3438d9140b9937ad16350087c7cfed379c7ebf9b (patch) | |
tree | 966fc0e033bf0fd29d188f03dae93fe5e4687080 /src/btree/row_modify.c | |
parent | c01aa57f3619417152d3b27d91856b9bf03d3bb0 (diff) | |
parent | 8347925459db58c6c74dc80f437d90ed6ffeac54 (diff) | |
download | mongo-3438d9140b9937ad16350087c7cfed379c7ebf9b.tar.gz |
merge tip
Diffstat (limited to 'src/btree/row_modify.c')
-rw-r--r-- | src/btree/row_modify.c | 321 |
1 files changed, 321 insertions, 0 deletions
diff --git a/src/btree/row_modify.c b/src/btree/row_modify.c new file mode 100644 index 00000000000..44909954952 --- /dev/null +++ b/src/btree/row_modify.c @@ -0,0 +1,321 @@ +/*- + * Copyright (c) 2008-2012 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_row_modify -- + * Row-store insert, update and delete. + */ +int +__wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, int is_remove) +{ + WT_INSERT *ins; + WT_INSERT_HEAD **inshead, *new_inshead, **new_inslist; + WT_ITEM *key, *value; + WT_PAGE *page; + WT_UPDATE **new_upd, *upd, **upd_entry; + size_t ins_size, upd_size; + size_t new_inshead_size, new_inslist_size, new_upd_size; + uint32_t ins_slot; + u_int skipdepth; + int i, ret; + + key = &cbt->iface.key; + value = is_remove ? NULL : &cbt->iface.value; + + page = cbt->page; + + ins = NULL; + new_inshead = NULL; + new_inslist = NULL; + new_upd = NULL; + upd = NULL; + ret = 0; + + /* + * Modify: allocate an update array as necessary, build a WT_UPDATE + * structure, and call a serialized function to insert the WT_UPDATE + * structure. + * + * Insert: allocate an insert array as necessary, build a WT_INSERT + * and WT_UPDATE structure pair, and call a serialized function to + * insert the WT_INSERT structure. + */ + if (cbt->compare == 0) { + new_upd_size = 0; + if (cbt->ins == NULL) { + /* + * Allocate an update array as necessary. + * + * Set the WT_UPDATE array reference. + */ + if (page->u.row.upd == NULL) { + WT_ERR(__wt_calloc_def( + session, page->entries, &new_upd)); + new_upd_size = + page->entries * sizeof(WT_UPDATE *); + upd_entry = &new_upd[cbt->slot]; + } else + upd_entry = &page->u.row.upd[cbt->slot]; + } else + upd_entry = &cbt->ins->upd; + + /* Allocate room for the new value from per-thread memory. */ + WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size)); + + /* Insert the WT_UPDATE structure. */ + ret = __wt_update_serial(session, page, cbt->write_gen, + upd_entry, &new_upd, new_upd_size, &upd, upd_size); + } else { + /* + * Allocate insert array if necessary, and set the array + * reference. + * + * We allocate an additional insert array slot for insert keys + * sorting less than any key on the page. The test to select + * that slot is baroque: if the search returned the first page + * slot, we didn't end up processing an insert list, and the + * comparison value indicates the search key was smaller than + * the returned slot, then we're using the smallest-key insert + * slot. That's hard, so we set a flag. + */ + ins_slot = F_ISSET( + cbt, WT_CBT_SEARCH_SMALLEST) ? page->entries : cbt->slot; + + new_inshead_size = new_inslist_size = 0; + if (page->u.row.ins == NULL) { + WT_ERR(__wt_calloc_def( + session, page->entries + 1, &new_inslist)); + new_inslist_size = + (page->entries + 1) * sizeof(WT_INSERT_HEAD *); + inshead = &new_inslist[ins_slot]; + } else + inshead = &page->u.row.ins[ins_slot]; + + /* + * Allocate a new insert list head as necessary. + * + * If allocating a new insert list head, we have to initialize + * the cursor's insert list stack and insert head reference as + * well, search couldn't have. + */ + if (*inshead == NULL) { + new_inshead_size = sizeof(WT_INSERT_HEAD); + WT_ERR(__wt_calloc_def(session, 1, &new_inshead)); + for (i = 0; i < WT_SKIP_MAXDEPTH; i++) + cbt->ins_stack[i] = &new_inshead->head[i]; + cbt->ins_head = new_inshead; + } + + /* Choose a skiplist depth for this insert. */ + skipdepth = __wt_skip_choose_depth(); + + /* + * Allocate a WT_INSERT/WT_UPDATE pair, and update the cursor + * to reference it. + */ + WT_ERR(__wt_row_insert_alloc( + session, key, skipdepth, &ins, &ins_size)); + WT_ERR(__wt_update_alloc(session, value, &upd, &upd_size)); + ins->upd = upd; + ins_size += upd_size; + cbt->ins = ins; + + /* Insert the WT_INSERT structure. */ + ret = __wt_insert_serial(session, page, cbt->write_gen, + inshead, cbt->ins_stack, + &new_inslist, new_inslist_size, + &new_inshead, new_inshead_size, + &ins, ins_size, skipdepth); + } + + if (ret != 0) { +err: if (ins != NULL) + __wt_free(session, ins); + if (upd != NULL) + __wt_free(session, upd); + } + + /* Free any insert, update arrays. */ + __wt_free(session, new_inslist); + __wt_free(session, new_inshead); + __wt_free(session, new_upd); + + return (ret); +} + +/* + * __wt_row_insert_alloc -- + * Row-store insert: allocate a WT_INSERT structure from the session's + * buffer and fill it in. + */ +int +__wt_row_insert_alloc(WT_SESSION_IMPL *session, + WT_ITEM *key, u_int skipdepth, WT_INSERT **insp, size_t *ins_sizep) +{ + WT_INSERT *ins; + size_t ins_size; + + /* + * Allocate the WT_INSERT structure, next pointers for the skip list, + * and room for the key. Then copy the key into place. + */ + ins_size = sizeof(WT_INSERT) + + skipdepth * sizeof(WT_INSERT *) + key->size; + WT_RET(__wt_calloc(session, 1, ins_size, &ins)); + + ins->u.key.offset = WT_STORE_SIZE(ins_size - key->size); + WT_INSERT_KEY_SIZE(ins) = key->size; + memcpy(WT_INSERT_KEY(ins), key->data, key->size); + + *insp = ins; + if (ins_sizep != NULL) + *ins_sizep = ins_size; + return (0); +} + +/* + * __wt_insert_serial_func -- + * Server function to add an WT_INSERT entry to the page. + */ +void +__wt_insert_serial_func(WT_SESSION_IMPL *session) +{ + WT_INSERT *new_ins, ***ins_stack; + WT_INSERT_HEAD **inshead, **new_inslist, *new_inshead; + WT_PAGE *page; + uint32_t write_gen; + u_int i, skipdepth; + int ret; + + ret = 0; + + __wt_insert_unpack(session, &page, &write_gen, &inshead, + &ins_stack, &new_inslist, &new_inshead, &new_ins, &skipdepth); + + /* Check the page's write-generation. */ + WT_ERR(__wt_page_write_gen_check(page, write_gen)); + + /* + * If the page does not yet have an insert array, our caller passed + * us one. + */ + if (page->type == WT_PAGE_ROW_LEAF) { + if (page->u.row.ins == NULL) { + page->u.row.ins = new_inslist; + __wt_insert_new_inslist_taken(session, page); + } + } else + if (page->modify->update == NULL) { + page->modify->update = new_inslist; + __wt_insert_new_inslist_taken(session, page); + } + + /* + * If the insert head does not yet have an insert list, our caller + * passed us one. + */ + if (*inshead == NULL) { + *inshead = new_inshead; + __wt_insert_new_inshead_taken(session, page); + } + + /* + * Publish: First, point the new WT_INSERT item's skiplist references + * to the next elements in the insert list, then flush memory. Second, + * update the skiplist elements that reference the new WT_INSERT item, + * this ensures the list is never inconsistent. + */ + for (i = 0; i < skipdepth; i++) + new_ins->next[i] = *ins_stack[i]; + WT_WRITE_BARRIER(); + for (i = 0; i < skipdepth; i++) { + if ((*inshead)->tail[i] == NULL || + ins_stack[i] == &(*inshead)->tail[i]->next[i]) + (*inshead)->tail[i] = new_ins; + *ins_stack[i] = new_ins; + } + + __wt_insert_new_ins_taken(session, page); + +err: __wt_session_serialize_wrapup(session, page, ret); +} + +/* + * __wt_update_alloc -- + * Allocate a WT_UPDATE structure and associated value from the session's + * buffer and fill it in. + */ +int +__wt_update_alloc(WT_SESSION_IMPL *session, + WT_ITEM *value, WT_UPDATE **updp, size_t *sizep) +{ + WT_UPDATE *upd; + size_t size; + + /* + * Allocate the WT_UPDATE structure and room for the value, then copy + * the value into place. + */ + size = value == NULL ? 0 : value->size; + WT_RET(__wt_calloc(session, 1, sizeof(WT_UPDATE) + size, &upd)); + if (value == NULL) + WT_UPDATE_DELETED_SET(upd); + else { + upd->size = WT_STORE_SIZE(size); + memcpy(WT_UPDATE_DATA(upd), value->data, size); + } + + *updp = upd; + if (sizep != NULL) + *sizep = sizeof(WT_UPDATE) + size; + return (0); +} + +/* + * __wt_update_serial_func -- + * Server function to add an WT_UPDATE entry in the page array. + */ +void +__wt_update_serial_func(WT_SESSION_IMPL *session) +{ + WT_PAGE *page; + WT_UPDATE **new_upd, *upd, **upd_entry; + uint32_t write_gen; + int ret; + + ret = 0; + + __wt_update_unpack( + session, &page, &write_gen, &upd_entry, &new_upd, &upd); + + /* Check the page's write-generation. */ + WT_ERR(__wt_page_write_gen_check(page, write_gen)); + + /* + * If the page needs an update array (column-store pages and inserts on + * row-store pages do not use the update array), our caller passed us + * one of the correct size. Check the page still needs one (the write + * generation test should have caught that, though). + */ + if (new_upd != NULL && page->u.row.upd == NULL) { + page->u.row.upd = new_upd; + __wt_update_new_upd_taken(session, page); + } + + upd->next = *upd_entry; + /* + * Publish: there must be a barrier to ensure the new entry's next + * pointer is set before we update the linked list. + */ + WT_PUBLISH(*upd_entry, upd); + + __wt_update_upd_taken(session, page); + +err: __wt_session_serialize_wrapup(session, page, ret); +} |