diff options
author | Michael Cahill <michael.cahill@wiredtiger.com> | 2012-09-28 17:39:45 +1000 |
---|---|---|
committer | Michael Cahill <michael.cahill@wiredtiger.com> | 2012-09-28 17:39:45 +1000 |
commit | 4be1afdf665bf4bd4449910049fcb0c0ccb9286e (patch) | |
tree | 43388e016235ac31f5c9a51e58e0325162ac52ec /src | |
parent | 952577567aaaeb2076d445b18fb88d28a2e8da67 (diff) | |
parent | c8a4341d852d323d0029b0b0a1e48a58ab2f23b6 (diff) | |
download | mongo-4be1afdf665bf4bd4449910049fcb0c0ccb9286e.tar.gz |
Merge branch 'develop' into data-handles
Conflicts:
src/btree/bt_evict.c
Diffstat (limited to 'src')
-rw-r--r-- | src/btree/bt_curnext.c | 2 | ||||
-rw-r--r-- | src/btree/bt_curprev.c | 1 | ||||
-rw-r--r-- | src/btree/bt_cursor.c | 1 | ||||
-rw-r--r-- | src/btree/bt_evict.c | 10 | ||||
-rw-r--r-- | src/btree/bt_page.c | 12 | ||||
-rw-r--r-- | src/btree/rec_evict.c | 6 | ||||
-rw-r--r-- | src/btree/rec_write.c | 28 | ||||
-rw-r--r-- | src/conn/conn_handle.c | 3 | ||||
-rw-r--r-- | src/docs/cursor-ops.dox | 10 | ||||
-rw-r--r-- | src/docs/transactions.dox | 5 | ||||
-rw-r--r-- | src/include/api.h | 8 | ||||
-rw-r--r-- | src/include/cursor.i | 13 | ||||
-rw-r--r-- | src/include/lsm.h | 9 | ||||
-rw-r--r-- | src/include/txn.h | 14 | ||||
-rw-r--r-- | src/include/txn.i | 4 | ||||
-rw-r--r-- | src/include/wt_internal.h | 2 | ||||
-rw-r--r-- | src/lsm/lsm_meta.c | 5 | ||||
-rw-r--r-- | src/lsm/lsm_worker.c | 84 | ||||
-rw-r--r-- | src/txn/txn.c | 52 |
19 files changed, 166 insertions, 103 deletions
diff --git a/src/btree/bt_curnext.c b/src/btree/bt_curnext.c index d8c188f3198..3fcf8dcd716 100644 --- a/src/btree/bt_curnext.c +++ b/src/btree/bt_curnext.c @@ -403,6 +403,7 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, int discard) LF_SET(WT_TREE_DISCARD); __cursor_func_init(cbt, 0); + __cursor_position_clear(cbt); /* * If we aren't already iterating in the right direction, there's @@ -507,6 +508,7 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt) WT_BSTAT_INCR(session, cursor_read_next); __cursor_func_init(cbt, 1); + __cursor_position_clear(cbt); /* * Only supports row-store: applications can trivially select a random diff --git a/src/btree/bt_curprev.c b/src/btree/bt_curprev.c index fe21472f2fe..baa256010e8 100644 --- a/src/btree/bt_curprev.c +++ b/src/btree/bt_curprev.c @@ -491,6 +491,7 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, int discard) LF_SET(WT_TREE_DISCARD); __cursor_func_init(cbt, 0); + __cursor_position_clear(cbt); /* * If we aren't already iterating in the right direction, there's diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c index 3b880d6a749..8265c4ccecf 100644 --- a/src/btree/bt_cursor.c +++ b/src/btree/bt_cursor.c @@ -112,6 +112,7 @@ __wt_btcur_reset(WT_CURSOR_BTREE *cbt) __cursor_leave(cbt); __cursor_search_clear(cbt); + __cursor_position_clear(cbt); return (0); } diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c index 973f623893a..24236459966 100644 --- a/src/btree/bt_evict.c +++ b/src/btree/bt_evict.c @@ -946,6 +946,7 @@ __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app) { WT_BTREE *btree; WT_DATA_HANDLE *saved_dhandle; + WT_DECL_RET; WT_PAGE *page; __evict_get_page(session, is_app, &btree, &page); @@ -958,19 +959,14 @@ __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app) saved_dhandle = session->dhandle; WT_SET_BTREE_IN_SESSION(session, btree); - /* - * We don't care why eviction failed (maybe the page was dirty and - * we're out of disk space, or the page had an in-memory subtree - * already being evicted). - */ - (void)__evict_page(session, page); + ret = __evict_page(session, page); (void)WT_ATOMIC_SUB(btree->lru_count, 1); WT_CLEAR_BTREE_IN_SESSION(session); session->dhandle = saved_dhandle; - return (0); + return (ret); } /* diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c index fbf8985cf37..d5973dbd3f3 100644 --- a/src/btree/bt_page.c +++ b/src/btree/bt_page.c @@ -42,9 +42,19 @@ __wt_page_in_func( case WT_REF_DISK: case WT_REF_DELETED: /* The page isn't in memory, attempt to read it. */ + + /* Check if there is space in the cache. */ __wt_eviction_check(session, &read_lockout, wake); wake = 0; - if (read_lockout) + + /* + * If the cache is full, give up, but only if we are + * not holding the schema lock. The schema lock can + * block checkpoints, and thus eviction, so it is not + * safe to wait for eviction if we are holding it. + */ + if (read_lockout && + !F_ISSET(session, WT_SESSION_SCHEMA_LOCKED)) break; WT_RET(__wt_cache_read(session, parent, ref)); diff --git a/src/btree/rec_evict.c b/src/btree/rec_evict.c index c20d9ea4534..92bba2bed82 100644 --- a/src/btree/rec_evict.c +++ b/src/btree/rec_evict.c @@ -339,10 +339,10 @@ __rec_review(WT_SESSION_IMPL *session, } /* - * If no pages are referenced, there are no consistency - * issues: try to bump our snapshot. + * If there aren't multiple cursors active, there + * are no consistency issues: try to bump our snapshot. */ - if (session->nhazard == 0) { + if (session->ncursors <= 1) { __wt_txn_read_last(session); __wt_txn_read_first(session); } diff --git a/src/btree/rec_write.c b/src/btree/rec_write.c index b23b911c345..aab438ebb15 100644 --- a/src/btree/rec_write.c +++ b/src/btree/rec_write.c @@ -1965,7 +1965,7 @@ __rec_col_var(WT_SESSION_IMPL *session, WT_INSERT *ins; WT_INSERT_HEAD *append; WT_ITEM *last; - WT_UPDATE *next_upd, *upd; + WT_UPDATE *upd; uint64_t n, nrepeat, repeat_count, rle, slvg_missing, src_recno; uint32_t i, size; int deleted, last_deleted, orig_deleted, update_no_copy; @@ -2016,21 +2016,13 @@ __rec_col_var(WT_SESSION_IMPL *session, WT_COL_FOREACH(page, cip, i) { ovfl_state = OVFL_IGNORE; if ((cell = WT_COL_PTR(page, cip)) == NULL) { - ins = NULL; nrepeat = 1; + ins = NULL; orig_deleted = 1; } else { __wt_cell_unpack(cell, unpack); nrepeat = __wt_cell_rle(unpack); - ins = WT_SKIP_FIRST(WT_COL_UPDATE(page, cip)); - while (ins != NULL) { - WT_ERR( - __rec_txn_read(session, r, ins->upd, &upd)); - if (upd != NULL) - break; - ins = WT_SKIP_NEXT(ins); - } /* * If the original value is "deleted", there's no value @@ -2090,19 +2082,13 @@ record_loop: /* */ for (n = 0; n < nrepeat; n += repeat_count, src_recno += repeat_count) { - if (ins != NULL && - WT_INSERT_RECNO(ins) == src_recno) { + upd = NULL; + if (ins != NULL && WT_INSERT_RECNO(ins) == src_recno) { WT_ERR( __rec_txn_read(session, r, ins->upd, &upd)); - WT_ASSERT(session, upd != NULL); - do { - ins = WT_SKIP_NEXT(ins); - if (ins == NULL) - break; - WT_ERR(__rec_txn_read( - session, r, ins->upd, &next_upd)); - } while (next_upd == NULL); - + ins = WT_SKIP_NEXT(ins); + } + if (upd != NULL) { update_no_copy = 1; /* No data copy */ repeat_count = 1; diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c index 4b393d1550d..d673a37cfc5 100644 --- a/src/conn/conn_handle.c +++ b/src/conn/conn_handle.c @@ -73,8 +73,9 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) __wt_spin_destroy(session, &conn->api_lock); __wt_spin_destroy(session, &conn->fh_lock); - __wt_spin_destroy(session, &conn->serial_lock); + __wt_spin_destroy(session, &conn->metadata_lock); __wt_spin_destroy(session, &conn->schema_lock); + __wt_spin_destroy(session, &conn->serial_lock); /* Free allocated memory. */ __wt_free(session, conn->home); diff --git a/src/docs/cursor-ops.dox b/src/docs/cursor-ops.dox index 84d089446cb..54a174a09c2 100644 --- a/src/docs/cursor-ops.dox +++ b/src/docs/cursor-ops.dox @@ -101,8 +101,14 @@ To remove existing data using a cursor, use the WT_CURSOR::remove method: @section cursor_error Cursor position after error After any cursor handle method failure, the cursor's position is -undetermined. Applications that cannot re-position the cursor after -failure must duplicate the cursor before calling a cursor method that will +undetermined. For cursor operations that expect a key to be set before the +operation begins (including WT_CURSOR::search, WT_CURSOR::insert, +WT_CURSOR::update and WT_CURSOR::remove), the application's key and value +will not be cleared by an error. + +Applications that cannot re-position the cursor after failure must +duplicate the cursor by calling WT_SESSION::open_cursor and passing the +cursor as the \c to_dup parameter before calling a cursor method that will attempt to re-position the cursor. */ diff --git a/src/docs/transactions.dox b/src/docs/transactions.dox index efc5934f9aa..4bc745858b8 100644 --- a/src/docs/transactions.dox +++ b/src/docs/transactions.dox @@ -72,6 +72,11 @@ updating the same value will fail with ::WT_DEADLOCK. Some applications may benefit from application-level synchronization to avoid repeated attempts to rollback and update the same value. +Operations in transactions may also fail with the ::WT_DEADLOCK error if +some resource cannot be allocated after repeated attempts. For example, if +the cache is not large enough to hold the updates required to satisfy +transactional readers, an operation may fail and return ::WT_DEADLOCK. + @section transaction_isolation Isolation levels WiredTiger supports <code>read-uncommitted</code>, diff --git a/src/include/api.h b/src/include/api.h index f438ab15ef7..c5799f2d4ef 100644 --- a/src/include/api.h +++ b/src/include/api.h @@ -334,14 +334,18 @@ struct __wt_connection_impl { } \ ret = __wt_txn_commit((s), NULL); \ } else { \ - WT_TRET(WT_DEADLOCK); \ (void)__wt_txn_rollback((s), NULL); \ + if (ret == 0 || ret == WT_DEADLOCK) { \ + ret = 0; \ + continue; \ + } \ } \ } else if ((ret) != 0 && \ (ret) != WT_NOTFOUND && \ (ret) != WT_DUPLICATE_KEY) \ F_SET(&(s)->txn, TXN_ERROR); \ -} while (0) + break; \ +} while (1) /* * If a session or connection method is about to return WT_NOTFOUND (some diff --git a/src/include/cursor.i b/src/include/cursor.i index 46ac684aa1f..5077409e8cc 100644 --- a/src/include/cursor.i +++ b/src/include/cursor.i @@ -17,6 +17,16 @@ __cursor_set_recno(WT_CURSOR_BTREE *cbt, uint64_t v) } /* + * __cursor_position_clear -- + * Forget the current key and value in a cursor. + */ +static inline void +__cursor_position_clear(WT_CURSOR_BTREE *cbt) +{ + F_CLR(&cbt->iface, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); +} + +/* * __cursor_search_clear -- * Reset the cursor's state for a search. */ @@ -60,9 +70,6 @@ __cursor_leave(WT_CURSOR_BTREE *cbt) __wt_stack_release(session, cbt->page); cbt->page = NULL; - /* Reset the returned key/value state. */ - F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET); - if (F_ISSET(cbt, WT_CBT_ACTIVE)) { WT_ASSERT(session, session->ncursors > 0); if (--session->ncursors == 0) diff --git a/src/include/lsm.h b/src/include/lsm.h index c97f940437f..30054491975 100644 --- a/src/include/lsm.h +++ b/src/include/lsm.h @@ -82,3 +82,12 @@ struct __wt_lsm_data_source { WT_RWLOCK *rwlock; }; + +struct __wt_lsm_worker_cookie { + WT_LSM_CHUNK **chunk_array; + size_t chunk_alloc; + int nchunks; +#define WT_LSM_WORKER_MERGE 0x01 +#define WT_LSM_WORKER_CHECKPOINT 0x02 + uint32_t flags; +}; diff --git a/src/include/txn.h b/src/include/txn.h index 1adc68e0caf..ed85867a74a 100644 --- a/src/include/txn.h +++ b/src/include/txn.h @@ -34,15 +34,17 @@ typedef uint32_t wt_txnid_t; * remains in the system after 2 billion transactions it can no longer be * compared with current transaction ID. */ -#define TXNID_LT(t1, t2) \ - (((t1) == (t2) || \ - (t1) == WT_TXN_ABORTED || (t2) == WT_TXN_NONE) ? 0 : \ - ((t1) == WT_TXN_NONE || (t2) == WT_TXN_ABORTED) ? 1 : \ +#define TXNID_LE(t1, t2) \ + (((t1) == WT_TXN_ABORTED || (t2) == WT_TXN_NONE) ? 0 : \ + ((t1) == WT_TXN_NONE || (t2) == WT_TXN_ABORTED) ? 1 : \ (t2) - (t1) < (UINT32_MAX / 2)) +#define TXNID_LT(t1, t2) \ + ((t1) != (t2) && TXNID_LE(t1, t2)) + struct __wt_txn_state { - wt_txnid_t id; - wt_txnid_t snap_min; + volatile wt_txnid_t id; + volatile wt_txnid_t snap_min; }; struct __wt_txn_global { diff --git a/src/include/txn.i b/src/include/txn.i index 08d626efddd..e410ad07c68 100644 --- a/src/include/txn.i +++ b/src/include/txn.i @@ -133,9 +133,7 @@ __wt_txn_visible_all(WT_SESSION_IMPL *session, wt_txnid_t id) WT_TXN *txn; txn = &session->txn; - if (TXNID_LT(txn->oldest_snap_min, id)) - return (0); - return (1); + return (TXNID_LT(id, txn->oldest_snap_min)); } /* diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h index 6ebb69a9d5d..8bdb3a7bc20 100644 --- a/src/include/wt_internal.h +++ b/src/include/wt_internal.h @@ -137,6 +137,8 @@ struct __wt_lsm_data_source; typedef struct __wt_lsm_data_source WT_LSM_DATA_SOURCE; struct __wt_lsm_tree; typedef struct __wt_lsm_tree WT_LSM_TREE; +struct __wt_lsm_worker_cookie; + typedef struct __wt_lsm_worker_cookie WT_LSM_WORKER_COOKIE; struct __wt_named_collator; typedef struct __wt_named_collator WT_NAMED_COLLATOR; struct __wt_named_compressor; diff --git a/src/lsm/lsm_meta.c b/src/lsm/lsm_meta.c index f2fba6422d6..8377dbaeac6 100644 --- a/src/lsm/lsm_meta.c +++ b/src/lsm/lsm_meta.c @@ -170,7 +170,10 @@ __wt_lsm_meta_write(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree) WT_ERR(__wt_buf_catfmt(session, buf, "\"%s\"", chunk->uri)); } WT_ERR(__wt_buf_catfmt(session, buf, "]")); - WT_ERR(__wt_metadata_update(session, lsm_tree->name, buf->data)); + __wt_spin_lock(session, &S2C(session)->metadata_lock); + ret = __wt_metadata_update(session, lsm_tree->name, buf->data); + __wt_spin_unlock(session, &S2C(session)->metadata_lock); + WT_ERR(ret); err: __wt_scr_free(&buf); return (ret); diff --git a/src/lsm/lsm_worker.c b/src/lsm/lsm_worker.c index d29f0f93de7..86f5ec504f4 100644 --- a/src/lsm/lsm_worker.c +++ b/src/lsm/lsm_worker.c @@ -8,8 +8,7 @@ #include "wt_internal.h" static int __lsm_free_chunks(WT_SESSION_IMPL *, WT_LSM_TREE *); -static int __lsm_copy_chunks( - WT_LSM_TREE *, size_t *, WT_LSM_CHUNK ***, int *, int); +static int __lsm_copy_chunks(WT_LSM_TREE *, WT_LSM_WORKER_COOKIE *); /* * __wt_lsm_worker -- @@ -20,31 +19,30 @@ void * __wt_lsm_worker(void *arg) { WT_DECL_RET; - WT_LSM_CHUNK *chunk, **chunk_array; + WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; + WT_LSM_WORKER_COOKIE cookie; WT_SESSION_IMPL *session; const char *cfg[] = API_CONF_DEFAULTS(session, checkpoint, NULL); - size_t chunk_alloc; - int i, nchunks, progress; + int i, progress; lsm_tree = arg; session = lsm_tree->worker_session; - chunk_array = NULL; - chunk_alloc = 0; + memset(&cookie, 0, sizeof(cookie)); + F_SET(&cookie, WT_LSM_WORKER_MERGE); while (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) { progress = 0; - WT_ERR(__lsm_copy_chunks( - lsm_tree, &chunk_alloc, &chunk_array, &nchunks, 0)); + WT_ERR(__lsm_copy_chunks(lsm_tree, &cookie)); /* * Write checkpoints in all completed files, then find * something to merge. */ - for (i = 0; i < nchunks; i++) { - chunk = chunk_array[i]; + for (i = 0; i < cookie.nchunks; i++) { + chunk = cookie.chunk_array[i]; if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) || chunk->ncursor > 0) continue; @@ -54,8 +52,8 @@ __wt_lsm_worker(void *arg) * NOTE: we pass a non-NULL config, because otherwise * __wt_checkpoint thinks we're closing the file. */ - WT_WITH_SCHEMA_LOCK(session, ret = - __wt_schema_worker(session, chunk->uri, + WT_WITH_SCHEMA_LOCK(session, + ret =__wt_schema_worker(session, chunk->uri, __wt_checkpoint, cfg, 0)); if (ret == 0) { __wt_spin_lock(session, &lsm_tree->lock); @@ -80,7 +78,7 @@ __wt_lsm_worker(void *arg) __wt_sleep(0, 10); } -err: __wt_free(session, chunk_array); +err: __wt_free(session, cookie.chunk_array); return (NULL); } @@ -94,26 +92,25 @@ void * __wt_lsm_checkpoint_worker(void *arg) { WT_DECL_RET; - WT_LSM_CHUNK *chunk, **chunk_array; + WT_LSM_CHUNK *chunk; WT_LSM_TREE *lsm_tree; + WT_LSM_WORKER_COOKIE cookie; WT_SESSION_IMPL *session; const char *cfg[] = { "name=,drop=", NULL }; - size_t chunk_alloc; - int i, j, nchunks; + int i, j; lsm_tree = arg; session = lsm_tree->ckpt_session; - chunk_array = NULL; - chunk_alloc = 0; + memset(&cookie, 0, sizeof(cookie)); + F_SET(&cookie, WT_LSM_WORKER_CHECKPOINT); while (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) { - WT_ERR(__lsm_copy_chunks( - lsm_tree, &chunk_alloc, &chunk_array, &nchunks, 1)); + WT_ERR(__lsm_copy_chunks(lsm_tree, &cookie)); /* Write checkpoints in all completed files. */ - for (i = 0, j = 0; i < nchunks; i++) { - chunk = chunk_array[i]; + for (i = 0, j = 0; i < cookie.nchunks; i++) { + chunk = cookie.chunk_array[i]; if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK)) continue; ++j; @@ -122,12 +119,12 @@ __wt_lsm_checkpoint_worker(void *arg) * NOTE: we pass a non-NULL config, because otherwise * __wt_checkpoint thinks we're closing the file. */ - WT_WITH_SCHEMA_LOCK(session, ret = - __wt_schema_worker(session, chunk->uri, + WT_WITH_SCHEMA_LOCK(session, + ret = __wt_schema_worker(session, chunk->uri, __wt_checkpoint, cfg, 0)); if (ret == 0) { __wt_spin_lock(session, &lsm_tree->lock); - F_SET(lsm_tree->chunk[i], WT_LSM_CHUNK_ONDISK); + F_SET(chunk, WT_LSM_CHUNK_ONDISK); lsm_tree->dsk_gen++; __wt_spin_unlock(session, &lsm_tree->lock); } @@ -137,27 +134,29 @@ __wt_lsm_checkpoint_worker(void *arg) "LSM worker checkpointed %d.", j); __wt_sleep(0, 10); } -err: __wt_free(session, chunk_array); +err: __wt_free(session, cookie.chunk_array); return (NULL); } +/* + * Take a copy of part of the LSM tree chunk array so that we can work on + * the contents without holding the LSM tree handle lock long term. + */ static int -__lsm_copy_chunks(WT_LSM_TREE *lsm_tree, - size_t *allocp, WT_LSM_CHUNK ***chunkp, int *nchunkp, int checkpoint) +__lsm_copy_chunks(WT_LSM_TREE *lsm_tree, WT_LSM_WORKER_COOKIE *cookie) { WT_DECL_RET; WT_SESSION_IMPL *session; - WT_LSM_CHUNK **chunk_array; - size_t chunk_alloc; int nchunks; - if (checkpoint == 1) + /* Always return zero chunks on error. */ + cookie->nchunks = 0; + + if (F_ISSET(cookie, WT_LSM_WORKER_CHECKPOINT)) session = lsm_tree->ckpt_session; else session = lsm_tree->worker_session; - chunk_array = *chunkp; - chunk_alloc = *allocp; __wt_spin_lock(session, &lsm_tree->lock); if (!F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) { @@ -172,7 +171,7 @@ __lsm_copy_chunks(WT_LSM_TREE *lsm_tree, */ nchunks = lsm_tree->nchunks - 1; /* Checkpoint doesn't care if there are active cursors, merge does. */ - if (checkpoint == 0) { + if (F_ISSET(cookie, WT_LSM_WORKER_MERGE)) { for (; nchunks > 0 && lsm_tree->chunk[nchunks - 1]->ncursor > 0; --nchunks) ; @@ -181,20 +180,17 @@ __lsm_copy_chunks(WT_LSM_TREE *lsm_tree, * If the tree array of active chunks is larger than our current buffer, * increase the size of our current buffer to match. */ - if (chunk_alloc < lsm_tree->chunk_alloc) + if (cookie->chunk_alloc < lsm_tree->chunk_alloc) ret = __wt_realloc(session, - &chunk_alloc, lsm_tree->chunk_alloc, - &chunk_array); + &cookie->chunk_alloc, lsm_tree->chunk_alloc, + &cookie->chunk_array); if (ret == 0 && nchunks > 0) - memcpy(chunk_array, lsm_tree->chunk, + memcpy(cookie->chunk_array, lsm_tree->chunk, nchunks * sizeof(*lsm_tree->chunk)); __wt_spin_unlock(session, &lsm_tree->lock); - if (ret == 0) { - *chunkp = chunk_array; - *allocp = chunk_alloc; - *nchunkp = nchunks; - } + if (ret == 0) + cookie->nchunks = nchunks; return (ret); } diff --git a/src/txn/txn.c b/src/txn/txn.c index eaddc34d708..68a5c805323 100644 --- a/src/txn/txn.c +++ b/src/txn/txn.c @@ -74,11 +74,10 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session, wt_txnid_t max_id) conn = S2C(session); txn = &session->txn; txn_global = &conn->txn_global; - oldest_snap_min = WT_TXN_ABORTED; do { /* Take a copy of the current session ID. */ - current_id = txn_global->current; + current_id = oldest_snap_min = txn_global->current; /* Copy the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); @@ -93,6 +92,12 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session, wt_txnid_t max_id) else if (max_id == WT_TXN_NONE || TXNID_LT(id, max_id)) txn->snapshot[n++] = id; } + + /* + * Ensure the snapshot reads are scheduled before re-checking + * the global current ID. + */ + WT_READ_BARRIER(); } while (current_id != txn_global->current); __txn_sort_snapshot(session, n, @@ -116,11 +121,10 @@ __wt_txn_get_evict_snapshot(WT_SESSION_IMPL *session) conn = S2C(session); txn_global = &conn->txn_global; - oldest_snap_min = WT_TXN_ABORTED; do { /* Take a copy of the current session ID. */ - current_id = txn_global->current; + current_id = oldest_snap_min = txn_global->current; /* Walk the array of concurrent transactions. */ WT_ORDERED_READ(session_cnt, conn->session_cnt); @@ -128,6 +132,12 @@ __wt_txn_get_evict_snapshot(WT_SESSION_IMPL *session) if ((id = s->snap_min) != WT_TXN_NONE && TXNID_LT(id, oldest_snap_min)) oldest_snap_min = id; + + /* + * Ensure the snapshot reads are scheduled before re-checking + * the global current ID. + */ + WT_READ_BARRIER(); } while (current_id != txn_global->current); __txn_sort_snapshot(session, 0, oldest_snap_min, oldest_snap_min); @@ -169,8 +179,26 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) F_SET(txn, TXN_RUNNING); do { - /* Take a copy of the current session ID. */ - txn->id = txn_global->current; + /* + * Allocate a transaction ID. + * + * We use an atomic increment to ensure that we get a unique + * ID, then publish that to the global state table. + * + * If two threads race to allocate an ID, only the latest ID + * will proceed. The winning thread can be sure its snapshot + * contains all of the earlier active IDs. Threads that race + * race and get an earlier ID may not appear in the snapshot, + * but they will loop and allocate a new ID before proceeding + * to make any updates. + * + * This potentially wastes transaction IDs when threads race to + * begin transactions, but that is the price we pay to keep + * this path latch free. + */ + do { + txn->id = WT_ATOMIC_ADD(txn_global->current, 1); + } while (txn->id == WT_TXN_NONE || txn->id == WT_TXN_ABORTED); WT_PUBLISH(txn_state->id, txn->id); /* @@ -200,8 +228,13 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) session, n, txn->id, oldest_snap_min); txn_state->snap_min = txn->snap_min; } - } while (!WT_ATOMIC_CAS(txn_global->current, txn->id, txn->id + 1) || - txn->id == WT_TXN_NONE || txn->id == WT_TXN_ABORTED); + + /* + * Ensure the snapshot reads are scheduled before re-checking + * the global current ID. + */ + WT_READ_BARRIER(); + } while (txn->id != txn_global->current); return (0); } @@ -223,7 +256,8 @@ __wt_txn_release(WT_SESSION_IMPL *session) /* Clear the transaction's ID from the global table. */ WT_ASSERT(session, txn_state->id != WT_TXN_NONE && txn->id != WT_TXN_NONE); - txn_state->id = txn_state->snap_min = WT_TXN_NONE; + WT_PUBLISH(txn_state->id, WT_TXN_NONE); + txn_state->snap_min = WT_TXN_NONE; /* Reset the transaction state to not running. */ txn->id = WT_TXN_NONE; |