summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorMichael Cahill <michael.cahill@wiredtiger.com>2012-09-28 17:39:45 +1000
committerMichael Cahill <michael.cahill@wiredtiger.com>2012-09-28 17:39:45 +1000
commit4be1afdf665bf4bd4449910049fcb0c0ccb9286e (patch)
tree43388e016235ac31f5c9a51e58e0325162ac52ec /src
parent952577567aaaeb2076d445b18fb88d28a2e8da67 (diff)
parentc8a4341d852d323d0029b0b0a1e48a58ab2f23b6 (diff)
downloadmongo-4be1afdf665bf4bd4449910049fcb0c0ccb9286e.tar.gz
Merge branch 'develop' into data-handles
Conflicts: src/btree/bt_evict.c
Diffstat (limited to 'src')
-rw-r--r--src/btree/bt_curnext.c2
-rw-r--r--src/btree/bt_curprev.c1
-rw-r--r--src/btree/bt_cursor.c1
-rw-r--r--src/btree/bt_evict.c10
-rw-r--r--src/btree/bt_page.c12
-rw-r--r--src/btree/rec_evict.c6
-rw-r--r--src/btree/rec_write.c28
-rw-r--r--src/conn/conn_handle.c3
-rw-r--r--src/docs/cursor-ops.dox10
-rw-r--r--src/docs/transactions.dox5
-rw-r--r--src/include/api.h8
-rw-r--r--src/include/cursor.i13
-rw-r--r--src/include/lsm.h9
-rw-r--r--src/include/txn.h14
-rw-r--r--src/include/txn.i4
-rw-r--r--src/include/wt_internal.h2
-rw-r--r--src/lsm/lsm_meta.c5
-rw-r--r--src/lsm/lsm_worker.c84
-rw-r--r--src/txn/txn.c52
19 files changed, 166 insertions, 103 deletions
diff --git a/src/btree/bt_curnext.c b/src/btree/bt_curnext.c
index d8c188f3198..3fcf8dcd716 100644
--- a/src/btree/bt_curnext.c
+++ b/src/btree/bt_curnext.c
@@ -403,6 +403,7 @@ __wt_btcur_next(WT_CURSOR_BTREE *cbt, int discard)
LF_SET(WT_TREE_DISCARD);
__cursor_func_init(cbt, 0);
+ __cursor_position_clear(cbt);
/*
* If we aren't already iterating in the right direction, there's
@@ -507,6 +508,7 @@ __wt_btcur_next_random(WT_CURSOR_BTREE *cbt)
WT_BSTAT_INCR(session, cursor_read_next);
__cursor_func_init(cbt, 1);
+ __cursor_position_clear(cbt);
/*
* Only supports row-store: applications can trivially select a random
diff --git a/src/btree/bt_curprev.c b/src/btree/bt_curprev.c
index fe21472f2fe..baa256010e8 100644
--- a/src/btree/bt_curprev.c
+++ b/src/btree/bt_curprev.c
@@ -491,6 +491,7 @@ __wt_btcur_prev(WT_CURSOR_BTREE *cbt, int discard)
LF_SET(WT_TREE_DISCARD);
__cursor_func_init(cbt, 0);
+ __cursor_position_clear(cbt);
/*
* If we aren't already iterating in the right direction, there's
diff --git a/src/btree/bt_cursor.c b/src/btree/bt_cursor.c
index 3b880d6a749..8265c4ccecf 100644
--- a/src/btree/bt_cursor.c
+++ b/src/btree/bt_cursor.c
@@ -112,6 +112,7 @@ __wt_btcur_reset(WT_CURSOR_BTREE *cbt)
__cursor_leave(cbt);
__cursor_search_clear(cbt);
+ __cursor_position_clear(cbt);
return (0);
}
diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c
index 973f623893a..24236459966 100644
--- a/src/btree/bt_evict.c
+++ b/src/btree/bt_evict.c
@@ -946,6 +946,7 @@ __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app)
{
WT_BTREE *btree;
WT_DATA_HANDLE *saved_dhandle;
+ WT_DECL_RET;
WT_PAGE *page;
__evict_get_page(session, is_app, &btree, &page);
@@ -958,19 +959,14 @@ __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app)
saved_dhandle = session->dhandle;
WT_SET_BTREE_IN_SESSION(session, btree);
- /*
- * We don't care why eviction failed (maybe the page was dirty and
- * we're out of disk space, or the page had an in-memory subtree
- * already being evicted).
- */
- (void)__evict_page(session, page);
+ ret = __evict_page(session, page);
(void)WT_ATOMIC_SUB(btree->lru_count, 1);
WT_CLEAR_BTREE_IN_SESSION(session);
session->dhandle = saved_dhandle;
- return (0);
+ return (ret);
}
/*
diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c
index fbf8985cf37..d5973dbd3f3 100644
--- a/src/btree/bt_page.c
+++ b/src/btree/bt_page.c
@@ -42,9 +42,19 @@ __wt_page_in_func(
case WT_REF_DISK:
case WT_REF_DELETED:
/* The page isn't in memory, attempt to read it. */
+
+ /* Check if there is space in the cache. */
__wt_eviction_check(session, &read_lockout, wake);
wake = 0;
- if (read_lockout)
+
+ /*
+ * If the cache is full, give up, but only if we are
+ * not holding the schema lock. The schema lock can
+ * block checkpoints, and thus eviction, so it is not
+ * safe to wait for eviction if we are holding it.
+ */
+ if (read_lockout &&
+ !F_ISSET(session, WT_SESSION_SCHEMA_LOCKED))
break;
WT_RET(__wt_cache_read(session, parent, ref));
diff --git a/src/btree/rec_evict.c b/src/btree/rec_evict.c
index c20d9ea4534..92bba2bed82 100644
--- a/src/btree/rec_evict.c
+++ b/src/btree/rec_evict.c
@@ -339,10 +339,10 @@ __rec_review(WT_SESSION_IMPL *session,
}
/*
- * If no pages are referenced, there are no consistency
- * issues: try to bump our snapshot.
+ * If there aren't multiple cursors active, there
+ * are no consistency issues: try to bump our snapshot.
*/
- if (session->nhazard == 0) {
+ if (session->ncursors <= 1) {
__wt_txn_read_last(session);
__wt_txn_read_first(session);
}
diff --git a/src/btree/rec_write.c b/src/btree/rec_write.c
index b23b911c345..aab438ebb15 100644
--- a/src/btree/rec_write.c
+++ b/src/btree/rec_write.c
@@ -1965,7 +1965,7 @@ __rec_col_var(WT_SESSION_IMPL *session,
WT_INSERT *ins;
WT_INSERT_HEAD *append;
WT_ITEM *last;
- WT_UPDATE *next_upd, *upd;
+ WT_UPDATE *upd;
uint64_t n, nrepeat, repeat_count, rle, slvg_missing, src_recno;
uint32_t i, size;
int deleted, last_deleted, orig_deleted, update_no_copy;
@@ -2016,21 +2016,13 @@ __rec_col_var(WT_SESSION_IMPL *session,
WT_COL_FOREACH(page, cip, i) {
ovfl_state = OVFL_IGNORE;
if ((cell = WT_COL_PTR(page, cip)) == NULL) {
- ins = NULL;
nrepeat = 1;
+ ins = NULL;
orig_deleted = 1;
} else {
__wt_cell_unpack(cell, unpack);
nrepeat = __wt_cell_rle(unpack);
-
ins = WT_SKIP_FIRST(WT_COL_UPDATE(page, cip));
- while (ins != NULL) {
- WT_ERR(
- __rec_txn_read(session, r, ins->upd, &upd));
- if (upd != NULL)
- break;
- ins = WT_SKIP_NEXT(ins);
- }
/*
* If the original value is "deleted", there's no value
@@ -2090,19 +2082,13 @@ record_loop: /*
*/
for (n = 0;
n < nrepeat; n += repeat_count, src_recno += repeat_count) {
- if (ins != NULL &&
- WT_INSERT_RECNO(ins) == src_recno) {
+ upd = NULL;
+ if (ins != NULL && WT_INSERT_RECNO(ins) == src_recno) {
WT_ERR(
__rec_txn_read(session, r, ins->upd, &upd));
- WT_ASSERT(session, upd != NULL);
- do {
- ins = WT_SKIP_NEXT(ins);
- if (ins == NULL)
- break;
- WT_ERR(__rec_txn_read(
- session, r, ins->upd, &next_upd));
- } while (next_upd == NULL);
-
+ ins = WT_SKIP_NEXT(ins);
+ }
+ if (upd != NULL) {
update_no_copy = 1; /* No data copy */
repeat_count = 1;
diff --git a/src/conn/conn_handle.c b/src/conn/conn_handle.c
index 4b393d1550d..d673a37cfc5 100644
--- a/src/conn/conn_handle.c
+++ b/src/conn/conn_handle.c
@@ -73,8 +73,9 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn)
__wt_spin_destroy(session, &conn->api_lock);
__wt_spin_destroy(session, &conn->fh_lock);
- __wt_spin_destroy(session, &conn->serial_lock);
+ __wt_spin_destroy(session, &conn->metadata_lock);
__wt_spin_destroy(session, &conn->schema_lock);
+ __wt_spin_destroy(session, &conn->serial_lock);
/* Free allocated memory. */
__wt_free(session, conn->home);
diff --git a/src/docs/cursor-ops.dox b/src/docs/cursor-ops.dox
index 84d089446cb..54a174a09c2 100644
--- a/src/docs/cursor-ops.dox
+++ b/src/docs/cursor-ops.dox
@@ -101,8 +101,14 @@ To remove existing data using a cursor, use the WT_CURSOR::remove method:
@section cursor_error Cursor position after error
After any cursor handle method failure, the cursor's position is
-undetermined. Applications that cannot re-position the cursor after
-failure must duplicate the cursor before calling a cursor method that will
+undetermined. For cursor operations that expect a key to be set before the
+operation begins (including WT_CURSOR::search, WT_CURSOR::insert,
+WT_CURSOR::update and WT_CURSOR::remove), the application's key and value
+will not be cleared by an error.
+
+Applications that cannot re-position the cursor after failure must
+duplicate the cursor by calling WT_SESSION::open_cursor and passing the
+cursor as the \c to_dup parameter before calling a cursor method that will
attempt to re-position the cursor.
*/
diff --git a/src/docs/transactions.dox b/src/docs/transactions.dox
index efc5934f9aa..4bc745858b8 100644
--- a/src/docs/transactions.dox
+++ b/src/docs/transactions.dox
@@ -72,6 +72,11 @@ updating the same value will fail with ::WT_DEADLOCK. Some applications
may benefit from application-level synchronization to avoid repeated
attempts to rollback and update the same value.
+Operations in transactions may also fail with the ::WT_DEADLOCK error if
+some resource cannot be allocated after repeated attempts. For example, if
+the cache is not large enough to hold the updates required to satisfy
+transactional readers, an operation may fail and return ::WT_DEADLOCK.
+
@section transaction_isolation Isolation levels
WiredTiger supports <code>read-uncommitted</code>,
diff --git a/src/include/api.h b/src/include/api.h
index f438ab15ef7..c5799f2d4ef 100644
--- a/src/include/api.h
+++ b/src/include/api.h
@@ -334,14 +334,18 @@ struct __wt_connection_impl {
} \
ret = __wt_txn_commit((s), NULL); \
} else { \
- WT_TRET(WT_DEADLOCK); \
(void)__wt_txn_rollback((s), NULL); \
+ if (ret == 0 || ret == WT_DEADLOCK) { \
+ ret = 0; \
+ continue; \
+ } \
} \
} else if ((ret) != 0 && \
(ret) != WT_NOTFOUND && \
(ret) != WT_DUPLICATE_KEY) \
F_SET(&(s)->txn, TXN_ERROR); \
-} while (0)
+ break; \
+} while (1)
/*
* If a session or connection method is about to return WT_NOTFOUND (some
diff --git a/src/include/cursor.i b/src/include/cursor.i
index 46ac684aa1f..5077409e8cc 100644
--- a/src/include/cursor.i
+++ b/src/include/cursor.i
@@ -17,6 +17,16 @@ __cursor_set_recno(WT_CURSOR_BTREE *cbt, uint64_t v)
}
/*
+ * __cursor_position_clear --
+ * Forget the current key and value in a cursor.
+ */
+static inline void
+__cursor_position_clear(WT_CURSOR_BTREE *cbt)
+{
+ F_CLR(&cbt->iface, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
+}
+
+/*
* __cursor_search_clear --
* Reset the cursor's state for a search.
*/
@@ -60,9 +70,6 @@ __cursor_leave(WT_CURSOR_BTREE *cbt)
__wt_stack_release(session, cbt->page);
cbt->page = NULL;
- /* Reset the returned key/value state. */
- F_CLR(cursor, WT_CURSTD_KEY_SET | WT_CURSTD_VALUE_SET);
-
if (F_ISSET(cbt, WT_CBT_ACTIVE)) {
WT_ASSERT(session, session->ncursors > 0);
if (--session->ncursors == 0)
diff --git a/src/include/lsm.h b/src/include/lsm.h
index c97f940437f..30054491975 100644
--- a/src/include/lsm.h
+++ b/src/include/lsm.h
@@ -82,3 +82,12 @@ struct __wt_lsm_data_source {
WT_RWLOCK *rwlock;
};
+
+struct __wt_lsm_worker_cookie {
+ WT_LSM_CHUNK **chunk_array;
+ size_t chunk_alloc;
+ int nchunks;
+#define WT_LSM_WORKER_MERGE 0x01
+#define WT_LSM_WORKER_CHECKPOINT 0x02
+ uint32_t flags;
+};
diff --git a/src/include/txn.h b/src/include/txn.h
index 1adc68e0caf..ed85867a74a 100644
--- a/src/include/txn.h
+++ b/src/include/txn.h
@@ -34,15 +34,17 @@ typedef uint32_t wt_txnid_t;
* remains in the system after 2 billion transactions it can no longer be
* compared with current transaction ID.
*/
-#define TXNID_LT(t1, t2) \
- (((t1) == (t2) || \
- (t1) == WT_TXN_ABORTED || (t2) == WT_TXN_NONE) ? 0 : \
- ((t1) == WT_TXN_NONE || (t2) == WT_TXN_ABORTED) ? 1 : \
+#define TXNID_LE(t1, t2) \
+ (((t1) == WT_TXN_ABORTED || (t2) == WT_TXN_NONE) ? 0 : \
+ ((t1) == WT_TXN_NONE || (t2) == WT_TXN_ABORTED) ? 1 : \
(t2) - (t1) < (UINT32_MAX / 2))
+#define TXNID_LT(t1, t2) \
+ ((t1) != (t2) && TXNID_LE(t1, t2))
+
struct __wt_txn_state {
- wt_txnid_t id;
- wt_txnid_t snap_min;
+ volatile wt_txnid_t id;
+ volatile wt_txnid_t snap_min;
};
struct __wt_txn_global {
diff --git a/src/include/txn.i b/src/include/txn.i
index 08d626efddd..e410ad07c68 100644
--- a/src/include/txn.i
+++ b/src/include/txn.i
@@ -133,9 +133,7 @@ __wt_txn_visible_all(WT_SESSION_IMPL *session, wt_txnid_t id)
WT_TXN *txn;
txn = &session->txn;
- if (TXNID_LT(txn->oldest_snap_min, id))
- return (0);
- return (1);
+ return (TXNID_LT(id, txn->oldest_snap_min));
}
/*
diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h
index 6ebb69a9d5d..8bdb3a7bc20 100644
--- a/src/include/wt_internal.h
+++ b/src/include/wt_internal.h
@@ -137,6 +137,8 @@ struct __wt_lsm_data_source;
typedef struct __wt_lsm_data_source WT_LSM_DATA_SOURCE;
struct __wt_lsm_tree;
typedef struct __wt_lsm_tree WT_LSM_TREE;
+struct __wt_lsm_worker_cookie;
+ typedef struct __wt_lsm_worker_cookie WT_LSM_WORKER_COOKIE;
struct __wt_named_collator;
typedef struct __wt_named_collator WT_NAMED_COLLATOR;
struct __wt_named_compressor;
diff --git a/src/lsm/lsm_meta.c b/src/lsm/lsm_meta.c
index f2fba6422d6..8377dbaeac6 100644
--- a/src/lsm/lsm_meta.c
+++ b/src/lsm/lsm_meta.c
@@ -170,7 +170,10 @@ __wt_lsm_meta_write(WT_SESSION_IMPL *session, WT_LSM_TREE *lsm_tree)
WT_ERR(__wt_buf_catfmt(session, buf, "\"%s\"", chunk->uri));
}
WT_ERR(__wt_buf_catfmt(session, buf, "]"));
- WT_ERR(__wt_metadata_update(session, lsm_tree->name, buf->data));
+ __wt_spin_lock(session, &S2C(session)->metadata_lock);
+ ret = __wt_metadata_update(session, lsm_tree->name, buf->data);
+ __wt_spin_unlock(session, &S2C(session)->metadata_lock);
+ WT_ERR(ret);
err: __wt_scr_free(&buf);
return (ret);
diff --git a/src/lsm/lsm_worker.c b/src/lsm/lsm_worker.c
index d29f0f93de7..86f5ec504f4 100644
--- a/src/lsm/lsm_worker.c
+++ b/src/lsm/lsm_worker.c
@@ -8,8 +8,7 @@
#include "wt_internal.h"
static int __lsm_free_chunks(WT_SESSION_IMPL *, WT_LSM_TREE *);
-static int __lsm_copy_chunks(
- WT_LSM_TREE *, size_t *, WT_LSM_CHUNK ***, int *, int);
+static int __lsm_copy_chunks(WT_LSM_TREE *, WT_LSM_WORKER_COOKIE *);
/*
* __wt_lsm_worker --
@@ -20,31 +19,30 @@ void *
__wt_lsm_worker(void *arg)
{
WT_DECL_RET;
- WT_LSM_CHUNK *chunk, **chunk_array;
+ WT_LSM_CHUNK *chunk;
WT_LSM_TREE *lsm_tree;
+ WT_LSM_WORKER_COOKIE cookie;
WT_SESSION_IMPL *session;
const char *cfg[] = API_CONF_DEFAULTS(session, checkpoint, NULL);
- size_t chunk_alloc;
- int i, nchunks, progress;
+ int i, progress;
lsm_tree = arg;
session = lsm_tree->worker_session;
- chunk_array = NULL;
- chunk_alloc = 0;
+ memset(&cookie, 0, sizeof(cookie));
+ F_SET(&cookie, WT_LSM_WORKER_MERGE);
while (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) {
progress = 0;
- WT_ERR(__lsm_copy_chunks(
- lsm_tree, &chunk_alloc, &chunk_array, &nchunks, 0));
+ WT_ERR(__lsm_copy_chunks(lsm_tree, &cookie));
/*
* Write checkpoints in all completed files, then find
* something to merge.
*/
- for (i = 0; i < nchunks; i++) {
- chunk = chunk_array[i];
+ for (i = 0; i < cookie.nchunks; i++) {
+ chunk = cookie.chunk_array[i];
if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK) ||
chunk->ncursor > 0)
continue;
@@ -54,8 +52,8 @@ __wt_lsm_worker(void *arg)
* NOTE: we pass a non-NULL config, because otherwise
* __wt_checkpoint thinks we're closing the file.
*/
- WT_WITH_SCHEMA_LOCK(session, ret =
- __wt_schema_worker(session, chunk->uri,
+ WT_WITH_SCHEMA_LOCK(session,
+ ret =__wt_schema_worker(session, chunk->uri,
__wt_checkpoint, cfg, 0));
if (ret == 0) {
__wt_spin_lock(session, &lsm_tree->lock);
@@ -80,7 +78,7 @@ __wt_lsm_worker(void *arg)
__wt_sleep(0, 10);
}
-err: __wt_free(session, chunk_array);
+err: __wt_free(session, cookie.chunk_array);
return (NULL);
}
@@ -94,26 +92,25 @@ void *
__wt_lsm_checkpoint_worker(void *arg)
{
WT_DECL_RET;
- WT_LSM_CHUNK *chunk, **chunk_array;
+ WT_LSM_CHUNK *chunk;
WT_LSM_TREE *lsm_tree;
+ WT_LSM_WORKER_COOKIE cookie;
WT_SESSION_IMPL *session;
const char *cfg[] = { "name=,drop=", NULL };
- size_t chunk_alloc;
- int i, j, nchunks;
+ int i, j;
lsm_tree = arg;
session = lsm_tree->ckpt_session;
- chunk_array = NULL;
- chunk_alloc = 0;
+ memset(&cookie, 0, sizeof(cookie));
+ F_SET(&cookie, WT_LSM_WORKER_CHECKPOINT);
while (F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) {
- WT_ERR(__lsm_copy_chunks(
- lsm_tree, &chunk_alloc, &chunk_array, &nchunks, 1));
+ WT_ERR(__lsm_copy_chunks(lsm_tree, &cookie));
/* Write checkpoints in all completed files. */
- for (i = 0, j = 0; i < nchunks; i++) {
- chunk = chunk_array[i];
+ for (i = 0, j = 0; i < cookie.nchunks; i++) {
+ chunk = cookie.chunk_array[i];
if (F_ISSET(chunk, WT_LSM_CHUNK_ONDISK))
continue;
++j;
@@ -122,12 +119,12 @@ __wt_lsm_checkpoint_worker(void *arg)
* NOTE: we pass a non-NULL config, because otherwise
* __wt_checkpoint thinks we're closing the file.
*/
- WT_WITH_SCHEMA_LOCK(session, ret =
- __wt_schema_worker(session, chunk->uri,
+ WT_WITH_SCHEMA_LOCK(session,
+ ret = __wt_schema_worker(session, chunk->uri,
__wt_checkpoint, cfg, 0));
if (ret == 0) {
__wt_spin_lock(session, &lsm_tree->lock);
- F_SET(lsm_tree->chunk[i], WT_LSM_CHUNK_ONDISK);
+ F_SET(chunk, WT_LSM_CHUNK_ONDISK);
lsm_tree->dsk_gen++;
__wt_spin_unlock(session, &lsm_tree->lock);
}
@@ -137,27 +134,29 @@ __wt_lsm_checkpoint_worker(void *arg)
"LSM worker checkpointed %d.", j);
__wt_sleep(0, 10);
}
-err: __wt_free(session, chunk_array);
+err: __wt_free(session, cookie.chunk_array);
return (NULL);
}
+/*
+ * Take a copy of part of the LSM tree chunk array so that we can work on
+ * the contents without holding the LSM tree handle lock long term.
+ */
static int
-__lsm_copy_chunks(WT_LSM_TREE *lsm_tree,
- size_t *allocp, WT_LSM_CHUNK ***chunkp, int *nchunkp, int checkpoint)
+__lsm_copy_chunks(WT_LSM_TREE *lsm_tree, WT_LSM_WORKER_COOKIE *cookie)
{
WT_DECL_RET;
WT_SESSION_IMPL *session;
- WT_LSM_CHUNK **chunk_array;
- size_t chunk_alloc;
int nchunks;
- if (checkpoint == 1)
+ /* Always return zero chunks on error. */
+ cookie->nchunks = 0;
+
+ if (F_ISSET(cookie, WT_LSM_WORKER_CHECKPOINT))
session = lsm_tree->ckpt_session;
else
session = lsm_tree->worker_session;
- chunk_array = *chunkp;
- chunk_alloc = *allocp;
__wt_spin_lock(session, &lsm_tree->lock);
if (!F_ISSET(lsm_tree, WT_LSM_TREE_WORKING)) {
@@ -172,7 +171,7 @@ __lsm_copy_chunks(WT_LSM_TREE *lsm_tree,
*/
nchunks = lsm_tree->nchunks - 1;
/* Checkpoint doesn't care if there are active cursors, merge does. */
- if (checkpoint == 0) {
+ if (F_ISSET(cookie, WT_LSM_WORKER_MERGE)) {
for (; nchunks > 0 && lsm_tree->chunk[nchunks - 1]->ncursor > 0;
--nchunks)
;
@@ -181,20 +180,17 @@ __lsm_copy_chunks(WT_LSM_TREE *lsm_tree,
* If the tree array of active chunks is larger than our current buffer,
* increase the size of our current buffer to match.
*/
- if (chunk_alloc < lsm_tree->chunk_alloc)
+ if (cookie->chunk_alloc < lsm_tree->chunk_alloc)
ret = __wt_realloc(session,
- &chunk_alloc, lsm_tree->chunk_alloc,
- &chunk_array);
+ &cookie->chunk_alloc, lsm_tree->chunk_alloc,
+ &cookie->chunk_array);
if (ret == 0 && nchunks > 0)
- memcpy(chunk_array, lsm_tree->chunk,
+ memcpy(cookie->chunk_array, lsm_tree->chunk,
nchunks * sizeof(*lsm_tree->chunk));
__wt_spin_unlock(session, &lsm_tree->lock);
- if (ret == 0) {
- *chunkp = chunk_array;
- *allocp = chunk_alloc;
- *nchunkp = nchunks;
- }
+ if (ret == 0)
+ cookie->nchunks = nchunks;
return (ret);
}
diff --git a/src/txn/txn.c b/src/txn/txn.c
index eaddc34d708..68a5c805323 100644
--- a/src/txn/txn.c
+++ b/src/txn/txn.c
@@ -74,11 +74,10 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session, wt_txnid_t max_id)
conn = S2C(session);
txn = &session->txn;
txn_global = &conn->txn_global;
- oldest_snap_min = WT_TXN_ABORTED;
do {
/* Take a copy of the current session ID. */
- current_id = txn_global->current;
+ current_id = oldest_snap_min = txn_global->current;
/* Copy the array of concurrent transactions. */
WT_ORDERED_READ(session_cnt, conn->session_cnt);
@@ -93,6 +92,12 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session, wt_txnid_t max_id)
else if (max_id == WT_TXN_NONE || TXNID_LT(id, max_id))
txn->snapshot[n++] = id;
}
+
+ /*
+ * Ensure the snapshot reads are scheduled before re-checking
+ * the global current ID.
+ */
+ WT_READ_BARRIER();
} while (current_id != txn_global->current);
__txn_sort_snapshot(session, n,
@@ -116,11 +121,10 @@ __wt_txn_get_evict_snapshot(WT_SESSION_IMPL *session)
conn = S2C(session);
txn_global = &conn->txn_global;
- oldest_snap_min = WT_TXN_ABORTED;
do {
/* Take a copy of the current session ID. */
- current_id = txn_global->current;
+ current_id = oldest_snap_min = txn_global->current;
/* Walk the array of concurrent transactions. */
WT_ORDERED_READ(session_cnt, conn->session_cnt);
@@ -128,6 +132,12 @@ __wt_txn_get_evict_snapshot(WT_SESSION_IMPL *session)
if ((id = s->snap_min) != WT_TXN_NONE &&
TXNID_LT(id, oldest_snap_min))
oldest_snap_min = id;
+
+ /*
+ * Ensure the snapshot reads are scheduled before re-checking
+ * the global current ID.
+ */
+ WT_READ_BARRIER();
} while (current_id != txn_global->current);
__txn_sort_snapshot(session, 0, oldest_snap_min, oldest_snap_min);
@@ -169,8 +179,26 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[])
F_SET(txn, TXN_RUNNING);
do {
- /* Take a copy of the current session ID. */
- txn->id = txn_global->current;
+ /*
+ * Allocate a transaction ID.
+ *
+ * We use an atomic increment to ensure that we get a unique
+ * ID, then publish that to the global state table.
+ *
+ * If two threads race to allocate an ID, only the latest ID
+ * will proceed. The winning thread can be sure its snapshot
+ * contains all of the earlier active IDs. Threads that race
+ * race and get an earlier ID may not appear in the snapshot,
+ * but they will loop and allocate a new ID before proceeding
+ * to make any updates.
+ *
+ * This potentially wastes transaction IDs when threads race to
+ * begin transactions, but that is the price we pay to keep
+ * this path latch free.
+ */
+ do {
+ txn->id = WT_ATOMIC_ADD(txn_global->current, 1);
+ } while (txn->id == WT_TXN_NONE || txn->id == WT_TXN_ABORTED);
WT_PUBLISH(txn_state->id, txn->id);
/*
@@ -200,8 +228,13 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[])
session, n, txn->id, oldest_snap_min);
txn_state->snap_min = txn->snap_min;
}
- } while (!WT_ATOMIC_CAS(txn_global->current, txn->id, txn->id + 1) ||
- txn->id == WT_TXN_NONE || txn->id == WT_TXN_ABORTED);
+
+ /*
+ * Ensure the snapshot reads are scheduled before re-checking
+ * the global current ID.
+ */
+ WT_READ_BARRIER();
+ } while (txn->id != txn_global->current);
return (0);
}
@@ -223,7 +256,8 @@ __wt_txn_release(WT_SESSION_IMPL *session)
/* Clear the transaction's ID from the global table. */
WT_ASSERT(session, txn_state->id != WT_TXN_NONE &&
txn->id != WT_TXN_NONE);
- txn_state->id = txn_state->snap_min = WT_TXN_NONE;
+ WT_PUBLISH(txn_state->id, WT_TXN_NONE);
+ txn_state->snap_min = WT_TXN_NONE;
/* Reset the transaction state to not running. */
txn->id = WT_TXN_NONE;