diff options
-rw-r--r-- | dist/api_data.py | 6 | ||||
-rw-r--r-- | dist/s_style | 2 | ||||
-rw-r--r-- | src/btree/bt_curprev.c | 8 | ||||
-rw-r--r-- | src/btree/bt_evict.c | 110 | ||||
-rw-r--r-- | src/btree/rec_write.c | 27 | ||||
-rw-r--r-- | src/config/config_def.c | 4 | ||||
-rw-r--r-- | src/include/extern.h | 1 | ||||
-rw-r--r-- | src/include/txn.i | 26 | ||||
-rw-r--r-- | src/include/wiredtiger.in | 39 | ||||
-rw-r--r-- | src/meta/meta_apply.c | 11 | ||||
-rw-r--r-- | src/txn/txn.c | 29 |
11 files changed, 138 insertions, 125 deletions
diff --git a/dist/api_data.py b/dist/api_data.py index 862b464cbcb..c30a020e5c5 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -296,12 +296,8 @@ methods = { 'session.rollback_transaction' : Method([]), 'session.checkpoint' : Method([ - Config('archive', 'false', r''' - remove log files no longer required for transactional - durability''', - type='boolean'), Config('snapshot', '', r''' - if non-empty, create a named snapshot'''), + if non-empty, create named snapshots in files'''), ]), 'connection.add_collator' : Method([]), diff --git a/dist/s_style b/dist/s_style index 2bb6a5b458e..416c12285b5 100644 --- a/dist/s_style +++ b/dist/s_style @@ -73,7 +73,7 @@ for f in `find examples ext src test -name '*.[chisy]' -o -name '*.in' | -e 's/\([ ]\)if(/\1if (/' \ -e 's/\([ ]\)index(/\1strchr(/' \ -e 's/\([ ]\)return(/\1return (/' \ - -e 's/^\([ ]+\)return \([^()]*\);/\1return (\2);/' \ + -e 's/\([ ]\)return \([^()]*\);/\1return (\2);/' \ -e 's/\([ ]\)rindex(/\1strrchr(/' \ -e 's/\([ ]\)sizeof (/\1sizeof(/g' \ -e 's/\([ ]\)switch(/\1switch (/' \ diff --git a/src/btree/bt_curprev.c b/src/btree/bt_curprev.c index 520db6f71aa..6fd35996811 100644 --- a/src/btree/bt_curprev.c +++ b/src/btree/bt_curprev.c @@ -140,6 +140,14 @@ __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, int newpage) --cbt->recno; } + /* + * Column store appends are inherently non-transactional. + * + * Even a non-visible update by a concurrent or aborted transaction + * changes the effective end of the data. The effect is subtle because + * of the blurring between deleted and empty values, but ideally we + * would skip all uncommitted changes at the end of the data. + */ cbt->iface.recno = cbt->recno; if (cbt->recno > WT_INSERT_RECNO(cbt->ins) || (upd = __wt_txn_read(session, cbt->ins->upd)) == NULL) { diff --git a/src/btree/bt_evict.c b/src/btree/bt_evict.c index e883232b332..f3333f085b2 100644 --- a/src/btree/bt_evict.c +++ b/src/btree/bt_evict.c @@ -378,6 +378,60 @@ __evict_clear_tree_walk(WT_SESSION_IMPL *session) } /* + * __evict_page -- + * Evict a given page. + */ +static int +__evict_page(WT_SESSION_IMPL *session, WT_PAGE *page) +{ + WT_DECL_RET; + WT_TXN_GLOBAL *txn_global; + WT_TXN saved_txn, *txn, *txn_ckpt; + const char *txn_cfg[] = { "isolation=snapshot", NULL }; + int was_running; + + /* + * We have to take care when evicting pages not to write a change that: + * (a) is not yet committed; or + * (b) is committed more recently than an in-progress checkpoint. + * + * We handle both of these cases by setting up the transaction context + * before evicting. If a checkpoint is in progress, copy the + * checkpoint's transaction. Otherwise, we need a snapshot to avoid + * uncommitted changes. If a transaction is in progress in the + * evicting session, we save and restore its state. + */ + txn = &session->txn; + was_running = (F_ISSET(txn, TXN_RUNNING) != 0); + if (was_running) + saved_txn = *txn; + + txn_global = &S2C(session)->txn_global; + if ((txn_ckpt = txn_global->checkpoint_txn) == NULL) { + if (was_running) { + WT_RET(__wt_txn_init(session)); + WT_ERR(__wt_txn_get_snapshot(session)); + } else + WT_ERR(__wt_txn_begin(session, txn_cfg)); + } else + session->txn = *txn_ckpt; + + ret = __wt_rec_evict(session, page, 0); + +err: if (txn_ckpt == NULL) { + if (was_running) + __wt_txn_destroy(session); + else + WT_TRET(__wt_txn_commit(session, NULL)); + } + + if (was_running) + session->txn = saved_txn; + + return (ret); +} + +/* * __evict_file_request_walk -- * Walk the session list looking for sync/close requests. If we find a * request, perform it, clear the request, and wake up the requesting @@ -597,7 +651,7 @@ __evict_page_request_walk(WT_SESSION_IMPL *session) * while trying to get another (e.g., if they have two cursors * open), so blocking indefinitely leads to deadlock. */ - (void)__wt_evict_page(session, page); + (void)__evict_page(session, page); __wt_spin_unlock(session, &cache->lru_lock); @@ -883,58 +937,6 @@ __evict_get_page( } /* - * __wt_evict_page -- - * Evict a given page. - */ -int -__wt_evict_page(WT_SESSION_IMPL *session, WT_PAGE *page) -{ - WT_DECL_RET; - WT_TXN_GLOBAL *txn_global; - WT_TXN saved_txn, *txn, *txn_ckpt; - const char *txn_cfg[] = { "isolation=snapshot", NULL }; - int was_running; - - /* - * We have to take care when evicting pages not to write changes that: - * (a) is not yet committed; or - * (b) is committed more recently than an in-progress checkpoint. - * - * We handle both of these cases by setting up the transaction context - * before evicting. If a checkpoint is in progress, copy the - * checkpoint's transaction. Otherwise, we need a snapshot to avoid - * uncommitted changes. If a transaction is in progress in the - * evicting session, we save and restore its state. - */ - txn = &session->txn; - saved_txn = *txn; - was_running = (F_ISSET(txn, TXN_RUNNING) != 0); - - txn_global = &S2C(session)->txn_global; - if ((txn_ckpt = txn_global->checkpoint_txn) == NULL) { - if (was_running) { - WT_RET(__wt_txn_init(session)); - WT_ERR(__wt_txn_get_snapshot(session)); - } else - WT_ERR(__wt_txn_begin(session, txn_cfg)); - } else - session->txn = *txn_ckpt; - - ret = __wt_rec_evict(session, page, 0); - -err: if (txn_ckpt == NULL) { - if (was_running) - __wt_txn_destroy(session); - else - WT_TRET(__wt_txn_commit(session, NULL)); - } - - session->txn = saved_txn; - - return (ret); -} - -/* * __wt_evict_lru_page -- * Called by both eviction and application threads to evict a page. */ @@ -959,7 +961,7 @@ __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app) * we're out of disk space, or the page had an in-memory subtree * already being evicted). */ - (void)__wt_evict_page(session, page); + (void)__evict_page(session, page); WT_ATOMIC_ADD(btree->lru_count, -1); diff --git a/src/btree/rec_write.c b/src/btree/rec_write.c index d28bd296cb1..a8118790968 100644 --- a/src/btree/rec_write.c +++ b/src/btree/rec_write.c @@ -24,8 +24,8 @@ typedef struct { WT_ITEM dsk; /* Temporary disk-image buffer */ /* Track whether all changes to the page are written. */ - uint32_t old_write_gen; - uint32_t old_disk_gen; + uint32_t orig_write_gen; + uint32_t orig_disk_gen; int upd_skipped; /* @@ -398,8 +398,8 @@ __rec_write_init(WT_SESSION_IMPL *session, WT_PAGE *page) r->page = page; /* Read the disk generation before we read anything from the page. */ - r->old_disk_gen = page->modify->disk_gen; - WT_ORDERED_READ(r->old_write_gen, page->modify->write_gen); + r->orig_disk_gen = page->modify->disk_gen; + WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen); r->upd_skipped = 0; return (0); @@ -1425,7 +1425,7 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page) /* Update any changes to the original on-page data items. */ WT_SKIP_FOREACH(ins, WT_COL_UPDATE_SINGLE(page)) { - upd = __wt_txn_read_int(session, ins->upd, &r->upd_skipped); + upd = __wt_txn_read_skip(session, ins->upd, &r->upd_skipped); if (upd == NULL) continue; __bit_setv_recno( @@ -1449,7 +1449,7 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_PAGE *page) /* Walk any append list. */ append = WT_COL_APPEND(page); WT_SKIP_FOREACH(ins, append) { - upd = __wt_txn_read_int(session, ins->upd, &r->upd_skipped); + upd = __wt_txn_read_skip(session, ins->upd, &r->upd_skipped); if (upd == NULL) continue; for (;;) { @@ -1756,7 +1756,7 @@ __rec_col_var( nrepeat = __wt_cell_rle(unpack); ins = WT_SKIP_FIRST(WT_COL_UPDATE(page, cip)); - while (ins != NULL && __wt_txn_read_int( + while (ins != NULL && __wt_txn_read_skip( session, ins->upd, &r->upd_skipped) == NULL) ins = WT_SKIP_NEXT(ins); @@ -1820,13 +1820,13 @@ record_loop: /* n < nrepeat; n += repeat_count, src_recno += repeat_count) { if (ins != NULL && WT_INSERT_RECNO(ins) == src_recno) { - upd = __wt_txn_read_int( + upd = __wt_txn_read_skip( session, ins->upd, &r->upd_skipped); WT_ASSERT(session, upd != NULL); do { ins = WT_SKIP_NEXT(ins); } while (ins != NULL && - __wt_txn_read_int(session, + __wt_txn_read_skip(session, ins->upd, &r->upd_skipped) == NULL); update_no_copy = 1; /* No data copy */ @@ -1975,7 +1975,7 @@ compare: /* /* Walk any append list. */ append = WT_COL_APPEND(page); WT_SKIP_FOREACH(ins, append) { - upd = __wt_txn_read_int(session, ins->upd, &r->upd_skipped); + upd = __wt_txn_read_skip(session, ins->upd, &r->upd_skipped); if (upd == NULL) continue; for (n = WT_INSERT_RECNO(ins); src_recno <= n; ++src_recno) { @@ -2510,7 +2510,7 @@ __rec_row_leaf( /* Build value cell. */ if ((val_cell = __wt_row_value(page, rip)) != NULL) __wt_cell_unpack(val_cell, unpack); - upd = __wt_txn_read_int( + upd = __wt_txn_read_skip( session, WT_ROW_UPDATE(page, rip), &r->upd_skipped); if (upd == NULL) { /* @@ -2755,7 +2755,7 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_INSERT *ins) for (; ins != NULL; ins = WT_SKIP_NEXT(ins)) { /* Build value cell. */ - upd = __wt_txn_read_int(session, ins->upd, &r->upd_skipped); + upd = __wt_txn_read_skip(session, ins->upd, &r->upd_skipped); if (upd == NULL || WT_UPDATE_DELETED_ISSET(upd)) continue; if (upd->size == 0) @@ -3051,7 +3051,8 @@ err: __wt_scr_free(&tkey); * generation when reconciliation started. */ if (ret == 0 && !r->upd_skipped) - WT_ATOMIC_CAS(mod->disk_gen, r->old_disk_gen, r->old_write_gen); + (void)WT_ATOMIC_CAS( + mod->disk_gen, r->orig_disk_gen, r->orig_write_gen); return (ret); } diff --git a/src/config/config_def.c b/src/config/config_def.c index 8c8d683d1ea..25ce3fd6459 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -114,11 +114,11 @@ __wt_confchk_session_begin_transaction = const char * __wt_confdfl_session_checkpoint = - "archive=false,snapshot="""; + "snapshot="""; const char * __wt_confchk_session_checkpoint = - "archive=(type=boolean),snapshot=()"; + "snapshot=()"; const char * __wt_confdfl_session_close = diff --git a/src/include/extern.h b/src/include/extern.h index d78595c98b6..baacb7a4e1c 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -263,7 +263,6 @@ extern void __wt_evict_server_wake(WT_SESSION_IMPL *session); extern void __wt_sync_file_serial_func(WT_SESSION_IMPL *session); extern int __wt_evict_page_request(WT_SESSION_IMPL *session, WT_PAGE *page); extern void *__wt_cache_evict_server(void *arg); -extern int __wt_evict_page(WT_SESSION_IMPL *session, WT_PAGE *page); extern int __wt_evict_lru_page(WT_SESSION_IMPL *session, int is_app); extern int __wt_btree_create(WT_SESSION_IMPL *session, const char *filename); extern int __wt_btree_truncate(WT_SESSION_IMPL *session, const char *filename); diff --git a/src/include/txn.i b/src/include/txn.i index e1905846948..ce17a612bfb 100644 --- a/src/include/txn.i +++ b/src/include/txn.i @@ -57,7 +57,7 @@ __wt_txn_visible(WT_SESSION_IMPL *session, wt_txnid_t id) /* Nobody sees the results of aborted transactions. */ if (id == WT_TXN_INVALID) - return 0; + return (0); /* * Changes with no associated transaction are always visible, and @@ -65,34 +65,35 @@ __wt_txn_visible(WT_SESSION_IMPL *session, wt_txnid_t id) */ txn = &session->txn; if (id == WT_TXN_NONE || txn->isolation != TXN_ISO_SNAPSHOT) - return 1; + return (1); /* * The snapshot test. */ if (TXNID_LT(id, txn->snap_min)) - return 1; + return (1); if (TXNID_LT(txn->id, id)) - return 0; + return (0); /* * Otherwise, the ID is visible if it is not the result of a concurrent - * transaction. That is, if it is not in the snapshot list. + * transaction. That is, if it is not in the snapshot list. Fast path + * the single-threaded case where there are no concurrent transactions. */ - return (bsearch(&id, txn->snapshot, txn->snapshot_count, - sizeof(wt_txnid_t), __wt_txnid_cmp) == NULL); + return (txn->snapshot_count == 0 || bsearch(&id, txn->snapshot, + txn->snapshot_count, sizeof(wt_txnid_t), __wt_txnid_cmp) == NULL); } /* - * __wt_txn_read_int -- + * __wt_txn_read_skip -- * Get the first visible update in a list (or NULL if none are visible), * and report whether uncommitted changes were skipped. */ static inline WT_UPDATE * -__wt_txn_read_int(WT_SESSION_IMPL *session, WT_UPDATE *upd, int *skipp) +__wt_txn_read_skip(WT_SESSION_IMPL *session, WT_UPDATE *upd, int *skipp) { while (upd != NULL && !__wt_txn_visible(session, upd->txnid)) { - if (skipp != NULL && upd->txnid != WT_TXN_NONE) + if (upd->txnid != WT_TXN_NONE) *skipp = 1; upd = upd->next; } @@ -107,7 +108,10 @@ __wt_txn_read_int(WT_SESSION_IMPL *session, WT_UPDATE *upd, int *skipp) static inline WT_UPDATE * __wt_txn_read(WT_SESSION_IMPL *session, WT_UPDATE *upd) { - return (__wt_txn_read_int(session, upd, NULL)); + while (upd != NULL && !__wt_txn_visible(session, upd->txnid)) + upd = upd->next; + + return (upd); } /* diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index fee73e5309a..b8445b072bb 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -768,16 +768,15 @@ struct __wt_session { /*! @name Transactions * @{ */ - /*! Start a transaction in this session. @notyet{transactions} + /*! Start a transaction in this session. * * All cursors opened in this session that support transactional * semantics will operate in the context of the transaction. The * transaction remains active until ended with * WT_SESSION::commit_transaction or WT_SESSION::rollback_transaction. * - * Ignored if a transaction is in progress. - * - * @todo describe nested transactions / savepoints + * Not permitted if a transaction is in progress or cursors are open in + * the session. * * @snippet ex_all.c session begin transaction * @@ -799,12 +798,11 @@ struct __wt_session { */ int __F(begin_transaction)(WT_SESSION *session, const char *config); - /*! Commit the current transaction. @notyet{transactions} - * - * Any cursors opened during the transaction will be closed before - * the commit is processed. + /*! Commit the current transaction. * - * Ignored if no transaction is in progress. + * A transaction must be in progress when this method is called. Any + * cursors opened during the transaction will be closed before the + * commit is processed. * * @snippet ex_all.c session commit transaction * @@ -814,12 +812,11 @@ struct __wt_session { */ int __F(commit_transaction)(WT_SESSION *session, const char *config); - /*! Roll back the current transaction. @notyet{transactions} + /*! Roll back the current transaction. * - * Any cursors opened during the transaction will be closed before - * the rollback is processed. - * - * Ignored if no transaction is in progress. + * A transaction must be in progress when this method is called. Any + * cursors opened during the transaction will be closed before the + * rollback is processed. * * @snippet ex_all.c session rollback transaction * @@ -829,17 +826,17 @@ struct __wt_session { */ int __F(rollback_transaction)(WT_SESSION *session, const char *config); - /*! Flush the cache and/or the log and optionally archive log files. - * @notyet{checkpoint} + /*! Write a transactionally consistent snapshot of a database. + * + * All data files in the database are updated with snapshots that + * reflect the transactions committed before the checkpoint starts. * * @snippet ex_all.c session checkpoint * * @param session the session handle * @configstart{session.checkpoint, see dist/api_data.py} - * @config{archive, remove log files no longer required for - * transactional durability.,a boolean flag; default \c false.} - * @config{snapshot, if non-empty\, create a named snapshot.,a string; - * default empty.} + * @config{snapshot, if non-empty\, create named snapshots in files.,a + * string; default empty.} * @configend * @errors */ @@ -927,7 +924,7 @@ struct __wt_connection { int __F(add_data_source)(WT_CONNECTION *connection, const char *prefix, WT_DATA_SOURCE *data_source, const char *config); - /*! Add a custom collation function. @notyet{custom collation} + /*! Add a custom collation function. * * The application must first implement the WT_COLLATOR interface and * then register the implementation with WiredTiger: diff --git a/src/meta/meta_apply.c b/src/meta/meta_apply.c index ca11eab1655..60a2659953e 100644 --- a/src/meta/meta_apply.c +++ b/src/meta/meta_apply.c @@ -8,8 +8,8 @@ #include "wt_internal.h" /* - * wt_metadata_get -- - * Public entry point to __wt_metadata_read (for wt dump and list). + * __wt_meta_btree_apply -- + * Apply a function to all files listed in the metadata. */ int __wt_meta_btree_apply(WT_SESSION_IMPL *session, @@ -29,14 +29,15 @@ __wt_meta_btree_apply(WT_SESSION_IMPL *session, if (tret == 0 && cmp < 0) tret = cursor->next(cursor); for (; tret == 0; tret = cursor->next(cursor)) { - cursor->get_key(cursor, &uri); + if ((tret = cursor->get_key(cursor, &uri)) != 0) + break; if (!WT_PREFIX_MATCH(uri, "file:")) break; else if (strcmp(uri, WT_METADATA_URI) == 0) continue; - if ((ret = + if ((tret = __wt_session_get_btree(session, uri, NULL, flags)) != 0) { - WT_TRET(ret); + WT_TRET(tret); continue; } WT_TRET(func(session, cfg)); diff --git a/src/txn/txn.c b/src/txn/txn.c index 4086dd8074e..3be979e6e3e 100644 --- a/src/txn/txn.c +++ b/src/txn/txn.c @@ -24,7 +24,7 @@ __wt_txnid_cmp(const void *v1, const void *v2) /* * __wt_txn_get_snapshot -- - * Set up a snapshot in the current transaction. + * Set up a snapshot in the current transaction, without allocating an ID. */ int __wt_txn_get_snapshot(WT_SESSION_IMPL *session) @@ -33,6 +33,7 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) WT_TXN *txn; WT_TXN_GLOBAL *txn_global; wt_txnid_t id, *lastid; + uint32_t i, session_cnt; conn = S2C(session); txn = &session->txn; @@ -43,15 +44,16 @@ __wt_txn_get_snapshot(WT_SESSION_IMPL *session) id = txn_global->current; /* Copy the array of concurrent transactions. */ - memcpy(txn->snapshot, txn_global->ids, - sizeof(wt_txnid_t) * conn->session_size); + WT_ORDERED_READ(session_cnt, conn->session_cnt); + for (i = 0; i < session_cnt; i++) + txn->snapshot[i] = txn_global->ids[i]; } while (txn_global->current != id); /* Sort the snapshot and size for faster searching. */ - qsort(txn->snapshot, conn->session_size, sizeof(wt_txnid_t), + qsort(txn->snapshot, session_cnt, sizeof(wt_txnid_t), + __wt_txnid_cmp); + lastid = bsearch(&id, txn->snapshot, session_cnt, sizeof(wt_txnid_t), __wt_txnid_cmp); - lastid = bsearch(&id, txn->snapshot, conn->session_size, - sizeof(wt_txnid_t), __wt_txnid_cmp); WT_ASSERT(session, lastid != NULL); while (lastid > txn->snapshot && lastid[-1] == lastid[0]) --lastid; @@ -73,6 +75,7 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) WT_TXN *txn; WT_TXN_GLOBAL *txn_global; wt_txnid_t *lastid; + uint32_t i, session_cnt; conn = S2C(session); txn = &session->txn; @@ -95,18 +98,20 @@ __wt_txn_begin(WT_SESSION_IMPL *session, const char *cfg[]) txn->id = txn_global->current; WT_PUBLISH(txn_global->ids[session->id], txn->id); - if (txn->isolation == TXN_ISO_SNAPSHOT) + if (txn->isolation == TXN_ISO_SNAPSHOT) { /* Copy the array of concurrent transactions. */ - memcpy(txn->snapshot, txn_global->ids, - sizeof(wt_txnid_t) * conn->session_size); + WT_ORDERED_READ(session_cnt, conn->session_cnt); + for (i = 0; i < conn->session_cnt; i++) + txn->snapshot[i] = txn_global->ids[i]; + } } while (!WT_ATOMIC_CAS(txn_global->current, txn->id, txn->id + 1) || txn->id == WT_TXN_NONE || txn->id == WT_TXN_INVALID); if (txn->isolation == TXN_ISO_SNAPSHOT) { /* Sort the snapshot and size for faster searching. */ - qsort(txn->snapshot, conn->session_size, sizeof(wt_txnid_t), + qsort(txn->snapshot, session_cnt, sizeof(wt_txnid_t), __wt_txnid_cmp); - lastid = bsearch(&txn->id, txn->snapshot, conn->session_size, + lastid = bsearch(&txn->id, txn->snapshot, session_cnt, sizeof(wt_txnid_t), __wt_txnid_cmp); WT_ASSERT(session, lastid != NULL); while (lastid > txn->snapshot && lastid[-1] == lastid[0]) @@ -235,7 +240,7 @@ __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) /* * If we're doing an ordinary unnamed checkpoint, we only need to flush - * open files. If we're creating a named snapshot, we need to walk the + * open files. If we're creating a named snapshot, we need to walk the * entire list of files in the metadata. */ WT_TRET((snapshot == NULL) ? |