From 9c7823141754c8099b784867d2dbed6f2dd28562 Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Fri, 18 Sep 2015 02:08:10 +0000 Subject: WT-2114 Update application eviction loop. Have it use a global evict count rather than a count of evictions done by this thread. --- src/evict/evict_lru.c | 35 +++++++++++++++-------------------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index e0866521c7c..4772a42db16 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -1454,14 +1454,12 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, u_int pct_full) WT_DECL_RET; WT_TXN_GLOBAL *txn_global; WT_TXN_STATE *txn_state; - int count, q_found, txn_busy; + int txn_busy; + uint64_t init_evict_count, max_pages_evicted; conn = S2C(session); cache = conn->cache; - /* First, wake the eviction server. */ - WT_RET(__wt_evict_server_wake(session)); - /* * If the current transaction is keeping the oldest ID pinned, it is in * the middle of an operation. This may prevent the oldest ID from @@ -1481,15 +1479,20 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, u_int pct_full) busy = 1; } + /* Wake the eviction server if we need to do work. */ + WT_RET(__wt_evict_server_wake(session)); + /* * If we're busy, either because of the transaction check we just did, * or because our caller is waiting on a longer-than-usual event (such * as a page read), limit the work to a single eviction and return. If * that's not the case, we can do more. */ - count = busy ? 1 : 10; + init_evict_count = cache->pages_evict; for (;;) { + max_pages_evicted = busy ? 10 : 50; + /* * A pathological case: if we're the oldest transaction in the * system and the eviction server is stuck trying to find space, @@ -1503,16 +1506,16 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, u_int pct_full) return (WT_ROLLBACK); } + /* See if eviction is still needed. */ + if (!__wt_eviction_needed(session, NULL) || + cache->pages_evict > init_evict_count + max_pages_evicted) + return (0); + /* Evict a page. */ - q_found = 0; switch (ret = __evict_page(session, 0)) { case 0: cache->app_evicts++; - if (--count == 0) - return (0); - - q_found = 1; - break; + /* Fallthrough */ case EBUSY: continue; case WT_NOTFOUND: @@ -1521,14 +1524,6 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, u_int pct_full) return (ret); } - /* See if eviction is still needed. */ - if (!__wt_eviction_needed(session, NULL)) - return (0); - - /* If we found pages in the eviction queue, continue there. */ - if (q_found) - continue; - /* Wait for the queue to re-populate before trying again. */ WT_RET( __wt_cond_wait(session, cache->evict_waiter_cond, 100000)); @@ -1537,7 +1532,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, u_int pct_full) /* Check if things have changed so that we are busy. */ if (!busy && txn_state->snap_min != WT_TXN_NONE && txn_global->current != txn_global->oldest_id) - busy = count = 1; + busy = 1; } /* NOTREACHED */ } -- cgit v1.2.1 From 6e8c93f3c397094649eeff35f7aebd81bb1c0a8d Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Fri, 18 Sep 2015 07:21:54 +0000 Subject: Further tweaks to evict wait function. These are more experimental, they deserve discussion prior to merge. --- src/evict/evict_lru.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 4772a42db16..859898d9edb 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -1473,11 +1473,11 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, u_int pct_full) session->nhazard > 0 || (txn_state->snap_min != WT_TXN_NONE && txn_global->current != txn_global->oldest_id); - if (txn_busy) { - if (pct_full < 100) - return (0); - busy = 1; - } + if (txn_busy && pct_full < 100) + return (0); + + if (busy == 1) + txn_busy = 1; /* Wake the eviction server if we need to do work. */ WT_RET(__wt_evict_server_wake(session)); @@ -1491,7 +1491,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, u_int pct_full) init_evict_count = cache->pages_evict; for (;;) { - max_pages_evicted = busy ? 10 : 50; + max_pages_evicted = txn_busy ? 5 : 20; /* * A pathological case: if we're the oldest transaction in the @@ -1515,24 +1515,25 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, u_int pct_full) switch (ret = __evict_page(session, 0)) { case 0: cache->app_evicts++; + if (txn_busy) + return (0); /* Fallthrough */ case EBUSY: - continue; + break; case WT_NOTFOUND: + /* Allow the queue to re-populate before retrying. */ + WT_RET(__wt_cond_wait( + session, cache->evict_waiter_cond, 100000)); + cache->app_waits++; break; default: return (ret); } - /* Wait for the queue to re-populate before trying again. */ - WT_RET( - __wt_cond_wait(session, cache->evict_waiter_cond, 100000)); - - cache->app_waits++; - /* Check if things have changed so that we are busy. */ + /* Check if we have become busy. */ if (!busy && txn_state->snap_min != WT_TXN_NONE && txn_global->current != txn_global->oldest_id) - busy = 1; + txn_busy = 1; } /* NOTREACHED */ } -- cgit v1.2.1 From 30824b710849574b9d6644193d86f447fa2af3cc Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Mon, 21 Sep 2015 15:03:40 +1000 Subject: WT-2114 Fix a bug testing against wrong variable. --- src/evict/evict_lru.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 859898d9edb..e69752422fe 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -1531,7 +1531,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, int busy, u_int pct_full) } /* Check if we have become busy. */ - if (!busy && txn_state->snap_min != WT_TXN_NONE && + if (!txn_busy && txn_state->snap_min != WT_TXN_NONE && txn_global->current != txn_global->oldest_id) txn_busy = 1; } -- cgit v1.2.1 From 5bcbed387bd398b87f8fab0c47d04ed4ffb640e9 Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Thu, 24 Sep 2015 07:18:16 +0000 Subject: WT-2131 Switch to using a lock to control page splits. We used to spin on setting an atomic flag, but we saw cases where that wasn't fair (and burned CPU). This implementation adds a lock into the page structure - we could probably find a better place for that if the change is a general improvement. --- src/btree/bt_compact.c | 14 ++++++++++++-- src/btree/bt_page.c | 2 ++ src/btree/bt_split.c | 45 ++++++++++++++++++--------------------------- src/include/btmem.h | 6 ++++++ src/reconcile/rec_write.c | 10 ++++++++-- 5 files changed, 46 insertions(+), 31 deletions(-) diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c index f7997d8b909..a9707ea2371 100644 --- a/src/btree/bt_compact.c +++ b/src/btree/bt_compact.c @@ -55,10 +55,20 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) * The page's modification information can change underfoot if * the page is being reconciled, serialize with reconciliation. */ - F_CAS_ATOMIC_WAIT(page, WT_PAGE_RECONCILIATION); + if (WT_PAGE_IS_INTERNAL(page)) + WT_RET(__wt_writelock( + session, page->pg_intl_split_lock)); + else + F_CAS_ATOMIC_WAIT(page, WT_PAGE_RECONCILIATION); + ret = bm->compact_page_skip(bm, session, mod->mod_replace.addr, mod->mod_replace.size, skipp); - F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION); + + if (WT_PAGE_IS_INTERNAL(page)) + WT_TRET(__wt_writeunlock( + session, page->pg_intl_split_lock)); + else + F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION); WT_RET(ret); } return (0); diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c index 34964def54c..82b57c6f9e7 100644 --- a/src/btree/bt_page.c +++ b/src/btree/bt_page.c @@ -94,6 +94,8 @@ __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, session, &pindex->index[i])); size += sizeof(WT_REF); } + WT_ERR(__wt_rwlock_alloc(session, + &page->pg_intl_split_lock, "internal page split lock")); if (0) { err: if ((pindex = WT_INTL_INDEX_GET_SAFE(page)) != NULL) { for (i = 0; i < pindex->entries; ++i) diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 8843cf7baa7..e064cd00e0e 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -866,6 +866,18 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, parent_entries = 0; complete = hazard = false; + /* + * A checkpoint reconciling this parent page can deadlock with + * our split. We have an exclusive page lock on the child before + * we acquire the page's reconciliation lock, and reconciliation + * acquires the page's reconciliation lock before it encounters + * the child's exclusive lock (which causes reconciliation to + * loop until the exclusive lock is resolved). If we want to split + * the parent, give up to avoid that deadlock. + */ + if (S2BT(session)->checkpointing) + return (EBUSY); + /* * Get a page-level lock on the parent to single-thread splits into the * page because we need to single-thread sizing/growing the page index. @@ -883,32 +895,11 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, */ for (;;) { parent = ref->home; - F_CAS_ATOMIC(parent, WT_PAGE_RECONCILIATION, ret); - if (ret == 0) { - /* - * We can race with another thread deepening our parent. - * To deal with that, read the parent pointer each time - * we try to lock it, and check it's still correct after - * it's locked. - */ - if (parent == ref->home) - break; - F_CLR_ATOMIC(parent, WT_PAGE_RECONCILIATION); - continue; - } - - /* - * A checkpoint reconciling this parent page can deadlock with - * our split. We have an exclusive page lock on the child before - * we acquire the page's reconciliation lock, and reconciliation - * acquires the page's reconciliation lock before it encounters - * the child's exclusive lock (which causes reconciliation to - * loop until the exclusive lock is resolved). If we can't lock - * the parent, give up to avoid that deadlock. - */ - if (S2BT(session)->checkpointing) - return (EBUSY); - __wt_yield(); + WT_RET(__wt_writelock(session, parent->u.intl.split_lock)); + if (parent == ref->home) + break; + /* Try again if the page deepened while we were waiting */ + WT_RET(__wt_writeunlock(session, parent->pg_intl_split_lock)); } /* @@ -1138,7 +1129,7 @@ err: if (!complete) if (next_ref->state == WT_REF_SPLIT) next_ref->state = WT_REF_DELETED; } - F_CLR_ATOMIC(parent, WT_PAGE_RECONCILIATION); + WT_TRET(__wt_writeunlock(session, parent->pg_intl_split_lock)); if (hazard) WT_TRET(__wt_hazard_clear(session, parent)); diff --git a/src/include/btmem.h b/src/include/btmem.h index e2a1d4269ee..4910893495e 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -451,12 +451,18 @@ struct __wt_page { */ uint32_t deepen_split_append; uint32_t deepen_split_last; + /* + * Used to protect and fairly co-ordinate splits into + * this page. + */ + WT_RWLOCK *split_lock; } intl; #undef pg_intl_recno #define pg_intl_recno u.intl.recno #define pg_intl_parent_ref u.intl.parent_ref #define pg_intl_deepen_split_append u.intl.deepen_split_append #define pg_intl_deepen_split_last u.intl.deepen_split_last +#define pg_intl_split_lock u.intl.split_lock /* * Macros to copy/set the index because the name is obscured to ensure diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index 3678429d0ff..4718e26de52 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -394,7 +394,10 @@ __wt_reconcile(WT_SESSION_IMPL *session, * In-memory splits: reconciliation of an internal page cannot handle * a child page splitting during the reconciliation. */ - F_CAS_ATOMIC_WAIT(page, WT_PAGE_RECONCILIATION); + if (WT_PAGE_IS_INTERNAL(page)) + WT_RET(__wt_writelock(session, page->pg_intl_split_lock)); + else + F_CAS_ATOMIC_WAIT(page, WT_PAGE_RECONCILIATION); /* Reconcile the page. */ switch (page->type) { @@ -432,7 +435,10 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_TRET(__rec_write_wrapup_err(session, r, page)); /* Release the reconciliation lock. */ - F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION); + if (WT_PAGE_IS_INTERNAL(page)) + WT_TRET(__wt_writeunlock(session, page->pg_intl_split_lock)); + else + F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION); /* Update statistics. */ WT_STAT_FAST_CONN_INCR(session, rec_pages); -- cgit v1.2.1 From ce1fc84bfe5914e0dd3cf9b715cece89ad684f88 Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Mon, 28 Sep 2015 01:41:43 +0000 Subject: WT-2131 Add a new fair-lock type. That is a simple ticket based lock implementation. Switch the page lock from a read/write lock to a fair lock. --- src/btree/bt_compact.c | 8 ++++---- src/btree/bt_page.c | 2 -- src/btree/bt_split.c | 6 +++--- src/include/btmem.h | 2 +- src/include/mutex.h | 9 +++++++++ src/include/mutex.i | 51 +++++++++++++++++++++++++++++++++++++++++++++++ src/include/wt_internal.h | 2 ++ src/reconcile/rec_write.c | 4 ++-- 8 files changed, 72 insertions(+), 12 deletions(-) diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c index a9707ea2371..bcd24b8bc0b 100644 --- a/src/btree/bt_compact.c +++ b/src/btree/bt_compact.c @@ -56,8 +56,8 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) * the page is being reconciled, serialize with reconciliation. */ if (WT_PAGE_IS_INTERNAL(page)) - WT_RET(__wt_writelock( - session, page->pg_intl_split_lock)); + WT_RET(__wt_fair_lock( + session, &page->pg_intl_split_lock)); else F_CAS_ATOMIC_WAIT(page, WT_PAGE_RECONCILIATION); @@ -65,8 +65,8 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) mod->mod_replace.addr, mod->mod_replace.size, skipp); if (WT_PAGE_IS_INTERNAL(page)) - WT_TRET(__wt_writeunlock( - session, page->pg_intl_split_lock)); + WT_TRET(__wt_fair_unlock( + session, &page->pg_intl_split_lock)); else F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION); WT_RET(ret); diff --git a/src/btree/bt_page.c b/src/btree/bt_page.c index 82b57c6f9e7..34964def54c 100644 --- a/src/btree/bt_page.c +++ b/src/btree/bt_page.c @@ -94,8 +94,6 @@ __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, session, &pindex->index[i])); size += sizeof(WT_REF); } - WT_ERR(__wt_rwlock_alloc(session, - &page->pg_intl_split_lock, "internal page split lock")); if (0) { err: if ((pindex = WT_INTL_INDEX_GET_SAFE(page)) != NULL) { for (i = 0; i < pindex->entries; ++i) diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index b04b0009f9b..90f90fa57f5 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -890,11 +890,11 @@ __split_parent_lock( */ for (;;) { parent = ref->home; - WT_RET(__wt_writelock(session, parent->u.intl.split_lock)); + WT_RET(__wt_fair_lock(session, &parent->pg_intl_split_lock)); if (parent == ref->home) break; /* Try again if the page deepened while we were waiting */ - WT_RET(__wt_writeunlock(session, parent->pg_intl_split_lock)); + WT_RET(__wt_fair_unlock(session, &parent->pg_intl_split_lock)); } /* @@ -1174,7 +1174,7 @@ err: if (!complete) if (next_ref->state == WT_REF_SPLIT) next_ref->state = WT_REF_DELETED; } - WT_TRET(__wt_writeunlock(session, parent->pg_intl_split_lock)); + WT_TRET(__wt_fair_unlock(session, &parent->pg_intl_split_lock)); __wt_free_ref_index(session, NULL, alloc_index, false); diff --git a/src/include/btmem.h b/src/include/btmem.h index 22157e3c398..374d970acd0 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -453,7 +453,7 @@ struct __wt_page { * Used to protect and fairly co-ordinate splits into * this page. */ - WT_RWLOCK *split_lock; + WT_FAIR_LOCK split_lock; } intl; #undef pg_intl_recno #define pg_intl_recno u.intl.recno diff --git a/src/include/mutex.h b/src/include/mutex.h index 1f1bb8f4b5c..ed9f2b2aec0 100644 --- a/src/include/mutex.h +++ b/src/include/mutex.h @@ -51,6 +51,15 @@ struct __wt_rwlock { wt_rwlock_t rwlock; /* Read/write lock */ }; +/* + * A light weight lock that can be used to replace spinlocks if fairness is + * necessasry. Implements a ticket-based backoff spin lock. + */ +struct __wt_fair_lock { + uint16_t owner; /* Ticket number for current owner */ + uint16_t waiter; /* Ticket number of last allocated */ +}; + /* * Spin locks: * diff --git a/src/include/mutex.i b/src/include/mutex.i index 5ea4583a2ab..efc8f588853 100644 --- a/src/include/mutex.i +++ b/src/include/mutex.i @@ -251,3 +251,54 @@ __wt_spin_unlock(WT_SESSION_IMPL *session, WT_SPINLOCK *t) #error Unknown spinlock type #endif + +/* + * __wt_fair_lock -- + * Get a lock. + */ +static inline int +__wt_fair_lock(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock) +{ + uint16_t ticket; + int pause_cnt; + + WT_UNUSED(session); + + /* + * Possibly wrap: if we have more than 64K lockers waiting, the ticket + * value will wrap and two lockers will simultaneously be granted the + * lock. + */ + ticket = __wt_atomic_fetch_add16(&lock->waiter, 1); + for (pause_cnt = 0; ticket != lock->owner;) { + /* + * We failed to get the lock; pause before retrying and if we've + * paused enough, sleep so we don't burn CPU to no purpose. This + * situation happens if there are more threads than cores in the + * system and we're thrashing on shared resources. + */ + if (++pause_cnt < 1000) + WT_PAUSE(); + else + __wt_sleep(0, 10); + } + + return (0); +} + +/* + * __wt_fair_unlock -- + * Release a shared lock. + */ +static inline int +__wt_fair_unlock(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock) +{ + WT_UNUSED(session); + + /* + * We have exclusive access - the update does not need to be atomic. + */ + ++lock->owner; + + return (0); +} diff --git a/src/include/wt_internal.h b/src/include/wt_internal.h index 4d46a25b63c..3f4e0ada7f1 100644 --- a/src/include/wt_internal.h +++ b/src/include/wt_internal.h @@ -164,6 +164,8 @@ struct __wt_ext; typedef struct __wt_ext WT_EXT; struct __wt_extlist; typedef struct __wt_extlist WT_EXTLIST; +struct __wt_fair_lock; + typedef struct __wt_fair_lock WT_FAIR_LOCK; struct __wt_fh; typedef struct __wt_fh WT_FH; struct __wt_hazard; diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index 5dfcb6c8a92..7b146ecf490 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -395,7 +395,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, * a child page splitting during the reconciliation. */ if (WT_PAGE_IS_INTERNAL(page)) - WT_RET(__wt_writelock(session, page->pg_intl_split_lock)); + WT_RET(__wt_fair_lock(session, &page->pg_intl_split_lock)); else F_CAS_ATOMIC_WAIT(page, WT_PAGE_RECONCILIATION); @@ -436,7 +436,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, /* Release the reconciliation lock. */ if (WT_PAGE_IS_INTERNAL(page)) - WT_TRET(__wt_writeunlock(session, page->pg_intl_split_lock)); + WT_TRET(__wt_fair_unlock(session, &page->pg_intl_split_lock)); else F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION); -- cgit v1.2.1 From b8d17a07e4e209756407a9ca30fe233d551aee84 Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Mon, 28 Sep 2015 01:54:26 +0000 Subject: lint --- src/include/mutex.h | 2 +- src/include/mutex.i | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/include/mutex.h b/src/include/mutex.h index ed9f2b2aec0..cdcd2bd46b9 100644 --- a/src/include/mutex.h +++ b/src/include/mutex.h @@ -53,7 +53,7 @@ struct __wt_rwlock { /* * A light weight lock that can be used to replace spinlocks if fairness is - * necessasry. Implements a ticket-based backoff spin lock. + * necessary. Implements a ticket-based back off spin lock. */ struct __wt_fair_lock { uint16_t owner; /* Ticket number for current owner */ diff --git a/src/include/mutex.i b/src/include/mutex.i index efc8f588853..47d2e24ee73 100644 --- a/src/include/mutex.i +++ b/src/include/mutex.i @@ -262,7 +262,7 @@ __wt_fair_lock(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock) uint16_t ticket; int pause_cnt; - WT_UNUSED(session); + WT_UNUSED(session); /* * Possibly wrap: if we have more than 64K lockers waiting, the ticket @@ -293,7 +293,7 @@ __wt_fair_lock(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock) static inline int __wt_fair_unlock(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock) { - WT_UNUSED(session); + WT_UNUSED(session); /* * We have exclusive access - the update does not need to be atomic. -- cgit v1.2.1 From 744696e318c0d8042f32f8ab5d858fa1a8365d69 Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Mon, 28 Sep 2015 04:21:23 +0000 Subject: WT-2131 Switch all WT_PAGE_RECONCILIATION locks to fair locks. The main purpose is to keep a single lock for all page operations. --- src/btree/bt_compact.c | 12 ++---------- src/btree/bt_debug.c | 7 +++++-- src/btree/bt_discard.c | 6 +++++- src/btree/bt_split.c | 9 ++++----- src/include/btmem.h | 15 +++++++-------- src/include/mutex.i | 20 ++++++++++++++++++++ src/include/serial.i | 5 ++--- src/reconcile/rec_write.c | 10 ++-------- 8 files changed, 47 insertions(+), 37 deletions(-) diff --git a/src/btree/bt_compact.c b/src/btree/bt_compact.c index bcd24b8bc0b..35326b6f272 100644 --- a/src/btree/bt_compact.c +++ b/src/btree/bt_compact.c @@ -55,20 +55,12 @@ __compact_rewrite(WT_SESSION_IMPL *session, WT_REF *ref, bool *skipp) * The page's modification information can change underfoot if * the page is being reconciled, serialize with reconciliation. */ - if (WT_PAGE_IS_INTERNAL(page)) - WT_RET(__wt_fair_lock( - session, &page->pg_intl_split_lock)); - else - F_CAS_ATOMIC_WAIT(page, WT_PAGE_RECONCILIATION); + WT_RET(__wt_fair_lock(session, &page->page_lock)); ret = bm->compact_page_skip(bm, session, mod->mod_replace.addr, mod->mod_replace.size, skipp); - if (WT_PAGE_IS_INTERNAL(page)) - WT_TRET(__wt_fair_unlock( - session, &page->pg_intl_split_lock)); - else - F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION); + WT_TRET(__wt_fair_unlock(session, &page->page_lock)); WT_RET(ret); } return (0); diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c index 4d3add47052..957955ee502 100644 --- a/src/btree/bt_debug.c +++ b/src/btree/bt_debug.c @@ -645,11 +645,14 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page) __dmsg(ds, ", disk-mapped"); if (F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)) __dmsg(ds, ", evict-lru"); - if (F_ISSET_ATOMIC(page, WT_PAGE_RECONCILIATION)) - __dmsg(ds, ", reconciliation"); if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT)) __dmsg(ds, ", split-insert"); + if (__wt_fair_trylock(session, &page->page_lock) != 0) + __dmsg(ds, ", locked"); + else + WT_RET(__wt_fair_unlock(session, &page->page_lock)); + if (mod != NULL) switch (mod->rec_result) { case WT_PM_REC_EMPTY: diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c index c27d42d38f4..c3da536170d 100644 --- a/src/btree/bt_discard.c +++ b/src/btree/bt_discard.c @@ -55,7 +55,11 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) */ WT_ASSERT(session, !__wt_page_is_modified(page)); WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)); - WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_RECONCILIATION)); + /* + * This will leave the page lock held - but that doesn't matter - we + * are about to free the memory anyway. + */ + WT_ASSERT(session, __wt_fair_trylock(session, &page->page_lock) == 0); #ifdef HAVE_DIAGNOSTIC { diff --git a/src/btree/bt_split.c b/src/btree/bt_split.c index 90f90fa57f5..620603dcf8e 100644 --- a/src/btree/bt_split.c +++ b/src/btree/bt_split.c @@ -890,11 +890,11 @@ __split_parent_lock( */ for (;;) { parent = ref->home; - WT_RET(__wt_fair_lock(session, &parent->pg_intl_split_lock)); + WT_RET(__wt_fair_lock(session, &parent->page_lock)); if (parent == ref->home) break; /* Try again if the page deepened while we were waiting */ - WT_RET(__wt_fair_unlock(session, &parent->pg_intl_split_lock)); + WT_RET(__wt_fair_unlock(session, &parent->page_lock)); } /* @@ -917,7 +917,7 @@ __split_parent_lock( *parentp = parent; return (0); -err: F_CLR_ATOMIC(parent, WT_PAGE_RECONCILIATION); +err: WT_TRET(__wt_fair_unlock(session, &parent->page_lock)); return (ret); } @@ -933,7 +933,7 @@ __split_parent_unlock(WT_SESSION_IMPL *session, WT_PAGE *parent, bool hazard) if (hazard) ret = __wt_hazard_clear(session, parent); - F_CLR_ATOMIC(parent, WT_PAGE_RECONCILIATION); + WT_TRET(__wt_fair_unlock(session, &parent->page_lock)); return (ret); } @@ -1174,7 +1174,6 @@ err: if (!complete) if (next_ref->state == WT_REF_SPLIT) next_ref->state = WT_REF_DELETED; } - WT_TRET(__wt_fair_unlock(session, &parent->pg_intl_split_lock)); __wt_free_ref_index(session, NULL, alloc_index, false); diff --git a/src/include/btmem.h b/src/include/btmem.h index 374d970acd0..04f8b77f84a 100644 --- a/src/include/btmem.h +++ b/src/include/btmem.h @@ -449,18 +449,12 @@ struct __wt_page { */ uint32_t deepen_split_append; uint32_t deepen_split_last; - /* - * Used to protect and fairly co-ordinate splits into - * this page. - */ - WT_FAIR_LOCK split_lock; } intl; #undef pg_intl_recno #define pg_intl_recno u.intl.recno #define pg_intl_parent_ref u.intl.parent_ref #define pg_intl_deepen_split_append u.intl.deepen_split_append #define pg_intl_deepen_split_last u.intl.deepen_split_last -#define pg_intl_split_lock u.intl.split_lock /* * Macros to copy/set the index because the name is obscured to ensure @@ -585,8 +579,7 @@ struct __wt_page { #define WT_PAGE_DISK_ALLOC 0x02 /* Disk image in allocated memory */ #define WT_PAGE_DISK_MAPPED 0x04 /* Disk image in mapped memory */ #define WT_PAGE_EVICT_LRU 0x08 /* Page is on the LRU queue */ -#define WT_PAGE_RECONCILIATION 0x10 /* Page reconciliation lock */ -#define WT_PAGE_SPLIT_INSERT 0x20 /* A leaf page was split for append */ +#define WT_PAGE_SPLIT_INSERT 0x10 /* A leaf page was split for append */ uint8_t flags_atomic; /* Atomic flags, use F_*_ATOMIC */ /* @@ -610,6 +603,12 @@ struct __wt_page { #define WT_READGEN_STEP 100 uint64_t read_gen; + /* + * Used to protect and co-ordinate splits for internal pages and + * reconciliation for all pages. + */ + WT_FAIR_LOCK page_lock; + size_t memory_footprint; /* Memory attached to the page */ /* Page's on-disk representation: NULL for pages created in memory. */ diff --git a/src/include/mutex.i b/src/include/mutex.i index 47d2e24ee73..1f6445c5315 100644 --- a/src/include/mutex.i +++ b/src/include/mutex.i @@ -252,6 +252,26 @@ __wt_spin_unlock(WT_SESSION_IMPL *session, WT_SPINLOCK *t) #endif +/* + * __wt_fair_trylock -- + * Try to get a lock - give up if it is not immediately available. + */ +static inline int +__wt_fair_trylock(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock) +{ + uint16_t ticket; + + WT_UNUSED(session); + + /* Do the cheap test first. */ + if (lock->waiter != lock->owner) + return (EBUSY); + + ticket = lock->owner; + return (__wt_atomic_cas16( + &lock->waiter, ticket, ticket + 1) ? 0 : EBUSY); +} + /* * __wt_fair_lock -- * Get a lock. diff --git a/src/include/serial.i b/src/include/serial.i index 5358b874c06..ca22ce12d81 100644 --- a/src/include/serial.i +++ b/src/include/serial.i @@ -316,12 +316,11 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page, } /* If we can't lock it, don't scan, that's okay. */ - F_CAS_ATOMIC(page, WT_PAGE_RECONCILIATION, ret); - if (ret != 0) + if (__wt_fair_trylock(session, &page->page_lock) != 0) return (0); obsolete = __wt_update_obsolete_check(session, page, upd->next); - F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION); + WT_RET(__wt_fair_unlock(session, &page->page_lock)); if (obsolete != NULL) __wt_update_obsolete_free(session, page, obsolete); diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index 7b146ecf490..fb1634b46d8 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -394,10 +394,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, * In-memory splits: reconciliation of an internal page cannot handle * a child page splitting during the reconciliation. */ - if (WT_PAGE_IS_INTERNAL(page)) - WT_RET(__wt_fair_lock(session, &page->pg_intl_split_lock)); - else - F_CAS_ATOMIC_WAIT(page, WT_PAGE_RECONCILIATION); + WT_RET(__wt_fair_lock(session, &page->page_lock)); /* Reconcile the page. */ switch (page->type) { @@ -435,10 +432,7 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_TRET(__rec_write_wrapup_err(session, r, page)); /* Release the reconciliation lock. */ - if (WT_PAGE_IS_INTERNAL(page)) - WT_TRET(__wt_fair_unlock(session, &page->pg_intl_split_lock)); - else - F_CLR_ATOMIC(page, WT_PAGE_RECONCILIATION); + WT_TRET(__wt_fair_unlock(session, &page->page_lock)); /* Update statistics. */ WT_STAT_FAST_CONN_INCR(session, rec_pages); -- cgit v1.2.1 From 6d7678f0c92f310139b53427f1fa6d0bea3857ae Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Mon, 28 Sep 2015 05:14:33 +0000 Subject: WT-2131 Fix __wt_fair_trylock implementation. --- src/include/hardware.h | 10 ---------- src/include/mutex.h | 11 +++++++++-- src/include/mutex.i | 21 ++++++++++++--------- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/include/hardware.h b/src/include/hardware.h index 32353072c5b..c9b72f8a609 100644 --- a/src/include/hardware.h +++ b/src/include/hardware.h @@ -50,16 +50,6 @@ &(p)->flags_atomic, __orig, __orig | (uint8_t)(mask))); \ } while (0) -#define F_CAS_ATOMIC_WAIT(p, mask) do { \ - int __ret; \ - for (;;) { \ - F_CAS_ATOMIC(p, mask, __ret); \ - if (__ret == 0) \ - break; \ - __wt_yield(); \ - } \ -} while (0) - #define F_CLR_ATOMIC(p, mask) do { \ uint8_t __orig; \ do { \ diff --git a/src/include/mutex.h b/src/include/mutex.h index cdcd2bd46b9..f497a30c20b 100644 --- a/src/include/mutex.h +++ b/src/include/mutex.h @@ -56,8 +56,15 @@ struct __wt_rwlock { * necessary. Implements a ticket-based back off spin lock. */ struct __wt_fair_lock { - uint16_t owner; /* Ticket number for current owner */ - uint16_t waiter; /* Ticket number of last allocated */ + union { + uint32_t lock; /* Used when needing to atomically switch */ + struct { + uint16_t owner; /* Ticket number for current owner */ + uint16_t waiter;/* Ticket number of last allocated */ + } s; + } u; +#define fair_lock_owner u.s.owner +#define fair_lock_waiter u.s.waiter }; /* diff --git a/src/include/mutex.i b/src/include/mutex.i index 1f6445c5315..befa8d4dda8 100644 --- a/src/include/mutex.i +++ b/src/include/mutex.i @@ -259,17 +259,20 @@ __wt_spin_unlock(WT_SESSION_IMPL *session, WT_SPINLOCK *t) static inline int __wt_fair_trylock(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock) { - uint16_t ticket; + WT_FAIR_LOCK new, old; WT_UNUSED(session); - /* Do the cheap test first. */ - if (lock->waiter != lock->owner) + old = new = *lock; + + /* Exit early if there is no chance we can get the lock. */ + if (old.fair_lock_waiter != old.fair_lock_owner) return (EBUSY); - ticket = lock->owner; - return (__wt_atomic_cas16( - &lock->waiter, ticket, ticket + 1) ? 0 : EBUSY); + /* The replacement lock value is a result of allocating a new ticket. */ + ++new.fair_lock_waiter; + return (__wt_atomic_cas32( + &lock->u.lock, old.u.lock, new.u.lock) ? 0 : EBUSY); } /* @@ -289,8 +292,8 @@ __wt_fair_lock(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock) * value will wrap and two lockers will simultaneously be granted the * lock. */ - ticket = __wt_atomic_fetch_add16(&lock->waiter, 1); - for (pause_cnt = 0; ticket != lock->owner;) { + ticket = __wt_atomic_fetch_add16(&lock->fair_lock_waiter, 1); + for (pause_cnt = 0; ticket != lock->fair_lock_owner;) { /* * We failed to get the lock; pause before retrying and if we've * paused enough, sleep so we don't burn CPU to no purpose. This @@ -318,7 +321,7 @@ __wt_fair_unlock(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock) /* * We have exclusive access - the update does not need to be atomic. */ - ++lock->owner; + ++lock->fair_lock_owner; return (0); } -- cgit v1.2.1 From ab5b1b4f8dfc19c1ff9cb1f7ed871bd8c09a6f71 Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Mon, 28 Sep 2015 06:31:48 +0000 Subject: Expand on some comments. --- src/include/mutex.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/include/mutex.h b/src/include/mutex.h index f497a30c20b..b67e5e610e8 100644 --- a/src/include/mutex.h +++ b/src/include/mutex.h @@ -54,13 +54,15 @@ struct __wt_rwlock { /* * A light weight lock that can be used to replace spinlocks if fairness is * necessary. Implements a ticket-based back off spin lock. + * The fields are available as a union to allow for atomically setting + * the state of the entire lock. */ struct __wt_fair_lock { union { - uint32_t lock; /* Used when needing to atomically switch */ + uint32_t lock; struct { - uint16_t owner; /* Ticket number for current owner */ - uint16_t waiter;/* Ticket number of last allocated */ + uint16_t owner; /* Ticket for current owner */ + uint16_t waiter; /* Last allocated ticket */ } s; } u; #define fair_lock_owner u.s.owner -- cgit v1.2.1 From cce60f8be381d6d881fd95b97303bcc429b93087 Mon Sep 17 00:00:00 2001 From: Susan LoVerso Date: Tue, 29 Sep 2015 12:52:59 -0400 Subject: WT-2151 Add zero_fill config to manually zero out log files. --- dist/api_data.py | 3 ++ src/config/config_def.c | 69 ++++++++++++++++++++++++---------------------- src/conn/conn_log.c | 6 +++- src/include/connection.h | 1 + src/include/wiredtiger.in | 2 ++ src/log/log.c | 70 ++++++++++++++++++++++++++++++++++++++++++++++- 6 files changed, 116 insertions(+), 35 deletions(-) diff --git a/dist/api_data.py b/dist/api_data.py index 5652edc4ebe..6faee4ddd7f 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -642,6 +642,9 @@ common_wiredtiger_open = [ run recovery or error if recovery needs to run after an unclean shutdown.''', choices=['error','on']), + Config('zero_fill', 'false', r''' + manually write zeroes into log files''', + type='boolean'), ]), Config('mmap', 'true', r''' Use memory mapping to access files when possible''', diff --git a/src/config/config_def.c b/src/config/config_def.c index a3dc24fafc4..990e604cf39 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -463,6 +463,7 @@ static const WT_CONFIG_CHECK { "recover", "string", NULL, "choices=[\"error\",\"on\"]", NULL, 0 }, + { "zero_fill", "boolean", NULL, NULL, NULL, 0 }, { NULL, NULL, NULL, NULL, NULL, 0 } }; @@ -517,7 +518,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open[] = { { "hazard_max", "int", NULL, "min=15", NULL, 0 }, { "log", "category", NULL, NULL, - confchk_wiredtiger_open_log_subconfigs, 7 }, + confchk_wiredtiger_open_log_subconfigs, 8 }, { "lsm_manager", "category", NULL, NULL, confchk_wiredtiger_open_lsm_manager_subconfigs, 2 }, @@ -592,7 +593,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_all[] = { { "hazard_max", "int", NULL, "min=15", NULL, 0 }, { "log", "category", NULL, NULL, - confchk_wiredtiger_open_log_subconfigs, 7 }, + confchk_wiredtiger_open_log_subconfigs, 8 }, { "lsm_manager", "category", NULL, NULL, confchk_wiredtiger_open_lsm_manager_subconfigs, 2 }, @@ -665,7 +666,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_basecfg[] = { { "hazard_max", "int", NULL, "min=15", NULL, 0 }, { "log", "category", NULL, NULL, - confchk_wiredtiger_open_log_subconfigs, 7 }, + confchk_wiredtiger_open_log_subconfigs, 8 }, { "lsm_manager", "category", NULL, NULL, confchk_wiredtiger_open_lsm_manager_subconfigs, 2 }, @@ -737,7 +738,7 @@ static const WT_CONFIG_CHECK confchk_wiredtiger_open_usercfg[] = { { "hazard_max", "int", NULL, "min=15", NULL, 0 }, { "log", "category", NULL, NULL, - confchk_wiredtiger_open_log_subconfigs, 7 }, + confchk_wiredtiger_open_log_subconfigs, 8 }, { "lsm_manager", "category", NULL, NULL, confchk_wiredtiger_open_lsm_manager_subconfigs, 2 }, @@ -969,13 +970,14 @@ static const WT_CONFIG_ENTRY config_entries[] = { "file_extend=,file_manager=(close_handle_minimum=250," "close_idle_time=30,close_scan_interval=10),hazard_max=1000," "log=(archive=,compressor=,enabled=0,file_max=100MB,path=," - "prealloc=,recover=on),lsm_manager=(merge=,worker_thread_max=4)," - "lsm_merge=,mmap=,multiprocess=0,session_max=100," - "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0," - "reserve=0,size=500MB),statistics=none,statistics_log=(on_close=0" - ",path=\"WiredTigerStat.%d.%H\",sources=," - "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0" - ",method=fsync),use_environment_priv=0,verbose=", + "prealloc=,recover=on,zero_fill=0),lsm_manager=(merge=," + "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0," + "session_max=100,session_scratch_max=2MB,shared_cache=(chunk=10MB" + ",name=,quota=0,reserve=0,size=500MB),statistics=none," + "statistics_log=(on_close=0,path=\"WiredTigerStat.%d.%H\"," + "sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," + "transaction_sync=(enabled=0,method=fsync),use_environment_priv=0" + ",verbose=", confchk_wiredtiger_open, 34 }, { "wiredtiger_open_all", @@ -989,14 +991,14 @@ static const WT_CONFIG_ENTRY config_entries[] = { "file_extend=,file_manager=(close_handle_minimum=250," "close_idle_time=30,close_scan_interval=10),hazard_max=1000," "log=(archive=,compressor=,enabled=0,file_max=100MB,path=," - "prealloc=,recover=on),lsm_manager=(merge=,worker_thread_max=4)," - "lsm_merge=,mmap=,multiprocess=0,session_max=100," - "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0," - "reserve=0,size=500MB),statistics=none,statistics_log=(on_close=0" - ",path=\"WiredTigerStat.%d.%H\",sources=," - "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0" - ",method=fsync),use_environment_priv=0,verbose=,version=(major=0," - "minor=0)", + "prealloc=,recover=on,zero_fill=0),lsm_manager=(merge=," + "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0," + "session_max=100,session_scratch_max=2MB,shared_cache=(chunk=10MB" + ",name=,quota=0,reserve=0,size=500MB),statistics=none," + "statistics_log=(on_close=0,path=\"WiredTigerStat.%d.%H\"," + "sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," + "transaction_sync=(enabled=0,method=fsync),use_environment_priv=0" + ",verbose=,version=(major=0,minor=0)", confchk_wiredtiger_open_all, 35 }, { "wiredtiger_open_basecfg", @@ -1009,13 +1011,14 @@ static const WT_CONFIG_ENTRY config_entries[] = { ",extensions=,file_extend=,file_manager=(close_handle_minimum=250" ",close_idle_time=30,close_scan_interval=10),hazard_max=1000," "log=(archive=,compressor=,enabled=0,file_max=100MB,path=," - "prealloc=,recover=on),lsm_manager=(merge=,worker_thread_max=4)," - "lsm_merge=,mmap=,multiprocess=0,session_max=100," - "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0," - "reserve=0,size=500MB),statistics=none,statistics_log=(on_close=0" - ",path=\"WiredTigerStat.%d.%H\",sources=," - "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0" - ",method=fsync),verbose=,version=(major=0,minor=0)", + "prealloc=,recover=on,zero_fill=0),lsm_manager=(merge=," + "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0," + "session_max=100,session_scratch_max=2MB,shared_cache=(chunk=10MB" + ",name=,quota=0,reserve=0,size=500MB),statistics=none," + "statistics_log=(on_close=0,path=\"WiredTigerStat.%d.%H\"," + "sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," + "transaction_sync=(enabled=0,method=fsync),verbose=," + "version=(major=0,minor=0)", confchk_wiredtiger_open_basecfg, 31 }, { "wiredtiger_open_usercfg", @@ -1028,13 +1031,13 @@ static const WT_CONFIG_ENTRY config_entries[] = { ",extensions=,file_extend=,file_manager=(close_handle_minimum=250" ",close_idle_time=30,close_scan_interval=10),hazard_max=1000," "log=(archive=,compressor=,enabled=0,file_max=100MB,path=," - "prealloc=,recover=on),lsm_manager=(merge=,worker_thread_max=4)," - "lsm_merge=,mmap=,multiprocess=0,session_max=100," - "session_scratch_max=2MB,shared_cache=(chunk=10MB,name=,quota=0," - "reserve=0,size=500MB),statistics=none,statistics_log=(on_close=0" - ",path=\"WiredTigerStat.%d.%H\",sources=," - "timestamp=\"%b %d %H:%M:%S\",wait=0),transaction_sync=(enabled=0" - ",method=fsync),verbose=", + "prealloc=,recover=on,zero_fill=0),lsm_manager=(merge=," + "worker_thread_max=4),lsm_merge=,mmap=,multiprocess=0," + "session_max=100,session_scratch_max=2MB,shared_cache=(chunk=10MB" + ",name=,quota=0,reserve=0,size=500MB),statistics=none," + "statistics_log=(on_close=0,path=\"WiredTigerStat.%d.%H\"," + "sources=,timestamp=\"%b %d %H:%M:%S\",wait=0)," + "transaction_sync=(enabled=0,method=fsync),verbose=", confchk_wiredtiger_open_usercfg, 30 }, { NULL, NULL, NULL, 0 } diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c index eba0a2769d6..336a195f8e9 100644 --- a/src/conn/conn_log.c +++ b/src/conn/conn_log.c @@ -74,6 +74,10 @@ __logmgr_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp) if (cval.val != 0) FLD_SET(conn->log_flags, WT_CONN_LOG_ARCHIVE); + WT_RET(__wt_config_gets(session, cfg, "log.zero_fill", &cval)); + if (cval.val != 0) + FLD_SET(conn->log_flags, WT_CONN_LOG_ZERO_FILL); + WT_RET(__wt_config_gets(session, cfg, "log.file_max", &cval)); conn->log_file_max = (wt_off_t)cval.val; WT_STAT_FAST_CONN_SET(session, log_max_filesize, conn->log_file_max); @@ -85,7 +89,7 @@ __logmgr_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp) */ if (cval.val != 0) { FLD_SET(conn->log_flags, WT_CONN_LOG_PREALLOC); - conn->log_prealloc = 1; + conn->log_prealloc = 5; } WT_RET(__wt_config_gets_def(session, cfg, "log.recover", 0, &cval)); if (cval.len != 0 && WT_STRING_MATCH("error", cval.str, cval.len)) diff --git a/src/include/connection.h b/src/include/connection.h index 2c20c2f7936..0273414f42e 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -342,6 +342,7 @@ struct __wt_connection_impl { #define WT_CONN_LOG_PREALLOC 0x08 /* Pre-allocation is enabled */ #define WT_CONN_LOG_RECOVER_DONE 0x10 /* Recovery completed */ #define WT_CONN_LOG_RECOVER_ERR 0x20 /* Error if recovery required */ +#define WT_CONN_LOG_ZERO_FILL 0x40 /* Manually zero files */ uint32_t log_flags; /* Global logging configuration */ WT_CONDVAR *log_cond; /* Log server wait mutex */ WT_SESSION_IMPL *log_session; /* Log server session */ diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 9078a0e2e99..57207a9f2d2 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -2212,6 +2212,8 @@ struct __wt_connection { * @config{    recover, run recovery * or error if recovery needs to run after an unclean shutdown., a string\, * chosen from the following options: \c "error"\, \c "on"; default \c on.} + * @config{    zero_fill, manually write zeroes into log + * files., a boolean flag; default \c false.} * @config{ ),,} * @config{lsm_manager = (, configure database wide options for LSM tree * management. The LSM manager is started automatically the first time an LSM diff --git a/src/log/log.c b/src/log/log.c index ca0b81c4cf6..d78cc0c9134 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -356,6 +356,66 @@ __wt_log_extract_lognum( return (0); } +/* + * __log_zero -- + * Zero a log file. + */ +static int +__log_zero(WT_SESSION_IMPL *session, + WT_FH *fh, wt_off_t start_off, wt_off_t len) +{ + WT_CONNECTION_IMPL *conn; + WT_DECL_ITEM(zerobuf); + WT_DECL_RET; + WT_LOG *log; + wt_off_t off, partial; + uint32_t allocsize, bufsz, wrlen; + + conn = S2C(session); + log = conn->log; + allocsize = log->allocsize; + zerobuf = NULL; + if (allocsize < WT_MEGABYTE) + bufsz = WT_MEGABYTE; + else + bufsz = allocsize; + /* + * If they're using smaller log files, cap it at the file size. + */ + if (conn->log_file_max < bufsz) + bufsz = conn->log_file_max; + WT_RET(__wt_scr_alloc(session, bufsz, &zerobuf)); + memset(zerobuf->mem, 0, zerobuf->size); + + /* + * Read in a chunk starting at the end of the file. Keep going until + * we reach the beginning or we find a chunk that contains any non-zero + * bytes. Compare against a known zero byte chunk. + */ + off = start_off; + while (off < len) { + /* + * Typically we start to zero the file after the log header + * and the bufsz is a sector-aligned size. So we want to + * align our writes when we can. + */ + partial = off % (wt_off_t)bufsz; + if (partial != 0) + wrlen = bufsz - partial; + else + wrlen = bufsz; + /* + * Check if we're writing a partial amount at the end too. + */ + if (len - off < bufsz) + wrlen = len - off; + WT_ERR(__wt_write(session, fh, off, wrlen, zerobuf->mem)); + off += wrlen; + } +err: __wt_scr_free(session, &zerobuf); + return (ret); +} + /* * __log_prealloc -- * Pre-allocate a log file. @@ -370,7 +430,15 @@ __log_prealloc(WT_SESSION_IMPL *session, WT_FH *fh) conn = S2C(session); log = conn->log; ret = 0; - if (fh->fallocate_available == WT_FALLOCATE_NOT_AVAILABLE || + /* + * If the user configured zero filling, pre-allocate the log file + * manually. Otherwise use either fallocate or ftruncate to create + * and zero the log file based on what is available. + */ + if (F_ISSET(conn, WT_CONN_LOG_ZERO_FILL)) + ret = __log_zero(session, fh, + WT_LOG_FIRST_RECORD, conn->log_file_max); + else if (fh->fallocate_available == WT_FALLOCATE_NOT_AVAILABLE || (ret = __wt_fallocate(session, fh, WT_LOG_FIRST_RECORD, conn->log_file_max)) == ENOTSUP) ret = __wt_ftruncate(session, fh, -- cgit v1.2.1 From 0db386b0aeb4a8d3fdc0b669bcdfdaebd9f09733 Mon Sep 17 00:00:00 2001 From: Susan LoVerso Date: Tue, 29 Sep 2015 13:51:43 -0400 Subject: WT-2151 Add zero-fill stat. Check correct flags field. --- dist/stat_data.py | 1 + src/include/stat.h | 1 + src/include/wiredtiger.in | 86 ++++++++++++++++++++++++----------------------- src/log/log.c | 3 +- src/support/stat.c | 3 ++ 5 files changed, 51 insertions(+), 43 deletions(-) diff --git a/dist/stat_data.py b/dist/stat_data.py index 5bf7000f402..76fdf185137 100644 --- a/dist/stat_data.py +++ b/dist/stat_data.py @@ -272,6 +272,7 @@ connection_stats = [ LogStat('log_sync_dir', 'log sync_dir operations'), LogStat('log_write_lsn', 'log server thread advances write LSN'), LogStat('log_writes', 'log write operations'), + LogStat('log_zero_fills', 'log files manually zero-filled'), ########################################## # Reconciliation statistics diff --git a/src/include/stat.h b/src/include/stat.h index 3f7d8985a84..1ebe253e5db 100644 --- a/src/include/stat.h +++ b/src/include/stat.h @@ -338,6 +338,7 @@ struct __wt_connection_stats { int64_t log_sync_dir; int64_t log_write_lsn; int64_t log_writes; + int64_t log_zero_fills; int64_t lsm_checkpoint_throttle; int64_t lsm_merge_throttle; int64_t lsm_rows_merged; diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index 57207a9f2d2..bc22c4bdc84 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -3795,90 +3795,92 @@ extern int wiredtiger_extension_terminate(WT_CONNECTION *connection); #define WT_STAT_CONN_LOG_WRITE_LSN 1109 /*! log: log write operations */ #define WT_STAT_CONN_LOG_WRITES 1110 +/*! log: log files manually zero-filled */ +#define WT_STAT_CONN_LOG_ZERO_FILLS 1111 /*! LSM: sleep for LSM checkpoint throttle */ -#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1111 +#define WT_STAT_CONN_LSM_CHECKPOINT_THROTTLE 1112 /*! LSM: sleep for LSM merge throttle */ -#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1112 +#define WT_STAT_CONN_LSM_MERGE_THROTTLE 1113 /*! LSM: rows merged in an LSM tree */ -#define WT_STAT_CONN_LSM_ROWS_MERGED 1113 +#define WT_STAT_CONN_LSM_ROWS_MERGED 1114 /*! LSM: application work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1114 +#define WT_STAT_CONN_LSM_WORK_QUEUE_APP 1115 /*! LSM: merge work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1115 +#define WT_STAT_CONN_LSM_WORK_QUEUE_MANAGER 1116 /*! LSM: tree queue hit maximum */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1116 +#define WT_STAT_CONN_LSM_WORK_QUEUE_MAX 1117 /*! LSM: switch work units currently queued */ -#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1117 +#define WT_STAT_CONN_LSM_WORK_QUEUE_SWITCH 1118 /*! LSM: tree maintenance operations scheduled */ -#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1118 +#define WT_STAT_CONN_LSM_WORK_UNITS_CREATED 1119 /*! LSM: tree maintenance operations discarded */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1119 +#define WT_STAT_CONN_LSM_WORK_UNITS_DISCARDED 1120 /*! LSM: tree maintenance operations executed */ -#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1120 +#define WT_STAT_CONN_LSM_WORK_UNITS_DONE 1121 /*! connection: memory allocations */ -#define WT_STAT_CONN_MEMORY_ALLOCATION 1121 +#define WT_STAT_CONN_MEMORY_ALLOCATION 1122 /*! connection: memory frees */ -#define WT_STAT_CONN_MEMORY_FREE 1122 +#define WT_STAT_CONN_MEMORY_FREE 1123 /*! connection: memory re-allocations */ -#define WT_STAT_CONN_MEMORY_GROW 1123 +#define WT_STAT_CONN_MEMORY_GROW 1124 /*! thread-yield: page acquire busy blocked */ -#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1124 +#define WT_STAT_CONN_PAGE_BUSY_BLOCKED 1125 /*! thread-yield: page acquire eviction blocked */ -#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1125 +#define WT_STAT_CONN_PAGE_FORCIBLE_EVICT_BLOCKED 1126 /*! thread-yield: page acquire locked blocked */ -#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1126 +#define WT_STAT_CONN_PAGE_LOCKED_BLOCKED 1127 /*! thread-yield: page acquire read blocked */ -#define WT_STAT_CONN_PAGE_READ_BLOCKED 1127 +#define WT_STAT_CONN_PAGE_READ_BLOCKED 1128 /*! thread-yield: page acquire time sleeping (usecs) */ -#define WT_STAT_CONN_PAGE_SLEEP 1128 +#define WT_STAT_CONN_PAGE_SLEEP 1129 /*! connection: total read I/Os */ -#define WT_STAT_CONN_READ_IO 1129 +#define WT_STAT_CONN_READ_IO 1130 /*! reconciliation: page reconciliation calls */ -#define WT_STAT_CONN_REC_PAGES 1130 +#define WT_STAT_CONN_REC_PAGES 1131 /*! reconciliation: page reconciliation calls for eviction */ -#define WT_STAT_CONN_REC_PAGES_EVICTION 1131 +#define WT_STAT_CONN_REC_PAGES_EVICTION 1132 /*! reconciliation: split bytes currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1132 +#define WT_STAT_CONN_REC_SPLIT_STASHED_BYTES 1133 /*! reconciliation: split objects currently awaiting free */ -#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1133 +#define WT_STAT_CONN_REC_SPLIT_STASHED_OBJECTS 1134 /*! connection: pthread mutex shared lock read-lock calls */ -#define WT_STAT_CONN_RWLOCK_READ 1134 +#define WT_STAT_CONN_RWLOCK_READ 1135 /*! connection: pthread mutex shared lock write-lock calls */ -#define WT_STAT_CONN_RWLOCK_WRITE 1135 +#define WT_STAT_CONN_RWLOCK_WRITE 1136 /*! session: open cursor count */ -#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1136 +#define WT_STAT_CONN_SESSION_CURSOR_OPEN 1137 /*! session: open session count */ -#define WT_STAT_CONN_SESSION_OPEN 1137 +#define WT_STAT_CONN_SESSION_OPEN 1138 /*! transaction: transaction begins */ -#define WT_STAT_CONN_TXN_BEGIN 1138 +#define WT_STAT_CONN_TXN_BEGIN 1139 /*! transaction: transaction checkpoints */ -#define WT_STAT_CONN_TXN_CHECKPOINT 1139 +#define WT_STAT_CONN_TXN_CHECKPOINT 1140 /*! transaction: transaction checkpoint generation */ -#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1140 +#define WT_STAT_CONN_TXN_CHECKPOINT_GENERATION 1141 /*! transaction: transaction checkpoint currently running */ -#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1141 +#define WT_STAT_CONN_TXN_CHECKPOINT_RUNNING 1142 /*! transaction: transaction checkpoint max time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1142 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MAX 1143 /*! transaction: transaction checkpoint min time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1143 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_MIN 1144 /*! transaction: transaction checkpoint most recent time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1144 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_RECENT 1145 /*! transaction: transaction checkpoint total time (msecs) */ -#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1145 +#define WT_STAT_CONN_TXN_CHECKPOINT_TIME_TOTAL 1146 /*! transaction: transactions committed */ -#define WT_STAT_CONN_TXN_COMMIT 1146 +#define WT_STAT_CONN_TXN_COMMIT 1147 /*! transaction: transaction failures due to cache overflow */ -#define WT_STAT_CONN_TXN_FAIL_CACHE 1147 +#define WT_STAT_CONN_TXN_FAIL_CACHE 1148 /*! transaction: transaction range of IDs currently pinned by a checkpoint */ -#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1148 +#define WT_STAT_CONN_TXN_PINNED_CHECKPOINT_RANGE 1149 /*! transaction: transaction range of IDs currently pinned */ -#define WT_STAT_CONN_TXN_PINNED_RANGE 1149 +#define WT_STAT_CONN_TXN_PINNED_RANGE 1150 /*! transaction: transactions rolled back */ -#define WT_STAT_CONN_TXN_ROLLBACK 1150 +#define WT_STAT_CONN_TXN_ROLLBACK 1151 /*! transaction: transaction sync calls */ -#define WT_STAT_CONN_TXN_SYNC 1151 +#define WT_STAT_CONN_TXN_SYNC 1152 /*! connection: total write I/Os */ -#define WT_STAT_CONN_WRITE_IO 1152 +#define WT_STAT_CONN_WRITE_IO 1153 /*! * @} diff --git a/src/log/log.c b/src/log/log.c index d78cc0c9134..54a415e4a64 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -386,6 +386,7 @@ __log_zero(WT_SESSION_IMPL *session, bufsz = conn->log_file_max; WT_RET(__wt_scr_alloc(session, bufsz, &zerobuf)); memset(zerobuf->mem, 0, zerobuf->size); + WT_STAT_FAST_CONN_INCR(session, log_zero_fills); /* * Read in a chunk starting at the end of the file. Keep going until @@ -435,7 +436,7 @@ __log_prealloc(WT_SESSION_IMPL *session, WT_FH *fh) * manually. Otherwise use either fallocate or ftruncate to create * and zero the log file based on what is available. */ - if (F_ISSET(conn, WT_CONN_LOG_ZERO_FILL)) + if (FLD_ISSET(conn->log_flags, WT_CONN_LOG_ZERO_FILL)) ret = __log_zero(session, fh, WT_LOG_FIRST_RECORD, conn->log_file_max); else if (fh->fallocate_available == WT_FALLOCATE_NOT_AVAILABLE || diff --git a/src/support/stat.c b/src/support/stat.c index 4e7f54937f4..9e817fad512 100644 --- a/src/support/stat.c +++ b/src/support/stat.c @@ -595,6 +595,7 @@ static const char * const __stats_connection_desc[] = { "log: log sync_dir operations", "log: log server thread advances write LSN", "log: log write operations", + "log: log files manually zero-filled", "LSM: sleep for LSM checkpoint throttle", "LSM: sleep for LSM merge throttle", "LSM: rows merged in an LSM tree", @@ -760,6 +761,7 @@ __wt_stat_connection_clear_single(WT_CONNECTION_STATS *stats) stats->log_slot_unbuffered = 0; stats->log_bytes_payload = 0; stats->log_bytes_written = 0; + stats->log_zero_fills = 0; stats->log_flush = 0; stats->log_compress_writes = 0; stats->log_compress_write_fails = 0; @@ -944,6 +946,7 @@ __wt_stat_connection_aggregate( to->log_slot_unbuffered += WT_STAT_READ(from, log_slot_unbuffered); to->log_bytes_payload += WT_STAT_READ(from, log_bytes_payload); to->log_bytes_written += WT_STAT_READ(from, log_bytes_written); + to->log_zero_fills += WT_STAT_READ(from, log_zero_fills); to->log_flush += WT_STAT_READ(from, log_flush); to->log_compress_writes += WT_STAT_READ(from, log_compress_writes); to->log_compress_write_fails += -- cgit v1.2.1 From 1b2fe0a2f8b8acb7650ddffe72b08330e39221e8 Mon Sep 17 00:00:00 2001 From: Susan LoVerso Date: Tue, 29 Sep 2015 14:28:43 -0400 Subject: WT-2151 Add zero-filled log workload --- bench/wtperf/runners/log-append-zero.wtperf | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 bench/wtperf/runners/log-append-zero.wtperf diff --git a/bench/wtperf/runners/log-append-zero.wtperf b/bench/wtperf/runners/log-append-zero.wtperf new file mode 100644 index 00000000000..973d2cddd0d --- /dev/null +++ b/bench/wtperf/runners/log-append-zero.wtperf @@ -0,0 +1,8 @@ +# wtperf options file: Test a log file with a multi-threaded +# append workload. +conn_config="cache_size=1G,log=(enabled=true,file_max=20MB,zero_fill=true),checkpoint=(log_size=1G)" +table_config="type=file" +icount=50000000 +report_interval=5 +run_time=0 +populate_threads=8 -- cgit v1.2.1 From e1ee9e924f82090c706da41ffe1681bd4e425586 Mon Sep 17 00:00:00 2001 From: Susan LoVerso Date: Tue, 29 Sep 2015 14:47:38 -0400 Subject: WT-2151 Alphabetize config processing. --- src/conn/conn_log.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c index 336a195f8e9..776cb0bf8e3 100644 --- a/src/conn/conn_log.c +++ b/src/conn/conn_log.c @@ -74,10 +74,6 @@ __logmgr_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp) if (cval.val != 0) FLD_SET(conn->log_flags, WT_CONN_LOG_ARCHIVE); - WT_RET(__wt_config_gets(session, cfg, "log.zero_fill", &cval)); - if (cval.val != 0) - FLD_SET(conn->log_flags, WT_CONN_LOG_ZERO_FILL); - WT_RET(__wt_config_gets(session, cfg, "log.file_max", &cval)); conn->log_file_max = (wt_off_t)cval.val; WT_STAT_FAST_CONN_SET(session, log_max_filesize, conn->log_file_max); @@ -95,6 +91,10 @@ __logmgr_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp) if (cval.len != 0 && WT_STRING_MATCH("error", cval.str, cval.len)) FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR); + WT_RET(__wt_config_gets(session, cfg, "log.zero_fill", &cval)); + if (cval.val != 0) + FLD_SET(conn->log_flags, WT_CONN_LOG_ZERO_FILL); + WT_RET(__logmgr_sync_cfg(session, cfg)); return (0); } -- cgit v1.2.1 From 89162212bc17a4bbed3ddb9d3439930d7e2327ca Mon Sep 17 00:00:00 2001 From: David Hows Date: Wed, 30 Sep 2015 14:25:23 +1000 Subject: Add support for WTPERF operations recieving a rollback --- bench/wtperf/wtperf.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c index 8dceeab2832..3a203f5e629 100644 --- a/bench/wtperf/wtperf.c +++ b/bench/wtperf/wtperf.c @@ -385,7 +385,7 @@ worker(void *arg) size_t i; uint64_t next_val, usecs; uint8_t *op, *op_end; - int measure_latency, ret, truncated; + int hit_rollback, measure_latency, ret, truncated; char *value_buf, *key_buf, *value; char buf[512]; @@ -393,6 +393,7 @@ worker(void *arg) cfg = thread->cfg; conn = cfg->conn; cursors = NULL; + hit_rollback = 0; ops = 0; ops_per_txn = thread->workload->ops_per_txn; session = NULL; @@ -603,7 +604,20 @@ worker(void *arg) op_err: lprintf(cfg, ret, 0, "%s failed for: %s, range: %"PRIu64, op_name(op), key_buf, wtperf_value_range(cfg)); - goto err; + /* + * If we get a rollback error we rollback and continue + */ + if (ret == WT_ROLLBACK) { + if ((ret = session->rollback_transaction( + session, NULL)) != 0) { + lprintf(cfg, ret, 0, + "Failed rollback_transaction"); + goto err; + } + hit_rollback = 1; + break; + } else + goto err; default: goto err; /* can't happen */ } @@ -635,12 +649,14 @@ op_err: lprintf(cfg, ret, 0, /* Commit our work if configured for explicit transactions */ if (ops_per_txn != 0 && ops++ % ops_per_txn == 0) { - if ((ret = session->commit_transaction( + + if (!hit_rollback && (ret = session->commit_transaction( session, NULL)) != 0) { lprintf(cfg, ret, 0, "Worker transaction commit failed"); goto err; } + hit_rollback = 0; if ((ret = session->begin_transaction( session, NULL)) != 0) { lprintf(cfg, ret, 0, @@ -892,6 +908,7 @@ populate_thread(void *arg) goto err; } intxn = 0; +printf("next\n"); continue; } else if (ret != 0) { lprintf(cfg, ret, 0, "Failed inserting"); -- cgit v1.2.1 From 94d6cfa2c5260a18a25260aa1c9f3a1a0e594b7c Mon Sep 17 00:00:00 2001 From: David Hows Date: Wed, 30 Sep 2015 14:33:27 +1000 Subject: Remove debug statement --- bench/wtperf/wtperf.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c index 3a203f5e629..837beb0eb63 100644 --- a/bench/wtperf/wtperf.c +++ b/bench/wtperf/wtperf.c @@ -649,7 +649,6 @@ op_err: lprintf(cfg, ret, 0, /* Commit our work if configured for explicit transactions */ if (ops_per_txn != 0 && ops++ % ops_per_txn == 0) { - if (!hit_rollback && (ret = session->commit_transaction( session, NULL)) != 0) { lprintf(cfg, ret, 0, @@ -908,7 +907,6 @@ populate_thread(void *arg) goto err; } intxn = 0; -printf("next\n"); continue; } else if (ret != 0) { lprintf(cfg, ret, 0, "Failed inserting"); -- cgit v1.2.1 From a2349b6a03a68fa914c8c9acabae592da0302955 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Wed, 30 Sep 2015 16:51:35 +0000 Subject: SERVER-20303: This change tunes for a test with a maximum page size of 10MB and multi-threaded append of 100K key/value pairs. That means 100 inserts is sufficient to trigger forced eviction, and the previous test would refuse in-memory splits unless there were approximately 4K items on the insert list, that is, it assumed small insert objects. Change the code to allow in-memory splits for both large numbers of items and large insert objects. --- src/include/btree.i | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/src/include/btree.i b/src/include/btree.i index c7466019e14..f43fb73dc5b 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -977,6 +977,7 @@ __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) WT_BTREE *btree; WT_INSERT_HEAD *ins_head; WT_INSERT *ins; + uint64_t mem; int i; btree = S2BT(session); @@ -1007,25 +1008,33 @@ __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) return (false); /* - * There is no point splitting if the list is small, no deep items is - * our heuristic for that. A 1/4 probability of adding a new skiplist - * level, with level-0 always created, means there will be a 5th level - * entry for roughly every 1024 entries in the list. If there are at - * least 4 5th level entries (4K items), the list is large enough. + * There is no point in splitting in-memory if the insert list doesn't + * represent most of the page footprint. Split if there are many items, + * or if there are enough items and the items are a significant part + * of the page's footprint. A 1/4 probability of adding a new skiplist + * level (with level-0 always created), implies a 2nd level entry for + * every 16 entries in the list. If there are at least 256 2nd level + * entries (4K items), or if the update list hits the maximum leaf page + * size, split. The reason we're walking the 2nd level list (rather than + * walking, for example, the 5th level list and looking for at least 4 + * entries), is it combines the number of entries test and the size of + * the entries test in one loop. */ -#define WT_MIN_SPLIT_SKIPLIST_DEPTH WT_MIN(5, WT_SKIP_MAXDEPTH - 1) +#define WT_MIN_SPLIT_SKIPLIST_DEPTH WT_MIN(2, WT_SKIP_MAXDEPTH - 1) ins_head = page->pg_row_entries == 0 ? WT_ROW_INSERT_SMALLEST(page) : WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); if (ins_head == NULL) return (false); - for (i = 0, ins = ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH]; - ins != NULL; ins = ins->next[WT_MIN_SPLIT_SKIPLIST_DEPTH]) - if (++i == 4) { + for (i = 0, mem = 0, ins = ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH]; + ins != NULL; ins = ins->next[WT_MIN_SPLIT_SKIPLIST_DEPTH]) { + mem += WT_UPDATE_MEMSIZE(ins->upd); + if (++i == 256 || mem > btree->maxleafpage) { WT_STAT_FAST_CONN_INCR(session, cache_inmem_splittable); WT_STAT_FAST_DATA_INCR(session, cache_inmem_splittable); return (true); } + } return (false); } -- cgit v1.2.1 From ea98d2dd8820011bea884fdab2a8cb51199e1dc0 Mon Sep 17 00:00:00 2001 From: Susan LoVerso Date: Wed, 30 Sep 2015 14:05:20 -0400 Subject: WT-2151 Fix bug - use the memsize field of the item. --- src/log/log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/log/log.c b/src/log/log.c index 54a415e4a64..6277785940c 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -385,7 +385,7 @@ __log_zero(WT_SESSION_IMPL *session, if (conn->log_file_max < bufsz) bufsz = conn->log_file_max; WT_RET(__wt_scr_alloc(session, bufsz, &zerobuf)); - memset(zerobuf->mem, 0, zerobuf->size); + memset(zerobuf->mem, 0, zerobuf->memsize); WT_STAT_FAST_CONN_INCR(session, log_zero_fills); /* -- cgit v1.2.1 From ca84628d0ffeb19f94488695cf39237e63373e2b Mon Sep 17 00:00:00 2001 From: Susan LoVerso Date: Wed, 30 Sep 2015 14:06:10 -0400 Subject: WT-2151 Add zero-fill configuration to recovery test. --- test/suite/test_txn02.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/test/suite/test_txn02.py b/test/suite/test_txn02.py index 83c10f41244..17d0b97b50f 100644 --- a/test/suite/test_txn02.py +++ b/test/suite/test_txn02.py @@ -104,9 +104,18 @@ class test_txn02(wttest.WiredTigerTestCase, suite_subprocess): # deterministic manner. self.txn_sync = self.sync_list[ self.scenario_number % len(self.sync_list)] + # + # We don't want to run zero fill with only the same settings, such + # as archive or sync, which are an even number of options. + # + freq = 3 + zerofill = 'false' + if self.scenario_number % freq == 0: + zerofill = 'true' self.backup_dir = os.path.join(self.home, "WT_BACKUP") conn_params = \ 'log=(archive=false,enabled,file_max=%s),' % self.logmax + \ + 'log=(zero_fill=%s),' % zerofill + \ 'create,error_prefix="%s: ",' % self.shortid() + \ 'transaction_sync="%s",' % self.txn_sync # print "Creating conn at '%s' with config '%s'" % (dir, conn_params) -- cgit v1.2.1 From e6b6d12fd0ff483af48e26a7445ce49334964f74 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Wed, 30 Sep 2015 15:12:47 -0400 Subject: WT-2149: open private lookaside cursors as part of creating an internal thread's session, in order to avoid deadlocks later. Change callers of __wt_open_internal_session() to pass a set of session flags. If those flags include WT_SESSION_LOOKASIDE_CURSOR, a lookaside table cursor is opened as part of creating the session. Previously, __wt_open_internal_session() had a uses_dhandles boolean. If that boolean was false, the session flag WT_SESSION_NO_DATA_HANDLES was set. That boolean is gone, callers of __wt_open_internal_session() should pass WT_SESSION_NO_DATA_HANDLES to __wt_open_internal_session(). Create the lookaside table before starting the worker threads, the order is required if lookaside table cursors are opened as part of the worker threads opening internal sessions. Flip the order of creation in the initial lookaside table setup: we now first create the lookaside table itself, then open the WT_CONNECTION_IMPL lookaside session/cursor, that allows the cursor open to happen when the connection's lookaside table session is first opened, like every other internal thread. Re-organize the lookaside table code to expect a lookaside table cursor to be available in any session with the WT_SESSION_LOOKASIDE_CURSOR flag, including the WT_CONNECTION_IMPL structure's lookaside session. --- src/async/async_api.c | 17 ++++++------- src/cache/cache_las.c | 62 +++++++++++++++++++--------------------------- src/conn/conn_api.c | 6 ++--- src/conn/conn_cache_pool.c | 4 ++- src/conn/conn_ckpt.c | 12 +++++---- src/conn/conn_log.c | 14 ++++++----- src/conn/conn_open.c | 2 +- src/conn/conn_stat.c | 3 ++- src/conn/conn_sweep.c | 12 ++++----- src/evict/evict_lru.c | 41 ++++++++++++++++-------------- src/include/connection.h | 1 - src/include/extern.h | 3 ++- src/lsm/lsm_manager.c | 8 +++--- src/session/session_api.c | 23 ++++++++++++----- 14 files changed, 110 insertions(+), 98 deletions(-) diff --git a/src/async/async_api.c b/src/async/async_api.c index 1d819474728..dc26f2d11c3 100644 --- a/src/async/async_api.c +++ b/src/async/async_api.c @@ -53,7 +53,7 @@ __async_get_format(WT_CONNECTION_IMPL *conn, const char *uri, * for the cursor. */ WT_RET(__wt_open_internal_session( - conn, "async-cursor", true, true, &session)); + conn, "async-cursor", true, 0, &session)); __wt_spin_lock(session, &async->ops_lock); WT_ERR(__wt_calloc_one(session, &af)); WT_ERR(__wt_strdup(session, uri, &af->uri)); @@ -229,7 +229,7 @@ __async_start(WT_SESSION_IMPL *session) { WT_ASYNC *async; WT_CONNECTION_IMPL *conn; - uint32_t i; + uint32_t i, session_flags; conn = S2C(session); conn->async_cfg = 1; @@ -256,9 +256,9 @@ __async_start(WT_SESSION_IMPL *session) * workers and we may want to selectively stop some workers * while leaving the rest running. */ - WT_RET(__wt_open_internal_session(conn, - "async-worker", true, true, &async->worker_sessions[i])); - F_SET(async->worker_sessions[i], WT_SESSION_SERVER_ASYNC); + session_flags = WT_SESSION_SERVER_ASYNC; + WT_RET(__wt_open_internal_session(conn, "async-worker", + true, session_flags, &async->worker_sessions[i])); } for (i = 0; i < conn->async_workers; i++) { /* @@ -305,7 +305,7 @@ __wt_async_reconfig(WT_SESSION_IMPL *session, const char *cfg[]) WT_DECL_RET; WT_SESSION *wt_session; bool run; - uint32_t i; + uint32_t i, session_flags; conn = S2C(session); async = conn->async; @@ -371,10 +371,9 @@ __wt_async_reconfig(WT_SESSION_IMPL *session, const char *cfg[]) /* * Each worker has its own session. */ + session_flags = WT_SESSION_SERVER_ASYNC; WT_RET(__wt_open_internal_session(conn, "async-worker", - true, true, &async->worker_sessions[i])); - F_SET(async->worker_sessions[i], - WT_SESSION_SERVER_ASYNC); + true, session_flags, &async->worker_sessions[i])); } for (i = conn->async_workers; i < tmp_conn.async_workers; i++) { /* diff --git a/src/cache/cache_las.c b/src/cache/cache_las.c index a964ac39874..90bf6b1d59c 100644 --- a/src/cache/cache_las.c +++ b/src/cache/cache_las.c @@ -27,7 +27,7 @@ __wt_las_stats_update(WT_SESSION_IMPL *session) * remain 0. In the current system, there's always a lookaside table, * but there's no reason not to be cautious. */ - if (conn->las_cursor == NULL) + if (conn->las_session->las_cursor == NULL) return; /* @@ -35,7 +35,8 @@ __wt_las_stats_update(WT_SESSION_IMPL *session) * to it by way of the underlying btree handle, but it's a little ugly. */ cstats = conn->stats; - dstats = ((WT_CURSOR_BTREE *)conn->las_cursor)->btree->dhandle->stats; + dstats = ((WT_CURSOR_BTREE *) + conn->las_session->las_cursor)->btree->dhandle->stats; WT_STAT_SET(session, cstats, cache_lookaside_insert, WT_STAT_READ(dstats, cursor_insert)); @@ -44,11 +45,11 @@ __wt_las_stats_update(WT_SESSION_IMPL *session) } /* - * __las_cursor_create -- + * __wt_las_cursor_create -- * Open a new lookaside table cursor. */ -static int -__las_cursor_create(WT_SESSION_IMPL *session, WT_CURSOR **cursorp) +int +__wt_las_cursor_create(WT_SESSION_IMPL *session, WT_CURSOR **cursorp) { WT_BTREE *btree; const char *open_cursor_cfg[] = { @@ -85,7 +86,7 @@ int __wt_las_create(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; - WT_DECL_RET; + uint32_t session_flags; const char *drop_cfg[] = { WT_CONFIG_BASE(session, WT_SESSION_drop), "force=true", NULL }; @@ -96,27 +97,22 @@ __wt_las_create(WT_SESSION_IMPL *session) * schema lock to create and drop the file, and it may not always be * available. * - * Open an internal session, used for the shared lookaside cursor. - * - * Sessions associated with a lookaside cursor should never be tapped - * for eviction. + * Discard any previous incarnation of the file. */ - WT_RET(__wt_open_internal_session( - conn, "lookaside table", true, true, &conn->las_session)); - session = conn->las_session; - F_SET(session, WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_NO_EVICTION); - - /* Discard any previous incarnation of the file. */ WT_RET(__wt_session_drop(session, WT_LAS_URI, drop_cfg)); /* Re-create the file. */ WT_RET(__wt_session_create(session, WT_LAS_URI, WT_LAS_FORMAT)); - /* Open the shared cursor. */ - WT_WITHOUT_DHANDLE(session, - ret = __las_cursor_create(session, &conn->las_cursor)); - - return (ret); + /* + * Open an internal session, used for the shared lookaside cursor. + * + * Sessions associated with a lookaside cursor should never be tapped + * for eviction. + */ + session_flags = WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_NO_EVICTION; + return (__wt_open_internal_session( + conn, "lookaside table", true, session_flags, &conn->las_session)); } /* @@ -138,7 +134,6 @@ __wt_las_destroy(WT_SESSION_IMPL *session) wt_session = &conn->las_session->iface; ret = wt_session->close(wt_session, NULL); - conn->las_cursor = NULL; conn->las_session = NULL; return (ret); @@ -202,20 +197,15 @@ __wt_las_cursor( conn = S2C(session); - /* Eviction and sweep threads have their own lookaside table cursors. */ - if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) { - if (session->las_cursor == NULL) { - WT_WITHOUT_DHANDLE(session, ret = - __las_cursor_create(session, &session->las_cursor)); - WT_RET(ret); - } - + /* + * Some threads have their own lookaside table cursors, else lock the + * shared lookaside cursor. + */ + if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) *cursorp = session->las_cursor; - } else { - /* Lock the shared lookaside cursor. */ + else { __wt_spin_lock(session, &conn->las_lock); - - *cursorp = conn->las_cursor; + *cursorp = conn->las_session->las_cursor; } /* Turn caching and eviction off. */ @@ -253,8 +243,8 @@ __wt_las_cursor_close( F_SET(session, session_flags); /* - * Eviction and sweep threads have their own lookaside table cursors; - * else, unlock the shared lookaside cursor. + * Some threads have their own lookaside table cursors, else unlock the + * shared lookaside cursor. */ if (!F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) __wt_spin_unlock(session, &conn->las_lock); diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index b5d0e8f2883..5cb39bb055a 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -2034,12 +2034,12 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, WT_ERR(__wt_turtle_init(session)); WT_ERR(__wt_metadata_open(session)); - /* Start the worker threads and run recovery. */ - WT_ERR(__wt_connection_workers(session, cfg)); - /* Create the lookaside table. */ WT_ERR(__wt_las_create(session)); + /* Start the worker threads and run recovery. */ + WT_ERR(__wt_connection_workers(session, cfg)); + WT_STATIC_ASSERT(offsetof(WT_CONNECTION_IMPL, iface) == 0); *wt_connp = &conn->iface; diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c index 6294e3b01a7..22af88c66b1 100644 --- a/src/conn/conn_cache_pool.c +++ b/src/conn/conn_cache_pool.c @@ -243,6 +243,7 @@ __wt_conn_cache_pool_open(WT_SESSION_IMPL *session) WT_CACHE_POOL *cp; WT_CONNECTION_IMPL *conn; WT_DECL_RET; + uint32_t session_flags; conn = S2C(session); cache = conn->cache; @@ -252,8 +253,9 @@ __wt_conn_cache_pool_open(WT_SESSION_IMPL *session) * Create a session that can be used by the cache pool thread, do * it in the main thread to avoid shutdown races */ + session_flags = WT_SESSION_NO_DATA_HANDLES; if ((ret = __wt_open_internal_session( - conn, "cache-pool", false, false, &cache->cp_session)) != 0) + conn, "cache-pool", false, session_flags, &cache->cp_session)) != 0) WT_RET_MSG(NULL, ret, "Failed to create session for cache pool"); diff --git a/src/conn/conn_ckpt.c b/src/conn/conn_ckpt.c index 7fc790d5efa..caf0c3b68f0 100644 --- a/src/conn/conn_ckpt.c +++ b/src/conn/conn_ckpt.c @@ -123,22 +123,24 @@ static int __ckpt_server_start(WT_CONNECTION_IMPL *conn) { WT_SESSION_IMPL *session; + uint32_t session_flags; /* Nothing to do if the server is already running. */ if (conn->ckpt_session != NULL) return (0); F_SET(conn, WT_CONN_SERVER_CHECKPOINT); - /* The checkpoint server gets its own session. */ - WT_RET(__wt_open_internal_session( - conn, "checkpoint-server", true, true, &conn->ckpt_session)); - session = conn->ckpt_session; /* + * The checkpoint server gets its own session. + * * Checkpoint does enough I/O it may be called upon to perform slow * operations for the block manager. */ - F_SET(session, WT_SESSION_CAN_WAIT); + session_flags = WT_SESSION_CAN_WAIT; + WT_RET(__wt_open_internal_session(conn, + "checkpoint-server", true, session_flags, &conn->ckpt_session)); + session = conn->ckpt_session; WT_RET(__wt_cond_alloc( session, "checkpoint server", false, &conn->ckpt_cond)); diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c index eba0a2769d6..1e22042f711 100644 --- a/src/conn/conn_log.c +++ b/src/conn/conn_log.c @@ -777,6 +777,7 @@ int __wt_logmgr_open(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; + uint32_t session_flags; conn = S2C(session); @@ -788,8 +789,9 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) * Start the log close thread. It is not configurable. * If logging is enabled, this thread runs. */ - WT_RET(__wt_open_internal_session( - conn, "log-close-server", false, false, &conn->log_file_session)); + session_flags = WT_SESSION_NO_DATA_HANDLES; + WT_RET(__wt_open_internal_session(conn, + "log-close-server", false, session_flags, &conn->log_file_session)); WT_RET(__wt_cond_alloc(conn->log_file_session, "log close server", false, &conn->log_file_cond)); @@ -804,8 +806,8 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) * Start the log write LSN thread. It is not configurable. * If logging is enabled, this thread runs. */ - WT_RET(__wt_open_internal_session( - conn, "log-wrlsn-server", false, false, &conn->log_wrlsn_session)); + WT_RET(__wt_open_internal_session(conn, "log-wrlsn-server", + false, session_flags, &conn->log_wrlsn_session)); WT_RET(__wt_cond_alloc(conn->log_wrlsn_session, "log write lsn server", false, &conn->log_wrlsn_cond)); WT_RET(__wt_thread_create(conn->log_wrlsn_session, @@ -829,8 +831,8 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) WT_RET(__wt_cond_signal(session, conn->log_cond)); } else { /* The log server gets its own session. */ - WT_RET(__wt_open_internal_session( - conn, "log-server", false, false, &conn->log_session)); + WT_RET(__wt_open_internal_session(conn, + "log-server", false, session_flags, &conn->log_session)); WT_RET(__wt_cond_alloc(conn->log_session, "log server", false, &conn->log_cond)); diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c index 199cf213e0a..675c3765d1a 100644 --- a/src/conn/conn_open.c +++ b/src/conn/conn_open.c @@ -38,7 +38,7 @@ __wt_connection_open(WT_CONNECTION_IMPL *conn, const char *cfg[]) * need to get cleaned up on close. */ WT_RET(__wt_open_internal_session( - conn, "connection", true, false, &session)); + conn, "connection", false, 0, &session)); /* * The connection's default session is originally a static structure, diff --git a/src/conn/conn_stat.c b/src/conn/conn_stat.c index d8c7227ae61..ec3a630581a 100644 --- a/src/conn/conn_stat.c +++ b/src/conn/conn_stat.c @@ -447,9 +447,10 @@ __statlog_start(WT_CONNECTION_IMPL *conn) return (0); F_SET(conn, WT_CONN_SERVER_STATISTICS); + /* The statistics log server gets its own session. */ WT_RET(__wt_open_internal_session( - conn, "statlog-server", true, true, &conn->stat_session)); + conn, "statlog-server", true, 0, &conn->stat_session)); session = conn->stat_session; WT_RET(__wt_cond_alloc( diff --git a/src/conn/conn_sweep.c b/src/conn/conn_sweep.c index 2de0cc12069..23846f978fe 100644 --- a/src/conn/conn_sweep.c +++ b/src/conn/conn_sweep.c @@ -353,16 +353,13 @@ int __wt_sweep_create(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; + uint32_t session_flags; conn = S2C(session); /* Set first, the thread might run before we finish up. */ F_SET(conn, WT_CONN_SERVER_SWEEP); - WT_RET(__wt_open_internal_session( - conn, "sweep-server", true, true, &conn->sweep_session)); - session = conn->sweep_session; - /* * Handle sweep does enough I/O it may be called upon to perform slow * operations for the block manager. @@ -372,8 +369,11 @@ __wt_sweep_create(WT_SESSION_IMPL *session) * * Don't tap the sweep thread for eviction. */ - F_SET(session, WT_SESSION_CAN_WAIT | - WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_NO_EVICTION); + session_flags = WT_SESSION_CAN_WAIT | + WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_NO_EVICTION; + WT_RET(__wt_open_internal_session( + conn, "sweep-server", true, session_flags, &conn->sweep_session)); + session = conn->sweep_session; WT_RET(__wt_cond_alloc( session, "handle sweep server", false, &conn->sweep_cond)); diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 3c00ee30896..c7da699b77e 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -236,7 +236,7 @@ __evict_workers_resize(WT_SESSION_IMPL *session) WT_DECL_RET; WT_EVICT_WORKER *workers; size_t alloc; - uint32_t i; + uint32_t i, session_flags; conn = S2C(session); @@ -246,17 +246,17 @@ __evict_workers_resize(WT_SESSION_IMPL *session) workers = conn->evict_workctx; for (i = conn->evict_workers_alloc; i < conn->evict_workers_max; i++) { - WT_ERR(__wt_open_internal_session(conn, - "eviction-worker", true, false, &workers[i].session)); - workers[i].id = i; - /* + * Eviction worker threads get their own session. * Eviction worker threads get their own lookaside table cursor. * Eviction worker threads may be called upon to perform slow * operations for the block manager. */ - F_SET(workers[i].session, - WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_CAN_WAIT); + session_flags = + WT_SESSION_CAN_WAIT | WT_SESSION_LOOKASIDE_CURSOR; + WT_ERR(__wt_open_internal_session(conn, "eviction-worker", + false, session_flags, &workers[i].session)); + workers[i].id = i; if (i < conn->evict_workers_min) { ++conn->evict_workers; @@ -278,33 +278,36 @@ int __wt_evict_create(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; + uint32_t session_flags; conn = S2C(session); /* Set first, the thread might run before we finish up. */ F_SET(conn, WT_CONN_EVICTION_RUN); - /* We need a session handle because we're reading/writing pages. */ - WT_RET(__wt_open_internal_session( - conn, "eviction-server", true, false, &conn->evict_session)); + /* + * We need a session handle because we're reading/writing pages. + * + * The eviction server gets its own lookaside table cursor. + * + * If there's only a single eviction thread, it may be called upon to + * perform slow operations for the block manager. (The flag is not + * reset if reconfigured later, but I doubt that's a problem.) + */ + session_flags = WT_SESSION_LOOKASIDE_CURSOR; + if (conn->evict_workers_max == 0) + FLD_SET(session_flags, WT_SESSION_CAN_WAIT); + WT_RET(__wt_open_internal_session(conn, + "eviction-server", false, session_flags, &conn->evict_session)); session = conn->evict_session; /* * If eviction workers were configured, allocate sessions for them now. * This is done to reduce the chance that we will open new eviction * sessions after WT_CONNECTION::close is called. - * - * If there's only a single eviction thread, it may be called upon to - * perform slow operations for the block manager. (The flag is not - * reset if reconfigured later, but I doubt that's a problem.) */ if (conn->evict_workers_max > 0) WT_RET(__evict_workers_resize(session)); - else - F_SET(session, WT_SESSION_CAN_WAIT); - - /* The eviction server gets its own lookaside table cursor. */ - F_SET(session, WT_SESSION_LOOKASIDE_CURSOR); /* * Start the primary eviction server thread after the worker threads diff --git a/src/include/connection.h b/src/include/connection.h index 2c20c2f7936..4a4560d930e 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -377,7 +377,6 @@ struct __wt_connection_impl { */ WT_SPINLOCK las_lock; /* Lookaside table spinlock */ WT_SESSION_IMPL *las_session; /* Lookaside table session */ - WT_CURSOR *las_cursor; /* Lookaside table cursor */ bool las_written; /* Lookaside table has been written */ WT_ITEM las_sweep_key; /* Sweep server's saved key */ diff --git a/src/include/extern.h b/src/include/extern.h index cfc1dc8f26e..eee03f4f868 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -182,6 +182,7 @@ extern int __wt_search_insert( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, W extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert); extern int __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt); extern void __wt_las_stats_update(WT_SESSION_IMPL *session); +extern int __wt_las_cursor_create(WT_SESSION_IMPL *session, WT_CURSOR **cursorp); extern int __wt_las_create(WT_SESSION_IMPL *session); extern int __wt_las_destroy(WT_SESSION_IMPL *session); extern void __wt_las_set_written(WT_SESSION_IMPL *session); @@ -592,7 +593,7 @@ extern int __wt_session_copy_values(WT_SESSION_IMPL *session); extern int __wt_open_cursor(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp); extern int __wt_session_create( WT_SESSION_IMPL *session, const char *uri, const char *config); extern int __wt_session_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]); -extern int __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, bool uses_dhandles, bool open_metadata, WT_SESSION_IMPL **sessionp); +extern int __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, bool open_metadata, uint32_t session_flags, WT_SESSION_IMPL **sessionp); extern int __wt_open_session(WT_CONNECTION_IMPL *conn, WT_EVENT_HANDLER *event_handler, const char *config, bool open_metadata, WT_SESSION_IMPL **sessionp); extern int __wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, bool *skipp); extern int __wt_session_compact( WT_SESSION *wt_session, const char *uri, const char *config); diff --git a/src/lsm/lsm_manager.c b/src/lsm/lsm_manager.c index bd3adb3a528..1c5124c32af 100644 --- a/src/lsm/lsm_manager.c +++ b/src/lsm/lsm_manager.c @@ -203,12 +203,14 @@ __wt_lsm_manager_reconfig(WT_SESSION_IMPL *session, const char **cfg) int __wt_lsm_manager_start(WT_SESSION_IMPL *session) { + WT_CONNECTION_IMPL *conn; WT_DECL_RET; WT_LSM_MANAGER *manager; WT_SESSION_IMPL *worker_session; uint32_t i; - manager = &S2C(session)->lsm_manager; + conn = S2C(session); + manager = &conn->lsm_manager; /* * We need at least a manager, a switch thread and a generic @@ -225,7 +227,7 @@ __wt_lsm_manager_start(WT_SESSION_IMPL *session) */ for (i = 0; i < WT_LSM_MAX_WORKERS; i++) { WT_ERR(__wt_open_internal_session( - S2C(session), "lsm-worker", true, false, &worker_session)); + conn, "lsm-worker", false, 0, &worker_session)); worker_session->isolation = WT_ISO_READ_UNCOMMITTED; manager->lsm_worker_cookies[i].session = worker_session; } @@ -234,7 +236,7 @@ __wt_lsm_manager_start(WT_SESSION_IMPL *session) WT_ERR(__wt_thread_create(session, &manager->lsm_worker_cookies[0].tid, __lsm_worker_manager, &manager->lsm_worker_cookies[0])); - F_SET(S2C(session), WT_CONN_SERVER_LSM); + F_SET(conn, WT_CONN_SERVER_LSM); if (0) { err: for (i = 0; diff --git a/src/session/session_api.c b/src/session/session_api.c index 1bb519e80e0..1f326b09d5c 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -1149,8 +1149,10 @@ __session_strerror(WT_SESSION *wt_session, int error) */ int __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, - bool uses_dhandles, bool open_metadata, WT_SESSION_IMPL **sessionp) + bool open_metadata, uint32_t session_flags, WT_SESSION_IMPL **sessionp) { + WT_DECL_RET; + WT_SESSION *wt_session; WT_SESSION_IMPL *session; *sessionp = NULL; @@ -1164,17 +1166,26 @@ __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, * list, there would be complex ordering issues during close. Set a * flag to avoid this: internal sessions are not closed automatically. */ - F_SET(session, WT_SESSION_INTERNAL); + F_SET(session, session_flags | WT_SESSION_INTERNAL); /* - * Some internal threads must keep running after we close all data - * handles. Make sure these threads don't open their own handles. + * Acquiring the lookaside table cursor requires various locks; we've + * seen problems in the past where deadlocks happened because sessions + * deadlocked getting the cursor late in the process. Be defensive, + * get it now. */ - if (!uses_dhandles) - F_SET(session, WT_SESSION_NO_DATA_HANDLES); + if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) { + WT_WITHOUT_DHANDLE(session, ret = + __wt_las_cursor_create(session, &session->las_cursor)); + WT_ERR(ret); + } *sessionp = session; return (0); + +err: wt_session = &session->iface; + WT_TRET(wt_session->close(wt_session, NULL)); + return (ret); } /* -- cgit v1.2.1 From 2c3b92b9bb14cffe121d02a7ff471ac6a320228f Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Wed, 30 Sep 2015 15:35:10 -0400 Subject: If the __wt_metadata_open() call inside __wt_open_session() fails, we'll leak the session, because the caller won't know to close the allocated session. Shouldn't ever happen, but it's wrong, clean up. --- src/include/extern.h | 2 +- src/session/session_api.c | 130 +++++++++++++++++++++++++++------------------- 2 files changed, 78 insertions(+), 54 deletions(-) diff --git a/src/include/extern.h b/src/include/extern.h index eee03f4f868..607a0a81d53 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -593,8 +593,8 @@ extern int __wt_session_copy_values(WT_SESSION_IMPL *session); extern int __wt_open_cursor(WT_SESSION_IMPL *session, const char *uri, WT_CURSOR *owner, const char *cfg[], WT_CURSOR **cursorp); extern int __wt_session_create( WT_SESSION_IMPL *session, const char *uri, const char *config); extern int __wt_session_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]); -extern int __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, bool open_metadata, uint32_t session_flags, WT_SESSION_IMPL **sessionp); extern int __wt_open_session(WT_CONNECTION_IMPL *conn, WT_EVENT_HANDLER *event_handler, const char *config, bool open_metadata, WT_SESSION_IMPL **sessionp); +extern int __wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, bool open_metadata, uint32_t session_flags, WT_SESSION_IMPL **sessionp); extern int __wt_compact_uri_analyze(WT_SESSION_IMPL *session, const char *uri, bool *skipp); extern int __wt_session_compact( WT_SESSION *wt_session, const char *uri, const char *config); extern int __wt_session_lock_dhandle( WT_SESSION_IMPL *session, uint32_t flags, bool *is_deadp); diff --git a/src/session/session_api.c b/src/session/session_api.c index 1f326b09d5c..a766829afad 100644 --- a/src/session/session_api.c +++ b/src/session/session_api.c @@ -1144,58 +1144,12 @@ __session_strerror(WT_SESSION *wt_session, int error) } /* - * __wt_open_internal_session -- - * Allocate a session for WiredTiger's use. - */ -int -__wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, - bool open_metadata, uint32_t session_flags, WT_SESSION_IMPL **sessionp) -{ - WT_DECL_RET; - WT_SESSION *wt_session; - WT_SESSION_IMPL *session; - - *sessionp = NULL; - - WT_RET(__wt_open_session(conn, NULL, NULL, open_metadata, &session)); - session->name = name; - - /* - * Public sessions are automatically closed during WT_CONNECTION->close. - * If the session handles for internal threads were to go on the public - * list, there would be complex ordering issues during close. Set a - * flag to avoid this: internal sessions are not closed automatically. - */ - F_SET(session, session_flags | WT_SESSION_INTERNAL); - - /* - * Acquiring the lookaside table cursor requires various locks; we've - * seen problems in the past where deadlocks happened because sessions - * deadlocked getting the cursor late in the process. Be defensive, - * get it now. - */ - if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) { - WT_WITHOUT_DHANDLE(session, ret = - __wt_las_cursor_create(session, &session->las_cursor)); - WT_ERR(ret); - } - - *sessionp = session; - return (0); - -err: wt_session = &session->iface; - WT_TRET(wt_session->close(wt_session, NULL)); - return (ret); -} - -/* - * __wt_open_session -- - * Allocate a session handle. The internal parameter is used for sessions - * opened by WiredTiger for its own use. + * __open_session -- + * Allocate a session handle. */ -int -__wt_open_session(WT_CONNECTION_IMPL *conn, - WT_EVENT_HANDLER *event_handler, const char *config, bool open_metadata, +static int +__open_session(WT_CONNECTION_IMPL *conn, + WT_EVENT_HANDLER *event_handler, const char *config, WT_SESSION_IMPL **sessionp) { static const WT_SESSION stds = { @@ -1335,7 +1289,26 @@ __wt_open_session(WT_CONNECTION_IMPL *conn, WT_STAT_FAST_CONN_INCR(session, session_open); err: __wt_spin_unlock(session, &conn->api_lock); - WT_RET(ret); + return (ret); +} + +/* + * __wt_open_session -- + * Allocate a session handle. + */ +int +__wt_open_session(WT_CONNECTION_IMPL *conn, + WT_EVENT_HANDLER *event_handler, const char *config, + bool open_metadata, WT_SESSION_IMPL **sessionp) +{ + WT_DECL_RET; + WT_SESSION_IMPL *session; + WT_SESSION *wt_session; + + *sessionp = NULL; + + /* Acquire a session. */ + WT_RET(__open_session(conn, event_handler, config, &session)); /* * Acquiring the metadata handle requires the schema lock; we've seen @@ -1347,8 +1320,59 @@ err: __wt_spin_unlock(session, &conn->api_lock); */ if (open_metadata) { WT_ASSERT(session, !F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); - WT_RET(__wt_metadata_open(session_ret)); + if ((ret = __wt_metadata_open(session)) != 0) { + wt_session = &session->iface; + WT_TRET(wt_session->close(wt_session, NULL)); + return (ret); + } + } + + *sessionp = session; + return (0); +} + +/* + * __wt_open_internal_session -- + * Allocate a session for WiredTiger's use. + */ +int +__wt_open_internal_session(WT_CONNECTION_IMPL *conn, const char *name, + bool open_metadata, uint32_t session_flags, WT_SESSION_IMPL **sessionp) +{ + WT_DECL_RET; + WT_SESSION *wt_session; + WT_SESSION_IMPL *session; + + *sessionp = NULL; + + /* Acquire a session. */ + WT_RET(__wt_open_session(conn, NULL, NULL, open_metadata, &session)); + session->name = name; + + /* + * Public sessions are automatically closed during WT_CONNECTION->close. + * If the session handles for internal threads were to go on the public + * list, there would be complex ordering issues during close. Set a + * flag to avoid this: internal sessions are not closed automatically. + */ + F_SET(session, session_flags | WT_SESSION_INTERNAL); + + /* + * Acquiring the lookaside table cursor requires various locks; we've + * seen problems in the past where deadlocks happened because sessions + * deadlocked getting the cursor late in the process. Be defensive, + * get it now. + */ + if (F_ISSET(session, WT_SESSION_LOOKASIDE_CURSOR)) { + WT_WITHOUT_DHANDLE(session, ret = + __wt_las_cursor_create(session, &session->las_cursor)); + if (ret != 0) { + wt_session = &session->iface; + WT_TRET(wt_session->close(wt_session, NULL)); + return (ret); + } } + *sessionp = session; return (0); } -- cgit v1.2.1 From 7ef27616d6ae7626814eb1a27fe5902b1a6dcadb Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Wed, 30 Sep 2015 15:36:00 -0400 Subject: Minor shuffling of functions in the lookaside table code to group related functions (now the "create a lookaside cursor" code is no longer private to the file, there's a cleaner grouping than before). --- src/cache/cache_las.c | 68 +++++++++++++++++++++++++-------------------------- src/include/extern.h | 2 +- 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/src/cache/cache_las.c b/src/cache/cache_las.c index 90bf6b1d59c..2c85cb70bd2 100644 --- a/src/cache/cache_las.c +++ b/src/cache/cache_las.c @@ -44,40 +44,6 @@ __wt_las_stats_update(WT_SESSION_IMPL *session) cache_lookaside_remove, WT_STAT_READ(dstats, cursor_remove)); } -/* - * __wt_las_cursor_create -- - * Open a new lookaside table cursor. - */ -int -__wt_las_cursor_create(WT_SESSION_IMPL *session, WT_CURSOR **cursorp) -{ - WT_BTREE *btree; - const char *open_cursor_cfg[] = { - WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL }; - - WT_RET(__wt_open_cursor( - session, WT_LAS_URI, NULL, open_cursor_cfg, cursorp)); - - /* - * Set special flags for the lookaside table: the lookaside flag (used, - * for example, to avoid writing records during reconciliation), also - * turn off checkpoints and logging. - * - * Test flags before setting them so updates can't race in subsequent - * opens (the first update is safe because it's single-threaded from - * wiredtiger_open). - */ - btree = S2BT(session); - if (!F_ISSET(btree, WT_BTREE_LOOKASIDE)) - F_SET(btree, WT_BTREE_LOOKASIDE); - if (!F_ISSET(btree, WT_BTREE_NO_CHECKPOINT)) - F_SET(btree, WT_BTREE_NO_CHECKPOINT); - if (!F_ISSET(btree, WT_BTREE_NO_LOGGING)) - F_SET(btree, WT_BTREE_NO_LOGGING); - - return (0); -} - /* * __wt_las_create -- * Initialize the database's lookaside store. @@ -170,6 +136,40 @@ __wt_las_is_written(WT_SESSION_IMPL *session) return (S2C(session)->las_written); } +/* + * __wt_las_cursor_create -- + * Open a new lookaside table cursor. + */ +int +__wt_las_cursor_create(WT_SESSION_IMPL *session, WT_CURSOR **cursorp) +{ + WT_BTREE *btree; + const char *open_cursor_cfg[] = { + WT_CONFIG_BASE(session, WT_SESSION_open_cursor), NULL }; + + WT_RET(__wt_open_cursor( + session, WT_LAS_URI, NULL, open_cursor_cfg, cursorp)); + + /* + * Set special flags for the lookaside table: the lookaside flag (used, + * for example, to avoid writing records during reconciliation), also + * turn off checkpoints and logging. + * + * Test flags before setting them so updates can't race in subsequent + * opens (the first update is safe because it's single-threaded from + * wiredtiger_open). + */ + btree = S2BT(session); + if (!F_ISSET(btree, WT_BTREE_LOOKASIDE)) + F_SET(btree, WT_BTREE_LOOKASIDE); + if (!F_ISSET(btree, WT_BTREE_NO_CHECKPOINT)) + F_SET(btree, WT_BTREE_NO_CHECKPOINT); + if (!F_ISSET(btree, WT_BTREE_NO_LOGGING)) + F_SET(btree, WT_BTREE_NO_LOGGING); + + return (0); +} + /* * __wt_las_cursor -- * Return a lookaside cursor. diff --git a/src/include/extern.h b/src/include/extern.h index 607a0a81d53..6c844f38f5d 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -182,11 +182,11 @@ extern int __wt_search_insert( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, W extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert); extern int __wt_row_random(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt); extern void __wt_las_stats_update(WT_SESSION_IMPL *session); -extern int __wt_las_cursor_create(WT_SESSION_IMPL *session, WT_CURSOR **cursorp); extern int __wt_las_create(WT_SESSION_IMPL *session); extern int __wt_las_destroy(WT_SESSION_IMPL *session); extern void __wt_las_set_written(WT_SESSION_IMPL *session); extern bool __wt_las_is_written(WT_SESSION_IMPL *session); +extern int __wt_las_cursor_create(WT_SESSION_IMPL *session, WT_CURSOR **cursorp); extern int __wt_las_cursor( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags); extern int __wt_las_cursor_close( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t session_flags); extern int __wt_las_sweep(WT_SESSION_IMPL *session); -- cgit v1.2.1 From ba9797f684c4eeb1eaf0fc8ae70d562e50c27bc4 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Wed, 30 Sep 2015 15:38:08 -0400 Subject: lint --- src/cache/cache_las.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/cache/cache_las.c b/src/cache/cache_las.c index 2c85cb70bd2..714963b18d4 100644 --- a/src/cache/cache_las.c +++ b/src/cache/cache_las.c @@ -179,7 +179,6 @@ __wt_las_cursor( WT_SESSION_IMPL *session, WT_CURSOR **cursorp, uint32_t *session_flags) { WT_CONNECTION_IMPL *conn; - WT_DECL_RET; *cursorp = NULL; -- cgit v1.2.1 From fc51ae17d2dab4d76e4bac3cbbc1f0eb5bb29b1a Mon Sep 17 00:00:00 2001 From: Susan LoVerso Date: Wed, 30 Sep 2015 16:08:34 -0400 Subject: WT-2151 Add logging reconfiguration support and testing. --- dist/api_data.py | 70 ++++++++++++------------- src/config/config_def.c | 43 ++++++++------- src/conn/conn_api.c | 1 + src/conn/conn_log.c | 76 +++++++++++++++++++++------ src/include/extern.h | 1 + src/include/wiredtiger.in | 27 ++++++++++ test/suite/test_reconfig.py | 119 ------------------------------------------ test/suite/test_reconfig01.py | 119 ++++++++++++++++++++++++++++++++++++++++++ test/suite/test_reconfig02.py | 108 ++++++++++++++++++++++++++++++++++++++ 9 files changed, 374 insertions(+), 190 deletions(-) delete mode 100644 test/suite/test_reconfig.py create mode 100644 test/suite/test_reconfig01.py create mode 100644 test/suite/test_reconfig02.py diff --git a/dist/api_data.py b/dist/api_data.py index 6faee4ddd7f..6fd7dcd0093 100644 --- a/dist/api_data.py +++ b/dist/api_data.py @@ -411,6 +411,41 @@ connection_runtime_config = [ interval in seconds at which to check for files that are inactive and close them''', min=1, max=100000), ]), + Config('log', '', r''' + enable logging. Enabling logging uses three sessions from the + configured session_max''', + type='category', subconfig=[ + Config('archive', 'true', r''' + automatically archive unneeded log files''', + type='boolean'), + Config('compressor', 'none', r''' + configure a compressor for log records. Permitted values are + \c "none" or custom compression engine name created with + WT_CONNECTION::add_compressor. If WiredTiger has builtin support + for \c "bzip2", \c "snappy", \c "lz4" or \c "zlib" compression, + these names are also available. See @ref compression for more + information'''), + Config('enabled', 'false', r''' + enable logging subsystem''', + type='boolean'), + Config('file_max', '100MB', r''' + the maximum size of log files''', + min='100KB', max='2GB'), + Config('path', '', r''' + the path to a directory into which the log files are written. + If the value is not an absolute path name, the files are created + relative to the database home'''), + Config('prealloc', 'true', r''' + pre-allocate log files.''', + type='boolean'), + Config('recover', 'on', r''' + run recovery or error if recovery needs to run after an + unclean shutdown.''', + choices=['error','on']), + Config('zero_fill', 'false', r''' + manually write zeroes into log files''', + type='boolean'), + ]), Config('lsm_manager', '', r''' configure database wide options for LSM tree management. The LSM manager is started automatically the first time an LSM tree is opened. @@ -611,41 +646,6 @@ common_wiredtiger_open = [ maximum number of simultaneous hazard pointers per session handle''', min='15'), - Config('log', '', r''' - enable logging. Enabling logging uses three sessions from the - configured session_max''', - type='category', subconfig=[ - Config('archive', 'true', r''' - automatically archive unneeded log files''', - type='boolean'), - Config('compressor', 'none', r''' - configure a compressor for log records. Permitted values are - \c "none" or custom compression engine name created with - WT_CONNECTION::add_compressor. If WiredTiger has builtin support - for \c "bzip2", \c "snappy", \c "lz4" or \c "zlib" compression, - these names are also available. See @ref compression for more - information'''), - Config('enabled', 'false', r''' - enable logging subsystem''', - type='boolean'), - Config('file_max', '100MB', r''' - the maximum size of log files''', - min='100KB', max='2GB'), - Config('path', '', r''' - the path to a directory into which the log files are written. - If the value is not an absolute path name, the files are created - relative to the database home'''), - Config('prealloc', 'true', r''' - pre-allocate log files.''', - type='boolean'), - Config('recover', 'on', r''' - run recovery or error if recovery needs to run after an - unclean shutdown.''', - choices=['error','on']), - Config('zero_fill', 'false', r''' - manually write zeroes into log files''', - type='boolean'), - ]), Config('mmap', 'true', r''' Use memory mapping to access files when possible''', type='boolean'), diff --git a/src/config/config_def.c b/src/config/config_def.c index 990e604cf39..419f4124133 100644 --- a/src/config/config_def.c +++ b/src/config/config_def.c @@ -65,6 +65,21 @@ static const WT_CONFIG_CHECK { NULL, NULL, NULL, NULL, NULL, 0 } }; +static const WT_CONFIG_CHECK + confchk_wiredtiger_open_log_subconfigs[] = { + { "archive", "boolean", NULL, NULL, NULL, 0 }, + { "compressor", "string", NULL, NULL, NULL, 0 }, + { "enabled", "boolean", NULL, NULL, NULL, 0 }, + { "file_max", "int", NULL, "min=100KB,max=2GB", NULL, 0 }, + { "path", "string", NULL, NULL, NULL, 0 }, + { "prealloc", "boolean", NULL, NULL, NULL, 0 }, + { "recover", "string", + NULL, "choices=[\"error\",\"on\"]", + NULL, 0 }, + { "zero_fill", "boolean", NULL, NULL, NULL, 0 }, + { NULL, NULL, NULL, NULL, NULL, 0 } +}; + static const WT_CONFIG_CHECK confchk_wiredtiger_open_lsm_manager_subconfigs[] = { { "merge", "boolean", NULL, NULL, NULL, 0 }, @@ -116,6 +131,9 @@ static const WT_CONFIG_CHECK confchk_WT_CONNECTION_reconfigure[] = { { "file_manager", "category", NULL, NULL, confchk_wiredtiger_open_file_manager_subconfigs, 3 }, + { "log", "category", + NULL, NULL, + confchk_wiredtiger_open_log_subconfigs, 8 }, { "lsm_manager", "category", NULL, NULL, confchk_wiredtiger_open_lsm_manager_subconfigs, 2 }, @@ -452,21 +470,6 @@ static const WT_CONFIG_CHECK { NULL, NULL, NULL, NULL, NULL, 0 } }; -static const WT_CONFIG_CHECK - confchk_wiredtiger_open_log_subconfigs[] = { - { "archive", "boolean", NULL, NULL, NULL, 0 }, - { "compressor", "string", NULL, NULL, NULL, 0 }, - { "enabled", "boolean", NULL, NULL, NULL, 0 }, - { "file_max", "int", NULL, "min=100KB,max=2GB", NULL, 0 }, - { "path", "string", NULL, NULL, NULL, 0 }, - { "prealloc", "boolean", NULL, NULL, NULL, 0 }, - { "recover", "string", - NULL, "choices=[\"error\",\"on\"]", - NULL, 0 }, - { "zero_fill", "boolean", NULL, NULL, NULL, 0 }, - { NULL, NULL, NULL, NULL, NULL, 0 } -}; - static const WT_CONFIG_CHECK confchk_wiredtiger_open_transaction_sync_subconfigs[] = { { "enabled", "boolean", NULL, NULL, NULL, 0 }, @@ -815,12 +818,14 @@ static const WT_CONFIG_ENTRY config_entries[] = { "eviction=(threads_max=1,threads_min=1),eviction_dirty_target=80," "eviction_dirty_trigger=95,eviction_target=80,eviction_trigger=95" ",file_manager=(close_handle_minimum=250,close_idle_time=30," - "close_scan_interval=10),lsm_manager=(merge=,worker_thread_max=4)" - ",lsm_merge=,shared_cache=(chunk=10MB,name=,quota=0,reserve=0," - "size=500MB),statistics=none,statistics_log=(on_close=0," + "close_scan_interval=10),log=(archive=,compressor=,enabled=0," + "file_max=100MB,path=,prealloc=,recover=on,zero_fill=0)," + "lsm_manager=(merge=,worker_thread_max=4),lsm_merge=," + "shared_cache=(chunk=10MB,name=,quota=0,reserve=0,size=500MB)," + "statistics=none,statistics_log=(on_close=0," "path=\"WiredTigerStat.%d.%H\",sources=," "timestamp=\"%b %d %H:%M:%S\",wait=0),verbose=", - confchk_WT_CONNECTION_reconfigure, 17 + confchk_WT_CONNECTION_reconfigure, 18 }, { "WT_CURSOR.close", "", diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index b5d0e8f2883..7ce84b624a3 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -1051,6 +1051,7 @@ __conn_reconfigure(WT_CONNECTION *wt_conn, const char *config) WT_ERR(__wt_async_reconfig(session, cfg)); WT_ERR(__wt_cache_config(session, true, cfg)); WT_ERR(__wt_checkpoint_server_create(session, cfg)); + WT_ERR(__wt_logmgr_reconfig(session, cfg)); WT_ERR(__wt_lsm_manager_reconfig(session, cfg)); WT_ERR(__wt_statlog_create(session, cfg)); WT_ERR(__wt_sweep_config(session, cfg)); diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c index 776cb0bf8e3..41420462f6e 100644 --- a/src/conn/conn_log.c +++ b/src/conn/conn_log.c @@ -42,30 +42,47 @@ __logmgr_sync_cfg(WT_SESSION_IMPL *session, const char **cfg) * Parse and setup the logging server options. */ static int -__logmgr_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp) +__logmgr_config( + WT_SESSION_IMPL *session, const char **cfg, bool *runp, bool reconfig) { WT_CONFIG_ITEM cval; WT_CONNECTION_IMPL *conn; conn = S2C(session); + WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval)); /* - * The logging configuration is off by default. + * If we're reconfiguring, enabled must match the already + * existing setting. */ - WT_RET(__wt_config_gets(session, cfg, "log.enabled", &cval)); + /* + * If it is off and the user it turning it on, or it is on + * and the user is turning it off, return an error. + */ + if (reconfig && + ((cval.val != 0 && + !FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)) || + (cval.val == 0 && + FLD_ISSET(conn->log_flags, WT_CONN_LOG_ENABLED)))) + return (EINVAL); *runp = cval.val != 0; /* - * Setup a log path, compression and encryption even if logging is - * disabled in case we are going to print a log. + * Setup a log path and compression even if logging is disabled in case + * we are going to print a log. Only do this on creation. Once a + * compressor or log path are set they cannot be changed. */ - conn->log_compressor = NULL; - WT_RET(__wt_config_gets_none(session, cfg, "log.compressor", &cval)); - WT_RET(__wt_compressor_config(session, &cval, &conn->log_compressor)); - - WT_RET(__wt_config_gets(session, cfg, "log.path", &cval)); - WT_RET(__wt_strndup(session, cval.str, cval.len, &conn->log_path)); - + if (!reconfig) { + conn->log_compressor = NULL; + WT_RET(__wt_config_gets_none( + session, cfg, "log.compressor", &cval)); + WT_RET(__wt_compressor_config( + session, &cval, &conn->log_compressor)); + + WT_RET(__wt_config_gets(session, cfg, "log.path", &cval)); + WT_RET(__wt_strndup( + session, cval.str, cval.len, &conn->log_path)); + } /* We are done if logging isn't enabled. */ if (!*runp) return (0); @@ -74,13 +91,22 @@ __logmgr_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp) if (cval.val != 0) FLD_SET(conn->log_flags, WT_CONN_LOG_ARCHIVE); - WT_RET(__wt_config_gets(session, cfg, "log.file_max", &cval)); - conn->log_file_max = (wt_off_t)cval.val; - WT_STAT_FAST_CONN_SET(session, log_max_filesize, conn->log_file_max); + if (!reconfig) { + /* + * Ignore if the user tries to change the file size. The + * amount of memory allocated to the log slots may be based + * on the log file size at creation and we don't want to + * re-allocate that memory while running. + */ + WT_RET(__wt_config_gets(session, cfg, "log.file_max", &cval)); + conn->log_file_max = (wt_off_t)cval.val; + WT_STAT_FAST_CONN_SET(session, + log_max_filesize, conn->log_file_max); + } WT_RET(__wt_config_gets(session, cfg, "log.prealloc", &cval)); /* - * If pre-allocation is configured, set the initial number to one. + * If pre-allocation is configured, set the initial number to a few. * We'll adapt as load dictates. */ if (cval.val != 0) { @@ -88,6 +114,10 @@ __logmgr_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp) conn->log_prealloc = 5; } WT_RET(__wt_config_gets_def(session, cfg, "log.recover", 0, &cval)); + /* + * Note that it is meaningless to reconfigure this value during + * runtime. It only matters on create before recovery runs. + */ if (cval.len != 0 && WT_STRING_MATCH("error", cval.str, cval.len)) FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR); @@ -99,6 +129,18 @@ __logmgr_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp) return (0); } +/* + * __wt_logmgr_reconfig -- + * Reconfigure logging. + */ +int +__wt_logmgr_reconfig(WT_SESSION_IMPL *session, const char **cfg) +{ + bool dummy; + + return (__logmgr_config(session, cfg, &dummy, true)); +} + /* * __log_archive_once -- * Perform one iteration of log archiving. Must be called with the @@ -726,7 +768,7 @@ __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]) conn = S2C(session); /* Handle configuration. */ - WT_RET(__logmgr_config(session, cfg, &run)); + WT_RET(__logmgr_config(session, cfg, &run, false)); /* If logging is not configured, we're done. */ if (!run) diff --git a/src/include/extern.h b/src/include/extern.h index cfc1dc8f26e..ec25b96aa22 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -246,6 +246,7 @@ extern int __wt_conn_dhandle_discard_single( WT_SESSION_IMPL *session, bool fina extern int __wt_conn_dhandle_discard(WT_SESSION_IMPL *session); extern int __wt_connection_init(WT_CONNECTION_IMPL *conn); extern int __wt_connection_destroy(WT_CONNECTION_IMPL *conn); +extern int __wt_logmgr_reconfig(WT_SESSION_IMPL *session, const char **cfg); extern int __wt_log_truncate_files( WT_SESSION_IMPL *session, WT_CURSOR *cursor, const char *cfg[]); extern int __wt_log_wrlsn(WT_SESSION_IMPL *session); extern int __wt_logmgr_create(WT_SESSION_IMPL *session, const char *cfg[]); diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in index bc22c4bdc84..b7ebb8fbc14 100644 --- a/src/include/wiredtiger.in +++ b/src/include/wiredtiger.in @@ -1750,6 +1750,33 @@ struct __wt_connection { * seconds at which to check for files that are inactive and close * them., an integer between 1 and 100000; default \c 10.} * @config{ ),,} + * @config{log = (, enable logging. Enabling logging uses three + * sessions from the configured session_max., a set of related + * configuration options defined below.} + * @config{    archive, automatically archive + * unneeded log files., a boolean flag; default \c true.} + * @config{    compressor, configure a compressor + * for log records. Permitted values are \c "none" or custom + * compression engine name created with WT_CONNECTION::add_compressor. + * If WiredTiger has builtin support for \c "bzip2"\, \c "snappy"\, \c + * "lz4" or \c "zlib" compression\, these names are also available. See + * @ref compression for more information., a string; default \c none.} + * @config{    enabled, enable logging subsystem., a + * boolean flag; default \c false.} + * @config{    file_max, the maximum size of log + * files., an integer between 100KB and 2GB; default \c 100MB.} + * @config{    path, the path to a directory into + * which the log files are written. If the value is not an absolute + * path name\, the files are created relative to the database home., a + * string; default empty.} + * @config{    prealloc, + * pre-allocate log files., a boolean flag; default \c true.} + * @config{    recover, run recovery or error if + * recovery needs to run after an unclean shutdown., a string\, chosen + * from the following options: \c "error"\, \c "on"; default \c on.} + * @config{    zero_fill, manually write zeroes into + * log files., a boolean flag; default \c false.} + * @config{ ),,} * @config{lsm_manager = (, configure database wide options for LSM tree * management. The LSM manager is started automatically the first time * an LSM tree is opened. The LSM manager uses a session from the diff --git a/test/suite/test_reconfig.py b/test/suite/test_reconfig.py deleted file mode 100644 index b464895f155..00000000000 --- a/test/suite/test_reconfig.py +++ /dev/null @@ -1,119 +0,0 @@ -#!/usr/bin/env python -# -# Public Domain 2014-2015 MongoDB, Inc. -# Public Domain 2008-2014 WiredTiger, Inc. -# -# This is free and unencumbered software released into the public domain. -# -# Anyone is free to copy, modify, publish, use, compile, sell, or -# distribute this software, either in source code form or as a compiled -# binary, for any purpose, commercial or non-commercial, and by any -# means. -# -# In jurisdictions that recognize copyright laws, the author or authors -# of this software dedicate any and all copyright interest in the -# software to the public domain. We make this dedication for the benefit -# of the public at large and to the detriment of our heirs and -# successors. We intend this dedication to be an overt act of -# relinquishment in perpetuity of all present and future rights to this -# software under copyright law. -# -# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, -# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF -# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. -# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR -# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, -# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR -# OTHER DEALINGS IN THE SOFTWARE. - -import time -import wiredtiger, wttest -from helper import simple_populate - -# test_reconfig.py -# Smoke-test the connection reconfiguration operations. -class test_reconfig(wttest.WiredTigerTestCase): - - def test_reconfig_shared_cache(self): - self.conn.reconfigure("shared_cache=(name=pool,size=300M)") - - def test_reconfig_async(self): - # Async starts off. Reconfigure through all the various cases, each - # building from the previous state. - # Async is off, and remains off. - self.conn.reconfigure("async=(enabled=false)") - # Async is off, turn it on. - self.conn.reconfigure("async=(enabled=true)") - # Async is on, and remains on. - self.conn.reconfigure("async=(enabled=true)") - # Async is on, turn it off. - self.conn.reconfigure("async=(enabled=false)") - # Async is off, turn it on with ops_max and threads. - self.conn.reconfigure("async=(enabled=true,ops_max=512,threads=10)") - # Reconfigure and use same thread count. (no-op) - self.conn.reconfigure("async=(threads=10)") - # Reconfigure more threads. - self.conn.reconfigure("async=(threads=14)") - # Reconfigure fewer threads. - self.conn.reconfigure("async=(threads=8)") - # Reconfigure illegal ops_max (ignored). - self.conn.reconfigure("async=(ops_max=1024)") - # Turn async off. - self.conn.reconfigure("async=(enabled=false)") - # Async is off, turn it on. Should end up with the - # same ops_max of 512 and thread of 8. - self.conn.reconfigure("async=(enabled=true)") - - def test_reconfig_lsm_manager(self): - # We create and populate a tiny LSM so that we can start off with - # the LSM threads running and change the numbers of threads. - # Take all the defaults. - uri = "lsm:test_reconfig" - nrecs = 10 - simple_populate(self, uri, 'key_format=S', nrecs) - # Sleep to make sure all threads are started. - time.sleep(2) - # Now that an LSM tree exists, reconfigure LSM manager threads. - # We start with the default, which is 4. Configure more threads. - self.conn.reconfigure("lsm_manager=(worker_thread_max=10)") - # Generate some work - nrecs = 20 - simple_populate(self, uri, 'key_format=S', nrecs) - # Now reconfigure fewer threads. - self.conn.reconfigure("lsm_manager=(worker_thread_max=3)") - - def test_reconfig_statistics(self): - self.conn.reconfigure("statistics=(all)") - self.conn.reconfigure("statistics=(fast)") - self.conn.reconfigure("statistics=(none)") - - def test_reconfig_checkpoints(self): - self.conn.reconfigure("checkpoint=(wait=0)") - self.conn.reconfigure("checkpoint=(wait=5)") - self.conn.reconfigure("checkpoint=(log_size=0)") - self.conn.reconfigure("checkpoint=(log_size=1M)") - self.conn.reconfigure("checkpoint=(wait=0,name=hi)") - self.conn.reconfigure("checkpoint=(wait=5,name=hi)") - - def test_reconfig_stat_log(self): - self.conn.reconfigure("statistics=[all],statistics_log=(wait=0)") - self.conn.reconfigure("statistics_log=(wait=0)") - self.conn.reconfigure("statistics_log=(wait=2)") - self.conn.reconfigure("statistics_log=(wait=0)") - self.conn.reconfigure("statistics_log=(wait=2,sources=[lsm:])") - self.conn.reconfigure("statistics_log=(wait=0)") - self.conn.reconfigure("statistics_log=(wait=2,timestamp=\"t%b %d\")") - self.conn.reconfigure("statistics_log=(wait=0)") - self.conn.reconfigure("statistics_log=(wait=2,path=\"wts.%d.%H\")") - self.conn.reconfigure("statistics_log=(wait=0)") - self.conn.reconfigure( - "statistics_log=(wait=2,sources=[lsm:],timestamp=\"%b\")") - - def test_file_manager(self): - self.conn.reconfigure("file_manager=(close_scan_interval=3)") - self.conn.reconfigure("file_manager=(close_idle_time=4)") - self.conn.reconfigure( - "file_manager=(close_idle_time=4,close_scan_interval=100)") - -if __name__ == '__main__': - wttest.run() diff --git a/test/suite/test_reconfig01.py b/test/suite/test_reconfig01.py new file mode 100644 index 00000000000..2528f856a08 --- /dev/null +++ b/test/suite/test_reconfig01.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2015 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import time +import wiredtiger, wttest +from helper import simple_populate + +# test_reconfig01.py +# Smoke-test the connection reconfiguration operations. +class test_reconfig01(wttest.WiredTigerTestCase): + + def test_reconfig_shared_cache(self): + self.conn.reconfigure("shared_cache=(name=pool,size=300M)") + + def test_reconfig_async(self): + # Async starts off. Reconfigure through all the various cases, each + # building from the previous state. + # Async is off, and remains off. + self.conn.reconfigure("async=(enabled=false)") + # Async is off, turn it on. + self.conn.reconfigure("async=(enabled=true)") + # Async is on, and remains on. + self.conn.reconfigure("async=(enabled=true)") + # Async is on, turn it off. + self.conn.reconfigure("async=(enabled=false)") + # Async is off, turn it on with ops_max and threads. + self.conn.reconfigure("async=(enabled=true,ops_max=512,threads=10)") + # Reconfigure and use same thread count. (no-op) + self.conn.reconfigure("async=(threads=10)") + # Reconfigure more threads. + self.conn.reconfigure("async=(threads=14)") + # Reconfigure fewer threads. + self.conn.reconfigure("async=(threads=8)") + # Reconfigure illegal ops_max (ignored). + self.conn.reconfigure("async=(ops_max=1024)") + # Turn async off. + self.conn.reconfigure("async=(enabled=false)") + # Async is off, turn it on. Should end up with the + # same ops_max of 512 and thread of 8. + self.conn.reconfigure("async=(enabled=true)") + + def test_reconfig_lsm_manager(self): + # We create and populate a tiny LSM so that we can start off with + # the LSM threads running and change the numbers of threads. + # Take all the defaults. + uri = "lsm:test_reconfig" + nrecs = 10 + simple_populate(self, uri, 'key_format=S', nrecs) + # Sleep to make sure all threads are started. + time.sleep(2) + # Now that an LSM tree exists, reconfigure LSM manager threads. + # We start with the default, which is 4. Configure more threads. + self.conn.reconfigure("lsm_manager=(worker_thread_max=10)") + # Generate some work + nrecs = 20 + simple_populate(self, uri, 'key_format=S', nrecs) + # Now reconfigure fewer threads. + self.conn.reconfigure("lsm_manager=(worker_thread_max=3)") + + def test_reconfig_statistics(self): + self.conn.reconfigure("statistics=(all)") + self.conn.reconfigure("statistics=(fast)") + self.conn.reconfigure("statistics=(none)") + + def test_reconfig_checkpoints(self): + self.conn.reconfigure("checkpoint=(wait=0)") + self.conn.reconfigure("checkpoint=(wait=5)") + self.conn.reconfigure("checkpoint=(log_size=0)") + self.conn.reconfigure("checkpoint=(log_size=1M)") + self.conn.reconfigure("checkpoint=(wait=0,name=hi)") + self.conn.reconfigure("checkpoint=(wait=5,name=hi)") + + def test_reconfig_stat_log(self): + self.conn.reconfigure("statistics=[all],statistics_log=(wait=0)") + self.conn.reconfigure("statistics_log=(wait=0)") + self.conn.reconfigure("statistics_log=(wait=2)") + self.conn.reconfigure("statistics_log=(wait=0)") + self.conn.reconfigure("statistics_log=(wait=2,sources=[lsm:])") + self.conn.reconfigure("statistics_log=(wait=0)") + self.conn.reconfigure("statistics_log=(wait=2,timestamp=\"t%b %d\")") + self.conn.reconfigure("statistics_log=(wait=0)") + self.conn.reconfigure("statistics_log=(wait=2,path=\"wts.%d.%H\")") + self.conn.reconfigure("statistics_log=(wait=0)") + self.conn.reconfigure( + "statistics_log=(wait=2,sources=[lsm:],timestamp=\"%b\")") + + def test_file_manager(self): + self.conn.reconfigure("file_manager=(close_scan_interval=3)") + self.conn.reconfigure("file_manager=(close_idle_time=4)") + self.conn.reconfigure( + "file_manager=(close_idle_time=4,close_scan_interval=100)") + +if __name__ == '__main__': + wttest.run() diff --git a/test/suite/test_reconfig02.py b/test/suite/test_reconfig02.py new file mode 100644 index 00000000000..e0981a887fb --- /dev/null +++ b/test/suite/test_reconfig02.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2015 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import fnmatch, os, time +import wiredtiger, wttest +from helper import simple_populate + +# test_reconfig02.py +# Smoke-test the connection reconfiguration operations. +class test_reconfig02(wttest.WiredTigerTestCase): + init_config = 'log=(archive=false,enabled,file_max=100K,prealloc=false,zero_fill=false)' + uri = "table:reconfig02" + entries = 1000 + + def setUpConnectionOpen(self, dir): + self.conn_config = self.init_config + return wttest.WiredTigerTestCase.setUpConnectionOpen(self, dir) + + # Call reconfigure for zero filling a file. There is nothing + # we can actually look for to confirm it did anything. + # Also changing the log file size is a no-op, but should not fail. + def test_reconfig02_simple(self): + self.conn.reconfigure("log=(zero_fill=true)") + self.conn.reconfigure("log=(file_max=1MB)") + + # Test that we get an error if we try to turn logging off. + def test_reconfig02_disable(self): + msg = 'Invalid argument' + gotException = False + try: + self.conn.reconfigure("log=(enabled=false)") + except wiredtiger.WiredTigerError as e: + gotException = True + self.pr('got exception: ' + str(e)) + self.assertTrue(str(e).find(msg) >= 0) + self.assertTrue(gotException) + + # Logging starts on, but prealloc is off. Verify it is off. + # Reconfigure it on and run again, making sure that log files + # get pre-allocated. + def test_reconfig02_prealloc(self): + # Create a table just to write something into the log. Sleep + # to give the worker thread a chance to run. + self.session.create(self.uri, 'key_format=i,value_format=i') + time.sleep(2) + prep_logs = fnmatch.filter(os.listdir('.'), "*Prep*") + # Make sure no pre-allocated log files exist. + self.assertEqual(0, len(prep_logs)) + + # Now turn on pre-allocation. Sleep to give the worker thread + # a chance to run and verify pre-allocated log files exist. + self.conn.reconfigure("log=(prealloc=true)") + time.sleep(2) + prep_logs = fnmatch.filter(os.listdir('.'), "*Prep*") + self.assertNotEqual(0, len(prep_logs)) + + # Logging starts on, but archive is off. Verify it is off. + # Reconfigure it on and run again, making sure that log files + # get archived. + def test_reconfig02_archive(self): + self.session.create(self.uri, 'key_format=i,value_format=i') + c = self.session.open_cursor(self.uri, None, None) + for i in range(self.entries): + c[i] = i + 1 + c.close() + # Close and reopen connection to write a checkpoint, move to the + # next log file and verify that archive did not run. + orig_logs = fnmatch.filter(os.listdir('.'), "*Log*") + self.reopen_conn() + cur_logs = fnmatch.filter(os.listdir('.'), "*Log*") + for o in orig_logs: + self.assertEqual(True, o in cur_logs) + + # Now turn on archive, sleep a bit to allow the archive thread + # to run and then confirm that all original logs are gone. + self.conn.reconfigure("log=(archive=true)") + time.sleep(2) + cur_logs = fnmatch.filter(os.listdir('.'), "*Log*") + for o in orig_logs: + self.assertEqual(False, o in cur_logs) + +if __name__ == '__main__': + wttest.run() -- cgit v1.2.1 From 8ae70e3025eacc753d9c83b7ca3b0a6843850d8c Mon Sep 17 00:00:00 2001 From: Susan LoVerso Date: Wed, 30 Sep 2015 16:09:56 -0400 Subject: WT-2153 Fix bug. Now we always need to start the log_server thread. --- src/conn/conn_log.c | 9 +-------- src/include/connection.h | 5 ++--- 2 files changed, 3 insertions(+), 11 deletions(-) diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c index eba0a2769d6..cee74b6e969 100644 --- a/src/conn/conn_log.c +++ b/src/conn/conn_log.c @@ -83,10 +83,8 @@ __logmgr_config(WT_SESSION_IMPL *session, const char **cfg, bool *runp) * If pre-allocation is configured, set the initial number to one. * We'll adapt as load dictates. */ - if (cval.val != 0) { - FLD_SET(conn->log_flags, WT_CONN_LOG_PREALLOC); + if (cval.val != 0) conn->log_prealloc = 1; - } WT_RET(__wt_config_gets_def(session, cfg, "log.recover", 0, &cval)); if (cval.len != 0 && WT_STRING_MATCH("error", cval.str, cval.len)) FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR); @@ -812,11 +810,6 @@ __wt_logmgr_open(WT_SESSION_IMPL *session) &conn->log_wrlsn_tid, __log_wrlsn_server, conn->log_wrlsn_session)); conn->log_wrlsn_tid_set = true; - /* If no log thread services are configured, we're done. */ - if (!FLD_ISSET(conn->log_flags, - (WT_CONN_LOG_ARCHIVE | WT_CONN_LOG_PREALLOC))) - return (0); - /* * If a log server thread exists, the user may have reconfigured * archiving or pre-allocation. Signal the thread. Otherwise the diff --git a/src/include/connection.h b/src/include/connection.h index 2c20c2f7936..4e62140b9ae 100644 --- a/src/include/connection.h +++ b/src/include/connection.h @@ -339,9 +339,8 @@ struct __wt_connection_impl { #define WT_CONN_LOG_ARCHIVE 0x01 /* Archive is enabled */ #define WT_CONN_LOG_ENABLED 0x02 /* Logging is enabled */ #define WT_CONN_LOG_EXISTED 0x04 /* Log files found */ -#define WT_CONN_LOG_PREALLOC 0x08 /* Pre-allocation is enabled */ -#define WT_CONN_LOG_RECOVER_DONE 0x10 /* Recovery completed */ -#define WT_CONN_LOG_RECOVER_ERR 0x20 /* Error if recovery required */ +#define WT_CONN_LOG_RECOVER_DONE 0x08 /* Recovery completed */ +#define WT_CONN_LOG_RECOVER_ERR 0x10 /* Error if recovery required */ uint32_t log_flags; /* Global logging configuration */ WT_CONDVAR *log_cond; /* Log server wait mutex */ WT_SESSION_IMPL *log_session; /* Log server session */ -- cgit v1.2.1 From 181f469ad2cecb6ea911ad445c87804e4af0f235 Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Thu, 1 Oct 2015 09:06:33 +1000 Subject: Add a wtperf configuration that triggers long latencies that are improved with fair parent-split locking. --- bench/wtperf/runners/multi-btree-stress.wtperf | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 bench/wtperf/runners/multi-btree-stress.wtperf diff --git a/bench/wtperf/runners/multi-btree-stress.wtperf b/bench/wtperf/runners/multi-btree-stress.wtperf new file mode 100644 index 00000000000..b10b08f6035 --- /dev/null +++ b/bench/wtperf/runners/multi-btree-stress.wtperf @@ -0,0 +1,17 @@ +# wtperf options file: multi-database configuration attempting to +# trigger slow operations by overloading CPU and disk. +# References Jira WT-2131 +conn_config="cache_size=2GB,eviction=(threads_min=2,threads_max=2),log=(enabled=false),direct_io=(data,checkpoint),buffer_alignment=4096,checkpoint_sync=true,checkpoint=(wait=60)" +table_config="allocation_size=4k,prefix_compression=false,split_pct=75,leaf_page_max=4k,internal_page_max=16k,leaf_item_max=1433,internal_item_max=3100,type=file" +# Divide original icount by database_count. +database_count=5 +icount=50000 +populate_threads=1 +random_range=50000000 +report_interval=5 +run_time=3600 +threads=((count=1,inserts=1),(count=10,reads=1)) +value_sz=100 +max_latency=1000 +sample_interval=5 +sample_rate=1 -- cgit v1.2.1 From 30cac7ee553a03786901db41e99d26d0b1ec32b6 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Wed, 30 Sep 2015 19:06:47 -0400 Subject: whitespace --- src/include/btree.i | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/btree.i b/src/include/btree.i index f43fb73dc5b..ccf7d79a40a 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -1008,7 +1008,7 @@ __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) return (false); /* - * There is no point in splitting in-memory if the insert list doesn't + * There is no point in splitting in-memory if the insert list doesn't * represent most of the page footprint. Split if there are many items, * or if there are enough items and the items are a significant part * of the page's footprint. A 1/4 probability of adding a new skiplist -- cgit v1.2.1 From 702b27f4336f5f0942099664e8aa56a3b785e710 Mon Sep 17 00:00:00 2001 From: David Hows Date: Thu, 1 Oct 2015 10:50:40 +1000 Subject: WT-2152 change how ROLLBACK is dealt with. Make error message lvl 1 --- bench/wtperf/wtperf.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c index 837beb0eb63..5beb6e5fdeb 100644 --- a/bench/wtperf/wtperf.c +++ b/bench/wtperf/wtperf.c @@ -385,7 +385,7 @@ worker(void *arg) size_t i; uint64_t next_val, usecs; uint8_t *op, *op_end; - int hit_rollback, measure_latency, ret, truncated; + int measure_latency, ret, truncated; char *value_buf, *key_buf, *value; char buf[512]; @@ -393,7 +393,6 @@ worker(void *arg) cfg = thread->cfg; conn = cfg->conn; cursors = NULL; - hit_rollback = 0; ops = 0; ops_per_txn = thread->workload->ops_per_txn; session = NULL; @@ -601,23 +600,36 @@ worker(void *arg) if (ret == WT_NOTFOUND) break; -op_err: lprintf(cfg, ret, 0, - "%s failed for: %s, range: %"PRIu64, - op_name(op), key_buf, wtperf_value_range(cfg)); +op_err: if (ret == WT_ROLLBACK && ops_per_txn != 0) { /* - * If we get a rollback error we rollback and continue - */ - if (ret == WT_ROLLBACK) { + * If we are runnibg with explicit eplicit transactions + * configured and we hit a WT_ROLLBACK, then we should + * rollback the current transaction and attmpt to + * continue. + * This does break the guarantee of insertion order in + * cases of ordered inserts, as we aren't retrying here. + */ + lprintf(cfg, ret, 1, + "%s for: %s, range: %"PRIu64, op_name(op), + key_buf, wtperf_value_range(cfg)); if ((ret = session->rollback_transaction( session, NULL)) != 0) { lprintf(cfg, ret, 0, "Failed rollback_transaction"); goto err; } - hit_rollback = 1; + if ((ret = session->begin_transaction( + session, NULL)) != 0) { + lprintf(cfg, ret, 0, + "Worker transaction commit failed"); + goto err; + } break; - } else - goto err; + } + lprintf(cfg, ret, 0, + "%s failed for: %s, range: %"PRIu64, + op_name(op), key_buf, wtperf_value_range(cfg)); + goto err; default: goto err; /* can't happen */ } @@ -649,13 +661,12 @@ op_err: lprintf(cfg, ret, 0, /* Commit our work if configured for explicit transactions */ if (ops_per_txn != 0 && ops++ % ops_per_txn == 0) { - if (!hit_rollback && (ret = session->commit_transaction( + if ((ret = session->commit_transaction( session, NULL)) != 0) { lprintf(cfg, ret, 0, "Worker transaction commit failed"); goto err; } - hit_rollback = 0; if ((ret = session->begin_transaction( session, NULL)) != 0) { lprintf(cfg, ret, 0, -- cgit v1.2.1 From 757851d0ecde135275767cfa060fb36e173d9540 Mon Sep 17 00:00:00 2001 From: David Hows Date: Thu, 1 Oct 2015 10:56:14 +1000 Subject: WT-2152 s_all linting --- bench/wtperf/wtperf.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c index 5beb6e5fdeb..eb2b2185bb1 100644 --- a/bench/wtperf/wtperf.c +++ b/bench/wtperf/wtperf.c @@ -602,13 +602,13 @@ worker(void *arg) op_err: if (ret == WT_ROLLBACK && ops_per_txn != 0) { /* - * If we are runnibg with explicit eplicit transactions - * configured and we hit a WT_ROLLBACK, then we should - * rollback the current transaction and attmpt to + * If we are running with explicit transactions + * configured and we hit a WT_ROLLBACK, then we should + * rollback the current transaction and attempt to * continue. * This does break the guarantee of insertion order in * cases of ordered inserts, as we aren't retrying here. - */ + */ lprintf(cfg, ret, 1, "%s for: %s, range: %"PRIu64, op_name(op), key_buf, wtperf_value_range(cfg)); -- cgit v1.2.1 From 53bf2383f6748e1d6219bd56d52c53f6d0fe17bc Mon Sep 17 00:00:00 2001 From: David Hows Date: Thu, 1 Oct 2015 11:31:47 +1000 Subject: WT-2152 fix style problems --- bench/wtperf/wtperf.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c index eb2b2185bb1..0d30bb6b2b7 100644 --- a/bench/wtperf/wtperf.c +++ b/bench/wtperf/wtperf.c @@ -601,14 +601,15 @@ worker(void *arg) break; op_err: if (ret == WT_ROLLBACK && ops_per_txn != 0) { - /* - * If we are running with explicit transactions - * configured and we hit a WT_ROLLBACK, then we should - * rollback the current transaction and attempt to - * continue. - * This does break the guarantee of insertion order in - * cases of ordered inserts, as we aren't retrying here. - */ + /* + * If we are running with explicit transactions + * configured and we hit a WT_ROLLBACK, then we + * should rollback the current transaction and + * attempt to continue. + * This does break the guarantee of insertion + * order in cases of ordered inserts, as we + * aren't retrying here. + */ lprintf(cfg, ret, 1, "%s for: %s, range: %"PRIu64, op_name(op), key_buf, wtperf_value_range(cfg)); @@ -621,7 +622,7 @@ op_err: if (ret == WT_ROLLBACK && ops_per_txn != 0) { if ((ret = session->begin_transaction( session, NULL)) != 0) { lprintf(cfg, ret, 0, - "Worker transaction commit failed"); + "Worker begin transaction failed"); goto err; } break; @@ -670,7 +671,7 @@ op_err: if (ret == WT_ROLLBACK && ops_per_txn != 0) { if ((ret = session->begin_transaction( session, NULL)) != 0) { lprintf(cfg, ret, 0, - "Worker transaction commit failed"); + "Worker begin transaction failed"); goto err; } } -- cgit v1.2.1 From 4123e425cbcf67ee4910ec778655b9218bc82ac7 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Thu, 1 Oct 2015 12:57:26 +1000 Subject: SERVER-20303 Require a minimum item count for in-memory splits. --- src/include/btree.i | 39 +++++++++++++++++++++------------------ 1 file changed, 21 insertions(+), 18 deletions(-) diff --git a/src/include/btree.i b/src/include/btree.i index ccf7d79a40a..14b5303cca9 100644 --- a/src/include/btree.i +++ b/src/include/btree.i @@ -977,8 +977,8 @@ __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) WT_BTREE *btree; WT_INSERT_HEAD *ins_head; WT_INSERT *ins; - uint64_t mem; - int i; + size_t size; + int count; btree = S2BT(session); @@ -1008,28 +1008,31 @@ __wt_page_can_split(WT_SESSION_IMPL *session, WT_PAGE *page) return (false); /* - * There is no point in splitting in-memory if the insert list doesn't - * represent most of the page footprint. Split if there are many items, - * or if there are enough items and the items are a significant part - * of the page's footprint. A 1/4 probability of adding a new skiplist - * level (with level-0 always created), implies a 2nd level entry for - * every 16 entries in the list. If there are at least 256 2nd level - * entries (4K items), or if the update list hits the maximum leaf page - * size, split. The reason we're walking the 2nd level list (rather than - * walking, for example, the 5th level list and looking for at least 4 - * entries), is it combines the number of entries test and the size of - * the entries test in one loop. + * There is no point doing an in-memory split unless there is a lot of + * data in the last skiplist on the page. Split if there are enough + * items and the skiplist does not fit within a single disk page. + * + * Rather than scanning the whole list, walk a higher level, which + * gives a sample of the items -- at level 0 we have all the items, at + * level 1 we have 1/4 and at level 2 we have 1/16th. If we see more + * than 30 items and more data than would fit in a disk page, split. */ -#define WT_MIN_SPLIT_SKIPLIST_DEPTH WT_MIN(2, WT_SKIP_MAXDEPTH - 1) +#define WT_MIN_SPLIT_DEPTH 2 +#define WT_MIN_SPLIT_COUNT 30 +#define WT_MIN_SPLIT_MULTIPLIER 16 /* At level 2, we see 1/16th entries */ + ins_head = page->pg_row_entries == 0 ? WT_ROW_INSERT_SMALLEST(page) : WT_ROW_INSERT_SLOT(page, page->pg_row_entries - 1); if (ins_head == NULL) return (false); - for (i = 0, mem = 0, ins = ins_head->head[WT_MIN_SPLIT_SKIPLIST_DEPTH]; - ins != NULL; ins = ins->next[WT_MIN_SPLIT_SKIPLIST_DEPTH]) { - mem += WT_UPDATE_MEMSIZE(ins->upd); - if (++i == 256 || mem > btree->maxleafpage) { + for (count = 0, size = 0, ins = ins_head->head[WT_MIN_SPLIT_DEPTH]; + ins != NULL; ins = ins->next[WT_MIN_SPLIT_DEPTH]) { + count += WT_MIN_SPLIT_MULTIPLIER; + size += WT_MIN_SPLIT_MULTIPLIER * + (WT_INSERT_KEY_SIZE(ins) + WT_UPDATE_MEMSIZE(ins->upd)); + if (count > WT_MIN_SPLIT_COUNT && + size > (size_t)btree->maxleafpage) { WT_STAT_FAST_CONN_INCR(session, cache_inmem_splittable); WT_STAT_FAST_DATA_INCR(session, cache_inmem_splittable); return (true); -- cgit v1.2.1 From 41c9d76d52abce27add669ccaec0d690372e5c47 Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Thu, 1 Oct 2015 04:40:32 +0000 Subject: Fair lock review feedback - cleanup diagnostics --- dist/s_string.ok | 1 + src/btree/bt_debug.c | 10 ++++------ src/btree/bt_discard.c | 6 +----- src/include/mutex.i | 14 ++++++++++++++ 4 files changed, 20 insertions(+), 11 deletions(-) diff --git a/dist/s_string.ok b/dist/s_string.ok index 021e222919e..d234a3c101f 100644 --- a/dist/s_string.ok +++ b/dist/s_string.ok @@ -646,6 +646,7 @@ intrin inuse io ip +islocked ispo iteratively jnr diff --git a/src/btree/bt_debug.c b/src/btree/bt_debug.c index f6971631fec..15ae93522a7 100644 --- a/src/btree/bt_debug.c +++ b/src/btree/bt_debug.c @@ -636,7 +636,10 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page) __dmsg(ds, ": %s\n", __wt_page_type_string(page->type)); __dmsg(ds, "\t" "disk %p, entries %" PRIu32, page->dsk, entries); - __dmsg(ds, "%s", __wt_page_is_modified(page) ? ", dirty" : ", clean"); + __dmsg(ds, ", %s", __wt_page_is_modified(page) ? "dirty" : "clean"); + __dmsg(ds, ", %s", __wt_fair_islocked( + session, &page->page_lock) ? "locked" : "unlocked"); + if (F_ISSET_ATOMIC(page, WT_PAGE_BUILD_KEYS)) __dmsg(ds, ", keys-built"); if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC)) @@ -650,11 +653,6 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page) if (F_ISSET_ATOMIC(page, WT_PAGE_SPLIT_INSERT)) __dmsg(ds, ", split-insert"); - if (__wt_fair_trylock(session, &page->page_lock) != 0) - __dmsg(ds, ", locked"); - else - WT_RET(__wt_fair_unlock(session, &page->page_lock)); - if (mod != NULL) switch (mod->rec_result) { case WT_PM_REC_EMPTY: diff --git a/src/btree/bt_discard.c b/src/btree/bt_discard.c index c3da536170d..998667e3e1f 100644 --- a/src/btree/bt_discard.c +++ b/src/btree/bt_discard.c @@ -55,11 +55,7 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) */ WT_ASSERT(session, !__wt_page_is_modified(page)); WT_ASSERT(session, !F_ISSET_ATOMIC(page, WT_PAGE_EVICT_LRU)); - /* - * This will leave the page lock held - but that doesn't matter - we - * are about to free the memory anyway. - */ - WT_ASSERT(session, __wt_fair_trylock(session, &page->page_lock) == 0); + WT_ASSERT(session, !__wt_fair_islocked(session, &page->page_lock)); #ifdef HAVE_DIAGNOSTIC { diff --git a/src/include/mutex.i b/src/include/mutex.i index befa8d4dda8..54a9cc6f9fd 100644 --- a/src/include/mutex.i +++ b/src/include/mutex.i @@ -325,3 +325,17 @@ __wt_fair_unlock(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock) return (0); } + +#ifdef HAVE_DIAGNOSTIC +/* + * __wt_fair_islocked -- + * Test whether the lock is currently held + */ +static inline bool +__wt_fair_islocked(WT_SESSION_IMPL *session, WT_FAIR_LOCK *lock) +{ + WT_UNUSED(session); + + return (lock->fair_lock_waiter != lock->fair_lock_owner); +} +#endif -- cgit v1.2.1 From 0f2ff00afe60668d119da7e71541a3a6fc68b733 Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Thu, 1 Oct 2015 04:50:44 +0000 Subject: Lint: Fallthrough -> FALLTHROUGH --- src/evict/evict_lru.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index 22bfa57ee62..d0cc60b583d 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -1521,7 +1521,7 @@ __wt_cache_eviction_worker(WT_SESSION_IMPL *session, bool busy, u_int pct_full) cache->app_evicts++; if (txn_busy) return (0); - /* Fallthrough */ + /* FALLTHROUGH */ case EBUSY: break; case WT_NOTFOUND: -- cgit v1.2.1 From 1ea12d03e2132a5c2e8c0e0822749c509b55b773 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Thu, 1 Oct 2015 14:52:24 +1000 Subject: WT-2149 Fix the order of creation of the lookaside table. The lookaside table has to be created after recovery has completed because it needs a file ID and otherwise recovery could overwrite its entry in the metadata. However, recovery needs eviction threads running because recovery could touch more data than fits in cache. Resolve this ordering problem by having recovery start a special set of eviction threads without lookaside table cursors. Once it is done, they are shut down, the lookaside table can be created, and a new set of eviction threads started. --- src/conn/conn_api.c | 3 --- src/conn/conn_open.c | 27 +++++++++++++++------------ src/evict/evict_lru.c | 15 +++++++++------ src/include/extern.h | 2 +- src/txn/txn_recover.c | 21 ++++++++++++++++++++- 5 files changed, 45 insertions(+), 23 deletions(-) diff --git a/src/conn/conn_api.c b/src/conn/conn_api.c index 5cb39bb055a..5601936cd19 100644 --- a/src/conn/conn_api.c +++ b/src/conn/conn_api.c @@ -2034,9 +2034,6 @@ wiredtiger_open(const char *home, WT_EVENT_HANDLER *event_handler, WT_ERR(__wt_turtle_init(session)); WT_ERR(__wt_metadata_open(session)); - /* Create the lookaside table. */ - WT_ERR(__wt_las_create(session)); - /* Start the worker threads and run recovery. */ WT_ERR(__wt_connection_workers(session, cfg)); diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c index 675c3765d1a..6db0c4bb10c 100644 --- a/src/conn/conn_open.c +++ b/src/conn/conn_open.c @@ -227,11 +227,6 @@ __wt_connection_close(WT_CONNECTION_IMPL *conn) int __wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[]) { - /* - * Start the eviction thread. - */ - WT_RET(__wt_evict_create(session)); - /* * Start the optional statistics thread. Start statistics first so that * other optional threads can know if statistics are enabled or not. @@ -242,19 +237,27 @@ __wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[]) /* Run recovery. */ WT_RET(__wt_txn_recover(session)); + /* + * Start the optional logging/archive threads. + * NOTE: The log manager must be started before checkpoints so that the + * checkpoint server knows if logging is enabled. It must also be + * started before any operation that can commit, or the commit can + * block. + */ + WT_RET(__wt_logmgr_open(session)); + + /* Create the lookaside table. */ + WT_RET(__wt_las_create(session)); + + /* Start eviction threads. */ + WT_RET(__wt_evict_create(session, true)); + /* Start the handle sweep thread. */ WT_RET(__wt_sweep_create(session)); /* Start the optional async threads. */ WT_RET(__wt_async_create(session, cfg)); - /* - * Start the optional logging/archive thread. - * NOTE: The log manager must be started before checkpoints so that the - * checkpoint server knows if logging is enabled. - */ - WT_RET(__wt_logmgr_open(session)); - /* Start the optional checkpoint thread. */ WT_RET(__wt_checkpoint_server_create(session, cfg)); diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index c7da699b77e..42cd3e0b53d 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -240,10 +240,13 @@ __evict_workers_resize(WT_SESSION_IMPL *session) conn = S2C(session); - alloc = conn->evict_workers_alloc * sizeof(*workers); - WT_RET(__wt_realloc(session, &alloc, - conn->evict_workers_max * sizeof(*workers), &conn->evict_workctx)); - workers = conn->evict_workctx; + if (conn->evict_workers_alloc < conn->evict_workers_max) { + alloc = conn->evict_workers_alloc * sizeof(*workers); + WT_RET(__wt_realloc(session, &alloc, + conn->evict_workers_max * sizeof(*workers), + &conn->evict_workctx)); + workers = conn->evict_workctx; + } for (i = conn->evict_workers_alloc; i < conn->evict_workers_max; i++) { /* @@ -275,7 +278,7 @@ err: conn->evict_workers_alloc = conn->evict_workers_max; * Start the eviction server thread. */ int -__wt_evict_create(WT_SESSION_IMPL *session) +__wt_evict_create(WT_SESSION_IMPL *session, bool with_las) { WT_CONNECTION_IMPL *conn; uint32_t session_flags; @@ -294,7 +297,7 @@ __wt_evict_create(WT_SESSION_IMPL *session) * perform slow operations for the block manager. (The flag is not * reset if reconfigured later, but I doubt that's a problem.) */ - session_flags = WT_SESSION_LOOKASIDE_CURSOR; + session_flags = with_las ? WT_SESSION_LOOKASIDE_CURSOR : 0; if (conn->evict_workers_max == 0) FLD_SET(session_flags, WT_SESSION_CAN_WAIT); WT_RET(__wt_open_internal_session(conn, diff --git a/src/include/extern.h b/src/include/extern.h index 6c844f38f5d..c28a24343ee 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -318,7 +318,7 @@ extern int __wt_curtable_open(WT_SESSION_IMPL *session, const char *uri, const c extern int __wt_evict_file(WT_SESSION_IMPL *session, int syncop); extern void __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_evict_server_wake(WT_SESSION_IMPL *session); -extern int __wt_evict_create(WT_SESSION_IMPL *session); +extern int __wt_evict_create(WT_SESSION_IMPL *session, bool with_las); extern int __wt_evict_destroy(WT_SESSION_IMPL *session); extern int __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp); extern void __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session); diff --git a/src/txn/txn_recover.c b/src/txn/txn_recover.c index f2b181711d1..9d8a19cbff3 100644 --- a/src/txn/txn_recover.c +++ b/src/txn/txn_recover.c @@ -412,11 +412,12 @@ __wt_txn_recover(WT_SESSION_IMPL *session) WT_RECOVERY r; struct WT_RECOVERY_FILE *metafile; char *config; - bool needs_rec, was_backup; + bool eviction_started, needs_rec, was_backup; conn = S2C(session); WT_CLEAR(r); WT_INIT_LSN(&r.ckpt_lsn); + eviction_started = false; was_backup = F_ISSET(conn, WT_CONN_WAS_BACKUP); /* We need a real session for recovery. */ @@ -494,6 +495,15 @@ __wt_txn_recover(WT_SESSION_IMPL *session) */ if (needs_rec && FLD_ISSET(conn->log_flags, WT_CONN_LOG_RECOVER_ERR)) WT_ERR(WT_RUN_RECOVERY); + + /* + * Recovery can touch more data than fits in cache, so it relies on + * regular eviction to manage paging. Start eviction threads for + * recovery without LAS cursors. + */ + WT_ERR(__wt_evict_create(session, false)); + eviction_started = true; + /* * Always run recovery even if it was a clean shutdown. * We can consider skipping it in the future. @@ -522,6 +532,15 @@ __wt_txn_recover(WT_SESSION_IMPL *session) done: FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE); err: WT_TRET(__recovery_free(&r)); __wt_free(session, config); + + /* + * Destroy the eviction threads that were started in support of + * recovery. They will be restarted once the lookaside table is + * created. + */ + if (eviction_started) + WT_TRET(__wt_evict_destroy(session)); + WT_TRET(session->iface.close(&session->iface, NULL)); return (ret); -- cgit v1.2.1 From 735e3ddcd3a12a1a53af1bf6469ec8b34ab60f9a Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Thu, 1 Oct 2015 15:59:30 +1000 Subject: WT-2149 Remove files from the dummy session during connection close so the lookaside handle isn't referenced after it is closed. --- src/conn/conn_dhandle.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c index 77e7693042b..0b364b5fd4b 100644 --- a/src/conn/conn_dhandle.c +++ b/src/conn/conn_dhandle.c @@ -678,11 +678,15 @@ __wt_conn_dhandle_discard(WT_SESSION_IMPL *session) conn = S2C(session); /* - * Close open data handles: first, everything but the metadata file - * (as closing a normal file may open and write the metadata file), - * then the metadata file. This function isn't called often, and I - * don't want to "know" anything about the metadata file's position on - * the list, so we do it the hard way. + * Empty the session cache: any data handles created in a connection + * method may be cached here, and we're about to close them. + */ + __wt_session_close_cache(session); + + /* + * Close open data handles: first, everything but the metadata file (as + * closing a normal file may open and write the metadata file), then + * the metadata file. */ restart: TAILQ_FOREACH(dhandle, &conn->dhqh, q) { -- cgit v1.2.1 From 996fe0cc5924708e1476228bfbb28a04ba3115ae Mon Sep 17 00:00:00 2001 From: Alex Gorrod Date: Thu, 1 Oct 2015 17:00:09 +1000 Subject: WT-2155 Remove last use of F_CAS_ATOMIC and the associated macro. --- src/conn/conn_cache_pool.c | 17 ++++++++--------- src/include/cache.h | 7 ++++--- src/include/hardware.h | 13 ------------- 3 files changed, 12 insertions(+), 25 deletions(-) diff --git a/src/conn/conn_cache_pool.c b/src/conn/conn_cache_pool.c index 6294e3b01a7..ae9e7eb1d60 100644 --- a/src/conn/conn_cache_pool.c +++ b/src/conn/conn_cache_pool.c @@ -275,7 +275,7 @@ __wt_conn_cache_pool_open(WT_SESSION_IMPL *session) * in each connection saves having a complex election process when * the active connection shuts down. */ - F_SET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE); + F_SET(cp, WT_CACHE_POOL_ACTIVE); F_SET(cache, WT_CACHE_POOL_RUN); WT_RET(__wt_thread_create(session, &cache->cp_tid, __wt_cache_pool_server, cache->cp_session)); @@ -366,10 +366,10 @@ __wt_conn_cache_pool_destroy(WT_SESSION_IMPL *session) if (--cp->refs == 0) { WT_ASSERT(session, TAILQ_EMPTY(&cp->cache_pool_qh)); - F_CLR_ATOMIC(cp, WT_CACHE_POOL_ACTIVE); + F_CLR(cp, WT_CACHE_POOL_ACTIVE); } - if (!F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE)) { + if (!F_ISSET(cp, WT_CACHE_POOL_ACTIVE)) { WT_TRET(__wt_verbose( session, WT_VERB_SHARED_CACHE, "Destroying cache pool")); __wt_spin_lock(session, &__wt_process.spinlock); @@ -398,7 +398,7 @@ __wt_conn_cache_pool_destroy(WT_SESSION_IMPL *session) /* Notify other participants if we were managing */ if (F_ISSET(cache, WT_CACHE_POOL_MANAGER)) { - F_CLR_ATOMIC(cp, WT_CACHE_POOL_MANAGED); + cp->pool_managed = 0; WT_TRET(__wt_verbose(session, WT_VERB_SHARED_CACHE, "Shutting down shared cache manager connection")); } @@ -438,7 +438,7 @@ __cache_pool_balance(WT_SESSION_IMPL *session, bool forward) * - Reduce the amount allocated, if we are over the budget * - Increase the amount used if there is capacity and any pressure. */ - while (F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) && + while (F_ISSET(cp, WT_CACHE_POOL_ACTIVE) && F_ISSET(S2C(session)->cache, WT_CACHE_POOL_RUN)) { WT_ERR(__cache_pool_adjust( session, highest, bump_threshold, forward, &adjusted)); @@ -728,7 +728,7 @@ __wt_cache_pool_server(void *arg) cache = S2C(session)->cache; forward = true; - while (F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) && + while (F_ISSET(cp, WT_CACHE_POOL_ACTIVE) && F_ISSET(cache, WT_CACHE_POOL_RUN)) { if (cp->currently_used <= cp->size) WT_ERR(__wt_cond_wait(session, @@ -738,13 +738,12 @@ __wt_cache_pool_server(void *arg) * Re-check pool run flag - since we want to avoid getting the * lock on shutdown. */ - if (!F_ISSET_ATOMIC(cp, WT_CACHE_POOL_ACTIVE) && + if (!F_ISSET(cp, WT_CACHE_POOL_ACTIVE) && F_ISSET(cache, WT_CACHE_POOL_RUN)) break; /* Try to become the managing thread */ - F_CAS_ATOMIC(cp, WT_CACHE_POOL_MANAGED, ret); - if (ret == 0) { + if (__wt_atomic_cas8(&cp->pool_managed, 0, 1)) { F_SET(cache, WT_CACHE_POOL_MANAGER); WT_ERR(__wt_verbose(session, WT_VERB_SHARED_CACHE, "Cache pool switched manager thread")); diff --git a/src/include/cache.h b/src/include/cache.h index f199372ea5e..caf8996e68b 100644 --- a/src/include/cache.h +++ b/src/include/cache.h @@ -151,7 +151,8 @@ struct __wt_cache_pool { /* Locked: List of connections participating in the cache pool. */ TAILQ_HEAD(__wt_cache_pool_qh, __wt_connection_impl) cache_pool_qh; -#define WT_CACHE_POOL_MANAGED 0x01 /* Cache pool has a manager thread */ -#define WT_CACHE_POOL_ACTIVE 0x02 /* Cache pool is active */ - uint8_t flags_atomic; + uint8_t pool_managed; /* Cache pool has a manager thread */ + +#define WT_CACHE_POOL_ACTIVE 0x01 /* Cache pool is active */ + uint8_t flags; }; diff --git a/src/include/hardware.h b/src/include/hardware.h index c9b72f8a609..1ab2c3d39c4 100644 --- a/src/include/hardware.h +++ b/src/include/hardware.h @@ -37,19 +37,6 @@ &(p)->flags_atomic, __orig, __orig | (uint8_t)(mask))); \ } while (0) -#define F_CAS_ATOMIC(p, mask, ret) do { \ - uint8_t __orig; \ - ret = 0; \ - do { \ - __orig = (p)->flags_atomic; \ - if ((__orig & (uint8_t)(mask)) != 0) { \ - ret = EBUSY; \ - break; \ - } \ - } while (!__wt_atomic_cas8( \ - &(p)->flags_atomic, __orig, __orig | (uint8_t)(mask))); \ -} while (0) - #define F_CLR_ATOMIC(p, mask) do { \ uint8_t __orig; \ do { \ -- cgit v1.2.1 From 20f68fc491d27e854adf10bac87b443f6cb7efa3 Mon Sep 17 00:00:00 2001 From: Susan LoVerso Date: Thu, 1 Oct 2015 13:56:41 -0400 Subject: WT-2151 Reset pre-alloc files to 1, it didn't affect performance. --- src/conn/conn_log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c index 6f9021ccfc3..bf2447fb646 100644 --- a/src/conn/conn_log.c +++ b/src/conn/conn_log.c @@ -109,7 +109,7 @@ __logmgr_config( */ WT_RET(__wt_config_gets(session, cfg, "log.prealloc", &cval)); if (cval.val != 0) - conn->log_prealloc = 5; + conn->log_prealloc = 1; /* * Note that it is meaningless to reconfigure this value during -- cgit v1.2.1 From 2c6072d869d0945812af49aaa8ba0613565f9284 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Fri, 2 Oct 2015 16:48:55 +1000 Subject: WT-2149 WT-2156 Allow eviction workers to restart. --- src/evict/evict_lru.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index a597079ff47..dd1c86b5ad8 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -364,6 +364,8 @@ __wt_evict_destroy(WT_SESSION_IMPL *session) WT_TRET(__wt_cond_signal(session, cache->evict_waiter_cond)); WT_TRET(__wt_thread_join(session, workers[i].tid)); } + conn->evict_workers = 0; + /* Handle shutdown when cleaning up after a failed open. */ if (conn->evict_workctx != NULL) { for (i = 0; i < conn->evict_workers_alloc; i++) { @@ -373,6 +375,7 @@ __wt_evict_destroy(WT_SESSION_IMPL *session) } __wt_free(session, conn->evict_workctx); } + conn->evict_workers_alloc = 0; if (conn->evict_session != NULL) { wt_session = &conn->evict_session->iface; -- cgit v1.2.1 From 4e7974c2c4fa4a6ec06ae21efea477635bfc14d9 Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Mon, 5 Oct 2015 09:51:36 -0400 Subject: We lost a return in 67fd264, add it back in so we don't check the config twice on success in one path. --- src/config/config.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/config/config.c b/src/config/config.c index 27de6264a28..505b843aa86 100644 --- a/src/config/config.c +++ b/src/config/config.c @@ -745,11 +745,16 @@ __wt_config_gets_def(WT_SESSION_IMPL *session, *value = false_value; value->val = def; + if (cfg == NULL || cfg[0] == NULL || cfg[1] == NULL) return (0); - else if (cfg[2] == NULL) + + if (cfg[2] == NULL) { WT_RET_NOTFOUND_OK( __wt_config_getones(session, cfg[1], key, value)); + return (0); + } + return (__wt_config_gets(session, cfg, key, value)); } -- cgit v1.2.1 From a20a23a50a63a466504b7149f957c5eac0c5e657 Mon Sep 17 00:00:00 2001 From: Susan LoVerso Date: Mon, 5 Oct 2015 12:26:38 -0400 Subject: Add "WARNING" string to max_latency failure so Jenkins can detect. --- bench/wtperf/wtperf.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bench/wtperf/wtperf.c b/bench/wtperf/wtperf.c index 0d30bb6b2b7..20c30e10482 100644 --- a/bench/wtperf/wtperf.c +++ b/bench/wtperf/wtperf.c @@ -1198,8 +1198,12 @@ monitor(void *arg) if (latency_max != 0 && (read_max > latency_max || insert_max > latency_max || update_max > latency_max)) + /* + * Make this a non-fatal error and print WARNING in + * the output so Jenkins can flag it as unstable. + */ lprintf(cfg, 0, 0, - "max latency exceeded: threshold %" PRIu32 + "WARNING: max latency exceeded: threshold %" PRIu32 " read max %" PRIu32 " insert max %" PRIu32 " update max %" PRIu32, latency_max, read_max, insert_max, update_max); -- cgit v1.2.1 From 54150b85ec2d1647bf56a949e85d2d61c8b8e04b Mon Sep 17 00:00:00 2001 From: Keith Bostic Date: Mon, 5 Oct 2015 15:15:41 -0400 Subject: WT-2161: the statistics log starts before we create the lookaside table, WT_CONNECTION_IMPL.las_session may be NULL, and because the session is created before the cursor, it has to be checked as well. --- src/cache/cache_las.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/cache/cache_las.c b/src/cache/cache_las.c index 714963b18d4..30de8ce2301 100644 --- a/src/cache/cache_las.c +++ b/src/cache/cache_las.c @@ -24,10 +24,12 @@ __wt_las_stats_update(WT_SESSION_IMPL *session) /* * Lookaside table statistics are copied from the underlying lookaside * table data-source statistics. If there's no lookaside table, values - * remain 0. In the current system, there's always a lookaside table, - * but there's no reason not to be cautious. + * remain 0. + * + * The statistics log is started before we create the lookaside table, + * check the session and the cursor for validity. */ - if (conn->las_session->las_cursor == NULL) + if (conn->las_session == NULL || conn->las_session->las_cursor == NULL) return; /* -- cgit v1.2.1 From d2db4c424dd5513bfd5cf64525914099fc7774fd Mon Sep 17 00:00:00 2001 From: Susan LoVerso Date: Tue, 6 Oct 2015 12:19:00 -0400 Subject: WT-2158 Lint --- src/log/log.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/log/log.c b/src/log/log.c index 6277785940c..832f15f4821 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -368,8 +368,7 @@ __log_zero(WT_SESSION_IMPL *session, WT_DECL_ITEM(zerobuf); WT_DECL_RET; WT_LOG *log; - wt_off_t off, partial; - uint32_t allocsize, bufsz, wrlen; + uint32_t allocsize, bufsz, off, partial, wrlen; conn = S2C(session); log = conn->log; @@ -383,7 +382,7 @@ __log_zero(WT_SESSION_IMPL *session, * If they're using smaller log files, cap it at the file size. */ if (conn->log_file_max < bufsz) - bufsz = conn->log_file_max; + bufsz = (uint32_t)conn->log_file_max; WT_RET(__wt_scr_alloc(session, bufsz, &zerobuf)); memset(zerobuf->mem, 0, zerobuf->memsize); WT_STAT_FAST_CONN_INCR(session, log_zero_fills); @@ -393,14 +392,14 @@ __log_zero(WT_SESSION_IMPL *session, * we reach the beginning or we find a chunk that contains any non-zero * bytes. Compare against a known zero byte chunk. */ - off = start_off; - while (off < len) { + off = (uint32_t)start_off; + while (off < (uint32_t)len) { /* * Typically we start to zero the file after the log header * and the bufsz is a sector-aligned size. So we want to * align our writes when we can. */ - partial = off % (wt_off_t)bufsz; + partial = off % bufsz; if (partial != 0) wrlen = bufsz - partial; else @@ -408,9 +407,10 @@ __log_zero(WT_SESSION_IMPL *session, /* * Check if we're writing a partial amount at the end too. */ - if (len - off < bufsz) - wrlen = len - off; - WT_ERR(__wt_write(session, fh, off, wrlen, zerobuf->mem)); + if ((uint32_t)len - off < bufsz) + wrlen = (uint32_t)len - off; + WT_ERR(__wt_write(session, + fh, (wt_off_t)off, wrlen, zerobuf->mem)); off += wrlen; } err: __wt_scr_free(session, &zerobuf); -- cgit v1.2.1 From 5c0cc8192507cf52f7cda30109b07ba11685602b Mon Sep 17 00:00:00 2001 From: Susan LoVerso Date: Tue, 6 Oct 2015 17:19:46 -0400 Subject: WT-2163 Remove unneeded arg. --- src/conn/conn_log.c | 2 +- src/include/extern.h | 2 +- src/log/log.c | 7 +++---- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/conn/conn_log.c b/src/conn/conn_log.c index a2555ba536f..9068e7e85a2 100644 --- a/src/conn/conn_log.c +++ b/src/conn/conn_log.c @@ -260,7 +260,7 @@ __log_prealloc_once(WT_SESSION_IMPL *session) */ for (i = reccount; i < (u_int)conn->log_prealloc; i++) { WT_ERR(__wt_log_allocfile( - session, ++log->prep_fileid, WT_LOG_PREPNAME, true)); + session, ++log->prep_fileid, WT_LOG_PREPNAME)); WT_STAT_FAST_CONN_INCR(session, log_prealloc_files); } /* diff --git a/src/include/extern.h b/src/include/extern.h index c196d550c7e..171f6b02768 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -337,7 +337,7 @@ extern int __wt_log_get_all_files(WT_SESSION_IMPL *session, char ***filesp, u_in extern void __wt_log_files_free(WT_SESSION_IMPL *session, char **files, u_int count); extern int __wt_log_extract_lognum( WT_SESSION_IMPL *session, const char *name, uint32_t *id); extern int __wt_log_acquire(WT_SESSION_IMPL *session, uint64_t recsize, WT_LOGSLOT *slot); -extern int __wt_log_allocfile( WT_SESSION_IMPL *session, uint32_t lognum, const char *dest, bool prealloc); +extern int __wt_log_allocfile( WT_SESSION_IMPL *session, uint32_t lognum, const char *dest); extern int __wt_log_remove(WT_SESSION_IMPL *session, const char *file_prefix, uint32_t lognum); extern int __wt_log_open(WT_SESSION_IMPL *session); extern int __wt_log_close(WT_SESSION_IMPL *session); diff --git a/src/log/log.c b/src/log/log.c index 832f15f4821..efe4d22eeca 100644 --- a/src/log/log.c +++ b/src/log/log.c @@ -822,7 +822,7 @@ __log_newfile(WT_SESSION_IMPL *session, bool conn_open, bool *created) if (create_log) { log->prep_missed++; WT_RET(__wt_log_allocfile( - session, log->fileid, WT_LOG_FILENAME, true)); + session, log->fileid, WT_LOG_FILENAME)); } WT_RET(__log_openfile(session, false, &log->log_fh, WT_LOG_FILENAME, log->fileid)); @@ -973,7 +973,7 @@ err: WT_TRET(__wt_close(session, &log_fh)); */ int __wt_log_allocfile( - WT_SESSION_IMPL *session, uint32_t lognum, const char *dest, bool prealloc) + WT_SESSION_IMPL *session, uint32_t lognum, const char *dest) { WT_CONNECTION_IMPL *conn; WT_DECL_ITEM(from_path); @@ -1005,8 +1005,7 @@ __wt_log_allocfile( WT_ERR(__log_openfile(session, true, &log_fh, WT_LOG_TMPNAME, tmp_id)); WT_ERR(__log_file_header(session, log_fh, NULL, true)); WT_ERR(__wt_ftruncate(session, log_fh, WT_LOG_FIRST_RECORD)); - if (prealloc) - WT_ERR(__log_prealloc(session, log_fh)); + WT_ERR(__log_prealloc(session, log_fh)); WT_ERR(__wt_fsync(session, log_fh)); WT_ERR(__wt_close(session, &log_fh)); WT_ERR(__wt_verbose(session, WT_VERB_LOG, -- cgit v1.2.1 From 6def405726ba8060f968dac388cc8eb07e09a242 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Wed, 7 Oct 2015 15:55:26 +1100 Subject: WT-2157 If we give up trying to split a page, make sure it is written by the next checkpoint. --- src/btree/bt_sync.c | 3 +-- src/reconcile/rec_write.c | 37 +++++++++++-------------------------- 2 files changed, 12 insertions(+), 28 deletions(-) diff --git a/src/btree/bt_sync.c b/src/btree/bt_sync.c index 247bdef65c8..237d900c3d1 100644 --- a/src/btree/bt_sync.c +++ b/src/btree/bt_sync.c @@ -140,8 +140,7 @@ __sync_file(WT_SESSION_IMPL *session, int syncop) */ if (!WT_PAGE_IS_INTERNAL(page) && F_ISSET(txn, WT_TXN_HAS_SNAPSHOT) && - WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn) && - mod->rec_result != WT_PM_REC_REWRITE) { + WT_TXNID_LT(txn->snap_max, mod->first_dirty_txn)) { __wt_page_modify_set(session, page); continue; } diff --git a/src/reconcile/rec_write.c b/src/reconcile/rec_write.c index f2c32a434bf..40917bebf56 100644 --- a/src/reconcile/rec_write.c +++ b/src/reconcile/rec_write.c @@ -44,7 +44,6 @@ typedef struct { * Track maximum transaction ID seen and first unwritten transaction ID. */ uint64_t max_txn; - uint64_t first_dirty_txn; /* * When we can't mark the page clean (for example, checkpoint found some @@ -292,7 +291,7 @@ typedef struct { } WT_RECONCILE; static void __rec_bnd_cleanup(WT_SESSION_IMPL *, WT_RECONCILE *, bool); -static void __rec_cell_build_addr( +static void __rec_cell_build_addr(WT_SESSION_IMPL *, WT_RECONCILE *, const void *, size_t, u_int, uint64_t); static int __rec_cell_build_int_key(WT_SESSION_IMPL *, WT_RECONCILE *, const void *, size_t, bool *); @@ -537,11 +536,6 @@ __rec_write_status(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) * Set the page's status based on whether or not we cleaned the page. */ if (r->leave_dirty) { - /* - * Update the page's first unwritten transaction ID. - */ - mod->first_dirty_txn = r->first_dirty_txn; - /* * The page remains dirty. * @@ -880,12 +874,6 @@ __rec_write_init(WT_SESSION_IMPL *session, r->cache_write_lookaside = r->cache_write_restore = false; - /* - * Running transactions may update the page after we write it, so - * this is the highest ID we can be confident we will see. - */ - r->first_dirty_txn = conn->txn_global.last_running; - return (0); } @@ -1083,17 +1071,11 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, if ((txnid = upd->txnid) == WT_TXN_ABORTED) continue; - /* - * Track the largest/smallest transaction IDs on the list and - * the smallest not-globally-visible transaction on the page. - */ + /* Track the largest/smallest transaction IDs on the list. */ if (WT_TXNID_LT(max_txn, txnid)) max_txn = txnid; if (WT_TXNID_LT(txnid, min_txn)) min_txn = txnid; - if (WT_TXNID_LT(txnid, r->first_dirty_txn) && - !__wt_txn_visible_all(session, txnid)) - r->first_dirty_txn = txnid; /* * Find the first update we can use. @@ -3837,7 +3819,8 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) val->cell_len = 0; val->len = val->buf.size; } else - __rec_cell_build_addr(r, addr->addr, addr->size, + __rec_cell_build_addr(session, r, + addr->addr, addr->size, __rec_vtype(addr), ref->key.recno); WT_CHILD_RELEASE_ERR(session, hazard, ref); @@ -3883,7 +3866,7 @@ __rec_col_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) /* Build the value cell. */ addr = &multi->addr; - __rec_cell_build_addr(r, + __rec_cell_build_addr(session, r, addr->addr, addr->size, __rec_vtype(addr), r->recno); /* Boundary: split or write the page. */ @@ -4708,7 +4691,7 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) vtype = state == WT_CHILD_PROXY ? WT_CELL_ADDR_DEL : (u_int)vpack->raw; } - __rec_cell_build_addr(r, p, size, vtype, WT_RECNO_OOB); + __rec_cell_build_addr(session, r, p, size, vtype, WT_RECNO_OOB); WT_CHILD_RELEASE_ERR(session, hazard, ref); /* @@ -4794,8 +4777,8 @@ __rec_row_merge(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) r->cell_zero = false; addr = &multi->addr; - __rec_cell_build_addr( - r, addr->addr, addr->size, __rec_vtype(addr), WT_RECNO_OOB); + __rec_cell_build_addr(session, r, + addr->addr, addr->size, __rec_vtype(addr), WT_RECNO_OOB); /* Boundary: split or write the page. */ if (key->len + val->len > r->space_avail) @@ -5863,13 +5846,15 @@ __rec_cell_build_leaf_key(WT_SESSION_IMPL *session, * on the page. */ static void -__rec_cell_build_addr(WT_RECONCILE *r, +__rec_cell_build_addr(WT_SESSION_IMPL *session, WT_RECONCILE *r, const void *addr, size_t size, u_int cell_type, uint64_t recno) { WT_KV *val; val = &r->v; + WT_ASSERT(session, size != 0 || cell_type == WT_CELL_ADDR_DEL); + /* * We don't check the address size because we can't store an address on * an overflow page: if the address won't fit, the overflow page's -- cgit v1.2.1 From a792371e4584c49f3c94b9e5843e0d57b2341fd8 Mon Sep 17 00:00:00 2001 From: Michael Cahill Date: Wed, 7 Oct 2015 16:13:12 +1100 Subject: WT-2149 Track if the lookaside table is open when starting eviction workers. --- dist/flags.py | 1 + src/cache/cache_las.c | 24 ++++++++++++------------ src/conn/conn_open.c | 15 ++++++++++++--- src/evict/evict_lru.c | 16 +++++++++++----- src/include/extern.h | 2 +- src/include/flags.h | 23 ++++++++++++----------- src/txn/txn_recover.c | 5 ++++- 7 files changed, 53 insertions(+), 33 deletions(-) diff --git a/dist/flags.py b/dist/flags.py index d98f249335e..65b68cf4277 100644 --- a/dist/flags.py +++ b/dist/flags.py @@ -92,6 +92,7 @@ flags = { 'CONN_CKPT_SYNC', 'CONN_CLOSING', 'CONN_EVICTION_RUN', + 'CONN_LAS_OPEN', 'CONN_LEAK_MEMORY', 'CONN_LOG_SERVER_RUN', 'CONN_LSM_MERGE', diff --git a/src/cache/cache_las.c b/src/cache/cache_las.c index 30de8ce2301..2eb406c2af8 100644 --- a/src/cache/cache_las.c +++ b/src/cache/cache_las.c @@ -25,11 +25,8 @@ __wt_las_stats_update(WT_SESSION_IMPL *session) * Lookaside table statistics are copied from the underlying lookaside * table data-source statistics. If there's no lookaside table, values * remain 0. - * - * The statistics log is started before we create the lookaside table, - * check the session and the cursor for validity. */ - if (conn->las_session == NULL || conn->las_session->las_cursor == NULL) + if (!F_ISSET(conn, WT_CONN_LAS_OPEN)) return; /* @@ -62,25 +59,28 @@ __wt_las_create(WT_SESSION_IMPL *session) /* * Done at startup: we cannot do it on demand because we require the - * schema lock to create and drop the file, and it may not always be + * schema lock to create and drop the table, and it may not always be * available. * - * Discard any previous incarnation of the file. + * Discard any previous incarnation of the table. */ WT_RET(__wt_session_drop(session, WT_LAS_URI, drop_cfg)); - /* Re-create the file. */ + /* Re-create the table. */ WT_RET(__wt_session_create(session, WT_LAS_URI, WT_LAS_FORMAT)); /* - * Open an internal session, used for the shared lookaside cursor. - * - * Sessions associated with a lookaside cursor should never be tapped - * for eviction. + * Open a shared internal session used to access the lookaside table. + * This session should never be tapped for eviction. */ session_flags = WT_SESSION_LOOKASIDE_CURSOR | WT_SESSION_NO_EVICTION; - return (__wt_open_internal_session( + WT_RET(__wt_open_internal_session( conn, "lookaside table", true, session_flags, &conn->las_session)); + + /* Flag that the lookaside table has been created. */ + F_SET(conn, WT_CONN_LAS_OPEN); + + return (0); } /* diff --git a/src/conn/conn_open.c b/src/conn/conn_open.c index 6db0c4bb10c..04815c8e152 100644 --- a/src/conn/conn_open.c +++ b/src/conn/conn_open.c @@ -234,7 +234,13 @@ __wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[]) WT_RET(__wt_statlog_create(session, cfg)); WT_RET(__wt_logmgr_create(session, cfg)); - /* Run recovery. */ + /* + * Run recovery. + * NOTE: This call will start (and stop) eviction if recovery is + * required. Recovery must run before the lookaside table is created + * (because recovery will update the metadata), and before eviction is + * started for real. + */ WT_RET(__wt_txn_recover(session)); /* @@ -249,8 +255,11 @@ __wt_connection_workers(WT_SESSION_IMPL *session, const char *cfg[]) /* Create the lookaside table. */ WT_RET(__wt_las_create(session)); - /* Start eviction threads. */ - WT_RET(__wt_evict_create(session, true)); + /* + * Start eviction threads. + * NOTE: Eviction must be started after the lookaside table is created. + */ + WT_RET(__wt_evict_create(session)); /* Start the handle sweep thread. */ WT_RET(__wt_sweep_create(session)); diff --git a/src/evict/evict_lru.c b/src/evict/evict_lru.c index dd1c86b5ad8..f9171900ca4 100644 --- a/src/evict/evict_lru.c +++ b/src/evict/evict_lru.c @@ -251,12 +251,17 @@ __evict_workers_resize(WT_SESSION_IMPL *session) for (i = conn->evict_workers_alloc; i < conn->evict_workers_max; i++) { /* * Eviction worker threads get their own session. - * Eviction worker threads get their own lookaside table cursor. * Eviction worker threads may be called upon to perform slow * operations for the block manager. + * + * Eviction worker threads get their own lookaside table cursor + * if the lookaside table is open. Note that eviction is also + * started during recovery, before the lookaside table is + * created. */ - session_flags = - WT_SESSION_CAN_WAIT | WT_SESSION_LOOKASIDE_CURSOR; + session_flags = WT_SESSION_CAN_WAIT; + if (F_ISSET(conn, WT_CONN_LAS_OPEN)) + FLD_SET(session_flags, WT_SESSION_LOOKASIDE_CURSOR); WT_ERR(__wt_open_internal_session(conn, "eviction-worker", false, session_flags, &workers[i].session)); workers[i].id = i; @@ -278,7 +283,7 @@ err: conn->evict_workers_alloc = conn->evict_workers_max; * Start the eviction server thread. */ int -__wt_evict_create(WT_SESSION_IMPL *session, bool with_las) +__wt_evict_create(WT_SESSION_IMPL *session) { WT_CONNECTION_IMPL *conn; uint32_t session_flags; @@ -297,7 +302,8 @@ __wt_evict_create(WT_SESSION_IMPL *session, bool with_las) * perform slow operations for the block manager. (The flag is not * reset if reconfigured later, but I doubt that's a problem.) */ - session_flags = with_las ? WT_SESSION_LOOKASIDE_CURSOR : 0; + session_flags = F_ISSET(conn, WT_CONN_LAS_OPEN) ? + WT_SESSION_LOOKASIDE_CURSOR : 0; if (conn->evict_workers_max == 0) FLD_SET(session_flags, WT_SESSION_CAN_WAIT); WT_RET(__wt_open_internal_session(conn, diff --git a/src/include/extern.h b/src/include/extern.h index 171f6b02768..1f63f07646e 100644 --- a/src/include/extern.h +++ b/src/include/extern.h @@ -319,7 +319,7 @@ extern int __wt_curtable_open(WT_SESSION_IMPL *session, const char *uri, const c extern int __wt_evict_file(WT_SESSION_IMPL *session, int syncop); extern void __wt_evict_list_clear_page(WT_SESSION_IMPL *session, WT_REF *ref); extern int __wt_evict_server_wake(WT_SESSION_IMPL *session); -extern int __wt_evict_create(WT_SESSION_IMPL *session, bool with_las); +extern int __wt_evict_create(WT_SESSION_IMPL *session); extern int __wt_evict_destroy(WT_SESSION_IMPL *session); extern int __wt_evict_file_exclusive_on(WT_SESSION_IMPL *session, bool *evict_resetp); extern void __wt_evict_file_exclusive_off(WT_SESSION_IMPL *session); diff --git a/src/include/flags.h b/src/include/flags.h index ca3c3c38245..24dccd30913 100644 --- a/src/include/flags.h +++ b/src/include/flags.h @@ -6,17 +6,18 @@ #define WT_CONN_CKPT_SYNC 0x00000002 #define WT_CONN_CLOSING 0x00000004 #define WT_CONN_EVICTION_RUN 0x00000008 -#define WT_CONN_LEAK_MEMORY 0x00000010 -#define WT_CONN_LOG_SERVER_RUN 0x00000020 -#define WT_CONN_LSM_MERGE 0x00000040 -#define WT_CONN_PANIC 0x00000080 -#define WT_CONN_SERVER_ASYNC 0x00000100 -#define WT_CONN_SERVER_CHECKPOINT 0x00000200 -#define WT_CONN_SERVER_LSM 0x00000400 -#define WT_CONN_SERVER_RUN 0x00000800 -#define WT_CONN_SERVER_STATISTICS 0x00001000 -#define WT_CONN_SERVER_SWEEP 0x00002000 -#define WT_CONN_WAS_BACKUP 0x00004000 +#define WT_CONN_LAS_OPEN 0x00000010 +#define WT_CONN_LEAK_MEMORY 0x00000020 +#define WT_CONN_LOG_SERVER_RUN 0x00000040 +#define WT_CONN_LSM_MERGE 0x00000080 +#define WT_CONN_PANIC 0x00000100 +#define WT_CONN_SERVER_ASYNC 0x00000200 +#define WT_CONN_SERVER_CHECKPOINT 0x00000400 +#define WT_CONN_SERVER_LSM 0x00000800 +#define WT_CONN_SERVER_RUN 0x00001000 +#define WT_CONN_SERVER_STATISTICS 0x00002000 +#define WT_CONN_SERVER_SWEEP 0x00004000 +#define WT_CONN_WAS_BACKUP 0x00008000 #define WT_EVICTING 0x00000001 #define WT_EVICT_LOOKASIDE 0x00000002 #define WT_EVICT_UPDATE_RESTORE 0x00000004 diff --git a/src/txn/txn_recover.c b/src/txn/txn_recover.c index 9d8a19cbff3..63d86969311 100644 --- a/src/txn/txn_recover.c +++ b/src/txn/txn_recover.c @@ -501,7 +501,7 @@ __wt_txn_recover(WT_SESSION_IMPL *session) * regular eviction to manage paging. Start eviction threads for * recovery without LAS cursors. */ - WT_ERR(__wt_evict_create(session, false)); + WT_ERR(__wt_evict_create(session)); eviction_started = true; /* @@ -533,6 +533,9 @@ done: FLD_SET(conn->log_flags, WT_CONN_LOG_RECOVER_DONE); err: WT_TRET(__recovery_free(&r)); __wt_free(session, config); + if (ret != 0) + __wt_err(session, ret, "Recovery failed"); + /* * Destroy the eviction threads that were started in support of * recovery. They will be restarted once the lookaside table is -- cgit v1.2.1 From d746adb7e1cce9f0532c311e788b0ab24155a4fe Mon Sep 17 00:00:00 2001 From: Don Anderson Date: Wed, 7 Oct 2015 21:27:36 -0400 Subject: WT-2162. Add a null pointer check for the WT_TABLE.indices array. This array may be partially populated after an index is dropped. --- src/schema/schema_open.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/schema/schema_open.c b/src/schema/schema_open.c index 42b578946f4..a86cff4d723 100644 --- a/src/schema/schema_open.c +++ b/src/schema/schema_open.c @@ -571,7 +571,7 @@ __wt_schema_get_index(WT_SESSION_IMPL *session, /* Try to find the index in the table. */ for (i = 0; i < table->nindices; i++) { idx = table->indices[i]; - if (strcmp(idx->name, uri) == 0) { + if (idx != NULL && strcmp(idx->name, uri) == 0) { if (tablep != NULL) *tablep = table; else -- cgit v1.2.1 From 419889ba63717c9e92a180aba695e81de947dc71 Mon Sep 17 00:00:00 2001 From: Don Anderson Date: Wed, 7 Oct 2015 21:27:59 -0400 Subject: WT-2162. Add a Python test equivalent to the reported test case. --- test/suite/test_bug015.py | 48 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) create mode 100644 test/suite/test_bug015.py diff --git a/test/suite/test_bug015.py b/test/suite/test_bug015.py new file mode 100644 index 00000000000..65b5b8e1755 --- /dev/null +++ b/test/suite/test_bug015.py @@ -0,0 +1,48 @@ +#!/usr/bin/env python +# +# Public Domain 2014-2015 MongoDB, Inc. +# Public Domain 2008-2014 WiredTiger, Inc. +# +# This is free and unencumbered software released into the public domain. +# +# Anyone is free to copy, modify, publish, use, compile, sell, or +# distribute this software, either in source code form or as a compiled +# binary, for any purpose, commercial or non-commercial, and by any +# means. +# +# In jurisdictions that recognize copyright laws, the author or authors +# of this software dedicate any and all copyright interest in the +# software to the public domain. We make this dedication for the benefit +# of the public at large and to the detriment of our heirs and +# successors. We intend this dedication to be an overt act of +# relinquishment in perpetuity of all present and future rights to this +# software under copyright law. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +# IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +import wiredtiger, wttest +from helper import copy_wiredtiger_home, key_populate, simple_populate + +# test_bug015.py +# JIRA WT-2162: index drop in a certain order triggers NULL pointer deref +class test_bug015(wttest.WiredTigerTestCase): + def test_bug015(self): + table = 'table:test_bug015' + idx1 = 'index:test_bug015:aab' + idx2 = 'index:test_bug015:aaa' + self.session.create(table, "columns=(k,v)") + self.session.create(idx1, "columns=(v)") + self.session.create(idx2, "columns=(v)") + self.session.drop(idx1, "force=true") + self.session.create(idx1, "columns=(v)") + self.session.drop(idx2, "force=true") + self.session.create(idx2, "columns=(v)") + +if __name__ == '__main__': + wttest.run() -- cgit v1.2.1