summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKeith Bostic <keith@wiredtiger.com>2014-07-22 12:00:26 -0400
committerKeith Bostic <keith@wiredtiger.com>2014-07-22 12:00:26 -0400
commit51de923778091761c99a48f5e786c7cef39f73da (patch)
tree2a01e75debedf303aa4dca7b0f77aa5ed5acfebe
parent6ada92843ec70dea42f1cc461fda0836c4ab62f9 (diff)
downloadmongo-51de923778091761c99a48f5e786c7cef39f73da.tar.gz
Switch from using the checkpoint lock to serialize bulk-load close and
database checkpoints, to using a new WT_BTREE handle lock. Reference #1114.
-rw-r--r--src/btree/bt_handle.c5
-rw-r--r--src/conn/conn_dhandle.c32
-rw-r--r--src/include/btree.h7
-rw-r--r--src/include/extern.h3
-rw-r--r--src/txn/txn_ckpt.c70
5 files changed, 79 insertions, 38 deletions
diff --git a/src/btree/bt_handle.c b/src/btree/bt_handle.c
index 5c1d7c1a2d5..a4587aeb40b 100644
--- a/src/btree/bt_handle.c
+++ b/src/btree/bt_handle.c
@@ -158,6 +158,7 @@ __wt_btree_close(WT_SESSION_IMPL *session)
/* Destroy locks. */
WT_TRET(__wt_rwlock_destroy(session, &btree->ovfl_lock));
+ __wt_spin_destroy(session, &btree->bulk_ckpt_lock);
/* Free allocated memory. */
__wt_free(session, btree->key_format);
@@ -305,9 +306,11 @@ __btree_conf(WT_SESSION_IMPL *session, WT_CKPT *ckpt)
(int)cval.len, cval.str);
}
- /* Overflow lock. */
+ /* Allocate locks. */
WT_RET(__wt_rwlock_alloc(
session, "btree overflow lock", &btree->ovfl_lock));
+ WT_RET(__wt_spin_init(
+ session, &btree->bulk_ckpt_lock, "bulk/checkpoint lock"));
__wt_stat_init_dsrc_stats(&btree->dhandle->stats);
diff --git a/src/conn/conn_dhandle.c b/src/conn/conn_dhandle.c
index d95ab86b887..6d6ef8879c2 100644
--- a/src/conn/conn_dhandle.c
+++ b/src/conn/conn_dhandle.c
@@ -170,7 +170,7 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session)
WT_BTREE *btree;
WT_DATA_HANDLE *dhandle;
WT_DECL_RET;
- int ckpt_lock;
+ int is_bulk;
dhandle = session->dhandle;
btree = S2BT(session);
@@ -179,26 +179,13 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session)
return (0);
/*
- * Checkpoint to flush out the file's changes. This usually happens on
- * data handle close (which means we're holding the handle lock, so
- * this call serializes with any session checkpoint). Bulk-cursors are
- * a special case: they do not hold the handle lock and they still must
- * serialize with checkpoints. Acquire the lower-level checkpoint lock
- * and hold it until the handle is closed and the bulk-cursor flag has
- * been cleared.
- * We hold the lock so long for two reasons: first, checkpoint uses
- * underlying btree handle structures (for example, the meta-tracking
- * checkpoint resolution uses the block-manager reference), and because
- * checkpoint writes "fake" checkpoint records for bulk-loaded files,
- * and a real checkpoint, which we're creating here, can't be followed
- * by more fake checkpoints. In summary, don't let a checkpoint happen
- * unless all of the bulk cursor's information has been cleared.
+ * Bulk-load has special checkpoint locking requirements (see the lock
+ * function for details). Lock/unlock around the checkpoint and clear
+ * of the bulk-load flag.
*/
- ckpt_lock = 0;
- if (F_ISSET(btree, WT_BTREE_BULK)) {
- ckpt_lock = 1;
- __wt_spin_lock(session, &S2C(session)->checkpoint_lock);
- }
+ is_bulk = F_ISSET(btree, WT_BTREE_BULK) ? 1 : 0;
+ if (is_bulk)
+ __wt_checkpoint_bulk_lock(session, btree, 1);
/*
* The close can fail if an update cannot be written, return the EBUSY
@@ -215,9 +202,8 @@ __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session)
F_CLR(dhandle, WT_DHANDLE_OPEN);
F_CLR(btree, WT_BTREE_SPECIAL_FLAGS);
-err:
- if (ckpt_lock)
- __wt_spin_unlock(session, &S2C(session)->checkpoint_lock);
+err: if (is_bulk)
+ __wt_checkpoint_bulk_lock(session, btree, 0);
return (ret);
}
diff --git a/src/include/btree.h b/src/include/btree.h
index e78f81b3e1a..8181db1686c 100644
--- a/src/include/btree.h
+++ b/src/include/btree.h
@@ -60,8 +60,6 @@
struct __wt_btree {
WT_DATA_HANDLE *dhandle;
- WT_CKPT *ckpt; /* Checkpoint information */
-
enum { BTREE_COL_FIX=1, /* Fixed-length column store */
BTREE_COL_VAR=2, /* Variable-length column store */
BTREE_ROW=3 /* Row-store */
@@ -105,7 +103,6 @@ struct __wt_btree {
WT_REF root; /* Root page reference */
int modified; /* If the tree ever modified */
- int bulk_load_ok; /* Bulk-load is a possibility */
WT_BM *bm; /* Block manager reference */
u_int block_header; /* WT_PAGE_HEADER_BYTE_SIZE */
@@ -118,8 +115,12 @@ struct __wt_btree {
u_int evict_walk_skips; /* Number of walks skipped */
volatile uint32_t evict_busy; /* Count of threads in eviction */
+ WT_CKPT *ckpt; /* Checkpoint information */
int checkpointing; /* Checkpoint in progress */
+ int bulk_load_ok; /* Bulk-load is a possibility */
+ WT_SPINLOCK bulk_ckpt_lock; /* Lock checkpoint and bulk close */
+
/* Flags values up to 0xff are reserved for WT_DHANDLE_* */
#define WT_BTREE_BULK 0x00100 /* Bulk-load handle */
#define WT_BTREE_NO_EVICTION 0x00200 /* Disable eviction */
diff --git a/src/include/extern.h b/src/include/extern.h
index 715659eaeff..42e5b11b99c 100644
--- a/src/include/extern.h
+++ b/src/include/extern.h
@@ -1566,6 +1566,9 @@ extern void __wt_txn_destroy(WT_SESSION_IMPL *session);
extern int __wt_txn_global_init(WT_CONNECTION_IMPL *conn, const char *cfg[]);
extern void __wt_txn_global_destroy(WT_CONNECTION_IMPL *conn);
extern int __wt_txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]);
+extern void __wt_checkpoint_bulk_lock( WT_SESSION_IMPL *session,
+ WT_BTREE *btree,
+ int getlock);
extern int __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]);
extern int __wt_checkpoint_write_leaves(WT_SESSION_IMPL *session,
const char *cfg[]);
diff --git a/src/txn/txn_ckpt.c b/src/txn/txn_ckpt.c
index c066254926c..a8cc34df485 100644
--- a/src/txn/txn_ckpt.c
+++ b/src/txn/txn_ckpt.c
@@ -434,6 +434,32 @@ __drop_to(WT_CKPT *ckptbase, const char *name, size_t len)
}
/*
+ * __wt_checkpoint_bulk_lock --
+ * Lock/unlock when checkpointing a bulk-load file.
+ */
+void
+__wt_checkpoint_bulk_lock(
+ WT_SESSION_IMPL *session, WT_BTREE *btree, int getlock)
+{
+ /*
+ * This function exists as a place for this comment: checkpoint does a
+ * read-modify-write cycle of the file's metadata, which means there's
+ * a potential race with other threads calling the checkpoint function.
+ * There are two paths into the checkpoint worker function, database
+ * checkpoints and handle close. Both exclusively hold the handle's
+ * lock, but bulk-load is a special case. Because bulk-load is likely
+ * a long-lived operation, database checkpoint is allowed to proceed
+ * without a bulk-load handle's lock. To avoid a database checkpoint
+ * racing with bulk-load handle close, we use a separate per-file lock
+ * acquired/released around the checkpoint.
+ */
+ if (getlock)
+ __wt_spin_lock(session, &btree->bulk_ckpt_lock);
+ else
+ __wt_spin_unlock(session, &btree->bulk_ckpt_lock);
+}
+
+/*
* __checkpoint_worker --
* Checkpoint a tree.
*/
@@ -451,19 +477,19 @@ __checkpoint_worker(
WT_DECL_RET;
WT_LSN ckptlsn;
const char *name;
- int deleted, force, hot_backup_locked, track_ckpt;
+ int bulk_ckpt_locked, deleted, force, hot_backup_locked, track_ckpt;
char *name_alloc;
btree = S2BT(session);
bm = btree->bm;
conn = S2C(session);
+ dhandle = session->dhandle;
+
ckpt = ckptbase = NULL;
INIT_LSN(&ckptlsn);
- dhandle = session->dhandle;
- name_alloc = NULL;
- hot_backup_locked = 0;
- name_alloc = NULL;
+ bulk_ckpt_locked = hot_backup_locked = 0;
track_ckpt = 1;
+ name_alloc = NULL;
/*
* If closing a file that's never been modified, discard its blocks.
@@ -475,14 +501,32 @@ __checkpoint_worker(
return (__wt_cache_op(session, NULL, WT_SYNC_DISCARD));
/*
- * Get the list of checkpoints for this file. If there's no reference
- * to the file in the metadata (the file is dead), then discard it from
+ * Bulk-load has special checkpoint locking requirements (see the lock
+ * function for details). Lock/unlock around the read-modify-write
+ * cycle of the file's metadata. We only lock here in the case of a
+ * database checkpoint, the bulk-load handle close path has already
+ * acquired the lock.
+ */
+ if (is_checkpoint && F_ISSET(btree, WT_BTREE_BULK)) {
+ bulk_ckpt_locked = 1;
+ /*
+ * The bulk-load flag could be cleared after we test and before
+ * we acquire the lock. We don't care, all this lock does is
+ * single-thread this function, the race just means a bulk-load
+ * handle had the lock and closed while we waited.
+ */
+ __wt_checkpoint_bulk_lock(session, btree, 1);
+ }
+
+ /*
+ * If the file has no metadata (the file is dead), then discard it from
* the cache without bothering to write any dirty pages.
*/
if ((ret = __wt_meta_ckptlist_get(
session, dhandle->name, &ckptbase)) == WT_NOTFOUND) {
WT_ASSERT(session, session->dhandle->session_ref == 0);
- return (__wt_cache_op(session, NULL, WT_SYNC_DISCARD));
+ ret = __wt_cache_op(session, NULL, WT_SYNC_DISCARD);
+ goto done;
}
WT_ERR(ret);
@@ -567,7 +611,7 @@ __checkpoint_worker(
}
if (!btree->modified && !force) {
if (!is_checkpoint)
- goto skip;
+ goto done;
deleted = 0;
WT_CKPT_FOREACH(ckptbase, ckpt)
@@ -585,7 +629,7 @@ __checkpoint_worker(
(strcmp(name, (ckpt - 1)->name) == 0 ||
(WT_PREFIX_MATCH(name, WT_CHECKPOINT) &&
WT_PREFIX_MATCH((ckpt - 1)->name, WT_CHECKPOINT))))
- goto skip;
+ goto done;
}
/* Add a new checkpoint entry at the end of the list. */
@@ -786,9 +830,13 @@ fake: /* Update the object's metadata. */
WT_ERR(__wt_txn_checkpoint_log(
session, 0, WT_TXN_LOG_CKPT_STOP, NULL));
+done:
err: if (hot_backup_locked)
__wt_spin_unlock(session, &conn->hot_backup_lock);
-skip: __wt_meta_ckptlist_free(session, ckptbase);
+ if (bulk_ckpt_locked)
+ __wt_checkpoint_bulk_lock(session, btree, 0);
+
+ __wt_meta_ckptlist_free(session, ckptbase);
__wt_free(session, name_alloc);
return (ret);
}