summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Chen <luke.chen@mongodb.com>2019-10-09 16:07:42 +1100
committerLuke Chen <luke.chen@mongodb.com>2019-10-09 16:07:42 +1100
commit5e9b683bee02abb30c4f710392c809feb25d14c6 (patch)
treea2b4ad73b07c51e273a9c8d2492633023e359c8a
parentb5ff43f92c0e562121477e8253a56b2d83825571 (diff)
downloadmongo-5e9b683bee02abb30c4f710392c809feb25d14c6.tar.gz
Import wiredtiger: 3af8f2dc2c6028b3c18caa6be430d14c4da93c30 from branch mongodb-3.4
ref: 1d7a748f9f..3af8f2dc2c for: 3.4.24 WT-4956 Handle the case where 4 billion updates are made to a page without eviction
-rw-r--r--src/third_party/wiredtiger/dist/s_string.ok1
-rw-r--r--src/third_party/wiredtiger/import.data8
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_debug.c2
-rw-r--r--src/third_party/wiredtiger/src/include/btmem.h17
-rw-r--r--src/third_party/wiredtiger/src/include/btree.i31
-rw-r--r--src/third_party/wiredtiger/src/include/serial.i32
-rw-r--r--src/third_party/wiredtiger/src/reconcile/rec_write.c37
7 files changed, 67 insertions, 61 deletions
diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok
index 0bfebbf8458..0b414b89894 100644
--- a/src/third_party/wiredtiger/dist/s_string.ok
+++ b/src/third_party/wiredtiger/dist/s_string.ok
@@ -1219,6 +1219,7 @@ unmodify
unordered
unpackv
unpadded
+unreconciled
unreferenced
unregister
unsized
diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data
index bfad8007be0..e448a673a88 100644
--- a/src/third_party/wiredtiger/import.data
+++ b/src/third_party/wiredtiger/import.data
@@ -1,6 +1,6 @@
{
- "vendor": "wiredtiger",
- "github": "wiredtiger/wiredtiger.git",
- "branch": "mongodb-3.4",
- "commit": "1d7a748f9f096ebf39e80ea442b6d3be3fc69381"
+ "commit": "3af8f2dc2c6028b3c18caa6be430d14c4da93c30",
+ "github": "wiredtiger/wiredtiger.git",
+ "vendor": "wiredtiger",
+ "branch": "mongodb-3.4"
}
diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c
index fdc33b608ec..6512dcd5d72 100644
--- a/src/third_party/wiredtiger/src/btree/bt_debug.c
+++ b/src/third_party/wiredtiger/src/btree/bt_debug.c
@@ -722,7 +722,7 @@ __debug_page_metadata(WT_DBG *ds, WT_REF *ref)
}
if (mod != NULL)
WT_RET(
- ds->f(ds, ", write generation=%" PRIu32, mod->write_gen));
+ ds->f(ds, ", page-state=%" PRIu32, mod->page_state));
WT_RET(ds->f(ds, "\n"));
return (0);
diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h
index e965724dffe..b1423d976f2 100644
--- a/src/third_party/wiredtiger/src/include/btmem.h
+++ b/src/third_party/wiredtiger/src/include/btmem.h
@@ -423,10 +423,21 @@ struct __wt_page_modify {
WT_SPINLOCK page_lock; /* Page's spinlock */
/*
- * The write generation is incremented when a page is modified, a page
- * is clean if the write generation is 0.
+ * The page state is incremented when a page is modified.
+ *
+ * WT_PAGE_CLEAN --
+ * The page is clean.
+ * WT_PAGE_DIRTY_FIRST --
+ * The page is in this state after the first operation that marks a
+ * page dirty, or when reconciliation is checking to see if it has
+ * done enough work to be able to mark the page clean.
+ * WT_PAGE_DIRTY --
+ * Two or more updates have been added to the page.
*/
- uint32_t write_gen;
+#define WT_PAGE_CLEAN 0
+#define WT_PAGE_DIRTY_FIRST 1
+#define WT_PAGE_DIRTY 2
+ uint32_t page_state;
#define WT_PM_REC_EMPTY 1 /* Reconciliation: no replacement */
#define WT_PM_REC_MULTIBLOCK 2 /* Reconciliation: multiple blocks */
diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i
index 1d6fcd6272c..3286c84be3f 100644
--- a/src/third_party/wiredtiger/src/include/btree.i
+++ b/src/third_party/wiredtiger/src/include/btree.i
@@ -34,7 +34,8 @@ __wt_page_is_empty(WT_PAGE *page)
static inline bool
__wt_page_is_modified(WT_PAGE *page)
{
- return (page->modify != NULL && page->modify->write_gen != 0);
+ return (page->modify != NULL &&
+ page->modify->page_state != WT_PAGE_CLEAN);
}
/*
@@ -505,19 +506,25 @@ __wt_page_only_modify_set(WT_SESSION_IMPL *session, WT_PAGE *page)
WT_ASSERT(session, !F_ISSET(session->dhandle, WT_DHANDLE_DEAD));
last_running = 0;
- if (page->modify->write_gen == 0)
+ if (page->modify->page_state == WT_PAGE_CLEAN)
last_running = S2C(session)->txn_global.last_running;
/*
- * We depend on atomic-add being a write barrier, that is, a barrier to
- * ensure all changes to the page are flushed before updating the page
- * write generation and/or marking the tree dirty, otherwise checkpoints
+ * We depend on the atomic operation being a write barrier, that is, a
+ * barrier to ensure all changes to the page are flushed before updating
+ * the page state and/or marking the tree dirty, otherwise checkpoints
* and/or page reconciliation might be looking at a clean page/tree.
*
* Every time the page transitions from clean to dirty, update the cache
* and transactional information.
+ *
+ * The page state can only ever be incremented above dirty by the number
+ * of concurrently running threads, so the counter will never approach
+ * the point where it would wrap.
*/
- if (__wt_atomic_add32(&page->modify->write_gen, 1) == 1) {
+ if (page->modify->page_state < WT_PAGE_DIRTY &&
+ __wt_atomic_add32(&page->modify->page_state, 1) ==
+ WT_PAGE_DIRTY_FIRST) {
__wt_cache_dirty_incr(session, page);
/*
@@ -588,7 +595,17 @@ __wt_page_modify_clear(WT_SESSION_IMPL *session, WT_PAGE *page)
* Allow the call to be made on clean pages.
*/
if (__wt_page_is_modified(page)) {
- page->modify->write_gen = 0;
+ /*
+ * The only part where ordering matters is during
+ * reconciliation where updates on other threads are performing
+ * writes to the page state that need to be visible to the
+ * reconciliation thread.
+ *
+ * Since clearing of the page state is not going to be happening
+ * during reconciliation on a separate thread, there's no write
+ * barrier needed here.
+ */
+ page->modify->page_state = WT_PAGE_CLEAN;
__wt_cache_dirty_decr(session, page);
}
}
diff --git a/src/third_party/wiredtiger/src/include/serial.i b/src/third_party/wiredtiger/src/include/serial.i
index 0134e1a9c20..59072278dcf 100644
--- a/src/third_party/wiredtiger/src/include/serial.i
+++ b/src/third_party/wiredtiger/src/include/serial.i
@@ -7,29 +7,6 @@
*/
/*
- * __page_write_gen_wrapped_check --
- * Confirm the page's write generation number won't wrap.
- */
-static inline int
-__page_write_gen_wrapped_check(WT_PAGE *page)
-{
- /*
- * Check to see if the page's write generation is about to wrap (wildly
- * unlikely as it implies 4B updates between clean page reconciliations,
- * but technically possible), and fail the update.
- *
- * The check is outside of the serialization mutex because the page's
- * write generation is going to be a hot cache line, so technically it's
- * possible for the page's write generation to wrap between the test and
- * our subsequent modification of it. However, the test is (4B-1M), and
- * there cannot be a million threads that have done the test but not yet
- * completed their modification.
- */
- return (page->modify->write_gen >
- UINT32_MAX - WT_MILLION ? WT_RESTART : 0);
-}
-
-/*
* __insert_simple_func --
* Worker function to add a WT_INSERT entry to the middle of a skiplist.
*/
@@ -159,9 +136,6 @@ __wt_col_append_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
WT_INSERT *new_ins = *new_insp;
WT_DECL_RET;
- /* Check for page write generation wrap. */
- WT_RET(__page_write_gen_wrapped_check(page));
-
/* Clear references to memory we now own and must free on error. */
*new_insp = NULL;
@@ -210,9 +184,6 @@ __wt_insert_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
u_int i;
bool simple;
- /* Check for page write generation wrap. */
- WT_RET(__page_write_gen_wrapped_check(page));
-
/* Clear references to memory we now own and must free on error. */
*new_insp = NULL;
@@ -266,9 +237,6 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page,
WT_UPDATE *obsolete, *upd = *updp;
uint64_t txn;
- /* Check for page write generation wrap. */
- WT_RET(__page_write_gen_wrapped_check(page));
-
/* Clear references to memory we now own and must free on error. */
*updp = NULL;
diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c
index b76192c0cf9..fd67939d7ca 100644
--- a/src/third_party/wiredtiger/src/reconcile/rec_write.c
+++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c
@@ -33,12 +33,6 @@ typedef struct {
WT_ITEM *interim_buf;
/*
- * Track start/stop write generation to decide if all changes to the
- * page are written.
- */
- uint32_t orig_write_gen;
-
- /*
* Track start/stop checkpoint generations to decide if lookaside table
* records are correct.
*/
@@ -708,14 +702,20 @@ __rec_write_page_status(WT_SESSION_IMPL *session, WT_RECONCILE *r)
btree->rec_max_txn = r->max_txn;
/*
- * The page only might be clean; if the write generation is
- * unchanged since reconciliation started, it's clean.
+ * We set the page state to mark it as having been dirtied for
+ * the first time prior to reconciliation. A failed atomic cas
+ * indicates that an update has taken place during
+ * reconciliation.
+ *
+ * The page only might be clean; if the page state is unchanged
+ * since reconciliation started, it's clean.
*
- * If the write generation changed, the page has been written
- * since reconciliation started and remains dirty (that can't
- * happen when evicting, the page is exclusively locked).
+ * If the page state changed, the page has been written since
+ * reconciliation started and remains dirty (that can't happen
+ * when evicting, the page is exclusively locked).
*/
- if (__wt_atomic_cas32(&mod->write_gen, r->orig_write_gen, 0))
+ if (__wt_atomic_cas32(
+ &mod->page_state, WT_PAGE_DIRTY_FIRST, WT_PAGE_CLEAN))
__wt_cache_dirty_decr(session, page);
else
WT_ASSERT(session, !F_ISSET(r, WT_EVICTING));
@@ -898,13 +898,22 @@ __rec_write_init(WT_SESSION_IMPL *session,
r->page = page;
/*
- * Save the page's write generation before reading the page.
* Save the transaction generations before reading the page.
* These are all ordered reads, but we only need one.
*/
r->orig_btree_checkpoint_gen = btree->checkpoint_gen;
r->orig_txn_checkpoint_gen = conn->txn_global.checkpoint_gen;
- WT_ORDERED_READ(r->orig_write_gen, page->modify->write_gen);
+
+ /*
+ * Update the page state to indicate that all currently installed
+ * updates will be included in this reconciliation if it would mark the
+ * page clean.
+ *
+ * Add a write barrier to make it more likely that a thread adding an
+ * update will see this state change.
+ */
+ page->modify->page_state = WT_PAGE_DIRTY_FIRST;
+ WT_FULL_BARRIER();
/*
* Cache the oldest running transaction ID. This is used to check