diff options
author | Luke Chen <luke.chen@mongodb.com> | 2017-11-22 20:21:34 +1100 |
---|---|---|
committer | Luke Chen <luke.chen@mongodb.com> | 2017-11-22 20:21:34 +1100 |
commit | 734205a16d1fb75f4db81108ed3eb40fe06a0a07 (patch) | |
tree | c3be5b78b6a46912c0e3c4f63373e2ed7d09eb07 /src/third_party | |
parent | e35e66b50e4a0b362004730c8481025917c4e152 (diff) | |
download | mongo-734205a16d1fb75f4db81108ed3eb40fe06a0a07.tar.gz |
Import wiredtiger: 7a7d6ffec48afeef4e1d7b23bceb4015986f47b9 from branch mongodb-3.6r3.6.0-rc5
ref: 923937e6b2..7a7d6ffec4
for: 3.6.0-rc5
WT-3751 Allow pages to split when nothing is visible
WT-3752 WiredTiger can walk long update chains for hot rows.
WT-3754 serial updates to the oldest should consider pinned timestamp for visibility
WT-3758 Turn on the snappy compression for the lookaside table
WT-3760 Avoid writing overflow values into the lookaside file
Diffstat (limited to 'src/third_party')
-rw-r--r-- | src/third_party/wiredtiger/dist/s_define.list | 1 | ||||
-rw-r--r-- | src/third_party/wiredtiger/import.data | 2 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/btree/row_modify.c | 29 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/btmem.h | 10 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/include/serial.i | 8 | ||||
-rw-r--r-- | src/third_party/wiredtiger/src/reconcile/rec_write.c | 76 |
6 files changed, 92 insertions, 34 deletions
diff --git a/src/third_party/wiredtiger/dist/s_define.list b/src/third_party/wiredtiger/dist/s_define.list index fb0162079d9..4be8ceee0e3 100644 --- a/src/third_party/wiredtiger/dist/s_define.list +++ b/src/third_party/wiredtiger/dist/s_define.list @@ -35,6 +35,7 @@ WT_LOG_SLOT_MASK_OFF WT_LOG_SLOT_MASK_ON WT_LOG_SLOT_MAXBITS WT_LOG_SLOT_UNBUFFERED_ISSET +WT_LOOKASIDE_COMPRESSOR WT_PACKED_STRUCT_BEGIN WT_PACKED_STRUCT_END WT_PADDING_CHECK diff --git a/src/third_party/wiredtiger/import.data b/src/third_party/wiredtiger/import.data index cf67d077b45..546c9967ece 100644 --- a/src/third_party/wiredtiger/import.data +++ b/src/third_party/wiredtiger/import.data @@ -1,5 +1,5 @@ { - "commit": "923937e6b2bb7ffc1ff86bfaa8c2f130930286de", + "commit": "7a7d6ffec48afeef4e1d7b23bceb4015986f47b9", "github": "wiredtiger/wiredtiger.git", "vendor": "wiredtiger", "branch": "mongodb-3.6" diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c index 5e84899999a..a2aaeb7673f 100644 --- a/src/third_party/wiredtiger/src/btree/row_modify.c +++ b/src/third_party/wiredtiger/src/btree/row_modify.c @@ -299,9 +299,12 @@ WT_UPDATE * __wt_update_obsolete_check( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd) { + WT_TXN_GLOBAL *txn_global; WT_UPDATE *first, *next; u_int count; + txn_global = &S2C(session)->txn_global; + /* * This function identifies obsolete updates, and truncates them from * the rest of the chain; because this routine is called from inside @@ -313,13 +316,14 @@ __wt_update_obsolete_check( * Only updates with globally visible, self-contained data can terminate * update chains. */ - for (first = NULL, count = 0; upd != NULL; upd = upd->next, count++) - if (WT_UPDATE_DATA_VALUE(upd) && - __wt_txn_upd_visible_all(session, upd)) { - if (first == NULL) - first = upd; - } else if (upd->txnid != WT_TXN_ABORTED) + for (first = NULL, count = 0; upd != NULL; upd = upd->next, count++) { + if (upd->txnid == WT_TXN_ABORTED) + continue; + if (!__wt_txn_upd_visible_all(session, upd)) first = NULL; + else if (first == NULL && WT_UPDATE_DATA_VALUE(upd)) + first = upd; + } /* * We cannot discard this WT_UPDATE structure, we can only discard @@ -338,9 +342,16 @@ __wt_update_obsolete_check( * trim update lists independently of the page state, ensure there * is a modify structure. */ - if (count > 20 && page->modify != NULL) - page->modify->obsolete_check_txn = - S2C(session)->txn_global.last_running; + if (count > 20 && page->modify != NULL) { + page->modify->obsolete_check_txn = txn_global->last_running; +#ifdef HAVE_TIMESTAMPS + if (txn_global->has_pinned_timestamp) + WT_WITH_TIMESTAMP_READLOCK(session, &txn_global->rwlock, + __wt_timestamp_set( + &page->modify->obsolete_check_timestamp, + &txn_global->pinned_timestamp)); +#endif + } return (NULL); } diff --git a/src/third_party/wiredtiger/src/include/btmem.h b/src/third_party/wiredtiger/src/include/btmem.h index bd881af0ecf..d45b68d1972 100644 --- a/src/third_party/wiredtiger/src/include/btmem.h +++ b/src/third_party/wiredtiger/src/include/btmem.h @@ -182,9 +182,16 @@ struct __wt_ovfl_reuse { * makes the lookaside table's value more likely to overflow the page size when * the row-store key is relatively large. */ +#ifdef HAVE_BUILTIN_EXTENSION_SNAPPY +#define WT_LOOKASIDE_COMPRESSOR "snappy" +#else +#define WT_LOOKASIDE_COMPRESSOR "none" +#endif #define WT_LAS_CONFIG \ "key_format=" WT_UNCHECKED_STRING(QIQu) \ - ",value_format=" WT_UNCHECKED_STRING(QuBu) + ",value_format=" WT_UNCHECKED_STRING(QuBu) \ + ",block_compressor=" WT_LOOKASIDE_COMPRESSOR \ + ",leaf_value_max=64MB" /* * WT_PAGE_LOOKASIDE -- @@ -218,6 +225,7 @@ struct __wt_page_modify { /* Avoid checking for obsolete updates during checkpoints. */ uint64_t obsolete_check_txn; + WT_DECL_TIMESTAMP(obsolete_check_timestamp) /* The largest transaction seen on the page by reconciliation. */ uint64_t rec_max_txn; diff --git a/src/third_party/wiredtiger/src/include/serial.i b/src/third_party/wiredtiger/src/include/serial.i index c5758ee605a..d471ebb399c 100644 --- a/src/third_party/wiredtiger/src/include/serial.i +++ b/src/third_party/wiredtiger/src/include/serial.i @@ -263,6 +263,7 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page, { WT_DECL_RET; WT_UPDATE *obsolete, *upd = *updp; + wt_timestamp_t *obsolete_timestamp; uint64_t txn; /* Clear references to memory we now own and must free on error. */ @@ -309,11 +310,14 @@ __wt_update_serial(WT_SESSION_IMPL *session, WT_PAGE *page, * is used as an indicator of there being further updates on this page. */ if ((txn = page->modify->obsolete_check_txn) != WT_TXN_NONE) { - if (!__wt_txn_visible_all(session, txn, NULL)) { + obsolete_timestamp = + WT_TIMESTAMP_NULL(&page->modify->obsolete_check_timestamp); + if (!__wt_txn_visible_all(session, txn, obsolete_timestamp)) { /* Try to move the oldest ID forward and re-check. */ WT_RET(__wt_txn_update_oldest(session, 0)); - if (!__wt_txn_visible_all(session, txn, NULL)) + if (!__wt_txn_visible_all( + session, txn, obsolete_timestamp)) return (0); } diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index 7e9980e0887..77b8c2a2e78 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -862,10 +862,11 @@ __rec_raw_compression_config(WT_SESSION_IMPL *session, /* * XXX - * Turn off if lookaside is configured: lookaside potentially writes - * blocks without entries and raw compression isn't ready for that. + * Turn off if lookaside or update/restore are configured: those modes + * potentially write blocks without entries and raw compression isn't + * ready for that. */ - if (LF_ISSET(WT_REC_LOOKASIDE)) + if (LF_ISSET(WT_REC_LOOKASIDE | WT_REC_UPDATE_RESTORE)) return (false); /* @@ -1249,7 +1250,8 @@ err: __wt_scr_free(session, &tmp); */ static int __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, - WT_INSERT *ins, void *ripcip, WT_CELL_UNPACK *vpack, WT_UPDATE **updp) + WT_INSERT *ins, void *ripcip, WT_CELL_UNPACK *vpack, + bool *upd_savedp, WT_UPDATE **updp) { WT_PAGE *page; WT_UPDATE *first_txn_upd, *first_upd, *upd; @@ -1263,6 +1265,8 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, first_ts_upd = NULL; #endif + if (upd_savedp != NULL) + *upd_savedp = false; *updp = NULL; page = r->page; @@ -1476,6 +1480,9 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, */ WT_RET(__rec_update_save(session, r, ins, ripcip, *updp, upd_memsize)); + if (upd_savedp != NULL) + *upd_savedp = true; + #ifdef HAVE_TIMESTAMPS /* Track the oldest saved timestamp for lookaside. */ if (F_ISSET(r, WT_REC_LOOKASIDE)) { @@ -1505,11 +1512,15 @@ check_original_value: /* * Returning an update means the original on-page value might be lost, * and that's a problem if there's a reader that needs it. There are - * three cases: any update from a modify operation (because the modify - * has to be applied to a stable update, not the new on-page update), - * any lookaside table eviction (because the backing disk image is - * rewritten), or any reconciliation of a backing overflow record that - * will be physically removed once it's no longer needed. + * several cases: + * - any update with no backing record (because we will store an empty + * value on page and returning that is wrong). + * - any update from a modify operation (because the modify has to be + * applied to a stable update, not the new on-page update), + * - any lookaside table eviction (because the backing disk image is + * rewritten), + * - or any reconciliation of a backing overflow record that will be + * physically removed once it's no longer needed. */ if (*updp != NULL && ((*updp)->type == WT_UPDATE_MODIFIED || F_ISSET(r, WT_REC_LOOKASIDE) || (vpack != NULL && @@ -4206,7 +4217,7 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) /* Update any changes to the original on-page data items. */ WT_SKIP_FOREACH(ins, WT_COL_UPDATE_SINGLE(page)) { - WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd)); + WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, NULL, &upd)); if (upd != NULL) __bit_setv(r->first_free, WT_INSERT_RECNO(ins) - pageref->ref_recno, @@ -4250,8 +4261,8 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_REF *pageref) break; upd = NULL; } else { - WT_RET( - __rec_txn_read(session, r, ins, NULL, NULL, &upd)); + WT_RET(__rec_txn_read( + session, r, ins, NULL, NULL, NULL, &upd)); recno = WT_INSERT_RECNO(ins); } for (;;) { @@ -4602,7 +4613,7 @@ record_loop: /* upd = NULL; if (ins != NULL && WT_INSERT_RECNO(ins) == src_recno) { WT_ERR(__rec_txn_read( - session, r, ins, cip, vpack, &upd)); + session, r, ins, cip, vpack, NULL, &upd)); ins = WT_SKIP_NEXT(ins); } @@ -4819,8 +4830,8 @@ compare: /* upd = NULL; } else { - WT_ERR( - __rec_txn_read(session, r, ins, NULL, NULL, &upd)); + WT_ERR(__rec_txn_read( + session, r, ins, NULL, NULL, NULL, &upd)); n = WT_INSERT_RECNO(ins); } while (src_recno <= n) { @@ -5318,7 +5329,8 @@ __rec_row_leaf(WT_SESSION_IMPL *session, vpack = &_vpack; __wt_cell_unpack(val_cell, vpack); } - WT_ERR(__rec_txn_read(session, r, NULL, rip, vpack, &upd)); + WT_ERR(__rec_txn_read( + session, r, NULL, rip, vpack, NULL, &upd)); /* Build value cell. */ dictionary = false; @@ -5632,7 +5644,7 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) WT_CURSOR_BTREE *cbt; WT_KV *key, *val; WT_UPDATE *upd; - bool ovfl_key; + bool ovfl_key, upd_saved; btree = S2BT(session); cbt = &r->update_modify_cbt; @@ -5641,11 +5653,32 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) val = &r->v; for (; ins != NULL; ins = WT_SKIP_NEXT(ins)) { - WT_RET(__rec_txn_read(session, r, ins, NULL, NULL, &upd)); + WT_RET(__rec_txn_read( + session, r, ins, NULL, NULL, &upd_saved, &upd)); + + if (upd == NULL) { + /* + * If no update is visible but some were saved, check + * for splits. + */ + if (!upd_saved) + continue; + if (!__rec_need_split(r, WT_INSERT_KEY_SIZE(ins))) + continue; + + /* Copy the current key into place and then split. */ + WT_RET(__wt_buf_set(session, r->cur, + WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins))); + WT_RET(__rec_split_crossing_bnd( + session, r, WT_INSERT_KEY_SIZE(ins))); - /* If no updates are visible there's no work to do. */ - if (upd == NULL) + /* + * Turn off prefix compression until a full key is + * written into the new page. + */ + r->key_pfx_compress = false; continue; + } switch (upd->type) { case WT_UPDATE_DELETED: @@ -5671,7 +5704,8 @@ __rec_row_leaf_insert(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_INSERT *ins) break; WT_ILLEGAL_VALUE(session); } - /* Build key cell. */ + + /* Build key cell. */ WT_RET(__rec_cell_build_leaf_key(session, r, WT_INSERT_KEY(ins), WT_INSERT_KEY_SIZE(ins), &ovfl_key)); |