diff options
author | Ramon Fernandez <ramon@mongodb.com> | 2016-05-05 07:32:30 -0400 |
---|---|---|
committer | Ramon Fernandez <ramon@mongodb.com> | 2016-05-05 07:32:36 -0400 |
commit | 150aa821caa327529a0996793c55a6b2e74acaf8 (patch) | |
tree | bd116d3f08cad05243dcdbf90e4a45791e0da1b9 /src/third_party/wiredtiger/src/btree | |
parent | 0ae4fb568aa6472a6030fd83a86fb2965d472095 (diff) | |
download | mongo-150aa821caa327529a0996793c55a6b2e74acaf8.tar.gz |
Import wiredtiger-wiredtiger-2.8.0-410-g636a7b2.tar.gz from wiredtiger branch mongodb-3.4
ref: eaa7b5f..636a7b2
WT-2103 add incremental backup testing to format
WT-2223 Add stress testing for in-memory
WT-2343 Assert we don't remove or rename when backup cursor is open
WT-2349 Add ability to open databases read-only
WT-2359 WiredTiger with Python will hang if a calloc failure occurs during __wt_connection_close
WT-2360 Allow disjunctions and combinations of operations in join cursors
WT-2446 Estimate WT cache hit ratio
WT-2450 salvage releases pages, then explicitly evicts them.
WT-2453 Throughput drop in wtperf evict Jenkins tests
WT-2479 dump utility discards table config (JSON)
WT-2504 Should READONLY always read basecfg file?
WT-2505 Review clang analyzer warnings
WT-2508 test programs should remove test directories on the "clean" target
WT-2518 LSM checkpoint handle acquisition optimization
WT-2520 WT_SESSION::verify should not alter tables
WT-2526 mixing and matching readonly and read/write handles
WT-2535 Extend test/format to test for transactions reading their writes
WT-2537 cannot open DB written by WT2.6.1 with WT2.8.0 due to WT_NOTFOUND on recovery
WT-2539 Implement file streaming above pluggable filesystems
WT-2540 Separate stream and file handle methods
WT-2542 fixed-length column store reconciliation overwrites original values
WT-2544 Investigate any thread populating eviction queue
WT-2546 Eviction server not help evict pages sometimes
WT-2547 Add 1-eviction-worker jobs to Jenkins
WT-2548 Cap the amount of data handed to raw compression.
WT-2549 joins using recno keys return no values
WT-2550 java ex_schema example fails
WT-2552 Public API for pluggable filesystems
WT-2553 Document in-memory configuration and WT_CACHE_FULL error return
WT-2556 typo in the Java example code
WT-2557 format test program should discard log files after incremental backup
WT-2558 WT_PAGE structure reorganization
WT-2559 Jenkins Windows segfault in logging code
WT-2560 test/format workload stuck trying to update oldest transaction ID
WT-2562 reconfig02 test failing sometimes on PPC
WT-2565 item 3573 on page at [write-check] is a corrupted cell
WT-2566 All lock operations should be barriers
WT-2567 segfault in test/format log truncate
WT-2568 Java PackTest.java compilation error
WT-2569 win_handle_read should always call GetLastError on error
WT-2570 Minor lint cleanups.
WT-2571 join code cleanup
WT-2572 don't select an in-memory format run if incompatible options configured
WT-2573 free of stack-allocated WT_REF
WT-2574 format doesn't free all allocated configure memory
WT-2576 variable-length column-store out-of-order return
WT-2577 core dump discarding non-existent addresses
WT-2579 in-memory configurations break debugging support
WT-2580 potential SWIG naming conflict in Java
WT-2581 assert multi->disk_image == NULL
WT-2582 cache eviction server error: WT_RESTART
WT-2583 incremental backup can prevent future recovery
WT-2584 don't use periods in error messages
WT-2586 Remove ex_config.c until config cursors are supported
WT-2592 Joins using non-recno key types not working
WT-2593 disk full with pre-allocated log files
WT-2595 Fix compiler warning in packing tests
WT-2598 in-memory FS needs fast lookup on file names
WT-2599 split out the checksum code from the support directory
WT-2600 clean up test program #includes
WT-2602 LSM stress hangs with very large uncompressed pages
WT-2609 Incorrect "skips API_END call" error.
WT-2612 The dist/s_prototypes script is creating a debugging file xxx.
WT-2613 WT Compile windows Alt is returning a C4100 error
WT-2615 Enabling checkpoints in test/format leads to reduced concurrency
WT-2616 In-memory deadlock getting size
WT-2621 WiredTiger fails to compile on MSVC 2013
SERVER-23661 $sample takes disproportionately long time on newly created collection
SERVER-23904 WiredTiger changes for MongoDB 3.3.6
Diffstat (limited to 'src/third_party/wiredtiger/src/btree')
21 files changed, 323 insertions, 287 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_curnext.c b/src/third_party/wiredtiger/src/btree/bt_curnext.c index 63b2e2abebc..70b3ba56e31 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curnext.c +++ b/src/third_party/wiredtiger/src/btree/bt_curnext.c @@ -86,10 +86,10 @@ __cursor_fix_next(WT_CURSOR_BTREE *cbt, bool newpage) /* Initialize for each new page. */ if (newpage) { - cbt->last_standard_recno = __col_fix_last_recno(page); + cbt->last_standard_recno = __col_fix_last_recno(cbt->ref); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); - __cursor_set_recno(cbt, page->pg_fix_recno); + __cursor_set_recno(cbt, cbt->ref->ref_recno); goto new_page; } @@ -107,7 +107,7 @@ new_page: cbt->ins = NULL; upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd == NULL) { - cbt->v = __bit_getv_recno(page, cbt->recno, btree->bitcnt); + cbt->v = __bit_getv_recno(cbt->ref, cbt->recno, btree->bitcnt); val->data = &cbt->v; } else val->data = WT_UPDATE_DATA(upd); @@ -179,10 +179,10 @@ __cursor_var_next(WT_CURSOR_BTREE *cbt, bool newpage) /* Initialize for each new page. */ if (newpage) { - cbt->last_standard_recno = __col_var_last_recno(page); + cbt->last_standard_recno = __col_var_last_recno(cbt->ref); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); - __cursor_set_recno(cbt, page->pg_var_recno); + __cursor_set_recno(cbt, cbt->ref->ref_recno); goto new_page; } @@ -194,7 +194,7 @@ __cursor_var_next(WT_CURSOR_BTREE *cbt, bool newpage) new_page: /* Find the matching WT_COL slot. */ if ((cip = - __col_var_search(page, cbt->recno, &rle_start)) == NULL) + __col_var_search(cbt->ref, cbt->recno, &rle_start)) == NULL) return (WT_NOTFOUND); cbt->slot = WT_COL_SLOT(page, cip); @@ -558,7 +558,8 @@ __wt_btcur_iterate_setup(WT_CURSOR_BTREE *cbt) * page. */ cbt->last_standard_recno = page->type == WT_PAGE_COL_VAR ? - __col_var_last_recno(page) : __col_fix_last_recno(page); + __col_var_last_recno(cbt->ref) : + __col_fix_last_recno(cbt->ref); /* If we're traversing the append list, set the reference. */ if (cbt->ins_head != NULL && diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c index 7475c0f1312..872f648446c 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curprev.c +++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c @@ -128,12 +128,10 @@ static inline int __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage) { WT_ITEM *val; - WT_PAGE *page; WT_SESSION_IMPL *session; WT_UPDATE *upd; session = (WT_SESSION_IMPL *)cbt->iface.session; - page = cbt->ref->page; val = &cbt->iface.value; if (newpage) { @@ -176,8 +174,8 @@ __cursor_fix_append_prev(WT_CURSOR_BTREE *cbt, bool newpage) * to a record number matching the first record on the page. */ if (cbt->ins == NULL && - (cbt->recno == page->pg_fix_recno || - __col_fix_last_recno(page) != 0)) + (cbt->recno == cbt->ref->ref_recno || + __col_fix_last_recno(cbt->ref) != 0)) return (WT_NOTFOUND); } @@ -234,7 +232,7 @@ __cursor_fix_prev(WT_CURSOR_BTREE *cbt, bool newpage) /* Initialize for each new page. */ if (newpage) { - cbt->last_standard_recno = __col_fix_last_recno(page); + cbt->last_standard_recno = __col_fix_last_recno(cbt->ref); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->last_standard_recno); @@ -242,7 +240,7 @@ __cursor_fix_prev(WT_CURSOR_BTREE *cbt, bool newpage) } /* Move to the previous entry and return the item. */ - if (cbt->recno == page->pg_fix_recno) + if (cbt->recno == cbt->ref->ref_recno) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->recno - 1); @@ -255,7 +253,7 @@ new_page: cbt->ins = NULL; upd = cbt->ins == NULL ? NULL : __wt_txn_read(session, cbt->ins->upd); if (upd == NULL) { - cbt->v = __bit_getv_recno(page, cbt->recno, btree->bitcnt); + cbt->v = __bit_getv_recno(cbt->ref, cbt->recno, btree->bitcnt); val->data = &cbt->v; } else val->data = WT_UPDATE_DATA(upd); @@ -327,7 +325,7 @@ __cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage) /* Initialize for each new page. */ if (newpage) { - cbt->last_standard_recno = __col_var_last_recno(page); + cbt->last_standard_recno = __col_var_last_recno(cbt->ref); if (cbt->last_standard_recno == 0) return (WT_NOTFOUND); __cursor_set_recno(cbt, cbt->last_standard_recno); @@ -338,12 +336,12 @@ __cursor_var_prev(WT_CURSOR_BTREE *cbt, bool newpage) for (;;) { __cursor_set_recno(cbt, cbt->recno - 1); -new_page: if (cbt->recno < page->pg_var_recno) +new_page: if (cbt->recno < cbt->ref->ref_recno) return (WT_NOTFOUND); /* Find the matching WT_COL slot. */ if ((cip = - __col_var_search(page, cbt->recno, &rle_start)) == NULL) + __col_var_search(cbt->ref, cbt->recno, &rle_start)) == NULL) return (WT_NOTFOUND); cbt->slot = WT_COL_SLOT(page, cip); diff --git a/src/third_party/wiredtiger/src/btree/bt_cursor.c b/src/third_party/wiredtiger/src/btree/bt_cursor.c index 1f3ac443495..4b73b76c8c8 100644 --- a/src/third_party/wiredtiger/src/btree/bt_cursor.c +++ b/src/third_party/wiredtiger/src/btree/bt_cursor.c @@ -164,12 +164,12 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) * column-store pages don't have slots, but map one-to-one to * keys, check for retrieval past the end of the page. */ - if (cbt->recno >= page->pg_fix_recno + page->pg_fix_entries) + if (cbt->recno >= cbt->ref->ref_recno + page->pg_fix_entries) return (false); /* - * Updates aren't stored on the page, an update would have - * appeared as an "insert" object; no further checks to do. + * An update would have appeared as an "insert" object; no + * further checks to do. */ break; case BTREE_COL_VAR: @@ -179,19 +179,18 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) WT_ASSERT(session, cbt->slot < page->pg_var_entries); /* - * Column-store updates aren't stored on the page, instead they - * are stored as "insert" objects. If search returned an insert - * object we can't return, the returned on-page object must be - * checked for a match. + * Column-store updates are stored as "insert" objects. If + * search returned an insert object we can't return, the + * returned on-page object must be checked for a match. */ if (cbt->ins != NULL && !F_ISSET(cbt, WT_CBT_VAR_ONPAGE_MATCH)) return (false); /* - * Updates aren't stored on the page, an update would have - * appeared as an "insert" object; however, variable-length - * column store deletes are written into the backing store, - * check the cell for a record already deleted when read. + * Although updates would have appeared as an "insert" objects, + * variable-length column store deletes are written into the + * backing store; check the cell for a record already deleted + * when read. */ cip = &page->pg_var_d[cbt->slot]; if ((cell = WT_COL_PTR(page, cip)) == NULL || @@ -211,9 +210,11 @@ __cursor_valid(WT_CURSOR_BTREE *cbt, WT_UPDATE **updp) if (cbt->ins != NULL) return (false); - /* Updates are stored on the page, check for a delete. */ - if (page->pg_row_upd != NULL && (upd = __wt_txn_read( - session, page->pg_row_upd[cbt->slot])) != NULL) { + /* Check for an update. */ + if (page->modify != NULL && + page->modify->mod_row_update != NULL && + (upd = __wt_txn_read(session, + page->modify->mod_row_update[cbt->slot])) != NULL) { if (WT_UPDATE_DELETED_ISSET(upd)) return (false); if (updp != NULL) @@ -325,7 +326,7 @@ __wt_btcur_search(WT_CURSOR_BTREE *cbt) valid = false; if (F_ISSET(cbt, WT_CBT_ACTIVE) && cbt->ref->page->read_gen != WT_READGEN_OLDEST) { - __wt_txn_cursor_op(session); + WT_ERR(__wt_txn_cursor_op(session)); WT_ERR(btree->type == BTREE_ROW ? __cursor_row_search(session, cbt, cbt->ref, false) : @@ -405,7 +406,7 @@ __wt_btcur_search_near(WT_CURSOR_BTREE *cbt, int *exactp) if (btree->type == BTREE_ROW && F_ISSET(cbt, WT_CBT_ACTIVE) && cbt->ref->page->read_gen != WT_READGEN_OLDEST) { - __wt_txn_cursor_op(session); + WT_ERR(__wt_txn_cursor_op(session)); WT_ERR(__cursor_row_search(session, cbt, cbt->ref, true)); @@ -596,9 +597,12 @@ __curfile_update_check(WT_CURSOR_BTREE *cbt) return (0); if (cbt->ins != NULL) return (__wt_txn_update_check(session, cbt->ins->upd)); - if (btree->type == BTREE_ROW && cbt->ref->page->pg_row_upd != NULL) - return (__wt_txn_update_check( - session, cbt->ref->page->pg_row_upd[cbt->slot])); + + if (btree->type == BTREE_ROW && + cbt->ref->page->modify != NULL && + cbt->ref->page->modify->mod_row_update != NULL) + return (__wt_txn_update_check(session, + cbt->ref->page->modify->mod_row_update[cbt->slot])); return (0); } diff --git a/src/third_party/wiredtiger/src/btree/bt_debug.c b/src/third_party/wiredtiger/src/btree/bt_debug.c index 8ce1463a0db..bd5970ecf86 100644 --- a/src/third_party/wiredtiger/src/btree/bt_debug.c +++ b/src/third_party/wiredtiger/src/btree/bt_debug.c @@ -19,7 +19,7 @@ typedef struct { * When using the standard event handlers, the debugging output has to * do its own message handling because its output isn't line-oriented. */ - WT_FH *fh; /* Output file stream */ + FILE *fp; WT_ITEM *msg; /* Buffered message */ WT_ITEM *tmp; /* Temporary space */ @@ -36,17 +36,17 @@ static int __debug_config(WT_SESSION_IMPL *, WT_DBG *, const char *); static int __debug_dsk_cell(WT_DBG *, const WT_PAGE_HEADER *); static void __debug_dsk_col_fix(WT_DBG *, const WT_PAGE_HEADER *); static void __debug_item(WT_DBG *, const char *, const void *, size_t); -static int __debug_page(WT_DBG *, WT_PAGE *, uint32_t); -static void __debug_page_col_fix(WT_DBG *, WT_PAGE *); +static int __debug_page(WT_DBG *, WT_REF *, uint32_t); +static void __debug_page_col_fix(WT_DBG *, WT_REF *); static int __debug_page_col_int(WT_DBG *, WT_PAGE *, uint32_t); -static int __debug_page_col_var(WT_DBG *, WT_PAGE *); -static int __debug_page_metadata(WT_DBG *, WT_PAGE *); +static int __debug_page_col_var(WT_DBG *, WT_REF *); +static int __debug_page_metadata(WT_DBG *, WT_REF *); static int __debug_page_row_int(WT_DBG *, WT_PAGE *, uint32_t); static int __debug_page_row_leaf(WT_DBG *, WT_PAGE *); static void __debug_ref(WT_DBG *, WT_REF *); static void __debug_row_skip(WT_DBG *, WT_INSERT_HEAD *); static int __debug_tree( - WT_SESSION_IMPL *, WT_BTREE *, WT_PAGE *, const char *, uint32_t); + WT_SESSION_IMPL *, WT_BTREE *, WT_REF *, const char *, uint32_t); static void __debug_update(WT_DBG *, WT_UPDATE *, bool); static void __dmsg(WT_DBG *, const char *, ...) WT_GCC_FUNC_DECL_ATTRIBUTE((format (printf, 2, 3))); @@ -97,8 +97,11 @@ __debug_config(WT_SESSION_IMPL *session, WT_DBG *ds, const char *ofile) if (ofile == NULL) return (__wt_scr_alloc(session, 512, &ds->msg)); - return (__wt_open(session, ofile, WT_FILE_TYPE_REGULAR, - WT_OPEN_CREATE | WT_STREAM_LINE_BUFFER | WT_STREAM_WRITE, &ds->fh)); + if ((ds->fp = fopen(ofile, "w")) == NULL) + return (EIO); + __wt_stream_set_line_buffer(ds->fp); + + return (0); } /* @@ -127,7 +130,8 @@ __dmsg_wrapup(WT_DBG *ds) } /* Close any file we opened. */ - (void)__wt_close(session, &ds->fh); + if (ds->fp != NULL) + (void)fclose(ds->fp); } /* @@ -152,7 +156,7 @@ __dmsg(WT_DBG *ds, const char *fmt, ...) * the output chunk, and pass it to the event handler once we see a * terminating newline. */ - if (ds->fh == NULL) { + if (ds->fp == NULL) { msg = ds->msg; for (;;) { p = (char *)msg->mem + msg->size; @@ -184,7 +188,7 @@ __dmsg(WT_DBG *ds, const char *fmt, ...) } } else { va_start(ap, fmt); - (void)__wt_vfprintf(session, ds->fh, fmt, ap); + (void)vfprintf(ds->fp, fmt, ap); va_end(ap); } } @@ -498,10 +502,10 @@ __wt_debug_tree_shape( */ int __wt_debug_tree_all( - WT_SESSION_IMPL *session, WT_BTREE *btree, WT_PAGE *page, const char *ofile) + WT_SESSION_IMPL *session, WT_BTREE *btree, WT_REF *ref, const char *ofile) { return (__debug_tree(session, - btree, page, ofile, WT_DEBUG_TREE_LEAF | WT_DEBUG_TREE_WALK)); + btree, ref, ofile, WT_DEBUG_TREE_LEAF | WT_DEBUG_TREE_WALK)); } /* @@ -513,9 +517,9 @@ __wt_debug_tree_all( */ int __wt_debug_tree( - WT_SESSION_IMPL *session, WT_BTREE *btree, WT_PAGE *page, const char *ofile) + WT_SESSION_IMPL *session, WT_BTREE *btree, WT_REF *ref, const char *ofile) { - return (__debug_tree(session, btree, page, ofile, WT_DEBUG_TREE_WALK)); + return (__debug_tree(session, btree, ref, ofile, WT_DEBUG_TREE_WALK)); } /* @@ -523,7 +527,7 @@ __wt_debug_tree( * Dump the in-memory information for a page. */ int -__wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile) +__wt_debug_page(WT_SESSION_IMPL *session, WT_REF *ref, const char *ofile) { WT_DBG *ds, _ds; WT_DECL_RET; @@ -533,7 +537,7 @@ __wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile) ds = &_ds; WT_RET(__debug_config(session, ds, ofile)); - ret = __debug_page(ds, page, WT_DEBUG_TREE_LEAF); + ret = __debug_page(ds, ref, WT_DEBUG_TREE_LEAF); __dmsg_wrapup(ds); @@ -549,9 +553,8 @@ __wt_debug_page(WT_SESSION_IMPL *session, WT_PAGE *page, const char *ofile) * in this function */ static int -__debug_tree( - WT_SESSION_IMPL *session, WT_BTREE *btree, - WT_PAGE *page, const char *ofile, uint32_t flags) +__debug_tree(WT_SESSION_IMPL *session, + WT_BTREE *btree, WT_REF *ref, const char *ofile, uint32_t flags) { WT_DBG *ds, _ds; WT_DECL_RET; @@ -560,10 +563,10 @@ __debug_tree( WT_RET(__debug_config(session, ds, ofile)); /* A NULL page starts at the top of the tree -- it's a convenience. */ - if (page == NULL) - page = btree->root.page; + if (ref == NULL) + ref = &btree->root; - WT_WITH_BTREE(session, btree, ret = __debug_page(ds, page, flags)); + WT_WITH_BTREE(session, btree, ret = __debug_page(ds, ref, flags)); __dmsg_wrapup(ds); @@ -575,7 +578,7 @@ __debug_tree( * Dump the in-memory information for an in-memory page. */ static int -__debug_page(WT_DBG *ds, WT_PAGE *page, uint32_t flags) +__debug_page(WT_DBG *ds, WT_REF *ref, uint32_t flags) { WT_DECL_RET; WT_SESSION_IMPL *session; @@ -583,32 +586,32 @@ __debug_page(WT_DBG *ds, WT_PAGE *page, uint32_t flags) session = ds->session; /* Dump the page metadata. */ - WT_WITH_PAGE_INDEX(session, ret = __debug_page_metadata(ds, page)); + WT_WITH_PAGE_INDEX(session, ret = __debug_page_metadata(ds, ref)); WT_RET(ret); /* Dump the page. */ - switch (page->type) { + switch (ref->page->type) { case WT_PAGE_COL_FIX: if (LF_ISSET(WT_DEBUG_TREE_LEAF)) - __debug_page_col_fix(ds, page); + __debug_page_col_fix(ds, ref); break; case WT_PAGE_COL_INT: WT_WITH_PAGE_INDEX(session, - ret = __debug_page_col_int(ds, page, flags)); + ret = __debug_page_col_int(ds, ref->page, flags)); WT_RET(ret); break; case WT_PAGE_COL_VAR: if (LF_ISSET(WT_DEBUG_TREE_LEAF)) - WT_RET(__debug_page_col_var(ds, page)); + WT_RET(__debug_page_col_var(ds, ref)); break; case WT_PAGE_ROW_INT: WT_WITH_PAGE_INDEX(session, - ret = __debug_page_row_int(ds, page, flags)); + ret = __debug_page_row_int(ds, ref->page, flags)); WT_RET(ret); break; case WT_PAGE_ROW_LEAF: if (LF_ISSET(WT_DEBUG_TREE_LEAF)) - WT_RET(__debug_page_row_leaf(ds, page)); + WT_RET(__debug_page_row_leaf(ds, ref->page)); break; WT_ILLEGAL_VALUE(session); } @@ -621,30 +624,32 @@ __debug_page(WT_DBG *ds, WT_PAGE *page, uint32_t flags) * Dump an in-memory page's metadata. */ static int -__debug_page_metadata(WT_DBG *ds, WT_PAGE *page) +__debug_page_metadata(WT_DBG *ds, WT_REF *ref) { + WT_PAGE *page; WT_PAGE_INDEX *pindex; WT_PAGE_MODIFY *mod; WT_SESSION_IMPL *session; uint32_t entries; session = ds->session; + page = ref->page; mod = page->modify; __dmsg(ds, "%p", page); switch (page->type) { case WT_PAGE_COL_INT: - __dmsg(ds, " recno %" PRIu64, page->pg_intl_recno); + __dmsg(ds, " recno %" PRIu64, ref->ref_recno); WT_INTL_INDEX_GET(session, page, pindex); entries = pindex->entries; break; case WT_PAGE_COL_FIX: - __dmsg(ds, " recno %" PRIu64, page->pg_fix_recno); + __dmsg(ds, " recno %" PRIu64, ref->ref_recno); entries = page->pg_fix_entries; break; case WT_PAGE_COL_VAR: - __dmsg(ds, " recno %" PRIu64, page->pg_var_recno); + __dmsg(ds, " recno %" PRIu64, ref->ref_recno); entries = page->pg_var_entries; break; case WT_PAGE_ROW_INT: @@ -707,10 +712,11 @@ __debug_page_metadata(WT_DBG *ds, WT_PAGE *page) * Dump an in-memory WT_PAGE_COL_FIX page. */ static void -__debug_page_col_fix(WT_DBG *ds, WT_PAGE *page) +__debug_page_col_fix(WT_DBG *ds, WT_REF *ref) { WT_BTREE *btree; WT_INSERT *ins; + WT_PAGE *page; const WT_PAGE_HEADER *dsk; WT_SESSION_IMPL *session; uint64_t recno; @@ -721,8 +727,9 @@ __debug_page_col_fix(WT_DBG *ds, WT_PAGE *page) session = ds->session; btree = S2BT(session); + page = ref->page; dsk = page->dsk; - recno = page->pg_fix_recno; + recno = ref->ref_recno; if (dsk != NULL) { ins = WT_SKIP_FIRST(WT_COL_UPDATE_SINGLE(page)); @@ -767,7 +774,7 @@ __debug_page_col_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags) session = ds->session; WT_INTL_FOREACH_BEGIN(session, page, ref) { - __dmsg(ds, "\trecno %" PRIu64 "\n", ref->key.recno); + __dmsg(ds, "\trecno %" PRIu64 "\n", ref->ref_recno); __debug_ref(ds, ref); } WT_INTL_FOREACH_END; @@ -775,7 +782,7 @@ __debug_page_col_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags) WT_INTL_FOREACH_BEGIN(session, page, ref) { if (ref->state == WT_REF_MEM) { __dmsg(ds, "\n"); - WT_RET(__debug_page(ds, ref->page, flags)); + WT_RET(__debug_page(ds, ref, flags)); } } WT_INTL_FOREACH_END; @@ -787,18 +794,20 @@ __debug_page_col_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags) * Dump an in-memory WT_PAGE_COL_VAR page. */ static int -__debug_page_col_var(WT_DBG *ds, WT_PAGE *page) +__debug_page_col_var(WT_DBG *ds, WT_REF *ref) { WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; WT_COL *cip; WT_INSERT_HEAD *update; + WT_PAGE *page; uint64_t recno, rle; uint32_t i; char tag[64]; unpack = &_unpack; - recno = page->pg_var_recno; + page = ref->page; + recno = ref->ref_recno; WT_COL_FOREACH(page, cip, i) { if ((cell = WT_COL_PTR(page, cip)) == NULL) { @@ -849,7 +858,7 @@ __debug_page_row_int(WT_DBG *ds, WT_PAGE *page, uint32_t flags) WT_INTL_FOREACH_BEGIN(session, page, ref) { if (ref->state == WT_REF_MEM) { __dmsg(ds, "\n"); - WT_RET(__debug_page(ds, ref->page, flags)); + WT_RET(__debug_page(ds, ref, flags)); } } WT_INTL_FOREACH_END; return (0); @@ -952,8 +961,7 @@ __debug_update(WT_DBG *ds, WT_UPDATE *upd, bool hexbyte) __dmsg(ds, "\tvalue {deleted}\n"); else if (hexbyte) { __dmsg(ds, "\t{"); - __debug_hex_byte(ds, - ((uint8_t *)WT_UPDATE_DATA(upd))[0]); + __debug_hex_byte(ds, *(uint8_t *)WT_UPDATE_DATA(upd)); __dmsg(ds, "}\n"); } else __debug_item(ds, diff --git a/src/third_party/wiredtiger/src/btree/bt_delete.c b/src/third_party/wiredtiger/src/btree/bt_delete.c index ba16dd204e8..54b7fedb31d 100644 --- a/src/third_party/wiredtiger/src/btree/bt_delete.c +++ b/src/third_party/wiredtiger/src/btree/bt_delete.c @@ -288,10 +288,9 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) * read-only or if the application never modifies the tree, we're not * able to do so.) */ - if (btree->modified) { - WT_RET(__wt_page_modify_init(session, page)); + WT_RET(__wt_page_modify_init(session, page)); + if (btree->modified) __wt_page_modify_set(session, page); - } /* * An operation is accessing a "deleted" page, and we're building an @@ -326,7 +325,7 @@ __wt_delete_page_instantiate(WT_SESSION_IMPL *session, WT_REF *ref) /* Allocate the per-page update array. */ WT_ERR(__wt_calloc_def(session, page->pg_row_entries, &upd_array)); - page->pg_row_upd = upd_array; + page->modify->mod_row_update = upd_array; /* * Fill in the per-reference update array with references to update diff --git a/src/third_party/wiredtiger/src/btree/bt_discard.c b/src/third_party/wiredtiger/src/btree/bt_discard.c index 1181d92609f..9807d5bc88f 100644 --- a/src/third_party/wiredtiger/src/btree/bt_discard.c +++ b/src/third_party/wiredtiger/src/btree/bt_discard.c @@ -40,7 +40,6 @@ __wt_ref_out(WT_SESSION_IMPL *session, WT_REF *ref) void __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) { - WT_FH *fh; WT_PAGE *page; WT_PAGE_HEADER *dsk; WT_PAGE_MODIFY *mod; @@ -134,10 +133,11 @@ __wt_page_out(WT_SESSION_IMPL *session, WT_PAGE **pagep) dsk = (WT_PAGE_HEADER *)page->dsk; if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_ALLOC)) __wt_overwrite_and_free_len(session, dsk, dsk->mem_size); - if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_MAPPED)) { - fh = S2BT(session)->bm->block->fh; - (void)fh->fh_map_discard(session, fh, dsk, dsk->mem_size); - } + + /* Discard any mapped image. */ + if (F_ISSET_ATOMIC(page, WT_PAGE_DISK_MAPPED)) + (void)S2BT(session)->bm->map_discard( + S2BT(session)->bm, session, dsk, (size_t)dsk->mem_size); __wt_overwrite_and_free(session, page); } @@ -194,16 +194,33 @@ __free_page_modify(WT_SESSION_IMPL *session, WT_PAGE *page) __free_skip_list( session, WT_SKIP_FIRST(append), update_ignore); __wt_free(session, append); - __wt_free(session, mod->mod_append); + __wt_free(session, mod->mod_col_append); } /* Free the insert/update array. */ - if (mod->mod_update != NULL) - __free_skip_array(session, mod->mod_update, + if (mod->mod_col_update != NULL) + __free_skip_array(session, mod->mod_col_update, page->type == WT_PAGE_COL_FIX ? 1 : page->pg_var_entries, update_ignore); break; + case WT_PAGE_ROW_LEAF: + /* + * Free the insert array. + * + * Row-store tables have one additional slot in the insert array + * (the insert array has an extra slot to hold keys that sort + * before keys found on the original page). + */ + if (mod->mod_row_insert != NULL) + __free_skip_array(session, mod->mod_row_insert, + page->pg_row_entries + 1, update_ignore); + + /* Free the update array. */ + if (mod->mod_row_update != NULL) + __free_update(session, mod->mod_row_update, + page->pg_row_entries, update_ignore); + break; } /* Free the overflow on-page, reuse and transaction-cache skiplists. */ @@ -324,10 +341,6 @@ __free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) WT_ROW *rip; uint32_t i; void *copy; - bool update_ignore; - - /* In some failed-split cases, we can't discard updates. */ - update_ignore = F_ISSET_ATOMIC(page, WT_PAGE_UPDATE_IGNORE); /* * Free the in-memory index array. @@ -342,22 +355,6 @@ __free_page_row_leaf(WT_SESSION_IMPL *session, WT_PAGE *page) page, copy, &ikey, NULL, NULL, NULL); __wt_free(session, ikey); } - - /* - * Free the insert array. - * - * Row-store tables have one additional slot in the insert array (the - * insert array has an extra slot to hold keys that sort before keys - * found on the original page). - */ - if (page->pg_row_ins != NULL) - __free_skip_array(session, - page->pg_row_ins, page->pg_row_entries + 1, update_ignore); - - /* Free the update array. */ - if (page->pg_row_upd != NULL) - __free_update(session, - page->pg_row_upd, page->pg_row_entries, update_ignore); } /* diff --git a/src/third_party/wiredtiger/src/btree/bt_handle.c b/src/third_party/wiredtiger/src/btree/bt_handle.c index 02eea9c2f0c..ba545859d07 100644 --- a/src/third_party/wiredtiger/src/btree/bt_handle.c +++ b/src/third_party/wiredtiger/src/btree/bt_handle.c @@ -371,7 +371,7 @@ __wt_root_ref_init(WT_REF *root_ref, WT_PAGE *root, bool is_recno) root_ref->page = root; root_ref->state = WT_REF_MEM; - root_ref->key.recno = is_recno ? 1 : WT_RECNO_OOB; + root_ref->ref_recno = is_recno ? 1 : WT_RECNO_OOB; root->pg_intl_parent_ref = root_ref; } @@ -495,7 +495,7 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, bool creation) case BTREE_COL_FIX: case BTREE_COL_VAR: WT_ERR(__wt_page_alloc( - session, WT_PAGE_COL_INT, 1, 1, true, &root)); + session, WT_PAGE_COL_INT, 1, true, &root)); root->pg_intl_parent_ref = &btree->root; pindex = WT_INTL_INDEX_GET_SAFE(root); @@ -504,11 +504,11 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, bool creation) ref->page = NULL; ref->addr = NULL; ref->state = WT_REF_DELETED; - ref->key.recno = 1; + ref->ref_recno = 1; break; case BTREE_ROW: WT_ERR(__wt_page_alloc( - session, WT_PAGE_ROW_INT, 0, 1, true, &root)); + session, WT_PAGE_ROW_INT, 1, true, &root)); root->pg_intl_parent_ref = &btree->root; pindex = WT_INTL_INDEX_GET_SAFE(root); @@ -524,7 +524,7 @@ __btree_tree_open_empty(WT_SESSION_IMPL *session, bool creation) /* Bulk loads require a leaf page for reconciliation: create it now. */ if (F_ISSET(btree, WT_BTREE_BULK)) { - WT_ERR(__wt_btree_new_leaf_page(session, 1, &leaf)); + WT_ERR(__wt_btree_new_leaf_page(session, &leaf)); ref->page = leaf; ref->state = WT_REF_MEM; WT_ERR(__wt_page_modify_init(session, leaf)); @@ -548,8 +548,7 @@ err: if (leaf != NULL) * Create an empty leaf page. */ int -__wt_btree_new_leaf_page( - WT_SESSION_IMPL *session, uint64_t recno, WT_PAGE **pagep) +__wt_btree_new_leaf_page(WT_SESSION_IMPL *session, WT_PAGE **pagep) { WT_BTREE *btree; @@ -558,15 +557,15 @@ __wt_btree_new_leaf_page( switch (btree->type) { case BTREE_COL_FIX: WT_RET(__wt_page_alloc( - session, WT_PAGE_COL_FIX, recno, 0, false, pagep)); + session, WT_PAGE_COL_FIX, 0, false, pagep)); break; case BTREE_COL_VAR: WT_RET(__wt_page_alloc( - session, WT_PAGE_COL_VAR, recno, 0, false, pagep)); + session, WT_PAGE_COL_VAR, 0, false, pagep)); break; case BTREE_ROW: WT_RET(__wt_page_alloc( - session, WT_PAGE_ROW_LEAF, WT_RECNO_OOB, 0, false, pagep)); + session, WT_PAGE_ROW_LEAF, 0, false, pagep)); break; WT_ILLEGAL_VALUE(session); } @@ -639,7 +638,7 @@ __btree_get_last_recno(WT_SESSION_IMPL *session) page = next_walk->page; btree->last_recno = page->type == WT_PAGE_COL_VAR ? - __col_var_last_recno(page) : __col_fix_last_recno(page); + __col_var_last_recno(next_walk) : __col_fix_last_recno(next_walk); return (__wt_page_release(session, next_walk, 0)); } diff --git a/src/third_party/wiredtiger/src/btree/bt_huffman.c b/src/third_party/wiredtiger/src/btree/bt_huffman.c index a1aaf2c7ea0..9e9d69c342e 100644 --- a/src/third_party/wiredtiger/src/btree/bt_huffman.c +++ b/src/third_party/wiredtiger/src/btree/bt_huffman.c @@ -133,10 +133,10 @@ static int __wt_huffman_read(WT_SESSION_IMPL *, * Check for a Huffman configuration file and return the file name. */ static int -__huffman_confchk_file( - WT_SESSION_IMPL *session, WT_CONFIG_ITEM *v, bool *is_utf8p, WT_FH **fhp) +__huffman_confchk_file(WT_SESSION_IMPL *session, + WT_CONFIG_ITEM *v, bool *is_utf8p, WT_FSTREAM **fsp) { - WT_FH *fh; + WT_FSTREAM *fs; WT_DECL_RET; size_t len; char *fname; @@ -157,14 +157,13 @@ __huffman_confchk_file( /* Check the file exists. */ WT_RET(__wt_strndup(session, v->str + len, v->len - len, &fname)); - WT_ERR(__wt_open(session, fname, WT_FILE_TYPE_REGULAR, - WT_OPEN_FIXED | WT_OPEN_READONLY | WT_STREAM_READ, &fh)); + WT_ERR(__wt_fopen(session, fname, WT_OPEN_FIXED, WT_STREAM_READ, &fs)); /* Optionally return the file handle. */ - if (fhp == NULL) - (void)__wt_close(session, &fh); + if (fsp == NULL) + (void)__wt_fclose(session, &fs); else - *fhp = fh; + *fsp = fs; err: __wt_free(session, fname); @@ -300,7 +299,7 @@ __wt_huffman_read(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *ip, struct __wt_huffman_table *table, *tp; WT_DECL_ITEM(tmp); WT_DECL_RET; - WT_FH *fh; + WT_FSTREAM *fs; int64_t symbol, frequency; u_int entries, lineno; int n; @@ -309,13 +308,13 @@ __wt_huffman_read(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *ip, *tablep = NULL; *entriesp = *numbytesp = 0; - fh = NULL; + fs = NULL; table = NULL; /* * Try and open the backing file. */ - WT_RET(__huffman_confchk_file(session, ip, &is_utf8, &fh)); + WT_RET(__huffman_confchk_file(session, ip, &is_utf8, &fs)); /* * UTF-8 table is 256 bytes, with a range of 0-255. @@ -333,7 +332,7 @@ __wt_huffman_read(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *ip, WT_ERR(__wt_scr_alloc(session, 0, &tmp)); for (tp = table, lineno = 1;; ++tp, ++lineno) { - WT_ERR(__wt_getline(session, tmp, fh)); + WT_ERR(__wt_getline(session, fs, tmp)); if (tmp->size == 0) break; n = sscanf( @@ -378,7 +377,7 @@ __wt_huffman_read(WT_SESSION_IMPL *session, WT_CONFIG_ITEM *ip, if (0) { err: __wt_free(session, table); } - (void)__wt_close(session, &fh); + (void)__wt_fclose(session, &fs); __wt_scr_free(session, &tmp); return (ret); diff --git a/src/third_party/wiredtiger/src/btree/bt_page.c b/src/third_party/wiredtiger/src/btree/bt_page.c index 9fa0145bbdd..00ec8aa4494 100644 --- a/src/third_party/wiredtiger/src/btree/bt_page.c +++ b/src/third_party/wiredtiger/src/btree/bt_page.c @@ -10,7 +10,7 @@ static void __inmem_col_fix(WT_SESSION_IMPL *, WT_PAGE *); static void __inmem_col_int(WT_SESSION_IMPL *, WT_PAGE *); -static int __inmem_col_var(WT_SESSION_IMPL *, WT_PAGE *, size_t *); +static int __inmem_col_var(WT_SESSION_IMPL *, WT_PAGE *, uint64_t, size_t *); static int __inmem_row_int(WT_SESSION_IMPL *, WT_PAGE *, size_t *); static int __inmem_row_leaf(WT_SESSION_IMPL *, WT_PAGE *); static int __inmem_row_leaf_entries( @@ -21,8 +21,8 @@ static int __inmem_row_leaf_entries( * Create or read a page into the cache. */ int -__wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, - uint64_t recno, uint32_t alloc_entries, bool alloc_refs, WT_PAGE **pagep) +__wt_page_alloc(WT_SESSION_IMPL *session, + uint8_t type, uint32_t alloc_entries, bool alloc_refs, WT_PAGE **pagep) { WT_CACHE *cache; WT_DECL_RET; @@ -67,13 +67,10 @@ __wt_page_alloc(WT_SESSION_IMPL *session, uint8_t type, switch (type) { case WT_PAGE_COL_FIX: - page->pg_fix_recno = recno; page->pg_fix_entries = alloc_entries; break; case WT_PAGE_COL_INT: case WT_PAGE_ROW_INT: - page->pg_intl_recno = recno; - /* * Internal pages have an array of references to objects so they * can split. Allocate the array of references and optionally, @@ -105,7 +102,6 @@ err: if ((pindex = WT_INTL_INDEX_GET_SAFE(page)) != NULL) { } break; case WT_PAGE_COL_VAR: - page->pg_var_recno = recno; page->pg_var_d = (WT_COL *)((uint8_t *)page + sizeof(WT_PAGE)); page->pg_var_entries = alloc_entries; break; @@ -191,8 +187,7 @@ __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, } /* Allocate and initialize a new WT_PAGE. */ - WT_RET(__wt_page_alloc( - session, dsk->type, dsk->recno, alloc_entries, true, &page)); + WT_RET(__wt_page_alloc(session, dsk->type, alloc_entries, true, &page)); page->dsk = dsk; F_SET_ATOMIC(page, flags); @@ -211,7 +206,7 @@ __wt_page_inmem(WT_SESSION_IMPL *session, WT_REF *ref, __inmem_col_int(session, page); break; case WT_PAGE_COL_VAR: - WT_ERR(__inmem_col_var(session, page, &size)); + WT_ERR(__inmem_col_var(session, page, dsk->recno, &size)); break; case WT_PAGE_ROW_INT: WT_ERR(__inmem_row_int(session, page, &size)); @@ -292,7 +287,7 @@ __inmem_col_int(WT_SESSION_IMPL *session, WT_PAGE *page) __wt_cell_unpack(cell, unpack); ref->addr = cell; - ref->key.recno = unpack->v; + ref->ref_recno = unpack->v; } } @@ -329,7 +324,8 @@ __inmem_col_var_repeats(WT_SESSION_IMPL *session, WT_PAGE *page, uint32_t *np) * column-store trees. */ static int -__inmem_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep) +__inmem_col_var( + WT_SESSION_IMPL *session, WT_PAGE *page, uint64_t recno, size_t *sizep) { WT_BTREE *btree; WT_COL *cip; @@ -337,13 +333,12 @@ __inmem_col_var(WT_SESSION_IMPL *session, WT_PAGE *page, size_t *sizep) WT_CELL *cell; WT_CELL_UNPACK *unpack, _unpack; const WT_PAGE_HEADER *dsk; - uint64_t recno, rle; + uint64_t rle; size_t bytes_allocated; uint32_t i, indx, n, repeat_off; btree = S2BT(session); dsk = page->dsk; - recno = page->pg_var_recno; repeats = NULL; repeat_off = 0; diff --git a/src/third_party/wiredtiger/src/btree/bt_read.c b/src/third_party/wiredtiger/src/btree/bt_read.c index 5cf6a9bf2bc..89d16a3f827 100644 --- a/src/third_party/wiredtiger/src/btree/bt_read.c +++ b/src/third_party/wiredtiger/src/btree/bt_read.c @@ -326,7 +326,7 @@ __evict_force_check(WT_SESSION_IMPL *session, WT_REF *ref) __wt_page_evict_soon(page); /* Bump the oldest ID, we're about to do some visibility checks. */ - __wt_txn_update_oldest(session, false); + WT_RET(__wt_txn_update_oldest(session, false)); /* If eviction cannot succeed, don't try. */ return (__wt_page_can_evict(session, ref, NULL)); @@ -377,9 +377,7 @@ __page_read(WT_SESSION_IMPL *session, WT_REF *ref) if (addr == NULL) { WT_ASSERT(session, previous_state == WT_REF_DELETED); - WT_ERR(__wt_btree_new_leaf_page(session, - btree->type == BTREE_ROW ? WT_RECNO_OOB : ref->key.recno, - &page)); + WT_ERR(__wt_btree_new_leaf_page(session, &page)); ref->page = page; goto done; } @@ -463,6 +461,8 @@ __wt_page_in_func(WT_SESSION_IMPL *session, WT_REF *ref, uint32_t flags btree = S2BT(session); + WT_STAT_FAST_CONN_INCR(session, cache_pages_requested); + WT_STAT_FAST_DATA_INCR(session, cache_pages_requested); for (evict_soon = stalled = false, force_attempts = 0, sleep_cnt = wait_cnt = 0;;) { switch (ref->state) { diff --git a/src/third_party/wiredtiger/src/btree/bt_rebalance.c b/src/third_party/wiredtiger/src/btree/bt_rebalance.c index d94eb2ddd80..de54e8433a8 100644 --- a/src/third_party/wiredtiger/src/btree/bt_rebalance.c +++ b/src/third_party/wiredtiger/src/btree/bt_rebalance.c @@ -90,7 +90,7 @@ __rebalance_leaf_append(WT_SESSION_IMPL *session, if (recno == WT_RECNO_OOB) WT_RET(__wt_row_ikey(session, 0, key, key_len, copy)); else - copy->key.recno = recno; + copy->ref_recno = recno; copy->page_del = NULL; return (0); @@ -147,8 +147,7 @@ __rebalance_internal(WT_SESSION_IMPL *session, WT_REBALANCE_STUFF *rs) leaf_next = (uint32_t)rs->leaf_next; /* Allocate a row-store root (internal) page and fill it in. */ - WT_RET(__wt_page_alloc(session, rs->type, - rs->type == WT_PAGE_COL_INT ? 1 : 0, leaf_next, false, &page)); + WT_RET(__wt_page_alloc(session, rs->type, leaf_next, false, &page)); page->pg_intl_parent_ref = &btree->root; WT_ERR(__wt_page_modify_init(session, page)); __wt_page_modify_set(session, page); diff --git a/src/third_party/wiredtiger/src/btree/bt_ret.c b/src/third_party/wiredtiger/src/btree/bt_ret.c index ebc0499f6a2..8ef2db67e7b 100644 --- a/src/third_party/wiredtiger/src/btree/bt_ret.c +++ b/src/third_party/wiredtiger/src/btree/bt_ret.c @@ -46,7 +46,7 @@ __wt_kv_return(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_UPDATE *upd) } /* Take the value from the original page. */ - v = __bit_getv_recno(page, cbt->iface.recno, btree->bitcnt); + v = __bit_getv_recno(cbt->ref, cursor->recno, btree->bitcnt); return (__wt_buf_set(session, &cursor->value, &v, 1)); case WT_PAGE_COL_VAR: /* diff --git a/src/third_party/wiredtiger/src/btree/bt_slvg.c b/src/third_party/wiredtiger/src/btree/bt_slvg.c index 0e064d306b6..9b5e4daf74a 100644 --- a/src/third_party/wiredtiger/src/btree/bt_slvg.c +++ b/src/third_party/wiredtiger/src/btree/bt_slvg.c @@ -116,8 +116,8 @@ struct __wt_track { static int __slvg_cleanup(WT_SESSION_IMPL *, WT_STUFF *); static int __slvg_col_build_internal(WT_SESSION_IMPL *, uint32_t, WT_STUFF *); static int __slvg_col_build_leaf(WT_SESSION_IMPL *, WT_TRACK *, WT_REF *); -static int __slvg_col_ovfl( - WT_SESSION_IMPL *, WT_TRACK *, WT_PAGE *, uint64_t, uint64_t); +static int __slvg_col_ovfl(WT_SESSION_IMPL *, + WT_TRACK *, WT_PAGE *, uint64_t, uint64_t, uint64_t); static int __slvg_col_range(WT_SESSION_IMPL *, WT_STUFF *); static int __slvg_col_range_missing(WT_SESSION_IMPL *, WT_STUFF *); static int __slvg_col_range_overlap( @@ -166,11 +166,13 @@ __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) WT_DECL_RET; WT_STUFF *ss, stuff; uint32_t i, leaf_cnt; + bool evict_reset; WT_UNUSED(cfg); btree = S2BT(session); bm = btree->bm; + evict_reset = false; WT_CLEAR(stuff); ss = &stuff; @@ -182,6 +184,13 @@ __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) WT_ERR(__wt_scr_alloc(session, 0, &ss->tmp2)); /* + * Salvage handles its own page eviction; get exclusive access to the + * file, have eviction ignore the tree entirely. + */ + WT_ERR(__wt_evict_file_exclusive_on(session)); + evict_reset = true; + + /* * Step 1: * Inform the underlying block manager that we're salvaging the file. */ @@ -295,13 +304,13 @@ __wt_bt_salvage(WT_SESSION_IMPL *session, WT_CKPT *ckptbase, const char *cfg[]) case WT_PAGE_COL_VAR: WT_WITH_PAGE_INDEX(session, ret = __slvg_col_build_internal( - session, leaf_cnt, ss)); + session, leaf_cnt, ss)); WT_ERR(ret); break; case WT_PAGE_ROW_LEAF: WT_WITH_PAGE_INDEX(session, ret = __slvg_row_build_internal( - session, leaf_cnt, ss)); + session, leaf_cnt, ss)); WT_ERR(ret); break; } @@ -341,6 +350,9 @@ err: WT_TRET(bm->salvage_end(bm, session)); if (ss->root_ref.page != NULL) __wt_ref_out(session, &ss->root_ref); + if (evict_reset) + __wt_evict_file_exclusive_off(session); + /* Discard the leaf and overflow page memory. */ WT_TRET(__slvg_cleanup(session, ss)); @@ -1159,7 +1171,7 @@ __slvg_col_build_internal( /* Allocate a column-store root (internal) page and fill it in. */ WT_RET(__wt_page_alloc( - session, WT_PAGE_COL_INT, 1, leaf_cnt, true, &page)); + session, WT_PAGE_COL_INT, leaf_cnt, true, &page)); WT_ERR(__slvg_modify_init(session, page)); pindex = WT_INTL_INDEX_GET_SAFE(page); @@ -1180,7 +1192,7 @@ __slvg_col_build_internal( ref->addr = addr; addr = NULL; - ref->key.recno = trk->col_start; + ref->ref_recno = trk->col_start; ref->state = WT_REF_DISK; /* @@ -1223,7 +1235,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) WT_DECL_RET; WT_PAGE *page; WT_SALVAGE_COOKIE *cookie, _cookie; - uint64_t skip, take; + uint64_t recno, skip, take; uint32_t *entriesp, save_entries; cookie = &_cookie; @@ -1243,7 +1255,8 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) * Calculate the number of K/V entries we are going to skip, and * the total number of K/V entries we'll take from this page. */ - cookie->skip = skip = trk->col_start - page->pg_var_recno; + recno = page->dsk->recno; + cookie->skip = skip = trk->col_start - recno; cookie->take = take = (trk->col_stop - trk->col_start) + 1; WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE, @@ -1255,7 +1268,7 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) /* Set the referenced flag on overflow pages we're using. */ if (page->type == WT_PAGE_COL_VAR && trk->trk_ovfl_cnt != 0) - WT_ERR(__slvg_col_ovfl(session, trk, page, skip, take)); + WT_ERR(__slvg_col_ovfl(session, trk, page, recno, skip, take)); /* * If we're missing some part of the range, the real start range is in @@ -1263,9 +1276,9 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) * reference as well as the page itself. */ if (trk->col_missing == 0) - page->pg_var_recno = trk->col_start; + ref->ref_recno = trk->col_start; else { - page->pg_var_recno = trk->col_missing; + ref->ref_recno = trk->col_missing; cookie->missing = trk->col_start - trk->col_missing; WT_ERR(__wt_verbose(session, WT_VERB_SALVAGE, @@ -1274,7 +1287,6 @@ __slvg_col_build_leaf(WT_SESSION_IMPL *session, WT_TRACK *trk, WT_REF *ref) session, trk->trk_addr, trk->trk_addr_size, trk->ss->tmp1), cookie->missing)); } - ref->key.recno = page->pg_var_recno; /* * We can't discard the original blocks associated with this page now. @@ -1338,21 +1350,20 @@ __slvg_col_ovfl_single( * Mark overflow items referenced by the merged page. */ static int -__slvg_col_ovfl(WT_SESSION_IMPL *session, - WT_TRACK *trk, WT_PAGE *page, uint64_t skip, uint64_t take) +__slvg_col_ovfl(WT_SESSION_IMPL *session, WT_TRACK *trk, + WT_PAGE *page, uint64_t recno, uint64_t skip, uint64_t take) { WT_CELL_UNPACK unpack; WT_CELL *cell; WT_COL *cip; WT_DECL_RET; - uint64_t recno, start, stop; + uint64_t start, stop; uint32_t i; /* * Merging a variable-length column-store page, and we took some number * of records, figure out which (if any) overflow records we used. */ - recno = page->pg_var_recno; start = recno + skip; stop = (recno + skip + take) - 1; @@ -1816,7 +1827,7 @@ __slvg_row_build_internal( /* Allocate a row-store root (internal) page and fill it in. */ WT_RET(__wt_page_alloc( - session, WT_PAGE_ROW_INT, WT_RECNO_OOB, leaf_cnt, true, &page)); + session, WT_PAGE_ROW_INT, leaf_cnt, true, &page)); WT_ERR(__slvg_modify_init(session, page)); pindex = WT_INTL_INDEX_GET_SAFE(page); diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index 4f16a290958..2d7b0a0030f 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -207,8 +207,8 @@ __split_verify_intl_key_order(WT_SESSION_IMPL *session, WT_PAGE *page) WT_INTL_FOREACH_BEGIN(session, page, ref) { WT_ASSERT(session, ref->home == page); - WT_ASSERT(session, ref->key.recno > recno); - recno = ref->key.recno; + WT_ASSERT(session, ref->ref_recno > recno); + recno = ref->ref_recno; } WT_INTL_FOREACH_END; break; case WT_PAGE_ROW_INT: @@ -335,7 +335,7 @@ __split_ref_move(WT_SESSION_IMPL *session, WT_PAGE *from_home, if ((ikey = __wt_ref_key_instantiated(ref)) == NULL) { __wt_ref_key(from_home, ref, &key, &size); WT_RET(__wt_row_ikey(session, 0, key, size, ref)); - ikey = ref->key.ikey; + ikey = ref->ref_ikey; } else { WT_RET( __split_ovfl_key_cleanup(session, from_home, ref)); @@ -529,7 +529,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_REF **child_refp, *ref, **root_refp; WT_SPLIT_ERROR_PHASE complete; size_t child_incr, root_decr, root_incr, size; - uint64_t recno, split_gen; + uint64_t split_gen; uint32_t children, chunk, i, j, remain; uint32_t slots; void *p; @@ -593,10 +593,8 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) alloc_refp = alloc_index->index, i = 0; i < children; ++i) { slots = i == children - 1 ? remain : chunk; - recno = root->type == WT_PAGE_COL_INT ? - (*root_refp)->key.recno : WT_RECNO_OOB; WT_ERR(__wt_page_alloc( - session, root->type, recno, slots, false, &child)); + session, root->type, slots, false, &child)); /* * Initialize the page's child reference; we need a copy of the @@ -611,7 +609,7 @@ __split_root(WT_SESSION_IMPL *session, WT_PAGE *root) WT_ERR(__wt_row_ikey(session, 0, p, size, ref)); root_incr += sizeof(WT_IKEY) + size; } else - ref->key.recno = recno; + ref->ref_recno = (*root_refp)->ref_recno; ref->state = WT_REF_MEM; /* Initialize the child page. */ @@ -737,7 +735,6 @@ __split_parent(WT_SESSION_IMPL *session, WT_REF *ref, WT_REF **ref_new, alloc_index = pindex = NULL; parent_decr = 0; - parent_entries = 0; empty_parent = false; complete = WT_ERR_RETURN; @@ -1014,7 +1011,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_REF **child_refp, *page_ref, **page_refp, *ref; WT_SPLIT_ERROR_PHASE complete; size_t child_incr, page_decr, page_incr, parent_incr, size; - uint64_t recno, split_gen; + uint64_t split_gen; uint32_t children, chunk, i, j, remain; uint32_t slots; void *p; @@ -1099,10 +1096,8 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) for (alloc_refp = alloc_index->index + 1, i = 1; i < children; ++i) { slots = i == children - 1 ? remain : chunk; - recno = page->type == WT_PAGE_COL_INT ? - (*page_refp)->key.recno : WT_RECNO_OOB; WT_ERR(__wt_page_alloc( - session, page->type, recno, slots, false, &child)); + session, page->type, slots, false, &child)); /* * Initialize the page's child reference; we need a copy of the @@ -1117,7 +1112,7 @@ __split_internal(WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE *page) WT_ERR(__wt_row_ikey(session, 0, p, size, ref)); parent_incr += sizeof(WT_IKEY) + size; } else - ref->key.recno = recno; + ref->ref_recno = (*page_refp)->ref_recno; ref->state = WT_REF_MEM; /* Initialize the child page. */ @@ -1483,6 +1478,15 @@ __split_multi_inmem( uint32_t i, slot; /* + * In 04/2016, we removed column-store record numbers from the WT_PAGE + * structure, leading to hard-to-debug problems because we corrupt the + * page if we search it using the wrong initial record number. For now, + * assert the record number is set. + */ + WT_ASSERT(session, + orig->type != WT_PAGE_COL_VAR || ref->ref_recno != 0); + + /* * This code re-creates an in-memory page that is part of a set created * while evicting a large page, and adds references to any unresolved * update chains to the new page. We get here due to choosing to keep @@ -1525,7 +1529,7 @@ __split_multi_inmem( /* Build a key. */ if (supd->ins == NULL) { slot = WT_ROW_SLOT(orig, supd->rip); - upd = orig->pg_row_upd[slot]; + upd = orig->modify->mod_row_update[slot]; WT_ERR(__wt_row_leaf_key( session, orig, supd->rip, key, false)); @@ -1588,7 +1592,7 @@ __split_multi_inmem_final(WT_PAGE *orig, WT_MULTI *multi) case WT_PAGE_ROW_LEAF: if (supd->ins == NULL) { slot = WT_ROW_SLOT(orig, supd->rip); - orig->pg_row_upd[slot] = NULL; + orig->modify->mod_row_update[slot] = NULL; } else supd->ins->upd = NULL; break; @@ -1605,11 +1609,16 @@ __split_multi_inmem_fail(WT_SESSION_IMPL *session, WT_PAGE *orig, WT_REF *ref) /* * We failed creating new in-memory pages. For error-handling reasons, * we've left the update chains referenced by both the original and - * new pages. Discard the new pages, setting a flag so the discard code - * doesn't discard the updates on the page. + * new pages. Discard the new allocated WT_REF structures and their + * pages (setting a flag so the discard code doesn't discard the updates + * on the page). + * + * Our callers allocate WT_REF arrays, then individual WT_REFs, check + * for uninitialized information. */ - if (ref->page != NULL) { - F_SET_ATOMIC(ref->page, WT_PAGE_UPDATE_IGNORE); + if (ref != NULL) { + if (ref->page != NULL) + F_SET_ATOMIC(ref->page, WT_PAGE_UPDATE_IGNORE); __wt_free_ref(session, ref, orig->type, true); } } @@ -1627,7 +1636,6 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, WT_REF *ref; size_t incr; - addr = NULL; incr = 0; /* Allocate an underlying WT_REF. */ @@ -1635,9 +1643,24 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, ref = *refp; incr += sizeof(WT_REF); - /* Any parent reference is filled in by our caller. */ - ref->home = NULL; + /* + * Set the WT_REF key before (optionally) building the page, underlying + * column-store functions need the page's key space to search it. + */ + switch (page->type) { + case WT_PAGE_ROW_INT: + case WT_PAGE_ROW_LEAF: + ikey = multi->key.ikey; + WT_RET(__wt_row_ikey( + session, 0, WT_IKEY_DATA(ikey), ikey->size, ref)); + incr += sizeof(WT_IKEY) + ikey->size; + break; + default: + ref->ref_recno = multi->key.recno; + break; + } + /* If there's a disk image, build a page, otherwise set the address. */ if (multi->disk_image == NULL) { /* * Copy the address: we could simply take the buffer, but that @@ -1651,28 +1674,13 @@ __wt_multi_to_ref(WT_SESSION_IMPL *session, addr->type = multi->addr.type; WT_RET(__wt_strndup(session, multi->addr.addr, addr->size, &addr->addr)); - } else + ref->state = WT_REF_DISK; + } else { WT_RET(__split_multi_inmem(session, page, ref, multi)); - - switch (page->type) { - case WT_PAGE_ROW_INT: - case WT_PAGE_ROW_LEAF: - ikey = multi->key.ikey; - WT_RET(__wt_row_ikey( - session, 0, WT_IKEY_DATA(ikey), ikey->size, ref)); - incr += sizeof(WT_IKEY) + ikey->size; - break; - default: - ref->key.recno = multi->key.recno; - break; + ref->state = WT_REF_MEM; } - ref->state = addr != NULL ? WT_REF_DISK : WT_REF_MEM; - - /* - * If our caller wants to track the memory allocations, we have a return - * reference. - */ + /* Optionally return changes in the memory footprint. */ if (incrp != NULL) *incrp += incr; return (0); @@ -1773,17 +1781,12 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) parent_incr += sizeof(WT_IKEY) + key->size; __wt_scr_free(session, &key); } else - child->key.recno = ref->key.recno; + child->ref_recno = ref->ref_recno; /* * The second page in the split is a new WT_REF/page pair. */ - if (type == WT_PAGE_ROW_LEAF) - WT_ERR(__wt_page_alloc(session, - type, WT_RECNO_OOB, 0, false, &right)); - else - WT_ERR(__wt_page_alloc(session, - type, WT_INSERT_RECNO(moved_ins), 0, false, &right)); + WT_ERR(__wt_page_alloc(session, type, 0, false, &right)); /* * The new page is dirty by definition, plus column-store splits update @@ -1793,11 +1796,15 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) __wt_page_modify_set(session, right); if (type == WT_PAGE_ROW_LEAF) { - WT_ERR(__wt_calloc_one(session, &right->pg_row_ins)); - WT_ERR(__wt_calloc_one(session, &right->pg_row_ins[0])); + WT_ERR(__wt_calloc_one( + session, &right->modify->mod_row_insert)); + WT_ERR(__wt_calloc_one( + session, &right->modify->mod_row_insert[0])); } else { - WT_ERR(__wt_calloc_one(session, &right->modify->mod_append)); - WT_ERR(__wt_calloc_one(session, &right->modify->mod_append[0])); + WT_ERR(__wt_calloc_one( + session, &right->modify->mod_col_append)); + WT_ERR(__wt_calloc_one( + session, &right->modify->mod_col_append[0])); } right_incr += sizeof(WT_INSERT_HEAD); right_incr += sizeof(WT_INSERT_HEAD *); @@ -1814,7 +1821,7 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) child)); parent_incr += sizeof(WT_IKEY) + WT_INSERT_KEY_SIZE(moved_ins); } else - child->key.recno = WT_INSERT_RECNO(moved_ins); + child->ref_recno = WT_INSERT_RECNO(moved_ins); /* * Allocation operations completed, we're going to split. @@ -1823,8 +1830,8 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) */ if (type != WT_PAGE_ROW_LEAF) { WT_ASSERT(session, - page->modify->mod_split_recno == WT_RECNO_OOB); - page->modify->mod_split_recno = child->key.recno; + page->modify->mod_col_split_recno == WT_RECNO_OOB); + page->modify->mod_col_split_recno = child->ref_recno; } /* @@ -1848,7 +1855,7 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) * can be ignored.) */ tmp_ins_head = type == WT_PAGE_ROW_LEAF ? - right->pg_row_ins[0] : right->modify->mod_append[0]; + right->modify->mod_row_insert[0] : right->modify->mod_col_append[0]; tmp_ins_head->head[0] = tmp_ins_head->tail[0] = moved_ins; /* @@ -1970,7 +1977,7 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) * Reset the split column-store page record. */ if (type != WT_PAGE_ROW_LEAF) - page->modify->mod_split_recno = WT_RECNO_OOB; + page->modify->mod_col_split_recno = WT_RECNO_OOB; /* * Clear the allocated page's reference to the moved insert list element @@ -1983,11 +1990,11 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) * lists have. */ if (type == WT_PAGE_ROW_LEAF) - right->pg_row_ins[0]->head[0] = - right->pg_row_ins[0]->tail[0] = NULL; + right->modify->mod_row_insert[0]->head[0] = + right->modify->mod_row_insert[0]->tail[0] = NULL; else - right->modify->mod_append[0]->head[0] = - right->modify->mod_append[0]->tail[0] = NULL; + right->modify->mod_col_append[0]->head[0] = + right->modify->mod_col_append[0]->tail[0] = NULL; ins_head->tail[0]->next[0] = moved_ins; ins_head->tail[0] = moved_ins; @@ -1999,12 +2006,12 @@ err: if (split_ref[0] != NULL) { ref->addr = split_ref[0]->addr; if (type == WT_PAGE_ROW_LEAF) - __wt_free(session, split_ref[0]->key.ikey); + __wt_free(session, split_ref[0]->ref_ikey); __wt_free(session, split_ref[0]); } if (split_ref[1] != NULL) { if (type == WT_PAGE_ROW_LEAF) - __wt_free(session, split_ref[1]->key.ikey); + __wt_free(session, split_ref[1]->ref_ikey); __wt_free(session, split_ref[1]); } if (right != NULL) { @@ -2170,7 +2177,7 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref) WT_DECL_RET; WT_PAGE *page; WT_PAGE_MODIFY *mod; - WT_REF new; + WT_REF *new; page = ref->page; mod = page->modify; @@ -2187,9 +2194,15 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref) * exactly what we want to do. * * Build the new page. + * + * Allocate a WT_REF because the error path uses routines that will ea + * free memory. The only field we need to set is the record number, as + * it's used by the search routines. */ - memset(&new, 0, sizeof(new)); - WT_ERR(__split_multi_inmem(session, page, &new, &mod->mod_multi[0])); + WT_RET(__wt_calloc_one(session, &new)); + new->ref_recno = ref->ref_recno; + + WT_ERR(__split_multi_inmem(session, page, new, &mod->mod_multi[0])); /* * The rewrite succeeded, we can no longer fail. @@ -2209,11 +2222,12 @@ __wt_split_rewrite(WT_SESSION_IMPL *session, WT_REF *ref) __wt_ref_out(session, ref); /* Swap the new page into place. */ - ref->page = new.page; + ref->page = new->page; WT_PUBLISH(ref->state, WT_REF_MEM); + __wt_free(session, new); return (0); -err: __split_multi_inmem_fail(session, page, &new); +err: __split_multi_inmem_fail(session, page, new); return (ret); } diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c index 826589f8bdd..5d60c436a08 100644 --- a/src/third_party/wiredtiger/src/btree/bt_sync.c +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -81,7 +81,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) if (__wt_page_is_modified(page) && WT_TXNID_LT(page->modify->update_txn, oldest_id)) { if (txn->isolation == WT_ISO_READ_COMMITTED) - __wt_txn_get_snapshot(session); + WT_ERR(__wt_txn_get_snapshot(session)); leaf_bytes += page->memory_footprint; ++leaf_pages; WT_ERR(__wt_reconcile(session, walk, NULL, 0)); @@ -100,7 +100,7 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) * the metadata shouldn't be that big, and (b) if we do ever */ if (txn->isolation == WT_ISO_READ_COMMITTED) - __wt_txn_get_snapshot(session); + WT_ERR(__wt_txn_get_snapshot(session)); /* * We cannot check the tree modified flag in the case of a diff --git a/src/third_party/wiredtiger/src/btree/bt_vrfy.c b/src/third_party/wiredtiger/src/btree/bt_vrfy.c index 83dc7924312..531a0dc125a 100644 --- a/src/third_party/wiredtiger/src/btree/bt_vrfy.c +++ b/src/third_party/wiredtiger/src/btree/bt_vrfy.c @@ -355,7 +355,7 @@ __verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs) if (vs->dump_blocks) WT_RET(__wt_debug_disk(session, page->dsk, NULL)); if (vs->dump_pages) - WT_RET(__wt_debug_page(session, page, NULL)); + WT_RET(__wt_debug_page(session, ref, NULL)); #endif /* @@ -364,13 +364,11 @@ __verify_tree(WT_SESSION_IMPL *session, WT_REF *ref, WT_VSTUFF *vs) */ switch (page->type) { case WT_PAGE_COL_FIX: - recno = page->pg_fix_recno; - goto recno_chk; case WT_PAGE_COL_INT: - recno = page->pg_intl_recno; + recno = ref->ref_recno; goto recno_chk; case WT_PAGE_COL_VAR: - recno = page->pg_var_recno; + recno = ref->ref_recno; recno_chk: if (recno != vs->record_total + 1) WT_RET_MSG(session, WT_ERROR, "page at %s has a starting record of %" PRIu64 @@ -485,7 +483,7 @@ celltype_err: WT_RET_MSG(session, WT_ERROR, * reviewed to this point. */ ++entry; - if (child_ref->key.recno != vs->record_total + 1) { + if (child_ref->ref_recno != vs->record_total + 1) { WT_RET_MSG(session, WT_ERROR, "the starting record number in entry %" PRIu32 " of the column internal page at " @@ -494,7 +492,7 @@ celltype_err: WT_RET_MSG(session, WT_ERROR, entry, __wt_page_addr_string( session, child_ref, vs->tmp1), - child_ref->key.recno, + child_ref->ref_recno, vs->record_total + 1); } diff --git a/src/third_party/wiredtiger/src/btree/col_modify.c b/src/third_party/wiredtiger/src/btree/col_modify.c index fd60b12538a..a7920da5267 100644 --- a/src/third_party/wiredtiger/src/btree/col_modify.c +++ b/src/third_party/wiredtiger/src/btree/col_modify.c @@ -55,7 +55,8 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, */ if (recno == WT_RECNO_OOB || recno > (btree->type == BTREE_COL_VAR ? - __col_var_last_recno(page) : __col_fix_last_recno(page))) + __col_var_last_recno(cbt->ref) : + __col_fix_last_recno(cbt->ref))) append = true; } @@ -107,17 +108,17 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, /* Allocate the append/update list reference as necessary. */ if (append) { WT_PAGE_ALLOC_AND_SWAP(session, - page, mod->mod_append, ins_headp, 1); - ins_headp = &mod->mod_append[0]; + page, mod->mod_col_append, ins_headp, 1); + ins_headp = &mod->mod_col_append[0]; } else if (page->type == WT_PAGE_COL_FIX) { WT_PAGE_ALLOC_AND_SWAP(session, - page, mod->mod_update, ins_headp, 1); - ins_headp = &mod->mod_update[0]; + page, mod->mod_col_update, ins_headp, 1); + ins_headp = &mod->mod_col_update[0]; } else { WT_PAGE_ALLOC_AND_SWAP(session, - page, mod->mod_update, ins_headp, + page, mod->mod_col_update, ins_headp, page->pg_var_entries); - ins_headp = &mod->mod_update[cbt->slot]; + ins_headp = &mod->mod_col_update[cbt->slot]; } /* Allocate the WT_INSERT_HEAD structure as necessary. */ @@ -142,8 +143,9 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, * it's easy (as opposed to in row-store) and a difficult bug to * otherwise diagnose. */ - WT_ASSERT(session, mod->mod_split_recno == WT_RECNO_OOB || - (recno != WT_RECNO_OOB && mod->mod_split_recno > recno)); + WT_ASSERT(session, mod->mod_col_split_recno == WT_RECNO_OOB || + (recno != WT_RECNO_OOB && + mod->mod_col_split_recno > recno)); if (upd_arg == NULL) { WT_ERR( diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c index 4730267a545..6c96181d3bf 100644 --- a/src/third_party/wiredtiger/src/btree/col_srch.c +++ b/src/third_party/wiredtiger/src/btree/col_srch.c @@ -30,7 +30,7 @@ __check_leaf_key_range(WT_SESSION_IMPL *session, * Check if the search key is smaller than the parent's starting key for * this page. */ - if (recno < leaf->key.recno) { + if (recno < leaf->ref_recno) { cbt->compare = 1; /* page keys > search key */ return (0); } @@ -48,7 +48,7 @@ __check_leaf_key_range(WT_SESSION_IMPL *session, WT_INTL_INDEX_GET(session, leaf->home, pindex); indx = leaf->pindex_hint; if (indx + 1 < pindex->entries && pindex->index[indx] == leaf) - if (recno >= pindex->index[indx + 1]->key.recno) { + if (recno >= pindex->index[indx + 1]->ref_recno) { cbt->compare = -1; /* page keys < search key */ return (0); } @@ -133,14 +133,12 @@ restart: /* if (page->type != WT_PAGE_COL_INT) break; - WT_ASSERT(session, current->key.recno == page->pg_intl_recno); - WT_INTL_INDEX_GET(session, page, pindex); base = pindex->entries; descent = pindex->index[base - 1]; /* Fast path appends. */ - if (recno >= descent->key.recno) { + if (recno >= descent->ref_recno) { /* * If on the last slot (the key is larger than any key * on the page), check for an internal page split race. @@ -158,9 +156,9 @@ restart: /* indx = base + (limit >> 1); descent = pindex->index[indx]; - if (recno == descent->key.recno) + if (recno == descent->ref_recno) break; - if (recno < descent->key.recno) + if (recno < descent->ref_recno) continue; base = indx + 1; --limit; @@ -172,7 +170,7 @@ descend: /* * (last + 1) index. The slot for descent is the one before * base. */ - if (recno != descent->key.recno) { + if (recno != descent->ref_recno) { /* * We don't have to correct for base == 0 because the * only way for base to be 0 is if recno is the page's @@ -237,13 +235,13 @@ leaf_only: * do in that case, the record may be appended to the page. */ if (page->type == WT_PAGE_COL_FIX) { - if (recno < page->pg_fix_recno) { - cbt->recno = page->pg_fix_recno; + if (recno < current->ref_recno) { + cbt->recno = current->ref_recno; cbt->compare = 1; return (0); } - if (recno >= page->pg_fix_recno + page->pg_fix_entries) { - cbt->recno = page->pg_fix_recno + page->pg_fix_entries; + if (recno >= current->ref_recno + page->pg_fix_entries) { + cbt->recno = current->ref_recno + page->pg_fix_entries; goto past_end; } else { cbt->recno = recno; @@ -251,14 +249,14 @@ leaf_only: ins_head = WT_COL_UPDATE_SINGLE(page); } } else { - if (recno < page->pg_var_recno) { - cbt->recno = page->pg_var_recno; + if (recno < current->ref_recno) { + cbt->recno = current->ref_recno; cbt->slot = 0; cbt->compare = 1; return (0); } - if ((cip = __col_var_search(page, recno, NULL)) == NULL) { - cbt->recno = __col_var_last_recno(page); + if ((cip = __col_var_search(current, recno, NULL)) == NULL) { + cbt->recno = __col_var_last_recno(current); cbt->slot = page->pg_var_entries == 0 ? 0 : page->pg_var_entries - 1; goto past_end; diff --git a/src/third_party/wiredtiger/src/btree/row_key.c b/src/third_party/wiredtiger/src/btree/row_key.c index 9fff092d079..83fd2dad9e4 100644 --- a/src/third_party/wiredtiger/src/btree/row_key.c +++ b/src/third_party/wiredtiger/src/btree/row_key.c @@ -517,7 +517,7 @@ __wt_row_ikey(WT_SESSION_IMPL *session, { uintptr_t oldv; - oldv = (uintptr_t)ref->key.ikey; + oldv = (uintptr_t)ref->ref_ikey; WT_DIAGNOSTIC_YIELD; /* @@ -527,10 +527,10 @@ __wt_row_ikey(WT_SESSION_IMPL *session, WT_ASSERT(session, oldv == 0 || (oldv & WT_IK_FLAG) != 0); WT_ASSERT(session, ref->state != WT_REF_SPLIT); WT_ASSERT(session, - __wt_atomic_cas_ptr(&ref->key.ikey, (WT_IKEY *)oldv, ikey)); + __wt_atomic_cas_ptr(&ref->ref_ikey, (WT_IKEY *)oldv, ikey)); } #else - ref->key.ikey = ikey; + ref->ref_ikey = ikey; #endif return (0); } diff --git a/src/third_party/wiredtiger/src/btree/row_modify.c b/src/third_party/wiredtiger/src/btree/row_modify.c index 176016bb340..f0424ff93b4 100644 --- a/src/third_party/wiredtiger/src/btree/row_modify.c +++ b/src/third_party/wiredtiger/src/btree/row_modify.c @@ -53,6 +53,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT *ins; WT_INSERT_HEAD *ins_head, **ins_headp; WT_PAGE *page; + WT_PAGE_MODIFY *mod; WT_UPDATE *old_upd, *upd, **upd_entry; size_t ins_size, upd_size; uint32_t ins_slot; @@ -70,6 +71,7 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, /* If we don't yet have a modify structure, we'll need one. */ WT_RET(__wt_page_modify_init(session, page)); + mod = page->modify; /* * Modify: allocate an update array as necessary, build a WT_UPDATE @@ -83,11 +85,12 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, if (cbt->compare == 0) { if (cbt->ins == NULL) { /* Allocate an update array as necessary. */ - WT_PAGE_ALLOC_AND_SWAP(session, page, - page->pg_row_upd, upd_entry, page->pg_row_entries); + WT_PAGE_ALLOC_AND_SWAP(session, + page, mod->mod_row_update, + upd_entry, page->pg_row_entries); /* Set the WT_UPDATE array reference. */ - upd_entry = &page->pg_row_upd[cbt->slot]; + upd_entry = &mod->mod_row_update[cbt->slot]; } else upd_entry = &cbt->ins->upd; @@ -144,11 +147,11 @@ __wt_row_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, * slot. That's hard, so we set a flag. */ WT_PAGE_ALLOC_AND_SWAP(session, page, - page->pg_row_ins, ins_headp, page->pg_row_entries + 1); + mod->mod_row_insert, ins_headp, page->pg_row_entries + 1); ins_slot = F_ISSET(cbt, WT_CBT_SEARCH_SMALLEST) ? page->pg_row_entries: cbt->slot; - ins_headp = &page->pg_row_ins[ins_slot]; + ins_headp = &mod->mod_row_insert[ins_slot]; /* Allocate the WT_INSERT_HEAD structure as necessary. */ WT_PAGE_ALLOC_AND_SWAP(session, page, *ins_headp, ins_head, 1); diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c index 6169a0a810a..a631764be7e 100644 --- a/src/third_party/wiredtiger/src/btree/row_srch.c +++ b/src/third_party/wiredtiger/src/btree/row_srch.c @@ -634,6 +634,7 @@ __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_INSERT *ins, **start, **stop; WT_INSERT_HEAD *ins_head; WT_PAGE *page; + uint64_t samples; uint32_t choice, entries, i; int level; @@ -688,7 +689,7 @@ __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) * Step down the skip list levels, selecting a random chunk of the name * space at each level. */ - while (level > 0) { + for (samples = entries; level > 0; samples += entries) { /* * There are (entries) or (entries + 1) chunks of the name space * considered at each level. They are: between start and the 1st @@ -765,6 +766,16 @@ __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) cbt->ins_head = ins_head; cbt->compare = 0; + /* + * Random lookups in newly created collections can be slow if a page + * consists of a large skiplist. Schedule the page for eviction if we + * encounter a large skiplist. This worthwhile because applications + * that take a sample often take many samples, so the overhead of + * traversing the skip list each time accumulates to real time. + */ + if (samples > 5000) + __wt_page_evict_soon(page); + return (0); } |