diff options
author | Michael Cahill <michael.cahill@mongodb.com> | 2016-02-19 15:36:42 +1100 |
---|---|---|
committer | Michael Cahill <michael.cahill@mongodb.com> | 2016-02-19 15:36:47 +1100 |
commit | 70db6ed51f90f627570de9bf32ab8c5cd23886ca (patch) | |
tree | 1f5ae19d38fad8488657ae80ae9014b832ec858e /src/third_party | |
parent | da2441b59b742c077306be6515c999c33cd955a6 (diff) | |
download | mongo-70db6ed51f90f627570de9bf32ab8c5cd23886ca.tar.gz |
Import wiredtiger-wiredtiger-2.7.0-675-g4f38287.tar.gz from wiredtiger branch mongodb-3.4
ref: cc96d99..4f38287
SERVER-22676 WiredTiger fails to open databases created by 3.0.0 or 3.0.1
WT-2280 Add CRC32 Optimized code for PPC64LE
WT-2295 WT_SESSION.create does a full-scan of the main table
WT-2346 Don't hold schema lock during checkpoint I/O
WT-2361 Column-store starting record number error
WT-2367 WT_CURSOR.next out-of-order returns failure
WT-2374 Read error on index file
WT-2375 Need tests for collators
WT-2382 Problem with custom collator for 'u' format with join cursor
WT-2387 Fix cursor random unit test on Windows
WT-2390 OS X build is broken
WT-2393 Unnecessary error handling labels.
WT-2396 Jenkins Spinlock GCC task Hung
WT-2397 Cursor traversal from end of the tree skips records.
WT-2399 Add test case that verifies cursor traversal
WT-2411 LSM drop hang
Diffstat (limited to 'src/third_party')
68 files changed, 4076 insertions, 697 deletions
diff --git a/src/third_party/wiredtiger/build_posix/Make.subdirs b/src/third_party/wiredtiger/build_posix/Make.subdirs index e1f8a05c613..bc4283a4876 100644 --- a/src/third_party/wiredtiger/build_posix/Make.subdirs +++ b/src/third_party/wiredtiger/build_posix/Make.subdirs @@ -26,6 +26,7 @@ lang/python PYTHON # Make the tests test/bloom test/checkpoint +test/cursor_order test/fops test/format test/huge diff --git a/src/third_party/wiredtiger/build_posix/configure.ac.in b/src/third_party/wiredtiger/build_posix/configure.ac.in index 875c8b436a8..06d73e2fe12 100644 --- a/src/third_party/wiredtiger/build_posix/configure.ac.in +++ b/src/third_party/wiredtiger/build_posix/configure.ac.in @@ -32,6 +32,7 @@ AC_SUBST([LIBTOOL_DEPS]) AC_PROG_CC(cc gcc) AC_PROG_CXX(c++ g++) +AM_PROG_AS(as gas) if test "$GCC" = "yes"; then # The Solaris gcc compiler gets the additional -pthreads flag. diff --git a/src/third_party/wiredtiger/dist/filelist b/src/third_party/wiredtiger/dist/filelist index edd59435841..4ed7d7e3beb 100644 --- a/src/third_party/wiredtiger/dist/filelist +++ b/src/third_party/wiredtiger/dist/filelist @@ -163,6 +163,8 @@ src/support/hazard.c src/support/hex.c src/support/huffman.c src/support/pow.c +src/support/power8/crc32.S +src/support/power8/crc32_wrapper.c src/support/rand.c src/support/scratch.c src/support/stat.c diff --git a/src/third_party/wiredtiger/dist/flags.py b/src/third_party/wiredtiger/dist/flags.py index b97235b965a..d5784630ab8 100644 --- a/src/third_party/wiredtiger/dist/flags.py +++ b/src/third_party/wiredtiger/dist/flags.py @@ -114,6 +114,7 @@ flags = { 'SESSION_LOCK_NO_WAIT', 'SESSION_LOCKED_CHECKPOINT', 'SESSION_LOCKED_HANDLE_LIST', + 'SESSION_LOCKED_METADATA', 'SESSION_LOCKED_SCHEMA', 'SESSION_LOCKED_SLOT', 'SESSION_LOCKED_TABLE', diff --git a/src/third_party/wiredtiger/dist/s_funcs.list b/src/third_party/wiredtiger/dist/s_funcs.list index ed6cf43bb2f..8d32eecdfb7 100644 --- a/src/third_party/wiredtiger/dist/s_funcs.list +++ b/src/third_party/wiredtiger/dist/s_funcs.list @@ -1,4 +1,6 @@ # List of functions that aren't found by s_funcs, but that's OK. +FUNC_END +FUNC_START WT_CURDUMP_PASS __bit_ffs __bit_nclr diff --git a/src/third_party/wiredtiger/dist/s_longlines b/src/third_party/wiredtiger/dist/s_longlines index decedb58f44..000f33d51d5 100755 --- a/src/third_party/wiredtiger/dist/s_longlines +++ b/src/third_party/wiredtiger/dist/s_longlines @@ -9,8 +9,9 @@ l=`(cd .. && find dist -name '*.py' && find src -name '*.in') | sed -e '/dist\/stat_data\.py/d' \ - -e '/support\/stat\.c/d' \ - -e '/include\/extern\.h/d'` + -e '/include\/extern\.h/d' \ + -e '/support\/power8/d' \ + -e '/support\/stat\.c/d'` for f in $l ; do expand -t8 < ../$f | awk -- \ diff --git a/src/third_party/wiredtiger/dist/s_string.ok b/src/third_party/wiredtiger/dist/s_string.ok index 19fa27cd719..7a8f3a9b0bd 100644 --- a/src/third_party/wiredtiger/dist/s_string.ok +++ b/src/third_party/wiredtiger/dist/s_string.ok @@ -189,6 +189,7 @@ MALLOC MEM MEMALIGN MERCHANTABILITY +METADATA MONGODB MSVC MULTIBLOCK @@ -686,6 +687,7 @@ jnr jrx json kb +kbits keycmp keyid keyv @@ -838,6 +840,7 @@ pos posint posix postsize +powerpc pragmas pre prealloc diff --git a/src/third_party/wiredtiger/dist/s_style b/src/third_party/wiredtiger/dist/s_style index 44a5bdda741..78fb7a6eb03 100755 --- a/src/third_party/wiredtiger/dist/s_style +++ b/src/third_party/wiredtiger/dist/s_style @@ -18,7 +18,9 @@ if [ $# -ne 1 ]; then find bench examples ext src test \ -name '*.[chisy]' -o -name '*.in' -o -name '*.dox' | - sed -e '/Makefile.in/d' -e '/build_win\/wiredtiger_config.h/d' | + sed -e '/Makefile.in/d' \ + -e '/build_win\/wiredtiger_config.h/d' \ + -e '/support\/power8/d' | xargs $xp -n 1 -I{} sh ./dist/s_style {} else # General style correction and cleanup for a single file diff --git a/src/third_party/wiredtiger/dist/s_whitespace b/src/third_party/wiredtiger/dist/s_whitespace index d13de4b5989..74820a4f0e9 100755 --- a/src/third_party/wiredtiger/dist/s_whitespace +++ b/src/third_party/wiredtiger/dist/s_whitespace @@ -36,10 +36,9 @@ for f in `find bench examples ext src test \ -name '*.[chi]' -o \ -name '*.dox' -o \ -name '*.in' -o \ - -name 'Makefile.am'`; do - if expr "$f" : ".*/Makefile.in" > /dev/null; then - continue - fi + -name 'Makefile.am' | + sed -e '/Makefile.in/d' \ + -e '/support\/power8/d'`; do whitespace_and_empty_line $f done diff --git a/src/third_party/wiredtiger/dist/s_win b/src/third_party/wiredtiger/dist/s_win index 1eb4702d517..0b7d5184037 100755 --- a/src/third_party/wiredtiger/dist/s_win +++ b/src/third_party/wiredtiger/dist/s_win @@ -44,7 +44,7 @@ win_filelist() f='../build_win/filelist.win' # Process the files for which there's a Windows-specific version, then - # append Windows-only files. (There aren't yet any POSIX-only files.) + # append Windows-only files and discard POSIX-only files. (sed \ -e 's;os_posix/os_dir.c;os_win/os_dir.c;' \ -e 's;os_posix/os_dlopen.c;os_win/os_dlopen.c;' \ @@ -71,7 +71,9 @@ win_filelist() -e 's;os_posix/os_sleep.c;os_win/os_sleep.c;' \ -e 's;os_posix/os_thread.c;os_win/os_thread.c;' \ -e 's;os_posix/os_time.c;os_win/os_time.c;' \ - -e 's;os_posix/os_yield.c;os_win/os_yield.c;' + -e 's;os_posix/os_yield.c;os_win/os_yield.c;' \ + -e '/src\/support\/power8\/crc32.S/d' \ + -e '/src\/support\/power8\/crc32_wrapper.c/d' echo 'src/os_win/os_snprintf.c' echo 'src/os_win/os_vsnprintf.c') < filelist | sort > $t cmp $t $f > /dev/null 2>&1 || diff --git a/src/third_party/wiredtiger/src/btree/bt_curprev.c b/src/third_party/wiredtiger/src/btree/bt_curprev.c index a083ec4016e..7475c0f1312 100644 --- a/src/third_party/wiredtiger/src/btree/bt_curprev.c +++ b/src/third_party/wiredtiger/src/btree/bt_curprev.c @@ -51,7 +51,8 @@ restart: if (cbt->btree->type == BTREE_ROW) { key.data = WT_INSERT_KEY(current); key.size = WT_INSERT_KEY_SIZE(current); - WT_RET(__wt_search_insert(session, cbt, &key)); + WT_RET(__wt_search_insert( + session, cbt, cbt->ins_head, &key)); } else cbt->ins = __col_insert_search(cbt->ins_head, cbt->ins_stack, cbt->next_stack, diff --git a/src/third_party/wiredtiger/src/btree/bt_split.c b/src/third_party/wiredtiger/src/btree/bt_split.c index bd38451d5d1..3dea03316ce 100644 --- a/src/third_party/wiredtiger/src/btree/bt_split.c +++ b/src/third_party/wiredtiger/src/btree/bt_split.c @@ -1383,11 +1383,27 @@ __split_internal_should_split(WT_SESSION_IMPL *session, WT_REF *ref) static int __split_parent_climb(WT_SESSION_IMPL *session, WT_PAGE *page, bool page_hazard) { + WT_BTREE *btree; WT_DECL_RET; WT_PAGE *parent; WT_REF *ref; bool parent_hazard; + btree = S2BT(session); + + /* + * Disallow internal splits during the final pass of a checkpoint. Most + * splits are already disallowed during checkpoints, but an important + * exception is insert splits. The danger is an insert split creates a + * new chunk of the namespace, and then the internal split will move it + * to a different part of the tree where it will be written; in other + * words, in one part of the tree we'll skip the newly created insert + * split chunk, but we'll write it upon finding it in a different part + * of the tree. + */ + if (btree->checkpointing != WT_CKPT_OFF) + return (__split_internal_unlock(session, page, page_hazard)); + /* * Page splits trickle up the tree, that is, as leaf pages grow large * enough and are evicted, they'll split into their parent. And, as @@ -1771,8 +1787,8 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) type, WT_INSERT_RECNO(moved_ins), 0, false, &right)); /* - * The new page is dirty by definition, column-store splits update the - * page-modify structure, so create it now. + * The new page is dirty by definition, plus column-store splits update + * the page-modify structure, so create it now. */ WT_ERR(__wt_page_modify_init(session, right)); __wt_page_modify_set(session, right); @@ -1813,15 +1829,6 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) } /* - * We modified the page above, which will have set the first dirty - * transaction to the last transaction current running. However, the - * updates we installed may be older than that. Set the first dirty - * transaction to an impossibly old value so this page is never skipped - * in a checkpoint. - */ - right->modify->first_dirty_txn = WT_TXN_FIRST; - - /* * Calculate how much memory we're moving: figure out how deep the skip * list stack is for the element we are moving, and the memory used by * the item's list of updates. @@ -1919,6 +1926,24 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) #endif /* + * We perform insert splits concurrently with checkpoints, where the + * requirement is a checkpoint must include either the original page + * or both new pages. The page we're splitting is dirty, but that's + * insufficient: set the first dirty transaction to an impossibly old + * value so this page is not skipped by a checkpoint. + */ + page->modify->first_dirty_txn = WT_TXN_FIRST; + + /* + * We modified the page above, which will have set the first dirty + * transaction to the last transaction current running. However, the + * updates we installed may be older than that. Set the first dirty + * transaction to an impossibly old value so this page is never skipped + * in a checkpoint. + */ + right->modify->first_dirty_txn = WT_TXN_FIRST; + + /* * Update the page accounting. * * XXX @@ -1928,10 +1953,14 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) __wt_cache_page_inmem_incr(session, right, right_incr); /* - * Split into the parent. On successful return, the original page is no - * longer locked, so we cannot safely look at it. + * The act of splitting into the parent releases the pages for eviction; + * ensure the page contents are consistent. + */ + WT_WRITE_BARRIER(); + + /* + * Split into the parent. */ - page = NULL; if ((ret = __split_parent( session, ref, split_ref, 2, parent_incr, false, true)) == 0) return (0); @@ -1941,7 +1970,8 @@ __split_insert(WT_SESSION_IMPL *session, WT_REF *ref) * * Reset the split column-store page record. */ - page->modify->mod_split_recno = WT_RECNO_OOB; + if (type != WT_PAGE_ROW_LEAF) + page->modify->mod_split_recno = WT_RECNO_OOB; /* * Clear the allocated page's reference to the moved insert list element diff --git a/src/third_party/wiredtiger/src/btree/bt_sync.c b/src/third_party/wiredtiger/src/btree/bt_sync.c index 5cbd8d1e996..bbfb06c636f 100644 --- a/src/third_party/wiredtiger/src/btree/bt_sync.c +++ b/src/third_party/wiredtiger/src/btree/bt_sync.c @@ -105,13 +105,13 @@ __sync_file(WT_SESSION_IMPL *session, WT_CACHE_OP syncop) __wt_spin_lock(session, &btree->flush_lock); /* - * When internal pages are being reconciled by checkpoint their - * child pages cannot disappear from underneath them or be split - * into them, nor can underlying blocks be freed until the block - * lists for the checkpoint are stable. Set the checkpointing - * flag to block eviction of dirty pages until the checkpoint's - * internal page pass is complete, then wait for any existing - * eviction to complete. + * In the final checkpoint pass, child pages cannot be evicted + * from underneath internal pages nor can underlying blocks be + * freed until the checkpoint's block lists are stable. Also, + * we cannot split child pages into parents unless we know the + * final pass will write a consistent view of that namespace. + * Set the checkpointing flag to block such actions and wait for + * any problematic eviction or page splits to complete. */ WT_PUBLISH(btree->checkpointing, WT_CKPT_PREPARE); diff --git a/src/third_party/wiredtiger/src/btree/bt_walk.c b/src/third_party/wiredtiger/src/btree/bt_walk.c index d7785c689d9..55b11d7b2d1 100644 --- a/src/third_party/wiredtiger/src/btree/bt_walk.c +++ b/src/third_party/wiredtiger/src/btree/bt_walk.c @@ -89,11 +89,11 @@ __ref_is_leaf(WT_REF *ref) } /* - * __page_ascend -- + * __ref_ascend -- * Ascend the tree one level. */ -static void -__page_ascend(WT_SESSION_IMPL *session, +static inline void +__ref_ascend(WT_SESSION_IMPL *session, WT_REF **refp, WT_PAGE_INDEX **pindexp, uint32_t *slotp) { WT_REF *parent_ref, *ref; @@ -163,12 +163,12 @@ __page_ascend(WT_SESSION_IMPL *session, } /* - * __page_descend -- - * Descend the tree one level. + * __ref_descend_prev -- + * Descend the tree one level, during a previous-cursor walk. */ -static void -__page_descend(WT_SESSION_IMPL *session, - WT_PAGE *page, WT_PAGE_INDEX **pindexp, uint32_t *slotp, bool prev) +static inline void +__ref_descend_prev( + WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp) { WT_PAGE_INDEX *pindex; @@ -177,9 +177,6 @@ __page_descend(WT_SESSION_IMPL *session, * we have a hazard pointer. */ for (;; __wt_yield()) { - WT_INTL_INDEX_GET(session, page, pindex); - *slotp = prev ? pindex->entries - 1 : 0; - /* * There's a split race when a cursor moving backwards through * the tree descends the tree. If we're splitting an internal @@ -233,21 +230,41 @@ __page_descend(WT_SESSION_IMPL *session, * being split and part of its namespace moved. We have the * correct page and we don't have to move, all we have to do is * wait until the split page's page index is updated. - * - * No test is necessary for a next-cursor movement because we - * do right-hand splits on internal pages and the initial part - * of the page's namespace won't change as part of a split. - * Instead of testing the direction boolean, do the test the - * previous cursor movement requires in all cases, even though - * it will always succeed for a next-cursor movement. */ - if (pindex->index[*slotp]->home == page) + WT_INTL_INDEX_GET(session, ref->page, pindex); + if (pindex->index[pindex->entries - 1]->home == ref->page) break; } *pindexp = pindex; } /* + * __ref_initial_descent_prev -- + * Descend the tree one level, when setting up the initial cursor position + * for a previous-cursor walk. + */ +static inline bool +__ref_initial_descent_prev( + WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX **pindexp) +{ + WT_PAGE_INDEX *pindex; + + /* + * We're passed a child page into which we're descending, and on which + * we have a hazard pointer. + * + * Acquire a page index for the child page and then confirm we haven't + * raced with a parent split. + */ + WT_INTL_INDEX_GET(session, ref->page, pindex); + if (__wt_split_descent_race(session, ref, *pindexp)) + return (false); + + *pindexp = pindex; + return (true); +} + +/* * __tree_walk_internal -- * Move to the next/previous page in the tree. */ @@ -259,11 +276,12 @@ __tree_walk_internal(WT_SESSION_IMPL *session, WT_DECL_RET; WT_PAGE_INDEX *pindex; WT_REF *couple, *couple_orig, *ref; - bool empty_internal, prev, skip; + bool empty_internal, initial_descent, prev, skip; uint32_t slot; btree = S2BT(session); - empty_internal = false; + pindex = NULL; + empty_internal = initial_descent = false; /* * Tree walks are special: they look inside page structures that splits @@ -323,22 +341,30 @@ __tree_walk_internal(WT_SESSION_IMPL *session, couple = couple_orig = ref = *refp; *refp = NULL; - /* If no page is active, begin a walk from the start of the tree. */ + /* If no page is active, begin a walk from the start/end of the tree. */ if (ref == NULL) { - ref = &btree->root; +restart: /* + * We can reach here with a NULL or root reference; the release + * function handles them internally, don't complicate this code + * by calling them out. + */ + WT_ERR(__wt_page_release(session, couple, flags)); + + couple = couple_orig = ref = &btree->root; if (ref->page == NULL) goto done; + + initial_descent = true; goto descend; } /* - * If the active page was the root, we've reached the walk's end. - * Release any hazard-pointer we're holding. + * If the active page was the root, we've reached the walk's end; we + * only get here if we've returned the root to our caller, so we're + * holding no hazard pointers. */ - if (__wt_ref_is_root(ref)) { - WT_ERR(__wt_page_release(session, couple, flags)); + if (__wt_ref_is_root(ref)) goto done; - } /* Figure out the current slot in the WT_REF array. */ __ref_index_slot(session, ref, &pindex, &slot); @@ -352,7 +378,7 @@ __tree_walk_internal(WT_SESSION_IMPL *session, while ((prev && slot == 0) || (!prev && slot == pindex->entries - 1)) { /* Ascend to the parent. */ - __page_ascend(session, &ref, &pindex, &slot); + __ref_ascend(session, &ref, &pindex, &slot); /* * If we got all the way through an internal page and @@ -521,16 +547,21 @@ __tree_walk_internal(WT_SESSION_IMPL *session, ret = 0; /* + * If a cursor is setting up at the end of the + * tree, we can't use our parent page's index, + * because it may have already split; restart + * the walk. + */ + if (prev && initial_descent) + goto restart; + + /* * If a new walk that never coupled from the * root to a new saved position in the tree, * restart the walk. */ - if (couple == &btree->root) { - ref = &btree->root; - if (ref->page == NULL) - goto done; - goto descend; - } + if (couple == &btree->root) + goto restart; /* * If restarting from some original position, @@ -561,10 +592,56 @@ __tree_walk_internal(WT_SESSION_IMPL *session, descend: couple = ref; empty_internal = true; - __page_descend( - session, ref->page, &pindex, &slot, prev); + /* + * There's a split race when a cursor is setting + * up at the end of the tree or moving backwards + * through the tree and descending a level. When + * splitting an internal page into its parent, + * we move the WT_REF structures and update the + * parent's page index before updating the split + * page's page index, and it's not an atomic + * update. A thread can read the parent page's + * replacement page index, then read the split + * page's original index, or the parent page's + * original and the split page's replacement. + * + * This isn't a problem for a cursor setting up + * at the start of the tree or moving forwards + * through the tree because we do right-hand + * splits on internal pages and the initial part + * of the split page's namespace won't change as + * part of a split. A thread reading the parent + * page's and split page's indexes will move to + * the same slot no matter what order of indexes + * are read. + * + * Handle a cursor setting up at the end of the + * tree or moving backwards through the tree. + */ + if (!prev) { + WT_INTL_INDEX_GET( + session, ref->page, pindex); + slot = 0; + } else if (initial_descent) { + if (!__ref_initial_descent_prev( + session, ref, &pindex)) + goto restart; + slot = pindex->entries - 1; + } else { + __ref_descend_prev( + session, ref, &pindex); + slot = pindex->entries - 1; + } } else { /* + * At the lowest tree level (considering a leaf + * page), turn off the initial-descent state. + * Descent race tests are different when moving + * through the tree vs. the initial descent. + */ + initial_descent = false; + + /* * Optionally skip leaf pages, the second half. * We didn't have an on-page cell to figure out * if it was a leaf page, we had to acquire the @@ -605,7 +682,7 @@ __wt_tree_walk(WT_SESSION_IMPL *session, WT_REF **refp, uint32_t flags) /* * __wt_tree_walk_count -- * Move to the next/previous page in the tree, tracking how many - * references were visited to get there. + * references were visited to get there. */ int __wt_tree_walk_count(WT_SESSION_IMPL *session, diff --git a/src/third_party/wiredtiger/src/btree/col_modify.c b/src/third_party/wiredtiger/src/btree/col_modify.c index 645d98d9c9b..fd60b12538a 100644 --- a/src/third_party/wiredtiger/src/btree/col_modify.c +++ b/src/third_party/wiredtiger/src/btree/col_modify.c @@ -25,6 +25,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, **ins_headp; WT_ITEM _value; WT_PAGE *page; + WT_PAGE_MODIFY *mod; WT_UPDATE *old_upd, *upd; size_t ins_size, upd_size; u_int i, skipdepth; @@ -60,6 +61,7 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, /* If we don't yet have a modify structure, we'll need one. */ WT_RET(__wt_page_modify_init(session, page)); + mod = page->modify; /* * Delete, insert or update a column-store entry. @@ -105,17 +107,17 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, /* Allocate the append/update list reference as necessary. */ if (append) { WT_PAGE_ALLOC_AND_SWAP(session, - page, page->modify->mod_append, ins_headp, 1); - ins_headp = &page->modify->mod_append[0]; + page, mod->mod_append, ins_headp, 1); + ins_headp = &mod->mod_append[0]; } else if (page->type == WT_PAGE_COL_FIX) { WT_PAGE_ALLOC_AND_SWAP(session, - page, page->modify->mod_update, ins_headp, 1); - ins_headp = &page->modify->mod_update[0]; + page, mod->mod_update, ins_headp, 1); + ins_headp = &mod->mod_update[0]; } else { WT_PAGE_ALLOC_AND_SWAP(session, - page, page->modify->mod_update, ins_headp, + page, mod->mod_update, ins_headp, page->pg_var_entries); - ins_headp = &page->modify->mod_update[cbt->slot]; + ins_headp = &mod->mod_update[cbt->slot]; } /* Allocate the WT_INSERT_HEAD structure as necessary. */ @@ -135,6 +137,14 @@ __wt_col_modify(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, cbt->ins_head = ins_head; cbt->ins = ins; + /* + * Check for insert split and checkpoint races in column-store: + * it's easy (as opposed to in row-store) and a difficult bug to + * otherwise diagnose. + */ + WT_ASSERT(session, mod->mod_split_recno == WT_RECNO_OOB || + (recno != WT_RECNO_OOB && mod->mod_split_recno > recno)); + if (upd_arg == NULL) { WT_ERR( __wt_update_alloc(session, value, &upd, &upd_size)); diff --git a/src/third_party/wiredtiger/src/btree/col_srch.c b/src/third_party/wiredtiger/src/btree/col_srch.c index cb5a227495f..23eae75ec2b 100644 --- a/src/third_party/wiredtiger/src/btree/col_srch.c +++ b/src/third_party/wiredtiger/src/btree/col_srch.c @@ -77,6 +77,7 @@ __wt_col_search(WT_SESSION_IMPL *session, int depth; btree = S2BT(session); + current = NULL; __cursor_pos_clear(cbt); @@ -116,12 +117,19 @@ __wt_col_search(WT_SESSION_IMPL *session, goto leaf_only; } -restart_root: + if (0) { +restart: /* + * Discard the currently held page and restart the search from + * the root. + */ + WT_RET(__wt_page_release(session, current, 0)); + } + /* Search the internal pages of the tree. */ current = &btree->root; for (depth = 2, pindex = NULL;; ++depth) { parent_pindex = pindex; -restart_page: page = current->page; + page = current->page; if (page->type != WT_PAGE_COL_INT) break; @@ -137,12 +145,10 @@ restart_page: page = current->page; * If on the last slot (the key is larger than any key * on the page), check for an internal page split race. */ - if (parent_pindex != NULL && - __wt_split_intl_race( - session, current->home, parent_pindex)) { - WT_RET(__wt_page_release(session, current, 0)); - goto restart_root; - } + if (__wt_split_descent_race( + session, current, parent_pindex)) + goto restart; + goto descend; } @@ -178,8 +184,14 @@ descend: /* /* * Swap the current page for the child page. If the page splits - * while we're retrieving it, restart the search in the current - * page; otherwise return on error, the swap call ensures we're + * while we're retrieving it, restart the search at the root. + * We cannot restart in the "current" page; for example, if a + * thread is appending to the tree, the page it's waiting for + * did an insert-split into the parent, then the parent split + * into its parent, the name space we are searching for may have + * moved above the current page in the tree. + * + * On other error, simply return, the swap call ensures we're * holding nothing on failure. */ if ((ret = __wt_page_swap( @@ -188,7 +200,7 @@ descend: /* continue; } if (ret == WT_RESTART) - goto restart_page; + goto restart; return (ret); } diff --git a/src/third_party/wiredtiger/src/btree/row_srch.c b/src/third_party/wiredtiger/src/btree/row_srch.c index 71564a7b3c5..9d68c8e0ce7 100644 --- a/src/third_party/wiredtiger/src/btree/row_srch.c +++ b/src/third_party/wiredtiger/src/btree/row_srch.c @@ -9,18 +9,17 @@ #include "wt_internal.h" /* - * __wt_search_insert_append -- + * __search_insert_append -- * Fast append search of a row-store insert list, creating a skiplist stack * as we go. */ static inline int -__wt_search_insert_append(WT_SESSION_IMPL *session, - WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key, bool *donep) +__search_insert_append(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, + WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key, bool *donep) { WT_BTREE *btree; WT_COLLATOR *collator; WT_INSERT *ins; - WT_INSERT_HEAD *inshead; WT_ITEM key; int cmp, i; @@ -28,8 +27,7 @@ __wt_search_insert_append(WT_SESSION_IMPL *session, collator = btree->collator; *donep = 0; - inshead = cbt->ins_head; - if ((ins = WT_SKIP_LAST(inshead)) == NULL) + if ((ins = WT_SKIP_LAST(ins_head)) == NULL) return (0); key.data = WT_INSERT_KEY(ins); key.size = WT_INSERT_KEY_SIZE(ins); @@ -48,12 +46,13 @@ __wt_search_insert_append(WT_SESSION_IMPL *session, */ for (i = WT_SKIP_MAXDEPTH - 1; i >= 0; i--) { cbt->ins_stack[i] = (i == 0) ? &ins->next[0] : - (inshead->tail[i] != NULL) ? - &inshead->tail[i]->next[i] : &inshead->head[i]; + (ins_head->tail[i] != NULL) ? + &ins_head->tail[i]->next[i] : &ins_head->head[i]; cbt->next_stack[i] = NULL; } cbt->compare = -cmp; cbt->ins = ins; + cbt->ins_head = ins_head; *donep = 1; } return (0); @@ -64,20 +63,18 @@ __wt_search_insert_append(WT_SESSION_IMPL *session, * Search a row-store insert list, creating a skiplist stack as we go. */ int -__wt_search_insert( - WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key) +__wt_search_insert(WT_SESSION_IMPL *session, + WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key) { WT_BTREE *btree; WT_COLLATOR *collator; WT_INSERT *ins, **insp, *last_ins; - WT_INSERT_HEAD *inshead; WT_ITEM key; size_t match, skiphigh, skiplow; int cmp, i; btree = S2BT(session); collator = btree->collator; - inshead = cbt->ins_head; cmp = 0; /* -Wuninitialized */ /* @@ -86,7 +83,7 @@ __wt_search_insert( */ match = skiphigh = skiplow = 0; ins = last_ins = NULL; - for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0;) { + for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i]; i >= 0;) { if ((ins = *insp) == NULL) { cbt->next_stack[i] = NULL; cbt->ins_stack[i--] = insp--; @@ -128,6 +125,7 @@ __wt_search_insert( */ cbt->compare = -cmp; cbt->ins = (ins != NULL) ? ins : last_ins; + cbt->ins_head = ins_head; return (0); } @@ -212,6 +210,7 @@ __wt_row_search(WT_SESSION_IMPL *session, WT_BTREE *btree; WT_COLLATOR *collator; WT_DECL_RET; + WT_INSERT_HEAD *ins_head; WT_ITEM *item; WT_PAGE *page; WT_PAGE_INDEX *pindex, *parent_pindex; @@ -276,12 +275,20 @@ __wt_row_search(WT_SESSION_IMPL *session, goto leaf_only; } + if (0) { +restart: /* + * Discard the currently held page and restart the search from + * the root. + */ + WT_RET(__wt_page_release(session, current, 0)); + skiphigh = skiplow = 0; + } + /* Search the internal pages of the tree. */ -restart_root: current = &btree->root; for (depth = 2, pindex = NULL;; ++depth) { parent_pindex = pindex; -restart_page: page = current->page; + page = current->page; if (page->type != WT_PAGE_ROW_INT) break; @@ -418,22 +425,21 @@ restart_page: page = current->page; * page), check for an internal page split race. */ if (pindex->entries == base) { -append: if (parent_pindex != NULL && - __wt_split_intl_race( - session, current->home, parent_pindex)) { - if ((ret = __wt_page_release( - session, current, 0)) != 0) - return (ret); - - skiplow = skiphigh = 0; - goto restart_root; - } +append: if (__wt_split_descent_race( + session, current, parent_pindex)) + goto restart; } descend: /* * Swap the current page for the child page. If the page splits - * while we're retrieving it, restart the search in the current - * page; otherwise return on error, the swap call ensures we're + * while we're retrieving it, restart the search at the root. + * We cannot restart in the "current" page; for example, if a + * thread is appending to the tree, the page it's waiting for + * did an insert-split into the parent, then the parent split + * into its parent, the name space we are searching for may have + * moved above the current page in the tree. + * + * On other error, simply return, the swap call ensures we're * holding nothing on failure. */ if ((ret = __wt_page_swap( @@ -441,10 +447,8 @@ descend: /* current = descent; continue; } - if (ret == WT_RESTART) { - skiphigh = skiplow = 0; - goto restart_page; - } + if (ret == WT_RESTART) + goto restart; return (ret); } @@ -480,24 +484,18 @@ leaf_only: cbt->slot = WT_ROW_SLOT(page, page->pg_row_d); F_SET(cbt, WT_CBT_SEARCH_SMALLEST); - cbt->ins_head = WT_ROW_INSERT_SMALLEST(page); + ins_head = WT_ROW_INSERT_SMALLEST(page); } else { cbt->slot = WT_ROW_SLOT(page, page->pg_row_d + (page->pg_row_entries - 1)); - cbt->ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot); + ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot); } - WT_ERR( - __wt_search_insert_append(session, cbt, srch_key, &done)); + WT_ERR(__search_insert_append( + session, cbt, ins_head, srch_key, &done)); if (done) return (0); - - /* - * Don't leave the insert list head set, code external to the - * search uses it. - */ - cbt->ins_head = NULL; } /* @@ -590,16 +588,16 @@ leaf_match: cbt->compare = 0; cbt->slot = WT_ROW_SLOT(page, page->pg_row_d); F_SET(cbt, WT_CBT_SEARCH_SMALLEST); - cbt->ins_head = WT_ROW_INSERT_SMALLEST(page); + ins_head = WT_ROW_INSERT_SMALLEST(page); } else { cbt->compare = -1; cbt->slot = WT_ROW_SLOT(page, page->pg_row_d + (base - 1)); - cbt->ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot); + ins_head = WT_ROW_INSERT_SLOT(page, cbt->slot); } /* If there's no insert list, we're done. */ - if (WT_SKIP_FIRST(cbt->ins_head) == NULL) + if (WT_SKIP_FIRST(ins_head) == NULL) return (0); /* @@ -607,12 +605,12 @@ leaf_match: cbt->compare = 0; * catch cursors repeatedly inserting at a single point. */ if (insert) { - WT_ERR( - __wt_search_insert_append(session, cbt, srch_key, &done)); + WT_ERR(__search_insert_append( + session, cbt, ins_head, srch_key, &done)); if (done) return (0); } - WT_ERR(__wt_search_insert(session, cbt, srch_key)); + WT_ERR(__wt_search_insert(session, cbt, ins_head, srch_key)); return (0); @@ -661,19 +659,16 @@ __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) /* * If the tree is new (and not empty), it might have a large insert * list. - */ - F_SET(cbt, WT_CBT_SEARCH_SMALLEST); - if ((cbt->ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL) - return (WT_NOTFOUND); - - /* + * * Walk down the list until we find a level with at least 50 entries, * that's where we'll start rolling random numbers. The value 50 is * used to ignore levels with only a few entries, that is, levels which * are potentially badly skewed. */ - for (ins_head = cbt->ins_head, - level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) { + F_SET(cbt, WT_CBT_SEARCH_SMALLEST); + if ((ins_head = WT_ROW_INSERT_SMALLEST(page)) == NULL) + return (WT_NOTFOUND); + for (level = WT_SKIP_MAXDEPTH - 1; level >= 0; --level) { start = &ins_head->head[level]; for (entries = 0, stop = start; *stop != NULL; stop = &(*stop)->next[level]) @@ -768,6 +763,7 @@ __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) ins = ins->next[0]; cbt->ins = ins; + cbt->ins_head = ins_head; cbt->compare = 0; return (0); @@ -787,11 +783,19 @@ __wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt) WT_REF *current, *descent; btree = S2BT(session); + current = NULL; __cursor_pos_clear(cbt); -restart_root: - /* Walk the internal pages of the tree. */ + if (0) { +restart: /* + * Discard the currently held page and restart the search from + * the root. + */ + WT_RET(__wt_page_release(session, current, 0)); + } + + /* Search the internal pages of the tree. */ current = &btree->root; for (;;) { page = current->page; @@ -803,22 +807,19 @@ restart_root: __wt_random(&session->rnd) % pindex->entries]; /* - * Swap the parent page for the child page; return on error, - * the swap function ensures we're holding nothing on failure. + * Swap the current page for the child page. If the page splits + * while we're retrieving it, restart the search at the root. + * + * On other error, simply return, the swap call ensures we're + * holding nothing on failure. */ if ((ret = __wt_page_swap( session, current, descent, WT_READ_RESTART_OK)) == 0) { current = descent; continue; } - /* - * Restart is returned if we find a page that's been split; the - * held page isn't discarded when restart is returned, discard - * it and restart the search from the top of the tree. - */ - if (ret == WT_RESTART && - (ret = __wt_page_release(session, current, 0)) == 0) - goto restart_root; + if (ret == WT_RESTART) + goto restart; return (ret); } diff --git a/src/third_party/wiredtiger/src/conn/conn_dhandle.c b/src/third_party/wiredtiger/src/conn/conn_dhandle.c index 60136a71b99..3bea24be508 100644 --- a/src/third_party/wiredtiger/src/conn/conn_dhandle.c +++ b/src/third_party/wiredtiger/src/conn/conn_dhandle.c @@ -368,24 +368,21 @@ __conn_btree_apply_internal(WT_SESSION_IMPL *session, WT_DATA_HANDLE *dhandle, * sure it's referenced to stop other internal code dropping the handle * (e.g in LSM when cleaning up obsolete chunks). */ - ret = __wt_session_get_btree(session, - dhandle->name, dhandle->checkpoint, NULL, 0); - if (ret == 0) { - WT_SAVE_DHANDLE(session, - ret = func(session, cfg)); - if (WT_META_TRACKING(session)) - WT_TRET(__wt_meta_track_handle_lock(session, false)); - else - WT_TRET(__wt_session_release_btree(session)); - } else if (ret == EBUSY) - ret = __wt_conn_btree_apply_single(session, dhandle->name, - dhandle->checkpoint, func, cfg); + if ((ret = __wt_session_get_btree(session, + dhandle->name, dhandle->checkpoint, NULL, 0)) != 0) + return (ret == EBUSY ? 0 : ret); + + WT_SAVE_DHANDLE(session, ret = func(session, cfg)); + if (WT_META_TRACKING(session)) + WT_TRET(__wt_meta_track_handle_lock(session, false)); + else + WT_TRET(__wt_session_release_btree(session)); return (ret); } /* * __wt_conn_btree_apply -- - * Apply a function to all open btree handles apart from the metadata. + * Apply a function to all open btree handles with the given URI. */ int __wt_conn_btree_apply(WT_SESSION_IMPL *session, @@ -430,98 +427,6 @@ __wt_conn_btree_apply(WT_SESSION_IMPL *session, } /* - * __wt_conn_btree_apply_single_ckpt -- - * Decode any checkpoint information from the configuration string then - * call btree apply single. - */ -int -__wt_conn_btree_apply_single_ckpt(WT_SESSION_IMPL *session, - const char *uri, - int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]) -{ - WT_CONFIG_ITEM cval; - WT_DECL_RET; - const char *checkpoint; - - checkpoint = NULL; - - /* - * This function exists to handle checkpoint configuration. Callers - * that never open a checkpoint call the underlying function directly. - */ - WT_RET_NOTFOUND_OK( - __wt_config_gets_def(session, cfg, "checkpoint", 0, &cval)); - if (cval.len != 0) { - /* - * The internal checkpoint name is special, find the last - * unnamed checkpoint of the object. - */ - if (WT_STRING_MATCH(WT_CHECKPOINT, cval.str, cval.len)) { - WT_RET(__wt_meta_checkpoint_last_name( - session, uri, &checkpoint)); - } else - WT_RET(__wt_strndup( - session, cval.str, cval.len, &checkpoint)); - } - - ret = __wt_conn_btree_apply_single(session, uri, checkpoint, func, cfg); - - __wt_free(session, checkpoint); - - return (ret); -} - -/* - * __wt_conn_btree_apply_single -- - * Apply a function to a single btree handle that couldn't be locked - * (attempting to get the handle returned EBUSY). - */ -int -__wt_conn_btree_apply_single(WT_SESSION_IMPL *session, - const char *uri, const char *checkpoint, - int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]) -{ - WT_CONNECTION_IMPL *conn; - WT_DATA_HANDLE *dhandle; - WT_DECL_RET; - uint64_t bucket, hash; - - conn = S2C(session); - - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST)); - - hash = __wt_hash_city64(uri, strlen(uri)); - bucket = hash % WT_HASH_ARRAY_SIZE; - TAILQ_FOREACH(dhandle, &conn->dhhash[bucket], hashq) - if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && - !F_ISSET(dhandle, WT_DHANDLE_DEAD) && - (hash == dhandle->name_hash && - strcmp(uri, dhandle->name) == 0) && - ((dhandle->checkpoint == NULL && checkpoint == NULL) || - (dhandle->checkpoint != NULL && checkpoint != NULL && - strcmp(dhandle->checkpoint, checkpoint) == 0))) { - /* - * We're holding the handle list lock which locks out - * handle open (which might change the state of the - * underlying object). However, closing a handle - * doesn't require the handle list lock, lock out - * closing the handle and then confirm the handle is - * still open. - */ - __wt_spin_lock(session, &dhandle->close_lock); - if (F_ISSET(dhandle, WT_DHANDLE_OPEN) && - !F_ISSET(dhandle, WT_DHANDLE_DEAD)) { - WT_WITH_DHANDLE(session, dhandle, - ret = func(session, cfg)); - } - __wt_spin_unlock(session, &dhandle->close_lock); - WT_RET(ret); - } - - return (0); -} - -/* * __wt_conn_dhandle_close_all -- * Close all data handles handles with matching name (including all * checkpoint handles). diff --git a/src/third_party/wiredtiger/src/conn/conn_handle.c b/src/third_party/wiredtiger/src/conn/conn_handle.c index 12b4e87e921..b33ec18dfca 100644 --- a/src/third_party/wiredtiger/src/conn/conn_handle.c +++ b/src/third_party/wiredtiger/src/conn/conn_handle.c @@ -56,6 +56,7 @@ __wt_connection_init(WT_CONNECTION_IMPL *conn) WT_RET(__wt_rwlock_alloc(session, &conn->hot_backup_lock, "hot backup")); WT_RET(__wt_spin_init(session, &conn->las_lock, "lookaside table")); + WT_RET(__wt_spin_init(session, &conn->metadata_lock, "metadata")); WT_RET(__wt_spin_init(session, &conn->reconfig_lock, "reconfigure")); WT_RET(__wt_spin_init(session, &conn->schema_lock, "schema")); WT_RET(__wt_spin_init(session, &conn->table_lock, "table creation")); @@ -143,6 +144,7 @@ __wt_connection_destroy(WT_CONNECTION_IMPL *conn) __wt_spin_destroy(session, &conn->fh_lock); WT_TRET(__wt_rwlock_destroy(session, &conn->hot_backup_lock)); __wt_spin_destroy(session, &conn->las_lock); + __wt_spin_destroy(session, &conn->metadata_lock); __wt_spin_destroy(session, &conn->reconfig_lock); __wt_spin_destroy(session, &conn->schema_lock); __wt_spin_destroy(session, &conn->table_lock); diff --git a/src/third_party/wiredtiger/src/conn/conn_log.c b/src/third_party/wiredtiger/src/conn/conn_log.c index 60f46288072..fa3928a8539 100644 --- a/src/third_party/wiredtiger/src/conn/conn_log.c +++ b/src/third_party/wiredtiger/src/conn/conn_log.c @@ -633,7 +633,7 @@ restart: if (slot->slot_start_lsn.l.offset != slot->slot_last_offset) slot->slot_start_lsn.l.offset = - slot->slot_last_offset; + (uint32_t)slot->slot_last_offset; log->write_start_lsn = slot->slot_start_lsn; log->write_lsn = slot->slot_end_lsn; WT_ERR(__wt_cond_signal( diff --git a/src/third_party/wiredtiger/src/cursor/cur_join.c b/src/third_party/wiredtiger/src/cursor/cur_join.c index 2cbefa68c5e..797e6e5879a 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_join.c +++ b/src/third_party/wiredtiger/src/cursor/cur_join.c @@ -53,7 +53,9 @@ __curjoin_entry_iter_init(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, iter->session = session; iter->entry = entry; iter->cursor = newcur; - iter->advance = false; + iter->positioned = false; + iter->isequal = (entry->ends_next == 1 && + WT_CURJOIN_END_RANGE(&entry->ends[0]) == WT_CURJOIN_END_EQ); *iterp = iter; if (0) { @@ -72,18 +74,16 @@ static int __curjoin_pack_recno(WT_SESSION_IMPL *session, uint64_t r, uint8_t *buf, size_t bufsize, WT_ITEM *item) { - WT_DECL_RET; WT_SESSION *wtsession; size_t sz; wtsession = (WT_SESSION *)session; - WT_ERR(wiredtiger_struct_size(wtsession, &sz, "r", r)); + WT_RET(wiredtiger_struct_size(wtsession, &sz, "r", r)); WT_ASSERT(session, sz < bufsize); - WT_ERR(wiredtiger_struct_pack(wtsession, buf, bufsize, "r", r)); + WT_RET(wiredtiger_struct_pack(wtsession, buf, bufsize, "r", r)); item->size = sz; item->data = buf; - -err: return (ret); + return (0); } /* @@ -97,14 +97,13 @@ __curjoin_entry_iter_next(WT_CURSOR_JOIN_ITER *iter, WT_ITEM *primkey, { WT_CURSOR *firstcg_cur; WT_CURSOR_JOIN *cjoin; - WT_DECL_RET; WT_SESSION_IMPL *session; uint64_t r; - if (iter->advance) - WT_ERR(iter->cursor->next(iter->cursor)); + if (iter->positioned) + WT_RET(iter->cursor->next(iter->cursor)); else - iter->advance = true; + iter->positioned = true; session = iter->session; cjoin = iter->cjoin; @@ -119,7 +118,7 @@ __curjoin_entry_iter_next(WT_CURSOR_JOIN_ITER *iter, WT_ITEM *primkey, firstcg_cur = ((WT_CURSOR_TABLE *)iter->cursor)->cg_cursors[0]; if (WT_CURSOR_RECNO(&cjoin->iface)) { r = *(uint64_t *)firstcg_cur->key.data; - WT_ERR(__curjoin_pack_recno(session, r, cjoin->recno_buf, + WT_RET(__curjoin_pack_recno(session, r, cjoin->recno_buf, sizeof(cjoin->recno_buf), primkey)); *rp = r; } else { @@ -129,8 +128,7 @@ __curjoin_entry_iter_next(WT_CURSOR_JOIN_ITER *iter, WT_ITEM *primkey, iter->curkey = primkey; iter->entry->stats.actual_count++; iter->entry->stats.accesses++; - -err: return (ret); + return (0); } /* @@ -141,17 +139,14 @@ err: return (ret); static int __curjoin_entry_iter_reset(WT_CURSOR_JOIN_ITER *iter) { - WT_DECL_RET; - - if (iter->advance) { - WT_ERR(iter->cursor->reset(iter->cursor)); - WT_ERR(__wt_cursor_dup_position( + if (iter->positioned) { + WT_RET(iter->cursor->reset(iter->cursor)); + WT_RET(__wt_cursor_dup_position( iter->cjoin->entries[0].ends[0].cursor, iter->cursor)); - iter->advance = false; + iter->positioned = false; iter->entry->stats.actual_count = 0; } - -err: return (ret); + return (0); } /* @@ -162,7 +157,7 @@ err: return (ret); static bool __curjoin_entry_iter_ready(WT_CURSOR_JOIN_ITER *iter) { - return (iter->advance); + return (iter->positioned); } /* @@ -255,18 +250,16 @@ __curjoin_init_bloom(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_CURSOR_JOIN_ENDPOINT *end, *endmax; WT_DECL_RET; WT_DECL_ITEM(uribuf); - WT_ITEM curkey, curvalue, *k; + WT_ITEM curkey, curvalue; WT_TABLE *maintable; const char *raw_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), "raw", NULL }; const char *mainkey_str, *p; - void *allocbuf; size_t mainkey_len, size; u_int i; int cmp, skip; c = NULL; - allocbuf = NULL; skip = 0; if (entry->index != NULL) { @@ -305,7 +298,7 @@ __curjoin_init_bloom(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, if ((end = &entry->ends[0]) < endmax && F_ISSET(end, WT_CURJOIN_END_GE)) { WT_ERR(__wt_cursor_dup_position(end->cursor, c)); - if (end->flags == WT_CURJOIN_END_GE) + if (WT_CURJOIN_END_RANGE(end) == WT_CURJOIN_END_GE) skip = 1; } collator = (entry->index == NULL) ? NULL : entry->index->collator; @@ -313,18 +306,15 @@ __curjoin_init_bloom(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_ERR(c->get_key(c, &curkey)); if (entry->index != NULL) { cindex = (WT_CURSOR_INDEX *)c; - if (cindex->index->extractor == NULL) { - /* - * Repack so it's comparable to the - * reference endpoints. - */ - k = &cindex->child->key; - WT_ERR(__wt_struct_repack(session, - cindex->child->key_format, - entry->main->value_format, k, &curkey, - &allocbuf)); - } else - curkey = cindex->child->key; + /* + * Repack so it's comparable to the + * reference endpoints. + */ + WT_ERR(__wt_struct_repack(session, + cindex->child->key_format, + (entry->repack_format != NULL ? + entry->repack_format : cindex->iface.key_format), + &cindex->child->key, &curkey)); } for (end = &entry->ends[skip]; end < endmax; end++) { WT_ERR(__wt_compare(session, collator, &curkey, @@ -361,7 +351,6 @@ done: err: if (c != NULL) WT_TRET(c->close(c)); __wt_scr_free(session, &uribuf); - __wt_free(session, allocbuf); return (ret); } @@ -375,27 +364,23 @@ __curjoin_endpoint_init_key(WT_SESSION_IMPL *session, { WT_CURSOR *cursor; WT_CURSOR_INDEX *cindex; - WT_DECL_RET; WT_ITEM *k; uint64_t r; - void *allocbuf; - allocbuf = NULL; if ((cursor = endpoint->cursor) != NULL) { if (entry->index != NULL) { /* Extract and save the index's logical key. */ cindex = (WT_CURSOR_INDEX *)endpoint->cursor; - WT_ERR(__wt_struct_repack(session, + WT_RET(__wt_struct_repack(session, cindex->child->key_format, - cindex->iface.key_format, - &cindex->child->key, &endpoint->key, &allocbuf)); - if (allocbuf != NULL) - F_SET(endpoint, WT_CURJOIN_END_OWN_KEY); + (entry->repack_format != NULL ? + entry->repack_format : cindex->iface.key_format), + &cindex->child->key, &endpoint->key)); } else { k = &((WT_CURSOR_TABLE *)cursor)->cg_cursors[0]->key; if (WT_CURSOR_RECNO(cursor)) { r = *(uint64_t *)k->data; - WT_ERR(__curjoin_pack_recno(session, r, + WT_RET(__curjoin_pack_recno(session, r, endpoint->recno_buf, sizeof(endpoint->recno_buf), &endpoint->key)); @@ -404,10 +389,7 @@ __curjoin_endpoint_init_key(WT_SESSION_IMPL *session, endpoint->key = *k; } } - if (0) { -err: __wt_free(session, allocbuf); - } - return (ret); + return (0); } /* @@ -520,35 +502,34 @@ __curjoin_entry_in_range(WT_SESSION_IMPL *session, WT_CURSOR_JOIN_ENTRY *entry, { WT_COLLATOR *collator; WT_CURSOR_JOIN_ENDPOINT *end, *endmax; - WT_DECL_RET; int cmp; collator = (entry->index != NULL) ? entry->index->collator : NULL; endmax = &entry->ends[entry->ends_next]; for (end = &entry->ends[skip_left ? 1 : 0]; end < endmax; end++) { - WT_ERR(__wt_compare(session, collator, curkey, &end->key, + WT_RET(__wt_compare(session, collator, curkey, &end->key, &cmp)); if (!F_ISSET(end, WT_CURJOIN_END_LT)) { if (cmp < 0 || (cmp == 0 && !F_ISSET(end, WT_CURJOIN_END_EQ)) || (cmp > 0 && !F_ISSET(end, WT_CURJOIN_END_GT))) - WT_ERR(WT_NOTFOUND); + WT_RET(WT_NOTFOUND); } else { if (cmp > 0 || (cmp == 0 && !F_ISSET(end, WT_CURJOIN_END_EQ)) || (cmp < 0 && !F_ISSET(end, WT_CURJOIN_END_LT))) - WT_ERR(WT_NOTFOUND); + WT_RET(WT_NOTFOUND); } } -err: return (ret); + return (0); } typedef struct { WT_CURSOR iface; WT_CURSOR_JOIN_ENTRY *entry; - int ismember; + bool ismember; } WT_CURJOIN_EXTRACTOR; /* @@ -584,8 +565,8 @@ __curjoin_extract_insert(WT_CURSOR *cursor) { ret = __curjoin_entry_in_range(session, cextract->entry, &ikey, false); if (ret == WT_NOTFOUND) ret = 0; - else - cextract->ismember = 1; + else if (ret == 0) + cextract->ismember = true; return (ret); } @@ -659,10 +640,11 @@ __curjoin_entry_member(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, v = *key; if ((idx = entry->index) != NULL && idx->extractor != NULL) { + WT_CLEAR(extract_cursor); extract_cursor.iface = iface; extract_cursor.iface.session = &session->iface; extract_cursor.iface.key_format = idx->exkey_format; - extract_cursor.ismember = 0; + extract_cursor.ismember = false; extract_cursor.entry = entry; WT_ERR(idx->extractor->extract(idx->extractor, &session->iface, key, &v, &extract_cursor.iface)); @@ -715,8 +697,15 @@ nextkey: for (i = 0; i < cjoin->entries_next; i++) { ret = __curjoin_entry_member(session, cjoin, &cjoin->entries[i], skip_left); - if (ret == WT_NOTFOUND) + if (ret == WT_NOTFOUND) { + /* + * If this is compare=eq on our outer iterator, + * and we've moved past it, we're done. + */ + if (cjoin->iter->isequal && i == 0) + break; goto nextkey; + } skip_left = false; WT_ERR(ret); } @@ -783,12 +772,10 @@ __curjoin_close(WT_CURSOR *cursor) if (F_ISSET(entry, WT_CURJOIN_ENTRY_OWN_BLOOM)) WT_TRET(__wt_bloom_close(entry->bloom)); for (end = &entry->ends[0]; - end < &entry->ends[entry->ends_next]; end++) { + end < &entry->ends[entry->ends_next]; end++) F_CLR(end->cursor, WT_CURSTD_JOINED); - if (F_ISSET(end, WT_CURJOIN_END_OWN_KEY)) - __wt_free(session, end->key.data); - } __wt_free(session, entry->ends); + __wt_free(session, entry->repack_format); } if (cjoin->iter != NULL) @@ -891,22 +878,22 @@ __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, WT_INDEX *idx, WT_CURSOR *ref_cursor, uint8_t flags, uint8_t range, uint64_t count, uint32_t bloom_bit_count, uint32_t bloom_hash_count) { + WT_CURSOR_INDEX *cindex; + WT_CURSOR_JOIN_ENDPOINT *end, *newend; WT_CURSOR_JOIN_ENTRY *entry; WT_DECL_RET; - WT_CURSOR_JOIN_ENDPOINT *end, *newend; bool hasins, needbloom, range_eq; - u_int i, ins, nonbloom; + char *main_uri, *newformat; const char *raw_cfg[] = { WT_CONFIG_BASE( session, WT_SESSION_open_cursor), "raw", NULL }; - char *main_uri; - size_t namesize, newsize; + size_t len, newsize; + u_int i, ins, nonbloom; entry = NULL; hasins = needbloom = false; ins = 0; /* -Wuninitialized */ main_uri = NULL; nonbloom = 0; /* -Wuninitialized */ - namesize = strlen(cjoin->table->name); for (i = 0; i < cjoin->entries_next; i++) { if (cjoin->entries[i].index == idx) { @@ -982,13 +969,13 @@ __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, ((range & WT_CURJOIN_END_GT) != 0 || range_eq)) || (F_ISSET(end, WT_CURJOIN_END_LT) && ((range & WT_CURJOIN_END_LT) != 0 || range_eq)) || - (end->flags == WT_CURJOIN_END_EQ && + (WT_CURJOIN_END_RANGE(end) == WT_CURJOIN_END_EQ && (range & (WT_CURJOIN_END_LT | WT_CURJOIN_END_GT)) != 0)) WT_ERR_MSG(session, EINVAL, "join has overlapping ranges"); if (range == WT_CURJOIN_END_EQ && - end->flags == WT_CURJOIN_END_EQ && + WT_CURJOIN_END_RANGE(end) == WT_CURJOIN_END_EQ && !F_ISSET(entry, WT_CURJOIN_ENTRY_DISJUNCTION)) WT_ERR_MSG(session, EINVAL, "compare=eq can only be combined " @@ -1026,15 +1013,40 @@ __wt_curjoin_join(WT_SESSION_IMPL *session, WT_CURSOR_JOIN *cjoin, F_SET(newend, range); /* Open the main file with a projection of the indexed columns. */ - if (entry->main == NULL && entry->index != NULL) { - namesize = strlen(cjoin->table->name); - newsize = namesize + entry->index->colconf.len + 1; + if (entry->main == NULL && idx != NULL) { + newsize = strlen(cjoin->table->name) + idx->colconf.len + 1; WT_ERR(__wt_calloc(session, 1, newsize, &main_uri)); snprintf(main_uri, newsize, "%s%.*s", - cjoin->table->name, (int)entry->index->colconf.len, - entry->index->colconf.str); + cjoin->table->name, (int)idx->colconf.len, + idx->colconf.str); WT_ERR(__wt_open_cursor(session, main_uri, (WT_CURSOR *)cjoin, raw_cfg, &entry->main)); + if (idx->extractor == NULL) { + /* + * Add no-op padding so trailing 'u' formats are not + * transformed to 'U'. This matches what happens in + * the index. We don't do this when we have an + * extractor, extractors already use the padding + * byte trick. + */ + len = strlen(entry->main->value_format) + 3; + WT_ERR(__wt_calloc(session, len, 1, &newformat)); + snprintf(newformat, len, "%s0x", + entry->main->value_format); + __wt_free(session, entry->main->value_format); + entry->main->value_format = newformat; + } + + /* + * When we are repacking index keys to remove the primary + * key, we never want to transform trailing 'u'. Use no-op + * padding to force this. + */ + cindex = (WT_CURSOR_INDEX *)ref_cursor; + len = strlen(cindex->iface.key_format) + 3; + WT_ERR(__wt_calloc(session, len, 1, &entry->repack_format)); + snprintf(entry->repack_format, len, "%s0x", + cindex->iface.key_format); } err: if (main_uri != NULL) diff --git a/src/third_party/wiredtiger/src/cursor/cur_stat.c b/src/third_party/wiredtiger/src/cursor/cur_stat.c index 00a6ade21c6..bb492c66ace 100644 --- a/src/third_party/wiredtiger/src/cursor/cur_stat.c +++ b/src/third_party/wiredtiger/src/cursor/cur_stat.c @@ -504,14 +504,13 @@ __curstat_join_init(WT_SESSION_IMPL *session, WT_CURSOR *curjoin, const char *cfg[], WT_CURSOR_STAT *cst) { WT_CURSOR_JOIN *cjoin; - WT_DECL_RET; WT_UNUSED(cfg); if (curjoin == NULL && cst->u.join_stats_group.join_cursor != NULL) curjoin = &cst->u.join_stats_group.join_cursor->iface; if (curjoin == NULL || !WT_PREFIX_MATCH(curjoin->uri, "join:")) - WT_ERR_MSG(session, EINVAL, + WT_RET_MSG(session, EINVAL, "join cursor must be used with statistics:join"); cjoin = (WT_CURSOR_JOIN *)curjoin; memset(&cst->u.join_stats_group, 0, sizeof(WT_JOIN_STATS_GROUP)); @@ -522,8 +521,7 @@ __curstat_join_init(WT_SESSION_IMPL *session, cst->stats_count = sizeof(WT_JOIN_STATS) / sizeof(int64_t); cst->stats_desc = __curstat_join_desc; cst->next_set = __curstat_join_next_set; - -err: return (ret); + return (0); } /* diff --git a/src/third_party/wiredtiger/src/docs/checkpoint.dox b/src/third_party/wiredtiger/src/docs/checkpoint.dox index 523c0887859..ec28fea13c3 100644 --- a/src/third_party/wiredtiger/src/docs/checkpoint.dox +++ b/src/third_party/wiredtiger/src/docs/checkpoint.dox @@ -23,11 +23,16 @@ All transactional updates committed before a checkpoint are made durable by the checkpoint, therefore the frequency of checkpoints limits the volume of data that may be lost due to application or system failure. -When WiredTiger data sources are first opened, they are opened in the -state of the most recent checkpoint taken on the file, in other words, -updates after the most recent checkpoint will not appear in the data -source. If no checkpoint is found when the data source is opened, the -data source will appear empty. +Data sources that are involved in an exclusive operation when the +checkpoint starts, including bulk load, verify or salvage, will be skipped +by the checkpoint. Operations requiring exclusive access may fail with +an \c EBUSY error if attempted during a checkpoint. + +When data sources are first opened, they are opened in the state of the +most recent checkpoint taken on the file, in other words, updates after the +most recent checkpoint will not appear in the data source. If no +checkpoint is found when the data source is opened, the data source will +appear empty. @section checkpoint_server Automatic checkpoints @@ -54,15 +59,16 @@ checkpoint cursor is closed. @section checkpoint_naming Checkpoint naming -Additionally, checkpoints that do not include LSM trees may optionally -be given names by the application. Checkpoints named by the application -persist until explicitly discarded or the application creates a new -checkpoint with the same name (which replaces the previous checkpoint -of that name). If the previous checkpoint cannot be replaced, either -because a cursor is reading from the previous checkpoint, or backups are -in progress, the checkpoint will fail. Because named checkpoints -persist until discarded or replaced, they can be used to periodically -snapshot data for later use. +Additionally, checkpoints that do not include LSM trees may optionally be +given names by the application. Because named checkpoints persist until +discarded or replaced, they can be used to periodically snapshot data for +later use. + +Checkpoints named by the application persist until explicitly discarded or +the application creates a new checkpoint with the same name (which replaces +the previous checkpoint of that name). If the previous checkpoint cannot be +replaced, either because a cursor is reading from the previous checkpoint, +or backups are in progress, the checkpoint will fail. Internal checkpoints (that is, checkpoints not named by the application) use the reserved name "WiredTigerCheckpoint". Applications can open the diff --git a/src/third_party/wiredtiger/src/docs/license.dox b/src/third_party/wiredtiger/src/docs/license.dox index febced2c6af..d7814d04fd6 100644 --- a/src/third_party/wiredtiger/src/docs/license.dox +++ b/src/third_party/wiredtiger/src/docs/license.dox @@ -2,16 +2,16 @@ The complete WiredTiger software package is Open Source software: you are welcome to modify and redistribute it under the terms of -<a href="http://www.gnu.org/licenses/gpl-2.0-standalone.html"> -<b>version 2</b></a> or -<a href="http://www.gnu.org/licenses/gpl-3.0-standalone.html"> -<b>version 3</b></a> of the -<b>GNU General Public License</b></a> +<a href="http://www.gnu.org/licenses/gpl-2.0-standalone.html">version 2</a> +or +<a href="http://www.gnu.org/licenses/gpl-3.0-standalone.html">version 3</a> +of the +<b>GNU General Public License</b> as published by the Free Software Foundation. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -<b>GNU General Public License</b></a> for details. +<b>GNU General Public License</b> for details. Additionally, portions of the WiredTiger distribution are distributed under the terms of the @@ -31,10 +31,10 @@ those described above, or for technical support for this software, please contact MongoDB, Inc. at <a mailto="info@wiredtiger.com">info@wiredtiger.com</a>. -@section license_library 3rd party software included in the WiredTiger library +@section license_library 3rd party software always included in the WiredTiger library Every build of the WiredTiger library binary includes the following 3rd -party software, distributed under their license terms. Redistribution +party software, distributed under separate license terms. Redistribution of the WiredTiger library should comply with these copyrights. <table> @@ -46,14 +46,26 @@ of the WiredTiger library should comply with these copyrights. @row{\c src/support/hash_fnv.c, Authors, Public Domain} </table> +@section license_crc32-vpmsum 3rd party software optionally included in the WiredTiger library: PPC64 + +PPC64 and PPC64LE builds of the WiredTiger library binary include additional +3rd party software, distributed under separate license terms. Redistribution +of the WiredTiger library PPC64 and PPC64LE builds should comply with these +copyrights. + +<table> +@hrow{Distribution Files, Copyright Holder, License} +@row{\c src/support/power8/*, Anton Blanchard, <a href="http://opensource.org/licenses/Apache-2.0">Apache License\, Version 2.0</a> or the <a href="http://www.gnu.org/licenses/gpl-2.0-standalone.html">GNU General Public License\, version 2 or later</a>} +</table> + @section license_leveldb 3rd party software optionally included in the WiredTiger library: LevelDB If the \c --enable-leveldb configuration option is specified when configuring the WiredTiger build, additional 3rd party software is -included in the WiredTiger LevelDB library binary, distributed under -their license terms. Redistribution of the WiredTiger library built -with the \c --enable-leveldb configuration option should comply with -these copyrights. +included in the WiredTiger library binary, distributed under separate +license terms. Redistribution of the WiredTiger library built with the +\c --enable-leveldb configuration option should comply with these +copyrights. <table> @hrow{Distribution Files, Copyright Holder, License} diff --git a/src/third_party/wiredtiger/src/docs/spell.ok b/src/third_party/wiredtiger/src/docs/spell.ok index 80597302cbb..ac71214f8b1 100644 --- a/src/third_party/wiredtiger/src/docs/spell.ok +++ b/src/third_party/wiredtiger/src/docs/spell.ok @@ -7,11 +7,13 @@ Atomicity BLOBs CFLAGS CPPFLAGS +CRC Cheng Christoph Collet's Coverity Coverity's +crc DB's DBTs Datastore @@ -64,6 +66,7 @@ NOTFOUND NUMA NoSQL OPTYPE +PPC PRELOAD README Rebalance @@ -78,6 +81,7 @@ Seward's SiH TXT URIs +vpmsum WiredTiger WiredTiger's WiredTigerCheckpoint diff --git a/src/third_party/wiredtiger/src/docs/upgrading.dox b/src/third_party/wiredtiger/src/docs/upgrading.dox index e4d85003a1e..df0a22ba0fe 100644 --- a/src/third_party/wiredtiger/src/docs/upgrading.dox +++ b/src/third_party/wiredtiger/src/docs/upgrading.dox @@ -4,25 +4,24 @@ <dl> <dt>Column-store bulk-load cursors</dt> <dd> -Historically, bulk-load of a column-store object ignored any key set in -the cursor and automatically assigned each inserted row the next -sequential record number for its key. In the 2.7.1 release, column-store -objects match row-store behavior and require the cursor key be set -before an insert. (This also allows allows sparse tables to be created -in column-store objects, any skipped records are created as -already-deleted rows.) To match the previous behavior, specify the -\c append configuration string when opening the column-store bulk-load -cursor; this causes the cursor's key to be ignored and each inserted row -will be assigned the next record number. +Historically, bulk-load of a column-store object ignored any key set in the +cursor and automatically assigned each inserted row the next sequential +record number for its key. In the 2.7.1 release, column-store objects match +row-store behavior and require the cursor key be set before an insert. +(This allows sparse tables to be created in column-store objects, any +skipped records are created as already-deleted rows.) To match the previous +behavior, specify the \c append configuration string when opening the +column-store bulk-load cursor; this causes the cursor's key to be ignored +and each inserted row will be assigned the next record number. </dd> <dt>Change to WT_SESSION::truncate with URI</dt> <dd> If using the WT_SESSION::truncate API with a file: URI for a full table -truncate, underlying algorithmic changes result in some visible differences. -This call can now return WT_ROLLBACK. Applications should be prepared to -handle this error. This method no longer requires exclusive access to the -table. Also the underlying disk space may not be immediately +truncate, underlying algorithmic changes result in some visible +differences. This call can now return WT_ROLLBACK. Applications should be +prepared to handle this error. This method no longer requires exclusive +access to the table. Also the underlying disk space may not be immediately reclaimed when the call returns. The performance of this API may differ from earlier releases. </dd> @@ -34,6 +33,14 @@ from the WiredTiger release; remaining compression engines include LZ4, snappy and zlib. </dd> +<dt>Change to named checkpoints with bulk loads</dt> +<dd> +Previous versions of WiredTiger created empty named checkpoints in files +being bulk-loaded. In this release, checkpoint skips files being +bulk-loaded, so they do not get named checkpoints that complete during the +bulk load. +</dd> + </dl><hr> @section version_270 Upgrading to Version 2.7.0 diff --git a/src/third_party/wiredtiger/src/evict/evict_lru.c b/src/third_party/wiredtiger/src/evict/evict_lru.c index 0536a06bc22..8ef7164dbc6 100644 --- a/src/third_party/wiredtiger/src/evict/evict_lru.c +++ b/src/third_party/wiredtiger/src/evict/evict_lru.c @@ -1209,7 +1209,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp) uint64_t pages_walked; uint32_t walk_flags; int internal_pages, restarts; - bool enough, modified, would_split; + bool enough, modified; conn = S2C(session); btree = S2BT(session); @@ -1298,7 +1298,7 @@ __evict_walk_file(WT_SESSION_IMPL *session, u_int *slotp) page->read_gen = __wt_cache_read_gen_new(session); fast: /* If the page can't be evicted, give up. */ - if (!__wt_page_can_evict(session, ref, &would_split)) + if (!__wt_page_can_evict(session, ref, NULL)) continue; /* diff --git a/src/third_party/wiredtiger/src/include/btree.i b/src/third_party/wiredtiger/src/include/btree.i index 94111397abd..6df7f87073f 100644 --- a/src/third_party/wiredtiger/src/include/btree.i +++ b/src/third_party/wiredtiger/src/include/btree.i @@ -1149,7 +1149,7 @@ __wt_page_can_evict(WT_SESSION_IMPL *session, WT_REF *ref, bool *inmem_splitp) * parent frees the backing blocks for any no-longer-used overflow keys, * which will corrupt the checkpoint's block management. */ - if (btree->checkpointing && + if (btree->checkpointing != WT_CKPT_OFF && F_ISSET_ATOMIC(ref->home, WT_PAGE_OVERFLOW_KEYS)) return (false); @@ -1294,19 +1294,19 @@ __wt_page_swap_func( bool acquired; /* - * In rare cases when walking the tree, we try to swap to the same - * page. Fast-path that to avoid thinking about error handling. - */ - if (held == want) - return (0); - - /* * This function is here to simplify the error handling during hazard * pointer coupling so we never leave a hazard pointer dangling. The * assumption is we're holding a hazard pointer on "held", and want to * acquire a hazard pointer on "want", releasing the hazard pointer on * "held" when we're done. + * + * When walking the tree, we sometimes swap to the same page. Fast-path + * that to avoid thinking about error handling. */ + if (held == want) + return (0); + + /* Get the wanted page. */ ret = __wt_page_in_func(session, want, flags #ifdef HAVE_DIAGNOSTIC , file, line @@ -1446,15 +1446,19 @@ __wt_btree_lsm_over_size(WT_SESSION_IMPL *session, uint64_t maxsize) } /* - * __wt_split_intl_race -- + * __wt_split_descent_race -- * Return if we raced with an internal page split when descending the tree. */ static inline bool -__wt_split_intl_race( - WT_SESSION_IMPL *session, WT_PAGE *parent, WT_PAGE_INDEX *saved_pindex) +__wt_split_descent_race( + WT_SESSION_IMPL *session, WT_REF *ref, WT_PAGE_INDEX *saved_pindex) { WT_PAGE_INDEX *pindex; + /* No test when starting the descent (there's no home to check). */ + if (__wt_ref_is_root(ref)) + return (false); + /* * A place to hang this comment... * @@ -1509,6 +1513,6 @@ __wt_split_intl_race( * content the split page retains after the split, and we ignore this * race. */ - WT_INTL_INDEX_GET(session, parent, pindex); + WT_INTL_INDEX_GET(session, ref->home, pindex); return (pindex != saved_pindex); } diff --git a/src/third_party/wiredtiger/src/include/column.i b/src/third_party/wiredtiger/src/include/column.i index 9f3e2101f6f..d64e68420a5 100644 --- a/src/third_party/wiredtiger/src/include/column.i +++ b/src/third_party/wiredtiger/src/include/column.i @@ -11,13 +11,13 @@ * Search a column-store insert list for the next larger record. */ static inline WT_INSERT * -__col_insert_search_gt(WT_INSERT_HEAD *inshead, uint64_t recno) +__col_insert_search_gt(WT_INSERT_HEAD *ins_head, uint64_t recno) { WT_INSERT *ins, **insp; int i; /* If there's no insert chain to search, we're done. */ - if ((ins = WT_SKIP_LAST(inshead)) == NULL) + if ((ins = WT_SKIP_LAST(ins_head)) == NULL) return (NULL); /* Fast path check for targets past the end of the skiplist. */ @@ -29,7 +29,7 @@ __col_insert_search_gt(WT_INSERT_HEAD *inshead, uint64_t recno) * go as far as possible at each level before stepping down to the next. */ ins = NULL; - for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0;) + for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i]; i >= 0;) if (*insp != NULL && recno >= WT_INSERT_RECNO(*insp)) { ins = *insp; /* GTE: keep going at this level */ insp = &(*insp)->next[i]; @@ -50,7 +50,7 @@ __col_insert_search_gt(WT_INSERT_HEAD *inshead, uint64_t recno) * such a record exists before searching. */ if (ins == NULL) - ins = WT_SKIP_FIRST(inshead); + ins = WT_SKIP_FIRST(ins_head); while (recno >= WT_INSERT_RECNO(ins)) ins = WT_SKIP_NEXT(ins); return (ins); @@ -61,13 +61,13 @@ __col_insert_search_gt(WT_INSERT_HEAD *inshead, uint64_t recno) * Search a column-store insert list for the next smaller record. */ static inline WT_INSERT * -__col_insert_search_lt(WT_INSERT_HEAD *inshead, uint64_t recno) +__col_insert_search_lt(WT_INSERT_HEAD *ins_head, uint64_t recno) { WT_INSERT *ins, **insp; int i; /* If there's no insert chain to search, we're done. */ - if ((ins = WT_SKIP_FIRST(inshead)) == NULL) + if ((ins = WT_SKIP_FIRST(ins_head)) == NULL) return (NULL); /* Fast path check for targets before the skiplist. */ @@ -78,7 +78,7 @@ __col_insert_search_lt(WT_INSERT_HEAD *inshead, uint64_t recno) * The insert list is a skip list: start at the highest skip level, then * go as far as possible at each level before stepping down to the next. */ - for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0;) + for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i]; i >= 0;) if (*insp != NULL && recno > WT_INSERT_RECNO(*insp)) { ins = *insp; /* GT: keep going at this level */ insp = &(*insp)->next[i]; @@ -95,14 +95,14 @@ __col_insert_search_lt(WT_INSERT_HEAD *inshead, uint64_t recno) * Search a column-store insert list for an exact match. */ static inline WT_INSERT * -__col_insert_search_match(WT_INSERT_HEAD *inshead, uint64_t recno) +__col_insert_search_match(WT_INSERT_HEAD *ins_head, uint64_t recno) { WT_INSERT **insp, *ret_ins; uint64_t ins_recno; int cmp, i; /* If there's no insert chain to search, we're done. */ - if ((ret_ins = WT_SKIP_LAST(inshead)) == NULL) + if ((ret_ins = WT_SKIP_LAST(ins_head)) == NULL) return (NULL); /* Fast path the check for values at the end of the skiplist. */ @@ -115,7 +115,7 @@ __col_insert_search_match(WT_INSERT_HEAD *inshead, uint64_t recno) * The insert list is a skip list: start at the highest skip level, then * go as far as possible at each level before stepping down to the next. */ - for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0; ) { + for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i]; i >= 0; ) { if (*insp == NULL) { --i; --insp; @@ -143,7 +143,7 @@ __col_insert_search_match(WT_INSERT_HEAD *inshead, uint64_t recno) * Search a column-store insert list, creating a skiplist stack as we go. */ static inline WT_INSERT * -__col_insert_search(WT_INSERT_HEAD *inshead, +__col_insert_search(WT_INSERT_HEAD *ins_head, WT_INSERT ***ins_stack, WT_INSERT **next_stack, uint64_t recno) { WT_INSERT **insp, *ret_ins; @@ -151,15 +151,15 @@ __col_insert_search(WT_INSERT_HEAD *inshead, int cmp, i; /* If there's no insert chain to search, we're done. */ - if ((ret_ins = WT_SKIP_LAST(inshead)) == NULL) + if ((ret_ins = WT_SKIP_LAST(ins_head)) == NULL) return (NULL); /* Fast path appends. */ if (recno >= WT_INSERT_RECNO(ret_ins)) { for (i = 0; i < WT_SKIP_MAXDEPTH; i++) { ins_stack[i] = (i == 0) ? &ret_ins->next[0] : - (inshead->tail[i] != NULL) ? - &inshead->tail[i]->next[i] : &inshead->head[i]; + (ins_head->tail[i] != NULL) ? + &ins_head->tail[i]->next[i] : &ins_head->head[i]; next_stack[i] = NULL; } return (ret_ins); @@ -169,7 +169,7 @@ __col_insert_search(WT_INSERT_HEAD *inshead, * The insert list is a skip list: start at the highest skip level, then * go as far as possible at each level before stepping down to the next. */ - for (i = WT_SKIP_MAXDEPTH - 1, insp = &inshead->head[i]; i >= 0; ) { + for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i]; i >= 0; ) { if ((ret_ins = *insp) == NULL) { next_stack[i] = NULL; ins_stack[i--] = insp--; diff --git a/src/third_party/wiredtiger/src/include/connection.h b/src/third_party/wiredtiger/src/include/connection.h index 88797e83ad6..b0edcef718b 100644 --- a/src/third_party/wiredtiger/src/include/connection.h +++ b/src/third_party/wiredtiger/src/include/connection.h @@ -175,6 +175,7 @@ struct __wt_connection_impl { WT_SPINLOCK checkpoint_lock; /* Checkpoint spinlock */ WT_SPINLOCK dhandle_lock; /* Data handle list spinlock */ WT_SPINLOCK fh_lock; /* File handle queue spinlock */ + WT_SPINLOCK metadata_lock; /* Metadata update spinlock */ WT_SPINLOCK reconfig_lock; /* Single thread reconfigure */ WT_SPINLOCK schema_lock; /* Schema operation spinlock */ WT_SPINLOCK table_lock; /* Table creation spinlock */ diff --git a/src/third_party/wiredtiger/src/include/cursor.h b/src/third_party/wiredtiger/src/include/cursor.h index 7f7b5dceb79..f9bd20c8ba1 100644 --- a/src/third_party/wiredtiger/src/include/cursor.h +++ b/src/third_party/wiredtiger/src/include/cursor.h @@ -289,7 +289,8 @@ struct __wt_cursor_join_iter { WT_CURSOR_JOIN_ENTRY *entry; WT_CURSOR *cursor; WT_ITEM *curkey; - bool advance; + bool positioned; + bool isequal; /* advancing means we're done */ }; struct __wt_cursor_join_endpoint { @@ -302,14 +303,17 @@ struct __wt_cursor_join_endpoint { #define WT_CURJOIN_END_GT 0x04 /* include values > cursor */ #define WT_CURJOIN_END_GE (WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ) #define WT_CURJOIN_END_LE (WT_CURJOIN_END_LT | WT_CURJOIN_END_EQ) -#define WT_CURJOIN_END_OWN_KEY 0x08 /* must free key's data */ uint8_t flags; /* range for this endpoint */ }; +#define WT_CURJOIN_END_RANGE(endp) \ + ((endp)->flags & \ + (WT_CURJOIN_END_GT | WT_CURJOIN_END_EQ | WT_CURJOIN_END_LT)) struct __wt_cursor_join_entry { WT_INDEX *index; WT_CURSOR *main; /* raw main table cursor */ WT_BLOOM *bloom; /* Bloom filter handle */ + char *repack_format; /* target format for repack */ uint32_t bloom_bit_count; /* bits per item in bloom */ uint32_t bloom_hash_count; /* hash functions in bloom */ uint64_t count; /* approx number of matches */ diff --git a/src/third_party/wiredtiger/src/include/extern.h b/src/third_party/wiredtiger/src/include/extern.h index 1999ff6b732..07b4adfe698 100644 --- a/src/third_party/wiredtiger/src/include/extern.h +++ b/src/third_party/wiredtiger/src/include/extern.h @@ -190,7 +190,7 @@ extern int __wt_row_insert_alloc(WT_SESSION_IMPL *session, WT_ITEM *key, u_int s extern int __wt_update_alloc( WT_SESSION_IMPL *session, WT_ITEM *value, WT_UPDATE **updp, size_t *sizep); extern WT_UPDATE *__wt_update_obsolete_check( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd); extern void __wt_update_obsolete_free( WT_SESSION_IMPL *session, WT_PAGE *page, WT_UPDATE *upd); -extern int __wt_search_insert( WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_ITEM *srch_key); +extern int __wt_search_insert(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt, WT_INSERT_HEAD *ins_head, WT_ITEM *srch_key); extern int __wt_row_search(WT_SESSION_IMPL *session, WT_ITEM *srch_key, WT_REF *leaf, WT_CURSOR_BTREE *cbt, bool insert); extern int __wt_row_random_leaf(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt); extern int __wt_row_random_descent(WT_SESSION_IMPL *session, WT_CURSOR_BTREE *cbt); @@ -253,8 +253,6 @@ extern int __wt_conn_dhandle_find( WT_SESSION_IMPL *session, const char *uri, co extern int __wt_conn_btree_sync_and_close(WT_SESSION_IMPL *session, bool final, bool force); extern int __wt_conn_btree_open( WT_SESSION_IMPL *session, const char *cfg[], uint32_t flags); extern int __wt_conn_btree_apply(WT_SESSION_IMPL *session, bool apply_checkpoints, const char *uri, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]); -extern int __wt_conn_btree_apply_single_ckpt(WT_SESSION_IMPL *session, const char *uri, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]); -extern int __wt_conn_btree_apply_single(WT_SESSION_IMPL *session, const char *uri, const char *checkpoint, int (*func)(WT_SESSION_IMPL *, const char *[]), const char *cfg[]); extern int __wt_conn_dhandle_close_all( WT_SESSION_IMPL *session, const char *uri, bool force); extern int __wt_conn_dhandle_discard_single( WT_SESSION_IMPL *session, bool final, bool force); extern int __wt_conn_dhandle_discard(WT_SESSION_IMPL *session); @@ -553,7 +551,7 @@ extern int __wt_struct_size(WT_SESSION_IMPL *session, size_t *sizep, const char extern int __wt_struct_pack(WT_SESSION_IMPL *session, void *buffer, size_t size, const char *fmt, ...); extern int __wt_struct_unpack(WT_SESSION_IMPL *session, const void *buffer, size_t size, const char *fmt, ...); extern int __wt_struct_unpack_size(WT_SESSION_IMPL *session, const void *buffer, size_t size, const char *fmt, size_t *resultp); -extern int __wt_struct_repack(WT_SESSION_IMPL *session, const char *infmt, const char *outfmt, const WT_ITEM *inbuf, WT_ITEM *outbuf, void **reallocp); +extern int __wt_struct_repack(WT_SESSION_IMPL *session, const char *infmt, const char *outfmt, const WT_ITEM *inbuf, WT_ITEM *outbuf); extern int __wt_ovfl_discard_add(WT_SESSION_IMPL *session, WT_PAGE *page, WT_CELL *cell); extern void __wt_ovfl_discard_free(WT_SESSION_IMPL *session, WT_PAGE *page); extern int __wt_ovfl_reuse_search(WT_SESSION_IMPL *session, WT_PAGE *page, uint8_t **addrp, size_t *addr_sizep, const void *value, size_t value_size); diff --git a/src/third_party/wiredtiger/src/include/flags.h b/src/third_party/wiredtiger/src/include/flags.h index 24fae4abccd..e610c07f432 100644 --- a/src/third_party/wiredtiger/src/include/flags.h +++ b/src/third_party/wiredtiger/src/include/flags.h @@ -55,20 +55,21 @@ #define WT_SESSION_INTERNAL 0x00000004 #define WT_SESSION_LOCKED_CHECKPOINT 0x00000008 #define WT_SESSION_LOCKED_HANDLE_LIST 0x00000010 -#define WT_SESSION_LOCKED_SCHEMA 0x00000020 -#define WT_SESSION_LOCKED_SLOT 0x00000040 -#define WT_SESSION_LOCKED_TABLE 0x00000080 -#define WT_SESSION_LOCKED_TURTLE 0x00000100 -#define WT_SESSION_LOCK_NO_WAIT 0x00000200 -#define WT_SESSION_LOGGING_INMEM 0x00000400 -#define WT_SESSION_LOOKASIDE_CURSOR 0x00000800 -#define WT_SESSION_NO_CACHE 0x00001000 -#define WT_SESSION_NO_DATA_HANDLES 0x00002000 -#define WT_SESSION_NO_EVICTION 0x00004000 -#define WT_SESSION_NO_LOGGING 0x00008000 -#define WT_SESSION_NO_SCHEMA_LOCK 0x00010000 -#define WT_SESSION_QUIET_CORRUPT_FILE 0x00020000 -#define WT_SESSION_SERVER_ASYNC 0x00040000 +#define WT_SESSION_LOCKED_METADATA 0x00000020 +#define WT_SESSION_LOCKED_SCHEMA 0x00000040 +#define WT_SESSION_LOCKED_SLOT 0x00000080 +#define WT_SESSION_LOCKED_TABLE 0x00000100 +#define WT_SESSION_LOCKED_TURTLE 0x00000200 +#define WT_SESSION_LOCK_NO_WAIT 0x00000400 +#define WT_SESSION_LOGGING_INMEM 0x00000800 +#define WT_SESSION_LOOKASIDE_CURSOR 0x00001000 +#define WT_SESSION_NO_CACHE 0x00002000 +#define WT_SESSION_NO_DATA_HANDLES 0x00004000 +#define WT_SESSION_NO_EVICTION 0x00008000 +#define WT_SESSION_NO_LOGGING 0x00010000 +#define WT_SESSION_NO_SCHEMA_LOCK 0x00020000 +#define WT_SESSION_QUIET_CORRUPT_FILE 0x00040000 +#define WT_SESSION_SERVER_ASYNC 0x00080000 #define WT_TXN_LOG_CKPT_CLEANUP 0x00000001 #define WT_TXN_LOG_CKPT_PREPARE 0x00000002 #define WT_TXN_LOG_CKPT_START 0x00000004 diff --git a/src/third_party/wiredtiger/src/include/meta.h b/src/third_party/wiredtiger/src/include/meta.h index d61022c0c44..ac0f5fedac4 100644 --- a/src/third_party/wiredtiger/src/include/meta.h +++ b/src/third_party/wiredtiger/src/include/meta.h @@ -21,6 +21,7 @@ #define WT_METADATA_TURTLE_SET "WiredTiger.turtle.set" /* Turtle temp file */ #define WT_METADATA_URI "metadata:" /* Metadata alias */ +#define WT_METAFILE "WiredTiger.wt" /* Metadata table */ #define WT_METAFILE_URI "file:WiredTiger.wt" /* Metadata table URI */ #define WT_LAS_URI "file:WiredTigerLAS.wt" /* Lookaside table URI*/ diff --git a/src/third_party/wiredtiger/src/include/schema.h b/src/third_party/wiredtiger/src/include/schema.h index a51030870c1..f93c596e2ca 100644 --- a/src/third_party/wiredtiger/src/include/schema.h +++ b/src/third_party/wiredtiger/src/include/schema.h @@ -133,6 +133,14 @@ struct __wt_table { &S2C(session)->dhandle_lock, WT_SESSION_LOCKED_HANDLE_LIST, op) /* + * WT_WITH_METADATA_LOCK -- + * Acquire the metadata lock, perform an operation, drop the lock. + */ +#define WT_WITH_METADATA_LOCK(session, ret, op) \ + WT_WITH_LOCK(session, ret, \ + &S2C(session)->metadata_lock, WT_SESSION_LOCKED_METADATA, op) + +/* * WT_WITH_SCHEMA_LOCK -- * Acquire the schema lock, perform an operation, drop the lock. * Check that we are not already holding some other lock: the schema lock @@ -166,6 +174,8 @@ struct __wt_table { */ #define WT_WITHOUT_LOCKS(session, op) do { \ WT_CONNECTION_IMPL *__conn = S2C(session); \ + bool __checkpoint_locked = \ + F_ISSET(session, WT_SESSION_LOCKED_CHECKPOINT); \ bool __handle_locked = \ F_ISSET(session, WT_SESSION_LOCKED_HANDLE_LIST); \ bool __table_locked = \ @@ -184,7 +194,15 @@ struct __wt_table { F_CLR(session, WT_SESSION_LOCKED_SCHEMA); \ __wt_spin_unlock(session, &__conn->schema_lock); \ } \ + if (__checkpoint_locked) { \ + F_CLR(session, WT_SESSION_LOCKED_CHECKPOINT); \ + __wt_spin_unlock(session, &__conn->checkpoint_lock); \ + } \ op; \ + if (__checkpoint_locked) { \ + __wt_spin_lock(session, &__conn->checkpoint_lock); \ + F_SET(session, WT_SESSION_LOCKED_CHECKPOINT); \ + } \ if (__schema_locked) { \ __wt_spin_lock(session, &__conn->schema_lock); \ F_SET(session, WT_SESSION_LOCKED_SCHEMA); \ diff --git a/src/third_party/wiredtiger/src/include/session.h b/src/third_party/wiredtiger/src/include/session.h index 5c3291230b4..b3c475805a4 100644 --- a/src/third_party/wiredtiger/src/include/session.h +++ b/src/third_party/wiredtiger/src/include/session.h @@ -127,10 +127,7 @@ struct WT_COMPILER_TYPE_ALIGN(WT_CACHE_LINE_ALIGNMENT) __wt_session_impl { int (*block_manager_cleanup)(WT_SESSION_IMPL *); /* Checkpoint support */ - struct { - WT_DATA_HANDLE *dhandle; - const char *name; - } *ckpt_handle; /* Handle list */ + WT_DATA_HANDLE **ckpt_handle; /* Handle list */ u_int ckpt_handle_next; /* Next empty slot */ size_t ckpt_handle_allocated; /* Bytes allocated */ diff --git a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c index d5d81df6785..099bde176f7 100644 --- a/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c +++ b/src/third_party/wiredtiger/src/lsm/lsm_work_unit.c @@ -334,14 +334,27 @@ __wt_lsm_checkpoint_chunk(WT_SESSION_IMPL *session, /* * Turn on metadata tracking to ensure the checkpoint gets the * necessary handle locks. + * + * Ensure that we don't race with a running checkpoint: the checkpoint + * lock protects against us racing with an application checkpoint in + * this chunk. Don't wait for it, though: checkpoints can take a long + * time, and our checkpoint operation should be very quick. */ WT_ERR(__wt_meta_track_on(session)); - WT_WITH_SCHEMA_LOCK(session, ret, - ret = __wt_schema_worker( - session, chunk->uri, __wt_checkpoint, NULL, NULL, 0)); + F_SET(session, WT_SESSION_LOCK_NO_WAIT); + WT_WITH_CHECKPOINT_LOCK(session, ret, + WT_WITH_SCHEMA_LOCK(session, ret, + ret = __wt_schema_worker( + session, chunk->uri, __wt_checkpoint, NULL, NULL, 0))); WT_TRET(__wt_meta_track_off(session, false, ret != 0)); - if (ret != 0) + F_CLR(session, WT_SESSION_LOCK_NO_WAIT); + if (ret != 0) { + if (ret == EBUSY) { + ret = 0; + goto err; + } WT_ERR_MSG(session, ret, "LSM checkpoint"); + } /* Now the file is written, get the chunk size. */ WT_ERR(__wt_lsm_tree_set_chunk_size(session, chunk)); diff --git a/src/third_party/wiredtiger/src/meta/meta_apply.c b/src/third_party/wiredtiger/src/meta/meta_apply.c index 92766213b33..7722cd55fbd 100644 --- a/src/third_party/wiredtiger/src/meta/meta_apply.c +++ b/src/third_party/wiredtiger/src/meta/meta_apply.c @@ -37,17 +37,15 @@ __meta_btree_apply(WT_SESSION_IMPL *session, WT_CURSOR *cursor, * dropping the handle (e.g in LSM when cleaning up obsolete * chunks). Holding the metadata lock isn't enough. */ - ret = __wt_session_get_btree(session, uri, NULL, NULL, 0); - if (ret == 0) { - WT_SAVE_DHANDLE(session, ret = func(session, cfg)); - if (WT_META_TRACKING(session)) - WT_TRET(__wt_meta_track_handle_lock( - session, false)); - else - WT_TRET(__wt_session_release_btree(session)); - } else if (ret == EBUSY) - ret = __wt_conn_btree_apply_single( - session, uri, NULL, func, cfg); + if ((ret = __wt_session_get_btree( + session, uri, NULL, NULL, 0)) != 0) + return (ret == EBUSY ? 0 : ret); + WT_SAVE_DHANDLE(session, ret = func(session, cfg)); + if (WT_META_TRACKING(session)) + WT_TRET(__wt_meta_track_handle_lock( + session, false)); + else + WT_TRET(__wt_session_release_btree(session)); WT_RET(ret); } WT_RET_NOTFOUND_OK(ret); diff --git a/src/third_party/wiredtiger/src/meta/meta_track.c b/src/third_party/wiredtiger/src/meta/meta_track.c index 1baab2deae1..a73b7e09d37 100644 --- a/src/third_party/wiredtiger/src/meta/meta_track.c +++ b/src/third_party/wiredtiger/src/meta/meta_track.c @@ -284,11 +284,12 @@ __wt_meta_track_off(WT_SESSION_IMPL *session, bool need_sync, bool unroll) * should be included in the checkpoint. */ ckpt_session->txn.id = session->txn.id; - F_SET(ckpt_session, WT_SESSION_LOCKED_SCHEMA); - WT_WITH_DHANDLE(ckpt_session, - WT_SESSION_META_DHANDLE(session), - ret = __wt_checkpoint(ckpt_session, NULL)); - F_CLR(ckpt_session, WT_SESSION_LOCKED_SCHEMA); + F_SET(ckpt_session, WT_SESSION_LOCKED_METADATA); + WT_WITH_METADATA_LOCK(session, ret, + WT_WITH_DHANDLE(ckpt_session, + WT_SESSION_META_DHANDLE(session), + ret = __wt_checkpoint(ckpt_session, NULL))); + F_CLR(ckpt_session, WT_SESSION_LOCKED_METADATA); ckpt_session->txn.id = WT_TXN_NONE; WT_RET(ret); WT_WITH_DHANDLE(session, diff --git a/src/third_party/wiredtiger/src/meta/meta_turtle.c b/src/third_party/wiredtiger/src/meta/meta_turtle.c index 7182bb0fe5f..3d27f0b5845 100644 --- a/src/third_party/wiredtiger/src/meta/meta_turtle.c +++ b/src/third_party/wiredtiger/src/meta/meta_turtle.c @@ -153,10 +153,11 @@ int __wt_turtle_init(WT_SESSION_IMPL *session) { WT_DECL_RET; - bool exist, exist_incr; + bool exist_backup, exist_incr, exist_turtle, load; char *metaconf; metaconf = NULL; + load = false; /* * Discard any turtle setup file left-over from previous runs. This @@ -179,13 +180,29 @@ __wt_turtle_init(WT_SESSION_IMPL *session) * done. */ WT_RET(__wt_exist(session, WT_INCREMENTAL_BACKUP, &exist_incr)); - WT_RET(__wt_exist(session, WT_METADATA_TURTLE, &exist)); - if (exist) { + WT_RET(__wt_exist(session, WT_METADATA_BACKUP, &exist_backup)); + WT_RET(__wt_exist(session, WT_METADATA_TURTLE, &exist_turtle)); + if (exist_turtle) { if (exist_incr) WT_RET_MSG(session, EINVAL, "Incremental backup after running recovery " "is not allowed."); - } else { + /* + * If we have a backup file and metadata and turtle files, + * we want to recreate the metadata from the backup. + */ + if (exist_backup) { + WT_RET(__wt_msg(session, "Both %s and %s exist. " + "Recreating metadata from backup.", + WT_METADATA_TURTLE, WT_METADATA_BACKUP)); + WT_RET(__wt_remove_if_exists(session, WT_METAFILE)); + WT_RET(__wt_remove_if_exists( + session, WT_METADATA_TURTLE)); + load = true; + } + } else + load = true; + if (load) { if (exist_incr) F_SET(S2C(session), WT_CONN_WAS_BACKUP); diff --git a/src/third_party/wiredtiger/src/packing/pack_impl.c b/src/third_party/wiredtiger/src/packing/pack_impl.c index 0e3ed44ba6a..2bd850bfc9a 100644 --- a/src/third_party/wiredtiger/src/packing/pack_impl.c +++ b/src/third_party/wiredtiger/src/packing/pack_impl.c @@ -144,70 +144,43 @@ __wt_struct_unpack_size(WT_SESSION_IMPL *session, */ int __wt_struct_repack(WT_SESSION_IMPL *session, const char *infmt, - const char *outfmt, const WT_ITEM *inbuf, WT_ITEM *outbuf, void **reallocp) + const char *outfmt, const WT_ITEM *inbuf, WT_ITEM *outbuf) { WT_DECL_PACK_VALUE(pvin); WT_DECL_PACK_VALUE(pvout); WT_DECL_RET; WT_PACK packin, packout; const uint8_t *before, *end, *p; - uint8_t *pout; - size_t len; const void *start; start = NULL; p = inbuf->data; end = p + inbuf->size; - /* - * Handle this non-contiguous case: 'U' -> 'u' at the end of the buf. - * The former case has the size embedded before the item, the latter - * does not. - */ - if ((len = strlen(outfmt)) > 1 && outfmt[len - 1] == 'u' && - strlen(infmt) > len && infmt[len - 1] == 'U') { - WT_ERR(__wt_realloc(session, NULL, inbuf->size, reallocp)); - pout = *reallocp; - } else - pout = NULL; - - WT_ERR(__pack_init(session, &packout, outfmt)); - WT_ERR(__pack_init(session, &packin, infmt)); + WT_RET(__pack_init(session, &packout, outfmt)); + WT_RET(__pack_init(session, &packin, infmt)); /* Outfmt should complete before infmt */ while ((ret = __pack_next(&packout, &pvout)) == 0) { if (p >= end) - WT_ERR(EINVAL); - WT_ERR(__pack_next(&packin, &pvin)); + WT_RET(EINVAL); + if (pvout.type == 'x' && pvout.size == 0 && pvout.havesize) + continue; + WT_RET(__pack_next(&packin, &pvin)); before = p; - WT_ERR(__unpack_read(session, &pvin, &p, (size_t)(end - p))); - if (pvout.type != pvin.type) { - if (pvout.type == 'u' && pvin.type == 'U') { - /* Skip the prefixed size, we don't need it */ - WT_ERR(__wt_struct_unpack_size(session, before, - (size_t)(end - before), "I", &len)); - before += len; - } else - WT_ERR(ENOTSUP); - } - if (pout != NULL) { - memcpy(pout, before, WT_PTRDIFF(p, before)); - pout += p - before; - } else if (start == NULL) + WT_RET(__unpack_read(session, &pvin, &p, (size_t)(end - p))); + if (pvout.type != pvin.type) + WT_RET(ENOTSUP); + if (start == NULL) start = before; } - WT_ERR_NOTFOUND_OK(ret); + WT_RET_NOTFOUND_OK(ret); /* Be paranoid - __pack_write should never overflow. */ WT_ASSERT(session, p <= end); - if (pout != NULL) { - outbuf->data = *reallocp; - outbuf->size = WT_PTRDIFF(pout, *reallocp); - } else { - outbuf->data = start; - outbuf->size = WT_PTRDIFF(p, start); - } + outbuf->data = start; + outbuf->size = WT_PTRDIFF(p, start); -err: return (ret); + return (0); } diff --git a/src/third_party/wiredtiger/src/reconcile/rec_write.c b/src/third_party/wiredtiger/src/reconcile/rec_write.c index c25d7b5e493..f245ff5d921 100644 --- a/src/third_party/wiredtiger/src/reconcile/rec_write.c +++ b/src/third_party/wiredtiger/src/reconcile/rec_write.c @@ -363,6 +363,17 @@ __wt_reconcile(WT_SESSION_IMPL *session, WT_ASSERT(session, __wt_page_is_modified(page)); /* + * Reconciliation locks the page for three reasons: + * Reconciliation reads the lists of page updates, obsolete updates + * cannot be discarded while reconciliation is in progress; + * The compaction process reads page modification information, which + * reconciliation modifies; + * In-memory splits: reconciliation of an internal page cannot handle + * a child page splitting during the reconciliation. + */ + WT_RET(__wt_fair_lock(session, &page->page_lock)); + + /* * Check that transaction time always moves forward for a given page. * If this check fails, reconciliation can free something that a future * reconciliation will need. @@ -376,17 +387,6 @@ __wt_reconcile(WT_SESSION_IMPL *session, session, ref, flags, salvage, &session->reconcile)); r = session->reconcile; - /* - * Reconciliation locks the page for three reasons: - * Reconciliation reads the lists of page updates, obsolete updates - * cannot be discarded while reconciliation is in progress; - * The compaction process reads page modification information, which - * reconciliation modifies; - * In-memory splits: reconciliation of an internal page cannot handle - * a child page splitting during the reconciliation. - */ - WT_RET(__wt_fair_lock(session, &page->page_lock)); - /* Reconcile the page. */ switch (page->type) { case WT_PAGE_COL_FIX: @@ -1313,7 +1313,7 @@ __rec_txn_read(WT_SESSION_IMPL *session, WT_RECONCILE *r, } while (0) typedef enum { - WT_CHILD_IGNORE, /* Deleted child: ignore */ + WT_CHILD_IGNORE, /* Ignored child */ WT_CHILD_MODIFIED, /* Modified child */ WT_CHILD_ORIGINAL, /* Original child */ WT_CHILD_PROXY /* Deleted child: proxy */ @@ -1450,16 +1450,15 @@ __rec_child_modify(WT_SESSION_IMPL *session, /* * This function is called when walking an internal page to decide how - * to handle child pages referenced by the internal page, specifically - * if the child page is to be merged into its parent. + * to handle child pages referenced by the internal page. * * Internal pages are reconciled for two reasons: first, when evicting * an internal page, second by the checkpoint code when writing internal - * pages. During eviction, the subtree is locked down so all pages - * should be in the WT_REF_DISK or WT_REF_LOCKED state. During - * checkpoint, any eviction that might affect our review of an internal - * page is prohibited, however, as the subtree is not reserved for our - * exclusive use, there are other page states that must be considered. + * pages. During eviction, all pages should be in the WT_REF_DISK or + * WT_REF_DELETED state. During checkpoint, eviction that might affect + * review of an internal page is prohibited, however, as the subtree is + * not reserved for our exclusive use, there are other page states that + * must be considered. */ for (;; __wt_yield()) switch (r->tested_ref_state = ref->state) { @@ -1488,15 +1487,14 @@ __rec_child_modify(WT_SESSION_IMPL *session, /* * Locked. * - * If evicting, the evicted page's subtree, including - * this child, was selected for eviction by us and the - * state is stable until we reset it, it's an in-memory - * state. This is the expected state for a child being - * merged into a page (where the page was selected by - * the eviction server for eviction). + * We should never be here during eviction, active child + * pages in an evicted page's subtree fails the eviction + * attempt. */ - if (F_ISSET(r, WT_EVICTING)) - goto in_memory; + if (F_ISSET(r, WT_EVICTING)) { + WT_ASSERT(session, !F_ISSET(r, WT_EVICTING)); + return (EBUSY); + } /* * If called during checkpoint, the child is being @@ -1514,24 +1512,21 @@ __rec_child_modify(WT_SESSION_IMPL *session, /* * In memory. * - * If evicting, the evicted page's subtree, including - * this child, was selected for eviction by us and the - * state is stable until we reset it, it's an in-memory - * state. This is the expected state for a child being - * merged into a page (where the page belongs to a file - * being discarded from the cache during close). + * We should never be here during eviction, active child + * pages in an evicted page's subtree fails the eviction + * attempt. */ - if (F_ISSET(r, WT_EVICTING)) - goto in_memory; + if (F_ISSET(r, WT_EVICTING)) { + WT_ASSERT(session, !F_ISSET(r, WT_EVICTING)); + return (EBUSY); + } /* * If called during checkpoint, acquire a hazard pointer * so the child isn't evicted, it's an in-memory case. * - * This call cannot return split/restart, eviction of - * pages that split into their parent is shutout during - * checkpoint, all splits in process will have completed - * before we walk any pages for checkpoint. + * This call cannot return split/restart, we have a lock + * on the parent which prevents a child page split. */ ret = __wt_page_in(session, ref, WT_READ_CACHE | WT_READ_NO_EVICT | @@ -1548,29 +1543,31 @@ __rec_child_modify(WT_SESSION_IMPL *session, /* * Being read, not modified by definition. * - * We should never be here during eviction, a child page - * in this state within an evicted page's subtree would - * have caused normally eviction to fail, and exclusive - * eviction shouldn't ever see pages being read. + * We should never be here during eviction, active child + * pages in an evicted page's subtree fails the eviction + * attempt. */ - WT_ASSERT(session, !F_ISSET(r, WT_EVICTING)); + if (F_ISSET(r, WT_EVICTING)) { + WT_ASSERT(session, !F_ISSET(r, WT_EVICTING)); + return (EBUSY); + } goto done; case WT_REF_SPLIT: /* * The page was split out from under us. * - * We should never be here during eviction, a child page - * in this state within an evicted page's subtree would - * have caused eviction to fail. + * We should never be here during eviction, active child + * pages in an evicted page's subtree fails the eviction + * attempt. * * We should never be here during checkpoint, dirty page * eviction is shutout during checkpoint, all splits in * process will have completed before we walk any pages * for checkpoint. */ - WT_ASSERT(session, ref->state != WT_REF_SPLIT); - /* FALLTHROUGH */ + WT_ASSERT(session, WT_REF_SPLIT != WT_REF_SPLIT); + return (EBUSY); WT_ILLEGAL_VALUE(session); } @@ -1581,11 +1578,21 @@ in_memory: * modify structure has been instantiated. If the modify structure * exists and the page has actually been modified, set that state. * If that's not the case, we would normally use the original cell's - * disk address as our reference, but, if we're forced to instantiate - * a deleted child page and it's never modified, we end up here with - * a page that has a modify structure, no modifications, and no disk - * address. Ignore those pages, they're not modified and there is no - * reason to write the cell. + * disk address as our reference, however there are two special cases, + * both flagged by a missing block address. + * + * First, if forced to instantiate a deleted child page and it's never + * modified, we end up here with a page that has a modify structure, no + * modifications, and no disk address. Ignore those pages, they're not + * modified and there is no reason to write the cell. + * + * Second, insert splits are permitted during checkpoint. When doing the + * final checkpoint pass, we first walk the internal page's page-index + * and write out any dirty pages we find, then we write out the internal + * page in post-order traversal. If we found the split page in the first + * step, it will have an address; if we didn't find the split page in + * the first step, it won't have an address and we ignore it, it's not + * part of the checkpoint. */ mod = ref->page->modify; if (mod != NULL && mod->rec_result != 0) @@ -3808,7 +3815,7 @@ __rec_col_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) switch (state) { case WT_CHILD_IGNORE: - /* Deleted child we don't have to write. */ + /* Ignored child. */ WT_CHILD_RELEASE_ERR(session, hazard, ref); continue; @@ -3977,7 +3984,7 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) * record 100 moves to another page. When we reconcile * the original page, we write record 98, then we don't * see record 99 for whatever reason. If we've moved - * record 1000, we don't know to write a deleted record + * record 100, we don't know to write a deleted record * 99 on the page.) * * The record number recorded during the split is the @@ -3999,8 +4006,6 @@ __rec_col_fix(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) } else { WT_RET( __rec_txn_read(session, r, ins, NULL, NULL, &upd)); - if (upd == NULL) - continue; recno = WT_INSERT_RECNO(ins); } for (;;) { @@ -4536,22 +4541,25 @@ compare: /* * record 100 moves to another page. When we reconcile * the original page, we write record 98, then we don't * see record 99 for whatever reason. If we've moved - * record 1000, we don't know to write a deleted record + * record 100, we don't know to write a deleted record * 99 on the page.) * + * Assert the recorded record number is past the end of + * the page. + * * The record number recorded during the split is the * first key on the split page, that is, one larger than * the last key on this page, we have to decrement it. */ if ((n = page->modify->mod_split_recno) == WT_RECNO_OOB) break; + WT_ASSERT(session, n >= src_recno); n -= 1; + upd = NULL; } else { WT_ERR( __rec_txn_read(session, r, ins, NULL, NULL, &upd)); - if (upd == NULL) - continue; n = WT_INSERT_RECNO(ins); } while (src_recno <= n) { @@ -4734,10 +4742,10 @@ __rec_row_int(WT_SESSION_IMPL *session, WT_RECONCILE *r, WT_PAGE *page) switch (state) { case WT_CHILD_IGNORE: /* - * Deleted child we don't have to write. + * Ignored child. * - * Overflow keys referencing discarded pages are no - * longer useful, schedule them for discard. Don't + * Overflow keys referencing pages we're not writing are + * no longer useful, schedule them for discard. Don't * worry about instantiation, internal page keys are * always instantiated. Don't worry about reuse, * reusing this key in this reconciliation is unlikely. diff --git a/src/third_party/wiredtiger/src/schema/schema_create.c b/src/third_party/wiredtiger/src/schema/schema_create.c index 25bbd496798..9b3b76b62de 100644 --- a/src/third_party/wiredtiger/src/schema/schema_create.c +++ b/src/third_party/wiredtiger/src/schema/schema_create.c @@ -275,15 +275,11 @@ __create_colgroup(WT_SESSION_IMPL *session, WT_ERR(__wt_schema_create(session, source, sourceconf)); WT_ERR(__wt_config_collapse(session, cfg, &cgconf)); - if (exists) { - if (strcmp(cgconf, origconf) != 0) - WT_ERR_MSG(session, EINVAL, - "%s: does not match existing configuration", name); - goto err; - } - WT_ERR(__wt_metadata_insert(session, name, cgconf)); - WT_ERR(__wt_schema_open_colgroups(session, table)); + if (!exists) { + WT_ERR(__wt_metadata_insert(session, name, cgconf)); + WT_ERR(__wt_schema_open_colgroups(session, table)); + } err: __wt_free(session, cgconf); __wt_free(session, sourceconf); @@ -539,20 +535,17 @@ __create_index(WT_SESSION_IMPL *session, cfg[1] = sourceconf; cfg[2] = confbuf.data; WT_ERR(__wt_config_collapse(session, cfg, &idxconf)); - if (exists) { - if (strcmp(idxconf, origconf) != 0) - WT_ERR_MSG(session, EINVAL, - "%s: does not match existing configuration", name); - goto err; - } - WT_ERR(__wt_metadata_insert(session, name, idxconf)); - /* Make sure that the configuration is valid. */ - WT_ERR(__wt_schema_open_index( - session, table, idxname, strlen(idxname), &idx)); + if (!exists) { + WT_ERR(__wt_metadata_insert(session, name, idxconf)); - /* If there is data in the table, fill the index. */ - WT_ERR(__fill_index(session, table, idx)); + /* Make sure that the configuration is valid. */ + WT_ERR(__wt_schema_open_index( + session, table, idxname, strlen(idxname), &idx)); + + /* If there is data in the table, fill the index. */ + WT_ERR(__fill_index(session, table, idx)); + } err: __wt_free(session, idxconf); __wt_free(session, origconf); @@ -612,23 +605,21 @@ __create_table(WT_SESSION_IMPL *session, WT_ERR_NOTFOUND_OK(ret); WT_ERR(__wt_config_collapse(session, cfg, &tableconf)); - if (exists) { - if (strcmp(tableconf, table->config) != 0) - WT_ERR_MSG(session, EINVAL, - "%s: does not match existing configuration", name); - goto err; - } - WT_ERR(__wt_metadata_insert(session, name, tableconf)); - /* Attempt to open the table now to catch any errors. */ - WT_ERR(__wt_schema_get_table( - session, tablename, strlen(tablename), true, &table)); + if (!exists) { + WT_ERR(__wt_metadata_insert(session, name, tableconf)); + + /* Attempt to open the table now to catch any errors. */ + WT_ERR(__wt_schema_get_table( + session, tablename, strlen(tablename), true, &table)); - if (ncolgroups == 0) { - cgsize = strlen("colgroup:") + strlen(tablename) + 1; - WT_ERR(__wt_calloc_def(session, cgsize, &cgname)); - snprintf(cgname, cgsize, "colgroup:%s", tablename); - WT_ERR(__create_colgroup(session, cgname, exclusive, config)); + if (ncolgroups == 0) { + cgsize = strlen("colgroup:") + strlen(tablename) + 1; + WT_ERR(__wt_calloc_def(session, cgsize, &cgname)); + snprintf(cgname, cgsize, "colgroup:%s", tablename); + WT_ERR(__create_colgroup( + session, cgname, exclusive, config)); + } } if (0) { diff --git a/src/third_party/wiredtiger/src/schema/schema_worker.c b/src/third_party/wiredtiger/src/schema/schema_worker.c index b5ee3bb7f7d..e60a7107786 100644 --- a/src/third_party/wiredtiger/src/schema/schema_worker.c +++ b/src/third_party/wiredtiger/src/schema/schema_worker.c @@ -55,18 +55,11 @@ __wt_schema_worker(WT_SESSION_IMPL *session, WT_ERR(ret); } - if ((ret = __wt_session_get_btree_ckpt( - session, uri, cfg, open_flags)) == 0) { - WT_SAVE_DHANDLE(session, - ret = file_func(session, cfg)); - WT_TRET(__wt_session_release_btree(session)); - } else if (ret == EBUSY) { - WT_ASSERT(session, !FLD_ISSET( - open_flags, WT_DHANDLE_EXCLUSIVE)); - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_conn_btree_apply_single_ckpt( - session, uri, file_func, cfg)); - } + WT_ERR(__wt_session_get_btree_ckpt( + session, uri, cfg, open_flags)); + WT_SAVE_DHANDLE(session, + ret = file_func(session, cfg)); + WT_TRET(__wt_session_release_btree(session)); WT_ERR(ret); } } else if (WT_PREFIX_MATCH(uri, "colgroup:")) { diff --git a/src/third_party/wiredtiger/src/session/session_api.c b/src/third_party/wiredtiger/src/session/session_api.c index c03b5fdc044..5511674dc5e 100644 --- a/src/third_party/wiredtiger/src/session/session_api.c +++ b/src/third_party/wiredtiger/src/session/session_api.c @@ -597,9 +597,10 @@ __session_rename(WT_SESSION *wt_session, WT_ERR(__wt_str_name_check(session, uri)); WT_ERR(__wt_str_name_check(session, newuri)); - WT_WITH_SCHEMA_LOCK(session, ret, - WT_WITH_TABLE_LOCK(session, ret, - ret = __wt_schema_rename(session, uri, newuri, cfg))); + WT_WITH_CHECKPOINT_LOCK(session, ret, + WT_WITH_SCHEMA_LOCK(session, ret, + WT_WITH_TABLE_LOCK(session, ret, + ret = __wt_schema_rename(session, uri, newuri, cfg)))); err: API_END_RET_NOTFOUND_MAP(session, ret); } @@ -646,9 +647,10 @@ __wt_session_drop(WT_SESSION_IMPL *session, const char *uri, const char *cfg[]) if (!lock_wait) F_SET(session, WT_SESSION_LOCK_NO_WAIT); - WT_WITH_SCHEMA_LOCK(session, ret, - WT_WITH_TABLE_LOCK(session, ret, - ret = __wt_schema_drop(session, uri, cfg))); + WT_WITH_CHECKPOINT_LOCK(session, ret, + WT_WITH_SCHEMA_LOCK(session, ret, + WT_WITH_TABLE_LOCK(session, ret, + ret = __wt_schema_drop(session, uri, cfg)))); if (!lock_wait) F_CLR(session, WT_SESSION_LOCK_NO_WAIT); diff --git a/src/third_party/wiredtiger/src/support/cksum.c b/src/third_party/wiredtiger/src/support/cksum.c index c2982c40015..0b086753406 100644 --- a/src/third_party/wiredtiger/src/support/cksum.c +++ b/src/third_party/wiredtiger/src/support/cksum.c @@ -1260,6 +1260,23 @@ __wt_cksum_hw(const void *chunk, size_t len) } #endif +#if defined(__powerpc64__) + +unsigned int crc32_vpmsum(unsigned int crc, const unsigned char *p, + unsigned long len); + +/* + * __wt_cksum_hw -- + * Return a checksum for a chunk of memory, computed in hardware + * using 8 byte steps. + */ +static uint32_t +__wt_cksum_hw(const void *chunk, size_t len) +{ + return crc32_vpmsum(0, chunk, len); +} +#endif + /* * __wt_cksum -- * Return a checksum for a chunk of memory using the fastest method @@ -1302,6 +1319,8 @@ __wt_cksum_init(void) __wt_cksum_func = __wt_cksum_hw; else __wt_cksum_func = __wt_cksum_sw; +#elif defined(__powerpc64__) + __wt_cksum_func = __wt_cksum_hw; #else __wt_cksum_func = __wt_cksum_sw; #endif diff --git a/src/third_party/wiredtiger/src/support/power8/LICENSE.TXT b/src/third_party/wiredtiger/src/support/power8/LICENSE.TXT new file mode 100644 index 00000000000..2f4bb91f574 --- /dev/null +++ b/src/third_party/wiredtiger/src/support/power8/LICENSE.TXT @@ -0,0 +1,476 @@ +Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM + +crc32-vpmsum is free software; you can redistribute it and/or +modify it under the terms of either: + + a) the GNU General Public License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later version., or + b) the Apache License, Version 2.0 + + + + + + + + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + + + + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS diff --git a/src/third_party/wiredtiger/src/support/power8/README.md b/src/third_party/wiredtiger/src/support/power8/README.md new file mode 100644 index 00000000000..3e2976650cd --- /dev/null +++ b/src/third_party/wiredtiger/src/support/power8/README.md @@ -0,0 +1,208 @@ +crc32-vpmsum +============ + +A set of examples for accelerating CRC32 calculations using the vector +polynomial multiply sum (vpmsum) instructions introduced in POWER8. These +instructions implement byte, halfword, word and doubleword carryless +multiply/add. + +Performance +----------- + +An implementation of slice-by-8, one of the fastest lookup table methods +is included so we can compare performance against it. Testing 5000000 +iterations of a CRC of 32 kB of data (to keep it L1 cache contained): + +``` +# time slice_by_8_bench 32768 5000000 +122.220 seconds + +# time crc32_bench 32768 5000000 +2.937 seconds +``` + +The vpmsum accelerated CRC is just over 41x faster. + +This test was run on a 4.1 GHz POWER8, so the algorithm sustains about +52 GiB/sec or 13.6 bytes/cycle. The theoretical limit is 16 bytes/cycle +since we can execute a maximum of one vpmsum instruction per cycle. + +In another test, a version was added to the kernel and btrfs write +performance was shown to be 3.8x faster. The test was done to a ramdisk +to mitigate any I/O induced variability. + +Quick start +----------- + +- Modify CRC and OPTIONS in the Makefile. There are examples for the two most + common crc32s. + +- Type make to create the constants (crc32_constants.h) + +- Import the code into your application (crc32.S crc32_wrapper.c + crc32_constants.h ppc-opcode.h) and call the CRC: + +``` +unsigned int crc32_vpmsum(unsigned int crc, unsigned char *p, unsigned long len); +``` + +CRC background +-------------- + +For a good background on CRCs, check out: + +http://www.ross.net/crc/download/crc_v3.txt + +A few key points: + +- A CRC is the remainder after dividing a message by the CRC polynomial, + ie M mod CRC_POLY +- multiply/divide is carryless +- add/subtract is an xor +- n (where n is the order of the CRC) bits of zeroes are appended to the + end of the message. + +One more important piece of information - a CRC is a linear function, so: + +``` + CRC(A xor B) = CRC(A) xor CRC(B) + + CRC(A . B) = CRC(A) . CRC(B) (remember this is carryless multiply) +``` + +If we take 64bits of data, represented by two 32 bit chunks (AAAAAAAA +and BBBBBBBB): + +``` +CRC(AAAAAAAABBBBBBBB) + = CRC(AAAAAAAA00000000 xor BBBBBBBB) + = CRC(AAAAAAAA00000000) xor CRC(BBBBBBBB) +``` + +If we operate on AAAAAAAA: + +``` +CRC(AAAAAAAA00000000) + = CRC(AAAAAAAA . 100000000) + = CRC(AAAAAAAA) . CRC(100000000) +``` + +And CRC(100000000) is a constant which we can pre-calculate: + +``` +CRC(100000000) + = 100000000 mod CRC_POLY + = 2^32 mod CRC_POLY +``` + +Finally we can add our modified AAAAAAAA to BBBBBBBB: + +``` +CRC(AAAAAAAABBBBBBBB) + = ((2^32 mod CRC_POLY) . CRC(AAAAAAAA)) xor CRC(BBBBBBBB) +``` + +In other words, with the right constants pre-calculated we can shift the +input data around and we can also calculate the CRC in as many parallel +chunks as we want. + +No matter how much shifting we do, the final result will be be 64 bits of +data (63 actually, because there is no carry into the top bit). To reduce +it further we need a another trick, and that is Barrett reduction: + +http://en.wikipedia.org/wiki/Barrett_reduction + +Barrett reduction is a method of calculating a mod n. The idea is to +calculate q, the multiple of our polynomial that we need to subtract. By +doing the computation 2x bits higher (ie 64 bits) and shifting the +result back down 2x bits, we round down to the nearest multiple. + +``` + k = 32 + m = floor((4^k)/n) = floor((4^32))/n) + n = 64 bits of data + a = 32 bit CRC + + q = floor(ma/(2^64)) + result = a - qn +``` + +An example in the floating point domain makes it clearer how this works: + +``` +a mod n = a - floor(am) * n +``` + +Let's use it to calculate 22 mod 10: + +``` + a = 22 + n = 10 + m = 1/n = 1/10 = 0.1 + +22 mod 10 + = 22 - floor(22*0.1) * 10 + = 22 - 2 * 10 + = 22 - 20 + = 2 +``` + +There is one more issue left - bit reflection. Some CRCs are defined to +operate on the least significant bit first (eg CRC32c). Lets look at +how this would get laid out in a register, and lets simplify it to just +two bytes (vs a 16 byte VMX register): + + [ 8..15 ] [ 0..7 ] + +Notice how the bits and bytes are out of order. Since we are doing +multi word multiplication on these values we need them to both be +in order. + +The simplest way to fix this is to reflect the bits in each byte: + + [ 15..8 ] [ 7..0 ] + +However shuffling bits in a byte is expensive on most CPUs. It is +however relatively cheap to shuffle bytes around. What if we load +the bytes in reversed: + + [ 0..7 ] [ 8..15 ] + +Now the bits and bytes are in order, except the least significant bit +of the register is now on the left and the most significant bit is on the +right. We operate as if the register is reflected, which normally we +cannot do. The reason we get away with this is our multiplies are carryless +and our addition and subtraction is xor, so our operations never create +carries. + +The only trick is we have to shift the result of multiplies left one +because the high bit of the multiply is always 0, and we want that high bit +on the right not the left. + +Implementation +-------------- + +The vpmsum instructions on POWER8 have a 6 cycle latency and we can +execute one every cycle. In light of this the main loop has 8 parallel +streams which consume 8 x 16 B each iteration. At the completion of this +loop we have taken 32 kB of data and reduced it to 8 x 16 B (128 B). + +The next step is to take this 128 B and reduce it to 8 B. At this stage +we also add 32 bits of 0 to the end. + +We then apply Barrett reduction to get our CRC. + +Examples +-------- +- barrett_reduction: An example of Barrett reduction + +- final_fold: Starting with 128 bits, add 32 bits of zeros and reduce it to + 64 bits, then apply Barrett reduction + +- final_fold2: A second method of reduction + +Acknowledgements +---------------- + +Thanks to Michael Gschwind, Jeff Derby, Lorena Pesantez and Stewart Smith +for their ideas and assistance. diff --git a/src/third_party/wiredtiger/src/support/power8/crc32.S b/src/third_party/wiredtiger/src/support/power8/crc32.S new file mode 100644 index 00000000000..4bc1fad416d --- /dev/null +++ b/src/third_party/wiredtiger/src/support/power8/crc32.S @@ -0,0 +1,741 @@ +#if defined(__powerpc64__) +/* + * Calculate the checksum of data that is 16 byte aligned and a multiple of + * 16 bytes. + * + * The first step is to reduce it to 1024 bits. We do this in 8 parallel + * chunks in order to mask the latency of the vpmsum instructions. If we + * have more than 32 kB of data to checksum we repeat this step multiple + * times, passing in the previous 1024 bits. + * + * The next step is to reduce the 1024 bits to 64 bits. This step adds + * 32 bits of 0s to the end - this matches what a CRC does. We just + * calculate constants that land the data in this 32 bits. + * + * We then use fixed point Barrett reduction to compute a mod n over GF(2) + * for n = CRC using POWER8 instructions. We use x = 32. + * + * http://en.wikipedia.org/wiki/Barrett_reduction + * + * Copyright (C) 2015 Anton Blanchard <anton@au.ibm.com>, IBM + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <ppc-asm.h> +#include "ppc-opcode.h" + +#undef toc + +#ifndef r1 +#define r1 1 +#endif + +#ifndef r2 +#define r2 2 +#endif + + .section .rodata +.balign 16 + +.byteswap_constant: + /* byte reverse permute constant */ + .octa 0x0F0E0D0C0B0A09080706050403020100 + +#define __ASSEMBLY__ +#include "crc32_constants.h" + + .text + +#if defined(__BIG_ENDIAN__) && defined(REFLECT) +#define BYTESWAP_DATA +#elif defined(__LITTLE_ENDIAN__) && !defined(REFLECT) +#define BYTESWAP_DATA +#else +#undef BYTESWAP_DATA +#endif + +#define off16 r25 +#define off32 r26 +#define off48 r27 +#define off64 r28 +#define off80 r29 +#define off96 r30 +#define off112 r31 + +#define const1 v25 +#define const2 v26 + +#define byteswap v27 +#define mask_32bit v28 +#define mask_64bit v29 +#define zeroes v30 +#define ones v31 + +#ifdef BYTESWAP_DATA +#define VPERM(A, B, C, D) vperm A, B, C, D +#else +#define VPERM(A, B, C, D) +#endif + +/* unsigned int __crc32_vpmsum(unsigned int crc, void *p, unsigned long len) */ +FUNC_START(__crc32_vpmsum) + std r31,-8(r1) + std r30,-16(r1) + std r29,-24(r1) + std r28,-32(r1) + std r27,-40(r1) + std r26,-48(r1) + std r25,-56(r1) + + li off16,16 + li off32,32 + li off48,48 + li off64,64 + li off80,80 + li off96,96 + li off112,112 + li r0,0 + + mr r10,r3 + + vxor zeroes,zeroes,zeroes + vspltisw ones,-1 + + vsldoi mask_32bit,zeroes,ones,4 + vsldoi mask_64bit,zeroes,ones,8 + + /* Get the initial value into v8 */ + vxor v8,v8,v8 + MTVRD(v8, r3) +#ifdef REFLECT + vsldoi v8,zeroes,v8,8 /* shift into bottom 32 bits */ +#else + vsldoi v8,v8,zeroes,4 /* shift into top 32 bits */ +#endif + +#ifdef BYTESWAP_DATA + addis r3,r2,.byteswap_constant@toc@ha + addi r3,r3,.byteswap_constant@toc@l + + lvx byteswap,0,r3 + addi r3,r3,16 +#endif + + cmpdi r5,256 + blt .Lshort + + rldicr r6,r5,0,56 + + /* Checksum in blocks of MAX_SIZE */ +1: lis r7,MAX_SIZE@h + ori r7,r7,MAX_SIZE@l + mr r9,r7 + cmpd r6,r7 + bgt 2f + mr r7,r6 +2: subf r6,r7,r6 + + /* our main loop does 128 bytes at a time */ + srdi r7,r7,7 + + /* + * Work out the offset into the constants table to start at. Each + * constant is 16 bytes, and it is used against 128 bytes of input + * data - 128 / 16 = 8 + */ + sldi r8,r7,4 + srdi r9,r9,3 + subf r8,r8,r9 + + /* We reduce our final 128 bytes in a separate step */ + addi r7,r7,-1 + mtctr r7 + + addis r3,r2,.constants@toc@ha + addi r3,r3,.constants@toc@l + + /* Find the start of our constants */ + add r3,r3,r8 + + /* zero v0-v7 which will contain our checksums */ + vxor v0,v0,v0 + vxor v1,v1,v1 + vxor v2,v2,v2 + vxor v3,v3,v3 + vxor v4,v4,v4 + vxor v5,v5,v5 + vxor v6,v6,v6 + vxor v7,v7,v7 + + lvx const1,0,r3 + + /* + * If we are looping back to consume more data we use the values + * already in v16-v23. + */ + cmpdi r0,1 + beq 2f + + /* First warm up pass */ + lvx v16,0,r4 + lvx v17,off16,r4 + VPERM(v16,v16,v16,byteswap) + VPERM(v17,v17,v17,byteswap) + lvx v18,off32,r4 + lvx v19,off48,r4 + VPERM(v18,v18,v18,byteswap) + VPERM(v19,v19,v19,byteswap) + lvx v20,off64,r4 + lvx v21,off80,r4 + VPERM(v20,v20,v20,byteswap) + VPERM(v21,v21,v21,byteswap) + lvx v22,off96,r4 + lvx v23,off112,r4 + VPERM(v22,v22,v22,byteswap) + VPERM(v23,v23,v23,byteswap) + addi r4,r4,8*16 + + /* xor in initial value */ + vxor v16,v16,v8 + +2: bdz .Lfirst_warm_up_done + + addi r3,r3,16 + lvx const2,0,r3 + + /* Second warm up pass */ + VPMSUMD(v8,v16,const1) + lvx v16,0,r4 + VPERM(v16,v16,v16,byteswap) + ori r2,r2,0 + + VPMSUMD(v9,v17,const1) + lvx v17,off16,r4 + VPERM(v17,v17,v17,byteswap) + ori r2,r2,0 + + VPMSUMD(v10,v18,const1) + lvx v18,off32,r4 + VPERM(v18,v18,v18,byteswap) + ori r2,r2,0 + + VPMSUMD(v11,v19,const1) + lvx v19,off48,r4 + VPERM(v19,v19,v19,byteswap) + ori r2,r2,0 + + VPMSUMD(v12,v20,const1) + lvx v20,off64,r4 + VPERM(v20,v20,v20,byteswap) + ori r2,r2,0 + + VPMSUMD(v13,v21,const1) + lvx v21,off80,r4 + VPERM(v21,v21,v21,byteswap) + ori r2,r2,0 + + VPMSUMD(v14,v22,const1) + lvx v22,off96,r4 + VPERM(v22,v22,v22,byteswap) + ori r2,r2,0 + + VPMSUMD(v15,v23,const1) + lvx v23,off112,r4 + VPERM(v23,v23,v23,byteswap) + + addi r4,r4,8*16 + + bdz .Lfirst_cool_down + + /* + * main loop. We modulo schedule it such that it takes three iterations + * to complete - first iteration load, second iteration vpmsum, third + * iteration xor. + */ + .balign 16 +4: lvx const1,0,r3 + addi r3,r3,16 + ori r2,r2,0 + + vxor v0,v0,v8 + VPMSUMD(v8,v16,const2) + lvx v16,0,r4 + VPERM(v16,v16,v16,byteswap) + ori r2,r2,0 + + vxor v1,v1,v9 + VPMSUMD(v9,v17,const2) + lvx v17,off16,r4 + VPERM(v17,v17,v17,byteswap) + ori r2,r2,0 + + vxor v2,v2,v10 + VPMSUMD(v10,v18,const2) + lvx v18,off32,r4 + VPERM(v18,v18,v18,byteswap) + ori r2,r2,0 + + vxor v3,v3,v11 + VPMSUMD(v11,v19,const2) + lvx v19,off48,r4 + VPERM(v19,v19,v19,byteswap) + lvx const2,0,r3 + ori r2,r2,0 + + vxor v4,v4,v12 + VPMSUMD(v12,v20,const1) + lvx v20,off64,r4 + VPERM(v20,v20,v20,byteswap) + ori r2,r2,0 + + vxor v5,v5,v13 + VPMSUMD(v13,v21,const1) + lvx v21,off80,r4 + VPERM(v21,v21,v21,byteswap) + ori r2,r2,0 + + vxor v6,v6,v14 + VPMSUMD(v14,v22,const1) + lvx v22,off96,r4 + VPERM(v22,v22,v22,byteswap) + ori r2,r2,0 + + vxor v7,v7,v15 + VPMSUMD(v15,v23,const1) + lvx v23,off112,r4 + VPERM(v23,v23,v23,byteswap) + + addi r4,r4,8*16 + + bdnz 4b + +.Lfirst_cool_down: + /* First cool down pass */ + lvx const1,0,r3 + addi r3,r3,16 + + vxor v0,v0,v8 + VPMSUMD(v8,v16,const1) + ori r2,r2,0 + + vxor v1,v1,v9 + VPMSUMD(v9,v17,const1) + ori r2,r2,0 + + vxor v2,v2,v10 + VPMSUMD(v10,v18,const1) + ori r2,r2,0 + + vxor v3,v3,v11 + VPMSUMD(v11,v19,const1) + ori r2,r2,0 + + vxor v4,v4,v12 + VPMSUMD(v12,v20,const1) + ori r2,r2,0 + + vxor v5,v5,v13 + VPMSUMD(v13,v21,const1) + ori r2,r2,0 + + vxor v6,v6,v14 + VPMSUMD(v14,v22,const1) + ori r2,r2,0 + + vxor v7,v7,v15 + VPMSUMD(v15,v23,const1) + ori r2,r2,0 + +.Lsecond_cool_down: + /* Second cool down pass */ + vxor v0,v0,v8 + vxor v1,v1,v9 + vxor v2,v2,v10 + vxor v3,v3,v11 + vxor v4,v4,v12 + vxor v5,v5,v13 + vxor v6,v6,v14 + vxor v7,v7,v15 + +#ifdef REFLECT + /* + * vpmsumd produces a 96 bit result in the least significant bits + * of the register. Since we are bit reflected we have to shift it + * left 32 bits so it occupies the least significant bits in the + * bit reflected domain. + */ + vsldoi v0,v0,zeroes,4 + vsldoi v1,v1,zeroes,4 + vsldoi v2,v2,zeroes,4 + vsldoi v3,v3,zeroes,4 + vsldoi v4,v4,zeroes,4 + vsldoi v5,v5,zeroes,4 + vsldoi v6,v6,zeroes,4 + vsldoi v7,v7,zeroes,4 +#endif + + /* xor with last 1024 bits */ + lvx v8,0,r4 + lvx v9,off16,r4 + VPERM(v8,v8,v8,byteswap) + VPERM(v9,v9,v9,byteswap) + lvx v10,off32,r4 + lvx v11,off48,r4 + VPERM(v10,v10,v10,byteswap) + VPERM(v11,v11,v11,byteswap) + lvx v12,off64,r4 + lvx v13,off80,r4 + VPERM(v12,v12,v12,byteswap) + VPERM(v13,v13,v13,byteswap) + lvx v14,off96,r4 + lvx v15,off112,r4 + VPERM(v14,v14,v14,byteswap) + VPERM(v15,v15,v15,byteswap) + + addi r4,r4,8*16 + + vxor v16,v0,v8 + vxor v17,v1,v9 + vxor v18,v2,v10 + vxor v19,v3,v11 + vxor v20,v4,v12 + vxor v21,v5,v13 + vxor v22,v6,v14 + vxor v23,v7,v15 + + li r0,1 + cmpdi r6,0 + addi r6,r6,128 + bne 1b + + /* Work out how many bytes we have left */ + andi. r5,r5,127 + + /* Calculate where in the constant table we need to start */ + subfic r6,r5,128 + add r3,r3,r6 + + /* How many 16 byte chunks are in the tail */ + srdi r7,r5,4 + mtctr r7 + + /* + * Reduce the previously calculated 1024 bits to 64 bits, shifting + * 32 bits to include the trailing 32 bits of zeros + */ + lvx v0,0,r3 + lvx v1,off16,r3 + lvx v2,off32,r3 + lvx v3,off48,r3 + lvx v4,off64,r3 + lvx v5,off80,r3 + lvx v6,off96,r3 + lvx v7,off112,r3 + addi r3,r3,8*16 + + VPMSUMW(v0,v16,v0) + VPMSUMW(v1,v17,v1) + VPMSUMW(v2,v18,v2) + VPMSUMW(v3,v19,v3) + VPMSUMW(v4,v20,v4) + VPMSUMW(v5,v21,v5) + VPMSUMW(v6,v22,v6) + VPMSUMW(v7,v23,v7) + + /* Now reduce the tail (0 - 112 bytes) */ + cmpdi r7,0 + beq 1f + + lvx v16,0,r4 + lvx v17,0,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off16,r4 + lvx v17,off16,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off32,r4 + lvx v17,off32,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off48,r4 + lvx v17,off48,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off64,r4 + lvx v17,off64,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off80,r4 + lvx v17,off80,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + bdz 1f + + lvx v16,off96,r4 + lvx v17,off96,r3 + VPERM(v16,v16,v16,byteswap) + VPMSUMW(v16,v16,v17) + vxor v0,v0,v16 + + /* Now xor all the parallel chunks together */ +1: vxor v0,v0,v1 + vxor v2,v2,v3 + vxor v4,v4,v5 + vxor v6,v6,v7 + + vxor v0,v0,v2 + vxor v4,v4,v6 + + vxor v0,v0,v4 + +.Lbarrett_reduction: + /* Barrett constants */ + addis r3,r2,.barrett_constants@toc@ha + addi r3,r3,.barrett_constants@toc@l + + lvx const1,0,r3 + lvx const2,off16,r3 + + vsldoi v1,v0,v0,8 + vxor v0,v0,v1 /* xor two 64 bit results together */ + +#ifdef REFLECT + /* shift left one bit */ + vspltisb v1,1 + vsl v0,v0,v1 +#endif + + vand v0,v0,mask_64bit + +#ifndef REFLECT + /* + * Now for the Barrett reduction algorithm. The idea is to calculate q, + * the multiple of our polynomial that we need to subtract. By + * doing the computation 2x bits higher (ie 64 bits) and shifting the + * result back down 2x bits, we round down to the nearest multiple. + */ + VPMSUMD(v1,v0,const1) /* ma */ + vsldoi v1,zeroes,v1,8 /* q = floor(ma/(2^64)) */ + VPMSUMD(v1,v1,const2) /* qn */ + vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ + + /* + * Get the result into r3. We need to shift it left 8 bytes: + * V0 [ 0 1 2 X ] + * V0 [ 0 X 2 3 ] + */ + vsldoi v0,v0,zeroes,8 /* shift result into top 64 bits */ +#else + /* + * The reflected version of Barrett reduction. Instead of bit + * reflecting our data (which is expensive to do), we bit reflect our + * constants and our algorithm, which means the intermediate data in + * our vector registers goes from 0-63 instead of 63-0. We can reflect + * the algorithm because we don't carry in mod 2 arithmetic. + */ + vand v1,v0,mask_32bit /* bottom 32 bits of a */ + VPMSUMD(v1,v1,const1) /* ma */ + vand v1,v1,mask_32bit /* bottom 32bits of ma */ + VPMSUMD(v1,v1,const2) /* qn */ + vxor v0,v0,v1 /* a - qn, subtraction is xor in GF(2) */ + + /* + * Since we are bit reflected, the result (ie the low 32 bits) is in + * the high 32 bits. We just need to shift it left 4 bytes + * V0 [ 0 1 X 3 ] + * V0 [ 0 X 2 3 ] + */ + vsldoi v0,v0,zeroes,4 /* shift result into top 64 bits of */ +#endif + + /* Get it into r3 */ + MFVRD(r3, v0) + + ld r31,-8(r1) + ld r30,-16(r1) + ld r29,-24(r1) + ld r28,-32(r1) + ld r27,-40(r1) + ld r26,-48(r1) + ld r25,-56(r1) + + blr + +.Lfirst_warm_up_done: + lvx const1,0,r3 + addi r3,r3,16 + + VPMSUMD(v8,v16,const1) + VPMSUMD(v9,v17,const1) + VPMSUMD(v10,v18,const1) + VPMSUMD(v11,v19,const1) + VPMSUMD(v12,v20,const1) + VPMSUMD(v13,v21,const1) + VPMSUMD(v14,v22,const1) + VPMSUMD(v15,v23,const1) + + b .Lsecond_cool_down + +.Lshort: + cmpdi r5,0 + beq .Lzero + + addis r3,r2,.short_constants@toc@ha + addi r3,r3,.short_constants@toc@l + + /* Calculate where in the constant table we need to start */ + subfic r6,r5,256 + add r3,r3,r6 + + /* How many 16 byte chunks? */ + srdi r7,r5,4 + mtctr r7 + + vxor v19,v19,v19 + vxor v20,v20,v20 + + lvx v0,0,r4 + lvx v16,0,r3 + VPERM(v0,v0,v16,byteswap) + vxor v0,v0,v8 /* xor in initial value */ + VPMSUMW(v0,v0,v16) + bdz .Lv0 + + lvx v1,off16,r4 + lvx v17,off16,r3 + VPERM(v1,v1,v17,byteswap) + VPMSUMW(v1,v1,v17) + bdz .Lv1 + + lvx v2,off32,r4 + lvx v16,off32,r3 + VPERM(v2,v2,v16,byteswap) + VPMSUMW(v2,v2,v16) + bdz .Lv2 + + lvx v3,off48,r4 + lvx v17,off48,r3 + VPERM(v3,v3,v17,byteswap) + VPMSUMW(v3,v3,v17) + bdz .Lv3 + + lvx v4,off64,r4 + lvx v16,off64,r3 + VPERM(v4,v4,v16,byteswap) + VPMSUMW(v4,v4,v16) + bdz .Lv4 + + lvx v5,off80,r4 + lvx v17,off80,r3 + VPERM(v5,v5,v17,byteswap) + VPMSUMW(v5,v5,v17) + bdz .Lv5 + + lvx v6,off96,r4 + lvx v16,off96,r3 + VPERM(v6,v6,v16,byteswap) + VPMSUMW(v6,v6,v16) + bdz .Lv6 + + lvx v7,off112,r4 + lvx v17,off112,r3 + VPERM(v7,v7,v17,byteswap) + VPMSUMW(v7,v7,v17) + bdz .Lv7 + + addi r3,r3,128 + addi r4,r4,128 + + lvx v8,0,r4 + lvx v16,0,r3 + VPERM(v8,v8,v16,byteswap) + VPMSUMW(v8,v8,v16) + bdz .Lv8 + + lvx v9,off16,r4 + lvx v17,off16,r3 + VPERM(v9,v9,v17,byteswap) + VPMSUMW(v9,v9,v17) + bdz .Lv9 + + lvx v10,off32,r4 + lvx v16,off32,r3 + VPERM(v10,v10,v16,byteswap) + VPMSUMW(v10,v10,v16) + bdz .Lv10 + + lvx v11,off48,r4 + lvx v17,off48,r3 + VPERM(v11,v11,v17,byteswap) + VPMSUMW(v11,v11,v17) + bdz .Lv11 + + lvx v12,off64,r4 + lvx v16,off64,r3 + VPERM(v12,v12,v16,byteswap) + VPMSUMW(v12,v12,v16) + bdz .Lv12 + + lvx v13,off80,r4 + lvx v17,off80,r3 + VPERM(v13,v13,v17,byteswap) + VPMSUMW(v13,v13,v17) + bdz .Lv13 + + lvx v14,off96,r4 + lvx v16,off96,r3 + VPERM(v14,v14,v16,byteswap) + VPMSUMW(v14,v14,v16) + bdz .Lv14 + + lvx v15,off112,r4 + lvx v17,off112,r3 + VPERM(v15,v15,v17,byteswap) + VPMSUMW(v15,v15,v17) + +.Lv15: vxor v19,v19,v15 +.Lv14: vxor v20,v20,v14 +.Lv13: vxor v19,v19,v13 +.Lv12: vxor v20,v20,v12 +.Lv11: vxor v19,v19,v11 +.Lv10: vxor v20,v20,v10 +.Lv9: vxor v19,v19,v9 +.Lv8: vxor v20,v20,v8 +.Lv7: vxor v19,v19,v7 +.Lv6: vxor v20,v20,v6 +.Lv5: vxor v19,v19,v5 +.Lv4: vxor v20,v20,v4 +.Lv3: vxor v19,v19,v3 +.Lv2: vxor v20,v20,v2 +.Lv1: vxor v19,v19,v1 +.Lv0: vxor v20,v20,v0 + + vxor v0,v19,v20 + + b .Lbarrett_reduction + +.Lzero: + mr r3,r10 + blr +FUNC_END(__crc32_vpmsum) +#endif diff --git a/src/third_party/wiredtiger/src/support/power8/crc32_constants.h b/src/third_party/wiredtiger/src/support/power8/crc32_constants.h new file mode 100644 index 00000000000..02c471d1c56 --- /dev/null +++ b/src/third_party/wiredtiger/src/support/power8/crc32_constants.h @@ -0,0 +1,901 @@ +#define CRC 0x1edc6f41 +#define CRC_XOR +#define REFLECT + +#ifndef __ASSEMBLY__ +#ifdef CRC_TABLE +static const unsigned int crc_table[] = { + 0x00000000, 0xf26b8303, 0xe13b70f7, 0x1350f3f4, + 0xc79a971f, 0x35f1141c, 0x26a1e7e8, 0xd4ca64eb, + 0x8ad958cf, 0x78b2dbcc, 0x6be22838, 0x9989ab3b, + 0x4d43cfd0, 0xbf284cd3, 0xac78bf27, 0x5e133c24, + 0x105ec76f, 0xe235446c, 0xf165b798, 0x030e349b, + 0xd7c45070, 0x25afd373, 0x36ff2087, 0xc494a384, + 0x9a879fa0, 0x68ec1ca3, 0x7bbcef57, 0x89d76c54, + 0x5d1d08bf, 0xaf768bbc, 0xbc267848, 0x4e4dfb4b, + 0x20bd8ede, 0xd2d60ddd, 0xc186fe29, 0x33ed7d2a, + 0xe72719c1, 0x154c9ac2, 0x061c6936, 0xf477ea35, + 0xaa64d611, 0x580f5512, 0x4b5fa6e6, 0xb93425e5, + 0x6dfe410e, 0x9f95c20d, 0x8cc531f9, 0x7eaeb2fa, + 0x30e349b1, 0xc288cab2, 0xd1d83946, 0x23b3ba45, + 0xf779deae, 0x05125dad, 0x1642ae59, 0xe4292d5a, + 0xba3a117e, 0x4851927d, 0x5b016189, 0xa96ae28a, + 0x7da08661, 0x8fcb0562, 0x9c9bf696, 0x6ef07595, + 0x417b1dbc, 0xb3109ebf, 0xa0406d4b, 0x522bee48, + 0x86e18aa3, 0x748a09a0, 0x67dafa54, 0x95b17957, + 0xcba24573, 0x39c9c670, 0x2a993584, 0xd8f2b687, + 0x0c38d26c, 0xfe53516f, 0xed03a29b, 0x1f682198, + 0x5125dad3, 0xa34e59d0, 0xb01eaa24, 0x42752927, + 0x96bf4dcc, 0x64d4cecf, 0x77843d3b, 0x85efbe38, + 0xdbfc821c, 0x2997011f, 0x3ac7f2eb, 0xc8ac71e8, + 0x1c661503, 0xee0d9600, 0xfd5d65f4, 0x0f36e6f7, + 0x61c69362, 0x93ad1061, 0x80fde395, 0x72966096, + 0xa65c047d, 0x5437877e, 0x4767748a, 0xb50cf789, + 0xeb1fcbad, 0x197448ae, 0x0a24bb5a, 0xf84f3859, + 0x2c855cb2, 0xdeeedfb1, 0xcdbe2c45, 0x3fd5af46, + 0x7198540d, 0x83f3d70e, 0x90a324fa, 0x62c8a7f9, + 0xb602c312, 0x44694011, 0x5739b3e5, 0xa55230e6, + 0xfb410cc2, 0x092a8fc1, 0x1a7a7c35, 0xe811ff36, + 0x3cdb9bdd, 0xceb018de, 0xdde0eb2a, 0x2f8b6829, + 0x82f63b78, 0x709db87b, 0x63cd4b8f, 0x91a6c88c, + 0x456cac67, 0xb7072f64, 0xa457dc90, 0x563c5f93, + 0x082f63b7, 0xfa44e0b4, 0xe9141340, 0x1b7f9043, + 0xcfb5f4a8, 0x3dde77ab, 0x2e8e845f, 0xdce5075c, + 0x92a8fc17, 0x60c37f14, 0x73938ce0, 0x81f80fe3, + 0x55326b08, 0xa759e80b, 0xb4091bff, 0x466298fc, + 0x1871a4d8, 0xea1a27db, 0xf94ad42f, 0x0b21572c, + 0xdfeb33c7, 0x2d80b0c4, 0x3ed04330, 0xccbbc033, + 0xa24bb5a6, 0x502036a5, 0x4370c551, 0xb11b4652, + 0x65d122b9, 0x97baa1ba, 0x84ea524e, 0x7681d14d, + 0x2892ed69, 0xdaf96e6a, 0xc9a99d9e, 0x3bc21e9d, + 0xef087a76, 0x1d63f975, 0x0e330a81, 0xfc588982, + 0xb21572c9, 0x407ef1ca, 0x532e023e, 0xa145813d, + 0x758fe5d6, 0x87e466d5, 0x94b49521, 0x66df1622, + 0x38cc2a06, 0xcaa7a905, 0xd9f75af1, 0x2b9cd9f2, + 0xff56bd19, 0x0d3d3e1a, 0x1e6dcdee, 0xec064eed, + 0xc38d26c4, 0x31e6a5c7, 0x22b65633, 0xd0ddd530, + 0x0417b1db, 0xf67c32d8, 0xe52cc12c, 0x1747422f, + 0x49547e0b, 0xbb3ffd08, 0xa86f0efc, 0x5a048dff, + 0x8ecee914, 0x7ca56a17, 0x6ff599e3, 0x9d9e1ae0, + 0xd3d3e1ab, 0x21b862a8, 0x32e8915c, 0xc083125f, + 0x144976b4, 0xe622f5b7, 0xf5720643, 0x07198540, + 0x590ab964, 0xab613a67, 0xb831c993, 0x4a5a4a90, + 0x9e902e7b, 0x6cfbad78, 0x7fab5e8c, 0x8dc0dd8f, + 0xe330a81a, 0x115b2b19, 0x020bd8ed, 0xf0605bee, + 0x24aa3f05, 0xd6c1bc06, 0xc5914ff2, 0x37faccf1, + 0x69e9f0d5, 0x9b8273d6, 0x88d28022, 0x7ab90321, + 0xae7367ca, 0x5c18e4c9, 0x4f48173d, 0xbd23943e, + 0xf36e6f75, 0x0105ec76, 0x12551f82, 0xe03e9c81, + 0x34f4f86a, 0xc69f7b69, 0xd5cf889d, 0x27a40b9e, + 0x79b737ba, 0x8bdcb4b9, 0x988c474d, 0x6ae7c44e, + 0xbe2da0a5, 0x4c4623a6, 0x5f16d052, 0xad7d5351,}; + +#endif +#else +#define MAX_SIZE 32768 +.constants: + + /* Reduce 262144 kbits to 1024 bits */ + /* x^261120 mod p(x)` << 1, x^261184 mod p(x)` << 1 */ + .octa 0x00000000b6ca9e20000000009c37c408 + + /* x^260096 mod p(x)` << 1, x^260160 mod p(x)` << 1 */ + .octa 0x00000000350249a800000001b51df26c + + /* x^259072 mod p(x)` << 1, x^259136 mod p(x)` << 1 */ + .octa 0x00000001862dac54000000000724b9d0 + + /* x^258048 mod p(x)` << 1, x^258112 mod p(x)` << 1 */ + .octa 0x00000001d87fb48c00000001c00532fe + + /* x^257024 mod p(x)` << 1, x^257088 mod p(x)` << 1 */ + .octa 0x00000001f39b699e00000000f05a9362 + + /* x^256000 mod p(x)` << 1, x^256064 mod p(x)` << 1 */ + .octa 0x0000000101da11b400000001e1007970 + + /* x^254976 mod p(x)` << 1, x^255040 mod p(x)` << 1 */ + .octa 0x00000001cab571e000000000a57366ee + + /* x^253952 mod p(x)` << 1, x^254016 mod p(x)` << 1 */ + .octa 0x00000000c7020cfe0000000192011284 + + /* x^252928 mod p(x)` << 1, x^252992 mod p(x)` << 1 */ + .octa 0x00000000cdaed1ae0000000162716d9a + + /* x^251904 mod p(x)` << 1, x^251968 mod p(x)` << 1 */ + .octa 0x00000001e804effc00000000cd97ecde + + /* x^250880 mod p(x)` << 1, x^250944 mod p(x)` << 1 */ + .octa 0x0000000077c3ea3a0000000058812bc0 + + /* x^249856 mod p(x)` << 1, x^249920 mod p(x)` << 1 */ + .octa 0x0000000068df31b40000000088b8c12e + + /* x^248832 mod p(x)` << 1, x^248896 mod p(x)` << 1 */ + .octa 0x00000000b059b6c200000001230b234c + + /* x^247808 mod p(x)` << 1, x^247872 mod p(x)` << 1 */ + .octa 0x0000000145fb8ed800000001120b416e + + /* x^246784 mod p(x)` << 1, x^246848 mod p(x)` << 1 */ + .octa 0x00000000cbc0916800000001974aecb0 + + /* x^245760 mod p(x)` << 1, x^245824 mod p(x)` << 1 */ + .octa 0x000000005ceeedc2000000008ee3f226 + + /* x^244736 mod p(x)` << 1, x^244800 mod p(x)` << 1 */ + .octa 0x0000000047d74e8600000001089aba9a + + /* x^243712 mod p(x)` << 1, x^243776 mod p(x)` << 1 */ + .octa 0x00000001407e9e220000000065113872 + + /* x^242688 mod p(x)` << 1, x^242752 mod p(x)` << 1 */ + .octa 0x00000001da967bda000000005c07ec10 + + /* x^241664 mod p(x)` << 1, x^241728 mod p(x)` << 1 */ + .octa 0x000000006c8983680000000187590924 + + /* x^240640 mod p(x)` << 1, x^240704 mod p(x)` << 1 */ + .octa 0x00000000f2d14c9800000000e35da7c6 + + /* x^239616 mod p(x)` << 1, x^239680 mod p(x)` << 1 */ + .octa 0x00000001993c6ad4000000000415855a + + /* x^238592 mod p(x)` << 1, x^238656 mod p(x)` << 1 */ + .octa 0x000000014683d1ac0000000073617758 + + /* x^237568 mod p(x)` << 1, x^237632 mod p(x)` << 1 */ + .octa 0x00000001a7c93e6c0000000176021d28 + + /* x^236544 mod p(x)` << 1, x^236608 mod p(x)` << 1 */ + .octa 0x000000010211e90a00000001c358fd0a + + /* x^235520 mod p(x)` << 1, x^235584 mod p(x)` << 1 */ + .octa 0x000000001119403e00000001ff7a2c18 + + /* x^234496 mod p(x)` << 1, x^234560 mod p(x)` << 1 */ + .octa 0x000000001c3261aa00000000f2d9f7e4 + + /* x^233472 mod p(x)` << 1, x^233536 mod p(x)` << 1 */ + .octa 0x000000014e37a634000000016cf1f9c8 + + /* x^232448 mod p(x)` << 1, x^232512 mod p(x)` << 1 */ + .octa 0x0000000073786c0c000000010af9279a + + /* x^231424 mod p(x)` << 1, x^231488 mod p(x)` << 1 */ + .octa 0x000000011dc037f80000000004f101e8 + + /* x^230400 mod p(x)` << 1, x^230464 mod p(x)` << 1 */ + .octa 0x0000000031433dfc0000000070bcf184 + + /* x^229376 mod p(x)` << 1, x^229440 mod p(x)` << 1 */ + .octa 0x000000009cde8348000000000a8de642 + + /* x^228352 mod p(x)` << 1, x^228416 mod p(x)` << 1 */ + .octa 0x0000000038d3c2a60000000062ea130c + + /* x^227328 mod p(x)` << 1, x^227392 mod p(x)` << 1 */ + .octa 0x000000011b25f26000000001eb31cbb2 + + /* x^226304 mod p(x)` << 1, x^226368 mod p(x)` << 1 */ + .octa 0x000000001629e6f00000000170783448 + + /* x^225280 mod p(x)` << 1, x^225344 mod p(x)` << 1 */ + .octa 0x0000000160838b4c00000001a684b4c6 + + /* x^224256 mod p(x)` << 1, x^224320 mod p(x)` << 1 */ + .octa 0x000000007a44011c00000000253ca5b4 + + /* x^223232 mod p(x)` << 1, x^223296 mod p(x)` << 1 */ + .octa 0x00000000226f417a0000000057b4b1e2 + + /* x^222208 mod p(x)` << 1, x^222272 mod p(x)` << 1 */ + .octa 0x0000000045eb2eb400000000b6bd084c + + /* x^221184 mod p(x)` << 1, x^221248 mod p(x)` << 1 */ + .octa 0x000000014459d70c0000000123c2d592 + + /* x^220160 mod p(x)` << 1, x^220224 mod p(x)` << 1 */ + .octa 0x00000001d406ed8200000000159dafce + + /* x^219136 mod p(x)` << 1, x^219200 mod p(x)` << 1 */ + .octa 0x0000000160c8e1a80000000127e1a64e + + /* x^218112 mod p(x)` << 1, x^218176 mod p(x)` << 1 */ + .octa 0x0000000027ba80980000000056860754 + + /* x^217088 mod p(x)` << 1, x^217152 mod p(x)` << 1 */ + .octa 0x000000006d92d01800000001e661aae8 + + /* x^216064 mod p(x)` << 1, x^216128 mod p(x)` << 1 */ + .octa 0x000000012ed7e3f200000000f82c6166 + + /* x^215040 mod p(x)` << 1, x^215104 mod p(x)` << 1 */ + .octa 0x000000002dc8778800000000c4f9c7ae + + /* x^214016 mod p(x)` << 1, x^214080 mod p(x)` << 1 */ + .octa 0x0000000018240bb80000000074203d20 + + /* x^212992 mod p(x)` << 1, x^213056 mod p(x)` << 1 */ + .octa 0x000000001ad381580000000198173052 + + /* x^211968 mod p(x)` << 1, x^212032 mod p(x)` << 1 */ + .octa 0x00000001396b78f200000001ce8aba54 + + /* x^210944 mod p(x)` << 1, x^211008 mod p(x)` << 1 */ + .octa 0x000000011a68133400000001850d5d94 + + /* x^209920 mod p(x)` << 1, x^209984 mod p(x)` << 1 */ + .octa 0x000000012104732e00000001d609239c + + /* x^208896 mod p(x)` << 1, x^208960 mod p(x)` << 1 */ + .octa 0x00000000a140d90c000000001595f048 + + /* x^207872 mod p(x)` << 1, x^207936 mod p(x)` << 1 */ + .octa 0x00000001b7215eda0000000042ccee08 + + /* x^206848 mod p(x)` << 1, x^206912 mod p(x)` << 1 */ + .octa 0x00000001aaf1df3c000000010a389d74 + + /* x^205824 mod p(x)` << 1, x^205888 mod p(x)` << 1 */ + .octa 0x0000000029d15b8a000000012a840da6 + + /* x^204800 mod p(x)` << 1, x^204864 mod p(x)` << 1 */ + .octa 0x00000000f1a96922000000001d181c0c + + /* x^203776 mod p(x)` << 1, x^203840 mod p(x)` << 1 */ + .octa 0x00000001ac80d03c0000000068b7d1f6 + + /* x^202752 mod p(x)` << 1, x^202816 mod p(x)` << 1 */ + .octa 0x000000000f11d56a000000005b0f14fc + + /* x^201728 mod p(x)` << 1, x^201792 mod p(x)` << 1 */ + .octa 0x00000001f1c022a20000000179e9e730 + + /* x^200704 mod p(x)` << 1, x^200768 mod p(x)` << 1 */ + .octa 0x0000000173d00ae200000001ce1368d6 + + /* x^199680 mod p(x)` << 1, x^199744 mod p(x)` << 1 */ + .octa 0x00000001d4ffe4ac0000000112c3a84c + + /* x^198656 mod p(x)` << 1, x^198720 mod p(x)` << 1 */ + .octa 0x000000016edc5ae400000000de940fee + + /* x^197632 mod p(x)` << 1, x^197696 mod p(x)` << 1 */ + .octa 0x00000001f1a0214000000000fe896b7e + + /* x^196608 mod p(x)` << 1, x^196672 mod p(x)` << 1 */ + .octa 0x00000000ca0b28a000000001f797431c + + /* x^195584 mod p(x)` << 1, x^195648 mod p(x)` << 1 */ + .octa 0x00000001928e30a20000000053e989ba + + /* x^194560 mod p(x)` << 1, x^194624 mod p(x)` << 1 */ + .octa 0x0000000097b1b002000000003920cd16 + + /* x^193536 mod p(x)` << 1, x^193600 mod p(x)` << 1 */ + .octa 0x00000000b15bf90600000001e6f579b8 + + /* x^192512 mod p(x)` << 1, x^192576 mod p(x)` << 1 */ + .octa 0x00000000411c5d52000000007493cb0a + + /* x^191488 mod p(x)` << 1, x^191552 mod p(x)` << 1 */ + .octa 0x00000001c36f330000000001bdd376d8 + + /* x^190464 mod p(x)` << 1, x^190528 mod p(x)` << 1 */ + .octa 0x00000001119227e0000000016badfee6 + + /* x^189440 mod p(x)` << 1, x^189504 mod p(x)` << 1 */ + .octa 0x00000000114d47020000000071de5c58 + + /* x^188416 mod p(x)` << 1, x^188480 mod p(x)` << 1 */ + .octa 0x00000000458b5b9800000000453f317c + + /* x^187392 mod p(x)` << 1, x^187456 mod p(x)` << 1 */ + .octa 0x000000012e31fb8e0000000121675cce + + /* x^186368 mod p(x)` << 1, x^186432 mod p(x)` << 1 */ + .octa 0x000000005cf619d800000001f409ee92 + + /* x^185344 mod p(x)` << 1, x^185408 mod p(x)` << 1 */ + .octa 0x0000000063f4d8b200000000f36b9c88 + + /* x^184320 mod p(x)` << 1, x^184384 mod p(x)` << 1 */ + .octa 0x000000004138dc8a0000000036b398f4 + + /* x^183296 mod p(x)` << 1, x^183360 mod p(x)` << 1 */ + .octa 0x00000001d29ee8e000000001748f9adc + + /* x^182272 mod p(x)` << 1, x^182336 mod p(x)` << 1 */ + .octa 0x000000006a08ace800000001be94ec00 + + /* x^181248 mod p(x)` << 1, x^181312 mod p(x)` << 1 */ + .octa 0x0000000127d4201000000000b74370d6 + + /* x^180224 mod p(x)` << 1, x^180288 mod p(x)` << 1 */ + .octa 0x0000000019d76b6200000001174d0b98 + + /* x^179200 mod p(x)` << 1, x^179264 mod p(x)` << 1 */ + .octa 0x00000001b1471f6e00000000befc06a4 + + /* x^178176 mod p(x)` << 1, x^178240 mod p(x)` << 1 */ + .octa 0x00000001f64c19cc00000001ae125288 + + /* x^177152 mod p(x)` << 1, x^177216 mod p(x)` << 1 */ + .octa 0x00000000003c0ea00000000095c19b34 + + /* x^176128 mod p(x)` << 1, x^176192 mod p(x)` << 1 */ + .octa 0x000000014d73abf600000001a78496f2 + + /* x^175104 mod p(x)` << 1, x^175168 mod p(x)` << 1 */ + .octa 0x00000001620eb84400000001ac5390a0 + + /* x^174080 mod p(x)` << 1, x^174144 mod p(x)` << 1 */ + .octa 0x0000000147655048000000002a80ed6e + + /* x^173056 mod p(x)` << 1, x^173120 mod p(x)` << 1 */ + .octa 0x0000000067b5077e00000001fa9b0128 + + /* x^172032 mod p(x)` << 1, x^172096 mod p(x)` << 1 */ + .octa 0x0000000010ffe20600000001ea94929e + + /* x^171008 mod p(x)` << 1, x^171072 mod p(x)` << 1 */ + .octa 0x000000000fee8f1e0000000125f4305c + + /* x^169984 mod p(x)` << 1, x^170048 mod p(x)` << 1 */ + .octa 0x00000001da26fbae00000001471e2002 + + /* x^168960 mod p(x)` << 1, x^169024 mod p(x)` << 1 */ + .octa 0x00000001b3a8bd880000000132d2253a + + /* x^167936 mod p(x)` << 1, x^168000 mod p(x)` << 1 */ + .octa 0x00000000e8f3898e00000000f26b3592 + + /* x^166912 mod p(x)` << 1, x^166976 mod p(x)` << 1 */ + .octa 0x00000000b0d0d28c00000000bc8b67b0 + + /* x^165888 mod p(x)` << 1, x^165952 mod p(x)` << 1 */ + .octa 0x0000000030f2a798000000013a826ef2 + + /* x^164864 mod p(x)` << 1, x^164928 mod p(x)` << 1 */ + .octa 0x000000000fba10020000000081482c84 + + /* x^163840 mod p(x)` << 1, x^163904 mod p(x)` << 1 */ + .octa 0x00000000bdb9bd7200000000e77307c2 + + /* x^162816 mod p(x)` << 1, x^162880 mod p(x)` << 1 */ + .octa 0x0000000075d3bf5a00000000d4a07ec8 + + /* x^161792 mod p(x)` << 1, x^161856 mod p(x)` << 1 */ + .octa 0x00000000ef1f98a00000000017102100 + + /* x^160768 mod p(x)` << 1, x^160832 mod p(x)` << 1 */ + .octa 0x00000000689c760200000000db406486 + + /* x^159744 mod p(x)` << 1, x^159808 mod p(x)` << 1 */ + .octa 0x000000016d5fa5fe0000000192db7f88 + + /* x^158720 mod p(x)` << 1, x^158784 mod p(x)` << 1 */ + .octa 0x00000001d0d2b9ca000000018bf67b1e + + /* x^157696 mod p(x)` << 1, x^157760 mod p(x)` << 1 */ + .octa 0x0000000041e7b470000000007c09163e + + /* x^156672 mod p(x)` << 1, x^156736 mod p(x)` << 1 */ + .octa 0x00000001cbb6495e000000000adac060 + + /* x^155648 mod p(x)` << 1, x^155712 mod p(x)` << 1 */ + .octa 0x000000010052a0b000000000bd8316ae + + /* x^154624 mod p(x)` << 1, x^154688 mod p(x)` << 1 */ + .octa 0x00000001d8effb5c000000019f09ab54 + + /* x^153600 mod p(x)` << 1, x^153664 mod p(x)` << 1 */ + .octa 0x00000001d969853c0000000125155542 + + /* x^152576 mod p(x)` << 1, x^152640 mod p(x)` << 1 */ + .octa 0x00000000523ccce2000000018fdb5882 + + /* x^151552 mod p(x)` << 1, x^151616 mod p(x)` << 1 */ + .octa 0x000000001e2436bc00000000e794b3f4 + + /* x^150528 mod p(x)` << 1, x^150592 mod p(x)` << 1 */ + .octa 0x00000000ddd1c3a2000000016f9bb022 + + /* x^149504 mod p(x)` << 1, x^149568 mod p(x)` << 1 */ + .octa 0x0000000019fcfe3800000000290c9978 + + /* x^148480 mod p(x)` << 1, x^148544 mod p(x)` << 1 */ + .octa 0x00000001ce95db640000000083c0f350 + + /* x^147456 mod p(x)` << 1, x^147520 mod p(x)` << 1 */ + .octa 0x00000000af5828060000000173ea6628 + + /* x^146432 mod p(x)` << 1, x^146496 mod p(x)` << 1 */ + .octa 0x00000001006388f600000001c8b4e00a + + /* x^145408 mod p(x)` << 1, x^145472 mod p(x)` << 1 */ + .octa 0x0000000179eca00a00000000de95d6aa + + /* x^144384 mod p(x)` << 1, x^144448 mod p(x)` << 1 */ + .octa 0x0000000122410a6a000000010b7f7248 + + /* x^143360 mod p(x)` << 1, x^143424 mod p(x)` << 1 */ + .octa 0x000000004288e87c00000001326e3a06 + + /* x^142336 mod p(x)` << 1, x^142400 mod p(x)` << 1 */ + .octa 0x000000016c5490da00000000bb62c2e6 + + /* x^141312 mod p(x)` << 1, x^141376 mod p(x)` << 1 */ + .octa 0x00000000d1c71f6e0000000156a4b2c2 + + /* x^140288 mod p(x)` << 1, x^140352 mod p(x)` << 1 */ + .octa 0x00000001b4ce08a6000000011dfe763a + + /* x^139264 mod p(x)` << 1, x^139328 mod p(x)` << 1 */ + .octa 0x00000001466ba60c000000007bcca8e2 + + /* x^138240 mod p(x)` << 1, x^138304 mod p(x)` << 1 */ + .octa 0x00000001f6c488a40000000186118faa + + /* x^137216 mod p(x)` << 1, x^137280 mod p(x)` << 1 */ + .octa 0x000000013bfb06820000000111a65a88 + + /* x^136192 mod p(x)` << 1, x^136256 mod p(x)` << 1 */ + .octa 0x00000000690e9e54000000003565e1c4 + + /* x^135168 mod p(x)` << 1, x^135232 mod p(x)` << 1 */ + .octa 0x00000000281346b6000000012ed02a82 + + /* x^134144 mod p(x)` << 1, x^134208 mod p(x)` << 1 */ + .octa 0x000000015646402400000000c486ecfc + + /* x^133120 mod p(x)` << 1, x^133184 mod p(x)` << 1 */ + .octa 0x000000016063a8dc0000000001b951b2 + + /* x^132096 mod p(x)` << 1, x^132160 mod p(x)` << 1 */ + .octa 0x0000000116a663620000000048143916 + + /* x^131072 mod p(x)` << 1, x^131136 mod p(x)` << 1 */ + .octa 0x000000017e8aa4d200000001dc2ae124 + + /* x^130048 mod p(x)` << 1, x^130112 mod p(x)` << 1 */ + .octa 0x00000001728eb10c00000001416c58d6 + + /* x^129024 mod p(x)` << 1, x^129088 mod p(x)` << 1 */ + .octa 0x00000001b08fd7fa00000000a479744a + + /* x^128000 mod p(x)` << 1, x^128064 mod p(x)` << 1 */ + .octa 0x00000001092a16e80000000096ca3a26 + + /* x^126976 mod p(x)` << 1, x^127040 mod p(x)` << 1 */ + .octa 0x00000000a505637c00000000ff223d4e + + /* x^125952 mod p(x)` << 1, x^126016 mod p(x)` << 1 */ + .octa 0x00000000d94869b2000000010e84da42 + + /* x^124928 mod p(x)` << 1, x^124992 mod p(x)` << 1 */ + .octa 0x00000001c8b203ae00000001b61ba3d0 + + /* x^123904 mod p(x)` << 1, x^123968 mod p(x)` << 1 */ + .octa 0x000000005704aea000000000680f2de8 + + /* x^122880 mod p(x)` << 1, x^122944 mod p(x)` << 1 */ + .octa 0x000000012e295fa2000000008772a9a8 + + /* x^121856 mod p(x)` << 1, x^121920 mod p(x)` << 1 */ + .octa 0x000000011d0908bc0000000155f295bc + + /* x^120832 mod p(x)` << 1, x^120896 mod p(x)` << 1 */ + .octa 0x0000000193ed97ea00000000595f9282 + + /* x^119808 mod p(x)` << 1, x^119872 mod p(x)` << 1 */ + .octa 0x000000013a0f1c520000000164b1c25a + + /* x^118784 mod p(x)` << 1, x^118848 mod p(x)` << 1 */ + .octa 0x000000010c2c40c000000000fbd67c50 + + /* x^117760 mod p(x)` << 1, x^117824 mod p(x)` << 1 */ + .octa 0x00000000ff6fac3e0000000096076268 + + /* x^116736 mod p(x)` << 1, x^116800 mod p(x)` << 1 */ + .octa 0x000000017b3609c000000001d288e4cc + + /* x^115712 mod p(x)` << 1, x^115776 mod p(x)` << 1 */ + .octa 0x0000000088c8c92200000001eaac1bdc + + /* x^114688 mod p(x)` << 1, x^114752 mod p(x)` << 1 */ + .octa 0x00000001751baae600000001f1ea39e2 + + /* x^113664 mod p(x)` << 1, x^113728 mod p(x)` << 1 */ + .octa 0x000000010795297200000001eb6506fc + + /* x^112640 mod p(x)` << 1, x^112704 mod p(x)` << 1 */ + .octa 0x0000000162b00abe000000010f806ffe + + /* x^111616 mod p(x)` << 1, x^111680 mod p(x)` << 1 */ + .octa 0x000000000d7b404c000000010408481e + + /* x^110592 mod p(x)` << 1, x^110656 mod p(x)` << 1 */ + .octa 0x00000000763b13d40000000188260534 + + /* x^109568 mod p(x)` << 1, x^109632 mod p(x)` << 1 */ + .octa 0x00000000f6dc22d80000000058fc73e0 + + /* x^108544 mod p(x)` << 1, x^108608 mod p(x)` << 1 */ + .octa 0x000000007daae06000000000391c59b8 + + /* x^107520 mod p(x)` << 1, x^107584 mod p(x)` << 1 */ + .octa 0x000000013359ab7c000000018b638400 + + /* x^106496 mod p(x)` << 1, x^106560 mod p(x)` << 1 */ + .octa 0x000000008add438a000000011738f5c4 + + /* x^105472 mod p(x)` << 1, x^105536 mod p(x)` << 1 */ + .octa 0x00000001edbefdea000000008cf7c6da + + /* x^104448 mod p(x)` << 1, x^104512 mod p(x)` << 1 */ + .octa 0x000000004104e0f800000001ef97fb16 + + /* x^103424 mod p(x)` << 1, x^103488 mod p(x)` << 1 */ + .octa 0x00000000b48a82220000000102130e20 + + /* x^102400 mod p(x)` << 1, x^102464 mod p(x)` << 1 */ + .octa 0x00000001bcb4684400000000db968898 + + /* x^101376 mod p(x)` << 1, x^101440 mod p(x)` << 1 */ + .octa 0x000000013293ce0a00000000b5047b5e + + /* x^100352 mod p(x)` << 1, x^100416 mod p(x)` << 1 */ + .octa 0x00000001710d0844000000010b90fdb2 + + /* x^99328 mod p(x)` << 1, x^99392 mod p(x)` << 1 */ + .octa 0x0000000117907f6e000000004834a32e + + /* x^98304 mod p(x)` << 1, x^98368 mod p(x)` << 1 */ + .octa 0x0000000087ddf93e0000000059c8f2b0 + + /* x^97280 mod p(x)` << 1, x^97344 mod p(x)` << 1 */ + .octa 0x000000005970e9b00000000122cec508 + + /* x^96256 mod p(x)` << 1, x^96320 mod p(x)` << 1 */ + .octa 0x0000000185b2b7d0000000000a330cda + + /* x^95232 mod p(x)` << 1, x^95296 mod p(x)` << 1 */ + .octa 0x00000001dcee0efc000000014a47148c + + /* x^94208 mod p(x)` << 1, x^94272 mod p(x)` << 1 */ + .octa 0x0000000030da27220000000042c61cb8 + + /* x^93184 mod p(x)` << 1, x^93248 mod p(x)` << 1 */ + .octa 0x000000012f925a180000000012fe6960 + + /* x^92160 mod p(x)` << 1, x^92224 mod p(x)` << 1 */ + .octa 0x00000000dd2e357c00000000dbda2c20 + + /* x^91136 mod p(x)` << 1, x^91200 mod p(x)` << 1 */ + .octa 0x00000000071c80de000000011122410c + + /* x^90112 mod p(x)` << 1, x^90176 mod p(x)` << 1 */ + .octa 0x000000011513140a00000000977b2070 + + /* x^89088 mod p(x)` << 1, x^89152 mod p(x)` << 1 */ + .octa 0x00000001df876e8e000000014050438e + + /* x^88064 mod p(x)` << 1, x^88128 mod p(x)` << 1 */ + .octa 0x000000015f81d6ce0000000147c840e8 + + /* x^87040 mod p(x)` << 1, x^87104 mod p(x)` << 1 */ + .octa 0x000000019dd94dbe00000001cc7c88ce + + /* x^86016 mod p(x)` << 1, x^86080 mod p(x)` << 1 */ + .octa 0x00000001373d206e00000001476b35a4 + + /* x^84992 mod p(x)` << 1, x^85056 mod p(x)` << 1 */ + .octa 0x00000000668ccade000000013d52d508 + + /* x^83968 mod p(x)` << 1, x^84032 mod p(x)` << 1 */ + .octa 0x00000001b192d268000000008e4be32e + + /* x^82944 mod p(x)` << 1, x^83008 mod p(x)` << 1 */ + .octa 0x00000000e30f3a7800000000024120fe + + /* x^81920 mod p(x)` << 1, x^81984 mod p(x)` << 1 */ + .octa 0x000000010ef1f7bc00000000ddecddb4 + + /* x^80896 mod p(x)` << 1, x^80960 mod p(x)` << 1 */ + .octa 0x00000001f5ac738000000000d4d403bc + + /* x^79872 mod p(x)` << 1, x^79936 mod p(x)` << 1 */ + .octa 0x000000011822ea7000000001734b89aa + + /* x^78848 mod p(x)` << 1, x^78912 mod p(x)` << 1 */ + .octa 0x00000000c3a33848000000010e7a58d6 + + /* x^77824 mod p(x)` << 1, x^77888 mod p(x)` << 1 */ + .octa 0x00000001bd151c2400000001f9f04e9c + + /* x^76800 mod p(x)` << 1, x^76864 mod p(x)` << 1 */ + .octa 0x0000000056002d7600000000b692225e + + /* x^75776 mod p(x)` << 1, x^75840 mod p(x)` << 1 */ + .octa 0x000000014657c4f4000000019b8d3f3e + + /* x^74752 mod p(x)` << 1, x^74816 mod p(x)` << 1 */ + .octa 0x0000000113742d7c00000001a874f11e + + /* x^73728 mod p(x)` << 1, x^73792 mod p(x)` << 1 */ + .octa 0x000000019c5920ba000000010d5a4254 + + /* x^72704 mod p(x)` << 1, x^72768 mod p(x)` << 1 */ + .octa 0x000000005216d2d600000000bbb2f5d6 + + /* x^71680 mod p(x)` << 1, x^71744 mod p(x)` << 1 */ + .octa 0x0000000136f5ad8a0000000179cc0e36 + + /* x^70656 mod p(x)` << 1, x^70720 mod p(x)` << 1 */ + .octa 0x000000018b07beb600000001dca1da4a + + /* x^69632 mod p(x)` << 1, x^69696 mod p(x)` << 1 */ + .octa 0x00000000db1e93b000000000feb1a192 + + /* x^68608 mod p(x)` << 1, x^68672 mod p(x)` << 1 */ + .octa 0x000000000b96fa3a00000000d1eeedd6 + + /* x^67584 mod p(x)` << 1, x^67648 mod p(x)` << 1 */ + .octa 0x00000001d9968af0000000008fad9bb4 + + /* x^66560 mod p(x)` << 1, x^66624 mod p(x)` << 1 */ + .octa 0x000000000e4a77a200000001884938e4 + + /* x^65536 mod p(x)` << 1, x^65600 mod p(x)` << 1 */ + .octa 0x00000000508c2ac800000001bc2e9bc0 + + /* x^64512 mod p(x)` << 1, x^64576 mod p(x)` << 1 */ + .octa 0x0000000021572a8000000001f9658a68 + + /* x^63488 mod p(x)` << 1, x^63552 mod p(x)` << 1 */ + .octa 0x00000001b859daf2000000001b9224fc + + /* x^62464 mod p(x)` << 1, x^62528 mod p(x)` << 1 */ + .octa 0x000000016f7884740000000055b2fb84 + + /* x^61440 mod p(x)` << 1, x^61504 mod p(x)` << 1 */ + .octa 0x00000001b438810e000000018b090348 + + /* x^60416 mod p(x)` << 1, x^60480 mod p(x)` << 1 */ + .octa 0x0000000095ddc6f2000000011ccbd5ea + + /* x^59392 mod p(x)` << 1, x^59456 mod p(x)` << 1 */ + .octa 0x00000001d977c20c0000000007ae47f8 + + /* x^58368 mod p(x)` << 1, x^58432 mod p(x)` << 1 */ + .octa 0x00000000ebedb99a0000000172acbec0 + + /* x^57344 mod p(x)` << 1, x^57408 mod p(x)` << 1 */ + .octa 0x00000001df9e9e9200000001c6e3ff20 + + /* x^56320 mod p(x)` << 1, x^56384 mod p(x)` << 1 */ + .octa 0x00000001a4a3f95200000000e1b38744 + + /* x^55296 mod p(x)` << 1, x^55360 mod p(x)` << 1 */ + .octa 0x00000000e2f5122000000000791585b2 + + /* x^54272 mod p(x)` << 1, x^54336 mod p(x)` << 1 */ + .octa 0x000000004aa01f3e00000000ac53b894 + + /* x^53248 mod p(x)` << 1, x^53312 mod p(x)` << 1 */ + .octa 0x00000000b3e90a5800000001ed5f2cf4 + + /* x^52224 mod p(x)` << 1, x^52288 mod p(x)` << 1 */ + .octa 0x000000000c9ca2aa00000001df48b2e0 + + /* x^51200 mod p(x)` << 1, x^51264 mod p(x)` << 1 */ + .octa 0x000000015168231600000000049c1c62 + + /* x^50176 mod p(x)` << 1, x^50240 mod p(x)` << 1 */ + .octa 0x0000000036fce78c000000017c460c12 + + /* x^49152 mod p(x)` << 1, x^49216 mod p(x)` << 1 */ + .octa 0x000000009037dc10000000015be4da7e + + /* x^48128 mod p(x)` << 1, x^48192 mod p(x)` << 1 */ + .octa 0x00000000d3298582000000010f38f668 + + /* x^47104 mod p(x)` << 1, x^47168 mod p(x)` << 1 */ + .octa 0x00000001b42e8ad60000000039f40a00 + + /* x^46080 mod p(x)` << 1, x^46144 mod p(x)` << 1 */ + .octa 0x00000000142a983800000000bd4c10c4 + + /* x^45056 mod p(x)` << 1, x^45120 mod p(x)` << 1 */ + .octa 0x0000000109c7f1900000000042db1d98 + + /* x^44032 mod p(x)` << 1, x^44096 mod p(x)` << 1 */ + .octa 0x0000000056ff931000000001c905bae6 + + /* x^43008 mod p(x)` << 1, x^43072 mod p(x)` << 1 */ + .octa 0x00000001594513aa00000000069d40ea + + /* x^41984 mod p(x)` << 1, x^42048 mod p(x)` << 1 */ + .octa 0x00000001e3b5b1e8000000008e4fbad0 + + /* x^40960 mod p(x)` << 1, x^41024 mod p(x)` << 1 */ + .octa 0x000000011dd5fc080000000047bedd46 + + /* x^39936 mod p(x)` << 1, x^40000 mod p(x)` << 1 */ + .octa 0x00000001675f0cc20000000026396bf8 + + /* x^38912 mod p(x)` << 1, x^38976 mod p(x)` << 1 */ + .octa 0x00000000d1c8dd4400000000379beb92 + + /* x^37888 mod p(x)` << 1, x^37952 mod p(x)` << 1 */ + .octa 0x0000000115ebd3d8000000000abae54a + + /* x^36864 mod p(x)` << 1, x^36928 mod p(x)` << 1 */ + .octa 0x00000001ecbd0dac0000000007e6a128 + + /* x^35840 mod p(x)` << 1, x^35904 mod p(x)` << 1 */ + .octa 0x00000000cdf67af2000000000ade29d2 + + /* x^34816 mod p(x)` << 1, x^34880 mod p(x)` << 1 */ + .octa 0x000000004c01ff4c00000000f974c45c + + /* x^33792 mod p(x)` << 1, x^33856 mod p(x)` << 1 */ + .octa 0x00000000f2d8657e00000000e77ac60a + + /* x^32768 mod p(x)` << 1, x^32832 mod p(x)` << 1 */ + .octa 0x000000006bae74c40000000145895816 + + /* x^31744 mod p(x)` << 1, x^31808 mod p(x)` << 1 */ + .octa 0x0000000152af8aa00000000038e362be + + /* x^30720 mod p(x)` << 1, x^30784 mod p(x)` << 1 */ + .octa 0x0000000004663802000000007f991a64 + + /* x^29696 mod p(x)` << 1, x^29760 mod p(x)` << 1 */ + .octa 0x00000001ab2f5afc00000000fa366d3a + + /* x^28672 mod p(x)` << 1, x^28736 mod p(x)` << 1 */ + .octa 0x0000000074a4ebd400000001a2bb34f0 + + /* x^27648 mod p(x)` << 1, x^27712 mod p(x)` << 1 */ + .octa 0x00000001d7ab3a4c0000000028a9981e + + /* x^26624 mod p(x)` << 1, x^26688 mod p(x)` << 1 */ + .octa 0x00000001a8da60c600000001dbc672be + + /* x^25600 mod p(x)` << 1, x^25664 mod p(x)` << 1 */ + .octa 0x000000013cf6382000000000b04d77f6 + + /* x^24576 mod p(x)` << 1, x^24640 mod p(x)` << 1 */ + .octa 0x00000000bec12e1e0000000124400d96 + + /* x^23552 mod p(x)` << 1, x^23616 mod p(x)` << 1 */ + .octa 0x00000001c6368010000000014ca4b414 + + /* x^22528 mod p(x)` << 1, x^22592 mod p(x)` << 1 */ + .octa 0x00000001e6e78758000000012fe2c938 + + /* x^21504 mod p(x)` << 1, x^21568 mod p(x)` << 1 */ + .octa 0x000000008d7f2b3c00000001faed01e6 + + /* x^20480 mod p(x)` << 1, x^20544 mod p(x)` << 1 */ + .octa 0x000000016b4a156e000000007e80ecfe + + /* x^19456 mod p(x)` << 1, x^19520 mod p(x)` << 1 */ + .octa 0x00000001c63cfeb60000000098daee94 + + /* x^18432 mod p(x)` << 1, x^18496 mod p(x)` << 1 */ + .octa 0x000000015f902670000000010a04edea + + /* x^17408 mod p(x)` << 1, x^17472 mod p(x)` << 1 */ + .octa 0x00000001cd5de11e00000001c00b4524 + + /* x^16384 mod p(x)` << 1, x^16448 mod p(x)` << 1 */ + .octa 0x000000001acaec540000000170296550 + + /* x^15360 mod p(x)` << 1, x^15424 mod p(x)` << 1 */ + .octa 0x000000002bd0ca780000000181afaa48 + + /* x^14336 mod p(x)` << 1, x^14400 mod p(x)` << 1 */ + .octa 0x0000000032d63d5c0000000185a31ffa + + /* x^13312 mod p(x)` << 1, x^13376 mod p(x)` << 1 */ + .octa 0x000000001c6d4e4c000000002469f608 + + /* x^12288 mod p(x)` << 1, x^12352 mod p(x)` << 1 */ + .octa 0x0000000106a60b92000000006980102a + + /* x^11264 mod p(x)` << 1, x^11328 mod p(x)` << 1 */ + .octa 0x00000000d3855e120000000111ea9ca8 + + /* x^10240 mod p(x)` << 1, x^10304 mod p(x)` << 1 */ + .octa 0x00000000e312563600000001bd1d29ce + + /* x^9216 mod p(x)` << 1, x^9280 mod p(x)` << 1 */ + .octa 0x000000009e8f7ea400000001b34b9580 + + /* x^8192 mod p(x)` << 1, x^8256 mod p(x)` << 1 */ + .octa 0x00000001c82e562c000000003076054e + + /* x^7168 mod p(x)` << 1, x^7232 mod p(x)` << 1 */ + .octa 0x00000000ca9f09ce000000012a608ea4 + + /* x^6144 mod p(x)` << 1, x^6208 mod p(x)` << 1 */ + .octa 0x00000000c63764e600000000784d05fe + + /* x^5120 mod p(x)` << 1, x^5184 mod p(x)` << 1 */ + .octa 0x0000000168d2e49e000000016ef0d82a + + /* x^4096 mod p(x)` << 1, x^4160 mod p(x)` << 1 */ + .octa 0x00000000e986c1480000000075bda454 + + /* x^3072 mod p(x)` << 1, x^3136 mod p(x)` << 1 */ + .octa 0x00000000cfb65894000000003dc0a1c4 + + /* x^2048 mod p(x)` << 1, x^2112 mod p(x)` << 1 */ + .octa 0x0000000111cadee400000000e9a5d8be + + /* x^1024 mod p(x)` << 1, x^1088 mod p(x)` << 1 */ + .octa 0x0000000171fb63ce00000001609bc4b4 + +.short_constants: + + /* Reduce final 1024-2048 bits to 64 bits, shifting 32 bits to include the trailing 32 bits of zeros */ + /* x^1952 mod p(x)`, x^1984 mod p(x)`, x^2016 mod p(x)`, x^2048 mod p(x)` */ + .octa 0x7fec2963e5bf80485cf015c388e56f72 + + /* x^1824 mod p(x)`, x^1856 mod p(x)`, x^1888 mod p(x)`, x^1920 mod p(x)` */ + .octa 0x38e888d4844752a9963a18920246e2e6 + + /* x^1696 mod p(x)`, x^1728 mod p(x)`, x^1760 mod p(x)`, x^1792 mod p(x)` */ + .octa 0x42316c00730206ad419a441956993a31 + + /* x^1568 mod p(x)`, x^1600 mod p(x)`, x^1632 mod p(x)`, x^1664 mod p(x)` */ + .octa 0x543d5c543e65ddf9924752ba2b830011 + + /* x^1440 mod p(x)`, x^1472 mod p(x)`, x^1504 mod p(x)`, x^1536 mod p(x)` */ + .octa 0x78e87aaf56767c9255bd7f9518e4a304 + + /* x^1312 mod p(x)`, x^1344 mod p(x)`, x^1376 mod p(x)`, x^1408 mod p(x)` */ + .octa 0x8f68fcec1903da7f6d76739fe0553f1e + + /* x^1184 mod p(x)`, x^1216 mod p(x)`, x^1248 mod p(x)`, x^1280 mod p(x)` */ + .octa 0x3f4840246791d588c133722b1fe0b5c3 + + /* x^1056 mod p(x)`, x^1088 mod p(x)`, x^1120 mod p(x)`, x^1152 mod p(x)` */ + .octa 0x34c96751b04de25a64b67ee0e55ef1f3 + + /* x^928 mod p(x)`, x^960 mod p(x)`, x^992 mod p(x)`, x^1024 mod p(x)` */ + .octa 0x156c8e180b4a395b069db049b8fdb1e7 + + /* x^800 mod p(x)`, x^832 mod p(x)`, x^864 mod p(x)`, x^896 mod p(x)` */ + .octa 0xe0b99ccbe661f7bea11bfaf3c9e90b9e + + /* x^672 mod p(x)`, x^704 mod p(x)`, x^736 mod p(x)`, x^768 mod p(x)` */ + .octa 0x041d37768cd75659817cdc5119b29a35 + + /* x^544 mod p(x)`, x^576 mod p(x)`, x^608 mod p(x)`, x^640 mod p(x)` */ + .octa 0x3a0777818cfaa9651ce9d94b36c41f1c + + /* x^416 mod p(x)`, x^448 mod p(x)`, x^480 mod p(x)`, x^512 mod p(x)` */ + .octa 0x0e148e8252377a554f256efcb82be955 + + /* x^288 mod p(x)`, x^320 mod p(x)`, x^352 mod p(x)`, x^384 mod p(x)` */ + .octa 0x9c25531d19e65ddeec1631edb2dea967 + + /* x^160 mod p(x)`, x^192 mod p(x)`, x^224 mod p(x)`, x^256 mod p(x)` */ + .octa 0x790606ff9957c0a65d27e147510ac59a + + /* x^32 mod p(x)`, x^64 mod p(x)`, x^96 mod p(x)`, x^128 mod p(x)` */ + .octa 0x82f63b786ea2d55ca66805eb18b8ea18 + + +.barrett_constants: + /* 33 bit reflected Barrett constant m - (4^32)/n */ + .octa 0x000000000000000000000000dea713f1 /* x^64 div p(x)` */ + /* 33 bit reflected Barrett constant n */ + .octa 0x00000000000000000000000105ec76f1 +#endif diff --git a/src/third_party/wiredtiger/src/support/power8/crc32_wrapper.c b/src/third_party/wiredtiger/src/support/power8/crc32_wrapper.c new file mode 100644 index 00000000000..34ac4150338 --- /dev/null +++ b/src/third_party/wiredtiger/src/support/power8/crc32_wrapper.c @@ -0,0 +1,66 @@ +#if defined(__powerpc64__) +#define CRC_TABLE +#include "crc32_constants.h" + +#define VMX_ALIGN 16 +#define VMX_ALIGN_MASK (VMX_ALIGN-1) + +#ifdef REFLECT +static unsigned int crc32_align(unsigned int crc, unsigned char *p, + unsigned long len) +{ + while (len--) + crc = crc_table[(crc ^ *p++) & 0xff] ^ (crc >> 8); + return crc; +} +#else +static unsigned int crc32_align(unsigned int crc, unsigned char *p, + unsigned long len) +{ + while (len--) + crc = crc_table[((crc >> 24) ^ *p++) & 0xff] ^ (crc << 8); + return crc; +} +#endif + +unsigned int __crc32_vpmsum(unsigned int crc, unsigned char *p, + unsigned long len); + +unsigned int crc32_vpmsum(unsigned int crc, unsigned char *p, + unsigned long len) +{ + unsigned int prealign; + unsigned int tail; + +#ifdef CRC_XOR + crc ^= 0xffffffff; +#endif + + if (len < VMX_ALIGN + VMX_ALIGN_MASK) { + crc = crc32_align(crc, p, len); + goto out; + } + + if ((unsigned long)p & VMX_ALIGN_MASK) { + prealign = VMX_ALIGN - ((unsigned long)p & VMX_ALIGN_MASK); + crc = crc32_align(crc, p, prealign); + len -= prealign; + p += prealign; + } + + crc = __crc32_vpmsum(crc, p, len & ~VMX_ALIGN_MASK); + + tail = len & VMX_ALIGN_MASK; + if (tail) { + p += len & ~VMX_ALIGN_MASK; + crc = crc32_align(crc, p, tail); + } + +out: +#ifdef CRC_XOR + crc ^= 0xffffffff; +#endif + + return crc; +} +#endif diff --git a/src/third_party/wiredtiger/src/support/power8/ppc-opcode.h b/src/third_party/wiredtiger/src/support/power8/ppc-opcode.h new file mode 100644 index 00000000000..b63feea60a0 --- /dev/null +++ b/src/third_party/wiredtiger/src/support/power8/ppc-opcode.h @@ -0,0 +1,23 @@ +#ifndef __OPCODES_H +#define __OPCODES_H + +#define __PPC_RA(a) (((a) & 0x1f) << 16) +#define __PPC_RB(b) (((b) & 0x1f) << 11) +#define __PPC_XA(a) ((((a) & 0x1f) << 16) | (((a) & 0x20) >> 3)) +#define __PPC_XB(b) ((((b) & 0x1f) << 11) | (((b) & 0x20) >> 4)) +#define __PPC_XS(s) ((((s) & 0x1f) << 21) | (((s) & 0x20) >> 5)) +#define __PPC_XT(s) __PPC_XS(s) +#define VSX_XX3(t, a, b) (__PPC_XT(t) | __PPC_XA(a) | __PPC_XB(b)) +#define VSX_XX1(s, a, b) (__PPC_XS(s) | __PPC_RA(a) | __PPC_RB(b)) + +#define PPC_INST_VPMSUMW 0x10000488 +#define PPC_INST_VPMSUMD 0x100004c8 +#define PPC_INST_MFVSRD 0x7c000066 +#define PPC_INST_MTVSRD 0x7c000166 + +#define VPMSUMW(t, a, b) .long PPC_INST_VPMSUMW | VSX_XX3((t), a, b) +#define VPMSUMD(t, a, b) .long PPC_INST_VPMSUMD | VSX_XX3((t), a, b) +#define MFVRD(a, t) .long PPC_INST_MFVSRD | VSX_XX1((t)+32, a, 0) +#define MTVRD(t, a) .long PPC_INST_MTVSRD | VSX_XX1((t)+32, a, 0) + +#endif diff --git a/src/third_party/wiredtiger/src/txn/txn_ckpt.c b/src/third_party/wiredtiger/src/txn/txn_ckpt.c index 6a2c1eef826..85102ae8cfe 100644 --- a/src/third_party/wiredtiger/src/txn/txn_ckpt.c +++ b/src/third_party/wiredtiger/src/txn/txn_ckpt.c @@ -179,14 +179,8 @@ __checkpoint_apply(WT_SESSION_IMPL *session, const char *cfg[], /* If we have already locked the handles, apply the operation. */ for (i = 0; i < session->ckpt_handle_next; ++i) { - if (session->ckpt_handle[i].dhandle != NULL) - WT_WITH_DHANDLE(session, - session->ckpt_handle[i].dhandle, - ret = (*op)(session, cfg)); - else - WT_WITH_HANDLE_LIST_LOCK(session, - ret = __wt_conn_btree_apply_single(session, - session->ckpt_handle[i].name, NULL, op, cfg)); + WT_WITH_DHANDLE(session, session->ckpt_handle[i], + ret = (*op)(session, cfg)); WT_RET(ret); } @@ -257,15 +251,11 @@ __wt_checkpoint_list(WT_SESSION_IMPL *session, const char *cfg[]) name = session->dhandle->name; session->dhandle = NULL; - /* Record busy file names, we'll deal with them in the checkpoint. */ - if ((ret = __wt_session_get_btree(session, name, NULL, NULL, 0)) == 0) - session->ckpt_handle[session->ckpt_handle_next++].dhandle = - session->dhandle; - else if (ret == EBUSY) - ret = __wt_strdup(session, name, - &session->ckpt_handle[session->ckpt_handle_next++].name); + if ((ret = __wt_session_get_btree(session, name, NULL, NULL, 0)) != 0) + return (ret == EBUSY ? 0 : ret); - return (ret); + session->ckpt_handle[session->ckpt_handle_next++] = session->dhandle; + return (0); } /* @@ -421,10 +411,7 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) if (F_ISSET(conn, WT_CONN_CKPT_SYNC)) WT_ERR(__checkpoint_apply(session, cfg, __wt_checkpoint_sync)); - /* Acquire the schema lock. */ - F_SET(session, WT_SESSION_LOCKED_SCHEMA); - __wt_spin_lock(session, &conn->schema_lock); - + /* Start the checkpoint for real. */ WT_ERR(__wt_meta_track_on(session)); tracking = true; @@ -543,16 +530,25 @@ __txn_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) * Recovery relies on the checkpoint LSN in the metadata only being * updated by full checkpoints so only checkpoint the metadata for * full or non-logged checkpoints. + * + * This is very similar to __wt_meta_track_off, ideally they would be + * merged. */ if (full || !logging) { session->isolation = txn->isolation = WT_ISO_READ_UNCOMMITTED; /* Disable metadata tracking during the metadata checkpoint. */ saved_meta_next = session->meta_track_next; session->meta_track_next = NULL; + WT_WITH_METADATA_LOCK(session, ret, + WT_WITH_DHANDLE(session, + WT_SESSION_META_DHANDLE(session), + ret = __wt_checkpoint(session, cfg))); + session->meta_track_next = saved_meta_next; + WT_ERR(ret); + WT_WITH_DHANDLE(session, WT_SESSION_META_DHANDLE(session), - ret = __wt_checkpoint(session, cfg)); - session->meta_track_next = saved_meta_next; + ret = __wt_checkpoint_sync(session, NULL)); WT_ERR(ret); WT_ERR(__checkpoint_verbose_track(session, @@ -610,23 +606,13 @@ err: /* WT_TXN_LOG_CKPT_STOP : WT_TXN_LOG_CKPT_CLEANUP, NULL)); } - for (i = 0; i < session->ckpt_handle_next; ++i) { - if (session->ckpt_handle[i].dhandle == NULL) { - __wt_free(session, session->ckpt_handle[i].name); - continue; - } - WT_WITH_DHANDLE(session, session->ckpt_handle[i].dhandle, + for (i = 0; i < session->ckpt_handle_next; ++i) + WT_WITH_DHANDLE(session, session->ckpt_handle[i], WT_TRET(__wt_session_release_btree(session))); - } __wt_free(session, session->ckpt_handle); session->ckpt_handle_allocated = session->ckpt_handle_next = 0; - if (F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)) { - F_CLR(session, WT_SESSION_LOCKED_SCHEMA); - __wt_spin_unlock(session, &conn->schema_lock); - } - session->isolation = txn->isolation = saved_isolation; return (ret); } @@ -1189,7 +1175,8 @@ __wt_checkpoint(WT_SESSION_IMPL *session, const char *cfg[]) WT_ASSERT(session, session->dhandle->checkpoint == NULL); /* Should be holding the schema lock. */ - WT_ASSERT(session, F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); + WT_ASSERT(session, !WT_IS_METADATA(session, session->dhandle) || + F_ISSET(session, WT_SESSION_LOCKED_METADATA)); return (__checkpoint_worker(session, cfg, true, true)); } @@ -1253,17 +1240,9 @@ __wt_checkpoint_close(WT_SESSION_IMPL *session, bool final) } /* - * We should already have the schema lock unless we're finishing a bulk - * load -- the only other paths to closing files (sweep and LSM) have - * already checked for read-only trees. - */ - WT_ASSERT(session, - final || bulk || F_ISSET(session, WT_SESSION_LOCKED_SCHEMA)); - - /* * Turn on metadata tracking if: * - The session is not already doing metadata tracking. - * - The file was bulk loaded. + * - The file was not bulk loaded. * - The close is not during connection close. */ need_tracking = !WT_META_TRACKING(session) && !bulk && !final; diff --git a/src/third_party/wiredtiger/test/cursor_order/Makefile.am b/src/third_party/wiredtiger/test/cursor_order/Makefile.am new file mode 100644 index 00000000000..c0c0ed639bf --- /dev/null +++ b/src/third_party/wiredtiger/test/cursor_order/Makefile.am @@ -0,0 +1,13 @@ +AM_CPPFLAGS = -I$(top_builddir) -I$(top_srcdir)/src/include \ + -I$(top_srcdir)/test/utility + +noinst_PROGRAMS = cursor_order +cursor_order_LDADD = $(top_builddir)/libwiredtiger.la + +cursor_order_SOURCES = cursor_order_file.c cursor_order_ops.c cursor_order.c +cursor_order_LDFLAGS = -static + +TESTS = $(noinst_PROGRAMS) + +clean-local: + rm -rf WiredTiger* wt.* *.core __stats diff --git a/src/third_party/wiredtiger/test/cursor_order/cursor_order.c b/src/third_party/wiredtiger/test/cursor_order/cursor_order.c new file mode 100644 index 00000000000..14709a2e88e --- /dev/null +++ b/src/third_party/wiredtiger/test/cursor_order/cursor_order.c @@ -0,0 +1,303 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "cursor_order.h" + +static char home[512]; /* Program working dir */ +static char *progname; /* Program name */ +static FILE *logfp; /* Log file */ + +static int handle_error(WT_EVENT_HANDLER *, WT_SESSION *, int, const char *); +static int handle_message(WT_EVENT_HANDLER *, WT_SESSION *, const char *); +static void onint(int); +static void shutdown(void); +static int usage(void); +static void wt_connect(SHARED_CONFIG *, char *); +static void wt_shutdown(SHARED_CONFIG *); + +extern int __wt_optind; +extern char *__wt_optarg; + +int +main(int argc, char *argv[]) +{ + SHARED_CONFIG _cfg, *cfg; + int ch, cnt, runs; + char *config_open, *working_dir; + + if ((progname = strrchr(argv[0], DIR_DELIM)) == NULL) + progname = argv[0]; + else + ++progname; + + cfg = &_cfg; + config_open = NULL; + working_dir = NULL; + runs = 1; + + /* + * Explicitly initialize the shared configuration object before + * parsing command line options. + */ + cfg->append_inserters = 1; + cfg->conn = NULL; + cfg->ftype = ROW; + cfg->max_nops = 1000000; + cfg->multiple_files = false; + cfg->nkeys = 1000; + cfg->reverse_scanners = 5; + cfg->reverse_scan_ops = 10; + cfg->thread_finish = false; + cfg->vary_nops = false; + + while ((ch = __wt_getopt( + progname, argc, argv, "C:Fk:h:l:n:R:r:t:vw:W:")) != EOF) + switch (ch) { + case 'C': /* wiredtiger_open config */ + config_open = __wt_optarg; + break; + case 'F': /* multiple files */ + cfg->multiple_files = true; + break; + case 'h': + working_dir = __wt_optarg; + break; + case 'k': /* rows */ + cfg->nkeys = (uint64_t)atol(__wt_optarg); + break; + case 'l': /* log */ + if ((logfp = fopen(__wt_optarg, "w")) == NULL) { + fprintf(stderr, + "%s: %s\n", __wt_optarg, strerror(errno)); + return (EXIT_FAILURE); + } + break; + case 'n': /* operations */ + cfg->max_nops = (uint64_t)atol(__wt_optarg); + break; + case 'R': + cfg->reverse_scanners = (uint64_t)atol(__wt_optarg); + break; + case 'r': /* runs */ + runs = atoi(__wt_optarg); + break; + case 't': + switch (__wt_optarg[0]) { + case 'f': + cfg->ftype = FIX; + break; + case 'r': + cfg->ftype = ROW; + break; + case 'v': + cfg->ftype = VAR; + break; + default: + return (usage()); + } + break; + case 'v': /* vary operation count */ + cfg->vary_nops = true; + break; + case 'w': + cfg->reverse_scan_ops = (uint64_t)atol(__wt_optarg); + break; + case 'W': + cfg->append_inserters = (uint64_t)atol(__wt_optarg); + break; + default: + return (usage()); + } + + argc -= __wt_optind; + argv += __wt_optind; + if (argc != 0) + return (usage()); + + testutil_work_dir_from_path(home, 512, working_dir); + + if (cfg->vary_nops && !cfg->multiple_files) { + fprintf(stderr, + "Variable op counts only supported with multiple tables\n"); + return (usage()); + } + + /* Clean up on signal. */ + (void)signal(SIGINT, onint); + + printf("%s: process %" PRIu64 "\n", progname, (uint64_t)getpid()); + for (cnt = 1; runs == 0 || cnt <= runs; ++cnt) { + printf(" %d: %u reverse scanners, %u writers\n", cnt, + (int)cfg->reverse_scanners, (int)cfg->append_inserters); + + shutdown(); /* Clean up previous runs */ + + wt_connect(cfg, config_open); /* WiredTiger connection */ + + if (ops_start(cfg)) + return (EXIT_FAILURE); + + wt_shutdown(cfg); /* WiredTiger shut down */ + } + return (0); +} + +/* + * wt_connect -- + * Configure the WiredTiger connection. + */ +static void +wt_connect(SHARED_CONFIG *cfg, char *config_open) +{ + static WT_EVENT_HANDLER event_handler = { + handle_error, + handle_message, + NULL, + NULL /* Close handler. */ + }; + int ret; + char config[512]; + size_t print_count; + + testutil_clean_work_dir(home); + testutil_make_work_dir(home); + + print_count = (size_t)snprintf(config, sizeof(config), + "create,statistics=(all),error_prefix=\"%s\",%s%s", + progname, + config_open == NULL ? "" : ",", + config_open == NULL ? "" : config_open); + + if (print_count >= sizeof(config)) + testutil_die(EINVAL, "Config string too long"); + + if ((ret = wiredtiger_open( + home, &event_handler, config, &cfg->conn)) != 0) + testutil_die(ret, "wiredtiger_open"); +} + +/* + * wt_shutdown -- + * Flush the file to disk and shut down the WiredTiger connection. + */ +static void +wt_shutdown(SHARED_CONFIG *cfg) +{ + WT_CONNECTION *conn; + WT_SESSION *session; + int ret; + + conn = cfg->conn; + + if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) + testutil_die(ret, "conn.session"); + + if ((ret = session->checkpoint(session, NULL)) != 0) + testutil_die(ret, "session.checkpoint"); + + if ((ret = conn->close(conn, NULL)) != 0) + testutil_die(ret, "conn.close"); +} + +/* + * shutdown -- + * Clean up from previous runs. + */ +static void +shutdown(void) +{ + testutil_clean_work_dir(home); +} + +static int +handle_error(WT_EVENT_HANDLER *handler, + WT_SESSION *session, int error, const char *errmsg) +{ + (void)(handler); + (void)(session); + (void)(error); + + return (fprintf(stderr, "%s\n", errmsg) < 0 ? -1 : 0); +} + +static int +handle_message(WT_EVENT_HANDLER *handler, + WT_SESSION *session, const char *message) +{ + (void)(handler); + (void)(session); + + if (logfp != NULL) + return (fprintf(logfp, "%s\n", message) < 0 ? -1 : 0); + + return (printf("%s\n", message) < 0 ? -1 : 0); +} + +/* + * onint -- + * Interrupt signal handler. + */ +static void +onint(int signo) +{ + (void)(signo); + + shutdown(); + + fprintf(stderr, "\n"); + exit(EXIT_FAILURE); +} + +/* + * usage -- + * Display usage statement and exit failure. + */ +static int +usage(void) +{ + fprintf(stderr, + "usage: %s " + "[-FLv] [-C wiredtiger-config] [-k keys] [-l log]\n\t" + "[-n ops] [-R reverse_scanners] [-r runs] [-t f|r|v] " + "[-W append_inserters]\n", + progname); + fprintf(stderr, "%s", + "\t-C specify wiredtiger_open configuration arguments\n" + "\t-F create a file per thread\n" + "\t-k set number of keys to load\n" + "\t-L log print per operation\n" + "\t-l specify a log file\n" + "\t-n set number of operations each thread does\n" + "\t-R set number of reverse scanner threads\n" + "\t-r set number of runs (0 for continuous)\n" + "\t-t set a file type (fix | row | var)\n" + "\t-v do a different number of operations on different tables\n" + "\t-w set number of items to walk in a reverse scan\n" + "\t-W set number of threads doing append inserts\n"); + return (EXIT_FAILURE); +} diff --git a/src/third_party/wiredtiger/test/cursor_order/cursor_order.h b/src/third_party/wiredtiger/test/cursor_order/cursor_order.h new file mode 100644 index 00000000000..dd49fce124b --- /dev/null +++ b/src/third_party/wiredtiger/test/cursor_order/cursor_order.h @@ -0,0 +1,54 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include <signal.h> + +#include "test_util.i" + +#define FNAME "file:cursor_order.%03d" /* File name */ + +typedef enum { FIX, ROW, VAR } __ftype; /* File type */ + +typedef struct { + uint64_t append_inserters; /* Number of append threads */ + WT_CONNECTION *conn; /* WiredTiger connection */ + __ftype ftype; + uint64_t key_range; /* Current key range */ + uint64_t max_nops; /* Operations per thread */ + bool multiple_files; /* File per thread */ + uint64_t nkeys; /* Keys to load */ + uint64_t reverse_scanners; /* Number of scan threads */ + uint64_t reverse_scan_ops; /* Keys to visit per scan */ + bool thread_finish; /* Signal to finish run. */ + bool vary_nops; /* Operations per thread */ + +} SHARED_CONFIG; + +void load(SHARED_CONFIG *, const char *); +int ops_start(SHARED_CONFIG *); +void verify(SHARED_CONFIG *, const char *); diff --git a/src/third_party/wiredtiger/test/cursor_order/cursor_order_file.c b/src/third_party/wiredtiger/test/cursor_order/cursor_order_file.c new file mode 100644 index 00000000000..e5dd76fa1a1 --- /dev/null +++ b/src/third_party/wiredtiger/test/cursor_order/cursor_order_file.c @@ -0,0 +1,130 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "cursor_order.h" + +static void +file_create(SHARED_CONFIG *cfg, const char *name) +{ + WT_CONNECTION *conn; + WT_SESSION *session; + int ret; + char *p, *end, config[128]; + + conn = cfg->conn; + + if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) + testutil_die(ret, "conn.session"); + + p = config; + end = config + sizeof(config); + p += snprintf(p, (size_t)(end - p), + "key_format=%s," + "internal_page_max=%d," + "split_deepen_min_child=200," + "leaf_page_max=%d,", + cfg->ftype == ROW ? "S" : "r", 16 * 1024, 128 * 1024); + if (cfg->ftype == FIX) + (void)snprintf(p, (size_t)(end - p), ",value_format=3t"); + + if ((ret = session->create(session, name, config)) != 0) + if (ret != EEXIST) + testutil_die(ret, "session.create"); + + if ((ret = session->close(session, NULL)) != 0) + testutil_die(ret, "session.close"); +} + +void +load(SHARED_CONFIG *cfg, const char *name) +{ + WT_CONNECTION *conn; + WT_CURSOR *cursor; + WT_ITEM *value, _value; + WT_SESSION *session; + char keybuf[64], valuebuf[64]; + int64_t keyno; + int ret; + + conn = cfg->conn; + + file_create(cfg, name); + + if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) + testutil_die(ret, "conn.session"); + + if ((ret = + session->open_cursor(session, name, NULL, "bulk", &cursor)) != 0) + testutil_die(ret, "cursor.open"); + + value = &_value; + for (keyno = 1; keyno <= (int64_t)cfg->nkeys; ++keyno) { + if (cfg->ftype == ROW) { + snprintf(keybuf, sizeof(keybuf), "%016u", (u_int)keyno); + cursor->set_key(cursor, &keybuf); + } else + cursor->set_key(cursor, (uint32_t)keyno); + value->data = valuebuf; + if (cfg->ftype == FIX) + cursor->set_value(cursor, 0x01); + else { + value->size = (uint32_t)snprintf( + valuebuf, sizeof(valuebuf), "%37u", (u_int)keyno); + cursor->set_value(cursor, value); + } + if ((ret = cursor->insert(cursor)) != 0) + testutil_die(ret, "cursor.insert"); + } + + /* Setup the starting key range for the workload phase. */ + cfg->key_range = cfg->nkeys; + cursor->close(cursor); + session->checkpoint(session, NULL); + + if ((ret = session->close(session, NULL)) != 0) + testutil_die(ret, "session.close"); +} + +void +verify(SHARED_CONFIG *cfg, const char *name) +{ + WT_CONNECTION *conn; + WT_SESSION *session; + int ret; + + conn = cfg->conn; + + if ((ret = conn->open_session(conn, NULL, NULL, &session)) != 0) + testutil_die(ret, "conn.session"); + + if ((ret = session->verify(session, name, NULL)) != 0) + testutil_die(ret, "session.create"); + + if ((ret = session->close(session, NULL)) != 0) + testutil_die(ret, "session.close"); +} diff --git a/src/third_party/wiredtiger/test/cursor_order/cursor_order_ops.c b/src/third_party/wiredtiger/test/cursor_order/cursor_order_ops.c new file mode 100644 index 00000000000..9077f500594 --- /dev/null +++ b/src/third_party/wiredtiger/test/cursor_order/cursor_order_ops.c @@ -0,0 +1,364 @@ +/*- + * Public Domain 2014-2016 MongoDB, Inc. + * Public Domain 2008-2014 WiredTiger, Inc. + * + * This is free and unencumbered software released into the public domain. + * + * Anyone is free to copy, modify, publish, use, compile, sell, or + * distribute this software, either in source code form or as a compiled + * binary, for any purpose, commercial or non-commercial, and by any + * means. + * + * In jurisdictions that recognize copyright laws, the author or authors + * of this software dedicate any and all copyright interest in the + * software to the public domain. We make this dedication for the benefit + * of the public at large and to the detriment of our heirs and + * successors. We intend this dedication to be an overt act of + * relinquishment in perpetuity of all present and future rights to this + * software under copyright law. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR + * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, + * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#include "cursor_order.h" + +static void *append_insert(void *); +static void print_stats(SHARED_CONFIG *); +static void *reverse_scan(void *); + +typedef struct { + char *name; /* object name */ + uint64_t nops; /* Thread op count */ + + WT_RAND_STATE rnd; /* RNG */ + + int append_insert; /* cursor.insert */ + int reverse_scans; /* cursor.prev sequences */ + SHARED_CONFIG *cfg; +} INFO; + +static INFO *run_info; + +int +ops_start(SHARED_CONFIG *cfg) +{ + struct timeval start, stop; + double seconds; + pthread_t *tids; + uint64_t i, name_index, offset, total_nops; + int ret; + void *thread_ret; + + tids = NULL; /* Keep GCC 4.1 happy. */ + total_nops = 0; + + /* Create per-thread structures. */ + if ((run_info = calloc( + (size_t)(cfg->reverse_scanners + cfg->append_inserters), + sizeof(*run_info))) == NULL) + testutil_die(errno, "calloc"); + + if ((tids = calloc( + (size_t)(cfg->reverse_scanners + cfg->append_inserters), + sizeof(*tids))) == NULL) + testutil_die(errno, "calloc"); + + /* Create the files and load the initial records. */ + for (i = 0; i < cfg->append_inserters; ++i) { + run_info[i].cfg = cfg; + if (i == 0 || cfg->multiple_files) { + if ((run_info[i].name = malloc(64)) == NULL) + testutil_die(errno, "malloc"); + snprintf(run_info[i].name, 64, FNAME, (int)i); + + /* Vary by orders of magnitude */ + if (cfg->vary_nops) + run_info[i].nops = + WT_MAX(1000, cfg->max_nops >> i); + load(cfg, run_info[i].name); + } else + run_info[i].name = run_info[0].name; + + /* Setup op count if not varying ops. */ + if (run_info[i].nops == 0) + run_info[i].nops = cfg->max_nops; + total_nops += run_info[i].nops; + } + + /* Setup the reverse scanner configurations */ + for (i = 0; i < cfg->reverse_scanners; ++i) { + offset = i + cfg->append_inserters; + run_info[offset].cfg = cfg; + if (cfg->multiple_files) { + if ((run_info[offset].name = malloc(64)) == NULL) + testutil_die(errno, "malloc"); + /* Have reverse scans read from tables with writes. */ + name_index = i % cfg->append_inserters; + snprintf( + run_info[offset].name, 64, FNAME, (int)name_index); + + /* Vary by orders of magnitude */ + if (cfg->vary_nops) + run_info[offset].nops = + WT_MAX(1000, cfg->max_nops >> name_index); + } else + run_info[offset].name = run_info[0].name; + + /* Setup op count if not varying ops. */ + if (run_info[offset].nops == 0) + run_info[offset].nops = cfg->max_nops; + total_nops += run_info[offset].nops; + } + + (void)gettimeofday(&start, NULL); + + /* Create threads. */ + for (i = 0; i < cfg->reverse_scanners; ++i) + if ((ret = pthread_create( + &tids[i], NULL, reverse_scan, (void *)(uintptr_t)i)) != 0) + testutil_die(ret, "pthread_create"); + for (; i < cfg->reverse_scanners + cfg->append_inserters; ++i) { + if ((ret = pthread_create( + &tids[i], NULL, append_insert, (void *)(uintptr_t)i)) != 0) + testutil_die(ret, "pthread_create"); + } + + /* Wait for the threads. */ + for (i = 0; i < cfg->reverse_scanners + cfg->append_inserters; ++i) + (void)pthread_join(tids[i], &thread_ret); + + (void)gettimeofday(&stop, NULL); + seconds = (stop.tv_sec - start.tv_sec) + + (stop.tv_usec - start.tv_usec) * 1e-6; + fprintf(stderr, "timer: %.2lf seconds (%d ops/second)\n", + seconds, (int)(((cfg->reverse_scanners + cfg->append_inserters) * + total_nops) / seconds)); + + /* Verify the files. */ + for (i = 0; i < cfg->reverse_scanners + cfg->append_inserters; ++i) { + verify(cfg, run_info[i].name); + if (!cfg->multiple_files) + break; + } + + /* Output run statistics. */ + print_stats(cfg); + + /* Free allocated memory. */ + for (i = 0; i < cfg->reverse_scanners + cfg->append_inserters; ++i) { + free(run_info[i].name); + if (!cfg->multiple_files) + break; + } + + free(run_info); + free(tids); + + return (0); +} + +/* + * reverse_scan_op -- + * Walk a cursor back from the end of the file. + */ +static inline void +reverse_scan_op( + SHARED_CONFIG *cfg, WT_SESSION *session, WT_CURSOR *cursor, INFO *s) +{ + uint64_t i; + int ret; + char *strkey; + uint64_t initial_key_range; + uint64_t prev_key, this_key; + + WT_UNUSED(session); + WT_UNUSED(s); + + /* Make GCC 4.1 happy */ + prev_key = this_key = 0; + + /* Reset the cursor */ + cursor->reset(cursor); + + /* Save the key range. */ + initial_key_range = cfg->key_range - cfg->append_inserters; + + for (i = 0; i < cfg->reverse_scan_ops; i++) { + if ((ret = cursor->prev(cursor)) != 0) { + if (ret == WT_NOTFOUND) + break; + testutil_die(ret, "cursor.prev"); + } + + if (cfg->ftype == ROW) { + cursor->get_key(cursor, &strkey); + this_key = (uint64_t)atol(strkey); + } else + cursor->get_key(cursor, (uint64_t*)&this_key); + + if (i == 0 && this_key < initial_key_range) + testutil_die(ret, + "cursor scan start range wrong first prev %" PRIu64 + " initial range: %" PRIu64, + this_key, initial_key_range); + if (i != 0 && this_key >= prev_key) + testutil_die(ret, + "cursor scan out of order this: %" PRIu64 + " prev: %" PRIu64, + this_key, prev_key); + prev_key = this_key; + } +} + +/* + * reverse_scan -- + * Reader thread start function. + */ +static void * +reverse_scan(void *arg) +{ + INFO *s; + SHARED_CONFIG *cfg; + WT_CURSOR *cursor; + WT_SESSION *session; + int id, ret; + char tid[128]; + uint64_t i; + + id = (int)(uintptr_t)arg; + s = &run_info[id]; + cfg = s->cfg; + __wt_thread_id(tid, sizeof(tid)); + __wt_random_init(&s->rnd); + + printf(" reverse scan thread %2d starting: tid: %s, file: %s\n", + id, tid, s->name); + + __wt_yield(); /* Get all the threads created. */ + + if ((ret = cfg->conn->open_session( + cfg->conn, NULL, "isolation=snapshot", &session)) != 0) + testutil_die(ret, "conn.open_session"); + if ((ret = session->open_cursor( + session, s->name, NULL, NULL, &cursor)) != 0) + testutil_die(ret, "session.open_cursor"); + for (i = 0; i < s->nops && !cfg->thread_finish; + ++i, ++s->reverse_scans, __wt_yield()) + reverse_scan_op(cfg, session, cursor, s); + if ((ret = session->close(session, NULL)) != 0) + testutil_die(ret, "session.close"); + + printf(" reverse scan thread %2d stopping: tid: %s, file: %s\n", + id, tid, s->name); + + /* Notify all other threads to finish once the first thread is done */ + cfg->thread_finish = true; + + return (NULL); +} + +/* + * append_insert_op -- + * Write operation. + */ +static inline void +append_insert_op( + SHARED_CONFIG *cfg, WT_SESSION *session, WT_CURSOR *cursor, INFO *s) +{ + WT_ITEM *value, _value; + uint64_t keyno; + int ret; + char keybuf[64], valuebuf[64]; + + WT_UNUSED(session); + + value = &_value; + + keyno = __wt_atomic_add64(&cfg->key_range, 1); + if (cfg->ftype == ROW) { + snprintf(keybuf, sizeof(keybuf), "%016u", (u_int)keyno); + cursor->set_key(cursor, &keybuf); + } else + cursor->set_key(cursor, (uint32_t)keyno); + + ++s->append_insert; + value->data = valuebuf; + if (cfg->ftype == FIX) + cursor->set_value(cursor, 0x10); + else { + value->size = (uint32_t)snprintf( + valuebuf, sizeof(valuebuf), "XXX %37u", (u_int)keyno); + cursor->set_value(cursor, value); + } + if ((ret = cursor->insert(cursor)) != 0) + testutil_die(ret, "cursor.insert"); +} + +/* + * append_insert -- + * Writer thread start function. + */ +static void * +append_insert(void *arg) +{ + INFO *s; + SHARED_CONFIG *cfg; + WT_CURSOR *cursor; + WT_SESSION *session; + uint64_t i; + int id, ret; + char tid[128]; + + id = (int)(uintptr_t)arg; + s = &run_info[id]; + cfg = s->cfg; + __wt_thread_id(tid, sizeof(tid)); + __wt_random_init(&s->rnd); + + printf("write thread %2d starting: tid: %s, file: %s\n", + id, tid, s->name); + + __wt_yield(); /* Get all the threads created. */ + + if ((ret = cfg->conn->open_session( + cfg->conn, NULL, "isolation=snapshot", &session)) != 0) + testutil_die(ret, "conn.open_session"); + if ((ret = session->open_cursor( + session, s->name, NULL, NULL, &cursor)) != 0) + testutil_die(ret, "session.open_cursor"); + for (i = 0; i < s->nops && !cfg->thread_finish; ++i, __wt_yield()) + append_insert_op(cfg, session, cursor, s); + if ((ret = session->close(session, NULL)) != 0) + testutil_die(ret, "session.close"); + + printf("write thread %2d stopping: tid: %s, file: %s\n", + id, tid, s->name); + + /* Notify all other threads to finish once the first thread is done */ + cfg->thread_finish = true; + + return (NULL); +} + +/* + * print_stats -- + * Display reverse scan/writer thread stats. + */ +static void +print_stats(SHARED_CONFIG *cfg) +{ + INFO *s; + uint64_t id, total_threads; + + total_threads = cfg->reverse_scanners + cfg->append_inserters; + s = run_info; + for (id = 0; id < total_threads; ++id, ++s) + printf("%3d: reverse scans %6d, append inserts %6d\n", + (int)id, (int)s->reverse_scans, (int)s->append_insert); +} diff --git a/src/third_party/wiredtiger/test/format/config.h b/src/third_party/wiredtiger/test/format/config.h index d8b11b005d4..a17614bc044 100644 --- a/src/third_party/wiredtiger/test/format/config.h +++ b/src/third_party/wiredtiger/test/format/config.h @@ -246,6 +246,10 @@ static CONFIG c[] = { "minimum gain before prefix compression is used", 0x0, 0, 8, 256, &g.c_prefix_compression_min, NULL }, + { "quiet", + "quiet run (same as -q)", + C_IGNORE|C_BOOL, 0, 0, 0, &g.c_quiet, NULL }, + { "repeat_data_pct", "percent duplicate values in row- or var-length column-stores", 0x0, 0, 90, 90, &g.c_repeat_data_pct, NULL }, diff --git a/src/third_party/wiredtiger/test/format/format.h b/src/third_party/wiredtiger/test/format/format.h index 41c9de3dd30..03da1a84c9c 100644 --- a/src/third_party/wiredtiger/test/format/format.h +++ b/src/third_party/wiredtiger/test/format/format.h @@ -142,7 +142,6 @@ typedef struct { FILE *logfp; /* Log file */ int replay; /* Replaying a run. */ - int track; /* Track progress */ int workers_finished; /* Operations completed */ pthread_rwlock_t backup_lock; /* Hot backup running */ @@ -210,6 +209,7 @@ typedef struct { uint32_t c_merge_max; uint32_t c_mmap; uint32_t c_ops; + uint32_t c_quiet; uint32_t c_prefix_compression; uint32_t c_prefix_compression_min; uint32_t c_repeat_data_pct; diff --git a/src/third_party/wiredtiger/test/format/t.c b/src/third_party/wiredtiger/test/format/t.c index ccbc0442e4a..0c0485c8bfe 100644 --- a/src/third_party/wiredtiger/test/format/t.c +++ b/src/third_party/wiredtiger/test/format/t.c @@ -64,7 +64,7 @@ main(int argc, char *argv[]) #endif /* Track progress unless we're re-directing output to a file. */ - g.track = isatty(1) ? 1 : 0; + g.c_quiet = isatty(1) ? 0 : 1; /* Set values from the command line. */ home = NULL; @@ -99,7 +99,7 @@ main(int argc, char *argv[]) g.logging = LOG_OPS; break; case 'q': /* Quiet */ - g.track = 0; + g.c_quiet = 1; break; case 'r': /* Replay a run */ g.replay = 1; @@ -259,7 +259,7 @@ main(int argc, char *argv[]) wts_salvage(); /* Overwrite the progress line with a completion line. */ - if (g.track) + if (!g.c_quiet) printf("\r%78s\r", " "); printf("%4d: %s, %s (%.0f seconds)\n", g.run_cnt, g.c_data_source, @@ -322,8 +322,8 @@ die(int e, const char *fmt, ...) (void)pthread_rwlock_wrlock(&g.death_lock); /* Try and turn off tracking so it doesn't obscure the error message. */ - if (g.track) { - g.track = 0; + if (!g.c_quiet) { + g.c_quiet = 1; fprintf(stderr, "\n"); } if (fmt != NULL) { /* Death message. */ diff --git a/src/third_party/wiredtiger/test/format/util.c b/src/third_party/wiredtiger/test/format/util.c index 2b6b9d67fc3..82a6de97ab6 100644 --- a/src/third_party/wiredtiger/test/format/util.c +++ b/src/third_party/wiredtiger/test/format/util.c @@ -236,7 +236,7 @@ track(const char *tag, uint64_t cnt, TINFO *tinfo) int len; char msg[128]; - if (!g.track || tag == NULL) + if (g.c_quiet || tag == NULL) return; if (tinfo == NULL && cnt == 0) diff --git a/src/third_party/wiredtiger/test/suite/test_backup05.py b/src/third_party/wiredtiger/test/suite/test_backup05.py index 8b176d0f7d7..8ffeb6752df 100644 --- a/src/third_party/wiredtiger/test/suite/test_backup05.py +++ b/src/third_party/wiredtiger/test/suite/test_backup05.py @@ -71,7 +71,7 @@ class test_backup05(wttest.WiredTigerTestCase, suite_subprocess): session.verify(self.uri) conn.close() - def test_backup(self): + def backup(self): '''Check manual fsyncLock backup strategy''' # Here's the strategy: @@ -95,5 +95,9 @@ class test_backup05(wttest.WiredTigerTestCase, suite_subprocess): else: self.session.verify(self.uri) + def test_backup(self): + with self.expectedStdoutPattern('Recreating metadata'): + self.backup() + if __name__ == '__main__': wttest.run() diff --git a/src/third_party/wiredtiger/test/suite/test_bulk02.py b/src/third_party/wiredtiger/test/suite/test_bulk02.py index eeca6a56967..fe8118209f2 100644 --- a/src/third_party/wiredtiger/test/suite/test_bulk02.py +++ b/src/third_party/wiredtiger/test/suite/test_bulk02.py @@ -49,8 +49,7 @@ class test_bulkload_checkpoint(wttest.WiredTigerTestCase, suite_subprocess): scenarios = number_scenarios(multiply_scenarios('.', types, ckpt_type)) - # Bulk-load handles return EBUSY to the checkpoint code, causing the - # checkpoint call to find a handle anyway, and create fake checkpoint. + # Bulk-load handles are skipped by checkpoints. # Named and unnamed checkpoint versions. def test_bulkload_checkpoint(self): # Open a bulk cursor and insert a few records. @@ -72,11 +71,8 @@ class test_bulkload_checkpoint(wttest.WiredTigerTestCase, suite_subprocess): # In the case of named checkpoints, verify they're still there, # reflecting an empty file. if self.ckpt_type == 'named': - cursor = self.session.open_cursor( - self.uri, None, 'checkpoint=myckpt') - self.assertEquals(cursor.next(), wiredtiger.WT_NOTFOUND) - cursor.close() - + self.assertRaises(wiredtiger.WiredTigerError, + lambda: self.session.open_cursor(self.uri, None, 'checkpoint=myckpt')) # test_bulkload_backup # Test bulk-load with hot-backup. diff --git a/src/third_party/wiredtiger/test/suite/test_cursor_random.py b/src/third_party/wiredtiger/test/suite/test_cursor_random.py index 2cef62b218a..1fd30d93c11 100644 --- a/src/third_party/wiredtiger/test/suite/test_cursor_random.py +++ b/src/third_party/wiredtiger/test/suite/test_cursor_random.py @@ -137,7 +137,7 @@ class test_cursor_random_column(wttest.WiredTigerTestCase): def test_cursor_random_column(self): self.session.create(self.uri, 'key_format=r,value_format=S') - msg = '/Operation not supported/' + msg = '/next_random .* not supported/' self.assertRaisesWithMessage(wiredtiger.WiredTigerError, lambda: self.session.open_cursor(self.uri, None, "next_random=true"), msg) diff --git a/src/third_party/wiredtiger/test/suite/test_index01.py b/src/third_party/wiredtiger/test/suite/test_index01.py index bebeb191ef0..5dfa5506277 100644 --- a/src/third_party/wiredtiger/test/suite/test_index01.py +++ b/src/third_party/wiredtiger/test/suite/test_index01.py @@ -226,10 +226,6 @@ class test_index01(wttest.WiredTigerTestCase): self.assertRaises(wiredtiger.WiredTigerError, lambda: self.session.create(self.index[0], 'columns=(dept),exclusive')) - # non-exclusive create with differing configuration - self.assertRaisesWithMessage(wiredtiger.WiredTigerError, - lambda: self.session.create(self.index[0], - 'columns=(salary)'), '/does not match existing configuration/') self.drop_table() if __name__ == '__main__': diff --git a/src/third_party/wiredtiger/test/suite/test_schema02.py b/src/third_party/wiredtiger/test/suite/test_schema02.py index 6895e947efe..b404261c066 100644 --- a/src/third_party/wiredtiger/test/suite/test_schema02.py +++ b/src/third_party/wiredtiger/test/suite/test_schema02.py @@ -103,10 +103,6 @@ class test_schema02(wttest.WiredTigerTestCase): self.expect_failure_colgroup("main:c1", "columns=(S1,i2),exclusive", "") - # exists with different config - self.expect_failure_colgroup("main:c1", "columns=(S1,i4)", - "/does not match existing configuration/") - # colgroup not declared in initial create self.expect_failure_colgroup("main:c3", "columns=(S3,i4)", "/Column group 'c3' not found in" diff --git a/src/third_party/wiredtiger/test/suite/test_txn04.py b/src/third_party/wiredtiger/test/suite/test_txn04.py index de49c5fe235..bbd6ce8c4e2 100644 --- a/src/third_party/wiredtiger/test/suite/test_txn04.py +++ b/src/third_party/wiredtiger/test/suite/test_txn04.py @@ -121,17 +121,14 @@ class test_txn04(wttest.WiredTigerTestCase, suite_subprocess): cmd += self.backup_dir self.runWt(cmd.split()) - self.exception='false' backup_conn_params = 'log=(enabled,file_max=%s)' % self.logmax backup_conn = self.wiredtiger_open(self.backup_dir, backup_conn_params) try: self.check(backup_conn.open_session(), None, committed) - except: - self.exception='true' finally: backup_conn.close() - def test_ops(self): + def ops(self): self.session.create(self.uri, self.create_params) c = self.session.open_cursor(self.uri, None, 'overwrite') # Set up the table with entries for 1-5. @@ -149,7 +146,6 @@ class test_txn04(wttest.WiredTigerTestCase, suite_subprocess): # The runWt command closes our connection and sessions so # we need to reopen them here. self.hot_backup(None, committed) - self.assertEqual(True, self.exception == 'false') c = self.session.open_cursor(self.uri, None, 'overwrite') c.set_value(1) # Then do the given modification. @@ -192,14 +188,13 @@ class test_txn04(wttest.WiredTigerTestCase, suite_subprocess): # Check the state after each commit/rollback. self.check_all(current, committed) - # Backup the target we modified. We expect that running - # recovery now will generate an exception if we committed. + # Backup the target we modified and verify the data. # print 'Call hot_backup with ' + self.uri self.hot_backup(self.uri, committed) - if txn == 'commit': - self.assertEqual(True, self.exception == 'true') - else: - self.assertEqual(True, self.exception == 'false') + + def test_ops(self): + with self.expectedStdoutPattern('Recreating metadata'): + self.ops() if __name__ == '__main__': wttest.run() |