/*- * Copyright (c) 2014-present MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * * See the file LICENSE for redistribution information. */ /* * __col_insert_search_gt -- * Search a column-store insert list for the next larger record. */ static inline WT_INSERT * __col_insert_search_gt(WT_INSERT_HEAD *ins_head, uint64_t recno) { WT_INSERT *ins, **insp, *ret_ins; int i; /* * Compiler may replace the following usage of the variable with another read. * * Place a read barrier to avoid this issue. */ WT_ORDERED_READ_WEAK_MEMORDER(ins, WT_SKIP_LAST(ins_head)); /* If there's no insert chain to search, we're done. */ if (ins == NULL) return (NULL); /* Fast path check for targets past the end of the skiplist. */ if (recno >= WT_INSERT_RECNO(ins)) return (NULL); /* * The insert list is a skip list: start at the highest skip level, then go as far as possible * at each level before stepping down to the next. */ ret_ins = NULL; for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i]; i >= 0;) { /* * CPUs with weak memory ordering may reorder the reads which may lead us to read a stale * and inconsistent value in the lower level. Place a read barrier to avoid this issue. * * This should use WT_ORDERED_READ_WEAK_MEMORDER. But to lower the risk of the change, we * keep this as before for now. */ WT_ORDERED_READ(ins, *insp); if (ins != NULL && recno >= WT_INSERT_RECNO(ins)) { /* GTE: keep going at this level */ insp = &ins->next[i]; ret_ins = ins; } else { --i; /* LT: drop down a level */ --insp; } } /* * If we didn't find any records greater than or equal to the target, we never set the return * value, set it to the first record in the list. * * Otherwise, it references a record less-than-or-equal to the target, move to a later record, * that is, a subsequent record greater than the target. Because inserts happen concurrently, * additional records might be inserted after the searched-for record that are still smaller * than the target, continue to move forward until reaching a record larger than the target. * There isn't any safety testing because we confirmed such a record exists before searching. */ if ((ins = ret_ins) == NULL) ins = WT_SKIP_FIRST(ins_head); while (recno >= WT_INSERT_RECNO(ins)) /* * CPUs with weak memory ordering may reorder the read and we may read a stale next value * and incorrectly skip a key that is concurrently inserted. For example, if we have A -> C * -> E initially, D is inserted, then B is inserted. If the current thread sees B, it would * be consistent to not see D. * * Place a read barrier to avoid this issue. */ WT_ORDERED_READ_WEAK_MEMORDER(ins, WT_SKIP_NEXT(ins)); return (ins); } /* * __col_insert_search_lt -- * Search a column-store insert list for the next smaller record. */ static inline WT_INSERT * __col_insert_search_lt(WT_INSERT_HEAD *ins_head, uint64_t recno) { WT_INSERT *ins, **insp, *ret_ins; int i; /* * Compiler may replace the following usage of the variable with another read. * * Place a read barrier to avoid this issue. */ WT_ORDERED_READ_WEAK_MEMORDER(ins, WT_SKIP_FIRST(ins_head)); /* If there's no insert chain to search, we're done. */ if (ins == NULL) return (NULL); /* Fast path check for targets before the skiplist. */ if (recno <= WT_INSERT_RECNO(ins)) return (NULL); /* * The insert list is a skip list: start at the highest skip level, then go as far as possible * at each level before stepping down to the next. */ ret_ins = NULL; for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i]; i >= 0;) { /* * CPUs with weak memory ordering may reorder the reads which may lead us to read a stale * and inconsistent value in the lower level. Place a read barrier to avoid this issue. * * This should use WT_ORDERED_READ_WEAK_MEMORDER. But to lower the risk of the change, we * keep this as before for now. */ WT_ORDERED_READ(ins, *insp); if (ins != NULL && recno > WT_INSERT_RECNO(ins)) { /* GT: keep going at this level */ insp = &ins->next[i]; ret_ins = ins; } else { --i; /* LTE: drop down a level */ --insp; } } return (ret_ins); } /* * __col_insert_search_match -- * Search a column-store insert list for an exact match. */ static inline WT_INSERT * __col_insert_search_match(WT_INSERT_HEAD *ins_head, uint64_t recno) { WT_INSERT *ins, **insp; uint64_t ins_recno; int cmp, i; /* * Compiler may replace the following usage of the variable with another read. * * Place a read barrier to avoid this issue. */ WT_ORDERED_READ_WEAK_MEMORDER(ins, WT_SKIP_LAST(ins_head)); /* If there's no insert chain to search, we're done. */ if (ins == NULL) return (NULL); /* Fast path the check for values at the end of the skiplist. */ if (recno > WT_INSERT_RECNO(ins)) return (NULL); if (recno == WT_INSERT_RECNO(ins)) return (ins); /* * The insert list is a skip list: start at the highest skip level, then go as far as possible * at each level before stepping down to the next. */ for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i]; i >= 0;) { /* * CPUs with weak memory ordering may reorder the reads which may lead us to read a stale * and inconsistent value in the lower level. Place a read barrier to avoid this issue. * * This should use WT_ORDERED_READ_WEAK_MEMORDER. But to lower the risk of the change, we * keep this as before for now. */ WT_ORDERED_READ(ins, *insp); if (ins == NULL) { --i; --insp; continue; } ins_recno = WT_INSERT_RECNO(ins); cmp = (recno == ins_recno) ? 0 : (recno < ins_recno) ? -1 : 1; if (cmp == 0) /* Exact match: return */ return (ins); if (cmp > 0) /* Keep going at this level */ insp = &ins->next[i]; else { /* Drop down a level */ --i; --insp; } } return (NULL); } /* * __col_insert_search -- * Search a column-store insert list, creating a skiplist stack as we go. */ static inline WT_INSERT * __col_insert_search( WT_INSERT_HEAD *ins_head, WT_INSERT ***ins_stack, WT_INSERT **next_stack, uint64_t recno) { WT_INSERT **insp, *ret_ins; uint64_t ins_recno; int cmp, i; /* * Compiler may replace the following usage of the variable with another read. * * Place a read barrier to avoid this issue. */ WT_ORDERED_READ_WEAK_MEMORDER(ret_ins, WT_SKIP_LAST(ins_head)); /* If there's no insert chain to search, we're done. */ if (ret_ins == NULL) return (NULL); /* Fast path appends. */ if (recno >= WT_INSERT_RECNO(ret_ins)) { for (i = 0; i < WT_SKIP_MAXDEPTH; i++) { ins_stack[i] = (i == 0) ? &ret_ins->next[0] : (ins_head->tail[i] != NULL) ? &ins_head->tail[i]->next[i] : &ins_head->head[i]; next_stack[i] = NULL; } return (ret_ins); } /* * The insert list is a skip list: start at the highest skip level, then go as far as possible * at each level before stepping down to the next. */ for (i = WT_SKIP_MAXDEPTH - 1, insp = &ins_head->head[i]; i >= 0;) { /* * Compiler and CPUs with weak memory ordering may reorder the reads causing us to read a * stale value here. Different to the row store version, it is generally OK here to read a * stale value as we don't have prefix search optimization for column store. Therefore, we * cannot wrongly skip the prefix comparison. However, we should still place a read barrier * here to ensure we see consistent values in the lower levels to prevent any unexpected * behavior. */ WT_ORDERED_READ_WEAK_MEMORDER(ret_ins, *insp); if (ret_ins == NULL) { next_stack[i] = NULL; ins_stack[i--] = insp--; continue; } /* * When no exact match is found, the search returns the smallest key larger than the * searched-for key, or the largest key smaller than the searched-for key, if there is no * larger key. Our callers depend on that: specifically, the fixed-length column store * cursor code interprets returning a key smaller than the searched-for key to mean the * searched-for key is larger than any key on the page. Don't change that behavior, things * will break. */ ins_recno = WT_INSERT_RECNO(ret_ins); cmp = (recno == ins_recno) ? 0 : (recno < ins_recno) ? -1 : 1; if (cmp > 0) /* Keep going at this level */ insp = &ret_ins->next[i]; else if (cmp == 0) /* Exact match: return */ for (; i >= 0; i--) { /* * It is possible that we read an old value that is inconsistent to the higher * levels of the skip list due to read reordering on CPUs with weak memory ordering. * Add a read barrier to avoid this issue. */ WT_ORDERED_READ_WEAK_MEMORDER(next_stack[i], ret_ins->next[i]); ins_stack[i] = &ret_ins->next[i]; } else { /* Drop down a level */ next_stack[i] = ret_ins; ins_stack[i--] = insp--; } } return (ret_ins); } /* * __col_var_last_recno -- * Return the last record number for a variable-length column-store page. */ static inline uint64_t __col_var_last_recno(WT_REF *ref) { WT_COL_RLE *repeat; WT_PAGE *page; page = ref->page; /* * If there's an append list, there may be more records on the page. This function ignores those * records, our callers must handle that explicitly, if they care. */ if (!WT_COL_VAR_REPEAT_SET(page)) return (page->entries == 0 ? WT_RECNO_OOB : ref->ref_recno + (page->entries - 1)); repeat = &page->pg_var_repeats[page->pg_var_nrepeats - 1]; return ((repeat->recno + repeat->rle) - 1 + (page->entries - (repeat->indx + 1))); } /* * __col_fix_last_recno -- * Return the last record number for a fixed-length column-store page. */ static inline uint64_t __col_fix_last_recno(WT_REF *ref) { WT_PAGE *page; page = ref->page; /* * If there's an append list, there may be more records on the page. This function ignores those * records, our callers must handle that explicitly, if they care. */ return (page->entries == 0 ? WT_RECNO_OOB : ref->ref_recno + (page->entries - 1)); } /* * __col_var_search -- * Search a variable-length column-store page for a record. */ static inline WT_COL * __col_var_search(WT_REF *ref, uint64_t recno, uint64_t *start_recnop) { WT_COL_RLE *repeat; WT_PAGE *page; uint64_t start_recno; uint32_t base, indx, limit, start_indx; page = ref->page; /* * Find the matching slot. * * This is done in two stages: first, we do a binary search among any repeating records to find * largest repeating less than the search key. Once there, we can do a simple offset calculation * to find the correct slot for this record number, because we know any intervening records have * repeat counts of 1. */ for (base = 0, limit = WT_COL_VAR_REPEAT_SET(page) ? page->pg_var_nrepeats : 0; limit != 0; limit >>= 1) { indx = base + (limit >> 1); repeat = page->pg_var_repeats + indx; if (recno >= repeat->recno && recno < repeat->recno + repeat->rle) { if (start_recnop != NULL) *start_recnop = repeat->recno; return (page->pg_var + repeat->indx); } if (recno < repeat->recno) continue; base = indx + 1; --limit; } /* * We didn't find an exact match, move forward from the largest repeat less than the search key. */ if (base == 0) { start_indx = 0; start_recno = ref->ref_recno; } else { repeat = page->pg_var_repeats + (base - 1); start_indx = repeat->indx + 1; start_recno = repeat->recno + repeat->rle; } /* * !!! * The test could be written more simply as: * * (recno >= start_recno + (page->entries - start_indx)) * * It's split into two parts because the simpler test will overflow if * searching for large record numbers. */ if (recno >= start_recno && recno - start_recno >= page->entries - start_indx) return (NULL); return (page->pg_var + start_indx + (uint32_t)(recno - start_recno)); }