summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarko Mäkelä <marko.makela@mariadb.com>2021-09-17 09:14:20 +0300
committerMarko Mäkelä <marko.makela@mariadb.com>2021-09-17 14:40:41 +0300
commit1e9c922fa726b22f4522f2a4de0fcb6595404086 (patch)
tree38b86f6a6218af0142e45d87629e01305997cc95
parent1a4a7dddb6505352a7c3b4a2e1583ed92cad1dcf (diff)
downloadmariadb-git-1e9c922fa726b22f4522f2a4de0fcb6595404086.tar.gz
MDEV-26623 Possible race condition between statistics and bulk insert
When computing statistics, let us play it safe and check whether an insert into an empty table is in progress, once we have acquired the root page latch. If yes, let us pretend that the table is empty, just like MVCC reads would do. It is unlikely that this race condition could lead into any crashes before MDEV-24621, because the index tree structure should be protected by the page latches. But, at least we can avoid some busy work and return earlier. As part of this, some code that is only used for statistics calculation is being moved into static functions in that compilation unit.
-rw-r--r--storage/innobase/btr/btr0btr.cc74
-rw-r--r--storage/innobase/btr/btr0cur.cc344
-rw-r--r--storage/innobase/dict/dict0stats.cc463
-rw-r--r--storage/innobase/include/btr0btr.h24
-rw-r--r--storage/innobase/include/btr0cur.h31
5 files changed, 424 insertions, 512 deletions
diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc
index 56e84e47fda..02aa89361c5 100644
--- a/storage/innobase/btr/btr0btr.cc
+++ b/storage/innobase/btr/btr0btr.cc
@@ -274,38 +274,6 @@ btr_root_get(
}
/**************************************************************//**
-Gets the height of the B-tree (the level of the root, when the leaf
-level is assumed to be 0). The caller must hold an S or X latch on
-the index.
-@return tree height (level of the root) */
-ulint
-btr_height_get(
-/*===========*/
- const dict_index_t* index, /*!< in: index tree */
- mtr_t* mtr) /*!< in/out: mini-transaction */
-{
- ulint height=0;
- buf_block_t* root_block;
-
- ut_ad(srv_read_only_mode
- || mtr->memo_contains_flagged(&index->lock, MTR_MEMO_S_LOCK
- | MTR_MEMO_X_LOCK
- | MTR_MEMO_SX_LOCK));
-
- /* S latches the page */
- root_block = btr_root_block_get(index, RW_S_LATCH, mtr);
-
- if (root_block) {
- height = btr_page_get_level(buf_block_get_frame(root_block));
-
- /* Release the S latch on the root page. */
- mtr->memo_release(root_block, MTR_MEMO_PAGE_S_FIX);
- }
-
- return(height);
-}
-
-/**************************************************************//**
Checks a file segment header within a B-tree root page and updates
the segment header space id.
@return TRUE if valid */
@@ -563,48 +531,6 @@ btr_page_alloc(
}
/**************************************************************//**
-Gets the number of pages in a B-tree.
-@return number of pages, or ULINT_UNDEFINED if the index is unavailable */
-ulint
-btr_get_size(
-/*=========*/
- const dict_index_t* index, /*!< in: index */
- ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
- mtr_t* mtr) /*!< in/out: mini-transaction where index
- is s-latched */
-{
- ulint n=0;
-
- ut_ad(srv_read_only_mode
- || mtr->memo_contains(index->lock, MTR_MEMO_S_LOCK));
- ut_ad(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE);
-
- if (dict_index_is_online_ddl(index)
- || !index->is_committed()) {
- return(ULINT_UNDEFINED);
- }
-
- buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr);
- if (!root) {
- return ULINT_UNDEFINED;
- }
- mtr->x_lock_space(index->table->space);
- if (flag == BTR_N_LEAF_PAGES) {
- fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF
- + root->frame, &n, mtr);
- } else {
- ulint dummy;
- n = fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_TOP
- + root->frame, &dummy, mtr);
- n += fseg_n_reserved_pages(*root,
- PAGE_HEADER + PAGE_BTR_SEG_LEAF
- + root->frame, &dummy, mtr);
- }
-
- return(n);
-}
-
-/**************************************************************//**
Frees a page used in an ibuf tree. Puts the page to the free list of the
ibuf tree. */
static
diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc
index adba676808c..32bdf9a8a51 100644
--- a/storage/innobase/btr/btr0cur.cc
+++ b/storage/innobase/btr/btr0cur.cc
@@ -137,17 +137,6 @@ can be released by page reorganize, then it is reorganized */
#define BTR_BLOB_HDR_SIZE 8 /*!< Size of a BLOB
part header, in bytes */
-/** Estimated table level stats from sampled value.
-@param value sampled stats
-@param index index being sampled
-@param sample number of sampled rows
-@param ext_size external stored data size
-@param not_empty table not empty
-@return estimated table wide stats from sampled value */
-#define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty) \
- (((value) * static_cast<ib_uint64_t>(index->stat_n_leaf_pages) \
- + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size)))
-
/* @} */
/*******************************************************************//**
@@ -6552,339 +6541,6 @@ btr_estimate_n_rows_in_range(
index, tuple1, tuple2, 1);
}
-/*******************************************************************//**
-Record the number of non_null key values in a given index for
-each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
-The estimates are eventually stored in the array:
-index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */
-static
-void
-btr_record_not_null_field_in_rec(
-/*=============================*/
- ulint n_unique, /*!< in: dict_index_get_n_unique(index),
- number of columns uniquely determine
- an index entry */
- const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index),
- its size could be for all fields or
- that of "n_unique" */
- ib_uint64_t* n_not_null) /*!< in/out: array to record number of
- not null rows for n-column prefix */
-{
- ulint i;
-
- ut_ad(rec_offs_n_fields(offsets) >= n_unique);
-
- if (n_not_null == NULL) {
- return;
- }
-
- for (i = 0; i < n_unique; i++) {
- if (rec_offs_nth_sql_null(offsets, i)) {
- break;
- }
-
- n_not_null[i]++;
- }
-}
-
-/** Estimates the number of different key values in a given index, for
-each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
-The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
-0..n_uniq-1) and the number of pages that were sampled is saved in
-result.n_sample_sizes[].
-If innodb_stats_method is nulls_ignored, we also record the number of
-non-null values for each prefix and stored the estimates in
-array result.n_non_null_key_vals.
-@param[in] index index
-@return vector with statistics information
-empty vector if the index is unavailable. */
-std::vector<index_field_stats_t>
-btr_estimate_number_of_different_key_vals(dict_index_t* index)
-{
- btr_cur_t cursor;
- page_t* page;
- rec_t* rec;
- ulint n_cols;
- ib_uint64_t* n_diff;
- ib_uint64_t* n_not_null;
- ibool stats_null_not_equal;
- uintmax_t n_sample_pages=1; /* number of pages to sample */
- ulint not_empty_flag = 0;
- ulint total_external_size = 0;
- ulint i;
- ulint j;
- uintmax_t add_on;
- mtr_t mtr;
- mem_heap_t* heap = NULL;
- rec_offs* offsets_rec = NULL;
- rec_offs* offsets_next_rec = NULL;
-
- std::vector<index_field_stats_t> result;
-
- /* For spatial index, there is no such stats can be
- fetched. */
- ut_ad(!dict_index_is_spatial(index));
-
- n_cols = dict_index_get_n_unique(index);
-
- heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
- * n_cols
- + dict_index_get_n_fields(index)
- * (sizeof *offsets_rec
- + sizeof *offsets_next_rec));
-
- n_diff = (ib_uint64_t*) mem_heap_zalloc(
- heap, n_cols * sizeof(n_diff[0]));
-
- n_not_null = NULL;
-
- /* Check srv_innodb_stats_method setting, and decide whether we
- need to record non-null value and also decide if NULL is
- considered equal (by setting stats_null_not_equal value) */
- switch (srv_innodb_stats_method) {
- case SRV_STATS_NULLS_IGNORED:
- n_not_null = (ib_uint64_t*) mem_heap_zalloc(
- heap, n_cols * sizeof *n_not_null);
- /* fall through */
-
- case SRV_STATS_NULLS_UNEQUAL:
- /* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
- case, we will treat NULLs as unequal value */
- stats_null_not_equal = TRUE;
- break;
-
- case SRV_STATS_NULLS_EQUAL:
- stats_null_not_equal = FALSE;
- break;
-
- default:
- ut_error;
- }
-
- if (srv_stats_sample_traditional) {
- /* It makes no sense to test more pages than are contained
- in the index, thus we lower the number if it is too high */
- if (srv_stats_transient_sample_pages > index->stat_index_size) {
- if (index->stat_index_size > 0) {
- n_sample_pages = index->stat_index_size;
- }
- } else {
- n_sample_pages = srv_stats_transient_sample_pages;
- }
- } else {
- /* New logaritmic number of pages that are estimated.
- Number of pages estimated should be between 1 and
- index->stat_index_size.
-
- If we have only 0 or 1 index pages then we can only take 1
- sample. We have already initialized n_sample_pages to 1.
-
- So taking index size as I and sample as S and log(I)*S as L
-
- requirement 1) we want the out limit of the expression to not exceed I;
- requirement 2) we want the ideal pages to be at least S;
- so the current expression is min(I, max( min(S,I), L)
-
- looking for simplifications:
-
- case 1: assume S < I
- min(I, max( min(S,I), L) -> min(I , max( S, L))
-
- but since L=LOG2(I)*S and log2(I) >=1 L>S always so max(S,L) = L.
-
- so we have: min(I , L)
-
- case 2: assume I < S
- min(I, max( min(S,I), L) -> min(I, max( I, L))
-
- case 2a: L > I
- min(I, max( I, L)) -> min(I, L) -> I
-
- case 2b: when L < I
- min(I, max( I, L)) -> min(I, I ) -> I
-
- so taking all case2 paths is I, our expression is:
- n_pages = S < I? min(I,L) : I
- */
- if (index->stat_index_size > 1) {
- n_sample_pages = (srv_stats_transient_sample_pages < index->stat_index_size)
- ? ut_min(index->stat_index_size,
- static_cast<ulint>(
- log2(double(index->stat_index_size))
- * double(srv_stats_transient_sample_pages)))
- : index->stat_index_size;
- }
- }
-
- /* Sanity check */
- ut_ad(n_sample_pages > 0 && n_sample_pages <= (index->stat_index_size <= 1 ? 1 : index->stat_index_size));
-
- /* We sample some pages in the index to get an estimate */
-
- for (i = 0; i < n_sample_pages; i++) {
- mtr_start(&mtr);
-
- bool available;
-
- available = btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF,
- &cursor, &mtr);
-
- if (!available) {
- mtr_commit(&mtr);
- mem_heap_free(heap);
-
- return result;
- }
-
- /* Count the number of different key values for each prefix of
- the key on this index page. If the prefix does not determine
- the index record uniquely in the B-tree, then we subtract one
- because otherwise our algorithm would give a wrong estimate
- for an index where there is just one key value. */
-
- if (!index->is_readable()) {
- mtr_commit(&mtr);
- goto exit_loop;
- }
-
- page = btr_cur_get_page(&cursor);
-
- rec = page_rec_get_next(page_get_infimum_rec(page));
- const ulint n_core = page_is_leaf(page)
- ? index->n_core_fields : 0;
-
- if (!page_rec_is_supremum(rec)) {
- not_empty_flag = 1;
- offsets_rec = rec_get_offsets(rec, index, offsets_rec,
- n_core,
- ULINT_UNDEFINED, &heap);
-
- if (n_not_null != NULL) {
- btr_record_not_null_field_in_rec(
- n_cols, offsets_rec, n_not_null);
- }
- }
-
- while (!page_rec_is_supremum(rec)) {
- ulint matched_fields;
- rec_t* next_rec = page_rec_get_next(rec);
- if (page_rec_is_supremum(next_rec)) {
- total_external_size +=
- btr_rec_get_externally_stored_len(
- rec, offsets_rec);
- break;
- }
-
- offsets_next_rec = rec_get_offsets(next_rec, index,
- offsets_next_rec,
- n_core,
- ULINT_UNDEFINED,
- &heap);
-
- cmp_rec_rec(rec, next_rec,
- offsets_rec, offsets_next_rec,
- index, stats_null_not_equal,
- &matched_fields);
-
- for (j = matched_fields; j < n_cols; j++) {
- /* We add one if this index record has
- a different prefix from the previous */
-
- n_diff[j]++;
- }
-
- if (n_not_null != NULL) {
- btr_record_not_null_field_in_rec(
- n_cols, offsets_next_rec, n_not_null);
- }
-
- total_external_size
- += btr_rec_get_externally_stored_len(
- rec, offsets_rec);
-
- rec = next_rec;
- /* Initialize offsets_rec for the next round
- and assign the old offsets_rec buffer to
- offsets_next_rec. */
- {
- rec_offs* offsets_tmp = offsets_rec;
- offsets_rec = offsets_next_rec;
- offsets_next_rec = offsets_tmp;
- }
- }
-
- if (n_cols == dict_index_get_n_unique_in_tree(index)
- && page_has_siblings(page)) {
-
- /* If there is more than one leaf page in the tree,
- we add one because we know that the first record
- on the page certainly had a different prefix than the
- last record on the previous index page in the
- alphabetical order. Before this fix, if there was
- just one big record on each clustered index page, the
- algorithm grossly underestimated the number of rows
- in the table. */
-
- n_diff[n_cols - 1]++;
- }
-
- mtr_commit(&mtr);
- }
-
-exit_loop:
- /* If we saw k borders between different key values on
- n_sample_pages leaf pages, we can estimate how many
- there will be in index->stat_n_leaf_pages */
-
- /* We must take into account that our sample actually represents
- also the pages used for external storage of fields (those pages are
- included in index->stat_n_leaf_pages) */
-
- result.reserve(n_cols);
-
- for (j = 0; j < n_cols; j++) {
- index_field_stats_t stat;
-
- stat.n_diff_key_vals
- = BTR_TABLE_STATS_FROM_SAMPLE(
- n_diff[j], index, n_sample_pages,
- total_external_size, not_empty_flag);
-
- /* If the tree is small, smaller than
- 10 * n_sample_pages + total_external_size, then
- the above estimate is ok. For bigger trees it is common that we
- do not see any borders between key values in the few pages
- we pick. But still there may be n_sample_pages
- different key values, or even more. Let us try to approximate
- that: */
-
- add_on = index->stat_n_leaf_pages
- / (10 * (n_sample_pages
- + total_external_size));
-
- if (add_on > n_sample_pages) {
- add_on = n_sample_pages;
- }
-
- stat.n_diff_key_vals += add_on;
-
- stat.n_sample_sizes = n_sample_pages;
-
- if (n_not_null != NULL) {
- stat.n_non_null_key_vals =
- BTR_TABLE_STATS_FROM_SAMPLE(
- n_not_null[j], index, n_sample_pages,
- total_external_size, not_empty_flag);
- }
-
- result.push_back(stat);
- }
-
- mem_heap_free(heap);
-
- return result;
-}
-
/*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
/***********************************************************//**
diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc
index d7466ae5f8a..f92beac0f67 100644
--- a/storage/innobase/dict/dict0stats.cc
+++ b/storage/innobase/dict/dict0stats.cc
@@ -1024,6 +1024,367 @@ dict_stats_snapshot_free(
dict_stats_table_clone_free(t);
}
+/** Statistics for one field of an index. */
+struct index_field_stats_t
+{
+ ib_uint64_t n_diff_key_vals;
+ ib_uint64_t n_sample_sizes;
+ ib_uint64_t n_non_null_key_vals;
+
+ index_field_stats_t(ib_uint64_t n_diff_key_vals= 0,
+ ib_uint64_t n_sample_sizes= 0,
+ ib_uint64_t n_non_null_key_vals= 0)
+ : n_diff_key_vals(n_diff_key_vals), n_sample_sizes(n_sample_sizes),
+ n_non_null_key_vals(n_non_null_key_vals)
+ {
+ }
+};
+
+/*******************************************************************//**
+Record the number of non_null key values in a given index for
+each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
+The estimates are eventually stored in the array:
+index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */
+static
+void
+btr_record_not_null_field_in_rec(
+/*=============================*/
+ ulint n_unique, /*!< in: dict_index_get_n_unique(index),
+ number of columns uniquely determine
+ an index entry */
+ const rec_offs* offsets, /*!< in: rec_get_offsets(rec, index),
+ its size could be for all fields or
+ that of "n_unique" */
+ ib_uint64_t* n_not_null) /*!< in/out: array to record number of
+ not null rows for n-column prefix */
+{
+ ulint i;
+
+ ut_ad(rec_offs_n_fields(offsets) >= n_unique);
+
+ if (n_not_null == NULL) {
+ return;
+ }
+
+ for (i = 0; i < n_unique; i++) {
+ if (rec_offs_nth_sql_null(offsets, i)) {
+ break;
+ }
+
+ n_not_null[i]++;
+ }
+}
+
+/** Estimated table level stats from sampled value.
+@param value sampled stats
+@param index index being sampled
+@param sample number of sampled rows
+@param ext_size external stored data size
+@param not_empty table not empty
+@return estimated table wide stats from sampled value */
+#define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty) \
+ (((value) * static_cast<ib_uint64_t>(index->stat_n_leaf_pages) \
+ + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size)))
+
+/** Estimates the number of different key values in a given index, for
+each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
+The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
+0..n_uniq-1) and the number of pages that were sampled is saved in
+result.n_sample_sizes[].
+If innodb_stats_method is nulls_ignored, we also record the number of
+non-null values for each prefix and stored the estimates in
+array result.n_non_null_key_vals.
+@param index B-tree index
+@param bulk_trx_id the value of index->table->bulk_trx_id at the start
+@return vector with statistics information
+empty vector if the index is unavailable. */
+static
+std::vector<index_field_stats_t>
+btr_estimate_number_of_different_key_vals(dict_index_t* index,
+ trx_id_t bulk_trx_id)
+{
+ btr_cur_t cursor;
+ page_t* page;
+ rec_t* rec;
+ ulint n_cols;
+ ib_uint64_t* n_diff;
+ ib_uint64_t* n_not_null;
+ ibool stats_null_not_equal;
+ uintmax_t n_sample_pages=1; /* number of pages to sample */
+ ulint not_empty_flag = 0;
+ ulint total_external_size = 0;
+ ulint i;
+ ulint j;
+ uintmax_t add_on;
+ mtr_t mtr;
+ mem_heap_t* heap = NULL;
+ rec_offs* offsets_rec = NULL;
+ rec_offs* offsets_next_rec = NULL;
+
+ std::vector<index_field_stats_t> result;
+
+ ut_ad(!index->is_spatial());
+
+ n_cols = dict_index_get_n_unique(index);
+
+ heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
+ * n_cols
+ + dict_index_get_n_fields(index)
+ * (sizeof *offsets_rec
+ + sizeof *offsets_next_rec));
+
+ n_diff = (ib_uint64_t*) mem_heap_zalloc(
+ heap, n_cols * sizeof(n_diff[0]));
+
+ n_not_null = NULL;
+
+ /* Check srv_innodb_stats_method setting, and decide whether we
+ need to record non-null value and also decide if NULL is
+ considered equal (by setting stats_null_not_equal value) */
+ switch (srv_innodb_stats_method) {
+ case SRV_STATS_NULLS_IGNORED:
+ n_not_null = (ib_uint64_t*) mem_heap_zalloc(
+ heap, n_cols * sizeof *n_not_null);
+ /* fall through */
+
+ case SRV_STATS_NULLS_UNEQUAL:
+ /* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
+ case, we will treat NULLs as unequal value */
+ stats_null_not_equal = TRUE;
+ break;
+
+ case SRV_STATS_NULLS_EQUAL:
+ stats_null_not_equal = FALSE;
+ break;
+
+ default:
+ ut_error;
+ }
+
+ if (srv_stats_sample_traditional) {
+ /* It makes no sense to test more pages than are contained
+ in the index, thus we lower the number if it is too high */
+ if (srv_stats_transient_sample_pages > index->stat_index_size) {
+ if (index->stat_index_size > 0) {
+ n_sample_pages = index->stat_index_size;
+ }
+ } else {
+ n_sample_pages = srv_stats_transient_sample_pages;
+ }
+ } else {
+ /* New logaritmic number of pages that are estimated.
+ Number of pages estimated should be between 1 and
+ index->stat_index_size.
+
+ If we have only 0 or 1 index pages then we can only take 1
+ sample. We have already initialized n_sample_pages to 1.
+
+ So taking index size as I and sample as S and log(I)*S as L
+
+ requirement 1) we want the out limit of the expression to not exceed I;
+ requirement 2) we want the ideal pages to be at least S;
+ so the current expression is min(I, max( min(S,I), L)
+
+ looking for simplifications:
+
+ case 1: assume S < I
+ min(I, max( min(S,I), L) -> min(I , max( S, L))
+
+ but since L=LOG2(I)*S and log2(I) >=1 L>S always so max(S,L) = L.
+
+ so we have: min(I , L)
+
+ case 2: assume I < S
+ min(I, max( min(S,I), L) -> min(I, max( I, L))
+
+ case 2a: L > I
+ min(I, max( I, L)) -> min(I, L) -> I
+
+ case 2b: when L < I
+ min(I, max( I, L)) -> min(I, I ) -> I
+
+ so taking all case2 paths is I, our expression is:
+ n_pages = S < I? min(I,L) : I
+ */
+ if (index->stat_index_size > 1) {
+ n_sample_pages = (srv_stats_transient_sample_pages < index->stat_index_size)
+ ? ut_min(index->stat_index_size,
+ static_cast<ulint>(
+ log2(double(index->stat_index_size))
+ * double(srv_stats_transient_sample_pages)))
+ : index->stat_index_size;
+ }
+ }
+
+ /* Sanity check */
+ ut_ad(n_sample_pages > 0 && n_sample_pages <= (index->stat_index_size <= 1 ? 1 : index->stat_index_size));
+
+ /* We sample some pages in the index to get an estimate */
+
+ for (i = 0; i < n_sample_pages; i++) {
+ mtr.start();
+
+ bool available;
+
+ available = btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF,
+ &cursor, &mtr);
+
+ if (!available || index->table->bulk_trx_id != bulk_trx_id) {
+ mtr.commit();
+ mem_heap_free(heap);
+
+ return result;
+ }
+
+ /* Count the number of different key values for each prefix of
+ the key on this index page. If the prefix does not determine
+ the index record uniquely in the B-tree, then we subtract one
+ because otherwise our algorithm would give a wrong estimate
+ for an index where there is just one key value. */
+
+ if (!index->is_readable()) {
+ mtr.commit();
+ goto exit_loop;
+ }
+
+ page = btr_cur_get_page(&cursor);
+
+ rec = page_rec_get_next(page_get_infimum_rec(page));
+ const ulint n_core = page_is_leaf(page)
+ ? index->n_core_fields : 0;
+
+ if (!page_rec_is_supremum(rec)) {
+ not_empty_flag = 1;
+ offsets_rec = rec_get_offsets(rec, index, offsets_rec,
+ n_core,
+ ULINT_UNDEFINED, &heap);
+
+ if (n_not_null != NULL) {
+ btr_record_not_null_field_in_rec(
+ n_cols, offsets_rec, n_not_null);
+ }
+ }
+
+ while (!page_rec_is_supremum(rec)) {
+ ulint matched_fields;
+ rec_t* next_rec = page_rec_get_next(rec);
+ if (page_rec_is_supremum(next_rec)) {
+ total_external_size +=
+ btr_rec_get_externally_stored_len(
+ rec, offsets_rec);
+ break;
+ }
+
+ offsets_next_rec = rec_get_offsets(next_rec, index,
+ offsets_next_rec,
+ n_core,
+ ULINT_UNDEFINED,
+ &heap);
+
+ cmp_rec_rec(rec, next_rec,
+ offsets_rec, offsets_next_rec,
+ index, stats_null_not_equal,
+ &matched_fields);
+
+ for (j = matched_fields; j < n_cols; j++) {
+ /* We add one if this index record has
+ a different prefix from the previous */
+
+ n_diff[j]++;
+ }
+
+ if (n_not_null != NULL) {
+ btr_record_not_null_field_in_rec(
+ n_cols, offsets_next_rec, n_not_null);
+ }
+
+ total_external_size
+ += btr_rec_get_externally_stored_len(
+ rec, offsets_rec);
+
+ rec = next_rec;
+ /* Initialize offsets_rec for the next round
+ and assign the old offsets_rec buffer to
+ offsets_next_rec. */
+ {
+ rec_offs* offsets_tmp = offsets_rec;
+ offsets_rec = offsets_next_rec;
+ offsets_next_rec = offsets_tmp;
+ }
+ }
+
+ if (n_cols == dict_index_get_n_unique_in_tree(index)
+ && page_has_siblings(page)) {
+
+ /* If there is more than one leaf page in the tree,
+ we add one because we know that the first record
+ on the page certainly had a different prefix than the
+ last record on the previous index page in the
+ alphabetical order. Before this fix, if there was
+ just one big record on each clustered index page, the
+ algorithm grossly underestimated the number of rows
+ in the table. */
+
+ n_diff[n_cols - 1]++;
+ }
+
+ mtr.commit();
+ }
+
+exit_loop:
+ /* If we saw k borders between different key values on
+ n_sample_pages leaf pages, we can estimate how many
+ there will be in index->stat_n_leaf_pages */
+
+ /* We must take into account that our sample actually represents
+ also the pages used for external storage of fields (those pages are
+ included in index->stat_n_leaf_pages) */
+
+ result.reserve(n_cols);
+
+ for (j = 0; j < n_cols; j++) {
+ index_field_stats_t stat;
+
+ stat.n_diff_key_vals
+ = BTR_TABLE_STATS_FROM_SAMPLE(
+ n_diff[j], index, n_sample_pages,
+ total_external_size, not_empty_flag);
+
+ /* If the tree is small, smaller than
+ 10 * n_sample_pages + total_external_size, then
+ the above estimate is ok. For bigger trees it is common that we
+ do not see any borders between key values in the few pages
+ we pick. But still there may be n_sample_pages
+ different key values, or even more. Let us try to approximate
+ that: */
+
+ add_on = index->stat_n_leaf_pages
+ / (10 * (n_sample_pages
+ + total_external_size));
+
+ if (add_on > n_sample_pages) {
+ add_on = n_sample_pages;
+ }
+
+ stat.n_diff_key_vals += add_on;
+
+ stat.n_sample_sizes = n_sample_pages;
+
+ if (n_not_null != NULL) {
+ stat.n_non_null_key_vals =
+ BTR_TABLE_STATS_FROM_SAMPLE(
+ n_not_null[j], index, n_sample_pages,
+ total_external_size, not_empty_flag);
+ }
+
+ result.push_back(stat);
+ }
+
+ mem_heap_free(heap);
+
+ return result;
+}
+
/*********************************************************************//**
Calculates new estimates for index statistics. This function is
relatively quick and is used to calculate transient statistics that
@@ -1054,39 +1415,48 @@ dummy_empty:
} else if (ibuf_debug && !dict_index_is_clust(index)) {
goto dummy_empty;
#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+ } else if (dict_index_is_online_ddl(index) || !index->is_committed()
+ || !index->table->space) {
+ goto dummy_empty;
} else {
mtr_t mtr;
- ulint size;
mtr.start();
mtr_s_lock_index(index, &mtr);
- size = btr_get_size(index, BTR_TOTAL_SIZE, &mtr);
-
- if (size != ULINT_UNDEFINED) {
- index->stat_index_size = size;
+ buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH,
+ &mtr);
+ if (!root) {
+invalid:
+ mtr.commit();
+ goto dummy_empty;
+ }
- size = btr_get_size(
- index, BTR_N_LEAF_PAGES, &mtr);
+ const auto bulk_trx_id = index->table->bulk_trx_id;
+ if (bulk_trx_id && trx_sys.find(nullptr, bulk_trx_id, false)) {
+ goto invalid;
}
- mtr.commit();
+ mtr.x_lock_space(index->table->space);
- switch (size) {
- case ULINT_UNDEFINED:
- goto dummy_empty;
- case 0:
- /* The root node of the tree is a leaf */
- size = 1;
- }
+ ulint dummy, size;
+ index->stat_index_size
+ = fseg_n_reserved_pages(*root, PAGE_HEADER
+ + PAGE_BTR_SEG_LEAF
+ + root->frame, &size, &mtr)
+ + fseg_n_reserved_pages(*root, PAGE_HEADER
+ + PAGE_BTR_SEG_TOP
+ + root->frame, &dummy, &mtr);
- index->stat_n_leaf_pages = size;
+ mtr.commit();
+
+ index->stat_n_leaf_pages = size ? size : 1;
/* Do not continue if table decryption has failed or
table is already marked as corrupted. */
if (index->is_readable()) {
std::vector<index_field_stats_t> stats
= btr_estimate_number_of_different_key_vals(
- index);
+ index, bulk_trx_id);
if (!stats.empty()) {
index->table->stats_mutex_lock();
@@ -2037,7 +2407,7 @@ struct index_stats_t
{
stats.reserve(n_uniq);
for (ulint i= 0; i < n_uniq; ++i)
- stats.push_back(index_field_stats_t(0, 1, 0));
+ stats.push_back(index_field_stats_t{0, 1, 0});
}
};
@@ -2123,15 +2493,12 @@ stat_n_leaf_pages. This function can be slow.
@return index stats */
static index_stats_t dict_stats_analyze_index(dict_index_t* index)
{
- ulint root_level;
- ulint level;
bool level_is_analyzed;
ulint n_uniq;
ulint n_prefix;
ib_uint64_t total_recs;
ib_uint64_t total_pages;
mtr_t mtr;
- ulint size;
index_stats_t result(index->n_uniq);
DBUG_ENTER("dict_stats_analyze_index");
@@ -2150,30 +2517,43 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index)
mtr.start();
mtr_s_lock_index(index, &mtr);
- size = btr_get_size(index, BTR_TOTAL_SIZE, &mtr);
+ uint16_t root_level;
+
+ {
+ buf_block_t* root;
+ root = btr_root_block_get(index, RW_SX_LATCH, &mtr);
+ if (!root) {
+empty_index:
+ mtr.commit();
+ dict_stats_assert_initialized_index(index);
+ DBUG_RETURN(result);
+ }
- if (size != ULINT_UNDEFINED) {
- result.index_size = size;
- size = btr_get_size(index, BTR_N_LEAF_PAGES, &mtr);
+ root_level = btr_page_get_level(root->frame);
+
+ mtr.x_lock_space(index->table->space);
+ ulint dummy, size;
+ result.index_size
+ = fseg_n_reserved_pages(*root, PAGE_HEADER
+ + PAGE_BTR_SEG_LEAF
+ + root->frame, &size, &mtr)
+ + fseg_n_reserved_pages(*root, PAGE_HEADER
+ + PAGE_BTR_SEG_TOP
+ + root->frame, &dummy, &mtr);
+ result.n_leaf_pages = size ? size : 1;
}
- /* Release the X locks on the root page taken by btr_get_size() */
- mtr.commit();
-
- switch (size) {
- case ULINT_UNDEFINED:
- dict_stats_assert_initialized_index(index);
- DBUG_RETURN(result);
- case 0:
- /* The root node of the tree is a leaf */
- size = 1;
+ const auto bulk_trx_id = index->table->bulk_trx_id;
+ if (bulk_trx_id && trx_sys.find(nullptr, bulk_trx_id, false)) {
+ result.index_size = 1;
+ result.n_leaf_pages = 1;
+ goto empty_index;
}
- result.n_leaf_pages = size;
+ mtr.commit();
mtr.start();
mtr_sx_lock_index(index, &mtr);
- root_level = btr_height_get(index, &mtr);
n_uniq = dict_index_get_n_unique(index);
@@ -2251,7 +2631,7 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index)
So if we find that the first level containing D distinct
keys (on n_prefix columns) is L, we continue from L when
searching for D distinct keys on n_prefix-1 columns. */
- level = root_level;
+ auto level = root_level;
level_is_analyzed = false;
for (n_prefix = n_uniq; n_prefix >= 1; n_prefix--) {
@@ -2265,7 +2645,10 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index)
mtr.commit();
mtr.start();
mtr_sx_lock_index(index, &mtr);
- if (root_level != btr_height_get(index, &mtr)) {
+ buf_block_t *root = btr_root_block_get(index, RW_S_LATCH,
+ &mtr);
+ if (!root || root_level != btr_page_get_level(root->frame)
+ || index->table->bulk_trx_id != bulk_trx_id) {
/* Just quit if the tree has changed beyond
recognition here. The old stats from previous
runs will remain in the values that we have
@@ -2277,6 +2660,8 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index)
break;
}
+ mtr.memo_release(root, MTR_MEMO_PAGE_S_FIX);
+
/* check whether we should pick the current level;
we pick level 1 even if it does not have enough
distinct records because we do not want to scan the
diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h
index 4543fc21e8a..40293df21e5 100644
--- a/storage/innobase/include/btr0btr.h
+++ b/storage/innobase/include/btr0btr.h
@@ -196,18 +196,6 @@ btr_root_adjust_on_import(
const dict_index_t* index) /*!< in: index tree */
MY_ATTRIBUTE((warn_unused_result));
-/**************************************************************//**
-Gets the height of the B-tree (the level of the root, when the leaf
-level is assumed to be 0). The caller must hold an S or X latch on
-the index.
-@return tree height (level of the root) */
-ulint
-btr_height_get(
-/*===========*/
- const dict_index_t* index, /*!< in: index tree */
- mtr_t* mtr) /*!< in/out: mini-transaction */
- MY_ATTRIBUTE((warn_unused_result));
-
/** Get an index page and declare its latching order level.
@param[in] index index tree
@param[in] page page number
@@ -537,17 +525,6 @@ btr_discard_page(
btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on
the root page */
mtr_t* mtr); /*!< in: mtr */
-/**************************************************************//**
-Gets the number of pages in a B-tree.
-@return number of pages, or ULINT_UNDEFINED if the index is unavailable */
-ulint
-btr_get_size(
-/*=========*/
- const dict_index_t* index, /*!< in: index */
- ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
- mtr_t* mtr) /*!< in/out: mini-transaction where index
- is s-latched */
- MY_ATTRIBUTE((warn_unused_result));
/**************************************************************//**
Allocates a new file page to be used in an index tree. NOTE: we assume
@@ -614,7 +591,6 @@ btr_root_block_get(
rw_lock_type_t mode, /*!< in: either RW_S_LATCH
or RW_X_LATCH */
mtr_t* mtr); /*!< in: mtr */
-
/*************************************************************//**
Reorganizes an index page.
diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h
index c7f25aff4b7..911c79c29ba 100644
--- a/storage/innobase/include/btr0cur.h
+++ b/storage/innobase/include/btr0cur.h
@@ -575,37 +575,6 @@ btr_estimate_n_rows_in_range(
btr_pos_t* range_start,
btr_pos_t* range_end);
-
-/** Statistics for one field of an index. */
-struct index_field_stats_t
-{
- ib_uint64_t n_diff_key_vals;
- ib_uint64_t n_sample_sizes;
- ib_uint64_t n_non_null_key_vals;
-
- index_field_stats_t(ib_uint64_t n_diff_key_vals= 0,
- ib_uint64_t n_sample_sizes= 0,
- ib_uint64_t n_non_null_key_vals= 0)
- : n_diff_key_vals(n_diff_key_vals), n_sample_sizes(n_sample_sizes),
- n_non_null_key_vals(n_non_null_key_vals)
- {
- }
-};
-
-/** Estimates the number of different key values in a given index, for
-each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
-The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
-0..n_uniq-1) and the number of pages that were sampled is saved in
-index->stat_n_sample_sizes[].
-If innodb_stats_method is nulls_ignored, we also record the number of
-non-null values for each prefix and stored the estimates in
-array index->stat_n_non_null_key_vals.
-@param[in] index index
-@return stat vector if the index is available and we get the estimated numbers,
-empty vector if the index is unavailable. */
-std::vector<index_field_stats_t>
-btr_estimate_number_of_different_key_vals(dict_index_t* index);
-
/** Gets the externally stored size of a record, in units of a database page.
@param[in] rec record
@param[in] offsets array returned by rec_get_offsets()