MDEV-26623 Possible race condition between statistics and bulk insert

When computing statistics, let us play it safe and check whether an insert into an empty table is in progress, once we have acquired the root page latch. If yes, let us pretend that the table is empty, just like MVCC reads would do. It is unlikely that this race condition could lead into any crashes before MDEV-24621, because the index tree structure should be protected by the page latches. But, at least we can avoid some busy work and return earlier. As part of this, some code that is only used for statistics calculation is being moved into static functions in that compilation unit.
author: Marko Mäkelä <marko.makela@mariadb.com> 2021-09-17 09:14:20 +0300
committer: Marko Mäkelä <marko.makela@mariadb.com> 2021-09-17 14:40:41 +0300
commit: 1e9c922fa726b22f4522f2a4de0fcb6595404086 (patch)
tree: 38b86f6a6218af0142e45d87629e01305997cc95
parent: 1a4a7dddb6505352a7c3b4a2e1583ed92cad1dcf (diff)
download: mariadb-git-1e9c922fa726b22f4522f2a4de0fcb6595404086.tar.gz
5 files changed, 424 insertions, 512 deletions
diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc
index 56e84e47fda..02aa89361c5 100644
--- a/storage/innobase/btr/btr0btr.cc
+++ b/storage/innobase/btr/btr0btr.cc
@@ -274,38 +274,6 @@ btr_root_get(
 }
 
 /**************************************************************//**
-Gets the height of the B-tree (the level of the root, when the leaf
-level is assumed to be 0). The caller must hold an S or X latch on
-the index.
-@return tree height (level of the root) */
-ulint
-btr_height_get(
-/*===========*/
-	const dict_index_t*	index,	/*!< in: index tree */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
-{
-	ulint		height=0;
-	buf_block_t*	root_block;
-
-	ut_ad(srv_read_only_mode
-	      || mtr->memo_contains_flagged(&index->lock, MTR_MEMO_S_LOCK
-					    | MTR_MEMO_X_LOCK
-					    | MTR_MEMO_SX_LOCK));
-
-	/* S latches the page */
-	root_block = btr_root_block_get(index, RW_S_LATCH, mtr);
-
-	if (root_block) {
-		height = btr_page_get_level(buf_block_get_frame(root_block));
-
-		/* Release the S latch on the root page. */
-		mtr->memo_release(root_block, MTR_MEMO_PAGE_S_FIX);
-	}
-
-	return(height);
-}
-
-/**************************************************************//**
 Checks a file segment header within a B-tree root page and updates
 the segment header space id.
 @return TRUE if valid */
@@ -563,48 +531,6 @@ btr_page_alloc(
 }
 
 /**************************************************************//**
-Gets the number of pages in a B-tree.
-@return number of pages, or ULINT_UNDEFINED if the index is unavailable */
-ulint
-btr_get_size(
-/*=========*/
-	const dict_index_t*	index,	/*!< in: index */
-	ulint		flag,	/*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction where index
-				is s-latched */
-{
-	ulint		n=0;
-
-	ut_ad(srv_read_only_mode
-	      || mtr->memo_contains(index->lock, MTR_MEMO_S_LOCK));
-	ut_ad(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE);
-
-	if (dict_index_is_online_ddl(index)
-	    || !index->is_committed()) {
-		return(ULINT_UNDEFINED);
-	}
-
-	buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH, mtr);
-	if (!root) {
-		return ULINT_UNDEFINED;
-	}
-	mtr->x_lock_space(index->table->space);
-	if (flag == BTR_N_LEAF_PAGES) {
-		fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_LEAF
-				      + root->frame, &n, mtr);
-	} else {
-		ulint dummy;
-		n = fseg_n_reserved_pages(*root, PAGE_HEADER + PAGE_BTR_SEG_TOP
-					  + root->frame, &dummy, mtr);
-		n += fseg_n_reserved_pages(*root,
-					   PAGE_HEADER + PAGE_BTR_SEG_LEAF
-					   + root->frame, &dummy, mtr);
-	}
-
-	return(n);
-}
-
-/**************************************************************//**
 Frees a page used in an ibuf tree. Puts the page to the free list of the
 ibuf tree. */
 static
diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc
index adba676808c..32bdf9a8a51 100644
--- a/storage/innobase/btr/btr0cur.cc
+++ b/storage/innobase/btr/btr0cur.cc
@@ -137,17 +137,6 @@ can be released by page reorganize, then it is reorganized */
 #define BTR_BLOB_HDR_SIZE		8	/*!< Size of a BLOB
 						part header, in bytes */
 
-/** Estimated table level stats from sampled value.
-@param value sampled stats
-@param index index being sampled
-@param sample number of sampled rows
-@param ext_size external stored data size
-@param not_empty table not empty
-@return estimated table wide stats from sampled value */
-#define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty) \
-	(((value) * static_cast<ib_uint64_t>(index->stat_n_leaf_pages) \
-	  + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size)))
-
 /* @} */
 
 /*******************************************************************//**
@@ -6552,339 +6541,6 @@ btr_estimate_n_rows_in_range(
 		index, tuple1, tuple2, 1);
 }
 
-/*******************************************************************//**
-Record the number of non_null key values in a given index for
-each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
-The estimates are eventually stored in the array:
-index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */
-static
-void
-btr_record_not_null_field_in_rec(
-/*=============================*/
-	ulint		n_unique,	/*!< in: dict_index_get_n_unique(index),
-					number of columns uniquely determine
-					an index entry */
-	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec, index),
-					its size could be for all fields or
-					that of "n_unique" */
-	ib_uint64_t*	n_not_null)	/*!< in/out: array to record number of
-					not null rows for n-column prefix */
-{
-	ulint	i;
-
-	ut_ad(rec_offs_n_fields(offsets) >= n_unique);
-
-	if (n_not_null == NULL) {
-		return;
-	}
-
-	for (i = 0; i < n_unique; i++) {
-		if (rec_offs_nth_sql_null(offsets, i)) {
-			break;
-		}
-
-		n_not_null[i]++;
-	}
-}
-
-/** Estimates the number of different key values in a given index, for
-each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
-The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
-0..n_uniq-1) and the number of pages that were sampled is saved in
-result.n_sample_sizes[].
-If innodb_stats_method is nulls_ignored, we also record the number of
-non-null values for each prefix and stored the estimates in
-array result.n_non_null_key_vals.
-@param[in]	index	index
-@return vector with statistics information
-empty vector if the index is unavailable. */
-std::vector<index_field_stats_t>
-btr_estimate_number_of_different_key_vals(dict_index_t* index)
-{
-	btr_cur_t	cursor;
-	page_t*		page;
-	rec_t*		rec;
-	ulint		n_cols;
-	ib_uint64_t*	n_diff;
-	ib_uint64_t*	n_not_null;
-	ibool		stats_null_not_equal;
-	uintmax_t	n_sample_pages=1; /* number of pages to sample */
-	ulint		not_empty_flag	= 0;
-	ulint		total_external_size = 0;
-	ulint		i;
-	ulint		j;
-	uintmax_t	add_on;
-	mtr_t		mtr;
-	mem_heap_t*	heap		= NULL;
-	rec_offs*	offsets_rec	= NULL;
-	rec_offs*	offsets_next_rec = NULL;
-
-	std::vector<index_field_stats_t> result;
-
-	/* For spatial index, there is no such stats can be
-	fetched. */
-	ut_ad(!dict_index_is_spatial(index));
-
-	n_cols = dict_index_get_n_unique(index);
-
-	heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
-			       * n_cols
-			       + dict_index_get_n_fields(index)
-			       * (sizeof *offsets_rec
-				  + sizeof *offsets_next_rec));
-
-	n_diff = (ib_uint64_t*) mem_heap_zalloc(
-		heap, n_cols * sizeof(n_diff[0]));
-
-	n_not_null = NULL;
-
-	/* Check srv_innodb_stats_method setting, and decide whether we
-	need to record non-null value and also decide if NULL is
-	considered equal (by setting stats_null_not_equal value) */
-	switch (srv_innodb_stats_method) {
-	case SRV_STATS_NULLS_IGNORED:
-		n_not_null = (ib_uint64_t*) mem_heap_zalloc(
-			heap, n_cols * sizeof *n_not_null);
-		/* fall through */
-
-	case SRV_STATS_NULLS_UNEQUAL:
-		/* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
-		case, we will treat NULLs as unequal value */
-		stats_null_not_equal = TRUE;
-		break;
-
-	case SRV_STATS_NULLS_EQUAL:
-		stats_null_not_equal = FALSE;
-		break;
-
-	default:
-		ut_error;
-	}
-
-	if (srv_stats_sample_traditional) {
-		/* It makes no sense to test more pages than are contained
-		in the index, thus we lower the number if it is too high */
-		if (srv_stats_transient_sample_pages > index->stat_index_size) {
-			if (index->stat_index_size > 0) {
-				n_sample_pages = index->stat_index_size;
-			}
-		} else {
-			n_sample_pages = srv_stats_transient_sample_pages;
-		}
-	} else {
-		/* New logaritmic number of pages that are estimated.
-		Number of pages estimated should be between 1 and
-		index->stat_index_size.
-
-		If we have only 0 or 1 index pages then we can only take 1
-		sample. We have already initialized n_sample_pages to 1.
-
-		So taking index size as I and sample as S and log(I)*S as L
-
-		requirement 1) we want the out limit of the expression to not exceed I;
-		requirement 2) we want the ideal pages to be at least S;
-		so the current expression is min(I, max( min(S,I), L)
-
-		looking for simplifications:
-
-		case 1: assume S < I
-		min(I, max( min(S,I), L) -> min(I , max( S, L))
-
-		but since L=LOG2(I)*S and log2(I) >=1   L>S always so max(S,L) = L.
-
-		so we have: min(I , L)
-
-		case 2: assume I < S
-		    min(I, max( min(S,I), L) -> min(I, max( I, L))
-
-		case 2a: L > I
-		    min(I, max( I, L)) -> min(I, L) -> I
-
-		case 2b: when L < I
-		    min(I, max( I, L))  ->  min(I, I ) -> I
-
-		so taking all case2 paths is I, our expression is:
-		n_pages = S < I? min(I,L) : I
-                */
-		if (index->stat_index_size > 1) {
-			n_sample_pages = (srv_stats_transient_sample_pages < index->stat_index_size)
-				? ut_min(index->stat_index_size,
-					 static_cast<ulint>(
-						 log2(double(index->stat_index_size))
-						 * double(srv_stats_transient_sample_pages)))
-				: index->stat_index_size;
-		}
-	}
-
-	/* Sanity check */
-	ut_ad(n_sample_pages > 0 && n_sample_pages <= (index->stat_index_size <= 1 ? 1 : index->stat_index_size));
-
-	/* We sample some pages in the index to get an estimate */
-
-	for (i = 0; i < n_sample_pages; i++) {
-		mtr_start(&mtr);
-
-		bool	available;
-
-		available = btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF,
-						    &cursor, &mtr);
-
-		if (!available) {
-			mtr_commit(&mtr);
-			mem_heap_free(heap);
-
-			return result;
-		}
-
-		/* Count the number of different key values for each prefix of
-		the key on this index page. If the prefix does not determine
-		the index record uniquely in the B-tree, then we subtract one
-		because otherwise our algorithm would give a wrong estimate
-		for an index where there is just one key value. */
-
-		if (!index->is_readable()) {
-			mtr_commit(&mtr);
-			goto exit_loop;
-		}
-
-		page = btr_cur_get_page(&cursor);
-
-		rec = page_rec_get_next(page_get_infimum_rec(page));
-		const ulint n_core = page_is_leaf(page)
-			? index->n_core_fields : 0;
-
-		if (!page_rec_is_supremum(rec)) {
-			not_empty_flag = 1;
-			offsets_rec = rec_get_offsets(rec, index, offsets_rec,
-						      n_core,
-						      ULINT_UNDEFINED, &heap);
-
-			if (n_not_null != NULL) {
-				btr_record_not_null_field_in_rec(
-					n_cols, offsets_rec, n_not_null);
-			}
-		}
-
-		while (!page_rec_is_supremum(rec)) {
-			ulint	matched_fields;
-			rec_t*	next_rec = page_rec_get_next(rec);
-			if (page_rec_is_supremum(next_rec)) {
-				total_external_size +=
-					btr_rec_get_externally_stored_len(
-						rec, offsets_rec);
-				break;
-			}
-
-			offsets_next_rec = rec_get_offsets(next_rec, index,
-							   offsets_next_rec,
-							   n_core,
-							   ULINT_UNDEFINED,
-							   &heap);
-
-			cmp_rec_rec(rec, next_rec,
-				    offsets_rec, offsets_next_rec,
-				    index, stats_null_not_equal,
-				    &matched_fields);
-
-			for (j = matched_fields; j < n_cols; j++) {
-				/* We add one if this index record has
-				a different prefix from the previous */
-
-				n_diff[j]++;
-			}
-
-			if (n_not_null != NULL) {
-				btr_record_not_null_field_in_rec(
-					n_cols, offsets_next_rec, n_not_null);
-			}
-
-			total_external_size
-				+= btr_rec_get_externally_stored_len(
-					rec, offsets_rec);
-
-			rec = next_rec;
-			/* Initialize offsets_rec for the next round
-			and assign the old offsets_rec buffer to
-			offsets_next_rec. */
-			{
-				rec_offs* offsets_tmp = offsets_rec;
-				offsets_rec = offsets_next_rec;
-				offsets_next_rec = offsets_tmp;
-			}
-		}
-
-		if (n_cols == dict_index_get_n_unique_in_tree(index)
-		    && page_has_siblings(page)) {
-
-			/* If there is more than one leaf page in the tree,
-			we add one because we know that the first record
-			on the page certainly had a different prefix than the
-			last record on the previous index page in the
-			alphabetical order. Before this fix, if there was
-			just one big record on each clustered index page, the
-			algorithm grossly underestimated the number of rows
-			in the table. */
-
-			n_diff[n_cols - 1]++;
-		}
-
-		mtr_commit(&mtr);
-	}
-
-exit_loop:
-	/* If we saw k borders between different key values on
-	n_sample_pages leaf pages, we can estimate how many
-	there will be in index->stat_n_leaf_pages */
-
-	/* We must take into account that our sample actually represents
-	also the pages used for external storage of fields (those pages are
-	included in index->stat_n_leaf_pages) */
-
-	result.reserve(n_cols);
-
-	for (j = 0; j < n_cols; j++) {
-		index_field_stats_t stat;
-
-		stat.n_diff_key_vals
-			= BTR_TABLE_STATS_FROM_SAMPLE(
-				n_diff[j], index, n_sample_pages,
-				total_external_size, not_empty_flag);
-
-		/* If the tree is small, smaller than
-		10 * n_sample_pages + total_external_size, then
-		the above estimate is ok. For bigger trees it is common that we
-		do not see any borders between key values in the few pages
-		we pick. But still there may be n_sample_pages
-		different key values, or even more. Let us try to approximate
-		that: */
-
-		add_on = index->stat_n_leaf_pages
-			/ (10 * (n_sample_pages
-				 + total_external_size));
-
-		if (add_on > n_sample_pages) {
-			add_on = n_sample_pages;
-		}
-
-		stat.n_diff_key_vals += add_on;
-
-		stat.n_sample_sizes = n_sample_pages;
-
-		if (n_not_null != NULL) {
-			stat.n_non_null_key_vals =
-				 BTR_TABLE_STATS_FROM_SAMPLE(
-					n_not_null[j], index, n_sample_pages,
-					total_external_size, not_empty_flag);
-		}
-
-		result.push_back(stat);
-	}
-
-	mem_heap_free(heap);
-
-	return result;
-}
-
 /*================== EXTERNAL STORAGE OF BIG FIELDS ===================*/
 
 /***********************************************************//**
diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc
index d7466ae5f8a..f92beac0f67 100644
--- a/storage/innobase/dict/dict0stats.cc
+++ b/storage/innobase/dict/dict0stats.cc
@@ -1024,6 +1024,367 @@ dict_stats_snapshot_free(
 	dict_stats_table_clone_free(t);
 }
 
+/** Statistics for one field of an index. */
+struct index_field_stats_t
+{
+  ib_uint64_t n_diff_key_vals;
+  ib_uint64_t n_sample_sizes;
+  ib_uint64_t n_non_null_key_vals;
+
+  index_field_stats_t(ib_uint64_t n_diff_key_vals= 0,
+                      ib_uint64_t n_sample_sizes= 0,
+                      ib_uint64_t n_non_null_key_vals= 0)
+      : n_diff_key_vals(n_diff_key_vals), n_sample_sizes(n_sample_sizes),
+        n_non_null_key_vals(n_non_null_key_vals)
+  {
+  }
+};
+
+/*******************************************************************//**
+Record the number of non_null key values in a given index for
+each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
+The estimates are eventually stored in the array:
+index->stat_n_non_null_key_vals[], which is indexed from 0 to n-1. */
+static
+void
+btr_record_not_null_field_in_rec(
+/*=============================*/
+	ulint		n_unique,	/*!< in: dict_index_get_n_unique(index),
+					number of columns uniquely determine
+					an index entry */
+	const rec_offs*	offsets,	/*!< in: rec_get_offsets(rec, index),
+					its size could be for all fields or
+					that of "n_unique" */
+	ib_uint64_t*	n_not_null)	/*!< in/out: array to record number of
+					not null rows for n-column prefix */
+{
+	ulint	i;
+
+	ut_ad(rec_offs_n_fields(offsets) >= n_unique);
+
+	if (n_not_null == NULL) {
+		return;
+	}
+
+	for (i = 0; i < n_unique; i++) {
+		if (rec_offs_nth_sql_null(offsets, i)) {
+			break;
+		}
+
+		n_not_null[i]++;
+	}
+}
+
+/** Estimated table level stats from sampled value.
+@param value sampled stats
+@param index index being sampled
+@param sample number of sampled rows
+@param ext_size external stored data size
+@param not_empty table not empty
+@return estimated table wide stats from sampled value */
+#define BTR_TABLE_STATS_FROM_SAMPLE(value, index, sample, ext_size, not_empty) \
+	(((value) * static_cast<ib_uint64_t>(index->stat_n_leaf_pages) \
+	  + (sample) - 1 + (ext_size) + (not_empty)) / ((sample) + (ext_size)))
+
+/** Estimates the number of different key values in a given index, for
+each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
+The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
+0..n_uniq-1) and the number of pages that were sampled is saved in
+result.n_sample_sizes[].
+If innodb_stats_method is nulls_ignored, we also record the number of
+non-null values for each prefix and stored the estimates in
+array result.n_non_null_key_vals.
+@param index          B-tree index
+@param bulk_trx_id    the value of index->table->bulk_trx_id at the start
+@return vector with statistics information
+empty vector if the index is unavailable. */
+static
+std::vector<index_field_stats_t>
+btr_estimate_number_of_different_key_vals(dict_index_t* index,
+					  trx_id_t bulk_trx_id)
+{
+	btr_cur_t	cursor;
+	page_t*		page;
+	rec_t*		rec;
+	ulint		n_cols;
+	ib_uint64_t*	n_diff;
+	ib_uint64_t*	n_not_null;
+	ibool		stats_null_not_equal;
+	uintmax_t	n_sample_pages=1; /* number of pages to sample */
+	ulint		not_empty_flag	= 0;
+	ulint		total_external_size = 0;
+	ulint		i;
+	ulint		j;
+	uintmax_t	add_on;
+	mtr_t		mtr;
+	mem_heap_t*	heap		= NULL;
+	rec_offs*	offsets_rec	= NULL;
+	rec_offs*	offsets_next_rec = NULL;
+
+	std::vector<index_field_stats_t> result;
+
+	ut_ad(!index->is_spatial());
+
+	n_cols = dict_index_get_n_unique(index);
+
+	heap = mem_heap_create((sizeof *n_diff + sizeof *n_not_null)
+			       * n_cols
+			       + dict_index_get_n_fields(index)
+			       * (sizeof *offsets_rec
+				  + sizeof *offsets_next_rec));
+
+	n_diff = (ib_uint64_t*) mem_heap_zalloc(
+		heap, n_cols * sizeof(n_diff[0]));
+
+	n_not_null = NULL;
+
+	/* Check srv_innodb_stats_method setting, and decide whether we
+	need to record non-null value and also decide if NULL is
+	considered equal (by setting stats_null_not_equal value) */
+	switch (srv_innodb_stats_method) {
+	case SRV_STATS_NULLS_IGNORED:
+		n_not_null = (ib_uint64_t*) mem_heap_zalloc(
+			heap, n_cols * sizeof *n_not_null);
+		/* fall through */
+
+	case SRV_STATS_NULLS_UNEQUAL:
+		/* for both SRV_STATS_NULLS_IGNORED and SRV_STATS_NULLS_UNEQUAL
+		case, we will treat NULLs as unequal value */
+		stats_null_not_equal = TRUE;
+		break;
+
+	case SRV_STATS_NULLS_EQUAL:
+		stats_null_not_equal = FALSE;
+		break;
+
+	default:
+		ut_error;
+	}
+
+	if (srv_stats_sample_traditional) {
+		/* It makes no sense to test more pages than are contained
+		in the index, thus we lower the number if it is too high */
+		if (srv_stats_transient_sample_pages > index->stat_index_size) {
+			if (index->stat_index_size > 0) {
+				n_sample_pages = index->stat_index_size;
+			}
+		} else {
+			n_sample_pages = srv_stats_transient_sample_pages;
+		}
+	} else {
+		/* New logaritmic number of pages that are estimated.
+		Number of pages estimated should be between 1 and
+		index->stat_index_size.
+
+		If we have only 0 or 1 index pages then we can only take 1
+		sample. We have already initialized n_sample_pages to 1.
+
+		So taking index size as I and sample as S and log(I)*S as L
+
+		requirement 1) we want the out limit of the expression to not exceed I;
+		requirement 2) we want the ideal pages to be at least S;
+		so the current expression is min(I, max( min(S,I), L)
+
+		looking for simplifications:
+
+		case 1: assume S < I
+		min(I, max( min(S,I), L) -> min(I , max( S, L))
+
+		but since L=LOG2(I)*S and log2(I) >=1   L>S always so max(S,L) = L.
+
+		so we have: min(I , L)
+
+		case 2: assume I < S
+		    min(I, max( min(S,I), L) -> min(I, max( I, L))
+
+		case 2a: L > I
+		    min(I, max( I, L)) -> min(I, L) -> I
+
+		case 2b: when L < I
+		    min(I, max( I, L))  ->  min(I, I ) -> I
+
+		so taking all case2 paths is I, our expression is:
+		n_pages = S < I? min(I,L) : I
+                */
+		if (index->stat_index_size > 1) {
+			n_sample_pages = (srv_stats_transient_sample_pages < index->stat_index_size)
+				? ut_min(index->stat_index_size,
+					 static_cast<ulint>(
+						 log2(double(index->stat_index_size))
+						 * double(srv_stats_transient_sample_pages)))
+				: index->stat_index_size;
+		}
+	}
+
+	/* Sanity check */
+	ut_ad(n_sample_pages > 0 && n_sample_pages <= (index->stat_index_size <= 1 ? 1 : index->stat_index_size));
+
+	/* We sample some pages in the index to get an estimate */
+
+	for (i = 0; i < n_sample_pages; i++) {
+		mtr.start();
+
+		bool	available;
+
+		available = btr_cur_open_at_rnd_pos(index, BTR_SEARCH_LEAF,
+						    &cursor, &mtr);
+
+		if (!available || index->table->bulk_trx_id != bulk_trx_id) {
+			mtr.commit();
+			mem_heap_free(heap);
+
+			return result;
+		}
+
+		/* Count the number of different key values for each prefix of
+		the key on this index page. If the prefix does not determine
+		the index record uniquely in the B-tree, then we subtract one
+		because otherwise our algorithm would give a wrong estimate
+		for an index where there is just one key value. */
+
+		if (!index->is_readable()) {
+			mtr.commit();
+			goto exit_loop;
+		}
+
+		page = btr_cur_get_page(&cursor);
+
+		rec = page_rec_get_next(page_get_infimum_rec(page));
+		const ulint n_core = page_is_leaf(page)
+			? index->n_core_fields : 0;
+
+		if (!page_rec_is_supremum(rec)) {
+			not_empty_flag = 1;
+			offsets_rec = rec_get_offsets(rec, index, offsets_rec,
+						      n_core,
+						      ULINT_UNDEFINED, &heap);
+
+			if (n_not_null != NULL) {
+				btr_record_not_null_field_in_rec(
+					n_cols, offsets_rec, n_not_null);
+			}
+		}
+
+		while (!page_rec_is_supremum(rec)) {
+			ulint	matched_fields;
+			rec_t*	next_rec = page_rec_get_next(rec);
+			if (page_rec_is_supremum(next_rec)) {
+				total_external_size +=
+					btr_rec_get_externally_stored_len(
+						rec, offsets_rec);
+				break;
+			}
+
+			offsets_next_rec = rec_get_offsets(next_rec, index,
+							   offsets_next_rec,
+							   n_core,
+							   ULINT_UNDEFINED,
+							   &heap);
+
+			cmp_rec_rec(rec, next_rec,
+				    offsets_rec, offsets_next_rec,
+				    index, stats_null_not_equal,
+				    &matched_fields);
+
+			for (j = matched_fields; j < n_cols; j++) {
+				/* We add one if this index record has
+				a different prefix from the previous */
+
+				n_diff[j]++;
+			}
+
+			if (n_not_null != NULL) {
+				btr_record_not_null_field_in_rec(
+					n_cols, offsets_next_rec, n_not_null);
+			}
+
+			total_external_size
+				+= btr_rec_get_externally_stored_len(
+					rec, offsets_rec);
+
+			rec = next_rec;
+			/* Initialize offsets_rec for the next round
+			and assign the old offsets_rec buffer to
+			offsets_next_rec. */
+			{
+				rec_offs* offsets_tmp = offsets_rec;
+				offsets_rec = offsets_next_rec;
+				offsets_next_rec = offsets_tmp;
+			}
+		}
+
+		if (n_cols == dict_index_get_n_unique_in_tree(index)
+		    && page_has_siblings(page)) {
+
+			/* If there is more than one leaf page in the tree,
+			we add one because we know that the first record
+			on the page certainly had a different prefix than the
+			last record on the previous index page in the
+			alphabetical order. Before this fix, if there was
+			just one big record on each clustered index page, the
+			algorithm grossly underestimated the number of rows
+			in the table. */
+
+			n_diff[n_cols - 1]++;
+		}
+
+		mtr.commit();
+	}
+
+exit_loop:
+	/* If we saw k borders between different key values on
+	n_sample_pages leaf pages, we can estimate how many
+	there will be in index->stat_n_leaf_pages */
+
+	/* We must take into account that our sample actually represents
+	also the pages used for external storage of fields (those pages are
+	included in index->stat_n_leaf_pages) */
+
+	result.reserve(n_cols);
+
+	for (j = 0; j < n_cols; j++) {
+		index_field_stats_t stat;
+
+		stat.n_diff_key_vals
+			= BTR_TABLE_STATS_FROM_SAMPLE(
+				n_diff[j], index, n_sample_pages,
+				total_external_size, not_empty_flag);
+
+		/* If the tree is small, smaller than
+		10 * n_sample_pages + total_external_size, then
+		the above estimate is ok. For bigger trees it is common that we
+		do not see any borders between key values in the few pages
+		we pick. But still there may be n_sample_pages
+		different key values, or even more. Let us try to approximate
+		that: */
+
+		add_on = index->stat_n_leaf_pages
+			/ (10 * (n_sample_pages
+				 + total_external_size));
+
+		if (add_on > n_sample_pages) {
+			add_on = n_sample_pages;
+		}
+
+		stat.n_diff_key_vals += add_on;
+
+		stat.n_sample_sizes = n_sample_pages;
+
+		if (n_not_null != NULL) {
+			stat.n_non_null_key_vals =
+				 BTR_TABLE_STATS_FROM_SAMPLE(
+					n_not_null[j], index, n_sample_pages,
+					total_external_size, not_empty_flag);
+		}
+
+		result.push_back(stat);
+	}
+
+	mem_heap_free(heap);
+
+	return result;
+}
+
 /*********************************************************************//**
 Calculates new estimates for index statistics. This function is
 relatively quick and is used to calculate transient statistics that
@@ -1054,39 +1415,48 @@ dummy_empty:
 	} else if (ibuf_debug && !dict_index_is_clust(index)) {
 		goto dummy_empty;
 #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+	} else if (dict_index_is_online_ddl(index) || !index->is_committed()
+		   || !index->table->space) {
+		goto dummy_empty;
 	} else {
 		mtr_t	mtr;
-		ulint	size;
 
 		mtr.start();
 		mtr_s_lock_index(index, &mtr);
-		size = btr_get_size(index, BTR_TOTAL_SIZE, &mtr);
-
-		if (size != ULINT_UNDEFINED) {
-			index->stat_index_size = size;
+		buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH,
+						       &mtr);
+		if (!root) {
+invalid:
+			mtr.commit();
+			goto dummy_empty;
+		}
 
-			size = btr_get_size(
-				index, BTR_N_LEAF_PAGES, &mtr);
+		const auto bulk_trx_id = index->table->bulk_trx_id;
+		if (bulk_trx_id && trx_sys.find(nullptr, bulk_trx_id, false)) {
+			goto invalid;
 		}
 
-		mtr.commit();
+		mtr.x_lock_space(index->table->space);
 
-		switch (size) {
-		case ULINT_UNDEFINED:
-			goto dummy_empty;
-		case 0:
-			/* The root node of the tree is a leaf */
-			size = 1;
-		}
+		ulint dummy, size;
+		index->stat_index_size
+			= fseg_n_reserved_pages(*root, PAGE_HEADER
+						+ PAGE_BTR_SEG_LEAF
+						+ root->frame, &size, &mtr)
+			+ fseg_n_reserved_pages(*root, PAGE_HEADER
+						+ PAGE_BTR_SEG_TOP
+						+ root->frame, &dummy, &mtr);
 
-		index->stat_n_leaf_pages = size;
+		mtr.commit();
+
+		index->stat_n_leaf_pages = size ? size : 1;
 
 		/* Do not continue if table decryption has failed or
 		table is already marked as corrupted. */
 		if (index->is_readable()) {
 			std::vector<index_field_stats_t> stats
 				= btr_estimate_number_of_different_key_vals(
-					index);
+					index, bulk_trx_id);
 
 			if (!stats.empty()) {
 				index->table->stats_mutex_lock();
@@ -2037,7 +2407,7 @@ struct index_stats_t
   {
     stats.reserve(n_uniq);
     for (ulint i= 0; i < n_uniq; ++i)
-      stats.push_back(index_field_stats_t(0, 1, 0));
+      stats.push_back(index_field_stats_t{0, 1, 0});
   }
 };
 
@@ -2123,15 +2493,12 @@ stat_n_leaf_pages. This function can be slow.
 @return index stats */
 static index_stats_t dict_stats_analyze_index(dict_index_t* index)
 {
-	ulint		root_level;
-	ulint		level;
 	bool		level_is_analyzed;
 	ulint		n_uniq;
 	ulint		n_prefix;
 	ib_uint64_t	total_recs;
 	ib_uint64_t	total_pages;
 	mtr_t		mtr;
-	ulint		size;
 	index_stats_t	result(index->n_uniq);
 	DBUG_ENTER("dict_stats_analyze_index");
 
@@ -2150,30 +2517,43 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index)
 
 	mtr.start();
 	mtr_s_lock_index(index, &mtr);
-	size = btr_get_size(index, BTR_TOTAL_SIZE, &mtr);
+	uint16_t root_level;
+
+	{
+		buf_block_t* root;
+		root = btr_root_block_get(index, RW_SX_LATCH, &mtr);
+		if (!root) {
+empty_index:
+			mtr.commit();
+			dict_stats_assert_initialized_index(index);
+			DBUG_RETURN(result);
+		}
 
-	if (size != ULINT_UNDEFINED) {
-		result.index_size = size;
-		size = btr_get_size(index, BTR_N_LEAF_PAGES, &mtr);
+		root_level = btr_page_get_level(root->frame);
+
+		mtr.x_lock_space(index->table->space);
+		ulint dummy, size;
+		result.index_size
+			= fseg_n_reserved_pages(*root, PAGE_HEADER
+						+ PAGE_BTR_SEG_LEAF
+						+ root->frame, &size, &mtr)
+			+ fseg_n_reserved_pages(*root, PAGE_HEADER
+						+ PAGE_BTR_SEG_TOP
+						+ root->frame, &dummy, &mtr);
+		result.n_leaf_pages = size ? size : 1;
 	}
 
-	/* Release the X locks on the root page taken by btr_get_size() */
-	mtr.commit();
-
-	switch (size) {
-	case ULINT_UNDEFINED:
-		dict_stats_assert_initialized_index(index);
-		DBUG_RETURN(result);
-	case 0:
-		/* The root node of the tree is a leaf */
-		size = 1;
+	const auto bulk_trx_id = index->table->bulk_trx_id;
+	if (bulk_trx_id && trx_sys.find(nullptr, bulk_trx_id, false)) {
+		result.index_size = 1;
+		result.n_leaf_pages = 1;
+		goto empty_index;
 	}
 
-	result.n_leaf_pages = size;
+	mtr.commit();
 
 	mtr.start();
 	mtr_sx_lock_index(index, &mtr);
-	root_level = btr_height_get(index, &mtr);
 
 	n_uniq = dict_index_get_n_unique(index);
 
@@ -2251,7 +2631,7 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index)
 	So if we find that the first level containing D distinct
 	keys (on n_prefix columns) is L, we continue from L when
 	searching for D distinct keys on n_prefix-1 columns. */
-	level = root_level;
+	auto level = root_level;
 	level_is_analyzed = false;
 
 	for (n_prefix = n_uniq; n_prefix >= 1; n_prefix--) {
@@ -2265,7 +2645,10 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index)
 		mtr.commit();
 		mtr.start();
 		mtr_sx_lock_index(index, &mtr);
-		if (root_level != btr_height_get(index, &mtr)) {
+		buf_block_t *root = btr_root_block_get(index, RW_S_LATCH,
+						       &mtr);
+		if (!root || root_level != btr_page_get_level(root->frame)
+		    || index->table->bulk_trx_id != bulk_trx_id) {
 			/* Just quit if the tree has changed beyond
 			recognition here. The old stats from previous
 			runs will remain in the values that we have
@@ -2277,6 +2660,8 @@ static index_stats_t dict_stats_analyze_index(dict_index_t* index)
 			break;
 		}
 
+		mtr.memo_release(root, MTR_MEMO_PAGE_S_FIX);
+
 		/* check whether we should pick the current level;
 		we pick level 1 even if it does not have enough
 		distinct records because we do not want to scan the
diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h
index 4543fc21e8a..40293df21e5 100644
--- a/storage/innobase/include/btr0btr.h
+++ b/storage/innobase/include/btr0btr.h
@@ -196,18 +196,6 @@ btr_root_adjust_on_import(
 	const dict_index_t*	index)	/*!< in: index tree */
 	MY_ATTRIBUTE((warn_unused_result));
 
-/**************************************************************//**
-Gets the height of the B-tree (the level of the root, when the leaf
-level is assumed to be 0). The caller must hold an S or X latch on
-the index.
-@return tree height (level of the root) */
-ulint
-btr_height_get(
-/*===========*/
-	const dict_index_t*	index,	/*!< in: index tree */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction */
-	MY_ATTRIBUTE((warn_unused_result));
-
 /** Get an index page and declare its latching order level.
 @param[in]	index	index tree
 @param[in]	page	page number
@@ -537,17 +525,6 @@ btr_discard_page(
 	btr_cur_t*	cursor,	/*!< in: cursor on the page to discard: not on
 				the root page */
 	mtr_t*		mtr);	/*!< in: mtr */
-/**************************************************************//**
-Gets the number of pages in a B-tree.
-@return number of pages, or ULINT_UNDEFINED if the index is unavailable */
-ulint
-btr_get_size(
-/*=========*/
-	const dict_index_t*	index,	/*!< in: index */
-	ulint		flag,	/*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
-	mtr_t*		mtr)	/*!< in/out: mini-transaction where index
-				is s-latched */
-	MY_ATTRIBUTE((warn_unused_result));
 
 /**************************************************************//**
 Allocates a new file page to be used in an index tree. NOTE: we assume
@@ -614,7 +591,6 @@ btr_root_block_get(
 	rw_lock_type_t		mode,	/*!< in: either RW_S_LATCH
 					or RW_X_LATCH */
 	mtr_t*			mtr);	/*!< in: mtr */
-
 /*************************************************************//**
 Reorganizes an index page.
 
diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h
index c7f25aff4b7..911c79c29ba 100644
--- a/storage/innobase/include/btr0cur.h
+++ b/storage/innobase/include/btr0cur.h
@@ -575,37 +575,6 @@ btr_estimate_n_rows_in_range(
         btr_pos_t*      range_start,
         btr_pos_t*      range_end);
 
-
-/** Statistics for one field of an index. */
-struct index_field_stats_t
-{
-  ib_uint64_t n_diff_key_vals;
-  ib_uint64_t n_sample_sizes;
-  ib_uint64_t n_non_null_key_vals;
-
-  index_field_stats_t(ib_uint64_t n_diff_key_vals= 0,
-                      ib_uint64_t n_sample_sizes= 0,
-                      ib_uint64_t n_non_null_key_vals= 0)
-      : n_diff_key_vals(n_diff_key_vals), n_sample_sizes(n_sample_sizes),
-        n_non_null_key_vals(n_non_null_key_vals)
-  {
-  }
-};
-
-/** Estimates the number of different key values in a given index, for
-each n-column prefix of the index where 1 <= n <= dict_index_get_n_unique(index).
-The estimates are stored in the array index->stat_n_diff_key_vals[] (indexed
-0..n_uniq-1) and the number of pages that were sampled is saved in
-index->stat_n_sample_sizes[].
-If innodb_stats_method is nulls_ignored, we also record the number of
-non-null values for each prefix and stored the estimates in
-array index->stat_n_non_null_key_vals.
-@param[in]	index	index
-@return stat vector if the index is available and we get the estimated numbers,
-empty vector if the index is unavailable. */
-std::vector<index_field_stats_t>
-btr_estimate_number_of_different_key_vals(dict_index_t* index);
-
 /** Gets the externally stored size of a record, in units of a database page.
 @param[in]	rec	record
 @param[in]	offsets	array returned by rec_get_offsets()
author	Marko Mäkelä <marko.makela@mariadb.com>	2021-09-17 09:14:20 +0300
committer	Marko Mäkelä <marko.makela@mariadb.com>	2021-09-17 14:40:41 +0300
commit	1e9c922fa726b22f4522f2a4de0fcb6595404086 (patch)
tree	38b86f6a6218af0142e45d87629e01305997cc95
parent	1a4a7dddb6505352a7c3b4a2e1583ed92cad1dcf (diff)
download	mariadb-git-1e9c922fa726b22f4522f2a4de0fcb6595404086.tar.gz