summaryrefslogtreecommitdiff
path: root/sql/ha_partition.cc
diff options
context:
space:
mode:
authorMattias Jonsson <mattias.jonsson@oracle.com>2012-02-22 23:13:36 +0100
committerMattias Jonsson <mattias.jonsson@oracle.com>2012-02-22 23:13:36 +0100
commit42282c10ce0e2c808d1818e77a6a72eadc3a560d (patch)
tree0c7483648c83a2186a53487994a5de5ffcccd422 /sql/ha_partition.cc
parent4af454634809b9a87af30022ea0edd40d72d0b4b (diff)
downloadmariadb-git-42282c10ce0e2c808d1818e77a6a72eadc3a560d.tar.gz
Bug#13694811: THE OPTIMIZER WRONGLY USES THE FIRST INNODB
PARTITION STATISTICS Problem was the fix for bug#11756867; It always used the first partitions, and stopped after it checked 10 [sub]partitions. (or until it found a partition which would contain a match). This results in bad statistics for tables where the first 10 partitions don't represent the majority of the data (like when the first 10 partitions only contained a few rows in total). The solution was to take statisics from the partitions containing the most rows instead: Added an array of partition ids which is sorted by number of records in descending order. this array is used in records_in_range to cover as many records as possible in as few calls as possible. Also changed the limit of how many partitions to use for the statistics from a static max of 10 partitions, into a dynamic model: Maximum number of partitions is now log2(total number of partitions) taken from the ordered array. It will continue calling partitions records_in_range until it has checked: (total rows in matching partitions) * (maximum number of partitions) / (number of used partitions) Also reverted the changes for ha_partition::scan_time() and ha_partition::estimate_rows_upper_bound() to before the fix of bug#11756867. Since they are not as slow as records_in_range.
Diffstat (limited to 'sql/ha_partition.cc')
-rw-r--r--sql/ha_partition.cc300
1 files changed, 195 insertions, 105 deletions
diff --git a/sql/ha_partition.cc b/sql/ha_partition.cc
index 7c7cf5a4302..ddfe271f6a3 100644
--- a/sql/ha_partition.cc
+++ b/sql/ha_partition.cc
@@ -286,6 +286,7 @@ void ha_partition::init_handler_variables()
m_is_sub_partitioned= 0;
m_is_clone_of= NULL;
m_clone_mem_root= NULL;
+ m_part_ids_sorted_by_num_of_records= NULL;
#ifdef DONT_HAVE_TO_BE_INITALIZED
m_start_key.flag= 0;
@@ -321,6 +322,8 @@ ha_partition::~ha_partition()
delete m_file[i];
}
my_free((char*) m_ordered_rec_buffer, MYF(MY_ALLOW_ZERO_PTR));
+ my_free((char*) m_part_ids_sorted_by_num_of_records,
+ MYF(MY_ALLOW_ZERO_PTR));
clear_handler_file();
DBUG_VOID_RETURN;
@@ -2638,6 +2641,16 @@ int ha_partition::open(const char *name, int mode, uint test_if_locked)
m_start_key.key= (const uchar*)ptr;
}
}
+ if (!m_part_ids_sorted_by_num_of_records)
+ {
+ if (!(m_part_ids_sorted_by_num_of_records=
+ (uint32*) my_malloc(m_tot_parts * sizeof(uint32), MYF(MY_WME))))
+ DBUG_RETURN(error);
+ uint32 i;
+ /* Initialize it with all partition ids. */
+ for (i= 0; i < m_tot_parts; i++)
+ m_part_ids_sorted_by_num_of_records[i]= i;
+ }
/* Initialize the bitmap we use to minimize ha_start_bulk_insert calls */
if (bitmap_init(&m_bulk_insert_started, NULL, m_tot_parts + 1, FALSE))
@@ -5146,6 +5159,24 @@ int ha_partition::handle_ordered_prev(uchar *buf)
and read_time calls
*/
+/**
+ Helper function for sorting according to number of rows in descending order.
+*/
+
+int ha_partition::compare_number_of_records(ha_partition *me,
+ const uint32 *a,
+ const uint32 *b)
+{
+ handler **file= me->m_file;
+ /* Note: sorting in descending order! */
+ if (file[*a]->stats.records > file[*b]->stats.records)
+ return -1;
+ if (file[*a]->stats.records < file[*b]->stats.records)
+ return 1;
+ return 0;
+}
+
+
/*
General method to gather info from handler
@@ -5387,6 +5418,15 @@ int ha_partition::info(uint flag)
}
i++;
} while (*(++file_array));
+ /*
+ Sort the array of part_ids by number of records in
+ in descending order.
+ */
+ my_qsort2((void*) m_part_ids_sorted_by_num_of_records,
+ m_tot_parts,
+ sizeof(uint32),
+ (qsort2_cmp) compare_number_of_records,
+ this);
file= m_file[handler_instance];
file->info(HA_STATUS_CONST);
@@ -6124,116 +6164,113 @@ const key_map *ha_partition::keys_to_use_for_scanning()
DBUG_RETURN(m_file[0]->keys_to_use_for_scanning());
}
-#define MAX_PARTS_FOR_OPTIMIZER_CALLS 10
-/*
- Prepare start variables for estimating optimizer costs.
-
- @param[out] num_used_parts Number of partitions after pruning.
- @param[out] check_min_num Number of partitions to call.
- @param[out] first first used partition.
+/**
+ Minimum number of rows to base optimizer estimate on.
*/
-void ha_partition::partitions_optimizer_call_preparations(uint *first,
- uint *num_used_parts,
- uint *check_min_num)
+
+ha_rows ha_partition::min_rows_for_estimate()
{
- *first= bitmap_get_first_set(&(m_part_info->used_partitions));
- *num_used_parts= bitmap_bits_set(&(m_part_info->used_partitions));
- *check_min_num= min(MAX_PARTS_FOR_OPTIMIZER_CALLS, *num_used_parts);
+ uint i, max_used_partitions, tot_used_partitions;
+ DBUG_ENTER("ha_partition::partitions_optimizer_call_preparations");
+
+ tot_used_partitions= bitmap_bits_set(&m_part_info->used_partitions);
+ DBUG_ASSERT(tot_used_partitions);
+
+ /*
+ Allow O(log2(tot_partitions)) increase in number of used partitions.
+ This gives O(1/log2(tot_partitions)) of rows to base the estimate on.
+ I.e when the total number of partitions doubles, allow one more
+ partition to be checked.
+ */
+ i= 2;
+ max_used_partitions= 1;
+ while (i < m_tot_parts)
+ {
+ max_used_partitions++;
+ i= i << 1;
+ }
+ if (max_used_partitions > tot_used_partitions)
+ max_used_partitions= tot_used_partitions;
+
+ /* stats.records is already updated by the info(HA_STATUS_VARIABLE) call. */
+ DBUG_PRINT("info", ("max_used_partitions: %u tot_rows: %lu",
+ max_used_partitions,
+ (ulong) stats.records));
+ DBUG_PRINT("info", ("tot_used_partitions: %u min_rows_to_check: %lu",
+ tot_used_partitions,
+ (ulong) stats.records * max_used_partitions
+ / tot_used_partitions));
+ DBUG_RETURN(stats.records * max_used_partitions / tot_used_partitions);
}
-/*
- Return time for a scan of the table
+/**
+ Get the biggest used partition.
- SYNOPSIS
- scan_time()
+ Starting at the N:th biggest partition and skips all non used
+ partitions, returning the biggest used partition found
- RETURN VALUE
- time for scan
+ @param[in,out] part_index Skip the *part_index biggest partitions
+
+ @return The biggest used partition with index not lower than *part_index.
+ @retval NO_CURRENT_PART_ID No more partition used.
+ @retval != NO_CURRENT_PART_ID partition id of biggest used partition with
+ index >= *part_index supplied. Note that
+ *part_index will be updated to the next
+ partition index to use.
*/
-double ha_partition::scan_time()
+uint ha_partition::get_biggest_used_partition(uint *part_index)
{
- double scan_time= 0.0;
- uint first, part_id, num_used_parts, check_min_num, partitions_called= 0;
- DBUG_ENTER("ha_partition::scan_time");
-
- partitions_optimizer_call_preparations(&first, &num_used_parts, &check_min_num);
- for (part_id= first; partitions_called < num_used_parts ; part_id++)
+ uint part_id;
+ while ((*part_index) < m_tot_parts)
{
- if (!bitmap_is_set(&(m_part_info->used_partitions), part_id))
- continue;
- scan_time+= m_file[part_id]->scan_time();
- partitions_called++;
- if (partitions_called >= check_min_num && scan_time != 0.0)
- {
- DBUG_RETURN(scan_time *
- (double) num_used_parts / (double) partitions_called);
- }
+ part_id= m_part_ids_sorted_by_num_of_records[(*part_index)++];
+ if (bitmap_is_set(&m_part_info->used_partitions, part_id))
+ return part_id;
}
- DBUG_RETURN(scan_time);
+ return NO_CURRENT_PART_ID;
}
/*
- Estimate rows for records_in_range or estimate_rows_upper_bound.
+ Return time for a scan of the table
- @param is_records_in_range call records_in_range instead of
- estimate_rows_upper_bound.
- @param inx (only for records_in_range) index to use.
- @param min_key (only for records_in_range) start of range.
- @param max_key (only for records_in_range) end of range.
+ SYNOPSIS
+ scan_time()
- @return Number of rows or HA_POS_ERROR.
+ RETURN VALUE
+ time for scan
*/
-ha_rows ha_partition::estimate_rows(bool is_records_in_range, uint inx,
- key_range *min_key, key_range *max_key)
+
+double ha_partition::scan_time()
{
- ha_rows rows, estimated_rows= 0;
- uint first, part_id, num_used_parts, check_min_num, partitions_called= 0;
- DBUG_ENTER("ha_partition::records_in_range");
+ double scan_time= 0;
+ handler **file;
+ DBUG_ENTER("ha_partition::scan_time");
- partitions_optimizer_call_preparations(&first, &num_used_parts, &check_min_num);
- for (part_id= first; partitions_called < num_used_parts ; part_id++)
- {
- if (!bitmap_is_set(&(m_part_info->used_partitions), part_id))
- continue;
- if (is_records_in_range)
- rows= m_file[part_id]->records_in_range(inx, min_key, max_key);
- else
- rows= m_file[part_id]->estimate_rows_upper_bound();
- if (rows == HA_POS_ERROR)
- DBUG_RETURN(HA_POS_ERROR);
- estimated_rows+= rows;
- partitions_called++;
- if (partitions_called >= check_min_num && estimated_rows)
- {
- DBUG_RETURN(estimated_rows * num_used_parts / partitions_called);
- }
- }
- DBUG_RETURN(estimated_rows);
+ for (file= m_file; *file; file++)
+ if (bitmap_is_set(&(m_part_info->used_partitions), (file - m_file)))
+ scan_time+= (*file)->scan_time();
+ DBUG_RETURN(scan_time);
}
-/*
- Find number of records in a range
-
- SYNOPSIS
- records_in_range()
- inx Index number
- min_key Start of range
- max_key End of range
+/**
+ Find number of records in a range.
+ @param inx Index number
+ @param min_key Start of range
+ @param max_key End of range
- RETURN VALUE
- Number of rows in range
+ @return Number of rows in range.
- DESCRIPTION
- Given a starting key, and an ending key estimate the number of rows that
- will exist between the two. end_key may be empty which in case determine
- if start_key matches any rows.
+ Given a starting key, and an ending key estimate the number of rows that
+ will exist between the two. end_key may be empty which in case determine
+ if start_key matches any rows.
- Called from opt_range.cc by check_quick_keys().
+ Called from opt_range.cc by check_quick_keys().
+ @note
monty: MUST be called for each range and added.
Note that MySQL will assume that if this returns 0 there is no
matching rows for the range!
@@ -6242,27 +6279,80 @@ ha_rows ha_partition::estimate_rows(bool is_records_in_range, uint inx,
ha_rows ha_partition::records_in_range(uint inx, key_range *min_key,
key_range *max_key)
{
+ ha_rows min_rows_to_check, rows, estimated_rows=0, checked_rows= 0;
+ uint partition_index= 0, part_id;
DBUG_ENTER("ha_partition::records_in_range");
- DBUG_RETURN(estimate_rows(TRUE, inx, min_key, max_key));
-}
+ min_rows_to_check= min_rows_for_estimate();
+ while ((part_id= get_biggest_used_partition(&partition_index))
+ != NO_CURRENT_PART_ID)
+ {
+ rows= m_file[part_id]->records_in_range(inx, min_key, max_key);
+
+ DBUG_PRINT("info", ("part %u match %lu rows of %lu", part_id, (ulong) rows,
+ (ulong) m_file[part_id]->stats.records));
-/*
- Estimate upper bound of number of rows
+ if (rows == HA_POS_ERROR)
+ DBUG_RETURN(HA_POS_ERROR);
+ estimated_rows+= rows;
+ checked_rows+= m_file[part_id]->stats.records;
+ /*
+ Returning 0 means no rows can be found, so we must continue
+ this loop as long as we have estimated_rows == 0.
+ Also many engines return 1 to indicate that there may exist
+ a matching row, we do not normalize this by dividing by number of
+ used partitions, but leave it to be returned as a sum, which will
+ reflect that we will need to scan each partition's index.
+
+ Note that this statistics may not always be correct, so we must
+ continue even if the current partition has 0 rows, since we might have
+ deleted rows from the current partition, or inserted to the next
+ partition.
+ */
+ if (estimated_rows && checked_rows &&
+ checked_rows >= min_rows_to_check)
+ {
+ DBUG_PRINT("info",
+ ("records_in_range(inx %u): %lu (%lu * %lu / %lu)",
+ inx,
+ (ulong) (estimated_rows * stats.records / checked_rows),
+ (ulong) estimated_rows,
+ (ulong) stats.records,
+ (ulong) checked_rows));
+ DBUG_RETURN(estimated_rows * stats.records / checked_rows);
+ }
+ }
+ DBUG_PRINT("info", ("records_in_range(inx %u): %lu",
+ inx,
+ (ulong) estimated_rows));
+ DBUG_RETURN(estimated_rows);
+}
- SYNOPSIS
- estimate_rows_upper_bound()
- RETURN VALUE
- Number of rows
+/**
+ Estimate upper bound of number of rows.
+
+ @return Number of rows.
*/
ha_rows ha_partition::estimate_rows_upper_bound()
{
+ ha_rows rows, tot_rows= 0;
+ handler **file= m_file;
DBUG_ENTER("ha_partition::estimate_rows_upper_bound");
- DBUG_RETURN(estimate_rows(FALSE, 0, NULL, NULL));
+ do
+ {
+ if (bitmap_is_set(&(m_part_info->used_partitions), (file - m_file)))
+ {
+ rows= (*file)->estimate_rows_upper_bound();
+ if (rows == HA_POS_ERROR)
+ DBUG_RETURN(HA_POS_ERROR);
+ tot_rows+= rows;
+ }
+ } while (*(++file));
+ DBUG_RETURN(tot_rows);
}
@@ -6494,20 +6584,20 @@ int ha_partition::add_index(TABLE *table_arg, KEY *key_info, uint num_of_keys)
return ret;
err:
if (file > m_file)
- {
- uint *key_numbers= (uint*) ha_thd()->alloc(sizeof(uint) * num_of_keys);
- KEY *old_key_info= table_arg->key_info;
- uint i;
- /* Use the newly added key_info as table->key_info to remove them. */
- for (i= 0; i < num_of_keys; i++)
- key_numbers[i]= i;
- table_arg->key_info= key_info;
- while (--file >= m_file)
- {
- (void) (*file)->prepare_drop_index(table_arg, key_numbers, num_of_keys);
- (void) (*file)->final_drop_index(table_arg);
- }
- table_arg->key_info= old_key_info;
+ {
+ uint *key_numbers= (uint*) ha_thd()->alloc(sizeof(uint) * num_of_keys);
+ KEY *old_key_info= table_arg->key_info;
+ uint i;
+ /* Use the newly added key_info as table->key_info to remove them. */
+ for (i= 0; i < num_of_keys; i++)
+ key_numbers[i]= i;
+ table_arg->key_info= key_info;
+ while (--file >= m_file)
+ {
+ (void) (*file)->prepare_drop_index(table_arg, key_numbers, num_of_keys);
+ (void) (*file)->final_drop_index(table_arg);
+ }
+ table_arg->key_info= old_key_info;
}
return ret;
}