diff options
author | Monty <monty@mariadb.org> | 2022-10-04 12:59:43 +0300 |
---|---|---|
committer | Monty <monty@mariadb.org> | 2022-10-04 16:11:19 +0300 |
commit | a35f715f440eb9ce93e4245cb1e4e5bc6f6972a2 (patch) | |
tree | 1efc35fe4ccfa66afdd067483d9ceff74830d105 | |
parent | a5d799298aa88d78998bc964743964abc70b84bd (diff) | |
download | mariadb-git-a35f715f440eb9ce93e4245cb1e4e5bc6f6972a2.tar.gz |
Changed aggregate distinct optimization with indexes to be cost based.
Before the cost of an aggregate distinct (COUNT(DISTINCT ...)) was set
to 0 if the values where part of an index and the cost of grouping
was higher than the best cost so far. This was shown in explain with
"Using index for group-by (scanning)".
This patch fixes it by calculating the cost of aggregate distinct
and using scanning only if the cost was better than group-by-optimization.
Thing taken into account:
- When using aggregate distinct on index, the filtering is done before
the row is checked against the WHERE and we have thus less WHERE cost.
- When comparing a cost from aggregate distinct, we add to the compared
to plan the cost of doing the filtering later in the SQL level.
-rw-r--r-- | sql/opt_range.cc | 79 | ||||
-rw-r--r-- | sql/optimizer_costs.h | 2 |
2 files changed, 64 insertions, 17 deletions
diff --git a/sql/opt_range.cc b/sql/opt_range.cc index 1f6675b8c15..b5de9887e36 100644 --- a/sql/opt_range.cc +++ b/sql/opt_range.cc @@ -2461,20 +2461,21 @@ void TRP_INDEX_MERGE::trace_basic_info(PARAM *param, class TRP_GROUP_MIN_MAX : public TABLE_READ_PLAN { private: - bool have_min, have_max, have_agg_distinct; - KEY_PART_INFO *min_max_arg_part; uint group_prefix_len; uint used_key_parts; uint group_key_parts; - KEY *index_info; uint index; uint key_infix_len; + uint param_idx; /* Index of used key in param->key. */ uchar key_infix[MAX_KEY_LENGTH]; + KEY *index_info; + KEY_PART_INFO *min_max_arg_part; SEL_TREE *range_tree; /* Represents all range predicates in the query. */ SEL_ARG *index_tree; /* The SEL_ARG sub-tree corresponding to index_info. */ - uint param_idx; /* Index of used key in param->key. */ - bool is_index_scan; /* Use index_next() instead of random read */ + bool have_min, have_max; public: + bool have_agg_distinct; + bool is_index_scan; /* Use index_next() instead of random read */ /* Number of records selected by the ranges in index_tree. */ ha_rows quick_prefix_records; public: @@ -2487,13 +2488,14 @@ public: uchar *key_infix_arg, SEL_TREE *tree_arg, SEL_ARG *index_tree_arg, uint param_idx_arg, ha_rows quick_prefix_records_arg) - : have_min(have_min_arg), have_max(have_max_arg), + : group_prefix_len(group_prefix_len_arg), used_key_parts(used_key_parts_arg), + group_key_parts(group_key_parts_arg), + index(index_arg), key_infix_len(key_infix_len_arg), param_idx(param_idx_arg), + index_info(index_info_arg),min_max_arg_part(min_max_arg_part_arg), + range_tree(tree_arg), index_tree(index_tree_arg), + have_min(have_min_arg), have_max(have_max_arg), have_agg_distinct(have_agg_distinct_arg), - min_max_arg_part(min_max_arg_part_arg), - group_prefix_len(group_prefix_len_arg), used_key_parts(used_key_parts_arg), - group_key_parts(group_key_parts_arg), index_info(index_info_arg), - index(index_arg), key_infix_len(key_infix_len_arg), range_tree(tree_arg), - index_tree(index_tree_arg), param_idx(param_idx_arg), is_index_scan(FALSE), + is_index_scan(FALSE), quick_prefix_records(quick_prefix_records_arg) { if (key_infix_len) @@ -3060,6 +3062,7 @@ int SQL_SELECT::test_quick_select(THD *thd, key_map keys_to_use, if (!only_single_index_range_scan) { TRP_GROUP_MIN_MAX *group_trp; + double duplicate_removal_cost= 0; if (tree) restore_nonrange_trees(¶m, tree, backup_keys); if ((group_trp= get_best_group_min_max(¶m, tree, read_time))) @@ -3078,9 +3081,31 @@ int SQL_SELECT::test_quick_select(THD *thd, key_map keys_to_use, if (unlikely(thd->trace_started())) group_trp->trace_basic_info(¶m, &grp_summary); - if (group_trp->read_cost < best_read_time || force_group_by) + if (group_trp->have_agg_distinct && group_trp->is_index_scan) + { + /* + We are optimization a distinct aggregate, like + SELECT count(DISTINCT a,b,c) FROM ... + + The group cost includes removal of the distinct, so to be + able to compare costs, we add small cost to the original cost + that stands for the extra work we have to do on the outside of + the engine to do duplicate elimination for each output row if + we are not using the grouping code. + */ + duplicate_removal_cost= (DUPLICATE_REMOVAL_COST * + (best_trp ? best_trp->records : + table_records)); + } + if (group_trp->read_cost < best_read_time + duplicate_removal_cost || + force_group_by) { - grp_summary.add("chosen", true); + if (thd->trace_started()) + { + if (duplicate_removal_cost) + grp_summary.add("duplicate_removal_cost", duplicate_removal_cost); + grp_summary.add("chosen", true); + } best_trp= group_trp; } else @@ -14484,11 +14509,31 @@ get_best_group_min_max(PARAM *param, SEL_TREE *tree, double read_time) read_plan->read_cost= best_read_cost; read_plan->records= best_records; - if (read_time < best_read_cost && is_agg_distinct) + if (is_agg_distinct) { - trace_group.add("index_scan", true); - read_plan->read_cost= 0; - read_plan->use_index_scan(); + double best_cost, duplicate_removal_cost; + ulonglong records; + handler *file= table->file; + + /* Calculate cost of distinct scan on index */ + if (best_index_tree && read_plan->quick_prefix_records) + records= read_plan->quick_prefix_records; + else + records= table->stat_records(); + + best_cost= file->cost(file->ha_key_scan_time(index, records)); + /* We only have 'best_records' left after duplication elimination */ + best_cost+= best_records * WHERE_COST_THD(thd); + duplicate_removal_cost= (DUPLICATE_REMOVAL_COST * records); + + if (best_cost < read_plan->read_cost + duplicate_removal_cost) + { + read_plan->read_cost= best_cost; + read_plan->use_index_scan(); + trace_group. + add("scan_cost", best_cost). + add("index_scan", true); + } } DBUG_PRINT("info", diff --git a/sql/optimizer_costs.h b/sql/optimizer_costs.h index 698cdbfe41e..3b2300b9019 100644 --- a/sql/optimizer_costs.h +++ b/sql/optimizer_costs.h @@ -87,6 +87,8 @@ extern OPTIMIZER_COSTS heap_optimizer_costs, tmp_table_optimizer_costs; #define TABLE_SCAN_SETUP_COST optimizer_scan_setup_cost #define TABLE_SCAN_SETUP_COST_THD(THD) (THD)->variables.optimizer_scan_setup_cost #define INDEX_SCAN_SETUP_COST optimizer_scan_setup_cost/2 +/* Cost for doing duplicate removal in test_quick_select */ +#define DUPLICATE_REMOVAL_COST default_optimizer_costs.key_copy_cost /* Default fill factors of an (b-tree) index block is assumed to be 0.75 */ #define INDEX_BLOCK_FILL_FACTOR_DIV 3 |