diff options
54 files changed, 1961 insertions, 569 deletions
diff --git a/Docs/optimizer_costs.txt b/Docs/optimizer_costs.txt new file mode 100644 index 00000000000..4e0ca40c811 --- /dev/null +++ b/Docs/optimizer_costs.txt @@ -0,0 +1,383 @@ +This file is intended to explain some of the optimizer cost variables +in MariaDB 10.11. + +Timings made on: +CPU: Intel(R) Xeon(R) W-2295 CPU @ 3.00GHz +Memory: 256G +Disk: Samsum SSD 860 (not really relevant in this case) + +Rows in test: 1M + +Most timings has come from running: + +perl check_costs.pl --skip-drop --engine=heap --user=monty --rows=1000000 --socket=/tmp/mysql-dbug.sock --init-query="set @@optimizer_where_compare_cost=0.01,@@optimizer_row_copy_cost=0.1,@@optimizer_key_copy_cost=0.025,@@optimizer_index_block_copy_cost=0.2,@@optimizer_index_next_find_cost=0.0125,@@optimizer_key_compare_cost=0.005" --comment --aria-pagecache-buffer-size="10G --innodb-buffer_pool_size=10G --key_buffer-size=1G --max-heap-table-size=10G + +- All costs are changed to be milliseconds for the engine operations + calculation + of the WHERE clause. This is a big change from before the patch that + added this file where the basic cost was a disk seek and one index read. +- I am using Aria as the 'base' cost. This is because it caches all data, + which most engines would do. +- MyISAM cannot be used as it does not cache row data (which gives a high + overhead when doing row lookups). +- Heap is in memory and a bit too special (no caching). +- InnoDB is a clustered engine where secondary indexes has to use + the clustered index to find a row (not common case among engines). + +The assumption in the optimzer has 'always' been that 1 cost = 1 seek = 0.10ms. +However this has not been reflected in most cost. +This document is the base of changing things so that 1 cost = 1ms. + +-------- + +optimizer_where_cost: + +MariaDB [test]> select benchmark(100000000,l_commitDate >= '2000-01-01' and l_tax >= 0.0) from test.check_costs limit 1; ++--------------------------------------------------------------------+ +| benchmark(100000000,l_commitDate >= '2000-01-01' and l_tax >= 0.0) | ++--------------------------------------------------------------------+ +| 0 | ++--------------------------------------------------------------------+ +1 row in set (3.198 sec) + +Time of where in seconds: 3.198 / 100000000 (100,000,000) + +Verification: + +select sum(1) from seq_1_to_100000000 where seq>=0.0 and seq>=-1.0; ++-----------+ +| sum(1) | ++-----------+ +| 100000000 | ++-----------+ +1 row in set (8.564 sec) + +MariaDB [test]> select sum(1) from seq_1_to_100000000; ++-----------+ +| sum(1) | ++-----------+ +| 100000000 | ++-----------+ +1 row in set (5.162 sec) + + +Time of where= (8.564-5.162)/100000000 = 3.402/100000000 (100,000,000) +(ok, as sligthly different computations) + +check_costs.pl comes provides the numbers when using heap tables and 1M rows: + +simple where: 118.689 ms +complex where: 138.474 ms +no where: 83.699 ms + +Which gives for simple where: +(118.689-83.699)/1000 = 0.034990000000000007 ms +Which is in the same ballpark. + +We use the result from the select benchmark run as this has least overhead +and is easiest to repeat and verify in a test. +Which gives optimizer_where_cost= 0.032 ms / where. + +---------------------------- + +HEAP_SCAN_TIME & ROW_COPY_COST + +We start with heap as all rows are in memory and we don't have to take +disk reads into account. + +select sum(l_partkey) from test.check_costs +table_scan ms: 11.72391252 +rows: 1000000 + +Cost should be 11.72391252 (scan cost) + 32 (where cost) + +cost= scan_time() * optimizer_cache_cost * SCAN_LOOKUP_COST + + TABLE_SCAN_SETUP_COST * avg_io_cost() + + records * (ROW_COPY_COST + WHERE_COMPARE_COST); + +=> +We are ignoring TABLE_SCAN_SETUP which is just to prefer index scans. +We can also ignore records * WHERE_COMPARE_COST as we don't have that +in the above 'ms' + +cost= scan_time() * 1 * 1 + + 1000000.0 * row_copy_cost +=> +cost= time_per_row*1000000+ row_copy_cost * 1000000; +=> +time_per_row+row_copy_cost= cost/1000000 + +Let's assume that for heap, finding the next row is 80 % of the time and +copying the row (a memcmp) to upper level is then 20 %. + +time_per_row= 11.72/1000000*0.8 = 9.3760000000000014e-06 +row_copy_cost= 11.72/1000000*0.2 = 2.3340000000000001e-06 + +Conclusion: +heap_scan_time= 9.376e-06 +row_copy_cost= 2.334e-06 + +I set key_copy_cost as row_copy_cost/5 to promote key scan. + +---------------------------- +HEAP_RANGE_TIME + +select sum(l_orderkey) from test.check_costs force index(commitdate) where l_commitDate >= '2000-01-01' and l_tax >= 0.0 +range_scan ms: 54.04171921 + +This is doing a scan through a binary tree and then finding the row +and copying it. +The cost of copying the record is is same as above (row_copy_cost). +Which means that he cost of finding a row through an index scan is: + +expected_cost/1000000.0 - row_copy_cost +54.04171921/1000000.0 - 2.3344e-06 = 5.1707319209999997e-05 + +-------------------------- + +SEQUENCE SCAN: + +analyze format=json select sum(seq+1) from seq_1_to_1000000; +r_table_time_ms: 15.78074764 + +Taking key_copy_cost into account: + +scan_cost/row= (15.78074764 - key_copy_cost)/10000000 = + +(15.78074764 - 2.334e-06/5*1000000.0)/1000000= 1.531394764e-05 + +--------------------- + +HEAP_KEY_LOOKUP + +We can use this code to check the cost of a index read in a table: + +analyze format=json select straight_join count(*) from seq_1_to_1000000,check_costs where seq=l_orderkey + +"query_block": { + "select_id": 1, + "r_loops": 1, + "r_total_time_ms": 420.5083447, + "table": { + "table_name": "seq_1_to_1000000", + "access_type": "index", + "possible_keys": ["PRIMARY"], + "key": "PRIMARY", + "key_length": "8", + "used_key_parts": ["seq"], + "r_loops": 1, + "rows": 1000000, + "r_rows": 1000000, + "r_table_time_ms": 12.47830611, + "r_other_time_ms": 44.0671283, + "filtered": 100, + "r_filtered": 100, + "using_index": true + }, + "table": { + "table_name": "check_costs", + "access_type": "eq_ref", + "possible_keys": ["PRIMARY"], + "key": "PRIMARY", + "key_length": "4", + "used_key_parts": ["l_orderkey"], + "ref": ["test.seq_1_to_1000000.seq"], + "r_loops": 1000000, + "rows": 1, + "r_rows": 1, + "r_table_time_ms": 193.8698744, + "r_other_time_ms": 143.4234354, + "filtered": 100, + "r_filtered": 100, + "attached_condition": "seq_1_to_1000000.seq = check_costs.l_orderkey" + } + } + +This gives the time for a key lookup on hash key as: +193.869874/10000000 - row_copy_cost = +193.869874/1000000.0 - 2.3344e-06 = 0.000191535474 ~= 1.91e-4 + +-------------------- + +ARIA TABLE SCAN +(When all rows are cached) + +table_scan ms: 119.4276553 + +Cost is calcualated as: + +blocks= stats.data_file_length / stats.block_size) = 125681664/8192= 15342 + +cost= blocks * DISK_SEEK_BASE_COST * avg_io_cost() * + optimizer_cache_cost * SCAN_LOOKUP_COST + + blocks * INDEX_BLOCK_COPY_COST + + TABLE_SCAN_SETUP_COST + + records * (ROW_COPY_COST + WHERE_COST)); + +When all is in memory (optimizer_cache_cost= 0) we get: + + +cost= blocks * INDEX_BLOCK_COPY_COST + + TABLE_SCAN_SETUP_COST + + records * (ROW_COPY_COST + WHERE_COST)); + +To calculate INDEX_BLOCK_COPY_COST I added a temporary tracker in +ma_pagecache.cc::pagecache_read() and did run the same query. +I got the data: +{counter = 17755, sum = 1890559} +Which give me the time for copying a block to: +1000.0*1890559/sys_timer_info.cycles.frequency/17755 = 3.558138826971332e-05 ms + +And thus INDEX_BLOCK_COPY_COST = 3.56e-05 + + + +Replacing known constants (and ignoring WHERE as it's not part of the table +scan): + +cost= 119.49 = 15342 * 3.56e-5 + 1 + 1000000 * row_copy_cost; + +row_copy_cost= (119.49 - (15342 * 3.56e-5 + 1))/100000 = 0.000118 + + +------------------- + + +Finding out cost of reading X keys from an index (no row lookup) in Aria. + +From check_costs.pl for range scan of 1M rows: +select sum(l_discount) from test.check_costs force index(commitdate) where l_commitDate > '2000-01-01' and l_discount >= 0.0 + +Index scan for Aria: index_scan sec: 0.1039406005 cost: 49664 + +key_read_time()= rows * index_length / INDEX_BLOCK_FILL_FACTOR / block_size * + (avg_io_cost + INDEX_BLOCK_COPY_COST / optimizer_cache_cost); + +index_scan_cost= key_read_time() * optmizer_cache_cost + + rows * (INDEX_NEXT_FIND_COST + KEY_COPY_COST + WHERE_COMPARE_COST) + +avg_io_size() = 1 (constant) +INDEX_BLOCK_COPY_COST= 0.75 (constant) +optimizer_cache_cost= 0.5 (default) +block_size= 8192 (aria default) +index_length=19 (from test case) +INDEX_BLOCK_COPY_COST= 0.2 + +Where time should be 0.03499 (as seem from above calculation of scan time) +Note that is is not part of the times below but is part of the cost calculation! + +key_read_time= rows * (19 / 0.75 / 8193 * (1.0 + 0.2 / 0.5)) = + 3092 (blocks) *(1+1.4) = 4328 + +index_scan_cost= 4328 + + rows * (INDEX_NEXT_FIND_COST + KEY_COPY_COST + WHERE_COMPARE_COST); + +The above should cover the cost of 0.1167679423 seconds (read) + 0.03499 (where) += 0.1517579423 sec + +Which means that the cost for the above scan 0.1517579423 should have +been 151. However everything is in memory for this test, so reading +the 3092 blocks takes no time instead of 3 seconds if they would not +be in the cache. + +Running with optimizer_cache_hit_ratio of 99 (almost all rows in cache), gives us +ha_key_scan_time()= 649 * records()* INDEX_NEXT_FIND_POS; + +To calculate INDEX_BLOCK_COPY_COST I added a temporary tracker in +ma_pagecache.cc::pagecache_read() and did run the same query. +I got the data: +{counter = 17755, sum = 1890559} +Which give me the time for copying a block to: +1000.0*1890559/sys_timer_info.cycles.frequency/17755 = 3.558138826971332e-05 ms +And thus INDEX_BLOCK_COPY_COST = 3.56e-03 +WHERE_COMPARE_COSTS = 0.03499 / 1000000 * 100 = 3.5e-06 + +Aria sec: 0.1167679423 cost: 37164.449327 +InnoDB sec: 0.1270633150 cost: 49847.613142 +MyISAM sec: 0.1502838113 cost: 39328.370352 +heap sec: 0.0675109278 cost: 295587.370000 + + +-------------------------------------------- + +Finding the cost of reading X rows through and index + + +Aria sec: 0.3594678257 cost: 477749.469327 +InnoDB sec: 1.1580623840 cost: 724847.613142 +MyISAM sec: 1.3118995710 cost: 482999.390352 +heap sec: 0.0593971533 cost: 295587.370000 + +row_lookup_cost= rows*(ROW_LOOKUP_COST + INDEX_LOOKUP_COST) * avg_io_cost() * optimizer_cache_cost + INDEX_NEXT_FIND_COST + ROW_COPY_COST + WHERE_COMPARE_COST) + +Assuming ROW_LOOKUP_COST=0.5 INDEX_LOOKUP_COST=1 + +For Aria: + +rows * (1 + 0.5) * 0.5 + INDEX_NEXT_FIND_COST + ROW_COPY_COST + WHERE_COMPARE_COST) + + + +--------------------------- + +Finding out table scan costs: + +scan cost= + file_size / block_size * io_cost (1.0) * optimizer_cache_cost (0.5) * + scan_lookup_cost (1.0) + table_scan_setup_cost (1.0) * io_cost(1.0) + + rows (1000000) * (ROW_COPY_COST + WHERE_COMPARE_COST); + += file_size / block_size * 0.5 + 1.0 + + 1000000 * (ROW_COPY_COST + WHERE_COMPARE_COST); + +analyze format=json select sum(1) from seq_1_to_100000000; +r_table_time_ms": 1189.239022 + +From check_costs.pl + +Aria: table_scan sec: 0.1125753767 cost: 117672.5000 +InnoDB: table_scan sec: 0.1327371202 cost: 113766.4600 +MyISAM: table_scan sec: 0.1495176535 cost: 123697.2632 +heap: table_scan sec: 0.0123179988 cost: 143334.3833 + + +--------------------- + + +I tried to use performance schema to get costs, but I was not successfull +as all timingss I got for tables showed the total time executing the +statement, not the timing for doing the actual reads. + +--------------------- + +With --performance-schema=on + +MariaDB [test]> select sum(1) from seq_1_to_100000000; ++-----------+ +| sum(1) | ++-----------+ +| 100000000 | ++-----------+ +1 row in set (4.950 sec) + +Performance schema overhead: 30.1% + +With: +UPDATE performance_schema.setup_consumers SET ENABLED = 'YES'; +UPDATE performance_schema.setup_instruments SET ENABLED = 'YES', TIMED = 'YES'; + +Flush with: +CALL sys.ps_truncate_all_tables(FALSE); + +Performance schema overhead now: 32.9% + +Timings from: +select * from events_statements_current where thread_id=80; + +MariaDB [test]> select 885402302809000-884884140290000; ++---------------------------------+ +| 885402302809000-884884140290000 | ++---------------------------------+ +| 518162519000 | ++---------------------------------+ +-> Need to divide by 1000000000000.0 to get seconds + diff --git a/include/my_global.h b/include/my_global.h index 2eed45e8407..a392429f46b 100644 --- a/include/my_global.h +++ b/include/my_global.h @@ -671,6 +671,7 @@ typedef SOCKET_SIZE_TYPE size_socket; Io buffer size; Must be a power of 2 and a multiple of 512. May be smaller what the disk page size. This influences the speed of the isam btree library. eg to big to slow. + 4096 is a common block size on SSDs. */ #define IO_SIZE 4096U /* diff --git a/include/my_tracker.h b/include/my_tracker.h new file mode 100644 index 00000000000..88cefe5ef5d --- /dev/null +++ b/include/my_tracker.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2022, MariaDB Corporation. + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1335 USA */ + +/* + Trivial framework to add a tracker to a C function +*/ + +#include "my_rdtsc.h" + +struct my_time_tracker +{ + ulonglong counter; + ulonglong cycles; +}; + +#ifdef HAVE_TIME_TRACKING +#define START_TRACKING ulonglong my_start_time= my_timer_cycles() +#define END_TRACKING(var) \ + { \ + ulonglong my_end_time= my_timer_cycles(); \ + (var)->counter++; \ + (var)->cycles+= (unlikely(my_end_time < my_start_time) ? \ + my_end_time - my_start_time + ULONGLONG_MAX : \ + my_end_time - my_start_time); \ + } +#else +#define START_TRACKING +#define END_TRACKING(var) do { } while(0) +#endif diff --git a/sql/filesort_utils.cc b/sql/filesort_utils.cc index 36ae0e23855..d0744becb07 100644 --- a/sql/filesort_utils.cc +++ b/sql/filesort_utils.cc @@ -106,12 +106,13 @@ double get_pq_sort_cost(size_t num_rows, size_t queue_size, static double get_merge_cost(ha_rows num_elements, ha_rows num_buffers, - size_t elem_size, double compare_cost) + size_t elem_size, double compare_cost, + double disk_read_cost) { /* 2 -> 1 read + 1 write */ const double io_cost= (2.0 * (num_elements * elem_size + DISK_CHUNK_SIZE - 1) / - DISK_CHUNK_SIZE); + DISK_CHUNK_SIZE) * disk_read_cost; /* 2 -> 1 insert, 1 pop for the priority queue used to merge the buffers. */ const double cpu_cost= (2.0 * num_elements * log2(1.0 + num_buffers) * compare_cost) * PQ_SORT_SLOWNESS_CORRECTION_FACTOR; @@ -131,6 +132,7 @@ double get_merge_many_buffs_cost_fast(ha_rows num_rows, ha_rows num_keys_per_buffer, size_t elem_size, double key_compare_cost, + double disk_read_cost, bool with_addon_fields) { DBUG_ASSERT(num_keys_per_buffer != 0); @@ -162,7 +164,7 @@ double get_merge_many_buffs_cost_fast(ha_rows num_rows, total_cost+= num_merge_calls * get_merge_cost(num_keys_per_buffer * MERGEBUFF, MERGEBUFF, elem_size, - key_compare_cost); + key_compare_cost, disk_read_cost); // # of records in remaining buffers. last_n_elems+= num_remaining_buffs * num_keys_per_buffer; @@ -170,7 +172,7 @@ double get_merge_many_buffs_cost_fast(ha_rows num_rows, // Cost of merge sort of remaining buffers. total_cost+= get_merge_cost(last_n_elems, 1 + num_remaining_buffs, elem_size, - key_compare_cost); + key_compare_cost, disk_read_cost); num_buffers= num_merge_calls; num_keys_per_buffer*= MERGEBUFF; @@ -179,7 +181,7 @@ double get_merge_many_buffs_cost_fast(ha_rows num_rows, // Simulate final merge_buff call. last_n_elems+= num_keys_per_buffer * num_buffers; total_cost+= get_merge_cost(last_n_elems, 1 + num_buffers, elem_size, - key_compare_cost); + key_compare_cost, disk_read_cost); return total_cost; } @@ -264,6 +266,7 @@ void Sort_costs::compute_merge_sort_costs(Sort_param *param, { size_t row_length= param->sort_length + param->ref_length + sizeof(char *); size_t num_available_keys= memory_available / row_length; + THD *thd= param->sort_form->in_use; costs[MERGE_SORT_ALL_FIELDS]= DBL_MAX; costs[MERGE_SORT_ORDER_BY_FIELDS]= DBL_MAX; @@ -272,6 +275,7 @@ void Sort_costs::compute_merge_sort_costs(Sort_param *param, costs[MERGE_SORT_ORDER_BY_FIELDS]= get_merge_many_buffs_cost_fast(num_rows, num_available_keys, row_length, DEFAULT_KEY_COMPARE_COST, + DISK_READ_COST_THD(thd), false) + param->sort_form->file->ha_rndpos_time(MY_MIN(param->limit_rows, num_rows)); @@ -286,6 +290,7 @@ void Sort_costs::compute_merge_sort_costs(Sort_param *param, costs[MERGE_SORT_ALL_FIELDS]= get_merge_many_buffs_cost_fast(num_rows, num_available_keys, row_length, DEFAULT_KEY_COMPARE_COST, + DISK_READ_COST_THD(thd), true); } diff --git a/sql/ha_partition.cc b/sql/ha_partition.cc index 51748f851de..0cedfb0085c 100644 --- a/sql/ha_partition.cc +++ b/sql/ha_partition.cc @@ -9707,24 +9707,27 @@ uint ha_partition::get_biggest_used_partition(uint *part_index) time for scan */ -double ha_partition::scan_time() +IO_AND_CPU_COST ha_partition::scan_time() { - double scan_time= 0; + IO_AND_CPU_COST scan_time= {0,0}; uint i; DBUG_ENTER("ha_partition::scan_time"); for (i= bitmap_get_first_set(&m_part_info->read_partitions); i < m_tot_parts; i= bitmap_get_next_set(&m_part_info->read_partitions, i)) - scan_time+= m_file[i]->scan_time(); + { + IO_AND_CPU_COST cost= m_file[i]->scan_time(); + scan_time.io+= cost.io; + scan_time.cpu= cost.cpu; + } if (m_tot_parts) { /* Add TABLE_SCAN_SETUP_COST for partitions to make cost similar to in ha_scan_time() */ - scan_time+= (TABLE_SCAN_SETUP_COST * avg_io_cost() * (m_tot_parts - 1) / - optimizer_cache_cost); + scan_time.cpu= TABLE_SCAN_SETUP_COST * (m_tot_parts - 1); } DBUG_RETURN(scan_time); } @@ -9739,34 +9742,68 @@ double ha_partition::scan_time() @return time for scanning index inx */ -double ha_partition::key_scan_time(uint inx) +IO_AND_CPU_COST ha_partition::key_scan_time(uint inx) { - double scan_time= 0; + IO_AND_CPU_COST scan_time= {0,0}; uint i; DBUG_ENTER("ha_partition::key_scan_time"); for (i= bitmap_get_first_set(&m_part_info->read_partitions); i < m_tot_parts; i= bitmap_get_next_set(&m_part_info->read_partitions, i)) - scan_time+= m_file[i]->key_scan_time(inx); + { + IO_AND_CPU_COST cost= m_file[i]->key_scan_time(inx); + scan_time.io+= cost.io; + scan_time.cpu= cost.cpu; + } DBUG_RETURN(scan_time); } -double ha_partition::keyread_time(uint inx, uint ranges, ha_rows rows) +IO_AND_CPU_COST ha_partition::keyread_time(uint inx, ulong ranges, ha_rows rows, + ulonglong blocks) { - double read_time= 0; + IO_AND_CPU_COST read_time= {0,0}; uint i; + uint partitions= bitmap_bits_set(&m_part_info->read_partitions); DBUG_ENTER("ha_partition::keyread_time"); - if (!ranges) - DBUG_RETURN(handler::keyread_time(inx, ranges, rows)); + if (partitions == 0) + DBUG_RETURN(read_time); + + uint rows_per_part= (rows + partitions - 1)/partitions; for (i= bitmap_get_first_set(&m_part_info->read_partitions); i < m_tot_parts; i= bitmap_get_next_set(&m_part_info->read_partitions, i)) - read_time+= m_file[i]->keyread_time(inx, ranges, rows); + { + IO_AND_CPU_COST cost= m_file[i]->keyread_time(inx, ranges, rows_per_part, + blocks); + read_time.io+= cost.io; + read_time.cpu= cost.cpu; + } DBUG_RETURN(read_time); } +IO_AND_CPU_COST ha_partition::rndpos_time(ha_rows rows) +{ + IO_AND_CPU_COST read_time= {0,0}; + uint i; + uint partitions= bitmap_bits_set(&m_part_info->read_partitions); + if (partitions == 0) + return read_time; + + uint rows_per_part= (rows + partitions - 1)/partitions; + for (i= bitmap_get_first_set(&m_part_info->read_partitions); + i < m_tot_parts; + i= bitmap_get_next_set(&m_part_info->read_partitions, i)) + { + IO_AND_CPU_COST cost= m_file[i]->rndpos_time(rows_per_part); + read_time.io+= cost.io; + read_time.cpu= cost.cpu; + } + return read_time; +} + + /** Find number of records in a range. @param inx Index number @@ -9866,33 +9903,6 @@ ha_rows ha_partition::estimate_rows_upper_bound() } -/* - Get time to read - - SYNOPSIS - read_time() - index Index number used - ranges Number of ranges - rows Number of rows - - RETURN VALUE - time for read - - DESCRIPTION - This will be optimised later to include whether or not the index can - be used with partitioning. To achieve we need to add another parameter - that specifies how many of the index fields that are bound in the ranges. - Possibly added as a new call to handlers. -*/ - -double ha_partition::read_time(uint index, uint ranges, ha_rows rows) -{ - DBUG_ENTER("ha_partition::read_time"); - - DBUG_RETURN(get_open_file_sample()->read_time(index, ranges, rows)); -} - - /** Number of rows in table. see handler.h diff --git a/sql/ha_partition.h b/sql/ha_partition.h index 14347ee9ac0..fea2ac8d567 100644 --- a/sql/ha_partition.h +++ b/sql/ha_partition.h @@ -1031,17 +1031,15 @@ public: /* Called in test_quick_select to determine if indexes should be used. */ - double scan_time() override; + IO_AND_CPU_COST scan_time() override; - double key_scan_time(uint inx) override; + IO_AND_CPU_COST key_scan_time(uint inx) override; - double keyread_time(uint inx, uint ranges, ha_rows rows) override; + IO_AND_CPU_COST keyread_time(uint inx, ulong ranges, ha_rows rows, + ulonglong blocks) override; + IO_AND_CPU_COST rndpos_time(ha_rows rows) override; /* - The next method will never be called if you do not implement indexes. - */ - double read_time(uint index, uint ranges, ha_rows rows) override; - /* For the given range how many records are estimated to be in this range. Used by optimiser to calculate cost of using a particular index. */ diff --git a/sql/handler.cc b/sql/handler.cc index 677ab9f7cbe..394e8ba6f9e 100644 --- a/sql/handler.cc +++ b/sql/handler.cc @@ -3224,56 +3224,46 @@ LEX_CSTRING *handler::engine_name() /* - It is assumed that the value of the parameter 'ranges' can be only 0 or 1. - If ranges == 1 then the function returns the cost of index only scan - by index 'keyno' of one range containing 'rows' key entries. - If ranges == 0 then the function returns only the cost of copying - those key entries into the engine buffers. - - This function doesn't take in account into copying the key to record - (KEY_COPY_COST) or comparing the key to the where clause (WHERE_COMPARE_COST) + Calculate cost of 'index_only' scan for given index and number of records. + + @param index index to use (not clustered) + @param ranges Number of ranges (b-tree dives in case of b-tree) + @param rows Number of expected rows + @param blocks Number of disk blocks to read. 0 if not known + + This function doesn't take in account into copying the key to record + (INDEX_NEXT_FIND_COST + KEY_COPY_COST) or comparing the key to the WHERE + clause (WHERE_COST) */ -double handler::keyread_time(uint index, uint ranges, ha_rows rows) +IO_AND_CPU_COST handler::keyread_time(uint index, ulong ranges, ha_rows rows, + ulonglong blocks) { - size_t len; - double cost; - DBUG_ASSERT(ranges == 0 || ranges == 1); - len= table->key_info[index].key_length + ref_length; - if (table->file->is_clustering_key(index)) - len= table->s->stored_rec_length; - - cost= ((double)rows*len/(stats.block_size+1) * - INDEX_BLOCK_COPY_COST(table->in_use)); - /* - We divide the cost with optimizer_cache_cost as ha_keyread_time() - and ha_key_scan_time() will multiply the result value with - optimizer_cache_cost and we want to keep the above 'memory operation' - cost unaffected by this multiplication. - */ - cost/= optimizer_cache_cost; - if (ranges) + IO_AND_CPU_COST cost; + if (!blocks && stats.block_size) { - uint keys_per_block= (uint) (stats.block_size*3/4/len+1); - /* - We let the cost grow slowly in proportion to number of rows to - promote indexes with less rows. - We do not calculate exact number of block reads as then index - only reads will be more costly than normal reads, especially - compared to InnoDB clustered keys. - - INDEX_LOOKUP_COST is the cost of finding the first key in the - range. Finding the next key is usually a fast operation so we - don't count it here, it is taken into account in - ha_keyread_and_copy_time() - */ - cost+= (((double) (rows / keys_per_block) + INDEX_LOOKUP_COST) * - avg_io_cost()); + ulonglong len; + if (table->file->is_clustering_key(index)) + len= table->s->stored_rec_length; + else + len= table->key_info[index].key_length + ref_length; + blocks= (rows * len * INDEX_BLOCK_FILL_FACTOR) / stats.block_size + 1; } + cost.io= blocks * stats.block_size/IO_SIZE * avg_io_cost(); + cost.cpu= ranges * INDEX_LOOKUP_COST + blocks * INDEX_BLOCK_COPY_COST; return cost; } +double handler::ha_keyread_time(uint index, ulong ranges, ha_rows rows, + ulonglong blocks) +{ + IO_AND_CPU_COST cost= keyread_time(index, ranges, rows, blocks); + return (cost.io * optimizer_cache_cost + + cost.cpu + rows * INDEX_NEXT_FIND_COST); +} + + THD *handler::ha_thd(void) const { DBUG_ASSERT(!table || !table->in_use || table->in_use == current_thd); @@ -3402,7 +3392,7 @@ int handler::ha_open(TABLE *table_arg, const char *name, int mode, DBUG_ASSERT(optimizer_key_copy_cost >= 0.0); DBUG_ASSERT(optimizer_index_next_find_cost >= 0.0); DBUG_ASSERT(optimizer_row_copy_cost >= 0.0); - DBUG_ASSERT(optimizer_where_cmp_cost >= 0.0); + DBUG_ASSERT(optimizer_where_cost >= 0.0); DBUG_ASSERT(optimizer_key_cmp_cost >= 0.0); reset_statistics(); } @@ -8729,23 +8719,39 @@ Table_scope_and_contents_source_st::fix_period_fields(THD *thd, This is needed to provide fast acccess to these variables during optimization (as we refer to them multiple times). + We also allow the engine to change these The other option would be to access them from thd, but that would require a function call (as we cannot access THD from an inline handler function) and two extra memory accesses for each variable. - - index_block_copy_cost is not copied as it is used so seldom. */ void handler::set_optimizer_costs(THD *thd) { - optimizer_key_copy_cost= thd->variables.optimizer_key_copy_cost; - optimizer_index_next_find_cost= - thd->variables.optimizer_index_next_find_cost; - optimizer_row_copy_cost= thd->variables.optimizer_row_copy_cost; - optimizer_where_cmp_cost= thd->variables.optimizer_where_cmp_cost; - optimizer_key_cmp_cost= thd->variables.optimizer_key_cmp_cost; - set_optimizer_cache_cost(thd->optimizer_cache_hit_ratio); + if (thd->variables.optimizer_cost_version != optimizer_cost_version) + { + optimizer_cost_version= thd->variables.optimizer_cost_version; + + optimizer_row_lookup_cost= thd->variables.optimizer_row_lookup_cost; + optimizer_index_lookup_cost= thd->variables.optimizer_index_lookup_cost; + optimizer_scan_lookup_cost= thd->variables.optimizer_scan_lookup_cost; + + optimizer_row_next_find_cost= thd->variables.optimizer_row_next_find_cost; + optimizer_index_next_find_cost= thd->variables.optimizer_index_next_find_cost; + optimizer_cache_cost= thd->optimizer_cache_hit_ratio; + optimizer_index_block_copy_cost= thd->variables.optimizer_index_block_copy_cost; + optimizer_disk_read_cost= thd->variables.optimizer_disk_read_cost; + + /* Should probably never have to be modified by the engine */ + optimizer_row_copy_cost= thd->variables.optimizer_row_copy_cost; + optimizer_key_copy_cost= thd->variables.optimizer_key_copy_cost; + + /* Should not be modified by the engine as these are upper level costs */ + optimizer_where_cost= thd->variables.optimizer_where_cost; + optimizer_key_cmp_cost= thd->variables.optimizer_key_cmp_cost; + + optimizer_costs_updated(); + } } diff --git a/sql/handler.h b/sql/handler.h index 0c44f51b9df..151cd802645 100644 --- a/sql/handler.h +++ b/sql/handler.h @@ -3099,6 +3099,21 @@ enum class Compare_keys : uint32_t NotEqual }; +/* Cost for reading a row through an index */ +struct INDEX_READ_COST +{ + double read_cost; + double index_only_cost; +}; + +/* Separated costs for IO and CPU. For handler::keyread_time() */ +struct IO_AND_CPU_COST +{ + double io; + double cpu; +}; + + /** The handler class is the interface for dynamically loadable storage engines. Do not add ifdefs and take care when adding or @@ -3224,6 +3239,7 @@ public: THD::first_successful_insert_id_in_cur_stmt. */ ulonglong insert_id_for_cur_row; + uint64 optimizer_cost_version; /** Interval returned by get_auto_increment() and being consumed by the inserter. @@ -3239,9 +3255,12 @@ public: Updated from THD in open_tables() */ double optimizer_cache_cost; - double optimizer_index_next_find_cost; + double optimizer_index_block_copy_cost; + double optimizer_row_next_find_cost, optimizer_index_next_find_cost; double optimizer_row_copy_cost, optimizer_key_copy_cost; - double optimizer_where_cmp_cost, optimizer_key_cmp_cost; + double optimizer_where_cost, optimizer_key_cmp_cost; + double optimizer_row_lookup_cost, optimizer_index_lookup_cost; + double optimizer_scan_lookup_cost,optimizer_disk_read_cost; ha_copy_info copy_info; @@ -3378,7 +3397,7 @@ public: ref_length(sizeof(my_off_t)), ft_handler(0), inited(NONE), pre_inited(NONE), pushed_cond(0), next_insert_id(0), insert_id_for_cur_row(0), - optimizer_cache_cost((100-DEFAULT_CACHE_HIT_RATIO)/100.0), + optimizer_cost_version((uint64) ~0), tracker(NULL), pushed_idx_cond(NULL), pushed_idx_cond_keyno(MAX_KEY), @@ -3610,10 +3629,13 @@ public: be used by the sql level. */ protected: - virtual double scan_time() + virtual IO_AND_CPU_COST scan_time() { - return (((ulonglong2double(stats.data_file_length) / stats.block_size)) * - avg_io_cost()); + double blocks= (double) (stats.data_file_length / IO_SIZE); + IO_AND_CPU_COST cost= { (double) blocks * avg_io_cost(), + (double) blocks * INDEX_BLOCK_COPY_COST + + (double) stats.records * ROW_NEXT_FIND_COST }; + return cost; } public: @@ -3631,8 +3653,9 @@ public: inline double ha_scan_time() { - return (scan_time() * optimizer_cache_cost + - TABLE_SCAN_SETUP_COST * avg_io_cost()); + IO_AND_CPU_COST cost= scan_time(); + return (cost.io * optimizer_cache_cost * SCAN_LOOKUP_COST + + cost.cpu + TABLE_SCAN_SETUP_COST); } /* @@ -3642,75 +3665,39 @@ public: inline double ha_scan_and_compare_time(ha_rows records) { return (ha_scan_time() + - (double) records * (ROW_COPY_COST + WHERE_COMPARE_COST)); + (double) records * (ROW_COPY_COST + WHERE_COST)); } + /* Cost of (random) reading a block of IO_SIZE */ virtual double avg_io_cost() { - return 1.0; + return DISK_READ_COST; } - virtual void set_optimizer_costs(THD *thd); - /* - Set cost for finding a row in the engine cache - This allows the handler to override the cost if there is no - caching of rows, like in heap or federatedx. + Set the optimizer cost variables in the handler. This is virtual mainly + for the partition engine. */ - virtual void set_optimizer_cache_cost(double cost) - { - optimizer_cache_cost= cost; - } - - /** - The cost of reading a set of ranges from the table using an index - to access it. - - @param index The index number. - @param ranges The number of ranges to be read. If 0, it means that - we calculate separately the cost of reading the key. - @param rows Total number of rows to be read. - - This method can be used to calculate the total cost of scanning a table - using an index by calling it using read_time(index, 1, table_size). - - This function is to be reimplemented by engines (if needed). The sql_level - should call ha_read_time(), ha_read_and_copy_time() or - ha_read_and_compare_time(). + virtual void set_optimizer_costs(THD *thd); + /* + Signal that the optimizer costs handler variables has been updated */ -protected: - virtual double read_time(uint index, uint ranges, ha_rows rows) - { - return ((rows2double(rows) * ROW_LOOKUP_COST + - rows2double(ranges) * INDEX_LOOKUP_COST) * avg_io_cost()); - } -public: - - /* Same as above, but take into account CACHE_COST */ - inline double ha_read_time(uint index, uint ranges, ha_rows rows) - { - return read_time(index, ranges, rows) * optimizer_cache_cost; - } + virtual void optimizer_costs_updated() {} - /* Same as above, but take into account also copying of the row to 'record' */ - inline double ha_read_and_copy_time(uint index, uint ranges, ha_rows rows) - { - return (ha_read_time(index, ranges, rows) + - rows2double(rows) * ROW_COPY_COST); - } - - /* Same as above, but take into account also copying and comparing the row */ - inline double ha_read_and_compare_time(uint index, uint ranges, ha_rows rows) - { - return (ha_read_time(index, ranges, rows) + - rows2double(rows) * (ROW_COPY_COST + WHERE_COMPARE_COST)); - } - - /* Cost of reading a row with rowid */ protected: - virtual double rndpos_time(ha_rows rows) + /* + Cost of reading 'rows' number of rows with a rowid + Should be same as read_time() without ranges calculation and + ROW_LOOKUP_COST (which is added in ha_rndpos_time(). + */ + virtual IO_AND_CPU_COST rndpos_time(ha_rows rows) { - return rows2double(rows) * ROW_LOOKUP_COST * avg_io_cost(); + double r= rows2double(rows); + return + { + r * avg_io_cost() * stats.block_size/IO_SIZE, + r * INDEX_BLOCK_COPY_COST + }; } public: /* @@ -3720,56 +3707,65 @@ public: */ inline double ha_rndpos_time(ha_rows rows) { - return (rndpos_time(rows) * optimizer_cache_cost + - rows2double(rows) * ROW_COPY_COST); + IO_AND_CPU_COST cost= rndpos_time(rows); + return (cost.io * optimizer_cache_cost * ROW_LOOKUP_COST + + cost.cpu + rows2double(rows) * ROW_COPY_COST); + } + + inline double ha_rndpos_and_compare_time(ha_rows rows) + { + IO_AND_CPU_COST cost= rndpos_time(rows); + return (cost.io * optimizer_cache_cost * ROW_LOOKUP_COST + + cost.cpu + + rows2double(rows) * (ROW_COPY_COST + WHERE_COST)); } /** - Calculate cost of 'index_only' scan for given index and number of records. - - @param index Index to read - @param flag If flag == 1 then the function returns the cost of - index only scan by index 'index' of one range containing - 'rows' key entries. - If flag == 0 then function returns only the cost of copying - those key entries into the engine buffers. - @param rows #of records to read + Calculate cost of 'index_only' scan for given index, a number of reanges + and number of records. + + @param index Index to read + @param ranges Number of key ranges + @param rows #of records to read + @param blocks Number of IO blocks that needs to be accessed. 0 if not known */ protected: - virtual double keyread_time(uint index, uint flag, ha_rows rows); + virtual IO_AND_CPU_COST keyread_time(uint index, ulong ranges, ha_rows rows, + ulonglong blocks); public: /* Calculate cost of 'keyread' scan for given index and number of records including fetching the key to the 'record' buffer. */ - - inline double ha_keyread_time(uint index, uint flag, ha_rows rows) - { - return (keyread_time(index, flag, rows) * optimizer_cache_cost); - } + double ha_keyread_time(uint index, ulong ranges, ha_rows rows, + ulonglong blocks); /* Same as above, but take into account copying the key the the SQL layer */ - inline double ha_keyread_and_copy_time(uint index, uint flag, ha_rows rows) + inline double ha_keyread_and_copy_time(uint index, ulong ranges, ha_rows rows) { - return ha_keyread_time(index, flag, rows) + (double) rows * KEY_COPY_COST; + return (ha_keyread_time(index, ranges, rows, 0) + + (double) rows * KEY_COPY_COST); } /* Time for a full table index scan (without copy or compare cost). To be overrided by engines, sql level should use ha_key_scan_time(). + Note that IO_AND_CPU_COST does not include avg_io_cost() ! */ protected: - virtual double key_scan_time(uint index) + virtual IO_AND_CPU_COST key_scan_time(uint index) { - return keyread_time(index, 1, records()); + return keyread_time(index, 1, records(), 0); } public: /* Cost of doing a full index scan */ inline double ha_key_scan_time(uint index) { - return (key_scan_time(index) * optimizer_cache_cost); + IO_AND_CPU_COST cost= key_scan_time(index); + return (cost.io * optimizer_cache_cost + + cost.cpu + (double) records() * INDEX_NEXT_FIND_COST); } /* @@ -3779,7 +3775,7 @@ public: inline double ha_key_scan_and_compare_time(uint index, ha_rows rows) { return (ha_key_scan_time(index) + - (double) rows * (KEY_COPY_COST + WHERE_COMPARE_COST)); + (double) rows * (KEY_COPY_COST + WHERE_COST)); } virtual const key_map *keys_to_use_for_scanning() { return &key_map_empty; } diff --git a/sql/item_func.cc b/sql/item_func.cc index dae2be5aa4e..6cea984c931 100644 --- a/sql/item_func.cc +++ b/sql/item_func.cc @@ -5911,6 +5911,7 @@ bool Item_func_get_system_var::fix_length_and_dec() decimals=0; break; case SHOW_DOUBLE: + case SHOW_OPTIMIZER_COST: decimals= 6; collation= DTCollation_numeric(); fix_char_length(DBL_DIG + 6); @@ -5968,6 +5969,7 @@ const Type_handler *Item_func_get_system_var::type_handler() const case SHOW_CHAR_PTR: case SHOW_LEX_STRING: return &type_handler_varchar; + case SHOW_OPTIMIZER_COST: case SHOW_DOUBLE: return &type_handler_double; default: diff --git a/sql/multi_range_read.cc b/sql/multi_range_read.cc index 951832fffa7..f59b318dc09 100644 --- a/sql/multi_range_read.cc +++ b/sql/multi_range_read.cc @@ -302,48 +302,39 @@ handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq, if (total_rows != HA_POS_ERROR) { - double io_cost= avg_io_cost(); - double range_lookup_cost= (io_cost * INDEX_LOOKUP_COST * - optimizer_cache_cost); + double key_cost; set_if_smaller(total_rows, max_rows); /* The following calculation is the same as in multi_range_read_info(): */ *flags |= HA_MRR_USE_DEFAULT_IMPL; cost->reset(); - cost->avg_io_cost= cost->idx_avg_io_cost= io_cost; + cost->avg_io_cost= cost->idx_avg_io_cost= 0; // Not used! if (!is_clustering_key(keyno)) { - cost->idx_io_count= (double) io_blocks; + key_cost= ha_keyread_time(keyno, n_ranges, total_rows, io_blocks); + cost->idx_cpu_cost= key_cost; + if (!(*flags & HA_MRR_INDEX_ONLY)) { - cost->idx_cpu_cost= (ha_keyread_time(keyno, 1, total_rows) + - (n_ranges-1) * range_lookup_cost); - cost->cpu_cost= ha_read_time(keyno, 0, total_rows); - cost->copy_cost= rows2double(total_rows) * ROW_COPY_COST; + /* ha_rndpos_time includes ROW_COPY_COST */ + cost->cpu_cost= ha_rndpos_time(total_rows); } else { /* Index only read */ - cost->idx_cpu_cost= (ha_keyread_time(keyno, 1, total_rows) + - (n_ranges-1) * range_lookup_cost); - cost->copy_cost= rows2double(total_rows) * KEY_COPY_COST; + cost->copy_cost= rows2double(total_rows) * KEY_COPY_COST; } } else { - /* - Clustered index - If all index dives are to a few blocks, then limit the - ranges used by read_time to the number of dives. - */ + /* Clustered index */ io_blocks+= unassigned_single_point_ranges; - uint limited_ranges= (uint) MY_MIN((ulonglong) n_ranges, io_blocks); - cost->idx_cpu_cost= limited_ranges * range_lookup_cost; - cost->cpu_cost= ha_read_time(keyno, 0, total_rows); - cost->copy_cost= rows2double(total_rows) * ROW_COPY_COST; + key_cost= ha_keyread_time(keyno, n_ranges, total_rows, io_blocks); + cost->idx_cpu_cost= key_cost; + cost->copy_cost= rows2double(total_rows) * ROW_COPY_COST; } - cost->comp_cost= (rows2double(total_rows) * WHERE_COMPARE_COST + + cost->comp_cost= (rows2double(total_rows) * WHERE_COST + MULTI_RANGE_READ_SETUP_COST); } DBUG_PRINT("statistics", @@ -378,7 +369,7 @@ handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq, @param keyno Index number @param n_ranges Estimated number of ranges (i.e. intervals) in the range sequence. - @param n_rows Estimated total number of records contained within all + @param total_rows Estimated total number of records contained within all of the ranges @param bufsz INOUT IN: Size of the buffer available for use OUT: Size of the buffer that will be actually used, or @@ -393,7 +384,7 @@ handler::multi_range_read_info_const(uint keyno, RANGE_SEQ_IF *seq, other Error or can't perform the requested scan */ -ha_rows handler::multi_range_read_info(uint keyno, uint n_ranges, uint n_rows, +ha_rows handler::multi_range_read_info(uint keyno, uint n_ranges, uint total_rows, uint key_parts, uint *bufsz, uint *flags, Cost_estimate *cost) { @@ -410,38 +401,27 @@ ha_rows handler::multi_range_read_info(uint keyno, uint n_ranges, uint n_rows, /* Produce the same cost as non-MRR code does */ if (!is_clustering_key(keyno)) { - double range_lookup_cost= (avg_io_cost() * INDEX_LOOKUP_COST * - optimizer_cache_cost); - /* - idx_io_count could potentially be increased with the number of - index leaf blocks we have to read for finding n_rows. - */ - cost->idx_io_count= n_ranges; + double key_cost= ha_keyread_time(keyno, n_ranges, total_rows, 0); + cost->idx_cpu_cost= key_cost; + if (!(*flags & HA_MRR_INDEX_ONLY)) { - cost->idx_cpu_cost= (keyread_time(keyno, 1, n_rows) + - (n_ranges-1) * range_lookup_cost); - cost->cpu_cost= read_time(keyno, 0, n_rows); - cost->copy_cost= rows2double(n_rows) * ROW_COPY_COST; + /* ha_rndpos_time includes ROW_COPY_COST */ + cost->cpu_cost= ha_rndpos_time(total_rows); } else { - /* - Same as above, but take into account copying the key to the upper - level. - */ - cost->idx_cpu_cost= (keyread_time(keyno, 1, n_rows) + - (n_ranges-1) * range_lookup_cost); - cost->copy_cost= rows2double(n_rows) * KEY_COPY_COST; + /* Index only read */ + cost->copy_cost= rows2double(total_rows) * KEY_COPY_COST; } } else { /* Clustering key */ - cost->cpu_cost= read_time(keyno, n_ranges, n_rows); - cost->copy_cost= rows2double(n_rows) * ROW_COPY_COST; + cost->cpu_cost= ha_keyread_time(keyno, n_ranges, total_rows, 0); + cost->copy_cost= rows2double(total_rows) * ROW_COPY_COST; } - cost->comp_cost= rows2double(n_rows) * WHERE_COMPARE_COST; + cost->comp_cost= rows2double(total_rows) * WHERE_COST; return 0; } @@ -2081,42 +2061,6 @@ void get_sort_and_sweep_cost(TABLE *table, ha_rows nrows, Cost_estimate *cost) /** Get cost of reading nrows table records in a "disk sweep" - A disk sweep read is a sequence of handler->rnd_pos(rowid) calls that made - for an ordered sequence of rowids. - - We assume hard disk IO. The read is performed as follows: - - 1. The disk head is moved to the needed cylinder - 2. The controller waits for the plate to rotate - 3. The data is transferred - - Time to do #3 is insignificant compared to #2+#1. - - Time to move the disk head is proportional to head travel distance. - - Time to wait for the plate to rotate depends on whether the disk head - was moved or not. - - If disk head wasn't moved, the wait time is proportional to distance - between the previous block and the block we're reading. - - If the head was moved, we don't know how much we'll need to wait for the - plate to rotate. We assume the wait time to be a variate with a mean of - 0.5 of full rotation time. - - Our cost units are "random disk seeks". The cost of random disk seek is - actually not a constant, it depends one range of cylinders we're going - to access. We make it constant by introducing a fuzzy concept of "typical - datafile length" (it's fuzzy as it's hard to tell whether it should - include index file, temp.tables etc). Then random seek cost is: - - 1 = half_rotation_cost + move_cost * 1/3 * typical_data_file_length - - We define half_rotation_cost as DISK_SEEK_BASE_COST=0.9. - - If handler::avg_io_cost() < 1.0, then we will trust the handler - when it comes to the average cost (this is for example true for HEAP). - @param table Table to be accessed @param nrows Number of rows to retrieve @param interrupted TRUE <=> Assume that the disk sweep will be @@ -2132,7 +2076,7 @@ void get_sweep_read_cost(TABLE *table, ha_rows nrows, bool interrupted, cost->reset(); #ifndef OLD_SWEEP_COST cost->cpu_cost= table->file->ha_rndpos_time(nrows); - cost->avg_io_cost= table->file->avg_io_cost(); + cost->avg_io_cost= 0; #else if (table->file->pk_is_clustering_key(table->s->primary_key)) { diff --git a/sql/opt_range.cc b/sql/opt_range.cc index 21c2100c236..d7220c45325 100644 --- a/sql/opt_range.cc +++ b/sql/opt_range.cc @@ -5059,7 +5059,7 @@ static void dbug_print_singlepoint_range(SEL_ARG **start, uint num) get_sweep_read_cost() param Parameter from test_quick_select records # of records to be retrieved - add_time_for_compare If set, add cost of WHERE clause (WHERE_COMPARE_COST) + add_time_for_compare If set, add cost of WHERE clause (WHERE_COST) RETURN cost of sweep */ @@ -5071,7 +5071,7 @@ static double get_sweep_read_cost(const PARAM *param, ha_rows records, #ifndef OLD_SWEEP_COST double cost= (param->table->file->ha_rndpos_time(records) + (add_time_for_compare ? - records * param->thd->variables.optimizer_where_cmp_cost : 0)); + records * param->thd->variables.optimizer_where_cost : 0)); DBUG_PRINT("return", ("cost: %g", cost)); DBUG_RETURN(cost); #else @@ -5123,7 +5123,7 @@ static double get_sweep_read_cost(const PARAM *param, ha_rows records, */ result= busy_blocks; } - result+= rows2double(n_rows) * ROW_COPY_COST_THD(param->table->thd); + result+= rows2double(n_rows) * param->table->file->ROW_COPY_COST); } DBUG_PRINT("return",("cost: %g", result)); DBUG_RETURN(result); @@ -5345,7 +5345,7 @@ TABLE_READ_PLAN *get_best_disjunct_quick(PARAM *param, SEL_IMERGE *imerge, /* Calculate cost(rowid_to_row_scan) */ { - /* imerge_cost already includes WHERE_COMPARE_COST */ + /* imerge_cost already includes WHERE_COST */ double sweep_cost= get_sweep_read_cost(param, non_cpk_scan_records, 0); imerge_cost+= sweep_cost; trace_best_disjunct. @@ -5379,7 +5379,7 @@ TABLE_READ_PLAN *get_best_disjunct_quick(PARAM *param, SEL_IMERGE *imerge, } { - const double dup_removal_cost= Unique::get_use_cost( + const double dup_removal_cost= Unique::get_use_cost(thd, param->imerge_cost_buff, (uint)non_cpk_scan_records, param->table->file->ref_length, (size_t)param->thd->variables.sortbuff_size, @@ -5454,9 +5454,7 @@ skip_to_ror_scan: if ((*cur_child)->is_ror) { /* Ok, we have index_only cost, now get full rows scan cost */ - cost= param->table->file-> - ha_read_and_compare_time(param->real_keynr[(*cur_child)->key_idx], 1, - (*cur_child)->records); + cost= param->table->file->ha_rndpos_and_compare_time((*cur_child)->records); } else cost= read_time; @@ -6314,7 +6312,8 @@ double get_cpk_filter_cost(ha_rows filtered_records, */ static -bool check_index_intersect_extension(PARTIAL_INDEX_INTERSECT_INFO *curr, +bool check_index_intersect_extension(THD *thd, + PARTIAL_INDEX_INTERSECT_INFO *curr, INDEX_SCAN_INFO *ext_index_scan, PARTIAL_INDEX_INTERSECT_INFO *next) { @@ -6361,7 +6360,7 @@ bool check_index_intersect_extension(PARTIAL_INDEX_INTERSECT_INFO *curr, size_t max_memory_size= common_info->max_memory_size; records_sent_to_unique+= ext_index_scan_records; - cost= Unique::get_use_cost(buff_elems, (size_t) records_sent_to_unique, + cost= Unique::get_use_cost(thd, buff_elems, (size_t) records_sent_to_unique, key_size, max_memory_size, compare_factor, TRUE, &next->in_memory); @@ -6372,7 +6371,7 @@ bool check_index_intersect_extension(PARTIAL_INDEX_INTERSECT_INFO *curr, double cost2; bool in_memory2; ha_rows records2= records_sent_to_unique-records_filtered_out_by_cpk; - cost2= Unique::get_use_cost(buff_elems, (size_t) records2, key_size, + cost2= Unique::get_use_cost(thd, buff_elems, (size_t) records2, key_size, max_memory_size, compare_factor, TRUE, &in_memory2); cost2+= get_cpk_filter_cost(ext_index_scan_records, common_info->cpk_scan, @@ -6432,7 +6431,8 @@ bool check_index_intersect_extension(PARTIAL_INDEX_INTERSECT_INFO *curr, */ static -void find_index_intersect_best_extension(PARTIAL_INDEX_INTERSECT_INFO *curr) +void find_index_intersect_best_extension(THD *thd, + PARTIAL_INDEX_INTERSECT_INFO *curr) { PARTIAL_INDEX_INTERSECT_INFO next; COMMON_INDEX_INTERSECT_INFO *common_info= curr->common_info; @@ -6465,8 +6465,9 @@ void find_index_intersect_best_extension(PARTIAL_INDEX_INTERSECT_INFO *curr) { *rem_first_index_scan_ptr= *index_scan_ptr; *index_scan_ptr= rem_first_index_scan; - if (check_index_intersect_extension(curr, *rem_first_index_scan_ptr, &next)) - find_index_intersect_best_extension(&next); + if (check_index_intersect_extension(thd, curr, *rem_first_index_scan_ptr, + &next)) + find_index_intersect_best_extension(thd, &next); *index_scan_ptr= *rem_first_index_scan_ptr; *rem_first_index_scan_ptr= rem_first_index_scan; } @@ -6518,7 +6519,7 @@ TRP_INDEX_INTERSECT *get_best_index_intersect(PARAM *param, SEL_TREE *tree, read_time)) DBUG_RETURN(NULL); - find_index_intersect_best_extension(&init); + find_index_intersect_best_extension(thd, &init); if (common.best_length <= 1 && !common.best_uses_cpk) DBUG_RETURN(NULL); @@ -7358,7 +7359,7 @@ TRP_ROR_INTERSECT *get_best_ror_intersect(const PARAM *param, SEL_TREE *tree, Adjust row count and add the cost of comparing the final rows to the WHERE clause */ - cmp_cost= intersect_best->out_rows * thd->variables.optimizer_where_cmp_cost; + cmp_cost= intersect_best->out_rows * thd->variables.optimizer_where_cost; /* Ok, return ROR-intersect plan if we have found one */ TRP_ROR_INTERSECT *trp= NULL; @@ -15058,7 +15059,7 @@ void cost_group_min_max(TABLE* table, KEY *index_info, uint used_key_parts, const double cpu_cost= (num_groups * (tree_traversal_cost + - thd->variables.optimizer_where_cmp_cost)); + thd->variables.optimizer_where_cost)); *read_cost= io_cost + cpu_cost; *records= num_groups; diff --git a/sql/opt_subselect.cc b/sql/opt_subselect.cc index 9e92231d519..53fbefb3795 100644 --- a/sql/opt_subselect.cc +++ b/sql/opt_subselect.cc @@ -1455,8 +1455,8 @@ void get_delayed_table_estimates(TABLE *table, hash_sj_engine->tmp_table->s->reclength); /* Do like in handler::ha_scan_and_compare_time, but ignore the where cost */ - *scan_time= ((data_size/table->file->stats.block_size+2) * - table->file->avg_io_cost()) + *out_rows * file->ROW_COPY_COST; + *scan_time= ((data_size/IO_SIZE * table->file->avg_io_cost()) + + *out_rows * file->ROW_COPY_COST); } @@ -2573,19 +2573,17 @@ bool optimize_semijoin_nests(JOIN *join, table_map all_table_map) consider doing sjm-scan). See ha_scan_time() for the basics of the calculations. We don't need to check the where clause for each row, so no - WHERE_COMPARE_COST is needed. + WHERE_COST is needed. */ - scan_cost= (TABLE_SCAN_SETUP_COST + - (cost.block_size == 0 ? 0 : - ((rowlen * (double) sjm->rows) / cost.block_size + - TABLE_SCAN_SETUP_COST))); + scan_cost= (rowlen * (double) sjm->rows) / cost.block_size; total_cost= (scan_cost * cost.cache_hit_ratio * cost.avg_io_cost + + TABLE_SCAN_SETUP_COST + row_copy_cost * sjm->rows); sjm->scan_cost.convert_from_cost(total_cost); /* When reading a row, we have also to check the where clause */ sjm->lookup_cost.convert_from_cost(cost.lookup + - WHERE_COMPARE_COST_THD(thd)); + WHERE_COST_THD(thd)); sj_nest->sj_mat_info= sjm; DBUG_EXECUTE("opt", print_sjm(sjm);); } @@ -2690,8 +2688,12 @@ get_tmp_table_costs(THD *thd, double row_count, uint row_size, bool blobs_used, blobs_used) { /* Disk based table */ - cost.lookup= ((DISK_TEMPTABLE_LOOKUP_COST * + cost.lookup= ((DISK_TEMPTABLE_LOOKUP_COST(thd) * thd->optimizer_cache_hit_ratio)) + row_copy_cost; + /* + On write we have to first copy the row to the record buffer and then + to the storage engine, so 2 copies. + */ cost.write= cost.lookup + row_copy_cost; cost.create= DISK_TEMPTABLE_CREATE_COST; cost.block_size= DISK_TEMPTABLE_BLOCK_SIZE; @@ -2704,8 +2706,8 @@ get_tmp_table_costs(THD *thd, double row_count, uint row_size, bool blobs_used, cost.lookup= HEAP_TEMPTABLE_LOOKUP_COST + row_copy_cost; cost.write= cost.lookup + row_copy_cost; cost.create= HEAP_TEMPTABLE_CREATE_COST; - cost.block_size= 0; - cost.avg_io_cost= HEAP_TEMPTABLE_LOOKUP_COST; + cost.block_size= 1; + cost.avg_io_cost= 0; cost.cache_hit_ratio= 1.0; } return cost; @@ -6731,7 +6733,7 @@ bool JOIN::choose_subquery_plan(table_map join_tables) double materialization_cost= COST_ADD(cost.create, COST_ADD(inner_read_time_1, - COST_MULT((cost.write + WHERE_COMPARE_COST_THD(thd)), + COST_MULT((cost.write + WHERE_COST_THD(thd)), inner_record_count_1))); materialize_strategy_cost= diff --git a/sql/rowid_filter.cc b/sql/rowid_filter.cc index 610947c4613..62a2812c901 100644 --- a/sql/rowid_filter.cc +++ b/sql/rowid_filter.cc @@ -125,7 +125,7 @@ void Range_rowid_filter_cost_info::init(Rowid_filter_container_type cont_type, key_no= idx; est_elements= (ulonglong) table->opt_range[key_no].rows; cost_of_building_range_filter= build_cost(container_type); - where_cmp_cost= tab->in_use->variables.optimizer_where_cmp_cost; + where_cmp_cost= tab->in_use->variables.optimizer_where_cost; index_next_find_cost= tab->in_use->variables.optimizer_index_next_find_cost; selectivity= est_elements/((double) table->stat_records()); gain= avg_access_and_eval_gain_per_row(container_type, @@ -534,7 +534,7 @@ TABLE::best_range_rowid_filter_for_partial_join(uint access_key_no, new_cost= (cost_of_accepted_rows + cost_of_rejected_rows + records * filter->lookup_cost()); new_total_cost= ((new_cost + new_records * - in_use->variables.optimizer_where_cmp_cost) * + in_use->variables.optimizer_where_cost) * prev_records + filter->get_setup_cost()); if (best_filter_gain > new_total_cost) diff --git a/sql/set_var.cc b/sql/set_var.cc index 8cb5fcd4870..e597c6017cd 100644 --- a/sql/set_var.cc +++ b/sql/set_var.cc @@ -310,7 +310,12 @@ do { \ case SHOW_HA_ROWS: do_num_val (ha_rows,CMD); #define case_for_double(CMD) \ - case SHOW_DOUBLE: do_num_val (double,CMD) + case SHOW_DOUBLE: do_num_val (double,CMD); \ + case SHOW_OPTIMIZER_COST: \ + { \ + double val= (*(double*) value) * 1000; \ + CMD; \ + } while (0) #define case_get_string_as_lex_string \ case SHOW_CHAR: \ diff --git a/sql/set_var.h b/sql/set_var.h index 570703a8222..38a395adf0f 100644 --- a/sql/set_var.h +++ b/sql/set_var.h @@ -84,7 +84,7 @@ protected: typedef bool (*on_update_function)(sys_var *self, THD *thd, enum_var_type type); int flags; ///< or'ed flag_enum values - const SHOW_TYPE show_val_type; ///< what value_ptr() returns for sql_show.cc + SHOW_TYPE show_val_type; ///< what value_ptr() returns for sql_show.cc PolyLock *guard; ///< *second* lock that protects the variable ptrdiff_t offset; ///< offset to the value from global_system_variables on_check_function on_check; diff --git a/sql/sql_class.h b/sql/sql_class.h index 03c7fb6a847..b0bbf1c7230 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -683,9 +683,13 @@ typedef struct system_variables ulonglong slave_skip_counter; ulonglong max_relay_log_size; - double optimizer_index_block_copy_cost, optimizer_index_next_find_cost; + double optimizer_cache_hit_ratio; // Stored in handler::optimizer_cache_cost + double optimizer_index_block_copy_cost; + double optimizer_row_next_find_cost, optimizer_index_next_find_cost; double optimizer_row_copy_cost, optimizer_key_copy_cost; - double optimizer_where_cmp_cost, optimizer_key_cmp_cost; + double optimizer_where_cost, optimizer_key_cmp_cost; + double optimizer_row_lookup_cost, optimizer_index_lookup_cost; + double optimizer_scan_lookup_cost,optimizer_disk_read_cost; double long_query_time_double, max_statement_time_double; double sample_percentage; @@ -783,7 +787,6 @@ typedef struct system_variables uint group_concat_max_len; uint eq_range_index_dive_limit; - uint optimizer_cache_hit_ratio; // Stored in handler::optimizer_cache_cost uint idle_transaction_timeout; uint idle_readonly_transaction_timeout; uint idle_write_transaction_timeout; @@ -857,6 +860,7 @@ typedef struct system_variables uint wsrep_sync_wait; vers_asof_timestamp_t vers_asof_timestamp; + uint64 optimizer_cost_version; } SV; /** diff --git a/sql/sql_const.h b/sql/sql_const.h index 7d79ac85cae..9016c54e25d 100644 --- a/sql/sql_const.h +++ b/sql/sql_const.h @@ -121,8 +121,9 @@ /* This is used when reading large blocks, sequential read. - We assume that reading this much will be the same cost as 1 seek / fetching - one row from the storage engine. + We assume that reading this much will be roughly the same cost as 1 + seek / fetching one row from the storage engine. + Cost of one read of DISK_CHUNK_SIZE is DISK_SEEK_BASE_COST (ms). */ #define DISK_CHUNK_SIZE (uint) (65536) /* Size of diskbuffer for tmpfiles */ #define TMPFILE_CREATE_COST 2.0 /* Creating and deleting tmp file */ @@ -219,24 +220,34 @@ Note that avg_io_cost() is multipled with this constant! */ -#define DEFAULT_CACHE_HIT_RATIO 50 +#define DEFAULT_CACHE_HIT_RATIO 80 /* Convert ratio to cost */ -static inline double cache_hit_ratio(uint ratio) +static inline double cache_hit_ratio(double ratio) { - return (((double) (100 - ratio)) / 100.0); + return ((100.0 - ratio) / 100.0); } /* - Base cost for finding keys and rows from the engine is 1.0 - All other costs should be proportional to these + All costs should be based on milliseconds (1 cost = 1 ms) */ /* Cost for finding the first key in a key scan */ -#define INDEX_LOOKUP_COST ((double) 1.0) +#define DEFAULT_INDEX_LOOKUP_COST ((double) 0.0005) +/* Modifier for reading a block when doing a table scan */ +#define DEFAULT_SCAN_LOOKUP_COST ((double) 1.0) + /* Cost of finding a key from a row_ID (not used for clustered keys) */ -#define ROW_LOOKUP_COST ((double) 1.0) +#define DEFAULT_ROW_LOOKUP_COST ((double) 0.0005) +#define ROW_LOOKUP_COST_THD(thd) ((thd)->variables.optimizer_row_lookup_cost) + +#define INDEX_LOOKUP_COST optimizer_index_lookup_cost +#define ROW_LOOKUP_COST optimizer_row_lookup_cost +#define SCAN_LOOKUP_COST optimizer_scan_lookup_cost + +/* Default fill factor of an (b-tree) index block */ +#define INDEX_BLOCK_FILL_FACTOR 0.75 /* These constants impact the cost of QSORT and priority queue sorting, @@ -252,13 +263,23 @@ static inline double cache_hit_ratio(uint ratio) */ #define QSORT_SORT_SLOWNESS_CORRECTION_FACTOR (0.1) #define PQ_SORT_SLOWNESS_CORRECTION_FACTOR (0.1) + +/* + Creating a record from the join cache is faster than getting a row from + the engine. JOIN_CACHE_ROW_COPY_COST_FACTOR is the factor used to + take this into account. This is multiplied with ROW_COPY_COST. +*/ +#define JOIN_CACHE_ROW_COPY_COST_FACTOR(thd) (0.75 * ROW_LOOKUP_COST_THD(thd)) + /* Cost of finding and copying keys from the storage engine index cache to - an internal cache as part of an index scan. - Used in handler::keyread_time() + an internal cache as part of an index scan. This includes all mutexes + that needs to be taken to get exclusive access to a page. + The number is taken from accessing an existing blocks from Aria page cache. + Used in handler::scan_time() and handler::keyread_time() */ -#define DEFAULT_INDEX_BLOCK_COPY_COST ((double) 1 / 5.0) -#define INDEX_BLOCK_COPY_COST(THD) ((THD)->variables.optimizer_index_block_copy_cost) +#define DEFAULT_INDEX_BLOCK_COPY_COST ((double) 3.56e-05) +#define INDEX_BLOCK_COPY_COST optimizer_index_block_copy_cost /* Cost of finding the next row during table scan and copying it to @@ -269,18 +290,11 @@ static inline double cache_hit_ratio(uint ratio) too big then MariaDB will used key lookup even when table scan is better. */ -#define DEFAULT_ROW_COPY_COST ((double) 1.0 / 20.0) +#define DEFAULT_ROW_COPY_COST ((double) 2.334e-06) #define ROW_COPY_COST optimizer_row_copy_cost #define ROW_COPY_COST_THD(THD) ((THD)->variables.optimizer_row_copy_cost) /* - Creating a record from the join cache is faster than getting a row from - the engine. JOIN_CACHE_ROW_COPY_COST_FACTOR is the factor used to - take this into account. This is multiplied with ROW_COPY_COST. -*/ -#define JOIN_CACHE_ROW_COPY_COST_FACTOR 0.75 - -/* Cost of finding the next key during index scan and copying it to 'table->record' @@ -288,7 +302,7 @@ static inline double cache_hit_ratio(uint ratio) as with table scans there are no key read (INDEX_LOOKUP_COST) and fewer disk reads. */ -#define DEFAULT_KEY_COPY_COST ((double) 1.0 / 40.0) +#define DEFAULT_KEY_COPY_COST DEFAULT_ROW_COPY_COST/5 #define KEY_COPY_COST optimizer_key_copy_cost #define KEY_COPY_COST_THD(THD) ((THD)->variables.optimizer_key_copy_cost) @@ -297,34 +311,38 @@ static inline double cache_hit_ratio(uint ratio) This cost is very low as it's done inside the storage engine. Should be smaller than KEY_COPY_COST. */ -#define DEFAULT_INDEX_NEXT_FIND_COST ((double) 1.0 / 80.0) -#define INDEX_NEXT_FIND_COST optimizer_next_find_cost +#define DEFAULT_INDEX_NEXT_FIND_COST DEFAULT_KEY_COPY_COST/10 +#define INDEX_NEXT_FIND_COST optimizer_index_next_find_cost +/* Cost of finding the next row when scanning a table */ +#define DEFAULT_ROW_NEXT_FIND_COST DEFAULT_INDEX_NEXT_FIND_COST +#define ROW_NEXT_FIND_COST optimizer_row_next_find_cost /** The following is used to decide if MariaDB should use table scanning instead of reading with keys. The number says how many evaluation of the WHERE clause is comparable to reading one extra row from a table. */ -#define DEFAULT_WHERE_COMPARE_COST (1 / 5.0) -#define WHERE_COMPARE_COST optimizer_where_cmp_cost -#define WHERE_COMPARE_COST_THD(THD) ((THD)->variables.optimizer_where_cmp_cost) +#define DEFAULT_WHERE_COST ((double) 3.2e-05) +#define WHERE_COST optimizer_where_cost +#define WHERE_COST_THD(THD) ((THD)->variables.optimizer_where_cost) -#define DEFAULT_KEY_COMPARE_COST (1 / 20.0) +/* The cost of comparing a key when using range access */ +#define DEFAULT_KEY_COMPARE_COST DEFAULT_WHERE_COST/4 #define KEY_COMPARE_COST optimizer_key_cmp_cost /* - Cost of comparing two rowids. This is set relative to WHERE_COMPARE_COST + Cost of comparing two rowids. This is set relative to KEY_COMPARE_COST */ -#define ROWID_COMPARE_COST (WHERE_COMPARE_COST / 100.0) -#define ROWID_COMPARE_COST_THD(THD) ((THD)->variables.optimizer_where_cmp_cost / 100.0) +#define ROWID_COMPARE_COST (KEY_COMPARE_COST/2) +#define ROWID_COMPARE_COST_THD(THD) ((THD)->variables.optimizer_key_cmp_cost) /* Setup cost for different operations */ /* Extra cost for doing a range scan. Used to prefer 'ref' over range */ -#define MULTI_RANGE_READ_SETUP_COST (double) (1.0 / 50.0) +#define MULTI_RANGE_READ_SETUP_COST ((double) 0.2) /* These costs are mainly to handle small tables, like the one we have in the @@ -348,21 +366,27 @@ static inline double cache_hit_ratio(uint ratio) test suite would vary depending on floating point calculations done in different paths. */ -#define COST_EPS 0.0001 +#define COST_EPS 0.0000001 /* - For sequential disk seeks the cost formula is: + Average disk seek time on a hard disk is 8-10 ms, which is also + about the time to read a IO_SIZE (8192) block. + + A medium ssd is about 400MB/second, which gives us the time for + reading an IO_SIZE block to IO_SIZE/400000000 = 0.0000204 sec= 0.02 ms. + + For sequential hard disk seeks the cost formula is: DISK_SEEK_BASE_COST + DISK_SEEK_PROP_COST * #blocks_to_skip The cost of average seek - DISK_SEEK_BASE_COST + DISK_SEEK_PROP_COST*BLOCKS_IN_AVG_SEEK =1.0. + DISK_SEEK_BASE_COST + DISK_SEEK_PROP_COST*BLOCKS_IN_AVG_SEEK = 10. */ -#define DISK_SEEK_BASE_COST ((double)0.9) - -#define BLOCKS_IN_AVG_SEEK 128 - -#define DISK_SEEK_PROP_COST ((double)0.1/BLOCKS_IN_AVG_SEEK) +#define DEFAULT_DISK_READ_COST ((double) IO_SIZE / 400000000.0 * 1000) +#define DISK_READ_COST optimizer_disk_read_cost +#define DISK_READ_COST_THD(thd) (thd)->variables.DISK_READ_COST +#define BLOCKS_IN_AVG_SEEK 1 +/* #define DISK_SEEK_PROP_COST ((double)1/BLOCKS_IN_AVG_SEEK) */ /** Number of rows in a reference table when refereed through a not unique key. @@ -375,11 +399,11 @@ static inline double cache_hit_ratio(uint ratio) Subquery materialization-related constants */ /* This should match ha_heap::read_time() */ -#define HEAP_TEMPTABLE_LOOKUP_COST 0.05 +#define HEAP_TEMPTABLE_LOOKUP_COST 1.91e-4 /* see ha_heap.h */ #define HEAP_TEMPTABLE_CREATE_COST 1.0 -#define DISK_TEMPTABLE_LOOKUP_COST 1.0 +#define DISK_TEMPTABLE_LOOKUP_COST(thd) DISK_READ_COST_THD(thd) #define DISK_TEMPTABLE_CREATE_COST TMPFILE_CREATE_COST*2 /* 2 tmp tables */ -#define DISK_TEMPTABLE_BLOCK_SIZE 8192 +#define DISK_TEMPTABLE_BLOCK_SIZE IO_SIZE #define SORT_INDEX_CMP_COST 0.02 diff --git a/sql/sql_explain.cc b/sql/sql_explain.cc index 616a545aa0e..297b587e005 100644 --- a/sql/sql_explain.cc +++ b/sql/sql_explain.cc @@ -1885,6 +1885,9 @@ void Explain_table_access::print_explain_json(Explain_query *query, writer->add_double(jbuf_tracker.get_filtered_after_where()*100.0); else writer->add_null(); + + writer->add_member("r_unpack_time_ms"); + writer->add_double(jbuf_unpack_tracker.get_time_ms()); } } diff --git a/sql/sql_explain.h b/sql/sql_explain.h index bfd52290374..823a94a64f0 100644 --- a/sql/sql_explain.h +++ b/sql/sql_explain.h @@ -722,7 +722,7 @@ public: class Explain_table_access : public Sql_alloc { public: - Explain_table_access(MEM_ROOT *root) : + Explain_table_access(MEM_ROOT *root, bool timed) : derived_select_number(0), non_merged_sjm_number(0), extra_tags(root), @@ -735,6 +735,7 @@ public: pushed_index_cond(NULL), sjm_nest(NULL), pre_join_sort(NULL), + jbuf_unpack_tracker(timed), rowid_filter(NULL) {} ~Explain_table_access() { delete sjm_nest; } @@ -843,6 +844,7 @@ public: Gap_time_tracker extra_time_tracker; Table_access_tracker jbuf_tracker; + Time_and_counter_tracker jbuf_unpack_tracker; Explain_rowid_filter *rowid_filter; diff --git a/sql/sql_join_cache.cc b/sql/sql_join_cache.cc index 1347e38753d..5fcfe0e0e0a 100644 --- a/sql/sql_join_cache.cc +++ b/sql/sql_join_cache.cc @@ -1600,6 +1600,7 @@ bool JOIN_CACHE::put_record() bool JOIN_CACHE::get_record() { bool res; + ANALYZE_START_TRACKING(thd(), join_tab->jbuf_unpack_tracker); uchar *prev_rec_ptr= 0; if (with_length) pos+= size_of_rec_len; @@ -1615,6 +1616,7 @@ bool JOIN_CACHE::get_record() if (prev_cache) prev_cache->get_record_by_pos(prev_rec_ptr); } + ANALYZE_STOP_TRACKING(thd(), join_tab->jbuf_unpack_tracker); return res; } diff --git a/sql/sql_plugin.h b/sql/sql_plugin.h index d4df8c6468f..df5cd37c3c6 100644 --- a/sql/sql_plugin.h +++ b/sql/sql_plugin.h @@ -24,6 +24,7 @@ #define SHOW_always_last SHOW_KEY_CACHE_LONG, \ SHOW_HAVE, SHOW_MY_BOOL, SHOW_HA_ROWS, SHOW_SYS, \ SHOW_LONG_NOFLUSH, SHOW_LEX_STRING, SHOW_ATOMIC_COUNTER_UINT32_T, \ + SHOW_OPTIMIZER_COST, \ /* SHOW_*_STATUS must be at the end, SHOW_LONG_STATUS being first */ \ SHOW_LONG_STATUS, SHOW_DOUBLE_STATUS, SHOW_LONGLONG_STATUS, \ SHOW_UINT32_STATUS diff --git a/sql/sql_select.cc b/sql/sql_select.cc index 50cf4cfb72f..9ec6d0696ed 100644 --- a/sql/sql_select.cc +++ b/sql/sql_select.cc @@ -101,13 +101,6 @@ #define double_to_rows(A) ((A) >= HA_POS_ERROR ? HA_POS_ERROR : (ha_rows) (A)) -/* Cost for reading a row through an index */ -struct INDEX_READ_COST -{ - double read_cost; - double index_only_cost; -}; - const char *join_type_str[]={ "UNKNOWN","system","const","eq_ref","ref", "MAYBE_REF","ALL","range","index","fulltext", "ref_or_null","unique_subquery","index_subquery", @@ -7706,7 +7699,7 @@ static double matching_candidates_in_table(JOIN_TAB *s, KEY_COPY_COST as for filtering there is no copying of not accepted keys. - WHERE_COMPARE_COST cost is not added to any result. + WHERE_COST cost is not added to any result. */ INDEX_READ_COST cost_for_index_read(const THD *thd, const TABLE *table, @@ -7724,32 +7717,26 @@ INDEX_READ_COST cost_for_index_read(const THD *thd, const TABLE *table, #endif if (file->is_clustering_key(key)) { - cost.index_only_cost= file->ha_read_time(key, 1, rows_adjusted); + cost.index_only_cost= file->ha_keyread_time(key, 1, rows_adjusted, 0); /* Same computation as in ha_read_and_copy_time() We do it explicitely here as we want to use the original value of records to compute the record copy cost. */ cost.read_cost= (cost.index_only_cost + - rows2double(records) * ROW_COPY_COST_THD(thd)); + rows2double(records) * file->ROW_COPY_COST); } else if (table->covering_keys.is_set(key) && !table->no_keyread) { - cost.index_only_cost= file->ha_keyread_time(key, 1, rows_adjusted); + cost.index_only_cost= file->ha_keyread_time(key, 1, rows_adjusted, 0); /* Same computation as in ha_keyread_and_copy_time() */ cost.read_cost= (cost.index_only_cost + - rows2double(records) * KEY_COPY_COST_THD(thd)); + rows2double(records) * file->KEY_COPY_COST); } else { - cost.index_only_cost= file->ha_keyread_time(key, 1, rows_adjusted); - /* - Note that ha_read_time() + ..ROW_COPY_COST should be same - as ha_rndpos_time(). - */ - cost.read_cost= (cost.index_only_cost + - file->ha_read_time(key, 0, rows_adjusted) + - rows2double(records) * ROW_COPY_COST_THD(thd)); + cost.index_only_cost= file->ha_keyread_time(key, 1, rows_adjusted, 0); + cost.read_cost= (cost.index_only_cost + file->ha_rndpos_time(records)); } DBUG_PRINT("statistics", ("index_cost: %.3f full_cost: %.3f", cost.index_only_cost, cost.read_cost)); @@ -7763,7 +7750,7 @@ INDEX_READ_COST cost_for_index_read(const THD *thd, const TABLE *table, @param thd Thread handler @param table Table @param cost Pointer to cost for *records_arg rows, not including - WHERE_COMPARE_COST cost. + WHERE_COST cost. Will be updated to new cost if filter is used. @param records_arg Pointer to number of records for the current key. Will be updated to records after filter, if filter is @@ -7818,14 +7805,14 @@ apply_filter(THD *thd, TABLE *table, double *cost, double *records_arg, read even if selectivity (and thus new_records) would be very low. */ new_cost= (MY_MAX(cost_of_accepted_rows, - ranges * INDEX_LOOKUP_COST * io_cost * - table->file->optimizer_cache_cost) + + ranges * table->file->INDEX_LOOKUP_COST + + ranges * io_cost * table->file->optimizer_cache_cost) + cost_of_rejected_rows + filter_lookup_cost); - new_total_cost= ((new_cost + new_records * WHERE_COMPARE_COST_THD(thd)) * + new_total_cost= ((new_cost + new_records * WHERE_COST_THD(thd)) * prev_records + filter_startup_cost); DBUG_ASSERT(new_cost >= 0 && new_records >= 0); - use_filter= ((*cost + records * WHERE_COMPARE_COST_THD(thd)) * prev_records > + use_filter= ((*cost + records * WHERE_COST_THD(thd)) * prev_records > new_total_cost); if (unlikely(thd->trace_started())) @@ -8481,7 +8468,7 @@ best_access_path(JOIN *join, tmp, index_only_cost, 1, record_count); } - tmp= COST_ADD(tmp, records_after_filter * WHERE_COMPARE_COST_THD(thd)); + tmp= COST_ADD(tmp, records_after_filter * WHERE_COST_THD(thd)); tmp= COST_MULT(tmp, record_count); tmp= COST_ADD(tmp, startup_cost); if (unlikely(trace_access_idx.trace_started())) @@ -8595,8 +8582,8 @@ best_access_path(JOIN *join, row combinations, only a HASH_FANOUT (10%) rows in the cache. */ cmp_time= (rnd_records * record_count * HASH_FANOUT * - (ROW_COPY_COST_THD(thd) * JOIN_CACHE_ROW_COPY_COST_FACTOR + - WHERE_COMPARE_COST_THD(thd))); + (ROW_COPY_COST_THD(thd) * JOIN_CACHE_ROW_COPY_COST_FACTOR(thd) + + WHERE_COST_THD(thd))); tmp= COST_ADD(tmp, cmp_time); best_cost= tmp; @@ -8682,7 +8669,7 @@ best_access_path(JOIN *join, - skip rows which does not satisfy WHERE constraints Note that s->quick->read_time includes the cost of comparing - the row with the where clause (WHERE_COMPARE_COST) + the row with the where clause (WHERE_COST) TODO: We take into account possible use of join cache for ALL/index @@ -8732,9 +8719,9 @@ best_access_path(JOIN *join, if (filter) { tmp= filter_cost; - /* Filter returns cost without WHERE_COMPARE_COST */ + /* Filter returns cost without WHERE_COST */ tmp= COST_ADD(tmp, records_after_filter * - WHERE_COMPARE_COST_THD(thd)); + WHERE_COST_THD(thd)); tmp= COST_MULT(tmp, record_count); tmp= COST_ADD(tmp, startup_cost); startup_cost= 0; // Avoid adding it later @@ -8764,17 +8751,28 @@ best_access_path(JOIN *join, /* Estimate cost of reading table. */ if (table->force_index && !best_key) { - INDEX_READ_COST cost= cost_for_index_read(thd, table, s->ref.key, - s->records, - s->worst_seeks); /* The query is using 'force_index' and we did not find a usable key. Caclulcate cost of a table scan with the forced index. */ + if (s->cached_covering_key != MAX_KEY) + { + /* Use value from estimate_scan_time */ + tmp= s->cached_scan_and_compare_time; + } + else + { + /* No cached key, use shortest allowd key */ + uint shortest_key= find_shortest_key(table, + &table->keys_in_use_for_query); + INDEX_READ_COST cost= cost_for_index_read(thd, table, shortest_key, + s->records, + s->worst_seeks); + tmp= cost.read_cost; + /* Calculate cost of checking the attached WHERE */ + tmp= COST_ADD(cost.read_cost, s->records * WHERE_COST_THD(thd)); + } type= JT_NEXT; - tmp= cost.read_cost; - /* Calculate cost of checking the attached WHERE */ - tmp= COST_ADD(cost.read_cost, s->records * WHERE_COMPARE_COST_THD(thd)); } else // table scan { @@ -8827,8 +8825,8 @@ best_access_path(JOIN *join, */ cmp_time= (rnd_records * record_count * (ROW_COPY_COST_THD(thd) * (idx - join->const_tables) * - JOIN_CACHE_ROW_COPY_COST_FACTOR + - WHERE_COMPARE_COST_THD(thd))); + JOIN_CACHE_ROW_COPY_COST_FACTOR(thd) + + WHERE_COST_THD(thd))); tmp= COST_ADD(tmp, cmp_time); } } @@ -9428,7 +9426,7 @@ optimize_straight_join(JOIN *join, table_map remaining_tables) /* Compute the cost of the new plan extended with 's' */ record_count= COST_MULT(record_count, position->records_read); - read_time+= COST_ADD(read_time, position->read_time); + read_time= COST_ADD(read_time, position->read_time); optimize_semi_joins(join, remaining_tables, idx, &record_count, &read_time, &loose_scan_pos); remaining_tables&= ~(s->table->map); @@ -10546,7 +10544,9 @@ best_extension_by_limited_search(JOIN *join, Hence it may be wrong. */ trace_one_table.add("cost_for_sorting", current_record_count); - current_read_time= COST_ADD(current_read_time, current_record_count); + current_read_time= COST_ADD(current_read_time, + current_record_count * + DISK_TEMPTABLE_LOOKUP_COST(thd)); } if (current_read_time < join->best_read) { @@ -14450,14 +14450,14 @@ void JOIN_TAB::cleanup() /** Estimate the time to get rows of the joined table - Updates found_records, records, cached_scan_time, cached_covering_key, - read_time and cache_scan_and_compare_time + Updates found_records, records, cached_covering_key, read_time and + cache_scan_and_compare_time */ void JOIN_TAB::estimate_scan_time() { THD *thd= join->thd; - double copy_cost= ROW_COPY_COST_THD(thd); + double copy_cost; cached_covering_key= MAX_KEY; if (table->is_created()) @@ -14468,6 +14468,7 @@ void JOIN_TAB::estimate_scan_time() &startup_cost); table->opt_range_condition_rows= records; table->used_stat_records= records; + copy_cost= ROW_COPY_COST_THD(thd); } else { @@ -14482,10 +14483,13 @@ void JOIN_TAB::estimate_scan_time() { cached_covering_key= find_shortest_key(table, &table->covering_keys); read_time= table->file->ha_key_scan_time(cached_covering_key); - copy_cost= KEY_COPY_COST_THD(thd); + copy_cost= table->file->KEY_COPY_COST; } else + { read_time= table->file->ha_scan_time(); + copy_cost= table->file->ROW_COPY_COST; + } } } else @@ -14493,11 +14497,11 @@ void JOIN_TAB::estimate_scan_time() records= table->stat_records(); DBUG_ASSERT(table->opt_range_condition_rows == records); read_time= records ? (double) records: 10.0;// TODO:fix this stub + copy_cost= ROW_COPY_COST_THD(thd); } found_records= records; - cached_scan_time= read_time; cached_scan_and_compare_time= (read_time + records * - (copy_cost + WHERE_COMPARE_COST_THD(thd))); + (copy_cost + WHERE_COST_THD(thd))); } @@ -27650,6 +27654,7 @@ bool JOIN_TAB::save_explain_data(Explain_table_access *eta, // psergey-todo: data for filtering! tracker= &eta->tracker; jbuf_tracker= &eta->jbuf_tracker; + jbuf_unpack_tracker= &eta->jbuf_unpack_tracker; /* Enable the table access time tracker only for "ANALYZE stmt" */ if (thd->lex->analyze_stmt) @@ -28286,9 +28291,9 @@ int JOIN::save_explain_data_intern(Explain_query *output, continue; } - Explain_table_access *eta= (new (output->mem_root) - Explain_table_access(output->mem_root)); + Explain_table_access(output->mem_root, + thd->lex->analyze_stmt)); if (!eta) DBUG_RETURN(1); @@ -29406,7 +29411,7 @@ static bool get_range_limit_read_cost(const POSITION *pos, range_cost*= rows_limit / best_rows; range_rows= rows_limit; } - *read_cost= range_cost + range_rows * WHERE_COMPARE_COST_THD(table->in_use); + *read_cost= range_cost + range_rows * WHERE_COST_THD(table->in_use); *read_rows= range_rows; return 1; } @@ -29430,7 +29435,7 @@ static bool get_range_limit_read_cost(const POSITION *pos, (ha_rows) pos->table->worst_seeks : HA_ROWS_MAX); *read_cost= (cost.read_cost + - rows_to_scan * WHERE_COMPARE_COST_THD(table->in_use)); + rows_to_scan * WHERE_COST_THD(table->in_use)); *read_rows= rows_to_scan; return 0; } diff --git a/sql/sql_select.h b/sql/sql_select.h index 0fae9589335..e99ca23d5eb 100644 --- a/sql/sql_select.h +++ b/sql/sql_select.h @@ -309,6 +309,7 @@ typedef struct st_join_table { Table_access_tracker *tracker; Table_access_tracker *jbuf_tracker; + Time_and_counter_tracker *jbuf_unpack_tracker; // READ_RECORD::Setup_func materialize_table; READ_RECORD::Setup_func read_first_record; @@ -356,7 +357,6 @@ typedef struct st_join_table { double partial_join_cardinality; /* set by estimate_scan_time() */ - double cached_scan_time; double cached_scan_and_compare_time; /* diff --git a/sql/sql_show.cc b/sql/sql_show.cc index a3155349fe9..00dc787c5e6 100644 --- a/sql/sql_show.cc +++ b/sql/sql_show.cc @@ -3637,6 +3637,9 @@ const char* get_one_variable(THD *thd, /* 6 is the default precision for '%f' in sprintf() */ end= buff + my_fcvt(*value.as_double, 6, buff, NULL); break; + case SHOW_OPTIMIZER_COST: // Stored in 1ms, displayed in us + end= buff + my_fcvt(*value.as_double*1000, 6, buff, NULL); + break; case SHOW_LONG_STATUS: value.as_char= status_var_value.as_char + value.as_intptr; /* fall through */ diff --git a/sql/sys_vars.cc b/sql/sys_vars.cc index 97c2d1c5796..0e86b070c6f 100644 --- a/sql/sys_vars.cc +++ b/sql/sys_vars.cc @@ -6817,6 +6817,25 @@ static Sys_var_ulong Sys_optimizer_max_sel_arg_weight( SESSION_VAR(optimizer_max_sel_arg_weight), CMD_LINE(REQUIRED_ARG), VALID_RANGE(0, ULONG_MAX), DEFAULT(SEL_ARG::MAX_WEIGHT), BLOCK_SIZE(1)); + +static ulonglong optimizer_cost_version; + +/* + Ensure that if we have an unique id if we are changing costs + This will ensure that handler::set_optimizer_costs() is called + for each new table. +*/ + +static bool update_optimizer_cost_version(sys_var *self, THD *thd, + enum_var_type type) +{ + if (type == OPT_GLOBAL) + global_system_variables.optimizer_cost_version= ++optimizer_cost_version; + else + thd->variables.optimizer_cost_version= ++optimizer_cost_version; + return 0; +} + /* We don't allow 100 for optimizer_cache_cost as there is always a small cost of finding the key, on cached pages, that we have to take into account. @@ -6828,57 +6847,94 @@ static bool update_optimizer_cache_hit_ratio(sys_var *self, THD *thd, if (type == OPT_SESSION) thd->optimizer_cache_hit_ratio= cache_hit_ratio(thd->variables.optimizer_cache_hit_ratio); + update_optimizer_cost_version(self, thd, type); return 0; } -static Sys_var_uint Sys_optimizer_cache_hit_ratio( + +static Sys_var_double Sys_optimizer_cache_hit_ratio( "optimizer_cache_hit_ratio", "Expected hit rate of the row and index cache in storage engines. " - "The value should be an integer between 0 and 99, where 0 means cache is " - "empty and 99 means that value is almost always in the cache", + "The value should be an integer between 0 and 100, where 0 means cache is " + "empty and 100 means that value is almost always in the cache", SESSION_VAR(optimizer_cache_hit_ratio), CMD_LINE(REQUIRED_ARG), - VALID_RANGE(0, 99), DEFAULT(DEFAULT_CACHE_HIT_RATIO), 1, NO_MUTEX_GUARD, + VALID_RANGE(0, 100), DEFAULT(DEFAULT_CACHE_HIT_RATIO), NO_MUTEX_GUARD, NOT_IN_BINLOG, ON_CHECK(0), ON_UPDATE(update_optimizer_cache_hit_ratio)); -static Sys_var_double Sys_optimizer_key_copy_cost( +static Sys_var_double Sys_optimizer_scan_lookup_cost( + "optimizer_scan_lookup_cost", + "Cost modifier for reading a block when scanning a table", + SESSION_VAR(optimizer_scan_lookup_cost), CMD_LINE(REQUIRED_ARG), + VALID_RANGE(0, 10), DEFAULT(DEFAULT_SCAN_LOOKUP_COST), NO_MUTEX_GUARD, + NOT_IN_BINLOG, ON_CHECK(0), ON_UPDATE(update_optimizer_cost_version)); + +static Sys_var_optimizer_cost Sys_optimizer_row_lookup_cost( + "optimizer_row_lookup_cost", + "Cost of fiding a row based on a rowid or a clustered key", + SESSION_VAR(optimizer_row_lookup_cost), CMD_LINE(REQUIRED_ARG), + VALID_RANGE(0, 10), DEFAULT(DEFAULT_ROW_LOOKUP_COST), NO_MUTEX_GUARD, + NOT_IN_BINLOG, ON_CHECK(0), ON_UPDATE(update_optimizer_cost_version)); + +static Sys_var_optimizer_cost Sys_optimizer_index_lookup_cost( + "optimizer_index_lookup_cost", + "Cost modifier for finding a row based on not clustered key", + SESSION_VAR(optimizer_index_lookup_cost), CMD_LINE(REQUIRED_ARG), + VALID_RANGE(0, 10), DEFAULT(DEFAULT_INDEX_LOOKUP_COST), NO_MUTEX_GUARD, + NOT_IN_BINLOG, ON_CHECK(0), ON_UPDATE(update_optimizer_cost_version)); + +static Sys_var_optimizer_cost Sys_optimizer_disk_read_cost( + "optimizer_disk_read_cost", + "Cost of reading a block of IO_SIZE from an disk (ms 4096 bytes)", + SESSION_VAR(optimizer_disk_read_cost), CMD_LINE(REQUIRED_ARG), + VALID_RANGE(0, 10), DEFAULT(DEFAULT_DISK_READ_COST), NO_MUTEX_GUARD, + NOT_IN_BINLOG, ON_CHECK(0), ON_UPDATE(update_optimizer_cost_version)); + +static Sys_var_optimizer_cost Sys_optimizer_key_copy_cost( "optimizer_key_copy_cost", "Cost of finding the next key in the engine and copying it to the SQL layer", SESSION_VAR(optimizer_key_copy_cost), CMD_LINE(REQUIRED_ARG), VALID_RANGE(0, 1), DEFAULT(DEFAULT_KEY_COPY_COST), NO_MUTEX_GUARD, - NOT_IN_BINLOG); + NOT_IN_BINLOG, ON_CHECK(0), ON_UPDATE(update_optimizer_cost_version)); -static Sys_var_double Sys_optimizer_index_block_copy_cost( +static Sys_var_optimizer_cost Sys_optimizer_index_block_copy_cost( "optimizer_index_block_copy_cost", "Cost of copying a key block from the cache to intern storage as part of an " "index scan", SESSION_VAR(optimizer_index_block_copy_cost), CMD_LINE(REQUIRED_ARG), VALID_RANGE(0, 1), DEFAULT(DEFAULT_INDEX_BLOCK_COPY_COST), NO_MUTEX_GUARD, - NOT_IN_BINLOG); + NOT_IN_BINLOG, ON_CHECK(0), ON_UPDATE(update_optimizer_cost_version)); + +static Sys_var_optimizer_cost Sys_optimizer_row_next_find_cost( + "optimizer_row_next_find_cost", + "Cost of finding the next row when scanning the table", + SESSION_VAR(optimizer_index_next_find_cost), CMD_LINE(REQUIRED_ARG), + VALID_RANGE(0, 1), DEFAULT(DEFAULT_ROW_NEXT_FIND_COST), NO_MUTEX_GUARD, + NOT_IN_BINLOG, ON_CHECK(0), ON_UPDATE(update_optimizer_cost_version)); -static Sys_var_double Sys_optimizer_index_next_find_cost( +static Sys_var_optimizer_cost Sys_optimizer_index_next_find_cost( "optimizer_index_next_find_cost", "Cost of finding the next key and rowid when using filters", SESSION_VAR(optimizer_index_next_find_cost), CMD_LINE(REQUIRED_ARG), VALID_RANGE(0, 1), DEFAULT(DEFAULT_INDEX_NEXT_FIND_COST), NO_MUTEX_GUARD, - NOT_IN_BINLOG); + NOT_IN_BINLOG, ON_CHECK(0), ON_UPDATE(update_optimizer_cost_version)); -static Sys_var_double Sys_optimizer_row_copy_cost( +static Sys_var_optimizer_cost Sys_optimizer_row_copy_cost( "optimizer_row_copy_cost", "Cost of copying a row from the engine or the join cache to the SQL layer.", SESSION_VAR(optimizer_row_copy_cost), CMD_LINE(REQUIRED_ARG), VALID_RANGE(0, 1), DEFAULT(DEFAULT_ROW_COPY_COST), NO_MUTEX_GUARD, - NOT_IN_BINLOG); + NOT_IN_BINLOG, ON_CHECK(0), ON_UPDATE(update_optimizer_cost_version)); -static Sys_var_double Sys_optimizer_where_cmp_cost( - "optimizer_where_compare_cost", +static Sys_var_optimizer_cost Sys_optimizer_where_cost( + "optimizer_where_cost", "Cost of checking the row against the WHERE clause", - SESSION_VAR(optimizer_where_cmp_cost), CMD_LINE(REQUIRED_ARG), - VALID_RANGE(0, 1), DEFAULT(DEFAULT_WHERE_COMPARE_COST), NO_MUTEX_GUARD, - NOT_IN_BINLOG); + SESSION_VAR(optimizer_where_cost), CMD_LINE(REQUIRED_ARG), + VALID_RANGE(0, 1), DEFAULT(DEFAULT_WHERE_COST), NO_MUTEX_GUARD, + NOT_IN_BINLOG, ON_CHECK(0), ON_UPDATE(update_optimizer_cost_version)); -static Sys_var_double Sys_optimizer_key_cmp_cost( +static Sys_var_optimizer_cost Sys_optimizer_key_cmp_cost( "optimizer_key_compare_cost", "Cost of checking a key aginst the end key condition", SESSION_VAR(optimizer_key_cmp_cost), CMD_LINE(REQUIRED_ARG), VALID_RANGE(0, 1), DEFAULT(DEFAULT_KEY_COMPARE_COST), NO_MUTEX_GUARD, - NOT_IN_BINLOG); + NOT_IN_BINLOG, ON_CHECK(0), ON_UPDATE(update_optimizer_cost_version)); diff --git a/sql/sys_vars.inl b/sql/sys_vars.inl index 97e3a28b67e..07c7093fbf0 100644 --- a/sql/sys_vars.inl +++ b/sql/sys_vars.inl @@ -1228,6 +1228,50 @@ public: { var->save_result.double_value= getopt_ulonglong2double(option.def_value); } }; + +/* + Optimizer costs + Stored as cost factor (1 cost = 1 ms). + Given and displayed as microsconds (as most values are very small) +*/ + +class Sys_var_optimizer_cost: public Sys_var_double +{ +public: + Sys_var_optimizer_cost(const char *name_arg, + const char *comment, int flag_args, ptrdiff_t off, size_t size, + CMD_LINE getopt, + double min_val, double max_val, double def_val, PolyLock *lock=0, + enum binlog_status_enum binlog_status_arg=VARIABLE_NOT_IN_BINLOG, + on_check_function on_check_func=0, + on_update_function on_update_func=0, + const char *substitute=0) + :Sys_var_double(name_arg, comment, flag_args, off, size, getopt, + min_val, max_val, def_val, lock, + binlog_status_arg, + on_check_func, + on_update_func, + substitute) + { + show_val_type= SHOW_OPTIMIZER_COST; + global_var(double)= (double)option.def_value/1000; // To us + } + bool session_update(THD *thd, set_var *var) + { + session_var(thd, double)= var->save_result.double_value/1000; + return false; + } + bool global_update(THD *thd, set_var *var) + { + global_var(double)= var->save_result.double_value/1000; + return false; + } + void global_save_default(THD *thd, set_var *var) + { + var->save_result.double_value= getopt_ulonglong2double(option.def_value)*1000; } +}; + + /** The class for the @max_user_connections. It's derived from Sys_var_uint, but non-standard session value diff --git a/sql/uniques.cc b/sql/uniques.cc index a09655bcaca..de7825eb061 100644 --- a/sql/uniques.cc +++ b/sql/uniques.cc @@ -159,7 +159,7 @@ inline double log2_n_fact(double x) total_buf_elems* log2(n_buffers) * ROWID_COMPARE_COST; */ -static double get_merge_buffers_cost(uint *buff_elems, uint elem_size, +static double get_merge_buffers_cost(THD *thd, uint *buff_elems, uint elem_size, uint *first, uint *last, double compare_factor) { @@ -171,7 +171,8 @@ static double get_merge_buffers_cost(uint *buff_elems, uint elem_size, size_t n_buffers= last - first + 1; /* Using log2(n)=log(n)/log(2) formula */ - return (2*((double)total_buf_elems*elem_size) / IO_SIZE + + return (2*((double)total_buf_elems*elem_size) / IO_SIZE * + thd->variables.optimizer_disk_read_cost + total_buf_elems*log((double) n_buffers) * compare_factor / M_LN2); } @@ -185,6 +186,7 @@ static double get_merge_buffers_cost(uint *buff_elems, uint elem_size, SYNOPSIS get_merge_many_buffs_cost() + thd THD, used to get disk_read_cost buffer buffer space for temporary data, at least Unique::get_cost_calc_buff_size bytes maxbuffer # of full buffers @@ -203,7 +205,8 @@ static double get_merge_buffers_cost(uint *buff_elems, uint elem_size, Cost of merge in disk seeks. */ -static double get_merge_many_buffs_cost(uint *buffer, +static double get_merge_many_buffs_cost(THD *thd, + uint *buffer, uint maxbuffer, uint max_n_elems, uint last_n_elems, int elem_size, double compare_factor) @@ -231,13 +234,13 @@ static double get_merge_many_buffs_cost(uint *buffer, uint lastbuff= 0; for (i = 0; i <= (int) maxbuffer - MERGEBUFF*3/2; i += MERGEBUFF) { - total_cost+=get_merge_buffers_cost(buff_elems, elem_size, + total_cost+=get_merge_buffers_cost(thd, buff_elems, elem_size, buff_elems + i, buff_elems + i + MERGEBUFF-1, compare_factor); lastbuff++; } - total_cost+=get_merge_buffers_cost(buff_elems, elem_size, + total_cost+=get_merge_buffers_cost(thd, buff_elems, elem_size, buff_elems + i, buff_elems + maxbuffer, compare_factor); @@ -246,7 +249,7 @@ static double get_merge_many_buffs_cost(uint *buffer, } /* Simulate final merge_buff call. */ - total_cost += get_merge_buffers_cost(buff_elems, elem_size, + total_cost += get_merge_buffers_cost(thd, buff_elems, elem_size, buff_elems, buff_elems + maxbuffer, compare_factor); return total_cost; @@ -304,7 +307,7 @@ static double get_merge_many_buffs_cost(uint *buffer, these will be random seeks. */ -double Unique::get_use_cost(uint *buffer, size_t nkeys, uint key_size, +double Unique::get_use_cost(THD *thd, uint *buffer, size_t nkeys, uint key_size, size_t max_in_memory_size, double compare_factor, bool intersect_fl, bool *in_memory) @@ -312,7 +315,7 @@ double Unique::get_use_cost(uint *buffer, size_t nkeys, uint key_size, size_t max_elements_in_tree; size_t last_tree_elems; size_t n_full_trees; /* number of trees in unique - 1 */ - double result; + double result, disk_read_cost; max_elements_in_tree= ((size_t) max_in_memory_size / ALIGN_SIZE(sizeof(TREE_ELEMENT)+key_size)); @@ -345,14 +348,15 @@ double Unique::get_use_cost(uint *buffer, size_t nkeys, uint key_size, First, add cost of writing all trees to disk, assuming that all disk writes are sequential. */ - result += DISK_SEEK_BASE_COST * n_full_trees * - ceil(((double) key_size)*max_elements_in_tree / IO_SIZE); - result += DISK_SEEK_BASE_COST * ceil(((double) key_size)*last_tree_elems / IO_SIZE); + disk_read_cost= DISK_READ_COST_THD(thd); + result += disk_read_cost * n_full_trees * + ceil(((double) key_size)*max_elements_in_tree / DISK_CHUNK_SIZE); + result += disk_read_cost * ceil(((double) key_size)*last_tree_elems / DISK_CHUNK_SIZE); /* Cost of merge */ if (intersect_fl) key_size+= sizeof(element_count); - double merge_cost= get_merge_many_buffs_cost(buffer, (uint)n_full_trees, + double merge_cost= get_merge_many_buffs_cost(thd, buffer, (uint)n_full_trees, (uint)max_elements_in_tree, (uint)last_tree_elems, key_size, compare_factor); diff --git a/sql/uniques.h b/sql/uniques.h index f4c45cde095..ecc49794efe 100644 --- a/sql/uniques.h +++ b/sql/uniques.h @@ -78,7 +78,7 @@ public: return log((double) tree_elems) * compare_factor / M_LN2; } - static double get_use_cost(uint *buffer, size_t nkeys, uint key_size, + static double get_use_cost(THD *thd, uint *buffer, size_t nkeys, uint key_size, size_t max_in_memory_size, double compare_factor, bool intersect_fl, bool *in_memory); inline static int get_cost_calc_buff_size(size_t nkeys, uint key_size, diff --git a/storage/connect/ha_connect.h b/storage/connect/ha_connect.h index d1aca22b01f..d5650163064 100644 --- a/storage/connect/ha_connect.h +++ b/storage/connect/ha_connect.h @@ -308,13 +308,18 @@ public: /** @brief Called in test_quick_select to determine if indexes should be used. */ - virtual double scan_time() { return (double) (stats.records+stats.deleted) / 20.0+10; } + virtual IO_AND_CPU_COST scan_time() + { return { 0, (double) (stats.records+stats.deleted) * avg_io_cost() }; }; /** @brief This method will never be called if you do not implement indexes. */ - virtual double read_time(uint, uint, ha_rows rows) - { return (double) rows / 20.0+1; } + virtual IO_AND_CPU_COST keyread_time(uint index, ulong ranges, ha_rows rows, + ulonglong blocks) + { + return { 0, (double) rows * 0.001 }; + } + /* Everything below are methods that we implement in ha_connect.cc. diff --git a/storage/csv/ha_tina.h b/storage/csv/ha_tina.h index 043183444da..5a56dc6c4dd 100644 --- a/storage/csv/ha_tina.h +++ b/storage/csv/ha_tina.h @@ -124,7 +124,12 @@ public: /* Called in test_quick_select to determine if indexes should be used. */ - virtual double scan_time() { return (double) (stats.records+stats.deleted) / 20.0+10; } + virtual IO_AND_CPU_COST scan_time() + { + return { (double) ((share->saved_data_file_length + IO_SIZE-1))/ IO_SIZE * + avg_io_cost(), + (stats.records+stats.deleted) * ROW_NEXT_FIND_COST }; + } /* The next method will never be called */ virtual bool fast_key_read() { return 1;} /* diff --git a/storage/example/ha_example.h b/storage/example/ha_example.h index 2d3fa6d4216..ccd8bbd5446 100644 --- a/storage/example/ha_example.h +++ b/storage/example/ha_example.h @@ -150,15 +150,40 @@ public: uint max_supported_key_length() const { return 0; } /** @brief - Called in test_quick_select to determine if indexes should be used. + Called in test_quick_select to determine cost of table scan */ - virtual double scan_time() { return (double) (stats.records+stats.deleted) / 20.0+10; } + virtual IO_AND_CPU_COST scan_time() + { + IO_AND_CPU_COST cost; + /* 0 blocks, 0.001 ms / row */ + cost.io= (double) (stats.records+stats.deleted) * avg_io_cost(); + cost.cpu= 0; + return cost; + } /** @brief This method will never be called if you do not implement indexes. */ - virtual double read_time(uint, uint, ha_rows rows) - { return (double) rows / 20.0+1; } + virtual IO_AND_CPU_COST keyread_time(uint, ulong, ha_rows rows, + ulonglong blocks) + { + IO_AND_CPU_COST cost; + cost.io= blocks * avg_io_cost(); + cost.cpu= (double) rows * 0.001; + return cost; + } + + /** @brief + Cost of fetching 'rows' records through rnd_pos() + */ + virtual IO_AND_CPU_COST rndpos_time(ha_rows rows) + { + IO_AND_CPU_COST cost; + /* 0 blocks, 0.001 ms / row */ + cost.io= 0; + cost.cpu= (double) rows * avg_io_cost(); + return cost; + } /* Everything below are methods that we implement in ha_example.cc. diff --git a/storage/federated/ha_federated.h b/storage/federated/ha_federated.h index a8d5439bdae..94a30962663 100644 --- a/storage/federated/ha_federated.h +++ b/storage/federated/ha_federated.h @@ -180,19 +180,26 @@ public: The reason for "records * 1000" is that such a large number forces this to use indexes " */ - virtual double scan_time() + + IO_AND_CPU_COST scan_time() { DBUG_PRINT("info", ("records %lu", (ulong) stats.records)); - return (double)(stats.records*1000); + return + { + (double) (stats.mean_rec_length * stats.records)/IO_SIZE * avg_io_cost(), + 0 + }; } - virtual double read_time(uint index, uint ranges, ha_rows rows) + IO_AND_CPU_COST rndpos_time(ha_rows rows) { - return rows2double(rows) + rows2double(ranges); + return { (double) stats.records * avg_io_cost(), 0 }; } - virtual double rndpos_time(ha_rows rows) + IO_AND_CPU_COST keyread_time(uint index, ulong ranges, ha_rows rows, + ulonglong blocks) { - return rows2double(rows); + return { (double) (ranges + rows) * avg_io_cost(), 0 }; } + virtual void set_optimizer_cache_cost(double cost); const key_map *keys_to_use_for_scanning() { return &key_map_full; } diff --git a/storage/federatedx/ha_federatedx.cc b/storage/federatedx/ha_federatedx.cc index afa7e26d85d..4521d33bcbc 100644 --- a/storage/federatedx/ha_federatedx.cc +++ b/storage/federatedx/ha_federatedx.cc @@ -846,12 +846,14 @@ ha_federatedx::ha_federatedx(handlerton *hton, } /* - Federated doesn't need optimizer_cache_cost as everything is one a remote server and - nothing is cached locally + Federated doesn't need optimizer_cache_cost as everything is one a remote + server and nothing is cached locally */ void ha_federatedx::set_optimizer_cache_cost(double cost) -{} +{ + optimizer_cache_cost= 1.0; +} /* Convert MySQL result set row to handler internal format diff --git a/storage/federatedx/ha_federatedx.h b/storage/federatedx/ha_federatedx.h index 28318842ac2..240b83554e8 100644 --- a/storage/federatedx/ha_federatedx.h +++ b/storage/federatedx/ha_federatedx.h @@ -368,19 +368,25 @@ public: The reason for "records * 1000" is that such a large number forces this to use indexes " */ - double scan_time() + IO_AND_CPU_COST scan_time() { DBUG_PRINT("info", ("records %lu", (ulong) stats.records)); - return (double)(stats.records*1000); + return + { + (double) (stats.mean_rec_length * stats.records)/8192 * avg_io_cost(), + 0 + }; } - double read_time(uint index, uint ranges, ha_rows rows) + IO_AND_CPU_COST keyread_time(uint index, ulong ranges, ha_rows rows, + ulonglong blocks) { - return rows2double(rows) + rows2double(ranges); + return { (double) (ranges + rows) * avg_io_cost(), 0 }; } - virtual double rndpos_time(ha_rows rows) + IO_AND_CPU_COST rndpos_time(ha_rows rows) { - return rows2double(rows); + return { (double) rows * avg_io_cost(), 0 }; } + virtual void set_optimizer_cache_cost(double cost); const key_map *keys_to_use_for_scanning() { return &key_map_full; } diff --git a/storage/heap/ha_heap.cc b/storage/heap/ha_heap.cc index f9b365cf91e..756f4d07573 100644 --- a/storage/heap/ha_heap.cc +++ b/storage/heap/ha_heap.cc @@ -230,6 +230,49 @@ void ha_heap::update_key_stats() } +void ha_heap::optimizer_costs_updated() +{ + /* + Heap doesn't need optimizer_cache_cost as everything is in memory and + it supports all needed _time() functions + */ + optimizer_cache_cost= 1.0; + optimizer_scan_lookup_cost= 1; + optimizer_index_next_find_cost= 0; +} + +#define HEAP_SCAN_TIME 9.376e-06 // See optimizer_costs.txt +#define BTREE_SCAN_TIME 5.171e-05 // See optimizer_costs.txt +#define HEAP_LOOKUP_TIME 1.91e-4 // See optimizer_costs.txt + + +IO_AND_CPU_COST ha_heap::keyread_time(uint index, ulong ranges, ha_rows rows, + ulonglong blocks) +{ + KEY *key=table->key_info+index; + if (key->algorithm == HA_KEY_ALG_BTREE) + return {0, (double) (rows + ranges + 1) * BTREE_SCAN_TIME }; + else + { + return {0, (double) ranges * HEAP_LOOKUP_TIME + (rows-1) * BTREE_SCAN_TIME }; + } +} + +IO_AND_CPU_COST ha_heap::scan_time() +{ + return {0, (double) (stats.records+stats.deleted) * HEAP_SCAN_TIME}; +} + +IO_AND_CPU_COST ha_heap::rndpos_time(ha_rows rows) +{ + /* + The row pointer is a direct pointer to the block. Thus almost instant + in practice. + Note that ha_rndpos_time() will add ROW_COPY_COST to this result + */ + return { 0, 0 }; +} + int ha_heap::write_row(const uchar * buf) { int res; diff --git a/storage/heap/ha_heap.h b/storage/heap/ha_heap.h index 45495daf54c..61e20779e44 100644 --- a/storage/heap/ha_heap.h +++ b/storage/heap/ha_heap.h @@ -62,27 +62,13 @@ public: const key_map *keys_to_use_for_scanning() { return &btree_keys; } uint max_supported_keys() const { return MAX_KEY; } uint max_supported_key_part_length() const { return MAX_KEY_LENGTH; } - double scan_time() - { return (double) (stats.records+stats.deleted) / 20.0+10; } - double read_time(uint index, uint ranges, ha_rows rows) - { return (double) (rows +1)/ 20.0; } - double keyread_time(uint index, uint ranges, ha_rows rows) - { return (double) (rows + ranges) / 20.0 ; } - double rndpos_time(ha_rows rows) - { - return (double) rows/ 20.0; - } - double avg_io_cost() - { return 0.05; } /* 1/20 */ - - /* - Heap doesn't need optimizer_cache_cost as everything is in memory and - it supports all needed _time() functions - */ - void set_optimizer_cache_cost(double cost) - { - optimizer_cache_cost= 1.0; - } + IO_AND_CPU_COST scan_time(); + IO_AND_CPU_COST keyread_time(uint index, ulong ranges, ha_rows rows, + ulonglong blocks); + IO_AND_CPU_COST rndpos_time(ha_rows rows); + /* 0 for avg_io_cost ensures that there are no read-block calculations */ + double avg_io_cost() { return 0.0; } + void optimizer_costs_updated(); int open(const char *name, int mode, uint test_if_locked); int close(void); void set_keys_for_scanning(void); diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index ffd3f0bc0e3..4ab2ac8618c 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -14416,7 +14416,7 @@ comparable to the number returned by records_in_range so that we can decide if we should scan the table or use keys. @return estimated time measured in disk seeks */ -double +IO_AND_CPU_COST ha_innobase::scan_time() /*====================*/ { @@ -14436,17 +14436,19 @@ ha_innobase::scan_time() TODO: This will be further improved to return some approximate estimate but that would also needs pre-population of stats structure. As of now approach is in sync with MyISAM. */ - return(ulonglong2double(stats.data_file_length) / IO_SIZE + 2); + return { (ulonglong2double(stats.data_file_length) / IO_SIZE * avg_io_cost()), 0.0 }; } ulint stat_clustered_index_size; - + IO_AND_CPU_COST cost; ut_a(m_prebuilt->table->stat_initialized); stat_clustered_index_size = m_prebuilt->table->stat_clustered_index_size; - return((double) stat_clustered_index_size); + cost.io= (double) stat_clustered_index_size * avg_io_cost(); + cost.cpu= 0; + return(cost); } /******************************************************************//** @@ -14454,6 +14456,7 @@ Calculate the time it takes to read a set of ranges through an index This enables us to optimise reads for clustered indexes. @return estimated time measured in disk seeks */ +#ifdef NOT_USED double ha_innobase::read_time( /*===================*/ @@ -14478,14 +14481,13 @@ ha_innobase::read_time( return(time_for_scan); } - return(ranges + (double) rows / (double) total_rows * time_for_scan); + return(ranges * INDEX_LOOKUP_COST + (double) rows / (double) total_rows * time_for_scan); } /******************************************************************//** Calculate the time it takes to read a set of rows with primary key. */ -double ha_innobase::rndpos_time(ha_rows rows) { ha_rows total_rows; @@ -14502,6 +14504,7 @@ ha_innobase::rndpos_time(ha_rows rows) return((double) rows + (double) rows / (double) total_rows * time_for_scan); } +#endif /*********************************************************************//** Calculates the key number used inside MySQL for an Innobase index. diff --git a/storage/innobase/handler/ha_innodb.h b/storage/innobase/handler/ha_innodb.h index 664fa10d4da..4bb5179d832 100644 --- a/storage/innobase/handler/ha_innodb.h +++ b/storage/innobase/handler/ha_innodb.h @@ -105,11 +105,25 @@ public: int close(void) override; - double scan_time() override; + IO_AND_CPU_COST scan_time() override; +#ifdef NOT_USED double read_time(uint index, uint ranges, ha_rows rows) override; - double rndpos_time(ha_rows rows) override; +#endif + + void optimizer_costs_updated() + { +#ifdef QQQ + /* + The following number was found by check_costs.pl when using 1M rows + and all rows are cached + */ + optimizer_row_lookup_cost*= 2.2; +#endif + /* Accessing a row has the same cost as doing an index read */ + optimizer_row_lookup_cost= optimizer_index_lookup_cost; + } int write_row(const uchar * buf) override; diff --git a/storage/maria/ha_maria.cc b/storage/maria/ha_maria.cc index 1e6680d30de..f48373ea818 100644 --- a/storage/maria/ha_maria.cc +++ b/storage/maria/ha_maria.cc @@ -1100,14 +1100,37 @@ ulong ha_maria::index_flags(uint inx, uint part, bool all_parts) const } -double ha_maria::scan_time() +IO_AND_CPU_COST ha_maria::scan_time() { - if (file->s->data_file_type == BLOCK_RECORD) - return (ulonglong2double(stats.data_file_length - file->s->block_size) / - file->s->block_size) + 2; return handler::scan_time(); } +void ha_maria::optimizer_costs_updated() +{ + /* + The following numbers where found by check_costs.pl when using 1M rows + and all rows are cached. See optimzier_costs.txt + */ + if (file->s->data_file_type == BLOCK_RECORD) + { + /* + Aria row lookup is fast for BLOCK_RECORDS as the row data is cached + and we know exactly on which block a row is. + */ + optimizer_row_copy_cost= 0.000118; + optimizer_row_lookup_cost= 0.52; + } + else + { + /* + MyISAM format row lookup costs are slow as the row data is on a not cached + file. + */ + optimizer_row_lookup_cost*= 3; + } +} + + /* We need to be able to store at least 2 keys on an index page as the splitting algorithms depends on this. (With only one key on a page diff --git a/storage/maria/ha_maria.h b/storage/maria/ha_maria.h index 6b4302145dd..46dce02239e 100644 --- a/storage/maria/ha_maria.h +++ b/storage/maria/ha_maria.h @@ -77,7 +77,7 @@ public: { return max_supported_key_length(); } enum row_type get_row_type() const override final; void change_table_ptr(TABLE *table_arg, TABLE_SHARE *share) override final; - virtual double scan_time() override final; + virtual IO_AND_CPU_COST scan_time() override final; int open(const char *name, int mode, uint test_if_locked) override; int close(void) override final; @@ -114,6 +114,7 @@ public: int remember_rnd_pos() override final; int restart_rnd_next(uchar * buf) override final; void position(const uchar * record) override final; + void optimizer_costs_updated(); int info(uint) override final; int info(uint, my_bool); int extra(enum ha_extra_function operation) override final; diff --git a/storage/maria/ma_pagecache.c b/storage/maria/ma_pagecache.c index de85ec51deb..58f8637abe7 100644 --- a/storage/maria/ma_pagecache.c +++ b/storage/maria/ma_pagecache.c @@ -3875,7 +3875,7 @@ restart: { pagecache_pthread_mutex_unlock(&pagecache->cache_lock); DBUG_ASSERT(0); - return (uchar*) 0; + DBUG_RETURN((uchar*) 0); } } /* diff --git a/storage/mroonga/ha_mroonga.cpp b/storage/mroonga/ha_mroonga.cpp index 83c2d15a36c..5dd74ea6240 100644 --- a/storage/mroonga/ha_mroonga.cpp +++ b/storage/mroonga/ha_mroonga.cpp @@ -13000,9 +13000,9 @@ int ha_mroonga::truncate() DBUG_RETURN(error); } -double ha_mroonga::wrapper_scan_time() +IO_AND_CPU_COST ha_mroonga::wrapper_scan_time() { - double res; + IO_AND_CPU_COST res; MRN_DBUG_ENTER_METHOD(); MRN_SET_WRAP_SHARE_KEY(share, table->s); MRN_SET_WRAP_TABLE_KEY(this, table); @@ -13012,17 +13012,16 @@ double ha_mroonga::wrapper_scan_time() DBUG_RETURN(res); } -double ha_mroonga::storage_scan_time() +IO_AND_CPU_COST ha_mroonga::storage_scan_time() { MRN_DBUG_ENTER_METHOD(); - double time = handler::scan_time(); - DBUG_RETURN(time); + DBUG_RETURN(handler::scan_time()); } -double ha_mroonga::scan_time() +IO_AND_CPU_COST ha_mroonga::scan_time() { MRN_DBUG_ENTER_METHOD(); - double time; + IO_AND_CPU_COST time; if (share->wrapper_mode) { time = wrapper_scan_time(); @@ -13032,51 +13031,87 @@ double ha_mroonga::scan_time() DBUG_RETURN(time); } -double ha_mroonga::wrapper_read_time(uint index, uint ranges, ha_rows rows) +IO_AND_CPU_COST ha_mroonga::wrapper_rndpos_time(ha_rows rows) +{ + IO_AND_CPU_COST res; + MRN_DBUG_ENTER_METHOD(); + MRN_SET_WRAP_SHARE_KEY(share, table->s); + MRN_SET_WRAP_TABLE_KEY(this, table); + res = wrap_handler->rndpos_time(rows); + MRN_SET_BASE_SHARE_KEY(share, table->s); + MRN_SET_BASE_TABLE_KEY(this, table); + DBUG_RETURN(res); +} + +IO_AND_CPU_COST ha_mroonga::storage_rndpos_time(ha_rows rows) { - double res; + MRN_DBUG_ENTER_METHOD(); + IO_AND_CPU_COST time = handler::rndpos_time(rows); + DBUG_RETURN(time); +} + + +IO_AND_CPU_COST ha_mroonga::rndpos_time(ha_rows rows) +{ + MRN_DBUG_ENTER_METHOD(); + IO_AND_CPU_COST time; + if (share->wrapper_mode) + { + time = wrapper_rndpos_time(rows); + } else { + time = storage_rndpos_time(rows); + } + DBUG_RETURN(time); +} + + +IO_AND_CPU_COST ha_mroonga::wrapper_keyread_time(uint index, ulong ranges, + ha_rows rows, ulonglong blocks) +{ + IO_AND_CPU_COST res; MRN_DBUG_ENTER_METHOD(); if (index < MAX_KEY) { KEY *key_info = &(table->key_info[index]); if (mrn_is_geo_key(key_info)) { - res = handler::read_time(index, ranges, rows); + res = handler::keyread_time(index, ranges, rows, blocks); DBUG_RETURN(res); } MRN_SET_WRAP_SHARE_KEY(share, table->s); MRN_SET_WRAP_TABLE_KEY(this, table); - res = wrap_handler->read_time(share->wrap_key_nr[index], ranges, rows); + res = wrap_handler->keyread_time(share->wrap_key_nr[index], ranges, rows, blocks); MRN_SET_BASE_SHARE_KEY(share, table->s); MRN_SET_BASE_TABLE_KEY(this, table); } else { MRN_SET_WRAP_SHARE_KEY(share, table->s); MRN_SET_WRAP_TABLE_KEY(this, table); - res = wrap_handler->read_time(index, ranges, rows); + res = wrap_handler->keyread_time(index, ranges, rows, blocks); MRN_SET_BASE_SHARE_KEY(share, table->s); MRN_SET_BASE_TABLE_KEY(this, table); } DBUG_RETURN(res); } -double ha_mroonga::storage_read_time(uint index, uint ranges, ha_rows rows) +IO_AND_CPU_COST ha_mroonga::storage_keyread_time(uint index, ulong ranges, ha_rows rows, ulonglong blocks) { MRN_DBUG_ENTER_METHOD(); - double time = handler::read_time(index, ranges, rows); + IO_AND_CPU_COST time = handler::keyread_time(index, ranges, rows, blocks); DBUG_RETURN(time); } -double ha_mroonga::read_time(uint index, uint ranges, ha_rows rows) +IO_AND_CPU_COST ha_mroonga::keyread_time(uint index, ulong ranges, ha_rows rows, ulonglong blocks) { MRN_DBUG_ENTER_METHOD(); - double time; + IO_AND_CPU_COST time; if (share->wrapper_mode) { - time = wrapper_read_time(index, ranges, rows); + time = wrapper_keyread_time(index, ranges, rows, blocks); } else { - time = storage_read_time(index, ranges, rows); + time = storage_keyread_time(index, ranges, rows, blocks); } DBUG_RETURN(time); } + #ifdef MRN_HANDLER_HAVE_KEYS_TO_USE_FOR_SCANNING const key_map *ha_mroonga::wrapper_keys_to_use_for_scanning() { diff --git a/storage/mroonga/ha_mroonga.hpp b/storage/mroonga/ha_mroonga.hpp index 66767899e21..38ed146676e 100644 --- a/storage/mroonga/ha_mroonga.hpp +++ b/storage/mroonga/ha_mroonga.hpp @@ -531,8 +531,9 @@ public: int end_bulk_insert() mrn_override; int delete_all_rows() mrn_override; int truncate() mrn_override; - double scan_time() mrn_override; - double read_time(uint index, uint ranges, ha_rows rows) mrn_override; + IO_AND_CPU_COST scan_time() mrn_override; + IO_AND_CPU_COST rndpos_time(ha_rows rows) mrn_override; + IO_AND_CPU_COST keyread_time(uint index, ulong ranges, ha_rows rows, ulonglong blocks) mrn_override; #ifdef MRN_HANDLER_HAVE_KEYS_TO_USE_FOR_SCANNING const key_map *keys_to_use_for_scanning() mrn_override; #endif @@ -1106,10 +1107,12 @@ private: int wrapper_truncate_index(); int storage_truncate(); int storage_truncate_index(); - double wrapper_scan_time(); - double storage_scan_time(); - double wrapper_read_time(uint index, uint ranges, ha_rows rows); - double storage_read_time(uint index, uint ranges, ha_rows rows); + IO_AND_CPU_COST wrapper_scan_time(); + IO_AND_CPU_COST storage_scan_time(); + IO_AND_CPU_COST wrapper_rndpos_time(ha_rows rows); + IO_AND_CPU_COST storage_rndpos_time(ha_rows rows); + IO_AND_CPU_COST wrapper_keyread_time(uint index, ulong ranges, ha_rows rows, ulonglong blocks); + IO_AND_CPU_COST storage_keyread_time(uint index, ulong ranges, ha_rows rows, ulonglong blocks); #ifdef MRN_HANDLER_HAVE_KEYS_TO_USE_FOR_SCANNING const key_map *wrapper_keys_to_use_for_scanning(); const key_map *storage_keys_to_use_for_scanning(); diff --git a/storage/myisam/ha_myisam.h b/storage/myisam/ha_myisam.h index 3843004cc6e..6ceabd8d9c5 100644 --- a/storage/myisam/ha_myisam.h +++ b/storage/myisam/ha_myisam.h @@ -102,6 +102,17 @@ class ha_myisam final : public handler int remember_rnd_pos(); int restart_rnd_next(uchar *buf); void position(const uchar *record); + void optimizer_costs_updated() + { +#ifdef QQQQ + /* + MyISAM row lookup costs are slow as the row data is on a not cached file. + The following number was found by check_costs.pl when using 1M rows + and all rows are cached + */ + optimizer_row_lookup_cost*= 3.0; +#endif + } int info(uint); int extra(enum ha_extra_function operation); int extra_opt(enum ha_extra_function operation, ulong cache_size); diff --git a/storage/myisammrg/ha_myisammrg.h b/storage/myisammrg/ha_myisammrg.h index 6da327ec84b..7921b725bb2 100644 --- a/storage/myisammrg/ha_myisammrg.h +++ b/storage/myisammrg/ha_myisammrg.h @@ -102,8 +102,14 @@ public: uint max_supported_keys() const { return MI_MAX_KEY; } uint max_supported_key_length() const { return HA_MAX_KEY_LENGTH; } uint max_supported_key_part_length() const { return HA_MAX_KEY_LENGTH; } - double scan_time() - { return ulonglong2double(stats.data_file_length) / IO_SIZE + file->tables; } + IO_AND_CPU_COST scan_time() + { + IO_AND_CPU_COST cost; + cost.io= (ulonglong2double(stats.data_file_length) / IO_SIZE * avg_io_cost()+ + file->tables); + cost.cpu= records() * ROW_NEXT_FIND_COST; + return cost; + } int open(const char *name, int mode, uint test_if_locked); int add_children_list(void); diff --git a/storage/oqgraph/ha_oqgraph.h b/storage/oqgraph/ha_oqgraph.h index c8e175df616..dae81fd0c0c 100644 --- a/storage/oqgraph/ha_oqgraph.h +++ b/storage/oqgraph/ha_oqgraph.h @@ -74,9 +74,10 @@ public: const char **bas_ext() const; uint max_supported_keys() const { return MAX_KEY; } uint max_supported_key_part_length() const { return MAX_KEY_LENGTH; } - double scan_time() { return (double) 1000000000; } - double read_time(uint index, uint ranges, ha_rows rows) - { return 1; } + IO_AND_CPU_COST scan_time() + { return { (double) 1000000000, (double) 1000000000 }; } + IO_AND_CPU_COST rndpos_time(ha_rows rows) + { return { (double) rows, (double) rows }; } // Doesn't make sense to change the engine on a virtual table. virtual bool can_switch_engines() { return false; } diff --git a/storage/perfschema/ha_perfschema.h b/storage/perfschema/ha_perfschema.h index f3d84a3e264..dcad584224b 100644 --- a/storage/perfschema/ha_perfschema.h +++ b/storage/perfschema/ha_perfschema.h @@ -104,8 +104,10 @@ public: ha_rows estimate_rows_upper_bound(void) { return HA_POS_ERROR; } - double scan_time(void) - { return 1.0; } + IO_AND_CPU_COST scan_time(void) + { + return {0.0, 1.0}; + } /** Open a performance schema table. diff --git a/storage/rocksdb/ha_rocksdb.cc b/storage/rocksdb/ha_rocksdb.cc index 72f88172a2a..b41851bd66a 100644 --- a/storage/rocksdb/ha_rocksdb.cc +++ b/storage/rocksdb/ha_rocksdb.cc @@ -14602,15 +14602,18 @@ bool ha_rocksdb::use_read_free_rpl() const { } #endif // MARIAROCKS_NOT_YET -double ha_rocksdb::read_time(uint index, uint ranges, ha_rows rows) { +IO_AND_CPU_COST ha_rocksdb::keyread_time(uint index, ulong ranges, + ha_rows rows, + ulonglong blocks) { DBUG_ENTER_FUNC(); if (index != table->s->primary_key) { /* Non covering index range scan */ - DBUG_RETURN(handler::read_time(index, ranges, rows)); + DBUG_RETURN(handler::keyread_time(index, ranges, rows, blocks)); } - DBUG_RETURN((rows / 20.0) + 1); + IO_AND_CPU_COST cost= {0, (rows / 20.0) + ranges }; + DBUG_RETURN(cost); } void ha_rocksdb::print_error(int error, myf errflag) { diff --git a/storage/rocksdb/ha_rocksdb.h b/storage/rocksdb/ha_rocksdb.h index 63bf7ffd602..d40fc539b0c 100644 --- a/storage/rocksdb/ha_rocksdb.h +++ b/storage/rocksdb/ha_rocksdb.h @@ -623,14 +623,17 @@ public: bool sorted) override MY_ATTRIBUTE((__warn_unused_result__)); - virtual double scan_time() override { + virtual IO_AND_CPU_COST scan_time() override + { + IO_AND_CPU_COST cost; DBUG_ENTER_FUNC(); - - DBUG_RETURN( - static_cast<double>((stats.records + stats.deleted) / 20.0 + 10)); + cost.io= 0; + cost.cpu= (stats.records + stats.deleted) * 0.001 + 1; + DBUG_RETURN(cost); } + IO_AND_CPU_COST keyread_time(uint index, ulong ranges, + ha_rows rows, ulonglong blocks) override; - virtual double read_time(uint, uint, ha_rows rows) override; virtual void print_error(int error, myf errflag) override; int open(const char *const name, int mode, uint test_if_locked) override diff --git a/storage/sequence/sequence.cc b/storage/sequence/sequence.cc index f5a18094521..1e558e6d3e4 100644 --- a/storage/sequence/sequence.cc +++ b/storage/sequence/sequence.cc @@ -32,6 +32,8 @@ static handlerton *sequence_hton; +#define SEQUENCE_SCAN_COST 1.53e-05 // See optimizer_costs.txt + class Sequence_share : public Handler_share { public: const char *name; @@ -100,9 +102,17 @@ public: int index_last(uchar *buf); ha_rows records_in_range(uint inx, const key_range *start_key, const key_range *end_key, page_range *pages); - double scan_time() { return (double)nvalues(); } - double read_time(uint index, uint ranges, ha_rows rows) { return (double)rows; } - double keyread_time(uint index, uint ranges, ha_rows rows) { return (double)rows; } + IO_AND_CPU_COST scan_time() + { + return {0, (double) nvalues() * SEQUENCE_SCAN_COST }; + } + IO_AND_CPU_COST keyread_time(uint index, ulong ranges, ha_rows rows, + ulonglong blocks) + { + return {0, rows * SEQUENCE_SCAN_COST }; // Very low value/row + } + /* 0 for avg_io_cost ensures that there are no read-block calculations */ + double avg_io_cost() { return 0.0; } private: void set(uchar *buf); diff --git a/storage/sphinx/ha_sphinx.h b/storage/sphinx/ha_sphinx.h index f03e9d8c797..c7c61a4738a 100644 --- a/storage/sphinx/ha_sphinx.h +++ b/storage/sphinx/ha_sphinx.h @@ -72,14 +72,30 @@ public: uint max_supported_key_length () const { return MAX_KEY_LENGTH; } uint max_supported_key_part_length () const { return MAX_KEY_LENGTH; } - #if MYSQL_VERSION_ID>50100 - virtual double scan_time () { return (double)( stats.records+stats.deleted )/20.0 + 10; } ///< called in test_quick_select to determine if indexes should be used - #else - virtual double scan_time () { return (double)( records+deleted )/20.0 + 10; } ///< called in test_quick_select to determine if indexes should be used - #endif + IO_AND_CPU_COST scan_time () + { + IO_AND_CPU_COST cost; + cost.io= 0; + cost.cpu= (double) (stats.records+stats.deleted) * avg_io_cost(); + return cost; + } + IO_AND_CPU_COST keyread_time(uint index, ulong ranges, ha_rows rows, + ulonglong blocks) + { + IO_AND_CPU_COST cost; + cost.io= ranges; + cost.cpu= (double) rows * DEFAULT_ROW_LOOKUP_COST; + return cost; + } + IO_AND_CPU_COST rndpos_time(ha_rows rows) + { + IO_AND_CPU_COST cost; + cost.io= 0; + cost.cpu= (double) rows * DEFAULT_ROW_LOOKUP_COST; + return cost; + } + - virtual double read_time(uint index, uint ranges, ha_rows rows) - { return ranges + (double)rows/20.0 + 1; } ///< index read time estimate public: int open ( const char * name, int mode, uint test_if_locked ); diff --git a/storage/spider/ha_spider.cc b/storage/spider/ha_spider.cc index 712397ffb92..e0c3e64ecb6 100644 --- a/storage/spider/ha_spider.cc +++ b/storage/spider/ha_spider.cc @@ -11350,37 +11350,44 @@ void ha_spider::bulk_req_exec() } #endif -double ha_spider::scan_time() +IO_AND_CPU_COST ha_spider::scan_time() { + IO_AND_CPU_COST cost; DBUG_ENTER("ha_spider::scan_time"); DBUG_PRINT("info",("spider this=%p", this)); - DBUG_PRINT("info",("spider scan_time = %.6f", - share->scan_rate * share->stat.records * share->stat.mean_rec_length + 2)); - DBUG_RETURN(share->scan_rate * share->stat.records * - share->stat.mean_rec_length + 2); + cost.io=0; + cost.cpu= (share->scan_rate * share->stat.records * + share->stat.mean_rec_length); + DBUG_PRINT("info",("spider scan_time = %.6f", cost.cpu)); + return cost; } -double ha_spider::read_time( - uint index, - uint ranges, - ha_rows rows -) { - DBUG_ENTER("ha_spider::read_time"); +IO_AND_CPU_COST ha_spider::rndpos_time(ha_rows rows) +{ + IO_AND_CPU_COST cost= { 0.0, 0.0}; // Row is in memory + return cost; +} + +IO_AND_CPU_COST ha_spider::keyread_time(uint index, ulong ranges, ha_rows rows, + ulonglong blocks) +{ + IO_AND_CPU_COST cost; + DBUG_ENTER("ha_spider::keyread_time"); DBUG_PRINT("info",("spider this=%p", this)); if (wide_handler->keyread) { - DBUG_PRINT("info",("spider read_time(keyread) = %.6f", - share->read_rate * table->key_info[index].key_length * - rows / 2 + 2)); - DBUG_RETURN(share->read_rate * table->key_info[index].key_length * - rows / 2 + 2); + cost.io= ranges; + cost.cpu= (share->read_rate * table->key_info[index].key_length * rows / 2 + + 2); } else { - DBUG_PRINT("info",("spider read_time = %.6f", - share->read_rate * share->stat.mean_rec_length * rows + 2)); - DBUG_RETURN(share->read_rate * share->stat.mean_rec_length * rows + 2); + cost.io= ranges; + cost.cpu= share->read_rate * share->stat.mean_rec_length * rows + 2; } + DBUG_PRINT("info",("spider scan_time(keyread) = %.6f", cost.cpu)); + DBUG_RETURN(cost); } + const key_map *ha_spider::keys_to_use_for_scanning() { DBUG_ENTER("ha_spider::keys_to_use_for_scanning"); diff --git a/storage/spider/ha_spider.h b/storage/spider/ha_spider.h index 3036f8d522a..8e8308c760a 100644 --- a/storage/spider/ha_spider.h +++ b/storage/spider/ha_spider.h @@ -787,12 +787,10 @@ public: #endif int delete_all_rows(); int truncate(); - double scan_time(); - double read_time( - uint index, - uint ranges, - ha_rows rows - ); + IO_AND_CPU_COST scan_time(); + IO_AND_CPU_COST rndpos_time(ha_rows rows); + IO_AND_CPU_COST keyread_time(uint index, ulong ranges, ha_rows rows, + ulonglong blocks); #ifdef HA_CAN_BULK_ACCESS void bulk_req_exec(); #endif diff --git a/tests/check_costs.pl b/tests/check_costs.pl new file mode 100644 index 00000000000..54f0947dbf2 --- /dev/null +++ b/tests/check_costs.pl @@ -0,0 +1,635 @@ +#!/usr/bin/env perl + +# Copyright (C) 2000, 2001 MySQL AB +# Use is subject to license terms +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; version 2 of the License. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA + +# This is a test that runs queries to meassure if the MariaDB cost calculations +# are reasonable. +# +# The following test are run: +# - Full table scan of a table +# - Range scan of the table +# - Index scan of the table +# +# The output can be used to finetune the optimizer cost variables. +# +# The table in question is a similar to the 'lineitem' table used by DBT3 +# it has 16 field and could be regarded as a 'average kind of table'. +# Number of fields and record length places a small role when comparing +# index scan and table scan + +##################### Standard benchmark inits ############################## + +use DBI; +use Getopt::Long; +use Benchmark ':hireswallclock'; +use JSON; +use Data::Dumper; + +package main; + +$opt_rows=1000000; +$opt_verbose=""; +$opt_host=""; +$opt_db="test"; +$opt_user="test"; +$opt_password=""; +$opt_socket=undef; +$opt_skip_drop= undef; +$opt_skip_create= undef; +$opt_init_query= undef; +$opt_print_analyze= undef; +$opt_skip_where_check= undef; +$opt_engine=undef; +$opt_comment=undef; + +@arguments= @ARGV; + +GetOptions("host=s","user=s","password=s", "db=s","rows=i","socket=s", + "skip-drop", + "init-query=s","engine=s","comment=s","skip-create", + "skip-where-check", "print-analyze", "verbose") || + die "Aborted"; + +$Mysql::db_errstr=undef; # Ignore warnings from these + +my ($table, $dbh, $where_cost, $real_where_cost, $perf_ratio); + +@engines= ("Aria","InnoDB","MyISAM","heap"); + +#### +#### Start timeing and start test +#### + +$|= 1; # Autoflush +if ($opt_verbose) +{ + $opt_print_analyze= 1; +} + +#### +#### Create the table +#### + +my %attrib; + +$attrib{'PrintError'}=0; + +if (defined($opt_socket)) +{ + $attrib{'mariadb_socket'}=$opt_socket; +} + +$dbh = DBI->connect("DBI:MariaDB:$opt_db:$opt_host", + $opt_user, $opt_password,\%attrib) || die $DBI::errstr; + +$table= "$opt_db.check_costs"; + +print_mariadb_version(); +print "Server options: $opt_comment\n" if (defined($opt_comment)); +print "Running tests with $opt_rows rows\n"; + +print "Program arguments:\n"; +for ($i= 0 ; $i <= $#arguments; $i++) +{ + my $arg=$arguments[$i]; + if ($arg =~ / /) + { + if ($arg =~ /([^ =]*)=(.*)/) + { + print "$1=\"$2\" "; + } + else + { + print "\"$arg\"" . " "; + } + } + else + { + print $arguments[$i] . " "; + } +} +print "\n\n"; + +@test_names= + ("table scan no where", "table scan simple where", + "table scan where no match", "table scan complex where", "table scan", + "range scan", "index scan", "eq_join"); +$where_tests=3; # Number of where test to be compared with test[0] + +if ($opt_engine) +{ + test_engine(0, $opt_engine); +} +else +{ + undef($opt_skip_create); + for ($i= 0 ; $i <= $#engines; $i++) + { + test_engine($i, $engines[$i]); + + if ($i > 0) + { + print "\n"; + my $j; + my $cmp_cost= $res[0][$j]->{'cost'}; + my $cmp_time= $res[0][$j]->{'time'}; + + print "Ratios $engines[$i] / $engines[0]. Multiplier should be close to 1.0\n"; + for ($j= 0 ; $j <= $#test_names ; $j++) + { + my $cur_cost= $res[$i][$j]->{'cost'}; + my $cur_time= $res[$i][$j]->{'time'}; + + printf "%10s cost: %6.4f time: %6.4f cost_correction_multiplier: %6.4f\n", + $test_names[$j], + $cur_cost / $cmp_cost, + $cur_time / $cmp_time, + ($cmp_cost * ($cur_time / $cmp_time))/$cur_cost; + } + } +# if ($i + 1 <= $#engines) + { + print "-------------------------\n\n"; + } + } + print_totals(); +} + +$dbh->do("drop table if exists $table") if (!defined($opt_skip_drop)); +$dbh->disconnect; $dbh=0; # Close handler +exit(0); + + +sub test_engine() +{ + my ($i, $engine)= @_; + + setup($opt_init_query); + + if (!defined($opt_skip_create)) + { + my $index_type=""; + + # We should use btree index with heap to ge range scans + $index_type= "using btree" if (lc($engine) eq "heap"); + + print "Creating table $table of type $engine\n"; + $dbh->do("drop table if exists $table"); + $dbh->do("create table $table ( + `l_orderkey` int(11) NOT NULL, + `l_partkey` int(11) DEFAULT NULL, + `l_suppkey` int(11) DEFAULT NULL, + `l_linenumber` int(11) NOT NULL, + `l_quantity` double DEFAULT NULL, + `l_extendedprice` double DEFAULT NULL, + `l_discount` double DEFAULT NULL, + `l_tax` double DEFAULT NULL, + `l_returnflag` char(1) DEFAULT NULL, + `l_linestatus` char(1) DEFAULT NULL, + `l_shipDATE` date DEFAULT NULL, + `l_commitDATE` date DEFAULT NULL, + `l_receiptDATE` date DEFAULT NULL, + `l_shipinstruct` char(25) DEFAULT NULL, + `l_shipmode` char(10) DEFAULT NULL, + `l_comment` varchar(44) DEFAULT NULL, + PRIMARY KEY (`l_orderkey`), + KEY `commitdate` $index_type (`l_commitDATE`,l_discount) ) + ENGINE= $engine") + or die $DBI::errstr; + + $dbh->do("insert into $table select + seq, seq/10, seq/100, seq, seq, seq, mod(seq,10)*10, + 0, 'a','b', + date_add('2000-01-01', interval seq/500 day), + date_add('2000-01-10', interval seq/500 day), + date_add('2000-01-20', interval seq/500 day), + left(md5(seq),25), + if(seq & 1,'mail','ship'), + repeat('a',mod(seq,40)) + from seq_1_to_$opt_rows") + or die $DBI::errstr; + } + else + { + $opt_rows= get_row_count(); + die "Table $table is empty. Please run without --skip-create" + if ($opt_rows == 0); + print "Reusing old table $table, which has $opt_rows rows\n"; + } + + $where_cost=get_variable("optimizer_where_cost"); + if (defined($where_cost) && !$opt_skip_where_check) + { + # Calculate cost of where once. Must be done after table is created + $real_where_cost= get_where_cost(); + $perf_ratio= $real_where_cost/$where_cost; + printf "Performance ratio compared to base computer: %6.4f\n", $perf_ratio; + } + print "\n"; + if (!$opt_skip_where_check) + { + $res[$i][0]= table_scan_without_where(); + $res[$i][1]= table_scan_with_where(); + $res[$i][2]= table_scan_with_where_no_match(); + $res[$i][3]= table_scan_with_complex_where(); + } + $res[$i][4]= table_scan_without_where_analyze(); + $res[$i][5]= range_scan(); + $res[$i][6]= index_scan(); + $res[$i][7]= eq_ref_join(); + + if (!$opt_skip_where_check) + { + printf "Variable optimizer_where_cost: cur: %6.4f real: %6.4f prop: %6.4f\n", + $where_cost, $real_where_cost, $perf_ratio; + print "Ratio of WHERE costs compared to scan without a WHERE\n"; + for ($j= 1 ; $j <= $where_tests ; $j++) + { + print_where_costs($i,$j,0); + } + } + + print "\nCost/time ratio for different scans types\n"; + for ($j= $where_tests+1 ; $j <= $#test_names ; $j++) + { + print_costs($test_names[$j], $res[$i][$j]); + } +} + + +sub print_costs() +{ + my ($name, $cur_res)= @_; + + # Cost without where clause + my $cur_cost= $cur_res->{'cost'} - $cur_res->{'where_cost'}; + my $cur_time= $cur_res->{'time'}; + + printf "%-20.20s cost: %8.4f time: %8.4f cost/time: %8.4f\n", + $name, + $cur_cost, $cur_time, $cur_cost/$cur_time; +} + +sub print_where_costs() +{ + my ($index, $cmp, $base)= @_; + + my $cmp_time= $res[$index][$cmp]->{'time'}; + my $base_time= $res[$index][$base]->{'time'}; + + printf "%-30.30s time: %6.4f\n", $test_names[$cmp], $cmp_time / $base_time; +} + + +# Used to setup things like optimizer_switch or optimizer_cache_hit_ratio + +sub setup() +{ + my ($query)= @_; + my ($sth); + + $sth= $dbh->do("analyze table $table") || die "Got error on 'analyze table: " . $dbh->errstr . "\n"; + $sth= $dbh->do("flush tables") || die "Got error on 'flush tables': " . $dbh->errstr . "\n"; + if (defined($query)) + { + $sth= $dbh->do("$query") || die "Got error on '$query': " . $dbh->errstr . "\n"; + } + + # Set variables that may interfer with timings + $query= "set \@\@optimizer_switch='index_condition_pushdown=off'"; + $sth= $dbh->do($query) || die "Got error on '$query': " . $dbh->errstr . "\n"; +} + +############################################################################## +# Query functions +############################################################################## + +# Calculate the cost of the WHERE clause + +sub table_scan_without_where() +{ + return run_query("table_scan", "ALL", $opt_rows, "select sum(l_partkey) from $table"); +} + +sub table_scan_with_where() +{ + return run_query("table_scan", "ALL", $opt_rows, "select sum(l_partkey) from $table where l_commitDate >= '2000-01-01' and l_tax >= 0.0"); +} + +sub table_scan_with_where_no_match() +{ + return run_query("table_scan", "ALL", $opt_rows, "select sum(l_partkey) from $table where l_commitDate >= '2000-01-01' and l_tax > 0.0 /* NO MATCH */"); +} + + +sub table_scan_with_complex_where() +{ + return run_query("table_scan", "ALL", $opt_rows, "select sum(l_partkey) from $table where l_commitDate >= '2000-01-01' and l_quantity*l_extendedprice-l_discount+l_tax > 0.0"); +} + + +# Calculate the table access times + +sub table_scan_without_where_analyze() +{ + return run_query_with_analyze("table_scan", "ALL", $opt_rows, "select sum(l_partkey) from $table"); +} + +sub range_scan() +{ + return run_query_with_analyze("range_scan", "range", $opt_rows, "select sum(l_orderkey) from $table force index(commitdate) where l_commitDate >= '2000-01-01' and l_tax >= 0.0"); +} + +sub index_scan() +{ + return run_query_with_analyze("index_scan", "index", $opt_rows, "select sum(l_discount) from $table force index(commitdate) where l_commitDate > '2000-01-01' and l_discount >= 0.0"); +} + +sub eq_ref_join() +{ + return run_query_with_analyze("eq_ref_join", "eq_ref", 1, "select straight_join count(*) from seq_1_to_1000000,check_costs where seq=l_orderkey"); +} + +sub get_where_cost() +{ + my ($loop); + $loop=10000000; + # Return time in microseconds for one where (= optimizer_where_cost) + return query_time("select benchmark($loop, l_commitDate >= '2000-01-01' and l_tax >= 0.0) from $table limit 1")/$loop; +} + + +############################################################################### +# Help functions for running the queries +############################################################################### + + +# Run query and return time for query in microseconds + +sub query_time() +{ + my ($query)= @_; + my ($start_time,$end_time,$time,$ms,$sth,$row); + + $start_time= new Benchmark; + $sth= $dbh->prepare($query) || die "Got error on '$query': " . $dbh->errstr . "\n"; + $sth->execute || die "Got error on '$query': " . $dbh->errstr . "\n"; + $end_time=new Benchmark; + $row= $sth->fetchrow_arrayref(); + $sth=0; + + $time= timestr(timediff($end_time, $start_time),"nop"); + $time =~ /([\d.]*)/; + return $1*1000000.0; +} + +# +# Run a query and compare the clock time +# + +sub run_query() +{ + my ($name, $type, $expected_rows, $query)= @_; + my ($start_time,$end_time,$sth,@row,%res,$i, $optimizer_rows, $adjust_cost); + my ($ms); + $adjust_cost=1.0; + + print "Timing full query: $query\n"; + + $sth= $dbh->prepare("explain $query") || die "Got error on 'explain $query': " . $dbh->errstr . "\n"; + $sth->execute || die "Got error on 'explain $query': " . $dbh->errstr . "\n"; + + print "explain:\n"; + while ($row= $sth->fetchrow_arrayref()) + { + print $row->[0]; + for ($i= 1 ; $i < @$row; $i++) + { + print " " . $row->[$i] if (defined($row->[$i])); + } + print "\n"; + + if ($row->[3] ne $type) + { + print "Warning: Wrong scan type: '$row->[3]', expected '$type'\n"; + } + $optimizer_rows= $row->[8]; + } + if ($expected_rows >= 0 && + (abs($optimizer_rows - $expected_rows)/$expected_rows) > 0.1) + { + printf "Warning: Expected $expected_rows instead of $optimizer_rows from EXPLAIN. Adjusting costs\n"; + $adjust_cost= $expected_rows / $optimizer_rows; + } + + # Do one query to fill the cache + $sth= $dbh->prepare($query) || die "Got error on '$query': " . $dbh->errstr . "\n"; + $sth->execute || die "Got error on '$query': " . $dbh->errstr . "\n"; + $end_time=new Benchmark; + $row= $sth->fetchrow_arrayref(); + $sth=0; + + # Run query for real + $start_time= new Benchmark; + $sth= $dbh->prepare($query) || die "Got error on '$query': " . $dbh->errstr . "\n"; + $sth->execute || die "Got error on '$query': " . $dbh->errstr . "\n"; + $end_time=new Benchmark; + $row= $sth->fetchrow_arrayref(); + $sth=0; + + $time= timestr(timediff($end_time, $start_time),"nop"); + $time =~ /([\d.]*)/; + $ms= $1*1000.0; + + $query= "show status like 'last_query_cost'"; + $sth= $dbh->prepare($query) || die "Got error on '$query': " . $dbh->errstr . "\n"; + $sth->execute || die "Got error on '$query': " . $dbh->errstr . "\n";; + $row= $sth->fetchrow_arrayref(); + $sth=0; + $cost= $row->[1] * $adjust_cost; + printf "%10s ms: %-10s cost: %6.4f", $name, $ms, $cost; + if ($adjust_cost != 1.0) + { + printf " (was %6.4f)", $row->[1]; + } + print "\n\n"; + + $res{'cost'}= $cost; + $res{'time'}= $ms; + return \%res; +} + +# +# Run a query and compare the table access time from analyze statement +# The cost works for queries with one or two tables! +# + +sub run_query_with_analyze() +{ + my ($name, $type, $expected_rows, $query)= @_; + my ($start_time,$end_time,$sth,@row,%res,$i, $optimizer_rows); + my ($adjust_cost, $ms, $first_ms, $analyze, $json, $local_where_cost); + + $adjust_cost=1.0; + $local_where_cost= $where_cost/1000 * $opt_rows; + + print "Timing table access for query: $query\n"; + + $sth= $dbh->prepare("explain $query") || die "Got error on 'explain $query': " . $dbh->errstr . "\n"; + $sth->execute || die "Got error on 'explain $query': " . $dbh->errstr . "\n"; + + print "explain:\n"; + while ($row= $sth->fetchrow_arrayref()) + { + print $row->[0]; + for ($i= 1 ; $i < @$row; $i++) + { + print " " . $row->[$i] if (defined($row->[$i])); + } + print "\n"; + + if ($row->[3] ne $type) + { + print "Warning: Wrong scan type: '$row->[3]', expected '$type'\n"; + } + $optimizer_rows= $row->[8]; + } + if ($expected_rows >= 0 && + (abs($optimizer_rows - $expected_rows)/$expected_rows) > 0.1) + { + printf "Warning: Expected $expected_rows instead of $optimizer_rows from EXPLAIN. Adjusting costs\n"; + $adjust_cost= $expected_rows / $optimizer_rows; + } + + # Do one query to fill the cache + $sth= $dbh->prepare($query) || die "Got error on '$query': " . $dbh->errstr . "\n"; + $sth->execute || die "Got error on '$query': " . $dbh->errstr . "\n"; + $row= $sth->fetchrow_arrayref(); + $sth=0; + + # Run the query through analyze statement + + $sth= $dbh->prepare("analyze format=json $query" ) || die "Got error on 'analzye $query': " . $dbh->errstr . "\n"; + $sth->execute || die "Got error on '$query': " . $dbh->errstr . "\n"; + $row= $sth->fetchrow_arrayref(); + $analyze= $row->[0]; + $sth=0; + + # Fetch the timings + $analyze=~ /r_table_time_ms": ([0-9.]*)/; + $first_ms= $1; + + # Fetch the timing for the last table in the query + + $json= from_json($analyze); + $ms= $json->{'query_block'}->{'table'}->{'r_table_time_ms'}; + die "Cannot find r_table_time_ms in JSON object from analyze statement\n" if (!defined($ms)); + + # If two tables, add the costs for the other table! + if ($ms != $first_ms) + { + $ms= $ms + $first_ms; + $local_where_cost= $local_where_cost*2; + } + + if ($opt_print_analyze) + { + print "\nanalyze:\n" . $analyze . "\n\n"; + # print "QQ: $analyze\n"; + # print Dumper($json) . "\n"; + } + + # Get last query cost + $query= "show status like 'last_query_cost'"; + $sth= $dbh->prepare($query) || die "Got error on '$query': " . $dbh->errstr . "\n"; + $sth->execute || die "Got error on '$query': " . $dbh->errstr . "\n";; + $row= $sth->fetchrow_arrayref(); + $sth=0; + $cost= $row->[1] * $adjust_cost; + + printf "%10s ms: %-10s cost-where: %6.4f cost: %6.4f", + $name, $ms, $cost - $local_where_cost, $cost; + if ($adjust_cost != 1.0) + { + printf " (cost was %6.4f)", $row->[1]; + } + print "\n\n"; + + $res{'cost'}= $cost; + $res{'where_cost'}= $local_where_cost; + $res{'time'}= $ms; + return \%res; +} + + +sub print_totals() +{ + my ($i, $j); + print "Totals per test\n"; + for ($j= 0 ; $j <= $#test_names; $j++) + { + print "$test_names[$j]:\n"; + for ($i= 0 ; $i <= $#engines ; $i++) + { + my $cost= $res[$i][$j]->{'cost'}; + my $ms= $res[$i][$j]->{'time'}; + printf "%-8s ms: %-10.10f cost: %10.6f\n", $engines[$i], $ms, $cost; + } + } +} + + +############################################################################## +# Get various simple data from MariaDB +############################################################################## + +sub print_mariadb_version() +{ + my ($query, $sth, $row); + $query= "select VERSION()"; + $sth= $dbh->prepare($query) || die "Got error on '$query': " . $dbh->errstr . "\n"; +$sth->execute || die "Got error on '$query': " . $dbh->errstr . "\n";; + $row= $sth->fetchrow_arrayref(); + print "Server: $row->[0]"; + + $query= "show variables like 'VERSION_SOURCE_REVISION'"; + $sth= $dbh->prepare($query) || die "Got error on '$query': " . $dbh->errstr . "\n"; +$sth->execute || die "Got error on '$query': " . $dbh->errstr . "\n";; + $row= $sth->fetchrow_arrayref(); + print " Commit: $row->[1]\n"; +} + + +sub get_row_count() +{ + $query= "select count(*) from $table"; + $sth= $dbh->prepare($query) || die "Got error on '$query': " . $dbh->errstr . "\n"; +$sth->execute || die "Got error on '$query': " . $dbh->errstr . "\n";; + $row= $sth->fetchrow_arrayref(); + return $row->[0]; +} + + +sub get_variable() +{ + my ($name)= @_; + $query= "select @@" . $name; + $sth= $dbh->prepare($query) || die "Got error on '$query': " . $dbh->errstr . "\n"; +$sth->execute || die "Got error on '$query': " . $dbh->errstr . "\n";; + $row= $sth->fetchrow_arrayref(); + return $row->[0]; +} |