summaryrefslogtreecommitdiff
path: root/sql/sql_statistics.cc
diff options
context:
space:
mode:
Diffstat (limited to 'sql/sql_statistics.cc')
-rw-r--r--sql/sql_statistics.cc311
1 files changed, 171 insertions, 140 deletions
diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc
index 59ae111f716..1222d0e0d5f 100644
--- a/sql/sql_statistics.cc
+++ b/sql/sql_statistics.cc
@@ -30,7 +30,6 @@
#include "sql_statistics.h"
#include "opt_range.h"
#include "uniques.h"
-#include "my_atomic.h"
#include "sql_show.h"
#include "sql_partition.h"
@@ -104,29 +103,6 @@ inline void init_table_list_for_stat_tables(TABLE_LIST *tables, bool for_write)
}
}
-
-/**
- @details
- The function builds a TABLE_LIST containing only one element 'tbl' for
- the statistical table called 'stat_tab_name'.
- The lock type of the element is set to TL_READ if for_write = FALSE,
- otherwise it is set to TL_WRITE.
-*/
-
-static inline
-void init_table_list_for_single_stat_table(TABLE_LIST *tbl,
- const LEX_CSTRING *stat_tab_name,
- bool for_write)
-{
- memset((char *) tbl, 0, sizeof(TABLE_LIST));
-
- tbl->db= MYSQL_SCHEMA_NAME;
- tbl->table_name= *stat_tab_name;
- tbl->alias= *stat_tab_name;
- tbl->lock_type= for_write ? TL_WRITE : TL_READ;
-}
-
-
static Table_check_intact_log_error stat_table_intact;
static const
@@ -288,16 +264,21 @@ inline int open_stat_tables(THD *thd, TABLE_LIST *tables,
/**
@brief
Open a statistical table and lock it
+
+ @details
+ This is used by DDLs. When a column or index is dropped or renamed,
+ stat tables need to be adjusted accordingly.
*/
-static
-inline int open_single_stat_table(THD *thd, TABLE_LIST *table,
- const LEX_CSTRING *stat_tab_name,
- Open_tables_backup *backup,
- bool for_write)
+static inline int open_stat_table_for_ddl(THD *thd, TABLE_LIST *table,
+ const LEX_CSTRING *stat_tab_name,
+ Open_tables_backup *backup)
{
- init_table_list_for_single_stat_table(table, stat_tab_name, for_write);
- init_mdl_requests(table);
- return open_system_tables_for_read(thd, table, backup);
+ table->init_one_table(&MYSQL_SCHEMA_NAME, stat_tab_name, NULL, TL_WRITE);
+ No_such_table_error_handler nst_handler;
+ thd->push_internal_handler(&nst_handler);
+ int res= open_system_tables_for_read(thd, table, backup);
+ thd->pop_internal_handler();
+ return res;
}
@@ -326,8 +307,8 @@ private:
public:
inline void init(THD *thd, Field * table_field);
- inline bool add(ha_rows rowno);
- inline void finish(ha_rows rows);
+ inline bool add();
+ inline void finish(ha_rows rows, double sample_fraction);
inline void cleanup();
};
@@ -1555,6 +1536,8 @@ class Histogram_builder
uint curr_bucket; /* number of the current bucket to be built */
ulonglong count; /* number of values retrieved */
ulonglong count_distinct; /* number of distinct values retrieved */
+ /* number of distinct values that occured only once */
+ ulonglong count_distinct_single_occurence;
public:
Histogram_builder(Field *col, uint col_len, ha_rows rows)
@@ -1568,14 +1551,21 @@ public:
bucket_capacity= (double) records / (hist_width + 1);
curr_bucket= 0;
count= 0;
- count_distinct= 0;
+ count_distinct= 0;
+ count_distinct_single_occurence= 0;
}
- ulonglong get_count_distinct() { return count_distinct; }
+ ulonglong get_count_distinct() const { return count_distinct; }
+ ulonglong get_count_single_occurence() const
+ {
+ return count_distinct_single_occurence;
+ }
int next(void *elem, element_count elem_cnt)
{
count_distinct++;
+ if (elem_cnt == 1)
+ count_distinct_single_occurence++;
count+= elem_cnt;
if (curr_bucket == hist_width)
return 0;
@@ -1589,7 +1579,7 @@ public:
count > bucket_capacity * (curr_bucket + 1))
{
histogram->set_prev_value(curr_bucket);
- curr_bucket++;
+ curr_bucket++;
}
}
return 0;
@@ -1605,9 +1595,18 @@ int histogram_build_walk(void *elem, element_count elem_cnt, void *arg)
return hist_builder->next(elem, elem_cnt);
}
-C_MODE_END
+static int count_distinct_single_occurence_walk(void *elem,
+ element_count count, void *arg)
+{
+ ((ulonglong*)arg)[0]+= 1;
+ if (count == 1)
+ ((ulonglong*)arg)[1]+= 1;
+ return 0;
+}
+
+C_MODE_END
/*
The class Count_distinct_field is a helper class used to calculate
the number of distinct values for a column. The class employs the
@@ -1626,6 +1625,9 @@ protected:
Unique *tree; /* The helper object to contain distinct values */
uint tree_key_length; /* The length of the keys for the elements of 'tree */
+ ulonglong distincts;
+ ulonglong distincts_single_occurence;
+
public:
Count_distinct_field() {}
@@ -1677,30 +1679,40 @@ public:
{
return tree->unique_add(table_field->ptr);
}
-
+
/*
@brief
Calculate the number of elements accumulated in the container of 'tree'
*/
- ulonglong get_value()
- {
- ulonglong count;
- if (tree->elements == 0)
- return (ulonglong) tree->elements_in_tree();
- count= 0;
- tree->walk(table_field->table, count_distinct_walk, (void*) &count);
- return count;
+ void walk_tree()
+ {
+ ulonglong counts[2] = {0, 0};
+ tree->walk(table_field->table,
+ count_distinct_single_occurence_walk, counts);
+ distincts= counts[0];
+ distincts_single_occurence= counts[1];
}
/*
@brief
- Build the histogram for the elements accumulated in the container of 'tree'
+ Calculate a histogram of the tree
*/
- ulonglong get_value_with_histogram(ha_rows rows)
+ void walk_tree_with_histogram(ha_rows rows)
{
Histogram_builder hist_builder(table_field, tree_key_length, rows);
tree->walk(table_field->table, histogram_build_walk, (void *) &hist_builder);
- return hist_builder.get_count_distinct();
+ distincts= hist_builder.get_count_distinct();
+ distincts_single_occurence= hist_builder.get_count_single_occurence();
+ }
+
+ ulonglong get_count_distinct()
+ {
+ return distincts;
+ }
+
+ ulonglong get_count_distinct_single_occurence()
+ {
+ return distincts_single_occurence;
}
/*
@@ -2506,7 +2518,7 @@ void Column_statistics_collected::init(THD *thd, Field *table_field)
*/
inline
-bool Column_statistics_collected::add(ha_rows rowno)
+bool Column_statistics_collected::add()
{
bool err= 0;
@@ -2515,9 +2527,11 @@ bool Column_statistics_collected::add(ha_rows rowno)
else
{
column_total_length+= column->value_length();
- if (min_value && column->update_min(min_value, rowno == nulls))
+ if (min_value && column->update_min(min_value,
+ is_null(COLUMN_STAT_MIN_VALUE)))
set_not_null(COLUMN_STAT_MIN_VALUE);
- if (max_value && column->update_max(max_value, rowno == nulls))
+ if (max_value && column->update_max(max_value,
+ is_null(COLUMN_STAT_MAX_VALUE)))
set_not_null(COLUMN_STAT_MAX_VALUE);
if (count_distinct)
err= count_distinct->add();
@@ -2535,7 +2549,7 @@ bool Column_statistics_collected::add(ha_rows rowno)
*/
inline
-void Column_statistics_collected::finish(ha_rows rows)
+void Column_statistics_collected::finish(ha_rows rows, double sample_fraction)
{
double val;
@@ -2553,16 +2567,44 @@ void Column_statistics_collected::finish(ha_rows rows)
}
if (count_distinct)
{
- ulonglong distincts;
uint hist_size= count_distinct->get_hist_size();
+
+ /* Compute cardinality statistics and optionally histogram. */
if (hist_size == 0)
- distincts= count_distinct->get_value();
+ count_distinct->walk_tree();
else
- distincts= count_distinct->get_value_with_histogram(rows - nulls);
+ count_distinct->walk_tree_with_histogram(rows - nulls);
+
+ ulonglong distincts= count_distinct->get_count_distinct();
+ ulonglong distincts_single_occurence=
+ count_distinct->get_count_distinct_single_occurence();
+
if (distincts)
{
- val= (double) (rows - nulls) / distincts;
- set_avg_frequency(val);
+ /*
+ We use the unsmoothed first-order jackknife estimator" to estimate
+ the number of distinct values.
+ With a sufficient large percentage of rows sampled (80%), we revert back
+ to computing the avg_frequency off of the raw data.
+ */
+ if (sample_fraction > 0.8)
+ val= (double) (rows - nulls) / distincts;
+ else
+ {
+ if (nulls == 1)
+ distincts_single_occurence+= 1;
+ if (nulls)
+ distincts+= 1;
+ double fraction_single_occurence=
+ static_cast<double>(distincts_single_occurence) / rows;
+ double total_number_of_rows= rows / sample_fraction;
+ double estimate_total_distincts= total_number_of_rows /
+ (distincts /
+ (1.0 - (1.0 - sample_fraction) * fraction_single_occurence));
+ val = std::fmax(estimate_total_distincts * (rows - nulls) / rows, 1.0);
+ }
+
+ set_avg_frequency(val);
set_not_null(COLUMN_STAT_AVG_FREQUENCY);
}
else
@@ -2750,12 +2792,28 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
Field *table_field;
ha_rows rows= 0;
handler *file=table->file;
+ double sample_fraction= thd->variables.sample_percentage / 100;
+ const ha_rows MIN_THRESHOLD_FOR_SAMPLING= 50000;
DBUG_ENTER("collect_statistics_for_table");
table->collected_stats->cardinality_is_null= TRUE;
table->collected_stats->cardinality= 0;
+ if (thd->variables.sample_percentage == 0)
+ {
+ if (file->records() < MIN_THRESHOLD_FOR_SAMPLING)
+ {
+ sample_fraction= 1;
+ }
+ else
+ {
+ sample_fraction= std::fmin(
+ (MIN_THRESHOLD_FOR_SAMPLING + 4096 *
+ log(200 * file->records())) / file->records(), 1);
+ }
+ }
+
for (field_ptr= table->field; *field_ptr; field_ptr++)
{
table_field= *field_ptr;
@@ -2768,7 +2826,7 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
/* Perform a full table scan to collect statistics on 'table's columns */
if (!(rc= file->ha_rnd_init(TRUE)))
- {
+ {
DEBUG_SYNC(table->in_use, "statistics_collection_start");
while ((rc= file->ha_rnd_next(table->record[0])) != HA_ERR_END_OF_FILE)
@@ -2779,17 +2837,20 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
if (rc)
break;
- for (field_ptr= table->field; *field_ptr; field_ptr++)
+ if (thd_rnd(thd) <= sample_fraction)
{
- table_field= *field_ptr;
- if (!bitmap_is_set(table->read_set, table_field->field_index))
- continue;
- if ((rc= table_field->collected_stats->add(rows)))
+ for (field_ptr= table->field; *field_ptr; field_ptr++)
+ {
+ table_field= *field_ptr;
+ if (!bitmap_is_set(table->read_set, table_field->field_index))
+ continue;
+ if ((rc= table_field->collected_stats->add()))
+ break;
+ }
+ if (rc)
break;
+ rows++;
}
- if (rc)
- break;
- rows++;
}
file->ha_rnd_end();
}
@@ -2803,7 +2864,8 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
if (!rc)
{
table->collected_stats->cardinality_is_null= FALSE;
- table->collected_stats->cardinality= rows;
+ table->collected_stats->cardinality=
+ static_cast<ha_rows>(rows / sample_fraction);
}
bitmap_clear_all(table->write_set);
@@ -2814,7 +2876,7 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
continue;
bitmap_set_bit(table->write_set, table_field->field_index);
if (!rc)
- table_field->collected_stats->finish(rows);
+ table_field->collected_stats->finish(rows, sample_fraction);
else
table_field->collected_stats->cleanup();
}
@@ -3280,7 +3342,6 @@ int read_statistics_for_tables_if_needed(THD *thd, TABLE_LIST *tables)
{
TABLE_LIST stat_tables[STATISTICS_TABLES];
Open_tables_backup open_tables_backup;
-
DBUG_ENTER("read_statistics_for_tables_if_needed");
DEBUG_SYNC(thd, "statistics_read_start");
@@ -3289,10 +3350,7 @@ int read_statistics_for_tables_if_needed(THD *thd, TABLE_LIST *tables)
DBUG_RETURN(0);
if (open_stat_tables(thd, stat_tables, &open_tables_backup, FALSE))
- {
- thd->clear_error();
DBUG_RETURN(1);
- }
for (TABLE_LIST *tl= tables; tl; tl= tl->next_global)
{
@@ -3345,7 +3403,7 @@ int read_statistics_for_tables_if_needed(THD *thd, TABLE_LIST *tables)
'db' from all statistical tables: table_stats, column_stats, index_stats.
@retval
- 0 If all deletions are successful
+ 0 If all deletions are successful or we couldn't open statistics table
@retval
1 Otherwise
@@ -3353,7 +3411,8 @@ int read_statistics_for_tables_if_needed(THD *thd, TABLE_LIST *tables)
The function is called when executing the statement DROP TABLE 'tab'.
*/
-int delete_statistics_for_table(THD *thd, const LEX_CSTRING *db, const LEX_CSTRING *tab)
+int delete_statistics_for_table(THD *thd, const LEX_CSTRING *db,
+ const LEX_CSTRING *tab)
{
int err;
enum_binlog_format save_binlog_format;
@@ -3361,11 +3420,10 @@ int delete_statistics_for_table(THD *thd, const LEX_CSTRING *db, const LEX_CSTRI
TABLE_LIST tables[STATISTICS_TABLES];
Open_tables_backup open_tables_backup;
int rc= 0;
-
DBUG_ENTER("delete_statistics_for_table");
if (open_stat_tables(thd, tables, &open_tables_backup, TRUE))
- DBUG_RETURN(rc);
+ DBUG_RETURN(0);
save_binlog_format= thd->set_current_stmt_binlog_format_stmt();
@@ -3418,21 +3476,16 @@ int delete_statistics_for_table(THD *thd, const LEX_CSTRING *db, const LEX_CSTRI
@brief
Delete statistics on a column of the specified table
- @param
- thd The thread handle
- @param
- tab The table the column belongs to
- @param
- col The field of the column whose statistics is to be deleted
+ @param thd The thread handle
+ @param tab The table the column belongs to
+ @param col The field of the column whose statistics is to be deleted
@details
The function delete statistics on the column 'col' belonging to the table
'tab' from the statistical table column_stats.
- @retval
- 0 If the deletion is successful
- @retval
- 1 Otherwise
+ @retval 0 If all deletions are successful or we couldn't open statistics table
+ @retval 1 Otherwise
@note
The function is called when dropping a table column or when changing
@@ -3447,15 +3500,11 @@ int delete_statistics_for_column(THD *thd, TABLE *tab, Field *col)
TABLE_LIST tables;
Open_tables_backup open_tables_backup;
int rc= 0;
-
DBUG_ENTER("delete_statistics_for_column");
- if (open_single_stat_table(thd, &tables, &stat_table_name[1],
- &open_tables_backup, TRUE))
- {
- thd->clear_error();
- DBUG_RETURN(rc);
- }
+ if (open_stat_table_for_ddl(thd, &tables, &stat_table_name[1],
+ &open_tables_backup))
+ DBUG_RETURN(0);
save_binlog_format= thd->set_current_stmt_binlog_format_stmt();
@@ -3481,24 +3530,18 @@ int delete_statistics_for_column(THD *thd, TABLE *tab, Field *col)
@brief
Delete statistics on an index of the specified table
- @param
- thd The thread handle
- @param
- tab The table the index belongs to
- @param
- key_info The descriptor of the index whose statistics is to be deleted
- @param
- ext_prefixes_only Delete statistics only on the index prefixes extended by
- the components of the primary key
+ @param thd The thread handle
+ @param tab The table the index belongs to
+ @param key_info The descriptor of the index whose statistics is to be deleted
+ @param ext_prefixes_only Delete statistics only on the index prefixes
+ extended by the components of the primary key
@details
The function delete statistics on the index specified by 'key_info'
defined on the table 'tab' from the statistical table index_stats.
- @retval
- 0 If the deletion is successful
- @retval
- 1 Otherwise
+ @retval 0 If all deletions are successful or we couldn't open statistics table
+ @retval 1 Otherwise
@note
The function is called when dropping an index, or dropping/changing the
@@ -3514,15 +3557,11 @@ int delete_statistics_for_index(THD *thd, TABLE *tab, KEY *key_info,
TABLE_LIST tables;
Open_tables_backup open_tables_backup;
int rc= 0;
-
DBUG_ENTER("delete_statistics_for_index");
- if (open_single_stat_table(thd, &tables, &stat_table_name[2],
- &open_tables_backup, TRUE))
- {
- thd->clear_error();
- DBUG_RETURN(rc);
- }
+ if (open_stat_table_for_ddl(thd, &tables, &stat_table_name[2],
+ &open_tables_backup))
+ DBUG_RETURN(0);
save_binlog_format= thd->set_current_stmt_binlog_format_stmt();
@@ -3583,7 +3622,7 @@ int delete_statistics_for_index(THD *thd, TABLE *tab, KEY *key_info,
index_stats.
@retval
- 0 If all updates of the table name are successful
+ 0 If all updates of the table name are successful
@retval
1 Otherwise
@@ -3591,8 +3630,10 @@ int delete_statistics_for_index(THD *thd, TABLE *tab, KEY *key_info,
The function is called when executing any statement that renames a table
*/
-int rename_table_in_stat_tables(THD *thd, const LEX_CSTRING *db, const LEX_CSTRING *tab,
- const LEX_CSTRING *new_db, const LEX_CSTRING *new_tab)
+int rename_table_in_stat_tables(THD *thd, const LEX_CSTRING *db,
+ const LEX_CSTRING *tab,
+ const LEX_CSTRING *new_db,
+ const LEX_CSTRING *new_tab)
{
int err;
enum_binlog_format save_binlog_format;
@@ -3603,7 +3644,9 @@ int rename_table_in_stat_tables(THD *thd, const LEX_CSTRING *db, const LEX_CSTRI
DBUG_ENTER("rename_table_in_stat_tables");
if (open_stat_tables(thd, tables, &open_tables_backup, TRUE))
+ {
DBUG_RETURN(0); // not an error
+ }
save_binlog_format= thd->set_current_stmt_binlog_format_stmt();
@@ -3660,26 +3703,19 @@ int rename_table_in_stat_tables(THD *thd, const LEX_CSTRING *db, const LEX_CSTRI
/**
- @brief
Rename a column in the statistical table column_stats
- @param
- thd The thread handle
- @param
- tab The table the column belongs to
- @param
- col The column to be renamed
- @param
- new_name The new column name
+ @param thd The thread handle
+ @param tab The table the column belongs to
+ @param col The column to be renamed
+ @param new_name The new column name
@details
The function replaces the name of the column 'col' belonging to the table
'tab' for 'new_name' in the statistical table column_stats.
- @retval
- 0 If all updates of the table name are successful
- @retval
- 1 Otherwise
+ @retval 0 If all updates of the table name are successful
+ @retval 1 Otherwise
@note
The function is called when executing any statement that renames a column,
@@ -3695,18 +3731,14 @@ int rename_column_in_stat_tables(THD *thd, TABLE *tab, Field *col,
TABLE_LIST tables;
Open_tables_backup open_tables_backup;
int rc= 0;
-
DBUG_ENTER("rename_column_in_stat_tables");
if (tab->s->tmp_table != NO_TMP_TABLE)
DBUG_RETURN(0);
- if (open_single_stat_table(thd, &tables, &stat_table_name[1],
- &open_tables_backup, TRUE))
- {
- thd->clear_error();
+ if (open_stat_table_for_ddl(thd, &tables, &stat_table_name[1],
+ &open_tables_backup))
DBUG_RETURN(rc);
- }
save_binlog_format= thd->set_current_stmt_binlog_format_stmt();
@@ -3748,9 +3780,8 @@ void set_statistics_for_table(THD *thd, TABLE *table)
{
TABLE_STATISTICS_CB *stats_cb= &table->s->stats_cb;
Table_statistics *read_stats= stats_cb->table_stats;
- Use_stat_tables_mode use_stat_table_mode= get_use_stat_tables_mode(thd);
table->used_stat_records=
- (use_stat_table_mode <= COMPLEMENTARY ||
+ (!check_eits_preferred(thd) ||
!table->stats_is_read || read_stats->cardinality_is_null) ?
table->file->stats.records : read_stats->cardinality;
@@ -3774,7 +3805,7 @@ void set_statistics_for_table(THD *thd, TABLE *table)
key_info < key_info_end; key_info++)
{
key_info->is_statistics_from_stat_tables=
- (use_stat_table_mode > COMPLEMENTARY &&
+ (check_eits_preferred(thd) &&
table->stats_is_read &&
key_info->read_stats->avg_frequency_is_inited() &&
key_info->read_stats->get_avg_frequency(0) > 0.5);