summaryrefslogtreecommitdiff
path: root/sql/sql_statistics.cc
diff options
context:
space:
mode:
authorIgor Babaev <igor@askmonty.org>2013-03-25 23:48:29 -0700
committerIgor Babaev <igor@askmonty.org>2013-03-25 23:48:29 -0700
commit1009832c13380365c03f77fcabd0fda470b73390 (patch)
tree73e123df951d60220a4cb0cac2ca19b2ebff7056 /sql/sql_statistics.cc
parentfc1c8ffdadfd14eb51969ecfde43e3204f10f6f8 (diff)
downloadmariadb-git-1009832c13380365c03f77fcabd0fda470b73390.tar.gz
Added histogams for table columns.
Diffstat (limited to 'sql/sql_statistics.cc')
-rw-r--r--sql/sql_statistics.cc326
1 files changed, 294 insertions, 32 deletions
diff --git a/sql/sql_statistics.cc b/sql/sql_statistics.cc
index 5338632067a..8c0b2730b02 100644
--- a/sql/sql_statistics.cc
+++ b/sql/sql_statistics.cc
@@ -889,7 +889,7 @@ public:
char buff[MAX_FIELD_WIDTH];
String val(buff, sizeof(buff), &my_charset_utf8_bin);
- for (uint i= COLUMN_STAT_MIN_VALUE; i <= COLUMN_STAT_AVG_FREQUENCY; i++)
+ for (uint i= COLUMN_STAT_MIN_VALUE; i <= COLUMN_STAT_HISTOGRAM; i++)
{
Field *stat_field= stat_table->field[i];
if (table_field->collected_stats->is_null(i))
@@ -924,7 +924,17 @@ public:
break;
case COLUMN_STAT_AVG_FREQUENCY:
stat_field->store(table_field->collected_stats->get_avg_frequency());
- break;
+ break;
+ case COLUMN_STAT_HIST_SIZE:
+ stat_field->store(table_field->collected_stats->histogram.get_size());
+ break;
+ case COLUMN_STAT_HISTOGRAM:
+ const char * col_histogram=
+ (const char *) (table_field->collected_stats->histogram.get_values());
+ stat_field->store(col_histogram,
+ table_field->collected_stats->histogram.get_size(),
+ &my_charset_bin);
+ break;
}
}
}
@@ -961,7 +971,7 @@ public:
char buff[MAX_FIELD_WIDTH];
String val(buff, sizeof(buff), &my_charset_utf8_bin);
- for (uint i= COLUMN_STAT_MIN_VALUE; i <= COLUMN_STAT_AVG_FREQUENCY; i++)
+ for (uint i= COLUMN_STAT_MIN_VALUE; i <= COLUMN_STAT_HIST_SIZE; i++)
{
Field *stat_field= stat_table->field[i];
@@ -993,6 +1003,9 @@ public:
break;
case COLUMN_STAT_AVG_FREQUENCY:
table_field->read_stats->set_avg_frequency(stat_field->val_real());
+ break;
+ case COLUMN_STAT_HIST_SIZE:
+ table_field->read_stats->histogram.set_size(stat_field->val_int());
break;
}
}
@@ -1000,6 +1013,21 @@ public:
}
}
+ void get_histogram_value()
+ {
+ if (find_stat())
+ {
+ char buff[MAX_FIELD_WIDTH];
+ String val(buff, sizeof(buff), &my_charset_utf8_bin);
+ uint fldno= COLUMN_STAT_HISTOGRAM;
+ Field *stat_field= stat_table->field[fldno];
+ table_field->read_stats->set_not_null(fldno);
+ stat_field->val_str(&val);
+ memcpy(table_field->read_stats->histogram.get_values(),
+ val.ptr(), table_field->read_stats->histogram.get_size());
+ }
+ }
+
};
@@ -1202,6 +1230,72 @@ public:
};
+class Histogram_builder
+{
+ Field *column;
+ uint col_length;
+ ha_rows records;
+ Field *min_value;
+ Field *max_value;
+ Histogram *histogram;
+ uint hist_size;
+ double bucket_capacity;
+ uint curr_bucket;
+ ulonglong count;
+ ulonglong count_distinct;
+
+public:
+ Histogram_builder(Field *col, uint col_len, ha_rows rows)
+ : column(col), col_length(col_len), records(rows)
+ {
+ Column_statistics *col_stats= col->collected_stats;
+ min_value= col_stats->min_value;
+ max_value= col_stats->max_value;
+ histogram= &col_stats->histogram;
+ hist_size= histogram->get_size();
+ bucket_capacity= (double) records / (hist_size + 1);
+ curr_bucket= 0;
+ count= 0;
+ count_distinct= 0;
+ }
+
+ ulonglong get_count_distinct() { return count_distinct; }
+
+ int next(void *elem, element_count elem_cnt)
+ {
+ count_distinct++;
+ count+= elem_cnt;
+ if (curr_bucket == hist_size)
+ return 0;
+ if (count > bucket_capacity * (curr_bucket + 1))
+ {
+ column->store_field_value((uchar *) elem, col_length);
+ histogram->set_value(curr_bucket,
+ column->middle_point_pos(min_value, max_value));
+ curr_bucket++;
+ while (curr_bucket != hist_size &&
+ count > bucket_capacity * (curr_bucket + 1))
+ {
+ histogram->set_prev_value(curr_bucket);
+ curr_bucket++;
+ }
+ }
+ return 0;
+ }
+};
+
+
+C_MODE_START
+
+int histogram_build_walk(void *elem, element_count elem_cnt, void *arg)
+{
+ Histogram_builder *hist_builder= (Histogram_builder *) arg;
+ return hist_builder->next(elem, elem_cnt);
+}
+
+C_MODE_END
+
+
/*
The class Count_distinct_field is a helper class used to calculate
the number of distinct values for a column. The class employs the
@@ -1221,6 +1315,8 @@ protected:
uint tree_key_length; /* The length of the keys for the elements of 'tree */
public:
+
+ Count_distinct_field() {}
/**
@param
@@ -1239,27 +1335,10 @@ public:
Count_distinct_field(Field *field, uint max_heap_table_size)
{
- qsort_cmp2 compare_key;
- void* cmp_arg;
- enum enum_field_types f_type= field->type();
-
table_field= field;
tree_key_length= field->pack_length();
- if ((f_type == MYSQL_TYPE_VARCHAR) ||
- (!field->binary() && (f_type == MYSQL_TYPE_STRING ||
- f_type == MYSQL_TYPE_VAR_STRING)))
- {
- compare_key= (qsort_cmp2) simple_str_key_cmp;
- cmp_arg= (void*) field;
- }
- else
- {
- cmp_arg= (void*) &tree_key_length;
- compare_key= (qsort_cmp2) simple_raw_key_cmp;
- }
-
- tree= new Unique(compare_key, cmp_arg,
+ tree= new Unique((qsort_cmp2) simple_str_key_cmp, (void*) field,
tree_key_length, max_heap_table_size);
}
@@ -1300,9 +1379,36 @@ public:
tree->walk(table_field->table, count_distinct_walk, (void*) &count);
return count;
}
+
+ ulonglong get_value_with_histogram(ha_rows rows)
+ {
+ Histogram_builder hist_builder(table_field, tree_key_length, rows);
+ tree->walk(table_field->table, histogram_build_walk, (void *) &hist_builder);
+ return hist_builder.get_count_distinct();
+ }
+
+ uint get_hist_size()
+ {
+ return table_field->collected_stats->histogram.get_size();
+ }
+
+ uchar *get_histogram()
+ {
+ return table_field->collected_stats->histogram.get_values();
+ }
+
};
+static
+int simple_ulonglong_key_cmp(void* arg, uchar* key1, uchar* key2)
+{
+ ulonglong *val1= (ulonglong *) key1;
+ ulonglong *val2= (ulonglong *) key2;
+ return *val1 > *val2 ? 1 : *val1 == *val2 ? 0 : -1;
+}
+
+
/*
The class Count_distinct_field_bit is derived from the class
Count_distinct_field to be used only for fields of the MYSQL_TYPE_BIT type.
@@ -1312,8 +1418,17 @@ public:
class Count_distinct_field_bit: public Count_distinct_field
{
public:
+
Count_distinct_field_bit(Field *field, uint max_heap_table_size)
- :Count_distinct_field(field, max_heap_table_size) {}
+ {
+ table_field= field;
+ tree_key_length= sizeof(ulonglong);
+
+ tree= new Unique((qsort_cmp2) simple_ulonglong_key_cmp,
+ (void*) &tree_key_length,
+ tree_key_length, max_heap_table_size);
+ }
+
bool add()
{
longlong val= table_field->val_int();
@@ -1672,13 +1787,26 @@ int alloc_statistics_for_table(THD* thd, TABLE *table)
ulong *idx_avg_frequency= (ulong*) alloc_root(&table->mem_root,
sizeof(ulong) * key_parts);
- if (!table_stats || !column_stats || !index_stats || !idx_avg_frequency)
+ uint columns= 0;
+ for (field_ptr= table->field; *field_ptr; field_ptr++)
+ {
+ if (bitmap_is_set(table->read_set, (*field_ptr)->field_index))
+ columns++;
+ }
+ uint hist_size= thd->variables.histogram_size;
+ uchar *histogram= NULL;
+ if (hist_size > 0)
+ histogram= (uchar *) alloc_root(&table->mem_root, hist_size * columns);
+
+ if (!table_stats || !column_stats || !index_stats || !idx_avg_frequency ||
+ (hist_size && !histogram))
DBUG_RETURN(1);
table->collected_stats= table_stats;
table_stats->column_stats= column_stats;
table_stats->index_stats= index_stats;
table_stats->idx_avg_frequency= idx_avg_frequency;
+ table_stats->histograms= histogram;
memset(column_stats, 0, sizeof(Column_statistics) * (fields+1));
@@ -1687,6 +1815,12 @@ int alloc_statistics_for_table(THD* thd, TABLE *table)
(*field_ptr)->collected_stats= column_stats;
(*field_ptr)->collected_stats->max_value= NULL;
(*field_ptr)->collected_stats->min_value= NULL;
+ if (bitmap_is_set(table->read_set, (*field_ptr)->field_index))
+ {
+ column_stats->histogram.set_size(hist_size);
+ column_stats->histogram.set_values(histogram);
+ histogram+= hist_size;
+ }
}
memset(idx_avg_frequency, 0, sizeof(ulong) * key_parts);
@@ -1903,10 +2037,51 @@ int alloc_statistics_for_table_share(THD* thd, TABLE_SHARE *table_share,
if (!is_safe)
mysql_mutex_unlock(&table_share->LOCK_ha_data);
-
DBUG_RETURN(0);
}
+static
+int alloc_histograms_for_table_share(THD* thd, TABLE_SHARE *table_share,
+ bool is_safe)
+{
+ TABLE_STATISTICS_CB *stats_cb= &table_share->stats_cb;
+
+ DBUG_ENTER("alloc_histograms_for_table_share");
+
+ if (!is_safe)
+ mysql_mutex_lock(&table_share->LOCK_ha_data);
+
+ if (stats_cb->histograms_can_be_read)
+ {
+ if (!is_safe)
+ mysql_mutex_unlock(&table_share->LOCK_ha_data);
+ DBUG_RETURN(0);
+ }
+
+ Table_statistics *table_stats= stats_cb->table_stats;
+ ulong total_hist_size= table_stats->total_hist_size;
+
+ if (total_hist_size && !table_stats->histograms)
+ {
+ uchar *histograms= (uchar *) alloc_root(&stats_cb->mem_root,
+ total_hist_size);
+ if (!histograms)
+ {
+ if (!is_safe)
+ mysql_mutex_unlock(&table_share->LOCK_ha_data);
+ DBUG_RETURN(1);
+ }
+ memset(histograms, 0, total_hist_size);
+ table_stats->histograms= histograms;
+ stats_cb->histograms_can_be_read= TRUE;
+ }
+
+ if (!is_safe)
+ mysql_mutex_unlock(&table_share->LOCK_ha_data);
+
+ DBUG_RETURN(0);
+
+}
/**
@brief
@@ -2006,14 +2181,28 @@ void Column_statistics_collected::finish(ha_rows rows)
set_not_null(COLUMN_STAT_AVG_LENGTH);
}
if (count_distinct)
- {
- ulonglong distincts= count_distinct->get_value();
+ {
+ ulonglong distincts;
+ uint hist_size= count_distinct->get_hist_size();
+ if (hist_size == 0)
+ distincts= count_distinct->get_value();
+ else
+ distincts= count_distinct->get_value_with_histogram(rows - nulls);
if (distincts)
{
val= (double) (rows - nulls) / distincts;
set_avg_frequency(val);
set_not_null(COLUMN_STAT_AVG_FREQUENCY);
}
+ else
+ hist_size= 0;
+ histogram.set_size(hist_size);
+ set_not_null(COLUMN_STAT_HIST_SIZE);
+ if (hist_size && distincts)
+ {
+ histogram.set_values(count_distinct->get_histogram());
+ set_not_null(COLUMN_STAT_HISTOGRAM);
+ }
delete count_distinct;
count_distinct= NULL;
}
@@ -2234,16 +2423,19 @@ int collect_statistics_for_table(THD *thd, TABLE *table)
table->collected_stats->cardinality= rows;
}
+ bitmap_clear_all(table->write_set);
for (field_ptr= table->field; *field_ptr; field_ptr++)
{
table_field= *field_ptr;
if (!bitmap_is_set(table->read_set, table_field->field_index))
continue;
+ bitmap_set_bit(table->write_set, table_field->field_index);
if (!rc)
table_field->collected_stats->finish(rows);
else
table_field->collected_stats->cleanup();
}
+bitmap_clear_all(table->write_set);
if (!rc)
{
@@ -2421,6 +2613,7 @@ int read_statistics_for_table(THD *thd, TABLE *table, TABLE_LIST *stat_tables)
Field **field_ptr;
KEY *key_info, *key_info_end;
TABLE_SHARE *table_share= table->s;
+ Table_statistics *read_stats= table_share->stats_cb.table_stats;
DBUG_ENTER("read_statistics_for_table");
@@ -2432,16 +2625,18 @@ int read_statistics_for_table(THD *thd, TABLE *table, TABLE_LIST *stat_tables)
/* Read statistics from the statistical table column_stats */
stat_table= stat_tables[COLUMN_STAT].table;
+ ulong total_hist_size= 0;
Column_stat column_stat(stat_table, table);
for (field_ptr= table_share->field; *field_ptr; field_ptr++)
{
table_field= *field_ptr;
column_stat.set_key_fields(table_field);
column_stat.get_stat_values();
+ total_hist_size+= table_field->read_stats->histogram.get_size();
}
+ read_stats->total_hist_size= total_hist_size;
/* Read statistics from the statistical table index_stats */
- Table_statistics *read_stats= table_share->stats_cb.table_stats;
stat_table= stat_tables[INDEX_STAT].table;
Index_stat index_stat(stat_table, table);
for (key_info= table_share->key_info,
@@ -2559,10 +2754,14 @@ bool statistics_for_tables_is_needed(THD *thd, TABLE_LIST *tables)
TABLE_SHARE *table_share= tl->table->s;
if (table_share &&
table_share->stats_cb.stats_can_be_read &&
- !table_share->stats_cb.stats_is_read)
+ (!table_share->stats_cb.stats_is_read ||
+ (!table_share->stats_cb.histograms_are_read &&
+ thd->variables.optimizer_use_condition_selectivity > 3)))
return TRUE;
if (table_share->stats_cb.stats_is_read)
tl->table->stats_is_read= TRUE;
+ if (table_share->stats_cb.histograms_are_read)
+ tl->table->histograms_are_read= TRUE;
}
}
@@ -2570,6 +2769,41 @@ bool statistics_for_tables_is_needed(THD *thd, TABLE_LIST *tables)
}
+static
+int read_histograms_for_table(THD *thd, TABLE *table, TABLE_LIST *stat_tables)
+{
+ TABLE_SHARE *table_share= table->s;
+
+ DBUG_ENTER("read_histograms_for_table");
+
+ if (!table_share->stats_cb.histograms_can_be_read)
+ {
+ (void) alloc_histograms_for_table_share(thd, table_share, FALSE);
+ }
+ if (table_share->stats_cb.histograms_can_be_read &&
+ !table_share->stats_cb.histograms_are_read)
+ {
+ Field **field_ptr;
+ uchar *histogram= table_share->stats_cb.table_stats->histograms;
+ TABLE *stat_table= stat_tables[COLUMN_STAT].table;
+ Column_stat column_stat(stat_table, table);
+ for (field_ptr= table_share->field; *field_ptr; field_ptr++)
+ {
+ Field *table_field= *field_ptr;
+ uint hist_size= table_field->read_stats->histogram.get_size();
+ if (hist_size)
+ {
+ column_stat.set_key_fields(table_field);
+ table_field->read_stats->histogram.set_values(histogram);
+ column_stat.get_histogram_value();
+ histogram+= hist_size;
+ }
+ }
+ }
+
+ DBUG_RETURN(0);
+}
+
/**
@brief
Read statistics for tables from a table list if it is needed
@@ -2597,7 +2831,7 @@ int read_statistics_for_tables_if_needed(THD *thd, TABLE_LIST *tables)
TABLE_LIST stat_tables[STATISTICS_TABLES];
Open_tables_backup open_tables_backup;
- DBUG_ENTER("read_statistics_for_table_if_needed");
+ DBUG_ENTER("read_statistics_for_tables_if_needed");
DEBUG_SYNC(thd, "statistics_read_start");
@@ -2624,6 +2858,14 @@ int read_statistics_for_tables_if_needed(THD *thd, TABLE_LIST *tables)
}
if (table_share->stats_cb.stats_is_read)
tl->table->stats_is_read= TRUE;
+ if (thd->variables.optimizer_use_condition_selectivity > 3 &&
+ table_share && !table_share->stats_cb.histograms_are_read)
+ {
+ (void) read_histograms_for_table(thd, tl->table, stat_tables);
+ table_share->stats_cb.histograms_are_read= TRUE;
+ }
+ if (table_share->stats_cb.stats_is_read)
+ tl->table->histograms_are_read= TRUE;
}
}
@@ -3083,20 +3325,40 @@ double get_column_range_cardinality(Field *field,
res= table->stat_records();
else if (min_endp->length == max_endp->length &&
!memcmp(min_endp->key, max_endp->key, min_endp->length))
- {
- res= col_stats->get_avg_frequency();
+ {
+ double avg_frequency= col_stats->get_avg_frequency();
+ res= avg_frequency;
+ if (avg_frequency > 1.0 + 0.000001 &&
+ col_stats->min_value && col_stats->max_value)
+ {
+ Histogram *hist= &col_stats->histogram;
+ if (hist->get_size() > 0)
+ {
+ double pos= field->middle_point_pos(col_stats->min_value,
+ col_stats->max_value);
+ res= table->stat_records() *
+ hist->point_selectivity(pos,
+ avg_frequency / table->stat_records());
+ }
+ }
}
else
{
if (col_stats->min_value && col_stats->max_value)
{
+ double sel;
store_key_image_to_rec(field, (uchar *) min_endp->key, min_endp->length);
double min_mp_pos= field->middle_point_pos(col_stats->min_value,
col_stats->max_value);
store_key_image_to_rec(field, (uchar *) max_endp->key, max_endp->length);
double max_mp_pos= field->middle_point_pos(col_stats->min_value,
col_stats->max_value);
- res= table->stat_records() * (max_mp_pos - min_mp_pos);
+ Histogram *hist= &col_stats->histogram;
+ if (hist->get_size() == 0)
+ sel= (max_mp_pos - min_mp_pos);
+ else
+ sel= hist->range_selectivity(min_mp_pos, max_mp_pos);
+ res= table->stat_records() * sel;
}
else
res= table->stat_records();