diff options
-rw-r--r-- | mysql-test/r/sum_distinct.result | 203 | ||||
-rw-r--r-- | mysql-test/t/sum_distinct.test | 188 | ||||
-rw-r--r-- | sql/filesort.cc | 56 | ||||
-rw-r--r-- | sql/item_sum.cc | 121 | ||||
-rw-r--r-- | sql/item_sum.h | 39 | ||||
-rw-r--r-- | sql/sql_class.h | 16 | ||||
-rw-r--r-- | sql/sql_sort.h | 1 | ||||
-rw-r--r-- | sql/sql_yacc.yy | 11 | ||||
-rw-r--r-- | sql/uniques.cc | 234 |
9 files changed, 834 insertions, 35 deletions
diff --git a/mysql-test/r/sum_distinct.result b/mysql-test/r/sum_distinct.result new file mode 100644 index 00000000000..c7f1a660267 --- /dev/null +++ b/mysql-test/r/sum_distinct.result @@ -0,0 +1,203 @@ +DROP TABLE IF EXISTS t1; +CREATE TABLE t1 ( +id INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT, +gender CHAR(1), +name VARCHAR(20) +); +SELECT SUM(DISTINCT LENGTH(name)) s1 FROM t1; +s1 +NULL +INSERT INTO t1 (gender, name) VALUES (NULL, NULL); +INSERT INTO t1 (gender, name) VALUES (NULL, NULL); +INSERT INTO t1 (gender, name) VALUES (NULL, NULL); +SELECT SUM(DISTINCT LENGTH(name)) s1 FROM t1; +s1 +NULL +INSERT INTO t1 (gender, name) VALUES ('F', 'Helen'), ('F', 'Anastasia'), +('F', 'Katherine'), ('F', 'Margo'), ('F', 'Magdalene'), ('F', 'Mary'); +CREATE TABLE t2 SELECT name FROM t1; +SELECT (SELECT SUM(DISTINCT LENGTH(name)) FROM t1) FROM t2; +(SELECT SUM(DISTINCT LENGTH(name)) FROM t1) +18 +18 +18 +18 +18 +18 +18 +18 +18 +DROP TABLE t2; +INSERT INTO t1 (gender, name) VALUES ('F', 'Eva'), ('F', 'Sofia'), +('F', 'Sara'), ('F', 'Golda'), ('F', 'Toba'), ('F', 'Victory'), +('F', 'Faina'), ('F', 'Miriam'), ('F', 'Beki'), ('F', 'America'), +('F', 'Susan'), ('F', 'Glory'), ('F', 'Priscilla'), ('F', 'Rosmary'), +('F', 'Rose'), ('F', 'Margareth'), ('F', 'Elizabeth'), ('F', 'Meredith'), +('F', 'Julie'), ('F', 'Xenia'), ('F', 'Zena'), ('F', 'Olga'), +('F', 'Brunhilda'), ('F', 'Nataly'), ('F', 'Lara'), ('F', 'Svetlana'), +('F', 'Grethem'), ('F', 'Irene'); +SELECT +SUM(DISTINCT LENGTH(name)) s1, +SUM(DISTINCT SUBSTRING(NAME, 1, 3)) s2, +SUM(DISTINCT LENGTH(SUBSTRING(name, 1, 4))) s3 +FROM t1; +s1 s2 s3 +42 0 7 +SELECT +SUM(DISTINCT LENGTH(g1.name)) s1, +SUM(DISTINCT SUBSTRING(g2.name, 1, 3)) s2, +SUM(DISTINCT LENGTH(SUBSTRING(g3.name, 1, 4))) s3 +FROM t1 g1, t1 g2, t1 g3; +s1 s2 s3 +42 0 7 +SELECT +SUM(DISTINCT LENGTH(g1.name)) s1, +SUM(DISTINCT SUBSTRING(g2.name, 1, 3)) s2, +SUM(DISTINCT LENGTH(SUBSTRING(g3.name, 1, 4))) s3 +FROM t1 g1, t1 g2, t1 g3 GROUP BY LENGTH(SUBSTRING(g3.name, 5, 10)); +s1 s2 s3 +42 0 NULL +42 0 7 +42 0 4 +42 0 4 +42 0 4 +42 0 4 +42 0 4 +SELECT SQL_BUFFER_RESULT +SUM(DISTINCT LENGTH(name)) s1, +SUM(DISTINCT SUBSTRING(NAME, 1, 3)) s2, +SUM(DISTINCT LENGTH(SUBSTRING(name, 1, 4))) s3 +FROM t1; +s1 s2 s3 +42 0 7 +SELECT SQL_BUFFER_RESULT +SUM(DISTINCT LENGTH(g1.name)) s1, +SUM(DISTINCT SUBSTRING(g2.name, 1, 3)) s2, +SUM(DISTINCT LENGTH(SUBSTRING(g3.name, 1, 4))) s3 +FROM t1 g1, t1 g2, t1 g3 GROUP BY LENGTH(SUBSTRING(g3.name, 5, 10)); +s1 s2 s3 +42 0 NULL +42 0 7 +42 0 4 +42 0 4 +42 0 4 +42 0 4 +42 0 4 +SET @l=1; +UPDATE t1 SET name=CONCAT(name, @l:=@l+1); +SELECT SUM(DISTINCT RIGHT(name, 1)) FROM t1; +SUM(DISTINCT RIGHT(name, 1)) +45 +SELECT SUM(DISTINCT id) FROM t1; +SUM(DISTINCT id) +703 +SELECT SUM(DISTINCT id % 11) FROM t1; +SUM(DISTINCT id % 11) +55 +DROP TABLE t1; +CREATE TABLE t1 (id INTEGER); +CREATE TABLE t2 (id INTEGER); +INSERT INTO t1 (id) VALUES (1), (1), (1),(1); +INSERT INTO t2 (id) SELECT id FROM t1; +INSERT INTO t1 (id) SELECT id FROM t2; +/* 8 */ +INSERT INTO t1 (id) SELECT id FROM t2; +/* 12 */ +INSERT INTO t1 (id) SELECT id FROM t2; +/* 16 */ +INSERT INTO t1 (id) SELECT id FROM t2; +/* 20 */ +INSERT INTO t1 (id) SELECT id FROM t2; +/* 24 */ +DELETE FROM t2; +INSERT INTO t2 (id) SELECT id+1 FROM t1; +INSERT INTO t1 SELECT id FROM t2; +DELETE FROM t2; +INSERT INTO t2 (id) SELECT id+2 FROM t1; +INSERT INTO t1 SELECT id FROM t2; +DELETE FROM t2; +INSERT INTO t2 (id) SELECT id+4 FROM t1; +INSERT INTO t1 SELECT id FROM t2; +DELETE FROM t2; +INSERT INTO t2 (id) SELECT id+8 FROM t1; +INSERT INTO t1 SELECT id FROM t2; +DELETE FROM t2; +INSERT INTO t2 (id) SELECT id+16 FROM t1; +INSERT INTO t1 SELECT id FROM t2; +DELETE FROM t2; +INSERT INTO t2 (id) SELECT id+32 FROM t1; +INSERT INTO t1 SELECT id FROM t2; +DELETE FROM t2; +INSERT INTO t2 (id) SELECT id+64 FROM t1; +INSERT INTO t1 SELECT id FROM t2; +DELETE FROM t2; +INSERT INTO t2 (id) SELECT id+128 FROM t1; +INSERT INTO t1 SELECT id FROM t2; +DELETE FROM t2; +INSERT INTO t2 (id) SELECT id+256 FROM t1; +INSERT INTO t1 SELECT id FROM t2; +DELETE FROM t2; +INSERT INTO t2 (id) SELECT id+512 FROM t1; +INSERT INTO t1 SELECT id FROM t2; +DELETE FROM t2; +INSERT INTO t2 (id) SELECT id+1024 FROM t1; +INSERT INTO t1 SELECT id FROM t2; +DELETE FROM t2; +INSERT INTO t2 (id) SELECT id+2048 FROM t1; +INSERT INTO t1 SELECT id FROM t2; +DELETE FROM t2; +INSERT INTO t2 (id) SELECT id+4096 FROM t1; +INSERT INTO t1 SELECT id FROM t2; +DELETE FROM t2; +INSERT INTO t2 (id) SELECT id+8192 FROM t1; +INSERT INTO t1 SELECT id FROM t2; +DELETE FROM t2; +INSERT INTO t2 SELECT id FROM t1 ORDER BY id*rand(); +SELECT SUM(DISTINCT id) sm FROM t1; +sm +134225920 +SELECT SUM(DISTINCT id) sm FROM t2; +sm +134225920 +SELECT SUM(DISTINCT id) sm FROM t1 group by id % 13; +sm +10327590 +10328851 +10330112 +10331373 +10332634 +10317510 +10318770 +10320030 +10321290 +10322550 +10323810 +10325070 +10326330 +SET max_heap_table_size=16384; +SHOW variables LIKE 'max_heap_table_size'; +Variable_name Value +max_heap_table_size 16384 +SELECT SUM(DISTINCT id) sm FROM t1; +sm +134225920 +SELECT SUM(DISTINCT id) sm FROM t2; +sm +134225920 +SELECT SUM(DISTINCT id) sm FROM t1 GROUP BY id % 13; +sm +10327590 +10328851 +10330112 +10331373 +10332634 +10317510 +10318770 +10320030 +10321290 +10322550 +10323810 +10325070 +10326330 +DROP TABLE t1; +DROP TABLE t2; diff --git a/mysql-test/t/sum_distinct.test b/mysql-test/t/sum_distinct.test new file mode 100644 index 00000000000..efbb21a7b85 --- /dev/null +++ b/mysql-test/t/sum_distinct.test @@ -0,0 +1,188 @@ +# +# Various tests for SUM(DISTINCT ...) +# +--disable_warnings +DROP TABLE IF EXISTS t1; +--enable_warnings + +CREATE TABLE t1 ( + id INTEGER NOT NULL PRIMARY KEY AUTO_INCREMENT, + gender CHAR(1), + name VARCHAR(20) +); + +# According to ANSI SQL, SUM(DISTINCT ...) should return NULL for empty +# record set + +SELECT SUM(DISTINCT LENGTH(name)) s1 FROM t1; + +# According to ANSI SQL, SUM(DISTINCT ...) should return NULL for records sets +# entirely consisting of NULLs + +INSERT INTO t1 (gender, name) VALUES (NULL, NULL); +INSERT INTO t1 (gender, name) VALUES (NULL, NULL); +INSERT INTO t1 (gender, name) VALUES (NULL, NULL); + +SELECT SUM(DISTINCT LENGTH(name)) s1 FROM t1; + + +# Filling table with t1 + +INSERT INTO t1 (gender, name) VALUES ('F', 'Helen'), ('F', 'Anastasia'), +('F', 'Katherine'), ('F', 'Margo'), ('F', 'Magdalene'), ('F', 'Mary'); + +CREATE TABLE t2 SELECT name FROM t1; + +SELECT (SELECT SUM(DISTINCT LENGTH(name)) FROM t1) FROM t2; + +DROP TABLE t2; + +INSERT INTO t1 (gender, name) VALUES ('F', 'Eva'), ('F', 'Sofia'), +('F', 'Sara'), ('F', 'Golda'), ('F', 'Toba'), ('F', 'Victory'), +('F', 'Faina'), ('F', 'Miriam'), ('F', 'Beki'), ('F', 'America'), +('F', 'Susan'), ('F', 'Glory'), ('F', 'Priscilla'), ('F', 'Rosmary'), +('F', 'Rose'), ('F', 'Margareth'), ('F', 'Elizabeth'), ('F', 'Meredith'), +('F', 'Julie'), ('F', 'Xenia'), ('F', 'Zena'), ('F', 'Olga'), +('F', 'Brunhilda'), ('F', 'Nataly'), ('F', 'Lara'), ('F', 'Svetlana'), +('F', 'Grethem'), ('F', 'Irene'); + +SELECT + SUM(DISTINCT LENGTH(name)) s1, + SUM(DISTINCT SUBSTRING(NAME, 1, 3)) s2, + SUM(DISTINCT LENGTH(SUBSTRING(name, 1, 4))) s3 +FROM t1; + +SELECT + SUM(DISTINCT LENGTH(g1.name)) s1, + SUM(DISTINCT SUBSTRING(g2.name, 1, 3)) s2, + SUM(DISTINCT LENGTH(SUBSTRING(g3.name, 1, 4))) s3 +FROM t1 g1, t1 g2, t1 g3; + +SELECT + SUM(DISTINCT LENGTH(g1.name)) s1, + SUM(DISTINCT SUBSTRING(g2.name, 1, 3)) s2, + SUM(DISTINCT LENGTH(SUBSTRING(g3.name, 1, 4))) s3 +FROM t1 g1, t1 g2, t1 g3 GROUP BY LENGTH(SUBSTRING(g3.name, 5, 10)); + +# here we explicitly request summing through temporary table (so +# Item_sum_sum_distinct::copy_or_same() is called) + +SELECT SQL_BUFFER_RESULT + SUM(DISTINCT LENGTH(name)) s1, + SUM(DISTINCT SUBSTRING(NAME, 1, 3)) s2, + SUM(DISTINCT LENGTH(SUBSTRING(name, 1, 4))) s3 +FROM t1; + +SELECT SQL_BUFFER_RESULT + SUM(DISTINCT LENGTH(g1.name)) s1, + SUM(DISTINCT SUBSTRING(g2.name, 1, 3)) s2, + SUM(DISTINCT LENGTH(SUBSTRING(g3.name, 1, 4))) s3 +FROM t1 g1, t1 g2, t1 g3 GROUP BY LENGTH(SUBSTRING(g3.name, 5, 10)); + +# this test demonstrates that strings are automatically converted to numbers +# before summing + +SET @l=1; +UPDATE t1 SET name=CONCAT(name, @l:=@l+1); + +SELECT SUM(DISTINCT RIGHT(name, 1)) FROM t1; + +# this is a test case for ordinary t1 + +SELECT SUM(DISTINCT id) FROM t1; +SELECT SUM(DISTINCT id % 11) FROM t1; + +DROP TABLE t1; + +# +# Test the case when distinct values doesn't fit in memory and +# filesort is used (see uniques.cc:merge_walk) +# + +CREATE TABLE t1 (id INTEGER); +CREATE TABLE t2 (id INTEGER); + +INSERT INTO t1 (id) VALUES (1), (1), (1),(1); +INSERT INTO t2 (id) SELECT id FROM t1; +INSERT INTO t1 (id) SELECT id FROM t2; /* 8 */ +INSERT INTO t1 (id) SELECT id FROM t2; /* 12 */ +INSERT INTO t1 (id) SELECT id FROM t2; /* 16 */ +INSERT INTO t1 (id) SELECT id FROM t2; /* 20 */ +INSERT INTO t1 (id) SELECT id FROM t2; /* 24 */ +DELETE FROM t2; +INSERT INTO t2 (id) SELECT id+1 FROM t1; +INSERT INTO t1 SELECT id FROM t2; +DELETE FROM t2; +INSERT INTO t2 (id) SELECT id+2 FROM t1; +INSERT INTO t1 SELECT id FROM t2; +DELETE FROM t2; +INSERT INTO t2 (id) SELECT id+4 FROM t1; +INSERT INTO t1 SELECT id FROM t2; +DELETE FROM t2; +INSERT INTO t2 (id) SELECT id+8 FROM t1; +INSERT INTO t1 SELECT id FROM t2; +DELETE FROM t2; +INSERT INTO t2 (id) SELECT id+16 FROM t1; +INSERT INTO t1 SELECT id FROM t2; +DELETE FROM t2; +INSERT INTO t2 (id) SELECT id+32 FROM t1; +INSERT INTO t1 SELECT id FROM t2; +DELETE FROM t2; +INSERT INTO t2 (id) SELECT id+64 FROM t1; +INSERT INTO t1 SELECT id FROM t2; +DELETE FROM t2; +INSERT INTO t2 (id) SELECT id+128 FROM t1; +INSERT INTO t1 SELECT id FROM t2; +DELETE FROM t2; +INSERT INTO t2 (id) SELECT id+256 FROM t1; +INSERT INTO t1 SELECT id FROM t2; +DELETE FROM t2; +INSERT INTO t2 (id) SELECT id+512 FROM t1; +INSERT INTO t1 SELECT id FROM t2; +DELETE FROM t2; +INSERT INTO t2 (id) SELECT id+1024 FROM t1; +INSERT INTO t1 SELECT id FROM t2; +DELETE FROM t2; +INSERT INTO t2 (id) SELECT id+2048 FROM t1; +INSERT INTO t1 SELECT id FROM t2; +DELETE FROM t2; +INSERT INTO t2 (id) SELECT id+4096 FROM t1; +INSERT INTO t1 SELECT id FROM t2; +DELETE FROM t2; +INSERT INTO t2 (id) SELECT id+8192 FROM t1; +INSERT INTO t1 SELECT id FROM t2; +DELETE FROM t2; +#INSERT INTO t2 (id) SELECT id+16384 FROM t1; +#INSERT INTO t1 SELECT id FROM t2; +#DELETE FROM t2; +#INSERT INTO t2 (id) SELECT id+32768 FROM t1; +#INSERT INTO t1 SELECT id FROM t2; +#DELETE FROM t2; +#INSERT INTO t2 (id) SELECT id+65536 FROM t1; +#INSERT INTO t1 SELECT id FROM t2; +#DELETE FROM t2; +INSERT INTO t2 SELECT id FROM t1 ORDER BY id*rand(); + +# SELECT '++++++++++++++++++++++++++++++++++++++++++++++++++'; + +SELECT SUM(DISTINCT id) sm FROM t1; +SELECT SUM(DISTINCT id) sm FROM t2; +SELECT SUM(DISTINCT id) sm FROM t1 group by id % 13; + +# this limit for max_heap_table_size is set to force testing the case, when +# all distinct sum values can not fit in memory and must be stored in a +# temporary table + +SET max_heap_table_size=16384; + +# to check that max_heap_table_size was actually set (hard limit for minimum +# max_heap_table_size is set in mysqld.cc): + +SHOW variables LIKE 'max_heap_table_size'; + +SELECT SUM(DISTINCT id) sm FROM t1; +SELECT SUM(DISTINCT id) sm FROM t2; +SELECT SUM(DISTINCT id) sm FROM t1 GROUP BY id % 13; + +DROP TABLE t1; +DROP TABLE t2; diff --git a/sql/filesort.cc b/sql/filesort.cc index a1c7b5f421f..3722e8be32c 100644 --- a/sql/filesort.cc +++ b/sql/filesort.cc @@ -756,6 +756,39 @@ uint read_to_buffer(IO_CACHE *fromfile, BUFFPEK *buffpek, } /* read_to_buffer */ +/* + Put all room used by freed buffer to use in adjacent buffer. Note, that + we can't simply distribute memory evenly between all buffers, because + new areas must not overlap with old ones. + SYNOPSYS + reuse_freed_buff() + queue IN list of non-empty buffers, without freed buffer + reuse IN empty buffer + key_length IN key length +*/ + +void reuse_freed_buff(QUEUE *queue, BUFFPEK *reuse, uint key_length) +{ + uchar *reuse_end= reuse->base + reuse->max_keys * key_length; + for (uint i= 0; i < queue->elements; ++i) + { + BUFFPEK *bp= (BUFFPEK *) queue_element(queue, i); + if (bp->base + bp->max_keys * key_length == reuse->base) + { + bp->max_keys+= reuse->max_keys; + return; + } + else if (bp->base == reuse_end) + { + bp->base= reuse->base; + bp->max_keys+= reuse->max_keys; + return; + } + } + DBUG_ASSERT(0); +} + + /* Merge buffers to one buffer */ @@ -881,29 +914,8 @@ int merge_buffers(SORTPARAM *param, IO_CACHE *from_file, if (!(error= (int) read_to_buffer(from_file,buffpek, rec_length))) { - uchar *base= buffpek->base; - ulong max_keys= buffpek->max_keys; - VOID(queue_remove(&queue,0)); - - /* Put room used by buffer to use in other buffer */ - for (refpek= (BUFFPEK**) &queue_top(&queue); - refpek <= (BUFFPEK**) &queue_end(&queue); - refpek++) - { - buffpek= *refpek; - if (buffpek->base+buffpek->max_keys*rec_length == base) - { - buffpek->max_keys+= max_keys; - break; - } - else if (base+max_keys*rec_length == buffpek->base) - { - buffpek->base= base; - buffpek->max_keys+= max_keys; - break; - } - } + reuse_freed_buff(&queue, buffpek, rec_length); break; /* One buffer have been removed */ } else if (error == -1) diff --git a/sql/item_sum.cc b/sql/item_sum.cc index a3d652d2fe1..f1ecced250d 100644 --- a/sql/item_sum.cc +++ b/sql/item_sum.cc @@ -262,6 +262,122 @@ double Item_sum_sum::val() } +/* Item_sum_sum_distinct */ + +Item_sum_sum_distinct::Item_sum_sum_distinct(Item *item) + :Item_sum_num(item), sum(0.0), tree(0) +{ + /* + quick_group is an optimizer hint, which means that GROUP BY can be + handled with help of index on grouped columns. + By setting quick_group to zero we force creation of temporary table + to perform GROUP BY. + */ + quick_group= 0; +} + + +Item_sum_sum_distinct::Item_sum_sum_distinct(THD *thd, + Item_sum_sum_distinct &original) + :Item_sum_num(thd, original), sum(0.0), tree(0) +{ + quick_group= 0; +} + + +Item_sum_sum_distinct::~Item_sum_sum_distinct() +{ + delete tree; +} + + +Item * +Item_sum_sum_distinct::copy_or_same(THD *thd) +{ + return new (&thd->mem_root) Item_sum_sum_distinct(thd, *this); +} + +C_MODE_START + +static int simple_raw_key_cmp(void* arg, const void* key1, const void* key2) +{ + return memcmp(key1, key2, *(uint *) arg); +} + +C_MODE_END + +bool Item_sum_sum_distinct::setup(THD *thd) +{ + SELECT_LEX *select_lex= thd->lex->current_select; + /* what does it mean??? */ + if (select_lex->linkage == GLOBAL_OPTIONS_TYPE) + return 1; + + DBUG_ASSERT(tree == 0); /* setup can not be called twice */ + + /* + Uniques handles all unique elements in a tree until they can't fit in. + Then thee tree is dumped to the temporary file. + See class Unique for details. + */ + null_value= maybe_null= 1; + /* + TODO: if underlying item result fits in 4 bytes we can take advantage + of it and have tree of long/ulong. It gives 10% performance boost + */ + static uint key_length= sizeof(double); + tree= new Unique(simple_raw_key_cmp, &key_length, key_length, + thd->variables.max_heap_table_size); + return tree == 0; +} + +void Item_sum_sum_distinct::clear() +{ + DBUG_ASSERT(tree); /* we always have a tree */ + null_value= 1; + tree->reset(); +} + +bool Item_sum_sum_distinct::add() +{ + /* args[0]->val() may reset args[0]->null_value */ + double val= args[0]->val(); + if (!args[0]->null_value) + { + DBUG_ASSERT(tree); + null_value= 0; + if (val) + return tree->unique_add(&val); + } + return 0; +} + +C_MODE_START + +static int sum_sum_distinct(void *element, element_count num_of_dups, + void *item_sum_sum_distinct) +{ + ((Item_sum_sum_distinct *) + (item_sum_sum_distinct))->add(* (double *) element); + return 0; +} + +C_MODE_END + +double Item_sum_sum_distinct::val() +{ + /* + We don't have a tree only if 'setup()' hasn't been called; + this is the case of sql_select.cc:return_zero_rows. + */ + sum= 0.0; + if (tree) + tree->walk(sum_sum_distinct, (void *) this); + return sum; +} + +/* end of Item_sum_sum_distinct */ + Item *Item_sum_count::copy_or_same(THD* thd) { return new (&thd->mem_root) Item_sum_count(thd, *this); @@ -1036,11 +1152,6 @@ String *Item_variance_field::val_str(String *str) #include "sql_select.h" -int simple_raw_key_cmp(void* arg, byte* key1, byte* key2) -{ - return memcmp(key1, key2, *(uint*) arg); -} - int simple_str_key_cmp(void* arg, byte* key1, byte* key2) { Item_sum_count_distinct* item = (Item_sum_count_distinct*)arg; diff --git a/sql/item_sum.h b/sql/item_sum.h index d454f06ccde..a80b6797068 100644 --- a/sql/item_sum.h +++ b/sql/item_sum.h @@ -27,9 +27,9 @@ class Item_sum :public Item_result_field { public: enum Sumfunctype - { COUNT_FUNC,COUNT_DISTINCT_FUNC,SUM_FUNC,AVG_FUNC,MIN_FUNC, - MAX_FUNC, UNIQUE_USERS_FUNC,STD_FUNC,VARIANCE_FUNC,SUM_BIT_FUNC, - UDF_SUM_FUNC, GROUP_CONCAT_FUNC + { COUNT_FUNC, COUNT_DISTINCT_FUNC, SUM_FUNC, SUM_DISTINCT_FUNC, AVG_FUNC, + MIN_FUNC, MAX_FUNC, UNIQUE_USERS_FUNC, STD_FUNC, VARIANCE_FUNC, + SUM_BIT_FUNC, UDF_SUM_FUNC, GROUP_CONCAT_FUNC }; Item **args,*tmp_args[2]; @@ -138,6 +138,39 @@ class Item_sum_sum :public Item_sum_num }; +/* + Item_sum_sum_distinct - SELECT SUM(DISTINCT expr) FROM ... + support. See also: MySQL manual, chapter 'Adding New Functions To MySQL' + and comments in item_sum.cc. +*/ + +class Unique; + +class Item_sum_sum_distinct :public Item_sum_num +{ + double sum; + Unique *tree; +private: + Item_sum_sum_distinct(THD *thd, Item_sum_sum_distinct &item); +public: + Item_sum_sum_distinct(Item *item_par); + ~Item_sum_sum_distinct(); + + bool setup(THD *thd); + void clear(); + bool add(); + double val(); + + inline void add(double val) { sum+= val; } + enum Sumfunctype sum_func () const { return SUM_DISTINCT_FUNC; } + void reset_field() {} // not used + void update_field() {} // not used + const char *func_name() const { return "sum_distinct"; } + Item *copy_or_same(THD* thd); + virtual void no_rows_in_result() {} +}; + + class Item_sum_count :public Item_sum_int { longlong count; diff --git a/sql/sql_class.h b/sql/sql_class.h index 8263789a2a2..e81c2ba0110 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -1207,8 +1207,13 @@ class user_var_entry DTCollation collation; }; - -/* Class for unique (removing of duplicates) */ +/* + Unique -- class for unique (removing of duplicates). + Puts all values to the TREE. If the tree becomes too big, + it's dumped to the file. User can request sorted values, or + just iterate through them. In the last case tree merging is performed in + memory simultaneously with iteration, so it should be ~2-3x faster. + */ class Unique :public Sql_alloc { @@ -1222,10 +1227,10 @@ class Unique :public Sql_alloc public: ulong elements; - Unique(qsort_cmp2 comp_func, void * comp_func_fixed_arg, + Unique(qsort_cmp2 comp_func, void *comp_func_fixed_arg, uint size_arg, ulong max_in_memory_size_arg); ~Unique(); - inline bool unique_add(gptr ptr) + inline bool unique_add(void *ptr) { if (tree.elements_in_tree > max_elements && flush()) return 1; @@ -1234,6 +1239,9 @@ public: bool get(TABLE *table); + void reset(); + bool walk(tree_walk_action action, void *walk_action_arg); + friend int unique_write_to_file(gptr key, element_count count, Unique *unique); friend int unique_write_to_ptrs(gptr key, element_count count, Unique *unique); }; diff --git a/sql/sql_sort.h b/sql/sql_sort.h index 9f95ffa4884..1831c4a2f3d 100644 --- a/sql/sql_sort.h +++ b/sql/sql_sort.h @@ -78,3 +78,4 @@ int merge_buffers(SORTPARAM *param,IO_CACHE *from_file, IO_CACHE *to_file, uchar *sort_buffer, BUFFPEK *lastbuff,BUFFPEK *Fb, BUFFPEK *Tb,int flag); +void reuse_freed_buff(QUEUE *queue, BUFFPEK *reuse, uint key_length); diff --git a/sql/sql_yacc.yy b/sql/sql_yacc.yy index 66784e79bcb..dce3ed1ce48 100644 --- a/sql/sql_yacc.yy +++ b/sql/sql_yacc.yy @@ -4055,14 +4055,25 @@ sum_expr: { $$= new Item_sum_unique_users($3,atoi($5.str),atoi($7.str),$9); } | MIN_SYM '(' in_sum_expr ')' { $$=new Item_sum_min($3); } +/* + According to ANSI SQL, DISTINCT is allowed and has + no sence inside MIN and MAX grouping functions; so MIN|MAX(DISTINCT ...) + is processed like an ordinary MIN | MAX() + */ + | MIN_SYM '(' DISTINCT in_sum_expr ')' + { $$=new Item_sum_min($4); } | MAX_SYM '(' in_sum_expr ')' { $$=new Item_sum_max($3); } + | MAX_SYM '(' DISTINCT in_sum_expr ')' + { $$=new Item_sum_max($4); } | STD_SYM '(' in_sum_expr ')' { $$=new Item_sum_std($3); } | VARIANCE_SYM '(' in_sum_expr ')' { $$=new Item_sum_variance($3); } | SUM_SYM '(' in_sum_expr ')' { $$=new Item_sum_sum($3); } + | SUM_SYM '(' DISTINCT in_sum_expr ')' + { $$=new Item_sum_sum_distinct($4); } | GROUP_CONCAT_SYM '(' opt_distinct expr_list opt_gorder_clause opt_gconcat_separator ')' { diff --git a/sql/uniques.cc b/sql/uniques.cc index 02146426782..7f8793abc91 100644 --- a/sql/uniques.cc +++ b/sql/uniques.cc @@ -84,6 +84,7 @@ bool Unique::flush() elements+= tree.elements_in_tree; file_ptr.count=tree.elements_in_tree; file_ptr.file_pos=my_b_tell(&file); + if (tree_walk(&tree, (tree_walk_action) unique_write_to_file, (void*) this, left_root_right) || insert_dynamic(&file_ptrs, (gptr) &file_ptr)) @@ -94,6 +95,237 @@ bool Unique::flush() /* + Clear the tree and the file. + You must call reset() if you want to reuse Unique after walk(). +*/ + +void +Unique::reset() +{ + reset_tree(&tree); + /* + If elements != 0, some trees were stored in the file (see how + flush() works). Note, that we can not count on my_b_tell(&file) == 0 + here, because it can return 0 right after walk(), and walk() does not + reset any Unique member. + */ + if (elements) + { + reset_dynamic(&file_ptrs); + reinit_io_cache(&file, WRITE_CACHE, 0L, 0, 1); + } + elements= 0; +} + +/* + The comparison function, passed to queue_init() in merge_walk() must + use comparison function of Uniques::tree, but compare members of struct + BUFFPEK. +*/ + +struct BUFFPEK_COMPARE_CONTEXT +{ + qsort_cmp2 key_compare; + void *key_compare_arg; +}; + +C_MODE_START + +static int buffpek_compare(void *arg, byte *key_ptr1, byte *key_ptr2) +{ + BUFFPEK_COMPARE_CONTEXT *ctx= (BUFFPEK_COMPARE_CONTEXT *) arg; + return ctx->key_compare(ctx->key_compare_arg, + *((byte **) key_ptr1), *((byte **)key_ptr2)); +} + +C_MODE_END + + +/* + DESCRIPTION + Function is very similar to merge_buffers, but instead of writing sorted + unique keys to the output file, it invokes walk_action for each key. + This saves I/O if you need to pass through all unique keys only once. + SYNOPSIS + merge_walk() + All params are 'IN' (but see comment for begin, end): + merge_buffer buffer to perform cached piece-by-piece loading + of trees; initially the buffer is empty + merge_buffer_size size of merge_buffer. Must be aligned with + key_length + key_length size of tree element; key_length * (end - begin) + must be less or equal than merge_buffer_size. + begin pointer to BUFFPEK struct for the first tree. + end pointer to BUFFPEK struct for the last tree; + end > begin and [begin, end) form a consecutive + range. BUFFPEKs structs in that range are used and + overwritten in merge_walk(). + walk_action element visitor. Action is called for each unique + key. + walk_action_arg argument to walk action. Passed to it on each call. + compare elements comparison function + compare_arg comparison function argument + file file with all trees dumped. Trees in the file + must contain sorted unique values. Cache must be + initialized in read mode. + RETURN VALUE + 0 ok + <> 0 error +*/ + +static bool merge_walk(uchar *merge_buffer, uint merge_buffer_size, + uint key_length, BUFFPEK *begin, BUFFPEK *end, + tree_walk_action walk_action, void *walk_action_arg, + qsort_cmp2 compare, void *compare_arg, + IO_CACHE *file) +{ + BUFFPEK_COMPARE_CONTEXT compare_context = { compare, compare_arg }; + QUEUE queue; + if (end <= begin || + merge_buffer_size < key_length * (end - begin + 1) || + init_queue(&queue, end - begin, offsetof(BUFFPEK, key), 0, + buffpek_compare, &compare_context)) + return 1; + /* we need space for one key when a piece of merge buffer is re-read */ + merge_buffer_size-= key_length; + uchar *save_key_buff= merge_buffer + merge_buffer_size; + uint max_key_count_per_piece= merge_buffer_size/(end-begin)/key_length; + /* if piece_size is aligned reuse_freed_buffer will always hit */ + uint piece_size= max_key_count_per_piece * key_length; + uint bytes_read; /* to hold return value of read_to_buffer */ + BUFFPEK *top; + int res= 1; + /* + Invariant: queue must contain top element from each tree, until a tree + is not completely walked through. + Here we're forcing the invariant, inserting one element from each tree + to the queue. + */ + for (top= begin; top != end; ++top) + { + top->base= merge_buffer + (top - begin) * piece_size; + top->max_keys= max_key_count_per_piece; + bytes_read= read_to_buffer(file, top, key_length); + if (bytes_read == (uint) (-1)) + goto end; + DBUG_ASSERT(bytes_read); + queue_insert(&queue, (byte *) top); + } + top= (BUFFPEK *) queue_top(&queue); + while (queue.elements > 1) + { + /* + Every iteration one element is removed from the queue, and one is + inserted by the rules of the invariant. If two adjacent elements on + the top of the queue are not equal, biggest one is unique, because all + elements in each tree are unique. Action is applied only to unique + elements. + */ + void *old_key= top->key; + /* + read next key from the cache or from the file and push it to the + queue; this gives new top. + */ + top->key+= key_length; + if (--top->mem_count) + queue_replaced(&queue); + else /* next piece should be read */ + { + /* save old_key not to overwrite it in read_to_buffer */ + memcpy(save_key_buff, old_key, key_length); + old_key= save_key_buff; + bytes_read= read_to_buffer(file, top, key_length); + if (bytes_read == (uint) (-1)) + goto end; + else if (bytes_read > 0) /* top->key, top->mem_count are reset */ + queue_replaced(&queue); /* in read_to_buffer */ + else + { + /* + Tree for old 'top' element is empty: remove it from the queue and + give all its memory to the nearest tree. + */ + queue_remove(&queue, 0); + reuse_freed_buff(&queue, top, key_length); + } + } + top= (BUFFPEK *) queue_top(&queue); + /* new top has been obtained; if old top is unique, apply the action */ + if (compare(compare_arg, old_key, top->key)) + { + if (walk_action(old_key, 1, walk_action_arg)) + goto end; + } + } + /* + Applying walk_action to the tail of the last tree: this is safe because + either we had only one tree in the beginning, either we work with the + last tree in the queue. + */ + do + { + do + { + if (walk_action(top->key, 1, walk_action_arg)) + goto end; + top->key+= key_length; + } + while (--top->mem_count); + bytes_read= read_to_buffer(file, top, key_length); + if (bytes_read == (uint) (-1)) + goto end; + } + while (bytes_read); + res= 0; +end: + delete_queue(&queue); + return res; +} + + +/* + DESCRIPTION + Walks consecutively through all unique elements: + if all elements are in memory, then it simply invokes 'tree_walk', else + all flushed trees are loaded to memory piece-by-piece, pieces are + sorted, and action is called for each unique value. + Note: so as merging resets file_ptrs state, this method can change + internal Unique state to undefined: if you want to reuse Unique after + walk() you must call reset() first! + SYNOPSIS + Unique:walk() + All params are 'IN': + action function-visitor, typed in include/my_tree.h + function is called for each unique element + arg argument for visitor, which is passed to it on each call + RETURN VALUE + 0 OK + <> 0 error + */ + +bool Unique::walk(tree_walk_action action, void *walk_action_arg) +{ + if (elements == 0) /* the whole tree is in memory */ + return tree_walk(&tree, action, walk_action_arg, left_root_right); + + /* flush current tree to the file to have some memory for merge buffer */ + if (flush()) + return 1; + if (flush_io_cache(&file) || reinit_io_cache(&file, READ_CACHE, 0L, 0, 0)) + return 1; + uchar *merge_buffer= (uchar *) my_malloc(max_in_memory_size, MYF(0)); + if (merge_buffer == 0) + return 1; + int res= merge_walk(merge_buffer, max_in_memory_size, size, + (BUFFPEK *) file_ptrs.buffer, + (BUFFPEK *) file_ptrs.buffer + file_ptrs.elements, + action, walk_action_arg, + tree.compare, tree.custom_arg, &file); + x_free(merge_buffer); + return res; +} + +/* Modify the TABLE element so that when one calls init_records() the rows will be read in priority order. */ @@ -114,7 +346,7 @@ bool Unique::get(TABLE *table) return 0; } } - /* Not enough memory; Save the result to file */ + /* Not enough memory; Save the result to file && free memory used by tree */ if (flush()) return 1; |