summaryrefslogtreecommitdiff
path: root/sql/item_sum.h
diff options
context:
space:
mode:
authorunknown <cmiller@zippy.cornsilk.net>2006-12-22 15:37:37 -0500
committerunknown <cmiller@zippy.cornsilk.net>2006-12-22 15:37:37 -0500
commit82bd9b6b61f14b227f0fe9169f0a8754694d1924 (patch)
tree1192dea3a5bbb9777291296b2e70e43c87d40abd /sql/item_sum.h
parent4f4b46456a150880d5d93d77a2f9f0db68d9396c (diff)
downloadmariadb-git-82bd9b6b61f14b227f0fe9169f0a8754694d1924.tar.gz
Bug#22555: STDDEV yields positive result for groups with only one row
When only one row was present, the subtraction of nearly the same number resulted in catastropic cancellation, introducing an error in the VARIANCE calculation near 1e-15. That was sqrt()ed to get STDDEV, the error was escallated to near 1e-8. The simple fix of testing for a row count of 1 and forcing that to yield 0.0 is insufficient, as two rows of the same value should also have a variance of 0.0, yet the error would be about the same. So, this patch changes the formula that computes the VARIANCE to be one that is not subject to catastrophic cancellation. In addition, it now uses only (faster-than-decimal) floating point numbers to calculate, and renders that to other types on demand. mysql-test/r/func_group.result: Test that the bug is fixed, and that no unexpected behavior arises from the changes. mysql-test/t/func_group.test: Test that the bug is fixed, and that no unexpected behavior arises from the changes. sql/item_sum.cc: Serg's suggestion: Force all VARIANCE calculations to be done with floating- point types. It's faster, and the SQL standard says we may implement these functions any way we want. Additionally, use a form of variance calculation that is not subject to catastrophic cancellation. http://static.flickr.com/108/311308512_5c4e1c0c3d_b.jpg sql/item_sum.h: Remove unused members and add a comment describing the recurrence relation.
Diffstat (limited to 'sql/item_sum.h')
-rw-r--r--sql/item_sum.h20
1 files changed, 14 insertions, 6 deletions
diff --git a/sql/item_sum.h b/sql/item_sum.h
index fe7edd76ecf..989e72654fe 100644
--- a/sql/item_sum.h
+++ b/sql/item_sum.h
@@ -665,8 +665,10 @@ public:
double val_real();
longlong val_int()
{ /* can't be fix_fields()ed */ return (longlong) rint(val_real()); }
- String *val_str(String*);
- my_decimal *val_decimal(my_decimal *);
+ String *val_str(String *str)
+ { return val_string_from_real(str); }
+ my_decimal *val_decimal(my_decimal *dec_buf)
+ { return val_decimal_from_real(dec_buf); }
bool is_null() { (void) val_int(); return null_value; }
enum_field_types field_type() const
{
@@ -688,6 +690,14 @@ public:
= (sum(ai^2) - 2*sum(a)*sum(a)/count(a) + count(a)*sum(a)^2/count(a)^2 )/count(a) =
= (sum(ai^2) - 2*sum(a)^2/count(a) + sum(a)^2/count(a) )/count(a) =
= (sum(ai^2) - sum(a)^2/count(a))/count(a)
+
+But, this falls prey to catastrophic cancellation. Instead, use the recurrence formulas
+
+ M_{1} = x_{1}, ~ M_{k} = M_{k-1} + (x_{k} - M_{k-1}) / k newline
+ S_{1} = 0, ~ S_{k} = S_{k-1} + (x_{k} - M_{k-1}) times (x_{k} - M_{k}) newline
+ for 2 <= k <= n newline
+ ital variance = S_{n} / (n-1)
+
*/
class Item_sum_variance : public Item_sum_num
@@ -696,9 +706,8 @@ class Item_sum_variance : public Item_sum_num
public:
Item_result hybrid_type;
- double sum, sum_sqr;
- my_decimal dec_sum[2], dec_sqr[2];
int cur_dec;
+ double recurrence_m, recurrence_s; /* Used in recurrence relation. */
ulonglong count;
uint f_precision0, f_scale0;
uint f_precision1, f_scale1;
@@ -707,7 +716,7 @@ public:
uint prec_increment;
Item_sum_variance(Item *item_par, uint sample_arg) :Item_sum_num(item_par),
- hybrid_type(REAL_RESULT), cur_dec(0), count(0), sample(sample_arg)
+ hybrid_type(REAL_RESULT), count(0), sample(sample_arg)
{}
Item_sum_variance(THD *thd, Item_sum_variance *item);
enum Sumfunctype sum_func () const { return VARIANCE_FUNC; }
@@ -727,7 +736,6 @@ public:
enum Item_result result_type () const { return REAL_RESULT; }
void cleanup()
{
- cur_dec= 0;
count= 0;
Item_sum_num::cleanup();
}