diff options
author | Tom Lane <tgl@sss.pgh.pa.us> | 2017-11-29 22:00:29 -0500 |
---|---|---|
committer | Tom Lane <tgl@sss.pgh.pa.us> | 2017-11-29 22:00:37 -0500 |
commit | 7ca25b7de6aefa5537e0dbe56541bc41c0464f97 (patch) | |
tree | 76d9932a12711f63204a11a25d5f5083edae40fe | |
parent | 1145acc70debacc34de01fac238defde543f4ed4 (diff) | |
download | postgresql-7ca25b7de6aefa5537e0dbe56541bc41c0464f97.tar.gz |
Fix neqjoinsel's behavior for semi/anti join cases.
Previously, this function estimated the selectivity as 1 minus eqjoinsel()
for the negator equality operator, regardless of join type (I think there
was an expectation that eqjoinsel would handle the join type). But
actually this is completely wrong for semijoin cases: the fraction of the
LHS that has a non-matching row is not one minus the fraction of the LHS
that has a matching row. In reality a semijoin with <> will nearly always
succeed: it can only fail when the RHS is empty, or it contains a single
distinct value that is equal to the particular LHS value, or the LHS value
is null. The only one of those things we should have much confidence in
estimating is the fraction of LHS values that are null, so let's just take
the selectivity as 1 minus outer nullfrac.
Per coding convention, antijoin should be estimated the same as semijoin.
Arguably this is a bug fix, but in view of the lack of field complaints
and the risk of destabilizing plans, no back-patch.
Thomas Munro, reviewed by Ashutosh Bapat
Discussion: https://postgr.es/m/CAEepm=270ze2hVxWkJw-5eKzc3AB4C9KpH3L2kih75R5pdSogg@mail.gmail.com
-rw-r--r-- | src/backend/utils/adt/selfuncs.c | 70 | ||||
-rw-r--r-- | src/test/regress/expected/join.out | 22 | ||||
-rw-r--r-- | src/test/regress/sql/join.sql | 9 |
3 files changed, 85 insertions, 16 deletions
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c index edff6da410..ea95b8068d 100644 --- a/src/backend/utils/adt/selfuncs.c +++ b/src/backend/utils/adt/selfuncs.c @@ -2767,29 +2767,67 @@ neqjoinsel(PG_FUNCTION_ARGS) List *args = (List *) PG_GETARG_POINTER(2); JoinType jointype = (JoinType) PG_GETARG_INT16(3); SpecialJoinInfo *sjinfo = (SpecialJoinInfo *) PG_GETARG_POINTER(4); - Oid eqop; float8 result; - /* - * We want 1 - eqjoinsel() where the equality operator is the one - * associated with this != operator, that is, its negator. - */ - eqop = get_negator(operator); - if (eqop) + if (jointype == JOIN_SEMI || jointype == JOIN_ANTI) { - result = DatumGetFloat8(DirectFunctionCall5(eqjoinsel, - PointerGetDatum(root), - ObjectIdGetDatum(eqop), - PointerGetDatum(args), - Int16GetDatum(jointype), - PointerGetDatum(sjinfo))); + /* + * For semi-joins, if there is more than one distinct value in the RHS + * relation then every non-null LHS row must find a row to join since + * it can only be equal to one of them. We'll assume that there is + * always more than one distinct RHS value for the sake of stability, + * though in theory we could have special cases for empty RHS + * (selectivity = 0) and single-distinct-value RHS (selectivity = + * fraction of LHS that has the same value as the single RHS value). + * + * For anti-joins, if we use the same assumption that there is more + * than one distinct key in the RHS relation, then every non-null LHS + * row must be suppressed by the anti-join. + * + * So either way, the selectivity estimate should be 1 - nullfrac. + */ + VariableStatData leftvar; + VariableStatData rightvar; + bool reversed; + HeapTuple statsTuple; + double nullfrac; + + get_join_variables(root, args, sjinfo, &leftvar, &rightvar, &reversed); + statsTuple = reversed ? rightvar.statsTuple : leftvar.statsTuple; + if (HeapTupleIsValid(statsTuple)) + nullfrac = ((Form_pg_statistic) GETSTRUCT(statsTuple))->stanullfrac; + else + nullfrac = 0.0; + ReleaseVariableStats(leftvar); + ReleaseVariableStats(rightvar); + + result = 1.0 - nullfrac; } else { - /* Use default selectivity (should we raise an error instead?) */ - result = DEFAULT_EQ_SEL; + /* + * We want 1 - eqjoinsel() where the equality operator is the one + * associated with this != operator, that is, its negator. + */ + Oid eqop = get_negator(operator); + + if (eqop) + { + result = DatumGetFloat8(DirectFunctionCall5(eqjoinsel, + PointerGetDatum(root), + ObjectIdGetDatum(eqop), + PointerGetDatum(args), + Int16GetDatum(jointype), + PointerGetDatum(sjinfo))); + } + else + { + /* Use default selectivity (should we raise an error instead?) */ + result = DEFAULT_EQ_SEL; + } + result = 1.0 - result; } - result = 1.0 - result; + PG_RETURN_FLOAT8(result); } diff --git a/src/test/regress/expected/join.out b/src/test/regress/expected/join.out index f02ef3f978..b7d1790097 100644 --- a/src/test/regress/expected/join.out +++ b/src/test/regress/expected/join.out @@ -1846,6 +1846,28 @@ SELECT '' AS "xxx", * (1 row) -- +-- semijoin selectivity for <> +-- +explain (costs off) +select * from int4_tbl i4, tenk1 a +where exists(select * from tenk1 b + where a.twothousand = b.twothousand and a.fivethous <> b.fivethous) + and i4.f1 = a.tenthous; + QUERY PLAN +---------------------------------------------- + Hash Semi Join + Hash Cond: (a.twothousand = b.twothousand) + Join Filter: (a.fivethous <> b.fivethous) + -> Hash Join + Hash Cond: (a.tenthous = i4.f1) + -> Seq Scan on tenk1 a + -> Hash + -> Seq Scan on int4_tbl i4 + -> Hash + -> Seq Scan on tenk1 b +(10 rows) + +-- -- More complicated constructs -- -- diff --git a/src/test/regress/sql/join.sql b/src/test/regress/sql/join.sql index cbb71c5b1b..c6d4a513e8 100644 --- a/src/test/regress/sql/join.sql +++ b/src/test/regress/sql/join.sql @@ -193,6 +193,15 @@ SELECT '' AS "xxx", * SELECT '' AS "xxx", * FROM J1_TBL LEFT JOIN J2_TBL USING (i) WHERE (i = 1); +-- +-- semijoin selectivity for <> +-- +explain (costs off) +select * from int4_tbl i4, tenk1 a +where exists(select * from tenk1 b + where a.twothousand = b.twothousand and a.fivethous <> b.fivethous) + and i4.f1 = a.tenthous; + -- -- More complicated constructs |