summaryrefslogtreecommitdiff
path: root/src/backend/access
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2015-07-25 14:39:00 -0400
committerTom Lane <tgl@sss.pgh.pa.us>2015-07-25 14:39:00 -0400
commitdd7a8f66ed278eef2f001a98e2312336c61ee527 (patch)
tree4daf4c4b1daddc8fc31d7448522b9e66f4369fd7 /src/backend/access
parentb26e3d660df51a088d14c3c2cfce5990c13c1195 (diff)
downloadpostgresql-dd7a8f66ed278eef2f001a98e2312336c61ee527.tar.gz
Redesign tablesample method API, and do extensive code review.
The original implementation of TABLESAMPLE modeled the tablesample method API on index access methods, which wasn't a good choice because, without specialized DDL commands, there's no way to build an extension that can implement a TSM. (Raw inserts into system catalogs are not an acceptable thing to do, because we can't undo them during DROP EXTENSION, nor will pg_upgrade behave sanely.) Instead adopt an API more like procedural language handlers or foreign data wrappers, wherein the only SQL-level support object needed is a single handler function identified by having a special return type. This lets us get rid of the supporting catalog altogether, so that no custom DDL support is needed for the feature. Adjust the API so that it can support non-constant tablesample arguments (the original coding assumed we could evaluate the argument expressions at ExecInitSampleScan time, which is undesirable even if it weren't outright unsafe), and discourage sampling methods from looking at invisible tuples. Make sure that the BERNOULLI and SYSTEM methods are genuinely repeatable within and across queries, as required by the SQL standard, and deal more honestly with methods that can't support that requirement. Make a full code-review pass over the tablesample additions, and fix assorted bugs, omissions, infelicities, and cosmetic issues (such as failure to put the added code stanzas in a consistent ordering). Improve EXPLAIN's output of tablesample plans, too. Back-patch to 9.5 so that we don't have to support the original API in production.
Diffstat (limited to 'src/backend/access')
-rw-r--r--src/backend/access/heap/heapam.c61
-rw-r--r--src/backend/access/tablesample/Makefile6
-rw-r--r--src/backend/access/tablesample/bernoulli.c326
-rw-r--r--src/backend/access/tablesample/system.c312
-rw-r--r--src/backend/access/tablesample/tablesample.c355
5 files changed, 421 insertions, 639 deletions
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 6f4ff2718f..050efdc480 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -80,8 +80,11 @@ bool synchronize_seqscans = true;
static HeapScanDesc heap_beginscan_internal(Relation relation,
Snapshot snapshot,
int nkeys, ScanKey key,
- bool allow_strat, bool allow_sync, bool allow_pagemode,
- bool is_bitmapscan, bool is_samplescan,
+ bool allow_strat,
+ bool allow_sync,
+ bool allow_pagemode,
+ bool is_bitmapscan,
+ bool is_samplescan,
bool temp_snap);
static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup,
TransactionId xid, CommandId cid, int options);
@@ -207,7 +210,7 @@ static const int MultiXactStatusLock[MaxMultiXactStatus + 1] =
* ----------------
*/
static void
-initscan(HeapScanDesc scan, ScanKey key, bool is_rescan)
+initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock)
{
bool allow_strat;
bool allow_sync;
@@ -257,12 +260,12 @@ initscan(HeapScanDesc scan, ScanKey key, bool is_rescan)
scan->rs_strategy = NULL;
}
- if (is_rescan)
+ if (keep_startblock)
{
/*
- * If rescan, keep the previous startblock setting so that rewinding a
- * cursor doesn't generate surprising results. Reset the syncscan
- * setting, though.
+ * When rescanning, we want to keep the previous startblock setting,
+ * so that rewinding a cursor doesn't generate surprising results.
+ * Reset the active syncscan setting, though.
*/
scan->rs_syncscan = (allow_sync && synchronize_seqscans);
}
@@ -1313,6 +1316,10 @@ heap_openrv_extended(const RangeVar *relation, LOCKMODE lockmode,
/* ----------------
* heap_beginscan - begin relation scan
*
+ * heap_beginscan is the "standard" case.
+ *
+ * heap_beginscan_catalog differs in setting up its own temporary snapshot.
+ *
* heap_beginscan_strat offers an extended API that lets the caller control
* whether a nondefault buffer access strategy can be used, and whether
* syncscan can be chosen (possibly resulting in the scan not starting from
@@ -1323,8 +1330,11 @@ heap_openrv_extended(const RangeVar *relation, LOCKMODE lockmode,
* really quite unlike a standard seqscan, there is just enough commonality
* to make it worth using the same data structure.
*
- * heap_beginscan_samplingscan is alternate entry point for setting up a
- * HeapScanDesc for a TABLESAMPLE scan.
+ * heap_beginscan_sampling is an alternative entry point for setting up a
+ * HeapScanDesc for a TABLESAMPLE scan. As with bitmap scans, it's worth
+ * using the same data structure although the behavior is rather different.
+ * In addition to the options offered by heap_beginscan_strat, this call
+ * also allows control of whether page-mode visibility checking is used.
* ----------------
*/
HeapScanDesc
@@ -1366,18 +1376,22 @@ heap_beginscan_bm(Relation relation, Snapshot snapshot,
HeapScanDesc
heap_beginscan_sampling(Relation relation, Snapshot snapshot,
int nkeys, ScanKey key,
- bool allow_strat, bool allow_pagemode)
+ bool allow_strat, bool allow_sync, bool allow_pagemode)
{
return heap_beginscan_internal(relation, snapshot, nkeys, key,
- allow_strat, false, allow_pagemode,
+ allow_strat, allow_sync, allow_pagemode,
false, true, false);
}
static HeapScanDesc
heap_beginscan_internal(Relation relation, Snapshot snapshot,
int nkeys, ScanKey key,
- bool allow_strat, bool allow_sync, bool allow_pagemode,
- bool is_bitmapscan, bool is_samplescan, bool temp_snap)
+ bool allow_strat,
+ bool allow_sync,
+ bool allow_pagemode,
+ bool is_bitmapscan,
+ bool is_samplescan,
+ bool temp_snap)
{
HeapScanDesc scan;
@@ -1462,6 +1476,27 @@ heap_rescan(HeapScanDesc scan,
}
/* ----------------
+ * heap_rescan_set_params - restart a relation scan after changing params
+ *
+ * This call allows changing the buffer strategy, syncscan, and pagemode
+ * options before starting a fresh scan. Note that although the actual use
+ * of syncscan might change (effectively, enabling or disabling reporting),
+ * the previously selected startblock will be kept.
+ * ----------------
+ */
+void
+heap_rescan_set_params(HeapScanDesc scan, ScanKey key,
+ bool allow_strat, bool allow_sync, bool allow_pagemode)
+{
+ /* adjust parameters */
+ scan->rs_allow_strat = allow_strat;
+ scan->rs_allow_sync = allow_sync;
+ scan->rs_pageatatime = allow_pagemode && IsMVCCSnapshot(scan->rs_snapshot);
+ /* ... and rescan */
+ heap_rescan(scan, key);
+}
+
+/* ----------------
* heap_endscan - end relation scan
*
* See how to integrate with index scans.
diff --git a/src/backend/access/tablesample/Makefile b/src/backend/access/tablesample/Makefile
index 46eeb59f9c..68d9ab2814 100644
--- a/src/backend/access/tablesample/Makefile
+++ b/src/backend/access/tablesample/Makefile
@@ -1,10 +1,10 @@
#-------------------------------------------------------------------------
#
# Makefile--
-# Makefile for utils/tablesample
+# Makefile for access/tablesample
#
# IDENTIFICATION
-# src/backend/utils/tablesample/Makefile
+# src/backend/access/tablesample/Makefile
#
#-------------------------------------------------------------------------
@@ -12,6 +12,6 @@ subdir = src/backend/access/tablesample
top_builddir = ../../../..
include $(top_builddir)/src/Makefile.global
-OBJS = tablesample.o system.o bernoulli.o
+OBJS = bernoulli.o system.o tablesample.o
include $(top_srcdir)/src/backend/common.mk
diff --git a/src/backend/access/tablesample/bernoulli.c b/src/backend/access/tablesample/bernoulli.c
index 0a53900822..cf88f95e75 100644
--- a/src/backend/access/tablesample/bernoulli.c
+++ b/src/backend/access/tablesample/bernoulli.c
@@ -1,233 +1,231 @@
/*-------------------------------------------------------------------------
*
* bernoulli.c
- * interface routines for BERNOULLI tablesample method
+ * support routines for BERNOULLI tablesample method
*
- * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * To ensure repeatability of samples, it is necessary that selection of a
+ * given tuple be history-independent; otherwise syncscanning would break
+ * repeatability, to say nothing of logically-irrelevant maintenance such
+ * as physical extension or shortening of the relation.
+ *
+ * To achieve that, we proceed by hashing each candidate TID together with
+ * the active seed, and then selecting it if the hash is less than the
+ * cutoff value computed from the selection probability by BeginSampleScan.
+ *
+ *
+ * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * src/backend/utils/tablesample/bernoulli.c
+ * src/backend/access/tablesample/bernoulli.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
-#include "fmgr.h"
+#ifdef _MSC_VER
+#include <float.h> /* for _isnan */
+#endif
+#include <math.h>
-#include "access/tablesample.h"
-#include "access/relscan.h"
-#include "nodes/execnodes.h"
-#include "nodes/relation.h"
+#include "access/hash.h"
+#include "access/tsmapi.h"
+#include "catalog/pg_type.h"
#include "optimizer/clauses.h"
-#include "storage/bufmgr.h"
-#include "utils/sampling.h"
+#include "optimizer/cost.h"
+#include "utils/builtins.h"
-/* tsdesc */
+/* Private state */
typedef struct
{
+ uint64 cutoff; /* select tuples with hash less than this */
uint32 seed; /* random seed */
- BlockNumber startblock; /* starting block, we use ths for syncscan
- * support */
- BlockNumber nblocks; /* number of blocks */
- BlockNumber blockno; /* current block */
- float4 probability; /* probabilty that tuple will be returned
- * (0.0-1.0) */
OffsetNumber lt; /* last tuple returned from current block */
- SamplerRandomState randstate; /* random generator tsdesc */
} BernoulliSamplerData;
+
+static void bernoulli_samplescangetsamplesize(PlannerInfo *root,
+ RelOptInfo *baserel,
+ List *paramexprs,
+ BlockNumber *pages,
+ double *tuples);
+static void bernoulli_initsamplescan(SampleScanState *node,
+ int eflags);
+static void bernoulli_beginsamplescan(SampleScanState *node,
+ Datum *params,
+ int nparams,
+ uint32 seed);
+static OffsetNumber bernoulli_nextsampletuple(SampleScanState *node,
+ BlockNumber blockno,
+ OffsetNumber maxoffset);
+
+
/*
- * Initialize the state.
+ * Create a TsmRoutine descriptor for the BERNOULLI method.
*/
Datum
-tsm_bernoulli_init(PG_FUNCTION_ARGS)
+tsm_bernoulli_handler(PG_FUNCTION_ARGS)
{
- TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
- uint32 seed = PG_GETARG_UINT32(1);
- float4 percent = PG_ARGISNULL(2) ? -1 : PG_GETARG_FLOAT4(2);
- HeapScanDesc scan = tsdesc->heapScan;
- BernoulliSamplerData *sampler;
+ TsmRoutine *tsm = makeNode(TsmRoutine);
+
+ tsm->parameterTypes = list_make1_oid(FLOAT4OID);
+ tsm->repeatable_across_queries = true;
+ tsm->repeatable_across_scans = true;
+ tsm->SampleScanGetSampleSize = bernoulli_samplescangetsamplesize;
+ tsm->InitSampleScan = bernoulli_initsamplescan;
+ tsm->BeginSampleScan = bernoulli_beginsamplescan;
+ tsm->NextSampleBlock = NULL;
+ tsm->NextSampleTuple = bernoulli_nextsampletuple;
+ tsm->EndSampleScan = NULL;
+
+ PG_RETURN_POINTER(tsm);
+}
- if (percent < 0 || percent > 100)
- ereport(ERROR,
- (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
- errmsg("invalid sample size"),
- errhint("Sample size must be numeric value between 0 and 100 (inclusive).")));
+/*
+ * Sample size estimation.
+ */
+static void
+bernoulli_samplescangetsamplesize(PlannerInfo *root,
+ RelOptInfo *baserel,
+ List *paramexprs,
+ BlockNumber *pages,
+ double *tuples)
+{
+ Node *pctnode;
+ float4 samplefract;
- sampler = palloc0(sizeof(BernoulliSamplerData));
+ /* Try to extract an estimate for the sample percentage */
+ pctnode = (Node *) linitial(paramexprs);
+ pctnode = estimate_expression_value(root, pctnode);
- /* Remember initial values for reinit */
- sampler->seed = seed;
- sampler->startblock = scan->rs_startblock;
- sampler->nblocks = scan->rs_nblocks;
- sampler->blockno = InvalidBlockNumber;
- sampler->probability = percent / 100;
- sampler->lt = InvalidOffsetNumber;
- sampler_random_init_state(sampler->seed, sampler->randstate);
+ if (IsA(pctnode, Const) &&
+ !((Const *) pctnode)->constisnull)
+ {
+ samplefract = DatumGetFloat4(((Const *) pctnode)->constvalue);
+ if (samplefract >= 0 && samplefract <= 100 && !isnan(samplefract))
+ samplefract /= 100.0f;
+ else
+ {
+ /* Default samplefract if the value is bogus */
+ samplefract = 0.1f;
+ }
+ }
+ else
+ {
+ /* Default samplefract if we didn't obtain a non-null Const */
+ samplefract = 0.1f;
+ }
+
+ /* We'll visit all pages of the baserel */
+ *pages = baserel->pages;
- tsdesc->tsmdata = (void *) sampler;
+ *tuples = clamp_row_est(baserel->tuples * samplefract);
+}
- PG_RETURN_VOID();
+/*
+ * Initialize during executor setup.
+ */
+static void
+bernoulli_initsamplescan(SampleScanState *node, int eflags)
+{
+ node->tsm_state = palloc0(sizeof(BernoulliSamplerData));
}
/*
- * Get next block number to read or InvalidBlockNumber if we are at the
- * end of the relation.
+ * Examine parameters and prepare for a sample scan.
*/
-Datum
-tsm_bernoulli_nextblock(PG_FUNCTION_ARGS)
+static void
+bernoulli_beginsamplescan(SampleScanState *node,
+ Datum *params,
+ int nparams,
+ uint32 seed)
{
- TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
- BernoulliSamplerData *sampler = (BernoulliSamplerData *) tsdesc->tsmdata;
+ BernoulliSamplerData *sampler = (BernoulliSamplerData *) node->tsm_state;
+ double percent = DatumGetFloat4(params[0]);
+
+ if (percent < 0 || percent > 100 || isnan(percent))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT),
+ errmsg("sample percentage must be between 0 and 100")));
/*
- * Bernoulli sampling scans all blocks on the table and supports syncscan
- * so loop from startblock to startblock instead of from 0 to nblocks.
+ * The cutoff is sample probability times (PG_UINT32_MAX + 1); we have to
+ * store that as a uint64, of course. Note that this gives strictly
+ * correct behavior at the limits of zero or one probability.
*/
- if (sampler->blockno == InvalidBlockNumber)
- sampler->blockno = sampler->startblock;
- else
- {
- sampler->blockno++;
-
- if (sampler->blockno >= sampler->nblocks)
- sampler->blockno = 0;
-
- if (sampler->blockno == sampler->startblock)
- PG_RETURN_UINT32(InvalidBlockNumber);
- }
+ sampler->cutoff = rint(((double) PG_UINT32_MAX + 1) * percent / 100);
+ sampler->seed = seed;
+ sampler->lt = InvalidOffsetNumber;
- PG_RETURN_UINT32(sampler->blockno);
+ /*
+ * Use bulkread, since we're scanning all pages. But pagemode visibility
+ * checking is a win only at larger sampling fractions. The 25% cutoff
+ * here is based on very limited experimentation.
+ */
+ node->use_bulkread = true;
+ node->use_pagemode = (percent >= 25);
}
/*
- * Get next tuple from current block.
- *
- * This method implements the main logic in bernoulli sampling.
- * The algorithm simply generates new random number (in 0.0-1.0 range) and if
- * it falls within user specified probability (in the same range) return the
- * tuple offset.
- *
- * It is ok here to return tuple offset without knowing if tuple is visible
- * and not check it via examinetuple. The reason for that is that we do the
- * coinflip (random number generation) for every tuple in the table. Since all
- * tuples have same probability of being returned the visible and invisible
- * tuples will be returned in same ratio as they have in the actual table.
- * This means that there is no skew towards either visible or invisible tuples
- * and the number of visible tuples returned from the executor node should
- * match the fraction of visible tuples which was specified by user.
+ * Select next sampled tuple in current block.
*
- * This is faster than doing the coinflip in examinetuple because we don't
- * have to do visibility checks on uninteresting tuples.
+ * It is OK here to return an offset without knowing if the tuple is visible
+ * (or even exists). The reason is that we do the coinflip for every tuple
+ * offset in the table. Since all tuples have the same probability of being
+ * returned, it doesn't matter if we do extra coinflips for invisible tuples.
*
- * If we reach end of the block return InvalidOffsetNumber which tells
+ * When we reach end of the block, return InvalidOffsetNumber which tells
* SampleScan to go to next block.
*/
-Datum
-tsm_bernoulli_nexttuple(PG_FUNCTION_ARGS)
+static OffsetNumber
+bernoulli_nextsampletuple(SampleScanState *node,
+ BlockNumber blockno,
+ OffsetNumber maxoffset)
{
- TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
- OffsetNumber maxoffset = PG_GETARG_UINT16(2);
- BernoulliSamplerData *sampler = (BernoulliSamplerData *) tsdesc->tsmdata;
+ BernoulliSamplerData *sampler = (BernoulliSamplerData *) node->tsm_state;
OffsetNumber tupoffset = sampler->lt;
- float4 probability = sampler->probability;
+ uint32 hashinput[3];
+ /* Advance to first/next tuple in block */
if (tupoffset == InvalidOffsetNumber)
tupoffset = FirstOffsetNumber;
else
tupoffset++;
/*
- * Loop over tuple offsets until the random generator returns value that
- * is within the probability of returning the tuple or until we reach end
- * of the block.
+ * We compute the hash by applying hash_any to an array of 3 uint32's
+ * containing the block, offset, and seed. This is efficient to set up,
+ * and with the current implementation of hash_any, it gives
+ * machine-independent results, which is a nice property for regression
+ * testing.
*
- * (This is our implementation of bernoulli trial)
+ * These words in the hash input are the same throughout the block:
*/
- while (sampler_random_fract(sampler->randstate) > probability)
+ hashinput[0] = blockno;
+ hashinput[2] = sampler->seed;
+
+ /*
+ * Loop over tuple offsets until finding suitable TID or reaching end of
+ * block.
+ */
+ for (; tupoffset <= maxoffset; tupoffset++)
{
- tupoffset++;
+ uint32 hash;
- if (tupoffset > maxoffset)
+ hashinput[1] = tupoffset;
+
+ hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput,
+ (int) sizeof(hashinput)));
+ if (hash < sampler->cutoff)
break;
}
if (tupoffset > maxoffset)
- /* Tell SampleScan that we want next block. */
tupoffset = InvalidOffsetNumber;
sampler->lt = tupoffset;
- PG_RETURN_UINT16(tupoffset);
-}
-
-/*
- * Cleanup method.
- */
-Datum
-tsm_bernoulli_end(PG_FUNCTION_ARGS)
-{
- TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
-
- pfree(tsdesc->tsmdata);
-
- PG_RETURN_VOID();
-}
-
-/*
- * Reset tsdesc (called by ReScan).
- */
-Datum
-tsm_bernoulli_reset(PG_FUNCTION_ARGS)
-{
- TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
- BernoulliSamplerData *sampler = (BernoulliSamplerData *) tsdesc->tsmdata;
-
- sampler->blockno = InvalidBlockNumber;
- sampler->lt = InvalidOffsetNumber;
- sampler_random_init_state(sampler->seed, sampler->randstate);
-
- PG_RETURN_VOID();
-}
-
-/*
- * Costing function.
- */
-Datum
-tsm_bernoulli_cost(PG_FUNCTION_ARGS)
-{
- PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
- Path *path = (Path *) PG_GETARG_POINTER(1);
- RelOptInfo *baserel = (RelOptInfo *) PG_GETARG_POINTER(2);
- List *args = (List *) PG_GETARG_POINTER(3);
- BlockNumber *pages = (BlockNumber *) PG_GETARG_POINTER(4);
- double *tuples = (double *) PG_GETARG_POINTER(5);
- Node *pctnode;
- float4 samplesize;
-
- *pages = baserel->pages;
-
- pctnode = linitial(args);
- pctnode = estimate_expression_value(root, pctnode);
-
- if (IsA(pctnode, RelabelType))
- pctnode = (Node *) ((RelabelType *) pctnode)->arg;
-
- if (IsA(pctnode, Const))
- {
- samplesize = DatumGetFloat4(((Const *) pctnode)->constvalue);
- samplesize /= 100.0;
- }
- else
- {
- /* Default samplesize if the estimation didn't return Const. */
- samplesize = 0.1f;
- }
-
- *tuples = path->rows * samplesize;
- path->rows = *tuples;
-
- PG_RETURN_VOID();
+ return tupoffset;
}
diff --git a/src/backend/access/tablesample/system.c b/src/backend/access/tablesample/system.c
index 1d834369a4..43c5dab716 100644
--- a/src/backend/access/tablesample/system.c
+++ b/src/backend/access/tablesample/system.c
@@ -1,186 +1,260 @@
/*-------------------------------------------------------------------------
*
* system.c
- * interface routines for system tablesample method
+ * support routines for SYSTEM tablesample method
*
+ * To ensure repeatability of samples, it is necessary that selection of a
+ * given tuple be history-independent; otherwise syncscanning would break
+ * repeatability, to say nothing of logically-irrelevant maintenance such
+ * as physical extension or shortening of the relation.
*
- * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group
+ * To achieve that, we proceed by hashing each candidate block number together
+ * with the active seed, and then selecting it if the hash is less than the
+ * cutoff value computed from the selection probability by BeginSampleScan.
+ *
+ *
+ * Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
*
* IDENTIFICATION
- * src/backend/utils/tablesample/system.c
+ * src/backend/access/tablesample/system.c
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
-#include "fmgr.h"
+#ifdef _MSC_VER
+#include <float.h> /* for _isnan */
+#endif
+#include <math.h>
-#include "access/tablesample.h"
+#include "access/hash.h"
#include "access/relscan.h"
-#include "nodes/execnodes.h"
-#include "nodes/relation.h"
+#include "access/tsmapi.h"
+#include "catalog/pg_type.h"
#include "optimizer/clauses.h"
-#include "storage/bufmgr.h"
-#include "utils/sampling.h"
+#include "optimizer/cost.h"
+#include "utils/builtins.h"
-/*
- * State
- */
+/* Private state */
typedef struct
{
- BlockSamplerData bs;
+ uint64 cutoff; /* select blocks with hash less than this */
uint32 seed; /* random seed */
- BlockNumber nblocks; /* number of block in relation */
- int samplesize; /* number of blocks to return */
+ BlockNumber nextblock; /* next block to consider sampling */
OffsetNumber lt; /* last tuple returned from current block */
} SystemSamplerData;
-/*
- * Initializes the state.
- */
-Datum
-tsm_system_init(PG_FUNCTION_ARGS)
-{
- TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
- uint32 seed = PG_GETARG_UINT32(1);
- float4 percent = PG_ARGISNULL(2) ? -1 : PG_GETARG_FLOAT4(2);
- HeapScanDesc scan = tsdesc->heapScan;
- SystemSamplerData *sampler;
+static void system_samplescangetsamplesize(PlannerInfo *root,
+ RelOptInfo *baserel,
+ List *paramexprs,
+ BlockNumber *pages,
+ double *tuples);
+static void system_initsamplescan(SampleScanState *node,
+ int eflags);
+static void system_beginsamplescan(SampleScanState *node,
+ Datum *params,
+ int nparams,
+ uint32 seed);
+static BlockNumber system_nextsampleblock(SampleScanState *node);
+static OffsetNumber system_nextsampletuple(SampleScanState *node,
+ BlockNumber blockno,
+ OffsetNumber maxoffset);
- if (percent < 0 || percent > 100)
- ereport(ERROR,
- (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
- errmsg("invalid sample size"),
- errhint("Sample size must be numeric value between 0 and 100 (inclusive).")));
-
- sampler = palloc0(sizeof(SystemSamplerData));
-
- /* Remember initial values for reinit */
- sampler->seed = seed;
- sampler->nblocks = scan->rs_nblocks;
- sampler->samplesize = 1 + (int) (sampler->nblocks * (percent / 100.0));
- sampler->lt = InvalidOffsetNumber;
-
- BlockSampler_Init(&sampler->bs, sampler->nblocks, sampler->samplesize,
- sampler->seed);
-
- tsdesc->tsmdata = (void *) sampler;
-
- PG_RETURN_VOID();
-}
/*
- * Get next block number or InvalidBlockNumber when we're done.
- *
- * Uses the same logic as ANALYZE for picking the random blocks.
+ * Create a TsmRoutine descriptor for the SYSTEM method.
*/
Datum
-tsm_system_nextblock(PG_FUNCTION_ARGS)
+tsm_system_handler(PG_FUNCTION_ARGS)
{
- TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
- SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata;
- BlockNumber blockno;
-
- if (!BlockSampler_HasMore(&sampler->bs))
- PG_RETURN_UINT32(InvalidBlockNumber);
-
- blockno = BlockSampler_Next(&sampler->bs);
-
- PG_RETURN_UINT32(blockno);
+ TsmRoutine *tsm = makeNode(TsmRoutine);
+
+ tsm->parameterTypes = list_make1_oid(FLOAT4OID);
+ tsm->repeatable_across_queries = true;
+ tsm->repeatable_across_scans = true;
+ tsm->SampleScanGetSampleSize = system_samplescangetsamplesize;
+ tsm->InitSampleScan = system_initsamplescan;
+ tsm->BeginSampleScan = system_beginsamplescan;
+ tsm->NextSampleBlock = system_nextsampleblock;
+ tsm->NextSampleTuple = system_nextsampletuple;
+ tsm->EndSampleScan = NULL;
+
+ PG_RETURN_POINTER(tsm);
}
/*
- * Get next tuple offset in current block or InvalidOffsetNumber if we are done
- * with this block.
+ * Sample size estimation.
*/
-Datum
-tsm_system_nexttuple(PG_FUNCTION_ARGS)
+static void
+system_samplescangetsamplesize(PlannerInfo *root,
+ RelOptInfo *baserel,
+ List *paramexprs,
+ BlockNumber *pages,
+ double *tuples)
{
- TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
- OffsetNumber maxoffset = PG_GETARG_UINT16(2);
- SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata;
- OffsetNumber tupoffset = sampler->lt;
+ Node *pctnode;
+ float4 samplefract;
- if (tupoffset == InvalidOffsetNumber)
- tupoffset = FirstOffsetNumber;
- else
- tupoffset++;
+ /* Try to extract an estimate for the sample percentage */
+ pctnode = (Node *) linitial(paramexprs);
+ pctnode = estimate_expression_value(root, pctnode);
- if (tupoffset > maxoffset)
- tupoffset = InvalidOffsetNumber;
+ if (IsA(pctnode, Const) &&
+ !((Const *) pctnode)->constisnull)
+ {
+ samplefract = DatumGetFloat4(((Const *) pctnode)->constvalue);
+ if (samplefract >= 0 && samplefract <= 100 && !isnan(samplefract))
+ samplefract /= 100.0f;
+ else
+ {
+ /* Default samplefract if the value is bogus */
+ samplefract = 0.1f;
+ }
+ }
+ else
+ {
+ /* Default samplefract if we didn't obtain a non-null Const */
+ samplefract = 0.1f;
+ }
- sampler->lt = tupoffset;
+ /* We'll visit a sample of the pages ... */
+ *pages = clamp_row_est(baserel->pages * samplefract);
- PG_RETURN_UINT16(tupoffset);
+ /* ... and hopefully get a representative number of tuples from them */
+ *tuples = clamp_row_est(baserel->tuples * samplefract);
}
/*
- * Cleanup method.
+ * Initialize during executor setup.
*/
-Datum
-tsm_system_end(PG_FUNCTION_ARGS)
+static void
+system_initsamplescan(SampleScanState *node, int eflags)
{
- TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
-
- pfree(tsdesc->tsmdata);
-
- PG_RETURN_VOID();
+ node->tsm_state = palloc0(sizeof(SystemSamplerData));
}
/*
- * Reset state (called by ReScan).
+ * Examine parameters and prepare for a sample scan.
*/
-Datum
-tsm_system_reset(PG_FUNCTION_ARGS)
+static void
+system_beginsamplescan(SampleScanState *node,
+ Datum *params,
+ int nparams,
+ uint32 seed)
{
- TableSampleDesc *tsdesc = (TableSampleDesc *) PG_GETARG_POINTER(0);
- SystemSamplerData *sampler = (SystemSamplerData *) tsdesc->tsmdata;
+ SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state;
+ double percent = DatumGetFloat4(params[0]);
+ if (percent < 0 || percent > 100 || isnan(percent))
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_TABLESAMPLE_ARGUMENT),
+ errmsg("sample percentage must be between 0 and 100")));
+
+ /*
+ * The cutoff is sample probability times (PG_UINT32_MAX + 1); we have to
+ * store that as a uint64, of course. Note that this gives strictly
+ * correct behavior at the limits of zero or one probability.
+ */
+ sampler->cutoff = rint(((double) PG_UINT32_MAX + 1) * percent / 100);
+ sampler->seed = seed;
+ sampler->nextblock = 0;
sampler->lt = InvalidOffsetNumber;
- BlockSampler_Init(&sampler->bs, sampler->nblocks, sampler->samplesize,
- sampler->seed);
- PG_RETURN_VOID();
+ /*
+ * Bulkread buffer access strategy probably makes sense unless we're
+ * scanning a very small fraction of the table. The 1% cutoff here is a
+ * guess. We should use pagemode visibility checking, since we scan all
+ * tuples on each selected page.
+ */
+ node->use_bulkread = (percent >= 1);
+ node->use_pagemode = true;
}
/*
- * Costing function.
+ * Select next block to sample.
*/
-Datum
-tsm_system_cost(PG_FUNCTION_ARGS)
+static BlockNumber
+system_nextsampleblock(SampleScanState *node)
{
- PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0);
- Path *path = (Path *) PG_GETARG_POINTER(1);
- RelOptInfo *baserel = (RelOptInfo *) PG_GETARG_POINTER(2);
- List *args = (List *) PG_GETARG_POINTER(3);
- BlockNumber *pages = (BlockNumber *) PG_GETARG_POINTER(4);
- double *tuples = (double *) PG_GETARG_POINTER(5);
- Node *pctnode;
- float4 samplesize;
+ SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state;
+ HeapScanDesc scan = node->ss.ss_currentScanDesc;
+ BlockNumber nextblock = sampler->nextblock;
+ uint32 hashinput[2];
+
+ /*
+ * We compute the hash by applying hash_any to an array of 2 uint32's
+ * containing the block number and seed. This is efficient to set up, and
+ * with the current implementation of hash_any, it gives
+ * machine-independent results, which is a nice property for regression
+ * testing.
+ *
+ * These words in the hash input are the same throughout the block:
+ */
+ hashinput[1] = sampler->seed;
+
+ /*
+ * Loop over block numbers until finding suitable block or reaching end of
+ * relation.
+ */
+ for (; nextblock < scan->rs_nblocks; nextblock++)
+ {
+ uint32 hash;
- pctnode = linitial(args);
- pctnode = estimate_expression_value(root, pctnode);
+ hashinput[0] = nextblock;
- if (IsA(pctnode, RelabelType))
- pctnode = (Node *) ((RelabelType *) pctnode)->arg;
+ hash = DatumGetUInt32(hash_any((const unsigned char *) hashinput,
+ (int) sizeof(hashinput)));
+ if (hash < sampler->cutoff)
+ break;
+ }
- if (IsA(pctnode, Const))
+ if (nextblock < scan->rs_nblocks)
{
- samplesize = DatumGetFloat4(((Const *) pctnode)->constvalue);
- samplesize /= 100.0;
+ /* Found a suitable block; remember where we should start next time */
+ sampler->nextblock = nextblock + 1;
+ return nextblock;
}
+
+ /* Done, but let's reset nextblock to 0 for safety. */
+ sampler->nextblock = 0;
+ return InvalidBlockNumber;
+}
+
+/*
+ * Select next sampled tuple in current block.
+ *
+ * In block sampling, we just want to sample all the tuples in each selected
+ * block.
+ *
+ * It is OK here to return an offset without knowing if the tuple is visible
+ * (or even exists); nodeSamplescan.c will deal with that.
+ *
+ * When we reach end of the block, return InvalidOffsetNumber which tells
+ * SampleScan to go to next block.
+ */
+static OffsetNumber
+system_nextsampletuple(SampleScanState *node,
+ BlockNumber blockno,
+ OffsetNumber maxoffset)
+{
+ SystemSamplerData *sampler = (SystemSamplerData *) node->tsm_state;
+ OffsetNumber tupoffset = sampler->lt;
+
+ /* Advance to next possible offset on page */
+ if (tupoffset == InvalidOffsetNumber)
+ tupoffset = FirstOffsetNumber;
else
- {
- /* Default samplesize if the estimation didn't return Const. */
- samplesize = 0.1f;
- }
+ tupoffset++;
- *pages = baserel->pages * samplesize;
- *tuples = path->rows * samplesize;
- path->rows = *tuples;
+ /* Done? */
+ if (tupoffset > maxoffset)
+ tupoffset = InvalidOffsetNumber;
+
+ sampler->lt = tupoffset;
- PG_RETURN_VOID();
+ return tupoffset;
}
diff --git a/src/backend/access/tablesample/tablesample.c b/src/backend/access/tablesample/tablesample.c
index f21d42c8e3..b8ad7ced74 100644
--- a/src/backend/access/tablesample/tablesample.c
+++ b/src/backend/access/tablesample/tablesample.c
@@ -1,7 +1,7 @@
/*-------------------------------------------------------------------------
*
* tablesample.c
- * TABLESAMPLE internal API
+ * Support functions for TABLESAMPLE feature
*
* Portions Copyright (c) 1996-2015, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
@@ -10,356 +10,31 @@
* IDENTIFICATION
* src/backend/access/tablesample/tablesample.c
*
- * TABLESAMPLE is the SQL standard clause for sampling the relations.
- *
- * The API is interface between the Executor and the TABLESAMPLE Methods.
- *
- * TABLESAMPLE Methods are implementations of actual sampling algorithms which
- * can be used for returning a sample of the source relation.
- * Methods don't read the table directly but are asked for block number and
- * tuple offset which they want to examine (or return) and the tablesample
- * interface implemented here does the reading for them.
- *
- * We currently only support sampling of the physical relations, but in the
- * future we might extend the API to support subqueries as well.
- *
* -------------------------------------------------------------------------
*/
#include "postgres.h"
-#include "access/tablesample.h"
-
-#include "catalog/pg_tablesample_method.h"
-#include "miscadmin.h"
-#include "pgstat.h"
-#include "storage/bufmgr.h"
-#include "storage/predicate.h"
-#include "utils/rel.h"
-#include "utils/tqual.h"
-
-
-static bool SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan);
-
-
-/*
- * Initialize the TABLESAMPLE Descriptor and the TABLESAMPLE Method.
- */
-TableSampleDesc *
-tablesample_init(SampleScanState *scanstate, TableSampleClause *tablesample)
-{
- FunctionCallInfoData fcinfo;
- int i;
- List *args = tablesample->args;
- ListCell *arg;
- ExprContext *econtext = scanstate->ss.ps.ps_ExprContext;
- TableSampleDesc *tsdesc = (TableSampleDesc *) palloc0(sizeof(TableSampleDesc));
-
- /* Load functions */
- fmgr_info(tablesample->tsminit, &(tsdesc->tsminit));
- fmgr_info(tablesample->tsmnextblock, &(tsdesc->tsmnextblock));
- fmgr_info(tablesample->tsmnexttuple, &(tsdesc->tsmnexttuple));
- if (OidIsValid(tablesample->tsmexaminetuple))
- fmgr_info(tablesample->tsmexaminetuple, &(tsdesc->tsmexaminetuple));
- else
- tsdesc->tsmexaminetuple.fn_oid = InvalidOid;
- fmgr_info(tablesample->tsmreset, &(tsdesc->tsmreset));
- fmgr_info(tablesample->tsmend, &(tsdesc->tsmend));
-
- InitFunctionCallInfoData(fcinfo, &tsdesc->tsminit,
- list_length(args) + 2,
- InvalidOid, NULL, NULL);
-
- tsdesc->tupDesc = scanstate->ss.ss_ScanTupleSlot->tts_tupleDescriptor;
- tsdesc->heapScan = scanstate->ss.ss_currentScanDesc;
-
- /* First argument for init function is always TableSampleDesc */
- fcinfo.arg[0] = PointerGetDatum(tsdesc);
- fcinfo.argnull[0] = false;
+#include "access/tsmapi.h"
- /*
- * Second arg for init function is always REPEATABLE.
- *
- * If tablesample->repeatable is NULL then REPEATABLE clause was not
- * specified, and we insert a random value as default.
- *
- * When specified, the expression cannot evaluate to NULL.
- */
- if (tablesample->repeatable)
- {
- ExprState *argstate = ExecInitExpr((Expr *) tablesample->repeatable,
- (PlanState *) scanstate);
-
- fcinfo.arg[1] = ExecEvalExpr(argstate, econtext,
- &fcinfo.argnull[1], NULL);
- if (fcinfo.argnull[1])
- ereport(ERROR,
- (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
- errmsg("REPEATABLE clause must be NOT NULL numeric value")));
- }
- else
- {
- fcinfo.arg[1] = UInt32GetDatum(random());
- fcinfo.argnull[1] = false;
- }
-
- /* Rest of the arguments come from user. */
- i = 2;
- foreach(arg, args)
- {
- Expr *argexpr = (Expr *) lfirst(arg);
- ExprState *argstate = ExecInitExpr(argexpr, (PlanState *) scanstate);
-
- fcinfo.arg[i] = ExecEvalExpr(argstate, econtext,
- &fcinfo.argnull[i], NULL);
- i++;
- }
- Assert(i == fcinfo.nargs);
-
- (void) FunctionCallInvoke(&fcinfo);
-
- return tsdesc;
-}
/*
- * Get next tuple from TABLESAMPLE Method.
- */
-HeapTuple
-tablesample_getnext(TableSampleDesc *desc)
-{
- HeapScanDesc scan = desc->heapScan;
- HeapTuple tuple = &(scan->rs_ctup);
- bool pagemode = scan->rs_pageatatime;
- BlockNumber blockno;
- Page page;
- bool page_all_visible;
- ItemId itemid;
- OffsetNumber tupoffset,
- maxoffset;
-
- if (!scan->rs_inited)
- {
- /*
- * return null immediately if relation is empty
- */
- if (scan->rs_nblocks == 0)
- {
- Assert(!BufferIsValid(scan->rs_cbuf));
- tuple->t_data = NULL;
- return NULL;
- }
- blockno = DatumGetInt32(FunctionCall1(&desc->tsmnextblock,
- PointerGetDatum(desc)));
- if (!BlockNumberIsValid(blockno))
- {
- tuple->t_data = NULL;
- return NULL;
- }
-
- heapgetpage(scan, blockno);
- scan->rs_inited = true;
- }
- else
- {
- /* continue from previously returned page/tuple */
- blockno = scan->rs_cblock; /* current page */
- }
-
- /*
- * When pagemode is disabled, the scan will do visibility checks for each
- * tuple it finds so the buffer needs to be locked.
- */
- if (!pagemode)
- LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
-
- page = (Page) BufferGetPage(scan->rs_cbuf);
- page_all_visible = PageIsAllVisible(page);
- maxoffset = PageGetMaxOffsetNumber(page);
-
- for (;;)
- {
- CHECK_FOR_INTERRUPTS();
-
- tupoffset = DatumGetUInt16(FunctionCall3(&desc->tsmnexttuple,
- PointerGetDatum(desc),
- UInt32GetDatum(blockno),
- UInt16GetDatum(maxoffset)));
-
- if (OffsetNumberIsValid(tupoffset))
- {
- bool visible;
- bool found;
-
- /* Skip invalid tuple pointers. */
- itemid = PageGetItemId(page, tupoffset);
- if (!ItemIdIsNormal(itemid))
- continue;
-
- tuple->t_data = (HeapTupleHeader) PageGetItem((Page) page, itemid);
- tuple->t_len = ItemIdGetLength(itemid);
- ItemPointerSet(&(tuple->t_self), blockno, tupoffset);
-
- if (page_all_visible)
- visible = true;
- else
- visible = SampleTupleVisible(tuple, tupoffset, scan);
-
- /*
- * Let the sampling method examine the actual tuple and decide if
- * we should return it.
- *
- * Note that we let it examine even invisible tuples for
- * statistical purposes, but not return them since user should
- * never see invisible tuples.
- */
- if (OidIsValid(desc->tsmexaminetuple.fn_oid))
- {
- found = DatumGetBool(FunctionCall4(&desc->tsmexaminetuple,
- PointerGetDatum(desc),
- UInt32GetDatum(blockno),
- PointerGetDatum(tuple),
- BoolGetDatum(visible)));
- /* Should not happen if sampling method is well written. */
- if (found && !visible)
- elog(ERROR, "Sampling method wanted to return invisible tuple");
- }
- else
- found = visible;
-
- /* Found visible tuple, return it. */
- if (found)
- {
- if (!pagemode)
- LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
- break;
- }
- else
- {
- /* Try next tuple from same page. */
- continue;
- }
- }
-
-
- if (!pagemode)
- LockBuffer(scan->rs_cbuf, BUFFER_LOCK_UNLOCK);
-
- blockno = DatumGetInt32(FunctionCall1(&desc->tsmnextblock,
- PointerGetDatum(desc)));
-
- /*
- * Report our new scan position for synchronization purposes. We don't
- * do that when moving backwards, however. That would just mess up any
- * other forward-moving scanners.
- *
- * Note: we do this before checking for end of scan so that the final
- * state of the position hint is back at the start of the rel. That's
- * not strictly necessary, but otherwise when you run the same query
- * multiple times the starting position would shift a little bit
- * backwards on every invocation, which is confusing. We don't
- * guarantee any specific ordering in general, though.
- */
- if (scan->rs_syncscan)
- ss_report_location(scan->rs_rd, BlockNumberIsValid(blockno) ?
- blockno : scan->rs_startblock);
-
- /*
- * Reached end of scan.
- */
- if (!BlockNumberIsValid(blockno))
- {
- if (BufferIsValid(scan->rs_cbuf))
- ReleaseBuffer(scan->rs_cbuf);
- scan->rs_cbuf = InvalidBuffer;
- scan->rs_cblock = InvalidBlockNumber;
- tuple->t_data = NULL;
- scan->rs_inited = false;
- return NULL;
- }
-
- heapgetpage(scan, blockno);
-
- if (!pagemode)
- LockBuffer(scan->rs_cbuf, BUFFER_LOCK_SHARE);
-
- page = (Page) BufferGetPage(scan->rs_cbuf);
- page_all_visible = PageIsAllVisible(page);
- maxoffset = PageGetMaxOffsetNumber(page);
- }
-
- pgstat_count_heap_getnext(scan->rs_rd);
-
- return &(scan->rs_ctup);
-}
-
-/*
- * Reset the sampling to starting state
- */
-void
-tablesample_reset(TableSampleDesc *desc)
-{
- (void) FunctionCall1(&desc->tsmreset, PointerGetDatum(desc));
-}
-
-/*
- * Signal the sampling method that the scan has finished.
- */
-void
-tablesample_end(TableSampleDesc *desc)
-{
- (void) FunctionCall1(&desc->tsmend, PointerGetDatum(desc));
-}
-
-/*
- * Check visibility of the tuple.
+ * GetTsmRoutine --- get a TsmRoutine struct by invoking the handler.
+ *
+ * This is a convenience routine that's just meant to check for errors.
*/
-static bool
-SampleTupleVisible(HeapTuple tuple, OffsetNumber tupoffset, HeapScanDesc scan)
+TsmRoutine *
+GetTsmRoutine(Oid tsmhandler)
{
- /*
- * If this scan is reading whole pages at a time, there is already
- * visibility info present in rs_vistuples so we can just search it for
- * the tupoffset.
- */
- if (scan->rs_pageatatime)
- {
- int start = 0,
- end = scan->rs_ntuples - 1;
-
- /*
- * Do the binary search over rs_vistuples, it's already sorted by
- * OffsetNumber so we don't need to do any sorting ourselves here.
- *
- * We could use bsearch() here but it's slower for integers because of
- * the function call overhead and because it needs boiler plate code
- * it would not save us anything code-wise anyway.
- */
- while (start <= end)
- {
- int mid = start + (end - start) / 2;
- OffsetNumber curoffset = scan->rs_vistuples[mid];
-
- if (curoffset == tupoffset)
- return true;
- else if (curoffset > tupoffset)
- end = mid - 1;
- else
- start = mid + 1;
- }
-
- return false;
- }
- else
- {
- /* No pagemode, we have to check the tuple itself. */
- Snapshot snapshot = scan->rs_snapshot;
- Buffer buffer = scan->rs_cbuf;
+ Datum datum;
+ TsmRoutine *routine;
- bool visible = HeapTupleSatisfiesVisibility(tuple, snapshot, buffer);
+ datum = OidFunctionCall1(tsmhandler, PointerGetDatum(NULL));
+ routine = (TsmRoutine *) DatumGetPointer(datum);
- CheckForSerializableConflictOut(visible, scan->rs_rd, tuple, buffer,
- snapshot);
+ if (routine == NULL || !IsA(routine, TsmRoutine))
+ elog(ERROR, "tablesample handler function %u did not return a TsmRoutine struct",
+ tsmhandler);
- return visible;
- }
+ return routine;
}