summaryrefslogtreecommitdiff
path: root/contrib/auto_explain
diff options
context:
space:
mode:
authorTom Lane <tgl@sss.pgh.pa.us>2019-06-03 18:06:04 -0400
committerTom Lane <tgl@sss.pgh.pa.us>2019-06-03 18:06:04 -0400
commit2cd4e835701e15f5e32dd8bde488f8669ddf7ae8 (patch)
tree23b7f54b6935416d4dcfae8ec3d8e8f9fa1e205e /contrib/auto_explain
parenteaf0292c3ba7acac2735f99a8988bd49a26112f5 (diff)
downloadpostgresql-2cd4e835701e15f5e32dd8bde488f8669ddf7ae8.tar.gz
Fix contrib/auto_explain to not cause problems in parallel workers.
A parallel worker process should not be making any decisions of its own about whether to auto-explain. If the parent session process passed down flags asking for instrumentation data, do that, otherwise not. Trying to enable instrumentation anyway leads to bugs like the "could not find key N in shm TOC" failure reported in bug #15821 from Christian Hofstaedtler. We can implement this cheaply by piggybacking on the existing logic for not doing anything when we've chosen not to sample a statement. While at it, clean up some tin-eared coding related to the sampling feature, including an off-by-one error that meant that asking for 1.0 sampling rate didn't actually result in sampling every statement. Although the specific case reported here only manifested in >= v11, I believe that related misbehaviors can be demonstrated in any version that has parallel query; and the off-by-one error is certainly there back to 9.6 where that feature was added. So back-patch to 9.6. Discussion: https://postgr.es/m/15821-5eb422e980594075@postgresql.org
Diffstat (limited to 'contrib/auto_explain')
-rw-r--r--contrib/auto_explain/auto_explain.c43
1 files changed, 28 insertions, 15 deletions
diff --git a/contrib/auto_explain/auto_explain.c b/contrib/auto_explain/auto_explain.c
index c810a41a79..a9536c2de0 100644
--- a/contrib/auto_explain/auto_explain.c
+++ b/contrib/auto_explain/auto_explain.c
@@ -14,6 +14,7 @@
#include <limits.h>
+#include "access/parallel.h"
#include "commands/explain.h"
#include "executor/instrument.h"
#include "jit/jit.h"
@@ -59,19 +60,20 @@ static const struct config_enum_entry loglevel_options[] = {
/* Current nesting depth of ExecutorRun calls */
static int nesting_level = 0;
+/* Is the current top-level query to be sampled? */
+static bool current_query_sampled = false;
+
+#define auto_explain_enabled() \
+ (auto_explain_log_min_duration >= 0 && \
+ (nesting_level == 0 || auto_explain_log_nested_statements) && \
+ current_query_sampled)
+
/* Saved hook values in case of unload */
static ExecutorStart_hook_type prev_ExecutorStart = NULL;
static ExecutorRun_hook_type prev_ExecutorRun = NULL;
static ExecutorFinish_hook_type prev_ExecutorFinish = NULL;
static ExecutorEnd_hook_type prev_ExecutorEnd = NULL;
-/* Is the current query sampled, per backend */
-static bool current_query_sampled = true;
-
-#define auto_explain_enabled() \
- (auto_explain_log_min_duration >= 0 && \
- (nesting_level == 0 || auto_explain_log_nested_statements))
-
void _PG_init(void);
void _PG_fini(void);
@@ -249,14 +251,25 @@ static void
explain_ExecutorStart(QueryDesc *queryDesc, int eflags)
{
/*
- * For rate sampling, randomly choose top-level statement. Either all
- * nested statements will be explained or none will.
+ * At the beginning of each top-level statement, decide whether we'll
+ * sample this statement. If nested-statement explaining is enabled,
+ * either all nested statements will be explained or none will.
+ *
+ * When in a parallel worker, we should do nothing, which we can implement
+ * cheaply by pretending we decided not to sample the current statement.
+ * If EXPLAIN is active in the parent session, data will be collected and
+ * reported back to the parent, and it's no business of ours to interfere.
*/
- if (auto_explain_log_min_duration >= 0 && nesting_level == 0)
- current_query_sampled = (random() < auto_explain_sample_rate *
- MAX_RANDOM_VALUE);
+ if (nesting_level == 0)
+ {
+ if (auto_explain_log_min_duration >= 0 && !IsParallelWorker())
+ current_query_sampled = (random() < auto_explain_sample_rate *
+ ((double) MAX_RANDOM_VALUE + 1));
+ else
+ current_query_sampled = false;
+ }
- if (auto_explain_enabled() && current_query_sampled)
+ if (auto_explain_enabled())
{
/* Enable per-node instrumentation iff log_analyze is required. */
if (auto_explain_log_analyze && (eflags & EXEC_FLAG_EXPLAIN_ONLY) == 0)
@@ -275,7 +288,7 @@ explain_ExecutorStart(QueryDesc *queryDesc, int eflags)
else
standard_ExecutorStart(queryDesc, eflags);
- if (auto_explain_enabled() && current_query_sampled)
+ if (auto_explain_enabled())
{
/*
* Set up to track total elapsed time in ExecutorRun. Make sure the
@@ -346,7 +359,7 @@ explain_ExecutorFinish(QueryDesc *queryDesc)
static void
explain_ExecutorEnd(QueryDesc *queryDesc)
{
- if (queryDesc->totaltime && auto_explain_enabled() && current_query_sampled)
+ if (queryDesc->totaltime && auto_explain_enabled())
{
double msec;