summaryrefslogtreecommitdiff
path: root/src/test/recovery/t/017_shm.pl
diff options
context:
space:
mode:
authorNoah Misch <noah@leadboat.com>2019-04-12 22:36:38 -0700
committerNoah Misch <noah@leadboat.com>2019-04-12 22:36:38 -0700
commitc098509927f9a49ebceb301a2cb6a477ecd4ac3c (patch)
treec53f974f64d6915c4cb1172924ec94a7349fa779 /src/test/recovery/t/017_shm.pl
parentdb8db624e826efbe16aab1ae921bae071f98f099 (diff)
downloadpostgresql-c098509927f9a49ebceb301a2cb6a477ecd4ac3c.tar.gz
Consistently test for in-use shared memory.
postmaster startup scrutinizes any shared memory segment recorded in postmaster.pid, exiting if that segment matches the current data directory and has an attached process. When the postmaster.pid file was missing, a starting postmaster used weaker checks. Change to use the same checks in both scenarios. This increases the chance of a startup failure, in lieu of data corruption, if the DBA does "kill -9 `head -n1 postmaster.pid` && rm postmaster.pid && pg_ctl -w start". A postmaster will no longer stop if shmat() of an old segment fails with EACCES. A postmaster will no longer recycle segments pertaining to other data directories. That's good for production, but it's bad for integration tests that crash a postmaster and immediately delete its data directory. Such a test now leaks a segment indefinitely. No "make check-world" test does that. win32_shmem.c already avoided all these problems. In 9.6 and later, enhance PostgresNode to facilitate testing. Back-patch to 9.4 (all supported versions). Reviewed (in earlier versions) by Daniel Gustafsson and Kyotaro HORIGUCHI. Discussion: https://postgr.es/m/20190408064141.GA2016666@rfd.leadboat.com
Diffstat (limited to 'src/test/recovery/t/017_shm.pl')
-rw-r--r--src/test/recovery/t/017_shm.pl200
1 files changed, 200 insertions, 0 deletions
diff --git a/src/test/recovery/t/017_shm.pl b/src/test/recovery/t/017_shm.pl
new file mode 100644
index 0000000000..3cbe938ddd
--- /dev/null
+++ b/src/test/recovery/t/017_shm.pl
@@ -0,0 +1,200 @@
+#
+# Tests of pg_shmem.h functions
+#
+use strict;
+use warnings;
+use IPC::Run 'run';
+use PostgresNode;
+use Test::More;
+use TestLib;
+use Time::HiRes qw(usleep);
+
+plan tests => 5;
+
+my $tempdir = TestLib::tempdir;
+my $port;
+
+# Log "ipcs" diffs on a best-effort basis, swallowing any error.
+my $ipcs_before = "$tempdir/ipcs_before";
+eval { run_log [ 'ipcs', '-am' ], '>', $ipcs_before; };
+
+sub log_ipcs
+{
+ eval { run_log [ 'ipcs', '-am' ], '|', [ 'diff', $ipcs_before, '-' ] };
+ return;
+}
+
+# These tests need a $port such that nothing creates or removes a segment in
+# $port's IpcMemoryKey range while this test script runs. While there's no
+# way to ensure that in general, we do ensure that if PostgreSQL tests are the
+# only actors. With TCP, the first get_new_node picks a port number. With
+# Unix sockets, use a postmaster, $port_holder, to represent a key space
+# reservation. $port_holder holds a reservation on the key space of port
+# 1+$port_holder->port if it created the first IpcMemoryKey of its own port's
+# key space. If multiple copies of this test script run concurrently, they
+# will pick different ports. $port_holder postmasters use odd-numbered ports,
+# and tests use even-numbered ports. In the absence of collisions from other
+# shmget() activity, gnat starts with key 0x7d001 (512001), and flea starts
+# with key 0x7d002 (512002).
+my $port_holder;
+if (!$PostgresNode::use_tcp)
+{
+ my $lock_port;
+ for ($lock_port = 511; $lock_port < 711; $lock_port += 2)
+ {
+ $port_holder = PostgresNode->get_new_node(
+ "port${lock_port}_holder",
+ port => $lock_port,
+ own_host => 1);
+ $port_holder->init;
+ $port_holder->append_conf('postgresql.conf', 'max_connections = 5');
+ $port_holder->start;
+ # Match the AddToDataDirLockFile() call in sysv_shmem.c. Assume all
+ # systems not using sysv_shmem.c do use TCP.
+ my $shmem_key_line_prefix = sprintf("%9lu ", 1 + $lock_port * 1000);
+ last
+ if slurp_file($port_holder->data_dir . '/postmaster.pid') =~
+ /^$shmem_key_line_prefix/m;
+ $port_holder->stop;
+ }
+ $port = $lock_port + 1;
+}
+
+# Node setup.
+sub init_start
+{
+ my $name = shift;
+ my $ret = PostgresNode->get_new_node($name, port => $port, own_host => 1);
+ defined($port) or $port = $ret->port; # same port for all nodes
+ $ret->init;
+ # Limit semaphore consumption, since we run several nodes concurrently.
+ $ret->append_conf('postgresql.conf', 'max_connections = 5');
+ $ret->start;
+ log_ipcs();
+ return $ret;
+}
+my $gnat = init_start 'gnat';
+my $flea = init_start 'flea';
+
+# Upon postmaster death, postmaster children exit automatically.
+$gnat->kill9;
+log_ipcs();
+$flea->restart; # flea ignores the shm key gnat abandoned.
+log_ipcs();
+poll_start($gnat); # gnat recycles its former shm key.
+log_ipcs();
+
+# After clean shutdown, the nodes swap shm keys.
+$gnat->stop;
+$flea->restart;
+log_ipcs();
+$gnat->start;
+log_ipcs();
+
+# Scenarios involving no postmaster.pid, dead postmaster, and a live backend.
+# Use a regress.c function to emulate the responsiveness of a backend working
+# through a CPU-intensive task.
+$gnat->safe_psql('postgres', <<EOSQL);
+CREATE FUNCTION wait_pid(int)
+ RETURNS void
+ AS '$ENV{REGRESS_SHLIB}'
+ LANGUAGE C STRICT;
+EOSQL
+my $slow_query = 'SELECT wait_pid(pg_backend_pid())';
+my ($stdout, $stderr);
+my $slow_client = IPC::Run::start(
+ [
+ 'psql', '-X', '-qAt', '-d', $gnat->connstr('postgres'),
+ '-c', $slow_query
+ ],
+ '<',
+ \undef,
+ '>',
+ \$stdout,
+ '2>',
+ \$stderr,
+ IPC::Run::timeout(900)); # five times the poll_query_until timeout
+ok( $gnat->poll_query_until(
+ 'postgres',
+ "SELECT 1 FROM pg_stat_activity WHERE query = '$slow_query'", '1'),
+ 'slow query started');
+my $slow_pid = $gnat->safe_psql('postgres',
+ "SELECT pid FROM pg_stat_activity WHERE query = '$slow_query'");
+$gnat->kill9;
+unlink($gnat->data_dir . '/postmaster.pid');
+$gnat->rotate_logfile; # on Windows, can't open old log for writing
+log_ipcs();
+# Reject ordinary startup. Retry for the same reasons poll_start() does.
+my $pre_existing_msg = qr/pre-existing shared memory block/;
+{
+ my $max_attempts = 180 * 10; # Retry every 0.1s for at least 180s.
+ my $attempts = 0;
+ while ($attempts < $max_attempts)
+ {
+ last
+ if $gnat->start(fail_ok => 1)
+ || slurp_file($gnat->logfile) =~ $pre_existing_msg;
+ usleep(100_000);
+ $attempts++;
+ }
+}
+like(slurp_file($gnat->logfile),
+ $pre_existing_msg, 'detected live backend via shared memory');
+# Reject single-user startup.
+my $single_stderr;
+ok( !run_log(
+ [ 'postgres', '--single', '-D', $gnat->data_dir, 'template1' ],
+ '<', \('SELECT 1 + 1'), '2>', \$single_stderr),
+ 'live query blocks --single');
+print STDERR $single_stderr;
+like($single_stderr, $pre_existing_msg,
+ 'single-user mode detected live backend via shared memory');
+log_ipcs();
+# Fail to reject startup if shm key N has become available and we crash while
+# using key N+1. This is unwanted, but expected. Windows is immune, because
+# its GetSharedMemName() use DataDir strings, not numeric keys.
+$flea->stop; # release first key
+is( $gnat->start(fail_ok => 1),
+ $TestLib::windows_os ? 0 : 1,
+ 'key turnover fools only sysv_shmem.c');
+$gnat->stop; # release first key (no-op on $TestLib::windows_os)
+$flea->start; # grab first key
+# cleanup
+TestLib::system_log('pg_ctl', 'kill', 'QUIT', $slow_pid);
+$slow_client->finish; # client has detected backend termination
+log_ipcs();
+poll_start($gnat); # recycle second key
+
+$gnat->stop;
+$flea->stop;
+$port_holder->stop if $port_holder;
+log_ipcs();
+
+
+# We may need retries to start a new postmaster. Causes:
+# - kernel is slow to deliver SIGKILL
+# - postmaster parent is slow to waitpid()
+# - postmaster child is slow to exit in response to SIGQUIT
+# - postmaster child is slow to exit after postmaster death
+sub poll_start
+{
+ my ($node) = @_;
+
+ my $max_attempts = 180 * 10;
+ my $attempts = 0;
+
+ while ($attempts < $max_attempts)
+ {
+ $node->start(fail_ok => 1) && return 1;
+
+ # Wait 0.1 second before retrying.
+ usleep(100_000);
+
+ $attempts++;
+ }
+
+ # No success within 180 seconds. Try one last time without fail_ok, which
+ # will BAIL_OUT unless it succeeds.
+ $node->start && return 1;
+ return 0;
+}