summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/backend/access/transam/xlog.c37
-rw-r--r--src/backend/access/transam/xlogprefetcher.c2
-rw-r--r--src/backend/storage/buffer/bufmgr.c16
-rw-r--r--src/backend/storage/buffer/localbuf.c7
-rw-r--r--src/backend/storage/file/fd.c98
-rw-r--r--src/backend/storage/smgr/md.c24
-rw-r--r--src/backend/storage/smgr/smgr.c1
-rw-r--r--src/backend/utils/misc/guc_tables.c12
-rw-r--r--src/include/storage/fd.h7
-rw-r--r--src/include/storage/smgr.h1
-rw-r--r--src/include/utils/guc_hooks.h2
-rw-r--r--src/test/modules/test_misc/meson.build1
-rw-r--r--src/test/modules/test_misc/t/004_io_direct.pl57
13 files changed, 231 insertions, 34 deletions
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index a5c74fdab8..18e16ae5b3 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -2926,6 +2926,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
XLogSegNo max_segno;
int fd;
int save_errno;
+ int open_flags = O_RDWR | O_CREAT | O_EXCL | PG_BINARY;
Assert(logtli != 0);
@@ -2959,8 +2960,11 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli,
unlink(tmppath);
+ if (io_direct_flags & IO_DIRECT_WAL_INIT)
+ open_flags |= PG_O_DIRECT;
+
/* do not use get_sync_bit() here --- want to fsync only at end of fill */
- fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
+ fd = BasicOpenFile(tmppath, open_flags);
if (fd < 0)
ereport(ERROR,
(errcode_for_file_access(),
@@ -3354,7 +3358,7 @@ XLogFileClose(void)
* use the cache to read the WAL segment.
*/
#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
- if (!XLogIsNeeded())
+ if (!XLogIsNeeded() && (io_direct_flags & IO_DIRECT_WAL) == 0)
(void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
#endif
@@ -4445,7 +4449,6 @@ show_in_hot_standby(void)
return RecoveryInProgress() ? "on" : "off";
}
-
/*
* Read the control file, set respective GUCs.
*
@@ -8029,35 +8032,27 @@ xlog_redo(XLogReaderState *record)
}
/*
- * Return the (possible) sync flag used for opening a file, depending on the
- * value of the GUC wal_sync_method.
+ * Return the extra open flags used for opening a file, depending on the
+ * value of the GUCs wal_sync_method, fsync and io_direct.
*/
static int
get_sync_bit(int method)
{
int o_direct_flag = 0;
- /* If fsync is disabled, never open in sync mode */
- if (!enableFsync)
- return 0;
-
/*
- * Optimize writes by bypassing kernel cache with O_DIRECT when using
- * O_SYNC and O_DSYNC. But only if archiving and streaming are disabled,
- * otherwise the archive command or walsender process will read the WAL
- * soon after writing it, which is guaranteed to cause a physical read if
- * we bypassed the kernel cache. We also skip the
- * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
- * reason.
- *
- * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
+ * Use O_DIRECT if requested, except in walreceiver process. The WAL
* written by walreceiver is normally read by the startup process soon
- * after it's written. Also, walreceiver performs unaligned writes, which
+ * after it's written. Also, walreceiver performs unaligned writes, which
* don't work with O_DIRECT, so it is required for correctness too.
*/
- if (!XLogIsNeeded() && !AmWalReceiverProcess())
+ if ((io_direct_flags & IO_DIRECT_WAL) && !AmWalReceiverProcess())
o_direct_flag = PG_O_DIRECT;
+ /* If fsync is disabled, never open in sync mode */
+ if (!enableFsync)
+ return o_direct_flag;
+
switch (method)
{
/*
@@ -8069,7 +8064,7 @@ get_sync_bit(int method)
case SYNC_METHOD_FSYNC:
case SYNC_METHOD_FSYNC_WRITETHROUGH:
case SYNC_METHOD_FDATASYNC:
- return 0;
+ return o_direct_flag;
#ifdef O_SYNC
case SYNC_METHOD_OPEN:
return O_SYNC | o_direct_flag;
diff --git a/src/backend/access/transam/xlogprefetcher.c b/src/backend/access/transam/xlogprefetcher.c
index 046e40d143..7ba18f2a76 100644
--- a/src/backend/access/transam/xlogprefetcher.c
+++ b/src/backend/access/transam/xlogprefetcher.c
@@ -785,7 +785,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn)
block->prefetch_buffer = InvalidBuffer;
return LRQ_NEXT_IO;
}
- else
+ else if ((io_direct_flags & IO_DIRECT_DATA) == 0)
{
/*
* This shouldn't be possible, because we already determined
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 5a237d5606..7778dde3e5 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -541,8 +541,11 @@ PrefetchSharedBuffer(SMgrRelation smgr_reln,
* Try to initiate an asynchronous read. This returns false in
* recovery if the relation file doesn't exist.
*/
- if (smgrprefetch(smgr_reln, forkNum, blockNum))
+ if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
+ smgrprefetch(smgr_reln, forkNum, blockNum))
+ {
result.initiated_io = true;
+ }
#endif /* USE_PREFETCH */
}
else
@@ -588,11 +591,11 @@ PrefetchSharedBuffer(SMgrRelation smgr_reln,
* the kernel and therefore didn't really initiate I/O, and no way to know when
* the I/O completes other than using synchronous ReadBuffer().
*
- * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and either
+ * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and
* USE_PREFETCH is not defined (this build doesn't support prefetching due to
- * lack of a kernel facility), or the underlying relation file wasn't found and
- * we are in recovery. (If the relation file wasn't found and we are not in
- * recovery, an error is raised).
+ * lack of a kernel facility), direct I/O is enabled, or the underlying
+ * relation file wasn't found and we are in recovery. (If the relation file
+ * wasn't found and we are not in recovery, an error is raised).
*/
PrefetchBufferResult
PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum)
@@ -5440,6 +5443,9 @@ ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag)
{
PendingWriteback *pending;
+ if (io_direct_flags & IO_DIRECT_DATA)
+ return;
+
/*
* Add buffer to the pending writeback array, unless writeback control is
* disabled.
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index 3c6382456a..f684862d98 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -92,8 +92,11 @@ PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum,
{
#ifdef USE_PREFETCH
/* Not in buffers, so initiate prefetch */
- smgrprefetch(smgr, forkNum, blockNum);
- result.initiated_io = true;
+ if ((io_direct_flags & IO_DIRECT_DATA) == 0 &&
+ smgrprefetch(smgr, forkNum, blockNum))
+ {
+ result.initiated_io = true;
+ }
#endif /* USE_PREFETCH */
}
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index a280a1e7be..277a28fc13 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -98,7 +98,9 @@
#include "storage/fd.h"
#include "storage/ipc.h"
#include "utils/guc.h"
+#include "utils/guc_hooks.h"
#include "utils/resowner_private.h"
+#include "utils/varlena.h"
/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */
#if defined(HAVE_SYNC_FILE_RANGE)
@@ -162,6 +164,9 @@ bool data_sync_retry = false;
/* How SyncDataDirectory() should do its job. */
int recovery_init_sync_method = RECOVERY_INIT_SYNC_METHOD_FSYNC;
+/* Which kinds of files should be opened with PG_O_DIRECT. */
+int io_direct_flags;
+
/* Debugging.... */
#ifdef FDDEBUG
@@ -2022,6 +2027,9 @@ FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info)
if (nbytes <= 0)
return;
+ if (VfdCache[file].fileFlags & PG_O_DIRECT)
+ return;
+
returnCode = FileAccess(file);
if (returnCode < 0)
return;
@@ -3826,3 +3834,93 @@ data_sync_elevel(int elevel)
{
return data_sync_retry ? elevel : PANIC;
}
+
+bool
+check_io_direct(char **newval, void **extra, GucSource source)
+{
+ bool result = true;
+ int flags;
+
+#if PG_O_DIRECT == 0
+ if (strcmp(*newval, "") != 0)
+ {
+ GUC_check_errdetail("io_direct is not supported on this platform.");
+ result = false;
+ }
+ flags = 0;
+#else
+ List *elemlist;
+ ListCell *l;
+ char *rawstring;
+
+ /* Need a modifiable copy of string */
+ rawstring = pstrdup(*newval);
+
+ if (!SplitGUCList(rawstring, ',', &elemlist))
+ {
+ GUC_check_errdetail("invalid list syntax in parameter \"%s\"",
+ "io_direct");
+ pfree(rawstring);
+ list_free(elemlist);
+ return false;
+ }
+
+ flags = 0;
+ foreach(l, elemlist)
+ {
+ char *item = (char *) lfirst(l);
+
+ if (pg_strcasecmp(item, "data") == 0)
+ flags |= IO_DIRECT_DATA;
+ else if (pg_strcasecmp(item, "wal") == 0)
+ flags |= IO_DIRECT_WAL;
+ else if (pg_strcasecmp(item, "wal_init") == 0)
+ flags |= IO_DIRECT_WAL_INIT;
+ else
+ {
+ GUC_check_errdetail("invalid option \"%s\"", item);
+ result = false;
+ break;
+ }
+ }
+
+ /*
+ * It's possible to configure block sizes smaller than our assumed I/O
+ * alignment size, which could result in invalid I/O requests.
+ */
+#if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE
+ if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT)))
+ {
+ GUC_check_errdetail("io_direct is not supported for WAL because XLOG_BLCKSZ is too small");
+ result = false;
+ }
+#endif
+#if BLCKSZ < PG_IO_ALIGN_SIZE
+ if (result && (flags & IO_DIRECT_DATA))
+ {
+ GUC_check_errdetail("io_direct is not supported for data because BLCKSZ is too small");
+ result = false;
+ }
+#endif
+
+ pfree(rawstring);
+ list_free(elemlist);
+#endif
+
+ if (!result)
+ return result;
+
+ /* Save the flags in *extra, for use by assign_io_direct */
+ *extra = guc_malloc(ERROR, sizeof(int));
+ *((int *) *extra) = flags;
+
+ return result;
+}
+
+extern void
+assign_io_direct(const char *newval, void *extra)
+{
+ int *flags = (int *) extra;
+
+ io_direct_flags = *flags;
+}
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index d1124d46f4..f1316eb4ce 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -142,6 +142,16 @@ static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum,
static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
MdfdVec *seg);
+static inline int
+_mdfd_open_flags(void)
+{
+ int flags = O_RDWR | PG_BINARY;
+
+ if (io_direct_flags & IO_DIRECT_DATA)
+ flags |= PG_O_DIRECT;
+
+ return flags;
+}
/*
* mdinit() -- Initialize private state for magnetic disk storage manager.
@@ -205,14 +215,14 @@ mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
path = relpath(reln->smgr_rlocator, forknum);
- fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
+ fd = PathNameOpenFile(path, _mdfd_open_flags() | O_CREAT | O_EXCL);
if (fd < 0)
{
int save_errno = errno;
if (isRedo)
- fd = PathNameOpenFile(path, O_RDWR | PG_BINARY);
+ fd = PathNameOpenFile(path, _mdfd_open_flags());
if (fd < 0)
{
/* be sure to report the error reported by create, not open */
@@ -635,7 +645,7 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior)
path = relpath(reln->smgr_rlocator, forknum);
- fd = PathNameOpenFile(path, O_RDWR | PG_BINARY);
+ fd = PathNameOpenFile(path, _mdfd_open_flags());
if (fd < 0)
{
@@ -706,6 +716,8 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
off_t seekpos;
MdfdVec *v;
+ Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
+
v = _mdfd_getseg(reln, forknum, blocknum, false,
InRecovery ? EXTENSION_RETURN_NULL : EXTENSION_FAIL);
if (v == NULL)
@@ -731,6 +743,8 @@ void
mdwriteback(SMgrRelation reln, ForkNumber forknum,
BlockNumber blocknum, BlockNumber nblocks)
{
+ Assert((io_direct_flags & IO_DIRECT_DATA) == 0);
+
/*
* Issue flush requests in as few requests as possible; have to split at
* segment boundaries though, since those are actually separate files.
@@ -1335,7 +1349,7 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
fullpath = _mdfd_segpath(reln, forknum, segno);
/* open the file */
- fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags);
+ fd = PathNameOpenFile(fullpath, _mdfd_open_flags() | oflags);
pfree(fullpath);
@@ -1546,7 +1560,7 @@ mdsyncfiletag(const FileTag *ftag, char *path)
strlcpy(path, p, MAXPGPATH);
pfree(p);
- file = PathNameOpenFile(path, O_RDWR | PG_BINARY);
+ file = PathNameOpenFile(path, _mdfd_open_flags());
if (file < 0)
return -1;
need_to_close = true;
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index c37c246b77..70d0d570b1 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -20,6 +20,7 @@
#include "access/xlogutils.h"
#include "lib/ilist.h"
#include "storage/bufmgr.h"
+#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/md.h"
#include "storage/smgr.h"
diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c
index 97edc61a14..cab3ddbe11 100644
--- a/src/backend/utils/misc/guc_tables.c
+++ b/src/backend/utils/misc/guc_tables.c
@@ -568,6 +568,7 @@ static char *locale_ctype;
static char *server_encoding_string;
static char *server_version_string;
static int server_version_num;
+static char *io_direct_string;
#ifdef HAVE_SYSLOG
#define DEFAULT_SYSLOG_FACILITY LOG_LOCAL0
@@ -4575,6 +4576,17 @@ struct config_string ConfigureNamesString[] =
check_backtrace_functions, assign_backtrace_functions, NULL
},
+ {
+ {"io_direct", PGC_POSTMASTER, DEVELOPER_OPTIONS,
+ gettext_noop("Use direct I/O for file access."),
+ NULL,
+ GUC_LIST_INPUT | GUC_NOT_IN_SAMPLE
+ },
+ &io_direct_string,
+ "",
+ check_io_direct, assign_io_direct, NULL
+ },
+
/* End-of-list marker */
{
{NULL, 0, 0, NULL, NULL}, NULL, NULL, NULL, NULL, NULL
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index faac4914fe..6791a406fc 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -44,6 +44,7 @@
#define FD_H
#include <dirent.h>
+#include <fcntl.h>
typedef enum RecoveryInitSyncMethod
{
@@ -54,10 +55,16 @@ typedef enum RecoveryInitSyncMethod
typedef int File;
+#define IO_DIRECT_DATA 0x01
+#define IO_DIRECT_WAL 0x02
+#define IO_DIRECT_WAL_INIT 0x04
+
+
/* GUC parameter */
extern PGDLLIMPORT int max_files_per_process;
extern PGDLLIMPORT bool data_sync_retry;
extern PGDLLIMPORT int recovery_init_sync_method;
+extern PGDLLIMPORT int io_direct_flags;
/*
* This is private to fd.c, but exported for save/restore_backend_variables()
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index a9a179aaba..17fba6f91a 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -17,6 +17,7 @@
#include "lib/ilist.h"
#include "storage/block.h"
#include "storage/relfilelocator.h"
+#include "utils/guc.h"
/*
* smgr.c maintains a table of SMgrRelation objects, which are essentially
diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h
index f722fb250a..a82a85c940 100644
--- a/src/include/utils/guc_hooks.h
+++ b/src/include/utils/guc_hooks.h
@@ -156,5 +156,7 @@ extern bool check_wal_consistency_checking(char **newval, void **extra,
GucSource source);
extern void assign_wal_consistency_checking(const char *newval, void *extra);
extern void assign_xlog_sync_method(int new_sync_method, void *extra);
+extern bool check_io_direct(char **newval, void **extra, GucSource source);
+extern void assign_io_direct(const char *newval, void *extra);
#endif /* GUC_HOOKS_H */
diff --git a/src/test/modules/test_misc/meson.build b/src/test/modules/test_misc/meson.build
index 21bde427b4..911084ac0f 100644
--- a/src/test/modules/test_misc/meson.build
+++ b/src/test/modules/test_misc/meson.build
@@ -9,6 +9,7 @@ tests += {
't/001_constraint_validation.pl',
't/002_tablespace.pl',
't/003_check_guc.pl',
+ 't/004_io_direct.pl',
],
},
}
diff --git a/src/test/modules/test_misc/t/004_io_direct.pl b/src/test/modules/test_misc/t/004_io_direct.pl
new file mode 100644
index 0000000000..f5bf0b11e4
--- /dev/null
+++ b/src/test/modules/test_misc/t/004_io_direct.pl
@@ -0,0 +1,57 @@
+# Very simple exercise of direct I/O GUC.
+
+use strict;
+use warnings;
+use PostgreSQL::Test::Cluster;
+use PostgreSQL::Test::Utils;
+use Test::More;
+
+# Systems that we know to have direct I/O support, and whose typical local
+# filesystems support it or at least won't fail with an error. (illumos should
+# probably be in this list, but perl reports it as solaris. Solaris should not
+# be in the list because we don't support its way of turning on direct I/O, and
+# even if we did, its version of ZFS rejects it, and OpenBSD just doesn't have
+# it.)
+if (!grep { $^O eq $_ } qw(aix darwin dragonfly freebsd linux MSWin32 netbsd))
+{
+ plan skip_all => "no direct I/O support";
+}
+
+my $node = PostgreSQL::Test::Cluster->new('main');
+$node->init;
+$node->append_conf(
+ 'postgresql.conf', qq{
+io_direct = 'data,wal,wal_init'
+shared_buffers = '256kB' # tiny to force I/O
+});
+$node->start;
+
+# Do some work that is bound to generate shared and local writes and reads as a
+# simple exercise.
+$node->safe_psql('postgres',
+ 'create table t1 as select 1 as i from generate_series(1, 10000)');
+$node->safe_psql('postgres', 'create table t2count (i int)');
+$node->safe_psql(
+ 'postgres', qq{
+begin;
+create temporary table t2 as select 1 as i from generate_series(1, 10000);
+update t2 set i = i;
+insert into t2count select count(*) from t2;
+commit;
+});
+$node->safe_psql('postgres', 'update t1 set i = i');
+is( '10000',
+ $node->safe_psql('postgres', 'select count(*) from t1'),
+ "read back from shared");
+is( '10000',
+ $node->safe_psql('postgres', 'select * from t2count'),
+ "read back from local");
+$node->stop('immediate');
+
+$node->start;
+is( '10000',
+ $node->safe_psql('postgres', 'select count(*) from t1'),
+ "read back from shared after crash recovery");
+$node->stop;
+
+done_testing();