diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/backend/access/transam/xlog.c | 37 | ||||
-rw-r--r-- | src/backend/access/transam/xlogprefetcher.c | 2 | ||||
-rw-r--r-- | src/backend/storage/buffer/bufmgr.c | 16 | ||||
-rw-r--r-- | src/backend/storage/buffer/localbuf.c | 7 | ||||
-rw-r--r-- | src/backend/storage/file/fd.c | 98 | ||||
-rw-r--r-- | src/backend/storage/smgr/md.c | 24 | ||||
-rw-r--r-- | src/backend/storage/smgr/smgr.c | 1 | ||||
-rw-r--r-- | src/backend/utils/misc/guc_tables.c | 12 | ||||
-rw-r--r-- | src/include/storage/fd.h | 7 | ||||
-rw-r--r-- | src/include/storage/smgr.h | 1 | ||||
-rw-r--r-- | src/include/utils/guc_hooks.h | 2 | ||||
-rw-r--r-- | src/test/modules/test_misc/meson.build | 1 | ||||
-rw-r--r-- | src/test/modules/test_misc/t/004_io_direct.pl | 57 |
13 files changed, 231 insertions, 34 deletions
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index a5c74fdab8..18e16ae5b3 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -2926,6 +2926,7 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, XLogSegNo max_segno; int fd; int save_errno; + int open_flags = O_RDWR | O_CREAT | O_EXCL | PG_BINARY; Assert(logtli != 0); @@ -2959,8 +2960,11 @@ XLogFileInitInternal(XLogSegNo logsegno, TimeLineID logtli, unlink(tmppath); + if (io_direct_flags & IO_DIRECT_WAL_INIT) + open_flags |= PG_O_DIRECT; + /* do not use get_sync_bit() here --- want to fsync only at end of fill */ - fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); + fd = BasicOpenFile(tmppath, open_flags); if (fd < 0) ereport(ERROR, (errcode_for_file_access(), @@ -3354,7 +3358,7 @@ XLogFileClose(void) * use the cache to read the WAL segment. */ #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) - if (!XLogIsNeeded()) + if (!XLogIsNeeded() && (io_direct_flags & IO_DIRECT_WAL) == 0) (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED); #endif @@ -4445,7 +4449,6 @@ show_in_hot_standby(void) return RecoveryInProgress() ? "on" : "off"; } - /* * Read the control file, set respective GUCs. * @@ -8029,35 +8032,27 @@ xlog_redo(XLogReaderState *record) } /* - * Return the (possible) sync flag used for opening a file, depending on the - * value of the GUC wal_sync_method. + * Return the extra open flags used for opening a file, depending on the + * value of the GUCs wal_sync_method, fsync and io_direct. */ static int get_sync_bit(int method) { int o_direct_flag = 0; - /* If fsync is disabled, never open in sync mode */ - if (!enableFsync) - return 0; - /* - * Optimize writes by bypassing kernel cache with O_DIRECT when using - * O_SYNC and O_DSYNC. But only if archiving and streaming are disabled, - * otherwise the archive command or walsender process will read the WAL - * soon after writing it, which is guaranteed to cause a physical read if - * we bypassed the kernel cache. We also skip the - * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same - * reason. - * - * Never use O_DIRECT in walreceiver process for similar reasons; the WAL + * Use O_DIRECT if requested, except in walreceiver process. The WAL * written by walreceiver is normally read by the startup process soon - * after it's written. Also, walreceiver performs unaligned writes, which + * after it's written. Also, walreceiver performs unaligned writes, which * don't work with O_DIRECT, so it is required for correctness too. */ - if (!XLogIsNeeded() && !AmWalReceiverProcess()) + if ((io_direct_flags & IO_DIRECT_WAL) && !AmWalReceiverProcess()) o_direct_flag = PG_O_DIRECT; + /* If fsync is disabled, never open in sync mode */ + if (!enableFsync) + return o_direct_flag; + switch (method) { /* @@ -8069,7 +8064,7 @@ get_sync_bit(int method) case SYNC_METHOD_FSYNC: case SYNC_METHOD_FSYNC_WRITETHROUGH: case SYNC_METHOD_FDATASYNC: - return 0; + return o_direct_flag; #ifdef O_SYNC case SYNC_METHOD_OPEN: return O_SYNC | o_direct_flag; diff --git a/src/backend/access/transam/xlogprefetcher.c b/src/backend/access/transam/xlogprefetcher.c index 046e40d143..7ba18f2a76 100644 --- a/src/backend/access/transam/xlogprefetcher.c +++ b/src/backend/access/transam/xlogprefetcher.c @@ -785,7 +785,7 @@ XLogPrefetcherNextBlock(uintptr_t pgsr_private, XLogRecPtr *lsn) block->prefetch_buffer = InvalidBuffer; return LRQ_NEXT_IO; } - else + else if ((io_direct_flags & IO_DIRECT_DATA) == 0) { /* * This shouldn't be possible, because we already determined diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 5a237d5606..7778dde3e5 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -541,8 +541,11 @@ PrefetchSharedBuffer(SMgrRelation smgr_reln, * Try to initiate an asynchronous read. This returns false in * recovery if the relation file doesn't exist. */ - if (smgrprefetch(smgr_reln, forkNum, blockNum)) + if ((io_direct_flags & IO_DIRECT_DATA) == 0 && + smgrprefetch(smgr_reln, forkNum, blockNum)) + { result.initiated_io = true; + } #endif /* USE_PREFETCH */ } else @@ -588,11 +591,11 @@ PrefetchSharedBuffer(SMgrRelation smgr_reln, * the kernel and therefore didn't really initiate I/O, and no way to know when * the I/O completes other than using synchronous ReadBuffer(). * - * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and either + * 3. Otherwise, the buffer wasn't already cached by PostgreSQL, and * USE_PREFETCH is not defined (this build doesn't support prefetching due to - * lack of a kernel facility), or the underlying relation file wasn't found and - * we are in recovery. (If the relation file wasn't found and we are not in - * recovery, an error is raised). + * lack of a kernel facility), direct I/O is enabled, or the underlying + * relation file wasn't found and we are in recovery. (If the relation file + * wasn't found and we are not in recovery, an error is raised). */ PrefetchBufferResult PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum) @@ -5440,6 +5443,9 @@ ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag) { PendingWriteback *pending; + if (io_direct_flags & IO_DIRECT_DATA) + return; + /* * Add buffer to the pending writeback array, unless writeback control is * disabled. diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 3c6382456a..f684862d98 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -92,8 +92,11 @@ PrefetchLocalBuffer(SMgrRelation smgr, ForkNumber forkNum, { #ifdef USE_PREFETCH /* Not in buffers, so initiate prefetch */ - smgrprefetch(smgr, forkNum, blockNum); - result.initiated_io = true; + if ((io_direct_flags & IO_DIRECT_DATA) == 0 && + smgrprefetch(smgr, forkNum, blockNum)) + { + result.initiated_io = true; + } #endif /* USE_PREFETCH */ } diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index a280a1e7be..277a28fc13 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -98,7 +98,9 @@ #include "storage/fd.h" #include "storage/ipc.h" #include "utils/guc.h" +#include "utils/guc_hooks.h" #include "utils/resowner_private.h" +#include "utils/varlena.h" /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */ #if defined(HAVE_SYNC_FILE_RANGE) @@ -162,6 +164,9 @@ bool data_sync_retry = false; /* How SyncDataDirectory() should do its job. */ int recovery_init_sync_method = RECOVERY_INIT_SYNC_METHOD_FSYNC; +/* Which kinds of files should be opened with PG_O_DIRECT. */ +int io_direct_flags; + /* Debugging.... */ #ifdef FDDEBUG @@ -2022,6 +2027,9 @@ FileWriteback(File file, off_t offset, off_t nbytes, uint32 wait_event_info) if (nbytes <= 0) return; + if (VfdCache[file].fileFlags & PG_O_DIRECT) + return; + returnCode = FileAccess(file); if (returnCode < 0) return; @@ -3826,3 +3834,93 @@ data_sync_elevel(int elevel) { return data_sync_retry ? elevel : PANIC; } + +bool +check_io_direct(char **newval, void **extra, GucSource source) +{ + bool result = true; + int flags; + +#if PG_O_DIRECT == 0 + if (strcmp(*newval, "") != 0) + { + GUC_check_errdetail("io_direct is not supported on this platform."); + result = false; + } + flags = 0; +#else + List *elemlist; + ListCell *l; + char *rawstring; + + /* Need a modifiable copy of string */ + rawstring = pstrdup(*newval); + + if (!SplitGUCList(rawstring, ',', &elemlist)) + { + GUC_check_errdetail("invalid list syntax in parameter \"%s\"", + "io_direct"); + pfree(rawstring); + list_free(elemlist); + return false; + } + + flags = 0; + foreach(l, elemlist) + { + char *item = (char *) lfirst(l); + + if (pg_strcasecmp(item, "data") == 0) + flags |= IO_DIRECT_DATA; + else if (pg_strcasecmp(item, "wal") == 0) + flags |= IO_DIRECT_WAL; + else if (pg_strcasecmp(item, "wal_init") == 0) + flags |= IO_DIRECT_WAL_INIT; + else + { + GUC_check_errdetail("invalid option \"%s\"", item); + result = false; + break; + } + } + + /* + * It's possible to configure block sizes smaller than our assumed I/O + * alignment size, which could result in invalid I/O requests. + */ +#if XLOG_BLCKSZ < PG_IO_ALIGN_SIZE + if (result && (flags & (IO_DIRECT_WAL | IO_DIRECT_WAL_INIT))) + { + GUC_check_errdetail("io_direct is not supported for WAL because XLOG_BLCKSZ is too small"); + result = false; + } +#endif +#if BLCKSZ < PG_IO_ALIGN_SIZE + if (result && (flags & IO_DIRECT_DATA)) + { + GUC_check_errdetail("io_direct is not supported for data because BLCKSZ is too small"); + result = false; + } +#endif + + pfree(rawstring); + list_free(elemlist); +#endif + + if (!result) + return result; + + /* Save the flags in *extra, for use by assign_io_direct */ + *extra = guc_malloc(ERROR, sizeof(int)); + *((int *) *extra) = flags; + + return result; +} + +extern void +assign_io_direct(const char *newval, void *extra) +{ + int *flags = (int *) extra; + + io_direct_flags = *flags; +} diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index d1124d46f4..f1316eb4ce 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -142,6 +142,16 @@ static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forknum, static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg); +static inline int +_mdfd_open_flags(void) +{ + int flags = O_RDWR | PG_BINARY; + + if (io_direct_flags & IO_DIRECT_DATA) + flags |= PG_O_DIRECT; + + return flags; +} /* * mdinit() -- Initialize private state for magnetic disk storage manager. @@ -205,14 +215,14 @@ mdcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) path = relpath(reln->smgr_rlocator, forknum); - fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY); + fd = PathNameOpenFile(path, _mdfd_open_flags() | O_CREAT | O_EXCL); if (fd < 0) { int save_errno = errno; if (isRedo) - fd = PathNameOpenFile(path, O_RDWR | PG_BINARY); + fd = PathNameOpenFile(path, _mdfd_open_flags()); if (fd < 0) { /* be sure to report the error reported by create, not open */ @@ -635,7 +645,7 @@ mdopenfork(SMgrRelation reln, ForkNumber forknum, int behavior) path = relpath(reln->smgr_rlocator, forknum); - fd = PathNameOpenFile(path, O_RDWR | PG_BINARY); + fd = PathNameOpenFile(path, _mdfd_open_flags()); if (fd < 0) { @@ -706,6 +716,8 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) off_t seekpos; MdfdVec *v; + Assert((io_direct_flags & IO_DIRECT_DATA) == 0); + v = _mdfd_getseg(reln, forknum, blocknum, false, InRecovery ? EXTENSION_RETURN_NULL : EXTENSION_FAIL); if (v == NULL) @@ -731,6 +743,8 @@ void mdwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, BlockNumber nblocks) { + Assert((io_direct_flags & IO_DIRECT_DATA) == 0); + /* * Issue flush requests in as few requests as possible; have to split at * segment boundaries though, since those are actually separate files. @@ -1335,7 +1349,7 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, fullpath = _mdfd_segpath(reln, forknum, segno); /* open the file */ - fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags); + fd = PathNameOpenFile(fullpath, _mdfd_open_flags() | oflags); pfree(fullpath); @@ -1546,7 +1560,7 @@ mdsyncfiletag(const FileTag *ftag, char *path) strlcpy(path, p, MAXPGPATH); pfree(p); - file = PathNameOpenFile(path, O_RDWR | PG_BINARY); + file = PathNameOpenFile(path, _mdfd_open_flags()); if (file < 0) return -1; need_to_close = true; diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index c37c246b77..70d0d570b1 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -20,6 +20,7 @@ #include "access/xlogutils.h" #include "lib/ilist.h" #include "storage/bufmgr.h" +#include "storage/fd.h" #include "storage/ipc.h" #include "storage/md.h" #include "storage/smgr.h" diff --git a/src/backend/utils/misc/guc_tables.c b/src/backend/utils/misc/guc_tables.c index 97edc61a14..cab3ddbe11 100644 --- a/src/backend/utils/misc/guc_tables.c +++ b/src/backend/utils/misc/guc_tables.c @@ -568,6 +568,7 @@ static char *locale_ctype; static char *server_encoding_string; static char *server_version_string; static int server_version_num; +static char *io_direct_string; #ifdef HAVE_SYSLOG #define DEFAULT_SYSLOG_FACILITY LOG_LOCAL0 @@ -4575,6 +4576,17 @@ struct config_string ConfigureNamesString[] = check_backtrace_functions, assign_backtrace_functions, NULL }, + { + {"io_direct", PGC_POSTMASTER, DEVELOPER_OPTIONS, + gettext_noop("Use direct I/O for file access."), + NULL, + GUC_LIST_INPUT | GUC_NOT_IN_SAMPLE + }, + &io_direct_string, + "", + check_io_direct, assign_io_direct, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, NULL, NULL, NULL, NULL diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index faac4914fe..6791a406fc 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -44,6 +44,7 @@ #define FD_H #include <dirent.h> +#include <fcntl.h> typedef enum RecoveryInitSyncMethod { @@ -54,10 +55,16 @@ typedef enum RecoveryInitSyncMethod typedef int File; +#define IO_DIRECT_DATA 0x01 +#define IO_DIRECT_WAL 0x02 +#define IO_DIRECT_WAL_INIT 0x04 + + /* GUC parameter */ extern PGDLLIMPORT int max_files_per_process; extern PGDLLIMPORT bool data_sync_retry; extern PGDLLIMPORT int recovery_init_sync_method; +extern PGDLLIMPORT int io_direct_flags; /* * This is private to fd.c, but exported for save/restore_backend_variables() diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index a9a179aaba..17fba6f91a 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -17,6 +17,7 @@ #include "lib/ilist.h" #include "storage/block.h" #include "storage/relfilelocator.h" +#include "utils/guc.h" /* * smgr.c maintains a table of SMgrRelation objects, which are essentially diff --git a/src/include/utils/guc_hooks.h b/src/include/utils/guc_hooks.h index f722fb250a..a82a85c940 100644 --- a/src/include/utils/guc_hooks.h +++ b/src/include/utils/guc_hooks.h @@ -156,5 +156,7 @@ extern bool check_wal_consistency_checking(char **newval, void **extra, GucSource source); extern void assign_wal_consistency_checking(const char *newval, void *extra); extern void assign_xlog_sync_method(int new_sync_method, void *extra); +extern bool check_io_direct(char **newval, void **extra, GucSource source); +extern void assign_io_direct(const char *newval, void *extra); #endif /* GUC_HOOKS_H */ diff --git a/src/test/modules/test_misc/meson.build b/src/test/modules/test_misc/meson.build index 21bde427b4..911084ac0f 100644 --- a/src/test/modules/test_misc/meson.build +++ b/src/test/modules/test_misc/meson.build @@ -9,6 +9,7 @@ tests += { 't/001_constraint_validation.pl', 't/002_tablespace.pl', 't/003_check_guc.pl', + 't/004_io_direct.pl', ], }, } diff --git a/src/test/modules/test_misc/t/004_io_direct.pl b/src/test/modules/test_misc/t/004_io_direct.pl new file mode 100644 index 0000000000..f5bf0b11e4 --- /dev/null +++ b/src/test/modules/test_misc/t/004_io_direct.pl @@ -0,0 +1,57 @@ +# Very simple exercise of direct I/O GUC. + +use strict; +use warnings; +use PostgreSQL::Test::Cluster; +use PostgreSQL::Test::Utils; +use Test::More; + +# Systems that we know to have direct I/O support, and whose typical local +# filesystems support it or at least won't fail with an error. (illumos should +# probably be in this list, but perl reports it as solaris. Solaris should not +# be in the list because we don't support its way of turning on direct I/O, and +# even if we did, its version of ZFS rejects it, and OpenBSD just doesn't have +# it.) +if (!grep { $^O eq $_ } qw(aix darwin dragonfly freebsd linux MSWin32 netbsd)) +{ + plan skip_all => "no direct I/O support"; +} + +my $node = PostgreSQL::Test::Cluster->new('main'); +$node->init; +$node->append_conf( + 'postgresql.conf', qq{ +io_direct = 'data,wal,wal_init' +shared_buffers = '256kB' # tiny to force I/O +}); +$node->start; + +# Do some work that is bound to generate shared and local writes and reads as a +# simple exercise. +$node->safe_psql('postgres', + 'create table t1 as select 1 as i from generate_series(1, 10000)'); +$node->safe_psql('postgres', 'create table t2count (i int)'); +$node->safe_psql( + 'postgres', qq{ +begin; +create temporary table t2 as select 1 as i from generate_series(1, 10000); +update t2 set i = i; +insert into t2count select count(*) from t2; +commit; +}); +$node->safe_psql('postgres', 'update t1 set i = i'); +is( '10000', + $node->safe_psql('postgres', 'select count(*) from t1'), + "read back from shared"); +is( '10000', + $node->safe_psql('postgres', 'select * from t2count'), + "read back from local"); +$node->stop('immediate'); + +$node->start; +is( '10000', + $node->safe_psql('postgres', 'select count(*) from t1'), + "read back from shared after crash recovery"); +$node->stop; + +done_testing(); |