diff options
author | Andres Freund <andres@anarazel.de> | 2023-04-05 10:06:39 -0700 |
---|---|---|
committer | Andres Freund <andres@anarazel.de> | 2023-04-05 10:06:39 -0700 |
commit | 4d330a61bb1969df31f2cebfe1ba9d1d004346d8 (patch) | |
tree | be72b30736467697a10d744672b282afb6d13eb7 /src/backend/storage/file/fd.c | |
parent | 4766eef3174b0b1bd8d2428381a7a33813386a6a (diff) | |
download | postgresql-4d330a61bb1969df31f2cebfe1ba9d1d004346d8.tar.gz |
Add smgrzeroextend(), FileZero(), FileFallocate()
smgrzeroextend() uses FileFallocate() to efficiently extend files by multiple
blocks. When extending by a small number of blocks, use FileZero() instead, as
using posix_fallocate() for small numbers of blocks is inefficient for some
file systems / operating systems. FileZero() is also used as the fallback for
FileFallocate() on platforms / filesystems that don't support fallocate.
A big advantage of using posix_fallocate() is that it typically won't cause
dirty buffers in the kernel pagecache. So far the most common pattern in our
code is that we smgrextend() a page full of zeroes and put the corresponding
page into shared buffers, from where we later write out the actual contents of
the page. If the kernel, e.g. due to memory pressure or elapsed time, already
wrote back the all-zeroes page, this can lead to doubling the amount of writes
reaching storage.
There are no users of smgrzeroextend() as of this commit. That will follow in
future commits.
Reviewed-by: Melanie Plageman <melanieplageman@gmail.com>
Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi>
Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com>
Reviewed-by: David Rowley <dgrowleyml@gmail.com>
Reviewed-by: John Naylor <john.naylor@enterprisedb.com>
Discussion: https://postgr.es/m/20221029025420.eplyow6k7tgu6he3@awork3.anarazel.de
Diffstat (limited to 'src/backend/storage/file/fd.c')
-rw-r--r-- | src/backend/storage/file/fd.c | 88 |
1 files changed, 88 insertions, 0 deletions
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 2ac365e97c..a280a1e7be 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -2206,6 +2206,94 @@ FileSync(File file, uint32 wait_event_info) return returnCode; } +/* + * Zero a region of the file. + * + * Returns 0 on success, -1 otherwise. In the latter case errno is set to the + * appropriate error. + */ +int +FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info) +{ + int returnCode; + ssize_t written; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT, + file, VfdCache[file].fileName, + (int64) offset, (int64) amount)); + + returnCode = FileAccess(file); + if (returnCode < 0) + return returnCode; + + pgstat_report_wait_start(wait_event_info); + written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset); + pgstat_report_wait_end(); + + if (written < 0) + return -1; + else if (written != amount) + { + /* if errno is unset, assume problem is no disk space */ + if (errno == 0) + errno = ENOSPC; + return -1; + } + + return 0; +} + +/* + * Try to reserve file space with posix_fallocate(). If posix_fallocate() is + * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP, + * use FileZero() instead. + * + * Note that at least glibc() implements posix_fallocate() in userspace if not + * implemented by the filesystem. That's not the case for all environments + * though. + * + * Returns 0 on success, -1 otherwise. In the latter case errno is set to the + * appropriate error. + */ +int +FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info) +{ +#ifdef HAVE_POSIX_FALLOCATE + int returnCode; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT, + file, VfdCache[file].fileName, + (int64) offset, (int64) amount)); + + returnCode = FileAccess(file); + if (returnCode < 0) + return -1; + + pgstat_report_wait_start(wait_event_info); + returnCode = posix_fallocate(VfdCache[file].fd, offset, amount); + pgstat_report_wait_end(); + + if (returnCode == 0) + return 0; + + /* for compatibility with %m printing etc */ + errno = returnCode; + + /* + * Return in cases of a "real" failure, if fallocate is not supported, + * fall through to the FileZero() backed implementation. + */ + if (returnCode != EINVAL && returnCode != EOPNOTSUPP) + return -1; +#endif + + return FileZero(file, offset, amount, wait_event_info); +} + off_t FileSize(File file) { |