summaryrefslogtreecommitdiff
path: root/src/backend/storage/file/fd.c
diff options
context:
space:
mode:
authorAndres Freund <andres@anarazel.de>2023-04-05 10:06:39 -0700
committerAndres Freund <andres@anarazel.de>2023-04-05 10:06:39 -0700
commit4d330a61bb1969df31f2cebfe1ba9d1d004346d8 (patch)
treebe72b30736467697a10d744672b282afb6d13eb7 /src/backend/storage/file/fd.c
parent4766eef3174b0b1bd8d2428381a7a33813386a6a (diff)
downloadpostgresql-4d330a61bb1969df31f2cebfe1ba9d1d004346d8.tar.gz
Add smgrzeroextend(), FileZero(), FileFallocate()
smgrzeroextend() uses FileFallocate() to efficiently extend files by multiple blocks. When extending by a small number of blocks, use FileZero() instead, as using posix_fallocate() for small numbers of blocks is inefficient for some file systems / operating systems. FileZero() is also used as the fallback for FileFallocate() on platforms / filesystems that don't support fallocate. A big advantage of using posix_fallocate() is that it typically won't cause dirty buffers in the kernel pagecache. So far the most common pattern in our code is that we smgrextend() a page full of zeroes and put the corresponding page into shared buffers, from where we later write out the actual contents of the page. If the kernel, e.g. due to memory pressure or elapsed time, already wrote back the all-zeroes page, this can lead to doubling the amount of writes reaching storage. There are no users of smgrzeroextend() as of this commit. That will follow in future commits. Reviewed-by: Melanie Plageman <melanieplageman@gmail.com> Reviewed-by: Heikki Linnakangas <hlinnaka@iki.fi> Reviewed-by: Kyotaro Horiguchi <horikyota.ntt@gmail.com> Reviewed-by: David Rowley <dgrowleyml@gmail.com> Reviewed-by: John Naylor <john.naylor@enterprisedb.com> Discussion: https://postgr.es/m/20221029025420.eplyow6k7tgu6he3@awork3.anarazel.de
Diffstat (limited to 'src/backend/storage/file/fd.c')
-rw-r--r--src/backend/storage/file/fd.c88
1 files changed, 88 insertions, 0 deletions
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 2ac365e97c..a280a1e7be 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -2206,6 +2206,94 @@ FileSync(File file, uint32 wait_event_info)
return returnCode;
}
+/*
+ * Zero a region of the file.
+ *
+ * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
+ * appropriate error.
+ */
+int
+FileZero(File file, off_t offset, off_t amount, uint32 wait_event_info)
+{
+ int returnCode;
+ ssize_t written;
+
+ Assert(FileIsValid(file));
+
+ DO_DB(elog(LOG, "FileZero: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
+ file, VfdCache[file].fileName,
+ (int64) offset, (int64) amount));
+
+ returnCode = FileAccess(file);
+ if (returnCode < 0)
+ return returnCode;
+
+ pgstat_report_wait_start(wait_event_info);
+ written = pg_pwrite_zeros(VfdCache[file].fd, amount, offset);
+ pgstat_report_wait_end();
+
+ if (written < 0)
+ return -1;
+ else if (written != amount)
+ {
+ /* if errno is unset, assume problem is no disk space */
+ if (errno == 0)
+ errno = ENOSPC;
+ return -1;
+ }
+
+ return 0;
+}
+
+/*
+ * Try to reserve file space with posix_fallocate(). If posix_fallocate() is
+ * not implemented on the operating system or fails with EINVAL / EOPNOTSUPP,
+ * use FileZero() instead.
+ *
+ * Note that at least glibc() implements posix_fallocate() in userspace if not
+ * implemented by the filesystem. That's not the case for all environments
+ * though.
+ *
+ * Returns 0 on success, -1 otherwise. In the latter case errno is set to the
+ * appropriate error.
+ */
+int
+FileFallocate(File file, off_t offset, off_t amount, uint32 wait_event_info)
+{
+#ifdef HAVE_POSIX_FALLOCATE
+ int returnCode;
+
+ Assert(FileIsValid(file));
+
+ DO_DB(elog(LOG, "FileFallocate: %d (%s) " INT64_FORMAT " " INT64_FORMAT,
+ file, VfdCache[file].fileName,
+ (int64) offset, (int64) amount));
+
+ returnCode = FileAccess(file);
+ if (returnCode < 0)
+ return -1;
+
+ pgstat_report_wait_start(wait_event_info);
+ returnCode = posix_fallocate(VfdCache[file].fd, offset, amount);
+ pgstat_report_wait_end();
+
+ if (returnCode == 0)
+ return 0;
+
+ /* for compatibility with %m printing etc */
+ errno = returnCode;
+
+ /*
+ * Return in cases of a "real" failure, if fallocate is not supported,
+ * fall through to the FileZero() backed implementation.
+ */
+ if (returnCode != EINVAL && returnCode != EOPNOTSUPP)
+ return -1;
+#endif
+
+ return FileZero(file, offset, amount, wait_event_info);
+}
+
off_t
FileSize(File file)
{