summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/block/block_write.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/third_party/wiredtiger/src/block/block_write.c')
-rw-r--r--src/third_party/wiredtiger/src/block/block_write.c203
1 files changed, 108 insertions, 95 deletions
diff --git a/src/third_party/wiredtiger/src/block/block_write.c b/src/third_party/wiredtiger/src/block/block_write.c
index 1fefeee09da..032f72d551b 100644
--- a/src/third_party/wiredtiger/src/block/block_write.c
+++ b/src/third_party/wiredtiger/src/block/block_write.c
@@ -15,29 +15,47 @@
int
__wt_block_truncate(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t len)
{
+ WT_CONNECTION_IMPL *conn;
+ WT_DECL_RET;
+
+ conn = S2C(session);
+
+ __wt_verbose(session,
+ WT_VERB_BLOCK, "truncate file to %" PRIuMAX, (uintmax_t)len);
+
+ /*
+ * Truncate requires serialization, we depend on our caller for that.
+ *
+ * Truncation isn't a requirement of the block manager, it's only used
+ * to conserve disk space. Regardless of the underlying file system
+ * call's result, the in-memory understanding of the file size changes.
+ */
+ block->size = block->extend_size = len;
+
/*
* Backups are done by copying files outside of WiredTiger, potentially
* by system utilities. We cannot truncate the file during the backup
* window, we might surprise an application.
*
- * Stop block truncation. This affects files that aren't involved in the
- * backup (for example, doing incremental backups, which only copies log
- * files, or targeted backups, stops all truncation). We may want a more
- * targeted solution at some point.
+ * This affects files that aren't involved in the backup (for example,
+ * doing incremental backups, which only copies log files, or targeted
+ * backups, stops all block truncation unnecessarily). We may want a
+ * more targeted solution at some point.
*/
- if (S2C(session)->hot_backup)
- return (EBUSY);
+ if (!conn->hot_backup) {
+ __wt_readlock(session, conn->hot_backup_lock);
+ if (!conn->hot_backup)
+ ret = __wt_ftruncate(session, block->fh, len);
+ __wt_readunlock(session, conn->hot_backup_lock);
+ }
/*
- * Additionally, the truncate might fail if there's a file mapping (if
- * there's an open checkpoint on the file), in which case the underlying
- * function returns EBUSY.
+ * The truncate may fail temporarily or permanently (for example, there
+ * may be a file mapping if there's an open checkpoint on the file on a
+ * POSIX system, in which case the underlying function returns EBUSY).
+ * It's OK, we don't have to be able to truncate files.
*/
- WT_RET(__wt_ftruncate(session, block->fh, len));
-
- block->size = block->extend_size = len;
-
- return (0);
+ return (ret == EBUSY || ret == ENOTSUP ? 0 : ret);
}
/*
@@ -82,22 +100,18 @@ __wt_block_extend(WT_SESSION_IMPL *session, WT_BLOCK *block,
{
WT_DECL_RET;
WT_FILE_HANDLE *handle;
- bool locked;
/*
* The locking in this function is messy: by definition, the live system
* is locked when we're called, but that lock may have been acquired by
* our caller or our caller's caller. If our caller's lock, release_lock
- * comes in set, indicating this function can unlock it before returning
- * (either before extending the file or afterward, depending on the call
- * used). If it is our caller's caller, then release_lock comes in not
- * set, indicating it cannot be released here.
+ * comes in set and this function can unlock it before returning (so it
+ * isn't held while extending the file). If it is our caller's caller,
+ * then release_lock comes in not set, indicating it cannot be released
+ * here.
*
- * If we unlock here, we clear release_lock. But if we then find out we
- * need a lock after all, we re-acquire the lock and set release_lock so
- * our caller knows to release it.
+ * If we unlock here, we clear release_lock.
*/
- locked = true;
/* If not configured to extend the file, we're done. */
if (block->extend_len == 0)
@@ -122,62 +136,39 @@ __wt_block_extend(WT_SESSION_IMPL *session, WT_BLOCK *block,
* used to extend the file initialize the extended space. If a writing
* thread races with the extending thread, the extending thread might
* overwrite already written data, and that would be very, very bad.
- *
- * Some variants of the system call to extend the file fail at run-time
- * based on the filesystem type, fall back to ftruncate in that case,
- * and remember that ftruncate requires locking.
*/
handle = fh->handle;
- if (handle->fh_allocate != NULL ||
- handle->fh_allocate_nolock != NULL) {
- /*
- * Release any locally acquired lock if not needed to extend the
- * file, extending the file may require updating on-disk file's
- * metadata, which can be slow. (It may be a bad idea to
- * configure for file extension on systems that require locking
- * over the extend call.)
- */
- if (handle->fh_allocate_nolock != NULL && *release_lockp) {
- *release_lockp = locked = false;
- __wt_spin_unlock(session, &block->live_lock);
- }
-
- /*
- * Extend the file: there's a race between setting the value of
- * extend_size and doing the extension, but it should err on the
- * side of extend_size being smaller than the actual file size,
- * and that's OK, we simply may do another extension sooner than
- * otherwise.
- */
- block->extend_size = block->size + block->extend_len * 2;
- if ((ret = __wt_fallocate(
- session, fh, block->size, block->extend_len * 2)) == 0)
- return (0);
- WT_RET_ERROR_OK(ret, ENOTSUP);
- }
+ if (handle->fh_extend == NULL && handle->fh_extend_nolock == NULL)
+ return (0);
/*
- * We may have a caller lock or a locally acquired lock, but we need a
- * lock to call ftruncate.
+ * Set the extend_size before releasing the lock, I don't want to read
+ * and manipulate multiple values without holding a lock.
+ *
+ * There's a race between the calculation and doing the extension, but
+ * it should err on the side of extend_size being smaller than the
+ * actual file size, and that's OK, we simply may do another extension
+ * sooner than otherwise.
*/
- if (!locked) {
- __wt_spin_lock(session, &block->live_lock);
- *release_lockp = true;
- }
+ block->extend_size = block->size + block->extend_len * 2;
/*
- * The underlying truncate call initializes allocated space, reset the
- * extend length after locking so we don't overwrite already-written
- * blocks.
+ * Release any locally acquired lock if not needed to extend the file,
+ * extending the file may require updating on-disk file's metadata,
+ * which can be slow. (It may be a bad idea to configure for file
+ * extension on systems that require locking over the extend call.)
*/
- block->extend_size = block->size + block->extend_len * 2;
+ if (handle->fh_extend_nolock != NULL && *release_lockp) {
+ *release_lockp = false;
+ __wt_spin_unlock(session, &block->live_lock);
+ }
/*
- * The truncate might fail if there's a mapped file (in other words, if
- * there's an open checkpoint on the file), that's OK.
+ * The extend might fail (for example, the file is mapped into memory),
+ * or discover file extension isn't supported; both are OK.
*/
- WT_RET_BUSY_OK(__wt_ftruncate(session, fh, block->extend_size));
- return (0);
+ ret = __wt_fextend(session, fh, block->extend_size);
+ return (ret == EBUSY || ret == ENOTSUP ? 0 : ret);
}
/*
@@ -210,39 +201,39 @@ __wt_block_write_size(WT_SESSION_IMPL *session, WT_BLOCK *block, size_t *sizep)
* Write a buffer into a block, returning the block's address cookie.
*/
int
-__wt_block_write(WT_SESSION_IMPL *session, WT_BLOCK *block,
- WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, bool data_cksum)
+__wt_block_write(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf,
+ uint8_t *addr, size_t *addr_sizep, bool data_checksum, bool checkpoint_io)
{
wt_off_t offset;
- uint32_t size, cksum;
+ uint32_t checksum, size;
uint8_t *endp;
- WT_RET(__wt_block_write_off(
- session, block, buf, &offset, &size, &cksum, data_cksum, false));
+ WT_RET(__wt_block_write_off(session, block, buf,
+ &offset, &size, &checksum, data_checksum, checkpoint_io, false));
endp = addr;
- WT_RET(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum));
+ WT_RET(__wt_block_addr_to_buffer(block, &endp, offset, size, checksum));
*addr_sizep = WT_PTRDIFF(endp, addr);
return (0);
}
/*
- * __wt_block_write_off --
+ * __block_write_off --
* Write a buffer into a block, returning the block's offset, size and
* checksum.
*/
-int
-__wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
- WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump,
- bool data_cksum, bool caller_locked)
+static int
+__block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
+ WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *checksump,
+ bool data_checksum, bool checkpoint_io, bool caller_locked)
{
WT_BLOCK_HEADER *blk;
WT_DECL_RET;
WT_FH *fh;
size_t align_size;
wt_off_t offset;
- uint32_t cksum;
+ uint32_t checksum;
bool local_locked;
fh = block->fh;
@@ -254,12 +245,6 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
blk = WT_BLOCK_HEADER_REF(buf->mem);
memset(blk, 0, sizeof(*blk));
- /*
- * Swap the page-header as needed; this doesn't belong here, but it's
- * the best place to catch all callers.
- */
- __wt_page_header_byteswap(buf->mem);
-
/* Buffers should be aligned for writing. */
if (!F_ISSET(buf, WT_ITEM_ALIGNED)) {
WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED));
@@ -313,14 +298,14 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
* big-endian format, swap it into place in a separate step.
*/
blk->flags = 0;
- if (data_cksum)
+ if (data_checksum)
F_SET(blk, WT_BLOCK_DATA_CKSUM);
- blk->cksum = 0;
+ blk->checksum = 0;
__wt_block_header_byteswap(blk);
- blk->cksum = cksum = __wt_cksum(
- buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP);
+ blk->checksum = checksum = __wt_checksum(
+ buf->mem, data_checksum ? align_size : WT_BLOCK_COMPRESS_SKIP);
#ifdef WORDS_BIGENDIAN
- blk->cksum = __wt_bswap32(blk->cksum);
+ blk->checksum = __wt_bswap32(blk->checksum);
#endif
/* Pre-allocate some number of extension structures. */
@@ -380,14 +365,42 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
WT_STAT_FAST_CONN_INCR(session, block_write);
WT_STAT_FAST_CONN_INCRV(session, block_byte_write, align_size);
+ if (checkpoint_io)
+ WT_STAT_FAST_CONN_INCRV(
+ session, block_byte_write_checkpoint, align_size);
- WT_RET(__wt_verbose(session, WT_VERB_WRITE,
- "off %" PRIuMAX ", size %" PRIuMAX ", cksum %" PRIu32,
- (uintmax_t)offset, (uintmax_t)align_size, cksum));
+ __wt_verbose(session, WT_VERB_WRITE,
+ "off %" PRIuMAX ", size %" PRIuMAX ", checksum %" PRIu32,
+ (uintmax_t)offset, (uintmax_t)align_size, checksum);
*offsetp = offset;
*sizep = WT_STORE_SIZE(align_size);
- *cksump = cksum;
+ *checksump = checksum;
return (0);
}
+
+/*
+ * __wt_block_write_off --
+ * Write a buffer into a block, returning the block's offset, size and
+ * checksum.
+ */
+int
+__wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
+ WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *checksump,
+ bool data_checksum, bool checkpoint_io, bool caller_locked)
+{
+ WT_DECL_RET;
+
+ /*
+ * Ensure the page header is in little endian order; this doesn't belong
+ * here, but it's the best place to catch all callers. After the write,
+ * swap values back to native order so callers never see anything other
+ * than their original content.
+ */
+ __wt_page_header_byteswap(buf->mem);
+ ret = __block_write_off(session, block, buf, offsetp,
+ sizep, checksump, data_checksum, checkpoint_io, caller_locked);
+ __wt_page_header_byteswap(buf->mem);
+ return (ret);
+}