diff options
Diffstat (limited to 'src/third_party/wiredtiger/src/block/block_write.c')
-rw-r--r-- | src/third_party/wiredtiger/src/block/block_write.c | 203 |
1 files changed, 108 insertions, 95 deletions
diff --git a/src/third_party/wiredtiger/src/block/block_write.c b/src/third_party/wiredtiger/src/block/block_write.c index 1fefeee09da..032f72d551b 100644 --- a/src/third_party/wiredtiger/src/block/block_write.c +++ b/src/third_party/wiredtiger/src/block/block_write.c @@ -15,29 +15,47 @@ int __wt_block_truncate(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t len) { + WT_CONNECTION_IMPL *conn; + WT_DECL_RET; + + conn = S2C(session); + + __wt_verbose(session, + WT_VERB_BLOCK, "truncate file to %" PRIuMAX, (uintmax_t)len); + + /* + * Truncate requires serialization, we depend on our caller for that. + * + * Truncation isn't a requirement of the block manager, it's only used + * to conserve disk space. Regardless of the underlying file system + * call's result, the in-memory understanding of the file size changes. + */ + block->size = block->extend_size = len; + /* * Backups are done by copying files outside of WiredTiger, potentially * by system utilities. We cannot truncate the file during the backup * window, we might surprise an application. * - * Stop block truncation. This affects files that aren't involved in the - * backup (for example, doing incremental backups, which only copies log - * files, or targeted backups, stops all truncation). We may want a more - * targeted solution at some point. + * This affects files that aren't involved in the backup (for example, + * doing incremental backups, which only copies log files, or targeted + * backups, stops all block truncation unnecessarily). We may want a + * more targeted solution at some point. */ - if (S2C(session)->hot_backup) - return (EBUSY); + if (!conn->hot_backup) { + __wt_readlock(session, conn->hot_backup_lock); + if (!conn->hot_backup) + ret = __wt_ftruncate(session, block->fh, len); + __wt_readunlock(session, conn->hot_backup_lock); + } /* - * Additionally, the truncate might fail if there's a file mapping (if - * there's an open checkpoint on the file), in which case the underlying - * function returns EBUSY. + * The truncate may fail temporarily or permanently (for example, there + * may be a file mapping if there's an open checkpoint on the file on a + * POSIX system, in which case the underlying function returns EBUSY). + * It's OK, we don't have to be able to truncate files. */ - WT_RET(__wt_ftruncate(session, block->fh, len)); - - block->size = block->extend_size = len; - - return (0); + return (ret == EBUSY || ret == ENOTSUP ? 0 : ret); } /* @@ -82,22 +100,18 @@ __wt_block_extend(WT_SESSION_IMPL *session, WT_BLOCK *block, { WT_DECL_RET; WT_FILE_HANDLE *handle; - bool locked; /* * The locking in this function is messy: by definition, the live system * is locked when we're called, but that lock may have been acquired by * our caller or our caller's caller. If our caller's lock, release_lock - * comes in set, indicating this function can unlock it before returning - * (either before extending the file or afterward, depending on the call - * used). If it is our caller's caller, then release_lock comes in not - * set, indicating it cannot be released here. + * comes in set and this function can unlock it before returning (so it + * isn't held while extending the file). If it is our caller's caller, + * then release_lock comes in not set, indicating it cannot be released + * here. * - * If we unlock here, we clear release_lock. But if we then find out we - * need a lock after all, we re-acquire the lock and set release_lock so - * our caller knows to release it. + * If we unlock here, we clear release_lock. */ - locked = true; /* If not configured to extend the file, we're done. */ if (block->extend_len == 0) @@ -122,62 +136,39 @@ __wt_block_extend(WT_SESSION_IMPL *session, WT_BLOCK *block, * used to extend the file initialize the extended space. If a writing * thread races with the extending thread, the extending thread might * overwrite already written data, and that would be very, very bad. - * - * Some variants of the system call to extend the file fail at run-time - * based on the filesystem type, fall back to ftruncate in that case, - * and remember that ftruncate requires locking. */ handle = fh->handle; - if (handle->fh_allocate != NULL || - handle->fh_allocate_nolock != NULL) { - /* - * Release any locally acquired lock if not needed to extend the - * file, extending the file may require updating on-disk file's - * metadata, which can be slow. (It may be a bad idea to - * configure for file extension on systems that require locking - * over the extend call.) - */ - if (handle->fh_allocate_nolock != NULL && *release_lockp) { - *release_lockp = locked = false; - __wt_spin_unlock(session, &block->live_lock); - } - - /* - * Extend the file: there's a race between setting the value of - * extend_size and doing the extension, but it should err on the - * side of extend_size being smaller than the actual file size, - * and that's OK, we simply may do another extension sooner than - * otherwise. - */ - block->extend_size = block->size + block->extend_len * 2; - if ((ret = __wt_fallocate( - session, fh, block->size, block->extend_len * 2)) == 0) - return (0); - WT_RET_ERROR_OK(ret, ENOTSUP); - } + if (handle->fh_extend == NULL && handle->fh_extend_nolock == NULL) + return (0); /* - * We may have a caller lock or a locally acquired lock, but we need a - * lock to call ftruncate. + * Set the extend_size before releasing the lock, I don't want to read + * and manipulate multiple values without holding a lock. + * + * There's a race between the calculation and doing the extension, but + * it should err on the side of extend_size being smaller than the + * actual file size, and that's OK, we simply may do another extension + * sooner than otherwise. */ - if (!locked) { - __wt_spin_lock(session, &block->live_lock); - *release_lockp = true; - } + block->extend_size = block->size + block->extend_len * 2; /* - * The underlying truncate call initializes allocated space, reset the - * extend length after locking so we don't overwrite already-written - * blocks. + * Release any locally acquired lock if not needed to extend the file, + * extending the file may require updating on-disk file's metadata, + * which can be slow. (It may be a bad idea to configure for file + * extension on systems that require locking over the extend call.) */ - block->extend_size = block->size + block->extend_len * 2; + if (handle->fh_extend_nolock != NULL && *release_lockp) { + *release_lockp = false; + __wt_spin_unlock(session, &block->live_lock); + } /* - * The truncate might fail if there's a mapped file (in other words, if - * there's an open checkpoint on the file), that's OK. + * The extend might fail (for example, the file is mapped into memory), + * or discover file extension isn't supported; both are OK. */ - WT_RET_BUSY_OK(__wt_ftruncate(session, fh, block->extend_size)); - return (0); + ret = __wt_fextend(session, fh, block->extend_size); + return (ret == EBUSY || ret == ENOTSUP ? 0 : ret); } /* @@ -210,39 +201,39 @@ __wt_block_write_size(WT_SESSION_IMPL *session, WT_BLOCK *block, size_t *sizep) * Write a buffer into a block, returning the block's address cookie. */ int -__wt_block_write(WT_SESSION_IMPL *session, WT_BLOCK *block, - WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, bool data_cksum) +__wt_block_write(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, + uint8_t *addr, size_t *addr_sizep, bool data_checksum, bool checkpoint_io) { wt_off_t offset; - uint32_t size, cksum; + uint32_t checksum, size; uint8_t *endp; - WT_RET(__wt_block_write_off( - session, block, buf, &offset, &size, &cksum, data_cksum, false)); + WT_RET(__wt_block_write_off(session, block, buf, + &offset, &size, &checksum, data_checksum, checkpoint_io, false)); endp = addr; - WT_RET(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum)); + WT_RET(__wt_block_addr_to_buffer(block, &endp, offset, size, checksum)); *addr_sizep = WT_PTRDIFF(endp, addr); return (0); } /* - * __wt_block_write_off -- + * __block_write_off -- * Write a buffer into a block, returning the block's offset, size and * checksum. */ -int -__wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, - WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump, - bool data_cksum, bool caller_locked) +static int +__block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, + WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *checksump, + bool data_checksum, bool checkpoint_io, bool caller_locked) { WT_BLOCK_HEADER *blk; WT_DECL_RET; WT_FH *fh; size_t align_size; wt_off_t offset; - uint32_t cksum; + uint32_t checksum; bool local_locked; fh = block->fh; @@ -254,12 +245,6 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, blk = WT_BLOCK_HEADER_REF(buf->mem); memset(blk, 0, sizeof(*blk)); - /* - * Swap the page-header as needed; this doesn't belong here, but it's - * the best place to catch all callers. - */ - __wt_page_header_byteswap(buf->mem); - /* Buffers should be aligned for writing. */ if (!F_ISSET(buf, WT_ITEM_ALIGNED)) { WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED)); @@ -313,14 +298,14 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, * big-endian format, swap it into place in a separate step. */ blk->flags = 0; - if (data_cksum) + if (data_checksum) F_SET(blk, WT_BLOCK_DATA_CKSUM); - blk->cksum = 0; + blk->checksum = 0; __wt_block_header_byteswap(blk); - blk->cksum = cksum = __wt_cksum( - buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP); + blk->checksum = checksum = __wt_checksum( + buf->mem, data_checksum ? align_size : WT_BLOCK_COMPRESS_SKIP); #ifdef WORDS_BIGENDIAN - blk->cksum = __wt_bswap32(blk->cksum); + blk->checksum = __wt_bswap32(blk->checksum); #endif /* Pre-allocate some number of extension structures. */ @@ -380,14 +365,42 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_STAT_FAST_CONN_INCR(session, block_write); WT_STAT_FAST_CONN_INCRV(session, block_byte_write, align_size); + if (checkpoint_io) + WT_STAT_FAST_CONN_INCRV( + session, block_byte_write_checkpoint, align_size); - WT_RET(__wt_verbose(session, WT_VERB_WRITE, - "off %" PRIuMAX ", size %" PRIuMAX ", cksum %" PRIu32, - (uintmax_t)offset, (uintmax_t)align_size, cksum)); + __wt_verbose(session, WT_VERB_WRITE, + "off %" PRIuMAX ", size %" PRIuMAX ", checksum %" PRIu32, + (uintmax_t)offset, (uintmax_t)align_size, checksum); *offsetp = offset; *sizep = WT_STORE_SIZE(align_size); - *cksump = cksum; + *checksump = checksum; return (0); } + +/* + * __wt_block_write_off -- + * Write a buffer into a block, returning the block's offset, size and + * checksum. + */ +int +__wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, + WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *checksump, + bool data_checksum, bool checkpoint_io, bool caller_locked) +{ + WT_DECL_RET; + + /* + * Ensure the page header is in little endian order; this doesn't belong + * here, but it's the best place to catch all callers. After the write, + * swap values back to native order so callers never see anything other + * than their original content. + */ + __wt_page_header_byteswap(buf->mem); + ret = __block_write_off(session, block, buf, offsetp, + sizep, checksump, data_checksum, checkpoint_io, caller_locked); + __wt_page_header_byteswap(buf->mem); + return (ret); +} |