/*- * Copyright (c) 2014-present MongoDB, Inc. * Copyright (c) 2008-2014 WiredTiger, Inc. * All rights reserved. * * See the file LICENSE for redistribution information. */ #include "wt_internal.h" /* * __wt_block_truncate -- * Truncate the file. */ int __wt_block_truncate(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t len) { WT_CONNECTION_IMPL *conn; WT_DECL_RET; conn = S2C(session); __wt_verbose(session, WT_VERB_BLOCK, "truncate file to %" PRIuMAX, (uintmax_t)len); /* * Truncate requires serialization, we depend on our caller for that. * * Truncation isn't a requirement of the block manager, it's only used to conserve disk space. * Regardless of the underlying file system call's result, the in-memory understanding of the * file size changes. */ block->size = block->extend_size = len; /* * Backups are done by copying files outside of WiredTiger, potentially by system utilities. We * cannot truncate the file during the backup window, we might surprise an application. * * This affects files that aren't involved in the backup (for example, doing incremental * backups, which only copies log files, or targeted backups, stops all block truncation * unnecessarily). We may want a more targeted solution at some point. */ if (conn->hot_backup_start == 0) WT_WITH_HOTBACKUP_READ_LOCK(session, ret = __wt_ftruncate(session, block->fh, len), NULL); /* * The truncate may fail temporarily or permanently (for example, there may be a file mapping if * there's an open checkpoint on the file on a POSIX system, in which case the underlying * function returns EBUSY). It's OK, we don't have to be able to truncate files. */ return (ret == EBUSY || ret == ENOTSUP ? 0 : ret); } /* * __wt_block_discard -- * Discard blocks from the system buffer cache. */ int __wt_block_discard(WT_SESSION_IMPL *session, WT_BLOCK *block, size_t added_size) { WT_DECL_RET; WT_FILE_HANDLE *handle; /* The file may not support this call. */ handle = block->fh->handle; if (handle->fh_advise == NULL) return (0); /* The call may not be configured. */ if (block->os_cache_max == 0) return (0); /* * We're racing on the addition, but I'm not willing to serialize on it in the standard read * path without evidence it's needed. */ if ((block->os_cache += added_size) <= block->os_cache_max) return (0); block->os_cache = 0; ret = handle->fh_advise( handle, (WT_SESSION *)session, (wt_off_t)0, (wt_off_t)0, WT_FILE_HANDLE_DONTNEED); return (ret == EBUSY || ret == ENOTSUP ? 0 : ret); } /* * __wt_block_extend -- * Extend the file. */ static inline int __wt_block_extend(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_FH *fh, wt_off_t offset, size_t align_size, bool *release_lockp) { WT_DECL_RET; WT_FILE_HANDLE *handle; /* * The locking in this function is messy: by definition, the live system is locked when we're * called, but that lock may have been acquired by our caller or our caller's caller. If our * caller's lock, release_lock comes in set and this function can unlock it before returning (so * it isn't held while extending the file). If it is our caller's caller, then release_lock * comes in not set, indicating it cannot be released here. * * If we unlock here, we clear release_lock. */ /* If not configured to extend the file, we're done. */ if (block->extend_len == 0) return (0); /* * Extend the file in chunks. We want to limit the number of threads extending the file at the * same time, so choose the one thread that's crossing the extended boundary. We don't extend * newly created files, and it's theoretically possible we might wait so long our extension of * the file is passed by another thread writing single blocks, that's why there's a check in * case the extended file size becomes too small: if the file size catches up, every thread * tries to extend it. */ if (block->extend_size > block->size && (offset > block->extend_size || offset + block->extend_len + (wt_off_t)align_size < block->extend_size)) return (0); /* * File extension may require locking: some variants of the system call used to extend the file * initialize the extended space. If a writing thread races with the extending thread, the * extending thread might overwrite already written data, and that would be very, very bad. */ handle = fh->handle; if (handle->fh_extend == NULL && handle->fh_extend_nolock == NULL) return (0); /* * Set the extend_size before releasing the lock, I don't want to read and manipulate multiple * values without holding a lock. * * There's a race between the calculation and doing the extension, but it should err on the side * of extend_size being smaller than the actual file size, and that's OK, we simply may do * another extension sooner than otherwise. */ block->extend_size = block->size + block->extend_len * 2; /* * Release any locally acquired lock if not needed to extend the file, extending the file may * require updating on-disk file's metadata, which can be slow. (It may be a bad idea to * configure for file extension on systems that require locking over the extend call.) */ if (handle->fh_extend_nolock != NULL && *release_lockp) { *release_lockp = false; __wt_spin_unlock(session, &block->live_lock); } /* * The extend might fail (for example, the file is mapped into memory or a backup is in * progress), or discover file extension isn't supported; both are OK. */ if (S2C(session)->hot_backup_start == 0) WT_WITH_HOTBACKUP_READ_LOCK( session, ret = __wt_fextend(session, fh, block->extend_size), NULL); return (ret == EBUSY || ret == ENOTSUP ? 0 : ret); } /* * __wt_block_write_size -- * Return the buffer size required to write a block. */ int __wt_block_write_size(WT_SESSION_IMPL *session, WT_BLOCK *block, size_t *sizep) { WT_UNUSED(session); /* * We write the page size, in bytes, into the block's header as a 4B unsigned value, and it's * possible for the engine to accept an item we can't write. For example, a huge key/value where * the allocation size has been set to something large will overflow 4B when it tries to align * the write. We could make this work (for example, writing the page size in units of allocation * size or something else), but it's not worth the effort, writing 4GB objects into a btree * makes no sense. Limit the writes to (4GB - 1KB), it gives us potential mode bits, and I'm not * interested in debugging corner cases anyway. */ *sizep = (size_t)WT_ALIGN(*sizep + WT_BLOCK_HEADER_BYTE_SIZE, block->allocsize); return (*sizep > UINT32_MAX - 1024 ? EINVAL : 0); } /* * __wt_block_write -- * Write a buffer into a block, returning the block's address cookie. */ int __wt_block_write(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, bool data_checksum, bool checkpoint_io) { wt_off_t offset; uint32_t checksum, objectid, size; uint8_t *endp; WT_RET(__wt_block_write_off(session, block, buf, &objectid, &offset, &size, &checksum, data_checksum, checkpoint_io, false)); endp = addr; WT_RET(__wt_block_addr_pack(block, &endp, objectid, offset, size, checksum)); *addr_sizep = WT_PTRDIFF(endp, addr); return (0); } /* * __block_write_off -- * Write a buffer into a block, returning the block's offset, size and checksum. */ static int __block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uint32_t *objectidp, wt_off_t *offsetp, uint32_t *sizep, uint32_t *checksump, bool data_checksum, bool checkpoint_io, bool caller_locked) { WT_BLOCK_HEADER *blk; WT_DECL_RET; WT_FH *fh; wt_off_t offset; size_t align_size; uint32_t checksum, objectid; uint8_t *file_sizep; bool local_locked; *offsetp = 0; /* -Werror=maybe-uninitialized */ *sizep = 0; /* -Werror=maybe-uninitialized */ *checksump = 0; /* -Werror=maybe-uninitialized */ fh = block->fh; objectid = block->objectid; /* Buffers should be aligned for writing. */ if (!F_ISSET(buf, WT_ITEM_ALIGNED)) { WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED)); WT_RET_MSG(session, EINVAL, "direct I/O check: write buffer incorrectly allocated"); } /* * File checkpoint/recovery magic: done before sizing the buffer as it may grow the buffer. */ if (block->final_ckpt != NULL) WT_RET(__wt_block_checkpoint_final(session, block, buf, &file_sizep)); /* * Align the size to an allocation unit. * * The buffer must be big enough for us to zero to the next allocsize boundary, this is one of * the reasons the btree layer must find out from the block-manager layer the maximum size of * the eventual write. */ align_size = WT_ALIGN(buf->size, block->allocsize); if (align_size > buf->memsize) { WT_ASSERT(session, align_size <= buf->memsize); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer incorrectly allocated"); } if (align_size > UINT32_MAX) { WT_ASSERT(session, align_size <= UINT32_MAX); WT_RET_MSG(session, EINVAL, "buffer size check: write buffer too large to write"); } /* Pre-allocate some number of extension structures. */ WT_RET(__wt_block_ext_prealloc(session, 5)); /* * Acquire a lock, if we don't already hold one. Allocate space for the write, and optionally * extend the file (note the block-extend function may release the lock). Release any locally * acquired lock. */ local_locked = false; if (!caller_locked) { __wt_spin_lock(session, &block->live_lock); local_locked = true; } ret = __wt_block_alloc(session, block, &offset, (wt_off_t)align_size); if (ret == 0) ret = __wt_block_extend(session, block, fh, offset, align_size, &local_locked); if (local_locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); /* * The file has finished changing size. If this is the final write in a checkpoint, update the * checkpoint's information inline. */ if (block->final_ckpt != NULL) WT_RET(__wt_vpack_uint(&file_sizep, 0, (uint64_t)block->size)); /* Zero out any unused bytes at the end of the buffer. */ memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size); /* * Clear the block header to ensure all of it is initialized, even the unused fields. */ blk = WT_BLOCK_HEADER_REF(buf->mem); memset(blk, 0, sizeof(*blk)); /* * Set the disk size so we don't have to incrementally read blocks during salvage. */ blk->disk_size = WT_STORE_SIZE(align_size); /* * Update the block's checksum: checksum the complete data if our caller specifies, otherwise * checksum the leading WT_BLOCK_COMPRESS_SKIP bytes. Applications with a compression or * encryption engine that includes checksums won't need a separate checksum. However, if the * block was too small for compression, or compression failed to shrink the block, the block * wasn't compressed, in which case our caller will tell us to checksum the data. If skipping * checksums because of compression or encryption, we still need to checksum the first * WT_BLOCK_COMPRESS_SKIP bytes because they're not compressed or encrypted, both to give * salvage a quick test of whether a block is useful and to give us a test so we don't lose the * first WT_BLOCK_COMPRESS_SKIP bytes without noticing. * * Checksum a little-endian version of the header, and write everything in little-endian format. * The checksum is (potentially) returned in a big-endian format, swap it into place in a * separate step. */ blk->flags = 0; if (data_checksum) F_SET(blk, WT_BLOCK_DATA_CKSUM); blk->checksum = 0; __wt_block_header_byteswap(blk); blk->checksum = checksum = __wt_checksum(buf->mem, data_checksum ? align_size : WT_BLOCK_COMPRESS_SKIP); #ifdef WORDS_BIGENDIAN blk->checksum = __wt_bswap32(blk->checksum); #endif /* Write the block. */ if ((ret = __wt_write(session, fh, offset, align_size, buf->mem)) != 0) { if (!caller_locked) __wt_spin_lock(session, &block->live_lock); WT_TRET(__wt_block_off_free(session, block, objectid, offset, (wt_off_t)align_size)); if (!caller_locked) __wt_spin_unlock(session, &block->live_lock); WT_RET(ret); } /* * Optionally schedule writes for dirty pages in the system buffer cache, but only if the * current session can wait. */ if (block->os_cache_dirty_max != 0 && fh->written > block->os_cache_dirty_max && __wt_session_can_wait(session)) { fh->written = 0; if ((ret = __wt_fsync(session, fh, false)) != 0) { /* * Ignore ENOTSUP, but don't try again. */ if (ret != ENOTSUP) return (ret); block->os_cache_dirty_max = 0; } } /* Optionally discard blocks from the buffer cache. */ WT_RET(__wt_block_discard(session, block, align_size)); WT_STAT_CONN_INCR(session, block_write); WT_STAT_CONN_INCRV(session, block_byte_write, align_size); if (checkpoint_io) WT_STAT_CONN_INCRV(session, block_byte_write_checkpoint, align_size); __wt_verbose_debug2(session, WT_VERB_WRITE, "off %" PRIuMAX ", size %" PRIuMAX ", checksum %#" PRIx32, (uintmax_t)offset, (uintmax_t)align_size, checksum); *objectidp = objectid; *offsetp = offset; *sizep = WT_STORE_SIZE(align_size); *checksump = checksum; return (0); } /* * __wt_block_write_off -- * Write a buffer into a block, returning the block's offset, size and checksum. */ int __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf, uint32_t *objectidp, wt_off_t *offsetp, uint32_t *sizep, uint32_t *checksump, bool data_checksum, bool checkpoint_io, bool caller_locked) { WT_DECL_RET; /* * Ensure the page header is in little endian order; this doesn't belong here, but it's the best * place to catch all callers. After the write, swap values back to native order so callers * never see anything other than their original content. */ __wt_page_header_byteswap(buf->mem); ret = __block_write_off(session, block, buf, objectidp, offsetp, sizep, checksump, data_checksum, checkpoint_io, caller_locked); __wt_page_header_byteswap(buf->mem); return (ret); }