diff options
Diffstat (limited to 'src/third_party/wiredtiger/src/block/block_write.c')
-rw-r--r-- | src/third_party/wiredtiger/src/block/block_write.c | 269 |
1 files changed, 269 insertions, 0 deletions
diff --git a/src/third_party/wiredtiger/src/block/block_write.c b/src/third_party/wiredtiger/src/block/block_write.c new file mode 100644 index 00000000000..0da6380e61f --- /dev/null +++ b/src/third_party/wiredtiger/src/block/block_write.c @@ -0,0 +1,269 @@ +/*- + * Copyright (c) 2008-2014 WiredTiger, Inc. + * All rights reserved. + * + * See the file LICENSE for redistribution information. + */ + +#include "wt_internal.h" + +/* + * __wt_block_header -- + * Return the size of the block-specific header. + */ +u_int +__wt_block_header(WT_BLOCK *block) +{ + WT_UNUSED(block); + + return ((u_int)WT_BLOCK_HEADER_SIZE); +} + +/* + * __wt_block_write_size -- + * Return the buffer size required to write a block. + */ +int +__wt_block_write_size(WT_SESSION_IMPL *session, WT_BLOCK *block, size_t *sizep) +{ + WT_UNUSED(session); + + /* + * We write the page size, in bytes, into the block's header as a 4B + * unsigned value, and it's possible for the engine to accept an item + * we can't write. For example, a huge key/value where the allocation + * size has been set to something large will overflow 4B when it tries + * to align the write. We could make this work (for example, writing + * the page size in units of allocation size or something else), but + * it's not worth the effort, writing 4GB objects into a btree makes + * no sense. Limit the writes to (4GB - 1KB), it gives us potential + * mode bits, and I'm not interested in debugging corner cases anyway. + */ + *sizep = (size_t) + WT_ALIGN(*sizep + WT_BLOCK_HEADER_BYTE_SIZE, block->allocsize); + return (*sizep > UINT32_MAX - 1024 ? EINVAL : 0); +} + +/* + * __wt_block_write -- + * Write a buffer into a block, returning the block's address cookie. + */ +int +__wt_block_write(WT_SESSION_IMPL *session, WT_BLOCK *block, + WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, int data_cksum) +{ + wt_off_t offset; + uint32_t size, cksum; + uint8_t *endp; + + WT_RET(__wt_block_write_off( + session, block, buf, &offset, &size, &cksum, data_cksum, 0)); + + endp = addr; + WT_RET(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum)); + *addr_sizep = WT_PTRDIFF(endp, addr); + + return (0); +} + +/* + * __wt_block_write_off -- + * Write a buffer into a block, returning the block's offset, size and + * checksum. + */ +int +__wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block, + WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump, + int data_cksum, int caller_locked) +{ + WT_BLOCK_HEADER *blk; + WT_DECL_RET; + WT_FH *fh; + size_t align_size; + wt_off_t offset; + int local_locked; + + blk = WT_BLOCK_HEADER_REF(buf->mem); + fh = block->fh; + local_locked = 0; + + /* Buffers should be aligned for writing. */ + if (!F_ISSET(buf, WT_ITEM_ALIGNED)) { + WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED)); + WT_RET_MSG(session, EINVAL, + "direct I/O check: write buffer incorrectly allocated"); + } + + /* + * Align the size to an allocation unit. + * + * The buffer must be big enough for us to zero to the next allocsize + * boundary, this is one of the reasons the btree layer must find out + * from the block-manager layer the maximum size of the eventual write. + */ + align_size = WT_ALIGN(buf->size, block->allocsize); + if (align_size > buf->memsize) { + WT_ASSERT(session, align_size <= buf->memsize); + WT_RET_MSG(session, EINVAL, + "buffer size check: write buffer incorrectly allocated"); + } + if (align_size > UINT32_MAX) { + WT_ASSERT(session, align_size <= UINT32_MAX); + WT_RET_MSG(session, EINVAL, + "buffer size check: write buffer too large to write"); + } + + /* Zero out any unused bytes at the end of the buffer. */ + memset((uint8_t *)buf->mem + buf->size, 0, align_size - buf->size); + + /* + * Set the disk size so we don't have to incrementally read blocks + * during salvage. + */ + blk->disk_size = WT_STORE_SIZE(align_size); + + /* + * Update the block's checksum: if our caller specifies, checksum the + * complete data, otherwise checksum the leading WT_BLOCK_COMPRESS_SKIP + * bytes. The assumption is applications with good compression support + * turn off checksums and assume corrupted blocks won't decompress + * correctly. However, if compression failed to shrink the block, the + * block wasn't compressed, in which case our caller will tell us to + * checksum the data to detect corruption. If compression succeeded, + * we still need to checksum the first WT_BLOCK_COMPRESS_SKIP bytes + * because they're not compressed, both to give salvage a quick test + * of whether a block is useful and to give us a test so we don't lose + * the first WT_BLOCK_COMPRESS_SKIP bytes without noticing. + */ + blk->flags = 0; + if (data_cksum) + F_SET(blk, WT_BLOCK_DATA_CKSUM); + blk->cksum = 0; + blk->cksum = __wt_cksum( + buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP); + + if (!caller_locked) { + WT_RET(__wt_block_ext_prealloc(session, 5)); + __wt_spin_lock(session, &block->live_lock); + local_locked = 1; + } + ret = __wt_block_alloc(session, block, &offset, (wt_off_t)align_size); + + /* + * Extend the file in chunks. We want to limit the number of threads + * extending the file at the same time, so choose the one thread that's + * crossing the extended boundary. We don't extend newly created files, + * and it's theoretically possible we might wait so long our extension + * of the file is passed by another thread writing single blocks, that's + * why there's a check in case the extended file size becomes too small: + * if the file size catches up, every thread tries to extend it. + * + * File extension may require locking: some variants of the system call + * used to extend the file initialize the extended space. If a writing + * thread races with the extending thread, the extending thread might + * overwrite already written data, and that would be very, very bad. + * + * Some variants of the system call to extend the file fail at run-time + * based on the filesystem type, fall back to ftruncate in that case, + * and remember that ftruncate requires locking. + */ + if (ret == 0 && + fh->extend_len != 0 && + (fh->extend_size <= fh->size || + (offset + fh->extend_len <= fh->extend_size && + offset + + fh->extend_len + (wt_off_t)align_size >= fh->extend_size))) { + fh->extend_size = offset + fh->extend_len * 2; + if (fh->fallocate_available) { + /* + * Release any locally acquired lock if it's not needed + * to extend the file, extending the file might require + * updating file metadata, which can be slow. (It may be + * a bad idea to configure for file extension on systems + * that require locking over the extend call.) + */ + if (!fh->fallocate_requires_locking && local_locked) { + __wt_spin_unlock(session, &block->live_lock); + local_locked = 0; + } + + /* Extend the file. */ + if ((ret = __wt_fallocate(session, + fh, offset, fh->extend_len * 2)) == ENOTSUP) { + ret = 0; + goto extend_truncate; + } + } else { +extend_truncate: /* + * We may have a caller lock or a locally acquired lock, + * but we need a lock to call ftruncate. + */ + if (!caller_locked && local_locked == 0) { + __wt_spin_lock(session, &block->live_lock); + local_locked = 1; + } + /* + * The truncate might fail if there's a file mapping + * (if there's an open checkpoint on the file), that's + * OK. + */ + if ((ret = __wt_ftruncate( + session, fh, offset + fh->extend_len * 2)) == EBUSY) + ret = 0; + } + } + /* Release any locally acquired lock. */ + if (local_locked) { + __wt_spin_unlock(session, &block->live_lock); + local_locked = 0; + } + WT_RET(ret); + + /* Write the block. */ + if ((ret = + __wt_write(session, fh, offset, align_size, buf->mem)) != 0) { + if (!caller_locked) + __wt_spin_lock(session, &block->live_lock); + WT_TRET(__wt_block_off_free( + session, block, offset, (wt_off_t)align_size)); + if (!caller_locked) + __wt_spin_unlock(session, &block->live_lock); + WT_RET(ret); + } + +#ifdef HAVE_SYNC_FILE_RANGE + /* + * Optionally schedule writes for dirty pages in the system buffer + * cache, but only if the current session can wait. + */ + if (block->os_cache_dirty_max != 0 && + (block->os_cache_dirty += align_size) > block->os_cache_dirty_max && + __wt_session_can_wait(session)) { + block->os_cache_dirty = 0; + WT_RET(__wt_fsync_async(session, fh)); + } +#endif +#ifdef HAVE_POSIX_FADVISE + /* Optionally discard blocks from the system buffer cache. */ + if (block->os_cache_max != 0 && + (block->os_cache += align_size) > block->os_cache_max) { + block->os_cache = 0; + if ((ret = posix_fadvise(fh->fd, + (wt_off_t)0, (wt_off_t)0, POSIX_FADV_DONTNEED)) != 0) + WT_RET_MSG( + session, ret, "%s: posix_fadvise", block->name); + } +#endif + WT_STAT_FAST_CONN_INCR(session, block_write); + WT_STAT_FAST_CONN_INCRV(session, block_byte_write, align_size); + + WT_RET(__wt_verbose(session, WT_VERB_WRITE, + "off %" PRIuMAX ", size %" PRIuMAX ", cksum %" PRIu32, + (uintmax_t)offset, (uintmax_t)align_size, blk->cksum)); + + *offsetp = offset; + *sizep = WT_STORE_SIZE(align_size); + *cksump = blk->cksum; + + return (ret); +} |