1 files changed, 108 insertions, 95 deletions
diff --git a/src/third_party/wiredtiger/src/block/block_write.c b/src/third_party/wiredtiger/src/block/block_write.c
index 1fefeee09da..032f72d551b 100644
--- a/src/third_party/wiredtiger/src/block/block_write.c
+++ b/src/third_party/wiredtiger/src/block/block_write.c
@@ -15,29 +15,47 @@
 int
 __wt_block_truncate(WT_SESSION_IMPL *session, WT_BLOCK *block, wt_off_t len)
 {
+	WT_CONNECTION_IMPL *conn;
+	WT_DECL_RET;
+
+	conn = S2C(session);
+
+	__wt_verbose(session,
+	    WT_VERB_BLOCK, "truncate file to %" PRIuMAX, (uintmax_t)len);
+
+	/*
+	 * Truncate requires serialization, we depend on our caller for that.
+	 *
+	 * Truncation isn't a requirement of the block manager, it's only used
+	 * to conserve disk space. Regardless of the underlying file system
+	 * call's result, the in-memory understanding of the file size changes.
+	 */
+	block->size = block->extend_size = len;
+
 	/*
 	 * Backups are done by copying files outside of WiredTiger, potentially
 	 * by system utilities. We cannot truncate the file during the backup
 	 * window, we might surprise an application.
 	 *
-	 * Stop block truncation. This affects files that aren't involved in the
-	 * backup (for example, doing incremental backups, which only copies log
-	 * files, or targeted backups, stops all truncation). We may want a more
-	 * targeted solution at some point.
+	 * This affects files that aren't involved in the backup (for example,
+	 * doing incremental backups, which only copies log files, or targeted
+	 * backups, stops all block truncation unnecessarily). We may want a
+	 * more targeted solution at some point.
 	 */
-	if (S2C(session)->hot_backup)
-		return (EBUSY);
+	if (!conn->hot_backup) {
+		__wt_readlock(session, conn->hot_backup_lock);
+		if (!conn->hot_backup)
+			ret = __wt_ftruncate(session, block->fh, len);
+		__wt_readunlock(session, conn->hot_backup_lock);
+	}
 
 	/*
-	 * Additionally, the truncate might fail if there's a file mapping (if
-	 * there's an open checkpoint on the file), in which case the underlying
-	 * function returns EBUSY.
+	 * The truncate may fail temporarily or permanently (for example, there
+	 * may be a file mapping if there's an open checkpoint on the file on a
+	 * POSIX system, in which case the underlying function returns EBUSY).
+	 * It's OK, we don't have to be able to truncate files.
 	 */
-	WT_RET(__wt_ftruncate(session, block->fh, len));
-
-	block->size = block->extend_size = len;
-
-	return (0);
+	return (ret == EBUSY || ret == ENOTSUP ? 0 : ret);
 }
 
 /*
@@ -82,22 +100,18 @@ __wt_block_extend(WT_SESSION_IMPL *session, WT_BLOCK *block,
 {
 	WT_DECL_RET;
 	WT_FILE_HANDLE *handle;
-	bool locked;
 
 	/*
 	 * The locking in this function is messy: by definition, the live system
 	 * is locked when we're called, but that lock may have been acquired by
 	 * our caller or our caller's caller. If our caller's lock, release_lock
-	 * comes in set, indicating this function can unlock it before returning
-	 * (either before extending the file or afterward, depending on the call
-	 * used). If it is our caller's caller, then release_lock comes in not
-	 * set, indicating it cannot be released here.
+	 * comes in set and this function can unlock it before returning (so it
+	 * isn't held while extending the file). If it is our caller's caller,
+	 * then release_lock comes in not set, indicating it cannot be released
+	 * here.
 	 *
-	 * If we unlock here, we clear release_lock. But if we then find out we
-	 * need a lock after all, we re-acquire the lock and set release_lock so
-	 * our caller knows to release it.
+	 * If we unlock here, we clear release_lock.
 	 */
-	locked = true;
 
 	/* If not configured to extend the file, we're done. */
 	if (block->extend_len == 0)
@@ -122,62 +136,39 @@ __wt_block_extend(WT_SESSION_IMPL *session, WT_BLOCK *block,
 	 * used to extend the file initialize the extended space. If a writing
 	 * thread races with the extending thread, the extending thread might
 	 * overwrite already written data, and that would be very, very bad.
-	 *
-	 * Some variants of the system call to extend the file fail at run-time
-	 * based on the filesystem type, fall back to ftruncate in that case,
-	 * and remember that ftruncate requires locking.
 	 */
 	handle = fh->handle;
-	if (handle->fh_allocate != NULL ||
-	    handle->fh_allocate_nolock != NULL) {
-		/*
-		 * Release any locally acquired lock if not needed to extend the
-		 * file, extending the file may require updating on-disk file's
-		 * metadata, which can be slow. (It may be a bad idea to
-		 * configure for file extension on systems that require locking
-		 * over the extend call.)
-		 */
-		if (handle->fh_allocate_nolock != NULL && *release_lockp) {
-			*release_lockp = locked = false;
-			__wt_spin_unlock(session, &block->live_lock);
-		}
-
-		/*
-		 * Extend the file: there's a race between setting the value of
-		 * extend_size and doing the extension, but it should err on the
-		 * side of extend_size being smaller than the actual file size,
-		 * and that's OK, we simply may do another extension sooner than
-		 * otherwise.
-		 */
-		block->extend_size = block->size + block->extend_len * 2;
-		if ((ret = __wt_fallocate(
-		    session, fh, block->size, block->extend_len * 2)) == 0)
-			return (0);
-		WT_RET_ERROR_OK(ret, ENOTSUP);
-	}
+	if (handle->fh_extend == NULL && handle->fh_extend_nolock == NULL)
+		return (0);
 
 	/*
-	 * We may have a caller lock or a locally acquired lock, but we need a
-	 * lock to call ftruncate.
+	 * Set the extend_size before releasing the lock, I don't want to read
+	 * and manipulate multiple values without holding a lock.
+	 *
+	 * There's a race between the calculation and doing the extension, but
+	 * it should err on the side of extend_size being smaller than the
+	 * actual file size, and that's OK, we simply may do another extension
+	 * sooner than otherwise.
 	 */
-	if (!locked) {
-		__wt_spin_lock(session, &block->live_lock);
-		*release_lockp = true;
-	}
+	block->extend_size = block->size + block->extend_len * 2;
 
 	/*
-	 * The underlying truncate call initializes allocated space, reset the
-	 * extend length after locking so we don't overwrite already-written
-	 * blocks.
+	 * Release any locally acquired lock if not needed to extend the file,
+	 * extending the file may require updating on-disk file's metadata,
+	 * which can be slow. (It may be a bad idea to configure for file
+	 * extension on systems that require locking over the extend call.)
 	 */
-	block->extend_size = block->size + block->extend_len * 2;
+	if (handle->fh_extend_nolock != NULL && *release_lockp) {
+		*release_lockp = false;
+		__wt_spin_unlock(session, &block->live_lock);
+	}
 
 	/*
-	 * The truncate might fail if there's a mapped file (in other words, if
-	 * there's an open checkpoint on the file), that's OK.
+	 * The extend might fail (for example, the file is mapped into memory),
+	 * or discover file extension isn't supported; both are OK.
 	 */
-	WT_RET_BUSY_OK(__wt_ftruncate(session, fh, block->extend_size));
-	return (0);
+	ret = __wt_fextend(session, fh, block->extend_size);
+	return (ret == EBUSY || ret == ENOTSUP ? 0 : ret);
 }
 
 /*
@@ -210,39 +201,39 @@ __wt_block_write_size(WT_SESSION_IMPL *session, WT_BLOCK *block, size_t *sizep)
  *	Write a buffer into a block, returning the block's address cookie.
  */
 int
-__wt_block_write(WT_SESSION_IMPL *session, WT_BLOCK *block,
-    WT_ITEM *buf, uint8_t *addr, size_t *addr_sizep, bool data_cksum)
+__wt_block_write(WT_SESSION_IMPL *session, WT_BLOCK *block, WT_ITEM *buf,
+    uint8_t *addr, size_t *addr_sizep, bool data_checksum, bool checkpoint_io)
 {
 	wt_off_t offset;
-	uint32_t size, cksum;
+	uint32_t checksum, size;
 	uint8_t *endp;
 
-	WT_RET(__wt_block_write_off(
-	    session, block, buf, &offset, &size, &cksum, data_cksum, false));
+	WT_RET(__wt_block_write_off(session, block, buf,
+	    &offset, &size, &checksum, data_checksum, checkpoint_io, false));
 
 	endp = addr;
-	WT_RET(__wt_block_addr_to_buffer(block, &endp, offset, size, cksum));
+	WT_RET(__wt_block_addr_to_buffer(block, &endp, offset, size, checksum));
 	*addr_sizep = WT_PTRDIFF(endp, addr);
 
 	return (0);
 }
 
 /*
- * __wt_block_write_off --
+ * __block_write_off --
  *	Write a buffer into a block, returning the block's offset, size and
  * checksum.
  */
-int
-__wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
-    WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *cksump,
-    bool data_cksum, bool caller_locked)
+static int
+__block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
+    WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *checksump,
+    bool data_checksum, bool checkpoint_io, bool caller_locked)
 {
 	WT_BLOCK_HEADER *blk;
 	WT_DECL_RET;
 	WT_FH *fh;
 	size_t align_size;
 	wt_off_t offset;
-	uint32_t cksum;
+	uint32_t checksum;
 	bool local_locked;
 
 	fh = block->fh;
@@ -254,12 +245,6 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
 	blk = WT_BLOCK_HEADER_REF(buf->mem);
 	memset(blk, 0, sizeof(*blk));
 
-	/*
-	 * Swap the page-header as needed; this doesn't belong here, but it's
-	 * the best place to catch all callers.
-	 */
-	__wt_page_header_byteswap(buf->mem);
-
 	/* Buffers should be aligned for writing. */
 	if (!F_ISSET(buf, WT_ITEM_ALIGNED)) {
 		WT_ASSERT(session, F_ISSET(buf, WT_ITEM_ALIGNED));
@@ -313,14 +298,14 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
 	 * big-endian format, swap it into place in a separate step.
 	 */
 	blk->flags = 0;
-	if (data_cksum)
+	if (data_checksum)
 		F_SET(blk, WT_BLOCK_DATA_CKSUM);
-	blk->cksum = 0;
+	blk->checksum = 0;
 	__wt_block_header_byteswap(blk);
-	blk->cksum = cksum = __wt_cksum(
-	    buf->mem, data_cksum ? align_size : WT_BLOCK_COMPRESS_SKIP);
+	blk->checksum = checksum = __wt_checksum(
+	    buf->mem, data_checksum ? align_size : WT_BLOCK_COMPRESS_SKIP);
 #ifdef WORDS_BIGENDIAN
-	blk->cksum = __wt_bswap32(blk->cksum);
+	blk->checksum = __wt_bswap32(blk->checksum);
 #endif
 
 	/* Pre-allocate some number of extension structures. */
@@ -380,14 +365,42 @@ __wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
 
 	WT_STAT_FAST_CONN_INCR(session, block_write);
 	WT_STAT_FAST_CONN_INCRV(session, block_byte_write, align_size);
+	if (checkpoint_io)
+		WT_STAT_FAST_CONN_INCRV(
+		    session, block_byte_write_checkpoint, align_size);
 
-	WT_RET(__wt_verbose(session, WT_VERB_WRITE,
-	    "off %" PRIuMAX ", size %" PRIuMAX ", cksum %" PRIu32,
-	    (uintmax_t)offset, (uintmax_t)align_size, cksum));
+	__wt_verbose(session, WT_VERB_WRITE,
+	    "off %" PRIuMAX ", size %" PRIuMAX ", checksum %" PRIu32,
+	    (uintmax_t)offset, (uintmax_t)align_size, checksum);
 
 	*offsetp = offset;
 	*sizep = WT_STORE_SIZE(align_size);
-	*cksump = cksum;
+	*checksump = checksum;
 
 	return (0);
 }
+
+/*
+ * __wt_block_write_off --
+ *	Write a buffer into a block, returning the block's offset, size and
+ * checksum.
+ */
+int
+__wt_block_write_off(WT_SESSION_IMPL *session, WT_BLOCK *block,
+    WT_ITEM *buf, wt_off_t *offsetp, uint32_t *sizep, uint32_t *checksump,
+    bool data_checksum, bool checkpoint_io, bool caller_locked)
+{
+	WT_DECL_RET;
+
+	/*
+	 * Ensure the page header is in little endian order; this doesn't belong
+	 * here, but it's the best place to catch all callers. After the write,
+	 * swap values back to native order so callers never see anything other
+	 * than their original content.
+	 */
+	__wt_page_header_byteswap(buf->mem);
+	ret = __block_write_off(session, block, buf, offsetp,
+	    sizep, checksump, data_checksum, checkpoint_io, caller_locked);
+	__wt_page_header_byteswap(buf->mem);
+	return (ret);
+}