summaryrefslogtreecommitdiff
path: root/src/third_party/wiredtiger/src/btree/bt_io.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/third_party/wiredtiger/src/btree/bt_io.c')
-rw-r--r--src/third_party/wiredtiger/src/btree/bt_io.c304
1 files changed, 304 insertions, 0 deletions
diff --git a/src/third_party/wiredtiger/src/btree/bt_io.c b/src/third_party/wiredtiger/src/btree/bt_io.c
new file mode 100644
index 00000000000..ccc67c994dc
--- /dev/null
+++ b/src/third_party/wiredtiger/src/btree/bt_io.c
@@ -0,0 +1,304 @@
+/*-
+ * Copyright (c) 2008-2014 WiredTiger, Inc.
+ * All rights reserved.
+ *
+ * See the file LICENSE for redistribution information.
+ */
+
+#include "wt_internal.h"
+
+/*
+ * __wt_bt_read --
+ * Read a cookie referenced block into a buffer.
+ */
+int
+__wt_bt_read(WT_SESSION_IMPL *session,
+ WT_ITEM *buf, const uint8_t *addr, size_t addr_size)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ const WT_PAGE_HEADER *dsk;
+ size_t result_len;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+
+ /*
+ * If anticipating a compressed block, read into a scratch buffer and
+ * decompress into the caller's buffer. Else, read directly into the
+ * caller's buffer.
+ */
+ if (btree->compressor == NULL) {
+ WT_RET(bm->read(bm, session, buf, addr, addr_size));
+ dsk = buf->data;
+ } else {
+ WT_RET(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(bm->read(bm, session, tmp, addr, addr_size));
+ dsk = tmp->data;
+ }
+
+ /*
+ * If the block is compressed, copy the skipped bytes of the original
+ * image into place, then decompress.
+ */
+ if (F_ISSET(dsk, WT_PAGE_COMPRESSED)) {
+ if (btree->compressor == NULL ||
+ btree->compressor->decompress == NULL)
+ WT_ERR_MSG(session, WT_ERROR,
+ "read compressed block where no compression engine "
+ "configured");
+
+ /*
+ * We're allocating the exact number of bytes we're expecting
+ * from decompression.
+ */
+ WT_ERR(__wt_buf_initsize(session, buf, dsk->mem_size));
+
+ /*
+ * Note the source length is NOT the number of compressed bytes,
+ * it's the length of the block we just read (minus the skipped
+ * bytes). We don't store the number of compressed bytes: some
+ * compression engines need that length stored externally, they
+ * don't have markers in the stream to signal the end of the
+ * compressed bytes. Those engines must store the compressed
+ * byte length somehow, see the snappy compression extension for
+ * an example.
+ */
+ memcpy(buf->mem, tmp->data, WT_BLOCK_COMPRESS_SKIP);
+ ret = btree->compressor->decompress(
+ btree->compressor, &session->iface,
+ (uint8_t *)tmp->data + WT_BLOCK_COMPRESS_SKIP,
+ tmp->size - WT_BLOCK_COMPRESS_SKIP,
+ (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP,
+ dsk->mem_size - WT_BLOCK_COMPRESS_SKIP, &result_len);
+
+ /*
+ * If checksums were turned off because we're depending on the
+ * decompression to fail on any corrupted data, we'll end up
+ * here after corruption happens. If we're salvaging the file,
+ * it's OK, otherwise it's really, really bad.
+ */
+ if (ret != 0 ||
+ result_len != dsk->mem_size - WT_BLOCK_COMPRESS_SKIP)
+ WT_ERR(
+ F_ISSET(btree, WT_BTREE_VERIFY) ||
+ F_ISSET(session, WT_SESSION_SALVAGE_CORRUPT_OK) ?
+ WT_ERROR :
+ __wt_illegal_value(session, btree->dhandle->name));
+ } else
+ if (btree->compressor == NULL)
+ buf->size = dsk->mem_size;
+ else
+ /*
+ * We guessed wrong: there was a compressor, but this
+ * block was not compressed, and now the page is in the
+ * wrong buffer and the buffer may be of the wrong size.
+ * This should be rare, but happens with small blocks
+ * that aren't worth compressing.
+ */
+ WT_ERR(__wt_buf_set(
+ session, buf, tmp->data, dsk->mem_size));
+
+ /* If the handle is a verify handle, verify the physical page. */
+ if (F_ISSET(btree, WT_BTREE_VERIFY)) {
+ if (tmp == NULL)
+ WT_ERR(__wt_scr_alloc(session, 0, &tmp));
+ WT_ERR(bm->addr_string(bm, session, tmp, addr, addr_size));
+ WT_ERR(__wt_verify_dsk(session, (const char *)tmp->data, buf));
+ }
+
+ WT_STAT_FAST_CONN_INCR(session, cache_read);
+ WT_STAT_FAST_DATA_INCR(session, cache_read);
+ if (F_ISSET(dsk, WT_PAGE_COMPRESSED))
+ WT_STAT_FAST_DATA_INCR(session, compress_read);
+ WT_STAT_FAST_CONN_INCRV(session, cache_bytes_read, dsk->mem_size);
+ WT_STAT_FAST_DATA_INCRV(session, cache_bytes_read, dsk->mem_size);
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}
+
+/*
+ * __wt_bt_write --
+ * Write a buffer into a block, returning the block's addr/size and
+ * checksum.
+ */
+int
+__wt_bt_write(WT_SESSION_IMPL *session, WT_ITEM *buf,
+ uint8_t *addr, size_t *addr_sizep, int checkpoint, int compressed)
+{
+ WT_BM *bm;
+ WT_BTREE *btree;
+ WT_ITEM *ip;
+ WT_DECL_ITEM(tmp);
+ WT_DECL_RET;
+ WT_PAGE_HEADER *dsk;
+ size_t len, src_len, dst_len, result_len, size;
+ int data_cksum, compression_failed;
+ uint8_t *src, *dst;
+
+ btree = S2BT(session);
+ bm = btree->bm;
+
+ /* Checkpoint calls are different than standard calls. */
+ WT_ASSERT(session,
+ (checkpoint == 0 && addr != NULL && addr_sizep != NULL) ||
+ (checkpoint == 1 && addr == NULL && addr_sizep == NULL));
+
+#ifdef HAVE_DIAGNOSTIC
+ /*
+ * We're passed a table's disk image. Decompress if necessary and
+ * verify the image. Always check the in-memory length for accuracy.
+ */
+ dsk = buf->mem;
+ WT_ASSERT(session, dsk->u.entries != 0);
+ if (compressed) {
+ WT_ERR(__wt_scr_alloc(session, dsk->mem_size, &tmp));
+
+ memcpy(tmp->mem, buf->data, WT_BLOCK_COMPRESS_SKIP);
+ WT_ERR(btree->compressor->decompress(
+ btree->compressor, &session->iface,
+ (uint8_t *)buf->data + WT_BLOCK_COMPRESS_SKIP,
+ buf->size - WT_BLOCK_COMPRESS_SKIP,
+ (uint8_t *)tmp->data + WT_BLOCK_COMPRESS_SKIP,
+ tmp->memsize - WT_BLOCK_COMPRESS_SKIP,
+ &result_len));
+ WT_ASSERT(session,
+ dsk->mem_size == result_len + WT_BLOCK_COMPRESS_SKIP);
+ tmp->size = (uint32_t)result_len + WT_BLOCK_COMPRESS_SKIP;
+ ip = tmp;
+ } else {
+ WT_ASSERT(session, dsk->mem_size == buf->size);
+ ip = buf;
+ }
+ WT_ERR(__wt_verify_dsk(session, "[write-check]", ip));
+ __wt_scr_free(&tmp);
+#endif
+
+ /*
+ * Optionally stream-compress the data, but don't compress blocks that
+ * are already as small as they're going to get.
+ */
+ if (btree->compressor == NULL ||
+ btree->compressor->compress == NULL || compressed)
+ ip = buf;
+ else if (buf->size <= btree->allocsize) {
+ ip = buf;
+ WT_STAT_FAST_DATA_INCR(session, compress_write_too_small);
+ } else {
+ /* Skip the header bytes of the source data. */
+ src = (uint8_t *)buf->mem + WT_BLOCK_COMPRESS_SKIP;
+ src_len = buf->size - WT_BLOCK_COMPRESS_SKIP;
+
+ /*
+ * Compute the size needed for the destination buffer. We only
+ * allocate enough memory for a copy of the original by default,
+ * if any compressed version is bigger than the original, we
+ * won't use it. However, some compression engines (snappy is
+ * one example), may need more memory because they don't stop
+ * just because there's no more memory into which to compress.
+ */
+ if (btree->compressor->pre_size == NULL)
+ len = src_len;
+ else
+ WT_ERR(btree->compressor->pre_size(btree->compressor,
+ &session->iface, src, src_len, &len));
+
+ size = len + WT_BLOCK_COMPRESS_SKIP;
+ WT_ERR(bm->write_size(bm, session, &size));
+ WT_ERR(__wt_scr_alloc(session, size, &tmp));
+
+ /* Skip the header bytes of the destination data. */
+ dst = (uint8_t *)tmp->mem + WT_BLOCK_COMPRESS_SKIP;
+ dst_len = len;
+
+ compression_failed = 0;
+ WT_ERR(btree->compressor->compress(btree->compressor,
+ &session->iface,
+ src, src_len,
+ dst, dst_len,
+ &result_len, &compression_failed));
+ result_len += WT_BLOCK_COMPRESS_SKIP;
+
+ /*
+ * If compression fails, or doesn't gain us at least one unit of
+ * allocation, fallback to the original version. This isn't
+ * unexpected: if compression doesn't work for some chunk of
+ * data for some reason (noting likely additional format/header
+ * information which compressed output requires), it just means
+ * the uncompressed version is as good as it gets, and that's
+ * what we use.
+ */
+ if (compression_failed ||
+ buf->size / btree->allocsize ==
+ result_len / btree->allocsize) {
+ ip = buf;
+ WT_STAT_FAST_DATA_INCR(session, compress_write_fail);
+ } else {
+ compressed = 1;
+ WT_STAT_FAST_DATA_INCR(session, compress_write);
+
+ /*
+ * Copy in the skipped header bytes, set the final data
+ * size.
+ */
+ memcpy(tmp->mem, buf->mem, WT_BLOCK_COMPRESS_SKIP);
+ tmp->size = result_len;
+ ip = tmp;
+ }
+ }
+ dsk = ip->mem;
+
+ /* If the buffer is compressed, set the flag. */
+ if (compressed)
+ F_SET(dsk, WT_PAGE_COMPRESSED);
+
+ /*
+ * We increment the block's write generation so it's easy to identify
+ * newer versions of blocks during salvage. (It's common in WiredTiger,
+ * at least for the default block manager, for multiple blocks to be
+ * internally consistent with identical first and last keys, so we need
+ * a way to know the most recent state of the block. We could check
+ * which leaf is referenced by a valid internal page, but that implies
+ * salvaging internal pages, which I don't want to do, and it's not
+ * as good anyway, because the internal page may not have been written
+ * after the leaf page was updated. So, write generations it is.
+ *
+ * Nothing is locked at this point but two versions of a page with the
+ * same generation is pretty unlikely, and if we did, they're going to
+ * be roughly identical for the purposes of salvage, anyway.
+ */
+ dsk->write_gen = ++btree->write_gen;
+
+ /*
+ * Checksum the data if the buffer isn't compressed or checksums are
+ * configured.
+ */
+ switch (btree->checksum) {
+ case CKSUM_ON:
+ data_cksum = 1;
+ break;
+ case CKSUM_OFF:
+ data_cksum = 0;
+ break;
+ case CKSUM_UNCOMPRESSED:
+ default:
+ data_cksum = !compressed;
+ break;
+ }
+
+ /* Call the block manager to write the block. */
+ WT_ERR(checkpoint ?
+ bm->checkpoint(bm, session, ip, btree->ckpt, data_cksum) :
+ bm->write(bm, session, ip, addr, addr_sizep, data_cksum));
+
+ WT_STAT_FAST_CONN_INCR(session, cache_write);
+ WT_STAT_FAST_DATA_INCR(session, cache_write);
+ WT_STAT_FAST_CONN_INCRV(session, cache_bytes_write, ip->size);
+ WT_STAT_FAST_DATA_INCRV(session, cache_bytes_write, ip->size);
+
+err: __wt_scr_free(&tmp);
+ return (ret);
+}