summaryrefslogtreecommitdiff
path: root/src/block/block_read.c
blob: 1e5ce279db2b1fe93c2642626a6c28dcc6bdb408 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
/*-
 * Copyright (c) 2008-2013 WiredTiger, Inc.
 *	All rights reserved.
 *
 * See the file LICENSE for redistribution information.
 */

#include "wt_internal.h"

/*
 * __wt_bm_read --
 *	Map or read address cookie referenced block into a buffer.
 */
int
__wt_bm_read(WT_BM *bm, WT_SESSION_IMPL *session,
    WT_ITEM *buf, const uint8_t *addr, uint32_t addr_size)
{
	WT_BLOCK *block;
	off_t offset;
	uint32_t size, cksum;
	int mapped;

	WT_UNUSED(addr_size);
	block = bm->block;

	/* Crack the cookie. */
	WT_RET(__wt_block_buffer_to_addr(block, addr, &offset, &size, &cksum));

	/*
	 * Clear buffers previously used for mapped memory, we may be forced
	 * to read into this buffer.
	 */
	if (F_ISSET(buf, WT_ITEM_MAPPED))
		__wt_buf_free(session, buf);

	/*
	 * If we're going to be able to return mapped memory and the buffer
	 * has allocated memory, discard it.
	 */
	mapped = bm->map != NULL && offset + size <= (off_t)bm->maplen;
	if (buf->mem != NULL && mapped)
		__wt_buf_free(session, buf);

	/* Map the block if it's possible. */
	if (mapped) {
		buf->mem = (uint8_t *)bm->map + offset;
		buf->memsize = size;
		buf->data = buf->mem;
		buf->size = size;
		F_SET(buf, WT_ITEM_MAPPED);

		WT_CSTAT_INCR(session, block_map_read);
		WT_CSTAT_INCRV(session, block_byte_map_read, size);
		return (0);
	}

	/* Read the block. */
	WT_RET(__wt_block_read_off(session, block, buf, offset, size, cksum));

#ifdef HAVE_POSIX_FADVISE
	/* Optionally discard blocks from the system's buffer cache. */
	if (block->os_cache_max != 0 &&
	    (block->os_cache += offset) > block->os_cache_max) {
		WT_DECL_RET;

		block->os_cache = 0;
		if ((ret = posix_fadvise(block->fh->fd,
		    (off_t)0, (off_t)0, POSIX_FADV_DONTNEED)) != 0)
			WT_RET_MSG(
			    session, ret, "%s: posix_fadvise", block->name);
	}
#endif
	return (0);
}

/*
 * __wt_block_read_off --
 *	Read an addr/size pair referenced block into a buffer.
 */
int
__wt_block_read_off(WT_SESSION_IMPL *session,
    WT_BLOCK *block, WT_ITEM *buf, off_t offset, uint32_t size, uint32_t cksum)
{
	WT_BLOCK_HEADER *blk;
	uint32_t alloc_size, page_cksum;

	WT_VERBOSE_RET(session, read,
	    "off %" PRIuMAX ", size %" PRIu32 ", cksum %" PRIu32,
	    (uintmax_t)offset, size, cksum);

#ifdef HAVE_DIAGNOSTIC
	/*
	 * In diagnostic mode, verify the block we're about to read isn't on
	 * either the available or discard lists.
	 *
	 * Don't check during salvage, it's possible we're reading an already
	 * freed overflow page.
	 */
	if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR))
		WT_RET(
		    __wt_block_misplaced(session, block, "read", offset, size));
#endif

	/*
	 * Grow the buffer as necessary and read the block.  Buffers should be
	 * aligned for reading, but there are lots of buffers (for example, file
	 * cursors have two buffers each, key and value), and it's difficult to
	 * be sure we've found all of them.  If the buffer isn't aligned, it's
	 * an easy fix: set the flag and guarantee we reallocate it.  (Most of
	 * the time on reads, the buffer memory has not yet been allocated, so
	 * we're not adding any additional processing time.)
	 */
	if (F_ISSET(buf, WT_ITEM_ALIGNED))
		alloc_size = size;
	else {
		F_SET(buf, WT_ITEM_ALIGNED);
		alloc_size = (uint32_t)WT_MAX(size, buf->memsize + 10);
	}
	WT_RET(__wt_buf_init(session, buf, alloc_size));
	WT_RET(__wt_read(session, block->fh, offset, size, buf->mem));
	buf->size = size;

	blk = WT_BLOCK_HEADER_REF(buf->mem);
	blk->cksum = 0;
	page_cksum = __wt_cksum(buf->mem,
	    F_ISSET(blk, WT_BLOCK_DATA_CKSUM) ? size : WT_BLOCK_COMPRESS_SKIP);
	if (cksum != page_cksum) {
		if (!F_ISSET(session, WT_SESSION_SALVAGE_QUIET_ERR))
			__wt_errx(session,
			    "read checksum error [%"
			    PRIu32 "B @ %" PRIuMAX ", %"
			    PRIu32 " != %" PRIu32 "]",
			    size, (uintmax_t)offset, cksum, page_cksum);
		return (WT_ERROR);
	}

	WT_CSTAT_INCR(session, block_read);
	WT_CSTAT_INCRV(session, block_byte_read, size);
	return (0);
}