summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKeith Bostic <keith@wiredtiger.com>2013-05-11 10:15:58 -0400
committerKeith Bostic <keith@wiredtiger.com>2013-05-11 10:15:58 -0400
commitbc674d1a6e42a470958823eae92220630dd09369 (patch)
treec7afbc60d3fc711f7c42e2b6760825815d914a54
parentd821e9a54b5725cddb45b307831e66a2ef530523 (diff)
downloadmongo-bc674d1a6e42a470958823eae92220630dd09369.tar.gz
Increase the Linux default buffer alignment to 4KB, there are known cases
where I/O to/from buffers with lesser alignment causes reads to go through the buffer cache, leading to multiple versions of a block in the system. Add asserts to the read/write calls to ensure we detect mis-aligned buffers.
-rw-r--r--build_posix/configure.ac.in4
-rw-r--r--dist/api_data.py5
-rw-r--r--src/include/wiredtiger.in6
-rw-r--r--src/os_posix/os_rw.c12
4 files changed, 19 insertions, 8 deletions
diff --git a/build_posix/configure.ac.in b/build_posix/configure.ac.in
index 7b469b673ca..1b4afeb30ec 100644
--- a/build_posix/configure.ac.in
+++ b/build_posix/configure.ac.in
@@ -100,13 +100,13 @@ AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[#include <pthread.h>]],[[
AC_C_BIGENDIAN
-# Linux requires _GNU_SOURCE to be defined and buffers aligned to 512 byte
+# Linux requires _GNU_SOURCE to be defined and buffers aligned to 4KB
# boundaries for O_DIRECT to work.
BUFFER_ALIGNMENT=0
if test "$ac_cv_func_posix_memalign" = "yes" ; then
case "`uname -s`" in
Linux) AM_CFLAGS="$AM_CFLAGS -D_GNU_SOURCE"
- BUFFER_ALIGNMENT=512
+ BUFFER_ALIGNMENT=4096
;;
esac
fi
diff --git a/dist/api_data.py b/dist/api_data.py
index 1b2d1e0e834..efbb1e2108c 100644
--- a/dist/api_data.py
+++ b/dist/api_data.py
@@ -532,9 +532,8 @@ methods = {
'wiredtiger_open' : Method(connection_runtime_config + [
Config('buffer_alignment', '-1', r'''
in-memory alignment (in bytes) for buffers used for I/O. The
- default value of -1 indicates that a platform-specific
- alignment value should be used (512 bytes on Linux systems,
- zero elsewhere)''',
+ default value of -1 indicates a platform-specific alignment
+ value should be used (4KB on Linux systems, zero elsewhere)''',
min='-1', max='1MB'),
Config('checkpoint', '', r'''
periodically checkpoint the database''',
diff --git a/src/include/wiredtiger.in b/src/include/wiredtiger.in
index 24a0598bc0d..fd495f8d914 100644
--- a/src/include/wiredtiger.in
+++ b/src/include/wiredtiger.in
@@ -1320,9 +1320,9 @@ struct __wt_connection {
* handler is installed that writes error messages to stderr
* @configstart{wiredtiger_open, see dist/api_data.py}
* @config{buffer_alignment, in-memory alignment (in bytes) for buffers used for
- * I/O. The default value of -1 indicates that a platform-specific alignment
- * value should be used (512 bytes on Linux systems\, zero elsewhere)., an
- * integer between -1 and 1MB; default \c -1.}
+ * I/O. The default value of -1 indicates a platform-specific alignment value
+ * should be used (4KB on Linux systems\, zero elsewhere)., an integer between
+ * -1 and 1MB; default \c -1.}
* @config{cache_size, maximum heap memory to allocate for the cache. A
* database should configure either a cache_size or a shared_cache not both., an
* integer between 1MB and 10TB; default \c 100MB.}
diff --git a/src/os_posix/os_rw.c b/src/os_posix/os_rw.c
index 2b83d961592..2bdde670256 100644
--- a/src/os_posix/os_rw.c
+++ b/src/os_posix/os_rw.c
@@ -21,6 +21,12 @@ __wt_read(WT_SESSION_IMPL *session,
"%s: read %" PRIu32 " bytes at offset %" PRIuMAX,
fh->name, bytes, (uintmax_t)offset);
+ WT_ASSERT(session, /* Assert aligned I/O is aligned. */
+ !fh->direct_io ||
+ S2C(session)->buffer_alignment == 0 ||
+ !((uintptr_t)buf &
+ (uintptr_t)(S2C(session)->buffer_alignment - 1)));
+
if (pread(fh->fd, buf, (size_t)bytes, offset) != (ssize_t)bytes)
WT_RET_MSG(session, __wt_errno(),
"%s read error: failed to read %" PRIu32
@@ -44,6 +50,12 @@ __wt_write(WT_SESSION_IMPL *session,
"%s: write %" PRIu32 " bytes at offset %" PRIuMAX,
fh->name, bytes, (uintmax_t)offset);
+ WT_ASSERT(session, /* Assert aligned I/O is aligned. */
+ !fh->direct_io ||
+ S2C(session)->buffer_alignment == 0 ||
+ !((uintptr_t)buf &
+ (uintptr_t)(S2C(session)->buffer_alignment - 1)));
+
if (pwrite(fh->fd, buf, (size_t)bytes, offset) != (ssize_t)bytes)
WT_RET_MSG(session, __wt_errno(),
"%s write error: failed to write %" PRIu32