summaryrefslogtreecommitdiff
path: root/storage
diff options
context:
space:
mode:
authorJan Lindström <jplindst@mariadb.org>2014-02-03 10:08:15 +0200
committerJan Lindström <jplindst@mariadb.org>2014-02-03 10:08:15 +0200
commit8c5d5bc5de135ed143bfe91c99fd53a8c9b4487c (patch)
treecd274b3c051b562a306072d4ba9be6e9d7587752 /storage
parentfebe99ec8d6b30236982a127fd5d194a7deceb44 (diff)
downloadmariadb-git-8c5d5bc5de135ed143bfe91c99fd53a8c9b4487c.tar.gz
Fixed merge error on InnoDB page compression level handling.
Merged page compression feature to XtraDB storage engine. Added feature where page compression can use lz4 compression method (innodb_use_lz4, default OFF).
Diffstat (limited to 'storage')
-rw-r--r--storage/innobase/CMakeLists.txt1
-rw-r--r--storage/innobase/btr/btr0btr.cc4
-rw-r--r--storage/innobase/btr/btr0cur.cc4
-rw-r--r--storage/innobase/fil/fil0fil.cc2
-rw-r--r--storage/innobase/fil/fil0pagecompress.cc170
-rw-r--r--storage/innobase/fil/lz4.c822
-rw-r--r--storage/innobase/fil/lz4.h205
-rw-r--r--storage/innobase/handler/ha_innodb.cc44
-rw-r--r--storage/innobase/include/fil0fil.h1
-rw-r--r--storage/innobase/include/fsp0pagecompress.ic5
-rw-r--r--storage/innobase/include/page0zip.h2
-rw-r--r--storage/innobase/include/srv0srv.h7
-rw-r--r--storage/innobase/page/page0cur.cc2
-rw-r--r--storage/innobase/page/page0page.cc6
-rw-r--r--storage/innobase/page/page0zip.cc4
-rw-r--r--storage/innobase/srv/srv0srv.cc18
-rw-r--r--storage/xtradb/CMakeLists.txt4
-rw-r--r--storage/xtradb/buf/buf0buf.cc23
-rw-r--r--storage/xtradb/buf/buf0dblwr.cc26
-rw-r--r--storage/xtradb/buf/buf0flu.cc349
-rw-r--r--storage/xtradb/buf/buf0rea.cc5
-rw-r--r--storage/xtradb/dict/dict0dict.cc1
-rw-r--r--storage/xtradb/fil/fil0fil.cc152
-rw-r--r--storage/xtradb/fil/fil0pagecompress.cc324
-rw-r--r--storage/xtradb/fil/lz4.c822
-rw-r--r--storage/xtradb/fil/lz4.h205
-rw-r--r--storage/xtradb/handler/ha_innodb.cc246
-rw-r--r--storage/xtradb/handler/ha_innodb.h18
-rw-r--r--storage/xtradb/handler/handler0alter.cc28
-rw-r--r--storage/xtradb/include/buf0buf.h21
-rw-r--r--storage/xtradb/include/buf0flu.h7
-rw-r--r--storage/xtradb/include/dict0dict.h12
-rw-r--r--storage/xtradb/include/dict0dict.ic164
-rw-r--r--storage/xtradb/include/dict0mem.h56
-rw-r--r--storage/xtradb/include/dict0pagecompress.h94
-rw-r--r--storage/xtradb/include/dict0pagecompress.ic191
-rw-r--r--storage/xtradb/include/dict0types.h9
-rw-r--r--storage/xtradb/include/fil0fil.h43
-rw-r--r--storage/xtradb/include/fil0pagecompress.h118
-rw-r--r--storage/xtradb/include/fsp0fsp.h68
-rw-r--r--storage/xtradb/include/fsp0fsp.ic19
-rw-r--r--storage/xtradb/include/fsp0pagecompress.h73
-rw-r--r--storage/xtradb/include/fsp0pagecompress.ic177
-rw-r--r--storage/xtradb/include/os0file.h69
-rw-r--r--storage/xtradb/include/os0file.ic26
-rw-r--r--storage/xtradb/include/srv0mon.h11
-rw-r--r--storage/xtradb/include/srv0srv.h62
-rw-r--r--storage/xtradb/log/log0log.cc20
-rw-r--r--storage/xtradb/log/log0online.cc6
-rw-r--r--storage/xtradb/log/log0recv.cc19
-rw-r--r--storage/xtradb/os/os0file.cc553
-rw-r--r--storage/xtradb/srv/srv0mon.cc68
-rw-r--r--storage/xtradb/srv/srv0srv.cc43
-rw-r--r--storage/xtradb/srv/srv0start.cc730
54 files changed, 5839 insertions, 320 deletions
diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt
index e41d2406bd2..0b1043bc421 100644
--- a/storage/innobase/CMakeLists.txt
+++ b/storage/innobase/CMakeLists.txt
@@ -294,6 +294,7 @@ SET(INNOBASE_SOURCES
eval/eval0proc.cc
fil/fil0fil.cc
fil/fil0pagecompress.cc
+ fil/lz4.c
fsp/fsp0fsp.cc
fut/fut0fut.cc
fut/fut0lst.cc
diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc
index e3e127c3ace..3d7dc993146 100644
--- a/storage/innobase/btr/btr0btr.cc
+++ b/storage/innobase/btr/btr0btr.cc
@@ -1923,7 +1923,7 @@ btr_page_reorganize(
dict_index_t* index, /*!< in: record descriptor */
mtr_t* mtr) /*!< in: mtr */
{
- return(btr_page_reorganize_low(FALSE, page_compression_level,
+ return(btr_page_reorganize_low(FALSE, page_zip_level,
block, index, mtr));
}
#endif /* !UNIV_HOTBACKUP */
@@ -1942,7 +1942,7 @@ btr_parse_page_reorganize(
buf_block_t* block, /*!< in: page to be reorganized, or NULL */
mtr_t* mtr) /*!< in: mtr or NULL */
{
- ulint level = page_compression_level;
+ ulint level = page_zip_level;
ut_ad(ptr && end_ptr);
diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc
index ecc17188770..5feb1363867 100644
--- a/storage/innobase/btr/btr0cur.cc
+++ b/storage/innobase/btr/btr0cur.cc
@@ -1844,7 +1844,7 @@ btr_cur_update_alloc_zip(
/* Have a local copy of the variables as these can change
dynamically. */
bool log_compressed = page_log_compressed_pages;
- ulint compression_level = page_compression_level;
+ ulint compression_level = page_zip_level;
page_t* page = buf_block_get_frame(block);
ut_a(page_zip == buf_block_get_page_zip(block));
@@ -4334,7 +4334,7 @@ btr_store_big_rec_extern_fields(
heap = mem_heap_create(250000);
page_zip_set_alloc(&c_stream, heap);
- err = deflateInit2(&c_stream, page_compression_level,
+ err = deflateInit2(&c_stream, page_zip_level,
Z_DEFLATED, 15, 7, Z_DEFAULT_STRATEGY);
ut_a(err == Z_OK);
}
diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc
index 1718e68d667..3803d0a93aa 100644
--- a/storage/innobase/fil/fil0fil.cc
+++ b/storage/innobase/fil/fil0fil.cc
@@ -5303,7 +5303,7 @@ fil_io(
os_offset_t offset;
ibool ignore_nonexistent_pages;
ibool page_compressed = FALSE;
- ibool page_compression_level = 0;
+ ulint page_compression_level = 0;
is_log = type & OS_FILE_LOG;
type = type & ~OS_FILE_LOG;
diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc
index 2da9d70e197..10ac273955f 100644
--- a/storage/innobase/fil/fil0pagecompress.cc
+++ b/storage/innobase/fil/fil0pagecompress.cc
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (C) 2013 SkySQL Ab. All Rights Reserved.
+Copyright (C) 2013, 2014, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -63,6 +63,7 @@ static ulint srv_data_read, srv_data_written;
#include <linux/falloc.h>
#endif
#include "row0mysql.h"
+#include "lz4.h"
/****************************************************************//**
For page compressed pages compress the page before actual write
@@ -100,7 +101,7 @@ fil_compress_page(
/* If no compression level was provided to this table, use system
default level */
if (level == 0) {
- level = srv_compress_zlib_level;
+ level = page_zip_level;
}
#ifdef UNIV_DEBUG
@@ -110,60 +111,88 @@ fil_compress_page(
#endif
write_size = UNIV_PAGE_SIZE - header_len;
- err = compress2(out_buf+header_len, &write_size, buf, len, level);
- if (err != Z_OK) {
- /* If error we leave the actual page as it was */
+ if (srv_use_lz4) {
+ err = LZ4_compress_limitedOutput((const char *)buf, (char *)out_buf+header_len, len, write_size);
+ write_size = err;
- fprintf(stderr,
- "InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n",
- space_id, fil_space_name(space), len, err, write_size);
+ if (err == 0) {
+ /* If error we leave the actual page as it was */
+
+ fprintf(stderr,
+ "InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n",
+ space_id, fil_space_name(space), len, err, write_size);
+
+ *out_len = len;
+ return (buf);
+ }
+ } else {
+ err = compress2(out_buf+header_len, &write_size, buf, len, level);
+
+ if (err != Z_OK) {
+ /* If error we leave the actual page as it was */
+
+ fprintf(stderr,
+ "InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n",
+ space_id, fil_space_name(space), len, err, write_size);
+
+ *out_len = len;
+ return (buf);
+ }
+ }
- *out_len = len;
- return (buf);
+ /* Set up the page header */
+ memcpy(out_buf, buf, FIL_PAGE_DATA);
+ /* Set up the checksum */
+ mach_write_to_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM, BUF_NO_CHECKSUM_MAGIC);
+ /* Set up the correct page type */
+ mach_write_to_2(out_buf+FIL_PAGE_TYPE, FIL_PAGE_PAGE_COMPRESSED);
+ /* Set up the flush lsn to be compression algorithm */
+ if (srv_use_lz4) {
+ mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, FIL_PAGE_COMPRESSION_LZ4);
} else {
- /* Set up the page header */
- memcpy(out_buf, buf, FIL_PAGE_DATA);
- /* Set up the checksum */
- mach_write_to_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM, BUF_NO_CHECKSUM_MAGIC);
- /* Set up the correct page type */
- mach_write_to_2(out_buf+FIL_PAGE_TYPE, FIL_PAGE_PAGE_COMPRESSED);
- /* Set up the flush lsn to be compression algorithm */
mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, FIL_PAGE_COMPRESSION_ZLIB);
- /* Set up the actual payload lenght */
- mach_write_to_2(out_buf+FIL_PAGE_DATA, write_size);
+ }
+ /* Set up the actual payload lenght */
+ mach_write_to_2(out_buf+FIL_PAGE_DATA, write_size);
#ifdef UNIV_DEBUG
- /* Verify */
- ut_ad(fil_page_is_compressed(out_buf));
- ut_ad(mach_read_from_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM) == BUF_NO_CHECKSUM_MAGIC);
- ut_ad(mach_read_from_2(out_buf+FIL_PAGE_DATA) == write_size);
+ /* Verify */
+ ut_ad(fil_page_is_compressed(out_buf));
+ ut_ad(mach_read_from_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM) == BUF_NO_CHECKSUM_MAGIC);
+ ut_ad(mach_read_from_2(out_buf+FIL_PAGE_DATA) == write_size);
+ if (srv_use_lz4) {
+ ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == FIL_PAGE_COMPRESSION_LZ4);
+ } else {
ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == FIL_PAGE_COMPRESSION_ZLIB);
+ }
#endif
- write_size+=header_len;
- /* Actual write needs to be alligned on block size */
- if (write_size % OS_FILE_LOG_BLOCK_SIZE) {
- write_size = (write_size + (OS_FILE_LOG_BLOCK_SIZE - (write_size % OS_FILE_LOG_BLOCK_SIZE)));
- }
+ write_size+=header_len;
+ /* Actual write needs to be alligned on block size */
+ if (write_size % OS_FILE_LOG_BLOCK_SIZE) {
+ write_size = (write_size + (OS_FILE_LOG_BLOCK_SIZE - (write_size % OS_FILE_LOG_BLOCK_SIZE)));
+ }
#ifdef UNIV_DEBUG
- fprintf(stderr,
- "InnoDB: Note: Compression succeeded for space %lu name %s len %lu out_len %lu\n",
- space_id, fil_space_name(space), len, write_size);
+ fprintf(stderr,
+ "InnoDB: Note: Compression succeeded for space %lu name %s len %lu out_len %lu\n",
+ space_id, fil_space_name(space), len, write_size);
#endif
+
#define SECT_SIZE 512
- srv_stats.page_compression_saved.add((len - write_size));
- if ((len - write_size) > 0) {
- srv_stats.page_compression_trim_sect512.add(((len - write_size) / SECT_SIZE));
- srv_stats.page_compression_trim_sect4096.add(((len - write_size) / (SECT_SIZE*8)));
- }
- //srv_stats.page_compressed_trim_op.inc();
- srv_stats.pages_page_compressed.inc();
- *out_len = write_size;
- return(out_buf);
+ srv_stats.page_compression_saved.add((len - write_size));
+ if ((len - write_size) > 0) {
+ srv_stats.page_compression_trim_sect512.add(((len - write_size) / SECT_SIZE));
+ srv_stats.page_compression_trim_sect4096.add(((len - write_size) / (SECT_SIZE*8)));
}
+ //srv_stats.page_compressed_trim_op.inc();
+ srv_stats.pages_page_compressed.inc();
+ *out_len = write_size;
+
+ return(out_buf);
+
}
/****************************************************************//**
@@ -203,16 +232,30 @@ fil_decompress_page(
/* Get compression algorithm */
compression_alg = mach_read_from_8(buf+FIL_PAGE_FILE_FLUSH_LSN);
- if (compression_alg == FIL_PAGE_COMPRESSION_ZLIB) {
- // If no buffer was given, we need to allocate temporal buffer
- if (page_buf == NULL) {
- in_buf = static_cast<byte *>(ut_malloc(UNIV_PAGE_SIZE));
- } else {
- in_buf = page_buf;
- }
+ // If no buffer was given, we need to allocate temporal buffer
+ if (page_buf == NULL) {
+#ifdef UNIV_DEBUG
+ fprintf(stderr,
+ "InnoDB: Note: Compression buffer not given, allocating...\n");
+#endif
+ in_buf = static_cast<byte *>(ut_malloc(UNIV_PAGE_SIZE));
+ } else {
+ in_buf = page_buf;
+ }
+
+ /* Get the actual size of compressed page */
+ actual_size = mach_read_from_2(buf+FIL_PAGE_DATA);
+ /* Check if payload size is corrupted */
+ if (actual_size == 0 || actual_size > UNIV_PAGE_SIZE) {
+ fprintf(stderr,
+ "InnoDB: Corruption: We try to uncompress corrupted page\n"
+ "InnoDB: actual size %lu compression %s\n",
+ actual_size, fil_get_compression_alg_name(compression_alg));
+ fflush(stderr);
+ ut_error;
+ }
- /* Get the actual size of compressed page */
- actual_size = mach_read_from_2(buf+FIL_PAGE_DATA);
+ if (compression_alg == FIL_PAGE_COMPRESSION_ZLIB) {
#ifdef UNIV_DEBUG
fprintf(stderr,
@@ -242,17 +285,19 @@ fil_decompress_page(
"InnoDB: Note: Decompression succeeded for len %lu \n",
len);
#endif
+ } else if (compression_alg == FIL_PAGE_COMPRESSION_LZ4) {
+ err = LZ4_decompress_fast((const char *)buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (char *)in_buf, UNIV_PAGE_SIZE);
- /* Copy the uncompressed page to the buffer pool, not
- really any other options. */
- memcpy(buf, in_buf, len);
+ if (err != actual_size) {
+ fprintf(stderr,
+ "InnoDB: Corruption: Page is marked as compressed\n"
+ "InnoDB: but decompression read only %d bytes.\n"
+ "InnoDB: size %lu len %lu\n",
+ err, actual_size, len);
+ fflush(stderr);
- // Need to free temporal buffer if no buffer was given
- if (page_buf == NULL) {
- ut_free(in_buf);
+ ut_error;
}
-
- srv_stats.pages_page_decompressed.inc();
} else {
fprintf(stderr,
"InnoDB: Corruption: Page is marked as compressed\n"
@@ -263,6 +308,17 @@ fil_decompress_page(
fflush(stderr);
ut_error;
}
+
+ srv_stats.pages_page_decompressed.inc();
+
+ /* Copy the uncompressed page to the buffer pool, not
+ really any other options. */
+ memcpy(buf, in_buf, len);
+
+ // Need to free temporal buffer if no buffer was given
+ if (page_buf == NULL) {
+ ut_free(in_buf);
+ }
}
diff --git a/storage/innobase/fil/lz4.c b/storage/innobase/fil/lz4.c
new file mode 100644
index 00000000000..4e864de67d3
--- /dev/null
+++ b/storage/innobase/fil/lz4.c
@@ -0,0 +1,822 @@
+/*
+ LZ4 - Fast LZ compression algorithm
+ Copyright (C) 2011-2013, Yann Collet.
+ BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following disclaimer
+ in the documentation and/or other materials provided with the
+ distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ You can contact the author at :
+ - LZ4 source repository : http://code.google.com/p/lz4/
+ - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+//**************************************
+// Tuning parameters
+//**************************************
+// MEMORY_USAGE :
+// Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+// Increasing memory usage improves compression ratio
+// Reduced memory usage can improve speed, due to cache effect
+// Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache
+#define MEMORY_USAGE 14
+
+// HEAPMODE :
+// Select how default compression functions will allocate memory for their hash table,
+// in memory stack (0:default, fastest), or in memory heap (1:requires memory allocation (malloc)).
+#define HEAPMODE 0
+
+
+//**************************************
+// CPU Feature Detection
+//**************************************
+// 32 or 64 bits ?
+#if (defined(__x86_64__) || defined(_M_X64) || defined(_WIN64) \
+ || defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) \
+ || defined(__64BIT__) || defined(_LP64) || defined(__LP64__) \
+ || defined(__ia64) || defined(__itanium__) || defined(_M_IA64) ) // Detects 64 bits mode
+# define LZ4_ARCH64 1
+#else
+# define LZ4_ARCH64 0
+#endif
+
+// Little Endian or Big Endian ?
+// Overwrite the #define below if you know your architecture endianess
+#if defined (__GLIBC__)
+# include <endian.h>
+# if (__BYTE_ORDER == __BIG_ENDIAN)
+# define LZ4_BIG_ENDIAN 1
+# endif
+#elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN))
+# define LZ4_BIG_ENDIAN 1
+#elif defined(__sparc) || defined(__sparc__) \
+ || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) \
+ || defined(__hpux) || defined(__hppa) \
+ || defined(_MIPSEB) || defined(__s390__)
+# define LZ4_BIG_ENDIAN 1
+#else
+// Little Endian assumed. PDP Endian and other very rare endian format are unsupported.
+#endif
+
+// Unaligned memory access is automatically enabled for "common" CPU, such as x86.
+// For others CPU, such as ARM, the compiler may be more cautious, inserting unnecessary extra code to ensure aligned access property
+// If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance
+#if defined(__ARM_FEATURE_UNALIGNED)
+# define LZ4_FORCE_UNALIGNED_ACCESS 1
+#endif
+
+// Define this parameter if your target system or compiler does not support hardware bit count
+#if defined(_MSC_VER) && defined(_WIN32_WCE) // Visual Studio for Windows CE does not support Hardware bit count
+# define LZ4_FORCE_SW_BITCOUNT
+#endif
+
+// BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE :
+// This option may provide a small boost to performance for some big endian cpu, although probably modest.
+// You may set this option to 1 if data will remain within closed environment.
+// This option is useless on Little_Endian CPU (such as x86)
+//#define BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1
+
+
+//**************************************
+// Compiler Options
+//**************************************
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) // C99
+/* "restrict" is a known keyword */
+#else
+# define restrict // Disable restrict
+#endif
+
+#ifdef _MSC_VER // Visual Studio
+# define FORCE_INLINE static __forceinline
+# include <intrin.h> // For Visual 2005
+# if LZ4_ARCH64 // 64-bits
+# pragma intrinsic(_BitScanForward64) // For Visual 2005
+# pragma intrinsic(_BitScanReverse64) // For Visual 2005
+# else // 32-bits
+# pragma intrinsic(_BitScanForward) // For Visual 2005
+# pragma intrinsic(_BitScanReverse) // For Visual 2005
+# endif
+# pragma warning(disable : 4127) // disable: C4127: conditional expression is constant
+#else
+# ifdef __GNUC__
+# define FORCE_INLINE static inline __attribute__((always_inline))
+# else
+# define FORCE_INLINE static inline
+# endif
+#endif
+
+#ifdef _MSC_VER
+# define lz4_bswap16(x) _byteswap_ushort(x)
+#else
+# define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | (((x) & 0xffu) << 8)))
+#endif
+
+#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#if (GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__)
+# define expect(expr,value) (__builtin_expect ((expr),(value)) )
+#else
+# define expect(expr,value) (expr)
+#endif
+
+#define likely(expr) expect((expr) != 0, 1)
+#define unlikely(expr) expect((expr) != 0, 0)
+
+
+//**************************************
+// Memory routines
+//**************************************
+#include <stdlib.h> // malloc, calloc, free
+#define ALLOCATOR(n,s) calloc(n,s)
+#define FREEMEM free
+#include <string.h> // memset, memcpy
+#define MEM_INIT memset
+
+
+//**************************************
+// Includes
+//**************************************
+#include "lz4.h"
+
+
+//**************************************
+// Basic Types
+//**************************************
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L // C99
+# include <stdint.h>
+ typedef uint8_t BYTE;
+ typedef uint16_t U16;
+ typedef uint32_t U32;
+ typedef int32_t S32;
+ typedef uint64_t U64;
+#else
+ typedef unsigned char BYTE;
+ typedef unsigned short U16;
+ typedef unsigned int U32;
+ typedef signed int S32;
+ typedef unsigned long long U64;
+#endif
+
+#if defined(__GNUC__) && !defined(LZ4_FORCE_UNALIGNED_ACCESS)
+# define _PACKED __attribute__ ((packed))
+#else
+# define _PACKED
+#endif
+
+#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__)
+# if defined(__IBMC__) || defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+# pragma pack(1)
+# else
+# pragma pack(push, 1)
+# endif
+#endif
+
+typedef struct { U16 v; } _PACKED U16_S;
+typedef struct { U32 v; } _PACKED U32_S;
+typedef struct { U64 v; } _PACKED U64_S;
+typedef struct {size_t v;} _PACKED size_t_S;
+
+#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__)
+# if defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+# pragma pack(0)
+# else
+# pragma pack(pop)
+# endif
+#endif
+
+#define A16(x) (((U16_S *)(x))->v)
+#define A32(x) (((U32_S *)(x))->v)
+#define A64(x) (((U64_S *)(x))->v)
+#define AARCH(x) (((size_t_S *)(x))->v)
+
+
+//**************************************
+// Constants
+//**************************************
+#define LZ4_HASHLOG (MEMORY_USAGE-2)
+#define HASHTABLESIZE (1 << MEMORY_USAGE)
+#define HASHNBCELLS4 (1 << LZ4_HASHLOG)
+
+#define MINMATCH 4
+
+#define COPYLENGTH 8
+#define LASTLITERALS 5
+#define MFLIMIT (COPYLENGTH+MINMATCH)
+const int LZ4_minLength = (MFLIMIT+1);
+
+#define LZ4_64KLIMIT ((1<<16) + (MFLIMIT-1))
+#define SKIPSTRENGTH 6 // Increasing this value will make the compression run slower on incompressible data
+
+#define MAXD_LOG 16
+#define MAX_DISTANCE ((1 << MAXD_LOG) - 1)
+
+#define ML_BITS 4
+#define ML_MASK ((1U<<ML_BITS)-1)
+#define RUN_BITS (8-ML_BITS)
+#define RUN_MASK ((1U<<RUN_BITS)-1)
+
+#define KB *(1U<<10)
+#define MB *(1U<<20)
+#define GB *(1U<<30)
+
+
+//**************************************
+// Structures and local types
+//**************************************
+
+typedef struct {
+ U32 hashTable[HASHNBCELLS4];
+ const BYTE* bufferStart;
+ const BYTE* base;
+ const BYTE* nextBlock;
+} LZ4_Data_Structure;
+
+typedef enum { notLimited = 0, limited = 1 } limitedOutput_directive;
+typedef enum { byPtr, byU32, byU16 } tableType_t;
+
+typedef enum { noPrefix = 0, withPrefix = 1 } prefix64k_directive;
+
+typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive;
+typedef enum { full = 0, partial = 1 } earlyEnd_directive;
+
+
+//**************************************
+// Architecture-specific macros
+//**************************************
+#define STEPSIZE sizeof(size_t)
+#define LZ4_COPYSTEP(d,s) { AARCH(d) = AARCH(s); d+=STEPSIZE; s+=STEPSIZE; }
+#define LZ4_COPY8(d,s) { LZ4_COPYSTEP(d,s); if (STEPSIZE<8) LZ4_COPYSTEP(d,s); }
+#define LZ4_SECURECOPY(d,s,e) { if ((STEPSIZE==4)||(d<e)) LZ4_WILDCOPY(d,s,e); }
+
+#if LZ4_ARCH64 // 64-bit
+# define HTYPE U32
+# define INITBASE(base) const BYTE* const base = ip
+#else // 32-bit
+# define HTYPE const BYTE*
+# define INITBASE(base) const int base = 0
+#endif
+
+#if (defined(LZ4_BIG_ENDIAN) && !defined(BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE))
+# define LZ4_READ_LITTLEENDIAN_16(d,s,p) { U16 v = A16(p); v = lz4_bswap16(v); d = (s) - v; }
+# define LZ4_WRITE_LITTLEENDIAN_16(p,i) { U16 v = (U16)(i); v = lz4_bswap16(v); A16(p) = v; p+=2; }
+#else // Little Endian
+# define LZ4_READ_LITTLEENDIAN_16(d,s,p) { d = (s) - A16(p); }
+# define LZ4_WRITE_LITTLEENDIAN_16(p,v) { A16(p) = v; p+=2; }
+#endif
+
+
+//**************************************
+// Macros
+//**************************************
+#define LZ4_WILDCOPY(d,s,e) { do { LZ4_COPY8(d,s) } while (d<e); } // at the end, d>=e;
+
+
+//****************************
+// Private functions
+//****************************
+#if LZ4_ARCH64
+
+FORCE_INLINE int LZ4_NbCommonBytes (register U64 val)
+{
+# if defined(LZ4_BIG_ENDIAN)
+# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+ unsigned long r = 0;
+ _BitScanReverse64( &r, val );
+ return (int)(r>>3);
+# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
+ return (__builtin_clzll(val) >> 3);
+# else
+ int r;
+ if (!(val>>32)) { r=4; } else { r=0; val>>=32; }
+ if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+ r += (!val);
+ return r;
+# endif
+# else
+# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+ unsigned long r = 0;
+ _BitScanForward64( &r, val );
+ return (int)(r>>3);
+# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
+ return (__builtin_ctzll(val) >> 3);
+# else
+ static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
+ return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+# endif
+# endif
+}
+
+#else
+
+FORCE_INLINE int LZ4_NbCommonBytes (register U32 val)
+{
+# if defined(LZ4_BIG_ENDIAN)
+# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+ unsigned long r = 0;
+ _BitScanReverse( &r, val );
+ return (int)(r>>3);
+# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
+ return (__builtin_clz(val) >> 3);
+# else
+ int r;
+ if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
+ r += (!val);
+ return r;
+# endif
+# else
+# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+ unsigned long r;
+ _BitScanForward( &r, val );
+ return (int)(r>>3);
+# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
+ return (__builtin_ctz(val) >> 3);
+# else
+ static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
+ return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+# endif
+# endif
+}
+
+#endif
+
+
+//****************************
+// Compression functions
+//****************************
+FORCE_INLINE int LZ4_hashSequence(U32 sequence, tableType_t tableType)
+{
+ if (tableType == byU16)
+ return (((sequence) * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1)));
+ else
+ return (((sequence) * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG));
+}
+
+FORCE_INLINE int LZ4_hashPosition(const BYTE* p, tableType_t tableType) { return LZ4_hashSequence(A32(p), tableType); }
+
+FORCE_INLINE void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+ switch (tableType)
+ {
+ case byPtr: { const BYTE** hashTable = (const BYTE**) tableBase; hashTable[h] = p; break; }
+ case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); break; }
+ case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); break; }
+ }
+}
+
+FORCE_INLINE void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+ U32 h = LZ4_hashPosition(p, tableType);
+ LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase);
+}
+
+FORCE_INLINE const BYTE* LZ4_getPositionOnHash(U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+ if (tableType == byPtr) { const BYTE** hashTable = (const BYTE**) tableBase; return hashTable[h]; }
+ if (tableType == byU32) { U32* hashTable = (U32*) tableBase; return hashTable[h] + srcBase; }
+ { U16* hashTable = (U16*) tableBase; return hashTable[h] + srcBase; } // default, to ensure a return
+}
+
+FORCE_INLINE const BYTE* LZ4_getPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+ U32 h = LZ4_hashPosition(p, tableType);
+ return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase);
+}
+
+
+FORCE_INLINE int LZ4_compress_generic(
+ void* ctx,
+ const char* source,
+ char* dest,
+ int inputSize,
+ int maxOutputSize,
+
+ limitedOutput_directive limitedOutput,
+ tableType_t tableType,
+ prefix64k_directive prefix)
+{
+ const BYTE* ip = (const BYTE*) source;
+ const BYTE* const base = (prefix==withPrefix) ? ((LZ4_Data_Structure*)ctx)->base : (const BYTE*) source;
+ const BYTE* const lowLimit = ((prefix==withPrefix) ? ((LZ4_Data_Structure*)ctx)->bufferStart : (const BYTE*)source);
+ const BYTE* anchor = (const BYTE*) source;
+ const BYTE* const iend = ip + inputSize;
+ const BYTE* const mflimit = iend - MFLIMIT;
+ const BYTE* const matchlimit = iend - LASTLITERALS;
+
+ BYTE* op = (BYTE*) dest;
+ BYTE* const oend = op + maxOutputSize;
+
+ int length;
+ const int skipStrength = SKIPSTRENGTH;
+ U32 forwardH;
+
+ // Init conditions
+ if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0; // Unsupported input size, too large (or negative)
+ if ((prefix==withPrefix) && (ip != ((LZ4_Data_Structure*)ctx)->nextBlock)) return 0; // must continue from end of previous block
+ if (prefix==withPrefix) ((LZ4_Data_Structure*)ctx)->nextBlock=iend; // do it now, due to potential early exit
+ if ((tableType == byU16) && (inputSize>=LZ4_64KLIMIT)) return 0; // Size too large (not within 64K limit)
+ if (inputSize<LZ4_minLength) goto _last_literals; // Input too small, no compression (all literals)
+
+ // First Byte
+ LZ4_putPosition(ip, ctx, tableType, base);
+ ip++; forwardH = LZ4_hashPosition(ip, tableType);
+
+ // Main Loop
+ for ( ; ; )
+ {
+ int findMatchAttempts = (1U << skipStrength) + 3;
+ const BYTE* forwardIp = ip;
+ const BYTE* ref;
+ BYTE* token;
+
+ // Find a match
+ do {
+ U32 h = forwardH;
+ int step = findMatchAttempts++ >> skipStrength;
+ ip = forwardIp;
+ forwardIp = ip + step;
+
+ if unlikely(forwardIp > mflimit) { goto _last_literals; }
+
+ forwardH = LZ4_hashPosition(forwardIp, tableType);
+ ref = LZ4_getPositionOnHash(h, ctx, tableType, base);
+ LZ4_putPositionOnHash(ip, h, ctx, tableType, base);
+
+ } while ((ref + MAX_DISTANCE < ip) || (A32(ref) != A32(ip)));
+
+ // Catch up
+ while ((ip>anchor) && (ref > lowLimit) && unlikely(ip[-1]==ref[-1])) { ip--; ref--; }
+
+ // Encode Literal length
+ length = (int)(ip - anchor);
+ token = op++;
+ if ((limitedOutput) && unlikely(op + length + (2 + 1 + LASTLITERALS) + (length/255) > oend)) return 0; // Check output limit
+ if (length>=(int)RUN_MASK)
+ {
+ int len = length-RUN_MASK;
+ *token=(RUN_MASK<<ML_BITS);
+ for(; len >= 255 ; len-=255) *op++ = 255;
+ *op++ = (BYTE)len;
+ }
+ else *token = (BYTE)(length<<ML_BITS);
+
+ // Copy Literals
+ { BYTE* end=(op)+(length); LZ4_WILDCOPY(op,anchor,end); op=end; }
+
+_next_match:
+ // Encode Offset
+ LZ4_WRITE_LITTLEENDIAN_16(op,(U16)(ip-ref));
+
+ // Start Counting
+ ip+=MINMATCH; ref+=MINMATCH; // MinMatch already verified
+ anchor = ip;
+ while likely(ip<matchlimit-(STEPSIZE-1))
+ {
+ size_t diff = AARCH(ref) ^ AARCH(ip);
+ if (!diff) { ip+=STEPSIZE; ref+=STEPSIZE; continue; }
+ ip += LZ4_NbCommonBytes(diff);
+ goto _endCount;
+ }
+ if (LZ4_ARCH64) if ((ip<(matchlimit-3)) && (A32(ref) == A32(ip))) { ip+=4; ref+=4; }
+ if ((ip<(matchlimit-1)) && (A16(ref) == A16(ip))) { ip+=2; ref+=2; }
+ if ((ip<matchlimit) && (*ref == *ip)) ip++;
+_endCount:
+
+ // Encode MatchLength
+ length = (int)(ip - anchor);
+ if ((limitedOutput) && unlikely(op + (1 + LASTLITERALS) + (length>>8) > oend)) return 0; // Check output limit
+ if (length>=(int)ML_MASK)
+ {
+ *token += ML_MASK;
+ length -= ML_MASK;
+ for (; length > 509 ; length-=510) { *op++ = 255; *op++ = 255; }
+ if (length >= 255) { length-=255; *op++ = 255; }
+ *op++ = (BYTE)length;
+ }
+ else *token += (BYTE)(length);
+
+ // Test end of chunk
+ if (ip > mflimit) { anchor = ip; break; }
+
+ // Fill table
+ LZ4_putPosition(ip-2, ctx, tableType, base);
+
+ // Test next position
+ ref = LZ4_getPosition(ip, ctx, tableType, base);
+ LZ4_putPosition(ip, ctx, tableType, base);
+ if ((ref + MAX_DISTANCE >= ip) && (A32(ref) == A32(ip))) { token = op++; *token=0; goto _next_match; }
+
+ // Prepare next loop
+ anchor = ip++;
+ forwardH = LZ4_hashPosition(ip, tableType);
+ }
+
+_last_literals:
+ // Encode Last Literals
+ {
+ int lastRun = (int)(iend - anchor);
+ if ((limitedOutput) && (((char*)op - dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize)) return 0; // Check output limit
+ if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK<<ML_BITS); lastRun-=RUN_MASK; for(; lastRun >= 255 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; }
+ else *op++ = (BYTE)(lastRun<<ML_BITS);
+ memcpy(op, anchor, iend - anchor);
+ op += iend-anchor;
+ }
+
+ // End
+ return (int) (((char*)op)-dest);
+}
+
+
+int LZ4_compress(const char* source, char* dest, int inputSize)
+{
+#if (HEAPMODE)
+ void* ctx = ALLOCATOR(HASHNBCELLS4, 4); // Aligned on 4-bytes boundaries
+#else
+ U32 ctx[1U<<(MEMORY_USAGE-2)] = {0}; // Ensure data is aligned on 4-bytes boundaries
+#endif
+ int result;
+
+ if (inputSize < (int)LZ4_64KLIMIT)
+ result = LZ4_compress_generic((void*)ctx, source, dest, inputSize, 0, notLimited, byU16, noPrefix);
+ else
+ result = LZ4_compress_generic((void*)ctx, source, dest, inputSize, 0, notLimited, (sizeof(void*)==8) ? byU32 : byPtr, noPrefix);
+
+#if (HEAPMODE)
+ FREEMEM(ctx);
+#endif
+ return result;
+}
+
+int LZ4_compress_continue (void* LZ4_Data, const char* source, char* dest, int inputSize)
+{
+ return LZ4_compress_generic(LZ4_Data, source, dest, inputSize, 0, notLimited, byU32, withPrefix);
+}
+
+
+int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize)
+{
+#if (HEAPMODE)
+ void* ctx = ALLOCATOR(HASHNBCELLS4, 4); // Aligned on 4-bytes boundaries
+#else
+ U32 ctx[1U<<(MEMORY_USAGE-2)] = {0}; // Ensure data is aligned on 4-bytes boundaries
+#endif
+ int result;
+
+ if (inputSize < (int)LZ4_64KLIMIT)
+ result = LZ4_compress_generic((void*)ctx, source, dest, inputSize, maxOutputSize, limited, byU16, noPrefix);
+ else
+ result = LZ4_compress_generic((void*)ctx, source, dest, inputSize, maxOutputSize, limited, (sizeof(void*)==8) ? byU32 : byPtr, noPrefix);
+
+#if (HEAPMODE)
+ FREEMEM(ctx);
+#endif
+ return result;
+}
+
+int LZ4_compress_limitedOutput_continue (void* LZ4_Data, const char* source, char* dest, int inputSize, int maxOutputSize)
+{
+ return LZ4_compress_generic(LZ4_Data, source, dest, inputSize, maxOutputSize, limited, byU32, withPrefix);
+}
+
+
+//****************************
+// Stream functions
+//****************************
+
+FORCE_INLINE void LZ4_init(LZ4_Data_Structure* lz4ds, const BYTE* base)
+{
+ MEM_INIT(lz4ds->hashTable, 0, sizeof(lz4ds->hashTable));
+ lz4ds->bufferStart = base;
+ lz4ds->base = base;
+ lz4ds->nextBlock = base;
+}
+
+
+void* LZ4_create (const char* inputBuffer)
+{
+ void* lz4ds = ALLOCATOR(1, sizeof(LZ4_Data_Structure));
+ LZ4_init ((LZ4_Data_Structure*)lz4ds, (const BYTE*)inputBuffer);
+ return lz4ds;
+}
+
+
+int LZ4_free (void* LZ4_Data)
+{
+ FREEMEM(LZ4_Data);
+ return (0);
+}
+
+
+char* LZ4_slideInputBuffer (void* LZ4_Data)
+{
+ LZ4_Data_Structure* lz4ds = (LZ4_Data_Structure*)LZ4_Data;
+ size_t delta = lz4ds->nextBlock - (lz4ds->bufferStart + 64 KB);
+
+ if ( (lz4ds->base - delta > lz4ds->base) // underflow control
+ || ((size_t)(lz4ds->nextBlock - lz4ds->base) > 0xE0000000) ) // close to 32-bits limit
+ {
+ size_t deltaLimit = (lz4ds->nextBlock - 64 KB) - lz4ds->base;
+ int nH;
+
+ for (nH=0; nH < HASHNBCELLS4; nH++)
+ {
+ if ((size_t)(lz4ds->hashTable[nH]) < deltaLimit) lz4ds->hashTable[nH] = 0;
+ else lz4ds->hashTable[nH] -= (U32)deltaLimit;
+ }
+ memcpy((void*)(lz4ds->bufferStart), (const void*)(lz4ds->nextBlock - 64 KB), 64 KB);
+ lz4ds->base = lz4ds->bufferStart;
+ lz4ds->nextBlock = lz4ds->base + 64 KB;
+ }
+ else
+ {
+ memcpy((void*)(lz4ds->bufferStart), (const void*)(lz4ds->nextBlock - 64 KB), 64 KB);
+ lz4ds->nextBlock -= delta;
+ lz4ds->base -= delta;
+ }
+
+ return (char*)(lz4ds->nextBlock);
+}
+
+
+//****************************
+// Decompression functions
+//****************************
+
+// This generic decompression function cover all use cases.
+// It shall be instanciated several times, using different sets of directives
+// Note that it is essential this generic function is really inlined,
+// in order to remove useless branches during compilation optimisation.
+FORCE_INLINE int LZ4_decompress_generic(
+ const char* source,
+ char* dest,
+ int inputSize, //
+ int outputSize, // If endOnInput==endOnInputSize, this value is the max size of Output Buffer.
+
+ int endOnInput, // endOnOutputSize, endOnInputSize
+ int prefix64k, // noPrefix, withPrefix
+ int partialDecoding, // full, partial
+ int targetOutputSize // only used if partialDecoding==partial
+ )
+{
+ // Local Variables
+ const BYTE* restrict ip = (const BYTE*) source;
+ const BYTE* ref;
+ const BYTE* const iend = ip + inputSize;
+
+ BYTE* op = (BYTE*) dest;
+ BYTE* const oend = op + outputSize;
+ BYTE* cpy;
+ BYTE* oexit = op + targetOutputSize;
+
+ const size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; // static reduces speed for LZ4_decompress_safe() on GCC64
+ static const size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3};
+
+
+ // Special cases
+ if ((partialDecoding) && (oexit> oend-MFLIMIT)) oexit = oend-MFLIMIT; // targetOutputSize too high => decode everything
+ if ((endOnInput) && unlikely(outputSize==0)) return ((inputSize==1) && (*ip==0)) ? 0 : -1; // Empty output buffer
+ if ((!endOnInput) && unlikely(outputSize==0)) return (*ip==0?1:-1);
+
+
+ // Main Loop
+ while (1)
+ {
+ unsigned token;
+ size_t length;
+
+ // get runlength
+ token = *ip++;
+ if ((length=(token>>ML_BITS)) == RUN_MASK)
+ {
+ unsigned s=255;
+ while (((endOnInput)?ip<iend:1) && (s==255))
+ {
+ s = *ip++;
+ length += s;
+ }
+ }
+
+ // copy literals
+ cpy = op+length;
+ if (((endOnInput) && ((cpy>(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) )
+ || ((!endOnInput) && (cpy>oend-COPYLENGTH)))
+ {
+ if (partialDecoding)
+ {
+ if (cpy > oend) goto _output_error; // Error : write attempt beyond end of output buffer
+ if ((endOnInput) && (ip+length > iend)) goto _output_error; // Error : read attempt beyond end of input buffer
+ }
+ else
+ {
+ if ((!endOnInput) && (cpy != oend)) goto _output_error; // Error : block decoding must stop exactly there
+ if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error; // Error : input must be consumed
+ }
+ memcpy(op, ip, length);
+ ip += length;
+ op += length;
+ break; // Necessarily EOF, due to parsing restrictions
+ }
+ LZ4_WILDCOPY(op, ip, cpy); ip -= (op-cpy); op = cpy;
+
+ // get offset
+ LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2;
+ if ((prefix64k==noPrefix) && unlikely(ref < (BYTE* const)dest)) goto _output_error; // Error : offset outside destination buffer
+
+ // get matchlength
+ if ((length=(token&ML_MASK)) == ML_MASK)
+ {
+ while ((!endOnInput) || (ip<iend-(LASTLITERALS+1))) // Ensure enough bytes remain for LASTLITERALS + token
+ {
+ unsigned s = *ip++;
+ length += s;
+ if (s==255) continue;
+ break;
+ }
+ }
+
+ // copy repeated sequence
+ if unlikely((op-ref)<(int)STEPSIZE)
+ {
+ const size_t dec64 = dec64table[(sizeof(void*)==4) ? 0 : op-ref];
+ op[0] = ref[0];
+ op[1] = ref[1];
+ op[2] = ref[2];
+ op[3] = ref[3];
+ op += 4, ref += 4; ref -= dec32table[op-ref];
+ A32(op) = A32(ref);
+ op += STEPSIZE-4; ref -= dec64;
+ } else { LZ4_COPYSTEP(op,ref); }
+ cpy = op + length - (STEPSIZE-4);
+
+ if unlikely(cpy>oend-COPYLENGTH-(STEPSIZE-4))
+ {
+ if (cpy > oend-LASTLITERALS) goto _output_error; // Error : last 5 bytes must be literals
+ LZ4_SECURECOPY(op, ref, (oend-COPYLENGTH));
+ while(op<cpy) *op++=*ref++;
+ op=cpy;
+ continue;
+ }
+ LZ4_WILDCOPY(op, ref, cpy);
+ op=cpy; // correction
+ }
+
+ // end of decoding
+ if (endOnInput)
+ return (int) (((char*)op)-dest); // Nb of output bytes decoded
+ else
+ return (int) (((char*)ip)-source); // Nb of input bytes read
+
+ // Overflow error detected
+_output_error:
+ return (int) (-(((char*)ip)-source))-1;
+}
+
+
+int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize)
+{
+ return LZ4_decompress_generic(source, dest, inputSize, maxOutputSize, endOnInputSize, noPrefix, full, 0);
+}
+
+int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int inputSize, int maxOutputSize)
+{
+ return LZ4_decompress_generic(source, dest, inputSize, maxOutputSize, endOnInputSize, withPrefix, full, 0);
+}
+
+int LZ4_decompress_safe_partial(const char* source, char* dest, int inputSize, int targetOutputSize, int maxOutputSize)
+{
+ return LZ4_decompress_generic(source, dest, inputSize, maxOutputSize, endOnInputSize, noPrefix, partial, targetOutputSize);
+}
+
+int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int outputSize)
+{
+ return LZ4_decompress_generic(source, dest, 0, outputSize, endOnOutputSize, withPrefix, full, 0);
+}
+
+int LZ4_decompress_fast(const char* source, char* dest, int outputSize)
+{
+#ifdef _MSC_VER // This version is faster with Visual
+ return LZ4_decompress_generic(source, dest, 0, outputSize, endOnOutputSize, noPrefix, full, 0);
+#else
+ return LZ4_decompress_generic(source, dest, 0, outputSize, endOnOutputSize, withPrefix, full, 0);
+#endif
+}
+
diff --git a/storage/innobase/fil/lz4.h b/storage/innobase/fil/lz4.h
new file mode 100644
index 00000000000..9ef58862947
--- /dev/null
+++ b/storage/innobase/fil/lz4.h
@@ -0,0 +1,205 @@
+/*
+ LZ4 - Fast LZ compression algorithm
+ Header File
+ Copyright (C) 2011-2013, Yann Collet.
+ BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following disclaimer
+ in the documentation and/or other materials provided with the
+ distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ You can contact the author at :
+ - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+ - LZ4 source repository : http://code.google.com/p/lz4/
+*/
+#pragma once
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+//**************************************
+// Compiler Options
+//**************************************
+#if defined(_MSC_VER) && !defined(__cplusplus) // Visual Studio
+# define inline __inline // Visual C is not C99, but supports some kind of inline
+#endif
+
+
+//****************************
+// Simple Functions
+//****************************
+
+int LZ4_compress (const char* source, char* dest, int inputSize);
+int LZ4_decompress_safe (const char* source, char* dest, int inputSize, int maxOutputSize);
+
+/*
+LZ4_compress() :
+ Compresses 'inputSize' bytes from 'source' into 'dest'.
+ Destination buffer must be already allocated,
+ and must be sized to handle worst cases situations (input data not compressible)
+ Worst case size evaluation is provided by function LZ4_compressBound()
+ inputSize : Max supported value is LZ4_MAX_INPUT_VALUE
+ return : the number of bytes written in buffer dest
+ or 0 if the compression fails
+
+LZ4_decompress_safe() :
+ maxOutputSize : is the size of the destination buffer (which must be already allocated)
+ return : the number of bytes decoded in the destination buffer (necessarily <= maxOutputSize)
+ If the source stream is detected malformed, the function will stop decoding and return a negative result.
+ This function is protected against buffer overflow exploits (never writes outside of output buffer, and never reads outside of input buffer). Therefore, it is protected against malicious data packets
+*/
+
+
+//****************************
+// Advanced Functions
+//****************************
+#define LZ4_MAX_INPUT_SIZE 0x7E000000 // 2 113 929 216 bytes
+#define LZ4_COMPRESSBOUND(isize) ((unsigned int)(isize) > (unsigned int)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16)
+static inline int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); }
+
+/*
+LZ4_compressBound() :
+ Provides the maximum size that LZ4 may output in a "worst case" scenario (input data not compressible)
+ primarily useful for memory allocation of output buffer.
+ inline function is recommended for the general case,
+ macro is also provided when result needs to be evaluated at compilation (such as stack memory allocation).
+
+ isize : is the input size. Max supported value is LZ4_MAX_INPUT_SIZE
+ return : maximum output size in a "worst case" scenario
+ or 0, if input size is too large ( > LZ4_MAX_INPUT_SIZE)
+*/
+
+
+int LZ4_compress_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize);
+
+/*
+LZ4_compress_limitedOutput() :
+ Compress 'inputSize' bytes from 'source' into an output buffer 'dest' of maximum size 'maxOutputSize'.
+ If it cannot achieve it, compression will stop, and result of the function will be zero.
+ This function never writes outside of provided output buffer.
+
+ inputSize : Max supported value is LZ4_MAX_INPUT_VALUE
+ maxOutputSize : is the size of the destination buffer (which must be already allocated)
+ return : the number of bytes written in buffer 'dest'
+ or 0 if the compression fails
+*/
+
+
+int LZ4_decompress_fast (const char* source, char* dest, int outputSize);
+
+/*
+LZ4_decompress_fast() :
+ outputSize : is the original (uncompressed) size
+ return : the number of bytes read from the source buffer (in other words, the compressed size)
+ If the source stream is malformed, the function will stop decoding and return a negative result.
+ note : This function is a bit faster than LZ4_decompress_safe()
+ This function never writes outside of output buffers, but may read beyond input buffer in case of malicious data packet.
+ Use this function preferably into a trusted environment (data to decode comes from a trusted source).
+ Destination buffer must be already allocated. Its size must be a minimum of 'outputSize' bytes.
+*/
+
+int LZ4_decompress_safe_partial (const char* source, char* dest, int inputSize, int targetOutputSize, int maxOutputSize);
+
+/*
+LZ4_decompress_safe_partial() :
+ This function decompress a compressed block of size 'inputSize' at position 'source'
+ into output buffer 'dest' of size 'maxOutputSize'.
+ The function tries to stop decompressing operation as soon as 'targetOutputSize' has been reached,
+ reducing decompression time.
+ return : the number of bytes decoded in the destination buffer (necessarily <= maxOutputSize)
+ Note : this number can be < 'targetOutputSize' should the compressed block to decode be smaller.
+ Always control how many bytes were decoded.
+ If the source stream is detected malformed, the function will stop decoding and return a negative result.
+ This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets
+*/
+
+
+//****************************
+// Stream Functions
+//****************************
+
+void* LZ4_create (const char* inputBuffer);
+int LZ4_compress_continue (void* LZ4_Data, const char* source, char* dest, int inputSize);
+int LZ4_compress_limitedOutput_continue (void* LZ4_Data, const char* source, char* dest, int inputSize, int maxOutputSize);
+char* LZ4_slideInputBuffer (void* LZ4_Data);
+int LZ4_free (void* LZ4_Data);
+
+/*
+These functions allow the compression of dependent blocks, where each block benefits from prior 64 KB within preceding blocks.
+In order to achieve this, it is necessary to start creating the LZ4 Data Structure, thanks to the function :
+
+void* LZ4_create (const char* inputBuffer);
+The result of the function is the (void*) pointer on the LZ4 Data Structure.
+This pointer will be needed in all other functions.
+If the pointer returned is NULL, then the allocation has failed, and compression must be aborted.
+The only parameter 'const char* inputBuffer' must, obviously, point at the beginning of input buffer.
+The input buffer must be already allocated, and size at least 192KB.
+'inputBuffer' will also be the 'const char* source' of the first block.
+
+All blocks are expected to lay next to each other within the input buffer, starting from 'inputBuffer'.
+To compress each block, use either LZ4_compress_continue() or LZ4_compress_limitedOutput_continue().
+Their behavior are identical to LZ4_compress() or LZ4_compress_limitedOutput(),
+but require the LZ4 Data Structure as their first argument, and check that each block starts right after the previous one.
+If next block does not begin immediately after the previous one, the compression will fail (return 0).
+
+When it's no longer possible to lay the next block after the previous one (not enough space left into input buffer), a call to :
+char* LZ4_slideInputBuffer(void* LZ4_Data);
+must be performed. It will typically copy the latest 64KB of input at the beginning of input buffer.
+Note that, for this function to work properly, minimum size of an input buffer must be 192KB.
+==> The memory position where the next input data block must start is provided as the result of the function.
+
+Compression can then resume, using LZ4_compress_continue() or LZ4_compress_limitedOutput_continue(), as usual.
+
+When compression is completed, a call to LZ4_free() will release the memory used by the LZ4 Data Structure.
+*/
+
+
+int LZ4_decompress_safe_withPrefix64k (const char* source, char* dest, int inputSize, int maxOutputSize);
+int LZ4_decompress_fast_withPrefix64k (const char* source, char* dest, int outputSize);
+
+/*
+*_withPrefix64k() :
+ These decoding functions work the same as their "normal name" versions,
+ but can use up to 64KB of data in front of 'char* dest'.
+ These functions are necessary to decode inter-dependant blocks.
+*/
+
+
+//****************************
+// Obsolete Functions
+//****************************
+
+static inline int LZ4_uncompress (const char* source, char* dest, int outputSize) { return LZ4_decompress_fast(source, dest, outputSize); }
+static inline int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); }
+
+/*
+These functions are deprecated and should no longer be used.
+They are provided here for compatibility with existing user programs.
+*/
+
+
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index eda7da81d5c..d4ce4eb9c4f 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -4,7 +4,7 @@ Copyright (c) 2000, 2012, Oracle and/or its affiliates. All rights reserved.
Copyright (c) 2008, 2009 Google Inc.
Copyright (c) 2009, Percona Inc.
Copyright (c) 2012, Facebook Inc.
-Copyright (c) 2013, SkySQL Ab.
+Copyright (c) 2013, 2014, SkySQL Ab.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -15430,29 +15430,6 @@ innodb_reset_all_monitor_update(
}
/****************************************************************//**
-Update the system variable innodb_compression_level using the "saved"
-value. This function is registered as a callback with MySQL. */
-static
-void
-innodb_compression_level_update(
-/*============================*/
- THD* thd, /*!< in: thread handle */
- struct st_mysql_sys_var* var, /*!< in: pointer to
- system variable */
- void* var_ptr,/*!< out: where the
- formal string goes */
- const void* save) /*!< in: immediate result
- from check function */
-{
- /* We have this call back just to avoid confusion between
- ulong and ulint datatypes. */
- innobase_compression_level =
- (*static_cast<const ulong*>(save));
- page_compression_level =
- (static_cast<const ulint>(innobase_compression_level));
-}
-
-/****************************************************************//**
Parse and enable InnoDB monitor counters during server startup.
User can list the monitor counters/groups to be enable by specifying
"loose-innodb_monitor_enable=monitor_name1;monitor_name2..."
@@ -16140,11 +16117,11 @@ static MYSQL_SYSVAR_ULONG(replication_delay, srv_replication_delay,
"innodb_thread_concurrency is reached (0 by default)",
NULL, NULL, 0, 0, ~0UL, 0);
-static MYSQL_SYSVAR_ULONG(compression_level, innobase_compression_level,
+static MYSQL_SYSVAR_UINT(compression_level, page_zip_level,
PLUGIN_VAR_RQCMDARG,
- "Compression level used for compressed row format. 0 is no compression"
+ "Compression level used for zlib compression. 0 is no compression"
", 1 is fastest, 9 is best compression and default is 6.",
- NULL, innodb_compression_level_update,
+ NULL, NULL,
DEFAULT_COMPRESSION_LEVEL, 0, 9, 0);
static MYSQL_SYSVAR_LONG(additional_mem_pool_size, innobase_additional_mem_pool_size,
@@ -16620,11 +16597,6 @@ static MYSQL_SYSVAR_LONG(trim_pct, srv_trim_pct,
"How many percent of compressed pages should be trimmed",
NULL, NULL, 100, 0, 100, 0);
-static MYSQL_SYSVAR_LONG(compress_zlib_level, srv_compress_zlib_level,
- PLUGIN_VAR_OPCMDARG ,
- "Default zlib compression level",
- NULL, NULL, 6, 0, 9, 0);
-
static MYSQL_SYSVAR_BOOL(compress_index_pages, srv_page_compress_index_pages,
PLUGIN_VAR_OPCMDARG,
"Use page compression for only index pages.",
@@ -16635,6 +16607,12 @@ static MYSQL_SYSVAR_BOOL(use_trim, srv_use_trim,
"Use trim.",
NULL, NULL, TRUE);
+static MYSQL_SYSVAR_BOOL(use_lz4, srv_use_lz4,
+ PLUGIN_VAR_OPCMDARG ,
+ "Use LZ4 for page compression",
+ NULL, NULL, FALSE);
+
+
static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(additional_mem_pool_size),
MYSQL_SYSVAR(api_trx_level),
@@ -16782,9 +16760,9 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
#endif /* UNIV_DEBUG */
MYSQL_SYSVAR(compress_pages),
MYSQL_SYSVAR(trim_pct),
- MYSQL_SYSVAR(compress_zlib_level),
MYSQL_SYSVAR(compress_index_pages),
MYSQL_SYSVAR(use_trim),
+ MYSQL_SYSVAR(use_lz4),
NULL
};
diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h
index 01084d52365..918a92fa811 100644
--- a/storage/innobase/include/fil0fil.h
+++ b/storage/innobase/include/fil0fil.h
@@ -134,6 +134,7 @@ extern fil_addr_t fil_addr_null;
actual payload data size on
compressed pages. */
#define FIL_PAGE_COMPRESSION_ZLIB 1 /*!< Compressin algorithm ZLIB. */
+#define FIL_PAGE_COMPRESSION_LZ4 2 /*!< Compressin algorithm LZ4. */
/* @} */
/** File page trailer @{ */
diff --git a/storage/innobase/include/fsp0pagecompress.ic b/storage/innobase/include/fsp0pagecompress.ic
index 755d91b3cd9..10f9d30d1f8 100644
--- a/storage/innobase/include/fsp0pagecompress.ic
+++ b/storage/innobase/include/fsp0pagecompress.ic
@@ -1,6 +1,6 @@
/*****************************************************************************
-Copyright (C) 2013 SkySQL Ab. All Rights Reserved.
+Copyright (C) 2013,2014 SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -144,6 +144,9 @@ fil_get_compression_alg_name(
case FIL_PAGE_COMPRESSION_ZLIB:
return ("ZLIB");
break;
+ case FIL_PAGE_COMPRESSION_LZ4:
+ return ("LZ4");
+ break;
default:
return("UNKNOWN");
break;
diff --git a/storage/innobase/include/page0zip.h b/storage/innobase/include/page0zip.h
index 12781bd61b8..89260d0984e 100644
--- a/storage/innobase/include/page0zip.h
+++ b/storage/innobase/include/page0zip.h
@@ -41,7 +41,7 @@ Created June 2005 by Marko Makela
#include "mem0mem.h"
/* Compression level to be used by zlib. Settable by user. */
-extern ulint page_compression_level;
+extern uint page_zip_level;
/* Default compression level. */
#define DEFAULT_COMPRESSION_LEVEL 6
diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
index f4fa8b434fe..a11c213d534 100644
--- a/storage/innobase/include/srv0srv.h
+++ b/storage/innobase/include/srv0srv.h
@@ -3,7 +3,7 @@
Copyright (c) 1995, 2012, Oracle and/or its affiliates. All rights reserved.
Copyright (c) 2008, 2009, Google Inc.
Copyright (c) 2009, Percona Inc.
-Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
+Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -254,9 +254,8 @@ extern my_bool srv_use_posix_fallocate;
/* Use atomic writes i.e disable doublewrite buffer */
extern my_bool srv_use_atomic_writes;
-/* Default zlib compression level */
-extern long srv_compress_zlib_level;
-
+/* If this flag IS TRUE, then we use lz4 to compress/decompress pages */
+extern my_bool srv_use_lz4;
#ifdef __WIN__
extern ibool srv_use_native_conditions;
diff --git a/storage/innobase/page/page0cur.cc b/storage/innobase/page/page0cur.cc
index f416d38cc35..9d6a62cae8f 100644
--- a/storage/innobase/page/page0cur.cc
+++ b/storage/innobase/page/page0cur.cc
@@ -1180,7 +1180,7 @@ page_cur_insert_rec_zip_reorg(
/* Make a local copy as the values can change dynamically. */
bool log_compressed = page_log_compressed_pages;
- ulint level = page_compression_level;
+ ulint level = page_zip_level;
/* Recompress or reorganize and recompress the page. */
if (page_zip_compress(page_zip, page, index, level,
diff --git a/storage/innobase/page/page0page.cc b/storage/innobase/page/page0page.cc
index 6b7b8424856..bf73a249f95 100644
--- a/storage/innobase/page/page0page.cc
+++ b/storage/innobase/page/page0page.cc
@@ -514,7 +514,7 @@ page_create_zip(
mach_write_to_2(page + PAGE_HEADER + PAGE_LEVEL, level);
if (!page_zip_compress(page_zip, page, index,
- page_compression_level, mtr)) {
+ page_zip_level, mtr)) {
/* The compression of a newly created page
should always succeed. */
ut_error;
@@ -663,7 +663,7 @@ page_copy_rec_list_end(
if (!page_zip_compress(new_page_zip,
new_page,
index,
- page_compression_level,
+ page_zip_level,
mtr)) {
/* Before trying to reorganize the page,
store the number of preceding records on the page. */
@@ -788,7 +788,7 @@ page_copy_rec_list_start(
goto zip_reorganize;);
if (!page_zip_compress(new_page_zip, new_page, index,
- page_compression_level, mtr)) {
+ page_zip_level, mtr)) {
ulint ret_pos;
#ifndef DBUG_OFF
diff --git a/storage/innobase/page/page0zip.cc b/storage/innobase/page/page0zip.cc
index dee37580002..3fba6216430 100644
--- a/storage/innobase/page/page0zip.cc
+++ b/storage/innobase/page/page0zip.cc
@@ -69,7 +69,7 @@ UNIV_INTERN mysql_pfs_key_t page_zip_stat_per_index_mutex_key;
#endif /* !UNIV_HOTBACKUP */
/* Compression level to be used by zlib. Settable by user. */
-UNIV_INTERN ulint page_compression_level = 6;
+UNIV_INTERN uint page_zip_level = DEFAULT_COMPRESSION_LEVEL;
/* Whether or not to log compressed page images to avoid possible
compression algorithm changes in zlib. */
@@ -4631,7 +4631,7 @@ page_zip_reorganize(
mtr_set_log_mode(mtr, log_mode);
if (!page_zip_compress(page_zip, page, index,
- page_compression_level, mtr)) {
+ page_zip_level, mtr)) {
#ifndef UNIV_HOTBACKUP
buf_block_free(temp_block);
diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc
index 90864cee9ef..cffd3f928c3 100644
--- a/storage/innobase/srv/srv0srv.cc
+++ b/storage/innobase/srv/srv0srv.cc
@@ -3,6 +3,7 @@
Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, 2009 Google Inc.
Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, 2014, SkySQL Ab.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -147,21 +148,20 @@ UNIV_INTERN my_bool srv_use_native_aio = TRUE;
/* If this flag is TRUE, then we will use page compression
to the pages */
-UNIV_INTERN my_bool srv_compress_pages = FALSE;
+UNIV_INTERN my_bool srv_compress_pages = FALSE;
/* If this flag is TRUE, then we will use page compression
only for index pages */
-UNIV_INTERN my_bool srv_page_compress_index_pages = FALSE;
-UNIV_INTERN long srv_trim_pct = 100;
-/* Default compression level if page compression is used and no compression
-level is set for the table*/
-UNIV_INTERN long srv_compress_zlib_level = 6;
+UNIV_INTERN my_bool srv_page_compress_index_pages = FALSE;
+UNIV_INTERN long srv_trim_pct = 100;
/* If this flag is TRUE, then we will use fallocate(PUCH_HOLE)
to the pages */
-UNIV_INTERN my_bool srv_use_trim = TRUE;
+UNIV_INTERN my_bool srv_use_trim = TRUE;
/* If this flag is TRUE, then we will use posix fallocate for file extentsion */
-UNIV_INTERN my_bool srv_use_posix_fallocate = FALSE;
+UNIV_INTERN my_bool srv_use_posix_fallocate = FALSE;
/* If this flag is TRUE, then we disable doublewrite buffer */
-UNIV_INTERN my_bool srv_use_atomic_writes = FALSE;
+UNIV_INTERN my_bool srv_use_atomic_writes = FALSE;
+/* If this flag IS TRUE, then we use lz4 to compress/decompress pages */
+UNIV_INTERN my_bool srv_use_lz4 = FALSE;
#ifdef __WIN__
/* Windows native condition variables. We use runtime loading / function
diff --git a/storage/xtradb/CMakeLists.txt b/storage/xtradb/CMakeLists.txt
index 282db2ddf31..5050ca34da9 100644
--- a/storage/xtradb/CMakeLists.txt
+++ b/storage/xtradb/CMakeLists.txt
@@ -284,6 +284,8 @@ SET(INNOBASE_SOURCES
buf/buf0flu.cc
buf/buf0lru.cc
buf/buf0rea.cc
+# TODO: JAN uncomment
+# buf/buf0mtflu.cc
data/data0data.cc
data/data0type.cc
dict/dict0boot.cc
@@ -297,6 +299,8 @@ SET(INNOBASE_SOURCES
eval/eval0eval.cc
eval/eval0proc.cc
fil/fil0fil.cc
+ fil/fil0pagecompress.cc
+ fil/lz4.c
fsp/fsp0fsp.cc
fut/fut0fut.cc
fut/fut0lst.cc
diff --git a/storage/xtradb/buf/buf0buf.cc b/storage/xtradb/buf/buf0buf.cc
index d4b170028d9..b995e3ee737 100644
--- a/storage/xtradb/buf/buf0buf.cc
+++ b/storage/xtradb/buf/buf0buf.cc
@@ -2,6 +2,7 @@
Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, Google Inc.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -3371,6 +3372,7 @@ buf_page_init_low(
bpage->access_time = 0;
bpage->newest_modification = 0;
bpage->oldest_modification = 0;
+ bpage->write_size = 0;
HASH_INVALIDATE(bpage, hash);
bpage->is_corrupt = FALSE;
#if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
@@ -5501,3 +5503,24 @@ buf_page_init_for_backup_restore(
}
}
#endif /* !UNIV_HOTBACKUP */
+
+/*********************************************************************//**
+Aquire LRU list mutex */
+void
+buf_pool_mutex_enter(
+/*=================*/
+ buf_pool_t* buf_pool) /*!< in: buffer pool */
+{
+ ut_ad(!mutex_own(&buf_pool->LRU_list_mutex));
+ mutex_enter(&buf_pool->LRU_list_mutex);
+}
+/*********************************************************************//**
+Exit LRU list mutex */
+void
+buf_pool_mutex_exit(
+/*================*/
+ buf_pool_t* buf_pool) /*!< in: buffer pool */
+{
+ ut_ad(mutex_own(&buf_pool->LRU_list_mutex));
+ mutex_exit(&buf_pool->LRU_list_mutex);
+}
diff --git a/storage/xtradb/buf/buf0dblwr.cc b/storage/xtradb/buf/buf0dblwr.cc
index 506a5b177ba..30b41dc754e 100644
--- a/storage/xtradb/buf/buf0dblwr.cc
+++ b/storage/xtradb/buf/buf0dblwr.cc
@@ -1,6 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -382,7 +383,7 @@ buf_dblwr_init_or_restore_pages(
buffer */
fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, TRX_SYS_PAGE_NO, 0,
- UNIV_PAGE_SIZE, read_buf, NULL);
+ UNIV_PAGE_SIZE, read_buf, NULL, 0);
doublewrite = read_buf + TRX_SYS_DOUBLEWRITE;
if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC)
@@ -418,11 +419,11 @@ buf_dblwr_init_or_restore_pages(
fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, block1, 0,
TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
- buf, NULL);
+ buf, NULL, 0);
fil_io(OS_FILE_READ, true, TRX_SYS_SPACE, 0, block2, 0,
TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
buf + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE,
- NULL);
+ NULL, 0);
/* Check if any of these pages is half-written in data files, in the
intended position */
@@ -450,7 +451,7 @@ buf_dblwr_init_or_restore_pages(
}
fil_io(OS_FILE_WRITE, true, 0, 0, source_page_no, 0,
- UNIV_PAGE_SIZE, page, NULL);
+ UNIV_PAGE_SIZE, page, NULL, 0);
} else {
space_id = mach_read_from_4(
@@ -492,7 +493,7 @@ buf_dblwr_init_or_restore_pages(
fil_io(OS_FILE_READ, true, space_id, zip_size,
page_no, 0,
zip_size ? zip_size : UNIV_PAGE_SIZE,
- read_buf, NULL);
+ read_buf, NULL, 0);
/* Check if the page is corrupt */
@@ -544,7 +545,7 @@ buf_dblwr_init_or_restore_pages(
fil_io(OS_FILE_WRITE, true, space_id,
zip_size, page_no, 0,
zip_size ? zip_size : UNIV_PAGE_SIZE,
- page, NULL);
+ page, NULL, 0);
ib_logf(IB_LOG_LEVEL_INFO,
"Recovered the page from"
@@ -763,7 +764,7 @@ buf_dblwr_write_block_to_datafile(
buf_page_get_page_no(bpage), 0,
buf_page_get_zip_size(bpage),
(void*) bpage->zip.data,
- (void*) bpage);
+ (void*) bpage, 0);
return;
}
@@ -775,7 +776,8 @@ buf_dblwr_write_block_to_datafile(
fil_io(flags, sync, buf_block_get_space(block), 0,
buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
- (void*) block->frame, (void*) block);
+ (void*) block->frame, (void*) block,
+ (ulint *)&bpage->write_size);
}
/********************************************************************//**
@@ -869,7 +871,7 @@ try_again:
fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
buf_dblwr->block1, 0, len,
- (void*) write_buf, NULL);
+ (void*) write_buf, NULL, 0);
if (buf_dblwr->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
/* No unwritten pages in the second block. */
@@ -885,7 +887,7 @@ try_again:
fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
buf_dblwr->block2, 0, len,
- (void*) write_buf, NULL);
+ (void*) write_buf, NULL, 0);
flush:
/* increment the doublewrite flushed pages counter */
@@ -1115,14 +1117,14 @@ retry:
fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
offset, 0, UNIV_PAGE_SIZE,
(void*) (buf_dblwr->write_buf
- + UNIV_PAGE_SIZE * i), NULL);
+ + UNIV_PAGE_SIZE * i), NULL, 0);
} else {
/* It is a regular page. Write it directly to the
doublewrite buffer */
fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
offset, 0, UNIV_PAGE_SIZE,
(void*) ((buf_block_t*) bpage)->frame,
- NULL);
+ NULL, 0);
}
/* Now flush the doublewrite buffer data to disk */
diff --git a/storage/xtradb/buf/buf0flu.cc b/storage/xtradb/buf/buf0flu.cc
index abcee504d2e..3c030eb60ee 100644
--- a/storage/xtradb/buf/buf0flu.cc
+++ b/storage/xtradb/buf/buf0flu.cc
@@ -1,6 +1,8 @@
/*****************************************************************************
Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved.
+Copyright (c) 2013, 2014, Fusion-io. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -48,6 +50,7 @@ Created 11/11/1995 Heikki Tuuri
#include "srv0mon.h"
#include "mysql/plugin.h"
#include "mysql/service_thd_wait.h"
+#include "fil0pagecompress.h"
/** Number of pages flushed through non flush_list flushes. */
// static ulint buf_lru_flush_page_count = 0;
@@ -71,11 +74,6 @@ in thrashing. */
/* @} */
-/** Handled page counters for a single flush */
-struct flush_counters_t {
- ulint flushed; /*!< number of dirty pages flushed */
- ulint evicted; /*!< number of clean pages evicted */
-};
/******************************************************************//**
Increases flush_list size in bytes with zip_size for compressed page,
@@ -721,8 +719,10 @@ buf_flush_write_complete(
buf_pool->n_flush[flush_type]--;
- /* fprintf(stderr, "n pending flush %lu\n",
- buf_pool->n_flush[flush_type]); */
+#ifdef UNIV_DEBUG
+ fprintf(stderr, "n pending flush %lu\n",
+ buf_pool->n_flush[flush_type]);
+#endif
if (buf_pool->n_flush[flush_type] == 0
&& buf_pool->init_flush[flush_type] == FALSE) {
@@ -880,6 +880,8 @@ buf_flush_write_block_low(
{
ulint zip_size = buf_page_get_zip_size(bpage);
page_t* frame = NULL;
+ ulint space_id = buf_page_get_space(bpage);
+ atomic_writes_t awrites = fil_space_get_atomic_writes(space_id);
#ifdef UNIV_DEBUG
buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
@@ -955,12 +957,26 @@ buf_flush_write_block_low(
sync, buf_page_get_space(bpage), zip_size,
buf_page_get_page_no(bpage), 0,
zip_size ? zip_size : UNIV_PAGE_SIZE,
- frame, bpage);
- } else if (flush_type == BUF_FLUSH_SINGLE_PAGE) {
- buf_dblwr_write_single_page(bpage, sync);
+ frame, bpage, &bpage->write_size);
} else {
- ut_ad(!sync);
- buf_dblwr_add_to_batch(bpage);
+ /* InnoDB uses doublewrite buffer and doublewrite buffer
+ is initialized. User can define do we use atomic writes
+ on a file space (table) or not. If atomic writes are
+ not used we should use doublewrite buffer and if
+ atomic writes should be used, no doublewrite buffer
+ is used. */
+
+ if (awrites == ATOMIC_WRITES_ON) {
+ fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
+ FALSE, buf_page_get_space(bpage), zip_size,
+ buf_page_get_page_no(bpage), 0,
+ zip_size ? zip_size : UNIV_PAGE_SIZE,
+ frame, bpage, &bpage->write_size);
+ } else if (flush_type == BUF_FLUSH_SINGLE_PAGE) {
+ buf_dblwr_write_single_page(bpage, sync);
+ } else {
+ buf_dblwr_add_to_batch(bpage);
+ }
}
/* When doing single page flushing the IO is done synchronously
@@ -1747,7 +1763,6 @@ end up waiting for these latches! NOTE 2: in the case of a flush list flush,
the calling thread is not allowed to own any latches on pages!
@return number of blocks for which the write request was queued */
__attribute__((nonnull))
-static
void
buf_flush_batch(
/*============*/
@@ -1806,7 +1821,6 @@ buf_flush_batch(
/******************************************************************//**
Gather the aggregated stats for both flush list and LRU list flushing */
-static
void
buf_flush_common(
/*=============*/
@@ -1833,7 +1847,6 @@ buf_flush_common(
/******************************************************************//**
Start a buffer flush batch for LRU or flush list */
-static
ibool
buf_flush_start(
/*============*/
@@ -1862,7 +1875,6 @@ buf_flush_start(
/******************************************************************//**
End a buffer flush batch for LRU or flush list */
-static
void
buf_flush_end(
/*==========*/
@@ -1912,11 +1924,55 @@ buf_flush_wait_batch_end(
}
} else {
thd_wait_begin(NULL, THD_WAIT_DISKIO);
- os_event_wait(buf_pool->no_flush[type]);
+ os_event_wait(buf_pool->no_flush[type]);
thd_wait_end(NULL);
}
}
+/* JAN: TODO: */
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the LRU list and also
+puts replaceable clean pages from the end of the LRU list to the free
+list.
+NOTE: The calling thread is not allowed to own any latches on pages!
+@return true if a batch was queued successfully. false if another batch
+of same type was already running. */
+static
+bool
+pgcomp_buf_flush_LRU(
+/*==========*/
+ buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */
+ ulint min_n, /*!< in: wished minimum mumber of blocks
+ flushed (it is not guaranteed that the
+ actual number is that big, though) */
+ ulint* n_processed) /*!< out: the number of pages
+ which were processed is passed
+ back to caller. Ignored if NULL */
+{
+ flush_counters_t n;
+
+ if (n_processed) {
+ *n_processed = 0;
+ }
+
+ if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) {
+ return(false);
+ }
+
+ buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0, false, &n);
+
+ buf_flush_end(buf_pool, BUF_FLUSH_LRU);
+
+ buf_flush_common(BUF_FLUSH_LRU, n.flushed);
+
+ if (n_processed) {
+ *n_processed = n.flushed;
+ }
+
+ return(true);
+}
+/* JAN: TODO: END: */
+
/*******************************************************************//**
This utility flushes dirty blocks from the end of the LRU list and also
puts replaceable clean pages from the end of the LRU list to the free
@@ -1954,6 +2010,168 @@ buf_flush_LRU(
return(true);
}
+/* JAN: TODO: */
+/*******************************************************************//**/
+extern int is_pgcomp_wrk_init_done(void);
+extern int pgcomp_flush_work_items(int buf_pool_inst, int *pages_flushed,
+ int flush_type, int min_n, unsigned long long lsn_limit);
+
+#define MT_COMP_WATER_MARK 50
+
+#include <time.h>
+int timediff(struct timeval *g_time, struct timeval *s_time, struct timeval *d_time)
+{
+ if (g_time->tv_usec < s_time->tv_usec)
+ {
+ int nsec = (s_time->tv_usec - g_time->tv_usec) / 1000000 + 1;
+ s_time->tv_usec -= 1000000 * nsec;
+ s_time->tv_sec += nsec;
+ }
+ if (g_time->tv_usec - s_time->tv_usec > 1000000)
+ {
+ int nsec = (s_time->tv_usec - g_time->tv_usec) / 1000000;
+ s_time->tv_usec += 1000000 * nsec;
+ s_time->tv_sec -= nsec;
+ }
+ d_time->tv_sec = g_time->tv_sec - s_time->tv_sec;
+ d_time->tv_usec = g_time->tv_usec - s_time->tv_usec;
+
+ return 0;
+}
+
+static pthread_mutex_t pgcomp_mtx = PTHREAD_MUTEX_INITIALIZER;
+/*******************************************************************//**
+Multi-threaded version of buf_flush_list
+*/
+UNIV_INTERN
+bool
+pgcomp_buf_flush_list(
+/*==================*/
+ ulint min_n, /*!< in: wished minimum mumber of blocks
+ flushed (it is not guaranteed that the
+ actual number is that big, though) */
+ lsn_t lsn_limit, /*!< in the case BUF_FLUSH_LIST all
+ blocks whose oldest_modification is
+ smaller than this should be flushed
+ (if their number does not exceed
+ min_n), otherwise ignored */
+ ulint* n_processed) /*!< out: the number of pages
+ which were processed is passed
+ back to caller. Ignored if NULL */
+
+{
+ ulint i;
+ bool success = true;
+ struct timeval p_start_time, p_end_time, d_time;
+ flush_counters_t n;
+
+ if (n_processed) {
+ *n_processed = 0;
+ }
+
+ if (min_n != ULINT_MAX) {
+ /* Ensure that flushing is spread evenly amongst the
+ buffer pool instances. When min_n is ULINT_MAX
+ we need to flush everything up to the lsn limit
+ so no limit here. */
+ min_n = (min_n + srv_buf_pool_instances - 1)
+ / srv_buf_pool_instances;
+ }
+
+#ifdef UNIV_DEBUG
+ gettimeofday(&p_start_time, 0x0);
+#endif
+ if(is_pgcomp_wrk_init_done() && (min_n > MT_COMP_WATER_MARK)) {
+ int cnt_flush[32];
+
+ //stack_trace();
+ pthread_mutex_lock(&pgcomp_mtx);
+ //gettimeofday(&p_start_time, 0x0);
+ //fprintf(stderr, "Calling into wrk-pgcomp [min:%lu]", min_n);
+ pgcomp_flush_work_items(srv_buf_pool_instances,
+ cnt_flush, BUF_FLUSH_LIST,
+ min_n, lsn_limit);
+
+ for (i = 0; i < srv_buf_pool_instances; i++) {
+ if (n_processed) {
+ *n_processed += cnt_flush[i];
+ }
+ if (cnt_flush[i]) {
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+ MONITOR_FLUSH_BATCH_COUNT,
+ MONITOR_FLUSH_BATCH_PAGES,
+ cnt_flush[i]);
+
+ }
+ }
+
+ pthread_mutex_unlock(&pgcomp_mtx);
+
+#ifdef UNIV_DEBUG
+ gettimeofday(&p_end_time, 0x0);
+ timediff(&p_end_time, &p_start_time, &d_time);
+ fprintf(stderr, "[1] [*n_processed: (min:%lu)%lu %llu usec]\n", (
+ min_n * srv_buf_pool_instances), *n_processed,
+ (unsigned long long)(d_time.tv_usec+(d_time.tv_sec*1000000)));
+#endif
+ return(success);
+ }
+ /* Flush to lsn_limit in all buffer pool instances */
+ for (i = 0; i < srv_buf_pool_instances; i++) {
+ buf_pool_t* buf_pool;
+
+ buf_pool = buf_pool_from_array(i);
+
+ if (!buf_flush_start(buf_pool, BUF_FLUSH_LIST)) {
+ /* We have two choices here. If lsn_limit was
+ specified then skipping an instance of buffer
+ pool means we cannot guarantee that all pages
+ up to lsn_limit has been flushed. We can
+ return right now with failure or we can try
+ to flush remaining buffer pools up to the
+ lsn_limit. We attempt to flush other buffer
+ pools based on the assumption that it will
+ help in the retry which will follow the
+ failure. */
+ success = false;
+
+ continue;
+ }
+
+ buf_flush_batch(
+ buf_pool, BUF_FLUSH_LIST, min_n, lsn_limit, false, &n);
+
+ buf_flush_end(buf_pool, BUF_FLUSH_LIST);
+
+ buf_flush_common(BUF_FLUSH_LIST, n.flushed);
+
+ if (n_processed) {
+ *n_processed += n.flushed;
+ }
+
+ if (n.flushed) {
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+ MONITOR_FLUSH_BATCH_COUNT,
+ MONITOR_FLUSH_BATCH_PAGES,
+ n.flushed);
+ }
+ }
+
+#ifdef UNIV_DEBUG
+ gettimeofday(&p_end_time, 0x0);
+ timediff(&p_end_time, &p_start_time, &d_time);
+
+ fprintf(stderr, "[2] [*n_processed: (min:%lu)%lu %llu usec]\n", (
+ min_n * srv_buf_pool_instances), *n_processed,
+ (unsigned long long)(d_time.tv_usec+(d_time.tv_sec*1000000)));
+#endif
+ return(success);
+}
+
+/* JAN: TODO: END: */
+
/*******************************************************************//**
This utility flushes dirty blocks from the end of the flush list of
all buffer pool instances.
@@ -1986,6 +2204,12 @@ buf_flush_list(
bool timeout = false;
ulint flush_start_time = 0;
+ /* JAN: TODO: */
+ if (is_pgcomp_wrk_init_done()) {
+ return(pgcomp_buf_flush_list(min_n, lsn_limit, n_processed));
+ }
+ /* JAN: TODO: END: */
+
for (i = 0; i < srv_buf_pool_instances; i++) {
requested_pages[i] = 0;
active_instance[i] = true;
@@ -2179,6 +2403,60 @@ buf_flush_single_page_from_LRU(
return(freed);
}
+/* JAN: TODO: */
+/*********************************************************************//**
+pgcomp_Clears up tail of the LRU lists:
+* Put replaceable pages at the tail of LRU to the free list
+* Flush dirty pages at the tail of LRU to the disk
+The depth to which we scan each buffer pool is controlled by dynamic
+config parameter innodb_LRU_scan_depth.
+@return total pages flushed */
+UNIV_INTERN
+ulint
+pgcomp_buf_flush_LRU_tail(void)
+/*====================*/
+{
+ struct timeval p_start_time, p_end_time, d_time;
+ ulint total_flushed=0, i=0;
+ int cnt_flush[32];
+
+#ifdef UNIV_DEBUG
+ gettimeofday(&p_start_time, 0x0);
+#endif
+ assert(is_pgcomp_wrk_init_done());
+
+ pthread_mutex_lock(&pgcomp_mtx);
+ pgcomp_flush_work_items(srv_buf_pool_instances,
+ cnt_flush, BUF_FLUSH_LRU, srv_LRU_scan_depth, 0);
+
+ for (i = 0; i < srv_buf_pool_instances; i++) {
+ if (cnt_flush[i]) {
+ total_flushed += cnt_flush[i];
+
+ MONITOR_INC_VALUE_CUMULATIVE(
+ MONITOR_LRU_BATCH_TOTAL_PAGE,
+ MONITOR_LRU_BATCH_COUNT,
+ MONITOR_LRU_BATCH_PAGES,
+ cnt_flush[i]);
+ }
+ }
+
+ pthread_mutex_unlock(&pgcomp_mtx);
+
+#ifdef UNIV_DEBUG
+ gettimeofday(&p_end_time, 0x0);
+ timediff(&p_end_time, &p_start_time, &d_time);
+
+ fprintf(stderr, "[1] [*n_processed: (min:%lu)%lu %llu usec]\n", (
+ srv_LRU_scan_depth * srv_buf_pool_instances), total_flushed,
+ (unsigned long long)(d_time.tv_usec+(d_time.tv_sec*1000000)));
+#endif
+
+ return(total_flushed);
+}
+/* JAN: TODO: END: */
+
+
/*********************************************************************//**
Clears up tail of the LRU lists:
* Put replaceable pages at the tail of LRU to the free list
@@ -2203,6 +2481,13 @@ buf_flush_LRU_tail(void)
ulint free_list_lwm = srv_LRU_scan_depth / 100
* srv_cleaner_free_list_lwm;
+ /* JAN: TODO: */
+ if(is_pgcomp_wrk_init_done())
+ {
+ return(pgcomp_buf_flush_LRU_tail());
+ }
+ /* JAN: TODO: END */
+
for (ulint i = 0; i < srv_buf_pool_instances; i++) {
const buf_pool_t* buf_pool = buf_pool_from_array(i);
@@ -2640,6 +2925,7 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)(
ulint n_flushed = 0;
ulint last_activity = srv_get_activity_count();
ulint lru_sleep_time = srv_cleaner_max_lru_time;
+ ulint n_lru=0, n_pgc_flush=0, n_pgc_batch=0;
ut_ad(!srv_read_only_mode);
@@ -2684,15 +2970,25 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)(
next_loop_time = ut_time_ms() + page_cleaner_sleep_time;
/* Flush pages from end of LRU if required */
- n_flushed = buf_flush_LRU_tail();
+ n_lru = n_flushed = buf_flush_LRU_tail();
+#ifdef UNIV_DEBUG
+ if (n_lru) {
+ fprintf(stderr,"n_lru:%lu ",n_lru);
+ }
+#endif
if (srv_check_activity(last_activity)) {
last_activity = srv_get_activity_count();
/* Flush pages from flush_list if required */
- n_flushed += page_cleaner_flush_pages_if_needed();
+ n_flushed += n_pgc_flush = page_cleaner_flush_pages_if_needed();
+#ifdef UNIV_DEBUG
+ if (n_pgc_flush) {
+ fprintf(stderr,"n_pgc_flush:%lu ",n_pgc_flush);
+ }
+#endif
} else {
- n_flushed = page_cleaner_do_flush_batch(
+ n_pgc_batch = n_flushed = page_cleaner_do_flush_batch(
PCT_IO(100),
LSN_MAX);
@@ -2703,7 +2999,20 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)(
MONITOR_FLUSH_BACKGROUND_PAGES,
n_flushed);
}
+#ifdef UNIV_DEBUG
+ if (n_pgc_batch) {
+ fprintf(stderr,"n_pgc_batch:%lu ",n_pgc_batch);
+ }
+#endif
}
+
+#ifdef UNIV_DEBUG
+ if (n_lru || n_pgc_flush || n_pgc_batch) {
+ fprintf(stderr,"\n");
+ n_lru = n_pgc_flush = n_pgc_batch = 0;
+ }
+#endif
+
}
ut_ad(srv_shutdown_state > 0);
diff --git a/storage/xtradb/buf/buf0rea.cc b/storage/xtradb/buf/buf0rea.cc
index 6e348bbf004..3dec3df6f2b 100644
--- a/storage/xtradb/buf/buf0rea.cc
+++ b/storage/xtradb/buf/buf0rea.cc
@@ -1,6 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -229,14 +230,14 @@ not_to_recover:
*err = _fil_io(OS_FILE_READ | wake_later
| ignore_nonexistent_pages,
sync, space, zip_size, offset, 0, zip_size,
- bpage->zip.data, bpage, trx);
+ bpage->zip.data, bpage, 0, trx);
} else {
ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
*err = _fil_io(OS_FILE_READ | wake_later
| ignore_nonexistent_pages,
sync, space, 0, offset, 0, UNIV_PAGE_SIZE,
- ((buf_block_t*) bpage)->frame, bpage, trx);
+ ((buf_block_t*) bpage)->frame, bpage, 0, trx);
}
if (sync) {
diff --git a/storage/xtradb/dict/dict0dict.cc b/storage/xtradb/dict/dict0dict.cc
index a20456fe3cf..d6a05d2b214 100644
--- a/storage/xtradb/dict/dict0dict.cc
+++ b/storage/xtradb/dict/dict0dict.cc
@@ -2,6 +2,7 @@
Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
diff --git a/storage/xtradb/fil/fil0fil.cc b/storage/xtradb/fil/fil0fil.cc
index 9861f85b814..f3e952299ff 100644
--- a/storage/xtradb/fil/fil0fil.cc
+++ b/storage/xtradb/fil/fil0fil.cc
@@ -1,6 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013 SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -54,6 +55,15 @@ Created 10/25/1995 Heikki Tuuri
# include "srv0srv.h"
static ulint srv_data_read, srv_data_written;
#endif /* !UNIV_HOTBACKUP */
+#include "fil0pagecompress.h"
+#include "zlib.h"
+#ifdef __linux__
+#include <linux/fs.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <linux/falloc.h>
+#endif
+#include "row0mysql.h"
/*
IMPLEMENTATION OF THE TABLESPACE MEMORY CACHE
@@ -434,11 +444,16 @@ fil_read(
block size multiple */
void* buf, /*!< in/out: buffer where to store data read;
in aio this must be appropriately aligned */
- void* message) /*!< in: message for aio handler if non-sync
- aio used, else ignored */
+ void* message, /*!< in: message for aio handler if non-sync
+ aio used, else ignored */
+ ulint* write_size) /*!< in/out: Actual write size initialized
+ after fist successfull trim
+ operation for this page and if
+ initialized we do not trim again if
+ actual page size does not decrease. */
{
return(fil_io(OS_FILE_READ, sync, space_id, zip_size, block_offset,
- byte_offset, len, buf, message));
+ byte_offset, len, buf, message, write_size));
}
/********************************************************************//**
@@ -463,18 +478,22 @@ fil_write(
be a block size multiple */
void* buf, /*!< in: buffer from which to write; in aio
this must be appropriately aligned */
- void* message) /*!< in: message for aio handler if non-sync
- aio used, else ignored */
+ void* message, /*!< in: message for aio handler if non-sync
+ aio used, else ignored */
+ ulint* write_size) /*!< in/out: Actual write size initialized
+ after fist successfull trim
+ operation for this page and if
+ initialized we do not trim again if
+ actual page size does not decrease. */
{
ut_ad(!srv_read_only_mode);
return(fil_io(OS_FILE_WRITE, sync, space_id, zip_size, block_offset,
- byte_offset, len, buf, message));
+ byte_offset, len, buf, message, write_size));
}
/*******************************************************************//**
Returns the table space by a given id, NULL if not found. */
-UNIV_INLINE
fil_space_t*
fil_space_get_by_id(
/*================*/
@@ -492,6 +511,19 @@ fil_space_get_by_id(
return(space);
}
+/****************************************************************//**
+Get space id from fil node */
+ulint
+fil_node_get_space_id(
+/*==================*/
+ fil_node_t* node) /*!< in: Compressed node*/
+{
+ ut_ad(node);
+ ut_ad(node->space);
+
+ return (node->space->id);
+}
+
/*******************************************************************//**
Returns the table space by a given name, NULL if not found. */
UNIV_INLINE
@@ -712,8 +744,9 @@ fil_node_open_file(
byte* buf2;
byte* page;
ulint space_id;
- ulint flags;
+ ulint flags=0;
ulint page_size;
+ ibool atomic_writes=FALSE;
ut_ad(mutex_own(&(system->mutex)));
ut_a(node->n_pending == 0);
@@ -730,7 +763,7 @@ fil_node_open_file(
node->handle = os_file_create_simple_no_error_handling(
innodb_file_data_key, node->name, OS_FILE_OPEN,
- OS_FILE_READ_ONLY, &success);
+ OS_FILE_READ_ONLY, &success, 0);
if (!success) {
/* The following call prints an error message */
os_file_get_last_error(true);
@@ -782,6 +815,7 @@ fil_node_open_file(
space_id = fsp_header_get_space_id(page);
flags = fsp_header_get_flags(page);
page_size = fsp_flags_get_page_size(flags);
+ atomic_writes = fsp_flags_get_atomic_writes(flags);
ut_free(buf2);
@@ -832,6 +866,17 @@ fil_node_open_file(
ut_error;
}
+ if (UNIV_UNLIKELY(space->flags != flags)) {
+ if (!dict_tf_verify_flags(space->flags, flags)) {
+ fprintf(stderr,
+ "InnoDB: Error: table flags are 0x%lx"
+ " in the data dictionary\n"
+ "InnoDB: but the flags in file %s are 0x%lx!\n",
+ space->flags, node->name, flags);
+ ut_error;
+ }
+ }
+
if (size_bytes >= 1024 * 1024) {
/* Truncate the size to whole megabytes. */
size_bytes = ut_2pow_round(size_bytes, 1024 * 1024);
@@ -851,6 +896,8 @@ add_size:
space->size += node->size;
}
+ atomic_writes = fsp_flags_get_atomic_writes(space->flags);
+
/* printf("Opening file %s\n", node->name); */
/* Open the file for reading and writing, in Windows normally in the
@@ -861,18 +908,18 @@ add_size:
node->handle = os_file_create(innodb_file_log_key,
node->name, OS_FILE_OPEN,
OS_FILE_AIO, OS_LOG_FILE,
- &ret);
+ &ret, atomic_writes);
} else if (node->is_raw_disk) {
node->handle = os_file_create(innodb_file_data_key,
node->name,
OS_FILE_OPEN_RAW,
OS_FILE_AIO, OS_DATA_FILE,
- &ret);
+ &ret, atomic_writes);
} else {
node->handle = os_file_create(innodb_file_data_key,
node->name, OS_FILE_OPEN,
OS_FILE_AIO, OS_DATA_FILE,
- &ret);
+ &ret, atomic_writes);
}
ut_a(ret);
@@ -1932,12 +1979,12 @@ fil_write_lsn_and_arch_no_to_file(
buf = static_cast<byte*>(ut_align(buf1, UNIV_PAGE_SIZE));
err = fil_read(TRUE, space, 0, sum_of_sizes, 0,
- UNIV_PAGE_SIZE, buf, NULL);
+ UNIV_PAGE_SIZE, buf, NULL, 0);
if (err == DB_SUCCESS) {
mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn);
err = fil_write(TRUE, space, 0, sum_of_sizes, 0,
- UNIV_PAGE_SIZE, buf, NULL);
+ UNIV_PAGE_SIZE, buf, NULL, 0);
}
mem_free(buf1);
@@ -3222,7 +3269,7 @@ fil_create_link_file(
file = os_file_create_simple_no_error_handling(
innodb_file_data_key, link_filepath,
- OS_FILE_CREATE, OS_FILE_READ_WRITE, &success);
+ OS_FILE_CREATE, OS_FILE_READ_WRITE, &success, 0);
if (!success) {
/* The following call will print an error message */
@@ -3331,8 +3378,9 @@ fil_open_linked_file(
/*===============*/
const char* tablename, /*!< in: database/tablename */
char** remote_filepath,/*!< out: remote filepath */
- os_file_t* remote_file) /*!< out: remote file handle */
-
+ os_file_t* remote_file, /*!< out: remote file handle */
+ ulint atomic_writes) /*!< in: atomic writes table option
+ value */
{
ibool success;
@@ -3346,7 +3394,7 @@ fil_open_linked_file(
*remote_file = os_file_create_simple_no_error_handling(
innodb_file_data_key, *remote_filepath,
OS_FILE_OPEN, OS_FILE_READ_ONLY,
- &success);
+ &success, atomic_writes);
if (!success) {
char* link_filepath = fil_make_isl_name(tablename);
@@ -3401,6 +3449,7 @@ fil_create_new_single_table_tablespace(
/* TRUE if a table is created with CREATE TEMPORARY TABLE */
bool is_temp = !!(flags2 & DICT_TF2_TEMPORARY);
bool has_data_dir = FSP_FLAGS_HAS_DATA_DIR(flags);
+ bool atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(flags);
ut_a(space_id > 0);
ut_ad(!srv_read_only_mode);
@@ -3433,7 +3482,8 @@ fil_create_new_single_table_tablespace(
OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT,
OS_FILE_NORMAL,
OS_DATA_FILE,
- &ret);
+ &ret,
+ atomic_writes);
if (ret == FALSE) {
/* The following call will print an error message */
@@ -3498,6 +3548,7 @@ fil_create_new_single_table_tablespace(
flags = fsp_flags_set_page_size(flags, UNIV_PAGE_SIZE);
fsp_header_init_fields(page, space_id, flags);
mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id);
+ ut_ad(fsp_flags_is_valid(flags));
if (!(fsp_flags_is_compressed(flags))) {
buf_flush_init_for_writing(page, NULL, 0);
@@ -3685,6 +3736,7 @@ fil_open_single_table_tablespace(
fsp_open_info remote;
ulint tablespaces_found = 0;
ulint valid_tablespaces_found = 0;
+ ibool atomic_writes = FALSE;
#ifdef UNIV_SYNC_DEBUG
ut_ad(!fix_dict || rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
@@ -3719,7 +3771,7 @@ fil_open_single_table_tablespace(
}
link_file_found = fil_open_linked_file(
- tablename, &remote.filepath, &remote.file);
+ tablename, &remote.filepath, &remote.file, atomic_writes);
remote.success = link_file_found;
if (remote.success) {
/* possibility of multiple files. */
@@ -3747,7 +3799,7 @@ fil_open_single_table_tablespace(
if (dict.filepath) {
dict.file = os_file_create_simple_no_error_handling(
innodb_file_data_key, dict.filepath, OS_FILE_OPEN,
- OS_FILE_READ_ONLY, &dict.success);
+ OS_FILE_READ_ONLY, &dict.success, atomic_writes);
if (dict.success) {
/* possibility of multiple files. */
validate = true;
@@ -3759,7 +3811,7 @@ fil_open_single_table_tablespace(
ut_a(def.filepath);
def.file = os_file_create_simple_no_error_handling(
innodb_file_data_key, def.filepath, OS_FILE_OPEN,
- OS_FILE_READ_ONLY, &def.success);
+ OS_FILE_READ_ONLY, &def.success, atomic_writes);
if (def.success) {
tablespaces_found++;
}
@@ -4155,7 +4207,7 @@ fil_load_single_table_tablespace(
/* Check for a link file which locates a remote tablespace. */
remote.success = fil_open_linked_file(
- tablename, &remote.filepath, &remote.file);
+ tablename, &remote.filepath, &remote.file, FALSE);
/* Read the first page of the remote tablespace */
if (remote.success) {
@@ -4170,7 +4222,7 @@ fil_load_single_table_tablespace(
/* Try to open the tablespace in the datadir. */
def.file = os_file_create_simple_no_error_handling(
innodb_file_data_key, def.filepath, OS_FILE_OPEN,
- OS_FILE_READ_ONLY, &def.success);
+ OS_FILE_READ_ONLY, &def.success, FALSE);
/* Read the first page of the remote tablespace */
if (def.success) {
@@ -4938,7 +4990,6 @@ retry:
#ifdef HAVE_POSIX_FALLOCATE
if (srv_use_posix_fallocate) {
- mutex_exit(&fil_system->mutex);
success = os_file_set_size(node->name, node->handle,
(size_after_extend
- file_start_page_no) * page_size);
@@ -4975,7 +5026,7 @@ retry:
success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC,
node->name, node->handle, buf,
offset, page_size * n_pages,
- NULL, NULL, space_id, NULL);
+ NULL, NULL, space_id, NULL, 0, 0, 0);
#endif /* UNIV_HOTBACKUP */
if (success) {
os_has_said_disk_full = FALSE;
@@ -5361,7 +5412,12 @@ _fil_io(
or from where to write; in aio this must be
appropriately aligned */
void* message, /*!< in: message for aio handler if non-sync
- aio used, else ignored */
+ aio used, else ignored */
+ ulint* write_size, /*!< in/out: Actual write size initialized
+ after fist successfull trim
+ operation for this page and if
+ initialized we do not trim again if
+ actual page size does not decrease. */
trx_t* trx)
{
ulint mode;
@@ -5372,6 +5428,8 @@ _fil_io(
ulint wake_later;
os_offset_t offset;
ibool ignore_nonexistent_pages;
+ ibool page_compressed = FALSE;
+ ulint page_compression_level = 0;
is_log = type & OS_FILE_LOG;
type = type & ~OS_FILE_LOG;
@@ -5425,6 +5483,9 @@ _fil_io(
} else if (type == OS_FILE_WRITE) {
ut_ad(!srv_read_only_mode);
srv_stats.data_written.add(len);
+ if (fil_page_is_index_page((byte *)buf)) {
+ srv_stats.index_pages_written.inc();
+ }
}
/* Reserve the fil_system mutex and make sure that we can open at
@@ -5434,6 +5495,8 @@ _fil_io(
space = fil_space_get_by_id(space_id);
+ page_compressed = fsp_flags_is_page_compressed(space->flags);
+ page_compression_level = fsp_flags_get_page_compression_level(space->flags);
/* If we are deleting a tablespace we don't allow any read
operations on that. However, we do allow write operations. */
if (space == 0 || (type == OS_FILE_READ && space->stop_new_ops)) {
@@ -5579,7 +5642,8 @@ _fil_io(
/* Queue the aio request */
ret = os_aio(type, mode | wake_later, node->name, node->handle, buf,
- offset, len, node, message, space_id, trx);
+ offset, len, node, message, space_id, trx,
+ page_compressed, page_compression_level, write_size);
#else
/* In ibbackup do normal i/o, not aio */
@@ -6214,7 +6278,7 @@ fil_tablespace_iterate(
file = os_file_create_simple_no_error_handling(
innodb_file_data_key, filepath,
- OS_FILE_OPEN, OS_FILE_READ_WRITE, &success);
+ OS_FILE_OPEN, OS_FILE_READ_WRITE, &success, FALSE);
DBUG_EXECUTE_IF("fil_tablespace_iterate_failure",
{
@@ -6501,3 +6565,33 @@ fil_space_set_corrupt(
mutex_exit(&fil_system->mutex);
}
+
+/****************************************************************//**
+Acquire fil_system mutex */
+void
+fil_system_enter(void)
+/*==================*/
+{
+ ut_ad(!mutex_own(&fil_system->mutex));
+ mutex_enter(&fil_system->mutex);
+}
+
+/****************************************************************//**
+Release fil_system mutex */
+void
+fil_system_exit(void)
+/*=================*/
+{
+ ut_ad(mutex_own(&fil_system->mutex));
+ mutex_exit(&fil_system->mutex);
+}
+
+/*******************************************************************//**
+Return space name */
+char*
+fil_space_name(
+/*===========*/
+ fil_space_t* space) /*!< in: space */
+{
+ return (space->name);
+}
diff --git a/storage/xtradb/fil/fil0pagecompress.cc b/storage/xtradb/fil/fil0pagecompress.cc
new file mode 100644
index 00000000000..10ac273955f
--- /dev/null
+++ b/storage/xtradb/fil/fil0pagecompress.cc
@@ -0,0 +1,324 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2014, SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fil/fil0pagecompress.cc
+Implementation for page compressed file spaces.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+#include "fil0fil.h"
+#include "fil0pagecompress.h"
+
+#include <debug_sync.h>
+#include <my_dbug.h>
+
+#include "mem0mem.h"
+#include "hash0hash.h"
+#include "os0file.h"
+#include "mach0data.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "log0recv.h"
+#include "fsp0fsp.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "dict0dict.h"
+#include "page0page.h"
+#include "page0zip.h"
+#include "trx0sys.h"
+#include "row0mysql.h"
+#ifndef UNIV_HOTBACKUP
+# include "buf0lru.h"
+# include "ibuf0ibuf.h"
+# include "sync0sync.h"
+# include "os0sync.h"
+#else /* !UNIV_HOTBACKUP */
+# include "srv0srv.h"
+static ulint srv_data_read, srv_data_written;
+#endif /* !UNIV_HOTBACKUP */
+#include "zlib.h"
+#ifdef __linux__
+#include <linux/fs.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <linux/falloc.h>
+#endif
+#include "row0mysql.h"
+#include "lz4.h"
+
+/****************************************************************//**
+For page compressed pages compress the page before actual write
+operation.
+@return compressed page to be written*/
+byte*
+fil_compress_page(
+/*==============*/
+ ulint space_id, /*!< in: tablespace id of the
+ table. */
+ byte* buf, /*!< in: buffer from which to write; in aio
+ this must be appropriately aligned */
+ byte* out_buf, /*!< out: compressed buffer */
+ ulint len, /*!< in: length of input buffer.*/
+ ulint compression_level, /* in: compression level */
+ ulint* out_len) /*!< out: actual length of compressed page */
+{
+ int err = Z_OK;
+ int level = 0;
+ ulint header_len = FIL_PAGE_DATA + FIL_PAGE_COMPRESSED_SIZE;
+ ulint write_size=0;
+
+ ut_ad(buf);
+ ut_ad(out_buf);
+ ut_ad(len);
+ ut_ad(out_len);
+
+ level = compression_level;
+ ut_ad(fil_space_is_page_compressed(space_id));
+
+ fil_system_enter();
+ fil_space_t* space = fil_space_get_by_id(space_id);
+ fil_system_exit();
+
+ /* If no compression level was provided to this table, use system
+ default level */
+ if (level == 0) {
+ level = page_zip_level;
+ }
+
+#ifdef UNIV_DEBUG
+ fprintf(stderr,
+ "InnoDB: Note: Preparing for compress for space %lu name %s len %lu\n",
+ space_id, fil_space_name(space), len);
+#endif
+
+ write_size = UNIV_PAGE_SIZE - header_len;
+
+ if (srv_use_lz4) {
+ err = LZ4_compress_limitedOutput((const char *)buf, (char *)out_buf+header_len, len, write_size);
+ write_size = err;
+
+ if (err == 0) {
+ /* If error we leave the actual page as it was */
+
+ fprintf(stderr,
+ "InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n",
+ space_id, fil_space_name(space), len, err, write_size);
+
+ *out_len = len;
+ return (buf);
+ }
+ } else {
+ err = compress2(out_buf+header_len, &write_size, buf, len, level);
+
+ if (err != Z_OK) {
+ /* If error we leave the actual page as it was */
+
+ fprintf(stderr,
+ "InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n",
+ space_id, fil_space_name(space), len, err, write_size);
+
+ *out_len = len;
+ return (buf);
+ }
+ }
+
+ /* Set up the page header */
+ memcpy(out_buf, buf, FIL_PAGE_DATA);
+ /* Set up the checksum */
+ mach_write_to_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM, BUF_NO_CHECKSUM_MAGIC);
+ /* Set up the correct page type */
+ mach_write_to_2(out_buf+FIL_PAGE_TYPE, FIL_PAGE_PAGE_COMPRESSED);
+ /* Set up the flush lsn to be compression algorithm */
+ if (srv_use_lz4) {
+ mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, FIL_PAGE_COMPRESSION_LZ4);
+ } else {
+ mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, FIL_PAGE_COMPRESSION_ZLIB);
+ }
+ /* Set up the actual payload lenght */
+ mach_write_to_2(out_buf+FIL_PAGE_DATA, write_size);
+
+#ifdef UNIV_DEBUG
+ /* Verify */
+ ut_ad(fil_page_is_compressed(out_buf));
+ ut_ad(mach_read_from_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM) == BUF_NO_CHECKSUM_MAGIC);
+ ut_ad(mach_read_from_2(out_buf+FIL_PAGE_DATA) == write_size);
+ if (srv_use_lz4) {
+ ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == FIL_PAGE_COMPRESSION_LZ4);
+ } else {
+ ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == FIL_PAGE_COMPRESSION_ZLIB);
+ }
+#endif
+
+ write_size+=header_len;
+ /* Actual write needs to be alligned on block size */
+ if (write_size % OS_FILE_LOG_BLOCK_SIZE) {
+ write_size = (write_size + (OS_FILE_LOG_BLOCK_SIZE - (write_size % OS_FILE_LOG_BLOCK_SIZE)));
+ }
+
+#ifdef UNIV_DEBUG
+ fprintf(stderr,
+ "InnoDB: Note: Compression succeeded for space %lu name %s len %lu out_len %lu\n",
+ space_id, fil_space_name(space), len, write_size);
+#endif
+
+#define SECT_SIZE 512
+
+ srv_stats.page_compression_saved.add((len - write_size));
+ if ((len - write_size) > 0) {
+ srv_stats.page_compression_trim_sect512.add(((len - write_size) / SECT_SIZE));
+ srv_stats.page_compression_trim_sect4096.add(((len - write_size) / (SECT_SIZE*8)));
+ }
+ //srv_stats.page_compressed_trim_op.inc();
+ srv_stats.pages_page_compressed.inc();
+ *out_len = write_size;
+
+ return(out_buf);
+
+}
+
+/****************************************************************//**
+For page compressed pages decompress the page after actual read
+operation. */
+void
+fil_decompress_page(
+/*================*/
+ byte* page_buf, /*!< in: preallocated buffer or NULL */
+ byte* buf, /*!< out: buffer from which to read; in aio
+ this must be appropriately aligned */
+ ulint len) /*!< in: length of output buffer.*/
+{
+ int err = 0;
+ ulint actual_size = 0;
+ ulint compression_alg = 0;
+ byte *in_buf;
+
+ ut_ad(buf);
+ ut_ad(len);
+
+ /* Before actual decompress, make sure that page type is correct */
+
+ if (mach_read_from_4(buf+FIL_PAGE_SPACE_OR_CHKSUM) != BUF_NO_CHECKSUM_MAGIC ||
+ mach_read_from_2(buf+FIL_PAGE_TYPE) != FIL_PAGE_PAGE_COMPRESSED) {
+ fprintf(stderr,
+ "InnoDB: Corruption: We try to uncompress corrupted page\n"
+ "InnoDB: CRC %lu type %lu.\n"
+ "InnoDB: len %lu\n",
+ mach_read_from_4(buf+FIL_PAGE_SPACE_OR_CHKSUM),
+ mach_read_from_2(buf+FIL_PAGE_TYPE), len);
+
+ fflush(stderr);
+ ut_error;
+ }
+
+ /* Get compression algorithm */
+ compression_alg = mach_read_from_8(buf+FIL_PAGE_FILE_FLUSH_LSN);
+
+ // If no buffer was given, we need to allocate temporal buffer
+ if (page_buf == NULL) {
+#ifdef UNIV_DEBUG
+ fprintf(stderr,
+ "InnoDB: Note: Compression buffer not given, allocating...\n");
+#endif
+ in_buf = static_cast<byte *>(ut_malloc(UNIV_PAGE_SIZE));
+ } else {
+ in_buf = page_buf;
+ }
+
+ /* Get the actual size of compressed page */
+ actual_size = mach_read_from_2(buf+FIL_PAGE_DATA);
+ /* Check if payload size is corrupted */
+ if (actual_size == 0 || actual_size > UNIV_PAGE_SIZE) {
+ fprintf(stderr,
+ "InnoDB: Corruption: We try to uncompress corrupted page\n"
+ "InnoDB: actual size %lu compression %s\n",
+ actual_size, fil_get_compression_alg_name(compression_alg));
+ fflush(stderr);
+ ut_error;
+ }
+
+ if (compression_alg == FIL_PAGE_COMPRESSION_ZLIB) {
+
+#ifdef UNIV_DEBUG
+ fprintf(stderr,
+ "InnoDB: Note: Preparing for decompress for len %lu\n",
+ actual_size);
+#endif
+
+ err= uncompress(in_buf, &len, buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (unsigned long)actual_size);
+
+
+ /* If uncompress fails it means that page is corrupted */
+ if (err != Z_OK) {
+
+ fprintf(stderr,
+ "InnoDB: Corruption: Page is marked as compressed\n"
+ "InnoDB: but uncompress failed with error %d.\n"
+ "InnoDB: size %lu len %lu\n",
+ err, actual_size, len);
+
+ fflush(stderr);
+
+ ut_error;
+ }
+
+#ifdef UNIV_DEBUG
+ fprintf(stderr,
+ "InnoDB: Note: Decompression succeeded for len %lu \n",
+ len);
+#endif
+ } else if (compression_alg == FIL_PAGE_COMPRESSION_LZ4) {
+ err = LZ4_decompress_fast((const char *)buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (char *)in_buf, UNIV_PAGE_SIZE);
+
+ if (err != actual_size) {
+ fprintf(stderr,
+ "InnoDB: Corruption: Page is marked as compressed\n"
+ "InnoDB: but decompression read only %d bytes.\n"
+ "InnoDB: size %lu len %lu\n",
+ err, actual_size, len);
+ fflush(stderr);
+
+ ut_error;
+ }
+ } else {
+ fprintf(stderr,
+ "InnoDB: Corruption: Page is marked as compressed\n"
+ "InnoDB: but compression algorithm %s\n"
+ "InnoDB: is not known.\n"
+ ,fil_get_compression_alg_name(compression_alg));
+
+ fflush(stderr);
+ ut_error;
+ }
+
+ srv_stats.pages_page_decompressed.inc();
+
+ /* Copy the uncompressed page to the buffer pool, not
+ really any other options. */
+ memcpy(buf, in_buf, len);
+
+ // Need to free temporal buffer if no buffer was given
+ if (page_buf == NULL) {
+ ut_free(in_buf);
+ }
+}
+
+
diff --git a/storage/xtradb/fil/lz4.c b/storage/xtradb/fil/lz4.c
new file mode 100644
index 00000000000..4e864de67d3
--- /dev/null
+++ b/storage/xtradb/fil/lz4.c
@@ -0,0 +1,822 @@
+/*
+ LZ4 - Fast LZ compression algorithm
+ Copyright (C) 2011-2013, Yann Collet.
+ BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following disclaimer
+ in the documentation and/or other materials provided with the
+ distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ You can contact the author at :
+ - LZ4 source repository : http://code.google.com/p/lz4/
+ - LZ4 public forum : https://groups.google.com/forum/#!forum/lz4c
+*/
+
+//**************************************
+// Tuning parameters
+//**************************************
+// MEMORY_USAGE :
+// Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
+// Increasing memory usage improves compression ratio
+// Reduced memory usage can improve speed, due to cache effect
+// Default value is 14, for 16KB, which nicely fits into Intel x86 L1 cache
+#define MEMORY_USAGE 14
+
+// HEAPMODE :
+// Select how default compression functions will allocate memory for their hash table,
+// in memory stack (0:default, fastest), or in memory heap (1:requires memory allocation (malloc)).
+#define HEAPMODE 0
+
+
+//**************************************
+// CPU Feature Detection
+//**************************************
+// 32 or 64 bits ?
+#if (defined(__x86_64__) || defined(_M_X64) || defined(_WIN64) \
+ || defined(__powerpc64__) || defined(__ppc64__) || defined(__PPC64__) \
+ || defined(__64BIT__) || defined(_LP64) || defined(__LP64__) \
+ || defined(__ia64) || defined(__itanium__) || defined(_M_IA64) ) // Detects 64 bits mode
+# define LZ4_ARCH64 1
+#else
+# define LZ4_ARCH64 0
+#endif
+
+// Little Endian or Big Endian ?
+// Overwrite the #define below if you know your architecture endianess
+#if defined (__GLIBC__)
+# include <endian.h>
+# if (__BYTE_ORDER == __BIG_ENDIAN)
+# define LZ4_BIG_ENDIAN 1
+# endif
+#elif (defined(__BIG_ENDIAN__) || defined(__BIG_ENDIAN) || defined(_BIG_ENDIAN)) && !(defined(__LITTLE_ENDIAN__) || defined(__LITTLE_ENDIAN) || defined(_LITTLE_ENDIAN))
+# define LZ4_BIG_ENDIAN 1
+#elif defined(__sparc) || defined(__sparc__) \
+ || defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) \
+ || defined(__hpux) || defined(__hppa) \
+ || defined(_MIPSEB) || defined(__s390__)
+# define LZ4_BIG_ENDIAN 1
+#else
+// Little Endian assumed. PDP Endian and other very rare endian format are unsupported.
+#endif
+
+// Unaligned memory access is automatically enabled for "common" CPU, such as x86.
+// For others CPU, such as ARM, the compiler may be more cautious, inserting unnecessary extra code to ensure aligned access property
+// If you know your target CPU supports unaligned memory access, you want to force this option manually to improve performance
+#if defined(__ARM_FEATURE_UNALIGNED)
+# define LZ4_FORCE_UNALIGNED_ACCESS 1
+#endif
+
+// Define this parameter if your target system or compiler does not support hardware bit count
+#if defined(_MSC_VER) && defined(_WIN32_WCE) // Visual Studio for Windows CE does not support Hardware bit count
+# define LZ4_FORCE_SW_BITCOUNT
+#endif
+
+// BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE :
+// This option may provide a small boost to performance for some big endian cpu, although probably modest.
+// You may set this option to 1 if data will remain within closed environment.
+// This option is useless on Little_Endian CPU (such as x86)
+//#define BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1
+
+
+//**************************************
+// Compiler Options
+//**************************************
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) // C99
+/* "restrict" is a known keyword */
+#else
+# define restrict // Disable restrict
+#endif
+
+#ifdef _MSC_VER // Visual Studio
+# define FORCE_INLINE static __forceinline
+# include <intrin.h> // For Visual 2005
+# if LZ4_ARCH64 // 64-bits
+# pragma intrinsic(_BitScanForward64) // For Visual 2005
+# pragma intrinsic(_BitScanReverse64) // For Visual 2005
+# else // 32-bits
+# pragma intrinsic(_BitScanForward) // For Visual 2005
+# pragma intrinsic(_BitScanReverse) // For Visual 2005
+# endif
+# pragma warning(disable : 4127) // disable: C4127: conditional expression is constant
+#else
+# ifdef __GNUC__
+# define FORCE_INLINE static inline __attribute__((always_inline))
+# else
+# define FORCE_INLINE static inline
+# endif
+#endif
+
+#ifdef _MSC_VER
+# define lz4_bswap16(x) _byteswap_ushort(x)
+#else
+# define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | (((x) & 0xffu) << 8)))
+#endif
+
+#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__)
+
+#if (GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__)
+# define expect(expr,value) (__builtin_expect ((expr),(value)) )
+#else
+# define expect(expr,value) (expr)
+#endif
+
+#define likely(expr) expect((expr) != 0, 1)
+#define unlikely(expr) expect((expr) != 0, 0)
+
+
+//**************************************
+// Memory routines
+//**************************************
+#include <stdlib.h> // malloc, calloc, free
+#define ALLOCATOR(n,s) calloc(n,s)
+#define FREEMEM free
+#include <string.h> // memset, memcpy
+#define MEM_INIT memset
+
+
+//**************************************
+// Includes
+//**************************************
+#include "lz4.h"
+
+
+//**************************************
+// Basic Types
+//**************************************
+#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L // C99
+# include <stdint.h>
+ typedef uint8_t BYTE;
+ typedef uint16_t U16;
+ typedef uint32_t U32;
+ typedef int32_t S32;
+ typedef uint64_t U64;
+#else
+ typedef unsigned char BYTE;
+ typedef unsigned short U16;
+ typedef unsigned int U32;
+ typedef signed int S32;
+ typedef unsigned long long U64;
+#endif
+
+#if defined(__GNUC__) && !defined(LZ4_FORCE_UNALIGNED_ACCESS)
+# define _PACKED __attribute__ ((packed))
+#else
+# define _PACKED
+#endif
+
+#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__)
+# if defined(__IBMC__) || defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+# pragma pack(1)
+# else
+# pragma pack(push, 1)
+# endif
+#endif
+
+typedef struct { U16 v; } _PACKED U16_S;
+typedef struct { U32 v; } _PACKED U32_S;
+typedef struct { U64 v; } _PACKED U64_S;
+typedef struct {size_t v;} _PACKED size_t_S;
+
+#if !defined(LZ4_FORCE_UNALIGNED_ACCESS) && !defined(__GNUC__)
+# if defined(__SUNPRO_C) || defined(__SUNPRO_CC)
+# pragma pack(0)
+# else
+# pragma pack(pop)
+# endif
+#endif
+
+#define A16(x) (((U16_S *)(x))->v)
+#define A32(x) (((U32_S *)(x))->v)
+#define A64(x) (((U64_S *)(x))->v)
+#define AARCH(x) (((size_t_S *)(x))->v)
+
+
+//**************************************
+// Constants
+//**************************************
+#define LZ4_HASHLOG (MEMORY_USAGE-2)
+#define HASHTABLESIZE (1 << MEMORY_USAGE)
+#define HASHNBCELLS4 (1 << LZ4_HASHLOG)
+
+#define MINMATCH 4
+
+#define COPYLENGTH 8
+#define LASTLITERALS 5
+#define MFLIMIT (COPYLENGTH+MINMATCH)
+const int LZ4_minLength = (MFLIMIT+1);
+
+#define LZ4_64KLIMIT ((1<<16) + (MFLIMIT-1))
+#define SKIPSTRENGTH 6 // Increasing this value will make the compression run slower on incompressible data
+
+#define MAXD_LOG 16
+#define MAX_DISTANCE ((1 << MAXD_LOG) - 1)
+
+#define ML_BITS 4
+#define ML_MASK ((1U<<ML_BITS)-1)
+#define RUN_BITS (8-ML_BITS)
+#define RUN_MASK ((1U<<RUN_BITS)-1)
+
+#define KB *(1U<<10)
+#define MB *(1U<<20)
+#define GB *(1U<<30)
+
+
+//**************************************
+// Structures and local types
+//**************************************
+
+typedef struct {
+ U32 hashTable[HASHNBCELLS4];
+ const BYTE* bufferStart;
+ const BYTE* base;
+ const BYTE* nextBlock;
+} LZ4_Data_Structure;
+
+typedef enum { notLimited = 0, limited = 1 } limitedOutput_directive;
+typedef enum { byPtr, byU32, byU16 } tableType_t;
+
+typedef enum { noPrefix = 0, withPrefix = 1 } prefix64k_directive;
+
+typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive;
+typedef enum { full = 0, partial = 1 } earlyEnd_directive;
+
+
+//**************************************
+// Architecture-specific macros
+//**************************************
+#define STEPSIZE sizeof(size_t)
+#define LZ4_COPYSTEP(d,s) { AARCH(d) = AARCH(s); d+=STEPSIZE; s+=STEPSIZE; }
+#define LZ4_COPY8(d,s) { LZ4_COPYSTEP(d,s); if (STEPSIZE<8) LZ4_COPYSTEP(d,s); }
+#define LZ4_SECURECOPY(d,s,e) { if ((STEPSIZE==4)||(d<e)) LZ4_WILDCOPY(d,s,e); }
+
+#if LZ4_ARCH64 // 64-bit
+# define HTYPE U32
+# define INITBASE(base) const BYTE* const base = ip
+#else // 32-bit
+# define HTYPE const BYTE*
+# define INITBASE(base) const int base = 0
+#endif
+
+#if (defined(LZ4_BIG_ENDIAN) && !defined(BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE))
+# define LZ4_READ_LITTLEENDIAN_16(d,s,p) { U16 v = A16(p); v = lz4_bswap16(v); d = (s) - v; }
+# define LZ4_WRITE_LITTLEENDIAN_16(p,i) { U16 v = (U16)(i); v = lz4_bswap16(v); A16(p) = v; p+=2; }
+#else // Little Endian
+# define LZ4_READ_LITTLEENDIAN_16(d,s,p) { d = (s) - A16(p); }
+# define LZ4_WRITE_LITTLEENDIAN_16(p,v) { A16(p) = v; p+=2; }
+#endif
+
+
+//**************************************
+// Macros
+//**************************************
+#define LZ4_WILDCOPY(d,s,e) { do { LZ4_COPY8(d,s) } while (d<e); } // at the end, d>=e;
+
+
+//****************************
+// Private functions
+//****************************
+#if LZ4_ARCH64
+
+FORCE_INLINE int LZ4_NbCommonBytes (register U64 val)
+{
+# if defined(LZ4_BIG_ENDIAN)
+# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+ unsigned long r = 0;
+ _BitScanReverse64( &r, val );
+ return (int)(r>>3);
+# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
+ return (__builtin_clzll(val) >> 3);
+# else
+ int r;
+ if (!(val>>32)) { r=4; } else { r=0; val>>=32; }
+ if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; }
+ r += (!val);
+ return r;
+# endif
+# else
+# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+ unsigned long r = 0;
+ _BitScanForward64( &r, val );
+ return (int)(r>>3);
+# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
+ return (__builtin_ctzll(val) >> 3);
+# else
+ static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 };
+ return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58];
+# endif
+# endif
+}
+
+#else
+
+FORCE_INLINE int LZ4_NbCommonBytes (register U32 val)
+{
+# if defined(LZ4_BIG_ENDIAN)
+# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+ unsigned long r = 0;
+ _BitScanReverse( &r, val );
+ return (int)(r>>3);
+# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
+ return (__builtin_clz(val) >> 3);
+# else
+ int r;
+ if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; }
+ r += (!val);
+ return r;
+# endif
+# else
+# if defined(_MSC_VER) && !defined(LZ4_FORCE_SW_BITCOUNT)
+ unsigned long r;
+ _BitScanForward( &r, val );
+ return (int)(r>>3);
+# elif defined(__GNUC__) && (GCC_VERSION >= 304) && !defined(LZ4_FORCE_SW_BITCOUNT)
+ return (__builtin_ctz(val) >> 3);
+# else
+ static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, 3, 2, 2, 1, 3, 2, 0, 1, 3, 3, 1, 2, 2, 2, 2, 0, 3, 1, 2, 0, 1, 0, 1, 1 };
+ return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27];
+# endif
+# endif
+}
+
+#endif
+
+
+//****************************
+// Compression functions
+//****************************
+FORCE_INLINE int LZ4_hashSequence(U32 sequence, tableType_t tableType)
+{
+ if (tableType == byU16)
+ return (((sequence) * 2654435761U) >> ((MINMATCH*8)-(LZ4_HASHLOG+1)));
+ else
+ return (((sequence) * 2654435761U) >> ((MINMATCH*8)-LZ4_HASHLOG));
+}
+
+FORCE_INLINE int LZ4_hashPosition(const BYTE* p, tableType_t tableType) { return LZ4_hashSequence(A32(p), tableType); }
+
+FORCE_INLINE void LZ4_putPositionOnHash(const BYTE* p, U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+ switch (tableType)
+ {
+ case byPtr: { const BYTE** hashTable = (const BYTE**) tableBase; hashTable[h] = p; break; }
+ case byU32: { U32* hashTable = (U32*) tableBase; hashTable[h] = (U32)(p-srcBase); break; }
+ case byU16: { U16* hashTable = (U16*) tableBase; hashTable[h] = (U16)(p-srcBase); break; }
+ }
+}
+
+FORCE_INLINE void LZ4_putPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+ U32 h = LZ4_hashPosition(p, tableType);
+ LZ4_putPositionOnHash(p, h, tableBase, tableType, srcBase);
+}
+
+FORCE_INLINE const BYTE* LZ4_getPositionOnHash(U32 h, void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+ if (tableType == byPtr) { const BYTE** hashTable = (const BYTE**) tableBase; return hashTable[h]; }
+ if (tableType == byU32) { U32* hashTable = (U32*) tableBase; return hashTable[h] + srcBase; }
+ { U16* hashTable = (U16*) tableBase; return hashTable[h] + srcBase; } // default, to ensure a return
+}
+
+FORCE_INLINE const BYTE* LZ4_getPosition(const BYTE* p, void* tableBase, tableType_t tableType, const BYTE* srcBase)
+{
+ U32 h = LZ4_hashPosition(p, tableType);
+ return LZ4_getPositionOnHash(h, tableBase, tableType, srcBase);
+}
+
+
+FORCE_INLINE int LZ4_compress_generic(
+ void* ctx,
+ const char* source,
+ char* dest,
+ int inputSize,
+ int maxOutputSize,
+
+ limitedOutput_directive limitedOutput,
+ tableType_t tableType,
+ prefix64k_directive prefix)
+{
+ const BYTE* ip = (const BYTE*) source;
+ const BYTE* const base = (prefix==withPrefix) ? ((LZ4_Data_Structure*)ctx)->base : (const BYTE*) source;
+ const BYTE* const lowLimit = ((prefix==withPrefix) ? ((LZ4_Data_Structure*)ctx)->bufferStart : (const BYTE*)source);
+ const BYTE* anchor = (const BYTE*) source;
+ const BYTE* const iend = ip + inputSize;
+ const BYTE* const mflimit = iend - MFLIMIT;
+ const BYTE* const matchlimit = iend - LASTLITERALS;
+
+ BYTE* op = (BYTE*) dest;
+ BYTE* const oend = op + maxOutputSize;
+
+ int length;
+ const int skipStrength = SKIPSTRENGTH;
+ U32 forwardH;
+
+ // Init conditions
+ if ((U32)inputSize > (U32)LZ4_MAX_INPUT_SIZE) return 0; // Unsupported input size, too large (or negative)
+ if ((prefix==withPrefix) && (ip != ((LZ4_Data_Structure*)ctx)->nextBlock)) return 0; // must continue from end of previous block
+ if (prefix==withPrefix) ((LZ4_Data_Structure*)ctx)->nextBlock=iend; // do it now, due to potential early exit
+ if ((tableType == byU16) && (inputSize>=LZ4_64KLIMIT)) return 0; // Size too large (not within 64K limit)
+ if (inputSize<LZ4_minLength) goto _last_literals; // Input too small, no compression (all literals)
+
+ // First Byte
+ LZ4_putPosition(ip, ctx, tableType, base);
+ ip++; forwardH = LZ4_hashPosition(ip, tableType);
+
+ // Main Loop
+ for ( ; ; )
+ {
+ int findMatchAttempts = (1U << skipStrength) + 3;
+ const BYTE* forwardIp = ip;
+ const BYTE* ref;
+ BYTE* token;
+
+ // Find a match
+ do {
+ U32 h = forwardH;
+ int step = findMatchAttempts++ >> skipStrength;
+ ip = forwardIp;
+ forwardIp = ip + step;
+
+ if unlikely(forwardIp > mflimit) { goto _last_literals; }
+
+ forwardH = LZ4_hashPosition(forwardIp, tableType);
+ ref = LZ4_getPositionOnHash(h, ctx, tableType, base);
+ LZ4_putPositionOnHash(ip, h, ctx, tableType, base);
+
+ } while ((ref + MAX_DISTANCE < ip) || (A32(ref) != A32(ip)));
+
+ // Catch up
+ while ((ip>anchor) && (ref > lowLimit) && unlikely(ip[-1]==ref[-1])) { ip--; ref--; }
+
+ // Encode Literal length
+ length = (int)(ip - anchor);
+ token = op++;
+ if ((limitedOutput) && unlikely(op + length + (2 + 1 + LASTLITERALS) + (length/255) > oend)) return 0; // Check output limit
+ if (length>=(int)RUN_MASK)
+ {
+ int len = length-RUN_MASK;
+ *token=(RUN_MASK<<ML_BITS);
+ for(; len >= 255 ; len-=255) *op++ = 255;
+ *op++ = (BYTE)len;
+ }
+ else *token = (BYTE)(length<<ML_BITS);
+
+ // Copy Literals
+ { BYTE* end=(op)+(length); LZ4_WILDCOPY(op,anchor,end); op=end; }
+
+_next_match:
+ // Encode Offset
+ LZ4_WRITE_LITTLEENDIAN_16(op,(U16)(ip-ref));
+
+ // Start Counting
+ ip+=MINMATCH; ref+=MINMATCH; // MinMatch already verified
+ anchor = ip;
+ while likely(ip<matchlimit-(STEPSIZE-1))
+ {
+ size_t diff = AARCH(ref) ^ AARCH(ip);
+ if (!diff) { ip+=STEPSIZE; ref+=STEPSIZE; continue; }
+ ip += LZ4_NbCommonBytes(diff);
+ goto _endCount;
+ }
+ if (LZ4_ARCH64) if ((ip<(matchlimit-3)) && (A32(ref) == A32(ip))) { ip+=4; ref+=4; }
+ if ((ip<(matchlimit-1)) && (A16(ref) == A16(ip))) { ip+=2; ref+=2; }
+ if ((ip<matchlimit) && (*ref == *ip)) ip++;
+_endCount:
+
+ // Encode MatchLength
+ length = (int)(ip - anchor);
+ if ((limitedOutput) && unlikely(op + (1 + LASTLITERALS) + (length>>8) > oend)) return 0; // Check output limit
+ if (length>=(int)ML_MASK)
+ {
+ *token += ML_MASK;
+ length -= ML_MASK;
+ for (; length > 509 ; length-=510) { *op++ = 255; *op++ = 255; }
+ if (length >= 255) { length-=255; *op++ = 255; }
+ *op++ = (BYTE)length;
+ }
+ else *token += (BYTE)(length);
+
+ // Test end of chunk
+ if (ip > mflimit) { anchor = ip; break; }
+
+ // Fill table
+ LZ4_putPosition(ip-2, ctx, tableType, base);
+
+ // Test next position
+ ref = LZ4_getPosition(ip, ctx, tableType, base);
+ LZ4_putPosition(ip, ctx, tableType, base);
+ if ((ref + MAX_DISTANCE >= ip) && (A32(ref) == A32(ip))) { token = op++; *token=0; goto _next_match; }
+
+ // Prepare next loop
+ anchor = ip++;
+ forwardH = LZ4_hashPosition(ip, tableType);
+ }
+
+_last_literals:
+ // Encode Last Literals
+ {
+ int lastRun = (int)(iend - anchor);
+ if ((limitedOutput) && (((char*)op - dest) + lastRun + 1 + ((lastRun+255-RUN_MASK)/255) > (U32)maxOutputSize)) return 0; // Check output limit
+ if (lastRun>=(int)RUN_MASK) { *op++=(RUN_MASK<<ML_BITS); lastRun-=RUN_MASK; for(; lastRun >= 255 ; lastRun-=255) *op++ = 255; *op++ = (BYTE) lastRun; }
+ else *op++ = (BYTE)(lastRun<<ML_BITS);
+ memcpy(op, anchor, iend - anchor);
+ op += iend-anchor;
+ }
+
+ // End
+ return (int) (((char*)op)-dest);
+}
+
+
+int LZ4_compress(const char* source, char* dest, int inputSize)
+{
+#if (HEAPMODE)
+ void* ctx = ALLOCATOR(HASHNBCELLS4, 4); // Aligned on 4-bytes boundaries
+#else
+ U32 ctx[1U<<(MEMORY_USAGE-2)] = {0}; // Ensure data is aligned on 4-bytes boundaries
+#endif
+ int result;
+
+ if (inputSize < (int)LZ4_64KLIMIT)
+ result = LZ4_compress_generic((void*)ctx, source, dest, inputSize, 0, notLimited, byU16, noPrefix);
+ else
+ result = LZ4_compress_generic((void*)ctx, source, dest, inputSize, 0, notLimited, (sizeof(void*)==8) ? byU32 : byPtr, noPrefix);
+
+#if (HEAPMODE)
+ FREEMEM(ctx);
+#endif
+ return result;
+}
+
+int LZ4_compress_continue (void* LZ4_Data, const char* source, char* dest, int inputSize)
+{
+ return LZ4_compress_generic(LZ4_Data, source, dest, inputSize, 0, notLimited, byU32, withPrefix);
+}
+
+
+int LZ4_compress_limitedOutput(const char* source, char* dest, int inputSize, int maxOutputSize)
+{
+#if (HEAPMODE)
+ void* ctx = ALLOCATOR(HASHNBCELLS4, 4); // Aligned on 4-bytes boundaries
+#else
+ U32 ctx[1U<<(MEMORY_USAGE-2)] = {0}; // Ensure data is aligned on 4-bytes boundaries
+#endif
+ int result;
+
+ if (inputSize < (int)LZ4_64KLIMIT)
+ result = LZ4_compress_generic((void*)ctx, source, dest, inputSize, maxOutputSize, limited, byU16, noPrefix);
+ else
+ result = LZ4_compress_generic((void*)ctx, source, dest, inputSize, maxOutputSize, limited, (sizeof(void*)==8) ? byU32 : byPtr, noPrefix);
+
+#if (HEAPMODE)
+ FREEMEM(ctx);
+#endif
+ return result;
+}
+
+int LZ4_compress_limitedOutput_continue (void* LZ4_Data, const char* source, char* dest, int inputSize, int maxOutputSize)
+{
+ return LZ4_compress_generic(LZ4_Data, source, dest, inputSize, maxOutputSize, limited, byU32, withPrefix);
+}
+
+
+//****************************
+// Stream functions
+//****************************
+
+FORCE_INLINE void LZ4_init(LZ4_Data_Structure* lz4ds, const BYTE* base)
+{
+ MEM_INIT(lz4ds->hashTable, 0, sizeof(lz4ds->hashTable));
+ lz4ds->bufferStart = base;
+ lz4ds->base = base;
+ lz4ds->nextBlock = base;
+}
+
+
+void* LZ4_create (const char* inputBuffer)
+{
+ void* lz4ds = ALLOCATOR(1, sizeof(LZ4_Data_Structure));
+ LZ4_init ((LZ4_Data_Structure*)lz4ds, (const BYTE*)inputBuffer);
+ return lz4ds;
+}
+
+
+int LZ4_free (void* LZ4_Data)
+{
+ FREEMEM(LZ4_Data);
+ return (0);
+}
+
+
+char* LZ4_slideInputBuffer (void* LZ4_Data)
+{
+ LZ4_Data_Structure* lz4ds = (LZ4_Data_Structure*)LZ4_Data;
+ size_t delta = lz4ds->nextBlock - (lz4ds->bufferStart + 64 KB);
+
+ if ( (lz4ds->base - delta > lz4ds->base) // underflow control
+ || ((size_t)(lz4ds->nextBlock - lz4ds->base) > 0xE0000000) ) // close to 32-bits limit
+ {
+ size_t deltaLimit = (lz4ds->nextBlock - 64 KB) - lz4ds->base;
+ int nH;
+
+ for (nH=0; nH < HASHNBCELLS4; nH++)
+ {
+ if ((size_t)(lz4ds->hashTable[nH]) < deltaLimit) lz4ds->hashTable[nH] = 0;
+ else lz4ds->hashTable[nH] -= (U32)deltaLimit;
+ }
+ memcpy((void*)(lz4ds->bufferStart), (const void*)(lz4ds->nextBlock - 64 KB), 64 KB);
+ lz4ds->base = lz4ds->bufferStart;
+ lz4ds->nextBlock = lz4ds->base + 64 KB;
+ }
+ else
+ {
+ memcpy((void*)(lz4ds->bufferStart), (const void*)(lz4ds->nextBlock - 64 KB), 64 KB);
+ lz4ds->nextBlock -= delta;
+ lz4ds->base -= delta;
+ }
+
+ return (char*)(lz4ds->nextBlock);
+}
+
+
+//****************************
+// Decompression functions
+//****************************
+
+// This generic decompression function cover all use cases.
+// It shall be instanciated several times, using different sets of directives
+// Note that it is essential this generic function is really inlined,
+// in order to remove useless branches during compilation optimisation.
+FORCE_INLINE int LZ4_decompress_generic(
+ const char* source,
+ char* dest,
+ int inputSize, //
+ int outputSize, // If endOnInput==endOnInputSize, this value is the max size of Output Buffer.
+
+ int endOnInput, // endOnOutputSize, endOnInputSize
+ int prefix64k, // noPrefix, withPrefix
+ int partialDecoding, // full, partial
+ int targetOutputSize // only used if partialDecoding==partial
+ )
+{
+ // Local Variables
+ const BYTE* restrict ip = (const BYTE*) source;
+ const BYTE* ref;
+ const BYTE* const iend = ip + inputSize;
+
+ BYTE* op = (BYTE*) dest;
+ BYTE* const oend = op + outputSize;
+ BYTE* cpy;
+ BYTE* oexit = op + targetOutputSize;
+
+ const size_t dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; // static reduces speed for LZ4_decompress_safe() on GCC64
+ static const size_t dec64table[] = {0, 0, 0, (size_t)-1, 0, 1, 2, 3};
+
+
+ // Special cases
+ if ((partialDecoding) && (oexit> oend-MFLIMIT)) oexit = oend-MFLIMIT; // targetOutputSize too high => decode everything
+ if ((endOnInput) && unlikely(outputSize==0)) return ((inputSize==1) && (*ip==0)) ? 0 : -1; // Empty output buffer
+ if ((!endOnInput) && unlikely(outputSize==0)) return (*ip==0?1:-1);
+
+
+ // Main Loop
+ while (1)
+ {
+ unsigned token;
+ size_t length;
+
+ // get runlength
+ token = *ip++;
+ if ((length=(token>>ML_BITS)) == RUN_MASK)
+ {
+ unsigned s=255;
+ while (((endOnInput)?ip<iend:1) && (s==255))
+ {
+ s = *ip++;
+ length += s;
+ }
+ }
+
+ // copy literals
+ cpy = op+length;
+ if (((endOnInput) && ((cpy>(partialDecoding?oexit:oend-MFLIMIT)) || (ip+length>iend-(2+1+LASTLITERALS))) )
+ || ((!endOnInput) && (cpy>oend-COPYLENGTH)))
+ {
+ if (partialDecoding)
+ {
+ if (cpy > oend) goto _output_error; // Error : write attempt beyond end of output buffer
+ if ((endOnInput) && (ip+length > iend)) goto _output_error; // Error : read attempt beyond end of input buffer
+ }
+ else
+ {
+ if ((!endOnInput) && (cpy != oend)) goto _output_error; // Error : block decoding must stop exactly there
+ if ((endOnInput) && ((ip+length != iend) || (cpy > oend))) goto _output_error; // Error : input must be consumed
+ }
+ memcpy(op, ip, length);
+ ip += length;
+ op += length;
+ break; // Necessarily EOF, due to parsing restrictions
+ }
+ LZ4_WILDCOPY(op, ip, cpy); ip -= (op-cpy); op = cpy;
+
+ // get offset
+ LZ4_READ_LITTLEENDIAN_16(ref,cpy,ip); ip+=2;
+ if ((prefix64k==noPrefix) && unlikely(ref < (BYTE* const)dest)) goto _output_error; // Error : offset outside destination buffer
+
+ // get matchlength
+ if ((length=(token&ML_MASK)) == ML_MASK)
+ {
+ while ((!endOnInput) || (ip<iend-(LASTLITERALS+1))) // Ensure enough bytes remain for LASTLITERALS + token
+ {
+ unsigned s = *ip++;
+ length += s;
+ if (s==255) continue;
+ break;
+ }
+ }
+
+ // copy repeated sequence
+ if unlikely((op-ref)<(int)STEPSIZE)
+ {
+ const size_t dec64 = dec64table[(sizeof(void*)==4) ? 0 : op-ref];
+ op[0] = ref[0];
+ op[1] = ref[1];
+ op[2] = ref[2];
+ op[3] = ref[3];
+ op += 4, ref += 4; ref -= dec32table[op-ref];
+ A32(op) = A32(ref);
+ op += STEPSIZE-4; ref -= dec64;
+ } else { LZ4_COPYSTEP(op,ref); }
+ cpy = op + length - (STEPSIZE-4);
+
+ if unlikely(cpy>oend-COPYLENGTH-(STEPSIZE-4))
+ {
+ if (cpy > oend-LASTLITERALS) goto _output_error; // Error : last 5 bytes must be literals
+ LZ4_SECURECOPY(op, ref, (oend-COPYLENGTH));
+ while(op<cpy) *op++=*ref++;
+ op=cpy;
+ continue;
+ }
+ LZ4_WILDCOPY(op, ref, cpy);
+ op=cpy; // correction
+ }
+
+ // end of decoding
+ if (endOnInput)
+ return (int) (((char*)op)-dest); // Nb of output bytes decoded
+ else
+ return (int) (((char*)ip)-source); // Nb of input bytes read
+
+ // Overflow error detected
+_output_error:
+ return (int) (-(((char*)ip)-source))-1;
+}
+
+
+int LZ4_decompress_safe(const char* source, char* dest, int inputSize, int maxOutputSize)
+{
+ return LZ4_decompress_generic(source, dest, inputSize, maxOutputSize, endOnInputSize, noPrefix, full, 0);
+}
+
+int LZ4_decompress_safe_withPrefix64k(const char* source, char* dest, int inputSize, int maxOutputSize)
+{
+ return LZ4_decompress_generic(source, dest, inputSize, maxOutputSize, endOnInputSize, withPrefix, full, 0);
+}
+
+int LZ4_decompress_safe_partial(const char* source, char* dest, int inputSize, int targetOutputSize, int maxOutputSize)
+{
+ return LZ4_decompress_generic(source, dest, inputSize, maxOutputSize, endOnInputSize, noPrefix, partial, targetOutputSize);
+}
+
+int LZ4_decompress_fast_withPrefix64k(const char* source, char* dest, int outputSize)
+{
+ return LZ4_decompress_generic(source, dest, 0, outputSize, endOnOutputSize, withPrefix, full, 0);
+}
+
+int LZ4_decompress_fast(const char* source, char* dest, int outputSize)
+{
+#ifdef _MSC_VER // This version is faster with Visual
+ return LZ4_decompress_generic(source, dest, 0, outputSize, endOnOutputSize, noPrefix, full, 0);
+#else
+ return LZ4_decompress_generic(source, dest, 0, outputSize, endOnOutputSize, withPrefix, full, 0);
+#endif
+}
+
diff --git a/storage/xtradb/fil/lz4.h b/storage/xtradb/fil/lz4.h
new file mode 100644
index 00000000000..9ef58862947
--- /dev/null
+++ b/storage/xtradb/fil/lz4.h
@@ -0,0 +1,205 @@
+/*
+ LZ4 - Fast LZ compression algorithm
+ Header File
+ Copyright (C) 2011-2013, Yann Collet.
+ BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are
+ met:
+
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above
+ copyright notice, this list of conditions and the following disclaimer
+ in the documentation and/or other materials provided with the
+ distribution.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ You can contact the author at :
+ - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html
+ - LZ4 source repository : http://code.google.com/p/lz4/
+*/
+#pragma once
+
+#if defined (__cplusplus)
+extern "C" {
+#endif
+
+
+//**************************************
+// Compiler Options
+//**************************************
+#if defined(_MSC_VER) && !defined(__cplusplus) // Visual Studio
+# define inline __inline // Visual C is not C99, but supports some kind of inline
+#endif
+
+
+//****************************
+// Simple Functions
+//****************************
+
+int LZ4_compress (const char* source, char* dest, int inputSize);
+int LZ4_decompress_safe (const char* source, char* dest, int inputSize, int maxOutputSize);
+
+/*
+LZ4_compress() :
+ Compresses 'inputSize' bytes from 'source' into 'dest'.
+ Destination buffer must be already allocated,
+ and must be sized to handle worst cases situations (input data not compressible)
+ Worst case size evaluation is provided by function LZ4_compressBound()
+ inputSize : Max supported value is LZ4_MAX_INPUT_VALUE
+ return : the number of bytes written in buffer dest
+ or 0 if the compression fails
+
+LZ4_decompress_safe() :
+ maxOutputSize : is the size of the destination buffer (which must be already allocated)
+ return : the number of bytes decoded in the destination buffer (necessarily <= maxOutputSize)
+ If the source stream is detected malformed, the function will stop decoding and return a negative result.
+ This function is protected against buffer overflow exploits (never writes outside of output buffer, and never reads outside of input buffer). Therefore, it is protected against malicious data packets
+*/
+
+
+//****************************
+// Advanced Functions
+//****************************
+#define LZ4_MAX_INPUT_SIZE 0x7E000000 // 2 113 929 216 bytes
+#define LZ4_COMPRESSBOUND(isize) ((unsigned int)(isize) > (unsigned int)LZ4_MAX_INPUT_SIZE ? 0 : (isize) + ((isize)/255) + 16)
+static inline int LZ4_compressBound(int isize) { return LZ4_COMPRESSBOUND(isize); }
+
+/*
+LZ4_compressBound() :
+ Provides the maximum size that LZ4 may output in a "worst case" scenario (input data not compressible)
+ primarily useful for memory allocation of output buffer.
+ inline function is recommended for the general case,
+ macro is also provided when result needs to be evaluated at compilation (such as stack memory allocation).
+
+ isize : is the input size. Max supported value is LZ4_MAX_INPUT_SIZE
+ return : maximum output size in a "worst case" scenario
+ or 0, if input size is too large ( > LZ4_MAX_INPUT_SIZE)
+*/
+
+
+int LZ4_compress_limitedOutput (const char* source, char* dest, int inputSize, int maxOutputSize);
+
+/*
+LZ4_compress_limitedOutput() :
+ Compress 'inputSize' bytes from 'source' into an output buffer 'dest' of maximum size 'maxOutputSize'.
+ If it cannot achieve it, compression will stop, and result of the function will be zero.
+ This function never writes outside of provided output buffer.
+
+ inputSize : Max supported value is LZ4_MAX_INPUT_VALUE
+ maxOutputSize : is the size of the destination buffer (which must be already allocated)
+ return : the number of bytes written in buffer 'dest'
+ or 0 if the compression fails
+*/
+
+
+int LZ4_decompress_fast (const char* source, char* dest, int outputSize);
+
+/*
+LZ4_decompress_fast() :
+ outputSize : is the original (uncompressed) size
+ return : the number of bytes read from the source buffer (in other words, the compressed size)
+ If the source stream is malformed, the function will stop decoding and return a negative result.
+ note : This function is a bit faster than LZ4_decompress_safe()
+ This function never writes outside of output buffers, but may read beyond input buffer in case of malicious data packet.
+ Use this function preferably into a trusted environment (data to decode comes from a trusted source).
+ Destination buffer must be already allocated. Its size must be a minimum of 'outputSize' bytes.
+*/
+
+int LZ4_decompress_safe_partial (const char* source, char* dest, int inputSize, int targetOutputSize, int maxOutputSize);
+
+/*
+LZ4_decompress_safe_partial() :
+ This function decompress a compressed block of size 'inputSize' at position 'source'
+ into output buffer 'dest' of size 'maxOutputSize'.
+ The function tries to stop decompressing operation as soon as 'targetOutputSize' has been reached,
+ reducing decompression time.
+ return : the number of bytes decoded in the destination buffer (necessarily <= maxOutputSize)
+ Note : this number can be < 'targetOutputSize' should the compressed block to decode be smaller.
+ Always control how many bytes were decoded.
+ If the source stream is detected malformed, the function will stop decoding and return a negative result.
+ This function never writes outside of output buffer, and never reads outside of input buffer. It is therefore protected against malicious data packets
+*/
+
+
+//****************************
+// Stream Functions
+//****************************
+
+void* LZ4_create (const char* inputBuffer);
+int LZ4_compress_continue (void* LZ4_Data, const char* source, char* dest, int inputSize);
+int LZ4_compress_limitedOutput_continue (void* LZ4_Data, const char* source, char* dest, int inputSize, int maxOutputSize);
+char* LZ4_slideInputBuffer (void* LZ4_Data);
+int LZ4_free (void* LZ4_Data);
+
+/*
+These functions allow the compression of dependent blocks, where each block benefits from prior 64 KB within preceding blocks.
+In order to achieve this, it is necessary to start creating the LZ4 Data Structure, thanks to the function :
+
+void* LZ4_create (const char* inputBuffer);
+The result of the function is the (void*) pointer on the LZ4 Data Structure.
+This pointer will be needed in all other functions.
+If the pointer returned is NULL, then the allocation has failed, and compression must be aborted.
+The only parameter 'const char* inputBuffer' must, obviously, point at the beginning of input buffer.
+The input buffer must be already allocated, and size at least 192KB.
+'inputBuffer' will also be the 'const char* source' of the first block.
+
+All blocks are expected to lay next to each other within the input buffer, starting from 'inputBuffer'.
+To compress each block, use either LZ4_compress_continue() or LZ4_compress_limitedOutput_continue().
+Their behavior are identical to LZ4_compress() or LZ4_compress_limitedOutput(),
+but require the LZ4 Data Structure as their first argument, and check that each block starts right after the previous one.
+If next block does not begin immediately after the previous one, the compression will fail (return 0).
+
+When it's no longer possible to lay the next block after the previous one (not enough space left into input buffer), a call to :
+char* LZ4_slideInputBuffer(void* LZ4_Data);
+must be performed. It will typically copy the latest 64KB of input at the beginning of input buffer.
+Note that, for this function to work properly, minimum size of an input buffer must be 192KB.
+==> The memory position where the next input data block must start is provided as the result of the function.
+
+Compression can then resume, using LZ4_compress_continue() or LZ4_compress_limitedOutput_continue(), as usual.
+
+When compression is completed, a call to LZ4_free() will release the memory used by the LZ4 Data Structure.
+*/
+
+
+int LZ4_decompress_safe_withPrefix64k (const char* source, char* dest, int inputSize, int maxOutputSize);
+int LZ4_decompress_fast_withPrefix64k (const char* source, char* dest, int outputSize);
+
+/*
+*_withPrefix64k() :
+ These decoding functions work the same as their "normal name" versions,
+ but can use up to 64KB of data in front of 'char* dest'.
+ These functions are necessary to decode inter-dependant blocks.
+*/
+
+
+//****************************
+// Obsolete Functions
+//****************************
+
+static inline int LZ4_uncompress (const char* source, char* dest, int outputSize) { return LZ4_decompress_fast(source, dest, outputSize); }
+static inline int LZ4_uncompress_unknownOutputSize (const char* source, char* dest, int isize, int maxOutputSize) { return LZ4_decompress_safe(source, dest, isize, maxOutputSize); }
+
+/*
+These functions are deprecated and should no longer be used.
+They are provided here for compatibility with existing user programs.
+*/
+
+
+
+#if defined (__cplusplus)
+}
+#endif
diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc
index 43cfa23a99f..ead0b0fc902 100644
--- a/storage/xtradb/handler/ha_innodb.cc
+++ b/storage/xtradb/handler/ha_innodb.cc
@@ -4,6 +4,7 @@ Copyright (c) 2000, 2013, Oracle and/or its affiliates. All rights reserved.
Copyright (c) 2008, 2009 Google Inc.
Copyright (c) 2009, Percona Inc.
Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, 2014, SkySQL Ab.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -558,6 +559,27 @@ ib_cb_t innodb_api_cb[] = {
(ib_cb_t) ib_cfg_bk_commit_interval
};
+/**
+ Structure for CREATE TABLE options (table options).
+ It needs to be called ha_table_option_struct.
+
+ The option values can be specified in the CREATE TABLE at the end:
+ CREATE TABLE ( ... ) *here*
+*/
+
+ha_create_table_option innodb_table_option_list[]=
+{
+ /* With this option user can enable page compression feature for the
+ table */
+ HA_TOPTION_BOOL("PAGE_COMPRESSED", page_compressed, 0),
+ /* With this option user can set zip compression level for page
+ compression for this table*/
+ HA_TOPTION_NUMBER("PAGE_COMPRESSION_LEVEL", page_compression_level, ULINT_UNDEFINED, 0, 9, 1),
+ /* With this option user can enable atomic writes feature for this table */
+ HA_TOPTION_ENUM("ATOMIC_WRITES", atomic_writes, "DEFAULT,ON,OFF", 0),
+ HA_TOPTION_END
+};
+
/*************************************************************//**
Check whether valid argument given to innodb_ft_*_stopword_table.
This function is registered as a callback with MySQL.
@@ -873,6 +895,25 @@ static SHOW_VAR innodb_status_variables[]= {
(char*) &export_vars.innodb_x_lock_spin_rounds, SHOW_LONGLONG},
{"x_lock_spin_waits",
(char*) &export_vars.innodb_x_lock_spin_waits, SHOW_LONGLONG},
+
+ /* Status variables for page compression */
+ {"page_compression_saved",
+ (char*) &export_vars.innodb_page_compression_saved, SHOW_LONGLONG},
+ {"page_compression_trim_sect512",
+ (char*) &export_vars.innodb_page_compression_trim_sect512, SHOW_LONGLONG},
+ {"page_compression_trim_sect4096",
+ (char*) &export_vars.innodb_page_compression_trim_sect4096, SHOW_LONGLONG},
+ {"num_index_pages_written",
+ (char*) &export_vars.innodb_index_pages_written, SHOW_LONGLONG},
+ {"num_pages_page_compressed",
+ (char*) &export_vars.innodb_pages_page_compressed, SHOW_LONGLONG},
+ {"num_page_compressed_trim_op",
+ (char*) &export_vars.innodb_page_compressed_trim_op, SHOW_LONGLONG},
+ {"num_page_compressed_trim_op_saved",
+ (char*) &export_vars.innodb_page_compressed_trim_op_saved, SHOW_LONGLONG},
+ {"num_pages_page_decompressed",
+ (char*) &export_vars.innodb_pages_page_decompressed, SHOW_LONGLONG},
+
{NullS, NullS, SHOW_LONG}
};
@@ -3156,6 +3197,8 @@ innobase_init(
if (srv_file_per_table)
innobase_hton->tablefile_extensions = ha_innobase_exts;
+ innobase_hton->table_options = innodb_table_option_list;
+
ut_a(DATA_MYSQL_TRUE_VARCHAR == (ulint)MYSQL_TYPE_VARCHAR);
#ifndef DBUG_OFF
@@ -10010,11 +10053,16 @@ innobase_table_flags(
enum row_type row_format;
rec_format_t innodb_row_format = REC_FORMAT_COMPACT;
bool use_data_dir;
+ ha_table_option_struct *options= form->s->option_struct;
/* Cache the value of innodb_file_format, in case it is
modified by another thread while the table is being created. */
const ulint file_format_allowed = srv_file_format;
+ /* Cache the value of innobase_compression_level, in case it is
+ modified by another thread while the table is being created. */
+ const ulint default_compression_level = page_zip_level;
+
*flags = 0;
*flags2 = 0;
@@ -10063,6 +10111,8 @@ index_bad:
}
}
+ row_format = form->s->row_type;
+
if (create_info->key_block_size) {
/* The requested compressed page size (key_block_size)
is given in kilobytes. If it is a valid number, store
@@ -10110,8 +10160,6 @@ index_bad:
}
}
- row_format = form->s->row_type;
-
if (zip_ssize && zip_allowed) {
/* if ROW_FORMAT is set to default,
automatically change it to COMPRESSED.*/
@@ -10166,10 +10214,18 @@ index_bad:
" innodb_file_format > Antelope.",
get_row_format_name(row_format));
} else {
- innodb_row_format = (row_format == ROW_TYPE_DYNAMIC
- ? REC_FORMAT_DYNAMIC
- : REC_FORMAT_COMPRESSED);
- break;
+ switch(row_format) {
+ case ROW_TYPE_COMPRESSED:
+ innodb_row_format = REC_FORMAT_COMPRESSED;
+ break;
+ case ROW_TYPE_DYNAMIC:
+ innodb_row_format = REC_FORMAT_DYNAMIC;
+ break;
+ default:
+ /* Not possible, avoid compiler warning */
+ break;
+ }
+ break; /* Correct row_format */
}
zip_allowed = FALSE;
/* fall through to set row_format = COMPACT */
@@ -10196,7 +10252,15 @@ index_bad:
&& ((create_info->data_file_name != NULL)
&& !(create_info->options & HA_LEX_CREATE_TMP_TABLE));
- dict_tf_set(flags, innodb_row_format, zip_ssize, use_data_dir);
+ /* Set up table dictionary flags */
+ dict_tf_set(flags,
+ innodb_row_format,
+ zip_ssize,
+ use_data_dir,
+ options->page_compressed,
+ (ulint)options->page_compression_level == ULINT_UNDEFINED ?
+ default_compression_level : options->page_compression_level,
+ options->atomic_writes);
if (create_info->options & HA_LEX_CREATE_TMP_TABLE) {
*flags2 |= DICT_TF2_TEMPORARY;
@@ -10210,6 +10274,112 @@ index_bad:
}
/*****************************************************************//**
+Check engine specific table options not handled by SQL-parser.
+@return NULL if valid, string if not */
+UNIV_INTERN
+const char*
+ha_innobase::check_table_options(
+ THD *thd, /*!< in: thread handle */
+ TABLE* table, /*!< in: information on table
+ columns and indexes */
+ HA_CREATE_INFO* create_info, /*!< in: more information of the
+ created table, contains also the
+ create statement string */
+ const bool use_tablespace, /*!< in: use file par table */
+ const ulint file_format)
+{
+ enum row_type row_format = table->s->row_type;;
+ ha_table_option_struct *options= table->s->option_struct;
+ atomic_writes_t awrites = (atomic_writes_t)options->atomic_writes;
+
+ /* Check page compression requirements */
+ if (options->page_compressed) {
+ if (!srv_compress_pages) {
+ push_warning(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: PAGE_COMPRESSED requires"
+ "innodb_compress_pages not enabled");
+ return "PAGE_COMPRESSED";
+ }
+
+ if (row_format == ROW_TYPE_COMPRESSED) {
+ push_warning(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: PAGE_COMPRESSED table can't have"
+ " ROW_TYPE=COMPRESSED");
+ return "PAGE_COMPRESSED";
+ }
+
+ if (!use_tablespace) {
+ push_warning(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: PAGE_COMPRESSED requires"
+ " innodb_file_per_table.");
+ return "PAGE_COMPRESSED";
+ }
+
+ if (file_format < UNIV_FORMAT_B) {
+ push_warning(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: PAGE_COMPRESSED requires"
+ " innodb_file_format > Antelope.");
+ return "PAGE_COMPRESSED";
+ }
+
+ if (create_info->key_block_size) {
+ push_warning(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: PAGE_COMPRESSED table can't have"
+ " key_block_size");
+ return "PAGE_COMPRESSED";
+ }
+ }
+
+ /* Check page compression level requirements, some of them are
+ already checked above */
+ if ((ulint)options->page_compression_level != ULINT_UNDEFINED) {
+ if (options->page_compressed == false) {
+ push_warning(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: PAGE_COMPRESSION_LEVEL requires"
+ " PAGE_COMPRESSED");
+ return "PAGE_COMPRESSION_LEVEL";
+ }
+
+ if (options->page_compression_level < 0 || options->page_compression_level > 9) {
+ push_warning_printf(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: invalid PAGE_COMPRESSION_LEVEL = %lu."
+ " Valid values are [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]",
+ create_info->key_block_size);
+ return "PAGE_COMPRESSION_LEVEL";
+ }
+ }
+
+ /* Check atomic writes requirements */
+ if (awrites == ATOMIC_WRITES_ON ||
+ (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes)) {
+ if (!use_tablespace) {
+ push_warning(
+ thd, Sql_condition::WARN_LEVEL_WARN,
+ HA_WRONG_CREATE_OPTION,
+ "InnoDB: ATOMIC_WRITES requires"
+ " innodb_file_per_table.");
+ return "ATOMIC_WRITES";
+ }
+ }
+
+ return 0;
+}
+
+/*****************************************************************//**
Creates a new table to an InnoDB database.
@return error number */
UNIV_INTERN
@@ -10240,6 +10410,7 @@ ha_innobase::create(
while creating the table. So we read the current value here
and make all further decisions based on this. */
bool use_tablespace = srv_file_per_table;
+ const ulint file_format = srv_file_format;
/* Zip Shift Size - log2 - 9 of compressed page size,
zero for uncompressed */
@@ -10263,6 +10434,12 @@ ha_innobase::create(
/* Create the table definition in InnoDB */
+ /* Validate table options not handled by the SQL-parser */
+ if(check_table_options(thd, form, create_info, use_tablespace,
+ file_format)) {
+ DBUG_RETURN(HA_WRONG_CREATE_OPTION);
+ }
+
/* Validate create options if innodb_strict_mode is set. */
if (create_options_are_invalid(
thd, form, create_info, use_tablespace)) {
@@ -14578,6 +14755,12 @@ ha_innobase::check_if_incompatible_data(
HA_CREATE_INFO* info,
uint table_changes)
{
+ ha_table_option_struct *param_old, *param_new;
+
+ /* Cache engine specific options */
+ param_new = info->option_struct;
+ param_old = table->s->option_struct;
+
innobase_copy_frm_flags_from_create_info(prebuilt->table, info);
if (table_changes != IS_EQUAL_YES) {
@@ -14604,6 +14787,13 @@ ha_innobase::check_if_incompatible_data(
return(COMPATIBLE_DATA_NO);
}
+ /* Changes on engine specific table options requests a rebuild of the table. */
+ if (param_new->page_compressed != param_old->page_compressed ||
+ param_new->page_compression_level != param_old->page_compression_level ||
+ param_new->atomic_writes != param_old->atomic_writes) {
+ return(COMPATIBLE_DATA_NO);
+ }
+
return(COMPATIBLE_DATA_YES);
}
@@ -17079,12 +17269,6 @@ static MYSQL_SYSVAR_ULONG(replication_delay, srv_replication_delay,
"innodb_thread_concurrency is reached (0 by default)",
NULL, NULL, 0, 0, ~0UL, 0);
-static MYSQL_SYSVAR_UINT(compression_level, page_zip_level,
- PLUGIN_VAR_RQCMDARG,
- "Compression level used for compressed row format. 0 is no compression"
- ", 1 is fastest, 9 is best compression and default is 6.",
- NULL, NULL, DEFAULT_COMPRESSION_LEVEL, 0, 9, 0);
-
static MYSQL_SYSVAR_BOOL(log_compressed_pages, page_zip_log_pages,
PLUGIN_VAR_OPCMDARG,
"Enables/disables the logging of entire compressed page images."
@@ -17758,6 +17942,37 @@ static MYSQL_SYSVAR_BOOL(use_stacktrace, srv_use_stacktrace,
"Print stacktrace on long semaphore wait (off by default supported only on linux)",
NULL, NULL, FALSE);
+static MYSQL_SYSVAR_BOOL(compress_pages, srv_compress_pages,
+ PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+ "Use page compression.",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_LONG(trim_pct, srv_trim_pct,
+ PLUGIN_VAR_OPCMDARG ,
+ "How many percent of compressed pages should be trimmed",
+ NULL, NULL, 100, 0, 100, 0);
+
+static MYSQL_SYSVAR_UINT(compression_level, page_zip_level,
+ PLUGIN_VAR_RQCMDARG,
+ "Compression level used for zlib compression. 0 is no compression"
+ ", 1 is fastest, 9 is best compression and default is 6.",
+ NULL, NULL, DEFAULT_COMPRESSION_LEVEL, 0, 9, 0);
+
+static MYSQL_SYSVAR_BOOL(compress_index_pages, srv_page_compress_index_pages,
+ PLUGIN_VAR_OPCMDARG,
+ "Use page compression for only index pages.",
+ NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(use_trim, srv_use_trim,
+ PLUGIN_VAR_OPCMDARG,
+ "Use trim.",
+ NULL, NULL, TRUE);
+
+static MYSQL_SYSVAR_BOOL(use_lz4, srv_use_lz4,
+ PLUGIN_VAR_OPCMDARG ,
+ "Use LZ4 for page compression",
+ NULL, NULL, FALSE);
+
static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(log_block_size),
MYSQL_SYSVAR(additional_mem_pool_size),
@@ -17948,6 +18163,11 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
MYSQL_SYSVAR(fake_changes),
MYSQL_SYSVAR(locking_fake_changes),
MYSQL_SYSVAR(use_stacktrace),
+ MYSQL_SYSVAR(compress_pages),
+ MYSQL_SYSVAR(trim_pct),
+ MYSQL_SYSVAR(compress_index_pages),
+ MYSQL_SYSVAR(use_trim),
+ MYSQL_SYSVAR(use_lz4),
NULL
};
diff --git a/storage/xtradb/handler/ha_innodb.h b/storage/xtradb/handler/ha_innodb.h
index 773a9b6b04d..b4df711356c 100644
--- a/storage/xtradb/handler/ha_innodb.h
+++ b/storage/xtradb/handler/ha_innodb.h
@@ -1,6 +1,7 @@
/*****************************************************************************
Copyright (c) 2000, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -57,6 +58,21 @@ typedef struct st_innobase_share {
/** Prebuilt structures in an InnoDB table handle used within MySQL */
struct row_prebuilt_t;
+/** Engine specific table options are definined using this struct */
+struct ha_table_option_struct
+{
+ bool page_compressed; /*!< Table is using page compression
+ if this option is true. */
+ int page_compression_level; /*!< Table page compression level
+ or UNIV_UNSPECIFIED. */
+ uint atomic_writes; /*!< Use atomic writes for this
+ table if this options is ON or
+ in DEFAULT if
+ srv_use_atomic_writes=1.
+ Atomic writes are not used if
+ value OFF.*/
+};
+
/** The class defining a handle to an Innodb table */
class ha_innobase: public handler
{
@@ -184,6 +200,8 @@ class ha_innobase: public handler
char* norm_name,
char* temp_path,
char* remote_path);
+ const char* check_table_options(THD *thd, TABLE* table,
+ HA_CREATE_INFO* create_info, const bool use_tablespace, const ulint file_format);
int create(const char *name, register TABLE *form,
HA_CREATE_INFO *create_info);
int truncate();
diff --git a/storage/xtradb/handler/handler0alter.cc b/storage/xtradb/handler/handler0alter.cc
index 9c535285d1e..24dc1086cc5 100644
--- a/storage/xtradb/handler/handler0alter.cc
+++ b/storage/xtradb/handler/handler0alter.cc
@@ -1,6 +1,7 @@
/*****************************************************************************
Copyright (c) 2005, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -252,6 +253,22 @@ ha_innobase::check_if_supported_inplace_alter(
update_thd();
trx_search_latch_release_if_reserved(prebuilt->trx);
+ /* Change on engine specific table options require rebuild of the
+ table */
+ if (ha_alter_info->handler_flags
+ == Alter_inplace_info::CHANGE_CREATE_OPTION) {
+ ha_table_option_struct *new_options= ha_alter_info->create_info->option_struct;
+ ha_table_option_struct *old_options= table->s->option_struct;
+
+ if (new_options->page_compressed != old_options->page_compressed ||
+ new_options->page_compression_level != old_options->page_compression_level ||
+ new_options->atomic_writes != old_options->atomic_writes) {
+ ha_alter_info->unsupported_reason = innobase_get_err_msg(
+ ER_ALTER_OPERATION_NOT_SUPPORTED_REASON);
+ DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+ }
+ }
+
if (ha_alter_info->handler_flags
& ~(INNOBASE_INPLACE_IGNORE
| INNOBASE_ALTER_NOREBUILD
@@ -3372,6 +3389,17 @@ ha_innobase::prepare_inplace_alter_table(
if (ha_alter_info->handler_flags
& Alter_inplace_info::CHANGE_CREATE_OPTION) {
+ /* Check engine specific table options */
+ if (const char* invalid_tbopt = check_table_options(
+ user_thd, altered_table,
+ ha_alter_info->create_info,
+ prebuilt->table->space != 0,
+ srv_file_format)) {
+ my_error(ER_ILLEGAL_HA_CREATE_OPTION, MYF(0),
+ table_type(), invalid_tbopt);
+ goto err_exit_no_heap;
+ }
+
if (const char* invalid_opt = create_options_are_invalid(
user_thd, altered_table,
ha_alter_info->create_info,
diff --git a/storage/xtradb/include/buf0buf.h b/storage/xtradb/include/buf0buf.h
index ba2f413429c..8fedeeaa832 100644
--- a/storage/xtradb/include/buf0buf.h
+++ b/storage/xtradb/include/buf0buf.h
@@ -1,6 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -1489,6 +1490,12 @@ struct buf_page_t{
state == BUF_BLOCK_ZIP_PAGE and
zip.data == NULL means an active
buf_pool->watch */
+
+ ulint write_size; /* Write size is set when this
+ page is first time written and then
+ if written again we check is TRIM
+ operation needed. */
+
#ifndef UNIV_HOTBACKUP
buf_page_t* hash; /*!< node used in chaining to
buf_pool->page_hash or
@@ -2118,6 +2125,20 @@ struct CheckUnzipLRUAndLRUList {
};
#endif /* UNIV_DEBUG || defined UNIV_BUF_DEBUG */
+/*********************************************************************//**
+Aquire LRU list mutex */
+void
+buf_pool_mutex_enter(
+/*=================*/
+ buf_pool_t* buf_pool); /*!< in: buffer pool */
+/*********************************************************************//**
+Exit LRU list mutex */
+void
+buf_pool_mutex_exit(
+/*================*/
+ buf_pool_t* buf_pool); /*!< in: buffer pool */
+
+
#ifndef UNIV_NONINL
#include "buf0buf.ic"
#endif
diff --git a/storage/xtradb/include/buf0flu.h b/storage/xtradb/include/buf0flu.h
index f4542e7c206..6b2827e77a7 100644
--- a/storage/xtradb/include/buf0flu.h
+++ b/storage/xtradb/include/buf0flu.h
@@ -36,6 +36,13 @@ Created 11/5/1995 Heikki Tuuri
/** Flag indicating if the page_cleaner is in active state. */
extern ibool buf_page_cleaner_is_active;
+/** Handled page counters for a single flush */
+struct flush_counters_t {
+ ulint flushed; /*!< number of dirty pages flushed */
+ ulint evicted; /*!< number of clean pages evicted */
+};
+
+
/********************************************************************//**
Remove a block from the flush list of modified blocks. */
UNIV_INTERN
diff --git a/storage/xtradb/include/dict0dict.h b/storage/xtradb/include/dict0dict.h
index 6669f60b95a..8ab05c50dbd 100644
--- a/storage/xtradb/include/dict0dict.h
+++ b/storage/xtradb/include/dict0dict.h
@@ -2,6 +2,7 @@
Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -42,6 +43,8 @@ Created 1/8/1996 Heikki Tuuri
#include "ut0byte.h"
#include "trx0types.h"
#include "row0types.h"
+#include "fsp0fsp.h"
+#include "dict0pagecompress.h"
#ifndef UNIV_HOTBACKUP
# include "sync0sync.h"
@@ -904,7 +907,14 @@ dict_tf_set(
ulint* flags, /*!< in/out: table */
rec_format_t format, /*!< in: file format */
ulint zip_ssize, /*!< in: zip shift size */
- bool remote_path) /*!< in: table uses DATA DIRECTORY */
+ bool remote_path, /*!< in: table uses DATA DIRECTORY
+ */
+ bool page_compressed,/*!< in: table uses page compressed
+ pages */
+ ulint page_compression_level, /*!< in: table page compression
+ level */
+ ulint atomic_writes) /*!< in: table atomic
+ writes option value*/
__attribute__((nonnull));
/********************************************************************//**
Convert a 32 bit integer table flags to the 32 bit integer that is
diff --git a/storage/xtradb/include/dict0dict.ic b/storage/xtradb/include/dict0dict.ic
index c261d6a3aee..502b1d028d8 100644
--- a/storage/xtradb/include/dict0dict.ic
+++ b/storage/xtradb/include/dict0dict.ic
@@ -1,6 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -537,10 +538,27 @@ dict_tf_is_valid(
ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(flags);
ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(flags);
ulint unused = DICT_TF_GET_UNUSED(flags);
+ ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(flags);
+ ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags);
+ ulint data_dir = DICT_TF_HAS_DATA_DIR(flags);
+ ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(flags);
/* Make sure there are no bits that we do not know about. */
if (unused != 0) {
+ fprintf(stderr,
+ "InnoDB: Error: table unused flags are %ld"
+ " in the data dictionary and are corrupted\n"
+ "InnoDB: Error: data dictionary flags are\n"
+ "InnoDB: compact %ld atomic_blobs %ld\n"
+ "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n"
+ "InnoDB: page_compression %ld page_compression_level %ld\n"
+ "InnoDB: atomic_writes %ld\n",
+ unused,
+ compact, atomic_blobs, unused, data_dir, zip_ssize,
+ page_compression, page_compression_level, atomic_writes
+ );
+
return(false);
} else if (atomic_blobs) {
@@ -550,12 +568,36 @@ dict_tf_is_valid(
data stored off-page in the clustered index. */
if (!compact) {
+ fprintf(stderr,
+ "InnoDB: Error: table compact flags are %ld"
+ " in the data dictionary and are corrupted\n"
+ "InnoDB: Error: data dictionary flags are\n"
+ "InnoDB: compact %ld atomic_blobs %ld\n"
+ "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n"
+ "InnoDB: page_compression %ld page_compression_level %ld\n"
+ "InnoDB: atomic_writes %ld\n",
+ compact, compact, atomic_blobs, unused, data_dir, zip_ssize,
+ page_compression, page_compression_level, atomic_writes
+ );
+
return(false);
}
} else if (zip_ssize) {
/* Antelope does not support COMPRESSED row format. */
+ fprintf(stderr,
+ "InnoDB: Error: table flags are %ld"
+ " in the data dictionary and are corrupted\n"
+ "InnoDB: Error: data dictionary flags are\n"
+ "InnoDB: compact %ld atomic_blobs %ld\n"
+ "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n"
+ "InnoDB: page_compression %ld page_compression_level %ld\n"
+ "InnoDB: atomic_writes %ld\n",
+ flags, compact, atomic_blobs, unused, data_dir, zip_ssize,
+ page_compression, page_compression_level, atomic_writes
+ );
+
return(false);
}
@@ -568,6 +610,41 @@ dict_tf_is_valid(
|| !atomic_blobs
|| zip_ssize > PAGE_ZIP_SSIZE_MAX) {
+ fprintf(stderr,
+ "InnoDB: Error: table compact flags are %ld in the data dictionary and are corrupted\n"
+ "InnoDB: Error: data dictionary flags are\n"
+ "InnoDB: compact %ld atomic_blobs %ld\n"
+ "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n"
+ "InnoDB: page_compression %ld page_compression_level %ld\n"
+ "InnoDB: atomic_writes %ld\n",
+ flags,
+ compact, atomic_blobs, unused, data_dir, zip_ssize,
+ page_compression, page_compression_level, atomic_writes
+
+ );
+ return(false);
+ }
+ }
+
+ if (page_compression || page_compression_level) {
+ /* Page compression format must have compact and
+ atomic_blobs and page_compression_level requires
+ page_compression */
+ if (!compact
+ || !page_compression
+ || !atomic_blobs) {
+
+ fprintf(stderr,
+ "InnoDB: Error: table flags are %ld in the data dictionary and are corrupted\n"
+ "InnoDB: Error: data dictionary flags are\n"
+ "InnoDB: compact %ld atomic_blobs %ld\n"
+ "InnoDB: unused %ld data_dir %ld zip_ssize %ld\n"
+ "InnoDB: page_compression %ld page_compression_level %ld\n"
+ "InnoDB: atomic_writes %ld\n",
+ flags, compact, atomic_blobs, unused, data_dir, zip_ssize,
+ page_compression, page_compression_level, atomic_writes
+ );
+
return(false);
}
}
@@ -594,6 +671,10 @@ dict_sys_tables_type_validate(
ulint zip_ssize = DICT_TF_GET_ZIP_SSIZE(type);
ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(type);
ulint unused = DICT_TF_GET_UNUSED(type);
+ ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(type);
+ ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(type);
+ ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(type);
+ atomic_writes_t awrites = (atomic_writes_t)atomic_writes;
/* The low order bit of SYS_TABLES.TYPE is always set to 1.
If the format is UNIV_FORMAT_B or higher, this field is the same
@@ -647,6 +728,24 @@ dict_sys_tables_type_validate(
format, so the DATA_DIR flag is compatible with any other
table flags. However, it is not used with TEMPORARY tables.*/
+ if (page_compression || page_compression_level) {
+ /* page compressed row format must have low_order_bit and
+ atomic_blobs bits set and the DICT_N_COLS_COMPACT flag
+ should be in N_COLS, but we already know about the
+ low_order_bit and DICT_N_COLS_COMPACT flags. */
+
+ if (!atomic_blobs || !page_compression) {
+ return(ULINT_UNDEFINED);
+ }
+ }
+
+ if (awrites == ATOMIC_WRITES_ON ||
+ (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes)) {
+ if (!atomic_blobs) {
+ return(ULINT_UNDEFINED);
+ }
+ }
+
/* Return the validated SYS_TABLES.TYPE. */
return(type);
}
@@ -719,8 +818,16 @@ dict_tf_set(
ulint* flags, /*!< in/out: table flags */
rec_format_t format, /*!< in: file format */
ulint zip_ssize, /*!< in: zip shift size */
- bool use_data_dir) /*!< in: table uses DATA DIRECTORY */
+ bool use_data_dir, /*!< in: table uses DATA DIRECTORY
+ */
+ bool page_compressed,/*!< in: table uses page compressed
+ pages */
+ ulint page_compression_level, /*!< in: table page compression
+ level */
+ ulint atomic_writes) /*!< in: table atomic writes setup */
{
+ atomic_writes_t awrites = (atomic_writes_t)atomic_writes;
+
switch (format) {
case REC_FORMAT_REDUNDANT:
*flags = 0;
@@ -745,6 +852,28 @@ dict_tf_set(
if (use_data_dir) {
*flags |= (1 << DICT_TF_POS_DATA_DIR);
}
+
+ if (page_compressed) {
+ *flags = DICT_TF_COMPACT
+ | (1 << DICT_TF_POS_ATOMIC_BLOBS)
+ | (1 << DICT_TF_POS_PAGE_COMPRESSION)
+ | (page_compression_level << DICT_TF_POS_PAGE_COMPRESSION_LEVEL);
+
+ ut_ad(zip_ssize == 0);
+ ut_ad(dict_tf_get_page_compression(*flags) == TRUE);
+ ut_ad(dict_tf_get_page_compression_level(*flags) == page_compression_level);
+ }
+
+ if (awrites != ATOMIC_WRITES_DEFAULT) {
+ *flags |= (atomic_writes << DICT_TF_POS_ATOMIC_WRITES);
+ ut_ad(dict_tf_get_atomic_writes(*flags) == awrites);
+ }
+
+ if (awrites == ATOMIC_WRITES_ON ||
+ (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes )) {
+ *flags |= (1 << DICT_TF_POS_ATOMIC_BLOBS);
+ }
+
}
/********************************************************************//**
@@ -765,6 +894,9 @@ dict_tf_to_fsp_flags(
ulint table_flags) /*!< in: dict_table_t::flags */
{
ulint fsp_flags;
+ ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(table_flags);
+ ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(table_flags);
+ ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(table_flags);
DBUG_EXECUTE_IF("dict_tf_to_fsp_flags_failure",
return(ULINT_UNDEFINED););
@@ -783,7 +915,20 @@ dict_tf_to_fsp_flags(
fsp_flags |= DICT_TF_HAS_DATA_DIR(table_flags)
? FSP_FLAGS_MASK_DATA_DIR : 0;
+ /* In addition, tablespace flags also contain if the page
+ compression is used for this table. */
+ fsp_flags |= FSP_FLAGS_SET_PAGE_COMPRESSION(fsp_flags, page_compression);
+
+ /* In addition, tablespace flags also contain page compression level
+ if page compression is used for this table. */
+ fsp_flags |= FSP_FLAGS_SET_PAGE_COMPRESSION_LEVEL(fsp_flags, page_compression_level);
+
+ /* In addition, tablespace flags also contain flag if atomic writes
+ is used for this table */
+ fsp_flags |= FSP_FLAGS_SET_ATOMIC_WRITES(fsp_flags, atomic_writes);
+
ut_a(fsp_flags_is_valid(fsp_flags));
+ ut_a(dict_tf_verify_flags(table_flags, fsp_flags));
return(fsp_flags);
}
@@ -811,10 +956,15 @@ dict_sys_tables_type_to_tf(
/* Adjust bit zero. */
flags = redundant ? 0 : 1;
- /* ZIP_SSIZE, ATOMIC_BLOBS & DATA_DIR are the same. */
+ /* ZIP_SSIZE, ATOMIC_BLOBS, DATA_DIR, PAGE_COMPRESSION,
+ PAGE_COMPRESSION_LEVEL, ATOMIC_WRITES are the same. */
flags |= type & (DICT_TF_MASK_ZIP_SSIZE
| DICT_TF_MASK_ATOMIC_BLOBS
- | DICT_TF_MASK_DATA_DIR);
+ | DICT_TF_MASK_DATA_DIR
+ | DICT_TF_MASK_PAGE_COMPRESSION
+ | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL
+ | DICT_TF_MASK_ATOMIC_WRITES
+ );
return(flags);
}
@@ -842,10 +992,14 @@ dict_tf_to_sys_tables_type(
/* Adjust bit zero. It is always 1 in SYS_TABLES.TYPE */
type = 1;
- /* ZIP_SSIZE, ATOMIC_BLOBS & DATA_DIR are the same. */
+ /* ZIP_SSIZE, ATOMIC_BLOBS, DATA_DIR, PAGE_COMPRESSION,
+ PAGE_COMPRESSION_LEVEL, ATOMIC_WRITES are the same. */
type |= flags & (DICT_TF_MASK_ZIP_SSIZE
| DICT_TF_MASK_ATOMIC_BLOBS
- | DICT_TF_MASK_DATA_DIR);
+ | DICT_TF_MASK_DATA_DIR
+ | DICT_TF_MASK_PAGE_COMPRESSION
+ | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL
+ | DICT_TF_MASK_ATOMIC_WRITES);
return(type);
}
diff --git a/storage/xtradb/include/dict0mem.h b/storage/xtradb/include/dict0mem.h
index bde0ce16094..087fde0ccb7 100644
--- a/storage/xtradb/include/dict0mem.h
+++ b/storage/xtradb/include/dict0mem.h
@@ -2,6 +2,7 @@
Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -125,11 +126,26 @@ This flag prevents older engines from attempting to open the table and
allows InnoDB to update_create_info() accordingly. */
#define DICT_TF_WIDTH_DATA_DIR 1
+/**
+Width of the page compression flag
+*/
+#define DICT_TF_WIDTH_PAGE_COMPRESSION 1
+#define DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL 4
+
+/**
+Width of atomic writes flag
+DEFAULT=0, ON = 1, OFF = 2
+*/
+#define DICT_TF_WIDTH_ATOMIC_WRITES 2
+
/** Width of all the currently known table flags */
#define DICT_TF_BITS (DICT_TF_WIDTH_COMPACT \
+ DICT_TF_WIDTH_ZIP_SSIZE \
+ DICT_TF_WIDTH_ATOMIC_BLOBS \
- + DICT_TF_WIDTH_DATA_DIR)
+ + DICT_TF_WIDTH_DATA_DIR \
+ + DICT_TF_WIDTH_PAGE_COMPRESSION \
+ + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL \
+ + DICT_TF_WIDTH_ATOMIC_WRITES)
/** A mask of all the known/used bits in table flags */
#define DICT_TF_BIT_MASK (~(~0 << DICT_TF_BITS))
@@ -145,9 +161,18 @@ allows InnoDB to update_create_info() accordingly. */
/** Zero relative shift position of the DATA_DIR field */
#define DICT_TF_POS_DATA_DIR (DICT_TF_POS_ATOMIC_BLOBS \
+ DICT_TF_WIDTH_ATOMIC_BLOBS)
+/** Zero relative shift position of the PAGE_COMPRESSION field */
+#define DICT_TF_POS_PAGE_COMPRESSION (DICT_TF_POS_DATA_DIR \
+ + DICT_TF_WIDTH_DATA_DIR)
+/** Zero relative shift position of the PAGE_COMPRESSION_LEVEL field */
+#define DICT_TF_POS_PAGE_COMPRESSION_LEVEL (DICT_TF_POS_PAGE_COMPRESSION \
+ + DICT_TF_WIDTH_PAGE_COMPRESSION)
+/** Zero relative shift position of the ATOMIC_WRITES field */
+#define DICT_TF_POS_ATOMIC_WRITES (DICT_TF_POS_PAGE_COMPRESSION_LEVEL \
+ + DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL)
/** Zero relative shift position of the start of the UNUSED bits */
-#define DICT_TF_POS_UNUSED (DICT_TF_POS_DATA_DIR \
- + DICT_TF_WIDTH_DATA_DIR)
+#define DICT_TF_POS_UNUSED (DICT_TF_POS_ATOMIC_WRITES \
+ + DICT_TF_WIDTH_ATOMIC_WRITES)
/** Bit mask of the COMPACT field */
#define DICT_TF_MASK_COMPACT \
@@ -165,6 +190,18 @@ allows InnoDB to update_create_info() accordingly. */
#define DICT_TF_MASK_DATA_DIR \
((~(~0 << DICT_TF_WIDTH_DATA_DIR)) \
<< DICT_TF_POS_DATA_DIR)
+/** Bit mask of the PAGE_COMPRESSION field */
+#define DICT_TF_MASK_PAGE_COMPRESSION \
+ ((~(~0 << DICT_TF_WIDTH_PAGE_COMPRESSION)) \
+ << DICT_TF_POS_PAGE_COMPRESSION)
+/** Bit mask of the PAGE_COMPRESSION_LEVEL field */
+#define DICT_TF_MASK_PAGE_COMPRESSION_LEVEL \
+ ((~(~0 << DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL)) \
+ << DICT_TF_POS_PAGE_COMPRESSION_LEVEL)
+/** Bit mask of the ATOMIC_WRITES field */
+#define DICT_TF_MASK_ATOMIC_WRITES \
+ ((~(~0 << DICT_TF_WIDTH_ATOMIC_WRITES)) \
+ << DICT_TF_POS_ATOMIC_WRITES)
/** Return the value of the COMPACT field */
#define DICT_TF_GET_COMPACT(flags) \
@@ -185,6 +222,19 @@ allows InnoDB to update_create_info() accordingly. */
/** Return the contents of the UNUSED bits */
#define DICT_TF_GET_UNUSED(flags) \
(flags >> DICT_TF_POS_UNUSED)
+
+/** Return the value of the PAGE_COMPRESSION field */
+#define DICT_TF_GET_PAGE_COMPRESSION(flags) \
+ ((flags & DICT_TF_MASK_PAGE_COMPRESSION) \
+ >> DICT_TF_POS_PAGE_COMPRESSION)
+/** Return the value of the PAGE_COMPRESSION_LEVEL field */
+#define DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags) \
+ ((flags & DICT_TF_MASK_PAGE_COMPRESSION_LEVEL) \
+ >> DICT_TF_POS_PAGE_COMPRESSION_LEVEL)
+/** Return the value of the ATOMIC_WRITES field */
+#define DICT_TF_GET_ATOMIC_WRITES(flags) \
+ ((flags & DICT_TF_MASK_ATOMIC_WRITES) \
+ >> DICT_TF_POS_ATOMIC_WRITES)
/* @} */
#ifndef UNIV_INNOCHECKSUM
diff --git a/storage/xtradb/include/dict0pagecompress.h b/storage/xtradb/include/dict0pagecompress.h
new file mode 100644
index 00000000000..19a2a6c52f3
--- /dev/null
+++ b/storage/xtradb/include/dict0pagecompress.h
@@ -0,0 +1,94 @@
+/*****************************************************************************
+
+Copyright (C) 2013 SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0pagecompress.h
+Helper functions for extracting/storing page compression information
+to dictionary.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+#ifndef dict0pagecompress_h
+#define dict0pagecompress_h
+
+/********************************************************************//**
+Extract the page compression level from table flags.
+@return page compression level, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_tf_get_page_compression_level(
+/*===============================*/
+ ulint flags) /*!< in: flags */
+ __attribute__((const));
+/********************************************************************//**
+Extract the page compression flag from table flags
+@return page compression flag, or false if not compressed */
+UNIV_INLINE
+ibool
+dict_tf_get_page_compression(
+/*==========================*/
+ ulint flags) /*!< in: flags */
+ __attribute__((const));
+
+/********************************************************************//**
+Check whether the table uses the page compressed page format.
+@return page compression level, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_table_page_compression_level(
+/*==============================*/
+ const dict_table_t* table) /*!< in: table */
+ __attribute__((const));
+
+/********************************************************************//**
+Verify that dictionary flags match tablespace flags
+@return true if flags match, false if not */
+UNIV_INLINE
+ibool
+dict_tf_verify_flags(
+/*=================*/
+ ulint table_flags, /*!< in: dict_table_t::flags */
+ ulint fsp_flags) /*!< in: fil_space_t::flags */
+ __attribute__((const));
+
+/********************************************************************//**
+Extract the atomic writes flag from table flags.
+@return true if atomic writes are used, false if not used */
+UNIV_INLINE
+atomic_writes_t
+dict_tf_get_atomic_writes(
+/*======================*/
+ ulint flags) /*!< in: flags */
+ __attribute__((const));
+
+/********************************************************************//**
+Check whether the table uses the atomic writes.
+@return true if atomic writes is used, false if not */
+UNIV_INLINE
+atomic_writes_t
+dict_table_get_atomic_writes(
+/*=========================*/
+ const dict_table_t* table); /*!< in: table */
+
+
+#ifndef UNIV_NONINL
+#include "dict0pagecompress.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/dict0pagecompress.ic b/storage/xtradb/include/dict0pagecompress.ic
new file mode 100644
index 00000000000..fb9581fc657
--- /dev/null
+++ b/storage/xtradb/include/dict0pagecompress.ic
@@ -0,0 +1,191 @@
+/*****************************************************************************
+
+Copyright (C) 2013 SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0pagecompress.ic
+Inline implementation for helper functions for extracting/storing
+page compression and atomic writes information to dictionary.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+/********************************************************************//**
+Verify that dictionary flags match tablespace flags
+@return true if flags match, false if not */
+UNIV_INLINE
+ibool
+dict_tf_verify_flags(
+/*=================*/
+ ulint table_flags, /*!< in: dict_table_t::flags */
+ ulint fsp_flags) /*!< in: fil_space_t::flags */
+{
+ ulint table_unused = DICT_TF_GET_UNUSED(table_flags);
+ ulint compact = DICT_TF_GET_COMPACT(table_flags);
+ ulint ssize = DICT_TF_GET_ZIP_SSIZE(table_flags);
+ ulint atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(table_flags);
+ ulint data_dir = DICT_TF_HAS_DATA_DIR(table_flags);
+ ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(table_flags);
+ ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(table_flags);
+ ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(table_flags);
+ ulint post_antelope = FSP_FLAGS_GET_POST_ANTELOPE(fsp_flags);
+ ulint zip_ssize = FSP_FLAGS_GET_ZIP_SSIZE(fsp_flags);
+ ulint fsp_atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(fsp_flags);
+ ulint page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(fsp_flags);
+ ulint fsp_unused = FSP_FLAGS_GET_UNUSED(fsp_flags);
+ ulint fsp_page_compression = FSP_FLAGS_GET_PAGE_COMPRESSION(fsp_flags);
+ ulint fsp_page_compression_level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(fsp_flags);
+ ulint fsp_atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(fsp_flags);
+
+ DBUG_EXECUTE_IF("dict_tf_verify_flags_failure",
+ return(ULINT_UNDEFINED););
+
+ ut_ad(!table_unused);
+ ut_ad(!fsp_unused);
+ ut_ad(page_ssize == 0 || page_ssize != 0); /* silence compiler */
+ ut_ad(compact == 0 || compact == 1); /* silence compiler */
+ ut_ad(data_dir == 0 || data_dir == 1); /* silence compiler */
+ ut_ad(post_antelope == 0 || post_antelope == 1); /* silence compiler */
+
+ if (ssize != zip_ssize) {
+ fprintf(stderr,
+ "InnoDB: Error: table flags has zip_ssize %ld"
+ " in the data dictionary\n"
+ "InnoDB: but the flags in file has zip_ssize %ld\n",
+ ssize, zip_ssize);
+ return (FALSE);
+ }
+ if (atomic_blobs != fsp_atomic_blobs) {
+ fprintf(stderr,
+ "InnoDB: Error: table flags has atomic_blobs %ld"
+ " in the data dictionary\n"
+ "InnoDB: but the flags in file has atomic_blobs %ld\n",
+ atomic_blobs, fsp_atomic_blobs);
+
+ return (FALSE);
+ }
+ if (page_compression != fsp_page_compression) {
+ fprintf(stderr,
+ "InnoDB: Error: table flags has page_compression %ld"
+ " in the data dictionary\n"
+ "InnoDB: but the flags in file ahas page_compression %ld\n",
+ page_compression, fsp_page_compression);
+
+ return (FALSE);
+ }
+ if (page_compression_level != fsp_page_compression_level) {
+ fprintf(stderr,
+ "InnoDB: Error: table flags has page_compression_level %ld"
+ " in the data dictionary\n"
+ "InnoDB: but the flags in file has page_compression_level %ld\n",
+ page_compression_level, fsp_page_compression_level);
+
+ return (FALSE);
+ }
+
+ if (atomic_writes != fsp_atomic_writes) {
+ fprintf(stderr,
+ "InnoDB: Error: table flags has atomic writes %ld"
+ " in the data dictionary\n"
+ "InnoDB: but the flags in file has atomic_writes %ld\n",
+ atomic_writes, fsp_atomic_writes);
+
+ return (FALSE);
+ }
+
+ return(TRUE);
+}
+
+/********************************************************************//**
+Extract the page compression level from dict_table_t::flags.
+These flags are in memory, so assert that they are valid.
+@return page compression level, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_tf_get_page_compression_level(
+/*===============================*/
+ ulint flags) /*!< in: flags */
+{
+ ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags);
+
+ ut_ad(page_compression_level >= 0 && page_compression_level <= 9);
+
+ return(page_compression_level);
+}
+
+/********************************************************************//**
+Check whether the table uses the page compression page format.
+@return page compression level, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_table_page_compression_level(
+/*==============================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ ut_ad(table);
+ ut_ad(dict_tf_get_page_compression(table->flags));
+
+ return(dict_tf_get_page_compression_level(table->flags));
+}
+
+/********************************************************************//**
+Check whether the table uses the page compression page format.
+@return true if page compressed, false if not */
+UNIV_INLINE
+ibool
+dict_tf_get_page_compression(
+/*=========================*/
+ ulint flags) /*!< in: flags */
+{
+ return(DICT_TF_GET_PAGE_COMPRESSION(flags));
+}
+
+/********************************************************************//**
+Check whether the table uses the page compression page format.
+@return true if page compressed, false if not */
+UNIV_INLINE
+ibool
+dict_table_is_page_compressed(
+/*==========================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ return (dict_tf_get_page_compression(table->flags));
+}
+
+/********************************************************************//**
+Extract the atomic writes flag from table flags.
+@return enumerated value of atomic writes */
+UNIV_INLINE
+atomic_writes_t
+dict_tf_get_atomic_writes(
+/*======================*/
+ ulint flags) /*!< in: flags */
+{
+ return((atomic_writes_t)DICT_TF_GET_ATOMIC_WRITES(flags));
+}
+
+/********************************************************************//**
+Check whether the table uses the atomic writes.
+@return enumerated value of atomic writes */
+UNIV_INLINE
+atomic_writes_t
+dict_table_get_atomic_writes(
+/*=========================*/
+ const dict_table_t* table) /*!< in: table */
+{
+ return ((atomic_writes_t)dict_tf_get_atomic_writes(table->flags));
+}
diff --git a/storage/xtradb/include/dict0types.h b/storage/xtradb/include/dict0types.h
index 6acb6a2dcbe..9e210117580 100644
--- a/storage/xtradb/include/dict0types.h
+++ b/storage/xtradb/include/dict0types.h
@@ -1,6 +1,7 @@
/*****************************************************************************
Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -82,4 +83,12 @@ enum ib_quiesce_t {
#define TEMP_TABLE_PREFIX "#sql"
#define TEMP_TABLE_PATH_PREFIX "/" TEMP_TABLE_PREFIX
+/** Enum values for atomic_writes table option */
+typedef enum {
+ ATOMIC_WRITES_DEFAULT = 0,
+ ATOMIC_WRITES_ON = 1,
+ ATOMIC_WRITES_OFF = 2
+} atomic_writes_t;
+
+
#endif
diff --git a/storage/xtradb/include/fil0fil.h b/storage/xtradb/include/fil0fil.h
index 472c57fcbfc..6b69a899690 100644
--- a/storage/xtradb/include/fil0fil.h
+++ b/storage/xtradb/include/fil0fil.h
@@ -1,6 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -129,6 +130,13 @@ extern fil_addr_t fil_addr_null;
#define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID 34 /*!< starting from 4.1.x this
contains the space id of the page */
#define FIL_PAGE_DATA 38 /*!< start of the data on the page */
+/* Following are used when page compression is used */
+#define FIL_PAGE_COMPRESSED_SIZE 2 /*!< Number of bytes used to store
+ actual payload data size on
+ compressed pages. */
+#define FIL_PAGE_COMPRESSION_ZLIB 1 /*!< Compressin algorithm ZLIB. */
+#define FIL_PAGE_COMPRESSION_LZ4 2 /*!< Compressin algorithm LZ4. */
+
/* @} */
/** File page trailer @{ */
#define FIL_PAGE_END_LSN_OLD_CHKSUM 8 /*!< the low 4 bytes of this are used
@@ -139,6 +147,7 @@ extern fil_addr_t fil_addr_null;
/* @} */
/** File page types (values of FIL_PAGE_TYPE) @{ */
+#define FIL_PAGE_PAGE_COMPRESSED 34354 /*!< Page compressed page */
#define FIL_PAGE_INDEX 17855 /*!< B-tree node */
#define FIL_PAGE_UNDO_LOG 2 /*!< Undo log page */
#define FIL_PAGE_INODE 3 /*!< Index node */
@@ -721,8 +730,8 @@ fil_space_get_n_reserved_extents(
Reads or writes data. This operation is asynchronous (aio).
@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do
i/o on a tablespace which does not exist */
-#define fil_io(type, sync, space_id, zip_size, block_offset, byte_offset, len, buf, message) \
- _fil_io(type, sync, space_id, zip_size, block_offset, byte_offset, len, buf, message, NULL)
+#define fil_io(type, sync, space_id, zip_size, block_offset, byte_offset, len, buf, message, write_size) \
+ _fil_io(type, sync, space_id, zip_size, block_offset, byte_offset, len, buf, message, write_size, NULL)
UNIV_INTERN
dberr_t
@@ -752,7 +761,12 @@ _fil_io(
or from where to write; in aio this must be
appropriately aligned */
void* message, /*!< in: message for aio handler if non-sync
- aio used, else ignored */
+ aio used, else ignored */
+ ulint* write_size, /*!< in/out: Actual write size initialized
+ after fist successfull trim
+ operation for this page and if
+ initialized we do not trim again if
+ actual page size does not decrease. */
trx_t* trx)
__attribute__((nonnull(8)));
/**********************************************************************//**
@@ -1018,4 +1032,27 @@ fil_space_set_corrupt(
/*==================*/
ulint space_id);
+/****************************************************************//**
+Acquire fil_system mutex */
+void
+fil_system_enter(void);
+/*==================*/
+/****************************************************************//**
+Release fil_system mutex */
+void
+fil_system_exit(void);
+/*==================*/
+/*******************************************************************//**
+Returns the table space by a given id, NULL if not found. */
+fil_space_t*
+fil_space_get_by_id(
+/*================*/
+ ulint id); /*!< in: space id */
+/*******************************************************************//**
+Return space name */
+char*
+fil_space_name(
+/*===========*/
+ fil_space_t* space); /*!< in: space */
+
#endif /* fil0fil_h */
diff --git a/storage/xtradb/include/fil0pagecompress.h b/storage/xtradb/include/fil0pagecompress.h
new file mode 100644
index 00000000000..342b105401c
--- /dev/null
+++ b/storage/xtradb/include/fil0pagecompress.h
@@ -0,0 +1,118 @@
+/*****************************************************************************
+
+Copyright (C) 2013 SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+#ifndef fil0pagecompress_h
+#define fil0pagecompress_h
+
+#include "fsp0fsp.h"
+#include "fsp0pagecompress.h"
+
+/******************************************************************//**
+@file include/fil0pagecompress.h
+Helper functions for extracting/storing page compression and
+atomic writes information to table space.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+/*******************************************************************//**
+Returns the page compression level flag of the space, or 0 if the space
+is not compressed. The tablespace must be cached in the memory cache.
+@return page compression level if page compressed, ULINT_UNDEFINED if space not found */
+ulint
+fil_space_get_page_compression_level(
+/*=================================*/
+ ulint id); /*!< in: space id */
+/*******************************************************************//**
+Returns the page compression flag of the space, or false if the space
+is not compressed. The tablespace must be cached in the memory cache.
+@return true if page compressed, false if not or space not found */
+ibool
+fil_space_is_page_compressed(
+/*=========================*/
+ ulint id); /*!< in: space id */
+/*******************************************************************//**
+Returns the atomic writes flag of the space, or false if the space
+is not using atomic writes. The tablespace must be cached in the memory cache.
+@return atomic write table option value */
+atomic_writes_t
+fil_space_get_atomic_writes(
+/*=========================*/
+ ulint id); /*!< in: space id */
+/*******************************************************************//**
+Find out wheather the page is index page or not
+@return true if page type index page, false if not */
+ibool
+fil_page_is_index_page(
+/*===================*/
+ byte *buf); /*!< in: page */
+
+/****************************************************************//**
+Get the name of the compression algorithm used for page
+compression.
+@return compression algorithm name or "UNKNOWN" if not known*/
+const char*
+fil_get_compression_alg_name(
+/*=========================*/
+ ulint comp_alg); /*!<in: compression algorithm number */
+
+/****************************************************************//**
+For page compressed pages compress the page before actual write
+operation.
+@return compressed page to be written*/
+byte*
+fil_compress_page(
+/*==============*/
+ ulint space_id, /*!< in: tablespace id of the
+ table. */
+ byte* buf, /*!< in: buffer from which to write; in aio
+ this must be appropriately aligned */
+ byte* out_buf, /*!< out: compressed buffer */
+ ulint len, /*!< in: length of input buffer.*/
+ ulint compression_level, /*!< in: compression level */
+ ulint* out_len); /*!< out: actual length of compressed page */
+
+/****************************************************************//**
+For page compressed pages decompress the page after actual read
+operation.
+@return uncompressed page */
+void
+fil_decompress_page(
+/*================*/
+ byte* page_buf, /*!< in: preallocated buffer or NULL */
+ byte* buf, /*!< out: buffer from which to read; in aio
+ this must be appropriately aligned */
+ ulint len); /*!< in: length of output buffer.*/
+
+/****************************************************************//**
+Get space id from fil node
+@return space id*/
+ulint
+fil_node_get_space_id(
+/*==================*/
+ fil_node_t* node); /*!< in: Node where to get space id*/
+
+/*******************************************************************//**
+Find out wheather the page is page compressed
+@return true if page is page compressed*/
+ibool
+fil_page_is_compressed(
+/*===================*/
+ byte *buf); /*!< in: page */
+
+#endif
diff --git a/storage/xtradb/include/fsp0fsp.h b/storage/xtradb/include/fsp0fsp.h
index a587ccc9f20..6fe44a0ef16 100644
--- a/storage/xtradb/include/fsp0fsp.h
+++ b/storage/xtradb/include/fsp0fsp.h
@@ -1,6 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -53,12 +54,21 @@ to the two Barracuda row formats COMPRESSED and DYNAMIC. */
/** Width of the DATA_DIR flag. This flag indicates that the tablespace
is found in a remote location, not the default data directory. */
#define FSP_FLAGS_WIDTH_DATA_DIR 1
+/** Number of flag bits used to indicate the page compression and compression level */
+#define FSP_FLAGS_WIDTH_PAGE_COMPRESSION 1
+#define FSP_FLAGS_WIDTH_PAGE_COMPRESSION_LEVEL 4
+/** Number of flag bits used to indicate atomic writes for this tablespace */
+#define FSP_FLAGS_WIDTH_ATOMIC_WRITES 2
+
/** Width of all the currently known tablespace flags */
#define FSP_FLAGS_WIDTH (FSP_FLAGS_WIDTH_POST_ANTELOPE \
+ FSP_FLAGS_WIDTH_ZIP_SSIZE \
+ FSP_FLAGS_WIDTH_ATOMIC_BLOBS \
+ FSP_FLAGS_WIDTH_PAGE_SSIZE \
- + FSP_FLAGS_WIDTH_DATA_DIR)
+ + FSP_FLAGS_WIDTH_DATA_DIR \
+ + FSP_FLAGS_WIDTH_PAGE_COMPRESSION \
+ + FSP_FLAGS_WIDTH_PAGE_COMPRESSION_LEVEL \
+ + FSP_FLAGS_WIDTH_ATOMIC_WRITES)
/** A mask of all the known/used bits in tablespace flags */
#define FSP_FLAGS_MASK (~(~0 << FSP_FLAGS_WIDTH))
@@ -71,10 +81,21 @@ is found in a remote location, not the default data directory. */
/** Zero relative shift position of the ATOMIC_BLOBS field */
#define FSP_FLAGS_POS_ATOMIC_BLOBS (FSP_FLAGS_POS_ZIP_SSIZE \
+ FSP_FLAGS_WIDTH_ZIP_SSIZE)
-/** Zero relative shift position of the PAGE_SSIZE field */
-#define FSP_FLAGS_POS_PAGE_SSIZE (FSP_FLAGS_POS_ATOMIC_BLOBS \
+/** Note that these need to be before the page size to be compatible with
+dictionary */
+/** Zero relative shift position of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_POS_PAGE_COMPRESSION (FSP_FLAGS_POS_ATOMIC_BLOBS \
+ FSP_FLAGS_WIDTH_ATOMIC_BLOBS)
-/** Zero relative shift position of the start of the UNUSED bits */
+/** Zero relative shift position of the PAGE_COMPRESSION_LEVEL field */
+#define FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL (FSP_FLAGS_POS_PAGE_COMPRESSION \
+ + FSP_FLAGS_WIDTH_PAGE_COMPRESSION)
+/** Zero relative shift position of the ATOMIC_WRITES field */
+#define FSP_FLAGS_POS_ATOMIC_WRITES (FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL \
+ + FSP_FLAGS_WIDTH_PAGE_COMPRESSION_LEVEL)
+ /** Zero relative shift position of the PAGE_SSIZE field */
+#define FSP_FLAGS_POS_PAGE_SSIZE (FSP_FLAGS_POS_ATOMIC_WRITES \
+ + FSP_FLAGS_WIDTH_ATOMIC_WRITES)
+/** Zero relative shift position of the start of the DATA DIR bits */
#define FSP_FLAGS_POS_DATA_DIR (FSP_FLAGS_POS_PAGE_SSIZE \
+ FSP_FLAGS_WIDTH_PAGE_SSIZE)
/** Zero relative shift position of the start of the UNUSED bits */
@@ -101,6 +122,19 @@ is found in a remote location, not the default data directory. */
#define FSP_FLAGS_MASK_DATA_DIR \
((~(~0 << FSP_FLAGS_WIDTH_DATA_DIR)) \
<< FSP_FLAGS_POS_DATA_DIR)
+/** Bit mask of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_MASK_PAGE_COMPRESSION \
+ ((~(~0 << FSP_FLAGS_WIDTH_PAGE_COMPRESSION)) \
+ << FSP_FLAGS_POS_PAGE_COMPRESSION)
+/** Bit mask of the PAGE_COMPRESSION_LEVEL field */
+#define FSP_FLAGS_MASK_PAGE_COMPRESSION_LEVEL \
+ ((~(~0 << FSP_FLAGS_WIDTH_PAGE_COMPRESSION_LEVEL)) \
+ << FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL)
+/** Bit mask of the ATOMIC_WRITES field */
+#define FSP_FLAGS_MASK_ATOMIC_WRITES \
+ ((~(~0 << FSP_FLAGS_WIDTH_ATOMIC_WRITES)) \
+ << FSP_FLAGS_POS_ATOMIC_WRITES)
+
/** Return the value of the POST_ANTELOPE field */
#define FSP_FLAGS_GET_POST_ANTELOPE(flags) \
@@ -125,12 +159,38 @@ is found in a remote location, not the default data directory. */
/** Return the contents of the UNUSED bits */
#define FSP_FLAGS_GET_UNUSED(flags) \
(flags >> FSP_FLAGS_POS_UNUSED)
+/** Return the value of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_GET_PAGE_COMPRESSION(flags) \
+ ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION) \
+ >> FSP_FLAGS_POS_PAGE_COMPRESSION)
+/** Return the value of the PAGE_COMPRESSION_LEVEL field */
+#define FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags) \
+ ((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION_LEVEL) \
+ >> FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL)
+/** Return the value of the ATOMIC_WRITES field */
+#define FSP_FLAGS_GET_ATOMIC_WRITES(flags) \
+ ((flags & FSP_FLAGS_MASK_ATOMIC_WRITES) \
+ >> FSP_FLAGS_POS_ATOMIC_WRITES)
/** Set a PAGE_SSIZE into the correct bits in a given
tablespace flags. */
#define FSP_FLAGS_SET_PAGE_SSIZE(flags, ssize) \
(flags | (ssize << FSP_FLAGS_POS_PAGE_SSIZE))
+/** Set a PAGE_COMPRESSION into the correct bits in a given
+tablespace flags. */
+#define FSP_FLAGS_SET_PAGE_COMPRESSION(flags, compression) \
+ (flags | (compression << FSP_FLAGS_POS_PAGE_COMPRESSION))
+
+/** Set a PAGE_COMPRESSION_LEVEL into the correct bits in a given
+tablespace flags. */
+#define FSP_FLAGS_SET_PAGE_COMPRESSION_LEVEL(flags, level) \
+ (flags | (level << FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL))
+/** Set a ATOMIC_WRITES into the correct bits in a given
+tablespace flags. */
+#define FSP_FLAGS_SET_ATOMIC_WRITES(flags, atomics) \
+ (flags | (atomics << FSP_FLAGS_POS_ATOMIC_WRITES))
+
/* @} */
/* @defgroup Tablespace Header Constants (moved from fsp0fsp.c) @{ */
diff --git a/storage/xtradb/include/fsp0fsp.ic b/storage/xtradb/include/fsp0fsp.ic
index 0d81e817cc9..bc46967fab0 100644
--- a/storage/xtradb/include/fsp0fsp.ic
+++ b/storage/xtradb/include/fsp0fsp.ic
@@ -1,6 +1,7 @@
/*****************************************************************************
Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -63,6 +64,10 @@ fsp_flags_is_valid(
ulint atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(flags);
ulint page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags);
ulint unused = FSP_FLAGS_GET_UNUSED(flags);
+ ulint page_compression = FSP_FLAGS_GET_PAGE_COMPRESSION(flags);
+ ulint page_compression_level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags);
+ ulint atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(flags);
+ atomic_writes_t awrites = (atomic_writes_t)atomic_writes;
DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", return(false););
@@ -108,6 +113,20 @@ fsp_flags_is_valid(
# error "UNIV_FORMAT_MAX != UNIV_FORMAT_B, Add more validations."
#endif
+ /* Page compression level requires page compression and atomic blobs
+ to be set */
+ if (page_compression_level || page_compression) {
+ if (!page_compression || !atomic_blobs) {
+ return(false);
+ }
+ }
+
+ if ((awrites == ATOMIC_WRITES_ON ||
+ (awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes))
+ && !atomic_blobs) {
+ return (false);
+ }
+
/* The DATA_DIR field can be used for any row type so there is
nothing here to validate. */
diff --git a/storage/xtradb/include/fsp0pagecompress.h b/storage/xtradb/include/fsp0pagecompress.h
new file mode 100644
index 00000000000..4913f1d6b29
--- /dev/null
+++ b/storage/xtradb/include/fsp0pagecompress.h
@@ -0,0 +1,73 @@
+/*****************************************************************************
+
+Copyright (C) 2013 SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fsp0pagecompress.h
+Helper functions for extracting/storing page compression and
+atomic writes information to file space.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+#ifndef fsp0pagecompress_h
+#define fsp0pagecompress_h
+
+/**********************************************************************//**
+Reads the page compression level from the first page of a tablespace.
+@return page compression level, or 0 if uncompressed */
+UNIV_INTERN
+ulint
+fsp_header_get_compression_level(
+/*=============================*/
+ const page_t* page); /*!< in: first page of a tablespace */
+
+/********************************************************************//**
+Determine if the tablespace is page compressed from dict_table_t::flags.
+@return TRUE if page compressed, FALSE if not compressed */
+UNIV_INLINE
+ibool
+fsp_flags_is_page_compressed(
+/*=========================*/
+ ulint flags); /*!< in: tablespace flags */
+
+/********************************************************************//**
+Extract the page compression level from tablespace flags.
+A tablespace has only one physical page compression level
+whether that page is compressed or not.
+@return page compression level of the file-per-table tablespace,
+or zero if the table is not compressed. */
+UNIV_INLINE
+ulint
+fsp_flags_get_page_compression_level(
+/*=================================*/
+ ulint flags); /*!< in: tablespace flags */
+
+/********************************************************************//**
+Determine the tablespace is using atomic writes from dict_table_t::flags.
+@return true if atomic writes is used, false if not */
+UNIV_INLINE
+atomic_writes_t
+fsp_flags_get_atomic_writes(
+/*========================*/
+ ulint flags); /*!< in: tablespace flags */
+
+#ifndef UNIV_NONINL
+#include "fsp0pagecompress.ic"
+#endif
+
+#endif
diff --git a/storage/xtradb/include/fsp0pagecompress.ic b/storage/xtradb/include/fsp0pagecompress.ic
new file mode 100644
index 00000000000..873f6cd401d
--- /dev/null
+++ b/storage/xtradb/include/fsp0pagecompress.ic
@@ -0,0 +1,177 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2014, SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fsp0pagecompress.ic
+Implementation for helper functions for extracting/storing page
+compression and atomic writes information to file space.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+#include "fsp0fsp.h"
+
+
+/********************************************************************//**
+Determine if the tablespace is page compressed from dict_table_t::flags.
+@return TRUE if page compressed, FALSE if not page compressed */
+UNIV_INLINE
+ibool
+fsp_flags_is_page_compressed(
+/*=========================*/
+ ulint flags) /*!< in: tablespace flags */
+{
+ return(FSP_FLAGS_GET_PAGE_COMPRESSION(flags));
+}
+
+/********************************************************************//**
+Determine the tablespace is page compression level from dict_table_t::flags.
+@return page compression level or 0 if not compressed*/
+UNIV_INLINE
+ulint
+fsp_flags_get_page_compression_level(
+/*=================================*/
+ ulint flags) /*!< in: tablespace flags */
+{
+ return(FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags));
+}
+
+/********************************************************************//**
+Determine the tablespace is using atomic writes from dict_table_t::flags.
+@return true if atomic writes is used, false if not */
+UNIV_INLINE
+atomic_writes_t
+fsp_flags_get_atomic_writes(
+/*========================*/
+ ulint flags) /*!< in: tablespace flags */
+{
+ return((atomic_writes_t)FSP_FLAGS_GET_ATOMIC_WRITES(flags));
+}
+
+/*******************************************************************//**
+Find out wheather the page is index page or not
+@return true if page type index page, false if not */
+UNIV_INLINE
+ibool
+fil_page_is_index_page(
+/*===================*/
+ byte *buf) /*!< in: page */
+{
+ return(mach_read_from_2(buf+FIL_PAGE_TYPE) == FIL_PAGE_INDEX);
+}
+
+/*******************************************************************//**
+Find out wheather the page is page compressed
+@return true if page is page compressed, false if not */
+UNIV_INLINE
+ibool
+fil_page_is_compressed(
+/*===================*/
+ byte *buf) /*!< in: page */
+{
+ return(mach_read_from_2(buf+FIL_PAGE_TYPE) == FIL_PAGE_PAGE_COMPRESSED);
+}
+
+/*******************************************************************//**
+Returns the page compression level of the space, or 0 if the space
+is not compressed. The tablespace must be cached in the memory cache.
+@return page compression level, ULINT_UNDEFINED if space not found */
+UNIV_INLINE
+ulint
+fil_space_get_page_compression_level(
+/*=================================*/
+ ulint id) /*!< in: space id */
+{
+ ulint flags;
+
+ flags = fil_space_get_flags(id);
+
+ if (flags && flags != ULINT_UNDEFINED) {
+
+ return(fsp_flags_get_page_compression_level(flags));
+ }
+
+ return(flags);
+}
+
+/*******************************************************************//**
+Extract the page compression from space.
+@return true if space is page compressed, false if space is not found
+or space is not page compressed. */
+UNIV_INLINE
+ibool
+fil_space_is_page_compressed(
+/*=========================*/
+ ulint id) /*!< in: space id */
+{
+ ulint flags;
+
+ flags = fil_space_get_flags(id);
+
+ if (flags && flags != ULINT_UNDEFINED) {
+
+ return(fsp_flags_is_page_compressed(flags));
+ }
+
+ return(flags);
+}
+
+/****************************************************************//**
+Get the name of the compression algorithm used for page
+compression.
+@return compression algorithm name or "UNKNOWN" if not known*/
+UNIV_INLINE
+const char*
+fil_get_compression_alg_name(
+/*=========================*/
+ ulint comp_alg) /*!<in: compression algorithm number */
+{
+ switch(comp_alg) {
+ case FIL_PAGE_COMPRESSION_ZLIB:
+ return ("ZLIB");
+ break;
+ case FIL_PAGE_COMPRESSION_LZ4:
+ return ("LZ4");
+ break;
+ default:
+ return("UNKNOWN");
+ break;
+ }
+}
+
+/*******************************************************************//**
+Returns the atomic writes flag of the space, or false if the space
+is not using atomic writes. The tablespace must be cached in the memory cache.
+@return atomic writes table option value */
+UNIV_INLINE
+atomic_writes_t
+fil_space_get_atomic_writes(
+/*========================*/
+ ulint id) /*!< in: space id */
+{
+ ulint flags;
+
+ flags = fil_space_get_flags(id);
+
+ if (flags && flags != ULINT_UNDEFINED) {
+
+ return((atomic_writes_t)fsp_flags_get_atomic_writes(flags));
+ }
+
+ return((atomic_writes_t)0);
+}
diff --git a/storage/xtradb/include/os0file.h b/storage/xtradb/include/os0file.h
index 564b579edc8..e5abd4e2961 100644
--- a/storage/xtradb/include/os0file.h
+++ b/storage/xtradb/include/os0file.h
@@ -2,6 +2,7 @@
Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
Portions of this file contain modifications contributed and copyrighted
by Percona Inc.. Those modifications are
@@ -155,6 +156,7 @@ enum os_file_create_t {
#define OS_FILE_INSUFFICIENT_RESOURCE 78
#define OS_FILE_AIO_INTERRUPTED 79
#define OS_FILE_OPERATION_ABORTED 80
+#define OS_FILE_OPERATION_NOT_SUPPORTED 125
/* @} */
/** Types for aio operations @{ */
@@ -300,26 +302,28 @@ os_file_write
The wrapper functions have the prefix of "innodb_". */
#ifdef UNIV_PFS_IO
-# define os_file_create(key, name, create, purpose, type, success) \
+# define os_file_create(key, name, create, purpose, type, success, atomic_writes) \
pfs_os_file_create_func(key, name, create, purpose, type, \
- success, __FILE__, __LINE__)
+ success, atomic_writes, __FILE__, __LINE__)
# define os_file_create_simple(key, name, create, access, success) \
pfs_os_file_create_simple_func(key, name, create, access, \
success, __FILE__, __LINE__)
# define os_file_create_simple_no_error_handling( \
- key, name, create_mode, access, success) \
+ key, name, create_mode, access, success, atomic_writes) \
pfs_os_file_create_simple_no_error_handling_func( \
- key, name, create_mode, access, success, __FILE__, __LINE__)
+ key, name, create_mode, access, success, atomic_writes, __FILE__, __LINE__)
# define os_file_close(file) \
pfs_os_file_close_func(file, __FILE__, __LINE__)
# define os_aio(type, mode, name, file, buf, offset, \
- n, message1, message2, space_id, trx) \
+ n, message1, message2, space_id, \
+ trx, page_compressed, page_compression_level, write_size) \
pfs_os_aio_func(type, mode, name, file, buf, offset, \
n, message1, message2, space_id, trx, \
+ page_compressed, page_compression_level, write_size, \
__FILE__, __LINE__)
# define os_file_read(file, buf, offset, n) \
@@ -353,23 +357,25 @@ The wrapper functions have the prefix of "innodb_". */
/* If UNIV_PFS_IO is not defined, these I/O APIs point
to original un-instrumented file I/O APIs */
-# define os_file_create(key, name, create, purpose, type, success) \
- os_file_create_func(name, create, purpose, type, success)
+# define os_file_create(key, name, create, purpose, type, success, atomic_writes) \
+ os_file_create_func(name, create, purpose, type, success, atomic_writes)
# define os_file_create_simple(key, name, create_mode, access, success) \
os_file_create_simple_func(name, create_mode, access, success)
# define os_file_create_simple_no_error_handling( \
- key, name, create_mode, access, success) \
+ key, name, create_mode, access, success, atomic_writes) \
os_file_create_simple_no_error_handling_func( \
- name, create_mode, access, success)
+ name, create_mode, access, success, atomic_writes)
# define os_file_close(file) os_file_close_func(file)
# define os_aio(type, mode, name, file, buf, offset, n, message1, \
- message2, space_id, trx) \
+ message2, space_id, trx, \
+ page_compressed, page_compression_level, write_size) \
os_aio_func(type, mode, name, file, buf, offset, n, \
- message1, message2, space_id, trx)
+ message1, message2, space_id, trx, \
+ page_compressed, page_compression_level, write_size)
# define os_file_read(file, buf, offset, n) \
os_file_read_func(file, buf, offset, n, NULL)
@@ -520,7 +526,9 @@ os_file_create_simple_func(
ulint create_mode,/*!< in: create mode */
ulint access_type,/*!< in: OS_FILE_READ_ONLY or
OS_FILE_READ_WRITE */
- ibool* success);/*!< out: TRUE if succeed, FALSE if error */
+ ibool* success,/*!< out: TRUE if succeed, FALSE if error */
+ ulint atomic_writes);/*!< in: atomic writes table option
+ value */
/****************************************************************//**
NOTE! Use the corresponding macro
os_file_create_simple_no_error_handling(), not directly this function!
@@ -538,7 +546,9 @@ os_file_create_simple_no_error_handling_func(
OS_FILE_READ_WRITE, or
OS_FILE_READ_ALLOW_DELETE; the last option is
used by a backup program reading the file */
- ibool* success)/*!< out: TRUE if succeed, FALSE if error */
+ ibool* success,/*!< out: TRUE if succeed, FALSE if error */
+ ulint atomic_writes)/*!< in: atomic writes table option
+ value */
__attribute__((nonnull, warn_unused_result));
/****************************************************************//**
Tries to disable OS caching on an opened file descriptor. */
@@ -572,7 +582,9 @@ os_file_create_func(
async i/o or unbuffered i/o: look in the
function source code for the exact rules */
ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */
- ibool* success)/*!< out: TRUE if succeed, FALSE if error */
+ ibool* success,/*!< out: TRUE if succeed, FALSE if error */
+ ulint atomic_writes) /*!< in: atomic writes table option
+ value */
__attribute__((nonnull, warn_unused_result));
/***********************************************************************//**
Deletes a file. The file has to be closed before calling this.
@@ -637,6 +649,8 @@ pfs_os_file_create_simple_func(
ulint access_type,/*!< in: OS_FILE_READ_ONLY or
OS_FILE_READ_WRITE */
ibool* success,/*!< out: TRUE if succeed, FALSE if error */
+ ulint atomic_writes,/*!< in: atomic writes table option
+ value */
const char* src_file,/*!< in: file name where func invoked */
ulint src_line)/*!< in: line where the func invoked */
__attribute__((nonnull, warn_unused_result));
@@ -662,6 +676,8 @@ pfs_os_file_create_simple_no_error_handling_func(
OS_FILE_READ_ALLOW_DELETE; the last option is
used by a backup program reading the file */
ibool* success,/*!< out: TRUE if succeed, FALSE if error */
+ ulint atomic_writes,/*!< in: atomic writes table option
+ value*/
const char* src_file,/*!< in: file name where func invoked */
ulint src_line)/*!< in: line where the func invoked */
__attribute__((nonnull, warn_unused_result));
@@ -690,6 +706,8 @@ pfs_os_file_create_func(
function source code for the exact rules */
ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */
ibool* success,/*!< out: TRUE if succeed, FALSE if error */
+ ulint atomic_writes,/*!< in: atomic writes table option
+ value */
const char* src_file,/*!< in: file name where func invoked */
ulint src_line)/*!< in: line where the func invoked */
__attribute__((nonnull, warn_unused_result));
@@ -721,6 +739,8 @@ pfs_os_file_read_func(
os_offset_t offset, /*!< in: file offset where to read */
ulint n, /*!< in: number of bytes to read */
trx_t* trx,
+ ulint atomic_writes,/*!< in: atomic writes table option
+ value */
const char* src_file,/*!< in: file name where func invoked */
ulint src_line);/*!< in: line where the func invoked */
@@ -771,6 +791,15 @@ pfs_os_aio_func(
OS_AIO_SYNC */
ulint space_id,
trx_t* trx,
+ ibool page_compression, /*!< in: is page compression used
+ on this file space */
+ ulint page_compression_level, /*!< page compression
+ level to be used */
+ ulint* write_size,/*!< in/out: Actual write size initialized
+ after fist successfull trim
+ operation for this page and if
+ initialized we do not trim again if
+ actual page size does not decrease. */
const char* src_file,/*!< in: file name where func invoked */
ulint src_line);/*!< in: line where the func invoked */
/*******************************************************************//**
@@ -1143,7 +1172,17 @@ os_aio_func(
aio operation); ignored if mode is
OS_AIO_SYNC */
ulint space_id,
- trx_t* trx);
+ trx_t* trx,
+ ibool page_compression, /*!< in: is page compression used
+ on this file space */
+ ulint page_compression_level, /*!< page compression
+ level to be used */
+ ulint* write_size);/*!< in/out: Actual write size initialized
+ after fist successfull trim
+ operation for this page and if
+ initialized we do not trim again if
+ actual page size does not decrease. */
+
/************************************************************************//**
Wakes up all async i/o threads so that they know to exit themselves in
shutdown. */
diff --git a/storage/xtradb/include/os0file.ic b/storage/xtradb/include/os0file.ic
index 25a1397147e..5ad9e3f5461 100644
--- a/storage/xtradb/include/os0file.ic
+++ b/storage/xtradb/include/os0file.ic
@@ -1,6 +1,7 @@
/*****************************************************************************
Copyright (c) 2010, 2011, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -44,6 +45,8 @@ pfs_os_file_create_simple_func(
ulint access_type,/*!< in: OS_FILE_READ_ONLY or
OS_FILE_READ_WRITE */
ibool* success,/*!< out: TRUE if succeed, FALSE if error */
+ ulint atomic_writes,/*!< in: atomic writes table option
+ value */
const char* src_file,/*!< in: file name where func invoked */
ulint src_line)/*!< in: line where the func invoked */
{
@@ -59,7 +62,7 @@ pfs_os_file_create_simple_func(
name, src_file, src_line);
file = os_file_create_simple_func(name, create_mode,
- access_type, success);
+ access_type, success, atomic_writes);
/* Regsiter the returning "file" value with the system */
register_pfs_file_open_end(locker, file);
@@ -88,6 +91,8 @@ pfs_os_file_create_simple_no_error_handling_func(
OS_FILE_READ_ALLOW_DELETE; the last option is
used by a backup program reading the file */
ibool* success,/*!< out: TRUE if succeed, FALSE if error */
+ ulint atomic_writes,/*!< in: atomic writes table option
+ value */
const char* src_file,/*!< in: file name where func invoked */
ulint src_line)/*!< in: line where the func invoked */
{
@@ -103,7 +108,7 @@ pfs_os_file_create_simple_no_error_handling_func(
name, src_file, src_line);
file = os_file_create_simple_no_error_handling_func(
- name, create_mode, access_type, success);
+ name, create_mode, access_type, success, atomic_writes);
register_pfs_file_open_end(locker, file);
@@ -134,6 +139,8 @@ pfs_os_file_create_func(
function source code for the exact rules */
ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */
ibool* success,/*!< out: TRUE if succeed, FALSE if error */
+ ulint atomic_writes, /*!< in: atomic writes table option
+ value */
const char* src_file,/*!< in: file name where func invoked */
ulint src_line)/*!< in: line where the func invoked */
{
@@ -148,7 +155,8 @@ pfs_os_file_create_func(
: PSI_FILE_OPEN),
name, src_file, src_line);
- file = os_file_create_func(name, create_mode, purpose, type, success);
+ file = os_file_create_func(name, create_mode, purpose, type,
+ success, atomic_writes);
register_pfs_file_open_end(locker, file);
@@ -212,6 +220,15 @@ pfs_os_aio_func(
OS_AIO_SYNC */
ulint space_id,
trx_t* trx,
+ ibool page_compression, /*!< in: is page compression used
+ on this file space */
+ ulint page_compression_level, /*!< page compression
+ level to be used */
+ ulint* write_size,/*!< in/out: Actual write size initialized
+ after fist successfull trim
+ operation for this page and if
+ initialized we do not trim again if
+ actual page size does not decrease. */
const char* src_file,/*!< in: file name where func invoked */
ulint src_line)/*!< in: line where the func invoked */
{
@@ -227,7 +244,8 @@ pfs_os_aio_func(
src_file, src_line);
result = os_aio_func(type, mode, name, file, buf, offset,
- n, message1, message2, space_id, trx);
+ n, message1, message2, space_id, trx,
+ page_compression, page_compression_level, write_size);
register_pfs_file_io_end(locker, n);
diff --git a/storage/xtradb/include/srv0mon.h b/storage/xtradb/include/srv0mon.h
index 209894833a0..4f1de8a3eb7 100644
--- a/storage/xtradb/include/srv0mon.h
+++ b/storage/xtradb/include/srv0mon.h
@@ -2,6 +2,7 @@
Copyright (c) 2010, 2013, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the
@@ -163,6 +164,7 @@ enum monitor_id_t {
MONITOR_OVLD_BUF_POOL_PAGES_FREE,
MONITOR_OVLD_PAGE_CREATED,
MONITOR_OVLD_PAGES_WRITTEN,
+ MONITOR_OVLD_INDEX_PAGES_WRITTEN,
MONITOR_OVLD_PAGES_READ,
MONITOR_OVLD_BYTE_READ,
MONITOR_OVLD_BYTE_WRITTEN,
@@ -305,6 +307,15 @@ enum monitor_id_t {
MONITOR_PAD_INCREMENTS,
MONITOR_PAD_DECREMENTS,
+ /* New monitor variables for page compression */
+ MONITOR_OVLD_PAGE_COMPRESS_SAVED,
+ MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT512,
+ MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT4096,
+ MONITOR_OVLD_PAGES_PAGE_COMPRESSED,
+ MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP,
+ MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP_SAVED,
+ MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED,
+
/* Index related counters */
MONITOR_MODULE_INDEX,
MONITOR_INDEX_SPLIT,
diff --git a/storage/xtradb/include/srv0srv.h b/storage/xtradb/include/srv0srv.h
index d278782daa8..cc2221fc3c6 100644
--- a/storage/xtradb/include/srv0srv.h
+++ b/storage/xtradb/include/srv0srv.h
@@ -3,6 +3,7 @@
Copyright (c) 1995, 2012, Oracle and/or its affiliates. All rights reserved.
Copyright (c) 2008, 2009, Google Inc.
Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -102,6 +103,23 @@ struct srv_stats_t {
a disk page */
ulint_ctr_1_t buf_pool_reads;
+ /** Number of bytes saved by page compression */
+ ulint_ctr_64_t page_compression_saved;
+ /** Number of 512Byte TRIM by page compression */
+ ulint_ctr_64_t page_compression_trim_sect512;
+ /** Number of 4K TRIM by page compression */
+ ulint_ctr_64_t page_compression_trim_sect4096;
+ /* Number of index pages written */
+ ulint_ctr_64_t index_pages_written;
+ /* Number of pages compressed with page compression */
+ ulint_ctr_64_t pages_page_compressed;
+ /* Number of TRIM operations induced by page compression */
+ ulint_ctr_64_t page_compressed_trim_op;
+ /* Number of TRIM operations saved by using actual write size knowledge */
+ ulint_ctr_64_t page_compressed_trim_op_saved;
+ /* Number of pages decompressed with page compression */
+ ulint_ctr_64_t pages_page_decompressed;
+
/** Number of data read in total (in bytes) */
ulint_ctr_1_t data_read;
@@ -238,6 +256,27 @@ extern ibool srv_use_native_conditions;
#endif /* __WIN__ */
#endif /* !UNIV_HOTBACKUP */
+/* Is page compression used */
+extern my_bool srv_compress_pages;
+
+/* Is page compression used only for index pages */
+extern my_bool srv_page_compress_index_pages;
+
+/* Frequency of trim operations */
+extern long srv_trim_pct;
+
+/* Use trim operation */
+extern my_bool srv_use_trim;
+
+/* Use posix fallocate */
+extern my_bool srv_use_posix_fallocate;
+
+/* Use atomic writes i.e disable doublewrite buffer */
+extern my_bool srv_use_atomic_writes;
+
+/* If this flag IS TRUE, then we use lz4 to compress/decompress pages */
+extern my_bool srv_use_lz4;
+
/** Server undo tablespaces directory, can be absolute path. */
extern char* srv_undo_dir;
@@ -411,10 +450,6 @@ extern my_bool srv_stats_auto_recalc;
extern ibool srv_use_doublewrite_buf;
extern ulong srv_doublewrite_batch_size;
-extern ibool srv_use_atomic_writes;
-#ifdef HAVE_POSIX_FALLOCATE
-extern ibool srv_use_posix_fallocate;
-#endif
extern ulong srv_checksum_algorithm;
extern ulong srv_log_arch_expire_sec;
@@ -1058,6 +1093,25 @@ struct export_var_t{
ulint innodb_purge_view_trx_id_age; /*!< rw_max_trx_id
- purged view's min trx_id */
#endif /* UNIV_DEBUG */
+
+
+ ib_int64_t innodb_page_compression_saved;/*!< Number of bytes saved
+ by page compression */
+ ib_int64_t innodb_page_compression_trim_sect512;/*!< Number of 512b TRIM
+ by page compression */
+ ib_int64_t innodb_page_compression_trim_sect4096;/*!< Number of 4K byte TRIM
+ by page compression */
+ ib_int64_t innodb_index_pages_written; /*!< Number of index pages
+ written */
+ ib_int64_t innodb_pages_page_compressed;/*!< Number of pages
+ compressed by page compression */
+ ib_int64_t innodb_page_compressed_trim_op;/*!< Number of TRIM operations
+ induced by page compression */
+ ib_int64_t innodb_page_compressed_trim_op_saved;/*!< Number of TRIM operations
+ saved by page compression */
+ ib_int64_t innodb_pages_page_decompressed;/*!< Number of pages
+ decompressed by page
+ compression */
};
/** Thread slot in the thread table. */
diff --git a/storage/xtradb/log/log0log.cc b/storage/xtradb/log/log0log.cc
index 403ceda7a10..787944c23d6 100644
--- a/storage/xtradb/log/log0log.cc
+++ b/storage/xtradb/log/log0log.cc
@@ -1257,7 +1257,7 @@ log_group_file_header_flush(
(ulint) (dest_offset / UNIV_PAGE_SIZE),
(ulint) (dest_offset % UNIV_PAGE_SIZE),
OS_FILE_LOG_BLOCK_SIZE,
- buf, group);
+ buf, group, 0);
srv_stats.os_log_pending_writes.dec();
}
@@ -1385,7 +1385,7 @@ loop:
fil_io(OS_FILE_WRITE | OS_FILE_LOG, true, group->space_id, 0,
(ulint) (next_offset / UNIV_PAGE_SIZE),
(ulint) (next_offset % UNIV_PAGE_SIZE), write_len, buf,
- group);
+ group, 0);
srv_stats.os_log_pending_writes.dec();
@@ -1975,7 +1975,7 @@ log_group_checkpoint(
write_offset / UNIV_PAGE_SIZE,
write_offset % UNIV_PAGE_SIZE,
OS_FILE_LOG_BLOCK_SIZE,
- buf, ((byte*) group + 1));
+ buf, ((byte*) group + 1), 0);
ut_ad(((ulint) group & 0x1UL) == 0);
}
@@ -2055,7 +2055,7 @@ log_group_read_checkpoint_info(
fil_io(OS_FILE_READ | OS_FILE_LOG, true, group->space_id, 0,
field / UNIV_PAGE_SIZE, field % UNIV_PAGE_SIZE,
- OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL);
+ OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL, 0);
}
/******************************************************//**
@@ -2438,7 +2438,7 @@ loop:
fil_io(OS_FILE_READ | OS_FILE_LOG, sync, group->space_id, 0,
(ulint) (source_offset / UNIV_PAGE_SIZE),
(ulint) (source_offset % UNIV_PAGE_SIZE),
- len, buf, (type == LOG_ARCHIVE) ? &log_archive_io : NULL);
+ len, buf, (type == LOG_ARCHIVE) ? &log_archive_io : NULL, 0);
start_lsn += len;
buf += len;
@@ -2563,7 +2563,7 @@ log_group_archive_file_header_write(
dest_offset / UNIV_PAGE_SIZE,
dest_offset % UNIV_PAGE_SIZE,
2 * OS_FILE_LOG_BLOCK_SIZE,
- buf, &log_archive_io);
+ buf, &log_archive_io, 0);
}
/******************************************************//**
@@ -2600,7 +2600,7 @@ log_group_archive_completed_header_write(
dest_offset % UNIV_PAGE_SIZE,
OS_FILE_LOG_BLOCK_SIZE,
buf + LOG_FILE_ARCH_COMPLETED,
- &log_archive_io);
+ &log_archive_io, 0);
}
/******************************************************//**
@@ -2663,12 +2663,12 @@ loop:
file_handle = os_file_create(innodb_file_log_key,
name, open_mode,
OS_FILE_AIO,
- OS_DATA_FILE, &ret);
+ OS_DATA_FILE, &ret, FALSE);
if (!ret && (open_mode == OS_FILE_CREATE)) {
file_handle = os_file_create(
innodb_file_log_key, name, OS_FILE_OPEN,
- OS_FILE_AIO, OS_DATA_FILE, &ret);
+ OS_FILE_AIO, OS_DATA_FILE, &ret, FALSE);
}
if (!ret) {
@@ -2737,7 +2737,7 @@ loop:
(ulint) (next_offset / UNIV_PAGE_SIZE),
(ulint) (next_offset % UNIV_PAGE_SIZE),
ut_calc_align(len, OS_FILE_LOG_BLOCK_SIZE), buf,
- &log_archive_io);
+ &log_archive_io, 0);
start_lsn += len;
next_offset += len;
diff --git a/storage/xtradb/log/log0online.cc b/storage/xtradb/log/log0online.cc
index 8c2bc5602a9..2438303043c 100644
--- a/storage/xtradb/log/log0online.cc
+++ b/storage/xtradb/log/log0online.cc
@@ -547,7 +547,7 @@ log_online_start_bitmap_file(void)
log_bmp_sys->out.name,
OS_FILE_CREATE,
OS_FILE_READ_WRITE,
- &success);
+ &success, FALSE);
}
if (UNIV_UNLIKELY(!success)) {
@@ -707,7 +707,7 @@ log_online_read_init(void)
log_bmp_sys->out.file
= os_file_create_simple_no_error_handling
(innodb_file_bmp_key, log_bmp_sys->out.name, OS_FILE_OPEN,
- OS_FILE_READ_WRITE, &success);
+ OS_FILE_READ_WRITE, &success, FALSE);
if (!success) {
@@ -1491,7 +1491,7 @@ log_online_open_bitmap_file_read_only(
bitmap_file->name,
OS_FILE_OPEN,
OS_FILE_READ_ONLY,
- &success);
+ &success, FALSE);
if (UNIV_UNLIKELY(!success)) {
/* Here and below assume that bitmap file names do not
diff --git a/storage/xtradb/log/log0recv.cc b/storage/xtradb/log/log0recv.cc
index d0b833f2bba..1772def9f9b 100644
--- a/storage/xtradb/log/log0recv.cc
+++ b/storage/xtradb/log/log0recv.cc
@@ -2,6 +2,7 @@
Copyright (c) 1997, 2013, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
This program is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free Software
@@ -2131,7 +2132,7 @@ recv_apply_log_recs_for_backup(void)
error = fil_io(OS_FILE_READ, true,
recv_addr->space, zip_size,
recv_addr->page_no, 0, zip_size,
- block->page.zip.data, NULL);
+ block->page.zip.data, NULL, 0);
if (error == DB_SUCCESS
&& !buf_zip_decompress(block, TRUE)) {
exit(1);
@@ -2141,7 +2142,7 @@ recv_apply_log_recs_for_backup(void)
recv_addr->space, 0,
recv_addr->page_no, 0,
UNIV_PAGE_SIZE,
- block->frame, NULL);
+ block->frame, NULL, 0);
}
if (error != DB_SUCCESS) {
@@ -2170,13 +2171,13 @@ recv_apply_log_recs_for_backup(void)
recv_addr->space, zip_size,
recv_addr->page_no, 0,
zip_size,
- block->page.zip.data, NULL);
+ block->page.zip.data, NULL, 0);
} else {
error = fil_io(OS_FILE_WRITE, true,
recv_addr->space, 0,
recv_addr->page_no, 0,
UNIV_PAGE_SIZE,
- block->frame, NULL);
+ block->frame, NULL, 0);
}
skip_this_recv_addr:
recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
@@ -3144,7 +3145,7 @@ recv_recovery_from_checkpoint_start_func(
fil_io(OS_FILE_READ | OS_FILE_LOG, true, max_cp_group->space_id, 0,
0, 0, LOG_FILE_HDR_SIZE,
- log_hdr_buf, max_cp_group);
+ log_hdr_buf, max_cp_group, 0);
if (0 == ut_memcmp(log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP,
(byte*)"ibbackup", (sizeof "ibbackup") - 1)) {
@@ -3175,7 +3176,7 @@ recv_recovery_from_checkpoint_start_func(
fil_io(OS_FILE_WRITE | OS_FILE_LOG, true,
max_cp_group->space_id, 0,
0, 0, OS_FILE_LOG_BLOCK_SIZE,
- log_hdr_buf, max_cp_group);
+ log_hdr_buf, max_cp_group, 0);
}
log_hdr_log_block_size
@@ -3775,7 +3776,7 @@ try_open_again:
file_handle = os_file_create(innodb_file_log_key,
name, OS_FILE_OPEN,
- OS_FILE_LOG, OS_FILE_AIO, &ret);
+ OS_FILE_LOG, OS_FILE_AIO, &ret, FALSE);
if (ret == FALSE) {
ask_again:
@@ -3827,7 +3828,7 @@ ask_again:
/* Read the archive file header */
fil_io(OS_FILE_READ | OS_FILE_LOG, true, group->archive_space_id, 0,
0, 0,
- LOG_FILE_HDR_SIZE, buf, NULL);
+ LOG_FILE_HDR_SIZE, buf, NULL, 0);
/* Check if the archive file header is consistent */
@@ -3901,7 +3902,7 @@ ask_again:
fil_io(OS_FILE_READ | OS_FILE_LOG, true,
group->archive_space_id, 0,
read_offset / UNIV_PAGE_SIZE,
- read_offset % UNIV_PAGE_SIZE, len, buf, NULL);
+ read_offset % UNIV_PAGE_SIZE, len, buf, NULL, 0);
ret = recv_scan_log_recs(
(buf_pool_get_n_pages()
diff --git a/storage/xtradb/os/os0file.cc b/storage/xtradb/os/os0file.cc
index 38eb5241da1..43adf78c63c 100644
--- a/storage/xtradb/os/os0file.cc
+++ b/storage/xtradb/os/os0file.cc
@@ -2,6 +2,7 @@
Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
Portions of this file contain modifications contributed and copyrighted
by Percona Inc.. Those modifications are
@@ -42,10 +43,16 @@ Created 10/21/1995 Heikki Tuuri
#include "srv0srv.h"
#include "srv0start.h"
#include "fil0fil.h"
+#include "fil0pagecompress.h"
#include "buf0buf.h"
#include "btr0types.h"
#include "trx0trx.h"
#include "srv0mon.h"
+#include "srv0srv.h"
+#ifdef HAVE_POSIX_FALLOCATE
+#include "fcntl.h"
+#include "linux/falloc.h"
+#endif
#ifndef UNIV_HOTBACKUP
# include "os0sync.h"
# include "os0thread.h"
@@ -196,6 +203,28 @@ struct os_aio_slot_t{
and which can be used to identify
which pending aio operation was
completed */
+ ulint bitmap;
+
+ byte* page_compression_page; /*!< Memory allocated for
+ page compressed page and
+ freed after the write
+ has been completed */
+
+ ibool page_compression;
+ ulint page_compression_level;
+
+ ulint* write_size; /*!< Actual write size initialized
+ after fist successfull trim
+ operation for this page and if
+ initialized we do not trim again if
+ actual page size does not decrease. */
+
+ byte* page_buf; /*!< Actual page buffer for
+ page compressed pages, do not
+ free this */
+
+ ibool page_compress_success;
+
#ifdef LINUX_NATIVE_AIO
struct iocb control; /* Linux control block for aio */
int n_bytes; /* bytes written/read. */
@@ -301,6 +330,58 @@ UNIV_INTERN ulint os_n_pending_writes = 0;
/** Number of pending read operations */
UNIV_INTERN ulint os_n_pending_reads = 0;
+/** After first fallocate failure we will disable os_file_trim */
+UNIV_INTERN ibool os_fallocate_failed = FALSE;
+
+/**********************************************************************//**
+Directly manipulate the allocated disk space by deallocating for the file referred to
+by fd for the byte range starting at offset and continuing for len bytes.
+Within the specified range, partial file system blocks are zeroed, and whole
+file system blocks are removed from the file. After a successful call,
+subsequent reads from this range will return zeroes.
+@return true if success, false if error */
+UNIV_INTERN
+ibool
+os_file_trim(
+/*=========*/
+ os_file_t file, /*!< in: file to be trimmed */
+ os_aio_slot_t* slot, /*!< in: slot structure */
+ ulint len); /*!< in: length of area */
+
+/**********************************************************************//**
+Allocate memory for temporal buffer used for page compression. This
+buffer is freed later. */
+UNIV_INTERN
+void
+os_slot_alloc_page_buf(
+/*===================*/
+ os_aio_slot_t* slot); /*!< in: slot structure */
+
+/****************************************************************//**
+Does error handling when a file operation fails.
+@return TRUE if we should retry the operation */
+static
+ibool
+os_file_handle_error_no_exit(
+/*=========================*/
+ const char* name, /*!< in: name of a file or NULL */
+ const char* operation, /*!< in: operation */
+ ibool on_error_silent,/*!< in: if TRUE then don't print
+ any message to the log. */
+ const char* file, /*!< in: file name */
+ const ulint line); /*!< in: line */
+
+/****************************************************************//**
+Tries to enable the atomic write feature, if available, for the specified file
+handle.
+@return TRUE if success */
+static __attribute__((warn_unused_result))
+ibool
+os_file_set_atomic_writes(
+/*======================*/
+ const char* name, /*!< in: name of the file */
+ os_file_t file); /*!< in: handle to the file */
+
#ifdef UNIV_DEBUG
# ifndef UNIV_HOTBACKUP
/**********************************************************************//**
@@ -537,6 +618,16 @@ os_file_get_last_error_low(
"InnoDB: because of either a thread exit"
" or an application request.\n"
"InnoDB: Retry attempt is made.\n");
+ } else if (err == ECANCELED) {
+ fprintf(stderr,
+ "InnoDB: Operation canceled (%d):%s\n",
+ err, strerror(err));
+
+ if(srv_use_atomic_writes) {
+ fprintf(stderr,
+ "InnoDB: Error trying to enable atomic writes on "
+ "non-supported destination!\n");
+ }
} else {
fprintf(stderr,
"InnoDB: Some operating system error numbers"
@@ -633,6 +724,8 @@ os_file_get_last_error_low(
return(OS_FILE_AIO_RESOURCES_RESERVED);
}
break;
+ case ECANCELED:
+ return(OS_FILE_OPERATION_NOT_SUPPORTED);
case EINTR:
if (srv_use_native_aio) {
return(OS_FILE_AIO_INTERRUPTED);
@@ -672,9 +765,11 @@ os_file_handle_error_cond_exit(
const char* operation, /*!< in: operation */
ibool should_exit, /*!< in: call exit(3) if unknown error
and this parameter is TRUE */
- ibool on_error_silent)/*!< in: if TRUE then don't print
+ ibool on_error_silent,/*!< in: if TRUE then don't print
any message to the log iff it is
an unknown non-fatal error */
+ const char* file, /*!< in: file name */
+ const ulint line) /*!< in: line */
{
ulint err;
@@ -706,6 +801,9 @@ os_file_handle_error_cond_exit(
os_has_said_disk_full = TRUE;
+ fprintf(stderr,
+ " InnoDB: at file %s and at line %ld\n", file, line);
+
fflush(stderr);
return(FALSE);
@@ -737,6 +835,9 @@ os_file_handle_error_cond_exit(
is better to ignore on_error_silent and print an error message
to the log. */
+ fprintf(stderr,
+ " InnoDB: at file %s and at line %ld\n", file, line);
+
if (should_exit || !on_error_silent) {
ib_logf(IB_LOG_LEVEL_ERROR, "File %s: '%s' returned OS "
"error " ULINTPF ".%s", name ? name : "(unknown)",
@@ -760,10 +861,12 @@ ibool
os_file_handle_error(
/*=================*/
const char* name, /*!< in: name of a file or NULL */
- const char* operation) /*!< in: operation */
+ const char* operation, /*!< in: operation */
+ const char* file, /*!< in: file name */
+ const ulint line) /*!< in: line */
{
/* exit in case of unknown error */
- return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE));
+ return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE, file, line));
}
/****************************************************************//**
@@ -775,12 +878,14 @@ os_file_handle_error_no_exit(
/*=========================*/
const char* name, /*!< in: name of a file or NULL */
const char* operation, /*!< in: operation */
- ibool on_error_silent)/*!< in: if TRUE then don't print
+ ibool on_error_silent,/*!< in: if TRUE then don't print
any message to the log. */
+ const char* file, /*!< in: file name */
+ const ulint line) /*!< in: line */
{
/* don't exit in case of unknown error */
return(os_file_handle_error_cond_exit(
- name, operation, FALSE, on_error_silent));
+ name, operation, FALSE, on_error_silent, file, line));
}
#undef USE_FILE_LOCK
@@ -923,7 +1028,7 @@ os_file_opendir(
if (dir == INVALID_HANDLE_VALUE) {
if (error_is_fatal) {
- os_file_handle_error(dirname, "opendir");
+ os_file_handle_error(dirname, "opendir", __FILE__, __LINE__);
}
return(NULL);
@@ -934,7 +1039,7 @@ os_file_opendir(
dir = opendir(dirname);
if (dir == NULL && error_is_fatal) {
- os_file_handle_error(dirname, "opendir");
+ os_file_handle_error(dirname, "opendir", __FILE__, __LINE__);
}
return(dir);
@@ -956,7 +1061,7 @@ os_file_closedir(
ret = FindClose(dir);
if (!ret) {
- os_file_handle_error_no_exit(NULL, "closedir", FALSE);
+ os_file_handle_error_no_exit(NULL, "closedir", FALSE, __FILE__, __LINE__);
return(-1);
}
@@ -968,7 +1073,7 @@ os_file_closedir(
ret = closedir(dir);
if (ret) {
- os_file_handle_error_no_exit(NULL, "closedir", FALSE);
+ os_file_handle_error_no_exit(NULL, "closedir", FALSE, __FILE__, __LINE__);
}
return(ret);
@@ -1040,7 +1145,7 @@ next_file:
return(1);
} else {
- os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE);
+ os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE, __FILE__, __LINE__);
return(-1);
}
#else
@@ -1126,7 +1231,7 @@ next_file:
goto next_file;
}
- os_file_handle_error_no_exit(full_path, "stat", FALSE);
+ os_file_handle_error_no_exit(full_path, "stat", FALSE, __FILE__, __LINE__);
ut_free(full_path);
@@ -1177,7 +1282,7 @@ os_file_create_directory(
&& !fail_if_exists))) {
os_file_handle_error_no_exit(
- pathname, "CreateDirectory", FALSE);
+ pathname, "CreateDirectory", FALSE, __FILE__, __LINE__);
return(FALSE);
}
@@ -1190,7 +1295,7 @@ os_file_create_directory(
if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
/* failure */
- os_file_handle_error_no_exit(pathname, "mkdir", FALSE);
+ os_file_handle_error_no_exit(pathname, "mkdir", FALSE, __FILE__, __LINE__);
return(FALSE);
}
@@ -1300,7 +1405,7 @@ os_file_create_simple_func(
retry = os_file_handle_error(
name, create_mode == OS_FILE_OPEN ?
- "open" : "create");
+ "open" : "create", __FILE__, __LINE__);
} else {
*success = TRUE;
@@ -1368,7 +1473,7 @@ os_file_create_simple_func(
retry = os_file_handle_error(
name,
create_mode == OS_FILE_OPEN
- ? "open" : "create");
+ ? "open" : "create", __FILE__, __LINE__);
} else {
*success = TRUE;
retry = false;
@@ -1410,9 +1515,12 @@ os_file_create_simple_no_error_handling_func(
OS_FILE_READ_WRITE, or
OS_FILE_READ_ALLOW_DELETE; the last option is
used by a backup program reading the file */
- ibool* success)/*!< out: TRUE if succeed, FALSE if error */
+ ibool* success,/*!< out: TRUE if succeed, FALSE if error */
+ ulint atomic_writes) /*! in: atomic writes table option
+ value */
{
os_file_t file;
+ atomic_writes_t awrites = (atomic_writes_t) atomic_writes;
*success = FALSE;
#ifdef __WIN__
@@ -1473,6 +1581,15 @@ os_file_create_simple_no_error_handling_func(
attributes,
NULL); // No template file
+ if (file != INVALID_HANDLE_VALUE
+ && (awrites == ATOMIC_WRITES_ON ||
+ (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT))
+ && !os_file_set_atomic_writes(name, file)) {
+ CloseHandle(file);
+ *success = FALSE;
+ file = INVALID_HANDLE_VALUE;
+ }
+
*success = (file != INVALID_HANDLE_VALUE);
#else /* __WIN__ */
int create_flag;
@@ -1533,6 +1650,15 @@ os_file_create_simple_no_error_handling_func(
}
#endif /* USE_FILE_LOCK */
+ if (file != -1
+ && (awrites == ATOMIC_WRITES_ON ||
+ (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT))
+ && !os_file_set_atomic_writes(name, file)) {
+ *success = FALSE;
+ close(file);
+ file = -1;
+ }
+
#endif /* __WIN__ */
return(file);
@@ -1602,7 +1728,7 @@ os_file_set_atomic_writes(
if (ioctl(file, DFS_IOCTL_ATOMIC_WRITE_SET, &atomic_option)) {
- os_file_handle_error_no_exit(name, "ioctl", FALSE);
+ os_file_handle_error_no_exit(name, "ioctl(DFS_IOCTL_ATOMIC_WRITE_SET)", FALSE, __FILE__, __LINE__);
return(FALSE);
}
@@ -1636,12 +1762,15 @@ os_file_create_func(
async i/o or unbuffered i/o: look in the
function source code for the exact rules */
ulint type, /*!< in: OS_DATA_FILE or OS_LOG_FILE */
- ibool* success)/*!< out: TRUE if succeed, FALSE if error */
+ ibool* success,/*!< out: TRUE if succeed, FALSE if error */
+ ulint atomic_writes) /*! in: atomic writes table option
+ value */
{
os_file_t file;
ibool retry;
ibool on_error_no_exit;
ibool on_error_silent;
+ atomic_writes_t awrites = (atomic_writes_t) atomic_writes;
#ifdef __WIN__
DBUG_EXECUTE_IF(
@@ -1784,9 +1913,9 @@ os_file_create_func(
if (on_error_no_exit) {
retry = os_file_handle_error_no_exit(
- name, operation, on_error_silent);
+ name, operation, on_error_silent, __FILE__, __LINE__);
} else {
- retry = os_file_handle_error(name, operation);
+ retry = os_file_handle_error(name, operation, __FILE__, __LINE__);
}
} else {
*success = TRUE;
@@ -1795,8 +1924,10 @@ os_file_create_func(
} while (retry);
- if (srv_use_atomic_writes && type == OS_DATA_FILE &&
- !os_file_set_atomic_writes(name, file)) {
+ if (file != INVALID_HANDLE_VALUE
+ && (awrites == ATOMIC_WRITES_ON ||
+ (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT))
+ && !os_file_set_atomic_writes(name, file)) {
CloseHandle(file);
*success = FALSE;
file = INVALID_HANDLE_VALUE;
@@ -1876,9 +2007,9 @@ os_file_create_func(
if (on_error_no_exit) {
retry = os_file_handle_error_no_exit(
- name, operation, on_error_silent);
+ name, operation, on_error_silent, __FILE__, __LINE__);
} else {
- retry = os_file_handle_error(name, operation);
+ retry = os_file_handle_error(name, operation, __FILE__, __LINE__);
}
} else {
*success = TRUE;
@@ -1932,14 +2063,16 @@ os_file_create_func(
}
#endif /* USE_FILE_LOCK */
- if (srv_use_atomic_writes && type == OS_DATA_FILE
+ if (file != -1
+ && (awrites == ATOMIC_WRITES_ON ||
+ (srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT))
&& !os_file_set_atomic_writes(name, file)) {
-
*success = FALSE;
close(file);
file = -1;
}
+
#endif /* __WIN__ */
return(file);
@@ -1998,7 +2131,7 @@ loop:
ret = unlink(name);
if (ret != 0 && errno != ENOENT) {
- os_file_handle_error_no_exit(name, "delete", FALSE);
+ os_file_handle_error_no_exit(name, "delete", FALSE, __FILE__, __LINE__);
return(false);
}
@@ -2062,7 +2195,7 @@ loop:
ret = unlink(name);
if (ret != 0) {
- os_file_handle_error_no_exit(name, "delete", FALSE);
+ os_file_handle_error_no_exit(name, "delete", FALSE, __FILE__, __LINE__);
return(false);
}
@@ -2106,7 +2239,7 @@ os_file_rename_func(
return(TRUE);
}
- os_file_handle_error_no_exit(oldpath, "rename", FALSE);
+ os_file_handle_error_no_exit(oldpath, "rename", FALSE, __FILE__, __LINE__);
return(FALSE);
#else
@@ -2115,7 +2248,7 @@ os_file_rename_func(
ret = rename(oldpath, newpath);
if (ret != 0) {
- os_file_handle_error_no_exit(oldpath, "rename", FALSE);
+ os_file_handle_error_no_exit(oldpath, "rename", FALSE, __FILE__, __LINE__);
return(FALSE);
}
@@ -2146,7 +2279,7 @@ os_file_close_func(
return(TRUE);
}
- os_file_handle_error(NULL, "close");
+ os_file_handle_error(NULL, "close", __FILE__, __LINE__);
return(FALSE);
#else
@@ -2155,7 +2288,7 @@ os_file_close_func(
ret = close(file);
if (ret == -1) {
- os_file_handle_error(NULL, "close");
+ os_file_handle_error(NULL, "close", __FILE__, __LINE__);
return(FALSE);
}
@@ -2247,6 +2380,12 @@ os_file_set_size(
current_size = 0;
+#ifdef UNIV_DEBUG
+ fprintf(stderr, "InnoDB: Note: File %s current_size %lu extended_size %lu\n",
+ name, os_file_get_size(file), size);
+#endif
+
+
#ifdef HAVE_POSIX_FALLOCATE
if (srv_use_posix_fallocate) {
@@ -2257,7 +2396,7 @@ os_file_set_size(
INT64PF ", desired size " INT64PF "\n",
name, current_size, size);
os_file_handle_error_no_exit (name, "posix_fallocate",
- FALSE);
+ FALSE, __FILE__, __LINE__);
return(FALSE);
}
return(TRUE);
@@ -2446,7 +2585,7 @@ os_file_flush_func(
return(TRUE);
}
- os_file_handle_error(NULL, "flush");
+ os_file_handle_error(NULL, "flush", __FILE__, __LINE__);
/* It is a fatal error if a file flush does not succeed, because then
the database can get corrupt on disk */
@@ -2500,7 +2639,7 @@ os_file_flush_func(
ib_logf(IB_LOG_LEVEL_ERROR, "The OS said file flush did not succeed");
- os_file_handle_error(NULL, "flush");
+ os_file_handle_error(NULL, "flush", __FILE__, __LINE__);
/* It is a fatal error if a file flush does not succeed, because then
the database can get corrupt on disk */
@@ -2855,6 +2994,9 @@ try_again:
os_mutex_exit(os_file_count_mutex);
if (ret && len == n) {
+ if (fil_page_is_compressed((byte *)buf)) {
+ fil_decompress_page(NULL, (byte *)buf, len);
+ }
return(TRUE);
}
#else /* __WIN__ */
@@ -2868,6 +3010,10 @@ try_again:
if ((ulint) ret == n) {
+ if (fil_page_is_compressed((byte *)buf)) {
+ fil_decompress_page(NULL, (byte *)buf, n);
+ }
+
return(TRUE);
}
@@ -2875,7 +3021,7 @@ try_again:
"Tried to read "ULINTPF" bytes at offset " UINT64PF". "
"Was only able to read %ld.", n, offset, (lint) ret);
#endif /* __WIN__ */
- retry = os_file_handle_error(NULL, "read");
+ retry = os_file_handle_error(NULL, "read", __FILE__, __LINE__);
if (retry) {
goto try_again;
@@ -2968,10 +3114,14 @@ try_again:
if ((ulint) ret == n) {
+ if (fil_page_is_compressed((byte *)buf)) {
+ fil_decompress_page(NULL, (byte *)buf, n);
+ }
+
return(TRUE);
}
#endif /* __WIN__ */
- retry = os_file_handle_error_no_exit(NULL, "read", FALSE);
+ retry = os_file_handle_error_no_exit(NULL, "read", FALSE, __FILE__, __LINE__);
if (retry) {
goto try_again;
@@ -3183,7 +3333,7 @@ os_file_status(
} else if (ret) {
/* file exists, but stat call failed */
- os_file_handle_error_no_exit(path, "stat", FALSE);
+ os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__);
return(FALSE);
}
@@ -3211,7 +3361,7 @@ os_file_status(
} else if (ret) {
/* file exists, but stat call failed */
- os_file_handle_error_no_exit(path, "stat", FALSE);
+ os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__);
return(FALSE);
}
@@ -3260,7 +3410,7 @@ os_file_get_status(
} else if (ret) {
/* file exists, but stat call failed */
- os_file_handle_error_no_exit(path, "stat", FALSE);
+ os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__);
return(DB_FAIL);
@@ -3313,7 +3463,7 @@ os_file_get_status(
} else if (ret) {
/* file exists, but stat call failed */
- os_file_handle_error_no_exit(path, "stat", FALSE);
+ os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__);
return(DB_FAIL);
@@ -3866,7 +4016,7 @@ os_aio_array_create(
array->slots = static_cast<os_aio_slot_t*>(
ut_malloc(n * sizeof(*array->slots)));
- memset(array->slots, 0x0, sizeof(n * sizeof(*array->slots)));
+ memset(array->slots, 0x0, n * sizeof(*array->slots));
#if defined(LINUX_NATIVE_AIO)
array->aio_ctx = NULL;
@@ -3941,6 +4091,8 @@ os_aio_array_free(
/*==============*/
os_aio_array_t*& array) /*!< in, own: array to free */
{
+ ulint i;
+
os_mutex_free(array->mutex);
os_event_free(array->not_full);
os_event_free(array->is_empty);
@@ -3952,6 +4104,14 @@ os_aio_array_free(
}
#endif /* LINUX_NATIVE_AIO */
+ for (i = 0; i < array->n_slots; i++) {
+ os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i);
+ if (slot->page_compression_page) {
+ ut_free(slot->page_compression_page);
+ slot->page_compression_page = NULL;
+ }
+ }
+
ut_free(array->slots);
ut_free(array);
@@ -4296,7 +4456,16 @@ os_aio_array_reserve_slot(
to write */
os_offset_t offset, /*!< in: file offset */
ulint len, /*!< in: length of the block to read or write */
- ulint space_id)
+ ulint space_id,
+ ibool page_compression, /*!< in: is page compression used
+ on this file space */
+ ulint page_compression_level, /*!< page compression
+ level to be used */
+ ulint* write_size)/*!< in/out: Actual write size initialized
+ after fist successfull trim
+ operation for this page and if
+ initialized we do not trim again if
+ actual page size does not decrease. */
{
os_aio_slot_t* slot = NULL;
#ifdef WIN_ASYNC_IO
@@ -4388,6 +4557,55 @@ found:
slot->io_already_done = FALSE;
slot->space_id = space_id;
+ slot->page_compress_success = FALSE;
+ slot->write_size = write_size;
+ slot->page_compression_level = page_compression_level;
+ slot->page_compression = page_compression;
+
+ /* If the space is page compressed and this is write operation
+ and if either only index pages compression is disabled or
+ page is index page and only index pages compression is enabled then
+ we compress the page */
+ if (message1 &&
+ type == OS_FILE_WRITE &&
+ page_compression &&
+ (srv_page_compress_index_pages == false ||
+ (srv_page_compress_index_pages == true && fil_page_is_index_page(slot->buf)))) {
+ ulint real_len = len;
+ byte* tmp = NULL;
+
+ /* Release the array mutex while compressing */
+ os_mutex_exit(array->mutex);
+
+ // We allocate memory for page compressed buffer if and only
+ // if it is not yet allocated.
+ if (slot->page_buf == NULL) {
+ os_slot_alloc_page_buf(slot);
+ }
+
+ ut_ad(slot->page_buf);
+
+ /* Write buffer full of zeros, this is needed for trim,
+ can't really avoid this now. */
+ memset(slot->page_buf, 0, len);
+
+ tmp = fil_compress_page(fil_node_get_space_id(slot->message1), (byte *)buf, slot->page_buf, len, page_compression_level, &real_len);
+
+ /* If compression succeeded, set up the length and buffer */
+ if (tmp != buf) {
+ len = real_len;
+ buf = slot->page_buf;
+ slot->len = real_len;
+ slot->page_compress_success = TRUE;
+ } else {
+ slot->page_compress_success = FALSE;
+ }
+
+ /* Take array mutex back */
+ os_mutex_enter(array->mutex);
+
+ }
+
#ifdef WIN_ASYNC_IO
control = &slot->control;
control->Offset = (DWORD) offset & 0xFFFFFFFF;
@@ -4663,7 +4881,16 @@ os_aio_func(
aio operation); ignored if mode is
OS_AIO_SYNC */
ulint space_id,
- trx_t* trx)
+ trx_t* trx,
+ ibool page_compression, /*!< in: is page compression used
+ on this file space */
+ ulint page_compression_level, /*!< page compression
+ level to be used */
+ ulint* write_size)/*!< in/out: Actual write size initialized
+ after fist successfull trim
+ operation for this page and if
+ initialized we do not trim again if
+ actual page size does not decrease. */
{
os_aio_array_t* array;
os_aio_slot_t* slot;
@@ -4686,7 +4913,7 @@ os_aio_func(
wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER;
mode = mode & (~OS_AIO_SIMULATED_WAKE_LATER);
- if (mode == OS_AIO_SYNC)
+ if (mode == OS_AIO_SYNC)
{
ibool ret;
/* This is actually an ordinary synchronous read or write:
@@ -4753,7 +4980,8 @@ try_again:
trx->io_read += n;
}
slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
- name, buf, offset, n, space_id);
+ name, buf, offset, n, space_id,
+ page_compression, page_compression_level, write_size);
if (type == OS_FILE_READ) {
if (srv_use_native_aio) {
os_n_file_reads++;
@@ -4811,7 +5039,7 @@ err_exit:
os_aio_array_free_slot(array, slot);
if (os_file_handle_error(
- name,type == OS_FILE_READ ? "aio read" : "aio write")) {
+ name,type == OS_FILE_READ ? "aio read" : "aio write", __FILE__, __LINE__)) {
goto try_again;
}
@@ -4911,7 +5139,7 @@ os_aio_windows_handle(
if (ret && len == slot->len) {
ret_val = TRUE;
- } else if (os_file_handle_error(slot->name, "Windows aio")) {
+ } else if (os_file_handle_error(slot->name, "Windows aio", __FILE__, __LINE__)) {
retry = TRUE;
} else {
@@ -4939,11 +5167,17 @@ os_aio_windows_handle(
switch (slot->type) {
case OS_FILE_WRITE:
- ret_val = os_file_write(slot->name, slot->file, slot->buf,
- slot->control.Offset, slot->control.OffsetHigh, slot->len);
+ if (slot->message1 && page_compression && slot->page_buf) {
+ ret_val = os_file_write(slot->name, slot->file, slot->page_buf,
+ slot->control.Offset, slot->control.OffsetHigh, slot->len);
+ } else {
+
+ ret_val = os_file_write(slot->name, slot->file, slot->buf,
+ slot->control.Offset, slot->control.OffsetHigh, slot->len);
+ }
break;
case OS_FILE_READ:
- ret_val = os_file_read(slot->file, slot->buf,
+ ret_val = os_file_read(slot->file, slot->buf,
slot->control.Offset, slot->control.OffsetHigh, slot->len);
break;
default:
@@ -4969,6 +5203,28 @@ os_aio_windows_handle(
ret_val = ret && len == slot->len;
}
+ if (slot->message1 && page_compression) {
+ // We allocate memory for page compressed buffer if and only
+ // if it is not yet allocated.
+ if (slot->page_buf == NULL) {
+ os_slot_alloc_page_buf(slot);
+ }
+ ut_ad(slot->page_buf);
+
+ if (slot->type == OS_FILE_READ) {
+ if (fil_page_is_compressed(slot->buf)) {
+ fil_decompress_page(slot->page_buf, slot->buf, slot->len);
+ }
+ } else {
+ if (slot->page_compress_success && fil_page_is_compressed(slot->page_buf)) {
+ if (srv_use_trim && os_fallocate_failed == FALSE) {
+ // Deallocate unused blocks from file system
+ os_file_trim(slot->file, slot, slot->len);
+ }
+ }
+ }
+ }
+
os_aio_array_free_slot((os_aio_array_t *)slot->arr, slot);
return(ret_val);
@@ -5058,6 +5314,33 @@ retry:
/* We have not overstepped to next segment. */
ut_a(slot->pos < end_pos);
+ /* If the table is page compressed and this is read,
+ we decompress before we annouce the read is
+ complete. For writes, we free the compressed page. */
+ if (slot->message1 && slot->page_compression) {
+ // We allocate memory for page compressed buffer if and only
+ // if it is not yet allocated.
+ if (slot->page_buf == NULL) {
+ os_slot_alloc_page_buf(slot);
+ }
+ ut_ad(slot->page_buf);
+
+ if (slot->type == OS_FILE_READ) {
+ if (fil_page_is_compressed(slot->buf)) {
+ fil_decompress_page(slot->page_buf, slot->buf, slot->len);
+ }
+ } else {
+ if (slot->page_compress_success &&
+ fil_page_is_compressed(slot->page_buf)) {
+ ut_ad(slot->page_compression_page);
+ if (srv_use_trim && os_fallocate_failed == FALSE) {
+ // Deallocate unused blocks from file system
+ os_file_trim(slot->file, slot, slot->len);
+ }
+ }
+ }
+ }
+
/* Mark this request as completed. The error handling
will be done in the calling function. */
os_mutex_enter(array->mutex);
@@ -5203,6 +5486,13 @@ found:
} else {
errno = -slot->ret;
+ if (slot->ret == 0) {
+ fprintf(stderr,
+ "InnoDB: Number of bytes after aio %d requested %lu\n"
+ "InnoDB: from file %s\n",
+ slot->n_bytes, slot->len, slot->name);
+ }
+
/* os_file_handle_error does tell us if we should retry
this IO. As it stands now, we don't do this retry when
reaping requests from a different context than
@@ -5210,7 +5500,7 @@ found:
windows and linux native AIO.
We should probably look into this to transparently
re-submit the IO. */
- os_file_handle_error(slot->name, "Linux aio");
+ os_file_handle_error(slot->name, "Linux aio", __FILE__, __LINE__);
ret = FALSE;
}
@@ -5884,3 +6174,162 @@ os_aio_all_slots_free(void)
#endif /* UNIV_DEBUG */
#endif /* !UNIV_HOTBACKUP */
+
+#ifdef _WIN32
+#include <winioctl.h>
+#ifndef FSCTL_FILE_LEVEL_TRIM
+#define FSCTL_FILE_LEVEL_TRIM CTL_CODE(FILE_DEVICE_FILE_SYSTEM, 130, METHOD_BUFFERED, FILE_WRITE_DATA)
+typedef struct _FILE_LEVEL_TRIM_RANGE {
+ DWORDLONG Offset;
+ DWORDLONG Length;
+} FILE_LEVEL_TRIM_RANGE, *PFILE_LEVEL_TRIM_RANGE;
+
+typedef struct _FILE_LEVEL_TRIM {
+ DWORD Key;
+ DWORD NumRanges;
+ FILE_LEVEL_TRIM_RANGE Ranges[1];
+} FILE_LEVEL_TRIM, *PFILE_LEVEL_TRIM;
+#endif
+#endif
+
+/**********************************************************************//**
+Directly manipulate the allocated disk space by deallocating for the file referred to
+by fd for the byte range starting at offset and continuing for len bytes.
+Within the specified range, partial file system blocks are zeroed, and whole
+file system blocks are removed from the file. After a successful call,
+subsequent reads from this range will return zeroes.
+@return true if success, false if error */
+UNIV_INTERN
+ibool
+os_file_trim(
+/*=========*/
+ os_file_t file, /*!< in: file to be trimmed */
+ os_aio_slot_t* slot, /*!< in: slot structure */
+ ulint len) /*!< in: length of area */
+{
+
+ size_t trim_len = UNIV_PAGE_SIZE - len;
+ os_offset_t off = slot->offset + len;
+
+ // Nothing to do if trim length is zero or if actual write
+ // size is initialized and it is smaller than current write size.
+ // In first write if we trim we set write_size to actual bytes
+ // written and rest of the page is trimmed. In following writes
+ // there is no need to trim again if write_size only increases
+ // because rest of the page is already trimmed. If actual write
+ // size decreases we need to trim again.
+ if (trim_len == 0 ||
+ (slot->write_size &&
+ *slot->write_size > 0 &&
+ len >= *slot->write_size)) {
+
+#ifdef UNIV_DEBUG
+ fprintf(stderr, "Note: TRIM: write_size %lu trim_len %lu len %lu\n",
+ *slot->write_size, trim_len, len);
+#endif
+
+ if (*slot->write_size > 0 && len >= *slot->write_size) {
+ srv_stats.page_compressed_trim_op_saved.inc();
+ }
+
+ *slot->write_size = len;
+
+ return (TRUE);
+ }
+
+#ifdef __linux__
+#if defined(FALLOC_FL_PUNCH_HOLE) && defined (FALLOC_FL_KEEP_SIZE)
+ int ret = fallocate(file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, off, trim_len);
+
+ if (ret) {
+ /* After first failure do not try to trim again */
+ os_fallocate_failed = TRUE;
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: [Warning] fallocate call failed with error code %d.\n"
+ " InnoDB: start: %lx len: %lu payload: %lu\n"
+ " InnoDB: Disabling fallocate for now.\n", ret, (slot->offset+len), trim_len, len);
+
+ os_file_handle_error_no_exit(slot->name,
+ " fallocate(FALLOC_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE) ",
+ FALSE, __FILE__, __LINE__);
+
+ if (slot->write_size) {
+ *slot->write_size = 0;
+ }
+
+ return (FALSE);
+ } else {
+ if (slot->write_size) {
+ *slot->write_size = len;
+ }
+ }
+#else
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: [Warning] fallocate not supported on this installation."
+ " InnoDB: Disabling fallocate for now.");
+ os_fallocate_failed = TRUE;
+ slot->write_size = NULL;
+
+#endif /* HAVE_FALLOCATE ... */
+
+#elif defined(_WIN32)
+ FILE_LEVEL_TRIM flt;
+ flt.Key = 0;
+ flt.NumRanges = 1;
+ flt.Ranges[0].Offset = off;
+ flt.Ranges[0].Length = trim_len;
+
+ BOOL ret = DeviceIoControl(file,FSCTL_FILE_LEVEL_TRIM,&flt, sizeof(flt), NULL, NULL, NULL, NULL);
+
+ if (!ret) {
+ /* After first failure do not try to trim again */
+ os_fallocate_failed = TRUE;
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: [Warning] fallocate call failed with error.\n"
+ " InnoDB: start: %lx len: %du payload: %lu\n"
+ " InnoDB: Disabling fallocate for now.\n", (slot->offset+len), trim_len, len);
+
+ os_file_handle_error_no_exit(slot->name,
+ " DeviceIOControl(FSCTL_FILE_LEVEL_TRIM) ",
+ FALSE, __FILE__, __LINE__);
+
+ if (slot->write_size) {
+ slot->write_size = 0;
+ }
+ return (FALSE);
+ } else {
+ if (slot->write_size) {
+ slot->write_size = len;
+ }
+ }
+#endif
+
+#define SECT_SIZE 512
+ srv_stats.page_compression_trim_sect512.add((trim_len / SECT_SIZE));
+ srv_stats.page_compression_trim_sect4096.add((trim_len / (SECT_SIZE*8)));
+ srv_stats.page_compressed_trim_op.inc();
+
+ return (TRUE);
+
+}
+
+/**********************************************************************//**
+Allocate memory for temporal buffer used for page compression. This
+buffer is freed later. */
+UNIV_INTERN
+void
+os_slot_alloc_page_buf(
+/*===================*/
+ os_aio_slot_t* slot) /*!< in: slot structure */
+{
+ byte* cbuf2;
+ byte* cbuf;
+
+ cbuf2 = static_cast<byte *>(ut_malloc(UNIV_PAGE_SIZE*2));
+ cbuf = static_cast<byte *>(ut_align(cbuf2, UNIV_PAGE_SIZE));
+ slot->page_compression_page = static_cast<byte *>(cbuf2);
+ slot->page_buf = static_cast<byte *>(cbuf);
+}
diff --git a/storage/xtradb/srv/srv0mon.cc b/storage/xtradb/srv/srv0mon.cc
index d98315ae9a2..0b5556ab61a 100644
--- a/storage/xtradb/srv/srv0mon.cc
+++ b/storage/xtradb/srv/srv0mon.cc
@@ -290,6 +290,12 @@ static monitor_info_t innodb_counter_info[] =
MONITOR_EXISTING | MONITOR_DEFAULT_ON),
MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_WRITTEN},
+ {"buffer_index_pages_written", "buffer",
+ "Number of index pages written (innodb_index_pages_written)",
+ static_cast<monitor_type_t>(
+ MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+ MONITOR_DEFAULT_START, MONITOR_OVLD_INDEX_PAGES_WRITTEN},
+
{"buffer_pages_read", "buffer",
"Number of pages read (innodb_pages_read)",
static_cast<monitor_type_t>(
@@ -879,6 +885,41 @@ static monitor_info_t innodb_counter_info[] =
MONITOR_NONE,
MONITOR_DEFAULT_START, MONITOR_PAD_DECREMENTS},
+ {"compress_saved", "compression",
+ "Number of bytes saved by page compression",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_SAVED},
+
+ {"compress_trim_sect512", "compression",
+ "Number of sect-512 TRIMed by page compression",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT512},
+
+ {"compress_trim_sect4096", "compression",
+ "Number of sect-4K TRIMed by page compression",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT4096},
+
+ {"compress_pages_page_compressed", "compression",
+ "Number of pages compressed by page compression",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_COMPRESSED},
+
+ {"compress_page_compressed_trim_op", "compression",
+ "Number of TRIM operation performed by page compression",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP},
+
+ {"compress_page_compressed_trim_op_saved", "compression",
+ "Number of TRIM operation saved by page compression",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP_SAVED},
+
+ {"compress_pages_page_decompressed", "compression",
+ "Number of pages decompressed by page compression",
+ MONITOR_NONE,
+ MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED},
+
/* ========== Counters for Index ========== */
{"module_index", "index", "Index Manager",
MONITOR_MODULE,
@@ -1532,6 +1573,11 @@ srv_mon_process_existing_counter(
value = stat.n_pages_written;
break;
+ /* innodb_index_pages_written, the number of page written */
+ case MONITOR_OVLD_INDEX_PAGES_WRITTEN:
+ value = srv_stats.index_pages_written;
+ break;
+
/* innodb_pages_read */
case MONITOR_OVLD_PAGES_READ:
buf_get_total_stat(&stat);
@@ -1773,6 +1819,28 @@ srv_mon_process_existing_counter(
value = btr_cur_n_non_sea;
break;
+ case MONITOR_OVLD_PAGE_COMPRESS_SAVED:
+ value = srv_stats.page_compression_saved;
+ break;
+ case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT512:
+ value = srv_stats.page_compression_trim_sect512;
+ break;
+ case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT4096:
+ value = srv_stats.page_compression_trim_sect4096;
+ break;
+ case MONITOR_OVLD_PAGES_PAGE_COMPRESSED:
+ value = srv_stats.pages_page_compressed;
+ break;
+ case MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP:
+ value = srv_stats.page_compressed_trim_op;
+ break;
+ case MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP_SAVED:
+ value = srv_stats.page_compressed_trim_op_saved;
+ break;
+ case MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED:
+ value = srv_stats.pages_page_decompressed;
+ break;
+
default:
ut_error;
}
diff --git a/storage/xtradb/srv/srv0srv.cc b/storage/xtradb/srv/srv0srv.cc
index 953bbba11f7..92acf847ca1 100644
--- a/storage/xtradb/srv/srv0srv.cc
+++ b/storage/xtradb/srv/srv0srv.cc
@@ -3,6 +3,7 @@
Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, 2009 Google Inc.
Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, 2014, SkySQL Ab.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -160,6 +161,26 @@ use simulated aio we build below with threads.
Currently we support native aio on windows and linux */
UNIV_INTERN my_bool srv_use_native_aio = TRUE;
+/* If this flag is TRUE, then we will use page compression
+to the pages */
+UNIV_INTERN my_bool srv_compress_pages = FALSE;
+/* If this flag is TRUE, then we will use page compression
+only for index pages */
+UNIV_INTERN my_bool srv_page_compress_index_pages = FALSE;
+UNIV_INTERN long srv_trim_pct = 100;
+/* Default compression level if page compression is used and no compression
+level is set for the table*/
+UNIV_INTERN long srv_compress_zlib_level = 6;
+/* If this flag is TRUE, then we will use fallocate(PUCH_HOLE)
+to the pages */
+UNIV_INTERN my_bool srv_use_trim = TRUE;
+/* If this flag is TRUE, then we will use posix fallocate for file extentsion */
+UNIV_INTERN my_bool srv_use_posix_fallocate = FALSE;
+/* If this flag is TRUE, then we disable doublewrite buffer */
+UNIV_INTERN my_bool srv_use_atomic_writes = FALSE;
+/* If this flag IS TRUE, then we use lz4 to compress/decompress pages */
+UNIV_INTERN my_bool srv_use_lz4 = FALSE;
+
#ifdef __WIN__
/* Windows native condition variables. We use runtime loading / function
pointers, because they are not available on Windows Server 2003 and
@@ -454,10 +475,6 @@ UNIV_INTERN unsigned long long srv_stats_persistent_sample_pages = 20;
UNIV_INTERN my_bool srv_stats_auto_recalc = TRUE;
UNIV_INTERN ibool srv_use_doublewrite_buf = TRUE;
-UNIV_INTERN ibool srv_use_atomic_writes = FALSE;
-#ifdef HAVE_POSIX_FALLOCATE
-UNIV_INTERN ibool srv_use_posix_fallocate = FALSE;
-#endif
/** doublewrite buffer is 1MB is size i.e.: it can hold 128 16K pages.
The following parameter is the size of the buffer that is used for
@@ -493,6 +510,15 @@ static ulint srv_n_rows_read_old = 0;
UNIV_INTERN ulint srv_truncated_status_writes = 0;
UNIV_INTERN ulint srv_available_undo_logs = 0;
+UNIV_INTERN ib_uint64_t srv_page_compression_saved = 0;
+UNIV_INTERN ib_uint64_t srv_page_compression_trim_sect512 = 0;
+UNIV_INTERN ib_uint64_t srv_page_compression_trim_sect4096 = 0;
+UNIV_INTERN ib_uint64_t srv_index_pages_written = 0;
+UNIV_INTERN ib_uint64_t srv_pages_page_compressed = 0;
+UNIV_INTERN ib_uint64_t srv_page_compressed_trim_op = 0;
+UNIV_INTERN ib_uint64_t srv_page_compressed_trim_op_saved = 0;
+UNIV_INTERN ib_uint64_t srv_index_page_decompressed = 0;
+
/* Ensure status variables are on separate cache lines */
#define CACHE_LINE_SIZE 64
@@ -1835,6 +1861,15 @@ srv_export_innodb_status(void)
export_vars.innodb_descriptors_memory
= os_atomic_increment_ulint(&srv_descriptors_memory, 0);
+ export_vars.innodb_page_compression_saved = srv_stats.page_compression_saved;
+ export_vars.innodb_page_compression_trim_sect512 = srv_stats.page_compression_trim_sect512;
+ export_vars.innodb_page_compression_trim_sect4096 = srv_stats.page_compression_trim_sect4096;
+ export_vars.innodb_index_pages_written = srv_stats.index_pages_written;
+ export_vars.innodb_pages_page_compressed = srv_stats.pages_page_compressed;
+ export_vars.innodb_page_compressed_trim_op = srv_stats.page_compressed_trim_op;
+ export_vars.innodb_page_compressed_trim_op_saved = srv_stats.page_compressed_trim_op_saved;
+ export_vars.innodb_pages_page_decompressed = srv_stats.pages_page_decompressed;
+
#ifdef UNIV_DEBUG
rw_lock_s_lock(&purge_sys->latch);
trx_id_t done_trx_no = purge_sys->done.trx_no;
diff --git a/storage/xtradb/srv/srv0start.cc b/storage/xtradb/srv/srv0start.cc
index 3ddfd9ab3a4..faad8c3c133 100644
--- a/storage/xtradb/srv/srv0start.cc
+++ b/storage/xtradb/srv/srv0start.cc
@@ -3,6 +3,7 @@
Copyright (c) 1996, 2013, Oracle and/or its affiliates. All rights reserved.
Copyright (c) 2008, Google Inc.
Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -64,6 +65,8 @@ Created 2/16/1996 Heikki Tuuri
#include "ibuf0ibuf.h"
#include "srv0start.h"
#include "srv0srv.h"
+#include "buf0flu.h"
+
#ifndef UNIV_HOTBACKUP
# include "trx0rseg.h"
# include "os0proc.h"
@@ -128,8 +131,14 @@ static os_file_t files[1000];
/** io_handler_thread parameters for thread identification */
static ulint n[SRV_MAX_N_IO_THREADS + 6];
/** io_handler_thread identifiers, 32 is the maximum number of purge threads */
-static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6
- + SRV_MAX_N_PURGE_THREADS];
+/*
+ static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6
+/ + SRV_MAX_N_PURGE_THREADS];
+*/
+/** pgcomp_thread are 16 total */
+#define START_PGCOMP_CNT (SRV_MAX_N_IO_THREADS + 6 + SRV_MAX_N_PURGE_THREADS)
+#define PGCOMP_MAX_WORKER 16
+static os_thread_id_t thread_ids[SRV_MAX_N_IO_THREADS + 6 + SRV_MAX_N_PURGE_THREADS + PGCOMP_MAX_WORKER];
/** We use this mutex to test the return value of pthread_mutex_trylock
on successful locking. HP-UX does NOT return 0, though Linux et al do. */
@@ -537,7 +546,7 @@ create_log_file(
*file = os_file_create(
innodb_file_log_key, name,
OS_FILE_CREATE|OS_FILE_ON_ERROR_NO_EXIT, OS_FILE_NORMAL,
- OS_LOG_FILE, &ret);
+ OS_LOG_FILE, &ret, FALSE);
if (!ret) {
ib_logf(IB_LOG_LEVEL_ERROR, "Cannot create %s", name);
@@ -754,7 +763,7 @@ open_log_file(
*file = os_file_create(innodb_file_log_key, name,
OS_FILE_OPEN, OS_FILE_AIO,
- OS_LOG_FILE, &ret);
+ OS_LOG_FILE, &ret, FALSE);
if (!ret) {
ib_logf(IB_LOG_LEVEL_ERROR, "Unable to open '%s'", name);
return(DB_ERROR);
@@ -845,7 +854,7 @@ open_or_create_data_files(
files[i] = os_file_create(
innodb_file_data_key, name, OS_FILE_CREATE,
- OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+ OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE);
if (srv_read_only_mode) {
@@ -888,7 +897,7 @@ open_or_create_data_files(
files[i] = os_file_create(
innodb_file_data_key, name, OS_FILE_OPEN_RAW,
- OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+ OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE);
if (!ret) {
ib_logf(IB_LOG_LEVEL_ERROR,
@@ -921,17 +930,17 @@ open_or_create_data_files(
files[i] = os_file_create(
innodb_file_data_key,
name, OS_FILE_OPEN_RAW,
- OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+ OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE);
} else if (i == 0) {
files[i] = os_file_create(
innodb_file_data_key,
name, OS_FILE_OPEN_RETRY,
- OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+ OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE);
} else {
files[i] = os_file_create(
innodb_file_data_key,
name, OS_FILE_OPEN, OS_FILE_NORMAL,
- OS_DATA_FILE, &ret);
+ OS_DATA_FILE, &ret, FALSE);
}
if (!ret) {
@@ -1122,7 +1131,7 @@ srv_undo_tablespace_create(
innodb_file_data_key,
name,
srv_read_only_mode ? OS_FILE_OPEN : OS_FILE_CREATE,
- OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+ OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE);
if (srv_read_only_mode && ret) {
ib_logf(IB_LOG_LEVEL_INFO,
@@ -1209,7 +1218,8 @@ srv_undo_tablespace_open(
| OS_FILE_ON_ERROR_SILENT,
OS_FILE_NORMAL,
OS_DATA_FILE,
- &ret);
+ &ret,
+ FALSE);
/* If the file open was successful then load the tablespace. */
@@ -1503,6 +1513,694 @@ init_log_online(void)
}
}
+/* JAN: TODO: */
+/**********************************************************************************/
+extern int timediff(struct timeval *g_time, struct timeval *s_time, struct timeval *d_time);
+extern ibool buf_flush_start(buf_pool_t* buf_pool, buf_flush_t flush_type);
+extern void buf_flush_end(buf_pool_t* buf_pool, buf_flush_t flush_type);
+extern void buf_flush_common(buf_flush_t flush_type, ulint page_count);
+extern ulint buf_flush_batch(buf_pool_t* buf_pool, buf_flush_t flush_type, ulint min_n, lsn_t lsn_limit, bool limited_lru_scan, flush_counters_t*);
+
+typedef enum wrk_status {
+ WRK_ITEM_SET=0,
+ WRK_ITEM_START=1,
+ WRK_ITEM_DONE=2,
+ WRK_ITEM_SUCCESS=2,
+ WRK_ITEM_FAILED=3,
+ WRK_ITEM_STATUS_UNDEFINED
+} wrk_status_t;
+
+typedef enum wthr_status {
+ WTHR_NOT_INIT=0,
+ WTHR_INITIALIZED=1,
+ WTHR_SIG_WAITING=2,
+ WTHR_RUNNING=3,
+ WTHR_NO_WORK=4,
+ WTHR_KILL_IT=5,
+ WTHR_STATUS_UNDEFINED
+} wthr_status_t;
+
+typedef struct wrk_itm
+{
+ /****************************/
+ /* Need to group into struct*/
+ buf_pool_t* buf_pool; //buffer-pool instance
+ int flush_type; //flush-type for buffer-pool flush operation
+ int min; //minimum number of pages requested to be flushed
+ unsigned long long lsn_limit; //lsn limit for the buffer-pool flush operation
+ /****************************/
+
+ unsigned long result; //flush pages count
+ unsigned long t_usec; //time-taken in usec
+ long id_usr; //thread-id currently working
+ wrk_status_t wi_status; //flag
+ struct wrk_itm *next;
+} wrk_t;
+
+typedef enum op_q_status {
+ Q_NOT_INIT=0,
+ Q_EMPTY=1,
+ Q_INITIALIZED=2,
+ Q_PROCESS=3,
+ Q_DONE=4,
+ Q_ERROR=5,
+ Q_STATUS_UNDEFINED
+} q_status_t;
+
+typedef struct op_queue
+{
+ pthread_mutex_t mtx;
+ pthread_cond_t cv;
+ q_status_t flag;
+ wrk_t *head;
+ wrk_t *tail;
+} opq_t;
+
+opq_t wq, cq;
+
+typedef struct thread_sync
+{
+ int wthread_id;
+ pthread_t wthread;
+ opq_t *wq;
+ opq_t *cq;
+ wthr_status_t wt_status;
+ unsigned long stat_universal_num_processed;
+ unsigned long stat_cycle_num_processed;
+} thread_sync_t;
+
+/* Global XXX:DD needs to be cleaned */
+int exit_flag;
+ulint check_wrk_done_count;
+static ulint done_cnt_flag;
+static int pgc_n_threads = 8;
+
+thread_sync_t pc_sync[PGCOMP_MAX_WORKER];
+static wrk_t work_items[PGCOMP_MAX_WORKER];
+static int pgcomp_wrk_initialized = -1;
+
+int set_check_done_flag_count(int cnt)
+{
+ return(check_wrk_done_count = cnt);
+}
+
+int set_pgcomp_wrk_init_done(void)
+{
+ pgcomp_wrk_initialized = 1;
+ return 0;
+}
+
+int is_pgcomp_wrk_init_done(void)
+{
+ return(pgcomp_wrk_initialized == 1);
+}
+
+ulint set_done_cnt_flag(ulint val)
+{
+ /*
+ * Assumption: The thread calling into set_done_cnt_flag
+ * needs to have "cq.mtx" acquired, else not safe.
+ */
+ done_cnt_flag = val;
+ return done_cnt_flag;
+}
+
+
+ulint cv_done_inc_flag_sig(thread_sync_t * ppc)
+{
+ pthread_mutex_lock(&ppc->cq->mtx);
+ ppc->stat_universal_num_processed++;
+ ppc->stat_cycle_num_processed++;
+ done_cnt_flag++;
+ if(!(done_cnt_flag <= check_wrk_done_count)) {
+ fprintf(stderr, "ERROR: done_cnt:%lu check_wrk_done_count:%lu\n",
+ done_cnt_flag, check_wrk_done_count);
+ }
+ assert(done_cnt_flag <= check_wrk_done_count);
+ pthread_mutex_unlock(&ppc->cq->mtx);
+ if(done_cnt_flag == check_wrk_done_count) {
+ ppc->wq->flag = Q_DONE;
+ pthread_mutex_lock(&ppc->cq->mtx);
+ ppc->cq->flag = Q_DONE;
+ pthread_cond_signal(&ppc->cq->cv);
+ pthread_mutex_unlock(&ppc->cq->mtx);
+ }
+ return(done_cnt_flag);
+}
+
+int q_remove_wrk(opq_t *q, wrk_t **wi)
+{
+ int ret = 0;
+
+ if(!wi || !q) {
+ return -1;
+ }
+
+ pthread_mutex_lock(&q->mtx);
+ assert(!((q->tail == NULL) && (q->head != NULL)));
+ assert(!((q->tail != NULL) && (q->head == NULL)));
+
+ /* get the first in the list*/
+ *wi = q->head;
+ if(q->head) {
+ ret = 0;
+ q->head = q->head->next;
+ (*wi)->next = NULL;
+ if(!q->head) {
+ q->tail = NULL;
+ }
+ } else {
+ q->tail = NULL;
+ ret = 1; /* indicating remove from queue failed */
+ }
+ pthread_mutex_unlock(&q->mtx);
+ return (ret);
+}
+
+int is_busy_wrk_itm(wrk_t *wi)
+{
+ if(!wi) {
+ return -1;
+ }
+ return(!(wi->id_usr == -1));
+}
+
+int setup_wrk_itm(int items)
+{
+ int i;
+ for(i=0; i<items; i++) {
+ work_items[i].buf_pool = NULL;
+ work_items[i].result = 0;
+ work_items[i].t_usec = 0;
+ work_items[i].id_usr = -1;
+ work_items[i].wi_status = WRK_ITEM_STATUS_UNDEFINED;
+ work_items[i].next = &work_items[(i+1)%items];
+ }
+ /* last node should be the tail */
+ work_items[items-1].next = NULL;
+ return 0;
+}
+
+int init_queue(opq_t *q)
+{
+ if(!q) {
+ return -1;
+ }
+ /* Initialize Queue mutex and CV */
+ pthread_mutex_init(&q->mtx, NULL);
+ pthread_cond_init(&q->cv, NULL);
+ q->flag = Q_INITIALIZED;
+ q->head = q->tail = NULL;
+
+ return 0;
+}
+
+#if 0
+int drain_cq(opq_t *cq, int items)
+{
+ int i=0;
+
+ if(!cq) {
+ return -1;
+ }
+ pthread_mutex_lock(&cq->mtx);
+ for(i=0; i<items; i++) {
+ work_items[i].result=0;
+ work_items[i].t_usec = 0;
+ work_items[i].id_usr = -1;
+ }
+ cq->head = cq->tail = NULL;
+ pthread_mutex_unlock(&cq->mtx);
+ return 0;
+}
+#endif
+
+int q_insert_wrk_list(opq_t *q, wrk_t *w_list)
+{
+ if((!q) || (!w_list)) {
+ fprintf(stderr, "insert failed q:%p w:%p\n", q, w_list);
+ return -1;
+ }
+
+ pthread_mutex_lock(&q->mtx);
+
+ assert(!((q->tail == NULL) && (q->head != NULL)));
+ assert(!((q->tail != NULL) && (q->head == NULL)));
+
+ /* list is empty */
+ if(!q->tail) {
+ q->head = q->tail = w_list;
+ } else {
+ /* added the first of the node to list */
+ assert(q->head != NULL);
+ q->tail->next = w_list;
+ }
+
+ /* move tail to the last node */
+ while(q->tail->next) {
+ q->tail = q->tail->next;
+ }
+ pthread_mutex_unlock(&q->mtx);
+
+ return 0;
+}
+
+int flush_pool_instance(wrk_t *wi)
+{
+ struct timeval p_start_time, p_end_time, d_time;
+ flush_counters_t n;
+
+ if(!wi) {
+ fprintf(stderr, "work item invalid wi:%p\n", wi);
+ return -1;
+ }
+
+ wi->t_usec = 0;
+ if (!buf_flush_start(wi->buf_pool, (buf_flush_t)wi->flush_type)) {
+ /* We have two choices here. If lsn_limit was
+ specified then skipping an instance of buffer
+ pool means we cannot guarantee that all pages
+ up to lsn_limit has been flushed. We can
+ return right now with failure or we can try
+ to flush remaining buffer pools up to the
+ lsn_limit. We attempt to flush other buffer
+ pools based on the assumption that it will
+ help in the retry which will follow the
+ failure. */
+ fprintf(stderr, "flush_start Failed, flush_type:%d\n",
+ (buf_flush_t)wi->flush_type);
+ return -1;
+ }
+
+#ifdef UNIV_DEBUG
+ /* Record time taken for the OP in usec */
+ gettimeofday(&p_start_time, 0x0);
+#endif
+
+ if((buf_flush_t)wi->flush_type == BUF_FLUSH_LRU) {
+ /* srv_LRU_scan_depth can be arbitrarily large value.
+ * We cap it with current LRU size.
+ */
+ buf_pool_mutex_enter(wi->buf_pool);
+ wi->min = UT_LIST_GET_LEN(wi->buf_pool->LRU);
+ buf_pool_mutex_exit(wi->buf_pool);
+ wi->min = ut_min(srv_LRU_scan_depth,wi->min);
+ }
+
+ buf_flush_batch(wi->buf_pool,
+ (buf_flush_t)wi->flush_type,
+ wi->min, wi->lsn_limit, false, &n);
+
+ wi->result = n.flushed;
+
+ buf_flush_end(wi->buf_pool, (buf_flush_t)wi->flush_type);
+ buf_flush_common((buf_flush_t)wi->flush_type, wi->result);
+
+#ifdef UNIV_DEBUG
+ gettimeofday(&p_end_time, 0x0);
+ timediff(&p_end_time, &p_start_time, &d_time);
+
+ wi->t_usec = (unsigned long)(d_time.tv_usec+(d_time.tv_sec*1000000));
+#endif
+
+ return 0;
+}
+
+int service_page_comp_io(thread_sync_t * ppc)
+{
+ wrk_t *wi = NULL;
+ int ret=0;
+
+ pthread_mutex_lock(&ppc->wq->mtx);
+ do{
+ ppc->wt_status = WTHR_SIG_WAITING;
+ ret = pthread_cond_wait(&ppc->wq->cv, &ppc->wq->mtx);
+ ppc->wt_status = WTHR_RUNNING;
+ if(ret == ETIMEDOUT) {
+ fprintf(stderr, "ERROR ETIMEDOUT cnt_flag:[%lu] ret:%d\n",
+ done_cnt_flag, ret);
+ } else if(ret == EINVAL || ret == EPERM) {
+ fprintf(stderr, "ERROR EINVAL/EPERM cnt_flag:[%lu] ret:%d\n",
+ done_cnt_flag, ret);
+ }
+ if(ppc->wq->flag == Q_PROCESS) {
+ break;
+ } else {
+ pthread_mutex_unlock(&ppc->wq->mtx);
+ return -1;
+ }
+ } while (ppc->wq->flag == Q_PROCESS && ret == 0);
+
+ pthread_mutex_unlock(&ppc->wq->mtx);
+
+ while (ppc->cq->flag == Q_PROCESS) {
+ wi = NULL;
+ /* Get the work item */
+ if (0 != (ret = q_remove_wrk(ppc->wq, &wi))) {
+ ppc->wt_status = WTHR_NO_WORK;
+ return -1;
+ }
+
+ assert(ret==0);
+ assert(wi != NULL);
+ assert(0 == is_busy_wrk_itm(wi));
+ assert(wi->id_usr == -1);
+
+ wi->id_usr = ppc->wthread;
+ wi->wi_status = WRK_ITEM_START;
+
+ /* Process work item */
+ if(0 != (ret = flush_pool_instance(wi))) {
+ fprintf(stderr, "FLUSH op failed ret:%d\n", ret);
+ wi->wi_status = WRK_ITEM_FAILED;
+ }
+
+ ret = q_insert_wrk_list(ppc->cq, wi);
+
+ assert(0==ret);
+ assert(check_wrk_done_count >= done_cnt_flag);
+ wi->wi_status = WRK_ITEM_SUCCESS;
+ if(check_wrk_done_count == cv_done_inc_flag_sig(ppc)) {
+ break;
+ }
+ }
+ return(0);
+}
+
+/******************************************************************//**
+@return a dummy parameter*/
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(page_comp_io_thread)(
+/*==========================================*/
+ void * arg)
+{
+ thread_sync_t *ppc_io = ((thread_sync_t *)arg);
+
+ while (srv_shutdown_state != SRV_SHUTDOWN_EXIT_THREADS) {
+ service_page_comp_io(ppc_io);
+ ppc_io->stat_cycle_num_processed = 0;
+ }
+ os_thread_exit(NULL);
+ OS_THREAD_DUMMY_RETURN;
+}
+
+int print_queue_wrk_itm(opq_t *q)
+{
+#if UNIV_DEBUG
+ wrk_t *wi = NULL;
+
+ if(!q) {
+ fprintf(stderr, "queue NULL\n");
+ return -1;
+ }
+
+ if(!q->head || !q->tail) {
+ assert(!(((q->tail==NULL) && (q->head!=NULL)) && ((q->tail != NULL) && (q->head == NULL))));
+ fprintf(stderr, "queue empty (h:%p t:%p)\n", q->head, q->tail);
+ return 0;
+ }
+
+ pthread_mutex_lock(&q->mtx);
+ for(wi = q->head; (wi != NULL) ; wi = wi->next) {
+ //fprintf(stderr, "- [%p] %p %lu %luus [%ld] >%p\n",
+ // wi, wi->buf_pool, wi->result, wi->t_usec, wi->id_usr, wi->next);
+ fprintf(stderr, "- [%p] [%s] >%p\n",
+ wi, (wi->id_usr == -1)?"free":"Busy", wi->next);
+ }
+ pthread_mutex_unlock(&q->mtx);
+#endif
+ return(0);
+}
+
+int print_wrk_list(wrk_t *wi_list)
+{
+ wrk_t *wi = wi_list;
+ int i=0;
+
+ if(!wi_list) {
+ fprintf(stderr, "list NULL\n");
+ }
+
+ while(wi) {
+ fprintf(stderr, "-\t[%p]\t[%s]\t[%lu]\t[%luus] > %p\n",
+ wi, (wi->id_usr == -1)?"free":"Busy", wi->result, wi->t_usec, wi->next);
+ wi = wi->next;
+ i++;
+ }
+ fprintf(stderr, "list len: %d\n", i);
+ return 0;
+}
+
+int pgcomp_handler(wrk_t *w_list)
+{
+ int ret=0;
+ opq_t *wrk_q=NULL, *comp_q=NULL;
+
+ wrk_q=&wq;
+ comp_q=&cq;
+
+ pthread_mutex_lock(&wrk_q->mtx);
+ /* setup work queue here.. */
+ wrk_q->flag = Q_EMPTY;
+ pthread_mutex_unlock(&wrk_q->mtx);
+
+ ret = q_insert_wrk_list(wrk_q, w_list);
+ if(ret != 0) {
+ fprintf(stderr, "%s():work-queue setup FAILED wq:%p w_list:%p \n",
+ __FUNCTION__, &wq, w_list);
+ return -1;
+ }
+
+retry_submit:
+ pthread_mutex_lock(&wrk_q->mtx);
+ /* setup work queue here.. */
+ wrk_q->flag = Q_INITIALIZED;
+ pthread_mutex_unlock(&wrk_q->mtx);
+
+
+ pthread_mutex_lock(&comp_q->mtx);
+ if(0 != set_done_cnt_flag(0)) {
+ fprintf(stderr, "FAILED %s:%d\n", __FILE__, __LINE__);
+ pthread_mutex_unlock(&comp_q->mtx);
+ return -1;
+ }
+ comp_q->flag = Q_PROCESS;
+ pthread_mutex_unlock(&comp_q->mtx);
+
+ /* if threads are waiting request them to start */
+ pthread_mutex_lock(&wrk_q->mtx);
+ wrk_q->flag = Q_PROCESS;
+ pthread_cond_broadcast(&wrk_q->cv);
+ pthread_mutex_unlock(&wrk_q->mtx);
+
+ /* Wait on all worker-threads to complete */
+ pthread_mutex_lock(&comp_q->mtx);
+ if (comp_q->flag != Q_DONE) {
+ do {
+ pthread_cond_wait(&comp_q->cv, &comp_q->mtx);
+ if(comp_q->flag != Q_DONE) {
+ fprintf(stderr, "[1] cv wait on CQ failed flag:%d cnt:%lu\n",
+ comp_q->flag, done_cnt_flag);
+ if (done_cnt_flag != srv_buf_pool_instances) {
+ fprintf(stderr, "[2] cv wait on CQ failed flag:%d cnt:%lu\n",
+ comp_q->flag, done_cnt_flag);
+ fprintf(stderr, "============\n");
+ print_wrk_list(w_list);
+ fprintf(stderr, "============\n");
+ }
+ continue;
+ } else if (done_cnt_flag != srv_buf_pool_instances) {
+ fprintf(stderr, "[3]cv wait on CQ failed flag:%d cnt:%lu\n",
+ comp_q->flag, done_cnt_flag);
+ fprintf(stderr, "============\n");
+ print_wrk_list(w_list);
+ fprintf(stderr, "============\n");
+ comp_q->flag = Q_INITIALIZED;
+ pthread_mutex_unlock(&comp_q->mtx);
+ goto retry_submit;
+
+ assert(!done_cnt_flag);
+ continue;
+ }
+ assert(done_cnt_flag == srv_buf_pool_instances);
+
+ if ((comp_q->flag == Q_DONE) &&
+ (done_cnt_flag == srv_buf_pool_instances)) {
+ break;
+ }
+ } while((comp_q->flag == Q_INITIALIZED) &&
+ (done_cnt_flag != srv_buf_pool_instances));
+ } else {
+ fprintf(stderr, "[4] cv wait on CQ failed flag:%d cnt:%lu\n",
+ comp_q->flag, done_cnt_flag);
+ if (!done_cnt_flag) {
+ fprintf(stderr, "============\n");
+ print_wrk_list(w_list);
+ fprintf(stderr, "============\n");
+ comp_q->flag = Q_INITIALIZED;
+ pthread_mutex_unlock(&comp_q->mtx);
+ goto retry_submit;
+ assert(!done_cnt_flag);
+ }
+ assert(done_cnt_flag == srv_buf_pool_instances);
+ }
+
+ pthread_mutex_unlock(&comp_q->mtx);
+ pthread_mutex_lock(&wrk_q->mtx);
+ wrk_q->flag = Q_DONE;
+ pthread_mutex_unlock(&wrk_q->mtx);
+
+ return 0;
+}
+
+/******************************************************************//**
+@return a dummy parameter*/
+int pgcomp_handler_init(int num_threads, int wrk_cnt, opq_t *wq, opq_t *cq)
+{
+ int i=0;
+
+ if(is_pgcomp_wrk_init_done()) {
+ fprintf(stderr, "pgcomp_handler_init(): ERROR already initialized\n");
+ return -1;
+ }
+
+ if(!wq || !cq) {
+ fprintf(stderr, "%s() FAILED wq:%p cq:%p\n", __FUNCTION__, wq, cq);
+ return -1;
+ }
+
+ /* work-item setup */
+ setup_wrk_itm(wrk_cnt);
+
+ /* wq & cq setup */
+ init_queue(wq);
+ init_queue(cq);
+
+ /* Mark each of the thread sync entires */
+ for(i=0; i < PGCOMP_MAX_WORKER; i++) {
+ pc_sync[i].wthread_id = i;
+ }
+
+ /* Create threads for page-compression-flush */
+ for(i=0; i < num_threads; i++) {
+ pc_sync[i].wthread_id = i;
+ pc_sync[i].wq = wq;
+ pc_sync[i].cq = cq;
+ os_thread_create(page_comp_io_thread, ((void *)(pc_sync + i)),
+ thread_ids + START_PGCOMP_CNT + i);
+ //pc_sync[i].wthread = thread_ids[START_PGCOMP_CNT + i];
+ pc_sync[i].wthread = (START_PGCOMP_CNT + i);
+ pc_sync[i].wt_status = WTHR_INITIALIZED;
+ }
+
+ set_check_done_flag_count(wrk_cnt);
+ set_pgcomp_wrk_init_done();
+
+ return 0;
+}
+
+
+int wrk_thread_stat(thread_sync_t *wthr, unsigned int num_threads)
+{
+ long stat_tot=0;
+ unsigned int i=0;
+ for(i=0; i< num_threads;i++) {
+ stat_tot+=wthr[i].stat_universal_num_processed;
+ fprintf(stderr, "[%d] stat [%lu]\n", wthr[i].wthread_id,
+ wthr[i].stat_universal_num_processed);
+ }
+ fprintf(stderr, "Stat-Total:%lu\n", stat_tot);
+ return (0);
+}
+
+int reset_wrk_itm(int items)
+{
+ int i;
+
+ pthread_mutex_lock(&wq.mtx);
+ wq.head = wq.tail = NULL;
+ pthread_mutex_unlock(&wq.mtx);
+
+ pthread_mutex_lock(&cq.mtx);
+ for(i=0;i<items; i++) {
+ work_items[i].id_usr = -1;
+ }
+ cq.head = cq.tail = NULL;
+ pthread_mutex_unlock(&cq.mtx);
+ return 0;
+}
+
+int pgcomp_flush_work_items(int buf_pool_inst, int *per_pool_pages_flushed,
+ int flush_type, int min_n, unsigned long long lsn_limit)
+{
+ int ret=0, i=0;
+
+ pthread_mutex_lock(&wq.mtx);
+ pthread_mutex_lock(&cq.mtx);
+
+ assert(wq.head == NULL);
+ assert(wq.tail == NULL);
+ if(cq.head) {
+ print_wrk_list(cq.head);
+ }
+ assert(cq.head == NULL);
+ assert(cq.tail == NULL);
+
+ for(i=0;i<buf_pool_inst; i++) {
+ work_items[i].buf_pool = buf_pool_from_array(i);
+ work_items[i].flush_type = flush_type;
+ work_items[i].min = min_n;
+ work_items[i].lsn_limit = lsn_limit;
+ work_items[i].id_usr = -1;
+ work_items[i].next = &work_items[(i+1)%buf_pool_inst];
+ work_items[i].wi_status = WRK_ITEM_SET;
+ }
+ work_items[i-1].next=NULL;
+
+ pthread_mutex_unlock(&cq.mtx);
+ pthread_mutex_unlock(&wq.mtx);
+
+ pgcomp_handler(work_items);
+
+ pthread_mutex_lock(&wq.mtx);
+ pthread_mutex_lock(&cq.mtx);
+ /* collect data/results total pages flushed */
+ for(i=0; i<buf_pool_inst; i++) {
+ if(work_items[i].result == -1) {
+ ret = -1;
+ per_pool_pages_flushed[i] = 0;
+ } else {
+ per_pool_pages_flushed[i] = work_items[i].result;
+ }
+ if((work_items[i].id_usr == -1) && (work_items[i].wi_status == WRK_ITEM_SET )) {
+ fprintf(stderr, "**Set/Unused work_item[%d] flush_type=%d\n", i, work_items[i].flush_type);
+ assert(0);
+ }
+ }
+
+ wq.flag = cq.flag = Q_INITIALIZED;
+
+ pthread_mutex_unlock(&cq.mtx);
+ pthread_mutex_unlock(&wq.mtx);
+
+#if UNIV_DEBUG
+ /* Print work-list stats */
+ fprintf(stderr, "==wq== [DONE]\n");
+ print_wrk_list(wq.head);
+ fprintf(stderr, "==cq== [DONE]\n");
+ print_wrk_list(cq.head);
+ fprintf(stderr, "==worker-thread-stats==\n");
+ wrk_thread_stat(pc_sync, pgc_n_threads);
+#endif
+
+ /* clear up work-queue for next flush */
+ reset_wrk_itm(buf_pool_inst);
+ return(ret);
+}
+
+/* JAN: TODO: END: */
+
/********************************************************************
Starts InnoDB and creates a new database if database files
are not found and the user wants.
@@ -2710,6 +3408,16 @@ files_checked:
}
if (!srv_read_only_mode) {
+ /* JAN: TODO: */
+ if (srv_buf_pool_instances <= PGCOMP_MAX_WORKER) {
+ pgc_n_threads = srv_buf_pool_instances;
+ }
+ /* else we default to 8 worker-threads */
+ pgcomp_handler_init(pgc_n_threads, srv_buf_pool_instances, &wq, &cq);
+ /* JAN: TODO: END */
+#if UNIV_DEBUG
+ fprintf(stderr, "%s:%d buf-pool-instances:%lu\n", __FILE__, __LINE__, srv_buf_pool_instances);
+#endif
os_thread_create(buf_flush_page_cleaner_thread, NULL, NULL);
}