93 files changed, 10943 insertions, 802 deletions
diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt
index 425d0bd0f1c..964294a962d 100644
--- a/storage/innobase/CMakeLists.txt
+++ b/storage/innobase/CMakeLists.txt
@@ -18,6 +18,15 @@
 INCLUDE(CheckFunctionExists)
 INCLUDE(CheckCSourceCompiles)
 INCLUDE(CheckCSourceRuns)
+INCLUDE(lz4)
+INCLUDE(lzo)
+INCLUDE(lzma)
+INCLUDE(bzip2)
+
+MYSQL_CHECK_LZ4()
+MYSQL_CHECK_LZO()
+MYSQL_CHECK_LZMA()
+MYSQL_CHECK_BZIP2()
 
 # OS tests
 IF(UNIX)
@@ -328,6 +337,7 @@ SET(INNOBASE_SOURCES
 	btr/btr0cur.cc
 	btr/btr0pcur.cc
 	btr/btr0sea.cc
+	btr/btr0defragment.cc
 	buf/buf0buddy.cc
 	buf/buf0buf.cc
 	buf/buf0dblwr.cc
@@ -336,6 +346,7 @@ SET(INNOBASE_SOURCES
 	buf/buf0flu.cc
 	buf/buf0lru.cc
 	buf/buf0rea.cc
+	buf/buf0mtflu.cc
 	data/data0data.cc
 	data/data0type.cc
 	dict/dict0boot.cc
@@ -349,6 +360,7 @@ SET(INNOBASE_SOURCES
 	eval/eval0eval.cc
 	eval/eval0proc.cc
 	fil/fil0fil.cc
+        fil/fil0pagecompress.cc
 	fsp/fsp0fsp.cc
 	fut/fut0fut.cc
 	fut/fut0lst.cc
@@ -436,7 +448,8 @@ SET(INNOBASE_SOURCES
 	ut/ut0rnd.cc
 	ut/ut0ut.cc
 	ut/ut0vec.cc
-	ut/ut0wqueue.cc)
+	ut/ut0wqueue.cc
+	ut/ut0timer.cc)
 
 IF(WITH_INNODB)
   # Legacy option
diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc
index 79b533481b7..ff27b470974 100644
--- a/storage/innobase/btr/btr0btr.cc
+++ b/storage/innobase/btr/btr0btr.cc
@@ -38,6 +38,7 @@ Created 6/2/1994 Heikki Tuuri
 #include "btr0cur.h"
 #include "btr0sea.h"
 #include "btr0pcur.h"
+#include "btr0defragment.h"
 #include "rem0cmp.h"
 #include "lock0lock.h"
 #include "ibuf0ibuf.h"
@@ -1193,6 +1194,32 @@ btr_get_size(
 	mtr_t*		mtr)	/*!< in/out: mini-transaction where index
 				is s-latched */
 {
+	ulint used;
+	if (flag == BTR_N_LEAF_PAGES) {
+		btr_get_size_and_reserved(index, flag, &used, mtr);
+		return used;
+	} else if (flag == BTR_TOTAL_SIZE) {
+		return btr_get_size_and_reserved(index, flag, &used, mtr);
+	} else {
+		ut_error;
+	}
+	return (ULINT_UNDEFINED);
+}
+
+/**************************************************************//**
+Gets the number of reserved and used pages in a B-tree.
+@return	number of pages reserved, or ULINT_UNDEFINED if the index
+is unavailable */
+UNIV_INTERN
+ulint
+btr_get_size_and_reserved(
+/*======================*/
+	dict_index_t*	index,	/*!< in: index */
+	ulint		flag,	/*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
+	ulint*		used,	/*!< out: number of pages used (<= reserved) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction where index
+				is s-latched */
+{
 	fseg_header_t*	seg_header;
 	page_t*		root;
 	ulint		n;
@@ -1201,6 +1228,8 @@ btr_get_size(
 	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
 				MTR_MEMO_S_LOCK));
 
+	ut_a(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE);
+
 	if (index->page == FIL_NULL || dict_index_is_online_ddl(index)
 	    || *index->name == TEMP_INDEX_PREFIX) {
 		return(ULINT_UNDEFINED);
@@ -1208,21 +1237,16 @@ btr_get_size(
 
 	root = btr_root_get(index, mtr);
 
-	if (flag == BTR_N_LEAF_PAGES) {
-		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
+	seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
 
-		fseg_n_reserved_pages(seg_header, &n, mtr);
+	n = fseg_n_reserved_pages(seg_header, used, mtr);
 
-	} else if (flag == BTR_TOTAL_SIZE) {
+	if (flag == BTR_TOTAL_SIZE) {
 		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
 
-		n = fseg_n_reserved_pages(seg_header, &dummy, mtr);
-
-		seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
-
 		n += fseg_n_reserved_pages(seg_header, &dummy, mtr);
-	} else {
-		ut_error;
+		*used += dummy;
+
 	}
 
 	return(n);
@@ -1971,7 +1995,7 @@ IBUF_BITMAP_FREE is unaffected by reorganization.
 
 @retval true if the operation was successful
 @retval false if it is a compressed page, and recompression failed */
-static __attribute__((nonnull))
+UNIV_INTERN
 bool
 btr_page_reorganize_block(
 /*======================*/
@@ -2031,7 +2055,7 @@ btr_parse_page_reorganize(
 	buf_block_t*	block,	/*!< in: page to be reorganized, or NULL */
 	mtr_t*		mtr)	/*!< in: mtr or NULL */
 {
-	ulint	level;
+	ulint	level = page_zip_level;
 
 	ut_ad(ptr && end_ptr);
 
@@ -3059,6 +3083,12 @@ func_start:
 	new_page_zip = buf_block_get_page_zip(new_block);
 	btr_page_create(new_block, new_page_zip, cursor->index,
 			btr_page_get_level(page, mtr), mtr);
+	/* Only record the leaf level page splits. */
+	if (btr_page_get_level(page, mtr) == 0) {
+		cursor->index->stat_defrag_n_page_split ++;
+		cursor->index->stat_defrag_modified_counter ++;
+		btr_defragment_save_defrag_stats_if_needed(cursor->index);
+	}
 
 	/* 3. Calculate the first record on the upper half-page, and the
 	first record (move_limit) on original page which ends up on the
@@ -3317,31 +3347,9 @@ func_exit:
 	return(rec);
 }
 
-#ifdef UNIV_SYNC_DEBUG
-/*************************************************************//**
-Removes a page from the level list of pages.
-@param space	in: space where removed
-@param zip_size	in: compressed page size in bytes, or 0 for uncompressed
-@param page	in/out: page to remove
-@param index	in: index tree
-@param mtr	in/out: mini-transaction */
-# define btr_level_list_remove(space,zip_size,page,index,mtr)		\
-	btr_level_list_remove_func(space,zip_size,page,index,mtr)
-#else /* UNIV_SYNC_DEBUG */
-/*************************************************************//**
-Removes a page from the level list of pages.
-@param space	in: space where removed
-@param zip_size	in: compressed page size in bytes, or 0 for uncompressed
-@param page	in/out: page to remove
-@param index	in: index tree
-@param mtr	in/out: mini-transaction */
-# define btr_level_list_remove(space,zip_size,page,index,mtr)		\
-	btr_level_list_remove_func(space,zip_size,page,mtr)
-#endif /* UNIV_SYNC_DEBUG */
-
 /*************************************************************//**
 Removes a page from the level list of pages. */
-static __attribute__((nonnull))
+UNIV_INTERN
 void
 btr_level_list_remove_func(
 /*=======================*/
@@ -3513,7 +3521,7 @@ btr_node_ptr_delete(
 If page is the only on its level, this function moves its records to the
 father page, thus reducing the tree height.
 @return father block */
-static
+UNIV_INTERN
 buf_block_t*
 btr_lift_page_up(
 /*=============*/
diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc
index acc3a4d1c98..b030fd7da79 100644
--- a/storage/innobase/btr/btr0cur.cc
+++ b/storage/innobase/btr/btr0cur.cc
@@ -1873,9 +1873,13 @@ btr_cur_update_alloc_zip_func(
 				false=update-in-place */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
+
+	/* Have a local copy of the variables as these can change
+	dynamically. */
 	const page_t*	page = page_cur_get_page(cursor);
 
 	ut_ad(page_zip == page_cur_get_page_zip(cursor));
+
 	ut_ad(page_zip);
 	ut_ad(!dict_index_is_ibuf(index));
 	ut_ad(rec_offs_validate(page_cur_get_rec(cursor), index, offsets));
diff --git a/storage/innobase/btr/btr0defragment.cc b/storage/innobase/btr/btr0defragment.cc
new file mode 100644
index 00000000000..dfb2cd8dffd
--- /dev/null
+++ b/storage/innobase/btr/btr0defragment.cc
@@ -0,0 +1,818 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved.
+Copyright (C) 2014, SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+/**************************************************//**
+@file btr/btr0defragment.cc
+Index defragmentation.
+
+Created  05/29/2014 Rongrong Zhong
+Modified 16/07/2014 Sunguck Lee
+Modified 30/07/2014 Jan Lindström jan.lindstrom@skysql.com
+*******************************************************/
+
+#include "btr0defragment.h"
+#ifndef UNIV_HOTBACKUP
+#include "btr0cur.h"
+#include "btr0sea.h"
+#include "btr0pcur.h"
+#include "dict0stats.h"
+#include "dict0stats_bg.h"
+#include "ibuf0ibuf.h"
+#include "lock0lock.h"
+#include "srv0start.h"
+#include "ut0timer.h"
+
+#include <list>
+
+/**************************************************//**
+Custom nullptr implementation for under g++ 4.6
+*******************************************************/
+// #pragma once
+/*
+namespace std
+{
+ // based on SC22/WG21/N2431 = J16/07-0301
+ struct nullptr_t
+ {
+ template<typename any> operator any * () const
+ {
+ return 0;
+ }
+ template<class any, typename T> operator T any:: * () const
+ {
+ return 0;
+ }
+
+#ifdef _MSC_VER
+ struct pad {};
+ pad __[sizeof(void*)/sizeof(pad)];
+#else
+ char __[sizeof(void*)];
+#endif
+private:
+ // nullptr_t();// {}
+ // nullptr_t(const nullptr_t&);
+ // void operator = (const nullptr_t&);
+ void operator &() const;
+ template<typename any> void operator +(any) const
+ {
+ // I Love MSVC 2005!
+ }
+ template<typename any> void operator -(any) const
+ {
+ // I Love MSVC 2005!
+ }
+ };
+static const nullptr_t __nullptr = {};
+}
+
+#ifndef nullptr
+#define nullptr std::__nullptr
+#endif
+*/
+
+/**************************************************//**
+End of Custom nullptr implementation for under g++ 4.6
+*******************************************************/
+
+/* When there's no work, either because defragment is disabled, or because no
+query is submitted, thread checks state every BTR_DEFRAGMENT_SLEEP_IN_USECS.*/
+#define BTR_DEFRAGMENT_SLEEP_IN_USECS		1000000
+/* Reduce the target page size by this amount when compression failure happens
+during defragmentaiton. 512 is chosen because it's a power of 2 and it is about
+3% of the page size. When there are compression failures in defragmentation,
+our goal is to get a decent defrag ratio with as few compression failure as
+possible. From experimentation it seems that reduce the target size by 512 every
+time will make sure the page is compressible within a couple of iterations. */
+#define BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE	512
+
+/* Work queue for defragmentation. */
+typedef std::list<btr_defragment_item_t*>	btr_defragment_wq_t;
+static btr_defragment_wq_t	btr_defragment_wq;
+
+/* Mutex protecting the defragmentation work queue.*/
+ib_mutex_t		btr_defragment_mutex;
+#ifdef UNIV_PFS_MUTEX
+UNIV_INTERN mysql_pfs_key_t	btr_defragment_mutex_key;
+#endif /* UNIV_PFS_MUTEX */
+
+/* Number of compression failures caused by defragmentation since server
+start. */
+ulint btr_defragment_compression_failures = 0;
+/* Number of btr_defragment_n_pages calls that altered page but didn't
+manage to release any page. */
+ulint btr_defragment_failures = 0;
+/* Total number of btr_defragment_n_pages calls that altered page.
+The difference between btr_defragment_count and btr_defragment_failures shows
+the amount of effort wasted. */
+ulint btr_defragment_count = 0;
+
+/******************************************************************//**
+Constructor for btr_defragment_item_t. */
+btr_defragment_item_t::btr_defragment_item_t(
+	btr_pcur_t* pcur,
+	os_event_t event)
+{
+	this->pcur = pcur;
+	this->event = event;
+	this->removed = false;
+	this->last_processed = 0;
+}
+
+/******************************************************************//**
+Destructor for btr_defragment_item_t. */
+btr_defragment_item_t::~btr_defragment_item_t() {
+	if (this->pcur) {
+		btr_pcur_free_for_mysql(this->pcur);
+	}
+	if (this->event) {
+		os_event_set(this->event);
+	}
+}
+
+/******************************************************************//**
+Initialize defragmentation. */
+void
+btr_defragment_init()
+{
+	srv_defragment_interval = ut_microseconds_to_timer(
+		1000000.0 / srv_defragment_frequency);
+	mutex_create(btr_defragment_mutex_key, &btr_defragment_mutex,
+		     SYNC_ANY_LATCH);
+	os_thread_create(btr_defragment_thread, NULL, NULL);
+}
+
+/******************************************************************//**
+Shutdown defragmentation. Release all resources. */
+void
+btr_defragment_shutdown()
+{
+	mutex_enter(&btr_defragment_mutex);
+	list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+	while(iter != btr_defragment_wq.end()) {
+		btr_defragment_item_t* item = *iter;
+		iter = btr_defragment_wq.erase(iter);
+		delete item;
+	}
+	mutex_exit(&btr_defragment_mutex);
+	mutex_free(&btr_defragment_mutex);
+}
+
+
+/******************************************************************//**
+Functions used by the query threads: btr_defragment_xxx_index
+Query threads find/add/remove index. */
+/******************************************************************//**
+Check whether the given index is in btr_defragment_wq. We use index->id
+to identify indices. */
+bool
+btr_defragment_find_index(
+	dict_index_t*	index)	/*!< Index to find. */
+{
+	mutex_enter(&btr_defragment_mutex);
+	for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+	     iter != btr_defragment_wq.end();
+	     ++iter) {
+		btr_defragment_item_t* item = *iter;
+		btr_pcur_t* pcur = item->pcur;
+		btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
+		dict_index_t* idx = btr_cur_get_index(cursor);
+		if (index->id == idx->id) {
+			mutex_exit(&btr_defragment_mutex);
+			return true;
+		}
+	}
+	mutex_exit(&btr_defragment_mutex);
+	return false;
+}
+
+/******************************************************************//**
+Query thread uses this function to add an index to btr_defragment_wq.
+Return a pointer to os_event for the query thread to wait on if this is a
+synchronized defragmentation. */
+os_event_t
+btr_defragment_add_index(
+	dict_index_t*	index,	/*!< index to be added  */
+	bool		async)	/*!< whether this is an async defragmentation */
+{
+	mtr_t mtr;
+	ulint space = dict_index_get_space(index);
+	ulint zip_size = dict_table_zip_size(index->table);
+	ulint page_no = dict_index_get_page(index);
+	mtr_start(&mtr);
+	// Load index rood page.
+	page_t* page = btr_page_get(space, zip_size, page_no,
+				    RW_NO_LATCH, index, &mtr);
+	if (btr_page_get_level(page, &mtr) == 0) {
+		// Index root is a leaf page, no need to defragment.
+		mtr_commit(&mtr);
+		return NULL;
+	}
+	btr_pcur_t* pcur = btr_pcur_create_for_mysql();
+	os_event_t event = NULL;
+	if (!async) {
+		event = os_event_create();
+	}
+	btr_pcur_open_at_index_side(true, index, BTR_SEARCH_LEAF, pcur,
+				    true, 0, &mtr);
+	btr_pcur_move_to_next(pcur, &mtr);
+	btr_pcur_store_position(pcur, &mtr);
+	mtr_commit(&mtr);
+	dict_stats_empty_defrag_summary(index);
+	btr_defragment_item_t*	item = new btr_defragment_item_t(pcur, event);
+	mutex_enter(&btr_defragment_mutex);
+	btr_defragment_wq.push_back(item);
+	mutex_exit(&btr_defragment_mutex);
+	return event;
+}
+
+/******************************************************************//**
+When table is dropped, this function is called to mark a table as removed in
+btr_efragment_wq. The difference between this function and the remove_index
+function is this will not NULL the event. */
+void
+btr_defragment_remove_table(
+	dict_table_t*	table)	/*!< Index to be removed. */
+{
+	mutex_enter(&btr_defragment_mutex);
+	for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+	     iter != btr_defragment_wq.end();
+	     ++iter) {
+		btr_defragment_item_t* item = *iter;
+		btr_pcur_t* pcur = item->pcur;
+		btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
+		dict_index_t* idx = btr_cur_get_index(cursor);
+		if (table->id == idx->table->id) {
+			item->removed = true;
+		}
+	}
+	mutex_exit(&btr_defragment_mutex);
+}
+
+/******************************************************************//**
+Query thread uses this function to mark an index as removed in
+btr_efragment_wq. */
+void
+btr_defragment_remove_index(
+	dict_index_t*	index)	/*!< Index to be removed. */
+{
+	mutex_enter(&btr_defragment_mutex);
+	for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+	     iter != btr_defragment_wq.end();
+	     ++iter) {
+		btr_defragment_item_t* item = *iter;
+		btr_pcur_t* pcur = item->pcur;
+		btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
+		dict_index_t* idx = btr_cur_get_index(cursor);
+		if (index->id == idx->id) {
+			item->removed = true;
+			item->event = NULL;
+			break;
+		}
+	}
+	mutex_exit(&btr_defragment_mutex);
+}
+
+/******************************************************************//**
+Functions used by defragmentation thread: btr_defragment_xxx_item.
+Defragmentation thread operates on the work *item*. It gets/removes
+item from the work queue. */
+/******************************************************************//**
+Defragment thread uses this to remove an item from btr_defragment_wq.
+When an item is removed from the work queue, all resources associated with it
+are free as well. */
+void
+btr_defragment_remove_item(
+	btr_defragment_item_t*	item) /*!< Item to be removed. */
+{
+	mutex_enter(&btr_defragment_mutex);
+	for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+	     iter != btr_defragment_wq.end();
+	     ++iter) {
+		if (item == *iter) {
+			btr_defragment_wq.erase(iter);
+			delete item;
+			break;
+		}
+	}
+	mutex_exit(&btr_defragment_mutex);
+}
+
+/******************************************************************//**
+Defragment thread uses this to get an item from btr_defragment_wq to work on.
+The item is not removed from the work queue so query threads can still access
+this item. We keep it this way so query threads can find and kill a
+defragmentation even if that index is being worked on. Be aware that while you
+work on this item you have no lock protection on it whatsoever. This is OK as
+long as the query threads and defragment thread won't modify the same fields
+without lock protection.
+*/
+btr_defragment_item_t*
+btr_defragment_get_item()
+{
+	if (btr_defragment_wq.empty()) {
+		return NULL;
+		//return nullptr;
+	}
+	mutex_enter(&btr_defragment_mutex);
+	list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
+	if (iter == btr_defragment_wq.end()) {
+		iter = btr_defragment_wq.begin();
+	}
+	btr_defragment_item_t* item = *iter;
+	iter++;
+	mutex_exit(&btr_defragment_mutex);
+	return item;
+}
+
+/*********************************************************************//**
+Check whether we should save defragmentation statistics to persistent storage.
+Currently we save the stats to persistent storage every 100 updates. */
+UNIV_INTERN
+void
+btr_defragment_save_defrag_stats_if_needed(
+	dict_index_t*	index)	/*!< in: index */
+{
+	if (srv_defragment_stats_accuracy != 0 // stats tracking disabled
+	    && dict_index_get_space(index) != 0 // do not track system tables
+	    && index->stat_defrag_modified_counter
+	       >= srv_defragment_stats_accuracy) {
+		dict_stats_defrag_pool_add(index);
+		index->stat_defrag_modified_counter = 0;
+	}
+}
+
+/*********************************************************************//**
+Main defragment functionalities used by defragment thread.*/
+/*************************************************************//**
+Calculate number of records from beginning of block that can
+fit into size_limit
+@return number of records */
+UNIV_INTERN
+ulint
+btr_defragment_calc_n_recs_for_size(
+	buf_block_t* block,	/*!< in: B-tree page */
+	dict_index_t* index,	/*!< in: index of the page */
+	ulint size_limit,	/*!< in: size limit to fit records in */
+	ulint* n_recs_size)	/*!< out: actual size of the records that fit
+				in size_limit. */
+{
+	page_t* page = buf_block_get_frame(block);
+	ulint n_recs = 0;
+	ulint offsets_[REC_OFFS_NORMAL_SIZE];
+	ulint* offsets = offsets_;
+	rec_offs_init(offsets_);
+	mem_heap_t* heap = NULL;
+	ulint size = 0;
+	page_cur_t cur;
+
+	page_cur_set_before_first(block, &cur);
+	page_cur_move_to_next(&cur);
+	while (page_cur_get_rec(&cur) != page_get_supremum_rec(page)) {
+		rec_t* cur_rec = page_cur_get_rec(&cur);
+		offsets = rec_get_offsets(cur_rec, index, offsets,
+					  ULINT_UNDEFINED, &heap);
+		ulint rec_size = rec_offs_size(offsets);
+		size += rec_size;
+		if (size > size_limit) {
+			size = size - rec_size;
+			break;
+		}
+		n_recs ++;
+		page_cur_move_to_next(&cur);
+	}
+	*n_recs_size = size;
+	return n_recs;
+}
+
+/*************************************************************//**
+Merge as many records from the from_block to the to_block. Delete
+the from_block if all records are successfully merged to to_block.
+@return the to_block to target for next merge operation. */
+UNIV_INTERN
+buf_block_t*
+btr_defragment_merge_pages(
+	dict_index_t*	index,		/*!< in: index tree */
+	buf_block_t*	from_block,	/*!< in: origin of merge */
+	buf_block_t*	to_block,	/*!< in: destination of merge */
+	ulint		zip_size,	/*!< in: zip size of the block */
+	ulint		reserved_space,	/*!< in: space reserved for future
+					insert to avoid immediate page split */
+	ulint*		max_data_size,	/*!< in/out: max data size to
+					fit in a single compressed page. */
+	mem_heap_t*	heap,		/*!< in/out: pointer to memory heap */
+	mtr_t*		mtr)		/*!< in/out: mini-transaction */
+{
+	page_t* from_page = buf_block_get_frame(from_block);
+	page_t* to_page = buf_block_get_frame(to_block);
+	ulint space = dict_index_get_space(index);
+	ulint level = btr_page_get_level(from_page, mtr);
+	ulint n_recs = page_get_n_recs(from_page);
+	ulint new_data_size = page_get_data_size(to_page);
+	ulint max_ins_size =
+		page_get_max_insert_size(to_page, n_recs);
+	ulint max_ins_size_reorg =
+		page_get_max_insert_size_after_reorganize(
+			to_page, n_recs);
+	ulint max_ins_size_to_use = max_ins_size_reorg > reserved_space
+				    ? max_ins_size_reorg - reserved_space : 0;
+	ulint move_size = 0;
+	ulint n_recs_to_move = 0;
+	rec_t* rec = NULL;
+	ulint target_n_recs = 0;
+	rec_t* orig_pred;
+
+	// Estimate how many records can be moved from the from_page to
+	// the to_page.
+	if (zip_size) {
+		ulint page_diff = UNIV_PAGE_SIZE - *max_data_size;
+		max_ins_size_to_use = (max_ins_size_to_use > page_diff)
+			       ? max_ins_size_to_use - page_diff : 0;
+	}
+	n_recs_to_move = btr_defragment_calc_n_recs_for_size(
+		from_block, index, max_ins_size_to_use, &move_size);
+
+	// If max_ins_size >= move_size, we can move the records without
+	// reorganizing the page, otherwise we need to reorganize the page
+	// first to release more space.
+	if (move_size > max_ins_size) {
+		if (!btr_page_reorganize_block(false, page_zip_level,
+					       to_block, index,
+					       mtr)) {
+			if (!dict_index_is_clust(index)
+			    && page_is_leaf(to_page)) {
+				ibuf_reset_free_bits(to_block);
+			}
+			// If reorganization fails, that means page is
+			// not compressable. There's no point to try
+			// merging into this page. Continue to the
+			// next page.
+			return from_block;
+		}
+		ut_ad(page_validate(to_page, index));
+		max_ins_size = page_get_max_insert_size(to_page, n_recs);
+		ut_a(max_ins_size >= move_size);
+	}
+
+	// Move records to pack to_page more full.
+	orig_pred = NULL;
+	target_n_recs = n_recs_to_move;
+	while (n_recs_to_move > 0) {
+		rec = page_rec_get_nth(from_page,
+					n_recs_to_move + 1);
+		orig_pred = page_copy_rec_list_start(
+			to_block, from_block, rec, index, mtr);
+		if (orig_pred)
+			break;
+		// If we reach here, that means compression failed after packing
+		// n_recs_to_move number of records to to_page. We try to reduce
+		// the targeted data size on the to_page by
+		// BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE and try again.
+		os_atomic_increment_ulint(
+			&btr_defragment_compression_failures, 1);
+		max_ins_size_to_use =
+			move_size > BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE
+			? move_size - BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE
+			: 0;
+		if (max_ins_size_to_use == 0) {
+			n_recs_to_move = 0;
+			move_size = 0;
+			break;
+		}
+		n_recs_to_move = btr_defragment_calc_n_recs_for_size(
+			from_block, index, max_ins_size_to_use, &move_size);
+	}
+	// If less than target_n_recs are moved, it means there are
+	// compression failures during page_copy_rec_list_start. Adjust
+	// the max_data_size estimation to reduce compression failures
+	// in the following runs.
+	if (target_n_recs > n_recs_to_move
+	    && *max_data_size > new_data_size + move_size) {
+		*max_data_size = new_data_size + move_size;
+	}
+	// Set ibuf free bits if necessary.
+	if (!dict_index_is_clust(index)
+	    && page_is_leaf(to_page)) {
+		if (zip_size) {
+			ibuf_reset_free_bits(to_block);
+		} else {
+			ibuf_update_free_bits_if_full(
+				to_block,
+				UNIV_PAGE_SIZE,
+				ULINT_UNDEFINED);
+		}
+	}
+	if (n_recs_to_move == n_recs) {
+		/* The whole page is merged with the previous page,
+		free it. */
+		lock_update_merge_left(to_block, orig_pred,
+				       from_block);
+		btr_search_drop_page_hash_index(from_block);
+		btr_level_list_remove(space, zip_size, from_page,
+				      index, mtr);
+		btr_node_ptr_delete(index, from_block, mtr);
+		btr_blob_dbg_remove(from_page, index,
+				    "btr_defragment_n_pages");
+		btr_page_free(index, from_block, mtr);
+	} else {
+		// There are still records left on the page, so
+		// increment n_defragmented. Node pointer will be changed
+		// so remove the old node pointer.
+		if (n_recs_to_move > 0) {
+			// Part of the page is merged to left, remove
+			// the merged records, update record locks and
+			// node pointer.
+			dtuple_t* node_ptr;
+			page_delete_rec_list_start(rec, from_block,
+						   index, mtr);
+			lock_update_split_and_merge(to_block,
+						    orig_pred,
+						    from_block);
+			btr_node_ptr_delete(index, from_block, mtr);
+			rec = page_rec_get_next(
+				page_get_infimum_rec(from_page));
+			node_ptr = dict_index_build_node_ptr(
+				index, rec, page_get_page_no(from_page),
+				heap, level + 1);
+			btr_insert_on_non_leaf_level(0, index, level+1,
+						     node_ptr, mtr);
+		}
+		to_block = from_block;
+	}
+	return to_block;
+}
+
+/*************************************************************//**
+Tries to merge N consecutive pages, starting from the page pointed by the
+cursor. Skip space 0. Only consider leaf pages.
+This function first loads all N pages into memory, then for each of
+the pages other than the first page, it tries to move as many records
+as possible to the left sibling to keep the left sibling full. During
+the process, if any page becomes empty, that page will be removed from
+the level list. Record locks, hash, and node pointers are updated after
+page reorganization.
+@return pointer to the last block processed, or NULL if reaching end of index */
+UNIV_INTERN
+buf_block_t*
+btr_defragment_n_pages(
+	buf_block_t*	block,	/*!< in: starting block for defragmentation */
+	dict_index_t*	index,	/*!< in: index tree */
+	uint		n_pages,/*!< in: number of pages to defragment */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+{
+	ulint		space;
+	ulint		zip_size;
+	/* We will need to load the n+1 block because if the last page is freed
+	and we need to modify the prev_page_no of that block. */
+	buf_block_t*	blocks[BTR_DEFRAGMENT_MAX_N_PAGES + 1];
+	page_t*		first_page;
+	buf_block_t*	current_block;
+	ulint		total_data_size = 0;
+	ulint		total_n_recs = 0;
+	ulint		data_size_per_rec;
+	ulint		optimal_page_size;
+	ulint		reserved_space;
+	ulint		level;
+	ulint		max_data_size = 0;
+	uint		n_defragmented = 0;
+	uint		n_new_slots;
+	mem_heap_t*	heap;
+	ibool		end_of_index = FALSE;
+
+	/* It doesn't make sense to call this function with n_pages = 1. */
+	ut_ad(n_pages > 1);
+
+	ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
+				MTR_MEMO_X_LOCK));
+	space = dict_index_get_space(index);
+	if (space == 0) {
+		/* Ignore space 0. */
+		return NULL;
+	}
+
+	if (n_pages > BTR_DEFRAGMENT_MAX_N_PAGES) {
+		n_pages = BTR_DEFRAGMENT_MAX_N_PAGES;
+	}
+
+	zip_size = dict_table_zip_size(index->table);
+	first_page = buf_block_get_frame(block);
+	level = btr_page_get_level(first_page, mtr);
+
+	if (level != 0) {
+		return NULL;
+	}
+
+	/* 1. Load the pages and calculate the total data size. */
+	blocks[0] = block;
+	for (uint i = 1; i <= n_pages; i++) {
+		page_t* page = buf_block_get_frame(blocks[i-1]);
+		ulint page_no = btr_page_get_next(page, mtr);
+		total_data_size += page_get_data_size(page);
+		total_n_recs += page_get_n_recs(page);
+		if (page_no == FIL_NULL) {
+			n_pages = i;
+			end_of_index = TRUE;
+			break;
+		}
+		blocks[i] = btr_block_get(space, zip_size, page_no,
+					  RW_X_LATCH, index, mtr);
+	}
+
+	if (n_pages == 1) {
+		if (btr_page_get_prev(first_page, mtr) == FIL_NULL) {
+			/* last page in the index */
+			if (dict_index_get_page(index)
+			    == page_get_page_no(first_page))
+				return NULL;
+			/* given page is the last page.
+			Lift the records to father. */
+			btr_lift_page_up(index, block, mtr);
+		}
+		return NULL;
+	}
+
+	/* 2. Calculate how many pages data can fit in. If not compressable,
+	return early. */
+	ut_a(total_n_recs != 0);
+	data_size_per_rec = total_data_size / total_n_recs;
+	// For uncompressed pages, the optimal data size if the free space of a
+	// empty page.
+	optimal_page_size = page_get_free_space_of_empty(
+		page_is_comp(first_page));
+	// For compressed pages, we take compression failures into account.
+	if (zip_size) {
+		ulint size = 0;
+		int i = 0;
+		// We estimate the optimal data size of the index use samples of
+		// data size. These samples are taken when pages failed to
+		// compress due to insertion on the page. We use the average
+		// of all samples we have as the estimation. Different pages of
+		// the same index vary in compressibility. Average gives a good
+		// enough estimation.
+		for (;i < STAT_DEFRAG_DATA_SIZE_N_SAMPLE; i++) {
+			if (index->stat_defrag_data_size_sample[i] == 0) {
+				break;
+			}
+			size += index->stat_defrag_data_size_sample[i];
+		}
+		if (i != 0) {
+			size = size / i;
+			optimal_page_size = min(optimal_page_size, size);
+		}
+		max_data_size = optimal_page_size;
+	}
+
+	reserved_space = min((ulint)(optimal_page_size
+			      * (1 - srv_defragment_fill_factor)),
+			     (data_size_per_rec
+			      * srv_defragment_fill_factor_n_recs));
+	optimal_page_size -= reserved_space;
+	n_new_slots = (total_data_size + optimal_page_size - 1)
+		      / optimal_page_size;
+	if (n_new_slots >= n_pages) {
+		/* Can't defragment. */
+		if (end_of_index)
+			return NULL;
+		return blocks[n_pages-1];
+	}
+
+	/* 3. Defragment pages. */
+	heap = mem_heap_create(256);
+	// First defragmented page will be the first page.
+	current_block = blocks[0];
+	// Start from the second page.
+	for (uint i = 1; i < n_pages; i ++) {
+		buf_block_t* new_block = btr_defragment_merge_pages(
+			index, blocks[i], current_block, zip_size,
+			reserved_space, &max_data_size, heap, mtr);
+		if (new_block != current_block) {
+			n_defragmented ++;
+			current_block = new_block;
+		}
+	}
+	mem_heap_free(heap);
+	n_defragmented ++;
+	os_atomic_increment_ulint(
+		&btr_defragment_count, 1);
+	if (n_pages == n_defragmented) {
+		os_atomic_increment_ulint(
+			&btr_defragment_failures, 1);
+	} else {
+		index->stat_defrag_n_pages_freed += (n_pages - n_defragmented);
+	}
+	if (end_of_index)
+		return NULL;
+	return current_block;
+}
+
+/******************************************************************//**
+Thread that merges consecutive b-tree pages into fewer pages to defragment
+the index. */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(btr_defragment_thread)(
+/*==========================================*/
+	void*	arg)	/*!< in: work queue */
+{
+	btr_pcur_t*	pcur;
+	btr_cur_t*	cursor;
+	dict_index_t*	index;
+	mtr_t		mtr;
+	buf_block_t*	first_block;
+	buf_block_t*	last_block;
+
+	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
+		/* If defragmentation is disabled, sleep before
+		checking whether it's enabled. */
+		if (!srv_defragment) {
+			os_thread_sleep(BTR_DEFRAGMENT_SLEEP_IN_USECS);
+			continue;
+		}
+		/* The following call won't remove the item from work queue.
+		We only get a pointer to it to work on. This will make sure
+		when user issue a kill command, all indices are in the work
+		queue to be searched. This also means that the user thread
+		cannot directly remove the item from queue (since we might be
+		using it). So user thread only marks index as removed. */
+		btr_defragment_item_t* item = btr_defragment_get_item();
+		/* If work queue is empty, sleep and check later. */
+		if (!item) {
+			os_thread_sleep(BTR_DEFRAGMENT_SLEEP_IN_USECS);
+			continue;
+		}
+		/* If an index is marked as removed, we remove it from the work
+		queue. No other thread could be using this item at this point so
+		it's safe to remove now. */
+		if (item->removed) {
+			btr_defragment_remove_item(item);
+			continue;
+		}
+
+		pcur = item->pcur;
+		ulonglong now = ut_timer_now();
+		ulonglong elapsed = now - item->last_processed;
+
+		if (elapsed < srv_defragment_interval) {
+			/* If we see an index again before the interval
+			determined by the configured frequency is reached,
+			we just sleep until the interval pass. Since
+			defragmentation of all indices queue up on a single
+			thread, it's likely other indices that follow this one
+			don't need to sleep again. */
+			os_thread_sleep(((ulint)ut_timer_to_microseconds(
+						srv_defragment_interval - elapsed)));
+		}
+
+		now = ut_timer_now();
+		mtr_start(&mtr);
+		btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, &mtr);
+		cursor = btr_pcur_get_btr_cur(pcur);
+		index = btr_cur_get_index(cursor);
+		first_block = btr_cur_get_block(cursor);
+		last_block = btr_defragment_n_pages(first_block, index,
+						    srv_defragment_n_pages,
+						    &mtr);
+		if (last_block) {
+			/* If we haven't reached the end of the index,
+			place the cursor on the last record of last page,
+			store the cursor position, and put back in queue. */
+			page_t* last_page = buf_block_get_frame(last_block);
+			rec_t* rec = page_rec_get_prev(
+				page_get_supremum_rec(last_page));
+			ut_a(page_rec_is_user_rec(rec));
+			page_cur_position(rec, last_block,
+					  btr_cur_get_page_cur(cursor));
+			btr_pcur_store_position(pcur, &mtr);
+			mtr_commit(&mtr);
+			/* Update the last_processed time of this index. */
+			item->last_processed = now;
+		} else {
+			mtr_commit(&mtr);
+			/* Reaching the end of the index. */
+			dict_stats_empty_defrag_stats(index);
+			dict_stats_save_defrag_stats(index);
+			dict_stats_save_defrag_summary(index);
+			btr_defragment_remove_item(item);
+		}
+	}
+	btr_defragment_shutdown();
+	os_thread_exit(NULL);
+	OS_THREAD_DUMMY_RETURN;
+}
+
+#endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc
index 9fceae0f880..33c9eb7a0f2 100644
--- a/storage/innobase/buf/buf0buf.cc
+++ b/storage/innobase/buf/buf0buf.cc
@@ -2,6 +2,7 @@
 
 Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2008, Google Inc.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
 
 Portions of this file contain modifications contributed and copyrighted by
 Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -54,6 +55,8 @@ Created 11/5/1995 Heikki Tuuri
 #include "srv0mon.h"
 #include "buf0checksum.h"
 
+#include <new>
+
 /*
 		IMPLEMENTATION OF THE BUFFER POOL
 		=================================
@@ -829,6 +832,11 @@ buf_page_print(
 			mach_read_from_4(read_buf + FIL_PAGE_OFFSET),
 			mach_read_from_4(read_buf
 					 + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID));
+
+		ulint page_type = mach_read_from_4(read_buf + FIL_PAGE_TYPE);
+
+		fprintf(stderr, "InnoDB: page type %ld meaning %s\n", page_type,
+			fil_get_page_type_name(page_type));
 	}
 
 #ifndef UNIV_HOTBACKUP
@@ -1323,6 +1331,19 @@ buf_pool_init_instance(
 
 	buf_pool->try_LRU_scan = TRUE;
 
+	/* Initialize the hazard pointer for flush_list batches */
+	new(&buf_pool->flush_hp)
+		FlushHp(buf_pool, &buf_pool->flush_list_mutex);
+
+	/* Initialize the hazard pointer for LRU batches */
+	new(&buf_pool->lru_hp) LRUHp(buf_pool, &buf_pool->mutex);
+
+	/* Initialize the iterator for LRU scan search */
+	new(&buf_pool->lru_scan_itr) LRUItr(buf_pool, &buf_pool->mutex);
+
+	/* Initialize the iterator for single page scan search */
+	new(&buf_pool->single_scan_itr) LRUItr(buf_pool, &buf_pool->mutex);
+
 	buf_pool_mutex_exit(buf_pool);
 
 	return(DB_SUCCESS);
@@ -1413,6 +1434,8 @@ buf_pool_init(
 
 	btr_search_sys_create(buf_pool_get_curr_size() / sizeof(void*) / 64);
 
+	buf_flush_event = os_event_create();
+
 	return(DB_SUCCESS);
 }
 
@@ -1529,6 +1552,10 @@ buf_relocate(
 
 	memcpy(dpage, bpage, sizeof *dpage);
 
+	/* Important that we adjust the hazard pointer before
+	removing bpage from LRU list. */
+	buf_LRU_adjust_hp(buf_pool, bpage);
+
 	ut_d(bpage->in_LRU_list = FALSE);
 	ut_d(bpage->in_page_hash = FALSE);
 
@@ -1567,6 +1594,84 @@ buf_relocate(
 	HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, dpage);
 }
 
+/** Hazard Pointer implementation. */
+
+/** Set current value
+@param bpage	buffer block to be set as hp */
+void
+HazardPointer::set(buf_page_t* bpage)
+{
+	ut_ad(mutex_own(m_mutex));
+	ut_ad(!bpage || buf_pool_from_bpage(bpage) == m_buf_pool);
+	ut_ad(!bpage || buf_page_in_file(bpage));
+
+	m_hp = bpage;
+}
+
+/** Checks if a bpage is the hp
+@param bpage    buffer block to be compared
+@return true if it is hp */
+
+bool
+HazardPointer::is_hp(const buf_page_t* bpage)
+{
+	ut_ad(mutex_own(m_mutex));
+	ut_ad(!m_hp || buf_pool_from_bpage(m_hp) == m_buf_pool);
+	ut_ad(!bpage || buf_pool_from_bpage(bpage) == m_buf_pool);
+
+	return(bpage == m_hp);
+}
+
+/** Adjust the value of hp. This happens when some other thread working
+on the same list attempts to remove the hp from the list.
+@param bpage	buffer block to be compared */
+
+void
+FlushHp::adjust(const buf_page_t* bpage)
+{
+	ut_ad(bpage != NULL);
+
+	/** We only support reverse traversal for now. */
+	if (is_hp(bpage)) {
+		m_hp = UT_LIST_GET_PREV(list, m_hp);
+	}
+
+	ut_ad(!m_hp || m_hp->in_flush_list);
+}
+
+/** Adjust the value of hp. This happens when some other thread working
+on the same list attempts to remove the hp from the list.
+@param bpage	buffer block to be compared */
+
+void
+LRUHp::adjust(const buf_page_t* bpage)
+{
+	ut_ad(bpage);
+
+	/** We only support reverse traversal for now. */
+	if (is_hp(bpage)) {
+		m_hp = UT_LIST_GET_PREV(LRU, m_hp);
+	}
+
+	ut_ad(!m_hp || m_hp->in_LRU_list);
+}
+
+/** Selects from where to start a scan. If we have scanned too deep into
+the LRU list it resets the value to the tail of the LRU list.
+@return buf_page_t from where to start scan. */
+
+buf_page_t*
+LRUItr::start()
+{
+	ut_ad(mutex_own(m_mutex));
+
+	if (!m_hp || m_hp->old) {
+		m_hp = UT_LIST_GET_LAST(m_buf_pool->LRU);
+	}
+
+	return(m_hp);
+}
+
 /********************************************************************//**
 Determine if a block is a sentinel for a buffer pool watch.
 @return	TRUE if a sentinel for a buffer pool watch, FALSE if not */
@@ -3363,6 +3468,7 @@ buf_page_init_low(
 	bpage->access_time = 0;
 	bpage->newest_modification = 0;
 	bpage->oldest_modification = 0;
+	bpage->write_size = 0;
 	HASH_INVALIDATE(bpage, hash);
 #if defined UNIV_DEBUG_FILE_ACCESSES || defined UNIV_DEBUG
 	bpage->file_page_was_freed = FALSE;
@@ -4050,7 +4156,10 @@ UNIV_INTERN
 bool
 buf_page_io_complete(
 /*=================*/
-	buf_page_t*	bpage)	/*!< in: pointer to the block in question */
+	buf_page_t*	bpage,	/*!< in: pointer to the block in question */
+	bool		evict)	/*!< in: whether or not to evict the page
+				from LRU list. */
+
 {
 	enum buf_io_fix	io_type;
 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
@@ -4232,6 +4341,7 @@ corrupt:
 	id. */
 
 	buf_page_set_io_fix(bpage, BUF_IO_NONE);
+	buf_page_monitor(bpage, io_type);
 
 	switch (io_type) {
 	case BUF_IO_READ:
@@ -4248,6 +4358,8 @@ corrupt:
 					     BUF_IO_READ);
 		}
 
+		mutex_exit(buf_page_get_mutex(bpage));
+
 		break;
 
 	case BUF_IO_WRITE:
@@ -4263,14 +4375,30 @@ corrupt:
 
 		buf_pool->stat.n_pages_written++;
 
+		/* In case of flush batches i.e.: BUF_FLUSH_LIST and
+		BUF_FLUSH_LRU this function is always called from IO
+		helper thread. In this case, we decide whether or not
+		to evict the page based on flush type. The value
+		passed as evict is the default value in function
+		definition which is false.
+		We always evict in case of LRU batch and never evict
+		in case of flush list batch. For single page flush
+		the caller sets the appropriate value. */
+		if (buf_page_get_flush_type(bpage) == BUF_FLUSH_LRU) {
+			evict = true;
+		}
+
+		mutex_exit(buf_page_get_mutex(bpage));
+		if (evict) {
+			buf_LRU_free_page(bpage, true);
+		}
+
 		break;
 
 	default:
 		ut_error;
 	}
 
-	buf_page_monitor(bpage, io_type);
-
 #ifdef UNIV_DEBUG
 	if (buf_debug_prints) {
 		fprintf(stderr, "Has %s page space %lu page no %lu\n",
@@ -4280,7 +4408,6 @@ corrupt:
 	}
 #endif /* UNIV_DEBUG */
 
-	mutex_exit(buf_page_get_mutex(bpage));
 	buf_pool_mutex_exit(buf_pool);
 
 	return(true);
diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc
index 62222993622..c903f5fbffa 100644
--- a/storage/innobase/buf/buf0dblwr.cc
+++ b/storage/innobase/buf/buf0dblwr.cc
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -382,7 +383,7 @@ buf_dblwr_init_or_load_pages(
 	/* Read the trx sys header to check if we are using the doublewrite
 	buffer */
 	off_t  trx_sys_page = TRX_SYS_PAGE_NO * UNIV_PAGE_SIZE;
-	os_file_read(file, read_buf, trx_sys_page, UNIV_PAGE_SIZE);
+	os_file_read(file, read_buf, trx_sys_page, UNIV_PAGE_SIZE, FALSE);
 
 	doublewrite = read_buf + TRX_SYS_DOUBLEWRITE;
 
@@ -419,9 +420,9 @@ buf_dblwr_init_or_load_pages(
 
         block_bytes = TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * UNIV_PAGE_SIZE;
 
-	os_file_read(file, buf, block1 * UNIV_PAGE_SIZE, block_bytes);
+	os_file_read(file, buf, block1 * UNIV_PAGE_SIZE, block_bytes, FALSE);
 	os_file_read(file, buf + block_bytes, block2 * UNIV_PAGE_SIZE,
-		     block_bytes);
+		     block_bytes, FALSE);
 
 	/* Check if any of these pages is half-written in data files, in the
 	intended position */
@@ -451,7 +452,6 @@ buf_dblwr_init_or_load_pages(
 			os_file_write(path, file, page,
 				      source_page_no * UNIV_PAGE_SIZE,
 				      UNIV_PAGE_SIZE);
-
 		} else if (load_corrupt_pages) {
 
 			recv_dblwr.add(page);
@@ -514,7 +514,7 @@ buf_dblwr_process()
 			fil_io(OS_FILE_READ, true, space_id, zip_size,
 			       page_no, 0,
 			       zip_size ? zip_size : UNIV_PAGE_SIZE,
-			       read_buf, NULL);
+			       read_buf, NULL, 0);
 
 			/* Check if the page is corrupt */
 
@@ -566,7 +566,7 @@ buf_dblwr_process()
 				fil_io(OS_FILE_WRITE, true, space_id,
 				       zip_size, page_no, 0,
 				       zip_size ? zip_size : UNIV_PAGE_SIZE,
-				       page, NULL);
+				       page, NULL, 0);
 
 				ib_logf(IB_LOG_LEVEL_INFO,
 					"Recovered the page from"
@@ -586,7 +586,7 @@ buf_dblwr_process()
 					       zip_size, page_no, 0,
 					       zip_size ? zip_size
 							: UNIV_PAGE_SIZE,
-					       page, NULL);
+					       page, NULL, 0);
 				}
 			}
 		}
@@ -798,7 +798,7 @@ buf_dblwr_write_block_to_datafile(
 		       buf_page_get_page_no(bpage), 0,
 		       buf_page_get_zip_size(bpage),
 		       (void*) bpage->zip.data,
-		       (void*) bpage);
+		       (void*) bpage, 0);
 
 		return;
 	}
@@ -810,8 +810,7 @@ buf_dblwr_write_block_to_datafile(
 
 	fil_io(flags, sync, buf_block_get_space(block), 0,
 	       buf_block_get_page_no(block), 0, UNIV_PAGE_SIZE,
-	       (void*) block->frame, (void*) block);
-
+		(void*) block->frame, (void*) block, (ulint *)&bpage->write_size);
 }
 
 /********************************************************************//**
@@ -905,7 +904,7 @@ try_again:
 
 	fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
 	       buf_dblwr->block1, 0, len,
-	       (void*) write_buf, NULL);
+	       (void*) write_buf, NULL, 0);
 
 	if (buf_dblwr->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) {
 		/* No unwritten pages in the second block. */
@@ -921,7 +920,7 @@ try_again:
 
 	fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
 	       buf_dblwr->block2, 0, len,
-	       (void*) write_buf, NULL);
+	       (void*) write_buf, NULL, 0);
 
 flush:
 	/* increment the doublewrite flushed pages counter */
@@ -1150,14 +1149,14 @@ retry:
 		fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
 		       offset, 0, UNIV_PAGE_SIZE,
 		       (void*) (buf_dblwr->write_buf
-				+ UNIV_PAGE_SIZE * i), NULL);
+			        + UNIV_PAGE_SIZE * i), NULL, 0);
 	} else {
 		/* It is a regular page. Write it directly to the
 		doublewrite buffer */
 		fil_io(OS_FILE_WRITE, true, TRX_SYS_SPACE, 0,
 		       offset, 0, UNIV_PAGE_SIZE,
 		       (void*) ((buf_block_t*) bpage)->frame,
-		       NULL);
+		       NULL, 0);
 	}
 
 	/* Now flush the doublewrite buffer data to disk */
diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc
index 6b219262207..32c3c816a85 100644
--- a/storage/innobase/buf/buf0flu.cc
+++ b/storage/innobase/buf/buf0flu.cc
@@ -1,6 +1,8 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved.
+Copyright (c) 2013, 2014, Fusion-io. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -30,6 +32,7 @@ Created 11/11/1995 Heikki Tuuri
 #endif
 
 #include "buf0buf.h"
+#include "buf0mtflu.h"
 #include "buf0checksum.h"
 #include "srv0start.h"
 #include "srv0srv.h"
@@ -44,10 +47,12 @@ Created 11/11/1995 Heikki Tuuri
 #include "ibuf0ibuf.h"
 #include "log0log.h"
 #include "os0file.h"
+#include "os0sync.h"
 #include "trx0sys.h"
 #include "srv0mon.h"
 #include "mysql/plugin.h"
 #include "mysql/service_thd_wait.h"
+#include "fil0pagecompress.h"
 
 /** Number of pages flushed through non flush_list flushes. */
 static ulint buf_lru_flush_page_count = 0;
@@ -59,14 +64,13 @@ need to protect it by a mutex. It is only ever read by the thread
 doing the shutdown */
 UNIV_INTERN ibool buf_page_cleaner_is_active = FALSE;
 
-/** LRU flush batch is further divided into this chunk size to
-reduce the wait time for the threads waiting for a clean block */
-#define PAGE_CLEANER_LRU_BATCH_CHUNK_SIZE	100
-
 #ifdef UNIV_PFS_THREAD
 UNIV_INTERN mysql_pfs_key_t buf_page_cleaner_thread_key;
 #endif /* UNIV_PFS_THREAD */
 
+/** Event to synchronise with the flushing. */
+ os_event_t	buf_flush_event;
+
 /** If LRU list of a buf_pool is less than this size then LRU eviction
 should not happen. This is because when we do LRU flushing we also put
 the blocks on free list. If LRU list is very small then we can end up
@@ -75,15 +79,6 @@ in thrashing. */
 
 /* @} */
 
-/** Handled page counters for a single flush */
-struct flush_counters_t {
-	ulint	flushed;	/*!< number of dirty pages flushed */
-	ulint	evicted;	/*!< number of clean pages evicted, including
-			        evicted uncompressed page images */
-	ulint	unzip_LRU_evicted;/*!< number of uncompressed page images
-				evicted */
-};
-
 /******************************************************************//**
 Increases flush_list size in bytes with zip_size for compressed page,
 UNIV_PAGE_SIZE for uncompressed page in inline function */
@@ -139,60 +134,6 @@ buf_flush_validate_skip(
 }
 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 
-/*******************************************************************//**
-Sets hazard pointer during flush_list iteration. */
-UNIV_INLINE
-void
-buf_flush_set_hp(
-/*=============*/
-	buf_pool_t*		buf_pool,/*!< in/out: buffer pool instance */
-	const buf_page_t*	bpage)	/*!< in: buffer control block */
-{
-	ut_ad(buf_flush_list_mutex_own(buf_pool));
-	ut_ad(buf_pool->flush_list_hp == NULL || bpage == NULL);
-	ut_ad(!bpage || buf_page_in_file(bpage));
-	ut_ad(!bpage || bpage->in_flush_list);
-	ut_ad(!bpage || buf_pool_from_bpage(bpage) == buf_pool);
-
-	buf_pool->flush_list_hp = bpage;
-}
-
-/*******************************************************************//**
-Checks if the given block is a hazard pointer
-@return true if bpage is hazard pointer */
-UNIV_INLINE
-bool
-buf_flush_is_hp(
-/*============*/
-	buf_pool_t*		buf_pool,/*!< in: buffer pool instance */
-	const buf_page_t*	bpage)	/*!< in: buffer control block */
-{
-	ut_ad(buf_flush_list_mutex_own(buf_pool));
-
-	return(buf_pool->flush_list_hp == bpage);
-}
-
-/*******************************************************************//**
-Whenever we move a block in flush_list (either to remove it or to
-relocate it) we check the hazard pointer set by some other thread
-doing the flush list scan. If the hazard pointer is the same as the
-one we are about going to move then we set it to NULL to force a rescan
-in the thread doing the batch. */
-UNIV_INLINE
-void
-buf_flush_update_hp(
-/*================*/
-	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
-	buf_page_t*	bpage)		/*!< in: buffer control block */
-{
-	ut_ad(buf_flush_list_mutex_own(buf_pool));
-
-	if (buf_flush_is_hp(buf_pool, bpage)) {
-		buf_flush_set_hp(buf_pool, NULL);
-		MONITOR_INC(MONITOR_FLUSH_HP_RESCAN);
-	}
-}
-
 /******************************************************************//**
 Insert a block in the flush_rbt and returns a pointer to its
 predecessor or NULL if no predecessor. The ordering is maintained
@@ -591,6 +532,10 @@ buf_flush_remove(
 
 	buf_flush_list_mutex_enter(buf_pool);
 
+	/* Important that we adjust the hazard pointer before removing
+	the bpage from flush list. */
+	buf_pool->flush_hp.adjust(bpage);
+
 	switch (buf_page_get_state(bpage)) {
 	case BUF_BLOCK_POOL_WATCH:
 	case BUF_BLOCK_ZIP_PAGE:
@@ -631,7 +576,6 @@ buf_flush_remove(
 	ut_a(buf_flush_validate_skip(buf_pool));
 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 
-	buf_flush_update_hp(buf_pool, bpage);
 	buf_flush_list_mutex_exit(buf_pool);
 }
 
@@ -682,6 +626,10 @@ buf_flush_relocate_on_flush_list(
 		prev_b = buf_flush_insert_in_flush_rbt(dpage);
 	}
 
+	/* Important that we adjust the hazard pointer before removing
+	the bpage from the flush list. */
+	buf_pool->flush_hp.adjust(bpage);
+
 	/* Must be done after we have removed it from the flush_rbt
 	because we assert on in_flush_list in comparison function. */
 	ut_d(bpage->in_flush_list = FALSE);
@@ -710,7 +658,6 @@ buf_flush_relocate_on_flush_list(
 	ut_a(buf_flush_validate_low(buf_pool));
 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 
-	buf_flush_update_hp(buf_pool, bpage);
 	buf_flush_list_mutex_exit(buf_pool);
 }
 
@@ -732,8 +679,10 @@ buf_flush_write_complete(
 	flush_type = buf_page_get_flush_type(bpage);
 	buf_pool->n_flush[flush_type]--;
 
+#ifdef UNIV_DEBUG
 	/* fprintf(stderr, "n pending flush %lu\n",
 	buf_pool->n_flush[flush_type]); */
+#endif
 
 	if (buf_pool->n_flush[flush_type] == 0
 	    && buf_pool->init_flush[flush_type] == FALSE) {
@@ -891,6 +840,8 @@ buf_flush_write_block_low(
 {
 	ulint	zip_size	= buf_page_get_zip_size(bpage);
 	page_t*	frame		= NULL;
+	ulint space_id          = buf_page_get_space(bpage);
+	atomic_writes_t awrites = fil_space_get_atomic_writes(space_id);
 
 #ifdef UNIV_DEBUG
 	buf_pool_t*	buf_pool = buf_pool_from_bpage(bpage);
@@ -967,12 +918,28 @@ buf_flush_write_block_low(
 		       sync, buf_page_get_space(bpage), zip_size,
 		       buf_page_get_page_no(bpage), 0,
 		       zip_size ? zip_size : UNIV_PAGE_SIZE,
-		       frame, bpage);
-	} else if (flush_type == BUF_FLUSH_SINGLE_PAGE) {
-		buf_dblwr_write_single_page(bpage, sync);
+		       frame, bpage, &bpage->write_size);
 	} else {
-		ut_ad(!sync);
-		buf_dblwr_add_to_batch(bpage);
+
+		/* InnoDB uses doublewrite buffer and doublewrite buffer
+		is initialized. User can define do we use atomic writes
+		on a file space (table) or not. If atomic writes are
+		not used we should use doublewrite buffer and if
+		atomic writes should be used, no doublewrite buffer
+		is used. */
+
+		if (awrites == ATOMIC_WRITES_ON) {
+			fil_io(OS_FILE_WRITE | OS_AIO_SIMULATED_WAKE_LATER,
+				FALSE, buf_page_get_space(bpage), zip_size,
+				buf_page_get_page_no(bpage), 0,
+				zip_size ? zip_size : UNIV_PAGE_SIZE,
+				frame, bpage, &bpage->write_size);
+		} else if (flush_type == BUF_FLUSH_SINGLE_PAGE) {
+			buf_dblwr_write_single_page(bpage, sync);
+		} else {
+			ut_ad(!sync);
+			buf_dblwr_add_to_batch(bpage);
+		}
 	}
 
 	/* When doing single page flushing the IO is done synchronously
@@ -981,7 +948,10 @@ buf_flush_write_block_low(
 	if (sync) {
 		ut_ad(flush_type == BUF_FLUSH_SINGLE_PAGE);
 		fil_flush(buf_page_get_space(bpage));
-		buf_page_io_complete(bpage);
+
+		/* true means we want to evict this page from the
+		LRU list as well. */
+		buf_page_io_complete(bpage, true);
 	}
 
 	/* Increment the counter of I/O operations used
@@ -1075,10 +1045,10 @@ buf_flush_page(
 			rw_lock_s_lock_gen(rw_lock, BUF_IO_WRITE);
                 }
 
-                /* Even though bpage is not protected by any mutex at this
-                point, it is safe to access bpage, because it is io_fixed and
-                oldest_modification != 0.  Thus, it cannot be relocated in the
-                buffer pool or removed from flush_list or LRU_list. */
+		/* Even though bpage is not protected by any mutex at this
+		point, it is safe to access bpage, because it is io_fixed and
+		oldest_modification != 0.  Thus, it cannot be relocated in the
+		buffer pool or removed from flush_list or LRU_list. */
 
                 buf_flush_write_block_low(bpage, flush_type, sync);
         }
@@ -1232,7 +1202,9 @@ buf_flush_try_neighbors(
 		}
 	}
 
+#ifdef UNIV_DEBUG
 	/* fprintf(stderr, "Flush area: low %lu high %lu\n", low, high); */
+#endif
 
 	if (high > fil_space_get_size(space)) {
 		high = fil_space_get_size(space);
@@ -1440,9 +1412,8 @@ This utility flushes dirty blocks from the end of the LRU list.
 The calling thread is not allowed to own any latches on pages!
 It attempts to make 'max' blocks available in the free list. Note that
 it is a best effort attempt and it is not guaranteed that after a call
-to this function there will be 'max' blocks in the free list.
-@return number of blocks for which the write request was queued. */
-static
+to this function there will be 'max' blocks in the free list.*/
+__attribute__((nonnull))
 void
 buf_flush_LRU_list_batch(
 /*=====================*/
@@ -1453,96 +1424,54 @@ buf_flush_LRU_list_batch(
 					counts */
 {
 	buf_page_t*	bpage;
-	ulint		count = 0;
 	ulint		scanned = 0;
 	ulint		free_len = UT_LIST_GET_LEN(buf_pool->free);
 	ulint		lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
 
-	ut_ad(buf_pool_mutex_own(buf_pool));
-
 	n->flushed = 0;
 	n->evicted = 0;
 	n->unzip_LRU_evicted = 0;
 
-	bpage = UT_LIST_GET_LAST(buf_pool->LRU);
-	while (bpage != NULL && count < max
-	       && (n->flushed + n->evicted) < max
-	       && free_len < srv_LRU_scan_depth
-	       && lru_len > BUF_LRU_MIN_LEN) {
+	ut_ad(buf_pool_mutex_own(buf_pool));
 
-		ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
-		ibool	 evict;
+	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
+	     bpage != NULL && (n->evicted + n->flushed) < max
+	     && free_len < srv_LRU_scan_depth
+	     && lru_len > BUF_LRU_MIN_LEN;
+	     ++scanned,
+	     bpage = buf_pool->lru_hp.get()) {
 
+		buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage);
+		buf_pool->lru_hp.set(prev);
+
+		ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
 		mutex_enter(block_mutex);
-		evict = buf_flush_ready_for_replace(bpage);
+		bool	evict = buf_flush_ready_for_replace(bpage);
 		mutex_exit(block_mutex);
 
-		++scanned;
-
-		/* If the block is ready to be replaced we try to
-		free it i.e.: put it on the free list.
-		Otherwise we try to flush the block and its
-		neighbors. In this case we'll put it on the
-		free list in the next pass. We do this extra work
-		of putting blocks to the free list instead of
-		just flushing them because after every flush
-		we have to restart the scan from the tail of
-		the LRU list and if we don't clear the tail
-		of the flushed pages then the scan becomes
-		O(n*n). */
 		if (evict) {
+			/* block is ready for eviction i.e., it is
+			clean and is not IO-fixed or buffer fixed. */
 			if (buf_LRU_free_page(bpage, true)) {
-				/* buf_pool->mutex was potentially
-				released and reacquired. */
 				n->evicted++;
-				bpage = UT_LIST_GET_LAST(buf_pool->LRU);
-			} else {
-				bpage = UT_LIST_GET_PREV(LRU, bpage);
 			}
 		} else {
-			ulint		space;
-			ulint		offset;
-			buf_page_t*	prev_bpage;
-
-			prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
-
-			/* Save the previous bpage */
-
-			if (prev_bpage != NULL) {
-				space = prev_bpage->space;
-				offset = prev_bpage->offset;
-			} else {
-				space = ULINT_UNDEFINED;
-				offset = ULINT_UNDEFINED;
-			}
-
-			if (!buf_flush_page_and_try_neighbors(
-				bpage, BUF_FLUSH_LRU, max, &n->flushed)) {
-
-				bpage = prev_bpage;
-			} else {
-				/* buf_pool->mutex was released.
-				reposition the iterator. Note: the
-				prev block could have been repositioned
-				too but that should be rare. */
-
-				if (prev_bpage != NULL) {
-
-					ut_ad(space != ULINT_UNDEFINED);
-					ut_ad(offset != ULINT_UNDEFINED);
-
-					prev_bpage = buf_page_hash_get(
-						buf_pool, space, offset);
-				}
-
-				bpage = prev_bpage;
-			}
+			/* Block is ready for flush. Dispatch an IO
+			request. The IO helper thread will put it on
+			free list in IO completion routine. */
+			buf_flush_page_and_try_neighbors(
+				bpage, BUF_FLUSH_LRU, max, &n->flushed);
 		}
 
+		ut_ad(!mutex_own(block_mutex));
+		ut_ad(buf_pool_mutex_own(buf_pool));
+
 		free_len = UT_LIST_GET_LEN(buf_pool->free);
 		lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
 	}
 
+	buf_pool->lru_hp.set(NULL);
+
 	/* We keep track of all flushes happening as part of LRU
 	flush. When estimating the desired rate at which flush_list
 	should be flushed, we factor in this value. */
@@ -1561,10 +1490,8 @@ buf_flush_LRU_list_batch(
 
 /*******************************************************************//**
 Flush and move pages from LRU or unzip_LRU list to the free list.
-Whether LRU or unzip_LRU is used depends on the state of the system.
-@return number of blocks for which either the write request was queued
-or in case of unzip_LRU the number of blocks actually moved to the
-free list */
+Whether LRU or unzip_LRU is used depends on the state of the system.*/
+__attribute__((nonnull))
 static
 void
 buf_do_LRU_batch(
@@ -1575,7 +1502,6 @@ buf_do_LRU_batch(
 	flush_counters_t*	n)	/*!< out: flushed/evicted page
 					counts */
 {
-
 	if (buf_LRU_evict_from_unzip_LRU(buf_pool)) {
 		n->unzip_LRU_evicted = buf_free_from_unzip_LRU_list_batch(buf_pool, max);
 	} else {
@@ -1588,6 +1514,10 @@ buf_do_LRU_batch(
 		n->evicted = 0;
 		n->flushed = 0;
 	}
+
+	/* Add evicted pages from unzip_LRU to the evicted pages from
+	the simple LRU. */
+	n->evicted += n->unzip_LRU_evicted;
 }
 
 /*******************************************************************//**
@@ -1629,6 +1559,7 @@ buf_do_flush_list_batch(
 	for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
 	     count < min_n && bpage != NULL && len > 0
 	     && bpage->oldest_modification < lsn_limit;
+	     bpage = buf_pool->flush_hp.get(),
 	     ++scanned) {
 
 		buf_page_t*	prev;
@@ -1637,8 +1568,7 @@ buf_do_flush_list_batch(
 		ut_ad(bpage->in_flush_list);
 
 		prev = UT_LIST_GET_PREV(list, bpage);
-		buf_flush_set_hp(buf_pool, prev);
-
+		buf_pool->flush_hp.set(prev);
 		buf_flush_list_mutex_exit(buf_pool);
 
 #ifdef UNIV_DEBUG
@@ -1649,23 +1579,12 @@ buf_do_flush_list_batch(
 
 		buf_flush_list_mutex_enter(buf_pool);
 
-		ut_ad(flushed || buf_flush_is_hp(buf_pool, prev));
+		ut_ad(flushed || buf_pool->flush_hp.is_hp(prev));
 
-		if (!buf_flush_is_hp(buf_pool, prev)) {
-			/* The hazard pointer was reset by some other
-			thread. Restart the scan. */
-			ut_ad(buf_flush_is_hp(buf_pool, NULL));
-			bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
-			len = UT_LIST_GET_LEN(buf_pool->flush_list);
-		} else {
-			bpage = prev;
-			--len;
-			buf_flush_set_hp(buf_pool, NULL);
-		}
-
-		ut_ad(!bpage || bpage->in_flush_list);
+		--len;
 	}
 
+	buf_pool->flush_hp.set(NULL);
 	buf_flush_list_mutex_exit(buf_pool);
 
 	MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_SCANNED,
@@ -1683,9 +1602,8 @@ This utility flushes dirty blocks from the end of the LRU list or flush_list.
 NOTE 1: in the case of an LRU flush the calling thread may own latches to
 pages: to avoid deadlocks, this function must be written so that it cannot
 end up waiting for these latches! NOTE 2: in the case of a flush list flush,
-the calling thread is not allowed to own any latches on pages!
-@return number of blocks for which the write request was queued */
-static
+the calling thread is not allowed to own any latches on pages! */
+__attribute__((nonnull))
 void
 buf_flush_batch(
 /*============*/
@@ -1705,7 +1623,6 @@ buf_flush_batch(
 	flush_counters_t*	n)	/*!< out: flushed/evicted page
 					counts  */
 {
-
 	ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad((flush_type != BUF_FLUSH_LIST)
@@ -1742,7 +1659,6 @@ buf_flush_batch(
 
 /******************************************************************//**
 Gather the aggregated stats for both flush list and LRU list flushing */
-static
 void
 buf_flush_common(
 /*=============*/
@@ -1767,7 +1683,6 @@ buf_flush_common(
 
 /******************************************************************//**
 Start a buffer flush batch for LRU or flush list */
-static
 ibool
 buf_flush_start(
 /*============*/
@@ -1796,7 +1711,6 @@ buf_flush_start(
 
 /******************************************************************//**
 End a buffer flush batch for LRU or flush list */
-static
 void
 buf_flush_end(
 /*==========*/
@@ -1852,40 +1766,6 @@ buf_flush_wait_batch_end(
 }
 
 /*******************************************************************//**
-This utility flushes dirty blocks from the end of the LRU list and also
-puts replaceable clean pages from the end of the LRU list to the free
-list.
-NOTE: The calling thread is not allowed to own any latches on pages!
-@return true if a batch was queued successfully. false if another batch
-of same type was already running. */
-static
-bool
-buf_flush_LRU(
-/*==========*/
-	buf_pool_t*	buf_pool,	/*!< in/out: buffer pool instance */
-	ulint		min_n,		/*!< in: wished minimum mumber of blocks
-					flushed (it is not guaranteed that the
-					actual number is that big, though) */
-	flush_counters_t	*n)	/*!< out: flushed/evicted page
-					counts */
-{
-	if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) {
-		n->flushed = 0;
-		n->evicted = 0;
-		n->unzip_LRU_evicted = 0;
-		return(false);
-	}
-
-	buf_flush_batch(buf_pool, BUF_FLUSH_LRU, min_n, 0, n);
-
-	buf_flush_end(buf_pool, BUF_FLUSH_LRU);
-
-	buf_flush_common(BUF_FLUSH_LRU, n->flushed);
-
-	return(true);
-}
-
-/*******************************************************************//**
 This utility flushes dirty blocks from the end of the flush list of
 all buffer pool instances.
 NOTE: The calling thread is not allowed to own any latches on pages!
@@ -1912,6 +1792,10 @@ buf_flush_list(
 	ulint		i;
 	bool		success = true;
 
+	if (buf_mtflu_init_done()) {
+		return(buf_mtflu_flush_list(min_n, lsn_limit, n_processed));
+	}
+
 	if (n_processed) {
 		*n_processed = 0;
 	}
@@ -1927,8 +1811,8 @@ buf_flush_list(
 
 	/* Flush to lsn_limit in all buffer pool instances */
 	for (i = 0; i < srv_buf_pool_instances; i++) {
-		buf_pool_t*	buf_pool;
-		flush_counters_t n;
+		buf_pool_t*		buf_pool;
+		flush_counters_t	n;
 
 		buf_pool = buf_pool_from_array(i);
 
@@ -1972,12 +1856,12 @@ buf_flush_list(
 }
 
 /******************************************************************//**
-This function picks up a single dirty page from the tail of the LRU
-list, flushes it, removes it from page_hash and LRU list and puts
-it on the free list. It is called from user threads when they are
-unable to find a replaceable page at the tail of the LRU list i.e.:
-when the background LRU flushing in the page_cleaner thread is not
-fast enough to keep pace with the workload.
+This function picks up a single page from the tail of the LRU
+list, flushes it (if it is dirty), removes it from page_hash and LRU
+list and puts it on the free list. It is called from user threads when
+they are unable to find a replaceable page at the tail of the LRU
+list i.e.: when the background LRU flushing in the page_cleaner thread
+is not fast enough to keep pace with the workload.
 @return TRUE if success. */
 UNIV_INTERN
 ibool
@@ -1987,84 +1871,67 @@ buf_flush_single_page_from_LRU(
 {
 	ulint		scanned;
 	buf_page_t*	bpage;
+	ibool		freed;
 
 	buf_pool_mutex_enter(buf_pool);
 
-	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU), scanned = 1;
+	for (bpage = buf_pool->single_scan_itr.start(),
+	     scanned = 0, freed = FALSE;
 	     bpage != NULL;
-	     bpage = UT_LIST_GET_PREV(LRU, bpage), ++scanned) {
+	     ++scanned, bpage = buf_pool->single_scan_itr.get()) {
 
-		ib_mutex_t*	block_mutex = buf_page_get_mutex(bpage);
+		ut_ad(buf_pool_mutex_own(buf_pool));
 
-		mutex_enter(block_mutex);
-
-		if (buf_flush_ready_for_flush(bpage, BUF_FLUSH_SINGLE_PAGE)) {
-
-			/* The following call will release the buffer pool
-			and block mutex. */
+		buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage);
+		buf_pool->single_scan_itr.set(prev);
 
-			ibool	flushed = buf_flush_page(
-				buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE, true);
+		ib_mutex_t* block_mutex = buf_page_get_mutex(bpage);
+		mutex_enter(block_mutex);
 
-			if (flushed) {
-				/* buf_flush_page() will release the
-				block mutex */
+		if (buf_flush_ready_for_replace(bpage)) {
+			/* block is ready for eviction i.e., it is
+			clean and is not IO-fixed or buffer fixed. */
+			mutex_exit(block_mutex);
+			if (buf_LRU_free_page(bpage, true)) {
+				buf_pool_mutex_exit(buf_pool);
+				freed = TRUE;
+				break;
+			}
+		} else if (buf_flush_ready_for_flush(
+				bpage, BUF_FLUSH_SINGLE_PAGE)) {
+			/* Block is ready for flush. Dispatch an IO
+			request. We'll put it on free list in IO
+			completion routine. The following call, if
+			successful, will release the buffer pool and
+			block mutex. */
+			freed = buf_flush_page(buf_pool, bpage,
+					       BUF_FLUSH_SINGLE_PAGE, true);
+			if (freed) {
+				/* block and buffer pool mutex have
+				already been reelased. */
 				break;
 			}
+			mutex_exit(block_mutex);
+		} else {
+			mutex_exit(block_mutex);
 		}
-
-		mutex_exit(block_mutex);
 	}
 
-	MONITOR_INC_VALUE_CUMULATIVE(
-		MONITOR_LRU_SINGLE_FLUSH_SCANNED,
-		MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
-		MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL,
-		scanned);
-
-	if (bpage == NULL) {
+	if (!freed) {
 		/* Can't find a single flushable page. */
+		ut_ad(!bpage);
 		buf_pool_mutex_exit(buf_pool);
-		return(FALSE);
 	}
 
-
-	ibool	freed = FALSE;
-
-	/* At this point the page has been written to the disk.
-	As we are not holding buffer pool or block mutex therefore
-	we cannot use the bpage safely. It may have been plucked out
-	of the LRU list by some other thread or it may even have
-	relocated in case of a compressed page. We need to start
-	the scan of LRU list again to remove the block from the LRU
-	list and put it on the free list. */
-	buf_pool_mutex_enter(buf_pool);
-
-	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
-	     bpage != NULL;
-	     bpage = UT_LIST_GET_PREV(LRU, bpage)) {
-
-		ib_mutex_t*	block_mutex = buf_page_get_mutex(bpage);
-
-		mutex_enter(block_mutex);
-
-		ibool	ready = buf_flush_ready_for_replace(bpage);
-
-		mutex_exit(block_mutex);
-
-		if (ready) {
-			bool	evict_zip;
-
-			evict_zip = !buf_LRU_evict_from_unzip_LRU(buf_pool);;
-
-			freed = buf_LRU_free_page(bpage, evict_zip);
-
-			break;
-		}
+	if (scanned) {
+		MONITOR_INC_VALUE_CUMULATIVE(
+			MONITOR_LRU_SINGLE_FLUSH_SCANNED,
+			MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
+			MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL,
+			scanned);
 	}
 
-	buf_pool_mutex_exit(buf_pool);
-
+	ut_ad(!buf_pool_mutex_own(buf_pool));
 	return(freed);
 }
 
@@ -2082,10 +1949,16 @@ buf_flush_LRU_tail(void)
 {
 	ulint	total_flushed = 0;
 
+	if(buf_mtflu_init_done())
+	{
+		return(buf_mtflu_flush_LRU_tail());
+	}
+
 	for (ulint i = 0; i < srv_buf_pool_instances; i++) {
 
 		buf_pool_t*	buf_pool = buf_pool_from_array(i);
 		ulint		scan_depth;
+		flush_counters_t	n;
 
 		/* srv_LRU_scan_depth can be arbitrarily large value.
 		We cap it with current LRU size. */
@@ -2095,44 +1968,37 @@ buf_flush_LRU_tail(void)
 
 		scan_depth = ut_min(srv_LRU_scan_depth, scan_depth);
 
-		/* We divide LRU flush into smaller chunks because
-		there may be user threads waiting for the flush to
-		end in buf_LRU_get_free_block(). */
-		for (ulint j = 0;
-		     j < scan_depth;
-		     j += PAGE_CLEANER_LRU_BATCH_CHUNK_SIZE) {
-
-			flush_counters_t	n;
-
-			/* Currently page_cleaner is the only thread
-			that can trigger an LRU flush. It is possible
-			that a batch triggered during last iteration is
-			still running, */
-			if (buf_flush_LRU(buf_pool,
-					  PAGE_CLEANER_LRU_BATCH_CHUNK_SIZE,
-					  &n)) {
-
-				/* Allowed only one batch per
-				buffer pool instance. */
-				buf_flush_wait_batch_end(
-					buf_pool, BUF_FLUSH_LRU);
-			}
+		/* Currently page_cleaner is the only thread
+		that can trigger an LRU flush. It is possible
+		that a batch triggered during last iteration is
+		still running, */
+		if (!buf_flush_start(buf_pool, BUF_FLUSH_LRU)) {
+			continue;
+		}
 
-			if (n.flushed) {
-				total_flushed += n.flushed;
-			} else {
-				/* Nothing to flush */
-				break;
-			}
+		buf_flush_batch(buf_pool, BUF_FLUSH_LRU, scan_depth, 0, &n);
+
+		buf_flush_end(buf_pool, BUF_FLUSH_LRU);
+
+		buf_flush_common(BUF_FLUSH_LRU, n.flushed);
+
+		if (n.flushed) {
+			MONITOR_INC_VALUE_CUMULATIVE(
+				MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
+				MONITOR_LRU_BATCH_FLUSH_COUNT,
+				MONITOR_LRU_BATCH_FLUSH_PAGES,
+				n.flushed);
 		}
-	}
 
-	if (total_flushed) {
-		MONITOR_INC_VALUE_CUMULATIVE(
-			MONITOR_LRU_BATCH_TOTAL_PAGE,
-			MONITOR_LRU_BATCH_COUNT,
-			MONITOR_LRU_BATCH_PAGES,
-			total_flushed);
+		if (n.evicted) {
+			MONITOR_INC_VALUE_CUMULATIVE(
+				MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
+				MONITOR_LRU_BATCH_EVICT_COUNT,
+				MONITOR_LRU_BATCH_EVICT_PAGES,
+				n.evicted);
+		}
+
+		total_flushed += (n.flushed + n.evicted);
 	}
 
 	return(total_flushed);
@@ -2390,14 +2256,19 @@ page_cleaner_sleep_if_needed(
 
 	if (next_loop_time > cur_time) {
 		/* Get sleep interval in micro seconds. We use
-		ut_min() to avoid long sleep in case of
-		wrap around. */
-		os_thread_sleep(ut_min(1000000,
-				(next_loop_time - cur_time)
-				 * 1000));
+		ut_min() to avoid long sleep in case of wrap around. */
+		ulint	sleep_us;
+
+		sleep_us = ut_min(1000000, (next_loop_time - cur_time) * 1000);
+
+		ib_int64_t	sig_count = os_event_reset(buf_flush_event);
+
+		os_event_wait_time_low(buf_flush_event, sleep_us, sig_count);
 	}
 }
 
+
+
 /******************************************************************//**
 page_cleaner thread tasked with flushing dirty pages from the buffer
 pools. As of now we'll have only one instance of this thread.
@@ -2424,7 +2295,6 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)(
 	fprintf(stderr, "InnoDB: page_cleaner thread running, id %lu\n",
 		os_thread_pf(os_thread_get_curr_id()));
 #endif /* UNIV_DEBUG_THREAD_CREATION */
-
 	buf_page_cleaner_is_active = TRUE;
 
 	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
@@ -2437,12 +2307,12 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)(
 			last_activity = srv_get_activity_count();
 
 			/* Flush pages from flush_list if required */
-			page_cleaner_flush_pages_if_needed();
-			n_flushed = 0;
-		} else {
+			n_flushed += page_cleaner_flush_pages_if_needed();
+
+		} else if (srv_idle_flush_pct) {
 			n_flushed = page_cleaner_do_flush_batch(
-							PCT_IO(100),
-							LSN_MAX);
+				PCT_IO(100),
+				LSN_MAX);
 
 			if (n_flushed) {
 				MONITOR_INC_VALUE_CUMULATIVE(
@@ -2454,10 +2324,11 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)(
 		}
 
 		/* Flush pages from end of LRU if required */
-		n_flushed = buf_flush_LRU_tail();
+		buf_flush_LRU_tail();
 	}
 
 	ut_ad(srv_shutdown_state > 0);
+
 	if (srv_fast_shutdown == 2) {
 		/* In very fast shutdown we simulate a crash of
 		buffer pool. We are not required to do any flushing */
@@ -2522,6 +2393,8 @@ DECLARE_THREAD(buf_flush_page_cleaner_thread)(
 thread_exit:
 	buf_page_cleaner_is_active = FALSE;
 
+	os_event_free(buf_flush_event);
+
 	/* We count the number of threads in os_thread_exit(). A created
 	thread should always use that to exit and not use return() to exit. */
 	os_thread_exit(NULL);
@@ -2623,9 +2496,11 @@ buf_flush_validate(
 
 	return(ret);
 }
+
 #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
 #endif /* !UNIV_HOTBACKUP */
 
+
 #ifdef UNIV_DEBUG
 /******************************************************************//**
 Check if there are any dirty pages that belong to a space id in the flush
diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc
index 36eae54c17f..952f0fc3083 100644
--- a/storage/innobase/buf/buf0lru.cc
+++ b/storage/innobase/buf/buf0lru.cc
@@ -81,6 +81,10 @@ are not blocked for extended period of time when using very large
 buffer pools. */
 #define BUF_LRU_DROP_SEARCH_SIZE	1024
 
+/** We scan these many blocks when looking for a clean page to evict
+during LRU eviction. */
+#define BUF_LRU_SEARCH_SCAN_THRESHOLD	100
+
 /** If we switch on the InnoDB monitor because there are too few available
 frames in the buffer pool, we set this to TRUE */
 static ibool	buf_lru_switched_on_innodb_mon	= FALSE;
@@ -961,7 +965,7 @@ buf_LRU_free_from_unzip_LRU_list(
 	}
 
 	for (block = UT_LIST_GET_LAST(buf_pool->unzip_LRU),
-	     scanned = 1, freed = FALSE;
+	     scanned = 0, freed = FALSE;
 	     block != NULL && !freed
 	     && (scan_all || scanned < srv_LRU_scan_depth);
 	     ++scanned) {
@@ -978,11 +982,13 @@ buf_LRU_free_from_unzip_LRU_list(
 		block = prev_block;
 	}
 
-	MONITOR_INC_VALUE_CUMULATIVE(
-		MONITOR_LRU_UNZIP_SEARCH_SCANNED,
-		MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL,
-		MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL,
-		scanned);
+	if (scanned) {
+		MONITOR_INC_VALUE_CUMULATIVE(
+			MONITOR_LRU_UNZIP_SEARCH_SCANNED,
+			MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL,
+			MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL,
+			scanned);
+	}
 	return(freed);
 }
 
@@ -1004,21 +1010,30 @@ buf_LRU_free_from_common_LRU_list(
 
 	ut_ad(buf_pool_mutex_own(buf_pool));
 
-	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU),
-	     scanned = 1, freed = FALSE;
+	for (bpage = buf_pool->lru_scan_itr.start(),
+	     scanned = 0, freed = false;
 	     bpage != NULL && !freed
-	     && (scan_all || scanned < srv_LRU_scan_depth);
-	     ++scanned) {
+	     && (scan_all || scanned < BUF_LRU_SEARCH_SCAN_THRESHOLD);
+	     ++scanned, bpage = buf_pool->lru_scan_itr.get()) {
 
-		unsigned	accessed;
-		buf_page_t*	prev_bpage = UT_LIST_GET_PREV(LRU,
-						bpage);
+		buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage);
+		buf_pool->lru_scan_itr.set(prev);
+
+		ib_mutex_t* mutex = buf_page_get_mutex(bpage);
+		mutex_enter(mutex);
 
 		ut_ad(buf_page_in_file(bpage));
 		ut_ad(bpage->in_LRU_list);
 
-		accessed = buf_page_is_accessed(bpage);
-		freed = buf_LRU_free_page(bpage, true);
+		unsigned accessed = buf_page_is_accessed(bpage);
+
+		if (buf_flush_ready_for_replace(bpage)) {
+			mutex_exit(mutex);
+			freed = buf_LRU_free_page(bpage, true);
+		} else {
+			mutex_exit(mutex);
+		}
+
 		if (freed && !accessed) {
 			/* Keep track of pages that are evicted without
 			ever being accessed. This gives us a measure of
@@ -1026,14 +1041,17 @@ buf_LRU_free_from_common_LRU_list(
 			++buf_pool->stat.n_ra_pages_evicted;
 		}
 
-		bpage = prev_bpage;
+		ut_ad(buf_pool_mutex_own(buf_pool));
+		ut_ad(!mutex_own(mutex));
 	}
 
-	MONITOR_INC_VALUE_CUMULATIVE(
-		MONITOR_LRU_SEARCH_SCANNED,
-		MONITOR_LRU_SEARCH_SCANNED_NUM_CALL,
-		MONITOR_LRU_SEARCH_SCANNED_PER_CALL,
-		scanned);
+	if (scanned) {
+		MONITOR_INC_VALUE_CUMULATIVE(
+			MONITOR_LRU_SEARCH_SCANNED,
+			MONITOR_LRU_SEARCH_SCANNED_NUM_CALL,
+			MONITOR_LRU_SEARCH_SCANNED_PER_CALL,
+			scanned);
+	}
 
 	return(freed);
 }
@@ -1217,8 +1235,6 @@ the free list. Even when we flush a page or find a page in LRU scan
 we put it to free list to be used.
 * iteration 0:
   * get a block from free list, success:done
-  * if there is an LRU flush batch in progress:
-    * wait for batch to end: retry free list
   * if buf_pool->try_LRU_scan is set
     * scan LRU up to srv_LRU_scan_depth to find a clean block
     * the above will put the block on free list
@@ -1231,7 +1247,7 @@ we put it to free list to be used.
     * scan whole LRU list
     * scan LRU list even if buf_pool->try_LRU_scan is not set
 * iteration > 1:
-  * same as iteration 1 but sleep 100ms
+  * same as iteration 1 but sleep 10ms
 @return	the free control block, in state BUF_BLOCK_READY_FOR_USE */
 UNIV_INTERN
 buf_block_t*
@@ -1269,20 +1285,6 @@ loop:
 		return(block);
 	}
 
-	if (buf_pool->init_flush[BUF_FLUSH_LRU]
-	    && srv_use_doublewrite_buf
-	    && buf_dblwr != NULL) {
-
-		/* If there is an LRU flush happening in the background
-		then we wait for it to end instead of trying a single
-		page flush. If, however, we are not using doublewrite
-		buffer then it is better to do our own single page
-		flush instead of waiting for LRU flush to end. */
-		buf_pool_mutex_exit(buf_pool);
-		buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
-		goto loop;
-	}
-
 	freed = FALSE;
 	if (buf_pool->try_LRU_scan || n_iterations > 0) {
 		/* If no block was in the free list, search from the
@@ -1299,6 +1301,10 @@ loop:
 			TRUE again when we flush a batch from this
 			buffer pool. */
 			buf_pool->try_LRU_scan = FALSE;
+
+			/* Also tell the page_cleaner thread that
+			there is work for it to do. */
+			os_event_set(buf_flush_event);
 		}
 	}
 
@@ -1347,12 +1353,10 @@ loop:
 
 	/* If we have scanned the whole LRU and still are unable to
 	find a free block then we should sleep here to let the
-	page_cleaner do an LRU batch for us.
-	TODO: It'd be better if we can signal the page_cleaner. Perhaps
-	we should use timed wait for page_cleaner. */
-	if (n_iterations > 1) {
+	page_cleaner do an LRU batch for us. */
 
-		os_thread_sleep(100000);
+	if (n_iterations > 1) {
+		os_thread_sleep(10000);
 	}
 
 	/* No free block was found: try to flush the LRU list.
@@ -1503,6 +1507,20 @@ buf_unzip_LRU_remove_block_if_needed(
 }
 
 /******************************************************************//**
+Adjust LRU hazard pointers if needed. */
+
+void
+buf_LRU_adjust_hp(
+/*==============*/
+	buf_pool_t*		buf_pool,/*!< in: buffer pool instance */
+	const buf_page_t*	bpage)	/*!< in: control block */
+{
+	buf_pool->lru_hp.adjust(bpage);
+	buf_pool->lru_scan_itr.adjust(bpage);
+	buf_pool->single_scan_itr.adjust(bpage);
+}
+
+/******************************************************************//**
 Removes a block from the LRU list. */
 UNIV_INLINE
 void
@@ -1521,6 +1539,10 @@ buf_LRU_remove_block(
 
 	ut_ad(bpage->in_LRU_list);
 
+	/* Important that we adjust the hazard pointers before removing
+	bpage from the LRU list. */
+	buf_LRU_adjust_hp(buf_pool, bpage);
+
 	/* If the LRU_old pointer is defined and points to just this block,
 	move it backward one step */
 
diff --git a/storage/innobase/buf/buf0mtflu.cc b/storage/innobase/buf/buf0mtflu.cc
new file mode 100644
index 00000000000..c14f9048ae5
--- /dev/null
+++ b/storage/innobase/buf/buf0mtflu.cc
@@ -0,0 +1,746 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2014, Fusion-io. All Rights Reserved.
+Copyright (C) 2013, 2014, SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file buf/buf0mtflu.cc
+Multi-threaded flush method implementation
+
+Created  06/11/2013 Dhananjoy Das DDas@fusionio.com
+Modified 12/12/2013 Jan Lindström jan.lindstrom@skysql.com
+Modified 03/02/2014 Dhananjoy Das DDas@fusionio.com
+Modified 06/02/2014 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "buf0mtflu.h"
+#include "buf0checksum.h"
+#include "srv0start.h"
+#include "srv0srv.h"
+#include "page0zip.h"
+#include "ut0byte.h"
+#include "ut0lst.h"
+#include "page0page.h"
+#include "fil0fil.h"
+#include "buf0lru.h"
+#include "buf0rea.h"
+#include "ibuf0ibuf.h"
+#include "log0log.h"
+#include "os0file.h"
+#include "os0sync.h"
+#include "trx0sys.h"
+#include "srv0mon.h"
+#include "mysql/plugin.h"
+#include "mysql/service_thd_wait.h"
+#include "fil0pagecompress.h"
+
+#define	MT_COMP_WATER_MARK	50
+/** Time to wait for a message. */
+#define MT_WAIT_IN_USECS 5000000
+
+/* Work item status */
+typedef enum wrk_status {
+	WRK_ITEM_UNSET=0,	/*!< Work item is not set */
+	WRK_ITEM_START=1,	/*!< Processing of work item has started */
+	WRK_ITEM_DONE=2,	/*!< Processing is done usually set to
+				SUCCESS/FAILED */
+	WRK_ITEM_SUCCESS=2,	/*!< Work item successfully processed */
+	WRK_ITEM_FAILED=3,	/*!< Work item process failed */
+	WRK_ITEM_EXIT=4,	/*!< Exiting */
+	WRK_ITEM_SET=5,		/*!< Work item is set */
+	WRK_ITEM_STATUS_UNDEFINED
+} wrk_status_t;
+
+/* Work item task type */
+typedef enum mt_wrk_tsk {
+	MT_WRK_NONE=0,		/*!< Exit queue-wait */
+	MT_WRK_WRITE=1,		/*!< Flush operation */
+	MT_WRK_READ=2,		/*!< Read operation  */
+	MT_WRK_UNDEFINED
+} mt_wrk_tsk_t;
+
+/* Work thread status */
+typedef enum wthr_status {
+	WTHR_NOT_INIT=0,	/*!< Work thread not initialized */
+	WTHR_INITIALIZED=1,	/*!< Work thread initialized */
+	WTHR_SIG_WAITING=2,	/*!< Work thread wating signal */
+	WTHR_RUNNING=3,		/*!< Work thread running */
+	WTHR_NO_WORK=4,		/*!< Work thread has no work */
+	WTHR_KILL_IT=5,		/*!< Work thread should exit */
+	WTHR_STATUS_UNDEFINED
+} wthr_status_t;
+
+/* Write work task */
+typedef struct wr_tsk {
+	buf_pool_t	*buf_pool;	/*!< buffer-pool instance */
+	buf_flush_t	flush_type;	/*!< flush-type for buffer-pool
+					flush operation */
+	ulint		min;		/*!< minimum number of pages
+					requested to be flushed */
+	lsn_t		lsn_limit;	/*!< lsn limit for the buffer-pool
+					flush operation */
+} wr_tsk_t;
+
+/* Read work task */
+typedef struct rd_tsk {
+	buf_pool_t	*page_pool;	/*!< list of pages to decompress; */
+} rd_tsk_t;
+
+/* Work item */
+typedef struct wrk_itm
+{
+	mt_wrk_tsk_t	tsk;		/*!< Task type. Based on task-type
+					one of the entries wr_tsk/rd_tsk
+					will be used */
+	wr_tsk_t	wr;		/*!< Flush page list */
+	rd_tsk_t	rd;		/*!< Decompress page list */
+        ulint		n_flushed; 	/*!< Number of flushed pages */
+	ulint		n_evicted;	/*!< Number of evicted pages */
+ 	os_thread_id_t	id_usr;		/*!< Thread-id currently working */
+    	wrk_status_t    wi_status;	/*!< Work item status */
+	mem_heap_t      *wheap;         /*!< Heap were to allocate memory
+					for queue nodes */
+	mem_heap_t      *rheap;
+} wrk_t;
+
+typedef struct thread_data
+{
+	os_thread_id_t	wthread_id;	/*!< Identifier */
+	os_thread_t 	wthread;	/*!< Thread id */
+	wthr_status_t   wt_status;	/*!< Worker thread status */
+} thread_data_t;
+
+/* Thread syncronization data */
+typedef struct thread_sync
+{
+	/* Global variables used by all threads */
+	os_fast_mutex_t	thread_global_mtx; /*!< Mutex used protecting below
+					   variables */
+	ulint           n_threads;	/*!< Number of threads */
+	ib_wqueue_t	*wq;		/*!< Work Queue */
+	ib_wqueue_t     *wr_cq;		/*!< Write Completion Queue */
+	ib_wqueue_t     *rd_cq;		/*!< Read Completion Queue */
+	mem_heap_t*     wheap;		/*!< Work heap where memory
+					is allocated */
+	mem_heap_t*     rheap;		/*!< Work heap where memory
+					is allocated */
+	wthr_status_t   gwt_status;     /*!< Global thread status */
+
+	/* Variables used by only one thread at a time */
+        thread_data_t*  thread_data;    /*!< Thread specific data */
+
+} thread_sync_t;
+
+static int		mtflush_work_initialized = -1;
+static thread_sync_t*   mtflush_ctx=NULL;
+static os_fast_mutex_t  mtflush_mtx;
+
+/******************************************************************//**
+Set multi-threaded flush work initialized. */
+static inline
+void
+buf_mtflu_work_init(void)
+/*=====================*/
+{
+	mtflush_work_initialized = 1;
+}
+
+/******************************************************************//**
+Return true if multi-threaded flush is initialized
+@return true if initialized */
+bool
+buf_mtflu_init_done(void)
+/*=====================*/
+{
+	return(mtflush_work_initialized == 1);
+}
+
+/******************************************************************//**
+Fush buffer pool instance.
+@return number of flushed pages, or 0 if error happened
+*/
+static
+ulint
+buf_mtflu_flush_pool_instance(
+/*==========================*/
+	wrk_t	*work_item)	/*!< inout: work item to be flushed */
+{
+	flush_counters_t	n;
+	ut_a(work_item != NULL);
+	ut_a(work_item->wr.buf_pool != NULL);
+
+	if (!buf_flush_start(work_item->wr.buf_pool, work_item->wr.flush_type)) {
+		/* We have two choices here. If lsn_limit was
+		specified then skipping an instance of buffer
+		pool means we cannot guarantee that all pages
+		up to lsn_limit has been flushed. We can
+		return right now with failure or we can try
+		to flush remaining buffer pools up to the
+		lsn_limit. We attempt to flush other buffer
+		pools based on the assumption that it will
+		help in the retry which will follow the
+		failure. */
+#ifdef UNIV_MTFLUSH_DEBUG
+		fprintf(stderr, "InnoDB: Note: buf flush start failed there is already active flush for this buffer pool.\n");
+#endif
+		return 0;
+	}
+
+
+    	if (work_item->wr.flush_type == BUF_FLUSH_LRU) {
+        	/* srv_LRU_scan_depth can be arbitrarily large value.
+        	 * We cap it with current LRU size.
+        	 */
+        	buf_pool_mutex_enter(work_item->wr.buf_pool);
+        	work_item->wr.min = UT_LIST_GET_LEN(work_item->wr.buf_pool->LRU);
+        	buf_pool_mutex_exit(work_item->wr.buf_pool);
+        	work_item->wr.min = ut_min(srv_LRU_scan_depth,work_item->wr.min);
+    	}
+
+	buf_flush_batch(work_item->wr.buf_pool,
+		work_item->wr.flush_type,
+		work_item->wr.min,
+		work_item->wr.lsn_limit,
+		&n);
+
+	buf_flush_end(work_item->wr.buf_pool, work_item->wr.flush_type);
+	buf_flush_common(work_item->wr.flush_type, n.flushed);
+	work_item->n_flushed = n.flushed;
+	work_item->n_evicted = n.evicted;
+
+	return work_item->n_flushed;
+}
+
+/******************************************************************//**
+Worker function to wait for work items and processing them and
+sending reply back.
+*/
+static
+void
+mtflush_service_io(
+/*===============*/
+	thread_sync_t*	mtflush_io,	/*!< inout: multi-threaded flush
+					syncronization data */
+	thread_data_t*  thread_data)    /* Thread status data */
+{
+	wrk_t		*work_item = NULL;
+	ulint		n_flushed=0;
+
+	ut_a(mtflush_io != NULL);
+	ut_a(thread_data != NULL);
+
+   	thread_data->wt_status = WTHR_SIG_WAITING;
+
+	work_item = (wrk_t *)ib_wqueue_nowait(mtflush_io->wq);
+
+	if (work_item == NULL) {
+		work_item = (wrk_t *)ib_wqueue_wait(mtflush_io->wq);
+	}
+
+	if (work_item) {
+		thread_data->wt_status = WTHR_RUNNING;
+	} else {
+		/* Thread did not get any work */
+		thread_data->wt_status = WTHR_NO_WORK;
+		return;
+	}
+
+	if (work_item->wi_status != WRK_ITEM_EXIT) {
+		work_item->wi_status = WRK_ITEM_SET;
+	}
+
+#ifdef UNIV_MTFLUSH_DEBUG
+	ut_a(work_item->id_usr == 0);
+#endif
+	work_item->id_usr = os_thread_get_curr_id();
+
+	/*  This works as a producer/consumer model, where in tasks are
+         *  inserted into the work-queue (wq) and completions are based
+         *  on the type of operations performed and as a result the WRITE/
+         *  compression/flush operation completions get posted to wr_cq.
+         *  And READ/decompress operations completions get posted to rd_cq.
+         *  in future we may have others.
+	*/
+
+	switch(work_item->tsk) {
+	case MT_WRK_NONE:
+		ut_a(work_item->wi_status == WRK_ITEM_EXIT);
+		work_item->wi_status = WRK_ITEM_EXIT;
+		ib_wqueue_add(mtflush_io->wr_cq, work_item, work_item->rheap);
+		thread_data->wt_status = WTHR_KILL_IT;
+		break;
+
+	case MT_WRK_WRITE:
+		ut_a(work_item->wi_status == WRK_ITEM_SET);
+		work_item->wi_status = WRK_ITEM_START;
+		/* Process work item */
+		if (0 == (n_flushed = buf_mtflu_flush_pool_instance(work_item))) {
+			work_item->wi_status = WRK_ITEM_FAILED;
+		}
+		work_item->wi_status = WRK_ITEM_SUCCESS;
+		ib_wqueue_add(mtflush_io->wr_cq, work_item, work_item->rheap);
+		break;
+
+	case MT_WRK_READ:
+		ut_a(0);
+		break;
+
+	default:
+		/* None other than Write/Read handling planned */
+		ut_a(0);
+		break;
+	}
+}
+
+/******************************************************************//**
+Thead used to flush dirty pages when multi-threaded flush is
+used.
+@return a dummy parameter*/
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(mtflush_io_thread)(
+/*==============================*/
+	void * arg)
+{
+	thread_sync_t *mtflush_io = ((thread_sync_t *)arg);
+	thread_data_t *this_thread_data = NULL;
+	ulint i;
+
+	/* Find correct slot for this thread */
+	os_fast_mutex_lock(&(mtflush_io->thread_global_mtx));
+	for(i=0; i < mtflush_io->n_threads; i ++) {
+		if (mtflush_io->thread_data[i].wthread_id == os_thread_get_curr_id()) {
+			break;
+		}
+	}
+
+	ut_a(i <= mtflush_io->n_threads);
+	this_thread_data = &mtflush_io->thread_data[i];
+	os_fast_mutex_unlock(&(mtflush_io->thread_global_mtx));
+
+	while (TRUE) {
+
+#ifdef UNIV_MTFLUSH_DEBUG
+ 		fprintf(stderr, "InnoDB: Note. Thread %lu work queue len %lu return queue len %lu\n",
+ 					os_thread_get_curr_id(),
+ 					ib_wqueue_len(mtflush_io->wq),
+ 					ib_wqueue_len(mtflush_io->wr_cq));
+#endif /* UNIV_MTFLUSH_DEBUG */
+
+		mtflush_service_io(mtflush_io, this_thread_data);
+
+
+		if (this_thread_data->wt_status == WTHR_KILL_IT) {
+			break;
+		}
+	}
+
+	os_thread_exit(NULL);
+	OS_THREAD_DUMMY_RETURN;
+}
+
+/******************************************************************//**
+Add exit work item to work queue to signal multi-threded flush
+threads that they should exit.
+*/
+void
+buf_mtflu_io_thread_exit(void)
+/*==========================*/
+{
+	ulint i;
+	thread_sync_t* mtflush_io = mtflush_ctx;
+	wrk_t* work_item = NULL;
+
+	ut_a(mtflush_io != NULL);
+
+	/* Allocate work items for shutdown message */
+	work_item = (wrk_t*)mem_heap_alloc(mtflush_io->wheap, sizeof(wrk_t)*srv_mtflush_threads);
+
+	/* Confirm if the io-thread KILL is in progress, bailout */
+	if (mtflush_io->gwt_status == WTHR_KILL_IT) {
+		return;
+	}
+
+	mtflush_io->gwt_status = WTHR_KILL_IT;
+
+	fprintf(stderr, "InnoDB: [Note]: Signal mtflush_io_threads to exit [%lu]\n",
+		srv_mtflush_threads);
+
+	/* This lock is to safequard against timing bug: flush request take
+	this mutex before sending work items to be processed by flush
+	threads. Inside flush thread we assume that work queue contains only
+	a constant number of items. Thus, we may not install new work items
+	below before all previous ones are processed. This mutex is released
+	by flush request after all work items sent to flush threads have
+	been processed. Thus, we can get this mutex if and only if work
+	queue is empty. */
+
+	os_fast_mutex_lock(&mtflush_mtx);
+
+	/* Make sure the work queue is empty */
+	ut_a(ib_wqueue_is_empty(mtflush_io->wq));
+
+	/* Send one exit work item/thread */
+	for (i=0; i < (ulint)srv_mtflush_threads; i++) {
+		work_item[i].tsk = MT_WRK_NONE;
+		work_item[i].wi_status = WRK_ITEM_EXIT;
+		work_item[i].wheap = mtflush_io->wheap;
+		work_item[i].rheap = mtflush_io->rheap;
+		work_item[i].id_usr = 0;
+
+		ib_wqueue_add(mtflush_io->wq,
+			(void *)&(work_item[i]),
+			mtflush_io->wheap);
+	}
+
+	/* Wait until all work items on a work queue are processed */
+	while(!ib_wqueue_is_empty(mtflush_io->wq)) {
+		/* Wait */
+		os_thread_sleep(MT_WAIT_IN_USECS);
+	}
+
+	ut_a(ib_wqueue_is_empty(mtflush_io->wq));
+
+	/* Collect all work done items */
+	for (i=0; i < (ulint)srv_mtflush_threads;) {
+		wrk_t* work_item = NULL;
+
+		work_item = (wrk_t *)ib_wqueue_timedwait(mtflush_io->wr_cq, MT_WAIT_IN_USECS);
+
+		/* If we receive reply to work item and it's status is exit,
+		thead has processed this message and existed */
+		if (work_item && work_item->wi_status == WRK_ITEM_EXIT) {
+			i++;
+		}
+	}
+
+	/* Wait about 1/2 sec to allow threads really exit */
+	os_thread_sleep(MT_WAIT_IN_USECS);
+
+	while(!ib_wqueue_is_empty(mtflush_io->wq))
+	{
+		ib_wqueue_nowait(mtflush_io->wq);
+	}
+
+	ut_a(ib_wqueue_is_empty(mtflush_io->wq));
+	ut_a(ib_wqueue_is_empty(mtflush_io->wr_cq));
+	ut_a(ib_wqueue_is_empty(mtflush_io->rd_cq));
+
+	/* Free all queues */
+	ib_wqueue_free(mtflush_io->wq);
+	ib_wqueue_free(mtflush_io->wr_cq);
+	ib_wqueue_free(mtflush_io->rd_cq);
+
+	/* Requests sent */
+	os_fast_mutex_unlock(&mtflush_mtx);
+	os_fast_mutex_free(&mtflush_mtx);
+	os_fast_mutex_free(&mtflush_io->thread_global_mtx);
+
+	/* Free heap */
+	mem_heap_free(mtflush_io->wheap);
+	mem_heap_free(mtflush_io->rheap);
+}
+
+/******************************************************************//**
+Initialize multi-threaded flush thread syncronization data.
+@return Initialized multi-threaded flush thread syncroniztion data. */
+void*
+buf_mtflu_handler_init(
+/*===================*/
+	ulint n_threads,	/*!< in: Number of threads to create */
+	ulint wrk_cnt)		/*!< in: Number of work items */
+{
+	ulint   	i;
+	mem_heap_t*	mtflush_heap;
+	mem_heap_t*	mtflush_heap2;
+
+	/* Create heap, work queue, write completion queue, read
+	completion queue for multi-threaded flush, and init
+	handler. */
+	mtflush_heap = mem_heap_create(0);
+	ut_a(mtflush_heap != NULL);
+	mtflush_heap2 = mem_heap_create(0);
+	ut_a(mtflush_heap2 != NULL);
+
+	mtflush_ctx = (thread_sync_t *)mem_heap_alloc(mtflush_heap,
+				sizeof(thread_sync_t));
+	memset(mtflush_ctx, 0, sizeof(thread_sync_t));
+	ut_a(mtflush_ctx != NULL);
+	mtflush_ctx->thread_data = (thread_data_t*)mem_heap_alloc(
+		mtflush_heap, sizeof(thread_data_t) * n_threads);
+	ut_a(mtflush_ctx->thread_data);
+	memset(mtflush_ctx->thread_data, 0, sizeof(thread_data_t) * n_threads);
+
+	mtflush_ctx->n_threads = n_threads;
+	mtflush_ctx->wq = ib_wqueue_create();
+	ut_a(mtflush_ctx->wq);
+	mtflush_ctx->wr_cq = ib_wqueue_create();
+	ut_a(mtflush_ctx->wr_cq);
+	mtflush_ctx->rd_cq = ib_wqueue_create();
+	ut_a(mtflush_ctx->rd_cq);
+	mtflush_ctx->wheap = mtflush_heap;
+	mtflush_ctx->rheap = mtflush_heap2;
+
+	os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mtflush_ctx->thread_global_mtx);
+	os_fast_mutex_init(PFS_NOT_INSTRUMENTED, &mtflush_mtx);
+
+	/* Create threads for page-compression-flush */
+	for(i=0; i < n_threads; i++) {
+		os_thread_id_t new_thread_id;
+
+		mtflush_ctx->thread_data[i].wt_status = WTHR_INITIALIZED;
+
+		mtflush_ctx->thread_data[i].wthread = os_thread_create(
+			mtflush_io_thread,
+			((void *) mtflush_ctx),
+	                &new_thread_id);
+
+		mtflush_ctx->thread_data[i].wthread_id = new_thread_id;
+	}
+
+	buf_mtflu_work_init();
+
+	return((void *)mtflush_ctx);
+}
+
+/******************************************************************//**
+Flush buffer pool instances.
+@return number of pages flushed. */
+ulint
+buf_mtflu_flush_work_items(
+/*=======================*/
+	ulint buf_pool_inst,		/*!< in: Number of buffer pool instances */
+	flush_counters_t *per_pool_cnt,	/*!< out: Number of pages
+					flushed or evicted /instance */
+	buf_flush_t flush_type,		/*!< in: Type of flush */
+	ulint min_n,			/*!< in: Wished minimum number of
+					blocks to be flushed */
+	lsn_t lsn_limit)		/*!< in: All blocks whose
+					oldest_modification is smaller than
+					this should be flushed (if their
+					number does not exceed min_n) */
+{
+	ulint n_flushed=0, i;
+	mem_heap_t* work_heap;
+	mem_heap_t* reply_heap;
+	wrk_t work_item[MTFLUSH_MAX_WORKER];
+
+	/* Allocate heap where all work items used and queue
+	node items areallocated */
+	work_heap = mem_heap_create(0);
+	reply_heap = mem_heap_create(0);
+
+
+	for(i=0;i<buf_pool_inst; i++) {
+		work_item[i].tsk = MT_WRK_WRITE;
+		work_item[i].wr.buf_pool = buf_pool_from_array(i);
+		work_item[i].wr.flush_type = flush_type;
+		work_item[i].wr.min = min_n;
+		work_item[i].wr.lsn_limit = lsn_limit;
+		work_item[i].wi_status = WRK_ITEM_UNSET;
+		work_item[i].wheap = work_heap;
+		work_item[i].rheap = reply_heap;
+		work_item[i].n_flushed = 0;
+		work_item[i].n_evicted = 0;
+		work_item[i].id_usr = 0;
+
+		ib_wqueue_add(mtflush_ctx->wq,
+			(void *)(work_item + i),
+			work_heap);
+	}
+
+	/* wait on the completion to arrive */
+   	for(i=0; i< buf_pool_inst;) {
+		wrk_t *done_wi = NULL;
+		done_wi = (wrk_t *)ib_wqueue_wait(mtflush_ctx->wr_cq);
+
+		if (done_wi != NULL) {
+			per_pool_cnt[i].flushed = done_wi->n_flushed;
+			per_pool_cnt[i].evicted = done_wi->n_evicted;
+
+#ifdef UNIV_MTFLUSH_DEBUG
+			if((int)done_wi->id_usr == 0 &&
+				(done_wi->wi_status == WRK_ITEM_SET ||
+					done_wi->wi_status == WRK_ITEM_UNSET)) {
+				fprintf(stderr,
+					"**Set/Unused work_item[%lu] flush_type=%d\n",
+					i,
+					done_wi->wr.flush_type);
+				ut_a(0);
+			}
+#endif
+
+			n_flushed+= done_wi->n_flushed+done_wi->n_evicted;
+			i++;
+		}
+	}
+
+	/* Release used work_items and queue nodes */
+	mem_heap_free(work_heap);
+	mem_heap_free(reply_heap);
+
+	return(n_flushed);
+}
+
+/*******************************************************************//**
+Multi-threaded version of buf_flush_list
+*/
+bool
+buf_mtflu_flush_list(
+/*=================*/
+	ulint		min_n,		/*!< in: wished minimum mumber of blocks
+					flushed (it is not guaranteed that the
+					actual number is that big, though) */
+	lsn_t		lsn_limit,	/*!< in the case BUF_FLUSH_LIST all
+					blocks whose oldest_modification is
+					smaller than this should be flushed
+					(if their number does not exceed
+					min_n), otherwise ignored */
+	ulint*		n_processed)	/*!< out: the number of pages
+					which were processed is passed
+					back to caller. Ignored if NULL */
+
+{
+	ulint				i;
+	bool				success = true;
+	flush_counters_t		cnt[MTFLUSH_MAX_WORKER];
+
+	if (n_processed) {
+		*n_processed = 0;
+	}
+
+	if (min_n != ULINT_MAX) {
+		/* Ensure that flushing is spread evenly amongst the
+		buffer pool instances. When min_n is ULINT_MAX
+		we need to flush everything up to the lsn limit
+		so no limit here. */
+		min_n = (min_n + srv_buf_pool_instances - 1)
+			 / srv_buf_pool_instances;
+	}
+
+	/* This lock is to safequard against re-entry if any. */
+	os_fast_mutex_lock(&mtflush_mtx);
+	buf_mtflu_flush_work_items(srv_buf_pool_instances,
+                cnt, BUF_FLUSH_LIST,
+                min_n, lsn_limit);
+	os_fast_mutex_unlock(&mtflush_mtx);
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		if (n_processed) {
+			*n_processed += cnt[i].flushed+cnt[i].evicted;
+		}
+
+		if (cnt[i].flushed) {
+			MONITOR_INC_VALUE_CUMULATIVE(
+				MONITOR_FLUSH_BATCH_TOTAL_PAGE,
+				MONITOR_FLUSH_BATCH_COUNT,
+				MONITOR_FLUSH_BATCH_PAGES,
+				cnt[i].flushed);
+		}
+
+		if(cnt[i].evicted) {
+				MONITOR_INC_VALUE_CUMULATIVE(
+				MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
+				MONITOR_LRU_BATCH_EVICT_COUNT,
+				MONITOR_LRU_BATCH_EVICT_PAGES,
+				cnt[i].evicted);
+		}
+	}
+#ifdef UNIV_MTFLUSH_DEBUG
+	fprintf(stderr, "%s: [1] [*n_processed: (min:%lu)%lu ]\n",
+		__FUNCTION__, (min_n * srv_buf_pool_instances), *n_processed);
+#endif
+	return(success);
+}
+
+/*********************************************************************//**
+Clears up tail of the LRU lists:
+* Put replaceable pages at the tail of LRU to the free list
+* Flush dirty pages at the tail of LRU to the disk
+The depth to which we scan each buffer pool is controlled by dynamic
+config parameter innodb_LRU_scan_depth.
+@return total pages flushed */
+UNIV_INTERN
+ulint
+buf_mtflu_flush_LRU_tail(void)
+/*==========================*/
+{
+	ulint	total_flushed=0, i;
+	flush_counters_t	cnt[MTFLUSH_MAX_WORKER];
+
+	ut_a(buf_mtflu_init_done());
+
+	/* At shutdown do not send requests anymore */
+	if (!mtflush_ctx || mtflush_ctx->gwt_status == WTHR_KILL_IT) {
+		return (total_flushed);
+	}
+
+	/* This lock is to safeguard against re-entry if any */
+	os_fast_mutex_lock(&mtflush_mtx);
+	buf_mtflu_flush_work_items(srv_buf_pool_instances,
+		cnt, BUF_FLUSH_LRU, srv_LRU_scan_depth, 0);
+	os_fast_mutex_unlock(&mtflush_mtx);
+
+	for (i = 0; i < srv_buf_pool_instances; i++) {
+		total_flushed += cnt[i].flushed+cnt[i].evicted;
+
+		if (cnt[i].flushed) {
+			MONITOR_INC_VALUE_CUMULATIVE(
+			        MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
+			        MONITOR_LRU_BATCH_FLUSH_COUNT,
+			        MONITOR_LRU_BATCH_FLUSH_PAGES,
+			        cnt[i].flushed);
+		}
+
+		if(cnt[i].evicted) {
+				MONITOR_INC_VALUE_CUMULATIVE(
+				MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
+				MONITOR_LRU_BATCH_EVICT_COUNT,
+				MONITOR_LRU_BATCH_EVICT_PAGES,
+				cnt[i].evicted);
+		}
+	}
+
+#if UNIV_MTFLUSH_DEBUG
+	fprintf(stderr, "[1] [*n_processed: (min:%lu)%lu ]\n", (
+			srv_LRU_scan_depth * srv_buf_pool_instances), total_flushed);
+#endif
+
+	return(total_flushed);
+}
+
+/*********************************************************************//**
+Set correct thread identifiers to io thread array based on
+information we have. */
+void
+buf_mtflu_set_thread_ids(
+/*=====================*/
+	ulint		n_threads,	/*!<in: Number of threads to fill */
+        void*		ctx,		/*!<in: thread context */
+	os_thread_id_t*	thread_ids)	/*!<in: thread id array */
+{
+	thread_sync_t *mtflush_io = ((thread_sync_t *)ctx);
+	ulint i;
+	ut_a(mtflush_io != NULL);
+	ut_a(thread_ids != NULL);
+
+	for(i = 0; i < n_threads; i++) {
+		thread_ids[i] = mtflush_io->thread_data[i].wthread_id;
+	}
+}
diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc
index 7c8369c0c09..9e81d010d0f 100644
--- a/storage/innobase/buf/buf0rea.cc
+++ b/storage/innobase/buf/buf0rea.cc
@@ -184,14 +184,15 @@ buf_read_page_low(
 		*err = fil_io(OS_FILE_READ | wake_later
 			      | ignore_nonexistent_pages,
 			      sync, space, zip_size, offset, 0, zip_size,
-			      bpage->zip.data, bpage);
+			bpage->zip.data, bpage, &bpage->write_size);
 	} else {
 		ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
 
 		*err = fil_io(OS_FILE_READ | wake_later
 			      | ignore_nonexistent_pages,
 			      sync, space, 0, offset, 0, UNIV_PAGE_SIZE,
-			      ((buf_block_t*) bpage)->frame, bpage);
+			      ((buf_block_t*) bpage)->frame, bpage,
+			      &bpage->write_size);
 	}
 
 	if (sync) {
diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc
index 947476e0e02..b13f68a08a7 100644
--- a/storage/innobase/dict/dict0dict.cc
+++ b/storage/innobase/dict/dict0dict.cc
@@ -2,6 +2,7 @@
 
 Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -498,7 +499,7 @@ dict_table_try_drop_aborted(
 
 	if (table == NULL) {
 		table = dict_table_open_on_id_low(
-			table_id, DICT_ERR_IGNORE_NONE);
+			table_id, DICT_ERR_IGNORE_NONE, FALSE);
 	} else {
 		ut_ad(table->id == table_id);
 	}
@@ -747,17 +748,24 @@ dict_index_get_nth_col_or_prefix_pos(
 /*=================================*/
 	const dict_index_t*	index,		/*!< in: index */
 	ulint			n,		/*!< in: column number */
-	ibool			inc_prefix)	/*!< in: TRUE=consider
+	ibool			inc_prefix,	/*!< in: TRUE=consider
 						column prefixes too */
+	ulint*			prefix_col_pos)	/*!< out: col num if prefix */
 {
 	const dict_field_t*	field;
 	const dict_col_t*	col;
 	ulint			pos;
 	ulint			n_fields;
+	ulint			prefixed_pos_dummy;
 
 	ut_ad(index);
 	ut_ad(index->magic_n == DICT_INDEX_MAGIC_N);
 
+	if (!prefix_col_pos) {
+		prefix_col_pos = &prefixed_pos_dummy;
+	}
+	*prefix_col_pos = ULINT_UNDEFINED;
+
 	col = dict_table_get_nth_col(index->table, n);
 
 	if (dict_index_is_clust(index)) {
@@ -770,10 +778,11 @@ dict_index_get_nth_col_or_prefix_pos(
 	for (pos = 0; pos < n_fields; pos++) {
 		field = dict_index_get_nth_field(index, pos);
 
-		if (col == field->col
-		    && (inc_prefix || field->prefix_len == 0)) {
-
-			return(pos);
+		if (col == field->col) {
+			*prefix_col_pos = pos;
+			if (inc_prefix || field->prefix_len == 0) {
+				return(pos);
+			}
 		}
 	}
 
@@ -885,7 +894,8 @@ dict_table_open_on_id(
 		table_id,
 		table_op == DICT_TABLE_OP_LOAD_TABLESPACE
 		? DICT_ERR_IGNORE_RECOVER_LOCK
-		: DICT_ERR_IGNORE_NONE);
+		: DICT_ERR_IGNORE_NONE,
+		table_op == DICT_TABLE_OP_OPEN_ONLY_IF_CACHED);
 
 	if (table != NULL) {
 
@@ -917,7 +927,7 @@ dict_table_get_nth_col_pos(
 	ulint			n)	/*!< in: column number */
 {
 	return(dict_index_get_nth_col_pos(dict_table_get_first_index(table),
-					  n));
+					  n, NULL));
 }
 
 /********************************************************************//**
@@ -1403,7 +1413,7 @@ dict_table_move_from_non_lru_to_lru(
 /**********************************************************************//**
 Looks for an index with the given id given a table instance.
 @return	index or NULL */
-static
+UNIV_INTERN
 dict_index_t*
 dict_table_find_index_on_id(
 /*========================*/
@@ -2525,6 +2535,13 @@ undo_size_ok:
 	new_index->stat_index_size = 1;
 	new_index->stat_n_leaf_pages = 1;
 
+	new_index->stat_defrag_n_pages_freed = 0;
+	new_index->stat_defrag_n_page_split = 0;
+
+	new_index->stat_defrag_sample_next_slot = 0;
+	memset(&new_index->stat_defrag_data_size_sample,
+	       0x0, sizeof(ulint) * STAT_DEFRAG_DATA_SIZE_N_SAMPLE);
+
 	/* Add the new index as the last index for the table */
 
 	UT_LIST_ADD_LAST(indexes, table->indexes, new_index);
@@ -3334,7 +3351,29 @@ dict_foreign_find_index(
 
 	return(NULL);
 }
-
+#ifdef WITH_WSREP
+dict_index_t*
+wsrep_dict_foreign_find_index(
+/*====================*/
+	dict_table_t*	table,	/*!< in: table */
+	const char**	col_names, /*!< in: column names, or NULL
+					to use table->col_names */
+	const char**	columns,/*!< in: array of column names */
+	ulint		n_cols,	/*!< in: number of columns */
+	dict_index_t*	types_idx, /*!< in: NULL or an index to whose types the
+				   column types must match */
+	ibool		check_charsets,
+				/*!< in: whether to check charsets.
+				only has an effect if types_idx != NULL */
+	ulint		check_null)
+				/*!< in: nonzero if none of the columns must
+				be declared NOT NULL */
+{
+	return dict_foreign_find_index(
+		table, col_names, columns, n_cols, types_idx, check_charsets,
+		check_null);
+}
+#endif /* WITH_WSREP */
 /**********************************************************************//**
 Report an error in a foreign key definition. */
 static
diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc
index 1eac9e0df51..001623a49bc 100644
--- a/storage/innobase/dict/dict0stats.cc
+++ b/storage/innobase/dict/dict0stats.cc
@@ -194,7 +194,7 @@ dict_stats_persistent_storage_check(
 		{"table_name", DATA_VARMYSQL,
 			DATA_NOT_NULL, 192},
 
-		{"last_update", DATA_INT,
+		{"last_update", DATA_FIXBINARY,
 			DATA_NOT_NULL, 4},
 
 		{"n_rows", DATA_INT,
@@ -225,7 +225,7 @@ dict_stats_persistent_storage_check(
 		{"index_name", DATA_VARMYSQL,
 			DATA_NOT_NULL, 192},
 
-		{"last_update", DATA_INT,
+		{"last_update", DATA_FIXBINARY,
 			DATA_NOT_NULL, 4},
 
 		{"stat_name", DATA_VARMYSQL,
@@ -496,6 +496,9 @@ dict_stats_table_clone_create(
 			heap,
 			idx->n_uniq * sizeof(idx->stat_n_non_null_key_vals[0]));
 		ut_d(idx->magic_n = DICT_INDEX_MAGIC_N);
+
+		idx->stat_defrag_n_page_split = 0;
+		idx->stat_defrag_n_pages_freed = 0;
 	}
 
 	ut_d(t->magic_n = DICT_TABLE_MAGIC_N);
@@ -525,7 +528,9 @@ static
 void
 dict_stats_empty_index(
 /*===================*/
-	dict_index_t*	index)	/*!< in/out: index */
+	dict_index_t*	index,	/*!< in/out: index */
+	bool		empty_defrag_stats)
+				/*!< in: whether to empty defrag stats */
 {
 	ut_ad(!(index->type & DICT_FTS));
 	ut_ad(!dict_index_is_univ(index));
@@ -540,6 +545,34 @@ dict_stats_empty_index(
 
 	index->stat_index_size = 1;
 	index->stat_n_leaf_pages = 1;
+
+	if (empty_defrag_stats) {
+		dict_stats_empty_defrag_stats(index);
+		dict_stats_empty_defrag_summary(index);
+	}
+}
+
+/**********************************************************************//**
+Clear defragmentation summary. */
+UNIV_INTERN
+void
+dict_stats_empty_defrag_summary(
+/*==================*/
+	dict_index_t* index)	/*!< in: index to clear defragmentation stats */
+{
+	index->stat_defrag_n_pages_freed = 0;
+}
+
+/**********************************************************************//**
+Clear defragmentation related index stats. */
+UNIV_INTERN
+void
+dict_stats_empty_defrag_stats(
+/*==================*/
+	dict_index_t* index)	/*!< in: index to clear defragmentation stats */
+{
+	index->stat_defrag_modified_counter = 0;
+	index->stat_defrag_n_page_split = 0;
 }
 
 /*********************************************************************//**
@@ -549,7 +582,9 @@ static
 void
 dict_stats_empty_table(
 /*===================*/
-	dict_table_t*	table)	/*!< in/out: table */
+	dict_table_t*	table,	/*!< in/out: table */
+	bool		empty_defrag_stats)
+				/*!< in: whether to empty defrag stats */
 {
 	/* Zero the stats members */
 
@@ -574,7 +609,7 @@ dict_stats_empty_table(
 
 		ut_ad(!dict_index_is_univ(index));
 
-		dict_stats_empty_index(index);
+		dict_stats_empty_index(index, empty_defrag_stats);
 	}
 
 	table->stat_initialized = TRUE;
@@ -709,7 +744,7 @@ dict_stats_copy(
 		}
 
 		if (!INDEX_EQ(src_idx, dst_idx)) {
-			dict_stats_empty_index(dst_idx);
+			dict_stats_empty_index(dst_idx, true);
 			continue;
 		}
 
@@ -720,7 +755,7 @@ dict_stats_copy(
 			/* Since src is smaller some elements in dst
 			will remain untouched by the following memmove(),
 			thus we init all of them here. */
-			dict_stats_empty_index(dst_idx);
+			dict_stats_empty_index(dst_idx, true);
 		} else {
 			n_copy_el = dst_idx->n_uniq;
 		}
@@ -740,6 +775,13 @@ dict_stats_copy(
 		dst_idx->stat_index_size = src_idx->stat_index_size;
 
 		dst_idx->stat_n_leaf_pages = src_idx->stat_n_leaf_pages;
+
+		dst_idx->stat_defrag_modified_counter =
+			src_idx->stat_defrag_modified_counter;
+		dst_idx->stat_defrag_n_pages_freed =
+			src_idx->stat_defrag_n_pages_freed;
+		dst_idx->stat_defrag_n_page_split =
+			src_idx->stat_defrag_n_page_split;
 	}
 
 	dst->stat_initialized = TRUE;
@@ -763,6 +805,9 @@ dict_index_t::stat_n_sample_sizes[]
 dict_index_t::stat_n_non_null_key_vals[]
 dict_index_t::stat_index_size
 dict_index_t::stat_n_leaf_pages
+dict_index_t::stat_defrag_modified_counter
+dict_index_t::stat_defrag_n_pages_freed
+dict_index_t::stat_defrag_n_page_split
 The returned object should be freed with dict_stats_snapshot_free()
 when no longer needed.
 @return incomplete table object */
@@ -812,7 +857,9 @@ dict_stats_snapshot_free(
 Calculates new estimates for index statistics. This function is
 relatively quick and is used to calculate transient statistics that
 are not saved on disk. This was the only way to calculate statistics
-before the Persistent Statistics feature was introduced. */
+before the Persistent Statistics feature was introduced.
+This function doesn't update the defragmentation related stats.
+Only persistent statistics supports defragmentation stats. */
 static
 void
 dict_stats_update_transient_for_index(
@@ -828,10 +875,10 @@ dict_stats_update_transient_for_index(
 		Initialize some bogus index cardinality
 		statistics, so that the data can be queried in
 		various means, also via secondary indexes. */
-		dict_stats_empty_index(index);
+		dict_stats_empty_index(index, false);
 #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
 	} else if (ibuf_debug && !dict_index_is_clust(index)) {
-		dict_stats_empty_index(index);
+		dict_stats_empty_index(index, false);
 #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
 	} else {
 		mtr_t	mtr;
@@ -852,7 +899,7 @@ dict_stats_update_transient_for_index(
 
 		switch (size) {
 		case ULINT_UNDEFINED:
-			dict_stats_empty_index(index);
+			dict_stats_empty_index(index, false);
 			return;
 		case 0:
 			/* The root node of the tree is a leaf */
@@ -887,7 +934,7 @@ dict_stats_update_transient(
 
 	if (dict_table_is_discarded(table)) {
 		/* Nothing to do. */
-		dict_stats_empty_table(table);
+		dict_stats_empty_table(table, false);
 		return;
 	} else if (index == NULL) {
 		/* Table definition is corrupt */
@@ -897,7 +944,7 @@ dict_stats_update_transient(
 		fprintf(stderr, " InnoDB: table %s has no indexes. "
 			"Cannot calculate statistics.\n",
 			ut_format_name(table->name, TRUE, buf, sizeof(buf)));
-		dict_stats_empty_table(table);
+		dict_stats_empty_table(table, false);
 		return;
 	}
 
@@ -909,7 +956,7 @@ dict_stats_update_transient(
 			continue;
 		}
 
-		dict_stats_empty_index(index);
+		dict_stats_empty_index(index, false);
 
 		if (dict_stats_should_ignore_index(index)) {
 			continue;
@@ -1903,7 +1950,7 @@ dict_stats_analyze_index(
 
 	DEBUG_PRINTF("  %s(index=%s)\n", __func__, index->name);
 
-	dict_stats_empty_index(index);
+	dict_stats_empty_index(index, false);
 
 	mtr_start(&mtr);
 
@@ -2201,7 +2248,7 @@ dict_stats_update_persistent(
 
 		/* Table definition is corrupt */
 		dict_table_stats_unlock(table, RW_X_LATCH);
-		dict_stats_empty_table(table);
+		dict_stats_empty_table(table, true);
 
 		return(DB_CORRUPTION);
 	}
@@ -2230,7 +2277,7 @@ dict_stats_update_persistent(
 			continue;
 		}
 
-		dict_stats_empty_index(index);
+		dict_stats_empty_index(index, false);
 
 		if (dict_stats_should_ignore_index(index)) {
 			continue;
@@ -2803,6 +2850,16 @@ dict_stats_fetch_index_stats_step(
 		   == 0) {
 		index->stat_n_leaf_pages = (ulint) stat_value;
 		arg->stats_were_modified = true;
+	} else if (stat_name_len == 12 /* strlen("n_page_split") */
+		   && strncasecmp("n_page_split", stat_name, stat_name_len)
+		      == 0) {
+		index->stat_defrag_n_page_split = (ulint) stat_value;
+		arg->stats_were_modified = true;
+	} else if (stat_name_len == 13 /* strlen("n_pages_freed") */
+		   && strncasecmp("n_pages_freed", stat_name, stat_name_len)
+		      == 0) {
+		index->stat_defrag_n_pages_freed = (ulint) stat_value;
+		arg->stats_were_modified = true;
 	} else if (stat_name_len > PFX_LEN /* e.g. stat_name=="n_diff_pfx01" */
 		   && strncasecmp(PFX, stat_name, PFX_LEN) == 0) {
 
@@ -2922,7 +2979,7 @@ dict_stats_fetch_from_ps(
 	the persistent storage contains incomplete stats (e.g. missing stats
 	for some index) then we would end up with (partially) uninitialized
 	stats. */
-	dict_stats_empty_table(table);
+	dict_stats_empty_table(table, true);
 
 	trx = trx_allocate_for_background();
 
@@ -3024,6 +3081,22 @@ dict_stats_fetch_from_ps(
 }
 
 /*********************************************************************//**
+Clear defragmentation stats modified counter for all indices in table. */
+static
+void
+dict_stats_empty_defrag_modified_counter(
+	dict_table_t*	table)	/*!< in: table */
+{
+	dict_index_t*	index;
+	ut_a(table);
+	for (index = dict_table_get_first_index(table);
+	     index != NULL;
+	     index = dict_table_get_next_index(index)) {
+		index->stat_defrag_modified_counter = 0;
+	}
+}
+
+/*********************************************************************//**
 Fetches or calculates new estimates for index statistics. */
 UNIV_INTERN
 void
@@ -3099,13 +3172,13 @@ dict_stats_update(
 			"because the .ibd file is missing. For help, please "
 			"refer to " REFMAN "innodb-troubleshooting.html\n",
 			ut_format_name(table->name, TRUE, buf, sizeof(buf)));
-		dict_stats_empty_table(table);
+		dict_stats_empty_table(table, true);
 		return(DB_TABLESPACE_DELETED);
 	} else if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
 		/* If we have set a high innodb_force_recovery level, do
 		not calculate statistics, as a badly corrupted index can
 		cause a crash in it. */
-		dict_stats_empty_table(table);
+		dict_stats_empty_table(table, false);
 		return(DB_SUCCESS);
 	}
 
@@ -3168,7 +3241,7 @@ dict_stats_update(
 
 	case DICT_STATS_EMPTY_TABLE:
 
-		dict_stats_empty_table(table);
+		dict_stats_empty_table(table, true);
 
 		/* If table is using persistent stats,
 		then save the stats on disk */
@@ -3231,6 +3304,7 @@ dict_stats_update(
 
 		t->stats_last_recalc = table->stats_last_recalc;
 		t->stat_modified_counter = 0;
+		dict_stats_empty_defrag_modified_counter(t);
 
 		switch (err) {
 		case DB_SUCCESS:
@@ -3241,7 +3315,7 @@ dict_stats_update(
 			copying because dict_stats_table_clone_create() does
 			skip corrupted indexes so our dummy object 't' may
 			have less indexes than the real object 'table'. */
-			dict_stats_empty_table(table);
+			dict_stats_empty_table(table, true);
 
 			dict_stats_copy(table, t);
 
@@ -3811,6 +3885,117 @@ dict_stats_rename_table(
 	return(ret);
 }
 
+/*********************************************************************//**
+Save defragmentation result.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_save_defrag_summary(
+	dict_index_t*	index)	/*!< in: index */
+{
+	dberr_t	ret;
+	lint	now = (lint) ut_time();
+	if (dict_index_is_univ(index)) {
+		return DB_SUCCESS;
+	}
+	rw_lock_x_lock(&dict_operation_lock);
+	mutex_enter(&dict_sys->mutex);
+	ret = dict_stats_save_index_stat(index, now, "n_pages_freed",
+					 index->stat_defrag_n_pages_freed,
+					 NULL,
+					 "Number of pages freed during"
+					 " last defragmentation run.",
+					 NULL);
+
+	mutex_exit(&dict_sys->mutex);
+	rw_lock_x_unlock(&dict_operation_lock);
+	return (ret);
+}
+
+/*********************************************************************//**
+Save defragmentation stats for a given index.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_save_defrag_stats(
+	dict_index_t*	index)	/*!< in: index */
+{
+	dberr_t	ret;
+
+	if (index->table->ibd_file_missing) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Cannot save defragment stats because "
+			".ibd file is missing.\n");
+		return (DB_TABLESPACE_DELETED);
+	}
+	if (dict_index_is_corrupted(index)) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Cannot save defragment stats because "
+			"index is corrupted.\n");
+		return(DB_CORRUPTION);
+	}
+
+	if (dict_index_is_univ(index)) {
+		return DB_SUCCESS;
+	}
+
+	lint	now = (lint) ut_time();
+	mtr_t	mtr;
+	ulint	n_leaf_pages;
+	ulint	n_leaf_reserved;
+	mtr_start(&mtr);
+	mtr_s_lock(dict_index_get_lock(index), &mtr);
+	n_leaf_reserved = btr_get_size_and_reserved(index, BTR_N_LEAF_PAGES,
+						    &n_leaf_pages, &mtr);
+	mtr_commit(&mtr);
+
+	if (n_leaf_reserved == ULINT_UNDEFINED) {
+		// The index name is different during fast index creation,
+		// so the stats won't be associated with the right index
+		// for later use. We just return without saving.
+		return DB_SUCCESS;
+	}
+
+	rw_lock_x_lock(&dict_operation_lock);
+
+	mutex_enter(&dict_sys->mutex);
+	ret = dict_stats_save_index_stat(index, now, "n_page_split",
+					 index->stat_defrag_n_page_split,
+					 NULL,
+					 "Number of new page splits on leaves"
+					 " since last defragmentation.",
+					 NULL);
+	if (ret != DB_SUCCESS) {
+		goto end;
+	}
+
+	ret = dict_stats_save_index_stat(
+		index, now, "n_leaf_pages_defrag",
+		n_leaf_pages,
+		NULL,
+		"Number of leaf pages when this stat is saved to disk",
+		NULL);
+	if (ret != DB_SUCCESS) {
+		goto end;
+	}
+
+	ret = dict_stats_save_index_stat(
+		index, now, "n_leaf_pages_reserved",
+		n_leaf_reserved,
+		NULL,
+		"Number of pages reserved for this index leaves when this stat "
+		"is saved to disk",
+		NULL);
+
+end:
+	mutex_exit(&dict_sys->mutex);
+	rw_lock_x_unlock(&dict_operation_lock);
+
+	return (ret);
+}
+
 /* tests @{ */
 #ifdef UNIV_COMPILE_TEST_FUNCS
 
diff --git a/storage/innobase/dict/dict0stats_bg.cc b/storage/innobase/dict/dict0stats_bg.cc
index ecd723ca39a..0089f9897ae 100644
--- a/storage/innobase/dict/dict0stats_bg.cc
+++ b/storage/innobase/dict/dict0stats_bg.cc
@@ -25,6 +25,7 @@ Created Apr 25, 2012 Vasil Dimov
 
 #include "row0mysql.h"
 #include "srv0start.h"
+#include "dict0dict.h"
 #include "dict0stats.h"
 #include "dict0stats_bg.h"
 
@@ -44,8 +45,10 @@ UNIV_INTERN os_event_t		dict_stats_event = NULL;
 
 /** This mutex protects the "recalc_pool" variable. */
 static ib_mutex_t		recalc_pool_mutex;
+static ib_mutex_t		defrag_pool_mutex;
 #ifdef HAVE_PSI_INTERFACE
 static mysql_pfs_key_t		recalc_pool_mutex_key;
+static mysql_pfs_key_t		defrag_pool_mutex_key;
 #endif /* HAVE_PSI_INTERFACE */
 
 /** The number of tables that can be added to "recalc_pool" before
@@ -59,16 +62,26 @@ static recalc_pool_t		recalc_pool;
 
 typedef recalc_pool_t::iterator	recalc_pool_iterator_t;
 
+/** Indices whose defrag stats need to be saved to persistent storage.*/
+struct defrag_pool_item_t {
+	table_id_t	table_id;
+	index_id_t	index_id;
+};
+typedef std::vector<defrag_pool_item_t>	defrag_pool_t;
+static defrag_pool_t			defrag_pool;
+typedef defrag_pool_t::iterator		defrag_pool_iterator_t;
+
 /*****************************************************************//**
 Initialize the recalc pool, called once during thread initialization. */
 static
 void
-dict_stats_recalc_pool_init()
+dict_stats_pool_init()
 /*=========================*/
 {
 	ut_ad(!srv_read_only_mode);
 
 	recalc_pool.reserve(RECALC_POOL_INITIAL_SLOTS);
+	defrag_pool.reserve(RECALC_POOL_INITIAL_SLOTS);
 }
 
 /*****************************************************************//**
@@ -76,12 +89,13 @@ Free the resources occupied by the recalc pool, called once during
 thread de-initialization. */
 static
 void
-dict_stats_recalc_pool_deinit()
-/*===========================*/
+dict_stats_pool_deinit()
+/*====================*/
 {
 	ut_ad(!srv_read_only_mode);
 
 	recalc_pool.clear();
+	defrag_pool.clear();
         /*
           recalc_pool may still have its buffer allocated. It will free it when
           its destructor is called.
@@ -90,8 +104,12 @@ dict_stats_recalc_pool_deinit()
           memory.  To avoid that, we force recalc_pool to surrender its buffer
           to empty_pool object, which will free it when leaving this function:
         */
-        recalc_pool_t empty_pool;
-        recalc_pool.swap(empty_pool);
+	recalc_pool_t recalc_empty_pool;
+	defrag_pool_t defrag_empty_pool;
+	memset(&recalc_empty_pool, 0, sizeof(recalc_pool_t));
+	memset(&defrag_empty_pool, 0, sizeof(defrag_pool_t));
+        recalc_pool.swap(recalc_empty_pool);
+	defrag_pool.swap(defrag_empty_pool);
 }
 
 /*****************************************************************//**
@@ -188,6 +206,111 @@ dict_stats_recalc_pool_del(
 }
 
 /*****************************************************************//**
+Add an index in a table to the defrag pool, which is processed by the
+background stats gathering thread. Only the table id and index id are
+added to the list, so the table can be closed after being enqueued and
+it will be opened when needed. If the table or index does not exist later
+(has been DROPped), then it will be removed from the pool and skipped. */
+UNIV_INTERN
+void
+dict_stats_defrag_pool_add(
+/*=======================*/
+	const dict_index_t*	index)	/*!< in: table to add */
+{
+	defrag_pool_item_t item;
+
+	ut_ad(!srv_read_only_mode);
+
+	mutex_enter(&defrag_pool_mutex);
+
+	/* quit if already in the list */
+	for (defrag_pool_iterator_t iter = defrag_pool.begin();
+	     iter != defrag_pool.end();
+	     ++iter) {
+		if ((*iter).table_id == index->table->id
+		    && (*iter).index_id == index->id) {
+			mutex_exit(&defrag_pool_mutex);
+			return;
+		}
+	}
+
+	item.table_id = index->table->id;
+	item.index_id = index->id;
+	defrag_pool.push_back(item);
+
+	mutex_exit(&defrag_pool_mutex);
+
+	os_event_set(dict_stats_event);
+}
+
+/*****************************************************************//**
+Get an index from the auto defrag pool. The returned index id is removed
+from the pool.
+@return true if the pool was non-empty and "id" was set, false otherwise */
+static
+bool
+dict_stats_defrag_pool_get(
+/*=======================*/
+	table_id_t*	table_id,	/*!< out: table id, or unmodified if
+					list is empty */
+	index_id_t*	index_id)	/*!< out: index id, or unmodified if
+					list is empty */
+{
+	ut_ad(!srv_read_only_mode);
+
+	mutex_enter(&defrag_pool_mutex);
+
+	if (defrag_pool.empty()) {
+		mutex_exit(&defrag_pool_mutex);
+		return(false);
+	}
+
+	defrag_pool_item_t& item = defrag_pool.back();
+	*table_id = item.table_id;
+	*index_id = item.index_id;
+
+	defrag_pool.pop_back();
+
+	mutex_exit(&defrag_pool_mutex);
+
+	return(true);
+}
+
+/*****************************************************************//**
+Delete a given index from the auto defrag pool. */
+UNIV_INTERN
+void
+dict_stats_defrag_pool_del(
+/*=======================*/
+	const dict_table_t*	table,	/*!<in: if given, remove
+					all entries for the table */
+	const dict_index_t*	index)	/*!< in: if given, remove this index */
+{
+	ut_a((table && !index) || (!table && index));
+	ut_ad(!srv_read_only_mode);
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	mutex_enter(&defrag_pool_mutex);
+
+	defrag_pool_iterator_t iter = defrag_pool.begin();
+	while (iter != defrag_pool.end()) {
+		if ((table && (*iter).table_id == table->id)
+		    || (index
+			&& (*iter).table_id == index->table->id
+			&& (*iter).index_id == index->id)) {
+			/* erase() invalidates the iterator */
+			iter = defrag_pool.erase(iter);
+			if (index)
+				break;
+		} else {
+			iter++;
+		}
+	}
+
+	mutex_exit(&defrag_pool_mutex);
+}
+
+/*****************************************************************//**
 Wait until background stats thread has stopped using the specified table.
 The caller must have locked the data dictionary using
 row_mysql_lock_data_dictionary() and this function may unlock it temporarily
@@ -237,7 +360,10 @@ dict_stats_thread_init()
 	mutex_create(recalc_pool_mutex_key, &recalc_pool_mutex,
 		     SYNC_STATS_AUTO_RECALC);
 
-	dict_stats_recalc_pool_init();
+	/* We choose SYNC_STATS_DEFRAG to be below SYNC_FSP_PAGE. */
+	mutex_create(defrag_pool_mutex_key, &defrag_pool_mutex,
+	     SYNC_STATS_DEFRAG);
+	dict_stats_pool_init();
 }
 
 /*****************************************************************//**
@@ -251,11 +377,14 @@ dict_stats_thread_deinit()
 	ut_a(!srv_read_only_mode);
 	ut_ad(!srv_dict_stats_thread_active);
 
-	dict_stats_recalc_pool_deinit();
+	dict_stats_pool_deinit();
 
 	mutex_free(&recalc_pool_mutex);
 	memset(&recalc_pool_mutex, 0x0, sizeof(recalc_pool_mutex));
 
+	mutex_free(&defrag_pool_mutex);
+	memset(&defrag_pool_mutex, 0x0, sizeof(defrag_pool_mutex));
+
 	os_event_free(dict_stats_event);
 	dict_stats_event = NULL;
 }
@@ -333,6 +462,63 @@ dict_stats_process_entry_from_recalc_pool()
 }
 
 /*****************************************************************//**
+Get the first index that has been added for updating persistent defrag
+stats and eventually save its stats. */
+static
+void
+dict_stats_process_entry_from_defrag_pool()
+/*=======================================*/
+{
+	table_id_t	table_id;
+	index_id_t	index_id;
+
+	ut_ad(!srv_read_only_mode);
+
+	/* pop the first index from the auto defrag pool */
+	if (!dict_stats_defrag_pool_get(&table_id, &index_id)) {
+		/* no index in defrag pool */
+		return;
+	}
+
+	dict_table_t*	table;
+
+	mutex_enter(&dict_sys->mutex);
+
+	/* If the table is no longer cached, we've already lost the in
+	memory stats so there's nothing really to write to disk. */
+	table = dict_table_open_on_id(table_id, TRUE,
+				      DICT_TABLE_OP_OPEN_ONLY_IF_CACHED);
+
+	if (table == NULL) {
+		mutex_exit(&dict_sys->mutex);
+		return;
+	}
+
+	/* Check whether table is corrupted */
+	if (table->corrupted) {
+		dict_table_close(table, TRUE, FALSE);
+		mutex_exit(&dict_sys->mutex);
+		return;
+	}
+	mutex_exit(&dict_sys->mutex);
+
+	dict_index_t*	index = dict_table_find_index_on_id(table, index_id);
+
+	if (index == NULL) {
+		return;
+	}
+
+	/* Check whether index is corrupted */
+	if (dict_index_is_corrupted(index)) {
+		dict_table_close(table, FALSE, FALSE);
+		return;
+	}
+
+	dict_stats_save_defrag_stats(index);
+	dict_table_close(table, FALSE, FALSE);
+}
+
+/*****************************************************************//**
 This is the thread for background stats gathering. It pops tables, from
 the auto recalc list and proceeds them, eventually recalculating their
 statistics.
@@ -364,6 +550,9 @@ DECLARE_THREAD(dict_stats_thread)(
 
 		dict_stats_process_entry_from_recalc_pool();
 
+		while (defrag_pool.size())
+			dict_stats_process_entry_from_defrag_pool();
+
 		os_event_reset(dict_stats_event);
 	}
 
diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc
index ba0476b1772..bc12774d475 100644
--- a/storage/innobase/fil/fil0fil.cc
+++ b/storage/innobase/fil/fil0fil.cc
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2014, MariaDB Corporation. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -24,6 +25,8 @@ Created 10/25/1995 Heikki Tuuri
 *******************************************************/
 
 #include "fil0fil.h"
+#include "fil0pagecompress.h"
+#include "fsp0pagecompress.h"
 
 #include <debug_sync.h>
 #include <my_dbug.h>
@@ -45,6 +48,7 @@ Created 10/25/1995 Heikki Tuuri
 #include "page0zip.h"
 #include "trx0sys.h"
 #include "row0mysql.h"
+#include "os0file.h"
 #ifndef UNIV_HOTBACKUP
 # include "buf0lru.h"
 # include "ibuf0ibuf.h"
@@ -54,6 +58,13 @@ Created 10/25/1995 Heikki Tuuri
 # include "srv0srv.h"
 static ulint srv_data_read, srv_data_written;
 #endif /* !UNIV_HOTBACKUP */
+#include "zlib.h"
+#ifdef __linux__
+#include <linux/fs.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#endif
+#include "row0mysql.h"
 
 /*
 		IMPLEMENTATION OF THE TABLESPACE MEMORY CACHE
@@ -260,11 +271,16 @@ fil_read(
 				block size multiple */
 	void*	buf,		/*!< in/out: buffer where to store data read;
 				in aio this must be appropriately aligned */
-	void*	message)	/*!< in: message for aio handler if non-sync
+	void*	message,	/*!< in: message for aio handler if non-sync
 				aio used, else ignored */
+	ulint*	write_size)	/*!< in/out: Actual write size initialized
+				after fist successfull trim
+				operation for this page and if
+				initialized we do not trim again if
+				actual page size does not decrease. */
 {
 	return(fil_io(OS_FILE_READ, sync, space_id, zip_size, block_offset,
-					  byte_offset, len, buf, message));
+		      byte_offset, len, buf, message, write_size));
 }
 
 /********************************************************************//**
@@ -289,18 +305,22 @@ fil_write(
 				be a block size multiple */
 	void*	buf,		/*!< in: buffer from which to write; in aio
 				this must be appropriately aligned */
-	void*	message)	/*!< in: message for aio handler if non-sync
+	void*	message,	/*!< in: message for aio handler if non-sync
 				aio used, else ignored */
+	ulint*	write_size)	/*!< in/out: Actual write size initialized
+				after fist successfull trim
+				operation for this page and if
+				initialized we do not trim again if
+				actual page size does not decrease. */
 {
 	ut_ad(!srv_read_only_mode);
 
 	return(fil_io(OS_FILE_WRITE, sync, space_id, zip_size, block_offset,
-					   byte_offset, len, buf, message));
+		      byte_offset, len, buf, message, write_size));
 }
 
 /*******************************************************************//**
 Returns the table space by a given id, NULL if not found. */
-UNIV_INLINE
 fil_space_t*
 fil_space_get_by_id(
 /*================*/
@@ -318,6 +338,19 @@ fil_space_get_by_id(
 	return(space);
 }
 
+/****************************************************************//**
+Get space id from fil node */
+ulint
+fil_node_get_space_id(
+/*==================*/
+        fil_node_t*     node)           /*!< in: Compressed node*/
+{
+	ut_ad(node);
+	ut_ad(node->space);
+
+	return (node->space->id);
+}
+
 /*******************************************************************//**
 Returns the table space by a given name, NULL if not found. */
 UNIV_INLINE
@@ -538,8 +571,9 @@ fil_node_open_file(
 	byte*		buf2;
 	byte*		page;
 	ulint		space_id;
-	ulint		flags;
+	ulint		flags=0;
 	ulint		page_size;
+	ulint           atomic_writes=0;
 
 	ut_ad(mutex_own(&(system->mutex)));
 	ut_a(node->n_pending == 0);
@@ -556,7 +590,7 @@ fil_node_open_file(
 
 		node->handle = os_file_create_simple_no_error_handling(
 			innodb_file_data_key, node->name, OS_FILE_OPEN,
-			OS_FILE_READ_ONLY, &success);
+			OS_FILE_READ_ONLY, &success, 0);
 		if (!success) {
 			/* The following call prints an error message */
 			os_file_get_last_error(true);
@@ -573,6 +607,8 @@ fil_node_open_file(
 
 		size_bytes = os_file_get_size(node->handle);
 		ut_a(size_bytes != (os_offset_t) -1);
+
+		node->file_block_size = os_file_get_block_size(node->handle, node->name);
 #ifdef UNIV_HOTBACKUP
 		if (space->id == 0) {
 			node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE);
@@ -604,10 +640,14 @@ fil_node_open_file(
 		set */
 		page = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
 
-		success = os_file_read(node->handle, page, 0, UNIV_PAGE_SIZE);
+		success = os_file_read(node->handle, page, 0, UNIV_PAGE_SIZE,
+			               space->flags);
+
 		space_id = fsp_header_get_space_id(page);
 		flags = fsp_header_get_flags(page);
 		page_size = fsp_flags_get_page_size(flags);
+		atomic_writes = fsp_flags_get_atomic_writes(flags);
+
 
 		ut_free(buf2);
 
@@ -658,13 +698,28 @@ fil_node_open_file(
 			ut_error;
 		}
 
-		if (size_bytes >= 1024 * 1024) {
-			/* Truncate the size to whole megabytes. */
-			size_bytes = ut_2pow_round(size_bytes, 1024 * 1024);
+		if (UNIV_UNLIKELY(space->flags != flags)) {
+			if (!dict_tf_verify_flags(space->flags, flags)) {
+				fprintf(stderr,
+					"InnoDB: Error: table flags are 0x%lx"
+					" in the data dictionary\n"
+					"InnoDB: but the flags in file %s are 0x%lx!\n",
+					space->flags, node->name, flags);
+				ut_error;
+			}
+		}
+
+		if (size_bytes >= FSP_EXTENT_SIZE * UNIV_PAGE_SIZE) {
+			/* Truncate the size to whole extent size. */
+			size_bytes = ut_2pow_round(size_bytes,
+						   FSP_EXTENT_SIZE *
+						   UNIV_PAGE_SIZE);
 		}
 
 		if (!fsp_flags_is_compressed(flags)) {
-			node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE);
+			node->size = (ulint)
+				(size_bytes
+				 / fsp_flags_get_page_size(flags));
 		} else {
 			node->size = (ulint)
 				(size_bytes
@@ -677,6 +732,8 @@ add_size:
 		space->size += node->size;
 	}
 
+	atomic_writes = fsp_flags_get_atomic_writes(space->flags);
+
 	/* printf("Opening file %s\n", node->name); */
 
 	/* Open the file for reading and writing, in Windows normally in the
@@ -687,18 +744,22 @@ add_size:
 		node->handle = os_file_create(innodb_file_log_key,
 					      node->name, OS_FILE_OPEN,
 					      OS_FILE_AIO, OS_LOG_FILE,
-					      &ret);
+					      &ret, atomic_writes);
 	} else if (node->is_raw_disk) {
 		node->handle = os_file_create(innodb_file_data_key,
 					      node->name,
 					      OS_FILE_OPEN_RAW,
 					      OS_FILE_AIO, OS_DATA_FILE,
-						     &ret);
+					      &ret, atomic_writes);
 	} else {
 		node->handle = os_file_create(innodb_file_data_key,
 					      node->name, OS_FILE_OPEN,
 					      OS_FILE_AIO, OS_DATA_FILE,
-					      &ret);
+					      &ret, atomic_writes);
+	}
+
+	if (node->file_block_size == 0) {
+		node->file_block_size = os_file_get_block_size(node->handle, node->name);
 	}
 
 	ut_a(ret);
@@ -1069,7 +1130,6 @@ fil_space_create(
 	DBUG_EXECUTE_IF("fil_space_create_failure", return(false););
 
 	ut_a(fil_system);
-	ut_a(fsp_flags_is_valid(flags));
 
 	/* Look for a matching tablespace and if found free it. */
 	do {
@@ -1723,12 +1783,12 @@ fil_write_lsn_and_arch_no_to_file(
 	buf = static_cast<byte*>(ut_align(buf1, UNIV_PAGE_SIZE));
 
 	err = fil_read(TRUE, space, 0, sum_of_sizes, 0,
-		       UNIV_PAGE_SIZE, buf, NULL);
+		       UNIV_PAGE_SIZE, buf, NULL, 0);
 	if (err == DB_SUCCESS) {
 		mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn);
 
 		err = fil_write(TRUE, space, 0, sum_of_sizes, 0,
-				UNIV_PAGE_SIZE, buf, NULL);
+				UNIV_PAGE_SIZE, buf, NULL, 0);
 	}
 
 	mem_free(buf1);
@@ -1816,6 +1876,9 @@ fil_check_first_page(
 	flags = mach_read_from_4(FSP_HEADER_OFFSET + FSP_SPACE_FLAGS + page);
 
 	if (UNIV_PAGE_SIZE != fsp_flags_get_page_size(flags)) {
+		fprintf(stderr, "InnoDB: Error: Current page size %lu != page size on page %lu\n",
+			UNIV_PAGE_SIZE, fsp_flags_get_page_size(flags));
+
 		return("innodb-page-size mismatch");
 	}
 
@@ -1868,8 +1931,10 @@ fil_read_first_page(
 #endif /* UNIV_LOG_ARCHIVE */
 	lsn_t*		min_flushed_lsn,	/*!< out: min of flushed
 						lsn values in data files */
-	lsn_t*		max_flushed_lsn)	/*!< out: max of flushed
+	lsn_t*		max_flushed_lsn,	/*!< out: max of flushed
 						lsn values in data files */
+	ulint		orig_space_id)		/*!< in: original file space
+						id */
 {
 	byte*		buf;
 	byte*		page;
@@ -1882,7 +1947,10 @@ fil_read_first_page(
 
 	page = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE));
 
-	os_file_read(data_file, page, 0, UNIV_PAGE_SIZE);
+	os_file_read(data_file, page, 0, UNIV_PAGE_SIZE,
+		orig_space_id != ULINT_UNDEFINED ?
+		fil_space_is_page_compressed(orig_space_id) :
+		FALSE);
 
 	/* The FSP_HEADER on page 0 is only valid for the first file
 	in a tablespace.  So if this is not the first datafile, leave
@@ -1891,7 +1959,16 @@ fil_read_first_page(
 	if (!one_read_already) {
 		*flags = fsp_header_get_flags(page);
 		*space_id = fsp_header_get_space_id(page);
+	}
 
+	/* Page is page compressed page, need to decompress, before
+	continue. */
+	if (fil_page_is_compressed(page)) {
+		ulint write_size=0;
+		fil_decompress_page(NULL, page, UNIV_PAGE_SIZE, &write_size);
+	}
+
+	if (!one_read_already) {
 		check_msg = fil_check_first_page(page);
 	}
 
@@ -3022,7 +3099,7 @@ fil_create_link_file(
 
 	file = os_file_create_simple_no_error_handling(
 		innodb_file_data_key, link_filepath,
-		OS_FILE_CREATE, OS_FILE_READ_WRITE, &success);
+		OS_FILE_CREATE, OS_FILE_READ_WRITE, &success, 0);
 
 	if (!success) {
 		/* The following call will print an error message */
@@ -3038,10 +3115,10 @@ fil_create_link_file(
 			ut_print_filename(stderr, filepath);
 			fputs(" already exists.\n", stderr);
 			err = DB_TABLESPACE_EXISTS;
-
 		} else if (error == OS_FILE_DISK_FULL) {
 			err = DB_OUT_OF_FILE_SPACE;
-
+		} else if (error == OS_FILE_OPERATION_NOT_SUPPORTED) {
+			err = DB_UNSUPPORTED;
 		} else {
 			err = DB_ERROR;
 		}
@@ -3052,7 +3129,7 @@ fil_create_link_file(
 	}
 
 	if (!os_file_write(link_filepath, file, filepath, 0,
-			    strlen(filepath))) {
+			   strlen(filepath))) {
 		err = DB_ERROR;
 	}
 
@@ -3131,8 +3208,9 @@ fil_open_linked_file(
 /*===============*/
 	const char*	tablename,	/*!< in: database/tablename */
 	char**		remote_filepath,/*!< out: remote filepath */
-	os_file_t*	remote_file)	/*!< out: remote file handle */
-
+	os_file_t*	remote_file,	/*!< out: remote file handle */
+	ulint           atomic_writes)  /*!< in: atomic writes table option
+					value */
 {
 	ibool		success;
 
@@ -3146,7 +3224,7 @@ fil_open_linked_file(
 	*remote_file = os_file_create_simple_no_error_handling(
 		innodb_file_data_key, *remote_filepath,
 		OS_FILE_OPEN, OS_FILE_READ_ONLY,
-		&success);
+		&success, atomic_writes);
 
 	if (!success) {
 		char*	link_filepath = fil_make_isl_name(tablename);
@@ -3201,6 +3279,7 @@ fil_create_new_single_table_tablespace(
 	/* TRUE if a table is created with CREATE TEMPORARY TABLE */
 	bool		is_temp = !!(flags2 & DICT_TF2_TEMPORARY);
 	bool		has_data_dir = FSP_FLAGS_HAS_DATA_DIR(flags);
+	ulint		atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(flags);
 
 	ut_a(space_id > 0);
 	ut_ad(!srv_read_only_mode);
@@ -3233,7 +3312,8 @@ fil_create_new_single_table_tablespace(
 		OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT,
 		OS_FILE_NORMAL,
 		OS_DATA_FILE,
-		&ret);
+		&ret,
+		atomic_writes);
 
 	if (ret == FALSE) {
 		/* The following call will print an error message */
@@ -3260,6 +3340,11 @@ fil_create_new_single_table_tablespace(
 			goto error_exit_3;
 		}
 
+		if (error == OS_FILE_OPERATION_NOT_SUPPORTED) {
+			err = DB_UNSUPPORTED;
+			goto error_exit_3;
+		}
+
 		if (error == OS_FILE_DISK_FULL) {
 			err = DB_OUT_OF_FILE_SPACE;
 			goto error_exit_3;
@@ -3298,6 +3383,7 @@ fil_create_new_single_table_tablespace(
 	flags = fsp_flags_set_page_size(flags, UNIV_PAGE_SIZE);
 	fsp_header_init_fields(page, space_id, flags);
 	mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space_id);
+	ut_ad(fsp_flags_is_valid(flags));
 
 	if (!(fsp_flags_is_compressed(flags))) {
 		buf_flush_init_for_writing(page, NULL, 0);
@@ -3474,16 +3560,25 @@ fil_open_single_table_tablespace(
 	fsp_open_info	remote;
 	ulint		tablespaces_found = 0;
 	ulint		valid_tablespaces_found = 0;
+	ulint           atomic_writes = 0;
 
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(!fix_dict || rw_lock_own(&dict_operation_lock, RW_LOCK_EX));
 #endif /* UNIV_SYNC_DEBUG */
 	ut_ad(!fix_dict || mutex_own(&(dict_sys->mutex)));
 
-	if (!fsp_flags_is_valid(flags)) {
+	/* Table flags can be ULINT_UNDEFINED if
+	dict_tf_to_fsp_flags_failure is set. */
+	if (flags != ULINT_UNDEFINED) {
+		if (!fsp_flags_is_valid(flags)) {
+			return(DB_CORRUPTION);
+		}
+	} else {
 		return(DB_CORRUPTION);
 	}
 
+	atomic_writes = fsp_flags_get_atomic_writes(flags);
+
 	/* If the tablespace was relocated, we do not
 	compare the DATA_DIR flag */
 	ulint mod_flags = flags & ~FSP_FLAGS_MASK_DATA_DIR;
@@ -3508,7 +3603,7 @@ fil_open_single_table_tablespace(
 	}
 
 	link_file_found = fil_open_linked_file(
-		tablename, &remote.filepath, &remote.file);
+		tablename, &remote.filepath, &remote.file, atomic_writes);
 	remote.success = link_file_found;
 	if (remote.success) {
 		/* possibility of multiple files. */
@@ -3536,7 +3631,7 @@ fil_open_single_table_tablespace(
 	if (dict.filepath) {
 		dict.file = os_file_create_simple_no_error_handling(
 			innodb_file_data_key, dict.filepath, OS_FILE_OPEN,
-			OS_FILE_READ_ONLY, &dict.success);
+			OS_FILE_READ_ONLY, &dict.success, atomic_writes);
 		if (dict.success) {
 			/* possibility of multiple files. */
 			validate = true;
@@ -3548,7 +3643,7 @@ fil_open_single_table_tablespace(
 	ut_a(def.filepath);
 	def.file = os_file_create_simple_no_error_handling(
 		innodb_file_data_key, def.filepath, OS_FILE_OPEN,
-		OS_FILE_READ_ONLY, &def.success);
+		OS_FILE_READ_ONLY, &def.success, atomic_writes);
 	if (def.success) {
 		tablespaces_found++;
 	}
@@ -3567,7 +3662,7 @@ fil_open_single_table_tablespace(
 #ifdef UNIV_LOG_ARCHIVE
 			&space_arch_log_no, &space_arch_log_no,
 #endif /* UNIV_LOG_ARCHIVE */
-			&def.lsn, &def.lsn);
+			&def.lsn, &def.lsn, id);
 		def.valid = !def.check_msg;
 
 		/* Validate this single-table-tablespace with SYS_TABLES,
@@ -3592,7 +3687,7 @@ fil_open_single_table_tablespace(
 #ifdef UNIV_LOG_ARCHIVE
 			&remote.arch_log_no, &remote.arch_log_no,
 #endif /* UNIV_LOG_ARCHIVE */
-			&remote.lsn, &remote.lsn);
+			&remote.lsn, &remote.lsn, id);
 		remote.valid = !remote.check_msg;
 
 		/* Validate this single-table-tablespace with SYS_TABLES,
@@ -3618,7 +3713,7 @@ fil_open_single_table_tablespace(
 #ifdef UNIV_LOG_ARCHIVE
 			&dict.arch_log_no, &dict.arch_log_no,
 #endif /* UNIV_LOG_ARCHIVE */
-			&dict.lsn, &dict.lsn);
+			&dict.lsn, &dict.lsn, id);
 		dict.valid = !dict.check_msg;
 
 		/* Validate this single-table-tablespace with SYS_TABLES,
@@ -3882,7 +3977,8 @@ fil_user_tablespace_find_space_id(
 
 		for (ulint j = 0; j < page_count; ++j) {
 
-			st = os_file_read(fsp->file, page, (j* page_size), page_size);
+			st = os_file_read(fsp->file, page, (j* page_size), page_size,
+				          fsp_flags_is_page_compressed(fsp->flags));
 
 			if (!st) {
 				ib_logf(IB_LOG_LEVEL_INFO,
@@ -3995,7 +4091,7 @@ fil_user_tablespace_restore_page(
 
 	err = os_file_write(fsp->filepath, fsp->file, page,
 			    (zip_size ? zip_size : page_size) * page_no,
-			    buflen);
+		            buflen);
 
 	os_file_flush(fsp->file);
 out:
@@ -4022,7 +4118,7 @@ check_first_page:
 #ifdef UNIV_LOG_ARCHIVE
 		    &fsp->arch_log_no, &fsp->arch_log_no,
 #endif /* UNIV_LOG_ARCHIVE */
-		    &fsp->lsn, &fsp->lsn)) {
+		    &fsp->lsn, &fsp->lsn, ULINT_UNDEFINED)) {
 		ib_logf(IB_LOG_LEVEL_ERROR,
 			"%s in tablespace %s (table %s)",
 			check_msg, fsp->filepath, tablename);
@@ -4095,9 +4191,7 @@ fil_load_single_table_tablespace(
 	fsp_open_info	def;
 	fsp_open_info	remote;
 	os_offset_t	size;
-#ifdef UNIV_HOTBACKUP
 	fil_space_t*	space;
-#endif
 
 	memset(&def, 0, sizeof(def));
 	memset(&remote, 0, sizeof(remote));
@@ -4119,7 +4213,8 @@ fil_load_single_table_tablespace(
 	one of them is sent to this function.  So if this table has
 	already been loaded, there is nothing to do.*/
 	mutex_enter(&fil_system->mutex);
-	if (fil_space_get_by_name(tablename)) {
+	space = fil_space_get_by_name(tablename);
+	if (space) {
 		mem_free(tablename);
 		mutex_exit(&fil_system->mutex);
 		return;
@@ -4144,7 +4239,7 @@ fil_load_single_table_tablespace(
 
 	/* Check for a link file which locates a remote tablespace. */
 	remote.success = fil_open_linked_file(
-		tablename, &remote.filepath, &remote.file);
+		tablename, &remote.filepath, &remote.file, FALSE);
 
 	/* Read the first page of the remote tablespace */
 	if (remote.success) {
@@ -4159,7 +4254,7 @@ fil_load_single_table_tablespace(
 	/* Try to open the tablespace in the datadir. */
 	def.file = os_file_create_simple_no_error_handling(
 		innodb_file_data_key, def.filepath, OS_FILE_OPEN,
-		OS_FILE_READ_WRITE, &def.success);
+		OS_FILE_READ_WRITE, &def.success, FALSE);
 
 	/* Read the first page of the remote tablespace */
 	if (def.success) {
@@ -4887,6 +4982,7 @@ retry:
 	}
 
 	page_size = fsp_flags_get_zip_size(space->flags);
+
 	if (!page_size) {
 		page_size = UNIV_PAGE_SIZE;
 	}
@@ -4924,6 +5020,11 @@ retry:
 	start_page_no = space->size;
 	file_start_page_no = space->size - node->size;
 
+	/* Determine correct file block size */
+	if (node->file_block_size == 0) {
+		node->file_block_size = os_file_get_block_size(node->handle, node->name);
+	}
+
 #ifdef HAVE_POSIX_FALLOCATE
 	if (srv_use_posix_fallocate) {
 		os_offset_t	start_offset = start_page_no * page_size;
@@ -4935,16 +5036,18 @@ retry:
 				"space for file \'%s\' failed.  Current size "
 				INT64PF ", desired size " INT64PF "\n",
 				node->name, start_offset, len+start_offset);
-			os_file_handle_error_no_exit(node->name, "posix_fallocate", FALSE);
+			os_file_handle_error_no_exit(node->name, "posix_fallocate", FALSE, __FILE__, __LINE__);
 			success = FALSE;
 		} else {
 			success = TRUE;
 		}
 
 		mutex_enter(&fil_system->mutex);
+
 		if (success) {
-			node->size += n_pages;
-			space->size += n_pages;
+			node->size += (size_after_extend - start_page_no);
+			space->size += (size_after_extend - start_page_no);
+
 			os_has_said_disk_full = FALSE;
 		}
 
@@ -4980,7 +5083,7 @@ retry:
 		success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC,
 				 node->name, node->handle, buf,
 				 offset, page_size * n_pages,
-				 NULL, NULL);
+				 node, NULL, 0, FALSE, 0);
 #endif /* UNIV_HOTBACKUP */
 		if (success) {
 			os_has_said_disk_full = FALSE;
@@ -5074,7 +5177,7 @@ fil_extend_tablespaces_to_stored_len(void)
 					      single-threaded operation */
 		error = fil_read(TRUE, space->id,
 				 fsp_flags_get_zip_size(space->flags),
-				 0, 0, UNIV_PAGE_SIZE, buf, NULL);
+				 0, 0, UNIV_PAGE_SIZE, buf, NULL, 0);
 		ut_a(error == DB_SUCCESS);
 
 		size_in_header = fsp_get_size_low(buf);
@@ -5354,8 +5457,13 @@ fil_io(
 	void*	buf,		/*!< in/out: buffer where to store read data
 				or from where to write; in aio this must be
 				appropriately aligned */
-	void*	message)	/*!< in: message for aio handler if non-sync
+	void*	message,	/*!< in: message for aio handler if non-sync
 				aio used, else ignored */
+	ulint*	write_size)	/*!< in/out: Actual write size initialized
+				after fist successfull trim
+				operation for this page and if
+				initialized we do not trim again if
+				actual page size does not decrease. */
 {
 	ulint		mode;
 	fil_space_t*	space;
@@ -5365,6 +5473,8 @@ fil_io(
 	ulint		wake_later;
 	os_offset_t	offset;
 	ibool		ignore_nonexistent_pages;
+        ibool		page_compressed = FALSE;
+	ulint		page_compression_level = 0;
 
 	is_log = type & OS_FILE_LOG;
 	type = type & ~OS_FILE_LOG;
@@ -5418,6 +5528,11 @@ fil_io(
 	} else if (type == OS_FILE_WRITE) {
 		ut_ad(!srv_read_only_mode);
 		srv_stats.data_written.add(len);
+		if (fil_page_is_index_page((byte *)buf)) {
+			srv_stats.index_pages_written.inc();
+		} else {
+			srv_stats.non_index_pages_written.inc();
+		}
 	}
 
 	/* Reserve the fil_system mutex and make sure that we can open at
@@ -5543,6 +5658,9 @@ fil_io(
 	ut_a(byte_offset % OS_FILE_LOG_BLOCK_SIZE == 0);
 	ut_a((len % OS_FILE_LOG_BLOCK_SIZE) == 0);
 
+	page_compressed = fsp_flags_is_page_compressed(space->flags);
+	page_compression_level = fsp_flags_get_page_compression_level(space->flags);
+
 #ifdef UNIV_HOTBACKUP
 	/* In mysqlbackup do normal i/o, not aio */
 	if (type == OS_FILE_READ) {
@@ -5555,7 +5673,8 @@ fil_io(
 #else
 	/* Queue the aio request */
 	ret = os_aio(type, mode | wake_later, node->name, node->handle, buf,
-		     offset, len, node, message);
+		offset, len, node, message, write_size,
+		page_compressed, page_compression_level);
 #endif /* UNIV_HOTBACKUP */
 
 
@@ -6095,7 +6214,8 @@ fil_iterate(
 		ut_ad(!(n_bytes % iter.page_size));
 
 		if (!os_file_read(iter.file, io_buffer, offset,
-				  (ulint) n_bytes)) {
+				  (ulint) n_bytes,
+				  fil_space_is_page_compressed(space_id))) {
 
 			ib_logf(IB_LOG_LEVEL_ERROR, "os_file_read() failed");
 
@@ -6182,7 +6302,7 @@ fil_tablespace_iterate(
 
 		file = os_file_create_simple_no_error_handling(
 			innodb_file_data_key, filepath,
-			OS_FILE_OPEN, OS_FILE_READ_WRITE, &success);
+			OS_FILE_OPEN, OS_FILE_READ_WRITE, &success, FALSE);
 
 		DBUG_EXECUTE_IF("fil_tablespace_iterate_failure",
 		{
@@ -6234,7 +6354,8 @@ fil_tablespace_iterate(
 
 	/* Read the first page and determine the page and zip size. */
 
-	if (!os_file_read(file, page, 0, UNIV_PAGE_SIZE)) {
+	if (!os_file_read(file, page, 0, UNIV_PAGE_SIZE,
+			  dict_tf_get_page_compression(table->flags))) {
 
 		err = DB_IO_ERROR;
 
@@ -6400,3 +6521,87 @@ fil_mtr_rename_log(
 				 0, 0, new_name, old_name, mtr);
 	}
 }
+
+/****************************************************************//**
+Acquire fil_system mutex */
+void
+fil_system_enter(void)
+/*==================*/
+{
+	ut_ad(!mutex_own(&fil_system->mutex));
+	mutex_enter(&fil_system->mutex);
+}
+
+/****************************************************************//**
+Release fil_system mutex */
+void
+fil_system_exit(void)
+/*=================*/
+{
+	ut_ad(mutex_own(&fil_system->mutex));
+	mutex_exit(&fil_system->mutex);
+}
+
+/*******************************************************************//**
+Return space name */
+char*
+fil_space_name(
+/*===========*/
+	fil_space_t*	space)	/*!< in: space */
+{
+	return (space->name);
+}
+
+/*******************************************************************//**
+Return page type name */
+const char*
+fil_get_page_type_name(
+/*===================*/
+	ulint	page_type)	/*!< in: FIL_PAGE_TYPE */
+{
+	switch(page_type) {
+	case FIL_PAGE_PAGE_COMPRESSED:
+		return (const char*)"PAGE_COMPRESSED";
+	case FIL_PAGE_INDEX:
+		return (const char*)"INDEX";
+	case FIL_PAGE_UNDO_LOG:
+		return (const char*)"UNDO LOG";
+	case FIL_PAGE_INODE:
+		return (const char*)"INODE";
+	case FIL_PAGE_IBUF_FREE_LIST:
+		return (const char*)"IBUF_FREE_LIST";
+	case FIL_PAGE_TYPE_ALLOCATED:
+		return (const char*)"ALLOCATED";
+	case FIL_PAGE_IBUF_BITMAP:
+		return (const char*)"IBUF_BITMAP";
+	case FIL_PAGE_TYPE_SYS:
+		return (const char*)"SYS";
+	case FIL_PAGE_TYPE_TRX_SYS:
+		return (const char*)"TRX_SYS";
+	case FIL_PAGE_TYPE_FSP_HDR:
+		return (const char*)"FSP_HDR";
+	case FIL_PAGE_TYPE_XDES:
+		return (const char*)"XDES";
+	case FIL_PAGE_TYPE_BLOB:
+		return (const char*)"BLOB";
+	case FIL_PAGE_TYPE_ZBLOB:
+		return (const char*)"ZBLOB";
+	case FIL_PAGE_TYPE_ZBLOB2:
+		return (const char*)"ZBLOB2";
+	case FIL_PAGE_TYPE_COMPRESSED:
+		return (const char*)"ORACLE PAGE COMPRESSED";
+	default:
+		return (const char*)"PAGE TYPE CORRUPTED";
+	}
+}
+/****************************************************************//**
+Get block size from fil node
+@return block size*/
+ulint
+fil_node_get_block_size(
+/*====================*/
+	fil_node_t*     node)		/*!< in: Node where to get block
+					size */
+{
+	return (node->file_block_size);
+}
diff --git a/storage/innobase/fil/fil0pagecompress.cc b/storage/innobase/fil/fil0pagecompress.cc
new file mode 100644
index 00000000000..2b0196c9017
--- /dev/null
+++ b/storage/innobase/fil/fil0pagecompress.cc
@@ -0,0 +1,740 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2014, SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file fil/fil0pagecompress.cc
+Implementation for page compressed file spaces.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+#include "fil0fil.h"
+#include "fil0pagecompress.h"
+
+#include <debug_sync.h>
+#include <my_dbug.h>
+
+#include "mem0mem.h"
+#include "hash0hash.h"
+#include "os0file.h"
+#include "mach0data.h"
+#include "buf0buf.h"
+#include "buf0flu.h"
+#include "log0recv.h"
+#include "fsp0fsp.h"
+#include "srv0srv.h"
+#include "srv0start.h"
+#include "mtr0mtr.h"
+#include "mtr0log.h"
+#include "dict0dict.h"
+#include "page0page.h"
+#include "page0zip.h"
+#include "trx0sys.h"
+#include "row0mysql.h"
+#ifndef UNIV_HOTBACKUP
+# include "buf0lru.h"
+# include "ibuf0ibuf.h"
+# include "sync0sync.h"
+# include "os0sync.h"
+#else /* !UNIV_HOTBACKUP */
+# include "srv0srv.h"
+static ulint srv_data_read, srv_data_written;
+#endif /* !UNIV_HOTBACKUP */
+#include "zlib.h"
+#ifdef __linux__
+#include <linux/fs.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#endif
+#include "row0mysql.h"
+#ifdef HAVE_LZ4
+#include "lz4.h"
+#endif
+#ifdef HAVE_LZO
+#include "lzo/lzo1x.h"
+#endif
+#ifdef HAVE_LZMA
+#include "lzma.h"
+#endif
+#ifdef HAVE_BZIP2
+#include "bzlib.h"
+#endif
+
+/* Used for debugging */
+//#define UNIV_PAGECOMPRESS_DEBUG 1
+
+/****************************************************************//**
+For page compressed pages decompress the page after actual read
+operation. */
+static
+void
+fil_decompress_page_2(
+/*==================*/
+	byte*           page_buf,      /*!< out: destination buffer for
+				       uncompressed data */
+	byte*           buf,           /*!< in: source compressed data */
+        ulong           len,           /*!< in: length of output buffer.*/
+	ulint*		write_size)    /*!< in/out: Actual payload size of
+				       the compressed data. */
+{
+	ulint	page_type = mach_read_from_2(buf + FIL_PAGE_TYPE);
+
+	if (page_type != FIL_PAGE_TYPE_COMPRESSED) {
+		/* It is not a compressed page */
+		return;
+	}
+
+	byte*   ptr = buf + FIL_PAGE_DATA;
+	ulint   version = mach_read_from_1(buf + FIL_PAGE_VERSION);
+	int err = 0;
+
+	ut_a(version == 1);
+
+	/* Read the original page type, before we compressed the data. */
+	page_type = mach_read_from_2(buf + FIL_PAGE_ORIGINAL_TYPE_V1);
+
+	ulint   original_len = mach_read_from_2(buf + FIL_PAGE_ORIGINAL_SIZE_V1);
+
+	if (original_len < UNIV_PAGE_SIZE_MIN - (FIL_PAGE_DATA + 8)
+	     || original_len > UNIV_PAGE_SIZE_MAX - FIL_PAGE_DATA
+	     || len < original_len + FIL_PAGE_DATA) {
+		fprintf(stderr,
+			"InnoDB: Corruption: We try to uncompress corrupted page\n"
+			"InnoDB: Original len %lu len %lu.\n",
+			original_len, len);
+
+		fflush(stderr);
+		ut_error;
+
+	}
+
+	ulint   algorithm = mach_read_from_1(buf + FIL_PAGE_ALGORITHM_V1);
+
+	switch(algorithm) {
+	case PAGE_ZLIB_ALGORITHM: {
+
+		fprintf(stderr, "InnoDB: [Note]: zlib\n");
+
+		err = uncompress(page_buf, &len, ptr, original_len);
+		/* If uncompress fails it means that page is corrupted */
+		if (err != Z_OK) {
+
+			fprintf(stderr,
+				"InnoDB: Corruption: Page is marked as compressed\n"
+				"InnoDB: but uncompress failed with error %d.\n"
+				"InnoDB: size %lu len %lu\n",
+				err, original_len, len);
+
+			fflush(stderr);
+
+			ut_error;
+		}
+
+		break;
+	}
+#ifdef HAVE_LZ4
+	case PAGE_LZ4_ALGORITHM: {
+		fprintf(stderr, "InnoDB: [Note]: lz4\n");
+		err = LZ4_decompress_fast(
+			(const char*) ptr, (char*) (page_buf), original_len);
+
+		if (err < 0) {
+			fprintf(stderr,
+				"InnoDB: Corruption: Page is marked as compressed\n"
+				"InnoDB: but decompression read only %d bytes.\n"
+				"InnoDB: size %lu len %lu\n",
+				err, original_len, len);
+			fflush(stderr);
+
+			ut_error;
+		}
+		break;
+	}
+#endif /* HAVE_LZ4 */
+
+#ifdef HAVE_LZMA
+	case PAGE_LZMA_ALGORITHM: {
+
+		lzma_ret	ret;
+		size_t		src_pos = 0;
+		size_t		dst_pos = 0;
+		uint64_t 	memlimit = UINT64_MAX;
+
+		fprintf(stderr, "InnoDB: [Note]: lzma\n");
+		ret = lzma_stream_buffer_decode(
+			&memlimit,
+			0,
+			NULL,
+			ptr,
+			&src_pos,
+			original_len,
+			(page_buf),
+			&dst_pos,
+			len);
+
+
+		if (ret != LZMA_OK || (dst_pos <= 0 || dst_pos > len)) {
+			fprintf(stderr,
+				"InnoDB: Corruption: Page is marked as compressed\n"
+				"InnoDB: but decompression read only %ld bytes.\n"
+				"InnoDB: size %lu len %lu\n",
+				dst_pos, original_len, len);
+			fflush(stderr);
+
+			ut_error;
+		}
+
+		break;
+	}
+#endif /* HAVE_LZMA */
+
+#ifdef HAVE_LZO
+	case PAGE_LZO_ALGORITHM: {
+                ulint olen = 0;
+		fprintf(stderr, "InnoDB: [Note]: lzo \n");
+		err = lzo1x_decompress((const unsigned char *)ptr,
+			original_len,(unsigned char *)(page_buf), &olen, NULL);
+
+		if (err != LZO_E_OK || (olen == 0 || olen > UNIV_PAGE_SIZE)) {
+			fprintf(stderr,
+				"InnoDB: Corruption: Page is marked as compressed\n"
+				"InnoDB: but decompression read only %ld bytes.\n"
+				"InnoDB: size %lu len %lu\n",
+				olen, original_len, len);
+			fflush(stderr);
+
+			ut_error;
+		}
+		break;
+	}
+#endif /* HAVE_LZO */
+
+	default:
+		fprintf(stderr,
+			"InnoDB: Corruption: Page is marked as compressed\n"
+			"InnoDB: but compression algorithm %s\n"
+			"InnoDB: is not known.\n"
+			,fil_get_compression_alg_name(algorithm));
+
+		fflush(stderr);
+		ut_error;
+		break;
+	}
+
+	/* Leave the header alone */
+	memmove(buf+FIL_PAGE_DATA, page_buf, original_len);
+
+	mach_write_to_2(buf + FIL_PAGE_TYPE, page_type);
+
+	ut_ad(memcmp(buf + FIL_PAGE_LSN + 4,
+		     buf + (original_len + FIL_PAGE_DATA)
+		     - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4) == 0);
+}
+
+/****************************************************************//**
+For page compressed pages compress the page before actual write
+operation.
+@return compressed page to be written*/
+byte*
+fil_compress_page(
+/*==============*/
+	ulint		space_id,      /*!< in: tablespace id of the
+				       table. */
+	byte*           buf,           /*!< in: buffer from which to write; in aio
+				       this must be appropriately aligned */
+        byte*           out_buf,       /*!< out: compressed buffer */
+        ulint           len,           /*!< in: length of input buffer.*/
+        ulint           compression_level, /* in: compression level */
+	ulint           block_size,    /*!< in: block size */
+	ulint*          out_len,       /*!< out: actual length of compressed
+				       page */
+	byte*		lzo_mem)       /*!< in: temporal memory used by LZO */
+{
+        int err = Z_OK;
+        int level = 0;
+        ulint header_len = FIL_PAGE_DATA + FIL_PAGE_COMPRESSED_SIZE;
+	ulint write_size=0;
+	ulint comp_method = innodb_compression_algorithm; /* Cache to avoid
+							  change during
+							  function execution */
+
+	ut_ad(buf);
+	ut_ad(out_buf);
+	ut_ad(len);
+	ut_ad(out_len);
+
+        level = compression_level;
+	ut_ad(fil_space_is_page_compressed(space_id));
+
+	fil_system_enter();
+	fil_space_t* space = fil_space_get_by_id(space_id);
+	fil_system_exit();
+
+	/* If no compression level was provided to this table, use system
+	default level */
+	if (level == 0) {
+		level = page_zip_level;
+	}
+
+#ifdef UNIV_PAGECOMPRESS_DEBUG
+	fprintf(stderr,
+		"InnoDB: Note: Preparing for compress for space %lu name %s len %lu\n",
+		space_id, fil_space_name(space), len);
+#endif /* UNIV_PAGECOMPRESS_DEBUG */
+
+	write_size = UNIV_PAGE_SIZE - header_len;
+
+	switch(comp_method) {
+#ifdef HAVE_LZ4
+	case PAGE_LZ4_ALGORITHM:
+		err = LZ4_compress_limitedOutput((const char *)buf,
+			(char *)out_buf+header_len, len, write_size);
+		write_size = err;
+
+		if (err == 0) {
+			/* If error we leave the actual page as it was */
+
+			fprintf(stderr,
+				"InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n",
+				space_id, fil_space_name(space), len, err, write_size);
+
+			srv_stats.pages_page_compression_error.inc();
+			*out_len = len;
+			return (buf);
+		}
+		break;
+#endif /* HAVE_LZ4 */
+#ifdef HAVE_LZO
+	case PAGE_LZO_ALGORITHM:
+		err = lzo1x_1_15_compress(
+			buf, len, out_buf+header_len, &write_size, lzo_mem);
+
+		if (err != LZO_E_OK || write_size > UNIV_PAGE_SIZE-header_len) {
+			fprintf(stderr,
+				"InnoDB: Warning: Compression failed for space %lu name %s len %lu err %d write_size %lu\n",
+				space_id, fil_space_name(space), len, err, write_size);
+			srv_stats.pages_page_compression_error.inc();
+			*out_len = len;
+			return (buf);
+		}
+
+		break;
+#endif /* HAVE_LZO */
+#ifdef HAVE_LZMA
+	case PAGE_LZMA_ALGORITHM: {
+		size_t out_pos=0;
+
+		err = lzma_easy_buffer_encode(
+			compression_level,
+			LZMA_CHECK_NONE,
+			NULL, 	/* No custom allocator, use malloc/free */
+			reinterpret_cast<uint8_t*>(buf),
+			len,
+			reinterpret_cast<uint8_t*>(out_buf + header_len),
+			&out_pos,
+			(size_t)write_size);
+
+		if (err != LZMA_OK || out_pos > UNIV_PAGE_SIZE-header_len) {
+			fprintf(stderr,
+				"InnoDB: Warning: Compression failed for space %lu name %s len %lu err %d write_size %lu\n",
+				space_id, fil_space_name(space), len, err, out_pos);
+
+			srv_stats.pages_page_compression_error.inc();
+			*out_len = len;
+			return (buf);
+		}
+
+		write_size = out_pos;
+
+		break;
+	}
+#endif /* HAVE_LZMA */
+
+#ifdef HAVE_BZIP2
+	case PAGE_BZIP2_ALGORITHM: {
+
+		err = BZ2_bzBuffToBuffCompress(
+			(char *)(out_buf + header_len),
+			(unsigned int *)&write_size,
+			(char *)buf,
+			len,
+			1,
+			0,
+			0);
+
+		if (err != BZ_OK || write_size > UNIV_PAGE_SIZE-header_len) {
+			fprintf(stderr,
+				"InnoDB: Warning: Compression failed for space %lu name %s len %lu err %d write_size %lu\n",
+				space_id, fil_space_name(space), len, err, write_size);
+			srv_stats.pages_page_compression_error.inc();
+			*out_len = len;
+			return (buf);
+		}
+		break;
+	}
+#endif /* HAVE_BZIP2 */
+
+	case PAGE_ZLIB_ALGORITHM:
+		err = compress2(out_buf+header_len, (ulong*)&write_size, buf, len, level);
+
+		if (err != Z_OK) {
+			/* If error we leave the actual page as it was */
+
+			fprintf(stderr,
+				"InnoDB: Warning: Compression failed for space %lu name %s len %lu rt %d write %lu\n",
+				space_id, fil_space_name(space), len, err, write_size);
+
+			srv_stats.pages_page_compression_error.inc();
+			*out_len = len;
+			return (buf);
+		}
+		break;
+
+	case PAGE_UNCOMPRESSED:
+		*out_len = len;
+		return (buf);
+		break;
+
+	default:
+		ut_error;
+		break;
+	}
+
+	/* Set up the page header */
+	memcpy(out_buf, buf, FIL_PAGE_DATA);
+	/* Set up the checksum */
+	mach_write_to_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM, BUF_NO_CHECKSUM_MAGIC);
+	/* Set up the correct page type */
+	mach_write_to_2(out_buf+FIL_PAGE_TYPE, FIL_PAGE_PAGE_COMPRESSED);
+	/* Set up the flush lsn to be compression algorithm */
+	mach_write_to_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN, comp_method);
+	/* Set up the actual payload lenght */
+	mach_write_to_2(out_buf+FIL_PAGE_DATA, write_size);
+
+#ifdef UNIV_DEBUG
+	/* Verify */
+	ut_ad(fil_page_is_compressed(out_buf));
+	ut_ad(mach_read_from_4(out_buf+FIL_PAGE_SPACE_OR_CHKSUM) == BUF_NO_CHECKSUM_MAGIC);
+	ut_ad(mach_read_from_2(out_buf+FIL_PAGE_DATA) == write_size);
+	ut_ad(mach_read_from_8(out_buf+FIL_PAGE_FILE_FLUSH_LSN) == (ulint)comp_method);
+
+	/* Verify that page can be decompressed */
+	{
+		byte *comp_page;
+		byte *uncomp_page;
+
+		comp_page = static_cast<byte *>(ut_malloc(UNIV_PAGE_SIZE*2));
+		uncomp_page = static_cast<byte *>(ut_malloc(UNIV_PAGE_SIZE*2));
+		memcpy(comp_page, out_buf, UNIV_PAGE_SIZE);
+
+		fil_decompress_page(uncomp_page, comp_page, len, NULL);
+		if(buf_page_is_corrupted(false, uncomp_page, 0)) {
+			buf_page_print(uncomp_page, 0, BUF_PAGE_PRINT_NO_CRASH);
+			ut_error;
+		}
+		ut_free(comp_page);
+		ut_free(uncomp_page);
+	}
+#endif /* UNIV_DEBUG */
+
+	write_size+=header_len;
+
+	/* Actual write needs to be alligned on block size */
+	if (write_size % block_size) {
+#ifdef UNIV_DEBUG
+		size_t tmp = write_size;
+		ut_a(block_size > 0);
+#endif
+		write_size =  (size_t)ut_uint64_align_up((ib_uint64_t)write_size, block_size);
+#ifdef UNIV_DEBUG
+		ut_a(write_size > 0 && ((write_size % block_size) == 0));
+		ut_a(write_size >= tmp);
+#endif
+	}
+
+#ifdef UNIV_PAGECOMPRESS_DEBUG
+	fprintf(stderr,
+		"InnoDB: Note: Compression succeeded for space %lu name %s len %lu out_len %lu\n",
+		space_id, fil_space_name(space), len, write_size);
+#endif /* UNIV_PAGECOMPRESS_DEBUG */
+
+
+	srv_stats.page_compression_saved.add((len - write_size));
+	srv_stats.pages_page_compressed.inc();
+
+#if defined (__linux__) && (!defined(FALLOC_FL_PUNCH_HOLE) || !defined (FALLOC_FL_KEEP_SIZE))
+	if (srv_use_trim) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: [Warning] System does not support FALLOC_FL_PUNCH_HOLE || FALLOC_FL_KEEP_SIZE.\n"
+			"  InnoDB: Disabling trim for now.\n");
+		srv_use_trim = FALSE;
+	}
+#endif
+
+	if (!srv_use_trim) {
+		/* If persistent trims are not used we always write full
+		page */
+		write_size = len;
+	}
+
+	*out_len = write_size;
+
+	return(out_buf);
+
+}
+
+/****************************************************************//**
+For page compressed pages decompress the page after actual read
+operation. */
+void
+fil_decompress_page(
+/*================*/
+	byte*           page_buf,      /*!< in: preallocated buffer or NULL */
+	byte*           buf,           /*!< out: buffer from which to read; in aio
+				       this must be appropriately aligned */
+        ulong           len,           /*!< in: length of output buffer.*/
+	ulint*		write_size)    /*!< in/out: Actual payload size of
+				       the compressed data. */
+{
+        int err = 0;
+        ulint actual_size = 0;
+	ulint compression_alg = 0;
+	byte *in_buf;
+	ulint ptype;
+
+	ut_ad(buf);
+	ut_ad(len);
+
+	ptype = mach_read_from_2(buf+FIL_PAGE_TYPE);
+
+	/* Do not try to uncompressed pages that are not compressed */
+	if (ptype !=  FIL_PAGE_PAGE_COMPRESSED && ptype != FIL_PAGE_TYPE_COMPRESSED) {
+		return;
+	}
+
+	// If no buffer was given, we need to allocate temporal buffer
+	if (page_buf == NULL) {
+#ifdef UNIV_PAGECOMPRESS_DEBUG
+		fprintf(stderr,
+			"InnoDB: Note: FIL: Compression buffer not given, allocating...\n");
+#endif /* UNIV_PAGECOMPRESS_DEBUG */
+		in_buf = static_cast<byte *>(ut_malloc(UNIV_PAGE_SIZE*2));
+	} else {
+		in_buf = page_buf;
+	}
+
+	if (ptype == FIL_PAGE_TYPE_COMPRESSED) {
+
+		fil_decompress_page_2(in_buf, buf, len, write_size);
+		// Need to free temporal buffer if no buffer was given
+		if (page_buf == NULL) {
+			ut_free(in_buf);
+		}
+		return;
+	}
+
+	/* Before actual decompress, make sure that page type is correct */
+
+	if (mach_read_from_4(buf+FIL_PAGE_SPACE_OR_CHKSUM) != BUF_NO_CHECKSUM_MAGIC ||
+		mach_read_from_2(buf+FIL_PAGE_TYPE) != FIL_PAGE_PAGE_COMPRESSED) {
+		fprintf(stderr,
+			"InnoDB: Corruption: We try to uncompress corrupted page\n"
+			"InnoDB: CRC %lu type %lu.\n"
+			"InnoDB: len %lu\n",
+			mach_read_from_4(buf+FIL_PAGE_SPACE_OR_CHKSUM),
+			mach_read_from_2(buf+FIL_PAGE_TYPE), len);
+
+		fflush(stderr);
+		ut_error;
+	}
+
+	/* Get compression algorithm */
+	compression_alg = mach_read_from_8(buf+FIL_PAGE_FILE_FLUSH_LSN);
+
+	/* Get the actual size of compressed page */
+	actual_size = mach_read_from_2(buf+FIL_PAGE_DATA);
+	/* Check if payload size is corrupted */
+	if (actual_size == 0 || actual_size > UNIV_PAGE_SIZE) {
+		fprintf(stderr,
+			"InnoDB: Corruption: We try to uncompress corrupted page\n"
+			"InnoDB: actual size %lu compression %s\n",
+			actual_size, fil_get_compression_alg_name(compression_alg));
+		fflush(stderr);
+		ut_error;
+	}
+
+	/* Store actual payload size of the compressed data. This pointer
+	points to buffer pool. */
+	if (write_size) {
+		*write_size = actual_size;
+	}
+
+#ifdef UNIV_PAGECOMPRESS_DEBUG
+	fprintf(stderr,
+		"InnoDB: Note: Preparing for decompress for len %lu\n",
+		actual_size);
+#endif /* UNIV_PAGECOMPRESS_DEBUG */
+
+
+	switch(compression_alg) {
+	case PAGE_ZLIB_ALGORITHM:
+		err= uncompress(in_buf, &len, buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (unsigned long)actual_size);
+
+		/* If uncompress fails it means that page is corrupted */
+		if (err != Z_OK) {
+
+			fprintf(stderr,
+				"InnoDB: Corruption: Page is marked as compressed\n"
+				"InnoDB: but uncompress failed with error %d.\n"
+				"InnoDB: size %lu len %lu\n",
+				err, actual_size, len);
+
+			fflush(stderr);
+
+			ut_error;
+		}
+		break;
+
+#ifdef HAVE_LZ4
+	case PAGE_LZ4_ALGORITHM:
+		err = LZ4_decompress_fast((const char *)buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE, (char *)in_buf, len);
+
+		if (err != (int)actual_size) {
+			fprintf(stderr,
+				"InnoDB: Corruption: Page is marked as compressed\n"
+				"InnoDB: but decompression read only %d bytes.\n"
+				"InnoDB: size %lu len %lu\n",
+				err, actual_size, len);
+			fflush(stderr);
+
+			ut_error;
+		}
+		break;
+#endif /* HAVE_LZ4 */
+#ifdef HAVE_LZO
+	case PAGE_LZO_ALGORITHM:
+        {
+                ulint olen=0;
+		err = lzo1x_decompress((const unsigned char *)buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE,
+			actual_size,(unsigned char *)in_buf, &olen, NULL);
+
+		if (err != LZO_E_OK || (olen == 0 || olen > UNIV_PAGE_SIZE)) {
+			fprintf(stderr,
+				"InnoDB: Corruption: Page is marked as compressed\n"
+				"InnoDB: but decompression read only %ld bytes.\n"
+				"InnoDB: size %lu len %lu\n",
+				olen, actual_size, len);
+			fflush(stderr);
+
+			ut_error;
+		}
+		break;
+        }
+#endif /* HAVE_LZO */
+#ifdef HAVE_LZMA
+	case PAGE_LZMA_ALGORITHM: {
+
+		lzma_ret	ret;
+		size_t		src_pos = 0;
+		size_t		dst_pos = 0;
+		uint64_t 	memlimit = UINT64_MAX;
+
+		ret = lzma_stream_buffer_decode(
+			&memlimit,
+			0,
+			NULL,
+			buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE,
+			&src_pos,
+			actual_size,
+			in_buf,
+			&dst_pos,
+			len);
+
+
+		if (ret != LZMA_OK || (dst_pos == 0 || dst_pos > UNIV_PAGE_SIZE)) {
+			fprintf(stderr,
+				"InnoDB: Corruption: Page is marked as compressed\n"
+				"InnoDB: but decompression read only %ld bytes.\n"
+				"InnoDB: size %lu len %lu\n",
+				dst_pos, actual_size, len);
+			fflush(stderr);
+
+			ut_error;
+		}
+
+		break;
+	}
+#endif /* HAVE_LZMA */
+#ifdef HAVE_BZIP2
+	case PAGE_BZIP2_ALGORITHM: {
+		unsigned int dst_pos = UNIV_PAGE_SIZE;
+
+		err = BZ2_bzBuffToBuffDecompress(
+			(char *)in_buf,
+			&dst_pos,
+			(char *)(buf+FIL_PAGE_DATA+FIL_PAGE_COMPRESSED_SIZE),
+			actual_size,
+			1,
+			0);
+
+		if (err != BZ_OK || (dst_pos == 0 || dst_pos > UNIV_PAGE_SIZE)) {
+			fprintf(stderr,
+				"InnoDB: Corruption: Page is marked as compressed\n"
+				"InnoDB: but decompression read only %du bytes.\n"
+				"InnoDB: size %lu len %lu err %d\n",
+				dst_pos, actual_size, len, err);
+			fflush(stderr);
+
+			ut_error;
+		}
+		break;
+	}
+#endif /* HAVE_BZIP2 */
+
+	default:
+		fprintf(stderr,
+			"InnoDB: Corruption: Page is marked as compressed\n"
+			"InnoDB: but compression algorithm %s\n"
+			"InnoDB: is not known.\n"
+			,fil_get_compression_alg_name(compression_alg));
+
+		fflush(stderr);
+		ut_error;
+		break;
+	}
+
+#ifdef UNIV_PAGECOMPRESS_DEBUG
+	fprintf(stderr,
+		"InnoDB: Note: Decompression succeeded for len %lu \n",
+		len);
+#endif /* UNIV_PAGECOMPRESS_DEBUG */
+
+	srv_stats.pages_page_decompressed.inc();
+
+	/* Copy the uncompressed page to the buffer pool, not
+	really any other options. */
+	memcpy(buf, in_buf, len);
+
+	// Need to free temporal buffer if no buffer was given
+	if (page_buf == NULL) {
+		ut_free(in_buf);
+	}
+}
+
+
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index 3c52a35b2b4..2d6b9881bd3 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -4,7 +4,7 @@ Copyright (c) 2000, 2014, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2008, 2009 Google Inc.
 Copyright (c) 2009, Percona Inc.
 Copyright (c) 2012, Facebook Inc.
-Copyright (c) 2013, 2014 SkySQL Ab. All Rights Reserved.
+Copyright (c) 2013, 2014, SkySQL Ab.
 
 Portions of this file contain modifications contributed and copyrighted by
 Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -57,6 +57,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 #include "buf0flu.h"
 #include "buf0dblwr.h"
 #include "btr0sea.h"
+#include "btr0defragment.h"
 #include "os0file.h"
 #include "os0thread.h"
 #include "srv0start.h"
@@ -65,7 +66,6 @@ this program; if not, write to the Free Software Foundation, Inc.,
 #include "trx0trx.h"
 
 #include "trx0sys.h"
-#include "mtr0mtr.h"
 #include "rem0types.h"
 #include "row0ins.h"
 #include "row0mysql.h"
@@ -86,6 +86,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 #include "dict0stats_bg.h"
 #include "ha_prototypes.h"
 #include "ut0mem.h"
+#include "ut0timer.h"
 #include "ibuf0ibuf.h"
 #include "dict0dict.h"
 #include "srv0mon.h"
@@ -101,6 +102,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 #endif /* UNIV_DEBUG */
 #include "fts0priv.h"
 #include "page0zip.h"
+#include "fil0pagecompress.h"
 
 #define thd_get_trx_isolation(X) ((enum_tx_isolation)thd_tx_isolation(X))
 
@@ -112,10 +114,40 @@ this program; if not, write to the Free Software Foundation, Inc.,
 #include "ha_innodb.h"
 #include "i_s.h"
 
+#include <mysql/plugin.h>
+#include <mysql/service_wsrep.h>
+
 # ifndef MYSQL_PLUGIN_IMPORT
 #  define MYSQL_PLUGIN_IMPORT /* nothing */
 # endif /* MYSQL_PLUGIN_IMPORT */
 
+#ifdef WITH_WSREP
+#include "dict0priv.h"
+#include "../storage/innobase/include/ut0byte.h"
+#include <mysql/service_md5.h>
+
+class  binlog_trx_data;
+extern handlerton *binlog_hton;
+
+extern MYSQL_PLUGIN_IMPORT MYSQL_BIN_LOG mysql_bin_log;
+
+static inline wsrep_ws_handle_t*
+wsrep_ws_handle(THD* thd, const trx_t* trx) {
+	return wsrep_ws_handle_for_trx(wsrep_thd_ws_handle(thd),
+				       (wsrep_trx_id_t)trx->id);
+}
+
+extern TC_LOG* tc_log;
+extern void wsrep_cleanup_transaction(THD *thd);
+static int
+wsrep_abort_transaction(handlerton* hton, THD *bf_thd, THD *victim_thd,
+			my_bool signal);
+static void
+wsrep_fake_trx_id(handlerton* hton, THD *thd);
+static int innobase_wsrep_set_checkpoint(handlerton* hton, const XID* xid);
+static int innobase_wsrep_get_checkpoint(handlerton* hton, XID* xid);
+#endif /* WITH_WSREP */
+
 /** to protect innobase_open_files */
 static mysql_mutex_t innobase_share_mutex;
 /** to force correct commit order in binlog */
@@ -224,12 +256,12 @@ static TYPELIB innodb_stats_method_typelib = {
 
 /** Possible values for system variable "innodb_checksum_algorithm". */
 static const char* innodb_checksum_algorithm_names[] = {
-	"crc32",
-	"strict_crc32",
-	"innodb",
-	"strict_innodb",
-	"none",
-	"strict_none",
+	"CRC32",
+	"STRICT_CRC32",
+	"INNODB",
+	"STRICT_INNODB",
+	"NONE",
+	"STRICT_NONE",
 	NullS
 };
 
@@ -501,6 +533,28 @@ ib_cb_t innodb_api_cb[] = {
 	(ib_cb_t) ib_cursor_stmt_begin
 };
 
+/**
+  Structure for CREATE TABLE options (table options).
+  It needs to be called ha_table_option_struct.
+
+  The option values can be specified in the CREATE TABLE at the end:
+  CREATE TABLE ( ... ) *here*
+*/
+
+ha_create_table_option innodb_table_option_list[]=
+{
+  /* With this option user can enable page compression feature for the
+  table */
+  HA_TOPTION_BOOL("PAGE_COMPRESSED", page_compressed, 0),
+  /* With this option user can set zip compression level for page
+  compression for this table*/
+  HA_TOPTION_NUMBER("PAGE_COMPRESSION_LEVEL", page_compression_level, ULINT_UNDEFINED, 0, 9, 1),
+  /* With this option user can enable atomic writes feature for this table */
+  HA_TOPTION_ENUM("ATOMIC_WRITES", atomic_writes, "DEFAULT,ON,OFF", 0),
+  HA_TOPTION_END
+};
+
+
 /*************************************************************//**
 Check whether valid argument given to innodb_ft_*_stopword_table.
 This function is registered as a callback with MySQL.
@@ -536,7 +590,27 @@ static inline
 ulint
 innobase_map_isolation_level(
 /*=========================*/
-	enum_tx_isolation	iso);	/*!< in: MySQL isolation level code */
+	enum_tx_isolation	iso);	/*!< in: MySQL isolation level code
+					*/
+
+/*************************************************************//**
+Check for a valid value of innobase_compression_algorithm.
+@return	0 for valid innodb_compression_algorithm. */
+static
+int
+innodb_compression_algorithm_validate(
+/*==================================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value);	/*!< in: incoming string */
+
+static ibool innodb_have_lzo=IF_LZO(1, 0);
+static ibool innodb_have_lz4=IF_LZ4(1, 0);
+static ibool innodb_have_lzma=IF_LZMA(1, 0);
+static ibool innodb_have_bzip2=IF_BZIP2(1, 0);
 
 static const char innobase_hton_name[]= "InnoDB";
 
@@ -689,6 +763,68 @@ static SHOW_VAR innodb_status_variables[]= {
   {"purge_view_trx_id_age",
   (char*) &export_vars.innodb_purge_view_trx_id_age,      SHOW_LONG},
 #endif /* UNIV_DEBUG */
+  /* Status variables for page compression */
+  {"page_compression_saved",
+   (char*) &export_vars.innodb_page_compression_saved,    SHOW_LONGLONG},
+  {"page_compression_trim_sect512",
+   (char*) &export_vars.innodb_page_compression_trim_sect512,    SHOW_LONGLONG},
+  {"page_compression_trim_sect1024",
+   (char*) &export_vars.innodb_page_compression_trim_sect1024,    SHOW_LONGLONG},
+  {"page_compression_trim_sect2048",
+   (char*) &export_vars.innodb_page_compression_trim_sect2048,    SHOW_LONGLONG},
+  {"page_compression_trim_sect4096",
+   (char*) &export_vars.innodb_page_compression_trim_sect4096,    SHOW_LONGLONG},
+  {"page_compression_trim_sect8192",
+   (char*) &export_vars.innodb_page_compression_trim_sect8192,    SHOW_LONGLONG},
+  {"page_compression_trim_sect16384",
+   (char*) &export_vars.innodb_page_compression_trim_sect16384,    SHOW_LONGLONG},
+  {"page_compression_trim_sect32768",
+   (char*) &export_vars.innodb_page_compression_trim_sect32768,    SHOW_LONGLONG},
+  {"num_index_pages_written",
+   (char*) &export_vars.innodb_index_pages_written,       SHOW_LONGLONG},
+  {"num_non_index_pages_written",
+   (char*) &export_vars.innodb_non_index_pages_written,       SHOW_LONGLONG},
+  {"num_pages_page_compressed",
+   (char*) &export_vars.innodb_pages_page_compressed,     SHOW_LONGLONG},
+  {"num_page_compressed_trim_op",
+   (char*) &export_vars.innodb_page_compressed_trim_op,     SHOW_LONGLONG},
+  {"num_page_compressed_trim_op_saved",
+   (char*) &export_vars.innodb_page_compressed_trim_op_saved,     SHOW_LONGLONG},
+  {"num_pages_page_decompressed",
+   (char*) &export_vars.innodb_pages_page_decompressed,   SHOW_LONGLONG},
+  {"have_lz4",
+  (char*) &innodb_have_lz4,		  SHOW_BOOL},
+  {"have_lzo",
+  (char*) &innodb_have_lzo,		  SHOW_BOOL},
+  {"have_lzma",
+  (char*) &innodb_have_lzma,		  SHOW_BOOL},
+  {"have_bzip2",
+  (char*) &innodb_have_bzip2,		  SHOW_BOOL},
+
+  /* Defragmentation */
+  {"defragment_compression_failures",
+  (char*) &export_vars.innodb_defragment_compression_failures, SHOW_LONG},
+  {"defragment_failures",
+  (char*) &export_vars.innodb_defragment_failures, SHOW_LONG},
+  {"defragment_count",
+  (char*) &export_vars.innodb_defragment_count, SHOW_LONG},
+
+  /* Online alter table status variables */
+  {"onlineddl_rowlog_rows",
+  (char*) &export_vars.innodb_onlineddl_rowlog_rows, SHOW_LONG},
+  {"onlineddl_rowlog_pct_used",
+  (char*) &export_vars.innodb_onlineddl_rowlog_pct_used, SHOW_LONG},
+  {"onlineddl_pct_progress",
+  (char*) &export_vars.innodb_onlineddl_pct_progress, SHOW_LONG},
+
+  /* Times secondary index lookup triggered cluster lookup and
+  times prefix optimization avoided triggering cluster lookup */
+  {"secondary_index_triggered_cluster_reads",
+  (char*) &export_vars.innodb_sec_rec_cluster_reads,	  SHOW_LONG},
+  {"secondary_index_triggered_cluster_reads_avoided",
+  (char*) &export_vars.innodb_sec_rec_cluster_reads_avoided, SHOW_LONG},
+
+
   {NullS, NullS, SHOW_LONG}
 };
 
@@ -1190,6 +1326,10 @@ innobase_srv_conc_enter_innodb(
 /*===========================*/
 	trx_t*	trx)	/*!< in: transaction handle */
 {
+#ifdef WITH_WSREP
+	if (wsrep_on(trx->mysql_thd) && 
+	    wsrep_thd_is_BF(trx->mysql_thd, FALSE)) return;
+#endif /* WITH_WSREP */
 	if (srv_thread_concurrency) {
 		if (trx->n_tickets_to_enter_innodb > 0) {
 
@@ -1224,6 +1364,10 @@ innobase_srv_conc_exit_innodb(
 #ifdef UNIV_SYNC_DEBUG
 	ut_ad(!sync_thread_levels_nonempty_trx(trx->has_search_latch));
 #endif /* UNIV_SYNC_DEBUG */
+#ifdef WITH_WSREP
+	if (wsrep_on(trx->mysql_thd) && 
+	    wsrep_thd_is_BF(trx->mysql_thd, FALSE)) return;
+#endif /* WITH_WSREP */
 
 	/* This is to avoid making an unnecessary function call. */
 	if (trx->declared_to_be_inside_innodb
@@ -1344,6 +1488,15 @@ thd_to_trx(
 {
 	return(*(trx_t**) thd_ha_data(thd, innodb_hton_ptr));
 }
+#ifdef WITH_WSREP
+ulonglong
+thd_to_trx_id(
+/*=======*/
+	THD*	thd)	/*!< in: MySQL thread */
+{
+	return(thd_to_trx(thd)->id);
+}
+#endif /* WITH_WSREP */
 
 /********************************************************************//**
 Call this function when mysqld passes control to the client. That is to
@@ -1829,6 +1982,9 @@ int
 innobase_mysql_tmpfile(void)
 /*========================*/
 {
+#ifdef WITH_INNODB_DISALLOW_WRITES
+	os_event_wait(srv_allow_writes_event);
+#endif /* WITH_INNODB_DISALLOW_WRITES */
 	int	fd2 = -1;
 	File	fd;
 
@@ -2285,9 +2441,11 @@ ha_innobase::ha_innobase(
 		  HA_BINLOG_ROW_CAPABLE |
 		  HA_CAN_GEOMETRY | HA_PARTIAL_COLUMN_READ |
 		  HA_TABLE_SCAN_ON_INDEX | HA_CAN_FULLTEXT |
+		  (srv_force_primary_key ? HA_REQUIRE_PRIMARY_KEY : 0 ) |
 		  HA_CAN_FULLTEXT_EXT | HA_CAN_EXPORT),
 	start_of_scan(0),
-	num_write_row(0)
+	num_write_row(0),
+	ha_partition_stats(NULL)
 {}
 
 /*********************************************************************//**
@@ -2907,11 +3065,19 @@ innobase_init(
 
 	innobase_hton->release_temporary_latches =
 		innobase_release_temporary_latches;
+#ifdef WITH_WSREP
+        innobase_hton->abort_transaction=wsrep_abort_transaction;
+        innobase_hton->set_checkpoint=innobase_wsrep_set_checkpoint;
+        innobase_hton->get_checkpoint=innobase_wsrep_get_checkpoint;
+        innobase_hton->fake_trx_id=wsrep_fake_trx_id;
+#endif /* WITH_WSREP */
 	innobase_hton->kill_query = innobase_kill_query;
 
         if (srv_file_per_table)
           innobase_hton->tablefile_extensions = ha_innobase_exts;
 
+	innobase_hton->table_options = innodb_table_option_list;
+
 	ut_a(DATA_MYSQL_TRUE_VARCHAR == (ulint)MYSQL_TYPE_VARCHAR);
 
 #ifndef DBUG_OFF
@@ -2946,6 +3112,58 @@ innobase_init(
 		}
 	}
 
+	if (UNIV_PAGE_SIZE != UNIV_PAGE_SIZE_DEF) {
+		fprintf(stderr,
+			"InnoDB: Warning: innodb_page_size has been "
+			"changed from default value %d to %ldd. (###EXPERIMENTAL### "
+			"operation)\n", UNIV_PAGE_SIZE_DEF, UNIV_PAGE_SIZE);
+
+		/* There is hang on buffer pool when trying to get a new
+		page if buffer pool size is too small for large page sizes */
+		if (innobase_buffer_pool_size < (24 * 1024 * 1024)) {
+			fprintf(stderr, "InnoDB: Error: innobase_page_size %lu requires "
+				"innodb_buffer_pool_size > 24M current %lld",
+				UNIV_PAGE_SIZE, innobase_buffer_pool_size);
+			goto error;
+		}
+	}
+
+#ifndef HAVE_LZ4
+	if (innodb_compression_algorithm == PAGE_LZ4_ALGORITHM) {
+		sql_print_error("InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
+				"InnoDB: liblz4 is not installed. \n",
+				innodb_compression_algorithm);
+	        goto error;
+	}
+#endif
+
+#ifndef HAVE_LZO
+	if (innodb_compression_algorithm == PAGE_LZO_ALGORITHM) {
+		sql_print_error("InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
+				"InnoDB: liblzo is not installed. \n",
+				innodb_compression_algorithm);
+		goto error;
+	}
+#endif
+
+#ifndef HAVE_LZMA
+	if (innodb_compression_algorithm == PAGE_LZMA_ALGORITHM) {
+		sql_print_error("InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
+				"InnoDB: liblzma is not installed. \n",
+				innodb_compression_algorithm);
+		goto error;
+	}
+#endif
+
+#ifndef HAVE_BZIP2
+	if (innodb_compression_algorithm == PAGE_BZIP2_ALGORITHM) {
+		sql_print_error("InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
+				"InnoDB: libbz2 is not installed. \n",
+				innodb_compression_algorithm);
+		goto error;
+	}
+#endif
+
 	os_innodb_umask = (ulint) my_umask;
 
 	/* First calculate the default path for innodb_data_home_dir etc.,
@@ -3519,10 +3737,30 @@ innobase_commit_low(
 /*================*/
 	trx_t*	trx)	/*!< in: transaction handle */
 {
+#ifdef WITH_WSREP
+	THD* thd = (THD*)trx->mysql_thd;
+	const char* tmp = 0;
+	if (wsrep_on(thd)) {
+#ifdef WSREP_PROC_INFO
+		char info[64];
+		info[sizeof(info) - 1] = '\0';
+		snprintf(info, sizeof(info) - 1,
+			 "innobase_commit_low():trx_commit_for_mysql(%lld)",
+			 (long long) wsrep_thd_trx_seqno(thd));
+		tmp = thd_proc_info(thd, info);
+
+#else
+		tmp = thd_proc_info(thd, "innobase_commit_low()");
+#endif /* WSREP_PROC_INFO */
+	}
+#endif /* WITH_WSREP */
 	if (trx_is_started(trx)) {
 
 		trx_commit_for_mysql(trx);
 	}
+#ifdef WITH_WSREP
+	if (wsrep_on(thd)) { thd_proc_info(thd, tmp); }
+#endif /* WITH_WSREP */
 }
 
 /*****************************************************************//**
@@ -4244,6 +4482,20 @@ innobase_kill_query(
 	DBUG_ENTER("innobase_kill_query");
 	DBUG_ASSERT(hton == innodb_hton_ptr);
 
+#ifdef WITH_WSREP
+	wsrep_thd_LOCK(thd);
+	if (wsrep_thd_get_conflict_state(thd) != NO_CONFLICT) {
+		/* if victim has been signaled by BF thread and/or aborting
+		   is already progressing, following query aborting is not necessary
+		   any more.
+		   Also, BF thread should own trx mutex for the victim, which would
+		   conflict with trx_mutex_enter() below
+		*/
+		wsrep_thd_UNLOCK(thd);
+		DBUG_VOID_RETURN;
+	}
+	wsrep_thd_UNLOCK(thd);
+#endif /* WITH_WSREP */
 	trx = thd_to_trx(thd);
 
 	if (trx) {
@@ -4251,7 +4503,7 @@ innobase_kill_query(
 		THD *owner = trx->current_lock_mutex_owner;
 
 		/* Cancel a pending lock request. */
-		if (owner != cur) {
+		if (!owner || owner != cur) {
 			lock_mutex_enter();
 		}
 		trx_mutex_enter(trx);
@@ -4259,7 +4511,7 @@ innobase_kill_query(
 			lock_cancel_waiting_and_release(trx->lock.wait_lock);
 		}
 		trx_mutex_exit(trx);
-		if (owner != cur) {
+		if (!owner || owner != cur) {
 			lock_mutex_exit();
 		}
 	}
@@ -4418,7 +4670,11 @@ ha_innobase::max_supported_key_length() const
 	case 8192:
 		return(1536);
 	default:
+#ifdef WITH_WSREP
+		return(3500);
+#else
 		return(3500);
+#endif
 	}
 }
 
@@ -5525,6 +5781,117 @@ get_field_offset(
 	return((uint) (field->ptr - table->record[0]));
 }
 
+#ifdef WITH_WSREP
+UNIV_INTERN
+int
+wsrep_innobase_mysql_sort(
+/*===============*/
+					/* out: str contains sort string */
+	int		mysql_type,	/* in: MySQL type */
+	uint		charset_number,	/* in: number of the charset */
+	unsigned char*	str,		/* in: data field */
+	unsigned int	str_length,	/* in: data field length,
+					not UNIV_SQL_NULL */
+	unsigned int	buf_length)	/* in: total str buffer length */
+
+{
+	CHARSET_INFO*		charset;
+	enum_field_types	mysql_tp;
+	int ret_length =	str_length;
+
+	DBUG_ASSERT(str_length != UNIV_SQL_NULL);
+
+	mysql_tp = (enum_field_types) mysql_type;
+
+	switch (mysql_tp) {
+
+	case MYSQL_TYPE_BIT:
+	case MYSQL_TYPE_STRING:
+	case MYSQL_TYPE_VAR_STRING:
+	case MYSQL_TYPE_TINY_BLOB:
+	case MYSQL_TYPE_MEDIUM_BLOB:
+	case MYSQL_TYPE_BLOB:
+	case MYSQL_TYPE_LONG_BLOB:
+	case MYSQL_TYPE_VARCHAR:
+	{
+		uchar tmp_str[REC_VERSION_56_MAX_INDEX_COL_LEN] = {'\0'};
+		uint tmp_length = REC_VERSION_56_MAX_INDEX_COL_LEN;
+
+		/* Use the charset number to pick the right charset struct for
+		the comparison. Since the MySQL function get_charset may be
+		slow before Bar removes the mutex operation there, we first
+		look at 2 common charsets directly. */
+
+		if (charset_number == default_charset_info->number) {
+			charset = default_charset_info;
+		} else if (charset_number == my_charset_latin1.number) {
+			charset = &my_charset_latin1;
+		} else {
+			charset = get_charset(charset_number, MYF(MY_WME));
+
+			if (charset == NULL) {
+			  sql_print_error("InnoDB needs charset %lu for doing "
+					  "a comparison, but MySQL cannot "
+					  "find that charset.",
+					  (ulong) charset_number);
+				ut_a(0);
+			}
+		}
+
+		ut_a(str_length <= tmp_length);
+		memcpy(tmp_str, str, str_length);
+
+		tmp_length = charset->coll->strnxfrm(charset, str, str_length,
+                                             str_length, tmp_str,
+                                             tmp_length, 0);
+		DBUG_ASSERT(tmp_length <= str_length);
+		if (wsrep_protocol_version < 3) {
+			tmp_length = charset->coll->strnxfrm(
+				charset, str, str_length,
+				str_length, tmp_str, tmp_length, 0);
+			DBUG_ASSERT(tmp_length <= str_length);
+		} else {
+			/* strnxfrm will expand the destination string,
+			   protocols < 3 truncated the sorted sring
+			   protocols >= 3 gets full sorted sring
+			*/
+			tmp_length = charset->coll->strnxfrm(
+				charset, str, buf_length,
+				str_length, tmp_str, str_length, 0);
+			DBUG_ASSERT(tmp_length <= buf_length);
+			ret_length = tmp_length;
+		}
+ 
+		break;
+	}
+	case MYSQL_TYPE_DECIMAL :
+	case MYSQL_TYPE_TINY :
+	case MYSQL_TYPE_SHORT :
+	case MYSQL_TYPE_LONG :
+	case MYSQL_TYPE_FLOAT :
+	case MYSQL_TYPE_DOUBLE :
+	case MYSQL_TYPE_NULL :
+	case MYSQL_TYPE_TIMESTAMP :
+	case MYSQL_TYPE_LONGLONG :
+	case MYSQL_TYPE_INT24 :
+	case MYSQL_TYPE_DATE :
+	case MYSQL_TYPE_TIME :
+	case MYSQL_TYPE_DATETIME :
+	case MYSQL_TYPE_YEAR :
+	case MYSQL_TYPE_NEWDATE :
+	case MYSQL_TYPE_NEWDECIMAL :
+	case MYSQL_TYPE_ENUM :
+	case MYSQL_TYPE_SET :
+	case MYSQL_TYPE_GEOMETRY :
+		break;
+	default:
+		break;
+	}
+
+	return ret_length;
+}
+#endif /* WITH_WSREP */
+
 /*************************************************************//**
 InnoDB uses this function to compare two data fields for which the data type
 is such that we must use MySQL code to compare them. NOTE that the prototype
@@ -6025,11 +6392,313 @@ innobase_read_from_2_little_endian(
 	return((uint) ((ulint)(buf[0]) + 256 * ((ulint)(buf[1]))));
 }
 
+#ifdef WITH_WSREP
 /*******************************************************************//**
 Stores a key value for a row to a buffer.
 @return	key value length as stored in buff */
 UNIV_INTERN
 uint
+wsrep_store_key_val_for_row(
+/*===============================*/
+	THD* 		thd,
+	TABLE*		table,
+	uint		keynr,	/*!< in: key number */
+	char*		buff,	/*!< in/out: buffer for the key value (in MySQL
+				format) */
+	uint		buff_len,/*!< in: buffer length */
+	const uchar*	record,
+	ibool*          key_is_null)/*!< out: full key was null */
+{
+	KEY*		key_info	= table->key_info + keynr;
+	KEY_PART_INFO*	key_part	= key_info->key_part;
+	KEY_PART_INFO*	end		= key_part + key_info->user_defined_key_parts;
+	char*		buff_start	= buff;
+	enum_field_types mysql_type;
+	Field*		field;
+	uint buff_space = buff_len;
+
+	DBUG_ENTER("wsrep_store_key_val_for_row");
+
+	memset(buff, 0, buff_len);
+	*key_is_null = TRUE;
+
+	for (; key_part != end; key_part++) {
+
+		uchar sorted[REC_VERSION_56_MAX_INDEX_COL_LEN] = {'\0'};
+		ibool part_is_null = FALSE;
+
+		if (key_part->null_bit) {
+			if (buff_space > 0) {
+				if (record[key_part->null_offset] 
+				    & key_part->null_bit) {
+					*buff = 1;
+					part_is_null = TRUE;
+				} else {
+					*buff = 0;
+				}
+				buff++;
+				buff_space--;
+			} else {
+				fprintf (stderr, "WSREP: key truncated: %s\n",
+					 wsrep_thd_query(thd));
+			}
+		}
+		if (!part_is_null)  *key_is_null = FALSE;
+
+		field = key_part->field;
+		mysql_type = field->type();
+
+		if (mysql_type == MYSQL_TYPE_VARCHAR) {
+						/* >= 5.0.3 true VARCHAR */
+			ulint		lenlen;
+			ulint		len;
+			const byte*	data;
+			ulint		key_len;
+			ulint		true_len;
+			const CHARSET_INFO* cs;
+			int		error=0;
+
+			key_len = key_part->length;
+
+			if (part_is_null) {
+				true_len = key_len + 2;
+				if (true_len > buff_space) {
+					fprintf (stderr,
+						 "WSREP: key truncated: %s\n",
+						 wsrep_thd_query(thd));
+					true_len = buff_space;
+				}
+				buff       += true_len;
+				buff_space -= true_len;
+				continue;
+			}
+			cs = field->charset();
+
+			lenlen = (ulint)
+				(((Field_varstring*)field)->length_bytes);
+
+			data = row_mysql_read_true_varchar(&len,
+				(byte*) (record
+				+ (ulint)get_field_offset(table, field)),
+				lenlen);
+
+			true_len = len;
+
+			/* For multi byte character sets we need to calculate
+			the true length of the key */
+
+			if (len > 0 && cs->mbmaxlen > 1) {
+				true_len = (ulint) cs->cset->well_formed_len(cs,
+						(const char *) data,
+						(const char *) data + len,
+                                                (uint) (key_len /
+                                                        cs->mbmaxlen),
+						&error);
+			}
+
+			/* In a column prefix index, we may need to truncate
+			the stored value: */
+
+			if (true_len > key_len) {
+				true_len = key_len;
+			}
+
+			memcpy(sorted, data, true_len);
+			true_len = wsrep_innobase_mysql_sort(
+				mysql_type, cs->number, sorted, true_len, 
+				REC_VERSION_56_MAX_INDEX_COL_LEN);
+
+			if (wsrep_protocol_version > 1) {
+			/* Note that we always reserve the maximum possible
+			   length of the true VARCHAR in the key value, though
+			   only len first bytes after the 2 length bytes contain
+			   actual data. The rest of the space was reset to zero
+			   in the bzero() call above. */
+				if (true_len > buff_space) {
+					fprintf (stderr,
+						 "WSREP: key truncated: %s\n",
+						 wsrep_thd_query(thd));
+					true_len = buff_space;
+				}
+ 				memcpy(buff, sorted, true_len);
+                                buff       += true_len;
+				buff_space -= true_len;
+                        } else {
+                                buff += key_len;
+                        }
+		} else if (mysql_type == MYSQL_TYPE_TINY_BLOB
+			|| mysql_type == MYSQL_TYPE_MEDIUM_BLOB
+			|| mysql_type == MYSQL_TYPE_BLOB
+			|| mysql_type == MYSQL_TYPE_LONG_BLOB
+			/* MYSQL_TYPE_GEOMETRY data is treated
+			as BLOB data in innodb. */
+			|| mysql_type == MYSQL_TYPE_GEOMETRY) {
+
+			const CHARSET_INFO* cs;
+			ulint		key_len;
+			ulint		true_len;
+			int		error=0;
+			ulint		blob_len;
+			const byte*	blob_data;
+
+			ut_a(key_part->key_part_flag & HA_PART_KEY_SEG);
+
+			key_len = key_part->length;
+
+			if (part_is_null) {
+				true_len = key_len + 2;
+				if (true_len > buff_space) {
+					fprintf (stderr,
+						 "WSREP: key truncated: %s\n",
+						 wsrep_thd_query(thd));
+					true_len = buff_space;
+				}
+				buff       += true_len;
+				buff_space -= true_len;
+
+				continue;
+			}
+
+			cs = field->charset();
+
+			blob_data = row_mysql_read_blob_ref(&blob_len,
+				(byte*) (record
+				+ (ulint)get_field_offset(table, field)),
+					(ulint) field->pack_length());
+
+			true_len = blob_len;
+
+			ut_a(get_field_offset(table, field)
+				== key_part->offset);
+
+			/* For multi byte character sets we need to calculate
+			the true length of the key */
+
+			if (blob_len > 0 && cs->mbmaxlen > 1) {
+				true_len = (ulint) cs->cset->well_formed_len(cs,
+						(const char *) blob_data,
+						(const char *) blob_data
+							+ blob_len,
+                                                (uint) (key_len /
+                                                        cs->mbmaxlen),
+						&error);
+			}
+
+			/* All indexes on BLOB and TEXT are column prefix
+			indexes, and we may need to truncate the data to be
+			stored in the key value: */
+
+			if (true_len > key_len) {
+				true_len = key_len;
+			}
+
+			memcpy(sorted, blob_data, true_len);
+			true_len = wsrep_innobase_mysql_sort(
+				mysql_type, cs->number, sorted, true_len,
+				REC_VERSION_56_MAX_INDEX_COL_LEN);
+
+
+			/* Note that we always reserve the maximum possible
+			length of the BLOB prefix in the key value. */
+                        if (wsrep_protocol_version > 1) {
+				if (true_len > buff_space) {
+					fprintf (stderr,
+						 "WSREP: key truncated: %s\n",
+						 wsrep_thd_query(thd));
+					true_len = buff_space;
+				}
+				buff       += true_len;
+				buff_space -= true_len;
+			} else {
+				buff += key_len;
+			}
+			memcpy(buff, sorted, true_len);
+		} else {
+			/* Here we handle all other data types except the
+			true VARCHAR, BLOB and TEXT. Note that the column
+			value we store may be also in a column prefix
+			index. */
+
+			const CHARSET_INFO*	cs = NULL;
+			ulint			true_len;
+			ulint			key_len;
+			const uchar*		src_start;
+			int			error=0;
+			enum_field_types	real_type;
+
+			key_len = key_part->length;
+
+			if (part_is_null) {
+				true_len = key_len;
+				if (true_len > buff_space) {
+					fprintf (stderr,
+						 "WSREP: key truncated: %s\n",
+						 wsrep_thd_query(thd));
+					true_len = buff_space;
+				}
+				buff       += true_len;
+				buff_space -= true_len;
+
+				continue;
+			}
+
+			src_start = record + key_part->offset;
+			real_type = field->real_type();
+			true_len = key_len;
+
+			/* Character set for the field is defined only
+			to fields whose type is string and real field
+			type is not enum or set. For these fields check
+			if character set is multi byte. */
+
+			if (real_type != MYSQL_TYPE_ENUM
+				&& real_type != MYSQL_TYPE_SET
+				&& ( mysql_type == MYSQL_TYPE_VAR_STRING
+					|| mysql_type == MYSQL_TYPE_STRING)) {
+
+				cs = field->charset();
+
+				/* For multi byte character sets we need to
+				calculate the true length of the key */
+
+				if (key_len > 0 && cs->mbmaxlen > 1) {
+
+					true_len = (ulint)
+						cs->cset->well_formed_len(cs,
+							(const char *)src_start,
+							(const char *)src_start
+								+ key_len,
+                                                        (uint) (key_len /
+                                                                cs->mbmaxlen),
+							&error);
+				}
+				memcpy(sorted, src_start, true_len);
+				true_len = wsrep_innobase_mysql_sort(
+					mysql_type, cs->number, sorted, true_len,
+					REC_VERSION_56_MAX_INDEX_COL_LEN);
+
+				if (true_len > buff_space) {
+					fprintf (stderr,
+						 "WSREP: key truncated: %s\n",
+						 wsrep_thd_query(thd));
+					true_len   = buff_space;
+				}
+				memcpy(buff, sorted, true_len);
+			} else {
+				memcpy(buff, src_start, true_len);
+			}
+			buff       += true_len;
+			buff_space -= true_len;
+		}
+	}
+
+	ut_a(buff <= buff_start + buff_len);
+
+	DBUG_RETURN((uint)(buff - buff_start));
+}
+#endif /* WITH_WSREP */
+UNIV_INTERN
+uint
 ha_innobase::store_key_val_for_row(
 /*===============================*/
 	uint		keynr,	/*!< in: key number */
@@ -6411,11 +7080,20 @@ build_template_field(
 	templ->col_no = i;
 	templ->clust_rec_field_no = dict_col_get_clust_pos(col, clust_index);
 	ut_a(templ->clust_rec_field_no != ULINT_UNDEFINED);
+	templ->rec_field_is_prefix = FALSE;
 
 	if (dict_index_is_clust(index)) {
 		templ->rec_field_no = templ->clust_rec_field_no;
+		templ->rec_prefix_field_no = ULINT_UNDEFINED;
 	} else {
-		templ->rec_field_no = dict_index_get_nth_col_pos(index, i);
+		/* If we're in a secondary index, keep track
+		* of the original index position even if this
+		* is just a prefix index; we will use this
+		* later to avoid a cluster index lookup in
+		* some cases.*/
+
+		templ->rec_field_no = dict_index_get_nth_col_pos(index, i,
+						&templ->rec_prefix_field_no);
 	}
 
 	if (field->real_maybe_null()) {
@@ -6446,6 +7124,13 @@ build_template_field(
 	if (!dict_index_is_clust(index)
 	    && templ->rec_field_no == ULINT_UNDEFINED) {
 		prebuilt->need_to_access_clustered = TRUE;
+
+		if (templ->rec_prefix_field_no != ULINT_UNDEFINED) {
+			dict_field_t* field = dict_index_get_nth_field(
+						index,
+						templ->rec_prefix_field_no);
+			templ->rec_field_is_prefix = (field->prefix_len != 0);
+		}
 	}
 
 	if (prebuilt->mysql_prefix_len < templ->mysql_col_offset
@@ -6607,7 +7292,8 @@ ha_innobase::build_template(
 				} else {
 					templ->icp_rec_field_no
 						= dict_index_get_nth_col_pos(
-							prebuilt->index, i);
+							prebuilt->index, i,
+							NULL);
 				}
 
 				if (dict_index_is_clust(prebuilt->index)) {
@@ -6637,7 +7323,7 @@ ha_innobase::build_template(
 
 				templ->icp_rec_field_no
 					= dict_index_get_nth_col_or_prefix_pos(
-						prebuilt->index, i, TRUE);
+						prebuilt->index, i, TRUE, NULL);
 				ut_ad(templ->icp_rec_field_no
 				      != ULINT_UNDEFINED);
 
@@ -6870,6 +7556,9 @@ ha_innobase::write_row(
 	dberr_t		error;
 	int		error_result= 0;
 	ibool		auto_inc_used= FALSE;
+#ifdef WITH_WSREP
+	ibool           auto_inc_inserted= FALSE; /* if NULL was inserted */
+#endif
 	ulint		sql_command;
 	trx_t*		trx = thd_to_trx(user_thd);
 
@@ -6903,8 +7592,20 @@ ha_innobase::write_row(
 	if ((sql_command == SQLCOM_ALTER_TABLE
 	     || sql_command == SQLCOM_OPTIMIZE
 	     || sql_command == SQLCOM_CREATE_INDEX
+#ifdef WITH_WSREP
+	     || (wsrep_on(user_thd) && wsrep_load_data_splitting &&
+		 sql_command == SQLCOM_LOAD                      &&
+		 !thd_test_options(
+			user_thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))
+#endif /* WITH_WSREP */
 	     || sql_command == SQLCOM_DROP_INDEX)
 	    && num_write_row >= 10000) {
+#ifdef WITH_WSREP
+		if (wsrep_on(user_thd) && sql_command == SQLCOM_LOAD) {
+			WSREP_DEBUG("forced trx split for LOAD: %s", 
+				    wsrep_thd_query(user_thd));
+		}
+#endif /* WITH_WSREP */
 		/* ALTER TABLE is COMMITted at every 10000 copied rows.
 		The IX table lock for the original table has to be re-issued.
 		As this method will be called on a temporary table where the
@@ -6938,6 +7639,21 @@ no_commit:
 			*/
 			;
 		} else if (src_table == prebuilt->table) {
+#ifdef WITH_WSREP
+			switch (wsrep_run_wsrep_commit(user_thd, 0, 1))
+			{
+			case WSREP_TRX_OK:
+				break;
+			case WSREP_TRX_SIZE_EXCEEDED:
+			case WSREP_TRX_CERT_FAIL:
+			case WSREP_TRX_ERROR:
+				DBUG_RETURN(1);
+			}
+
+			if (binlog_hton->commit(binlog_hton, user_thd, 1))
+                                DBUG_RETURN(1);
+                        wsrep_post_commit(user_thd, TRUE);
+#endif /* WITH_WSREP */
 			/* Source table is not in InnoDB format:
 			no need to re-acquire locks on it. */
 
@@ -6948,6 +7664,21 @@ no_commit:
 			/* We will need an IX lock on the destination table. */
 			prebuilt->sql_stat_start = TRUE;
 		} else {
+#ifdef WITH_WSREP
+			switch (wsrep_run_wsrep_commit(user_thd, 0, 1))
+			{
+			case WSREP_TRX_OK:
+				break;
+			case WSREP_TRX_SIZE_EXCEEDED:
+			case WSREP_TRX_CERT_FAIL:
+			case WSREP_TRX_ERROR:
+				DBUG_RETURN(1);
+			}
+
+			if (binlog_hton->commit(binlog_hton, user_thd, 1))
+                                DBUG_RETURN(1);
+                        wsrep_post_commit(user_thd, TRUE);
+#endif /* WITH_WSREP */
 			/* Ensure that there are no other table locks than
 			LOCK_IX and LOCK_AUTO_INC on the destination table. */
 
@@ -6977,6 +7708,10 @@ no_commit:
 		innobase_get_auto_increment(). */
 		prebuilt->autoinc_error = DB_SUCCESS;
 
+#ifdef WITH_WSREP
+		auto_inc_inserted= (table->next_number_field->val_int() == 0);
+#endif
+
 		if ((error_result = update_auto_increment())) {
 			/* We don't want to mask autoinc overflow errors. */
 
@@ -7055,6 +7790,33 @@ no_commit:
 			case SQLCOM_REPLACE_SELECT:
 				goto set_max_autoinc;
 
+#ifdef WITH_WSREP
+			/* workaround for LP bug #355000, retrying the insert */
+			case SQLCOM_INSERT:
+                               if (wsrep_on(current_thd)                     &&
+                                   auto_inc_inserted                         &&
+                                   wsrep_drupal_282555_workaround            &&
+                                   wsrep_thd_retry_counter(current_thd) == 0 &&
+				    !thd_test_options(current_thd, 
+						      OPTION_NOT_AUTOCOMMIT | 
+						      OPTION_BEGIN)) {
+					WSREP_DEBUG(
+					    "retrying insert: %s",
+					    (*wsrep_thd_query(current_thd)) ? 
+						wsrep_thd_query(current_thd) : 
+						(char *)"void");
+					error= DB_SUCCESS;
+					wsrep_thd_set_conflict_state(
+						current_thd, MUST_ABORT);
+                                        innobase_srv_conc_exit_innodb(
+						prebuilt->trx);
+                                        /* jump straight to func exit over
+                                         * later wsrep hooks */
+                                        goto func_exit;
+				}
+                                break;
+#endif /* WITH_WSREP */
+
 			default:
 				break;
 			}
@@ -7114,6 +7876,21 @@ report_error:
 						   prebuilt->table->flags,
 						   user_thd);
 
+#ifdef WITH_WSREP
+	if (!error_result && wsrep_thd_exec_mode(user_thd) == LOCAL_STATE &&
+	    wsrep_on(user_thd) && !wsrep_consistency_check(user_thd) &&
+	    (sql_command != SQLCOM_LOAD || 
+	     thd_binlog_format(user_thd) == BINLOG_FORMAT_ROW)) {
+
+		if (wsrep_append_keys(user_thd, false, record, NULL)) {
+ 			DBUG_PRINT("wsrep", ("row key failed"));
+ 			error_result = HA_ERR_INTERNAL_ERROR;
+			goto wsrep_error;
+		}
+	}
+wsrep_error:
+#endif /* WITH_WSREP */
+
 	if (error_result == HA_FTS_INVALID_DOCID) {
 		my_error(HA_FTS_INVALID_DOCID, MYF(0));
 	}
@@ -7401,6 +8178,88 @@ calc_row_difference(
 	return(DB_SUCCESS);
 }
 
+#ifdef WITH_WSREP
+static
+int
+wsrep_calc_row_hash(
+/*================*/
+	byte*		digest,		/*!< in/out: md5 sum */
+	const uchar*	row,		/*!< in: row in MySQL format */
+	TABLE*		table,		/*!< in: table in MySQL data
+					dictionary */
+	row_prebuilt_t*	prebuilt,	/*!< in: InnoDB prebuilt struct */
+	THD*		thd)		/*!< in: user thread */
+{
+	Field*		field;
+	enum_field_types field_mysql_type;
+	uint		n_fields;
+	ulint		len;
+	const byte*	ptr;
+	ulint		col_type;
+	uint		i;
+
+	void *ctx = alloca(my_md5_context_size());
+        my_md5_init(ctx);
+
+	n_fields = table->s->fields;
+
+	for (i = 0; i < n_fields; i++) {
+		byte null_byte=0;
+		byte true_byte=1;
+
+		field = table->field[i];
+
+		ptr = (const byte*) row + get_field_offset(table, field);
+		len = field->pack_length();
+
+		field_mysql_type = field->type();
+
+		col_type = prebuilt->table->cols[i].mtype;
+
+		switch (col_type) {
+
+		case DATA_BLOB:
+			ptr = row_mysql_read_blob_ref(&len, ptr, len);
+
+			break;
+
+		case DATA_VARCHAR:
+		case DATA_BINARY:
+		case DATA_VARMYSQL:
+			if (field_mysql_type == MYSQL_TYPE_VARCHAR) {
+				/* This is a >= 5.0.3 type true VARCHAR where
+				the real payload data length is stored in
+				1 or 2 bytes */
+
+				ptr = row_mysql_read_true_varchar(
+					&len, ptr,
+					(ulint)
+					(((Field_varstring*)field)->length_bytes));
+
+			}
+
+			break;
+		default:
+			;
+		}
+		/*
+		if (field->null_ptr &&
+		    field_in_record_is_null(table, field, (char*) row)) {
+		*/
+
+		if (field->is_null_in_record(row)) {
+			my_md5_input(ctx, &null_byte, 1);
+		} else {
+			my_md5_input(ctx, &true_byte, 1);
+			my_md5_input(ctx, ptr, len);
+		}
+	}
+
+	my_md5_result(ctx, digest);
+
+	return(0);
+}
+#endif /* WITH_WSREP */
 /**********************************************************************//**
 Updates a row given as a parameter to a new value. Note that we are given
 whole rows, not just the fields which are updated: this incurs some
@@ -7538,6 +8397,24 @@ func_exit:
 
 	innobase_active_small();
 
+#ifdef WITH_WSREP
+	if (error == DB_SUCCESS                          && 
+	    wsrep_thd_exec_mode(user_thd) == LOCAL_STATE &&
+            wsrep_on(user_thd)) {
+
+		DBUG_PRINT("wsrep", ("update row key"));
+
+		if (wsrep_append_keys(user_thd, false, old_row, new_row)) {
+			WSREP_DEBUG("WSREP: UPDATE_ROW_KEY FAILED");
+			DBUG_PRINT("wsrep", ("row key failed"));
+			err = HA_ERR_INTERNAL_ERROR;
+			goto wsrep_error;
+		}
+	}
+wsrep_error:
+#endif /* WITH_WSREP */
+
+
 	DBUG_RETURN(err);
 }
 
@@ -7585,6 +8462,19 @@ ha_innobase::delete_row(
 
 	innobase_active_small();
 
+#ifdef WITH_WSREP
+	if (error == DB_SUCCESS                          && 
+	    wsrep_thd_exec_mode(user_thd) == LOCAL_STATE &&
+            wsrep_on(user_thd)) {
+
+		if (wsrep_append_keys(user_thd, false, record, NULL)) {
+			DBUG_PRINT("wsrep", ("delete fail"));
+			error = (dberr_t)HA_ERR_INTERNAL_ERROR;
+			goto wsrep_error;
+		}
+	}
+wsrep_error:
+#endif
 	DBUG_RETURN(convert_error_code_to_mysql(
 			    error, prebuilt->table->flags, user_thd));
 }
@@ -8781,6 +9671,396 @@ ha_innobase::ft_end()
 
 	rnd_end();
 }
+#ifdef WITH_WSREP
+extern dict_index_t*
+wsrep_dict_foreign_find_index(
+	dict_table_t*	table,
+	const char**	col_names,
+	const char**	columns,
+	ulint		n_cols,
+	dict_index_t*	types_idx,
+	ibool		check_charsets,
+	ulint		check_null);
+
+
+extern dberr_t
+wsrep_append_foreign_key(
+/*===========================*/
+	trx_t*		trx,		/*!< in: trx */
+	dict_foreign_t*	foreign,	/*!< in: foreign key constraint */
+	const rec_t*	rec,		/*!<in: clustered index record */
+	dict_index_t*	index,		/*!<in: clustered index */
+	ibool		referenced,	/*!<in: is check for referenced table */
+	ibool		shared)		/*!<in: is shared access */
+{
+	ut_a(trx);
+	THD*  thd = (THD*)trx->mysql_thd;
+	ulint rcode = DB_SUCCESS;
+	char  cache_key[513] = {'\0'};
+	int   cache_key_len;
+    bool const copy = true;
+
+	if (!wsrep_on(trx->mysql_thd) ||
+	    wsrep_thd_exec_mode(thd) != LOCAL_STATE)
+		return DB_SUCCESS;
+
+	if (!thd || !foreign ||
+	    (!foreign->referenced_table && !foreign->foreign_table))
+	{
+		WSREP_INFO("FK: %s missing in: %s",
+			(!thd)      ?  "thread"     :
+			((!foreign) ?  "constraint" :
+			((!foreign->referenced_table) ?
+			     "referenced table" : "foreign table")),
+			   (thd && wsrep_thd_query(thd)) ?
+			   wsrep_thd_query(thd) : "void");
+		return DB_ERROR;
+	}
+
+	if ( !((referenced) ?
+		foreign->referenced_table : foreign->foreign_table))
+	{
+		WSREP_DEBUG("pulling %s table into cache",
+			    (referenced) ? "referenced" : "foreign");
+		mutex_enter(&(dict_sys->mutex));
+		if (referenced)
+		{
+			foreign->referenced_table =
+				dict_table_get_low(
+					foreign->referenced_table_name_lookup);
+			if (foreign->referenced_table)
+			{
+				foreign->referenced_index =
+					wsrep_dict_foreign_find_index(
+						foreign->referenced_table, NULL,
+						foreign->referenced_col_names,
+						foreign->n_fields, 
+						foreign->foreign_index,
+						TRUE, FALSE);
+			}
+		}
+		else
+		{
+	  		foreign->foreign_table =
+				dict_table_get_low(
+					foreign->foreign_table_name_lookup);
+			if (foreign->foreign_table)
+			{
+				foreign->foreign_index =
+					wsrep_dict_foreign_find_index(
+						foreign->foreign_table, NULL,
+						foreign->foreign_col_names,
+						foreign->n_fields,
+						foreign->referenced_index, 
+						TRUE, FALSE);
+			}
+		}
+		mutex_exit(&(dict_sys->mutex));
+	}
+
+	if ( !((referenced) ?
+		foreign->referenced_table : foreign->foreign_table))
+	{
+		WSREP_WARN("FK: %s missing in query: %s",
+			   (!foreign->referenced_table) ?
+			   "referenced table" : "foreign table",
+			   (wsrep_thd_query(thd)) ?
+			   wsrep_thd_query(thd) : "void");
+		return DB_ERROR;
+	}
+	byte  key[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'};
+	ulint len = WSREP_MAX_SUPPORTED_KEY_LENGTH;
+
+	dict_index_t *idx_target = (referenced) ?
+		foreign->referenced_index : index;
+	dict_index_t *idx = (referenced) ?
+		UT_LIST_GET_FIRST(foreign->referenced_table->indexes) :
+		UT_LIST_GET_FIRST(foreign->foreign_table->indexes);
+	int i = 0;
+	while (idx != NULL && idx != idx_target) {
+		if (innobase_strcasecmp (idx->name, innobase_index_reserve_name) != 0) {
+			i++;
+		}
+		idx = UT_LIST_GET_NEXT(indexes, idx);
+	}
+	ut_a(idx);
+	key[0] = (char)i;
+
+	rcode = wsrep_rec_get_foreign_key(
+		&key[1], &len, rec, index, idx,
+		wsrep_protocol_version > 1);
+	if (rcode != DB_SUCCESS) {
+		WSREP_ERROR(
+			"FK key set failed: %lu (%lu %lu), index: %s %s, %s",
+			rcode, referenced, shared,
+			(index && index->name)       ? index->name :
+				"void index",
+			(index && index->table_name) ? index->table_name :
+				"void table",
+			wsrep_thd_query(thd));
+		return DB_ERROR;
+	}
+	strncpy(cache_key,
+		(wsrep_protocol_version > 1) ?
+		((referenced) ?
+			foreign->referenced_table->name :
+			foreign->foreign_table->name) :
+		foreign->foreign_table->name, sizeof(cache_key) - 1);
+	cache_key_len = strlen(cache_key);
+#ifdef WSREP_DEBUG_PRINT
+	ulint j;
+	fprintf(stderr, "FK parent key, table: %s %s len: %lu ",
+		cache_key, (shared) ? "shared" : "exclusive", len+1);
+	for (j=0; j<len+1; j++) {
+		fprintf(stderr, " %hhX, ", key[j]);
+	}
+	fprintf(stderr, "\n");
+#endif
+	char *p = strchr(cache_key, '/');
+	if (p) {
+		*p = '\0';
+	} else {
+		WSREP_WARN("unexpected foreign key table %s %s",
+			   foreign->referenced_table->name,
+			   foreign->foreign_table->name);
+	}
+
+	wsrep_buf_t wkey_part[3];
+        wsrep_key_t wkey = {wkey_part, 3};
+	if (!wsrep_prepare_key(
+		(const uchar*)cache_key,
+		cache_key_len +  1,
+		(const uchar*)key, len+1,
+		wkey_part,
+		(size_t*)&wkey.key_parts_num)) {
+		WSREP_WARN("key prepare failed for cascaded FK: %s",
+			   (wsrep_thd_query(thd)) ?
+			    wsrep_thd_query(thd) : "void");
+		return DB_ERROR;
+	}
+        wsrep_t *wsrep= get_wsrep();
+	rcode = (int)wsrep->append_key(
+		wsrep,
+		wsrep_ws_handle(thd, trx),
+		&wkey,
+		1,
+		shared ? WSREP_KEY_SHARED : WSREP_KEY_EXCLUSIVE,
+                copy);
+	if (rcode) {
+		DBUG_PRINT("wsrep", ("row key failed: %lu", rcode));
+		WSREP_ERROR("Appending cascaded fk row key failed: %s, %lu",
+			    (wsrep_thd_query(thd)) ?
+			    wsrep_thd_query(thd) : "void", rcode);
+		return DB_ERROR;
+	}
+
+	return DB_SUCCESS;
+}
+
+static int
+wsrep_append_key(
+/*==================*/
+	THD		*thd,
+	trx_t 		*trx,
+	TABLE_SHARE 	*table_share,
+	TABLE 		*table,
+	const char*	key,
+	uint16_t        key_len,
+	bool            shared
+)
+{
+	DBUG_ENTER("wsrep_append_key");
+	bool const copy = true;
+#ifdef WSREP_DEBUG_PRINT
+	fprintf(stderr, "%s conn %ld, trx %llu, keylen %d, table %s ",
+		(shared) ? "Shared" : "Exclusive",
+		thd_get_thread_id(thd), (long long)trx->id, key_len,
+		table_share->table_name.str);
+	for (int i=0; i<key_len; i++) {
+		fprintf(stderr, "%hhX, ", key[i]);
+	}
+	fprintf(stderr, "\n");
+#endif
+	wsrep_buf_t wkey_part[3];
+        wsrep_key_t wkey = {wkey_part, 3};
+	if (!wsrep_prepare_key(
+			(const uchar*)table_share->table_cache_key.str,
+			table_share->table_cache_key.length,
+			(const uchar*)key, key_len,
+			wkey_part,
+			(size_t*)&wkey.key_parts_num)) {
+		WSREP_WARN("key prepare failed for: %s",
+			   (wsrep_thd_query(thd)) ?
+			   wsrep_thd_query(thd) : "void");
+		DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
+	}
+
+        wsrep_t *wsrep= get_wsrep();
+	int rcode = (int)wsrep->append_key(
+			       wsrep,
+			       wsrep_ws_handle(thd, trx),
+			       &wkey,
+			       1,
+			       shared ? WSREP_KEY_SHARED : WSREP_KEY_EXCLUSIVE,
+                               copy);
+	if (rcode) {
+		DBUG_PRINT("wsrep", ("row key failed: %d", rcode));
+		WSREP_WARN("Appending row key failed: %s, %d",
+			   (wsrep_thd_query(thd)) ?
+			   wsrep_thd_query(thd) : "void", rcode);
+		DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
+	}
+	DBUG_RETURN(0);
+}
+
+extern void compute_md5_hash(char *digest, const char *buf, int len);
+#define MD5_HASH compute_md5_hash
+
+int
+ha_innobase::wsrep_append_keys(
+/*==================*/
+	THD 		*thd,
+	bool		shared,
+	const uchar*	record0,	/* in: row in MySQL format */
+	const uchar*	record1)	/* in: row in MySQL format */
+{
+	int rcode;
+	DBUG_ENTER("wsrep_append_keys");
+
+	bool key_appended = false;
+	trx_t *trx = thd_to_trx(thd);
+
+	if (table_share && table_share->tmp_table  != NO_TMP_TABLE) {
+		WSREP_DEBUG("skipping tmp table DML: THD: %lu tmp: %d SQL: %s", 
+			    thd_get_thread_id(thd),
+			    table_share->tmp_table,
+			    (wsrep_thd_query(thd)) ? 
+			    wsrep_thd_query(thd) : "void");
+		DBUG_RETURN(0);
+	}
+
+	if (wsrep_protocol_version == 0) {
+		uint	len;
+		char 	keyval[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'};
+		char 	*key 		= &keyval[0];
+		ibool    is_null;
+
+		len = wsrep_store_key_val_for_row(
+			thd, table, 0, key, WSREP_MAX_SUPPORTED_KEY_LENGTH, 
+			record0, &is_null);
+
+		if (!is_null) {
+			rcode = wsrep_append_key(
+				thd, trx, table_share, table, keyval, 
+				len, shared);
+			if (rcode) DBUG_RETURN(rcode);
+		}
+		else
+		{
+			WSREP_DEBUG("NULL key skipped (proto 0): %s", 
+				    wsrep_thd_query(thd));
+		}
+	} else {
+		ut_a(table->s->keys <= 256);
+		uint i;
+                bool hasPK= false;
+
+		for (i=0; i<table->s->keys; ++i) {
+			KEY*  key_info	= table->key_info + i;
+			if (key_info->flags & HA_NOSAME) {
+				hasPK = true;
+			}
+		}
+
+		for (i=0; i<table->s->keys; ++i) {
+			uint  len;
+			char  keyval0[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'};
+			char  keyval1[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'};
+			char* key0 		= &keyval0[1];
+			char* key1 		= &keyval1[1];
+			KEY*  key_info	= table->key_info + i;
+			ibool is_null;
+
+			dict_index_t* idx  = innobase_get_index(i);
+			dict_table_t* tab  = (idx) ? idx->table : NULL;
+
+			keyval0[0] = (char)i;
+			keyval1[0] = (char)i;
+
+			if (!tab) {
+				WSREP_WARN("MySQL-InnoDB key mismatch %s %s",
+					   table->s->table_name.str, 
+					   key_info->name);
+			}
+			/* !hasPK == table with no PK, must append all non-unique keys */
+			if (!hasPK || key_info->flags & HA_NOSAME ||
+			    ((tab &&
+			      dict_table_get_referenced_constraint(tab, idx)) ||
+			     (!tab && referenced_by_foreign_key()))) {
+
+				len = wsrep_store_key_val_for_row(
+					thd, table, i, key0, 
+					WSREP_MAX_SUPPORTED_KEY_LENGTH, 
+					record0, &is_null);
+				if (!is_null) {
+					rcode = wsrep_append_key(
+						thd, trx, table_share, table, 
+						keyval0, len+1, shared);
+					if (rcode) DBUG_RETURN(rcode);
+
+				if (key_info->flags & HA_NOSAME || shared)
+			  		key_appended = true;
+				}
+				else
+				{
+					WSREP_DEBUG("NULL key skipped: %s", 
+						    wsrep_thd_query(thd));
+				}
+				if (record1) {
+					len = wsrep_store_key_val_for_row(
+						thd, table, i, key1, 
+						WSREP_MAX_SUPPORTED_KEY_LENGTH,
+						record1, &is_null);
+					if (!is_null && memcmp(key0, key1, len)) {
+						rcode = wsrep_append_key(
+							thd, trx, table_share, 
+							table, 
+							keyval1, len+1, shared);
+						if (rcode) DBUG_RETURN(rcode);
+					}
+				}
+			}
+		}
+	}
+
+	/* if no PK, calculate hash of full row, to be the key value */
+	if (!key_appended && wsrep_certify_nonPK) {
+		uchar digest[16];
+		int rcode;
+
+		wsrep_calc_row_hash(digest, record0, table, prebuilt, thd);
+		if ((rcode = wsrep_append_key(thd, trx, table_share, table, 
+					      (const char*) digest, 16, 
+					      shared))) {
+			DBUG_RETURN(rcode);
+		}
+
+		if (record1) {
+			wsrep_calc_row_hash(
+				digest, record1, table, prebuilt, thd);
+			if ((rcode = wsrep_append_key(thd, trx, table_share, 
+						      table,
+						      (const char*) digest, 
+						      16, shared))) {
+				DBUG_RETURN(rcode);
+			}
+		}
+		DBUG_RETURN(0);
+	}
+
+	DBUG_RETURN(0);
+}
+#endif /* WITH_WSREP */
 
 /*********************************************************************//**
 Stores a reference to the current row to 'ref' field of the handle. Note
@@ -9655,11 +10935,16 @@ innobase_table_flags(
 	enum row_type	row_format;
 	rec_format_t	innodb_row_format = REC_FORMAT_COMPACT;
 	bool		use_data_dir;
+	ha_table_option_struct *options= form->s->option_struct;
 
 	/* Cache the value of innodb_file_format, in case it is
 	modified by another thread while the table is being created. */
 	const ulint	file_format_allowed = srv_file_format;
 
+	/* Cache the value of innobase_compression_level, in case it is
+	modified by another thread while the table is being created. */
+	const ulint     default_compression_level = page_zip_level;
+
 	*flags = 0;
 	*flags2 = 0;
 
@@ -9713,6 +10998,8 @@ index_bad:
 		}
 	}
 
+	row_format = form->s->row_type;
+
 	if (create_info->key_block_size) {
 		/* The requested compressed page size (key_block_size)
 		is given in kilobytes. If it is a valid number, store
@@ -9722,7 +11009,7 @@ index_bad:
 		ulint kbsize;		/* Key Block Size */
 		for (zssize = kbsize = 1;
 		     zssize <= ut_min(UNIV_PAGE_SSIZE_MAX,
-				      PAGE_ZIP_SSIZE_MAX);
+			     	      PAGE_ZIP_SSIZE_MAX);
 		     zssize++, kbsize <<= 1) {
 			if (kbsize == create_info->key_block_size) {
 				zip_ssize = zssize;
@@ -9750,8 +11037,8 @@ index_bad:
 		}
 
 		if (!zip_allowed
-		    || zssize > ut_min(UNIV_PAGE_SSIZE_MAX,
-				       PAGE_ZIP_SSIZE_MAX)) {
+			|| zssize > ut_min(UNIV_PAGE_SSIZE_MAX,
+					   PAGE_ZIP_SSIZE_MAX)) {
 			push_warning_printf(
 				thd, Sql_condition::WARN_LEVEL_WARN,
 				ER_ILLEGAL_HA_CREATE_OPTION,
@@ -9760,8 +11047,6 @@ index_bad:
 		}
 	}
 
-	row_format = form->s->row_type;
-
 	if (zip_ssize && zip_allowed) {
 		/* if ROW_FORMAT is set to default,
 		automatically change it to COMPRESSED.*/
@@ -9798,7 +11083,6 @@ index_bad:
 	case ROW_TYPE_REDUNDANT:
 		innodb_row_format = REC_FORMAT_REDUNDANT;
 		break;
-
 	case ROW_TYPE_COMPRESSED:
 	case ROW_TYPE_DYNAMIC:
 		if (!use_tablespace) {
@@ -9816,10 +11100,18 @@ index_bad:
 				" innodb_file_format > Antelope.",
 				get_row_format_name(row_format));
 		} else {
-			innodb_row_format = (row_format == ROW_TYPE_DYNAMIC
-					     ? REC_FORMAT_DYNAMIC
-					     : REC_FORMAT_COMPRESSED);
-			break;
+			switch(row_format) {
+			  case ROW_TYPE_COMPRESSED:
+			    innodb_row_format = REC_FORMAT_COMPRESSED;
+			    break;
+			  case ROW_TYPE_DYNAMIC:
+			    innodb_row_format = REC_FORMAT_DYNAMIC;
+                            break;
+			  default:
+			    /* Not possible, avoid compiler warning */
+			    break;
+			}
+			break; /* Correct row_format */
 		}
 		zip_allowed = FALSE;
 		/* fall through to set row_format = COMPACT */
@@ -9846,7 +11138,15 @@ index_bad:
 		       && ((create_info->data_file_name != NULL)
 		       && !(create_info->options & HA_LEX_CREATE_TMP_TABLE));
 
-	dict_tf_set(flags, innodb_row_format, zip_ssize, use_data_dir);
+	/* Set up table dictionary flags */
+	dict_tf_set(flags,
+		    innodb_row_format,
+		    zip_ssize,
+		    use_data_dir,
+		    options->page_compressed,
+		    (ulint)options->page_compression_level == ULINT_UNDEFINED ?
+		        default_compression_level : options->page_compression_level,
+		    options->atomic_writes);
 
 	if (create_info->options & HA_LEX_CREATE_TMP_TABLE) {
 		*flags2 |= DICT_TF2_TEMPORARY;
@@ -9864,6 +11164,114 @@ index_bad:
 	DBUG_RETURN(true);
 }
 
+
+/*****************************************************************//**
+Check engine specific table options not handled by SQL-parser.
+@return	NULL if valid, string if not */
+UNIV_INTERN
+const char*
+ha_innobase::check_table_options(
+	THD		*thd,		/*!< in: thread handle */
+	TABLE*		table,		/*!< in: information on table
+					columns and indexes */
+	HA_CREATE_INFO*	create_info,	/*!< in: more information of the
+					created table, contains also the
+					create statement string */
+	const bool	use_tablespace, /*!< in: use file par table */
+	const ulint     file_format)
+{
+	enum row_type	row_format = table->s->row_type;;
+	ha_table_option_struct *options= table->s->option_struct;
+	atomic_writes_t awrites = (atomic_writes_t)options->atomic_writes;
+
+	/* Check page compression requirements */
+	if (options->page_compressed) {
+
+		if (row_format == ROW_TYPE_COMPRESSED) {
+			push_warning(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				HA_WRONG_CREATE_OPTION,
+				"InnoDB: PAGE_COMPRESSED table can't have"
+				" ROW_TYPE=COMPRESSED");
+			return "PAGE_COMPRESSED";
+		}
+
+		if (row_format == ROW_TYPE_REDUNDANT) {
+			push_warning(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				HA_WRONG_CREATE_OPTION,
+				"InnoDB: PAGE_COMPRESSED table can't have"
+				" ROW_TYPE=REDUNDANT");
+			return "PAGE_COMPRESSED";
+		}
+
+		if (!use_tablespace) {
+			push_warning(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				HA_WRONG_CREATE_OPTION,
+				"InnoDB: PAGE_COMPRESSED requires"
+				" innodb_file_per_table.");
+			return "PAGE_COMPRESSED";
+		}
+
+		if (file_format < UNIV_FORMAT_B) {
+			push_warning(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				HA_WRONG_CREATE_OPTION,
+				"InnoDB: PAGE_COMPRESSED requires"
+				" innodb_file_format > Antelope.");
+			return "PAGE_COMPRESSED";
+		}
+
+		if (create_info->key_block_size) {
+			push_warning(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				HA_WRONG_CREATE_OPTION,
+				"InnoDB: PAGE_COMPRESSED table can't have"
+				" key_block_size");
+			return "PAGE_COMPRESSED";
+		}
+	}
+
+	/* Check page compression level requirements, some of them are
+	already checked above */
+	if ((ulint)options->page_compression_level != ULINT_UNDEFINED) {
+		if (options->page_compressed == false) {
+			push_warning(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				HA_WRONG_CREATE_OPTION,
+				"InnoDB: PAGE_COMPRESSION_LEVEL requires"
+				" PAGE_COMPRESSED");
+			return "PAGE_COMPRESSION_LEVEL";
+		}
+
+		if (options->page_compression_level < 0 || options->page_compression_level > 9) {
+			push_warning_printf(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				HA_WRONG_CREATE_OPTION,
+				"InnoDB: invalid PAGE_COMPRESSION_LEVEL = %lu."
+				" Valid values are [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]",
+				options->page_compression_level);
+			return "PAGE_COMPRESSION_LEVEL";
+		}
+	}
+
+	/* Check atomic writes requirements */
+	if (awrites == ATOMIC_WRITES_ON ||
+		(awrites == ATOMIC_WRITES_DEFAULT && srv_use_atomic_writes)) {
+		if (!use_tablespace) {
+			push_warning(
+				thd, Sql_condition::WARN_LEVEL_WARN,
+				HA_WRONG_CREATE_OPTION,
+				"InnoDB: ATOMIC_WRITES requires"
+				" innodb_file_per_table.");
+			return "ATOMIC_WRITES";
+		}
+	}
+
+	return 0;
+}
+
 /*****************************************************************//**
 Creates a new table to an InnoDB database.
 @return	error number */
@@ -9895,6 +11303,7 @@ ha_innobase::create(
 	while creating the table. So we read the current value here
 	and make all further decisions based on this. */
 	bool		use_tablespace = srv_file_per_table;
+	const ulint     file_format    = srv_file_format;
 
 	/* Zip Shift Size - log2 - 9 of compressed page size,
 	zero for uncompressed */
@@ -9918,6 +11327,12 @@ ha_innobase::create(
 
 	/* Create the table definition in InnoDB */
 
+	/* Validate table options not handled by the SQL-parser */
+	if(check_table_options(thd, form, create_info, use_tablespace,
+			       file_format)) {
+		DBUG_RETURN(HA_WRONG_CREATE_OPTION);
+	}
+
 	/* Validate create options if innodb_strict_mode is set. */
 	if (create_options_are_invalid(
 			thd, form, create_info, use_tablespace)) {
@@ -10488,6 +11903,71 @@ ha_innobase::delete_table(
 
 	DBUG_RETURN(convert_error_code_to_mysql(err, 0, NULL));
 }
+/*****************************************************************//**
+Defragment table.
+@return	error number */
+UNIV_INTERN
+int
+ha_innobase::defragment_table(
+/*==========================*/
+	const char*	name,		/*!< in: table name */
+	const char*	index_name,	/*!< in: index name */
+	bool		async)		/*!< in: whether to wait until finish */
+{
+	char    norm_name[FN_REFLEN];
+	dict_table_t* table;
+	dict_index_t* index;
+	ibool		one_index = (index_name != 0);
+	int		ret = 0;
+	if (!srv_defragment) {
+		return ER_FEATURE_DISABLED;
+	}
+	normalize_table_name(norm_name, name);
+	table = dict_table_open_on_name(norm_name, FALSE,
+		FALSE, DICT_ERR_IGNORE_NONE);
+	for (index = dict_table_get_first_index(table); index;
+	     index = dict_table_get_next_index(index)) {
+		if (one_index && strcasecmp(index_name, index->name) != 0)
+			continue;
+		if (btr_defragment_find_index(index)) {
+			// We borrow this error code. When the same index is
+			// already in the defragmentation queue, issue another
+			// defragmentation only introduces overhead. We return
+			// an error here to let the user know this is not
+			// necessary. Note that this will fail a query that's
+			// trying to defragment a full table if one of the
+			// indicies in that table is already in defragmentation.
+			// We choose this behavior so user is aware of this
+			// rather than silently defragment other indicies of
+			// that table.
+			ret = ER_SP_ALREADY_EXISTS;
+			break;
+		}
+		os_event_t event = btr_defragment_add_index(index, async);
+		if (!async && event) {
+			while(os_event_wait_time(event, 1000000)) {
+				if (thd_killed(current_thd)) {
+					btr_defragment_remove_index(index);
+					ret = ER_QUERY_INTERRUPTED;
+					break;
+				}
+			}
+			os_event_free(event);
+		}
+		if (ret) {
+			break;
+		}
+		if (one_index) {
+			one_index = FALSE;
+			break;
+		}
+	}
+	dict_table_close(table, FALSE, FALSE);
+	if (ret == 0 && one_index) {
+		ret = ER_NO_SUCH_INDEX;
+	}
+	return ret;
+}
 
 /*****************************************************************//**
 Removes all tables in the named database inside InnoDB. */
@@ -11646,6 +13126,27 @@ ha_innobase::optimize(
 	This works OK otherwise, but MySQL locks the entire table during
 	calls to OPTIMIZE, which is undesirable. */
 
+	if (srv_defragment) {
+		int err;
+
+		err = defragment_table(prebuilt->table->name, NULL, false);
+
+		if (err == 0) {
+			return (HA_ADMIN_OK);
+		} else {
+			push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				err,
+				"InnoDB: Cannot defragment table %s: returned error code %d\n",
+				prebuilt->table->name, err);
+
+			if(err == ER_SP_ALREADY_EXISTS) {
+				return (HA_ADMIN_OK);
+			} else {
+				return (HA_ADMIN_TRY_ALTER);
+			}
+		}
+	}
+
 	if (innodb_optimize_fulltext_only) {
 		if (prebuilt->table->fts && prebuilt->table->fts->cache
 		    && !dict_table_is_discarded(prebuilt->table)) {
@@ -11745,7 +13246,7 @@ ha_innobase::check(
 			CHECK TABLE. */
 			os_increment_counter_by_amount(
 				server_mutex,
-				srv_fatal_semaphore_wait_threshold,
+                                srv_fatal_semaphore_wait_threshold,
 				SRV_SEMAPHORE_WAIT_EXTENSION);
 			bool valid = btr_validate_index(index, prebuilt->trx);
 
@@ -11753,7 +13254,7 @@ ha_innobase::check(
 			CHECK TABLE. */
 			os_decrement_counter_by_amount(
 				server_mutex,
-				srv_fatal_semaphore_wait_threshold,
+                                srv_fatal_semaphore_wait_threshold,
 				SRV_SEMAPHORE_WAIT_EXTENSION);
 
 			if (!valid) {
@@ -12537,11 +14038,18 @@ ha_innobase::external_lock(
 		/* used by test case */
 		DBUG_EXECUTE_IF("no_innodb_binlog_errors", skip = true;);
 		if (!skip) {
+#ifdef WITH_WSREP
+		  if (!wsrep_on(thd) || wsrep_thd_exec_mode(thd) == LOCAL_STATE)
+			{
+#endif /* WITH_WSREP */
 			my_error(ER_BINLOG_STMT_MODE_AND_ROW_ENGINE, MYF(0),
 			         " InnoDB is limited to row-logging when "
 			         "transaction isolation level is "
 			         "READ COMMITTED or READ UNCOMMITTED.");
 			DBUG_RETURN(HA_ERR_LOGGING_IMPOSSIBLE);
+#ifdef WITH_WSREP
+			}
+#endif /* WITH_WSREP */
 		}
 	}
 
@@ -13995,6 +15503,9 @@ innobase_xa_prepare(
 	to the session variable take effect only in the next transaction */
 	if (!trx->support_xa) {
 
+#ifdef WITH_WSREP
+                thd_get_xid(thd, (MYSQL_XID*) &trx->xid);
+#endif // WITH_WSREP
 		return(0);
 	}
 
@@ -14182,6 +15693,12 @@ ha_innobase::check_if_incompatible_data(
 	HA_CREATE_INFO*	info,
 	uint		table_changes)
 {
+	ha_table_option_struct *param_old, *param_new;
+
+	/* Cache engine specific options */
+	param_new = info->option_struct;
+	param_old = table->s->option_struct;
+
 	innobase_copy_frm_flags_from_create_info(prebuilt->table, info);
 
 	if (table_changes != IS_EQUAL_YES) {
@@ -14208,6 +15725,13 @@ ha_innobase::check_if_incompatible_data(
 		return(COMPATIBLE_DATA_NO);
 	}
 
+	/* Changes on engine specific table options requests a rebuild of the table. */
+	if (param_new->page_compressed != param_old->page_compressed ||
+	    param_new->page_compression_level != param_old->page_compression_level ||
+	    param_new->atomic_writes != param_old->atomic_writes) {
+		return(COMPATIBLE_DATA_NO);
+	}
+
 	return(COMPATIBLE_DATA_YES);
 }
 
@@ -14347,6 +15871,13 @@ innodb_max_dirty_pages_pct_lwm_update(
 	srv_max_dirty_pages_pct_lwm = in_val;
 }
 
+UNIV_INTERN
+void
+ha_innobase::set_partition_owner_stats(ha_statistics *stats)
+{
+	ha_partition_stats= stats;
+}
+
 /************************************************************//**
 Validate the file format name and return its corresponding id.
 @return	valid file format id */
@@ -15600,6 +17131,23 @@ innodb_reset_all_monitor_update(
 			      TRUE);
 }
 
+static
+void
+innodb_defragment_frequency_update(
+/*===============================*/
+	THD* thd,  /*!< in: thread handle */
+	struct st_mysql_sys_var* var,  /*!< in: pointer to
+	          system variable */
+	void* var_ptr,/*!< out: where the
+	          formal string goes */
+	const void* save) /*!< in: immediate result
+	          from check function */
+{
+	srv_defragment_frequency = (*static_cast<const uint*>(save));
+	srv_defragment_interval = ut_microseconds_to_timer(
+		1000000.0 / srv_defragment_frequency);
+}
+
 /****************************************************************//**
 Parse and enable InnoDB monitor counters during server startup.
 User can list the monitor counters/groups to be enable by specifying
@@ -16051,6 +17599,290 @@ static SHOW_VAR innodb_status_variables_export[]= {
 static struct st_mysql_storage_engine innobase_storage_engine=
 { MYSQL_HANDLERTON_INTERFACE_VERSION };
 
+#ifdef WITH_WSREP
+void
+wsrep_abort_slave_trx(wsrep_seqno_t bf_seqno, wsrep_seqno_t victim_seqno)
+{
+	WSREP_ERROR("Trx %lld tries to abort slave trx %lld. This could be "
+		"caused by:\n\t"
+		"1) unsupported configuration options combination, please check documentation.\n\t"
+		"2) a bug in the code.\n\t"
+		"3) a database corruption.\n Node consistency compromized, "
+		"need to abort. Restart the node to resync with cluster.",
+		(long long)bf_seqno, (long long)victim_seqno);
+	abort();
+}
+/*******************************************************************//**
+This function is used to kill one transaction in BF. */
+
+int
+wsrep_innobase_kill_one_trx(void * const bf_thd_ptr,
+                            const trx_t * const bf_trx,
+                            trx_t *victim_trx, ibool signal)
+{
+        ut_ad(lock_mutex_own());
+        ut_ad(trx_mutex_own(victim_trx));
+        ut_ad(bf_thd_ptr);
+        ut_ad(victim_trx);
+
+	DBUG_ENTER("wsrep_innobase_kill_one_trx");
+	THD *bf_thd       = bf_thd_ptr ? (THD*) bf_thd_ptr : NULL;
+	THD *thd          = (THD *) victim_trx->mysql_thd;
+	int64_t bf_seqno  = (bf_thd) ? wsrep_thd_trx_seqno(bf_thd) : 0;
+
+	if (!thd) {
+		DBUG_PRINT("wsrep", ("no thd for conflicting lock"));
+		WSREP_WARN("no THD for trx: %lu", victim_trx->id);
+		DBUG_RETURN(1);
+	}
+	if (!bf_thd) {
+		DBUG_PRINT("wsrep", ("no BF thd for conflicting lock"));
+		WSREP_WARN("no BF THD for trx: %lu", (bf_trx) ? bf_trx->id : 0);
+		DBUG_RETURN(1);
+	}
+
+	WSREP_LOG_CONFLICT(bf_thd, thd, TRUE);
+
+	WSREP_DEBUG("BF kill (%lu, seqno: %lld), victim: (%lu) trx: %lu",
+ 		    signal, (long long)bf_seqno,
+		    thd_get_thread_id(thd),
+		    victim_trx->id);
+
+	WSREP_DEBUG("Aborting query: %s",
+		  (thd && wsrep_thd_query(thd)) ? wsrep_thd_query(thd) : "void");
+
+	wsrep_thd_LOCK(thd);
+
+	if (wsrep_thd_query_state(thd) == QUERY_EXITING) {
+		WSREP_DEBUG("kill trx EXITING for %lu", victim_trx->id);
+		wsrep_thd_UNLOCK(thd);
+		DBUG_RETURN(0);
+	}
+	if(wsrep_thd_exec_mode(thd) != LOCAL_STATE) {
+		WSREP_DEBUG("withdraw for BF trx: %lu, state: %d",
+			    victim_trx->id,
+		wsrep_thd_get_conflict_state(thd));
+	}
+
+	switch (wsrep_thd_get_conflict_state(thd)) {
+	case NO_CONFLICT:
+		wsrep_thd_set_conflict_state(thd, MUST_ABORT);
+		break;
+        case MUST_ABORT:
+		WSREP_DEBUG("victim %lu in MUST ABORT state",
+			    victim_trx->id);
+		wsrep_thd_UNLOCK(thd);
+		wsrep_thd_awake(thd, signal);
+		DBUG_RETURN(0);
+		break;
+	case ABORTED:
+	case ABORTING: // fall through
+	default:
+		WSREP_DEBUG("victim %lu in state %d",
+			    victim_trx->id, wsrep_thd_get_conflict_state(thd));
+		wsrep_thd_UNLOCK(thd);
+		DBUG_RETURN(0);
+		break;
+	}
+
+	switch (wsrep_thd_query_state(thd)) {
+	case QUERY_COMMITTING:
+		enum wsrep_status rcode;
+
+		WSREP_DEBUG("kill query for: %ld",
+			    thd_get_thread_id(thd));
+		WSREP_DEBUG("kill trx QUERY_COMMITTING for %lu",
+			    victim_trx->id);
+
+		if (wsrep_thd_exec_mode(thd) == REPL_RECV) {
+			wsrep_abort_slave_trx(bf_seqno,
+					      wsrep_thd_trx_seqno(thd));
+		} else {
+                        wsrep_t *wsrep= get_wsrep();
+			rcode = wsrep->abort_pre_commit(
+				wsrep, bf_seqno,
+				(wsrep_trx_id_t)victim_trx->id
+			);
+
+			switch (rcode) {
+			case WSREP_WARNING:
+				WSREP_DEBUG("cancel commit warning: %lu",
+					    victim_trx->id);
+				wsrep_thd_UNLOCK(thd);
+				wsrep_thd_awake(thd, signal);
+				DBUG_RETURN(1);
+				break;
+			case WSREP_OK:
+				break;
+			default:
+				WSREP_ERROR(
+					"cancel commit bad exit: %d %lu",
+					rcode,
+					victim_trx->id);
+				/* unable to interrupt, must abort */
+				/* note: kill_mysql() will block, if we cannot.
+				 * kill the lock holder first.
+				 */
+				abort();
+				break;
+			}
+		}
+		wsrep_thd_UNLOCK(thd);
+		wsrep_thd_awake(thd, signal);
+		break;
+	case QUERY_EXEC:
+		/* it is possible that victim trx is itself waiting for some
+		 * other lock. We need to cancel this waiting
+		 */
+		WSREP_DEBUG("kill trx QUERY_EXEC for %lu", victim_trx->id);
+
+		victim_trx->lock.was_chosen_as_deadlock_victim= TRUE;
+		if (victim_trx->lock.wait_lock) {
+			WSREP_DEBUG("victim has wait flag: %ld",
+				thd_get_thread_id(thd));
+			lock_t*  wait_lock = victim_trx->lock.wait_lock;
+			if (wait_lock) {
+				WSREP_DEBUG("canceling wait lock");
+				victim_trx->lock.was_chosen_as_deadlock_victim= TRUE;
+				lock_cancel_waiting_and_release(wait_lock);
+			}
+
+			wsrep_thd_UNLOCK(thd);
+			wsrep_thd_awake(thd, signal);
+		} else {
+			/* abort currently executing query */
+			DBUG_PRINT("wsrep",("sending KILL_QUERY to: %ld",
+                                            thd_get_thread_id(thd)));
+			WSREP_DEBUG("kill query for: %ld",
+				thd_get_thread_id(thd));
+			/* Note that innobase_kill_connection will take lock_mutex
+			and trx_mutex */
+			wsrep_thd_UNLOCK(thd);
+			wsrep_thd_awake(thd, signal);
+
+			/* for BF thd, we need to prevent him from committing */
+			if (wsrep_thd_exec_mode(thd) == REPL_RECV) {
+				wsrep_abort_slave_trx(bf_seqno,
+						    wsrep_thd_trx_seqno(thd));
+			}
+		}
+		break;
+	case QUERY_IDLE:
+	{
+		WSREP_DEBUG("kill IDLE for %lu", victim_trx->id);
+
+		if (wsrep_thd_exec_mode(thd) == REPL_RECV) {
+			WSREP_DEBUG("kill BF IDLE, seqno: %lld",
+				    (long long)wsrep_thd_trx_seqno(thd));
+			wsrep_thd_UNLOCK(thd);
+			wsrep_abort_slave_trx(bf_seqno,
+					      wsrep_thd_trx_seqno(thd));
+			DBUG_RETURN(0);
+		}
+                /* This will lock thd from proceeding after net_read() */
+		wsrep_thd_set_conflict_state(thd, ABORTING);
+
+		wsrep_lock_rollback();
+
+                if (wsrep_aborting_thd_contains(thd)) {
+		  WSREP_WARN("duplicate thd aborter %lu",
+			     thd_get_thread_id(thd));
+                } else {
+                  wsrep_aborting_thd_enqueue(thd);
+		  DBUG_PRINT("wsrep",("enqueuing trx abort for %lu",
+                                       thd_get_thread_id(thd)));
+		  WSREP_DEBUG("enqueuing trx abort for (%lu)",
+			      thd_get_thread_id(thd));
+		}
+
+		DBUG_PRINT("wsrep",("signalling wsrep rollbacker"));
+		WSREP_DEBUG("signaling aborter");
+                wsrep_unlock_rollback();
+		wsrep_thd_UNLOCK(thd);
+
+		break;
+	}
+	default:
+		WSREP_WARN("bad wsrep query state: %d",
+			  wsrep_thd_query_state(thd));
+		wsrep_thd_UNLOCK(thd);
+		break;
+	}
+
+	DBUG_RETURN(0);
+}
+
+static
+int
+wsrep_abort_transaction(handlerton* hton, THD *bf_thd, THD *victim_thd,
+			my_bool signal)
+{
+	DBUG_ENTER("wsrep_innobase_abort_thd");
+	trx_t* victim_trx = thd_to_trx(victim_thd);
+	trx_t* bf_trx     = (bf_thd) ? thd_to_trx(bf_thd) : NULL;
+	WSREP_DEBUG("abort transaction: BF: %s victim: %s",
+		    wsrep_thd_query(bf_thd),
+		    wsrep_thd_query(victim_thd));
+
+	if (victim_trx)
+	{
+                lock_mutex_enter();
+                trx_mutex_enter(victim_trx);
+		int rcode = wsrep_innobase_kill_one_trx(bf_thd, bf_trx,
+                                                        victim_trx, signal);
+                trx_mutex_exit(victim_trx);
+                lock_mutex_exit();
+		wsrep_srv_conc_cancel_wait(victim_trx);
+
+ 		DBUG_RETURN(rcode);
+	} else {
+		WSREP_DEBUG("victim does not have transaction");
+		wsrep_thd_LOCK(victim_thd);
+		wsrep_thd_set_conflict_state(victim_thd, MUST_ABORT);
+		wsrep_thd_UNLOCK(victim_thd);
+		wsrep_thd_awake(victim_thd, signal); 
+	}
+	DBUG_RETURN(-1);
+}
+
+static int innobase_wsrep_set_checkpoint(handlerton* hton, const XID* xid)
+{
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+        if (wsrep_is_wsrep_xid(xid)) {
+                mtr_t mtr;
+                mtr_start(&mtr);
+                trx_sysf_t* sys_header = trx_sysf_get(&mtr);
+                trx_sys_update_wsrep_checkpoint(xid, sys_header, &mtr);
+                mtr_commit(&mtr);
+                innobase_flush_logs(hton);
+                return 0;
+        } else {
+                return 1;
+        }
+}
+
+static int innobase_wsrep_get_checkpoint(handlerton* hton, XID* xid)
+{
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+        trx_sys_read_wsrep_checkpoint(xid);
+        return 0;
+}
+
+static void
+wsrep_fake_trx_id(
+/*==================*/
+	handlerton	*hton,
+	THD		*thd)	/*!< in: user thread handle */
+{
+	mutex_enter(&trx_sys->mutex);
+	trx_id_t trx_id = trx_sys_get_new_trx_id();
+	mutex_exit(&trx_sys->mutex);
+
+	(void *)wsrep_ws_handle_for_trx(wsrep_thd_ws_handle(thd), trx_id);
+}
+
+#endif /* WITH_WSREP */
+
 /* plugin options */
 
 static MYSQL_SYSVAR_ENUM(checksum_algorithm, srv_checksum_algorithm,
@@ -16122,6 +17954,13 @@ static MYSQL_SYSVAR_ULONG(io_capacity_max, srv_max_io_capacity,
   SRV_MAX_IO_CAPACITY_DUMMY_DEFAULT, 100,
   SRV_MAX_IO_CAPACITY_LIMIT, 0);
 
+static MYSQL_SYSVAR_ULONG(idle_flush_pct,
+  srv_idle_flush_pct,
+  PLUGIN_VAR_RQCMDARG,
+  "Up to what percentage of dirty pages should be flushed when innodb "
+  "finds it has spare resources to do so.",
+  NULL, NULL, 100, 0, 100, 0);
+
 #ifdef UNIV_DEBUG
 static MYSQL_SYSVAR_BOOL(purge_run_now, innodb_purge_run_now,
   PLUGIN_VAR_OPCMDARG,
@@ -16381,7 +18220,7 @@ static MYSQL_SYSVAR_ULONG(replication_delay, srv_replication_delay,
 
 static MYSQL_SYSVAR_UINT(compression_level, page_zip_level,
   PLUGIN_VAR_RQCMDARG,
-  "Compression level used for compressed row format.  0 is no compression"
+  "Compression level used for zlib compression.  0 is no compression"
   ", 1 is fastest, 9 is best compression and default is 6.",
   NULL, NULL, DEFAULT_COMPRESSION_LEVEL, 0, 9, 0);
 
@@ -16392,7 +18231,7 @@ static MYSQL_SYSVAR_BOOL(log_compressed_pages, page_zip_log_pages,
   " the zlib compression algorithm changes."
   " When turned OFF, InnoDB will assume that the zlib"
   " compression algorithm doesn't change.",
-  NULL, NULL, TRUE);
+  NULL, NULL, FALSE);
 
 static MYSQL_SYSVAR_LONG(additional_mem_pool_size, innobase_additional_mem_pool_size,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
@@ -16467,6 +18306,60 @@ static MYSQL_SYSVAR_BOOL(buffer_pool_load_at_startup, srv_buffer_pool_load_at_st
   "Load the buffer pool from a file named @@innodb_buffer_pool_filename",
   NULL, NULL, FALSE);
 
+static MYSQL_SYSVAR_BOOL(defragment, srv_defragment,
+  PLUGIN_VAR_RQCMDARG,
+  "Enable/disable InnoDB defragmentation (default FALSE). When set to FALSE, all existing "
+  "defragmentation will be paused. And new defragmentation command will fail."
+  "Paused defragmentation commands will resume when this variable is set to "
+  "true again.",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_UINT(defragment_n_pages, srv_defragment_n_pages,
+  PLUGIN_VAR_RQCMDARG,
+  "Number of pages considered at once when merging multiple pages to "
+  "defragment",
+  NULL, NULL, 7, 2, 32, 0);
+
+static MYSQL_SYSVAR_UINT(defragment_stats_accuracy,
+  srv_defragment_stats_accuracy,
+  PLUGIN_VAR_RQCMDARG,
+  "How many defragment stats changes there are before the stats "
+  "are written to persistent storage. Set to 0 meaning disable "
+  "defragment stats tracking.",
+  NULL, NULL, 0, 0, ~0U, 0);
+
+static MYSQL_SYSVAR_UINT(defragment_fill_factor_n_recs,
+  srv_defragment_fill_factor_n_recs,
+  PLUGIN_VAR_RQCMDARG,
+  "How many records of space defragmentation should leave on the page. "
+  "This variable, together with innodb_defragment_fill_factor, is introduced "
+  "so defragmentation won't pack the page too full and cause page split on "
+  "the next insert on every page. The variable indicating more defragmentation"
+  " gain is the one effective.",
+  NULL, NULL, 20, 1, 100, 0);
+
+static MYSQL_SYSVAR_DOUBLE(defragment_fill_factor, srv_defragment_fill_factor,
+  PLUGIN_VAR_RQCMDARG,
+  "A number between [0.7, 1] that tells defragmentation how full it should "
+  "fill a page. Default is 0.9. Number below 0.7 won't make much sense."
+  "This variable, together with innodb_defragment_fill_factor_n_recs, is "
+  "introduced so defragmentation won't pack the page too full and cause "
+  "page split on the next insert on every page. The variable indicating more "
+  "defragmentation gain is the one effective.",
+  NULL, NULL, 0.9, 0.7, 1, 0);
+
+static MYSQL_SYSVAR_UINT(defragment_frequency, srv_defragment_frequency,
+  PLUGIN_VAR_RQCMDARG,
+  "Do not defragment a single index more than this number of time per second."
+  "This controls the number of time defragmentation thread can request X_LOCK "
+  "on an index. Defragmentation thread will check whether "
+  "1/defragment_frequency (s) has passed since it worked on this index last "
+  "time, and put the index back to the queue if not enough time has passed. "
+  "The actual frequency can only be lower than this given number.",
+  NULL, innodb_defragment_frequency_update,
+  SRV_DEFRAGMENT_FREQUENCY_DEFAULT, 1, 1000, 0);
+
+
 static MYSQL_SYSVAR_ULONG(lru_scan_depth, srv_LRU_scan_depth,
   PLUGIN_VAR_RQCMDARG,
   "How deep to scan LRU to keep it clean",
@@ -16655,6 +18548,12 @@ static MYSQL_SYSVAR_ULONG(
   1000000, 0);			/* Maximum value */
 #endif /* HAVE_ATOMIC_BUILTINS */
 
+static MYSQL_SYSVAR_BOOL(prefix_index_cluster_optimization,
+  srv_prefix_index_cluster_optimization,
+  PLUGIN_VAR_OPCMDARG,
+  "Enable prefix optimization to sometimes avoid cluster index lookups.",
+  NULL, NULL, FALSE);
+
 static MYSQL_SYSVAR_ULONG(thread_sleep_delay, srv_thread_sleep_delay,
   PLUGIN_VAR_RQCMDARG,
   "Time of innodb thread sleeping before joining InnoDB queue (usec). "
@@ -16793,6 +18692,40 @@ static MYSQL_SYSVAR_BOOL(disable_background_merge,
   NULL, NULL, FALSE);
 #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
 
+#ifdef WITH_INNODB_DISALLOW_WRITES
+/*******************************************************
+ *    innobase_disallow_writes variable definition     *
+ *******************************************************/
+ 
+/* Must always init to FALSE. */
+static my_bool	innobase_disallow_writes	= FALSE;
+
+/**************************************************************************
+An "update" method for innobase_disallow_writes variable. */
+static
+void
+innobase_disallow_writes_update(
+/*============================*/
+	THD*			thd,		/* in: thread handle */
+	st_mysql_sys_var*	var,		/* in: pointer to system
+						variable */
+	void*			var_ptr,	/* out: pointer to dynamic
+						variable */
+	const void*		save)		/* in: temporary storage */
+{
+	*(my_bool*)var_ptr = *(my_bool*)save;
+	ut_a(srv_allow_writes_event);
+	if (*(my_bool*)var_ptr)
+		os_event_reset(srv_allow_writes_event);
+	else
+		os_event_set(srv_allow_writes_event);
+}
+
+static MYSQL_SYSVAR_BOOL(disallow_writes, innobase_disallow_writes,
+  PLUGIN_VAR_NOCMDOPT,
+  "Tell InnoDB to stop any writes to disk",
+  NULL, innobase_disallow_writes_update, FALSE);
+#endif /* WITH_INNODB_DISALLOW_WRITES */
 static MYSQL_SYSVAR_BOOL(random_read_ahead, srv_random_read_ahead,
   PLUGIN_VAR_NOCMDARG,
   "Whether to use read ahead for random access within an extent.",
@@ -16900,6 +18833,56 @@ static MYSQL_SYSVAR_UINT(simulate_comp_failures, srv_simulate_comp_failures,
   "Simulate compression failures.",
   NULL, NULL, 0, 0, 99, 0);
 
+static MYSQL_SYSVAR_BOOL(force_primary_key,
+  srv_force_primary_key,
+  PLUGIN_VAR_OPCMDARG,
+  "Do not allow to create table without primary key (off by default)",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_BOOL(use_trim, srv_use_trim,
+  PLUGIN_VAR_OPCMDARG,
+  "Use trim. Default FALSE.",
+  NULL, NULL, FALSE);
+
+static const char *page_compression_algorithms[]= { "none", "zlib", "lz4", "lzo", "lzma", "bzip2", 0 };
+static TYPELIB page_compression_algorithms_typelib=
+{
+  array_elements(page_compression_algorithms) - 1, 0,
+  page_compression_algorithms, 0
+};
+static MYSQL_SYSVAR_ENUM(compression_algorithm, innodb_compression_algorithm,
+  PLUGIN_VAR_OPCMDARG,
+  "Compression algorithm used on page compression. One of: none, zlib, lz4, lzo, lzma, or bzip2",
+  innodb_compression_algorithm_validate, NULL,
+  /* We use here the largest number of supported compression method to
+  enable all those methods that are available. Availability of compression
+  method is verified on innodb_compression_algorithm_validate function. */
+  PAGE_UNCOMPRESSED,
+  &page_compression_algorithms_typelib);
+
+static MYSQL_SYSVAR_LONG(mtflush_threads, srv_mtflush_threads,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Number of multi-threaded flush threads",
+  NULL, NULL,
+  MTFLUSH_DEFAULT_WORKER, /* Default setting */
+  1,                      /* Minimum setting */
+  MTFLUSH_MAX_WORKER,     /* Max setting */
+  0);
+
+static MYSQL_SYSVAR_BOOL(use_mtflush, srv_use_mtflush,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "Use multi-threaded flush. Default FALSE.",
+  NULL, NULL, FALSE);
+
+static MYSQL_SYSVAR_ULONG(fatal_semaphore_wait_threshold, srv_fatal_semaphore_wait_threshold,
+  PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
+  "Maximum number of seconds that semaphore times out in InnoDB.",
+  NULL, NULL,
+  DEFAULT_SRV_FATAL_SEMAPHORE_TIMEOUT, /* Default setting */
+  1, /* Minimum setting */
+  UINT_MAX32, /* Maximum setting */
+  0);
+
 static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(additional_mem_pool_size),
   MYSQL_SYSVAR(api_trx_level),
@@ -16916,6 +18899,12 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(buffer_pool_load_now),
   MYSQL_SYSVAR(buffer_pool_load_abort),
   MYSQL_SYSVAR(buffer_pool_load_at_startup),
+  MYSQL_SYSVAR(defragment),
+  MYSQL_SYSVAR(defragment_n_pages),
+  MYSQL_SYSVAR(defragment_stats_accuracy),
+  MYSQL_SYSVAR(defragment_fill_factor),
+  MYSQL_SYSVAR(defragment_fill_factor_n_recs),
+  MYSQL_SYSVAR(defragment_frequency),
   MYSQL_SYSVAR(lru_scan_depth),
   MYSQL_SYSVAR(flush_neighbors),
   MYSQL_SYSVAR(checksum_algorithm),
@@ -17009,6 +18998,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
 #ifdef HAVE_ATOMIC_BUILTINS
   MYSQL_SYSVAR(adaptive_max_sleep_delay),
 #endif /* HAVE_ATOMIC_BUILTINS */
+  MYSQL_SYSVAR(prefix_index_cluster_optimization),
   MYSQL_SYSVAR(thread_sleep_delay),
   MYSQL_SYSVAR(autoinc_lock_mode),
   MYSQL_SYSVAR(version),
@@ -17020,11 +19010,15 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(change_buffering_debug),
   MYSQL_SYSVAR(disable_background_merge),
 #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
+#ifdef WITH_INNODB_DISALLOW_WRITES
+  MYSQL_SYSVAR(disallow_writes),
+#endif /* WITH_INNODB_DISALLOW_WRITES */
   MYSQL_SYSVAR(random_read_ahead),
   MYSQL_SYSVAR(read_ahead_threshold),
   MYSQL_SYSVAR(read_only),
   MYSQL_SYSVAR(io_capacity),
   MYSQL_SYSVAR(io_capacity_max),
+  MYSQL_SYSVAR(idle_flush_pct),
   MYSQL_SYSVAR(monitor_enable),
   MYSQL_SYSVAR(monitor_disable),
   MYSQL_SYSVAR(monitor_reset),
@@ -17060,6 +19054,13 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(saved_page_number_debug),
 #endif /* UNIV_DEBUG */
   MYSQL_SYSVAR(simulate_comp_failures),
+  MYSQL_SYSVAR(force_primary_key),
+  MYSQL_SYSVAR(use_trim),
+  MYSQL_SYSVAR(compression_algorithm),
+  MYSQL_SYSVAR(mtflush_threads),
+  MYSQL_SYSVAR(use_mtflush),
+
+  MYSQL_SYSVAR(fatal_semaphore_wait_threshold),
   NULL
 };
 
@@ -17402,6 +19403,9 @@ ib_senderrf(
 	case IB_LOG_LEVEL_FATAL:
 		l = 0;
 		break;
+	default:
+		l = 0;
+		break;
 	}
 
         my_printv_error(code, format, MYF(l), args);
@@ -17559,3 +19563,94 @@ innobase_convert_to_system_charset(
                           static_cast<uint>(len), errors));
 }
 
+/*************************************************************//**
+Check for a valid value of innobase_compression_algorithm.
+@return	0 for valid innodb_compression_algorithm. */
+static
+int
+innodb_compression_algorithm_validate(
+/*==================================*/
+	THD*				thd,	/*!< in: thread handle */
+	struct st_mysql_sys_var*	var,	/*!< in: pointer to system
+						variable */
+	void*				save,	/*!< out: immediate result
+						for update function */
+	struct st_mysql_value*		value)	/*!< in: incoming string */
+{
+	long		compression_algorithm;
+	DBUG_ENTER("innobase_compression_algorithm_validate");
+
+	if (value->value_type(value) == MYSQL_VALUE_TYPE_STRING) {
+		char buff[STRING_BUFFER_USUAL_SIZE];
+		const char *str;
+		int length= sizeof(buff);
+
+		if (!(str= value->val_str(value, buff, &length))) {
+			DBUG_RETURN(1);
+		}
+
+		if ((compression_algorithm= (long)find_type(str, &page_compression_algorithms_typelib, 0) - 1) < 0) {
+			DBUG_RETURN(1);
+		}
+	} else {
+		long long tmp;
+
+		if (value->val_int(value, &tmp)) {
+			DBUG_RETURN(1);
+		}
+
+		if (tmp < 0 || tmp >= page_compression_algorithms_typelib.count) {
+			DBUG_RETURN(1);
+		}
+
+		compression_algorithm= (long) tmp;
+	}
+
+	*reinterpret_cast<ulong*>(save) = compression_algorithm;
+
+#ifndef HAVE_LZ4
+	if (compression_algorithm == PAGE_LZ4_ALGORITHM) {
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    HA_ERR_UNSUPPORTED,
+				    "InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
+				    "InnoDB: liblz4 is not installed. \n",
+				    compression_algorithm);
+		DBUG_RETURN(1);
+	}
+#endif
+
+#ifndef HAVE_LZO
+	if (compression_algorithm == PAGE_LZO_ALGORITHM) {
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    HA_ERR_UNSUPPORTED,
+				    "InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
+				    "InnoDB: liblzo is not installed. \n",
+				    compression_algorithm);
+		DBUG_RETURN(1);
+	}
+#endif
+
+#ifndef HAVE_LZMA
+	if (compression_algorithm == PAGE_LZMA_ALGORITHM) {
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    HA_ERR_UNSUPPORTED,
+				    "InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
+				    "InnoDB: liblzma is not installed. \n",
+				    compression_algorithm);
+		DBUG_RETURN(1);
+	}
+#endif
+
+#ifndef HAVE_BZIP2
+	if (compression_algorithm == PAGE_BZIP2_ALGORITHM) {
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+				    HA_ERR_UNSUPPORTED,
+				    "InnoDB: innodb_compression_algorithm = %lu unsupported.\n"
+				    "InnoDB: libbz2 is not installed. \n",
+				    compression_algorithm);
+		DBUG_RETURN(1);
+	}
+#endif
+
+	DBUG_RETURN(0);
+}
diff --git a/storage/innobase/handler/ha_innodb.h b/storage/innobase/handler/ha_innodb.h
index 5cebc425769..6da31c8ecc6 100644
--- a/storage/innobase/handler/ha_innodb.h
+++ b/storage/innobase/handler/ha_innodb.h
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2000, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -56,6 +57,22 @@ typedef struct st_innobase_share {
 /** Prebuilt structures in an InnoDB table handle used within MySQL */
 struct row_prebuilt_t;
 
+/** Engine specific table options are definined using this struct */
+struct ha_table_option_struct
+{
+	bool  page_compressed;		/*!< Table is using page compression
+					if this option is true. */
+	int   page_compression_level;	/*!< Table page compression level
+					or UNIV_UNSPECIFIED. */
+	uint  atomic_writes;		/*!< Use atomic writes for this
+					table if this options is ON or
+					in DEFAULT if
+					srv_use_atomic_writes=1.
+					Atomic writes are not used if
+					value OFF.*/
+};
+
+
 /** The class defining a handle to an Innodb table */
 class ha_innobase: public handler
 {
@@ -81,6 +98,8 @@ class ha_innobase: public handler
 					or undefined */
 	uint		num_write_row;	/*!< number of write_row() calls */
 
+	ha_statistics*	ha_partition_stats; /*!< stats of the partition owner
+					handler (if there is one) */
 	uint store_key_val_for_row(uint keynr, char* buff, uint buff_len,
                                    const uchar* record);
 	inline void update_thd(THD* thd);
@@ -95,6 +114,10 @@ class ha_innobase: public handler
 	void innobase_initialize_autoinc();
 	dict_index_t* innobase_get_index(uint keynr);
 
+#ifdef WITH_WSREP
+	int wsrep_append_keys(THD *thd, bool shared,
+			      const uchar* record0, const uchar* record1);
+#endif
 	/* Init values for the class: */
  public:
 	ha_innobase(handlerton *hton, TABLE_SHARE *table_arg);
@@ -175,11 +198,15 @@ class ha_innobase: public handler
 			     char* norm_name,
 			     char* temp_path,
 			     char* remote_path);
+	const char* check_table_options(THD *thd, TABLE* table,
+		HA_CREATE_INFO*	create_info, const bool use_tablespace, const ulint file_format);
 	int create(const char *name, register TABLE *form,
 					HA_CREATE_INFO *create_info);
 	int truncate();
 	int delete_table(const char *name);
 	int rename_table(const char* from, const char* to);
+	int defragment_table(const char* name, const char* index_name,
+						bool async);
 	int check(THD* thd, HA_CHECK_OPT* check_opt);
 	char* update_table_comment(const char* comment);
 	char* get_foreign_key_create_info();
@@ -283,6 +310,7 @@ class ha_innobase: public handler
 		Alter_inplace_info*	ha_alter_info,
 		bool			commit);
 	/** @} */
+	void set_partition_owner_stats(ha_statistics *stats);
 	bool check_if_incompatible_data(HA_CREATE_INFO *info,
 					uint table_changes);
 private:
@@ -440,7 +468,9 @@ __attribute__((nonnull));
  */
 extern void mysql_bin_log_commit_pos(THD *thd, ulonglong *out_pos, const char **out_file);
 
-struct trx_t;
+#ifdef WITH_WSREP
+#include <mysql/service_wsrep.h>
+#endif
 
 extern const struct _ft_vft ft_vft_result;
 
@@ -478,6 +508,9 @@ innobase_index_name_is_reserved(
 	__attribute__((nonnull, warn_unused_result));
 
 /*****************************************************************//**
+#ifdef WITH_WSREP
+extern "C" int wsrep_trx_is_aborting(void *thd_ptr);
+#endif
 Determines InnoDB table flags.
 @retval true if successful, false if error */
 UNIV_INTERN
diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc
index a04b34fe027..d08fe25d377 100644
--- a/storage/innobase/handler/handler0alter.cc
+++ b/storage/innobase/handler/handler0alter.cc
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2005, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -262,6 +263,22 @@ ha_innobase::check_if_supported_inplace_alter(
 	update_thd();
 	trx_search_latch_release_if_reserved(prebuilt->trx);
 
+	/* Change on engine specific table options require rebuild of the
+	table */
+	if (ha_alter_info->handler_flags
+		== Alter_inplace_info::CHANGE_CREATE_OPTION) {
+		ha_table_option_struct *new_options= ha_alter_info->create_info->option_struct;
+		ha_table_option_struct *old_options= table->s->option_struct;
+
+		if (new_options->page_compressed != old_options->page_compressed ||
+		    new_options->page_compression_level != old_options->page_compression_level ||
+			new_options->atomic_writes != old_options->atomic_writes) {
+			ha_alter_info->unsupported_reason = innobase_get_err_msg(
+				ER_ALTER_OPERATION_NOT_SUPPORTED_REASON);
+			DBUG_RETURN(HA_ALTER_INPLACE_NOT_SUPPORTED);
+		}
+	}
+
 	if (ha_alter_info->handler_flags
 	    & ~(INNOBASE_INPLACE_IGNORE
 		| INNOBASE_ALTER_NOREBUILD
@@ -1178,7 +1195,8 @@ innobase_rec_to_mysql(
 
 		field->reset();
 
-		ipos = dict_index_get_nth_col_or_prefix_pos(index, i, TRUE);
+		ipos = dict_index_get_nth_col_or_prefix_pos(index, i, TRUE,
+							    NULL);
 
 		if (ipos == ULINT_UNDEFINED
 		    || rec_offs_nth_extern(offsets, ipos)) {
@@ -1230,7 +1248,8 @@ innobase_fields_to_mysql(
 
 		field->reset();
 
-		ipos = dict_index_get_nth_col_or_prefix_pos(index, i, TRUE);
+		ipos = dict_index_get_nth_col_or_prefix_pos(index, i, TRUE,
+							    NULL);
 
 		if (ipos == ULINT_UNDEFINED
 		    || dfield_is_ext(&fields[ipos])
@@ -3371,6 +3390,11 @@ ha_innobase::prepare_inplace_alter_table(
 	DBUG_ASSERT(ha_alter_info->create_info);
 	DBUG_ASSERT(!srv_read_only_mode);
 
+	/* Init online ddl status variables */
+	onlineddl_rowlog_rows = 0;
+	onlineddl_rowlog_pct_used = 0;
+	onlineddl_pct_progress = 0;
+
 	MONITOR_ATOMIC_INC(MONITOR_PENDING_ALTER_TABLE);
 
 #ifdef UNIV_DEBUG
@@ -3393,6 +3417,17 @@ ha_innobase::prepare_inplace_alter_table(
 
 	if (ha_alter_info->handler_flags
 	    & Alter_inplace_info::CHANGE_CREATE_OPTION) {
+		/* Check engine specific table options */
+		if (const char* invalid_tbopt = check_table_options(
+				user_thd, altered_table,
+				ha_alter_info->create_info,
+				prebuilt->table->space != 0,
+				srv_file_format)) {
+			my_error(ER_ILLEGAL_HA_CREATE_OPTION, MYF(0),
+				 table_type(), invalid_tbopt);
+			goto err_exit_no_heap;
+		}
+
 		if (const char* invalid_opt = create_options_are_invalid(
 			    user_thd, altered_table,
 			    ha_alter_info->create_info,
@@ -4011,6 +4046,11 @@ oom:
 			ctx->thr, prebuilt->table, altered_table);
 	}
 
+	/* Init online ddl status variables */
+	onlineddl_rowlog_rows = 0;
+	onlineddl_rowlog_pct_used = 0;
+	onlineddl_pct_progress = 0;
+
 	DEBUG_SYNC_C("inplace_after_index_build");
 
 	DBUG_EXECUTE_IF("create_index_fail",
diff --git a/storage/innobase/handler/i_s.cc b/storage/innobase/handler/i_s.cc
index ca504acf64b..f6b3dbd2d5d 100644
--- a/storage/innobase/handler/i_s.cc
+++ b/storage/innobase/handler/i_s.cc
@@ -92,6 +92,7 @@ static buf_page_desc_t	i_s_page_type[] = {
 	{"COMPRESSED_BLOB", FIL_PAGE_TYPE_ZBLOB},
 	{"COMPRESSED_BLOB2", FIL_PAGE_TYPE_ZBLOB2},
 	{"IBUF_INDEX", I_S_PAGE_TYPE_IBUF},
+	{"PAGE COMPRESSED", FIL_PAGE_PAGE_COMPRESSED},
 	{"UNKNOWN", I_S_PAGE_TYPE_UNKNOWN}
 };
 
@@ -2885,7 +2886,7 @@ UNIV_INTERN struct st_maria_plugin	i_s_innodb_ft_default_stopword =
 
 	/* general descriptive text (for SHOW PLUGINS) */
 	/* const char* */
-	STRUCT_FLD(descr, "Default stopword list for InnDB Full Text Search"),
+	STRUCT_FLD(descr, "Default stopword list for InnoDB Full Text Search"),
 
 	/* the plugin license (PLUGIN_LICENSE_XXX) */
 	/* int */
diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h
index 305acf7e322..b6f8a685ae9 100644
--- a/storage/innobase/include/btr0btr.h
+++ b/storage/innobase/include/btr0btr.h
@@ -2,6 +2,7 @@
 
 Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2014, SkySQL Ab. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -671,6 +672,21 @@ btr_get_size(
 				is s-latched */
 	__attribute__((nonnull, warn_unused_result));
 /**************************************************************//**
+Gets the number of reserved and used pages in a B-tree.
+@return	number of pages reserved, or ULINT_UNDEFINED if the index
+is unavailable */
+UNIV_INTERN
+ulint
+btr_get_size_and_reserved(
+/*======================*/
+	dict_index_t*	index,	/*!< in: index */
+	ulint		flag,	/*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
+	ulint*		used,	/*!< out: number of pages used (<= reserved) */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction where index
+				is s-latched */
+	__attribute__((nonnull));
+
+/**************************************************************//**
 Allocates a new file page to be used in an index tree. NOTE: we assume
 that the caller has made the reservation for free extents!
 @retval NULL if no page could be allocated
@@ -717,6 +733,33 @@ btr_page_free_low(
 	ulint		level,	/*!< in: page level */
 	mtr_t*		mtr)	/*!< in: mtr */
 	__attribute__((nonnull));
+/*************************************************************//**
+Reorganizes an index page.
+
+IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
+if this is a compressed leaf page in a secondary index. This has to
+be done either within the same mini-transaction, or by invoking
+ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
+IBUF_BITMAP_FREE is unaffected by reorganization.
+
+@retval true if the operation was successful
+@retval false if it is a compressed page, and recompression failed */
+UNIV_INTERN
+bool
+btr_page_reorganize_block(
+/*======================*/
+	bool		recovery,/*!< in: true if called in recovery:
+				locks should not be updated, i.e.,
+				there cannot exist locks on the
+				page, and a hash index should not be
+				dropped: it cannot exist */
+	ulint		z_level,/*!< in: compression level to be used
+				if dealing with compressed page */
+	buf_block_t*	block,	/*!< in/out: B-tree page */
+	dict_index_t*	index,	/*!< in: the index tree of the page */
+	mtr_t*		mtr)	/*!< in/out: mini-transaction */
+	__attribute__((nonnull));
+
 #ifdef UNIV_BTR_PRINT
 /*************************************************************//**
 Prints size info of a B-tree. */
@@ -762,6 +805,60 @@ btr_validate_index(
 	const trx_t*	trx)			/*!< in: transaction or 0 */
 	__attribute__((nonnull(1), warn_unused_result));
 
+#ifdef UNIV_SYNC_DEBUG
+/*************************************************************//**
+Removes a page from the level list of pages.
+@param space	in: space where removed
+@param zip_size	in: compressed page size in bytes, or 0 for uncompressed
+@param page	in/out: page to remove
+@param index	in: index tree
+@param mtr	in/out: mini-transaction */
+# define btr_level_list_remove(space,zip_size,page,index,mtr)		\
+	btr_level_list_remove_func(space,zip_size,page,index,mtr)
+#else /* UNIV_SYNC_DEBUG */
+/*************************************************************//**
+Removes a page from the level list of pages.
+@param space	in: space where removed
+@param zip_size	in: compressed page size in bytes, or 0 for uncompressed
+@param page	in/out: page to remove
+@param index	in: index tree
+@param mtr	in/out: mini-transaction */
+# define btr_level_list_remove(space,zip_size,page,index,mtr)		\
+	btr_level_list_remove_func(space,zip_size,page,mtr)
+#endif /* UNIV_SYNC_DEBUG */
+
+/*************************************************************//**
+Removes a page from the level list of pages. */
+UNIV_INTERN
+void
+btr_level_list_remove_func(
+/*=======================*/
+	ulint			space,	/*!< in: space where removed */
+	ulint			zip_size,/*!< in: compressed page size in bytes
+					or 0 for uncompressed pages */
+	page_t*			page,	/*!< in/out: page to remove */
+#ifdef UNIV_SYNC_DEBUG
+	const dict_index_t*	index,	/*!< in: index tree */
+#endif /* UNIV_SYNC_DEBUG */
+	mtr_t*			mtr)	/*!< in/out: mini-transaction */
+	__attribute__((nonnull));
+
+/*************************************************************//**
+If page is the only on its level, this function moves its records to the
+father page, thus reducing the tree height.
+@return father block */
+UNIV_INTERN
+buf_block_t*
+btr_lift_page_up(
+/*=============*/
+	dict_index_t*	index,	/*!< in: index tree */
+	buf_block_t*	block,	/*!< in: page which is the only on its level;
+				must not be empty: use
+				btr_discard_only_page_on_level if the last
+				record from the page should be removed */
+	mtr_t*		mtr)	/*!< in: mtr */
+	__attribute__((nonnull));
+
 #define BTR_N_LEAF_PAGES	1
 #define BTR_TOTAL_SIZE		2
 #endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/include/btr0btr.ic b/storage/innobase/include/btr0btr.ic
index 00f50b5dcaf..40b468b200a 100644
--- a/storage/innobase/include/btr0btr.ic
+++ b/storage/innobase/include/btr0btr.ic
@@ -163,9 +163,10 @@ btr_page_get_next(
 				/*!< in: mini-transaction handle */
 {
 	ut_ad(page && mtr);
+#ifndef UNIV_INNOCHECKSUM
 	ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX)
 	      || mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_S_FIX));
-
+#endif /* UNIV_INNOCHECKSUM */
 	return(mach_read_from_4(page + FIL_PAGE_NEXT));
 }
 
diff --git a/storage/innobase/include/btr0defragment.h b/storage/innobase/include/btr0defragment.h
new file mode 100644
index 00000000000..8fef3c6519a
--- /dev/null
+++ b/storage/innobase/include/btr0defragment.h
@@ -0,0 +1,101 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+#ifndef btr0defragment_h
+#define btr0defragment_h
+
+#include "univ.i"
+
+#ifndef UNIV_HOTBACKUP
+
+#include "btr0pcur.h"
+
+/* Max number of pages to consider at once during defragmentation. */
+#define BTR_DEFRAGMENT_MAX_N_PAGES	32
+
+/** stats in btr_defragment */
+extern ulint btr_defragment_compression_failures;
+extern ulint btr_defragment_failures;
+extern ulint btr_defragment_count;
+
+/** Item in the work queue for btr_degrament_thread. */
+struct btr_defragment_item_t
+{
+	btr_pcur_t*	pcur;		/* persistent cursor where
+					btr_defragment_n_pages should start */
+	os_event_t	event;		/* if not null, signal after work
+					is done */
+	bool		removed;	/* Mark an item as removed */
+	ulonglong	last_processed;	/* timestamp of last time this index
+					is processed by defragment thread */
+
+	btr_defragment_item_t(btr_pcur_t* pcur, os_event_t event);
+	~btr_defragment_item_t();
+};
+
+/******************************************************************//**
+Initialize defragmentation. */
+void
+btr_defragment_init(void);
+/******************************************************************//**
+Shutdown defragmentation. */
+void
+btr_defragment_shutdown();
+/******************************************************************//**
+Check whether the given index is in btr_defragment_wq. */
+bool
+btr_defragment_find_index(
+	dict_index_t*	index);	/*!< Index to find. */
+/******************************************************************//**
+Add an index to btr_defragment_wq. Return a pointer to os_event if this
+is a synchronized defragmentation. */
+os_event_t
+btr_defragment_add_index(
+	dict_index_t*	index,	/*!< index to be added  */
+	bool		async);	/*!< whether this is an async defragmentation */
+/******************************************************************//**
+When table is dropped, this function is called to mark a table as removed in
+btr_efragment_wq. The difference between this function and the remove_index
+function is this will not NULL the event. */
+void
+btr_defragment_remove_table(
+	dict_table_t*	table);	/*!< Index to be removed. */
+/******************************************************************//**
+Mark an index as removed from btr_defragment_wq. */
+void
+btr_defragment_remove_index(
+	dict_index_t*	index);	/*!< Index to be removed. */
+/*********************************************************************//**
+Check whether we should save defragmentation statistics to persistent storage.*/
+UNIV_INTERN
+void
+btr_defragment_save_defrag_stats_if_needed(
+	dict_index_t*	index);	/*!< in: index */
+/******************************************************************//**
+Thread that merges consecutive b-tree pages into fewer pages to defragment
+the index. */
+extern "C" UNIV_INTERN
+os_thread_ret_t
+DECLARE_THREAD(btr_defragment_thread)(
+/*==========================================*/
+	void*	arg);		/*!< in: a dummy parameter required by
+				os_thread_create */
+
+
+#endif /* !UNIV_HOTBACKUP */
+#endif
diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h
index 31ec6b9ef8b..7ea29169a48 100644
--- a/storage/innobase/include/buf0buf.h
+++ b/storage/innobase/include/buf0buf.h
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -1198,7 +1199,9 @@ UNIV_INTERN
 bool
 buf_page_io_complete(
 /*=================*/
-	buf_page_t*	bpage);	/*!< in: pointer to the block in question */
+	buf_page_t*	bpage,	/*!< in: pointer to the block in question */
+	bool		evict = false);/*!< in: whether or not to evict
+				the page from LRU list. */
 /********************************************************************//**
 Calculates a folded value of a file page address to use in the page hash
 table.
@@ -1498,6 +1501,11 @@ struct buf_page_t{
 					state == BUF_BLOCK_ZIP_PAGE and
 					zip.data == NULL means an active
 					buf_pool->watch */
+
+	ulint           write_size;     /* Write size is set when this
+					page is first time written and then
+					if written again we check is TRIM
+					operation needed. */
 #ifndef UNIV_HOTBACKUP
 	buf_page_t*	hash;		/*!< node used in chaining to
 					buf_pool->page_hash or
@@ -1756,6 +1764,133 @@ Compute the hash fold value for blocks in buf_pool->zip_hash. */
 #define BUF_POOL_ZIP_FOLD_BPAGE(b) BUF_POOL_ZIP_FOLD((buf_block_t*) (b))
 /* @} */
 
+/** A "Hazard Pointer" class used to iterate over page lists
+inside the buffer pool. A hazard pointer is a buf_page_t pointer
+which we intend to iterate over next and we want it remain valid
+even after we release the buffer pool mutex. */
+class HazardPointer {
+
+public:
+	/** Constructor
+	@param buf_pool buffer pool instance
+	@param mutex	mutex that is protecting the hp. */
+	HazardPointer(const buf_pool_t* buf_pool, const ib_mutex_t* mutex)
+		:
+		m_buf_pool(buf_pool)
+#ifdef UNIV_DEBUG
+		, m_mutex(mutex)
+#endif /* UNIV_DEBUG */
+		, m_hp() {}
+
+	/** Destructor */
+	virtual ~HazardPointer() {}
+
+	/** Get current value */
+	buf_page_t* get()
+	{
+		ut_ad(mutex_own(m_mutex));
+		return(m_hp);
+	}
+
+	/** Set current value
+	@param bpage	buffer block to be set as hp */
+	void set(buf_page_t* bpage);
+
+	/** Checks if a bpage is the hp
+	@param bpage	buffer block to be compared
+	@return true if it is hp */
+	bool is_hp(const buf_page_t* bpage);
+
+	/** Adjust the value of hp. This happens when some
+	other thread working on the same list attempts to
+	remove the hp from the list. Must be implemented
+	by the derived classes.
+	@param bpage	buffer block to be compared */
+	virtual void adjust(const buf_page_t*) = 0;
+
+protected:
+	/** Disable copying */
+	HazardPointer(const HazardPointer&);
+	HazardPointer& operator=(const HazardPointer&);
+
+	/** Buffer pool instance */
+	const buf_pool_t*	m_buf_pool;
+
+#if UNIV_DEBUG
+	/** mutex that protects access to the m_hp. */
+	const ib_mutex_t*	m_mutex;
+#endif /* UNIV_DEBUG */
+
+	/** hazard pointer. */
+	buf_page_t*		m_hp;
+};
+
+/** Class implementing buf_pool->flush_list hazard pointer */
+class FlushHp: public HazardPointer {
+
+public:
+	/** Constructor
+	@param buf_pool buffer pool instance
+	@param mutex	mutex that is protecting the hp. */
+	FlushHp(const buf_pool_t* buf_pool, const ib_mutex_t* mutex)
+		:
+		HazardPointer(buf_pool, mutex) {}
+
+	/** Destructor */
+	virtual ~FlushHp() {}
+
+	/** Adjust the value of hp. This happens when some
+	other thread working on the same list attempts to
+	remove the hp from the list.
+	@param bpage	buffer block to be compared */
+	void adjust(const buf_page_t* bpage);
+};
+
+/** Class implementing buf_pool->LRU hazard pointer */
+class LRUHp: public HazardPointer {
+
+public:
+	/** Constructor
+	@param buf_pool buffer pool instance
+	@param mutex	mutex that is protecting the hp. */
+	LRUHp(const buf_pool_t* buf_pool, const ib_mutex_t* mutex)
+		:
+		HazardPointer(buf_pool, mutex) {}
+
+	/** Destructor */
+	virtual ~LRUHp() {}
+
+	/** Adjust the value of hp. This happens when some
+	other thread working on the same list attempts to
+	remove the hp from the list.
+	@param bpage	buffer block to be compared */
+	void adjust(const buf_page_t* bpage);
+};
+
+/** Special purpose iterators to be used when scanning the LRU list.
+The idea is that when one thread finishes the scan it leaves the
+itr in that position and the other thread can start scan from
+there */
+class LRUItr: public LRUHp {
+
+public:
+	/** Constructor
+	@param buf_pool buffer pool instance
+	@param mutex	mutex that is protecting the hp. */
+	LRUItr(const buf_pool_t* buf_pool, const ib_mutex_t* mutex)
+		:
+		LRUHp(buf_pool, mutex) {}
+
+	/** Destructor */
+	virtual ~LRUItr() {}
+
+	/** Selects from where to start a scan. If we have scanned
+	too deep into the LRU list it resets the value to the tail
+	of the LRU list.
+	@return buf_page_t from where to start scan. */
+	buf_page_t* start();
+};
+
 /** Struct that is embedded in the free zip blocks */
 struct buf_buddy_free_t {
 	union {
@@ -1888,7 +2023,7 @@ struct buf_pool_t{
 					also protects writes to
 					bpage::oldest_modification and
 					flush_list_hp */
-	const buf_page_t*	flush_list_hp;/*!< "hazard pointer"
+	FlushHp			flush_hp;/*!< "hazard pointer"
 					used during scan of flush_list
 					while doing flush list batch.
 					Protected by flush_list_mutex */
@@ -1946,6 +2081,19 @@ struct buf_pool_t{
 	UT_LIST_BASE_NODE_T(buf_page_t) free;
 					/*!< base node of the free
 					block list */
+
+	/** "hazard pointer" used during scan of LRU while doing
+	LRU list batch.  Protected by buf_pool::mutex */
+	LRUHp		lru_hp;
+
+	/** Iterator used to scan the LRU list when searching for
+	replacable victim. Protected by buf_pool::mutex. */
+	LRUItr		lru_scan_itr;
+
+	/** Iterator used to scan the LRU list when searching for
+	single page flushing victim.  Protected by buf_pool::mutex. */
+	LRUItr		single_scan_itr;
+
 	UT_LIST_BASE_NODE_T(buf_page_t) LRU;
 					/*!< base node of the LRU list */
 	buf_page_t*	LRU_old;	/*!< pointer to the about
diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h
index f116720574b..3ab3f7c308a 100644
--- a/storage/innobase/include/buf0flu.h
+++ b/storage/innobase/include/buf0flu.h
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2014, 2014, SkySQL Ab.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -36,6 +37,17 @@ Created 11/5/1995 Heikki Tuuri
 /** Flag indicating if the page_cleaner is in active state. */
 extern ibool buf_page_cleaner_is_active;
 
+/** Event to synchronise with the flushing. */
+extern os_event_t	buf_flush_event;
+
+/** Handled page counters for a single flush */
+struct flush_counters_t {
+	ulint	flushed;	/*!< number of dirty pages flushed */
+	ulint	evicted;	/*!< number of clean pages evicted */
+	ulint	unzip_LRU_evicted;/*!< number of uncompressed page images
+				evicted */
+};
+
 /********************************************************************//**
 Remove a block from the flush list of modified blocks. */
 UNIV_INTERN
@@ -110,12 +122,12 @@ buf_flush_list(
 					which were processed is passed
 					back to caller. Ignored if NULL */
 /******************************************************************//**
-This function picks up a single dirty page from the tail of the LRU
-list, flushes it, removes it from page_hash and LRU list and puts
-it on the free list. It is called from user threads when they are
-unable to find a replacable page at the tail of the LRU list i.e.:
-when the background LRU flushing in the page_cleaner thread is not
-fast enough to keep pace with the workload.
+This function picks up a single page from the tail of the LRU
+list, flushes it (if it is dirty), removes it from page_hash and LRU
+list and puts it on the free list. It is called from user threads when
+they are unable to find a replaceable page at the tail of the LRU
+list i.e.: when the background LRU flushing in the page_cleaner thread
+is not fast enough to keep pace with the workload.
 @return TRUE if success. */
 UNIV_INTERN
 ibool
@@ -279,6 +291,57 @@ buf_flush_get_dirty_pages_count(
 
 #endif /* !UNIV_HOTBACKUP */
 
+/******************************************************************//**
+Start a buffer flush batch for LRU or flush list */
+ibool
+buf_flush_start(
+/*============*/
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
+	buf_flush_t	flush_type);	/*!< in: BUF_FLUSH_LRU
+					or BUF_FLUSH_LIST */
+/******************************************************************//**
+End a buffer flush batch for LRU or flush list */
+void
+buf_flush_end(
+/*==========*/
+	buf_pool_t*	buf_pool,	/*!< buffer pool instance */
+	buf_flush_t	flush_type);	/*!< in: BUF_FLUSH_LRU
+					or BUF_FLUSH_LIST */
+/******************************************************************//**
+Gather the aggregated stats for both flush list and LRU list flushing */
+void
+buf_flush_common(
+/*=============*/
+	buf_flush_t	flush_type,	/*!< in: type of flush */
+	ulint		page_count);	/*!< in: number of pages flushed */
+
+/*******************************************************************//**
+This utility flushes dirty blocks from the end of the LRU list or flush_list.
+NOTE 1: in the case of an LRU flush the calling thread may own latches to
+pages: to avoid deadlocks, this function must be written so that it cannot
+end up waiting for these latches! NOTE 2: in the case of a flush list flush,
+the calling thread is not allowed to own any latches on pages! */
+__attribute__((nonnull))
+void
+buf_flush_batch(
+/*============*/
+	buf_pool_t*	buf_pool,	/*!< in: buffer pool instance */
+	buf_flush_t	flush_type,	/*!< in: BUF_FLUSH_LRU or
+					BUF_FLUSH_LIST; if BUF_FLUSH_LIST,
+					then the caller must not own any
+					latches on pages */
+	ulint		min_n,		/*!< in: wished minimum mumber of blocks
+					flushed (it is not guaranteed that the
+					actual number is that big, though) */
+	lsn_t		lsn_limit,	/*!< in: in the case of BUF_FLUSH_LIST
+					all blocks whose oldest_modification is
+					smaller than this should be flushed
+					(if their number does not exceed
+					min_n), otherwise ignored */
+	flush_counters_t*	n);	/*!< out: flushed/evicted page
+					counts  */
+
+
 #ifndef UNIV_NONINL
 #include "buf0flu.ic"
 #endif
diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h
index ecdaef685a1..f1f6abd2d68 100644
--- a/storage/innobase/include/buf0lru.h
+++ b/storage/innobase/include/buf0lru.h
@@ -117,7 +117,7 @@ buf_LRU_get_free_only(
 	buf_pool_t*	buf_pool);	/*!< buffer pool instance */
 /******************************************************************//**
 Returns a free block from the buf_pool. The block is taken off the
-free list. If it is empty, blocks are moved from the end of the
+free list. If free list is empty, blocks are moved from the end of the
 LRU list to the free list.
 This function is called from a user thread when it needs a clean
 block to read in a page. Note that we only ever get a block from
@@ -125,8 +125,6 @@ the free list. Even when we flush a page or find a page in LRU scan
 we put it to free list to be used.
 * iteration 0:
   * get a block from free list, success:done
-  * if there is an LRU flush batch in progress:
-    * wait for batch to end: retry free list
   * if buf_pool->try_LRU_scan is set
     * scan LRU up to srv_LRU_scan_depth to find a clean block
     * the above will put the block on free list
@@ -139,7 +137,7 @@ we put it to free list to be used.
     * scan whole LRU list
     * scan LRU list even if buf_pool->try_LRU_scan is not set
 * iteration > 1:
-  * same as iteration 1 but sleep 100ms
+  * same as iteration 1 but sleep 10ms
 @return	the free control block, in state BUF_BLOCK_READY_FOR_USE */
 UNIV_INTERN
 buf_block_t*
@@ -231,6 +229,15 @@ buf_LRU_free_one_page(
 				may or may not be a hash index to the page */
 	__attribute__((nonnull));
 
+/******************************************************************//**
+Adjust LRU hazard pointers if needed. */
+
+void
+buf_LRU_adjust_hp(
+/*==============*/
+	buf_pool_t*		buf_pool,/*!< in: buffer pool instance */
+	const buf_page_t*	bpage);	/*!< in: control block */
+
 #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
 /**********************************************************************//**
 Validates the LRU list.
diff --git a/storage/innobase/include/buf0mtflu.h b/storage/innobase/include/buf0mtflu.h
new file mode 100644
index 00000000000..0475335bbf5
--- /dev/null
+++ b/storage/innobase/include/buf0mtflu.h
@@ -0,0 +1,95 @@
+/*****************************************************************************
+
+Copyright (C) 2014 SkySQL Ab. All Rights Reserved.
+Copyright (C) 2014 Fusion-io. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/buf0mtflu.h
+Multi-threadef flush method interface function prototypes
+
+Created 06/02/2014 Jan Lindström jan.lindstrom@skysql.com
+		   Dhananjoy Das DDas@fusionio.com
+***********************************************************************/
+
+#ifndef buf0mtflu_h
+#define buf0mtflu_h
+
+/******************************************************************//**
+Add exit work item to work queue to signal multi-threded flush
+threads that they should exit.
+*/
+void
+buf_mtflu_io_thread_exit(void);
+/*===========================*/
+
+/******************************************************************//**
+Initialize multi-threaded flush thread syncronization data.
+@return Initialized multi-threaded flush thread syncroniztion data. */
+void*
+buf_mtflu_handler_init(
+/*===================*/
+	ulint n_threads,	/*!< in: Number of threads to create */
+	ulint wrk_cnt);		/*!< in: Number of work items */
+
+/******************************************************************//**
+Return true if multi-threaded flush is initialized
+@return true if initialized, false if not */
+bool
+buf_mtflu_init_done(void);
+/*======================*/
+
+/*********************************************************************//**
+Clears up tail of the LRU lists:
+* Put replaceable pages at the tail of LRU to the free list
+* Flush dirty pages at the tail of LRU to the disk
+The depth to which we scan each buffer pool is controlled by dynamic
+config parameter innodb_LRU_scan_depth.
+@return total pages flushed */
+UNIV_INTERN
+ulint
+buf_mtflu_flush_LRU_tail(void);
+/*===========================*/
+
+/*******************************************************************//**
+Multi-threaded version of buf_flush_list
+*/
+bool
+buf_mtflu_flush_list(
+/*=================*/
+	ulint		min_n,		/*!< in: wished minimum mumber of blocks
+					flushed (it is not guaranteed that the
+					actual number is that big, though) */
+	lsn_t		lsn_limit,	/*!< in the case BUF_FLUSH_LIST all
+					blocks whose oldest_modification is
+					smaller than this should be flushed
+					(if their number does not exceed
+					min_n), otherwise ignored */
+	ulint*		n_processed);	/*!< out: the number of pages
+					which were processed is passed
+					back to caller. Ignored if NULL */
+
+/*********************************************************************//**
+Set correct thread identifiers to io thread array based on
+information we have. */
+void
+buf_mtflu_set_thread_ids(
+/*=====================*/
+	ulint n_threads,		/*!<in: Number of threads to fill */
+	void* ctx,		        /*!<in: thread context */
+	os_thread_id_t* thread_ids);	/*!<in: thread id array */
+
+#endif
diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h
index 026187b2000..c7161987b78 100644
--- a/storage/innobase/include/dict0dict.h
+++ b/storage/innobase/include/dict0dict.h
@@ -2,6 +2,7 @@
 
 Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -42,6 +43,8 @@ Created 1/8/1996 Heikki Tuuri
 #include "ut0byte.h"
 #include "trx0types.h"
 #include "row0types.h"
+#include "fsp0fsp.h"
+#include "dict0pagecompress.h"
 
 extern bool innodb_table_stats_not_found;
 extern bool innodb_index_stats_not_found;
@@ -120,7 +123,9 @@ enum dict_table_op_t {
 	DICT_TABLE_OP_DROP_ORPHAN,
 	/** Silently load the tablespace if it does not exist,
 	and do not load the definitions of incomplete indexes. */
-	DICT_TABLE_OP_LOAD_TABLESPACE
+	DICT_TABLE_OP_LOAD_TABLESPACE,
+	/** Open the table only if it's in table cache. */
+	DICT_TABLE_OP_OPEN_ONLY_IF_CACHED
 };
 
 /**********************************************************************//**
@@ -907,7 +912,14 @@ dict_tf_set(
 	ulint*		flags,		/*!< in/out: table */
 	rec_format_t	format,		/*!< in: file format */
 	ulint		zip_ssize,	/*!< in: zip shift size */
-	bool		remote_path)	/*!< in: table uses DATA DIRECTORY */
+	bool		remote_path,	/*!< in: table uses DATA DIRECTORY
+					*/
+        bool		page_compressed,/*!< in: table uses page compressed
+					pages */
+	ulint		page_compression_level, /*!< in: table page compression
+						 level */
+	ulint		atomic_writes)  /*!< in: table atomic
+					writes option value*/
 	__attribute__((nonnull));
 /********************************************************************//**
 Convert a 32 bit integer table flags to the 32 bit integer that is
@@ -935,6 +947,7 @@ dict_tf_get_zip_size(
 /*=================*/
 	ulint	flags)			/*!< in: flags */
 	__attribute__((const));
+
 /********************************************************************//**
 Check whether the table uses the compressed compact page format.
 @return	compressed page size, or 0 if not compressed */
@@ -1146,8 +1159,9 @@ ulint
 dict_index_get_nth_col_pos(
 /*=======================*/
 	const dict_index_t*	index,	/*!< in: index */
-	ulint			n)	/*!< in: column number */
-	__attribute__((nonnull, warn_unused_result));
+	ulint			n,	/*!< in: column number */
+	ulint*			prefix_col_pos) /*!< out: col num if prefix */
+	__attribute__((nonnull(1), warn_unused_result));
 /********************************************************************//**
 Looks for column n in an index.
 @return position in internal representation of the index;
@@ -1158,9 +1172,11 @@ dict_index_get_nth_col_or_prefix_pos(
 /*=================================*/
 	const dict_index_t*	index,		/*!< in: index */
 	ulint			n,		/*!< in: column number */
-	ibool			inc_prefix)	/*!< in: TRUE=consider
+	ibool			inc_prefix,	/*!< in: TRUE=consider
 						column prefixes too */
-	__attribute__((nonnull, warn_unused_result));
+	ulint*			prefix_col_pos)	/*!< out: col num if prefix */
+
+	__attribute__((nonnull(1), warn_unused_result));
 /********************************************************************//**
 Returns TRUE if the index contains a column or a prefix of that column.
 @return	TRUE if contains the column or its prefix */
@@ -1510,6 +1526,16 @@ dict_table_get_index_on_name(
 	const char*	name)	/*!< in: name of the index to find */
 	__attribute__((nonnull, warn_unused_result));
 /**********************************************************************//**
+Looks for an index with the given id given a table instance.
+@return	index or NULL */
+UNIV_INTERN
+dict_index_t*
+dict_table_find_index_on_id(
+/*========================*/
+	const dict_table_t*	table,	/*!< in: table instance */
+	index_id_t		id)	/*!< in: index id */
+	__attribute__((nonnull, warn_unused_result));
+/**********************************************************************//**
 In case there is more than one index with the same name return the index
 with the min(id).
 @return	index, NULL if does not exist */
@@ -1837,6 +1863,7 @@ dict_table_get_index_on_first_col(
 
 #endif /* !UNIV_HOTBACKUP */
 
+
 #ifndef UNIV_NONINL
 #include "dict0dict.ic"
 #endif
diff --git a/storage/innobase/include/dict0dict.ic b/storage/innobase/include/dict0dict.ic
index 066ffe47e4a..43bd42ae025 100644
--- a/storage/innobase/include/dict0dict.ic
+++ b/storage/innobase/include/dict0dict.ic
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -537,9 +538,25 @@ dict_tf_is_valid(
 	ulint	zip_ssize = DICT_TF_GET_ZIP_SSIZE(flags);
 	ulint	atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(flags);
 	ulint	unused = DICT_TF_GET_UNUSED(flags);
+	ulint	page_compression = DICT_TF_GET_PAGE_COMPRESSION(flags);
+	ulint	page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags);
+	ulint	data_dir = DICT_TF_HAS_DATA_DIR(flags);
+	ulint	atomic_writes = DICT_TF_GET_ATOMIC_WRITES(flags);
 
 	/* Make sure there are no bits that we do not know about. */
 	if (unused != 0) {
+		fprintf(stderr,
+			"InnoDB: Error: table unused flags are %ld"
+			" in the data dictionary and are corrupted\n"
+			"InnoDB: Error: data dictionary flags are\n"
+			"InnoDB: compact %ld atomic_blobs %ld\n"
+			"InnoDB: unused %ld data_dir %ld zip_ssize %ld\n"
+			"InnoDB: page_compression %ld page_compression_level %ld\n"
+			"InnoDB: atomic_writes %ld\n",
+			unused,
+			compact, atomic_blobs, unused, data_dir, zip_ssize,
+			page_compression, page_compression_level, atomic_writes
+		);
 
 		return(false);
 
@@ -550,12 +567,34 @@ dict_tf_is_valid(
 		data stored off-page in the clustered index. */
 
 		if (!compact) {
+			fprintf(stderr,
+				"InnoDB: Error: table compact flags are %ld"
+				" in the data dictionary and are corrupted\n"
+				"InnoDB: Error: data dictionary flags are\n"
+				"InnoDB: compact %ld atomic_blobs %ld\n"
+				"InnoDB: unused %ld data_dir %ld zip_ssize %ld\n"
+				"InnoDB: page_compression %ld page_compression_level %ld\n"
+				"InnoDB: atomic_writes %ld\n",
+				compact, compact, atomic_blobs, unused, data_dir, zip_ssize,
+				page_compression, page_compression_level, atomic_writes
+			);
 			return(false);
 		}
 
 	} else if (zip_ssize) {
 
 		/* Antelope does not support COMPRESSED row format. */
+		fprintf(stderr,
+			"InnoDB: Error: table flags are %ld"
+			" in the data dictionary and are corrupted\n"
+			"InnoDB: Error: data dictionary flags are\n"
+			"InnoDB: compact %ld atomic_blobs %ld\n"
+			"InnoDB: unused %ld data_dir %ld zip_ssize %ld\n"
+			"InnoDB: page_compression %ld page_compression_level %ld\n"
+			"InnoDB: atomic_writes %ld\n",
+			flags, compact, atomic_blobs, unused, data_dir, zip_ssize,
+			page_compression, page_compression_level, atomic_writes
+		);
 		return(false);
 	}
 
@@ -568,6 +607,58 @@ dict_tf_is_valid(
 		    || !atomic_blobs
 		    || zip_ssize > PAGE_ZIP_SSIZE_MAX) {
 
+			fprintf(stderr,
+				"InnoDB: Error: table compact flags are %ld in the data dictionary and are corrupted\n"
+				"InnoDB: Error: data dictionary flags are\n"
+				"InnoDB: compact %ld atomic_blobs %ld\n"
+				"InnoDB: unused %ld data_dir %ld zip_ssize %ld\n"
+				"InnoDB: page_compression %ld page_compression_level %ld\n"
+				"InnoDB: atomic_writes %ld\n",
+				flags,
+				compact, atomic_blobs, unused, data_dir, zip_ssize,
+				page_compression, page_compression_level, atomic_writes
+
+			);
+			return(false);
+		}
+	}
+
+        if (page_compression || page_compression_level) {
+		/* Page compression format must have compact and
+		atomic_blobs and page_compression_level requires
+		page_compression */
+		if (!compact
+			|| !page_compression
+			|| !atomic_blobs) {
+
+			fprintf(stderr,
+				"InnoDB: Error: table flags are %ld in the data dictionary and are corrupted\n"
+				"InnoDB: Error: data dictionary flags are\n"
+				"InnoDB: compact %ld atomic_blobs %ld\n"
+				"InnoDB: unused %ld data_dir %ld zip_ssize %ld\n"
+				"InnoDB: page_compression %ld page_compression_level %ld\n"
+				"InnoDB: atomic_writes %ld\n",
+				flags, compact, atomic_blobs, unused, data_dir, zip_ssize,
+				page_compression, page_compression_level, atomic_writes
+			);
+			return(false);
+		}
+	}
+
+	if (atomic_writes) {
+
+		if(atomic_writes > ATOMIC_WRITES_OFF) {
+
+			fprintf(stderr,
+				"InnoDB: Error: table flags are %ld in the data dictionary and are corrupted\n"
+				"InnoDB: Error: data dictionary flags are\n"
+				"InnoDB: compact %ld atomic_blobs %ld\n"
+				"InnoDB: unused %ld data_dir %ld zip_ssize %ld\n"
+				"InnoDB: page_compression %ld page_compression_level %ld\n"
+				"InnoDB: atomic_writes %ld\n",
+				flags, compact, atomic_blobs, unused, data_dir, zip_ssize,
+				page_compression, page_compression_level, atomic_writes
+			);
 			return(false);
 		}
 	}
@@ -594,6 +685,11 @@ dict_sys_tables_type_validate(
 	ulint	zip_ssize = DICT_TF_GET_ZIP_SSIZE(type);
 	ulint	atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(type);
 	ulint	unused = DICT_TF_GET_UNUSED(type);
+	ulint	page_compression = DICT_TF_GET_PAGE_COMPRESSION(type);
+	ulint	page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(type);
+	ulint	atomic_writes = DICT_TF_GET_ATOMIC_WRITES(type);
+
+	ut_a(atomic_writes <= ATOMIC_WRITES_OFF);
 
 	/* The low order bit of SYS_TABLES.TYPE is always set to 1.
 	If the format is UNIV_FORMAT_B or higher, this field is the same
@@ -604,12 +700,16 @@ dict_sys_tables_type_validate(
 
 	if (redundant) {
 		if (zip_ssize || atomic_blobs) {
+			fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=Redundant, zip_ssize %lu atomic_blobs %lu\n",
+				zip_ssize, atomic_blobs);
 			return(ULINT_UNDEFINED);
 		}
 	}
 
 	/* Make sure there are no bits that we do not know about. */
 	if (unused) {
+		fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, unused %lu\n",
+			type, unused);
 		return(ULINT_UNDEFINED);
 	}
 
@@ -624,6 +724,8 @@ dict_sys_tables_type_validate(
 
 	} else if (zip_ssize) {
 		/* Antelope does not support COMPRESSED format. */
+		fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu\n",
+			type, zip_ssize);
 		return(ULINT_UNDEFINED);
 	}
 
@@ -633,11 +735,15 @@ dict_sys_tables_type_validate(
 		should be in N_COLS, but we already know about the
 		low_order_bit and DICT_N_COLS_COMPACT flags. */
 		if (!atomic_blobs) {
+			fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu atomic_blobs %lu\n",
+				type, zip_ssize, atomic_blobs);
 			return(ULINT_UNDEFINED);
 		}
 
 		/* Validate that the number is within allowed range. */
 		if (zip_ssize > PAGE_ZIP_SSIZE_MAX) {
+			fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, zip_ssize %lu max %d\n",
+				type, zip_ssize, PAGE_ZIP_SSIZE_MAX);
 			return(ULINT_UNDEFINED);
 		}
 	}
@@ -647,6 +753,27 @@ dict_sys_tables_type_validate(
 	format, so the DATA_DIR flag is compatible with any other
 	table flags. However, it is not used with TEMPORARY tables.*/
 
+        if (page_compression || page_compression_level) {
+		/* page compressed row format must have low_order_bit and
+		atomic_blobs bits set and the DICT_N_COLS_COMPACT flag
+		should be in N_COLS, but we already know about the
+		low_order_bit and DICT_N_COLS_COMPACT flags. */
+
+                if (!atomic_blobs || !page_compression) {
+			fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, page_compression %lu page_compression_level %lu\n"
+				"InnoDB: Error: atomic_blobs %lu\n",
+				type, page_compression, page_compression_level, atomic_blobs);
+			return(ULINT_UNDEFINED);
+		}
+	}
+
+	/* Validate that the atomic writes number is within allowed range. */
+	if (atomic_writes > ATOMIC_WRITES_OFF) {
+		fprintf(stderr, "InnoDB: Error: SYS_TABLES::TYPE=%lu, atomic_writes %lu\n",
+				type, atomic_writes);
+			return(ULINT_UNDEFINED);
+	}
+
 	/* Return the validated SYS_TABLES.TYPE. */
 	return(type);
 }
@@ -719,8 +846,16 @@ dict_tf_set(
 	ulint*		flags,		/*!< in/out: table flags */
 	rec_format_t	format,		/*!< in: file format */
 	ulint		zip_ssize,	/*!< in: zip shift size */
-	bool		use_data_dir)	/*!< in: table uses DATA DIRECTORY */
+	bool		use_data_dir,	/*!< in: table uses DATA DIRECTORY
+					*/
+	bool		page_compressed,/*!< in: table uses page compressed
+					pages */
+	ulint		page_compression_level, /*!< in: table page compression
+						 level */
+	ulint		atomic_writes)  /*!< in: table atomic writes setup */
 {
+	atomic_writes_t awrites = (atomic_writes_t)atomic_writes;
+
 	switch (format) {
 	case REC_FORMAT_REDUNDANT:
 		*flags = 0;
@@ -742,6 +877,19 @@ dict_tf_set(
 		break;
 	}
 
+	if (page_compressed) {
+		*flags |= (1 << DICT_TF_POS_ATOMIC_BLOBS)
+                       | (1 << DICT_TF_POS_PAGE_COMPRESSION)
+		       | (page_compression_level << DICT_TF_POS_PAGE_COMPRESSION_LEVEL);
+
+		ut_ad(zip_ssize == 0);
+		ut_ad(dict_tf_get_page_compression(*flags) == TRUE);
+		ut_ad(dict_tf_get_page_compression_level(*flags) == page_compression_level);
+	}
+
+	*flags |= (atomic_writes << DICT_TF_POS_ATOMIC_WRITES);
+	ut_a(dict_tf_get_atomic_writes(*flags) == awrites);
+
 	if (use_data_dir) {
 		*flags |= (1 << DICT_TF_POS_DATA_DIR);
 	}
@@ -765,6 +913,9 @@ dict_tf_to_fsp_flags(
 	ulint	table_flags)	/*!< in: dict_table_t::flags */
 {
 	ulint fsp_flags;
+	ulint page_compression = DICT_TF_GET_PAGE_COMPRESSION(table_flags);
+	ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(table_flags);
+	ulint atomic_writes = DICT_TF_GET_ATOMIC_WRITES(table_flags);
 
 	DBUG_EXECUTE_IF("dict_tf_to_fsp_flags_failure",
 			return(ULINT_UNDEFINED););
@@ -783,7 +934,20 @@ dict_tf_to_fsp_flags(
 	fsp_flags |= DICT_TF_HAS_DATA_DIR(table_flags)
 		     ? FSP_FLAGS_MASK_DATA_DIR : 0;
 
+	/* In addition, tablespace flags also contain if the page
+	compression is used for this table. */
+	fsp_flags |= FSP_FLAGS_SET_PAGE_COMPRESSION(fsp_flags, page_compression);
+
+	/* In addition, tablespace flags also contain page compression level
+	if page compression is used for this table. */
+	fsp_flags |= FSP_FLAGS_SET_PAGE_COMPRESSION_LEVEL(fsp_flags, page_compression_level);
+
+	/* In addition, tablespace flags also contain flag if atomic writes
+	is used for this table */
+	fsp_flags |= FSP_FLAGS_SET_ATOMIC_WRITES(fsp_flags, atomic_writes);
+
 	ut_a(fsp_flags_is_valid(fsp_flags));
+	ut_a(dict_tf_verify_flags(table_flags, fsp_flags));
 
 	return(fsp_flags);
 }
@@ -811,10 +975,15 @@ dict_sys_tables_type_to_tf(
 	/* Adjust bit zero. */
 	flags = redundant ? 0 : 1;
 
-	/* ZIP_SSIZE, ATOMIC_BLOBS & DATA_DIR are the same. */
+	/* ZIP_SSIZE, ATOMIC_BLOBS, DATA_DIR, PAGE_COMPRESSION,
+	PAGE_COMPRESSION_LEVEL, ATOMIC_WRITES are the same. */
 	flags |= type & (DICT_TF_MASK_ZIP_SSIZE
 			 | DICT_TF_MASK_ATOMIC_BLOBS
-			 | DICT_TF_MASK_DATA_DIR);
+			 | DICT_TF_MASK_DATA_DIR
+			 | DICT_TF_MASK_PAGE_COMPRESSION
+			 | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL
+			 | DICT_TF_MASK_ATOMIC_WRITES
+	);
 
 	return(flags);
 }
@@ -842,10 +1011,14 @@ dict_tf_to_sys_tables_type(
 	/* Adjust bit zero. It is always 1 in SYS_TABLES.TYPE */
 	type = 1;
 
-	/* ZIP_SSIZE, ATOMIC_BLOBS & DATA_DIR are the same. */
+	/* ZIP_SSIZE, ATOMIC_BLOBS, DATA_DIR, PAGE_COMPRESSION,
+	PAGE_COMPRESSION_LEVEL, ATOMIC_WRITES are the same. */
 	type |= flags & (DICT_TF_MASK_ZIP_SSIZE
 			 | DICT_TF_MASK_ATOMIC_BLOBS
-			 | DICT_TF_MASK_DATA_DIR);
+			 | DICT_TF_MASK_DATA_DIR
+			 | DICT_TF_MASK_PAGE_COMPRESSION
+			 | DICT_TF_MASK_PAGE_COMPRESSION_LEVEL
+			 | DICT_TF_MASK_ATOMIC_WRITES);
 
 	return(type);
 }
@@ -1048,7 +1221,8 @@ dict_index_get_sys_col_pos(
 	}
 
 	return(dict_index_get_nth_col_pos(
-		       index, dict_table_get_sys_col_no(index->table, type)));
+			index, dict_table_get_sys_col_no(index->table, type),
+			NULL));
 }
 
 /*********************************************************************//**
@@ -1100,9 +1274,11 @@ ulint
 dict_index_get_nth_col_pos(
 /*=======================*/
 	const dict_index_t*	index,	/*!< in: index */
-	ulint			n)	/*!< in: column number */
+	ulint			n,	/*!< in: column number */
+	ulint*			prefix_col_pos) /*!< out: col num if prefix */
 {
-	return(dict_index_get_nth_col_or_prefix_pos(index, n, FALSE));
+	return(dict_index_get_nth_col_or_prefix_pos(index, n, FALSE,
+						    prefix_col_pos));
 }
 
 #ifndef UNIV_HOTBACKUP
diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h
index be0ef395ba8..1d59bc09f6d 100644
--- a/storage/innobase/include/dict0mem.h
+++ b/storage/innobase/include/dict0mem.h
@@ -2,6 +2,7 @@
 
 Copyright (c) 1996, 2014, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -124,11 +125,26 @@ This flag prevents older engines from attempting to open the table and
 allows InnoDB to update_create_info() accordingly. */
 #define DICT_TF_WIDTH_DATA_DIR		1
 
+/**
+Width of the page compression flag
+*/
+#define DICT_TF_WIDTH_PAGE_COMPRESSION  1
+#define DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL 4
+
+/**
+Width of atomic writes flag
+DEFAULT=0, ON = 1, OFF = 2
+*/
+#define DICT_TF_WIDTH_ATOMIC_WRITES 2
+
 /** Width of all the currently known table flags */
 #define DICT_TF_BITS	(DICT_TF_WIDTH_COMPACT		\
 			+ DICT_TF_WIDTH_ZIP_SSIZE	\
 			+ DICT_TF_WIDTH_ATOMIC_BLOBS	\
-			+ DICT_TF_WIDTH_DATA_DIR)
+			+ DICT_TF_WIDTH_DATA_DIR        \
+			+ DICT_TF_WIDTH_PAGE_COMPRESSION \
+			+ DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL \
+			+ DICT_TF_WIDTH_ATOMIC_WRITES)
 
 /** A mask of all the known/used bits in table flags */
 #define DICT_TF_BIT_MASK	(~(~0 << DICT_TF_BITS))
@@ -144,9 +160,19 @@ allows InnoDB to update_create_info() accordingly. */
 /** Zero relative shift position of the DATA_DIR field */
 #define DICT_TF_POS_DATA_DIR		(DICT_TF_POS_ATOMIC_BLOBS	\
 					+ DICT_TF_WIDTH_ATOMIC_BLOBS)
+/** Zero relative shift position of the PAGE_COMPRESSION field */
+#define DICT_TF_POS_PAGE_COMPRESSION	(DICT_TF_POS_DATA_DIR	\
+		                        + DICT_TF_WIDTH_DATA_DIR)
+/** Zero relative shift position of the PAGE_COMPRESSION_LEVEL field */
+#define DICT_TF_POS_PAGE_COMPRESSION_LEVEL	(DICT_TF_POS_PAGE_COMPRESSION	\
+					+ DICT_TF_WIDTH_PAGE_COMPRESSION)
+/** Zero relative shift position of the ATOMIC_WRITES field */
+#define DICT_TF_POS_ATOMIC_WRITES	(DICT_TF_POS_PAGE_COMPRESSION_LEVEL	\
+					+ DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL)
+
 /** Zero relative shift position of the start of the UNUSED bits */
-#define DICT_TF_POS_UNUSED		(DICT_TF_POS_DATA_DIR		\
-					+ DICT_TF_WIDTH_DATA_DIR)
+#define DICT_TF_POS_UNUSED		(DICT_TF_POS_ATOMIC_WRITES     \
+					+ DICT_TF_WIDTH_ATOMIC_WRITES)
 
 /** Bit mask of the COMPACT field */
 #define DICT_TF_MASK_COMPACT				\
@@ -164,6 +190,18 @@ allows InnoDB to update_create_info() accordingly. */
 #define DICT_TF_MASK_DATA_DIR				\
 		((~(~0 << DICT_TF_WIDTH_DATA_DIR))	\
 		<< DICT_TF_POS_DATA_DIR)
+/** Bit mask of the PAGE_COMPRESSION field */
+#define DICT_TF_MASK_PAGE_COMPRESSION			\
+		((~(~0 << DICT_TF_WIDTH_PAGE_COMPRESSION)) \
+		<< DICT_TF_POS_PAGE_COMPRESSION)
+/** Bit mask of the PAGE_COMPRESSION_LEVEL field */
+#define DICT_TF_MASK_PAGE_COMPRESSION_LEVEL		\
+		((~(~0 << DICT_TF_WIDTH_PAGE_COMPRESSION_LEVEL)) \
+		<< DICT_TF_POS_PAGE_COMPRESSION_LEVEL)
+/** Bit mask of the ATOMIC_WRITES field */
+#define DICT_TF_MASK_ATOMIC_WRITES		\
+		((~(~0 << DICT_TF_WIDTH_ATOMIC_WRITES)) \
+		<< DICT_TF_POS_ATOMIC_WRITES)
 
 /** Return the value of the COMPACT field */
 #define DICT_TF_GET_COMPACT(flags)			\
@@ -181,6 +219,19 @@ allows InnoDB to update_create_info() accordingly. */
 #define DICT_TF_HAS_DATA_DIR(flags)			\
 		((flags & DICT_TF_MASK_DATA_DIR)	\
 		>> DICT_TF_POS_DATA_DIR)
+/** Return the value of the PAGE_COMPRESSION field */
+#define DICT_TF_GET_PAGE_COMPRESSION(flags)	       \
+		((flags & DICT_TF_MASK_PAGE_COMPRESSION) \
+		>> DICT_TF_POS_PAGE_COMPRESSION)
+/** Return the value of the PAGE_COMPRESSION_LEVEL field */
+#define DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags)       \
+		((flags & DICT_TF_MASK_PAGE_COMPRESSION_LEVEL)	\
+		>> DICT_TF_POS_PAGE_COMPRESSION_LEVEL)
+/** Return the value of the ATOMIC_WRITES field */
+#define DICT_TF_GET_ATOMIC_WRITES(flags)       \
+		((flags & DICT_TF_MASK_ATOMIC_WRITES)	\
+		>> DICT_TF_POS_ATOMIC_WRITES)
+
 /** Return the contents of the UNUSED bits */
 #define DICT_TF_GET_UNUSED(flags)			\
 		(flags >> DICT_TF_POS_UNUSED)
@@ -492,6 +543,9 @@ be REC_VERSION_56_MAX_INDEX_COL_LEN (3072) bytes */
 
 /** Defines the maximum fixed length column size */
 #define DICT_MAX_FIXED_COL_LEN		DICT_ANTELOPE_MAX_INDEX_COL_LEN
+#ifdef WITH_WSREP
+#define WSREP_MAX_SUPPORTED_KEY_LENGTH 3500
+#endif /* WITH_WSREP */
 
 /** Data structure for a field in an index */
 struct dict_field_t{
@@ -562,6 +616,10 @@ struct zip_pad_info_t {
 				rounds */
 };
 
+/** Number of samples of data size kept when page compression fails for
+a certain index.*/
+#define STAT_DEFRAG_DATA_SIZE_N_SAMPLE	10
+
 /** Data structure for an index.  Most fields will be
 initialized to 0, NULL or FALSE in dict_mem_index_create(). */
 struct dict_index_t{
@@ -653,6 +711,23 @@ struct dict_index_t{
 				/*!< has persistent statistics error printed
 				for this index ? */
 	/* @} */
+	/** Statistics for defragmentation, these numbers are estimations and
+	could be very inaccurate at certain times, e.g. right after restart,
+	during defragmentation, etc. */
+	/* @{ */
+	ulint		stat_defrag_modified_counter;
+	ulint		stat_defrag_n_pages_freed;
+				/* number of pages freed by defragmentation. */
+	ulint		stat_defrag_n_page_split;
+				/* number of page splits since last full index
+				defragmentation. */
+	ulint		stat_defrag_data_size_sample[STAT_DEFRAG_DATA_SIZE_N_SAMPLE];
+				/* data size when compression failure happened
+				the most recent 10 times. */
+	ulint		stat_defrag_sample_next_slot;
+				/* in which slot the next sample should be
+				saved. */
+	/* @} */
 	rw_lock_t	lock;	/*!< read-write lock protecting the
 				upper levels of the index tree */
 	trx_id_t	trx_id; /*!< id of the transaction that created this
diff --git a/storage/innobase/include/dict0pagecompress.h b/storage/innobase/include/dict0pagecompress.h
new file mode 100644
index 00000000000..19a2a6c52f3
--- /dev/null
+++ b/storage/innobase/include/dict0pagecompress.h
@@ -0,0 +1,94 @@
+/*****************************************************************************
+
+Copyright (C) 2013 SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0pagecompress.h
+Helper functions for extracting/storing page compression information
+to dictionary.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+#ifndef dict0pagecompress_h
+#define dict0pagecompress_h
+
+/********************************************************************//**
+Extract the page compression level from table flags.
+@return	page compression level, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_tf_get_page_compression_level(
+/*===============================*/
+	ulint	flags)			/*!< in: flags */
+	__attribute__((const));
+/********************************************************************//**
+Extract the page compression flag from table flags
+@return	page compression flag, or false if not compressed */
+UNIV_INLINE
+ibool
+dict_tf_get_page_compression(
+/*==========================*/
+	ulint	flags)			/*!< in: flags */
+	__attribute__((const));
+
+/********************************************************************//**
+Check whether the table uses the page compressed page format.
+@return	page compression level, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_table_page_compression_level(
+/*==============================*/
+	const dict_table_t*	table)	/*!< in: table */
+	__attribute__((const));
+
+/********************************************************************//**
+Verify that dictionary flags match tablespace flags
+@return	true if flags match, false if not */
+UNIV_INLINE
+ibool
+dict_tf_verify_flags(
+/*=================*/
+	ulint	table_flags,	/*!< in: dict_table_t::flags */
+	ulint   fsp_flags)     /*!< in: fil_space_t::flags  */
+	__attribute__((const));
+
+/********************************************************************//**
+Extract the atomic writes flag from table flags.
+@return	true if atomic writes are used, false if not used  */
+UNIV_INLINE
+atomic_writes_t
+dict_tf_get_atomic_writes(
+/*======================*/
+	ulint	flags)			/*!< in: flags */
+	__attribute__((const));
+
+/********************************************************************//**
+Check whether the table uses the atomic writes.
+@return	true if atomic writes is used, false if not */
+UNIV_INLINE
+atomic_writes_t
+dict_table_get_atomic_writes(
+/*=========================*/
+	const dict_table_t* table);	/*!< in: table */
+
+
+#ifndef UNIV_NONINL
+#include "dict0pagecompress.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/dict0pagecompress.ic b/storage/innobase/include/dict0pagecompress.ic
new file mode 100644
index 00000000000..811976434a8
--- /dev/null
+++ b/storage/innobase/include/dict0pagecompress.ic
@@ -0,0 +1,191 @@
+/*****************************************************************************
+
+Copyright (C) 2013 SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/dict0pagecompress.ic
+Inline implementation for helper functions for extracting/storing
+page compression and atomic writes information to dictionary.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+/********************************************************************//**
+Verify that dictionary flags match tablespace flags
+@return	true if flags match, false if not */
+UNIV_INLINE
+ibool
+dict_tf_verify_flags(
+/*=================*/
+	ulint	table_flags,	/*!< in: dict_table_t::flags */
+	ulint   fsp_flags)      /*!< in: fil_space_t::flags  */
+{
+	ulint   table_unused = DICT_TF_GET_UNUSED(table_flags);
+	ulint   compact = DICT_TF_GET_COMPACT(table_flags);
+	ulint   ssize = DICT_TF_GET_ZIP_SSIZE(table_flags);
+	ulint	atomic_blobs = DICT_TF_HAS_ATOMIC_BLOBS(table_flags);
+	ulint   data_dir = DICT_TF_HAS_DATA_DIR(table_flags);
+        ulint   page_compression = DICT_TF_GET_PAGE_COMPRESSION(table_flags);
+	ulint   page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(table_flags);
+	ulint   atomic_writes = DICT_TF_GET_ATOMIC_WRITES(table_flags);
+	ulint	post_antelope = FSP_FLAGS_GET_POST_ANTELOPE(fsp_flags);
+	ulint	zip_ssize = FSP_FLAGS_GET_ZIP_SSIZE(fsp_flags);
+	ulint	fsp_atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(fsp_flags);
+	ulint	page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(fsp_flags);
+	ulint	fsp_unused = FSP_FLAGS_GET_UNUSED(fsp_flags);
+        ulint   fsp_page_compression = FSP_FLAGS_GET_PAGE_COMPRESSION(fsp_flags);
+	ulint   fsp_page_compression_level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(fsp_flags);
+	ulint   fsp_atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(fsp_flags);
+
+	DBUG_EXECUTE_IF("dict_tf_verify_flags_failure",
+			return(ULINT_UNDEFINED););
+
+	ut_a(!table_unused);
+	ut_a(!fsp_unused);
+	ut_a(page_ssize == 0 || page_ssize != 0); /* silence compiler */
+	ut_a(compact == 0 || compact == 1); /* silence compiler */
+	ut_a(data_dir == 0 || data_dir == 1); /* silence compiler */
+	ut_a(post_antelope == 0 || post_antelope == 1); /* silence compiler */
+
+	if (ssize != zip_ssize) {
+		fprintf(stderr,
+			"InnoDB: Error: table flags has zip_ssize %ld"
+			" in the data dictionary\n"
+			"InnoDB: but the flags in file has zip_ssize %ld\n",
+			ssize, zip_ssize);
+		return (FALSE);
+	}
+	if (atomic_blobs != fsp_atomic_blobs) {
+		fprintf(stderr,
+			"InnoDB: Error: table flags has atomic_blobs %ld"
+			" in the data dictionary\n"
+			"InnoDB: but the flags in file has atomic_blobs %ld\n",
+			atomic_blobs, fsp_atomic_blobs);
+
+		return (FALSE);
+	}
+	if (page_compression != fsp_page_compression) {
+		fprintf(stderr,
+			"InnoDB: Error: table flags has page_compression %ld"
+			" in the data dictionary\n"
+			"InnoDB: but the flags in file ahas page_compression %ld\n",
+			page_compression, fsp_page_compression);
+
+		return (FALSE);
+	}
+	if (page_compression_level != fsp_page_compression_level) {
+		fprintf(stderr,
+			"InnoDB: Error: table flags has page_compression_level %ld"
+			" in the data dictionary\n"
+			"InnoDB: but the flags in file has page_compression_level %ld\n",
+			page_compression_level, fsp_page_compression_level);
+
+		return (FALSE);
+	}
+
+	if (atomic_writes != fsp_atomic_writes) {
+		fprintf(stderr,
+			"InnoDB: Error: table flags has atomic writes %ld"
+			" in the data dictionary\n"
+			"InnoDB: but the flags in file has atomic_writes %ld\n",
+			atomic_writes, fsp_atomic_writes);
+
+		return (FALSE);
+	}
+
+	return(TRUE);
+}
+
+/********************************************************************//**
+Extract the page compression level from dict_table_t::flags.
+These flags are in memory, so assert that they are valid.
+@return	page compression level, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_tf_get_page_compression_level(
+/*===============================*/
+	ulint	flags)	/*!< in: flags */
+{
+        ulint page_compression_level = DICT_TF_GET_PAGE_COMPRESSION_LEVEL(flags);
+
+	ut_ad(page_compression_level <= 9);
+
+	return(page_compression_level);
+}
+
+/********************************************************************//**
+Check whether the table uses the page compression page format.
+@return	page compression level, or 0 if not compressed */
+UNIV_INLINE
+ulint
+dict_table_page_compression_level(
+/*==============================*/
+	const dict_table_t*	table)	/*!< in: table */
+{
+	ut_ad(table);
+	ut_ad(dict_tf_get_page_compression(table->flags));
+
+	return(dict_tf_get_page_compression_level(table->flags));
+}
+
+/********************************************************************//**
+Check whether the table uses the page compression page format.
+@return	true if page compressed, false if not */
+UNIV_INLINE
+ibool
+dict_tf_get_page_compression(
+/*=========================*/
+	ulint	flags)	/*!< in: flags */
+{
+	return(DICT_TF_GET_PAGE_COMPRESSION(flags));
+}
+
+/********************************************************************//**
+Check whether the table uses the page compression page format.
+@return	true if page compressed, false if not */
+UNIV_INLINE
+ibool
+dict_table_is_page_compressed(
+/*==========================*/
+	const dict_table_t* table)	/*!< in: table */
+{
+	return (dict_tf_get_page_compression(table->flags));
+}
+
+/********************************************************************//**
+Extract the atomic writes flag from table flags.
+@return	enumerated value of atomic writes  */
+UNIV_INLINE
+atomic_writes_t
+dict_tf_get_atomic_writes(
+/*======================*/
+	ulint	flags)			/*!< in: flags */
+{
+	return((atomic_writes_t)DICT_TF_GET_ATOMIC_WRITES(flags));
+}
+
+/********************************************************************//**
+Check whether the table uses the atomic writes.
+@return	enumerated value of atomic writes */
+UNIV_INLINE
+atomic_writes_t
+dict_table_get_atomic_writes(
+/*=========================*/
+	const dict_table_t* table)	/*!< in: table */
+{
+	return ((atomic_writes_t)dict_tf_get_atomic_writes(table->flags));
+}
diff --git a/storage/innobase/include/dict0priv.h b/storage/innobase/include/dict0priv.h
index 9a3c8e22992..e034662aba0 100644
--- a/storage/innobase/include/dict0priv.h
+++ b/storage/innobase/include/dict0priv.h
@@ -53,8 +53,9 @@ dict_table_t*
 dict_table_open_on_id_low(
 /*=====================*/
 	table_id_t		table_id,	/*!< in: table id */
-	dict_err_ignore_t	ignore_err);	/*!< in: errors to ignore
+	dict_err_ignore_t	ignore_err,	/*!< in: errors to ignore
 						when loading the table */
+	ibool			open_only_if_in_cache);
 
 #ifndef UNIV_NONINL
 #include "dict0priv.ic"
diff --git a/storage/innobase/include/dict0priv.ic b/storage/innobase/include/dict0priv.ic
index 30ba8fb60aa..983218af78a 100644
--- a/storage/innobase/include/dict0priv.ic
+++ b/storage/innobase/include/dict0priv.ic
@@ -74,8 +74,9 @@ dict_table_t*
 dict_table_open_on_id_low(
 /*======================*/
 	table_id_t		table_id,	/*!< in: table id */
-	dict_err_ignore_t	ignore_err)	/*!< in: errors to ignore
+	dict_err_ignore_t	ignore_err,	/*!< in: errors to ignore
 						when loading the table */
+	ibool			open_only_if_in_cache)
 {
 	dict_table_t*	table;
 	ulint		fold;
@@ -88,7 +89,7 @@ dict_table_open_on_id_low(
 	HASH_SEARCH(id_hash, dict_sys->table_id_hash, fold,
 		    dict_table_t*, table, ut_ad(table->cached),
 		    table->id == table_id);
-	if (table == NULL) {
+	if (table == NULL && !open_only_if_in_cache) {
 		table = dict_load_table_on_id(table_id, ignore_err);
 	}
 
diff --git a/storage/innobase/include/dict0stats.h b/storage/innobase/include/dict0stats.h
index 186f90e3694..abf56b2f0c7 100644
--- a/storage/innobase/include/dict0stats.h
+++ b/storage/innobase/include/dict0stats.h
@@ -195,6 +195,39 @@ dict_stats_rename_table(
 					is returned */
 	size_t		errstr_sz);	/*!< in: errstr size */
 
+/*********************************************************************//**
+Save defragmentation result.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_save_defrag_summary(
+	dict_index_t*	index);	/*!< in: index */
+
+/*********************************************************************//**
+Save defragmentation stats for a given index.
+@return DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_stats_save_defrag_stats(
+	dict_index_t*	index);	/*!< in: index */
+
+/**********************************************************************//**
+Clear defragmentation summary. */
+UNIV_INTERN
+void
+dict_stats_empty_defrag_summary(
+/*==================*/
+	dict_index_t* index);	/*!< in: index to clear defragmentation stats */
+
+/**********************************************************************//**
+Clear defragmentation related index stats. */
+UNIV_INTERN
+void
+dict_stats_empty_defrag_stats(
+/*==================*/
+	dict_index_t* index);	/*!< in: index to clear defragmentation stats */
+
+
 #ifndef UNIV_NONINL
 #include "dict0stats.ic"
 #endif
diff --git a/storage/innobase/include/dict0stats_bg.h b/storage/innobase/include/dict0stats_bg.h
index e866ab419fe..32fac3015e8 100644
--- a/storage/innobase/include/dict0stats_bg.h
+++ b/storage/innobase/include/dict0stats_bg.h
@@ -56,6 +56,28 @@ dict_stats_recalc_pool_del(
 /*=======================*/
 	const dict_table_t*	table);	/*!< in: table to remove */
 
+/*****************************************************************//**
+Add an index in a table to the defrag pool, which is processed by the
+background stats gathering thread. Only the table id and index id are
+added to the list, so the table can be closed after being enqueued and
+it will be opened when needed. If the table or index does not exist later
+(has been DROPped), then it will be removed from the pool and skipped. */
+UNIV_INTERN
+void
+dict_stats_defrag_pool_add(
+/*=======================*/
+	const dict_index_t*	index);	/*!< in: table to add */
+
+/*****************************************************************//**
+Delete a given index from the auto defrag pool. */
+UNIV_INTERN
+void
+dict_stats_defrag_pool_del(
+/*=======================*/
+	const dict_table_t*	table,	/*!<in: if given, remove
+					all entries for the table */
+	const dict_index_t*	index);	/*!< in: index to remove */
+
 /** Yield the data dictionary latch when waiting
 for the background thread to stop accessing a table.
 @param trx	transaction holding the data dictionary locks */
diff --git a/storage/innobase/include/dict0types.h b/storage/innobase/include/dict0types.h
index d34b6f7eab3..35430e8ea62 100644
--- a/storage/innobase/include/dict0types.h
+++ b/storage/innobase/include/dict0types.h
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1996, 2013, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -75,6 +76,13 @@ enum ib_quiesce_t {
 	QUIESCE_COMPLETE		/*!< All done */
 };
 
+/** Enum values for atomic_writes table option */
+typedef enum {
+	ATOMIC_WRITES_DEFAULT = 0,
+	ATOMIC_WRITES_ON = 1,
+	ATOMIC_WRITES_OFF = 2
+} atomic_writes_t;
+
 /** Prefix for tmp tables, adopted from sql/table.h */
 #define tmp_file_prefix		"#sql"
 #define tmp_file_prefix_length	4
diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h
index 798423eeddd..9c453d3f4ca 100644
--- a/storage/innobase/include/fil0fil.h
+++ b/storage/innobase/include/fil0fil.h
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -126,11 +127,33 @@ extern fil_addr_t	fil_addr_null;
 					data file (ibdata*, not *.ibd):
 					the file has been flushed to disk
 					at least up to this lsn */
+/** If page type is FIL_PAGE_COMPRESSED then the 8 bytes starting at
+FIL_PAGE_FILE_FLUSH_LSN are broken down as follows: */
+
+/** Control information version format (u8) */
+static const ulint FIL_PAGE_VERSION = FIL_PAGE_FILE_FLUSH_LSN;
+
+/** Compression algorithm (u8) */
+static const ulint FIL_PAGE_ALGORITHM_V1 = FIL_PAGE_VERSION + 1;
+
+/** Original page type (u16) */
+static const ulint FIL_PAGE_ORIGINAL_TYPE_V1 = FIL_PAGE_ALGORITHM_V1 + 1;
+
+/** Original data size in bytes (u16)*/
+static const ulint FIL_PAGE_ORIGINAL_SIZE_V1 = FIL_PAGE_ORIGINAL_TYPE_V1 + 2;
+
+/** Size after compression (u16)*/
+static const ulint FIL_PAGE_COMPRESS_SIZE_V1 = FIL_PAGE_ORIGINAL_SIZE_V1 + 2;
+
 #define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID  34 /*!< starting from 4.1.x this
 					contains the space id of the page */
 #define FIL_PAGE_SPACE_ID  FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID
 
 #define FIL_PAGE_DATA		38	/*!< start of the data on the page */
+/* Following are used when page compression is used */
+#define FIL_PAGE_COMPRESSED_SIZE 2      /*!< Number of bytes used to store
+ 					actual payload data size on
+ 					compressed pages. */
 /* @} */
 /** File page trailer @{ */
 #define FIL_PAGE_END_LSN_OLD_CHKSUM 8	/*!< the low 4 bytes of this are used
@@ -143,6 +166,7 @@ extern fil_addr_t	fil_addr_null;
 #ifndef UNIV_INNOCHECKSUM
 
 /** File page types (values of FIL_PAGE_TYPE) @{ */
+#define FIL_PAGE_PAGE_COMPRESSED 34354  /*!< page compressed page */
 #define FIL_PAGE_INDEX		17855	/*!< B-tree node */
 #define FIL_PAGE_UNDO_LOG	2	/*!< Undo log page */
 #define FIL_PAGE_INODE		3	/*!< Index node */
@@ -157,7 +181,8 @@ extern fil_addr_t	fil_addr_null;
 #define FIL_PAGE_TYPE_BLOB	10	/*!< Uncompressed BLOB page */
 #define FIL_PAGE_TYPE_ZBLOB	11	/*!< First compressed BLOB page */
 #define FIL_PAGE_TYPE_ZBLOB2	12	/*!< Subsequent compressed BLOB page */
-#define FIL_PAGE_TYPE_LAST	FIL_PAGE_TYPE_ZBLOB2
+#define FIL_PAGE_TYPE_COMPRESSED	13	/*!< Compressed page */
+#define FIL_PAGE_TYPE_LAST	FIL_PAGE_TYPE_COMPRESSED
 					/*!< Last page type */
 /* @} */
 
@@ -223,6 +248,7 @@ struct fil_node_t {
 	ib_int64_t	flush_counter;/*!< up to what
 				modification_counter value we have
 				flushed the modifications to disk */
+	ulint		file_block_size;/*!< file system block size */
 	UT_LIST_NODE_T(fil_node_t) chain;
 				/*!< link field for the file chain */
 	UT_LIST_NODE_T(fil_node_t) LRU;
@@ -396,6 +422,7 @@ ulint
 fil_space_get_type(
 /*===============*/
 	ulint	id);	/*!< in: space id */
+
 #endif /* !UNIV_HOTBACKUP */
 /*******************************************************************//**
 Appends a new file to the chain of files of a space. File must be closed.
@@ -575,8 +602,10 @@ fil_read_first_page(
 #endif /* UNIV_LOG_ARCHIVE */
 	lsn_t*		min_flushed_lsn,	/*!< out: min of flushed
 						lsn values in data files */
-	lsn_t*		max_flushed_lsn)	/*!< out: max of flushed
+	lsn_t*		max_flushed_lsn,	/*!< out: max of flushed
 						lsn values in data files */
+	ulint		orig_space_id)		/*!< in: file space id or
+						ULINT_UNDEFINED */
 	__attribute__((warn_unused_result));
 /*******************************************************************//**
 Increments the count of pending operation, if space is not being deleted.
@@ -939,8 +968,13 @@ fil_io(
 	void*	buf,		/*!< in/out: buffer where to store read data
 				or from where to write; in aio this must be
 				appropriately aligned */
-	void*	message)	/*!< in: message for aio handler if non-sync
+	void*	message,	/*!< in: message for aio handler if non-sync
 				aio used, else ignored */
+	ulint*	write_size)	/*!< in/out: Actual write size initialized
+			       after fist successfull trim
+			       operation for this page and if
+			       initialized we do not trim again if
+			       actual page size does not decrease. */
 	__attribute__((nonnull(8)));
 /**********************************************************************//**
 Waits for an aio operation to complete. This function is used to write the
@@ -1192,4 +1226,38 @@ fil_user_tablespace_restore_page(
 					write buffer */
 
 #endif /* !UNIV_INNOCHECKSUM */
+
+/****************************************************************//**
+Acquire fil_system mutex */
+void
+fil_system_enter(void);
+/*==================*/
+/****************************************************************//**
+Release fil_system mutex */
+void
+fil_system_exit(void);
+/*==================*/
+
+#ifndef UNIV_INNOCHECKSUM
+/*******************************************************************//**
+Returns the table space by a given id, NULL if not found. */
+fil_space_t*
+fil_space_get_by_id(
+/*================*/
+	ulint	id);	/*!< in: space id */
+/*******************************************************************//**
+Return space name */
+char*
+fil_space_name(
+/*===========*/
+	fil_space_t*	space);	/*!< in: space */
+#endif
+
+/*******************************************************************//**
+Return page type name */
+const char*
+fil_get_page_type_name(
+/*===================*/
+	ulint	page_type);	/*!< in: FIL_PAGE_TYPE */
+
 #endif /* fil0fil_h */
diff --git a/storage/innobase/include/fil0pagecompress.h b/storage/innobase/include/fil0pagecompress.h
new file mode 100644
index 00000000000..fb97af87460
--- /dev/null
+++ b/storage/innobase/include/fil0pagecompress.h
@@ -0,0 +1,138 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2014 SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+#ifndef fil0pagecompress_h
+#define fil0pagecompress_h
+
+#include "fsp0fsp.h"
+#include "fsp0pagecompress.h"
+
+/******************************************************************//**
+@file include/fil0pagecompress.h
+Helper functions for extracting/storing page compression and
+atomic writes information to table space.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+/*******************************************************************//**
+Returns the page compression level flag of the space, or 0 if the space
+is not compressed. The tablespace must be cached in the memory cache.
+@return	page compression level if page compressed, ULINT_UNDEFINED if space not found */
+ulint
+fil_space_get_page_compression_level(
+/*=================================*/
+	ulint	id);	/*!< in: space id */
+/*******************************************************************//**
+Returns the page compression flag of the space, or false if the space
+is not compressed. The tablespace must be cached in the memory cache.
+@return	true if page compressed, false if not or space not found */
+ibool
+fil_space_is_page_compressed(
+/*=========================*/
+	ulint   id);	/*!< in: space id */
+/*******************************************************************//**
+Returns the page compression flag of the space, or false if the space
+is not compressed. The tablespace must be cached in the memory cache.
+@return	true if page compressed, false if not or space not found */
+ibool
+fil_space_get_page_compressed(
+/*=========================*/
+	fil_space_t*	space);	/*!< in: space id */
+/*******************************************************************//**
+Returns the atomic writes flag of the space, or false if the space
+is not using atomic writes. The tablespace must be cached in the memory cache.
+@return	atomic write table option value */
+atomic_writes_t
+fil_space_get_atomic_writes(
+/*=========================*/
+	ulint   id);	/*!< in: space id */
+/*******************************************************************//**
+Find out wheather the page is index page or not
+@return	true if page type index page, false if not */
+ibool
+fil_page_is_index_page(
+/*===================*/
+	byte *buf);	/*!< in: page */
+
+/****************************************************************//**
+Get the name of the compression algorithm used for page
+compression.
+@return compression algorithm name or "UNKNOWN" if not known*/
+const char*
+fil_get_compression_alg_name(
+/*=========================*/
+       ulint           comp_alg);    /*!<in: compression algorithm number */
+
+/****************************************************************//**
+For page compressed pages compress the page before actual write
+operation.
+@return compressed page to be written*/
+byte*
+fil_compress_page(
+/*==============*/
+	ulint		space_id,      /*!< in: tablespace id of the
+				       table. */
+	byte*           buf,           /*!< in: buffer from which to write; in aio
+				       this must be appropriately aligned */
+        byte*           out_buf,       /*!< out: compressed buffer */
+        ulint           len,           /*!< in: length of input buffer.*/
+        ulint           compression_level, /*!< in: compression level */
+	ulint           block_size,    /*!< in: block size */
+	ulint*          out_len,       /*!< out: actual length of compressed
+				       page */
+	byte*		lzo_mem);      /*!< in: temporal memory used by LZO */
+
+/****************************************************************//**
+For page compressed pages decompress the page after actual read
+operation.
+@return uncompressed page */
+void
+fil_decompress_page(
+/*================*/
+	byte*           page_buf,      /*!< in: preallocated buffer or NULL */
+	byte*           buf,           /*!< out: buffer from which to read; in aio
+				       this must be appropriately aligned */
+        ulong           len,           /*!< in: length of output buffer.*/
+	ulint*		write_size);   /*!< in/out: Actual payload size of
+				       the compressed data. */
+
+/****************************************************************//**
+Get space id from fil node
+@return space id*/
+ulint
+fil_node_get_space_id(
+/*==================*/
+        fil_node_t*     node);		/*!< in: Node where to get space id*/
+
+/****************************************************************//**
+Get block size from fil node
+@return block size*/
+ulint
+fil_node_get_block_size(
+	fil_node_t*     node);		/*!< in: Node where to get block
+					size */
+/*******************************************************************//**
+Find out wheather the page is page compressed
+@return	true if page is page compressed*/
+ibool
+fil_page_is_compressed(
+/*===================*/
+	byte *buf);	/*!< in: page */
+
+#endif
diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h
index a587ccc9f20..87f1f5a636d 100644
--- a/storage/innobase/include/fsp0fsp.h
+++ b/storage/innobase/include/fsp0fsp.h
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -53,12 +54,21 @@ to the two Barracuda row formats COMPRESSED and DYNAMIC. */
 /** Width of the DATA_DIR flag.  This flag indicates that the tablespace
 is found in a remote location, not the default data directory. */
 #define FSP_FLAGS_WIDTH_DATA_DIR	1
+/** Number of flag bits used to indicate the page compression and compression level */
+#define FSP_FLAGS_WIDTH_PAGE_COMPRESSION  1
+#define FSP_FLAGS_WIDTH_PAGE_COMPRESSION_LEVEL 4
+/** Number of flag bits used to indicate atomic writes for this tablespace */
+#define FSP_FLAGS_WIDTH_ATOMIC_WRITES  2
+
 /** Width of all the currently known tablespace flags */
 #define FSP_FLAGS_WIDTH		(FSP_FLAGS_WIDTH_POST_ANTELOPE	\
 				+ FSP_FLAGS_WIDTH_ZIP_SSIZE	\
 				+ FSP_FLAGS_WIDTH_ATOMIC_BLOBS	\
 				+ FSP_FLAGS_WIDTH_PAGE_SSIZE	\
-				+ FSP_FLAGS_WIDTH_DATA_DIR)
+				+ FSP_FLAGS_WIDTH_DATA_DIR      \
+				+ FSP_FLAGS_WIDTH_PAGE_COMPRESSION \
+				+ FSP_FLAGS_WIDTH_PAGE_COMPRESSION_LEVEL \
+				+ FSP_FLAGS_WIDTH_ATOMIC_WRITES)
 
 /** A mask of all the known/used bits in tablespace flags */
 #define FSP_FLAGS_MASK		(~(~0 << FSP_FLAGS_WIDTH))
@@ -71,9 +81,20 @@ is found in a remote location, not the default data directory. */
 /** Zero relative shift position of the ATOMIC_BLOBS field */
 #define FSP_FLAGS_POS_ATOMIC_BLOBS	(FSP_FLAGS_POS_ZIP_SSIZE	\
 					+ FSP_FLAGS_WIDTH_ZIP_SSIZE)
-/** Zero relative shift position of the PAGE_SSIZE field */
-#define FSP_FLAGS_POS_PAGE_SSIZE	(FSP_FLAGS_POS_ATOMIC_BLOBS	\
+/** Note that these need to be before the page size to be compatible with
+dictionary */
+/** Zero relative shift position of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_POS_PAGE_COMPRESSION	(FSP_FLAGS_POS_ATOMIC_BLOBS	\
 					+ FSP_FLAGS_WIDTH_ATOMIC_BLOBS)
+/** Zero relative shift position of the PAGE_COMPRESSION_LEVEL field */
+#define FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL	(FSP_FLAGS_POS_PAGE_COMPRESSION	\
+					+ FSP_FLAGS_WIDTH_PAGE_COMPRESSION)
+/** Zero relative shift position of the ATOMIC_WRITES field */
+#define FSP_FLAGS_POS_ATOMIC_WRITES	(FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL	\
+					+ FSP_FLAGS_WIDTH_PAGE_COMPRESSION_LEVEL)
+ /** Zero relative shift position of the PAGE_SSIZE field */
+#define FSP_FLAGS_POS_PAGE_SSIZE	(FSP_FLAGS_POS_ATOMIC_WRITES	\
+					+ FSP_FLAGS_WIDTH_ATOMIC_WRITES)
 /** Zero relative shift position of the start of the UNUSED bits */
 #define FSP_FLAGS_POS_DATA_DIR		(FSP_FLAGS_POS_PAGE_SSIZE	\
 					+ FSP_FLAGS_WIDTH_PAGE_SSIZE)
@@ -101,6 +122,18 @@ is found in a remote location, not the default data directory. */
 #define FSP_FLAGS_MASK_DATA_DIR					\
 		((~(~0 << FSP_FLAGS_WIDTH_DATA_DIR))		\
 		<< FSP_FLAGS_POS_DATA_DIR)
+/** Bit mask of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_MASK_PAGE_COMPRESSION			\
+		((~(~0 << FSP_FLAGS_WIDTH_PAGE_COMPRESSION))	\
+		<< FSP_FLAGS_POS_PAGE_COMPRESSION)
+/** Bit mask of the PAGE_COMPRESSION_LEVEL field */
+#define FSP_FLAGS_MASK_PAGE_COMPRESSION_LEVEL		\
+		((~(~0 << FSP_FLAGS_WIDTH_PAGE_COMPRESSION_LEVEL))	\
+		<< FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL)
+/** Bit mask of the ATOMIC_WRITES field */
+#define FSP_FLAGS_MASK_ATOMIC_WRITES		\
+		((~(~0 << FSP_FLAGS_WIDTH_ATOMIC_WRITES))	\
+		<< FSP_FLAGS_POS_ATOMIC_WRITES)
 
 /** Return the value of the POST_ANTELOPE field */
 #define FSP_FLAGS_GET_POST_ANTELOPE(flags)			\
@@ -126,11 +159,38 @@ is found in a remote location, not the default data directory. */
 #define FSP_FLAGS_GET_UNUSED(flags)				\
 		(flags >> FSP_FLAGS_POS_UNUSED)
 
+/** Return the value of the PAGE_COMPRESSION field */
+#define FSP_FLAGS_GET_PAGE_COMPRESSION(flags)		\
+		((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION)	\
+		>> FSP_FLAGS_POS_PAGE_COMPRESSION)
+/** Return the value of the PAGE_COMPRESSION_LEVEL field */
+#define FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags)		\
+		((flags & FSP_FLAGS_MASK_PAGE_COMPRESSION_LEVEL) \
+		>> FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL)
+/** Return the value of the ATOMIC_WRITES field */
+#define FSP_FLAGS_GET_ATOMIC_WRITES(flags)		\
+		((flags & FSP_FLAGS_MASK_ATOMIC_WRITES) \
+		>> FSP_FLAGS_POS_ATOMIC_WRITES)
+
 /** Set a PAGE_SSIZE into the correct bits in a given
 tablespace flags. */
 #define FSP_FLAGS_SET_PAGE_SSIZE(flags, ssize)			\
 		(flags | (ssize << FSP_FLAGS_POS_PAGE_SSIZE))
 
+/** Set a PAGE_COMPRESSION into the correct bits in a given
+tablespace flags. */
+#define FSP_FLAGS_SET_PAGE_COMPRESSION(flags, compression)	\
+		(flags | (compression << FSP_FLAGS_POS_PAGE_COMPRESSION))
+
+/** Set a PAGE_COMPRESSION_LEVEL into the correct bits in a given
+tablespace flags. */
+#define FSP_FLAGS_SET_PAGE_COMPRESSION_LEVEL(flags, level)	\
+		(flags | (level << FSP_FLAGS_POS_PAGE_COMPRESSION_LEVEL))
+/** Set a ATOMIC_WRITES into the correct bits in a given
+tablespace flags. */
+#define FSP_FLAGS_SET_ATOMIC_WRITES(flags, atomics)	\
+		(flags | (atomics << FSP_FLAGS_POS_ATOMIC_WRITES))
+
 /* @} */
 
 /* @defgroup Tablespace Header Constants (moved from fsp0fsp.c) @{ */
diff --git a/storage/innobase/include/fsp0fsp.ic b/storage/innobase/include/fsp0fsp.ic
index 0d81e817cc9..3a3eb21a61a 100644
--- a/storage/innobase/include/fsp0fsp.ic
+++ b/storage/innobase/include/fsp0fsp.ic
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 1995, 2012, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -63,12 +64,17 @@ fsp_flags_is_valid(
 	ulint	atomic_blobs = FSP_FLAGS_HAS_ATOMIC_BLOBS(flags);
 	ulint	page_ssize = FSP_FLAGS_GET_PAGE_SSIZE(flags);
 	ulint	unused = FSP_FLAGS_GET_UNUSED(flags);
+	ulint	page_compression = FSP_FLAGS_GET_PAGE_COMPRESSION(flags);
+	ulint	page_compression_level = FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags);
+	ulint	atomic_writes = FSP_FLAGS_GET_ATOMIC_WRITES(flags);
 
 	DBUG_EXECUTE_IF("fsp_flags_is_valid_failure", return(false););
 
 	/* fsp_flags is zero unless atomic_blobs is set. */
 	/* Make sure there are no bits that we do not know about. */
 	if (unused != 0 || flags == 1) {
+		fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted unused %lu\n",
+			flags, unused);
 		return(false);
 	} else if (post_antelope) {
 		/* The Antelope row formats REDUNDANT and COMPACT did
@@ -76,6 +82,8 @@ fsp_flags_is_valid(
 		4-byte field is zero for Antelope row formats. */
 
 		if (!atomic_blobs) {
+			fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted atomic_blobs %lu\n",
+				flags, atomic_blobs);
 			return(false);
 		}
 	}
@@ -87,10 +95,14 @@ fsp_flags_is_valid(
 		externally stored parts. */
 
 		if (post_antelope || zip_ssize != 0) {
+			fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted zip_ssize %lu atomic_blobs %lu\n",
+				flags, zip_ssize, atomic_blobs);
 			return(false);
 		}
 
 	} else if (!post_antelope || zip_ssize > PAGE_ZIP_SSIZE_MAX) {
+		fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted zip_ssize %lu max %d\n",
+			flags, zip_ssize, PAGE_ZIP_SSIZE_MAX);
 		return(false);
 	} else if (page_ssize > UNIV_PAGE_SSIZE_MAX) {
 
@@ -98,12 +110,33 @@ fsp_flags_is_valid(
 		be zero for an original 16k page size.
 		Validate the page shift size is within allowed range. */
 
+		fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted page_ssize %lu max %lu\n",
+			flags, page_ssize, UNIV_PAGE_SSIZE_MAX);
 		return(false);
 
 	} else if (UNIV_PAGE_SIZE != UNIV_PAGE_SIZE_ORIG && !page_ssize) {
+		fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted page_ssize %lu max %lu:%d\n",
+			flags, page_ssize, UNIV_PAGE_SIZE, UNIV_PAGE_SIZE_ORIG);
 		return(false);
 	}
 
+	/* Page compression level requires page compression and atomic blobs
+	to be set */
+        if (page_compression_level || page_compression) {
+		if (!page_compression || !atomic_blobs) {
+			fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted page_compression %lu\n"
+				"InnoDB: Error: page_compression_level %lu atomic_blobs %lu\n",
+				flags, page_compression, page_compression_level, atomic_blobs);
+			return(false);
+		}
+	}
+
+	if (atomic_writes > ATOMIC_WRITES_OFF) {
+		fprintf(stderr, "InnoDB: Error: Tablespace flags %lu corrupted atomic_writes %lu\n",
+			flags, atomic_writes);
+		return (false);
+	}
+
 #if UNIV_FORMAT_MAX != UNIV_FORMAT_B
 # error "UNIV_FORMAT_MAX != UNIV_FORMAT_B, Add more validations."
 #endif
@@ -312,3 +345,4 @@ xdes_calc_descriptor_page(
 }
 
 #endif /* !UNIV_INNOCHECKSUM */
+
diff --git a/storage/innobase/include/fsp0pagecompress.h b/storage/innobase/include/fsp0pagecompress.h
new file mode 100644
index 00000000000..15212227829
--- /dev/null
+++ b/storage/innobase/include/fsp0pagecompress.h
@@ -0,0 +1,83 @@
+/*****************************************************************************
+
+Copyright (C) 2013, 2014 SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fsp0pagecompress.h
+Helper functions for extracting/storing page compression and
+atomic writes information to file space.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+#ifndef fsp0pagecompress_h
+#define fsp0pagecompress_h
+
+/* Supported page compression methods */
+
+#define PAGE_UNCOMPRESSED	0
+#define PAGE_ZLIB_ALGORITHM	1
+#define PAGE_LZ4_ALGORITHM	2
+#define PAGE_LZO_ALGORITHM	3
+#define PAGE_LZMA_ALGORITHM	4
+#define PAGE_BZIP2_ALGORITHM	5
+#define PAGE_ALGORITHM_LAST	PAGE_BZIP2_ALGORITHM
+
+/**********************************************************************//**
+Reads the page compression level from the first page of a tablespace.
+@return	page compression level, or 0 if uncompressed */
+UNIV_INTERN
+ulint
+fsp_header_get_compression_level(
+/*=============================*/
+	const page_t*	page);	/*!< in: first page of a tablespace */
+
+/********************************************************************//**
+Determine if the tablespace is page compressed from dict_table_t::flags.
+@return	TRUE if page compressed, FALSE if not compressed */
+UNIV_INLINE
+ibool
+fsp_flags_is_page_compressed(
+/*=========================*/
+	ulint	flags);	/*!< in: tablespace flags */
+
+/********************************************************************//**
+Extract the page compression level from tablespace flags.
+A tablespace has only one physical page compression level
+whether that page is compressed or not.
+@return	page compression level of the file-per-table tablespace,
+or zero if the table is not compressed.  */
+UNIV_INLINE
+ulint
+fsp_flags_get_page_compression_level(
+/*=================================*/
+	ulint	flags);	/*!< in: tablespace flags */
+
+/********************************************************************//**
+Determine the tablespace is using atomic writes from dict_table_t::flags.
+@return	true if atomic writes is used, false if not */
+UNIV_INLINE
+atomic_writes_t
+fsp_flags_get_atomic_writes(
+/*========================*/
+	ulint	flags);	/*!< in: tablespace flags */
+
+#ifndef UNIV_NONINL
+#include "fsp0pagecompress.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/fsp0pagecompress.ic b/storage/innobase/include/fsp0pagecompress.ic
new file mode 100644
index 00000000000..1ba3b7835c9
--- /dev/null
+++ b/storage/innobase/include/fsp0pagecompress.ic
@@ -0,0 +1,184 @@
+/*****************************************************************************
+
+Copyright (C) 2013,2014 SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+
+*****************************************************************************/
+
+/******************************************************************//**
+@file include/fsp0pagecompress.ic
+Implementation for helper functions for extracting/storing page
+compression and atomic writes information to file space.
+
+Created 11/12/2013 Jan Lindström jan.lindstrom@skysql.com
+***********************************************************************/
+
+/********************************************************************//**
+Determine if the tablespace is page compressed from dict_table_t::flags.
+@return	TRUE if page compressed, FALSE if not page compressed */
+UNIV_INLINE
+ibool
+fsp_flags_is_page_compressed(
+/*=========================*/
+	ulint	flags)	/*!< in: tablespace flags */
+{
+	return(FSP_FLAGS_GET_PAGE_COMPRESSION(flags));
+}
+
+/********************************************************************//**
+Determine the tablespace is page compression level from dict_table_t::flags.
+@return	page compression level or 0 if not compressed*/
+UNIV_INLINE
+ulint
+fsp_flags_get_page_compression_level(
+/*=================================*/
+	ulint	flags)	/*!< in: tablespace flags */
+{
+	return(FSP_FLAGS_GET_PAGE_COMPRESSION_LEVEL(flags));
+}
+
+/********************************************************************//**
+Determine the tablespace is using atomic writes from dict_table_t::flags.
+@return	true if atomic writes is used, false if not */
+UNIV_INLINE
+atomic_writes_t
+fsp_flags_get_atomic_writes(
+/*========================*/
+	ulint	flags)	/*!< in: tablespace flags */
+{
+	return((atomic_writes_t)FSP_FLAGS_GET_ATOMIC_WRITES(flags));
+}
+
+/*******************************************************************//**
+Find out wheather the page is index page or not
+@return	true if page type index page, false if not */
+UNIV_INLINE
+ibool
+fil_page_is_index_page(
+/*===================*/
+	byte *buf)	/*!< in: page */
+{
+	return(mach_read_from_2(buf+FIL_PAGE_TYPE) == FIL_PAGE_INDEX);
+}
+
+/*******************************************************************//**
+Find out wheather the page is page compressed
+@return	true if page is page compressed, false if not */
+UNIV_INLINE
+ibool
+fil_page_is_compressed(
+/*===================*/
+	byte *buf)	/*!< in: page */
+{
+	return(mach_read_from_2(buf+FIL_PAGE_TYPE) == FIL_PAGE_PAGE_COMPRESSED);
+}
+
+/*******************************************************************//**
+Returns the page compression level of the space, or 0 if the space
+is not compressed. The tablespace must be cached in the memory cache.
+@return	page compression level, ULINT_UNDEFINED if space not found */
+UNIV_INLINE
+ulint
+fil_space_get_page_compression_level(
+/*=================================*/
+	ulint	id)	/*!< in: space id */
+{
+	ulint	flags;
+
+	flags = fil_space_get_flags(id);
+
+	if (flags && flags != ULINT_UNDEFINED) {
+
+		return(fsp_flags_get_page_compression_level(flags));
+	}
+
+	return(flags);
+}
+
+/*******************************************************************//**
+Extract the page compression from space.
+@return true if space is page compressed, false if space is not found
+or space is not page compressed. */
+UNIV_INLINE
+ibool
+fil_space_is_page_compressed(
+/*=========================*/
+	ulint	id)	/*!< in: space id */
+{
+	ulint	flags;
+
+	flags = fil_space_get_flags(id);
+
+	if (flags && flags != ULINT_UNDEFINED) {
+
+		return(fsp_flags_is_page_compressed(flags));
+	}
+
+	return(flags);
+}
+
+/****************************************************************//**
+Get the name of the compression algorithm used for page
+compression.
+@return compression algorithm name or "UNKNOWN" if not known*/
+UNIV_INLINE
+const char*
+fil_get_compression_alg_name(
+/*=========================*/
+       ulint           comp_alg)     /*!<in: compression algorithm number */
+{
+	switch(comp_alg) {
+	case PAGE_UNCOMPRESSED:
+		return ("uncompressed");
+		break;
+	case PAGE_ZLIB_ALGORITHM:
+		return ("ZLIB");
+		break;
+	case PAGE_LZ4_ALGORITHM:
+		return ("LZ4");
+		break;
+	case PAGE_LZO_ALGORITHM:
+		return ("LZO");
+		break;
+	case PAGE_LZMA_ALGORITHM:
+		return ("LZMA");
+		break;
+	default:
+		return("UNKNOWN");
+		ut_error;
+		break;
+	}
+}
+
+/*******************************************************************//**
+Returns the atomic writes flag of the space, or false if the space
+is not using atomic writes. The tablespace must be cached in the memory cache.
+@return	atomic writes table option value */
+UNIV_INLINE
+atomic_writes_t
+fil_space_get_atomic_writes(
+/*========================*/
+	ulint	id)	/*!< in: space id */
+{
+	ulint	flags;
+
+	flags = fil_space_get_flags(id);
+
+	if (flags && flags != ULINT_UNDEFINED) {
+
+		return((atomic_writes_t)fsp_flags_get_atomic_writes(flags));
+	}
+
+	return((atomic_writes_t)0);
+}
diff --git a/storage/innobase/include/fsp0types.h b/storage/innobase/include/fsp0types.h
index 94fd908ab0c..a6797cd66de 100644
--- a/storage/innobase/include/fsp0types.h
+++ b/storage/innobase/include/fsp0types.h
@@ -29,6 +29,7 @@ Created May 26, 2009 Vasil Dimov
 #include "univ.i"
 
 #include "fil0fil.h" /* for FIL_PAGE_DATA */
+#include "ut0byte.h"
 
 /** @name Flags for inserting records in order
 If records are inserted in order, there are the following
@@ -41,14 +42,17 @@ fseg_alloc_free_page) */
 #define	FSP_NO_DIR	((byte)113)	/*!< no order */
 /* @} */
 
-/** File space extent size (one megabyte) in pages */
-#define	FSP_EXTENT_SIZE		(1048576U / UNIV_PAGE_SIZE)
+/** File space extent size (one megabyte if default two or four if not) in pages */
+#define	FSP_EXTENT_SIZE		((UNIV_PAGE_SIZE <= (1 << 14) ?		\
+				(1048576U / UNIV_PAGE_SIZE) : \
+				((UNIV_PAGE_SIZE <= 1 << 15) ? \
+				(2097152U / UNIV_PAGE_SIZE) : (4194304U / UNIV_PAGE_SIZE))))
 
-/** File space extent size (one megabyte) in pages for MAX page size */
-#define	FSP_EXTENT_SIZE_MAX	(1048576 / UNIV_PAGE_SIZE_MAX)
+/** File space extent size (four megabytes) in pages for MAX page size */
+#define	FSP_EXTENT_SIZE_MAX	(4194304U / UNIV_PAGE_SIZE_MAX)
 
 /** File space extent size (one megabyte) in pages for MIN page size */
-#define	FSP_EXTENT_SIZE_MIN	(1048576 / UNIV_PAGE_SIZE_MIN)
+#define	FSP_EXTENT_SIZE_MIN	(1048576U / UNIV_PAGE_SIZE_MIN)
 
 /** On a page of any file segment, data may be put starting from this
 offset */
diff --git a/storage/innobase/include/ha_prototypes.h b/storage/innobase/include/ha_prototypes.h
index a02b8f1893a..212df1a1283 100644
--- a/storage/innobase/include/ha_prototypes.h
+++ b/storage/innobase/include/ha_prototypes.h
@@ -286,6 +286,16 @@ innobase_casedn_str(
 /*================*/
 	char*	a);	/*!< in/out: string to put in lower case */
 
+#ifdef WITH_WSREP
+UNIV_INTERN
+int
+wsrep_innobase_kill_one_trx(void *thd_ptr,
+                            const trx_t *bf_trx, trx_t *victim_trx, ibool signal);
+int wsrep_innobase_mysql_sort(int mysql_type, uint charset_number,
+                             unsigned char* str, unsigned int str_length,
+                             unsigned int buf_length);
+#endif /* WITH_WSREP */
+
 /**********************************************************************//**
 Determines the connection character set.
 @return	connection character set */
diff --git a/storage/innobase/include/hash0hash.h b/storage/innobase/include/hash0hash.h
index 6f9a628df5d..9a4077befb1 100644
--- a/storage/innobase/include/hash0hash.h
+++ b/storage/innobase/include/hash0hash.h
@@ -144,6 +144,33 @@ do {\
 	}\
 } while (0)
 
+#ifdef WITH_WSREP
+/*******************************************************************//**
+Inserts a struct to the head of hash table. */
+
+#define HASH_PREPEND(TYPE, NAME, TABLE, FOLD, DATA)	\
+do {							\
+	hash_cell_t*	cell3333;			\
+	TYPE*		struct3333;			\
+							\
+	HASH_ASSERT_OWN(TABLE, FOLD)			\
+							\
+	(DATA)->NAME = NULL;				\
+							\
+	cell3333 = hash_get_nth_cell(TABLE, hash_calc_hash(FOLD, TABLE));\
+							\
+	if (cell3333->node == NULL) {			\
+		cell3333->node = DATA;			\
+		DATA->NAME = NULL;			\
+	} else {					\
+		struct3333 = (TYPE*) cell3333->node;	\
+							\
+		DATA->NAME = struct3333;		\
+							\
+		cell3333->node = DATA;			\
+	}						\
+} while (0)
+#endif /*WITH_WSREP */
 #ifdef UNIV_HASH_DEBUG
 # define HASH_ASSERT_VALID(DATA) ut_a((void*) (DATA) != (void*) -1)
 # define HASH_INVALIDATE(DATA, NAME) *(void**) (&DATA->NAME) = (void*) -1
diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h
index bf4a4ae1c35..d96fdfa9d89 100644
--- a/storage/innobase/include/lock0lock.h
+++ b/storage/innobase/include/lock0lock.h
@@ -181,6 +181,16 @@ lock_update_merge_left(
 	const buf_block_t*	right_block);	/*!< in: merged index page
 						which will be discarded */
 /*************************************************************//**
+Updates the lock table when a page is splited and merged to
+two pages. */
+UNIV_INTERN
+void
+lock_update_split_and_merge(
+	const buf_block_t* left_block,	/*!< in: left page to which merged */
+	const rec_t* orig_pred,		/*!< in: original predecessor of
+					supremum on the left page before merge*/
+	const buf_block_t* right_block);/*!< in: right page from which merged */
+/*************************************************************//**
 Resets the original locks on heir and replaces them with gap type locks
 inherited from rec. */
 UNIV_INTERN
@@ -972,6 +982,16 @@ extern lock_sys_t*	lock_sys;
 	mutex_exit(&lock_sys->wait_mutex);	\
 } while (0)
 
+#ifdef WITH_WSREP
+/*********************************************************************//**
+Cancels a waiting lock request and releases possible other transactions
+waiting behind it. */
+UNIV_INTERN
+void
+lock_cancel_waiting_and_release(
+/*============================*/
+	lock_t*	lock);	/*!< in/out: waiting lock request */
+#endif /* WITH_WSREP */
 #ifndef UNIV_NONINL
 #include "lock0lock.ic"
 #endif
diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h
index 74d3c6bbc7c..8f8aef4f45c 100644
--- a/storage/innobase/include/os0file.h
+++ b/storage/innobase/include/os0file.h
@@ -2,6 +2,7 @@
 
 Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
 
 Portions of this file contain modifications contributed and copyrighted
 by Percona Inc.. Those modifications are
@@ -151,10 +152,9 @@ enum os_file_create_t {
 #define	OS_FILE_INSUFFICIENT_RESOURCE	78
 #define	OS_FILE_AIO_INTERRUPTED		79
 #define	OS_FILE_OPERATION_ABORTED	80
-
 #define	OS_FILE_ACCESS_VIOLATION	81
-
-#define	OS_FILE_ERROR_MAX		100
+#define	OS_FILE_OPERATION_NOT_SUPPORTED	125
+#define	OS_FILE_ERROR_MAX		200
 /* @} */
 
 /** Types for aio operations @{ */
@@ -295,33 +295,35 @@ os_file_write
 The wrapper functions have the prefix of "innodb_". */
 
 #ifdef UNIV_PFS_IO
-# define os_file_create(key, name, create, purpose, type, success)	\
+# define os_file_create(key, name, create, purpose, type, success, atomic_writes)	\
 	pfs_os_file_create_func(key, name, create, purpose,	type,	\
-				success, __FILE__, __LINE__)
+				success, atomic_writes, __FILE__, __LINE__)
 
 # define os_file_create_simple(key, name, create, access, success)	\
 	pfs_os_file_create_simple_func(key, name, create, access,	\
 				       success, __FILE__, __LINE__)
 
 # define os_file_create_simple_no_error_handling(			\
-		key, name, create_mode, access, success)		\
+	key, name, create_mode, access, success, atomic_writes)		\
 	pfs_os_file_create_simple_no_error_handling_func(		\
-		key, name, create_mode, access, success, __FILE__, __LINE__)
+		key, name, create_mode, access, success, atomic_writes, __FILE__, __LINE__)
 
 # define os_file_close(file)						\
 	pfs_os_file_close_func(file, __FILE__, __LINE__)
 
 # define os_aio(type, mode, name, file, buf, offset,			\
-		n, message1, message2)					\
+	n, message1, message2, write_size,                              \
+	page_compression, page_compression_level)			\
 	pfs_os_aio_func(type, mode, name, file, buf, offset,		\
-			n, message1, message2, __FILE__, __LINE__)
+			n, message1, message2, write_size,              \
+		page_compression, page_compression_level, __FILE__, __LINE__)
 
-# define os_file_read(file, buf, offset, n)				\
-	pfs_os_file_read_func(file, buf, offset, n, __FILE__, __LINE__)
+# define os_file_read(file, buf, offset, n, compressed)			\
+	pfs_os_file_read_func(file, buf, offset, n, compressed, __FILE__, __LINE__)
 
-# define os_file_read_no_error_handling(file, buf, offset, n)		\
+# define os_file_read_no_error_handling(file, buf, offset, n, compressed) \
 	pfs_os_file_read_no_error_handling_func(file, buf, offset, n,	\
-						__FILE__, __LINE__)
+		                                compressed, __FILE__, __LINE__)
 
 # define os_file_write(name, file, buf, offset, n)	\
 	pfs_os_file_write_func(name, file, buf, offset,	\
@@ -342,28 +344,28 @@ The wrapper functions have the prefix of "innodb_". */
 
 /* If UNIV_PFS_IO is not defined, these I/O APIs point
 to original un-instrumented file I/O APIs */
-# define os_file_create(key, name, create, purpose, type, success)	\
-	os_file_create_func(name, create, purpose, type, success)
+# define os_file_create(key, name, create, purpose, type, success, atomic_writes)	\
+	os_file_create_func(name, create, purpose, type, success, atomic_writes)
 
-# define os_file_create_simple(key, name, create_mode, access, success)	\
+# define os_file_create_simple(key, name, create_mode, access, success) \
 	os_file_create_simple_func(name, create_mode, access, success)
 
 # define os_file_create_simple_no_error_handling(			\
-		key, name, create_mode, access, success)		\
-	os_file_create_simple_no_error_handling_func(			\
-		name, create_mode, access, success)
+	key, name, create_mode, access, success, atomic_writes)		\
+		os_file_create_simple_no_error_handling_func(		\
+			name, create_mode, access, success, atomic_writes)
 
 # define os_file_close(file)	os_file_close_func(file)
 
-# define os_aio(type, mode, name, file, buf, offset, n, message1, message2) \
+# define os_aio(type, mode, name, file, buf, offset, n, message1, message2, write_size, page_compression, page_compression_level) \
 	os_aio_func(type, mode, name, file, buf, offset, n,		\
-		    message1, message2)
+		message1, message2, write_size, page_compression, page_compression_level)
 
-# define os_file_read(file, buf, offset, n)	\
-	os_file_read_func(file, buf, offset, n)
+# define os_file_read(file, buf, offset, n, compressed)	\
+	os_file_read_func(file, buf, offset, n, compressed)
 
-# define os_file_read_no_error_handling(file, buf, offset, n)		\
-	os_file_read_no_error_handling_func(file, buf, offset, n)
+# define os_file_read_no_error_handling(file, buf, offset, n, compressed) \
+	os_file_read_no_error_handling_func(file, buf, offset, n, compressed)
 
 # define os_file_write(name, file, buf, offset, n)			\
 	os_file_write_func(name, file, buf, offset, n)
@@ -524,7 +526,9 @@ os_file_create_simple_no_error_handling_func(
 				OS_FILE_READ_WRITE, or
 				OS_FILE_READ_ALLOW_DELETE; the last option is
 				used by a backup program reading the file */
-	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
+	ibool*		success,/*!< out: TRUE if succeed, FALSE if error */
+	ulint		atomic_writes)/*!< in: atomic writes table option
+				      value */
 	__attribute__((nonnull, warn_unused_result));
 /****************************************************************//**
 Tries to disable OS caching on an opened file descriptor. */
@@ -558,7 +562,9 @@ os_file_create_func(
 				async i/o or unbuffered i/o: look in the
 				function source code for the exact rules */
 	ulint		type,	/*!< in: OS_DATA_FILE or OS_LOG_FILE */
-	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
+	ibool*		success,/*!< out: TRUE if succeed, FALSE if error */
+	ulint		atomic_writes)/*!< in: atomic writes table option
+				      value */
 	__attribute__((nonnull, warn_unused_result));
 /***********************************************************************//**
 Deletes a file. The file has to be closed before calling this.
@@ -648,6 +654,8 @@ pfs_os_file_create_simple_no_error_handling_func(
 				OS_FILE_READ_ALLOW_DELETE; the last option is
 				used by a backup program reading the file */
 	ibool*		success,/*!< out: TRUE if succeed, FALSE if error */
+	ulint		atomic_writes,/*!< in: atomic writes table option
+				value */
 	const char*	src_file,/*!< in: file name where func invoked */
 	ulint		src_line)/*!< in: line where the func invoked */
 	__attribute__((nonnull, warn_unused_result));
@@ -676,6 +684,8 @@ pfs_os_file_create_func(
 				function source code for the exact rules */
 	ulint		type,	/*!< in: OS_DATA_FILE or OS_LOG_FILE */
 	ibool*		success,/*!< out: TRUE if succeed, FALSE if error */
+	ulint		atomic_writes,/*!< in: atomic writes table option
+				      value*/
 	const char*	src_file,/*!< in: file name where func invoked */
 	ulint		src_line)/*!< in: line where the func invoked */
 	__attribute__((nonnull, warn_unused_result));
@@ -706,6 +716,8 @@ pfs_os_file_read_func(
 	void*		buf,	/*!< in: buffer where to read */
 	os_offset_t	offset,	/*!< in: file offset where to read */
 	ulint		n,	/*!< in: number of bytes to read */
+	ibool		compressed, /*!< in: is this file space
+				    compressed ? */
 	const char*	src_file,/*!< in: file name where func invoked */
 	ulint		src_line);/*!< in: line where the func invoked */
 
@@ -724,6 +736,8 @@ pfs_os_file_read_no_error_handling_func(
 	void*		buf,	/*!< in: buffer where to read */
 	os_offset_t	offset,	/*!< in: file offset where to read */
 	ulint		n,	/*!< in: number of bytes to read */
+	ibool		compressed, /*!< in: is this file space
+				    compressed ? */
 	const char*	src_file,/*!< in: file name where func invoked */
 	ulint		src_line);/*!< in: line where the func invoked */
 
@@ -754,6 +768,15 @@ pfs_os_aio_func(
 				(can be used to identify a completed
 				aio operation); ignored if mode is
                                 OS_AIO_SYNC */
+	ulint*		write_size,/*!< in/out: Actual write size initialized
+			       after fist successfull trim
+			       operation for this page and if
+			       initialized we do not trim again if
+			       actual page size does not decrease. */
+	ibool		page_compression, /*!< in: is page compression used
+					  on this file space */
+	ulint		page_compression_level, /*!< page compression
+						 level to be used */
 	const char*	src_file,/*!< in: file name where func invoked */
 	ulint		src_line);/*!< in: line where the func invoked */
 /*******************************************************************//**
@@ -910,7 +933,9 @@ os_file_read_func(
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read */
 	os_offset_t	offset,	/*!< in: file offset where to read */
-	ulint		n);	/*!< in: number of bytes to read */
+	ulint		n,	/*!< in: number of bytes to read */
+	ibool		compressed); /*!< in: is this file space
+				    compressed ? */
 /*******************************************************************//**
 Rewind file to its start, read at most size - 1 bytes from it to str, and
 NUL-terminate str. All errors are silently ignored. This function is
@@ -935,7 +960,9 @@ os_file_read_no_error_handling_func(
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read */
 	os_offset_t	offset,	/*!< in: file offset where to read */
-	ulint		n);	/*!< in: number of bytes to read */
+	ulint		n,	/*!< in: number of bytes to read */
+	ibool		compressed); /*!< in: is this file space
+				     compressed ? */
 
 /*******************************************************************//**
 NOTE! Use the corresponding macro os_file_write(), not directly this
@@ -952,6 +979,7 @@ os_file_write_func(
 	const void*	buf,	/*!< in: buffer from which to write */
 	os_offset_t	offset,	/*!< in: file offset where to write */
 	ulint		n);	/*!< in: number of bytes to write */
+
 /*******************************************************************//**
 Check the existence and type of the given file.
 @return	TRUE if call succeeded */
@@ -1114,10 +1142,20 @@ os_aio_func(
 				(can be used to identify a completed
 				aio operation); ignored if mode is
 				OS_AIO_SYNC */
-	void*		message2);/*!< in: message for the aio handler
+	void*		message2,/*!< in: message for the aio handler
 				(can be used to identify a completed
 				aio operation); ignored if mode is
 				OS_AIO_SYNC */
+	ulint*		write_size,/*!< in/out: Actual write size initialized
+			       after fist successfull trim
+			       operation for this page and if
+			       initialized we do not trim again if
+			       actual page size does not decrease. */
+	ibool		page_compression, /*!< in: is page compression used
+					  on this file space */
+	ulint		page_compression_level); /*!< page compression
+						 level to be used */
+
 /************************************************************************//**
 Wakes up all async i/o threads so that they know to exit themselves in
 shutdown. */
@@ -1291,8 +1329,20 @@ os_file_handle_error_no_exit(
 /*=========================*/
 	const char*	name,		/*!< in: name of a file or NULL */
 	const char*	operation,	/*!< in: operation */
-	ibool		on_error_silent);/*!< in: if TRUE then don't print
+	ibool		on_error_silent,/*!< in: if TRUE then don't print
 					any message to the log. */
+	const char*	file,		/*!< in: file name */
+	const ulint	line);		/*!< in: line */
+
+/***********************************************************************//**
+Try to get number of bytes per sector from file system.
+@return	file block size */
+UNIV_INTERN
+ulint
+os_file_get_block_size(
+/*===================*/
+	os_file_t	file,	/*!< in: handle to a file */
+	const char*	name);	/*!< in: file name */
 
 #ifndef UNIV_NONINL
 #include "os0file.ic"
diff --git a/storage/innobase/include/os0file.ic b/storage/innobase/include/os0file.ic
index defd8204ba3..8e1cea585e6 100644
--- a/storage/innobase/include/os0file.ic
+++ b/storage/innobase/include/os0file.ic
@@ -1,6 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2010, 2011, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -88,6 +89,8 @@ pfs_os_file_create_simple_no_error_handling_func(
 				OS_FILE_READ_ALLOW_DELETE; the last option is
 				used by a backup program reading the file */
 	ibool*		success,/*!< out: TRUE if succeed, FALSE if error */
+	ulint		atomic_writes,/*!< in: atomic writes table option
+				value */
 	const char*	src_file,/*!< in: file name where func invoked */
 	ulint		src_line)/*!< in: line where the func invoked */
 {
@@ -103,7 +106,7 @@ pfs_os_file_create_simple_no_error_handling_func(
 				     name, src_file, src_line);
 
 	file = os_file_create_simple_no_error_handling_func(
-		name, create_mode, access_type, success);
+		name, create_mode, access_type, success, atomic_writes);
 
 	register_pfs_file_open_end(locker, file);
 
@@ -134,6 +137,8 @@ pfs_os_file_create_func(
 				function source code for the exact rules */
 	ulint		type,	/*!< in: OS_DATA_FILE or OS_LOG_FILE */
 	ibool*		success,/*!< out: TRUE if succeed, FALSE if error */
+	ulint		atomic_writes, /*!< in: atomic writes table option
+				       value */
 	const char*	src_file,/*!< in: file name where func invoked */
 	ulint		src_line)/*!< in: line where the func invoked */
 {
@@ -148,7 +153,7 @@ pfs_os_file_create_func(
 					: PSI_FILE_OPEN),
 				     name, src_file, src_line);
 
-	file = os_file_create_func(name, create_mode, purpose, type, success);
+	file = os_file_create_func(name, create_mode, purpose, type, success, atomic_writes);
 
 	register_pfs_file_open_end(locker, file);
 
@@ -210,6 +215,15 @@ pfs_os_aio_func(
 				(can be used to identify a completed
 				aio operation); ignored if mode is
                                 OS_AIO_SYNC */
+	ulint*		write_size,/*!< in/out: Actual write size initialized
+			       after fist successfull trim
+			       operation for this page and if
+			       initialized we do not trim again if
+			       actual page size does not decrease. */
+	ibool		page_compression, /*!< in: is page compression used
+					  on this file space */
+	ulint		page_compression_level, /*!< page compression
+						 level to be used */
 	const char*	src_file,/*!< in: file name where func invoked */
 	ulint		src_line)/*!< in: line where the func invoked */
 {
@@ -225,7 +239,8 @@ pfs_os_aio_func(
 				   src_file, src_line);
 
 	result = os_aio_func(type, mode, name, file, buf, offset,
-			     n, message1, message2);
+		n, message1, message2, write_size,
+		page_compression, page_compression_level);
 
 	register_pfs_file_io_end(locker, n);
 
@@ -246,6 +261,8 @@ pfs_os_file_read_func(
 	void*		buf,	/*!< in: buffer where to read */
 	os_offset_t	offset,	/*!< in: file offset where to read */
 	ulint		n,	/*!< in: number of bytes to read */
+	ibool		compressed, /*!< in: is this file space
+				    compressed ? */
 	const char*	src_file,/*!< in: file name where func invoked */
 	ulint		src_line)/*!< in: line where the func invoked */
 {
@@ -256,7 +273,7 @@ pfs_os_file_read_func(
 	register_pfs_file_io_begin(&state, locker, file, n, PSI_FILE_READ,
 				   src_file, src_line);
 
-	result = os_file_read_func(file, buf, offset, n);
+	result = os_file_read_func(file, buf, offset, n, compressed);
 
 	register_pfs_file_io_end(locker, n);
 
@@ -279,6 +296,8 @@ pfs_os_file_read_no_error_handling_func(
 	void*		buf,	/*!< in: buffer where to read */
 	os_offset_t	offset,	/*!< in: file offset where to read */
 	ulint		n,	/*!< in: number of bytes to read */
+	ibool		compressed, /*!< in: is this file space
+				    compressed ? */
 	const char*	src_file,/*!< in: file name where func invoked */
 	ulint		src_line)/*!< in: line where the func invoked */
 {
@@ -289,7 +308,7 @@ pfs_os_file_read_no_error_handling_func(
 	register_pfs_file_io_begin(&state, locker, file, n, PSI_FILE_READ,
 				   src_file, src_line);
 
-	result = os_file_read_no_error_handling_func(file, buf, offset, n);
+	result = os_file_read_no_error_handling_func(file, buf, offset, n, compressed);
 
 	register_pfs_file_io_end(locker, n);
 
diff --git a/storage/innobase/include/rem0rec.h b/storage/innobase/include/rem0rec.h
index 8e7d5ff2d48..238cb04e1f8 100644
--- a/storage/innobase/include/rem0rec.h
+++ b/storage/innobase/include/rem0rec.h
@@ -981,6 +981,15 @@ are given in one byte (resp. two byte) format. */
 two upmost bits in a two byte offset for special purposes */
 #define REC_MAX_DATA_SIZE	(16 * 1024)
 
+#ifdef WITH_WSREP
+int wsrep_rec_get_foreign_key(
+	byte 		*buf,     /* out: extracted key */
+	ulint 		*buf_len, /* in/out: length of buf */
+	const rec_t*	rec,	  /* in: physical record */
+	dict_index_t*	index_for,  /* in: index for foreign table */
+	dict_index_t*	index_ref,  /* in: index for referenced table */
+	ibool		new_protocol); /* in: protocol > 1 */
+#endif /* WITH_WSREP */
 #ifndef UNIV_NONINL
 #include "rem0rec.ic"
 #endif
diff --git a/storage/innobase/include/row0log.h b/storage/innobase/include/row0log.h
index 62715fe8808..f105838eece 100644
--- a/storage/innobase/include/row0log.h
+++ b/storage/innobase/include/row0log.h
@@ -35,6 +35,10 @@ Created 2011-05-26 Marko Makela
 #include "trx0types.h"
 #include "que0types.h"
 
+extern ulint onlineddl_rowlog_rows;
+extern ulint onlineddl_rowlog_pct_used;
+extern ulint onlineddl_pct_progress;
+
 /******************************************************//**
 Allocate the row log for an index and flag the index
 for online creation.
diff --git a/storage/innobase/include/row0merge.h b/storage/innobase/include/row0merge.h
index 390c0ce038b..31a9ac6f45e 100644
--- a/storage/innobase/include/row0merge.h
+++ b/storage/innobase/include/row0merge.h
@@ -40,6 +40,18 @@ Created 13/06/2005 Jan Lindstrom
 #include "lock0types.h"
 #include "srv0srv.h"
 
+/* Cluster index read task is mandatory */
+#define COST_READ_CLUSTERED_INDEX            1.0
+
+/* Basic fixed cost to build all type of index */
+#define COST_BUILD_INDEX_STATIC              0.5
+/* Dynamic cost to build all type of index, dynamic cost will be re-distributed based on page count ratio of each index */
+#define COST_BUILD_INDEX_DYNAMIC             0.5
+
+/* Sum of below two must be 1.0 */
+#define PCT_COST_MERGESORT_INDEX                 0.4
+#define PCT_COST_INSERT_INDEX                    0.6
+
 // Forward declaration
 struct ib_sequence_t;
 
@@ -370,7 +382,10 @@ row_merge_sort(
 	merge_file_t*		file,	/*!< in/out: file containing
 					index entries */
 	row_merge_block_t*	block,	/*!< in/out: 3 buffers */
-	int*			tmpfd)	/*!< in/out: temporary file handle */
+	int*			tmpfd,	/*!< in/out: temporary file handle */
+	const bool		update_progress, /*!< in: update progress status variable or not */
+	const float		pct_progress, /*!< in: total progress percent until now */
+	const float		pct_cost) /*!< in: current progress percent */
 	__attribute__((nonnull));
 /*********************************************************************//**
 Allocate a sort buffer.
diff --git a/storage/innobase/include/row0mysql.h b/storage/innobase/include/row0mysql.h
index 06c07002c2b..440001410f0 100644
--- a/storage/innobase/include/row0mysql.h
+++ b/storage/innobase/include/row0mysql.h
@@ -606,6 +606,12 @@ struct mysql_row_templ_t {
 					Innobase record in the current index;
 					not defined if template_type is
 					ROW_MYSQL_WHOLE_ROW */
+	ibool	rec_field_is_prefix;	/* is this field in a prefix index? */
+	ulint	rec_prefix_field_no;	/* record field, even if just a
+					prefix; same as rec_field_no when not a
+					prefix, otherwise rec_field_no is
+					ULINT_UNDEFINED but this is the true
+					field number*/
 	ulint	clust_rec_field_no;	/*!< field number of the column in an
 					Innobase record in the clustered index;
 					not defined if template_type is
@@ -707,7 +713,9 @@ struct row_prebuilt_t {
 					columns through a secondary index
 					and at least one column is not in
 					the secondary index, then this is
-					set to TRUE */
+					set to TRUE; note that sometimes this
+					is set but we later optimize out the
+					clustered index lookup */
 	unsigned	templ_contains_blob:1;/*!< TRUE if the template contains
 					a column with DATA_BLOB ==
 					get_innobase_type_from_mysql_type();
diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h
index 2d90f47eefe..0a47d514e1b 100644
--- a/storage/innobase/include/srv0mon.h
+++ b/storage/innobase/include/srv0mon.h
@@ -2,6 +2,7 @@
 
 Copyright (c) 2010, 2013, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it
 under the terms of the GNU General Public License as published by the
@@ -163,7 +164,11 @@ enum monitor_id_t {
 	MONITOR_OVLD_BUF_POOL_PAGES_FREE,
 	MONITOR_OVLD_PAGE_CREATED,
 	MONITOR_OVLD_PAGES_WRITTEN,
+	MONITOR_OVLD_INDEX_PAGES_WRITTEN,
+	MONITOR_OVLD_NON_INDEX_PAGES_WRITTEN,
 	MONITOR_OVLD_PAGES_READ,
+	MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS,
+	MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS_AVOIDED,
 	MONITOR_OVLD_BYTE_READ,
 	MONITOR_OVLD_BYTE_WRITTEN,
 	MONITOR_FLUSH_BATCH_SCANNED,
@@ -194,9 +199,12 @@ enum monitor_id_t {
 	MONITOR_LRU_BATCH_SCANNED,
 	MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
 	MONITOR_LRU_BATCH_SCANNED_PER_CALL,
-	MONITOR_LRU_BATCH_TOTAL_PAGE,
-	MONITOR_LRU_BATCH_COUNT,
-	MONITOR_LRU_BATCH_PAGES,
+	MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
+	MONITOR_LRU_BATCH_FLUSH_COUNT,
+	MONITOR_LRU_BATCH_FLUSH_PAGES,
+	MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
+	MONITOR_LRU_BATCH_EVICT_COUNT,
+	MONITOR_LRU_BATCH_EVICT_PAGES,
 	MONITOR_LRU_SINGLE_FLUSH_SCANNED,
 	MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
 	MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL,
@@ -304,6 +312,20 @@ enum monitor_id_t {
 	MONITOR_PAGE_DECOMPRESS,
 	MONITOR_PAD_INCREMENTS,
 	MONITOR_PAD_DECREMENTS,
+	/* New monitor variables for page compression */
+	MONITOR_OVLD_PAGE_COMPRESS_SAVED,
+	MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT512,
+	MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT1024,
+	MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT2048,
+	MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT4096,
+	MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT8192,
+	MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT16384,
+	MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT32768,
+	MONITOR_OVLD_PAGES_PAGE_COMPRESSED,
+	MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP,
+	MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP_SAVED,
+	MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED,
+	MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR,
 
 	/* Index related counters */
 	MONITOR_MODULE_INDEX,
diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
index a3e6a17a6e2..52f2f22b372 100644
--- a/storage/innobase/include/srv0srv.h
+++ b/storage/innobase/include/srv0srv.h
@@ -103,6 +103,37 @@ struct srv_stats_t {
 	a disk page */
 	ulint_ctr_1_t		buf_pool_reads;
 
+	/** Number of bytes saved by page compression */
+	ulint_ctr_64_t          page_compression_saved;
+	/** Number of 512Byte TRIM by page compression */
+	ulint_ctr_64_t          page_compression_trim_sect512;
+	/** Number of 1K TRIM by page compression */
+	ulint_ctr_64_t          page_compression_trim_sect1024;
+	/** Number of 2K TRIM by page compression */
+	ulint_ctr_64_t          page_compression_trim_sect2048;
+	/** Number of 4K TRIM  by page compression */
+	ulint_ctr_64_t          page_compression_trim_sect4096;
+	/** Number of 8K TRIM by page compression */
+	ulint_ctr_64_t          page_compression_trim_sect8192;
+	/** Number of 16K TRIM by page compression */
+	ulint_ctr_64_t          page_compression_trim_sect16384;
+	/** Number of 32K TRIM by page compression */
+	ulint_ctr_64_t          page_compression_trim_sect32768;
+	/* Number of index pages written */
+	ulint_ctr_64_t          index_pages_written;
+	/* Number of non index pages written */
+	ulint_ctr_64_t          non_index_pages_written;
+	/* Number of pages compressed with page compression */
+        ulint_ctr_64_t          pages_page_compressed;
+	/* Number of TRIM operations induced by page compression */
+        ulint_ctr_64_t          page_compressed_trim_op;
+	/* Number of TRIM operations saved by using actual write size knowledge */
+        ulint_ctr_64_t          page_compressed_trim_op_saved;
+	/* Number of pages decompressed with page compression */
+        ulint_ctr_64_t          pages_page_decompressed;
+	/* Number of page compression errors */
+	ulint_ctr_64_t          pages_page_compression_error;
+
 	/** Number of data read in total (in bytes) */
 	ulint_ctr_1_t		data_read;
 
@@ -138,6 +169,12 @@ struct srv_stats_t {
 
 	/** Number of system rows inserted */
 	ulint_ctr_64_t		n_system_rows_inserted;
+
+	/** Number of times secondary index lookup triggered cluster lookup */
+	ulint_ctr_64_t		n_sec_rec_cluster_reads;
+
+	/** Number of times prefix optimization avoided triggering cluster lookup */
+	ulint_ctr_64_t		n_sec_rec_cluster_reads_avoided;
 };
 
 extern const char*	srv_main_thread_op_info;
@@ -230,6 +267,31 @@ OS (provided we compiled Innobase with it in), otherwise we will
 use simulated aio we build below with threads.
 Currently we support native aio on windows and linux */
 extern my_bool	srv_use_native_aio;
+
+/* Use trim operation */
+extern my_bool srv_use_trim;
+
+/* Use posix fallocate */
+#ifdef HAVE_POSIX_FALLOCATE
+extern my_bool srv_use_posix_fallocate;
+#endif
+
+/* Use atomic writes i.e disable doublewrite buffer */
+extern my_bool srv_use_atomic_writes;
+
+/* Compression algorithm*/
+extern ulong innodb_compression_algorithm;
+
+/* Number of flush threads */
+#define MTFLUSH_MAX_WORKER       64
+#define MTFLUSH_DEFAULT_WORKER   8
+
+/* Number of threads used for multi-threaded flush */
+extern long    srv_mtflush_threads;
+
+/* If this flag is TRUE, then we will use multi threaded flush. */
+extern my_bool	srv_use_mtflush;
+
 #ifdef __WIN__
 extern ibool	srv_use_native_conditions;
 #endif /* __WIN__ */
@@ -260,6 +322,10 @@ extern ulong	srv_auto_extend_increment;
 
 extern ibool	srv_created_new_raw;
 
+/* Optimize prefix index queries to skip cluster index lookup when possible */
+/* Enables or disables this prefix optimization.  Disabled by default. */
+extern my_bool	srv_prefix_index_cluster_optimization;
+
 /** Maximum number of srv_n_log_files, or innodb_log_files_in_group */
 #define SRV_N_LOG_FILES_MAX 100
 extern ulong	srv_n_log_files;
@@ -270,6 +336,10 @@ extern ulong	srv_flush_log_at_trx_commit;
 extern uint	srv_flush_log_at_timeout;
 extern char	srv_adaptive_flushing;
 
+#ifdef WITH_INNODB_DISALLOW_WRITES
+/* When this event is reset we do not allow any file writes to take place. */
+extern os_event_t	srv_allow_writes_event;
+#endif /* WITH_INNODB_DISALLOW_WRITES */
 /* If this flag is TRUE, then we will load the indexes' (and tables') metadata
 even if they are marked as "corrupted". Mostly it is for DBA to process
 corrupted index and table */
@@ -301,6 +371,17 @@ extern my_bool	srv_random_read_ahead;
 extern ulong	srv_read_ahead_threshold;
 extern ulint	srv_n_read_io_threads;
 extern ulint	srv_n_write_io_threads;
+/* Defragmentation, Origianlly facebook default value is 100, but it's too high */
+#define SRV_DEFRAGMENT_FREQUENCY_DEFAULT 40
+extern my_bool	srv_defragment;
+extern uint	srv_defragment_n_pages;
+extern uint	srv_defragment_stats_accuracy;
+extern uint	srv_defragment_fill_factor_n_recs;
+extern double	srv_defragment_fill_factor;
+extern uint	srv_defragment_frequency;
+extern ulonglong	srv_defragment_interval;
+
+extern ulong	srv_idle_flush_pct;
 
 /* Number of IO operations per second the server can do */
 extern ulong    srv_io_capacity;
@@ -363,10 +444,7 @@ extern ibool	srv_use_doublewrite_buf;
 extern ulong	srv_doublewrite_batch_size;
 extern ulong	srv_checksum_algorithm;
 
-extern ibool	srv_use_atomic_writes;
-#ifdef HAVE_POSIX_FALLOCATE
-extern ibool	srv_use_posix_fallocate;
-#endif
+extern my_bool	srv_force_primary_key;
 
 extern double	srv_max_buf_pool_modified_pct;
 extern ulong	srv_max_purge_lag;
@@ -428,7 +506,6 @@ extern my_bool	srv_ibuf_disable_background_merge;
 extern my_bool	srv_purge_view_update_only_debug;
 #endif /* UNIV_DEBUG */
 
-extern ulint	srv_fatal_semaphore_wait_threshold;
 #define SRV_SEMAPHORE_WAIT_EXTENSION	7200
 extern ulint	srv_dml_needed_delay;
 
@@ -467,6 +544,11 @@ extern srv_stats_t	srv_stats;
 /** Simulate compression failures. */
 extern uint srv_simulate_comp_failures;
 
+/** Fatal semaphore wait threshold = maximum number of seconds
+that semaphore times out in InnoDB */
+#define DEFAULT_SRV_FATAL_SEMAPHORE_TIMEOUT 600
+extern ulong	srv_fatal_semaphore_wait_threshold;
+
 # ifdef UNIV_PFS_THREAD
 /* Keys to register InnoDB threads with performance schema */
 extern mysql_pfs_key_t	buf_page_cleaner_thread_key;
@@ -866,12 +948,62 @@ struct export_var_t{
 	ulint innodb_system_rows_deleted; /*!< srv_n_system_rows_deleted*/
 	ulint innodb_num_open_files;		/*!< fil_n_file_opened */
 	ulint innodb_truncated_status_writes;	/*!< srv_truncated_status_writes */
-	ulint innodb_available_undo_logs;       /*!< srv_available_undo_logs */
+	ulint innodb_available_undo_logs;       /*!< srv_available_undo_logs
+						*/
+	ulint innodb_defragment_compression_failures; /*!< Number of
+						defragment re-compression
+						failures */
+
+	ulint innodb_defragment_failures;	/*!< Number of defragment
+						failures*/
+	ulint innodb_defragment_count;		/*!< Number of defragment
+						operations*/
+
+	ulint innodb_onlineddl_rowlog_rows;	/*!< Online alter rows */
+	ulint innodb_onlineddl_rowlog_pct_used; /*!< Online alter percentage
+						of used row log buffer */
+	ulint innodb_onlineddl_pct_progress;	/*!< Online alter progress */
+
 #ifdef UNIV_DEBUG
 	ulint innodb_purge_trx_id_age;		/*!< rw_max_trx_id - purged trx_id */
 	ulint innodb_purge_view_trx_id_age;	/*!< rw_max_trx_id
 						- purged view's min trx_id */
 #endif /* UNIV_DEBUG */
+
+	ib_int64_t innodb_page_compression_saved;/*!< Number of bytes saved
+						by page compression */
+	ib_int64_t innodb_page_compression_trim_sect512;/*!< Number of 512b TRIM
+						by page compression */
+	ib_int64_t innodb_page_compression_trim_sect1024;/*!< Number of 1K TRIM
+						by page compression */
+	ib_int64_t innodb_page_compression_trim_sect2048;/*!< Number of 2K TRIM
+						by page compression */
+	ib_int64_t innodb_page_compression_trim_sect4096;/*!< Number of 4K byte TRIM
+						by page compression */
+	ib_int64_t innodb_page_compression_trim_sect8192;/*!< Number of 8K TRIM
+						by page compression */
+	ib_int64_t innodb_page_compression_trim_sect16384;/*!< Number of 16K TRIM
+						by page compression */
+	ib_int64_t innodb_page_compression_trim_sect32768;/*!< Number of 32K TRIM
+						by page compression */
+	ib_int64_t innodb_index_pages_written;  /*!< Number of index pages
+						written */
+	ib_int64_t innodb_non_index_pages_written;  /*!< Number of non index pages
+						written */
+	ib_int64_t innodb_pages_page_compressed;/*!< Number of pages
+						compressed by page compression */
+	ib_int64_t innodb_page_compressed_trim_op;/*!< Number of TRIM operations
+						induced by page compression */
+	ib_int64_t innodb_page_compressed_trim_op_saved;/*!< Number of TRIM operations
+						saved by page compression */
+	ib_int64_t innodb_pages_page_decompressed;/*!< Number of pages
+						decompressed by page
+						compression */
+	ib_int64_t innodb_pages_page_compression_error;/*!< Number of page
+						compression errors */
+
+	ulint innodb_sec_rec_cluster_reads;	/*!< srv_sec_rec_cluster_reads */
+	ulint innodb_sec_rec_cluster_reads_avoided; /*!< srv_sec_rec_cluster_reads_avoided */
 };
 
 /** Thread slot in the thread table.  */
@@ -911,5 +1043,13 @@ struct srv_slot_t{
 # define srv_start_raw_disk_in_use		0
 # define srv_file_per_table			1
 #endif /* !UNIV_HOTBACKUP */
+#ifdef WITH_WSREP
+UNIV_INTERN
+void
+wsrep_srv_conc_cancel_wait(
+/*==================*/
+	trx_t*	trx);	/*!< in: transaction object associated with the
+			thread */
+#endif /* WITH_WSREP */
 
 #endif
diff --git a/storage/innobase/include/srv0start.h b/storage/innobase/include/srv0start.h
index 40d502f4459..e1c19982ba5 100644
--- a/storage/innobase/include/srv0start.h
+++ b/storage/innobase/include/srv0start.h
@@ -37,7 +37,8 @@ Created 10/10/1995 Heikki Tuuri
 #endif
 
 /*********************************************************************//**
-Normalizes a directory path for Windows: converts slashes to backslashes. */
+Normalizes a directory path for Windows: converts slashes to backslashes. 
+*/
 UNIV_INTERN
 void
 srv_normalize_path_for_win(
diff --git a/storage/innobase/include/sync0sync.h b/storage/innobase/include/sync0sync.h
index 7b00e16476b..f26e66f1a87 100644
--- a/storage/innobase/include/sync0sync.h
+++ b/storage/innobase/include/sync0sync.h
@@ -687,6 +687,7 @@ or row lock! */
 #define SYNC_EXTERN_STORAGE	500
 #define	SYNC_FSP		400
 #define	SYNC_FSP_PAGE		395
+#define SYNC_STATS_DEFRAG	390
 /*------------------------------------- Change buffer headers */
 #define SYNC_IBUF_MUTEX		370	/* ibuf_mutex */
 /*------------------------------------- Change buffer tree */
diff --git a/storage/innobase/include/sync0sync.ic b/storage/innobase/include/sync0sync.ic
index 97ec63c0dd2..a5887b1fd6f 100644
--- a/storage/innobase/include/sync0sync.ic
+++ b/storage/innobase/include/sync0sync.ic
@@ -204,7 +204,10 @@ mutex_enter_func(
 	ulint		line)		/*!< in: line where locked */
 {
 	ut_ad(mutex_validate(mutex));
+#ifndef WITH_WSREP
+	/* this cannot be be granted when BF trx kills a trx in lock wait state */
 	ut_ad(!mutex_own(mutex));
+#endif /* WITH_WSREP */
 
 	/* Note that we do not peek at the value of lock_word before trying
 	the atomic test_and_set; we could peek, and possibly save time. */
diff --git a/storage/innobase/include/trx0sys.h b/storage/innobase/include/trx0sys.h
index 70f214d1ac7..9ffc8d99a7f 100644
--- a/storage/innobase/include/trx0sys.h
+++ b/storage/innobase/include/trx0sys.h
@@ -42,6 +42,9 @@ Created 3/26/1996 Heikki Tuuri
 #include "read0types.h"
 #include "page0types.h"
 #include "ut0bh.h"
+#ifdef WITH_WSREP
+#include "trx0xa.h"
+#endif /* WITH_WSREP */
 
 typedef UT_LIST_BASE_NODE_T(trx_t) trx_list_t;
 
@@ -293,6 +296,9 @@ trx_sys_update_mysql_binlog_offset(
 	ib_int64_t	offset,	/*!< in: position in that log file */
 	ulint		field,	/*!< in: offset of the MySQL log info field in
 				the trx sys header */
+#ifdef WITH_WSREP
+        trx_sysf_t*     sys_header, /*!< in: trx sys header */
+#endif /* WITH_WSREP */
 	mtr_t*		mtr);	/*!< in: mtr */
 /*****************************************************************//**
 Prints to stderr the MySQL binlog offset info in the trx system header if
@@ -301,6 +307,19 @@ UNIV_INTERN
 void
 trx_sys_print_mysql_binlog_offset(void);
 /*===================================*/
+#ifdef WITH_WSREP
+/** Update WSREP checkpoint XID in sys header. */
+void
+trx_sys_update_wsrep_checkpoint(
+        const XID*      xid,         /*!< in: WSREP XID */
+        trx_sysf_t*     sys_header,  /*!< in: sys_header */
+        mtr_t*          mtr);        /*!< in: mtr       */
+
+void
+/** Read WSREP checkpoint XID from sys header. */
+trx_sys_read_wsrep_checkpoint(
+        XID* xid); /*!< out: WSREP XID */
+#endif /* WITH_WSREP */
 /*****************************************************************//**
 Prints to stderr the MySQL master log offset info in the trx system header if
 the magic number shows it valid. */
@@ -529,6 +548,20 @@ this contains the same fields as TRX_SYS_MYSQL_LOG_INFO below */
 						within that file */
 #define TRX_SYS_MYSQL_LOG_NAME		12	/*!< MySQL log file name */
 
+#ifdef WITH_WSREP
+/* The offset to WSREP XID headers */
+#define TRX_SYS_WSREP_XID_INFO (UNIV_PAGE_SIZE - 3500)
+#define TRX_SYS_WSREP_XID_MAGIC_N_FLD 0
+#define TRX_SYS_WSREP_XID_MAGIC_N 0x77737265
+
+/* XID field: formatID, gtrid_len, bqual_len, xid_data */
+#define TRX_SYS_WSREP_XID_LEN        (4 + 4 + 4 + XIDDATASIZE)
+#define TRX_SYS_WSREP_XID_FORMAT     4
+#define TRX_SYS_WSREP_XID_GTRID_LEN  8
+#define TRX_SYS_WSREP_XID_BQUAL_LEN 12
+#define TRX_SYS_WSREP_XID_DATA      16
+#endif /* WITH_WSREP*/
+
 /** Doublewrite buffer */
 /* @{ */
 /** The offset of the doublewrite buffer header on the trx system header page */
diff --git a/storage/innobase/include/trx0sys.ic b/storage/innobase/include/trx0sys.ic
index e097e29b551..7265a97ae25 100644
--- a/storage/innobase/include/trx0sys.ic
+++ b/storage/innobase/include/trx0sys.ic
@@ -445,7 +445,10 @@ trx_id_t
 trx_sys_get_new_trx_id(void)
 /*========================*/
 {
+#ifndef WITH_WSREP
+	/* wsrep_fake_trx_id  violates this assert */
 	ut_ad(mutex_own(&trx_sys->mutex));
+#endif /* WITH_WSREP */
 
 	/* VERY important: after the database is started, max_trx_id value is
 	divisible by TRX_SYS_TRX_ID_WRITE_MARGIN, and the following if
diff --git a/storage/innobase/include/trx0trx.h b/storage/innobase/include/trx0trx.h
index fcc9ed05081..7c92445b796 100644
--- a/storage/innobase/include/trx0trx.h
+++ b/storage/innobase/include/trx0trx.h
@@ -1009,6 +1009,9 @@ struct trx_t{
 	/*------------------------------*/
 	char detailed_error[256];	/*!< detailed error message for last
 					error, or empty. */
+#ifdef WITH_WSREP
+	os_event_t	wsrep_event;	/* event waited for in srv_conc_slot */
+#endif /* WITH_WSREP */
 };
 
 /* Transaction isolation levels (trx->isolation_level) */
diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i
index eeeaca166a8..a4c401134f9 100644
--- a/storage/innobase/include/univ.i
+++ b/storage/innobase/include/univ.i
@@ -2,6 +2,7 @@
 
 Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2008, Google Inc.
+Copyright (c) 2013, 2014 SkySQL Ab.
 
 Portions of this file contain modifications contributed and copyrighted by
 Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -331,6 +332,30 @@ typedef enum innodb_file_formats_enum innodb_file_formats_t;
 /** The 2-logarithm of UNIV_PAGE_SIZE: */
 #define UNIV_PAGE_SIZE_SHIFT	srv_page_size_shift
 
+#ifdef HAVE_LZO
+#define IF_LZO(A,B) A
+#else
+#define IF_LZO(A,B) B
+#endif
+
+#ifdef HAVE_LZ4
+#define IF_LZ4(A,B) A
+#else
+#define IF_LZ4(A,B) B
+#endif
+
+#ifdef HAVE_LZMA
+#define IF_LZMA(A,B) A
+#else
+#define IF_LZMA(A,B) B
+#endif
+
+#ifdef HAVE_BZIP2
+#define IF_BZIP2(A,B) A
+#else
+#define IF_BZIP2(A,B) B
+#endif
+
 /** The universal page size of the database */
 #define UNIV_PAGE_SIZE		((ulint) srv_page_size)
 
@@ -344,13 +369,15 @@ and 2 bits for flags. This limits the uncompressed page size to 16k.
 Even though a 16k uncompressed page can theoretically be compressed
 into a larger compressed page, it is not a useful feature so we will
 limit both with this same constant. */
-#define UNIV_ZIP_SIZE_SHIFT_MAX		14
+#define UNIV_ZIP_SIZE_SHIFT_MAX		15
 
 /* Define the Min, Max, Default page sizes. */
 /** Minimum Page Size Shift (power of 2) */
 #define UNIV_PAGE_SIZE_SHIFT_MIN	12
+/** log2 of largest page size (1<<16 == 64436 bytes). */
 /** Maximum Page Size Shift (power of 2) */
-#define UNIV_PAGE_SIZE_SHIFT_MAX	14
+#define UNIV_PAGE_SIZE_SHIFT_MAX	16
+/** log2 of default page size (1<<14 == 16384 bytes). */
 /** Default Page Size Shift (power of 2) */
 #define UNIV_PAGE_SIZE_SHIFT_DEF	14
 /** Original 16k InnoDB Page Size Shift, in case the default changes */
diff --git a/storage/innobase/include/ut0list.h b/storage/innobase/include/ut0list.h
index 29fc8669ce4..796a272db59 100644
--- a/storage/innobase/include/ut0list.h
+++ b/storage/innobase/include/ut0list.h
@@ -150,6 +150,15 @@ ib_list_is_empty(
 					/* out: TRUE if empty else  */
 	const ib_list_t*	list);	/* in: list */
 
+/********************************************************************
+Get number of items on list.
+@return number of items on list */
+UNIV_INLINE
+ulint
+ib_list_len(
+/*========*/
+	const ib_list_t*	list);		/*<! in: list */
+
 /* List. */
 struct ib_list_t {
 	ib_list_node_t*		first;		/*!< first node */
diff --git a/storage/innobase/include/ut0list.ic b/storage/innobase/include/ut0list.ic
index d9dcb2eac99..7a7f53adb2f 100644
--- a/storage/innobase/include/ut0list.ic
+++ b/storage/innobase/include/ut0list.ic
@@ -58,3 +58,23 @@ ib_list_is_empty(
 {
 	return(!(list->first || list->last));
 }
+
+/********************************************************************
+Get number of items on list.
+@return number of items on list */
+UNIV_INLINE
+ulint
+ib_list_len(
+/*========*/
+	const ib_list_t*	list)		/*<! in: list */
+{
+	ulint len = 0;
+	ib_list_node_t* node = list->first;
+
+	while(node) {
+		len++;
+		node = node->next;
+	}
+
+	return (len);
+}
diff --git a/storage/innobase/include/ut0timer.h b/storage/innobase/include/ut0timer.h
new file mode 100644
index 00000000000..f361ae79bf5
--- /dev/null
+++ b/storage/innobase/include/ut0timer.h
@@ -0,0 +1,104 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2014, Facebook, Inc. All Rights Reserved.
+Copyright (c) 2014, SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/ut0timer.h
+Timer rountines
+
+Created 30/07/2014 Jan Lindström jan.lindstrom@skysql.com
+modified from https://github.com/facebook/mysql-5.6/commit/c75a413edeb96eb99bf11d7269bdfea06f96d6b6
+*************************************************************************/
+#ifndef ut0timer_h
+#define ut0timer_h
+
+#include "univ.i"
+#include "data0type.h"
+#include <my_rdtsc.h>
+
+/* Current timer stats */
+extern struct my_timer_unit_info ut_timer;
+
+/**************************************************************//**
+Function pointer to point selected timer function.
+@return	timer current value */
+extern ulonglong (*ut_timer_now)(void);
+
+/**************************************************************//**
+Sets up the data required for use of my_timer_* functions.
+Selects the best timer by high frequency, and tight resolution.
+Points my_timer_now() to the selected timer function.
+Initializes my_timer struct to contain the info for selected timer.*/
+UNIV_INTERN
+void ut_init_timer(void);
+
+/**************************************************************//**
+Return time passed since time then, automatically adjusted
+for the estimated timer overhead.
+@return	time passed since "then" */
+UNIV_INLINE
+ulonglong
+ut_timer_since(
+/*===========*/
+	ulonglong	then); /*!< in: time where to calculate */
+/**************************************************************//**
+Get time passed since "then", and update then to now
+@return time passed sinche "then" */
+UNIV_INLINE
+ulonglong
+ut_timer_since_and_update(
+/*======================*/
+	ulonglong	*then); /*!< in: time where to calculate */
+/**************************************************************//**
+Convert native timer units in a ulonglong into seconds in a double
+@return time in a seconds */
+UNIV_INLINE
+double
+ut_timer_to_seconds(
+/*=================*/
+	ulonglong	when); /*!< in: time where to calculate */
+/**************************************************************//**
+Convert native timer units in a ulonglong into milliseconds in a double
+@return time in milliseconds */
+UNIV_INLINE
+double
+ut_timer_to_milliseconds(
+/*=====================*/
+	ulonglong	when); /*!< in: time where to calculate */
+/**************************************************************//**
+Convert native timer units in a ulonglong into microseconds in a double
+@return time in microseconds */
+UNIV_INLINE
+double
+ut_timer_to_microseconds(
+/*=====================*/
+	ulonglong	when); /*!< in: time where to calculate */
+/**************************************************************//**
+Convert microseconds in a double to native timer units in a ulonglong
+@return time in microseconds */
+UNIV_INLINE
+ulonglong
+ut_microseconds_to_timer(
+/*=====================*/
+	ulonglong	when); /*!< in: time where to calculate */
+
+#ifndef UNIV_NONINL
+#include "ut0timer.ic"
+#endif
+
+#endif
diff --git a/storage/innobase/include/ut0timer.ic b/storage/innobase/include/ut0timer.ic
new file mode 100644
index 00000000000..027e89c6279
--- /dev/null
+++ b/storage/innobase/include/ut0timer.ic
@@ -0,0 +1,113 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2014, Facebook, Inc. All Rights Reserved.
+Copyright (c) 2014, SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file include/ut0timer.ic
+Timer rountines
+
+Created 30/07/2014 Jan Lindström jan.lindstrom@skysql.com
+modified from https://github.com/facebook/mysql-5.6/commit/c75a413edeb96eb99bf11d7269bdfea06f96d6b6
+*************************************************************************/
+
+/**************************************************************//**
+Return time passed since time then, automatically adjusted
+for the estimated timer overhead.
+@return	time passed since "then" */
+UNIV_INLINE
+ulonglong
+ut_timer_since(
+/*===========*/
+	ulonglong	then) /*!< in: time where to calculate */
+{
+	return (ut_timer_now() - then) - ut_timer.overhead;
+}
+
+/**************************************************************//**
+Get time passed since "then", and update then to now
+@return time passed sinche "then" */
+UNIV_INLINE
+ulonglong
+ut_timer_since_and_update(
+/*======================*/
+	ulonglong	*then) /*!< in: time where to calculate */
+{
+	ulonglong now = ut_timer_now();
+	ulonglong ret = (now - (*then)) - ut_timer.overhead;
+	*then = now;
+	return ret;
+}
+
+/**************************************************************//**
+Convert native timer units in a ulonglong into seconds in a double
+@return time in a seconds */
+UNIV_INLINE
+double
+ut_timer_to_seconds(
+/*=================*/
+	ulonglong	when) /*!< in: time where to calculate */
+{
+	double ret = (double)(when);
+	ret /= (double)(ut_timer.frequency);
+	return ret;
+}
+
+/**************************************************************//**
+Convert native timer units in a ulonglong into milliseconds in a double
+@return time in milliseconds */
+UNIV_INLINE
+double
+ut_timer_to_milliseconds(
+/*=====================*/
+	ulonglong	when) /*!< in: time where to calculate */
+{
+	double ret = (double)(when);
+	ret *= 1000.0;
+	ret /= (double)(ut_timer.frequency);
+	return ret;
+}
+
+/**************************************************************//**
+Convert native timer units in a ulonglong into microseconds in a double
+@return time in microseconds */
+UNIV_INLINE
+double
+ut_timer_to_microseconds(
+/*=====================*/
+	ulonglong	when) /*!< in: time where to calculate */
+{
+	double ret = (double)(when);
+	ret *= 1000000.0;
+	ret /= (double)(ut_timer.frequency);
+	return ret;
+}
+
+/**************************************************************//**
+Convert microseconds in a double to native timer units in a ulonglong
+@return time in microseconds */
+UNIV_INLINE
+ulonglong
+ut_microseconds_to_timer(
+/*=====================*/
+	ulonglong 	when) /*!< in: time where to calculate */
+{
+	double ret = when;
+	ret *= (double)(ut_timer.frequency);
+	ret /= 1000000.0;
+	return (ulonglong)ret;
+}
diff --git a/storage/innobase/include/ut0wqueue.h b/storage/innobase/include/ut0wqueue.h
index 33385ddf2d4..9906e299808 100644
--- a/storage/innobase/include/ut0wqueue.h
+++ b/storage/innobase/include/ut0wqueue.h
@@ -95,6 +95,23 @@ ib_wqueue_timedwait(
 	ib_wqueue_t*	wq,		/* in: work queue */
 	ib_time_t	wait_in_usecs); /* in: wait time in micro seconds */
 
+/********************************************************************
+Return first item on work queue or NULL if queue is empty
+@return work item or NULL */
+void*
+ib_wqueue_nowait(
+/*=============*/
+	ib_wqueue_t*	wq);		/*<! in: work queue */
+
+/********************************************************************
+Get number of items on queue.
+@return number of items on queue */
+ulint
+ib_wqueue_len(
+/*==========*/
+	ib_wqueue_t*	wq);		/*<! in: work queue */
+
+
 /* Work queue. */
 struct ib_wqueue_t {
 	ib_mutex_t		mutex;	/*!< mutex protecting everything */
diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
index eac18a02d16..42719fcc3cd 100644
--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -51,6 +51,8 @@ Created 5/7/1996 Heikki Tuuri
 #include <set>
 #include "mysql/plugin.h"
 
+#include <mysql/service_wsrep.h>
+
 /* Restricts the length of search we will do in the waits-for
 graph of transactions */
 #define LOCK_MAX_N_STEPS_IN_DEADLOCK_CHECK 1000000
@@ -959,6 +961,9 @@ UNIV_INLINE
 ibool
 lock_rec_has_to_wait(
 /*=================*/
+#ifdef WITH_WSREP
+	ibool		for_locking, /*!< is caller locking or releasing */
+#endif /* WITH_WSREP */
 	const trx_t*	trx,	/*!< in: trx of new lock */
 	ulint		type_mode,/*!< in: precise mode of the new lock
 				to set: LOCK_S or LOCK_X, possibly
@@ -1055,6 +1060,50 @@ lock_rec_has_to_wait(
 			return (FALSE);
 		}
 
+#ifdef WITH_WSREP
+		/* if BF thread is locking and has conflict with another BF
+		   thread, we need to look at trx ordering and lock types */
+		if (for_locking                                    &&
+		    wsrep_thd_is_BF(trx->mysql_thd, FALSE)         &&
+		    wsrep_thd_is_BF(lock2->trx->mysql_thd, TRUE)) {
+
+			if (wsrep_debug) {
+				fprintf(stderr, "\n BF-BF lock conflict \n");
+				lock_rec_print(stderr, lock2);
+			}
+
+			if (wsrep_trx_order_before(trx->mysql_thd,
+						   lock2->trx->mysql_thd) &&
+			    (type_mode & LOCK_MODE_MASK) == LOCK_X        &&
+			    (lock2->type_mode & LOCK_MODE_MASK) == LOCK_X)
+			{
+				/* exclusive lock conflicts are not accepted */
+				fprintf(stderr, "BF-BF X lock conflict,"
+					"type_mode: %lu supremum: %lu\n",
+					type_mode, lock_is_on_supremum);
+				fprintf(stderr, "conflicts states: my %d locked %d\n",
+					wsrep_thd_conflict_state(trx->mysql_thd, FALSE),
+					wsrep_thd_conflict_state(lock2->trx->mysql_thd, FALSE) );
+				lock_rec_print(stderr, lock2);
+				return FALSE;
+				//abort();
+			} else {
+				/* if lock2->index->n_uniq <=
+				   lock2->index->n_user_defined_cols
+				   operation is on uniq index
+				*/
+				if (wsrep_debug) fprintf(stderr,
+					"BF conflict, modes: %lu %lu, "
+					"idx: %s-%s n_uniq %u n_user %u\n",
+					type_mode, lock2->type_mode,
+					lock2->index->name, 
+					lock2->index->table_name,
+					lock2->index->n_uniq,
+					lock2->index->n_user_defined_cols);
+				return FALSE;
+			}
+		}
+#endif /* WITH_WSREP */
 		return(TRUE);
 	}
 
@@ -1085,7 +1134,11 @@ lock_has_to_wait(
 			/* If this lock request is for a supremum record
 			then the second bit on the lock bitmap is set */
 
+#ifdef WITH_WSREP
+			return(lock_rec_has_to_wait(FALSE, lock1->trx,
+#else
 			return(lock_rec_has_to_wait(lock1->trx,
+#endif /* WITH_WSREP */
 						    lock1->type_mode, lock2,
 						    lock_rec_get_nth_bit(
 							    lock1, 1)));
@@ -1554,6 +1607,11 @@ lock_rec_has_expl(
 	return(NULL);
 }
 
+#ifdef WITH_WSREP
+static
+void
+lock_rec_discard(lock_t*	in_lock);
+#endif
 #ifdef UNIV_DEBUG
 /*********************************************************************//**
 Checks if some other transaction has a lock request in the queue.
@@ -1602,6 +1660,69 @@ lock_rec_other_has_expl_req(
 }
 #endif /* UNIV_DEBUG */
 
+#ifdef WITH_WSREP
+static
+void
+wsrep_kill_victim(
+	const trx_t * const trx,
+	const lock_t *lock)
+{
+        ut_ad(lock_mutex_own());
+        ut_ad(trx_mutex_own(lock->trx));
+	my_bool bf_this  = wsrep_thd_is_BF(trx->mysql_thd, FALSE);
+	my_bool bf_other = wsrep_thd_is_BF(lock->trx->mysql_thd, TRUE);
+
+	if ((bf_this && !bf_other) ||
+		(bf_this && bf_other && wsrep_trx_order_before(
+			trx->mysql_thd, lock->trx->mysql_thd))) {
+
+		if (lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+			if (wsrep_debug) {
+				fprintf(stderr, "WSREP: BF victim waiting\n");
+			}
+			/* cannot release lock, until our lock
+			is in the queue*/
+		} else if (lock->trx != trx) {
+			if (wsrep_log_conflicts) {
+				mutex_enter(&trx_sys->mutex);
+				if (bf_this) {
+					fputs("\n*** Priority TRANSACTION:\n",
+					      stderr);
+				} else {
+					fputs("\n*** Victim TRANSACTION:\n",
+					      stderr);
+				}
+
+				trx_print_latched(stderr, trx, 3000);
+
+				if (bf_other) {
+					fputs("\n*** Priority TRANSACTION:\n",
+					      stderr);
+				} else {
+					fputs("\n*** Victim TRANSACTION:\n",
+					      stderr);
+				}
+
+				trx_print_latched(stderr, lock->trx, 3000);
+
+				mutex_exit(&trx_sys->mutex);
+
+				fputs("*** WAITING FOR THIS LOCK TO BE GRANTED:\n",
+				      stderr);
+
+				if (lock_get_type(lock) == LOCK_REC) {
+					lock_rec_print(stderr, lock);
+				} else {
+					lock_table_print(stderr, lock);
+				}
+			}
+
+			wsrep_innobase_kill_one_trx(trx->mysql_thd,
+				(const trx_t*) trx, lock->trx, TRUE);
+		}
+	}
+}
+#endif
 /*********************************************************************//**
 Checks if some other transaction has a conflicting explicit lock request
 in the queue, so that we have to wait.
@@ -1630,7 +1751,15 @@ lock_rec_other_has_conflicting(
 	     lock != NULL;
 	     lock = lock_rec_get_next_const(heap_no, lock)) {
 
-		if (lock_rec_has_to_wait(trx, mode, lock, is_supremum)) {
+#ifdef WITH_WSREP
+		if (lock_rec_has_to_wait(TRUE, trx, mode, lock, is_supremum)) {
+			trx_mutex_enter(lock->trx);
+			wsrep_kill_victim(trx, lock);
+			trx_mutex_exit(lock->trx);
+#else
+ 		if (lock_rec_has_to_wait(trx, mode, lock, is_supremum)) {
+#endif /* WITH_WSREP */
+
 			return(lock);
 		}
 	}
@@ -1811,6 +1940,28 @@ lock_number_of_rows_locked(
 
 /*============== RECORD LOCK CREATION AND QUEUE MANAGEMENT =============*/
 
+#ifdef WITH_WSREP
+static
+void
+wsrep_print_wait_locks(
+/*============*/
+	lock_t*		c_lock) /* conflicting lock to print */
+{
+	if (wsrep_debug &&  c_lock->trx->lock.wait_lock != c_lock) {
+		fprintf(stderr, "WSREP: c_lock != wait lock\n");
+		if (lock_get_type_low(c_lock) & LOCK_TABLE)
+			lock_table_print(stderr, c_lock);
+		else
+			lock_rec_print(stderr, c_lock);
+
+		if (lock_get_type_low(c_lock->trx->lock.wait_lock) & LOCK_TABLE)
+			lock_table_print(stderr, c_lock->trx->lock.wait_lock);
+		else
+			lock_rec_print(stderr, c_lock->trx->lock.wait_lock);
+	}
+}
+#endif /* WITH_WSREP */
+
 /*********************************************************************//**
 Creates a new record lock and inserts it to the lock queue. Does NOT check
 for deadlocks or lock compatibility!
@@ -1819,6 +1970,10 @@ static
 lock_t*
 lock_rec_create(
 /*============*/
+#ifdef WITH_WSREP
+	lock_t*			const c_lock,   /* conflicting lock */
+	que_thr_t*		thr,
+#endif
 	ulint			type_mode,/*!< in: lock mode and wait
 					flag, type is ignored and
 					replaced by LOCK_REC */
@@ -1890,8 +2045,88 @@ lock_rec_create(
 
 	ut_ad(index->table->n_ref_count > 0 || !index->table->can_be_evicted);
 
+#ifdef WITH_WSREP
+	if (c_lock && wsrep_thd_is_BF(trx->mysql_thd, FALSE)) {
+		lock_t *hash	= (lock_t *)c_lock->hash;
+		lock_t *prev	= NULL;
+
+		while (hash 						       &&
+		       wsrep_thd_is_BF(((lock_t *)hash)->trx->mysql_thd, TRUE) &&
+		       wsrep_trx_order_before(
+				((lock_t *)hash)->trx->mysql_thd,
+				trx->mysql_thd)) {
+			prev = hash;
+			hash = (lock_t *)hash->hash;
+		}
+		lock->hash = hash;
+		if (prev) {
+			prev->hash = lock;
+		} else {
+			c_lock->hash = lock;
+		}
+		/*
+		 * delayed conflict resolution '...kill_one_trx' was not called,
+		 * if victim was waiting for some other lock
+		 */
+		trx_mutex_enter(c_lock->trx);
+		if (c_lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+
+			c_lock->trx->lock.was_chosen_as_deadlock_victim = TRUE;
+
+			if (wsrep_debug) {
+				wsrep_print_wait_locks(c_lock);
+			}
+
+			trx->lock.que_state = TRX_QUE_LOCK_WAIT;
+			lock_set_lock_and_trx_wait(lock, trx);
+			UT_LIST_ADD_LAST(trx_locks, trx->lock.trx_locks, lock);
+
+			ut_ad(thr != NULL);
+			trx->lock.wait_thr = thr;
+			thr->state = QUE_THR_LOCK_WAIT;
+
+			/* have to release trx mutex for the duration of
+			   victim lock release. This will eventually call
+			   lock_grant, which wants to grant trx mutex again
+			*/
+			if (caller_owns_trx_mutex) {
+				trx_mutex_exit(trx);
+			}
+			lock_cancel_waiting_and_release(
+				c_lock->trx->lock.wait_lock);
+
+			if (caller_owns_trx_mutex) {
+				trx_mutex_enter(trx);
+			}
+
+			/* trx might not wait for c_lock, but some other lock
+			   does not matter if wait_lock was released above
+			 */
+			if (c_lock->trx->lock.wait_lock == c_lock) {
+				lock_reset_lock_and_trx_wait(lock);
+			}
+
+			trx_mutex_exit(c_lock->trx);
+
+			if (wsrep_debug) {
+				fprintf(
+					stderr,
+					"WSREP: c_lock canceled %llu\n",
+					(ulonglong) c_lock->trx->id);
+			}
+
+			/* have to bail out here to avoid lock_set_lock... */
+			return(lock);
+		}
+		trx_mutex_exit(c_lock->trx);
+	} else {
+		HASH_INSERT(lock_t, hash, lock_sys->rec_hash,
+			    lock_rec_fold(space, page_no), lock);
+	}
+#else
 	HASH_INSERT(lock_t, hash, lock_sys->rec_hash,
 		    lock_rec_fold(space, page_no), lock);
+#endif /* WITH_WSREP */
 
 	if (!caller_owns_trx_mutex) {
 		trx_mutex_enter(trx);
@@ -1899,7 +2134,6 @@ lock_rec_create(
 	ut_ad(trx_mutex_own(trx));
 
 	if (type_mode & LOCK_WAIT) {
-
 		lock_set_lock_and_trx_wait(lock, trx);
 	}
 
@@ -1911,7 +2145,6 @@ lock_rec_create(
 
 	MONITOR_INC(MONITOR_RECLOCK_CREATED);
 	MONITOR_INC(MONITOR_NUM_RECLOCK);
-
 	return(lock);
 }
 
@@ -1926,6 +2159,9 @@ static
 dberr_t
 lock_rec_enqueue_waiting(
 /*=====================*/
+#ifdef WITH_WSREP
+	lock_t*			c_lock,   /* conflicting lock */
+#endif
 	ulint			type_mode,/*!< in: lock mode this
 					transaction is requesting:
 					LOCK_S or LOCK_X, possibly
@@ -1983,6 +2219,9 @@ lock_rec_enqueue_waiting(
 	/* Enqueue the lock request that will wait to be granted, note that
 	we already own the trx mutex. */
 	lock = lock_rec_create(
+#ifdef WITH_WSREP
+                c_lock, thr,
+#endif /* WITH_WSREP */
 		type_mode | LOCK_WAIT, block, heap_no, index, trx, TRUE);
 
 	/* Release the mutex to obey the latching order.
@@ -2083,7 +2322,19 @@ lock_rec_add_to_queue(
 		const lock_t*	other_lock
 			= lock_rec_other_has_expl_req(mode, 0, LOCK_WAIT,
 						      block, heap_no, trx);
+#ifdef WITH_WSREP
+		/* this can potentionally assert with wsrep */
+		if (wsrep_thd_is_wsrep(trx->mysql_thd)) {
+			if (wsrep_debug && other_lock) {
+				fprintf(stderr,
+					"WSREP: InnoDB assert ignored\n");
+			}
+		} else {
+			ut_a(!other_lock);
+		}
+#else
 		ut_a(!other_lock);
+#endif /* WITH_WSREP */
 	}
 #endif /* UNIV_DEBUG */
 
@@ -2111,7 +2362,16 @@ lock_rec_add_to_queue(
 
 		if (lock_get_wait(lock)
 		    && lock_rec_get_nth_bit(lock, heap_no)) {
-
+#ifdef WITH_WSREP
+			if (wsrep_thd_is_BF(trx->mysql_thd, FALSE)) {
+				if (wsrep_debug) {
+					fprintf(stderr,
+						"BF skipping wait: %lu\n",
+						trx->id);
+					lock_rec_print(stderr, lock);
+				}
+		  } else
+#endif
 			goto somebody_waits;
 		}
 	}
@@ -2134,9 +2394,15 @@ lock_rec_add_to_queue(
 	}
 
 somebody_waits:
-	return(lock_rec_create(
+#ifdef WITH_WSREP
+	return(lock_rec_create(NULL, NULL,
 			type_mode, block, heap_no, index, trx,
 			caller_owns_trx_mutex));
+#else
+ 	return(lock_rec_create(
+                        type_mode, block, heap_no, index, trx,
+                        caller_owns_trx_mutex));
+#endif /* WITH_WSREP */
 }
 
 /** Record locking request status */
@@ -2199,9 +2465,13 @@ lock_rec_lock_fast(
 	if (lock == NULL) {
 		if (!impl) {
 			/* Note that we don't own the trx mutex. */
+#ifdef WITH_WSREP
+			lock = lock_rec_create(NULL, thr,
+				mode, block, heap_no, index, trx, FALSE);
+#else
 			lock = lock_rec_create(
 				mode, block, heap_no, index, trx, FALSE);
-
+#endif /* WITH_WSREP */
 		}
 		status = LOCK_REC_SUCCESS_CREATED;
 	} else {
@@ -2254,6 +2524,9 @@ lock_rec_lock_slow(
 	que_thr_t*		thr)	/*!< in: query thread */
 {
 	trx_t*			trx;
+#ifdef WITH_WSREP
+	lock_t*			c_lock(NULL);
+#endif
 	dberr_t			err = DB_SUCCESS;
 
 	ut_ad(lock_mutex_own());
@@ -2277,18 +2550,31 @@ lock_rec_lock_slow(
 
 		/* The trx already has a strong enough lock on rec: do
 		nothing */
-
+#ifdef WITH_WSREP
+	} else if ((c_lock = (ib_lock_t*)lock_rec_other_has_conflicting(
+                        static_cast<enum lock_mode>(mode),
+                        block, heap_no, trx))) {
+#else
 	} else if (lock_rec_other_has_conflicting(
 			static_cast<enum lock_mode>(mode),
 			block, heap_no, trx)) {
+#endif /* WITH_WSREP */
 
 		/* If another transaction has a non-gap conflicting
 		request in the queue, as this transaction does not
 		have a lock strong enough already granted on the
 		record, we have to wait. */
 
+#ifdef WITH_WSREP
+		/* c_lock is NULL here if jump to enqueue_waiting happened
+		but it's ok because lock is not NULL in that case and c_lock
+		is not used. */
+		err = lock_rec_enqueue_waiting(c_lock,
+			mode, block, heap_no, index, thr);
+#else
 		err = lock_rec_enqueue_waiting(
 			mode, block, heap_no, index, thr);
+#endif /* WITH_WSREP */
 
 	} else if (!impl) {
 		/* Set the requested lock on the record, note that
@@ -2394,7 +2680,13 @@ lock_rec_has_to_wait_in_queue(
 		if (heap_no < lock_rec_get_n_bits(lock)
 		    && (p[bit_offset] & bit_mask)
 		    && lock_has_to_wait(wait_lock, lock)) {
-
+#ifdef WITH_WSREP
+			if (wsrep_thd_is_BF(wait_lock->trx->mysql_thd, FALSE) &&
+			    wsrep_thd_is_BF(lock->trx->mysql_thd, TRUE)) {
+				/* don't wait for another BF lock */
+				continue;
+			}
+#endif
 			return(lock);
 		}
 	}
@@ -3308,6 +3600,47 @@ lock_update_merge_left(
 }
 
 /*************************************************************//**
+Updates the lock table when a page is split and merged to
+two pages. */
+UNIV_INTERN
+void
+lock_update_split_and_merge(
+	const buf_block_t* left_block,	/*!< in: left page to which merged */
+	const rec_t* orig_pred,		/*!< in: original predecessor of
+					supremum on the left page before merge*/
+	const buf_block_t* right_block)	/*!< in: right page from which merged */
+{
+	const rec_t* left_next_rec;
+
+	ut_a(left_block && right_block);
+	ut_a(orig_pred);
+
+	lock_mutex_enter();
+
+	left_next_rec = page_rec_get_next_const(orig_pred);
+
+	/* Inherit the locks on the supremum of the left page to the
+	first record which was moved from the right page */
+	lock_rec_inherit_to_gap(
+		left_block, left_block,
+		page_rec_get_heap_no(left_next_rec),
+		PAGE_HEAP_NO_SUPREMUM);
+
+	/* Reset the locks on the supremum of the left page,
+	releasing waiting transactions */
+	lock_rec_reset_and_release_wait(left_block,
+					PAGE_HEAP_NO_SUPREMUM);
+
+	/* Inherit the locks to the supremum of the left page from the
+	successor of the infimum on the right page */
+	lock_rec_inherit_to_gap(left_block, right_block,
+				PAGE_HEAP_NO_SUPREMUM,
+				lock_get_min_heap_no(right_block));
+
+	lock_mutex_exit();
+}
+
+/*************************************************************//**
 Resets the original locks on heir and replaces them with gap type locks
 inherited from rec. */
 UNIV_INTERN
@@ -3778,10 +4111,22 @@ lock_deadlock_select_victim(
 		/* The joining  transaction is 'smaller',
 		choose it as the victim and roll it back. */
 
-		return(ctx->start);
+#ifdef WITH_WSREP
+		if (wsrep_thd_is_BF(ctx->start->mysql_thd, TRUE)) {
+			return(ctx->wait_lock->trx);
+		}
+		else
+#endif /* WITH_WSREP */
+			return(ctx->start);
 	}
 
-	return(ctx->wait_lock->trx);
+#ifdef WITH_WSREP
+	if (wsrep_thd_is_BF(ctx->wait_lock->trx->mysql_thd, TRUE)) {
+		return(ctx->start);
+	}
+	else
+#endif /* WITH_WSREP */
+		return(ctx->wait_lock->trx);
 }
 
 /********************************************************************//**
@@ -3911,8 +4256,14 @@ lock_deadlock_search(
 
 			ctx->too_deep = TRUE;
 
+#ifdef WITH_WSREP
+			if (wsrep_thd_is_BF(ctx->start->mysql_thd, TRUE)) {
+				return(ctx->wait_lock->trx->id);
+			}
+			else
+#endif /* WITH_WSREP */
 			/* Select the joining transaction as the victim. */
-			return(ctx->start->id);
+				return(ctx->start->id);
 
 		} else {
 			/* We do not need to report autoinc locks to the upper
@@ -3953,6 +4304,11 @@ lock_deadlock_search(
 					size not big enough. */
 
 					ctx->too_deep = TRUE;
+#ifdef WITH_WSREP
+				if (wsrep_thd_is_BF(ctx->start->mysql_thd, TRUE))
+					return(lock->trx->id);
+				else
+#endif /* WITH_WSREP */
 
 					return(ctx->start->id);
 				}
@@ -4137,9 +4493,18 @@ lock_deadlock_check_and_resolve(
 			ut_a(trx == ctx.start);
 			ut_a(victim_trx_id == trx->id);
 
-			if (!srv_read_only_mode) {
-				lock_deadlock_joining_trx_print(trx, lock);
+#ifdef WITH_WSREP
+			if (!wsrep_thd_is_BF(ctx.start->mysql_thd, TRUE))
+			{
+#endif /* WITH_WSREP */
+				if (!srv_read_only_mode) {
+					lock_deadlock_joining_trx_print(trx, lock);
+				}
+#ifdef WITH_WSREP
+			} else {
+			  /* BF processor */;
 			}
+#endif /* WITH_WSREP */
 
 			MONITOR_INC(MONITOR_DEADLOCK);
 
@@ -4177,6 +4542,9 @@ UNIV_INLINE
 lock_t*
 lock_table_create(
 /*==============*/
+#ifdef WITH_WSREP
+	lock_t*		c_lock, /*!< in: conflicting lock */
+#endif
 	dict_table_t*	table,	/*!< in/out: database table
 				in dictionary cache */
 	ulint		type_mode,/*!< in: lock mode possibly ORed with
@@ -4220,7 +4588,59 @@ lock_table_create(
 	ut_ad(table->n_ref_count > 0 || !table->can_be_evicted);
 
 	UT_LIST_ADD_LAST(trx_locks, trx->lock.trx_locks, lock);
+
+#ifdef WITH_WSREP
+	if (wsrep_thd_is_wsrep(trx->mysql_thd)) {
+		if (c_lock && wsrep_thd_is_BF(trx->mysql_thd, FALSE)) {
+			UT_LIST_INSERT_AFTER(
+				un_member.tab_lock.locks, table->locks, c_lock, lock);
+		} else {
+			UT_LIST_ADD_LAST(un_member.tab_lock.locks, table->locks, lock);
+		}
+
+		if (c_lock) {
+			trx_mutex_enter(c_lock->trx);
+		}
+
+		if (c_lock && c_lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+
+			c_lock->trx->lock.was_chosen_as_deadlock_victim = TRUE;
+
+			if (wsrep_debug) {
+				wsrep_print_wait_locks(c_lock);
+				wsrep_print_wait_locks(c_lock->trx->lock.wait_lock);
+			}
+
+			/* have to release trx mutex for the duration of
+			victim lock release. This will eventually call
+			lock_grant, which wants to grant trx mutex again
+			*/
+			/* caller has trx_mutex, have to release for lock cancel */
+			trx_mutex_exit(trx);
+			lock_cancel_waiting_and_release(c_lock->trx->lock.wait_lock);
+			trx_mutex_enter(trx);
+
+			/* trx might not wait for c_lock, but some other lock
+			does not matter if wait_lock was released above
+			*/
+			if (c_lock->trx->lock.wait_lock == c_lock) {
+				lock_reset_lock_and_trx_wait(lock);
+			}
+
+			if (wsrep_debug) {
+				fprintf(stderr, "WSREP: c_lock canceled %llu\n",
+					(ulonglong) c_lock->trx->id);
+			}
+		}
+		if (c_lock) {
+			trx_mutex_exit(c_lock->trx);
+		}
+	} else {
+		UT_LIST_ADD_LAST(un_member.tab_lock.locks, table->locks, lock);
+	}
+#else
 	UT_LIST_ADD_LAST(un_member.tab_lock.locks, table->locks, lock);
+#endif /* WITH_WSREP */
 
 	if (UNIV_UNLIKELY(type_mode & LOCK_WAIT)) {
 
@@ -4377,6 +4797,9 @@ static
 dberr_t
 lock_table_enqueue_waiting(
 /*=======================*/
+#ifdef WITH_WSREP
+	lock_t*		c_lock, /*!< in: conflicting lock */
+#endif
 	ulint		mode,	/*!< in: lock mode this transaction is
 				requesting */
 	dict_table_t*	table,	/*!< in/out: table */
@@ -4421,7 +4844,14 @@ lock_table_enqueue_waiting(
 
 	/* Enqueue the lock request that will wait to be granted */
 
-	lock = lock_table_create(table, mode | LOCK_WAIT, trx);
+#ifdef WITH_WSREP
+	if (trx->lock.was_chosen_as_deadlock_victim) {
+		return(DB_DEADLOCK);
+	}
+	lock = lock_table_create(c_lock, table, mode | LOCK_WAIT, trx);
+#else
+ 	lock = lock_table_create(table, mode | LOCK_WAIT, trx);
+#endif /* WITH_WSREP */
 
 	/* Release the mutex to obey the latching order.
 	This is safe, because lock_deadlock_check_and_resolve()
@@ -4493,6 +4923,18 @@ lock_table_other_has_incompatible(
 		    && !lock_mode_compatible(lock_get_mode(lock), mode)
 		    && (wait || !lock_get_wait(lock))) {
 
+#ifdef WITH_WSREP
+			if(wsrep_thd_is_wsrep(trx->mysql_thd)) {
+				if (wsrep_debug) {
+					fprintf(stderr, "WSREP: trx %ld table lock abort\n",
+						trx->id);
+				}
+				trx_mutex_enter(lock->trx);
+				wsrep_kill_victim((trx_t *)trx, (lock_t *)lock);
+				trx_mutex_exit(lock->trx);
+			}
+#endif
+
 			return(lock);
 		}
 	}
@@ -4515,6 +4957,9 @@ lock_table(
 	enum lock_mode	mode,	/*!< in: lock mode */
 	que_thr_t*	thr)	/*!< in: query thread */
 {
+#ifdef WITH_WSREP
+	lock_t *c_lock = NULL;
+#endif
 	trx_t*		trx;
 	dberr_t		err;
 	const lock_t*	wait_for;
@@ -4542,11 +4987,19 @@ lock_table(
 
 	lock_mutex_enter();
 
+	DBUG_EXECUTE_IF("fatal-semaphore-timeout",
+		{ os_thread_sleep(3600000000); });
+
 	/* We have to check if the new lock is compatible with any locks
 	other transactions have in the table lock queue. */
 
+#ifdef WITH_WSREP
+	wait_for = lock_table_other_has_incompatible(
+                trx, LOCK_WAIT, table, mode);
+#else
 	wait_for = lock_table_other_has_incompatible(
 		trx, LOCK_WAIT, table, mode);
+#endif
 
 	trx_mutex_enter(trx);
 
@@ -4554,9 +5007,17 @@ lock_table(
 	mode: this trx may have to wait */
 
 	if (wait_for != NULL) {
+#ifdef WITH_WSREP
+                err = lock_table_enqueue_waiting((ib_lock_t*)wait_for, mode | flags, table, thr);
+#else
 		err = lock_table_enqueue_waiting(mode | flags, table, thr);
+#endif
 	} else {
+#ifdef WITH_WSREP
+	        lock_table_create(c_lock, table, mode | flags, trx);
+#else
 		lock_table_create(table, mode | flags, trx);
+#endif
 
 		ut_a(!flags || mode == LOCK_S || mode == LOCK_X);
 
@@ -4594,7 +5055,11 @@ lock_table_ix_resurrect(
 		      trx, LOCK_WAIT, table, LOCK_IX));
 
 	trx_mutex_enter(trx);
+#ifdef WITH_WSREP
+	lock_table_create(NULL, table, LOCK_IX, trx);
+#else
 	lock_table_create(table, LOCK_IX, trx);
+#endif
 	lock_mutex_exit();
 	trx_mutex_exit(trx);
 }
@@ -5725,6 +6190,7 @@ lock_rec_queue_validate(
 
 		if (!lock_rec_get_gap(lock) && !lock_get_wait(lock)) {
 
+#ifndef WITH_WSREP
 			enum lock_mode	mode;
 
 			if (lock_get_mode(lock) == LOCK_S) {
@@ -5733,7 +6199,8 @@ lock_rec_queue_validate(
 				mode = LOCK_S;
 			}
 			ut_a(!lock_rec_other_has_expl_req(
-				     mode, 0, 0, block, heap_no, lock->trx));
+				mode, 0, 0, block, heap_no, lock->trx));
+#endif /* WITH_WSREP */
 
 		} else if (lock_get_wait(lock) && !lock_rec_get_gap(lock)) {
 
@@ -6038,6 +6505,9 @@ lock_rec_insert_check_and_lock(
 	dberr_t		err;
 	ulint		next_rec_heap_no;
 	ibool		inherit_in = *inherit;
+#ifdef WITH_WSREP
+	lock_t*		c_lock=NULL;
+#endif
 
 	ut_ad(block->frame == page_align(rec));
 	ut_ad(!dict_index_is_online_ddl(index)
@@ -6094,17 +6564,30 @@ lock_rec_insert_check_and_lock(
 	had to wait for their insert. Both had waiting gap type lock requests
 	on the successor, which produced an unnecessary deadlock. */
 
+#ifdef WITH_WSREP
+	if ((c_lock = (ib_lock_t*)lock_rec_other_has_conflicting(
+		    static_cast<enum lock_mode>(
+                            LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION),
+		    block, next_rec_heap_no, trx))) {
+#else
 	if (lock_rec_other_has_conflicting(
 		    static_cast<enum lock_mode>(
 			    LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION),
 		    block, next_rec_heap_no, trx)) {
+#endif /* WITH_WSREP */
 
 		/* Note that we may get DB_SUCCESS also here! */
 		trx_mutex_enter(trx);
 
+#ifdef WITH_WSREP
+		err = lock_rec_enqueue_waiting(c_lock,
+                        LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION,
+			block, next_rec_heap_no, index, thr);
+#else
 		err = lock_rec_enqueue_waiting(
 			LOCK_X | LOCK_GAP | LOCK_INSERT_INTENTION,
 			block, next_rec_heap_no, index, thr);
+#endif /* WITH_WSREP */
 
 		trx_mutex_exit(trx);
 	} else {
diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc
index 53794a0d773..d1418dcaab5 100644
--- a/storage/innobase/log/log0log.cc
+++ b/storage/innobase/log/log0log.cc
@@ -2,6 +2,7 @@
 
 Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2009, Google Inc.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
 
 Portions of this file contain modifications contributed and copyrighted by
 Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -1272,7 +1273,7 @@ log_group_file_header_flush(
 		       (ulint) (dest_offset / UNIV_PAGE_SIZE),
 		       (ulint) (dest_offset % UNIV_PAGE_SIZE),
 		       OS_FILE_LOG_BLOCK_SIZE,
-		       buf, group);
+		       buf, group, 0);
 
 		srv_stats.os_log_pending_writes.dec();
 	}
@@ -1400,7 +1401,7 @@ loop:
 		fil_io(OS_FILE_WRITE | OS_FILE_LOG, true, group->space_id, 0,
 		       (ulint) (next_offset / UNIV_PAGE_SIZE),
 		       (ulint) (next_offset % UNIV_PAGE_SIZE), write_len, buf,
-		       group);
+		       group, 0);
 
 		srv_stats.os_log_pending_writes.dec();
 
@@ -1966,7 +1967,7 @@ log_group_checkpoint(
 		       write_offset / UNIV_PAGE_SIZE,
 		       write_offset % UNIV_PAGE_SIZE,
 		       OS_FILE_LOG_BLOCK_SIZE,
-		       buf, ((byte*) group + 1));
+		       buf, ((byte*) group + 1), 0);
 
 		ut_ad(((ulint) group & 0x1UL) == 0);
 	}
@@ -2046,7 +2047,7 @@ log_group_read_checkpoint_info(
 
 	fil_io(OS_FILE_READ | OS_FILE_LOG, true, group->space_id, 0,
 	       field / UNIV_PAGE_SIZE, field % UNIV_PAGE_SIZE,
-	       OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL);
+	       OS_FILE_LOG_BLOCK_SIZE, log_sys->checkpoint_buf, NULL, 0);
 }
 
 /******************************************************//**
@@ -2340,7 +2341,7 @@ loop:
 	fil_io(OS_FILE_READ | OS_FILE_LOG, sync, group->space_id, 0,
 	       (ulint) (source_offset / UNIV_PAGE_SIZE),
 	       (ulint) (source_offset % UNIV_PAGE_SIZE),
-	       len, buf, NULL);
+	       len, buf, NULL, 0);
 
 	start_lsn += len;
 	buf += len;
@@ -2405,7 +2406,7 @@ log_group_archive_file_header_write(
 	       dest_offset / UNIV_PAGE_SIZE,
 	       dest_offset % UNIV_PAGE_SIZE,
 	       2 * OS_FILE_LOG_BLOCK_SIZE,
-	       buf, &log_archive_io);
+	       buf, &log_archive_io, 0);
 }
 
 /******************************************************//**
@@ -2441,7 +2442,7 @@ log_group_archive_completed_header_write(
 	       dest_offset % UNIV_PAGE_SIZE,
 	       OS_FILE_LOG_BLOCK_SIZE,
 	       buf + LOG_FILE_ARCH_COMPLETED,
-	       &log_archive_io);
+	       &log_archive_io, 0);
 }
 
 /******************************************************//**
@@ -2569,7 +2570,7 @@ loop:
 	       (ulint) (next_offset / UNIV_PAGE_SIZE),
 	       (ulint) (next_offset % UNIV_PAGE_SIZE),
 	       ut_calc_align(len, OS_FILE_LOG_BLOCK_SIZE), buf,
-	       &log_archive_io);
+	       &log_archive_io, 0);
 
 	start_lsn += len;
 	next_offset += len;
diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc
index 4fe9620ccaa..48a204ff327 100644
--- a/storage/innobase/log/log0recv.cc
+++ b/storage/innobase/log/log0recv.cc
@@ -2,6 +2,7 @@
 
 Copyright (c) 1997, 2014, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, SkySQL Ab. All Rights Reserved.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -347,7 +348,10 @@ DECLARE_THREAD(recv_writer_thread)(
 
 	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
 
-		os_thread_sleep(100000);
+		/* Wait till we get a signal to clean the LRU list.
+		Bounded by max wait time of 100ms. */
+		ib_int64_t      sig_count = os_event_reset(buf_flush_event);
+		os_event_wait_time_low(buf_flush_event, 100000, sig_count);
 
 		mutex_enter(&recv_sys->writer_mutex);
 
@@ -2078,7 +2082,7 @@ recv_apply_log_recs_for_backup(void)
 				error = fil_io(OS_FILE_READ, true,
 					       recv_addr->space, zip_size,
 					       recv_addr->page_no, 0, zip_size,
-					       block->page.zip.data, NULL);
+					       block->page.zip.data, NULL, 0);
 				if (error == DB_SUCCESS
 				    && !buf_zip_decompress(block, TRUE)) {
 					exit(1);
@@ -2088,7 +2092,7 @@ recv_apply_log_recs_for_backup(void)
 					       recv_addr->space, 0,
 					       recv_addr->page_no, 0,
 					       UNIV_PAGE_SIZE,
-					       block->frame, NULL);
+					       block->frame, NULL, 0);
 			}
 
 			if (error != DB_SUCCESS) {
@@ -2117,13 +2121,13 @@ recv_apply_log_recs_for_backup(void)
 					       recv_addr->space, zip_size,
 					       recv_addr->page_no, 0,
 					       zip_size,
-					       block->page.zip.data, NULL);
+					       block->page.zip.data, NULL, 0);
 			} else {
 				error = fil_io(OS_FILE_WRITE, true,
 					       recv_addr->space, 0,
 					       recv_addr->page_no, 0,
 					       UNIV_PAGE_SIZE,
-					       block->frame, NULL);
+					       block->frame, NULL, 0);
 			}
 skip_this_recv_addr:
 			recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
@@ -3082,7 +3086,7 @@ recv_recovery_from_checkpoint_start_func(
 
 	fil_io(OS_FILE_READ | OS_FILE_LOG, true, max_cp_group->space_id, 0,
 	       0, 0, LOG_FILE_HDR_SIZE,
-	       log_hdr_buf, max_cp_group);
+	       log_hdr_buf, max_cp_group, 0);
 
 	if (0 == ut_memcmp(log_hdr_buf + LOG_FILE_WAS_CREATED_BY_HOT_BACKUP,
 			   (byte*)"ibbackup", (sizeof "ibbackup") - 1)) {
@@ -3113,7 +3117,7 @@ recv_recovery_from_checkpoint_start_func(
 		fil_io(OS_FILE_WRITE | OS_FILE_LOG, true,
 		       max_cp_group->space_id, 0,
 		       0, 0, OS_FILE_LOG_BLOCK_SIZE,
-		       log_hdr_buf, max_cp_group);
+		       log_hdr_buf, max_cp_group, 0);
 	}
 
 #ifdef UNIV_LOG_ARCHIVE
@@ -3742,7 +3746,7 @@ ask_again:
 
 	/* Read the archive file header */
 	fil_io(OS_FILE_READ | OS_FILE_LOG, true, group->archive_space_id, 0, 0,
-	       LOG_FILE_HDR_SIZE, buf, NULL);
+	       LOG_FILE_HDR_SIZE, buf, NULL, 0);
 
 	/* Check if the archive file header is consistent */
 
@@ -3815,7 +3819,7 @@ ask_again:
 
 		fil_io(OS_FILE_READ | OS_FILE_LOG, true,
 		       group->archive_space_id, read_offset / UNIV_PAGE_SIZE,
-		       read_offset % UNIV_PAGE_SIZE, len, buf, NULL);
+		       read_offset % UNIV_PAGE_SIZE, len, buf, NULL, 0);
 
 		ret = recv_scan_log_recs(
 			(buf_pool_get_n_pages()
diff --git a/storage/innobase/mysql-test/storage_engine/type_char_indexes.rdiff b/storage/innobase/mysql-test/storage_engine/type_char_indexes.rdiff
index 7a388552c57..98e17f3c825 100644
--- a/storage/innobase/mysql-test/storage_engine/type_char_indexes.rdiff
+++ b/storage/innobase/mysql-test/storage_engine/type_char_indexes.rdiff
@@ -1,6 +1,6 @@
---- suite/storage_engine/type_char_indexes.result	2012-07-12 19:27:42.191013570 +0400
-+++ suite/storage_engine/type_char_indexes.reject	2012-07-15 17:51:55.810034331 +0400
-@@ -135,7 +135,7 @@
+--- suite/storage_engine/type_char_indexes.result	2014-10-12 14:22:11.000000000 +0400
++++ suite/storage_engine/type_char_indexes.reject	2014-10-12 14:23:28.000000000 +0400
+@@ -137,7 +137,7 @@
  r3a
  EXPLAIN SELECT c,c20,v16,v128 FROM t1 WHERE v16 = 'varchar1a' OR v16 = 'varchar3a' ORDER BY v16;
  id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc
index f94d6353431..89c8bf373f7 100644
--- a/storage/innobase/os/os0file.cc
+++ b/storage/innobase/os/os0file.cc
@@ -2,6 +2,7 @@
 
 Copyright (c) 1995, 2013, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, 2014, MariaDB Corporation.
 
 Portions of this file contain modifications contributed and copyrighted
 by Percona Inc.. Those modifications are
@@ -42,8 +43,13 @@ Created 10/21/1995 Heikki Tuuri
 #include "srv0srv.h"
 #include "srv0start.h"
 #include "fil0fil.h"
+#include "fil0pagecompress.h"
 #include "buf0buf.h"
 #include "srv0mon.h"
+#include "srv0srv.h"
+#ifdef HAVE_POSIX_FALLOCATE
+#include "fcntl.h"
+#endif
 #ifndef UNIV_HOTBACKUP
 # include "os0sync.h"
 # include "os0thread.h"
@@ -60,6 +66,21 @@ Created 10/21/1995 Heikki Tuuri
 #include <libaio.h>
 #endif
 
+#if defined(UNIV_LINUX) && defined(HAVE_SYS_IOCTL_H)
+# include <sys/ioctl.h>
+# ifndef DFS_IOCTL_ATOMIC_WRITE_SET
+#  define DFS_IOCTL_ATOMIC_WRITE_SET _IOW(0x95, 2, uint)
+# endif
+#endif
+
+#if defined(UNIV_LINUX) && defined(HAVE_SYS_STATVFS_H)
+#include <sys/statvfs.h>
+#endif
+
+#ifdef HAVE_LZO
+#include "lzo/lzo1x.h"
+#endif
+
 /** Insert buffer segment id */
 static const ulint IO_IBUF_SEGMENT = 0;
 
@@ -87,6 +108,12 @@ UNIV_INTERN os_ib_mutex_t	os_file_seek_mutexes[OS_FILE_N_SEEK_MUTEXES];
 /* In simulated aio, merge at most this many consecutive i/os */
 #define OS_AIO_MERGE_N_CONSECUTIVE	64
 
+#ifdef WITH_INNODB_DISALLOW_WRITES
+#define WAIT_ALLOW_WRITES() os_event_wait(srv_allow_writes_event)
+#else
+#define WAIT_ALLOW_WRITES() do { } while (0)
+#endif /* WITH_INNODB_DISALLOW_WRITES */
+
 /**********************************************************************
 
 InnoDB AIO Implementation:
@@ -175,6 +202,32 @@ struct os_aio_slot_t{
 					and which can be used to identify
 					which pending aio operation was
 					completed */
+	ulint           bitmap;
+
+	byte*           page_compression_page; /*!< Memory allocated for
+					       page compressed page and
+					       freed after the write
+					       has been completed */
+
+	ibool           page_compression;
+	ulint           page_compression_level;
+
+	ulint*          write_size;     /*!< Actual write size initialized
+					after fist successfull trim
+					operation for this page and if
+					initialized we do not trim again if
+					actual page size does not decrease. */
+
+	byte*           page_buf;       /*!< Actual page buffer for
+					page compressed pages, do not
+					free this */
+
+	ibool           page_compress_success;
+	                                /*!< TRUE if page compression was
+					successfull, false if not */
+
+	ulint           file_block_size;/*!< file block size */
+
 #ifdef WIN_ASYNC_IO
 	HANDLE		handle;		/*!< handle object we need in the
 					OVERLAPPED struct */
@@ -185,6 +238,7 @@ struct os_aio_slot_t{
 	int		n_bytes;	/* bytes written/read. */
 	int		ret;		/* AIO return code */
 #endif /* WIN_ASYNC_IO */
+	byte		*lzo_mem;	/* Temporal memory used by LZO */
 };
 
 /** The asynchronous i/o array structure */
@@ -294,6 +348,88 @@ UNIV_INTERN ulint	os_n_pending_writes = 0;
 /** Number of pending read operations */
 UNIV_INTERN ulint	os_n_pending_reads = 0;
 
+/** After first fallocate failure we will disable os_file_trim */
+UNIV_INTERN ibool       os_fallocate_failed = FALSE;
+
+/**********************************************************************//**
+Directly manipulate the allocated disk space by deallocating for the file referred to
+by fd  for  the  byte range starting at offset and continuing for len bytes.
+Within the specified range, partial file system blocks are zeroed, and whole
+file system blocks are removed from the file.  After a successful call,
+subsequent reads from  this range will return zeroes.
+@return	true if success, false if error */
+UNIV_INTERN
+ibool
+os_file_trim(
+/*=========*/
+	os_aio_slot_t*	slot); /*!< in: slot structure     */
+
+/**********************************************************************//**
+Allocate memory for temporal buffer used for page compression. This
+buffer is freed later. */
+UNIV_INTERN
+void
+os_slot_alloc_page_buf(
+/*===================*/
+	os_aio_slot_t*	slot); /*!< in: slot structure     */
+
+#ifdef HAVE_LZO
+/**********************************************************************//**
+Allocate memory for temporal memory used for page compression when
+LZO compression method is used */
+UNIV_INTERN
+void
+os_slot_alloc_lzo_mem(
+/*===================*/
+	os_aio_slot_t*   slot); /*!< in: slot structure     */
+#endif
+
+/****************************************************************//**
+Does error handling when a file operation fails.
+@return	TRUE if we should retry the operation */
+ibool
+os_file_handle_error_no_exit(
+/*=========================*/
+	const char*	name,		/*!< in: name of a file or NULL */
+	const char*	operation,	/*!< in: operation */
+	ibool		on_error_silent,/*!< in: if TRUE then don't print
+					any message to the log. */
+	const char*	file,		/*!< in: file name */
+	const ulint	line);		/*!< in: line */
+
+/****************************************************************//**
+Tries to enable the atomic write feature, if available, for the specified file
+handle.
+@return TRUE if success */
+static __attribute__((warn_unused_result))
+ibool
+os_file_set_atomic_writes(
+/*======================*/
+	const char*	name	/*!< in: name of the file */
+	__attribute__((unused)),
+	os_file_t	file	/*!< in: handle to the file */
+	__attribute__((unused)))
+{
+#ifdef DFS_IOCTL_ATOMIC_WRITE_SET
+	int	atomic_option	= 1;
+
+	if (ioctl(file, DFS_IOCTL_ATOMIC_WRITE_SET, &atomic_option)) {
+
+		fprintf(stderr, "InnoDB: Warning:Trying to enable atomic writes on "
+			"file %s on non-supported platform!\n", name);
+		os_file_handle_error_no_exit(name, "ioctl", FALSE, __FILE__, __LINE__);
+		return(FALSE);
+	}
+
+	return(TRUE);
+#else
+	fprintf(stderr, "InnoDB: Error: trying to enable atomic writes on "
+		"file %s on non-supported platform!\n", name);
+	return(FALSE);
+#endif
+}
+
+
 #ifdef UNIV_DEBUG
 # ifndef UNIV_HOTBACKUP
 /**********************************************************************//**
@@ -439,6 +575,19 @@ os_file_get_last_error_low(
 				"InnoDB: because of either a thread exit"
 				" or an application request.\n"
 				"InnoDB: Retry attempt is made.\n");
+		} else if (err == ECANCELED || err == ENOTTY) {
+			if (strerror(err) != NULL) {
+				fprintf(stderr,
+					"InnoDB: Error number %d"
+					" means '%s'.\n",
+					err, strerror(err));
+			}
+
+			if(srv_use_atomic_writes) {
+				fprintf(stderr,
+					"InnoDB: Error trying to enable atomic writes on "
+					"non-supported destination!\n");
+			}
 		} else {
 			fprintf(stderr,
 				"InnoDB: Some operating system error numbers"
@@ -503,6 +652,19 @@ os_file_get_last_error_low(
 				"InnoDB: The error means mysqld does not have"
 				" the access rights to\n"
 				"InnoDB: the directory.\n");
+		} else if (err == ECANCELED || err == ENOTTY) {
+			if (strerror(err) != NULL) {
+				fprintf(stderr,
+					"InnoDB: Error number %d"
+					" means '%s'.\n",
+					err, strerror(err));
+			}
+
+			if(srv_use_atomic_writes) {
+				fprintf(stderr,
+					"InnoDB: Error trying to enable atomic writes on "
+					"non-supported destination!\n");
+			}
 		} else {
 			if (strerror(err) != NULL) {
 				fprintf(stderr,
@@ -536,6 +698,9 @@ os_file_get_last_error_low(
 	case ENOTDIR:
 	case EISDIR:
 		return(OS_FILE_PATH_ERROR);
+	case ECANCELED:
+	case ENOTTY:
+                return(OS_FILE_OPERATION_NOT_SUPPORTED);
 	case EAGAIN:
 		if (srv_use_native_aio) {
 			return(OS_FILE_AIO_RESOURCES_RESERVED);
@@ -582,9 +747,11 @@ os_file_handle_error_cond_exit(
 	const char*	operation,	/*!< in: operation */
 	ibool		should_exit,	/*!< in: call exit(3) if unknown error
 					and this parameter is TRUE */
-	ibool		on_error_silent)/*!< in: if TRUE then don't print
+	ibool		on_error_silent,/*!< in: if TRUE then don't print
 					any message to the log iff it is
 					an unknown non-fatal error */
+	const char*     file,           /*!< in: file name */
+	const ulint     line)           /*!< in: line */
 {
 	ulint	err;
 
@@ -614,6 +781,9 @@ os_file_handle_error_cond_exit(
 			"  InnoDB: Disk is full. Try to clean the disk"
 			" to free space.\n");
 
+		fprintf(stderr,
+			" InnoDB: at file %s and at line %ld\n", file, line);
+
 		os_has_said_disk_full = TRUE;
 
 		fflush(stderr);
@@ -649,6 +819,12 @@ os_file_handle_error_cond_exit(
 		to the log. */
 
 		if (should_exit || !on_error_silent) {
+			fprintf(stderr,
+				" InnoDB: Operation %s to file %s and at line %ld\n",
+				operation, file, line);
+		}
+
+		if (should_exit || !on_error_silent) {
 			ib_logf(IB_LOG_LEVEL_ERROR, "File %s: '%s' returned OS "
 				"error " ULINTPF ".%s", name ? name : "(unknown)",
 				operation, err, should_exit
@@ -671,10 +847,12 @@ ibool
 os_file_handle_error(
 /*=================*/
 	const char*	name,		/*!< in: name of a file or NULL */
-	const char*	operation)	/*!< in: operation */
+	const char*	operation,	/*!< in: operation */
+	const char*     file,           /*!< in: file name */
+	const ulint     line)           /*!< in: line */
 {
 	/* exit in case of unknown error */
-	return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE));
+	return(os_file_handle_error_cond_exit(name, operation, TRUE, FALSE, file, line));
 }
 
 /****************************************************************//**
@@ -685,12 +863,14 @@ os_file_handle_error_no_exit(
 /*=========================*/
 	const char*	name,		/*!< in: name of a file or NULL */
 	const char*	operation,	/*!< in: operation */
-	ibool		on_error_silent)/*!< in: if TRUE then don't print
+	ibool		on_error_silent,/*!< in: if TRUE then don't print
 					any message to the log. */
+	const char*     file,           /*!< in: file name */
+	const ulint     line)           /*!< in: line */
 {
 	/* don't exit in case of unknown error */
 	return(os_file_handle_error_cond_exit(
-			name, operation, FALSE, on_error_silent));
+			name, operation, FALSE, on_error_silent, file, line));
 }
 
 #undef USE_FILE_LOCK
@@ -766,7 +946,9 @@ os_file_create_tmpfile(void)
 /*========================*/
 {
 	FILE*	file	= NULL;
-	int	fd	= innobase_mysql_tmpfile();
+	int	fd;
+	WAIT_ALLOW_WRITES();
+	fd	= innobase_mysql_tmpfile();
 
 	ut_ad(!srv_read_only_mode);
 
@@ -830,7 +1012,7 @@ os_file_opendir(
 	if (dir == INVALID_HANDLE_VALUE) {
 
 		if (error_is_fatal) {
-			os_file_handle_error(dirname, "opendir");
+			os_file_handle_error(dirname, "opendir", __FILE__, __LINE__);
 		}
 
 		return(NULL);
@@ -841,7 +1023,7 @@ os_file_opendir(
 	dir = opendir(dirname);
 
 	if (dir == NULL && error_is_fatal) {
-		os_file_handle_error(dirname, "opendir");
+		os_file_handle_error(dirname, "opendir", __FILE__, __LINE__);
 	}
 
 	return(dir);
@@ -863,7 +1045,7 @@ os_file_closedir(
 	ret = FindClose(dir);
 
 	if (!ret) {
-		os_file_handle_error_no_exit(NULL, "closedir", FALSE);
+		os_file_handle_error_no_exit(NULL, "closedir", FALSE, __FILE__, __LINE__);
 
 		return(-1);
 	}
@@ -875,7 +1057,7 @@ os_file_closedir(
 	ret = closedir(dir);
 
 	if (ret) {
-		os_file_handle_error_no_exit(NULL, "closedir", FALSE);
+		os_file_handle_error_no_exit(NULL, "closedir", FALSE, __FILE__, __LINE__);
 	}
 
 	return(ret);
@@ -947,7 +1129,7 @@ next_file:
 
 		return(1);
 	} else {
-		os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE);
+		os_file_handle_error_no_exit(NULL, "readdir_next_file", FALSE, __FILE__, __LINE__);
 		return(-1);
 	}
 #else
@@ -1033,7 +1215,7 @@ next_file:
 			goto next_file;
 		}
 
-		os_file_handle_error_no_exit(full_path, "stat", FALSE);
+		os_file_handle_error_no_exit(full_path, "stat", FALSE, __FILE__, __LINE__);
 
 		ut_free(full_path);
 
@@ -1084,7 +1266,7 @@ os_file_create_directory(
 		  && !fail_if_exists))) {
 
 		os_file_handle_error_no_exit(
-			pathname, "CreateDirectory", FALSE);
+			pathname, "CreateDirectory", FALSE, __FILE__, __LINE__);
 
 		return(FALSE);
 	}
@@ -1092,12 +1274,13 @@ os_file_create_directory(
 	return(TRUE);
 #else
 	int	rcode;
+	WAIT_ALLOW_WRITES();
 
 	rcode = mkdir(pathname, 0770);
 
 	if (!(rcode == 0 || (errno == EEXIST && !fail_if_exists))) {
 		/* failure */
-		os_file_handle_error_no_exit(pathname, "mkdir", FALSE);
+		os_file_handle_error_no_exit(pathname, "mkdir", FALSE, __FILE__, __LINE__);
 
 		return(FALSE);
 	}
@@ -1207,7 +1390,7 @@ os_file_create_simple_func(
 
 			retry = os_file_handle_error(
 				name, create_mode == OS_FILE_OPEN ?
-				"open" : "create");
+				"open" : "create", __FILE__, __LINE__);
 
 		} else {
 			*success = TRUE;
@@ -1218,6 +1401,8 @@ os_file_create_simple_func(
 
 #else /* __WIN__ */
 	int		create_flag;
+	if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
+		WAIT_ALLOW_WRITES();
 
 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
@@ -1275,7 +1460,7 @@ os_file_create_simple_func(
 			retry = os_file_handle_error(
 				name,
 				create_mode == OS_FILE_OPEN
-				?  "open" : "create");
+				?  "open" : "create", __FILE__, __LINE__);
 		} else {
 			*success = TRUE;
 			retry = false;
@@ -1317,9 +1502,12 @@ os_file_create_simple_no_error_handling_func(
 				OS_FILE_READ_WRITE, or
 				OS_FILE_READ_ALLOW_DELETE; the last option is
 				used by a backup program reading the file */
-	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
+	ibool*		success,/*!< out: TRUE if succeed, FALSE if error */
+	ulint           atomic_writes) /*! in: atomic writes table option
+				       value */
 {
 	os_file_t	file;
+	atomic_writes_t awrites = (atomic_writes_t) atomic_writes;
 
 	*success = FALSE;
 #ifdef __WIN__
@@ -1380,11 +1568,30 @@ os_file_create_simple_no_error_handling_func(
 			  attributes,
 			  NULL);		// No template file
 
+	/* If we have proper file handle and atomic writes should be used,
+	try to set atomic writes and if that fails when creating a new
+	table, produce a error. If atomic writes are used on existing
+	file, ignore error and use traditional writes for that file */
+	if (file != INVALID_HANDLE_VALUE
+	    && (awrites == ATOMIC_WRITES_ON ||
+		(srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT))
+	    && !os_file_set_atomic_writes(name, file)) {
+		if (create_mode == OS_FILE_CREATE) {
+			fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n");
+			CloseHandle(file);
+			os_file_delete_if_exists_func(name);
+			*success = FALSE;
+			file = INVALID_HANDLE_VALUE;
+		}
+	}
+
 	*success = (file != INVALID_HANDLE_VALUE);
 #else /* __WIN__ */
 	int		create_flag;
 
 	ut_a(name);
+	if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
+		WAIT_ALLOW_WRITES();
 
 	ut_a(!(create_mode & OS_FILE_ON_ERROR_SILENT));
 	ut_a(!(create_mode & OS_FILE_ON_ERROR_NO_EXIT));
@@ -1440,6 +1647,24 @@ os_file_create_simple_no_error_handling_func(
 	}
 #endif /* USE_FILE_LOCK */
 
+	/* If we have proper file handle and atomic writes should be used,
+	try to set atomic writes and if that fails when creating a new
+	table, produce a error. If atomic writes are used on existing
+	file, ignore error and use traditional writes for that file */
+	if (file != -1
+	    && (awrites == ATOMIC_WRITES_ON ||
+		(srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT))
+	    && !os_file_set_atomic_writes(name, file)) {
+		if (create_mode == OS_FILE_CREATE) {
+			fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n");
+			close(file);
+			os_file_delete_if_exists_func(name);
+			*success = FALSE;
+			file = -1;
+		}
+	}
+
+
 #endif /* __WIN__ */
 
 	return(file);
@@ -1524,12 +1749,15 @@ os_file_create_func(
 				async i/o or unbuffered i/o: look in the
 				function source code for the exact rules */
 	ulint		type,	/*!< in: OS_DATA_FILE or OS_LOG_FILE */
-	ibool*		success)/*!< out: TRUE if succeed, FALSE if error */
+	ibool*		success,/*!< out: TRUE if succeed, FALSE if error */
+	ulint           atomic_writes) /*! in: atomic writes table option
+				       value */
 {
 	os_file_t	file;
 	ibool		retry;
 	ibool		on_error_no_exit;
 	ibool		on_error_silent;
+	atomic_writes_t awrites = (atomic_writes_t) atomic_writes;
 
 #ifdef __WIN__
 	DBUG_EXECUTE_IF(
@@ -1662,9 +1890,9 @@ os_file_create_func(
 
 			if (on_error_no_exit) {
 				retry = os_file_handle_error_no_exit(
-					name, operation, on_error_silent);
+					name, operation, on_error_silent, __FILE__, __LINE__);
 			} else {
-				retry = os_file_handle_error(name, operation);
+				retry = os_file_handle_error(name, operation, __FILE__, __LINE__);
 			}
 		} else {
 			*success = TRUE;
@@ -1673,9 +1901,27 @@ os_file_create_func(
 
 	} while (retry);
 
+	/* If we have proper file handle and atomic writes should be used,
+	try to set atomic writes and if that fails when creating a new
+	table, produce a error. If atomic writes are used on existing
+	file, ignore error and use traditional writes for that file */
+	if (file != INVALID_HANDLE_VALUE
+	    && (awrites == ATOMIC_WRITES_ON ||
+		(srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT))
+	    && !os_file_set_atomic_writes(name, file)) {
+		if (create_mode == OS_FILE_CREATE) {
+			fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n");
+			CloseHandle(file);
+			os_file_delete_if_exists_func(name);
+			*success = FALSE;
+			file = INVALID_HANDLE_VALUE;
+		}
+	}
 #else /* __WIN__ */
 	int		create_flag;
 	const char*	mode_str	= NULL;
+	if (create_mode != OS_FILE_OPEN && create_mode != OS_FILE_OPEN_RAW)
+		WAIT_ALLOW_WRITES();
 
 	on_error_no_exit = create_mode & OS_FILE_ON_ERROR_NO_EXIT
 		? TRUE : FALSE;
@@ -1747,9 +1993,9 @@ os_file_create_func(
 
 			if (on_error_no_exit) {
 				retry = os_file_handle_error_no_exit(
-					name, operation, on_error_silent);
+					name, operation, on_error_silent, __FILE__, __LINE__);
 			} else {
-				retry = os_file_handle_error(name, operation);
+				retry = os_file_handle_error(name, operation, __FILE__, __LINE__);
 			}
 		} else {
 			*success = TRUE;
@@ -1801,6 +2047,22 @@ os_file_create_func(
 	}
 #endif /* USE_FILE_LOCK */
 
+	/* If we have proper file handle and atomic writes should be used,
+	try to set atomic writes and if that fails when creating a new
+	table, produce a error. If atomic writes are used on existing
+	file, ignore error and use traditional writes for that file */
+	if (file != -1
+	    && (awrites == ATOMIC_WRITES_ON ||
+		(srv_use_atomic_writes && awrites == ATOMIC_WRITES_DEFAULT))
+	    && !os_file_set_atomic_writes(name, file)) {
+		if (create_mode == OS_FILE_CREATE) {
+			fprintf(stderr, "InnoDB: Error: Can't create file using atomic writes\n");
+			close(file);
+			os_file_delete_if_exists_func(name);
+			*success = FALSE;
+			file = -1;
+		}
+	}
 #endif /* __WIN__ */
 
 	return(file);
@@ -1855,11 +2117,12 @@ loop:
 	goto loop;
 #else
 	int	ret;
+	WAIT_ALLOW_WRITES();
 
 	ret = unlink(name);
 
 	if (ret != 0 && errno != ENOENT) {
-		os_file_handle_error_no_exit(name, "delete", FALSE);
+		os_file_handle_error_no_exit(name, "delete", FALSE, __FILE__, __LINE__);
 
 		return(false);
 	}
@@ -1919,11 +2182,12 @@ loop:
 	goto loop;
 #else
 	int	ret;
+	WAIT_ALLOW_WRITES();
 
 	ret = unlink(name);
 
 	if (ret != 0) {
-		os_file_handle_error_no_exit(name, "delete", FALSE);
+		os_file_handle_error_no_exit(name, "delete", FALSE, __FILE__, __LINE__);
 
 		return(false);
 	}
@@ -1967,16 +2231,17 @@ os_file_rename_func(
 		return(TRUE);
 	}
 
-	os_file_handle_error_no_exit(oldpath, "rename", FALSE);
+	os_file_handle_error_no_exit(oldpath, "rename", FALSE, __FILE__, __LINE__);
 
 	return(FALSE);
 #else
 	int	ret;
+	WAIT_ALLOW_WRITES();
 
 	ret = rename(oldpath, newpath);
 
 	if (ret != 0) {
-		os_file_handle_error_no_exit(oldpath, "rename", FALSE);
+		os_file_handle_error_no_exit(oldpath, "rename", FALSE, __FILE__, __LINE__);
 
 		return(FALSE);
 	}
@@ -2007,7 +2272,7 @@ os_file_close_func(
 		return(TRUE);
 	}
 
-	os_file_handle_error(NULL, "close");
+	os_file_handle_error(NULL, "close", __FILE__, __LINE__);
 
 	return(FALSE);
 #else
@@ -2016,7 +2281,7 @@ os_file_close_func(
 	ret = close(file);
 
 	if (ret == -1) {
-		os_file_handle_error(NULL, "close");
+		os_file_handle_error(NULL, "close", __FILE__, __LINE__);
 
 		return(FALSE);
 	}
@@ -2118,15 +2383,15 @@ os_file_set_size(
 			fprintf(stderr, "InnoDB: Error: preallocating file "
 				"space for file \'%s\' failed.  Current size "
 				"%lu, desired size %lu\n",
-				name, (long unsigned) current_size, (long unsigned) size);
-			os_file_handle_error_no_exit(name, "posix_fallocate", FALSE);
+				name, current_size, size);
+			os_file_handle_error_no_exit(name, "posix_fallocate", FALSE, __FILE__, __LINE__);
+
 			return(FALSE);
 		}
 		return(TRUE);
 	}
 #endif
 
-
 	/* Write up to 1 megabyte at a time. */
 	buf_size = ut_min(64, (ulint) (size / UNIV_PAGE_SIZE))
 		* UNIV_PAGE_SIZE;
@@ -2153,6 +2418,7 @@ os_file_set_size(
 		}
 
 		ret = os_file_write(name, file, buf, current_size, n_bytes);
+
 		if (!ret) {
 			ut_free(buf2);
 			goto error_handling;
@@ -2200,6 +2466,7 @@ os_file_set_eof(
 	HANDLE h = (HANDLE) _get_osfhandle(fileno(file));
 	return(SetEndOfFile(h));
 #else /* __WIN__ */
+	WAIT_ALLOW_WRITES();
 	return(!ftruncate(fileno(file), ftell(file)));
 #endif /* __WIN__ */
 }
@@ -2285,7 +2552,7 @@ os_file_flush_func(
 		return(TRUE);
 	}
 
-	os_file_handle_error(NULL, "flush");
+	os_file_handle_error(NULL, "flush", __FILE__, __LINE__);
 
 	/* It is a fatal error if a file flush does not succeed, because then
 	the database can get corrupt on disk */
@@ -2294,6 +2561,7 @@ os_file_flush_func(
 	return(FALSE);
 #else
 	int	ret;
+	WAIT_ALLOW_WRITES();
 
 #if defined(HAVE_DARWIN_THREADS)
 # ifndef F_FULLFSYNC
@@ -2339,7 +2607,7 @@ os_file_flush_func(
 
 	ib_logf(IB_LOG_LEVEL_ERROR, "The OS said file flush did not succeed");
 
-	os_file_handle_error(NULL, "flush");
+	os_file_handle_error(NULL, "flush", __FILE__, __LINE__);
 
 	/* It is a fatal error if a file flush does not succeed, because then
 	the database can get corrupt on disk */
@@ -2577,7 +2845,9 @@ os_file_read_func(
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read */
 	os_offset_t	offset,	/*!< in: file offset where to read */
-	ulint		n)	/*!< in: number of bytes to read */
+	ulint		n,	/*!< in: number of bytes to read */
+	ibool		compressed) /*!< in: is this file space
+				    compressed ? */
 {
 #ifdef __WIN__
 	BOOL		ret;
@@ -2646,6 +2916,14 @@ try_again:
 	os_mutex_exit(os_file_count_mutex);
 
 	if (ret && len == n) {
+		/* Note that InnoDB writes files that are not formated
+		as file spaces and they do not have FIL_PAGE_TYPE
+		field, thus we must use here information is the actual
+		file space compressed. */
+		if (compressed && fil_page_is_compressed((byte *)buf)) {
+			fil_decompress_page(NULL, (byte *)buf, len, NULL);
+		}
+
 		return(TRUE);
 	}
 #else /* __WIN__ */
@@ -2658,6 +2936,13 @@ try_again:
 	ret = os_file_pread(file, buf, n, offset);
 
 	if ((ulint) ret == n) {
+		/* Note that InnoDB writes files that are not formated
+		as file spaces and they do not have FIL_PAGE_TYPE
+		field, thus we must use here information is the actual
+		file space compressed. */
+		if (compressed && fil_page_is_compressed((byte *)buf)) {
+			fil_decompress_page(NULL, (byte *)buf, n, NULL);
+		}
 
 		return(TRUE);
 	}
@@ -2669,7 +2954,7 @@ try_again:
 #ifdef __WIN__
 error_handling:
 #endif
-	retry = os_file_handle_error(NULL, "read");
+	retry = os_file_handle_error(NULL, "read", __FILE__, __LINE__);
 
 	if (retry) {
 		goto try_again;
@@ -2704,7 +2989,9 @@ os_file_read_no_error_handling_func(
 	os_file_t	file,	/*!< in: handle to a file */
 	void*		buf,	/*!< in: buffer where to read */
 	os_offset_t	offset,	/*!< in: file offset where to read */
-	ulint		n)	/*!< in: number of bytes to read */
+	ulint		n,	/*!< in: number of bytes to read */
+	ibool		compressed) /*!< in: is this file space
+				     compressed ? */
 {
 #ifdef __WIN__
 	BOOL		ret;
@@ -2773,6 +3060,15 @@ try_again:
 	os_mutex_exit(os_file_count_mutex);
 
 	if (ret && len == n) {
+
+		/* Note that InnoDB writes files that are not formated
+		as file spaces and they do not have FIL_PAGE_TYPE
+		field, thus we must use here information is the actual
+		file space compressed. */
+		if (compressed && fil_page_is_compressed((byte *)buf)) {
+			fil_decompress_page(NULL, (byte *)buf, n, NULL);
+		}
+
 		return(TRUE);
 	}
 #else /* __WIN__ */
@@ -2785,6 +3081,13 @@ try_again:
 	ret = os_file_pread(file, buf, n, offset);
 
 	if ((ulint) ret == n) {
+		/* Note that InnoDB writes files that are not formated
+		as file spaces and they do not have FIL_PAGE_TYPE
+		field, thus we must use here information is the actual
+		file space compressed. */
+		if (compressed && fil_page_is_compressed((byte *)buf)) {
+			fil_decompress_page(NULL, (byte *)buf, n, NULL);
+		}
 
 		return(TRUE);
 	}
@@ -2792,7 +3095,7 @@ try_again:
 #ifdef __WIN__
 error_handling:
 #endif
-	retry = os_file_handle_error_no_exit(NULL, "read", FALSE);
+	retry = os_file_handle_error_no_exit(NULL, "read", FALSE, __FILE__, __LINE__);
 
 	if (retry) {
 		goto try_again;
@@ -2864,6 +3167,7 @@ os_file_write_func(
 	ut_ad(file);
 	ut_ad(buf);
 	ut_ad(n > 0);
+
 retry:
 	low = (DWORD) offset & 0xFFFFFFFF;
 	high = (DWORD) (offset >> 32);
@@ -2995,6 +3299,7 @@ retry:
 	return(FALSE);
 #else
 	ssize_t	ret;
+	WAIT_ALLOW_WRITES();
 
 	ret = os_file_pwrite(file, buf, n, offset);
 
@@ -3060,7 +3365,7 @@ os_file_status(
 	} else if (ret) {
 		/* file exists, but stat call failed */
 
-		os_file_handle_error_no_exit(path, "stat", FALSE);
+		os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__);
 
 		return(FALSE);
 	}
@@ -3088,7 +3393,7 @@ os_file_status(
 	} else if (ret) {
 		/* file exists, but stat call failed */
 
-		os_file_handle_error_no_exit(path, "stat", FALSE);
+		os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__);
 
 		return(FALSE);
 	}
@@ -3137,7 +3442,7 @@ os_file_get_status(
 	} else if (ret) {
 		/* file exists, but stat call failed */
 
-		os_file_handle_error_no_exit(path, "stat", FALSE);
+		os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__);
 
 		return(DB_FAIL);
 
@@ -3190,7 +3495,7 @@ os_file_get_status(
 	} else if (ret) {
 		/* file exists, but stat call failed */
 
-		os_file_handle_error_no_exit(path, "stat", FALSE);
+		os_file_handle_error_no_exit(path, "stat", FALSE, __FILE__, __LINE__);
 
 		return(DB_FAIL);
 
@@ -3756,7 +4061,8 @@ os_aio_array_create(
 	array->slots = static_cast<os_aio_slot_t*>(
 		ut_malloc(n * sizeof(*array->slots)));
 
-	memset(array->slots, 0x0, sizeof(n * sizeof(*array->slots)));
+	memset(array->slots, 0x0, n * sizeof(*array->slots));
+
 #ifdef __WIN__
 	array->handles = static_cast<HANDLE*>(ut_malloc(n * sizeof(HANDLE)));
 #endif /* __WIN__ */
@@ -3844,8 +4150,8 @@ os_aio_array_free(
 /*==============*/
 	os_aio_array_t*& array)	/*!< in, own: array to free */
 {
-#ifdef WIN_ASYNC_IO
 	ulint	i;
+#ifdef WIN_ASYNC_IO
 
 	for (i = 0; i < array->n_slots; i++) {
 		os_aio_slot_t*	slot = os_aio_array_get_nth_slot(array, i);
@@ -3867,6 +4173,19 @@ os_aio_array_free(
 	}
 #endif /* LINUX_NATIVE_AIO */
 
+	for (i = 0; i < array->n_slots; i++) {
+		os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i);
+		if (slot->page_compression_page) {
+			ut_free(slot->page_compression_page);
+			slot->page_compression_page = NULL;
+		}
+
+		if (slot->lzo_mem) {
+			ut_free(slot->lzo_mem);
+			slot->lzo_mem = NULL;
+		}
+	}
+
 	ut_free(array->slots);
 	ut_free(array);
 
@@ -4200,7 +4519,16 @@ os_aio_array_reserve_slot(
 	void*		buf,	/*!< in: buffer where to read or from which
 				to write */
 	os_offset_t	offset,	/*!< in: file offset */
-	ulint		len)	/*!< in: length of the block to read or write */
+	ulint		len,	/*!< in: length of the block to read or write */
+	ulint*		write_size,/*!< in/out: Actual write size initialized
+			       after fist successfull trim
+			       operation for this page and if
+			       initialized we do not trim again if
+			       actual page size does not decrease. */
+	ibool		page_compression, /*!< in: is page compression used
+					  on this file space */
+	ulint		page_compression_level) /*!< page compression
+						 level to be used */
 {
 	os_aio_slot_t*	slot = NULL;
 #ifdef WIN_ASYNC_IO
@@ -4290,6 +4618,63 @@ found:
 	slot->buf      = static_cast<byte*>(buf);
 	slot->offset   = offset;
 	slot->io_already_done = FALSE;
+	slot->page_compress_success = FALSE;
+	slot->write_size = write_size;
+	slot->page_compression_level = page_compression_level;
+	slot->page_compression = page_compression;
+
+	if (message1) {
+		slot->file_block_size = fil_node_get_block_size(message1);
+	}
+
+	/* If the space is page compressed and this is write operation
+	   then we compress the page */
+	if (message1 && type == OS_FILE_WRITE && page_compression ) {
+		ulint           real_len = len;
+		byte*           tmp = NULL;
+
+		/* Release the array mutex while compressing */
+		os_mutex_exit(array->mutex);
+
+		// We allocate memory for page compressed buffer if and only
+		// if it is not yet allocated.
+		if (slot->page_buf == NULL) {
+			os_slot_alloc_page_buf(slot);
+		}
+
+#ifdef HAVE_LZO
+		if (innodb_compression_algorithm == 3 && slot->lzo_mem == NULL) {
+			os_slot_alloc_lzo_mem(slot);
+		}
+#endif
+
+		/* Call page compression */
+		tmp = fil_compress_page(fil_node_get_space_id(slot->message1),
+			(byte *)buf,
+			slot->page_buf,
+			len,
+			page_compression_level,
+			fil_node_get_block_size(slot->message1),
+			&real_len,
+			slot->lzo_mem
+		);
+
+		/* If compression succeeded, set up the length and buffer */
+		if (tmp != buf) {
+			len = real_len;
+			buf = slot->page_buf;
+			slot->len = real_len;
+			slot->page_compress_success = TRUE;
+		} else {
+			slot->page_compress_success = FALSE;
+		}
+
+		/* Take array mutex back, not sure if this is really needed
+		below */
+		os_mutex_enter(array->mutex);
+
+	}
+
 
 #ifdef WIN_ASYNC_IO
 	control = &slot->control;
@@ -4564,10 +4949,19 @@ os_aio_func(
 				(can be used to identify a completed
 				aio operation); ignored if mode is
 				OS_AIO_SYNC */
-	void*		message2)/*!< in: message for the aio handler
+	void*		message2,/*!< in: message for the aio handler
 				(can be used to identify a completed
 				aio operation); ignored if mode is
 				OS_AIO_SYNC */
+	ulint*		write_size,/*!< in/out: Actual write size initialized
+			       after fist successfull trim
+			       operation for this page and if
+			       initialized we do not trim again if
+			       actual page size does not decrease. */
+	ibool		page_compression, /*!< in: is page compression used
+					  on this file space */
+	ulint		page_compression_level) /*!< page compression
+						 level to be used */
 {
 	os_aio_array_t*	array;
 	os_aio_slot_t*	slot;
@@ -4618,7 +5012,8 @@ os_aio_func(
 		and os_file_write_func() */
 
 		if (type == OS_FILE_READ) {
-			ret = os_file_read_func(file, buf, offset, n);
+			ret = os_file_read_func(file, buf, offset, n,
+					         page_compression);
 		} else {
 
 			ut_ad(!srv_read_only_mode);
@@ -4627,12 +5022,12 @@ os_aio_func(
 			ret = os_file_write_func(name, file, buf, offset, n);
 		}
 
-		DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28",
-			os_has_said_disk_full = FALSE;);
-		DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28",
-			ret = 0;);
-		DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28",
-			errno = 28;);
+		if (type == OS_FILE_WRITE) {
+			DBUG_EXECUTE_IF("ib_os_aio_func_io_failure_28",
+				os_has_said_disk_full = FALSE;
+				ret = 0;
+				errno = 28;);
+		}
 
 		return ret;
 	}
@@ -4680,7 +5075,8 @@ try_again:
 	}
 
 	slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
-					 name, buf, offset, n);
+		name, buf, offset, n, write_size, page_compression, page_compression_level);
+
 	if (type == OS_FILE_READ) {
 		if (srv_use_native_aio) {
 			os_n_file_reads++;
@@ -4760,7 +5156,7 @@ err_exit:
 	os_aio_array_free_slot(array, slot);
 
 	if (os_file_handle_error(
-		name,type == OS_FILE_READ ? "aio read" : "aio write")) {
+		name,type == OS_FILE_READ ? "aio read" : "aio write", __FILE__, __LINE__)) {
 
 		goto try_again;
 	}
@@ -4873,9 +5269,17 @@ os_aio_windows_handle(
 	if (ret && len == slot->len) {
 
 		ret_val = TRUE;
-	} else if (os_file_handle_error(slot->name, "Windows aio")) {
+	} else if (!ret || (len != slot->len)) {
 
-		retry = TRUE;
+		if (!ret) {
+			if (os_file_handle_error(slot->name, "Windows aio", __FILE__, __LINE__)) {
+				retry = TRUE;
+			} else {
+				ret_val = FALSE;
+			}
+		} else {
+			retry = TRUE;
+		}
 	} else {
 
 		ret_val = FALSE;
@@ -4903,9 +5307,17 @@ os_aio_windows_handle(
 
 		switch (slot->type) {
 		case OS_FILE_WRITE:
-			ret = WriteFile(slot->file, slot->buf,
+			if (slot->message1 &&
+			    slot->page_compression &&
+			    slot->page_buf) {
+				ret = WriteFile(slot->file, slot->page_buf,
 					(DWORD) slot->len, &len,
 					&(slot->control));
+			} else {
+				ret = WriteFile(slot->file, slot->buf,
+					(DWORD) slot->len, &len,
+					&(slot->control));
+			}
 
 			break;
 		case OS_FILE_READ:
@@ -4937,6 +5349,30 @@ os_aio_windows_handle(
 		ret_val = ret && len == slot->len;
 	}
 
+	if (slot->message1 && slot->page_compression) {
+		// We allocate memory for page compressed buffer if and only
+		// if it is not yet allocated.
+		if (slot->page_buf == NULL) {
+			os_slot_alloc_page_buf(slot);
+		}
+#ifdef HAVE_LZO
+		if (innodb_compression_algorithm == 3 && slot->lzo_mem == NULL) {
+			os_slot_alloc_lzo_mem(slot);
+		}
+#endif
+
+	        if (slot->type == OS_FILE_READ) {
+			fil_decompress_page(slot->page_buf, slot->buf, slot->len, slot->write_size);
+		} else {
+			if (slot->page_compress_success && fil_page_is_compressed(slot->page_buf)) {
+				if (srv_use_trim && os_fallocate_failed == FALSE) {
+					// Deallocate unused blocks from file system
+					os_file_trim(slot);
+				}
+			}
+		}
+	}
+
 	os_aio_array_free_slot(array, slot);
 
 	return(ret_val);
@@ -5026,6 +5462,36 @@ retry:
 			/* We have not overstepped to next segment. */
 			ut_a(slot->pos < end_pos);
 
+			/* If the table is page compressed and this is read,
+			we decompress before we annouce the read is
+			complete. For writes, we free the compressed page. */
+			if (slot->message1 && slot->page_compression) {
+				// We allocate memory for page compressed buffer if and only
+				// if it is not yet allocated.
+				if (slot->page_buf == NULL) {
+					os_slot_alloc_page_buf(slot);
+				}
+
+#ifdef HAVE_LZO
+				if (innodb_compression_algorithm == 3 && slot->lzo_mem == NULL) {
+					os_slot_alloc_lzo_mem(slot);
+				}
+#endif
+
+				if (slot->type == OS_FILE_READ) {
+					fil_decompress_page(slot->page_buf, slot->buf, slot->len, slot->write_size);
+				} else {
+					if (slot->page_compress_success &&
+					    fil_page_is_compressed(slot->page_buf)) {
+						ut_ad(slot->page_compression_page);
+						if (srv_use_trim && os_fallocate_failed == FALSE) {
+							// Deallocate unused blocks from file system
+							os_file_trim(slot);
+						}
+					}
+				}
+			}
+
 			/* Mark this request as completed. The error handling
 			will be done in the calling function. */
 			os_mutex_enter(array->mutex);
@@ -5169,6 +5635,13 @@ found:
 	} else {
 		errno = -slot->ret;
 
+		if (slot->ret == 0) {
+			fprintf(stderr, 
+				"InnoDB: Number of bytes after aio %d requested %lu\n"
+				"InnoDB: from file %s\n",
+				slot->n_bytes, slot->len, slot->name);
+		}
+
 		/* os_file_handle_error does tell us if we should retry
 		this IO. As it stands now, we don't do this retry when
 		reaping requests from a different context than
@@ -5176,7 +5649,7 @@ found:
 		windows and linux native AIO.
 		We should probably look into this to transparently
 		re-submit the IO. */
-		os_file_handle_error(slot->name, "Linux aio");
+		os_file_handle_error(slot->name, "Linux aio", __FILE__, __LINE__);
 
 		ret = FALSE;
 	}
@@ -5456,7 +5929,8 @@ consecutive_loop:
 	} else {
 		ret = os_file_read(
 			aio_slot->file, combined_buf,
-			aio_slot->offset, total_len);
+			aio_slot->offset, total_len,
+			aio_slot->page_compression);
 	}
 
 	if (aio_slot->type == OS_FILE_WRITE) {
@@ -5853,4 +6327,282 @@ os_aio_all_slots_free(void)
 }
 #endif /* UNIV_DEBUG */
 
+#ifdef _WIN32
+#include <winioctl.h>
+#ifndef FSCTL_FILE_LEVEL_TRIM
+#define FSCTL_FILE_LEVEL_TRIM  CTL_CODE(FILE_DEVICE_FILE_SYSTEM, 130, METHOD_BUFFERED, FILE_WRITE_DATA)
+typedef struct _FILE_LEVEL_TRIM_RANGE {
+  DWORDLONG Offset;
+  DWORDLONG Length;
+} FILE_LEVEL_TRIM_RANGE, *PFILE_LEVEL_TRIM_RANGE;
+
+typedef struct _FILE_LEVEL_TRIM {
+  DWORD                 Key;
+  DWORD                 NumRanges;
+  FILE_LEVEL_TRIM_RANGE Ranges[1];
+} FILE_LEVEL_TRIM, *PFILE_LEVEL_TRIM;
+#endif
+#endif
+
+/**********************************************************************//**
+Directly manipulate the allocated disk space by deallocating for the file referred to
+by fd  for  the  byte range starting at offset and continuing for len bytes.
+Within the specified range, partial file system blocks are zeroed, and whole
+file system blocks are removed from the file.  After a successful call,
+subsequent reads from  this range will return zeroes.
+@return	true if success, false if error */
+UNIV_INTERN
+ibool
+os_file_trim(
+/*=========*/
+	os_aio_slot_t*	slot) /*!< in: slot structure     */
+{
+
+	size_t len = slot->len;
+	size_t trim_len = UNIV_PAGE_SIZE - len;
+	os_offset_t off = slot->offset + len;
+	size_t bsize = slot->file_block_size;
+
+	// len here should be alligned to sector size
+	ut_ad((trim_len % bsize) == 0);
+	ut_ad((len % bsize) == 0);
+	ut_ad(bsize != 0);
+	ut_ad((off % bsize) == 0);
+
+#ifdef UNIV_TRIM_DEBUG
+	fprintf(stderr, "Note: TRIM: write_size %lu trim_len %lu len %lu off %lu bz %lu\n",
+		*slot->write_size, trim_len, len, off, bsize);
+#endif
+
+	// Nothing to do if trim length is zero or if actual write
+	// size is initialized and it is smaller than current write size.
+	// In first write if we trim we set write_size to actual bytes
+	// written and rest of the page is trimmed. In following writes
+	// there is no need to trim again if write_size only increases
+	// because rest of the page is already trimmed. If actual write
+	// size decreases we need to trim again.
+	if (trim_len == 0 ||
+	    (slot->write_size &&
+		    *slot->write_size > 0 &&
+		    len >= *slot->write_size)) {
+
+#ifdef UNIV_PAGECOMPRESS_DEBUG
+		fprintf(stderr, "Note: TRIM: write_size %lu trim_len %lu len %lu\n",
+			*slot->write_size, trim_len, len);
+#endif
+
+		if (*slot->write_size > 0 && len >= *slot->write_size) {
+			srv_stats.page_compressed_trim_op_saved.inc();
+		}
+
+		*slot->write_size = len;
+
+		return (TRUE);
+	}
+
+#ifdef __linux__
+#if defined(FALLOC_FL_PUNCH_HOLE) && defined (FALLOC_FL_KEEP_SIZE)
+	int ret = fallocate(slot->file, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, off, trim_len);
+
+	if (ret) {
+		/* After first failure do not try to trim again */
+		os_fallocate_failed = TRUE;
+		srv_use_trim = FALSE;
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: [Warning] fallocate call failed with error code %d.\n"
+			"  InnoDB: start: %lu len: %lu payload: %lu\n"
+			"  InnoDB: Disabling fallocate for now.\n", errno, off, trim_len, len);
+
+		os_file_handle_error_no_exit(slot->name,
+			" fallocate(FALLOC_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE) ",
+			FALSE, __FILE__, __LINE__);
+
+		if (slot->write_size) {
+			*slot->write_size = 0;
+		}
+
+		return (FALSE);
+	} else {
+		if (slot->write_size) {
+			*slot->write_size = len;
+		}
+	}
+#else
+	ut_print_timestamp(stderr);
+	fprintf(stderr,
+		"  InnoDB: [Warning] fallocate not supported on this installation."
+		"  InnoDB: Disabling fallocate for now.");
+	os_fallocate_failed = TRUE;
+	srv_use_trim = FALSE;
+	if (slot->write_size) {
+		*slot->write_size = 0;
+	}
+
+#endif /* HAVE_FALLOCATE ... */
+
+#elif defined(_WIN32)
+	FILE_LEVEL_TRIM flt;
+	flt.Key = 0;
+	flt.NumRanges = 1;
+	flt.Ranges[0].Offset = off;
+	flt.Ranges[0].Length = trim_len;
+
+	BOOL ret = DeviceIoControl(slot->file, FSCTL_FILE_LEVEL_TRIM,
+		&flt, sizeof(flt), NULL, NULL, NULL, NULL);
+
+	if (!ret) {
+		/* After first failure do not try to trim again */
+		os_fallocate_failed = TRUE;
+		srv_use_trim=FALSE;
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			"  InnoDB: [Warning] fallocate call failed with error.\n"
+			"  InnoDB: start: %lu len: %lu payload: %lu\n"
+			"  InnoDB: Disabling fallocate for now.\n", off, trim_len, len);
+
+		os_file_handle_error_no_exit(slot->name,
+			" DeviceIOControl(FSCTL_FILE_LEVEL_TRIM) ",
+			FALSE, __FILE__, __LINE__);
+
+		if (slot->write_size) {
+			*slot->write_size = 0;
+		}
+		return (FALSE);
+	} else {
+		if (slot->write_size) {
+			*slot->write_size = len;
+		}
+	}
+#endif
+
+	switch(bsize) {
+	case 512:
+		srv_stats.page_compression_trim_sect512.add((trim_len / bsize));
+		break;
+	case 1024:
+		srv_stats.page_compression_trim_sect1024.add((trim_len / bsize));
+		break;
+	case 2948:
+		srv_stats.page_compression_trim_sect2048.add((trim_len / bsize));
+		break;
+	case 4096:
+		srv_stats.page_compression_trim_sect4096.add((trim_len / bsize));
+		break;
+	case 8192:
+		srv_stats.page_compression_trim_sect8192.add((trim_len / bsize));
+		break;
+	case 16384:
+		srv_stats.page_compression_trim_sect16384.add((trim_len / bsize));
+		break;
+	case 32768:
+		srv_stats.page_compression_trim_sect32768.add((trim_len / bsize));
+		break;
+	default:
+		break;
+	}
+
+	srv_stats.page_compressed_trim_op.inc();
+
+	return (TRUE);
+
+}
 #endif /* !UNIV_HOTBACKUP */
+
+/**********************************************************************//**
+Allocate memory for temporal buffer used for page compression. This
+buffer is freed later. */
+UNIV_INTERN
+void
+os_slot_alloc_page_buf(
+/*===================*/
+	os_aio_slot_t*   slot) /*!< in: slot structure     */
+{
+	byte*           cbuf2;
+	byte*           cbuf;
+
+	ut_a(slot != NULL);
+	/* We allocate extra to avoid memory overwrite on compression */
+	cbuf2 = static_cast<byte *>(ut_malloc(UNIV_PAGE_SIZE*2));
+	cbuf = static_cast<byte *>(ut_align(cbuf2, UNIV_PAGE_SIZE));
+	slot->page_compression_page = static_cast<byte *>(cbuf2);
+	slot->page_buf = static_cast<byte *>(cbuf);
+	ut_a(slot->page_buf != NULL);
+}
+
+#ifdef HAVE_LZO
+/**********************************************************************//**
+Allocate memory for temporal memory used for page compression when
+LZO compression method is used */
+UNIV_INTERN
+void
+os_slot_alloc_lzo_mem(
+/*===================*/
+	os_aio_slot_t*   slot) /*!< in: slot structure     */
+{
+	ut_a(slot != NULL);
+	slot->lzo_mem = static_cast<byte *>(ut_malloc(LZO1X_1_15_MEM_COMPRESS));
+	ut_a(slot->lzo_mem != NULL);
+}
+#endif
+
+/***********************************************************************//**
+Try to get number of bytes per sector from file system.
+@return	file block size */
+UNIV_INTERN
+ulint
+os_file_get_block_size(
+/*===================*/
+	os_file_t	file,	/*!< in: handle to a file */
+	const char*	name)	/*!< in: file name */
+{
+	ulint		fblock_size = 512;
+
+#if defined(UNIV_LINUX) && defined(HAVE_SYS_STATVFS_H)
+	struct statvfs  fstat;
+	int		err;
+
+	err = fstatvfs(file, &fstat);
+
+	if (err != 0) {
+		fprintf(stderr, "InnoDB: Warning: fstatvfs() failed on file %s\n", name);
+		os_file_handle_error_no_exit(name, "fstatvfs()", FALSE, __FILE__, __LINE__);
+	} else {
+		fblock_size = fstat.f_bsize;
+	}
+#endif /* UNIV_LINUX */
+#ifdef __WIN__
+	{
+		DWORD SectorsPerCluster = 0;
+		DWORD BytesPerSector = 0;
+		DWORD NumberOfFreeClusters = 0;
+		DWORD TotalNumberOfClusters = 0;
+
+		/*
+		if (GetFreeSpace((LPCTSTR)name, &SectorsPerCluster, &BytesPerSector, &NumberOfFreeClusters, &TotalNumberOfClusters)) {
+			fblock_size = BytesPerSector;
+		} else {
+			fprintf(stderr, "InnoDB: Warning: GetFreeSpace() failed on file %s\n", name);
+			os_file_handle_error_no_exit(name, "GetFreeSpace()", FALSE, __FILE__, __LINE__);
+		}
+		*/
+	}
+#endif /* __WIN__*/
+
+	if (fblock_size > UNIV_PAGE_SIZE/2 || fblock_size < 512) {
+		fprintf(stderr, "InnoDB: Note: File system for file %s has "
+			"file block size %lu not supported for page_size %lu\n",
+			name, fblock_size, UNIV_PAGE_SIZE);
+
+		if (fblock_size < 512) {
+			fblock_size = 512;
+		} else {
+			fblock_size = UNIV_PAGE_SIZE/2;
+		}
+
+		fprintf(stderr, "InnoDB: Note: Using file block size %ld for file %s\n",
+			fblock_size, name);
+	}
+
+	return fblock_size;
+}
diff --git a/storage/innobase/page/page0cur.cc b/storage/innobase/page/page0cur.cc
index f5f7e1299ce..97405261392 100644
--- a/storage/innobase/page/page0cur.cc
+++ b/storage/innobase/page/page0cur.cc
@@ -1349,6 +1349,21 @@ page_cur_insert_rec_zip(
 					return(insert_rec);
 				}
 
+				/* Page compress failed. If this happened on a
+				leaf page, put the data size into the sample
+				buffer. */
+				if (page_is_leaf(page)) {
+					ulint occupied = page_get_data_size(page)
+						+ page_dir_calc_reserved_space(
+								page_get_n_recs(page));
+					index->stat_defrag_data_size_sample[
+						index->stat_defrag_sample_next_slot] =
+								occupied;
+					index->stat_defrag_sample_next_slot =
+						(index->stat_defrag_sample_next_slot
+						 + 1) % STAT_DEFRAG_DATA_SIZE_N_SAMPLE;
+				}
+
 				ut_ad(cursor->rec
 				      == (pos > 1
 					  ? page_rec_get_nth(
diff --git a/storage/innobase/page/page0zip.cc b/storage/innobase/page/page0zip.cc
index 6989953cb0c..b0cb9407e72 100644
--- a/storage/innobase/page/page0zip.cc
+++ b/storage/innobase/page/page0zip.cc
@@ -76,7 +76,7 @@ UNIV_INTERN uint	page_zip_level = DEFAULT_COMPRESSION_LEVEL;
 
 /* Whether or not to log compressed page images to avoid possible
 compression algorithm changes in zlib. */
-UNIV_INTERN my_bool	page_zip_log_pages = true;
+UNIV_INTERN my_bool	page_zip_log_pages = false;
 
 /* Please refer to ../include/page0zip.ic for a description of the
 compressed page format. */
@@ -658,7 +658,7 @@ page_zip_dir_encode(
 #if PAGE_ZIP_DIR_SLOT_MASK & (PAGE_ZIP_DIR_SLOT_MASK + 1)
 # error "PAGE_ZIP_DIR_SLOT_MASK is not 1 less than a power of 2"
 #endif
-#if PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE_MAX - 1
+#if PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE_DEF - 1
 # error "PAGE_ZIP_DIR_SLOT_MASK < UNIV_PAGE_SIZE_MAX - 1"
 #endif
 		if (UNIV_UNLIKELY(rec_get_n_owned_new(rec))) {
diff --git a/storage/innobase/pars/pars0opt.cc b/storage/innobase/pars/pars0opt.cc
index cbed2b39eeb..5a7e1861d74 100644
--- a/storage/innobase/pars/pars0opt.cc
+++ b/storage/innobase/pars/pars0opt.cc
@@ -948,12 +948,14 @@ opt_find_all_cols(
 	/* Fill in the field_no fields in sym_node */
 
 	sym_node->field_nos[SYM_CLUST_FIELD_NO] = dict_index_get_nth_col_pos(
-		dict_table_get_first_index(index->table), sym_node->col_no);
+		dict_table_get_first_index(index->table), sym_node->col_no,
+		NULL);
 	if (!dict_index_is_clust(index)) {
 
 		ut_a(plan);
 
-		col_pos = dict_index_get_nth_col_pos(index, sym_node->col_no);
+		col_pos = dict_index_get_nth_col_pos(index, sym_node->col_no,
+						     NULL);
 
 		if (col_pos == ULINT_UNDEFINED) {
 
diff --git a/storage/innobase/pars/pars0pars.cc b/storage/innobase/pars/pars0pars.cc
index 655e5ba1324..c87e1f8e247 100644
--- a/storage/innobase/pars/pars0pars.cc
+++ b/storage/innobase/pars/pars0pars.cc
@@ -1232,7 +1232,8 @@ pars_process_assign_list(
 		col_sym = assign_node->col;
 
 		upd_field_set_field_no(upd_field, dict_index_get_nth_col_pos(
-					       clust_index, col_sym->col_no),
+						clust_index, col_sym->col_no,
+						NULL),
 				       clust_index, NULL);
 		upd_field->exp = assign_node->val;
 
diff --git a/storage/innobase/rem/rem0rec.cc b/storage/innobase/rem/rem0rec.cc
index 0d7b7c16785..3ff71d5c59e 100644
--- a/storage/innobase/rem/rem0rec.cc
+++ b/storage/innobase/rem/rem0rec.cc
@@ -33,6 +33,9 @@ Created 5/30/1994 Heikki Tuuri
 #include "mtr0mtr.h"
 #include "mtr0log.h"
 #include "fts0fts.h"
+#ifdef WITH_WSREP
+#include <ha_prototypes.h>
+#endif /* WITH_WSREP */
 
 /*			PHYSICAL RECORD (OLD STYLE)
 			===========================
@@ -1961,3 +1964,134 @@ rec_get_trx_id(
 }
 # endif /* UNIV_DEBUG */
 #endif /* !UNIV_HOTBACKUP */
+
+#ifdef WITH_WSREP
+int
+wsrep_rec_get_foreign_key(
+	byte 		*buf,     /* out: extracted key */
+	ulint 		*buf_len, /* in/out: length of buf */
+	const rec_t*	rec,	  /* in: physical record */
+	dict_index_t*	index_for,  /* in: index in foreign table */
+	dict_index_t*	index_ref,  /* in: index in referenced table */
+	ibool		new_protocol) /* in: protocol > 1 */
+{
+	const byte*	data;
+	ulint		len;
+	ulint		key_len = 0;
+	ulint		i;
+	uint            key_parts;
+	mem_heap_t*	heap	= NULL;
+	ulint		offsets_[REC_OFFS_NORMAL_SIZE];
+        const ulint*    offsets;
+
+	ut_ad(index_for);
+	ut_ad(index_ref);
+
+        rec_offs_init(offsets_);
+	offsets = rec_get_offsets(rec, index_for, offsets_, 
+				  ULINT_UNDEFINED, &heap);
+
+	ut_ad(rec_offs_validate(rec, NULL, offsets));
+
+	ut_ad(rec);
+
+	key_parts = dict_index_get_n_unique_in_tree(index_for);
+	for (i = 0; 
+	     i < key_parts && 
+	       (index_for->type & DICT_CLUSTERED || i < key_parts - 1); 
+	     i++) {
+		dict_field_t*	  field_f = 
+			dict_index_get_nth_field(index_for, i);
+		const dict_col_t* col_f = dict_field_get_col(field_f);
+                dict_field_t*	  field_r = 
+			dict_index_get_nth_field(index_ref, i);
+		const dict_col_t* col_r = dict_field_get_col(field_r);
+
+		data = rec_get_nth_field(rec, offsets, i, &len);
+		if (key_len + ((len != UNIV_SQL_NULL) ? len + 1 : 1) > 
+		    *buf_len) {
+			fprintf (stderr, 
+				 "WSREP: FK key len exceeded %lu %lu %lu\n", 
+				 key_len, len, *buf_len);
+			goto err_out;
+		}
+
+		if (len == UNIV_SQL_NULL) {
+			ut_a(!(col_f->prtype & DATA_NOT_NULL));
+			*buf++ = 1;
+			key_len++;
+		} else if (!new_protocol) {
+			if (!(col_r->prtype & DATA_NOT_NULL)) {
+				*buf++ = 0;
+				key_len++;
+			}
+			memcpy(buf, data, len);
+			*buf_len = wsrep_innobase_mysql_sort(
+				(int)(col_f->prtype & DATA_MYSQL_TYPE_MASK),
+				(uint)dtype_get_charset_coll(col_f->prtype),
+				buf, len, *buf_len);
+		} else { /* new protocol */
+			if (!(col_r->prtype & DATA_NOT_NULL)) {
+				*buf++ = 0;
+				key_len++;
+			}
+			switch (col_f->mtype) {
+			case DATA_INT: {
+				byte* ptr = buf+len;
+				for (;;) {
+					ptr--;
+					*ptr = *data;
+					if (ptr == buf) {
+						break;
+					}
+					data++;
+				}
+		
+				if (!(col_f->prtype & DATA_UNSIGNED)) {
+					buf[len-1] = (byte) (buf[len-1] ^ 128);
+				}
+
+				break;
+			}
+			case DATA_VARCHAR:
+			case DATA_VARMYSQL:
+			case DATA_CHAR:
+			case DATA_MYSQL:
+				/* Copy the actual data */
+				ut_memcpy(buf, data, len);
+				len = wsrep_innobase_mysql_sort(
+					(int)
+					(col_f->prtype & DATA_MYSQL_TYPE_MASK),
+					(uint)
+					dtype_get_charset_coll(col_f->prtype),
+					buf, len, *buf_len);
+				break;
+			case DATA_BLOB:
+			case DATA_BINARY:
+				memcpy(buf, data, len);
+				break;
+			default: 
+				break;
+			}
+
+			key_len += len;
+			buf 	+= len;
+		}
+	}
+
+	rec_validate(rec, offsets);
+
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+
+	*buf_len = key_len;
+	return DB_SUCCESS;
+
+ err_out:
+	if (UNIV_LIKELY_NULL(heap)) {
+		mem_heap_free(heap);
+	}
+	return DB_ERROR;
+}
+#endif // WITH_WSREP
diff --git a/storage/innobase/row/row0ftsort.cc b/storage/innobase/row/row0ftsort.cc
index b11a9f0d85a..eb032246416 100644
--- a/storage/innobase/row/row0ftsort.cc
+++ b/storage/innobase/row/row0ftsort.cc
@@ -848,7 +848,7 @@ exit:
 
 		error = row_merge_sort(psort_info->psort_common->trx,
 				       psort_info->psort_common->dup,
-				       merge_file[i], block[i], &tmpfd[i]);
+				       merge_file[i], block[i], &tmpfd[i], false, 0.0/* pct_progress */, 0.0/* pct_cost */);
 		if (error != DB_SUCCESS) {
 			close(tmpfd[i]);
 			goto func_exit;
@@ -1409,8 +1409,9 @@ row_fts_merge_insert(
 		fd[i] = psort_info[i].merge_file[id]->fd;
 		foffs[i] = 0;
 
-		buf[i] = static_cast<unsigned char (*)[16384]>(
+		buf[i] = static_cast<unsigned char (*)[65536]>(
 			mem_heap_alloc(heap, sizeof *buf[i]));
+
 		count_diag += (int) psort_info[i].merge_file[id]->n_rec;
 	}
 
diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc
index ac9ca7b44eb..44c9ac32d16 100644
--- a/storage/innobase/row/row0ins.cc
+++ b/storage/innobase/row/row0ins.cc
@@ -920,6 +920,14 @@ row_ins_invalidate_query_cache(
 	innobase_invalidate_query_cache(thr_get_trx(thr), buf, len);
 	mem_free(buf);
 }
+#ifdef WITH_WSREP
+dberr_t wsrep_append_foreign_key(trx_t *trx,  
+			       dict_foreign_t*	foreign,
+			       const rec_t*	clust_rec,
+			       dict_index_t*	clust_index,
+			       ibool		referenced,
+			       ibool            shared);
+#endif /* WITH_WSREP */
 
 /*********************************************************************//**
 Perform referential actions or checks when a parent row is deleted or updated
@@ -1271,7 +1279,19 @@ row_ins_foreign_check_on_constraint(
 
 	cascade->state = UPD_NODE_UPDATE_CLUSTERED;
 
-	err = row_update_cascade_for_mysql(thr, cascade,
+#ifdef WITH_WSREP
+	err = wsrep_append_foreign_key(
+				       thr_get_trx(thr),
+				       foreign,
+				       clust_rec,
+				       clust_index,
+				       FALSE, FALSE);
+	if (err != DB_SUCCESS) {
+		fprintf(stderr,
+			"WSREP: foreign key append failed: %d\n", err);
+	} else
+#endif /* WITH_WSREP */
+		err = row_update_cascade_for_mysql(thr, cascade,
 					   foreign->foreign_table);
 
 	if (foreign->foreign_table->n_foreign_key_checks_running == 0) {
@@ -1603,7 +1623,14 @@ run_again:
 
 				if (check_ref) {
 					err = DB_SUCCESS;
-
+#ifdef WITH_WSREP
+					err = wsrep_append_foreign_key(
+						thr_get_trx(thr),
+						foreign,
+						rec, 
+						check_index, 
+						check_ref, TRUE);
+#endif /* WITH_WSREP */
 					goto end_scan;
 				} else if (foreign->type != 0) {
 					/* There is an ON UPDATE or ON DELETE
diff --git a/storage/innobase/row/row0log.cc b/storage/innobase/row/row0log.cc
index fd0c54d889b..caed087b439 100644
--- a/storage/innobase/row/row0log.cc
+++ b/storage/innobase/row/row0log.cc
@@ -40,6 +40,10 @@ Created 2011-05-26 Marko Makela
 
 #include<map>
 
+ulint onlineddl_rowlog_rows;
+ulint onlineddl_rowlog_pct_used;
+ulint onlineddl_pct_progress;
+
 /** Table row modification operations during online table rebuild.
 Delete-marked records are not copied to the rebuilt table. */
 enum row_tab_op {
@@ -470,6 +474,10 @@ write_failed:
 	log->tail.total += size;
 	UNIV_MEM_INVALID(log->tail.buf, sizeof log->tail.buf);
 	mutex_exit(&log->mutex);
+
+	os_atomic_increment_ulint(&onlineddl_rowlog_rows, 1);
+	/* 10000 means 100.00%, 4525 means 45.25% */
+	onlineddl_rowlog_pct_used = (log->tail.total * 10000) / srv_online_max_size;
 }
 
 #ifdef UNIV_DEBUG
@@ -2546,7 +2554,7 @@ all_done:
 		success = os_file_read_no_error_handling(
 			OS_FILE_FROM_FD(index->online_log->fd),
 			index->online_log->head.block, ofs,
-			srv_sort_buf_size);
+			srv_sort_buf_size, FALSE);
 
 		if (!success) {
 			fprintf(stderr, "InnoDB: unable to read temporary file"
@@ -3377,7 +3385,7 @@ all_done:
 		success = os_file_read_no_error_handling(
 			OS_FILE_FROM_FD(index->online_log->fd),
 			index->online_log->head.block, ofs,
-			srv_sort_buf_size);
+			srv_sort_buf_size, FALSE);
 
 		if (!success) {
 			fprintf(stderr, "InnoDB: unable to read temporary file"
diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc
index e9d8bd50d6a..c79bd6c62ec 100644
--- a/storage/innobase/row/row0merge.cc
+++ b/storage/innobase/row/row0merge.cc
@@ -23,6 +23,8 @@ New index creation routines using a merge sort
 Created 12/4/2005 Jan Lindstrom
 Completed by Sunny Bains and Marko Makela
 *******************************************************/
+#include <my_config.h>
+#include <log.h>
 
 #include "row0merge.h"
 #include "row0ext.h"
@@ -38,6 +40,13 @@ Completed by Sunny Bains and Marko Makela
 #include "row0import.h"
 #include "handler0alter.h"
 #include "ha_prototypes.h"
+#include "math.h" /* log() */
+
+float my_log2f(float n)
+{
+	/* log(n) / log(2) is log2. */
+	return (float)(log((double)n) / log((double)2));
+}
 
 /* Ignore posix_fadvise() on those platforms where it does not exist */
 #if defined __WIN__
@@ -777,7 +786,8 @@ row_merge_read(
 #endif /* UNIV_DEBUG */
 
 	success = os_file_read_no_error_handling(OS_FILE_FROM_FD(fd), buf,
-						 ofs, srv_sort_buf_size);
+		                                 ofs, srv_sort_buf_size, FALSE);
+
 #ifdef POSIX_FADV_DONTNEED
 	/* Each block is read exactly once.  Free up the file cache. */
 	posix_fadvise(fd, ofs, srv_sort_buf_size, POSIX_FADV_DONTNEED);
@@ -1188,7 +1198,8 @@ row_merge_read_clustered_index(
 					AUTO_INCREMENT column, or
 					ULINT_UNDEFINED if none is added */
 	ib_sequence_t&		sequence,/*!< in/out: autoinc sequence */
-	row_merge_block_t*	block)	/*!< in/out: file buffer */
+	row_merge_block_t*	block, /*!< in/out: file buffer */
+	float pct_cost) /*!< in: percent of task weight out of total alter job */
 {
 	dict_index_t*		clust_index;	/* Clustered index */
 	mem_heap_t*		row_heap;	/* Heap memory to create
@@ -1208,11 +1219,21 @@ row_merge_read_clustered_index(
 	os_event_t		fts_parallel_sort_event = NULL;
 	ibool			fts_pll_sort = FALSE;
 	ib_int64_t		sig_count = 0;
+
+	float 			curr_progress;
+	ib_int64_t		read_rows = 0;
+	ib_int64_t		table_total_rows;
 	DBUG_ENTER("row_merge_read_clustered_index");
 
 	ut_ad((old_table == new_table) == !col_map);
 	ut_ad(!add_cols || col_map);
 
+	table_total_rows = dict_table_get_n_rows(old_table);
+	if(table_total_rows == 0) {
+		/* We don't know total row count */
+		table_total_rows = 1;
+	}
+
 	trx->op_info = "reading clustered index";
 
 #ifdef FTS_INTERNAL_DIAG_PRINT
@@ -1710,6 +1731,17 @@ write_buffers:
 		}
 
 		mem_heap_empty(row_heap);
+
+		/* Increment innodb_onlineddl_pct_progress status variable */
+		read_rows++;
+		if(read_rows % 1000 == 0) {
+			/* Update progress for each 1000 rows */
+			curr_progress = (read_rows >= table_total_rows) ?
+					pct_cost : 
+				((pct_cost * read_rows) / table_total_rows);
+			/* presenting 10.12% as 1012 integer */
+			onlineddl_pct_progress = curr_progress * 100;
+		}
 	}
 
 func_exit:
@@ -2099,6 +2131,7 @@ row_merge(
 	/* Copy the last blocks, if there are any. */
 
 	while (foffs0 < ihalf) {
+
 		if (UNIV_UNLIKELY(trx_is_interrupted(trx))) {
 			return(DB_INTERRUPTED);
 		}
@@ -2115,6 +2148,7 @@ row_merge(
 	ut_ad(foffs0 == ihalf);
 
 	while (foffs1 < file->offset) {
+
 		if (trx_is_interrupted(trx)) {
 			return(DB_INTERRUPTED);
 		}
@@ -2170,17 +2204,37 @@ row_merge_sort(
 	merge_file_t*		file,	/*!< in/out: file containing
 					index entries */
 	row_merge_block_t*	block,	/*!< in/out: 3 buffers */
-	int*			tmpfd)	/*!< in/out: temporary file handle */
+	int*			tmpfd,	/*!< in/out: temporary file handle
+					*/
+	const bool		update_progress,
+					/*!< in: update progress
+					status variable or not */
+	const float 		pct_progress,
+					/*!< in: total progress percent
+					until now */
+	const float		pct_cost) /*!< in: current progress percent */
 {
 	const ulint	half	= file->offset / 2;
 	ulint		num_runs;
+	ulint		cur_run = 0;
 	ulint*		run_offset;
 	dberr_t		error	= DB_SUCCESS;
+	ulint		merge_count = 0;
+	ulint		total_merge_sort_count;
+	float		curr_progress = 0;
+
 	DBUG_ENTER("row_merge_sort");
 
 	/* Record the number of merge runs we need to perform */
 	num_runs = file->offset;
 
+	/* Find the number N which 2^N is greater or equal than num_runs */
+	/* N is merge sort running count */
+	total_merge_sort_count = ceil(my_log2f(num_runs));
+	if(total_merge_sort_count <= 0) {
+		total_merge_sort_count=1;
+	}
+
 	/* If num_runs are less than 1, nothing to merge */
 	if (num_runs <= 1) {
 		DBUG_RETURN(error);
@@ -2197,11 +2251,30 @@ row_merge_sort(
 	of file marker).  Thus, it must be at least one block. */
 	ut_ad(file->offset > 0);
 
+	thd_progress_init(trx->mysql_thd, num_runs);
+	sql_print_information("InnoDB: Online DDL : merge-sorting has estimated %lu runs", num_runs);
+
 	/* Merge the runs until we have one big run */
 	do {
+		cur_run++;
+
+		/* Report progress of merge sort to MySQL for
+		show processlist progress field */
+		thd_progress_report(trx->mysql_thd, cur_run, num_runs);
+		sql_print_information("InnoDB: Online DDL : merge-sorting current run %lu estimated %lu runs", cur_run, num_runs);
+
 		error = row_merge(trx, dup, file, block, tmpfd,
 				  &num_runs, run_offset);
 
+		if(update_progress) {
+			merge_count++;
+			curr_progress = (merge_count >= total_merge_sort_count) ?
+				pct_cost :
+				((pct_cost * merge_count) / total_merge_sort_count);
+			/* presenting 10.12% as 1012 integer */;
+			onlineddl_pct_progress = (pct_progress + curr_progress) * 100;
+		}
+
 		if (error != DB_SUCCESS) {
 			break;
 		}
@@ -2211,6 +2284,8 @@ row_merge_sort(
 
 	mem_free(run_offset);
 
+	thd_progress_end(trx->mysql_thd);
+
 	DBUG_RETURN(error);
 }
 
@@ -2269,7 +2344,10 @@ row_merge_insert_index_tuples(
 	dict_index_t*		index,	/*!< in: index */
 	const dict_table_t*	old_table,/*!< in: old table */
 	int			fd,	/*!< in: file descriptor */
-	row_merge_block_t*	block)	/*!< in/out: file buffer */
+	row_merge_block_t*	block,	/*!< in/out: file buffer */
+	const ib_int64_t	table_total_rows, /*!< in: total rows of old table */
+	const float		pct_progress,	/*!< in: total progress percent until now */
+	const float		pct_cost) /*!< in: current progress percent */
 {
 	const byte*		b;
 	mem_heap_t*		heap;
@@ -2279,6 +2357,8 @@ row_merge_insert_index_tuples(
 	ulint			foffs = 0;
 	ulint*			offsets;
 	mrec_buf_t*		buf;
+	ib_int64_t		inserted_rows = 0;
+	float			curr_progress;
 	DBUG_ENTER("row_merge_insert_index_tuples");
 
 	ut_ad(!srv_read_only_mode);
@@ -2455,6 +2535,19 @@ row_merge_insert_index_tuples(
 
 			mem_heap_empty(tuple_heap);
 			mem_heap_empty(ins_heap);
+
+			/* Increment innodb_onlineddl_pct_progress status variable */
+			inserted_rows++;
+			if(inserted_rows % 1000 == 0) {
+				/* Update progress for each 1000 rows */
+				curr_progress = (inserted_rows >= table_total_rows ||
+					table_total_rows <= 0) ?
+						pct_cost :
+					((pct_cost * inserted_rows) / table_total_rows);
+
+				/* presenting 10.12% as 1012 integer */;
+				onlineddl_pct_progress = (pct_progress + curr_progress) * 100;
+			}
 		}
 	}
 
@@ -3450,6 +3543,13 @@ row_merge_build_indexes(
 	fts_psort_t*		merge_info = NULL;
 	ib_int64_t		sig_count = 0;
 	bool			fts_psort_initiated = false;
+
+	float total_static_cost = 0;
+	float total_dynamic_cost = 0;
+	uint total_index_blocks = 0;
+	float pct_cost=0;
+	float pct_progress=0;
+
 	DBUG_ENTER("row_merge_build_indexes");
 
 	ut_ad(!srv_read_only_mode);
@@ -3480,6 +3580,9 @@ row_merge_build_indexes(
 		merge_files[i].fd = -1;
 	}
 
+	total_static_cost = COST_BUILD_INDEX_STATIC * n_indexes + COST_READ_CLUSTERED_INDEX;
+	total_dynamic_cost = COST_BUILD_INDEX_DYNAMIC * n_indexes;
+
 	for (i = 0; i < n_indexes; i++) {
 		if (row_merge_file_create(&merge_files[i]) < 0) {
 			error = DB_OUT_OF_MEMORY;
@@ -3524,6 +3627,12 @@ row_merge_build_indexes(
 	duplicate keys. */
 	innobase_rec_reset(table);
 
+	sql_print_information("InnoDB: Online DDL : Start");
+	sql_print_information("InnoDB: Online DDL : Start reading clustered "
+		"index of the table and create temporary files");
+
+	pct_cost = COST_READ_CLUSTERED_INDEX * 100 / (total_static_cost + total_dynamic_cost);
+
 	/* Read clustered index of the table and create files for
 	secondary index entries for merge sort */
 
@@ -3531,10 +3640,18 @@ row_merge_build_indexes(
 		trx, table, old_table, new_table, online, indexes,
 		fts_sort_idx, psort_info, merge_files, key_numbers,
 		n_indexes, add_cols, col_map,
-		add_autoinc, sequence, block);
+		add_autoinc, sequence, block, pct_cost);
 
-	if (error != DB_SUCCESS) {
+	pct_progress += pct_cost;
+
+	sql_print_information("InnoDB: Online DDL : End of reading "
+		"clustered index of the table and create temporary files");
 
+	for (i = 0; i < n_indexes; i++) {
+		total_index_blocks += merge_files[i].offset;
+	}
+
+	if (error != DB_SUCCESS) {
 		goto func_exit;
 	}
 
@@ -3616,14 +3733,47 @@ wait_again:
 			row_merge_dup_t	dup = {
 				sort_idx, table, col_map, 0};
 
+			pct_cost = (COST_BUILD_INDEX_STATIC +
+				(total_dynamic_cost * merge_files[i].offset /
+					total_index_blocks)) /
+				(total_static_cost + total_dynamic_cost)
+				* PCT_COST_MERGESORT_INDEX * 100;
+
+			sql_print_information("InnoDB: Online DDL : Start merge-sorting"
+				" index %s (%lu / %lu), estimated cost : %2.4f",
+				indexes[i]->name, (i+1), n_indexes, pct_cost);
+
 			error = row_merge_sort(
 				trx, &dup, &merge_files[i],
-				block, &tmpfd);
+				block, &tmpfd, true, pct_progress, pct_cost);
+
+			pct_progress += pct_cost;
+
+			sql_print_information("InnoDB: Online DDL : End of "
+				" merge-sorting index %s (%lu / %lu)",
+				indexes[i]->name, (i+1), n_indexes);
 
 			if (error == DB_SUCCESS) {
+				pct_cost = (COST_BUILD_INDEX_STATIC +
+					(total_dynamic_cost * merge_files[i].offset /
+						total_index_blocks)) /
+					(total_static_cost + total_dynamic_cost) *
+					PCT_COST_INSERT_INDEX * 100;
+
+				sql_print_information("InnoDB: Online DDL : Start "
+					"building index %s (%lu / %lu), estimated "
+					"cost : %2.4f", indexes[i]->name, (i+1),
+					n_indexes, pct_cost);
+
 				error = row_merge_insert_index_tuples(
 					trx->id, sort_idx, old_table,
-					merge_files[i].fd, block);
+					merge_files[i].fd, block,
+					merge_files[i].n_rec, pct_progress, pct_cost);
+				pct_progress += pct_cost;
+
+				sql_print_information("InnoDB: Online DDL : "
+					"End of building index %s (%lu / %lu)",
+					indexes[i]->name, (i+1), n_indexes);
 			}
 		}
 
@@ -3640,11 +3790,15 @@ wait_again:
 			ut_ad(sort_idx->online_status
 			      == ONLINE_INDEX_COMPLETE);
 		} else {
+			sql_print_information("InnoDB: Online DDL : Start applying row log");
 			DEBUG_SYNC_C("row_log_apply_before");
 			error = row_log_apply(trx, sort_idx, table);
 			DEBUG_SYNC_C("row_log_apply_after");
+			sql_print_information("InnoDB: Online DDL : End of applying row log");
 		}
 
+		sql_print_information("InnoDB: Online DDL : Completed");
+
 		if (error != DB_SUCCESS) {
 			trx->error_key_num = key_numbers[i];
 			goto func_exit;
diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc
index 1138aa410cc..86248b87c66 100644
--- a/storage/innobase/row/row0mysql.cc
+++ b/storage/innobase/row/row0mysql.cc
@@ -55,6 +55,7 @@ Created 9/17/2000 Heikki Tuuri
 #include "rem0cmp.h"
 #include "log0log.h"
 #include "btr0sea.h"
+#include "btr0defragment.h"
 #include "fil0fil.h"
 #include "ibuf0ibuf.h"
 #include "fts0fts.h"
@@ -3931,6 +3932,8 @@ row_drop_table_for_mysql(
 	if (!dict_table_is_temporary(table)) {
 
 		dict_stats_recalc_pool_del(table);
+		dict_stats_defrag_pool_del(table, NULL);
+		btr_defragment_remove_table(table);
 
 		/* Remove stats for this table and all of its indexes from the
 		persistent storage if it exists and if there are stats for this
@@ -5219,18 +5222,6 @@ end:
 			trx->error_state = DB_SUCCESS;
 			trx_rollback_to_savepoint(trx, NULL);
 			trx->error_state = DB_SUCCESS;
-		} else {
-			if (old_is_tmp && !new_is_tmp) {
-				/* After ALTER TABLE the table statistics
-				needs to be rebuilt.  Even if we close
-				table below there could be other
-				transactions using this table (e.g.
-				SELECT * FROM INFORMATION_SCHEMA.`TABLE_CONSTRAINTS`),
-				thus we can't remove table from dictionary cache
-				here. Therefore, we initialize the
-				transient statistics here. */
-				dict_stats_update_transient(table);
-			}
 		}
 	}
 
diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc
index 69c8498839e..b0e0c89b778 100644
--- a/storage/innobase/row/row0sel.cc
+++ b/storage/innobase/row/row0sel.cc
@@ -56,6 +56,7 @@ Created 12/19/1997 Heikki Tuuri
 #include "row0mysql.h"
 #include "read0read.h"
 #include "buf0lru.h"
+#include "srv0srv.h"
 #include "ha_prototypes.h"
 #include "m_string.h" /* for my_sys.h */
 #include "my_sys.h" /* DEBUG_SYNC_C */
@@ -2933,9 +2934,14 @@ row_sel_store_mysql_rec(
 			: templ->rec_field_no;
 		/* We should never deliver column prefixes to MySQL,
 		except for evaluating innobase_index_cond(). */
+		/* ...actually, we do want to do this in order to
+		support the prefix query optimization.
+
 		ut_ad(dict_index_get_nth_field(index, field_no)->prefix_len
 		      == 0);
 
+		...so we disable this assert. */
+
 		if (!row_sel_store_mysql_field(mysql_rec, prebuilt,
 					       rec, index, offsets,
 					       field_no, templ)) {
@@ -3028,6 +3034,8 @@ row_sel_get_clust_rec_for_mysql(
 	dberr_t		err;
 	trx_t*		trx;
 
+	srv_stats.n_sec_rec_cluster_reads.inc();
+
 	*out_rec = NULL;
 	trx = thr_get_trx(thr);
 
@@ -3683,6 +3691,7 @@ row_search_for_mysql(
 	ulint*		offsets				= offsets_;
 	ibool		table_lock_waited		= FALSE;
 	byte*		next_buf			= 0;
+	ibool		use_clustered_index		= FALSE;
 
 	rec_offs_init(offsets_);
 
@@ -4706,10 +4715,68 @@ locks_ok:
 	}
 
 	/* Get the clustered index record if needed, if we did not do the
-	search using the clustered index. */
-
-	if (index != clust_index && prebuilt->need_to_access_clustered) {
+	search using the clustered index... */
+
+	use_clustered_index =
+		(index != clust_index && prebuilt->need_to_access_clustered);
+
+	if (use_clustered_index && srv_prefix_index_cluster_optimization
+	    && prebuilt->n_template <= index->n_fields) {
+		/* ...but, perhaps avoid the clustered index lookup if
+		all of the following are true:
+		1) all columns are in the secondary index
+		2) all values for columns that are prefix-only
+		   indexes are shorter than the prefix size
+		This optimization can avoid many IOs for certain schemas.
+		*/
+		ibool row_contains_all_values = TRUE;
+		int i;
+		for (i = 0; i < prebuilt->n_template; i++) {
+			/* Condition (1) from above: is the field in the
+			index (prefix or not)? */
+			mysql_row_templ_t* templ =
+				prebuilt->mysql_template + i;
+			ulint secondary_index_field_no =
+				templ->rec_prefix_field_no;
+			if (secondary_index_field_no == ULINT_UNDEFINED) {
+				row_contains_all_values = FALSE;
+				break;
+			}
+			/* Condition (2) from above: if this is a
+			prefix, is this row's value size shorter
+			than the prefix? */
+			if (templ->rec_field_is_prefix) {
+				ulint record_size = rec_offs_nth_size(
+					offsets,
+					secondary_index_field_no);
+				const dict_field_t *field =
+					dict_index_get_nth_field(
+						index,
+						secondary_index_field_no);
+				ut_a(field->prefix_len > 0);
+				if (record_size >= field->prefix_len) {
+					row_contains_all_values = FALSE;
+					break;
+				}
+			}
+		}
+		/* If (1) and (2) were true for all columns above, use
+		rec_prefix_field_no instead of rec_field_no, and skip
+		the clustered lookup below. */
+		if (row_contains_all_values) {
+			for (i = 0; i < prebuilt->n_template; i++) {
+				mysql_row_templ_t* templ =
+					prebuilt->mysql_template + i;
+				templ->rec_field_no =
+					templ->rec_prefix_field_no;
+				ut_a(templ->rec_field_no != ULINT_UNDEFINED);
+			}
+			use_clustered_index = FALSE;
+			srv_stats.n_sec_rec_cluster_reads_avoided.inc();
+		}
+	}
 
+	if (use_clustered_index) {
 requires_clust_rec:
 		ut_ad(index != clust_index);
 		/* We use a 'goto' to the preceding label if a consistent
diff --git a/storage/innobase/row/row0upd.cc b/storage/innobase/row/row0upd.cc
index a8c2eaa6683..0ea4865d15f 100644
--- a/storage/innobase/row/row0upd.cc
+++ b/storage/innobase/row/row0upd.cc
@@ -53,6 +53,9 @@ Created 12/27/1996 Heikki Tuuri
 #include "buf0lru.h"
 #include <algorithm>
 
+#include <mysql/plugin.h>
+#include <mysql/service_wsrep.h>
+
 /* What kind of latch and lock can we assume when the control comes to
    -------------------------------------------------------------------
 an update node?
@@ -162,6 +165,52 @@ row_upd_index_is_referenced(
 	return(is_referenced);
 }
 
+#ifdef WITH_WSREP
+static
+ibool
+wsrep_row_upd_index_is_foreign(
+/*========================*/
+	dict_index_t*	index,	/*!< in: index */
+	trx_t*		trx)	/*!< in: transaction */
+{
+	dict_table_t*	table		= index->table;
+	dict_foreign_t*	foreign;
+	ibool		froze_data_dict	= FALSE;
+	ibool		is_referenced	= FALSE;
+
+	if (table->foreign_set.empty()) {
+
+		return(FALSE);
+	}
+
+	if (trx->dict_operation_lock_mode == 0) {
+		row_mysql_freeze_data_dictionary(trx);
+		froze_data_dict = TRUE;
+	}
+
+	for (dict_foreign_set::iterator it= table->foreign_set.begin();
+	     it != table->foreign_set.end();
+	     ++ it)
+	{
+		foreign= *it;
+
+		if (foreign->foreign_index == index) {
+
+			is_referenced = TRUE;
+			goto func_exit;
+		}
+
+	}
+
+func_exit:
+	if (froze_data_dict) {
+		row_mysql_unfreeze_data_dictionary(trx);
+	}
+
+	return(is_referenced);
+}
+#endif /* WITH_WSREP */
+
 /*********************************************************************//**
 Checks if possible foreign key constraints hold after a delete of the record
 under pcur.
@@ -281,7 +330,125 @@ run_again:
 	}
 
 	err = DB_SUCCESS;
+func_exit:
+	if (got_s_lock) {
+		row_mysql_unfreeze_data_dictionary(trx);
+	}
+
+	mem_heap_free(heap);
+
+	return(err);
+}
+#ifdef WITH_WSREP
+static
+dberr_t
+wsrep_row_upd_check_foreign_constraints(
+/*=================================*/
+	upd_node_t*	node,	/*!< in: row update node */
+	btr_pcur_t*	pcur,	/*!< in: cursor positioned on a record; NOTE: the
+				cursor position is lost in this function! */
+	dict_table_t*	table,	/*!< in: table in question */
+	dict_index_t*	index,	/*!< in: index of the cursor */
+	ulint*		offsets,/*!< in/out: rec_get_offsets(pcur.rec, index) */
+	que_thr_t*	thr,	/*!< in: query thread */
+	mtr_t*		mtr)	/*!< in: mtr */
+{
+	dict_foreign_t*	foreign;
+	mem_heap_t*	heap;
+	dtuple_t*	entry;
+	trx_t*		trx;
+	const rec_t*	rec;
+	ulint		n_ext;
+	dberr_t		err;
+	ibool		got_s_lock	= FALSE;
+	ibool		opened     	= FALSE;
+
+	if (table->foreign_set.empty()) {
+
+		return(DB_SUCCESS);
+	}
 
+	trx = thr_get_trx(thr);
+
+        /* TODO: make native slave thread bail out here */
+
+	rec = btr_pcur_get_rec(pcur);
+	ut_ad(rec_offs_validate(rec, index, offsets));
+
+	heap = mem_heap_create(500);
+
+	entry = row_rec_to_index_entry(rec, index, offsets,
+				       &n_ext, heap);
+
+	mtr_commit(mtr);
+
+	mtr_start(mtr);
+
+	if (trx->dict_operation_lock_mode == 0) {
+		got_s_lock = TRUE;
+
+		row_mysql_freeze_data_dictionary(trx);
+	}
+
+	for (dict_foreign_set::iterator it= table->foreign_set.begin();
+	     it != table->foreign_set.end();
+	     ++ it)
+	{
+		foreign= *it;
+
+		/* Note that we may have an update which updates the index
+		record, but does NOT update the first fields which are
+		referenced in a foreign key constraint. Then the update does
+		NOT break the constraint. */
+
+		if (foreign->foreign_index == index
+		    && (node->is_delete
+			|| row_upd_changes_first_fields_binary(
+				entry, index, node->update,
+				foreign->n_fields))) {
+
+			if (foreign->referenced_table == NULL) {
+				foreign->referenced_table =
+					dict_table_open_on_name(
+					  foreign->referenced_table_name_lookup,
+					  FALSE, FALSE, DICT_ERR_IGNORE_NONE);
+				opened = TRUE;
+			}
+
+			if (foreign->referenced_table) {
+				os_inc_counter(dict_sys->mutex,
+					       foreign->referenced_table
+					       ->n_foreign_key_checks_running);
+			}
+
+			/* NOTE that if the thread ends up waiting for a lock
+			we will release dict_operation_lock temporarily!
+			But the counter on the table protects 'foreign' from
+			being dropped while the check is running. */
+
+			err = row_ins_check_foreign_constraint(
+				TRUE, foreign, table, entry, thr);
+
+			if (foreign->referenced_table) {
+				os_dec_counter(dict_sys->mutex,
+					       foreign->referenced_table
+					       ->n_foreign_key_checks_running);
+
+				if (opened == TRUE) {
+					dict_table_close(foreign->referenced_table, TRUE, FALSE);
+					opened = FALSE;
+				}
+			}
+
+			if (err != DB_SUCCESS) {
+
+				goto func_exit;
+			}
+		}
+
+	}
+
+	err = DB_SUCCESS;
 func_exit:
 	if (got_s_lock) {
 		row_mysql_unfreeze_data_dictionary(trx);
@@ -293,6 +460,7 @@ func_exit:
 
 	return(err);
 }
+#endif /* WITH_WSREP */
 
 /*********************************************************************//**
 Creates an update node for a query graph.
@@ -1667,6 +1835,9 @@ row_upd_sec_index_entry(
 	index = node->index;
 
 	referenced = row_upd_index_is_referenced(index, trx);
+#ifdef WITH_WSREP
+	ibool foreign = wsrep_row_upd_index_is_foreign(index, trx);
+#endif /* WITH_WSREP */
 
 	heap = mem_heap_create(1024);
 
@@ -1794,6 +1965,9 @@ row_upd_sec_index_entry(
 		row_ins_sec_index_entry() below */
 		if (!rec_get_deleted_flag(
 			    rec, dict_table_is_comp(index->table))) {
+#ifdef WITH_WSREP
+			que_node_t *parent = que_node_get_parent(node);
+#endif /* WITH_WSREP */
 			err = btr_cur_del_mark_set_sec_rec(
 				0, btr_cur, TRUE, thr, &mtr);
 
@@ -1811,6 +1985,37 @@ row_upd_sec_index_entry(
 					node, &pcur, index->table,
 					index, offsets, thr, &mtr);
 			}
+#ifdef WITH_WSREP
+			if (err == DB_SUCCESS && !referenced                  &&
+			    !(parent && que_node_get_type(parent) ==
+				QUE_NODE_UPDATE                               &&
+			      ((upd_node_t*)parent)->cascade_node == node)    &&
+			    foreign
+			) {
+				ulint*	offsets =
+					rec_get_offsets(
+						rec, index, NULL, ULINT_UNDEFINED,
+						&heap);
+				err = wsrep_row_upd_check_foreign_constraints(
+					node, &pcur, index->table,
+					index, offsets, thr, &mtr);
+				switch (err) {
+				case DB_SUCCESS:
+				case DB_NO_REFERENCED_ROW:
+					err = DB_SUCCESS;
+					break;
+				case DB_DEADLOCK:
+					if (wsrep_debug) fprintf (stderr, 
+						"WSREP: sec index FK check fail for deadlock");
+					break;
+				default:
+					fprintf (stderr, 
+						 "WSREP: referenced FK check fail: %d", 
+						 (int)err);
+					break;
+				}
+			}
+#endif /* WITH_WSREP */
 		}
 		break;
 	}
@@ -1965,6 +2170,9 @@ row_upd_clust_rec_by_insert(
 	que_thr_t*	thr,	/*!< in: query thread */
 	ibool		referenced,/*!< in: TRUE if index may be referenced in
 				a foreign key constraint */
+#ifdef WITH_WSREP
+	ibool		foreign, /*!< in: TRUE if index is foreign key index */
+#endif /* WITH_WSREP */
 	mtr_t*		mtr)	/*!< in/out: mtr; gets committed here */
 {
 	mem_heap_t*	heap;
@@ -1978,6 +2186,9 @@ row_upd_clust_rec_by_insert(
 	rec_t*		rec;
 	ulint*		offsets			= NULL;
 
+#ifdef WITH_WSREP
+	que_node_t *parent = que_node_get_parent(node);
+#endif /* WITH_WSREP */
 	ut_ad(node);
 	ut_ad(dict_index_is_clust(index));
 
@@ -2060,6 +2271,34 @@ err_exit:
 				goto err_exit;
 			}
 		}
+#ifdef WITH_WSREP
+		if (!referenced                                              &&
+		    !(parent && que_node_get_type(parent) == QUE_NODE_UPDATE &&
+		      ((upd_node_t*)parent)->cascade_node == node)           &&
+		    foreign
+		) {
+			err = wsrep_row_upd_check_foreign_constraints(
+				node, pcur, table, index, offsets, thr, mtr);
+			switch (err) {
+			case DB_SUCCESS:
+			case DB_NO_REFERENCED_ROW:
+				err = DB_SUCCESS;
+				break;
+			case DB_DEADLOCK:
+				if (wsrep_debug) fprintf (stderr, 
+					"WSREP: insert FK check fail for deadlock");
+				break;
+			default:
+				fprintf (stderr, 
+					"WSREP: referenced FK check fail: %d", 
+					 (int)err);
+				break;
+			}
+			if (err != DB_SUCCESS) {
+				goto err_exit;
+			}
+		}
+#endif /* WITH_WSREP */
 	}
 
 	mtr_commit(mtr);
@@ -2252,11 +2491,18 @@ row_upd_del_mark_clust_rec(
 	ibool		referenced,
 				/*!< in: TRUE if index may be referenced in
 				a foreign key constraint */
+#ifdef WITH_WSREP
+	ibool		foreign,/*!< in: TRUE if index is foreign key index */
+#endif /* WITH_WSREP */
 	mtr_t*		mtr)	/*!< in: mtr; gets committed here */
 {
 	btr_pcur_t*	pcur;
 	btr_cur_t*	btr_cur;
 	dberr_t		err;
+#ifdef WITH_WSREP
+	rec_t*		rec;
+	que_node_t *parent = que_node_get_parent(node);
+#endif /* WITH_WSREP */
 
 	ut_ad(node);
 	ut_ad(dict_index_is_clust(index));
@@ -2273,8 +2519,16 @@ row_upd_del_mark_clust_rec(
 	/* Mark the clustered index record deleted; we do not have to check
 	locks, because we assume that we have an x-lock on the record */
 
+#ifdef WITH_WSREP
+	rec = btr_cur_get_rec(btr_cur);
+#endif /* WITH_WSREP */
+
 	err = btr_cur_del_mark_set_clust_rec(
+#ifdef WITH_WSREP
+		btr_cur_get_block(btr_cur), rec,
+#else
 		btr_cur_get_block(btr_cur), btr_cur_get_rec(btr_cur),
+#endif /* WITH_WSREP */
 		index, offsets, thr, mtr);
 	if (err == DB_SUCCESS && referenced) {
 		/* NOTE that the following call loses the position of pcur ! */
@@ -2282,6 +2536,32 @@ row_upd_del_mark_clust_rec(
 		err = row_upd_check_references_constraints(
 			node, pcur, index->table, index, offsets, thr, mtr);
 	}
+#ifdef WITH_WSREP
+	if (err == DB_SUCCESS && !referenced                         &&
+	    !(parent && que_node_get_type(parent) == QUE_NODE_UPDATE &&
+	      ((upd_node_t*)parent)->cascade_node == node)           &&
+	    thr_get_trx(thr)                                         &&
+	    foreign
+	) {
+		err = wsrep_row_upd_check_foreign_constraints(
+			node, pcur, index->table, index, offsets, thr, mtr);
+		switch (err) {
+		case DB_SUCCESS:
+		case DB_NO_REFERENCED_ROW:
+			err = DB_SUCCESS;
+			break;
+		case DB_DEADLOCK:
+			if (wsrep_debug) fprintf (stderr, 
+				"WSREP: clust rec FK check fail for deadlock");
+			break;
+		default:
+			fprintf (stderr, 
+				"WSREP: clust rec referenced FK check fail: %d", 
+				 (int)err);
+			break;
+		}
+	}
+#endif /* WITH_WSREP */
 
 	mtr_commit(mtr);
 
@@ -2314,6 +2594,10 @@ row_upd_clust_step(
 	index = dict_table_get_first_index(node->table);
 
 	referenced = row_upd_index_is_referenced(index, thr_get_trx(thr));
+#ifdef WITH_WSREP
+	ibool foreign = wsrep_row_upd_index_is_foreign(
+		index, thr_get_trx(thr));
+#endif /* WITH_WSREP */
 
 	pcur = node->pcur;
 
@@ -2408,7 +2692,11 @@ row_upd_clust_step(
 
 	if (node->is_delete) {
 		err = row_upd_del_mark_clust_rec(
+#ifdef WITH_WSREP
+			node, index, offsets, thr, referenced, foreign, &mtr);
+#else
 			node, index, offsets, thr, referenced, &mtr);
+#endif /* WITH_WSREP */
 
 		if (err == DB_SUCCESS) {
 			node->state = UPD_NODE_UPDATE_ALL_SEC;
@@ -2453,7 +2741,11 @@ row_upd_clust_step(
 		externally! */
 
 		err = row_upd_clust_rec_by_insert(
+#ifdef WITH_WSREP
+			node, index, thr, referenced, foreign, &mtr);
+#else
 			node, index, thr, referenced, &mtr);
+#endif /* WITH_WSREP */
 
 		if (err != DB_SUCCESS) {
 
diff --git a/storage/innobase/srv/srv0conc.cc b/storage/innobase/srv/srv0conc.cc
index dc3c0b1dd88..8942eb20080 100644
--- a/storage/innobase/srv/srv0conc.cc
+++ b/storage/innobase/srv/srv0conc.cc
@@ -41,7 +41,8 @@ Created 2011/04/18 Sunny Bains
 #include "sync0sync.h"
 #include "trx0trx.h"
 
-#include "mysql/plugin.h"
+#include <mysql/plugin.h>
+#include <mysql/service_wsrep.h>
 
 /** Number of times a thread is allowed to enter InnoDB within the same
 SQL query after it has once got the ticket. */
@@ -86,6 +87,9 @@ struct srv_conc_slot_t{
 					reserved may still be TRUE at that
 					point */
 	srv_conc_node_t	srv_conc_queue;	/*!< queue node */
+#ifdef WITH_WSREP
+	void				*thd;		/*!< to see priority */
+#endif
 };
 
 /** Queue of threads waiting to get in */
@@ -145,6 +149,9 @@ srv_conc_init(void)
 
 		conc_slot->event = os_event_create();
 		ut_a(conc_slot->event);
+#ifdef WITH_WSREP
+		conc_slot->thd = NULL;
+#endif /* WITH_WSREP */
 	}
 #endif /* !HAVE_ATOMIC_BUILTINS */
 }
@@ -202,6 +209,16 @@ srv_conc_enter_innodb_with_atomics(
 
 	for (;;) {
 		ulint	sleep_in_us;
+#ifdef WITH_WSREP
+		if (wsrep_on(trx->mysql_thd) && 
+		    wsrep_trx_is_aborting(trx->mysql_thd)) {
+			if (wsrep_debug)
+		  		fprintf(stderr,	
+					"srv_conc_enter due to MUST_ABORT");
+			srv_conc_force_enter_innodb(trx);
+			return;
+		}
+#endif /* WITH_WSREP */
 
 		if (srv_conc.n_active < (lint) srv_thread_concurrency) {
 			ulint	n_active;
@@ -319,6 +336,9 @@ srv_conc_exit_innodb_without_atomics(
 	slot = NULL;
 
 	if (srv_conc.n_active < (lint) srv_thread_concurrency) {
+#ifdef WITH_WSREP
+		srv_conc_slot_t*  wsrep_slot;
+#endif
 		/* Look for a slot where a thread is waiting and no other
 		thread has yet released the thread */
 
@@ -329,6 +349,19 @@ srv_conc_exit_innodb_without_atomics(
 			/* No op */
 		}
 
+#ifdef WITH_WSREP
+		/* look for aborting trx, they must be released asap */
+		wsrep_slot= slot;
+		while (wsrep_slot && (wsrep_slot->wait_ended == TRUE || 
+		    !wsrep_trx_is_aborting(wsrep_slot->thd))) {
+			wsrep_slot = UT_LIST_GET_NEXT(srv_conc_queue, wsrep_slot);
+		}
+		if (wsrep_slot) {
+			slot = wsrep_slot;
+			if (wsrep_debug)
+			    fprintf(stderr, "WSREP: releasing aborting thd\n");
+		}
+#endif
 		if (slot != NULL) {
 			slot->wait_ended = TRUE;
 
@@ -384,6 +417,13 @@ retry:
 
 		return;
 	}
+#ifdef WITH_WSREP
+	if (wsrep_on(trx->mysql_thd) && 
+	    wsrep_thd_is_brute_force(trx->mysql_thd)) {
+		srv_conc_force_enter_innodb(trx);
+		return;
+	}
+#endif
 
 	/* If the transaction is not holding resources, let it sleep
 	for srv_thread_sleep_delay microseconds, and try again then */
@@ -450,6 +490,9 @@ retry:
 	/* Add to the queue */
 	slot->reserved = TRUE;
 	slot->wait_ended = FALSE;
+#ifdef WITH_WSREP
+	slot->thd = trx->mysql_thd;
+#endif
 
 	UT_LIST_ADD_LAST(srv_conc_queue, srv_conc_queue, slot);
 
@@ -457,6 +500,18 @@ retry:
 
 	srv_conc.n_waiting++;
 
+#ifdef WITH_WSREP
+	if (wsrep_on(trx->mysql_thd) && 
+	    wsrep_trx_is_aborting(trx->mysql_thd)) {
+		os_fast_mutex_unlock(&srv_conc_mutex);
+		if (wsrep_debug)
+			fprintf(stderr, "srv_conc_enter due to MUST_ABORT");
+		trx->declared_to_be_inside_innodb = TRUE;
+		trx->n_tickets_to_enter_innodb = srv_n_free_tickets_to_enter;
+		return;
+	}
+	trx->wsrep_event = slot->event;
+#endif /* WITH_WSREP */
 	os_fast_mutex_unlock(&srv_conc_mutex);
 
 	/* Go to wait for the event; when a thread leaves InnoDB it will
@@ -472,6 +527,9 @@ retry:
 
 	os_event_wait(slot->event);
 	thd_wait_end(trx->mysql_thd);
+#ifdef WITH_WSREP
+	trx->wsrep_event = NULL;
+#endif /* WITH_WSREP */
 
 	trx->op_info = "";
 
@@ -483,6 +541,9 @@ retry:
 	incremented the thread counter on behalf of this thread */
 
 	slot->reserved = FALSE;
+#ifdef WITH_WSREP
+	slot->thd = NULL;
+#endif
 
 	UT_LIST_REMOVE(srv_conc_queue, srv_conc_queue, slot);
 
@@ -593,5 +654,32 @@ srv_conc_get_active_threads(void)
 /*==============================*/
 {
 	return(srv_conc.n_active);
- }
+}
+
+#ifdef WITH_WSREP
+UNIV_INTERN
+void
+wsrep_srv_conc_cancel_wait(
+/*==================*/
+	trx_t*	trx)	/*!< in: transaction object associated with the
+			thread */
+{
+#ifdef HAVE_ATOMIC_BUILTINS
+	/* aborting transactions will enter innodb by force in 
+	   srv_conc_enter_innodb_with_atomics(). No need to cancel here,
+	   thr will wake up after os_sleep and let to enter innodb
+	*/
+	if (wsrep_debug)
+		fprintf(stderr, "WSREP: conc slot cancel, no atomics\n");
+#else
+	os_fast_mutex_lock(&srv_conc_mutex);
+	if (trx->wsrep_event) {
+		if (wsrep_debug) 
+			fprintf(stderr, "WSREP: conc slot cancel\n");
+		os_event_set(trx->wsrep_event);
+	}
+	os_fast_mutex_unlock(&srv_conc_mutex);
+#endif
+}
+#endif /* WITH_WSREP */
 
diff --git a/storage/innobase/srv/srv0mon.cc b/storage/innobase/srv/srv0mon.cc
index f29621bc90a..24cf403c0af 100644
--- a/storage/innobase/srv/srv0mon.cc
+++ b/storage/innobase/srv/srv0mon.cc
@@ -2,6 +2,7 @@
 
 Copyright (c) 2010, 2013, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2012, Facebook Inc.
+Copyright (c) 2013, 2014, MariaDB Corporation
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -290,12 +291,36 @@ static monitor_info_t	innodb_counter_info[] =
 	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
 	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_WRITTEN},
 
+	{"buffer_index_pages_written", "buffer",
+	 "Number of index pages written (innodb_index_pages_written)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_INDEX_PAGES_WRITTEN},
+
+	{"buffer_non_index_pages_written", "buffer",
+	 "Number of non index pages written (innodb_non_index_pages_written)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_NON_INDEX_PAGES_WRITTEN},
+
 	{"buffer_pages_read", "buffer",
 	 "Number of pages read (innodb_pages_read)",
 	 static_cast<monitor_type_t>(
 	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
 	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_READ},
 
+	{"buffer_index_sec_rec_cluster_reads", "buffer",
+	 "Number of secondary record reads triggered cluster read",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS},
+
+	{"buffer_index_sec_rec_cluster_reads_avoided", "buffer",
+	 "Number of secondary record reads avoided triggering cluster read",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS_AVOIDED},
+
 	{"buffer_data_reads", "buffer",
 	 "Amount of data read in bytes (innodb_data_reads)",
 	 static_cast<monitor_type_t>(
@@ -457,20 +482,36 @@ static monitor_info_t	innodb_counter_info[] =
 	 MONITOR_LRU_BATCH_SCANNED_PER_CALL},
 
 	/* Cumulative counter for LRU batch pages flushed */
-	{"buffer_LRU_batch_total_pages", "buffer",
+	{"buffer_LRU_batch_flush_total_pages", "buffer",
 	 "Total pages flushed as part of LRU batches",
-	 MONITOR_SET_OWNER, MONITOR_LRU_BATCH_COUNT,
-	 MONITOR_LRU_BATCH_TOTAL_PAGE},
+	 MONITOR_SET_OWNER, MONITOR_LRU_BATCH_FLUSH_COUNT,
+	 MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE},
 
-	{"buffer_LRU_batches", "buffer",
+	{"buffer_LRU_batches_flush", "buffer",
 	 "Number of LRU batches",
-	 MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_TOTAL_PAGE,
-	 MONITOR_LRU_BATCH_COUNT},
+	 MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
+	 MONITOR_LRU_BATCH_FLUSH_COUNT},
 
-	{"buffer_LRU_batch_pages", "buffer",
+	{"buffer_LRU_batch_flush_pages", "buffer",
 	 "Pages queued as an LRU batch",
-	 MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_TOTAL_PAGE,
-	 MONITOR_LRU_BATCH_PAGES},
+	 MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
+	 MONITOR_LRU_BATCH_FLUSH_PAGES},
+
+	/* Cumulative counter for LRU batch pages flushed */
+	{"buffer_LRU_batch_evict_total_pages", "buffer",
+	 "Total pages evicted as part of LRU batches",
+	 MONITOR_SET_OWNER, MONITOR_LRU_BATCH_EVICT_COUNT,
+	 MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE},
+
+	{"buffer_LRU_batches_evict", "buffer",
+	 "Number of LRU batches",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
+	 MONITOR_LRU_BATCH_EVICT_COUNT},
+
+	{"buffer_LRU_batch_evict_pages", "buffer",
+	 "Pages queued as an LRU batch",
+	 MONITOR_SET_MEMBER, MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
+	 MONITOR_LRU_BATCH_EVICT_PAGES},
 
 	/* Cumulative counter for single page LRU scans */
 	{"buffer_LRU_single_flush_scanned", "buffer",
@@ -879,6 +920,71 @@ static monitor_info_t	innodb_counter_info[] =
 	 MONITOR_NONE,
 	 MONITOR_DEFAULT_START, MONITOR_PAD_DECREMENTS},
 
+	{"compress_saved", "compression",
+	 "Number of bytes saved by page compression",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_SAVED},
+
+	{"compress_trim_sect512", "compression",
+	 "Number of sect-512 TRIMed by page compression",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT512},
+
+	{"compress_trim_sect1024", "compression",
+	 "Number of sect-1024 TRIMed by page compression",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT1024},
+
+	{"compress_trim_sect2048", "compression",
+	 "Number of sect-2048 TRIMed by page compression",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT2048},
+
+	{"compress_trim_sect4096", "compression",
+	 "Number of sect-4K TRIMed by page compression",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT4096},
+
+	{"compress_trim_sect8192", "compression",
+	 "Number of sect-8K TRIMed by page compression",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT8192},
+
+	{"compress_trim_sect16384", "compression",
+	 "Number of sect-16K TRIMed by page compression",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT16384},
+
+	{"compress_trim_sect32768", "compression",
+	 "Number of sect-32K TRIMed by page compression",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT32768},
+
+	{"compress_pages_page_compressed", "compression",
+	 "Number of pages compressed by page compression",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_COMPRESSED},
+
+	{"compress_page_compressed_trim_op", "compression",
+	 "Number of TRIM operation performed by page compression",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP},
+
+	{"compress_page_compressed_trim_op_saved", "compression",
+	 "Number of TRIM operation saved by page compression",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP_SAVED},
+
+	{"compress_pages_page_decompressed", "compression",
+	 "Number of pages decompressed by page compression",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED},
+
+	{"compress_pages_page_compression_error", "compression",
+	 "Number of page compression errors",
+	 MONITOR_NONE,
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR},
+
 	/* ========== Counters for Index ========== */
 	{"module_index", "index", "Index Manager",
 	 MONITOR_MODULE,
@@ -1572,12 +1678,32 @@ srv_mon_process_existing_counter(
 		value = stat.n_pages_written;
 		break;
 
+	/* innodb_index_pages_written, the number of index pages written */
+	case MONITOR_OVLD_INDEX_PAGES_WRITTEN:
+		value = srv_stats.index_pages_written;
+		break;
+
+	/* innodb_non_index_pages_written, the number of non index pages written */
+	case MONITOR_OVLD_NON_INDEX_PAGES_WRITTEN:
+		value = srv_stats.non_index_pages_written;
+		break;
+
 	/* innodb_pages_read */
 	case MONITOR_OVLD_PAGES_READ:
 		buf_get_total_stat(&stat);
 		value = stat.n_pages_read;
 		break;
 
+	/* Number of times secondary index lookup triggered cluster lookup */
+	case MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS:
+		value = srv_stats.n_sec_rec_cluster_reads;
+		break;
+	/* Number of times prefix optimization avoided triggering cluster
+	lookup */
+	case MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS_AVOIDED:
+		value = srv_stats.n_sec_rec_cluster_reads_avoided;
+		break;
+
 	/* innodb_data_reads, the total number of data reads */
 	case MONITOR_OVLD_BYTE_READ:
 		value = srv_stats.data_read;
@@ -1833,6 +1959,46 @@ srv_mon_process_existing_counter(
 		value = btr_cur_n_non_sea;
 		break;
 
+        case MONITOR_OVLD_PAGE_COMPRESS_SAVED:
+		value = srv_stats.page_compression_saved;
+		break;
+        case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT512:
+		value = srv_stats.page_compression_trim_sect512;
+		break;
+       case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT1024:
+		value = srv_stats.page_compression_trim_sect1024;
+		break;
+       case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT2048:
+		value = srv_stats.page_compression_trim_sect2048;
+		break;
+        case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT4096:
+		value = srv_stats.page_compression_trim_sect4096;
+		break;
+        case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT8192:
+		value = srv_stats.page_compression_trim_sect8192;
+		break;
+        case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT16384:
+		value = srv_stats.page_compression_trim_sect16384;
+		break;
+        case MONITOR_OVLD_PAGE_COMPRESS_TRIM_SECT32768:
+		value = srv_stats.page_compression_trim_sect32768;
+		break;
+        case MONITOR_OVLD_PAGES_PAGE_COMPRESSED:
+		value = srv_stats.pages_page_compressed;
+		break;
+        case MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP:
+		value = srv_stats.page_compressed_trim_op;
+		break;
+        case MONITOR_OVLD_PAGE_COMPRESSED_TRIM_OP_SAVED:
+		value = srv_stats.page_compressed_trim_op_saved;
+		break;
+        case MONITOR_OVLD_PAGES_PAGE_DECOMPRESSED:
+		value = srv_stats.pages_page_decompressed;
+		break;
+        case MONITOR_OVLD_PAGES_PAGE_COMPRESSION_ERROR:
+		value = srv_stats.pages_page_compression_error;
+		break;
+
 	default:
 		ut_error;
 	}
diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc
index 14b2bdbe03c..bcbce3cd53c 100644
--- a/storage/innobase/srv/srv0srv.cc
+++ b/storage/innobase/srv/srv0srv.cc
@@ -63,17 +63,24 @@ Created 10/8/1995 Heikki Tuuri
 #include "dict0stats_bg.h" /* dict_stats_event */
 #include "srv0start.h"
 #include "row0mysql.h"
+#include "row0log.h"
 #include "ha_prototypes.h"
 #include "trx0i_s.h"
 #include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */
 #include "srv0mon.h"
 #include "ut0crc32.h"
+#include "btr0defragment.h"
 
 #include "mysql/plugin.h"
 #include "mysql/service_thd_wait.h"
+#include "fil0pagecompress.h"
 
+#ifdef WITH_WSREP
+extern int wsrep_debug;
+extern int wsrep_trx_is_aborting(void *thd_ptr);
+#endif
 /* The following is the maximum allowed duration of a lock wait. */
-UNIV_INTERN ulint	srv_fatal_semaphore_wait_threshold = 600;
+UNIV_INTERN ulong	srv_fatal_semaphore_wait_threshold =  DEFAULT_SRV_FATAL_SEMAPHORE_TIMEOUT;
 
 /* How much data manipulation language (DML) statements need to be delayed,
 in microseconds, in order to reduce the lagging of the purge thread. */
@@ -146,6 +153,20 @@ use simulated aio we build below with threads.
 Currently we support native aio on windows and linux */
 UNIV_INTERN my_bool	srv_use_native_aio = TRUE;
 
+/* If this flag is TRUE, then we will use fallocate(PUCH_HOLE)
+to the pages */
+UNIV_INTERN my_bool	srv_use_trim = FALSE;
+/* If this flag is TRUE, then we will use posix fallocate for file extentsion */
+UNIV_INTERN my_bool	srv_use_posix_fallocate = FALSE;
+/* If this flag is TRUE, then we disable doublewrite buffer */
+UNIV_INTERN my_bool	srv_use_atomic_writes = FALSE;
+/* If this flag IS TRUE, then we use this algorithm for page compressing the pages */
+UNIV_INTERN ulong	innodb_compression_algorithm = PAGE_ZLIB_ALGORITHM;
+/* Number of threads used for multi-threaded flush */
+UNIV_INTERN long srv_mtflush_threads = MTFLUSH_DEFAULT_WORKER;
+/* If this flag is TRUE, then we will use multi threaded flush. */
+UNIV_INTERN my_bool	srv_use_mtflush                 = FALSE;
+
 #ifdef __WIN__
 /* Windows native condition variables. We use runtime loading / function
 pointers, because they are not available on Windows Server 2003 and
@@ -208,6 +229,10 @@ srv_printf_innodb_monitor() will request mutex acquisition
 with mutex_enter(), which will wait until it gets the mutex. */
 #define MUTEX_NOWAIT(mutex_skipped)	((mutex_skipped) < MAX_MUTEX_NOWAIT)
 
+#ifdef WITH_INNODB_DISALLOW_WRITES
+UNIV_INTERN os_event_t	srv_allow_writes_event;
+#endif /* WITH_INNODB_DISALLOW_WRITES */
+
 /** The sort order table of the MySQL latin1_swedish_ci character set
 collation */
 UNIV_INTERN const byte*	srv_latin1_ordering;
@@ -232,6 +257,8 @@ UNIV_INTERN ulint	srv_buf_pool_curr_size	= 0;
 UNIV_INTERN ulint	srv_mem_pool_size	= ULINT_MAX;
 UNIV_INTERN ulint	srv_lock_table_size	= ULINT_MAX;
 
+UNIV_INTERN ulong	srv_idle_flush_pct = 100;
+
 /* This parameter is deprecated. Use srv_n_io_[read|write]_threads
 instead. */
 UNIV_INTERN ulint	srv_n_file_io_threads	= ULINT_MAX;
@@ -329,6 +356,10 @@ UNIV_INTERN ulint	srv_fast_shutdown	= 0;
 /* Generate a innodb_status.<pid> file */
 UNIV_INTERN ibool	srv_innodb_status	= FALSE;
 
+/* Optimize prefix index queries to skip cluster index lookup when possible */
+/* Enables or disables this prefix optimization.  Disabled by default. */
+UNIV_INTERN my_bool	srv_prefix_index_cluster_optimization = 0;
+
 /* When estimating number of different key values in an index, sample
 this many index pages, there are 2 ways to calculate statistics:
 * persistent stats that are calculated by ANALYZE TABLE and saved
@@ -356,11 +387,6 @@ batch flushing i.e.: LRU flushing and flush_list flushing. The rest
 of the pages are used for single page flushing. */
 UNIV_INTERN ulong	srv_doublewrite_batch_size	= 120;
 
-UNIV_INTERN ibool	srv_use_atomic_writes = FALSE;
-#ifdef HAVE_POSIX_FALLOCATE
-UNIV_INTERN ibool	srv_use_posix_fallocate = TRUE;
-#endif
-
 UNIV_INTERN ulong	srv_replication_delay		= 0;
 
 /*-------------------------------------------*/
@@ -393,6 +419,26 @@ static ulint		srv_n_system_rows_read_old	= 0;
 UNIV_INTERN ulint	srv_truncated_status_writes	= 0;
 UNIV_INTERN ulint	srv_available_undo_logs         = 0;
 
+UNIV_INTERN ib_uint64_t srv_page_compression_saved      = 0;
+UNIV_INTERN ib_uint64_t srv_page_compression_trim_sect512       = 0;
+UNIV_INTERN ib_uint64_t srv_page_compression_trim_sect4096      = 0;
+UNIV_INTERN ib_uint64_t srv_index_pages_written         = 0;
+UNIV_INTERN ib_uint64_t srv_non_index_pages_written     = 0;
+UNIV_INTERN ib_uint64_t srv_pages_page_compressed       = 0;
+UNIV_INTERN ib_uint64_t srv_page_compressed_trim_op     = 0;
+UNIV_INTERN ib_uint64_t srv_page_compressed_trim_op_saved     = 0;
+UNIV_INTERN ib_uint64_t srv_index_page_decompressed     = 0;
+
+/* Defragmentation */
+UNIV_INTERN my_bool	srv_defragment = FALSE;
+UNIV_INTERN uint	srv_defragment_n_pages = 7;
+UNIV_INTERN uint	srv_defragment_stats_accuracy = 0;
+UNIV_INTERN uint	srv_defragment_fill_factor_n_recs = 20;
+UNIV_INTERN double	srv_defragment_fill_factor = 0.9;
+UNIV_INTERN uint	srv_defragment_frequency =
+	SRV_DEFRAGMENT_FREQUENCY_DEFAULT;
+UNIV_INTERN ulonglong	srv_defragment_interval = 0;
+
 /* Set the following to 0 if you want InnoDB to write messages on
 stderr on startup/shutdown. */
 UNIV_INTERN ibool	srv_print_verbose_log		= TRUE;
@@ -401,6 +447,9 @@ UNIV_INTERN my_bool	srv_print_innodb_lock_monitor	= FALSE;
 UNIV_INTERN ibool	srv_print_innodb_tablespace_monitor = FALSE;
 UNIV_INTERN ibool	srv_print_innodb_table_monitor = FALSE;
 
+/** If this flag is set tables without primary key are not allowed */
+UNIV_INTERN my_bool	srv_force_primary_key		= FALSE;
+
 /* Array of English strings describing the current state of an
 i/o handler thread */
 
@@ -1000,6 +1049,14 @@ srv_init(void)
 	dict_ind_init();
 
 	srv_conc_init();
+#ifdef WITH_INNODB_DISALLOW_WRITES
+	/* Writes have to be enabled on init or else we hang. Thus, we
+	always set the event here regardless of innobase_disallow_writes.
+	That flag will always be 0 at this point because it isn't settable
+	via my.cnf or command line arg. */
+	srv_allow_writes_event = os_event_create();
+	os_event_set(srv_allow_writes_event);
+#endif /* WITH_INNODB_DISALLOW_WRITES */
 
 	/* Initialize some INFORMATION SCHEMA internal structures */
 	trx_i_s_cache_init(trx_i_s_cache);
@@ -1518,6 +1575,24 @@ srv_export_innodb_status(void)
 		srv_truncated_status_writes;
 
 	export_vars.innodb_available_undo_logs = srv_available_undo_logs;
+	export_vars.innodb_page_compression_saved = srv_stats.page_compression_saved;
+	export_vars.innodb_page_compression_trim_sect512 = srv_stats.page_compression_trim_sect512;
+	export_vars.innodb_page_compression_trim_sect4096 = srv_stats.page_compression_trim_sect4096;
+	export_vars.innodb_index_pages_written = srv_stats.index_pages_written;
+	export_vars.innodb_non_index_pages_written = srv_stats.non_index_pages_written;
+	export_vars.innodb_pages_page_compressed = srv_stats.pages_page_compressed;
+	export_vars.innodb_page_compressed_trim_op = srv_stats.page_compressed_trim_op;
+	export_vars.innodb_page_compressed_trim_op_saved = srv_stats.page_compressed_trim_op_saved;
+	export_vars.innodb_pages_page_decompressed = srv_stats.pages_page_decompressed;
+
+	export_vars.innodb_defragment_compression_failures =
+		btr_defragment_compression_failures;
+	export_vars.innodb_defragment_failures = btr_defragment_failures;
+	export_vars.innodb_defragment_count = btr_defragment_count;
+
+	export_vars.innodb_onlineddl_rowlog_rows = onlineddl_rowlog_rows;
+	export_vars.innodb_onlineddl_rowlog_pct_used = onlineddl_rowlog_pct_used;
+	export_vars.innodb_onlineddl_pct_progress = onlineddl_pct_progress;
 
 #ifdef UNIV_DEBUG
 	rw_lock_s_lock(&purge_sys->latch);
@@ -1547,6 +1622,11 @@ srv_export_innodb_status(void)
 	}
 #endif /* UNIV_DEBUG */
 
+	export_vars.innodb_sec_rec_cluster_reads =
+		srv_stats.n_sec_rec_cluster_reads;
+	export_vars.innodb_sec_rec_cluster_reads_avoided =
+		srv_stats.n_sec_rec_cluster_reads_avoided;
+
 	mutex_exit(&srv_innodb_monitor_mutex);
 }
 
@@ -1803,7 +1883,20 @@ loop:
 
 	if (sync_array_print_long_waits(&waiter, &sema)
 	    && sema == old_sema && os_thread_eq(waiter, old_waiter)) {
+#if defined(WITH_WSREP) && defined(WITH_INNODB_DISALLOW_WRITES)
+	  if (srv_allow_writes_event->is_set) {
+#endif /* WITH_WSREP */
 		fatal_cnt++;
+#if defined(WITH_WSREP) && defined(WITH_INNODB_DISALLOW_WRITES)
+	  } else {
+		fprintf(stderr,
+			"WSREP: avoiding InnoDB self crash due to long "
+			"semaphore wait of  > %lu seconds\n"
+			"Server is processing SST donor operation, "
+			"fatal_cnt now: %lu",
+			(ulong) srv_fatal_semaphore_wait_threshold, fatal_cnt);
+	  }
+#endif /* WITH_WSREP */
 		if (fatal_cnt > 10) {
 
 			fprintf(stderr,
diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc
index d1be5be9238..2692636dcb5 100644
--- a/storage/innobase/srv/srv0start.cc
+++ b/storage/innobase/srv/srv0start.cc
@@ -3,6 +3,7 @@
 Copyright (c) 1996, 2014, Oracle and/or its affiliates. All rights reserved.
 Copyright (c) 2008, Google Inc.
 Copyright (c) 2009, Percona Inc.
+Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved.
 
 Portions of this file contain modifications contributed and copyrighted by
 Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -42,6 +43,7 @@ Created 2/16/1996 Heikki Tuuri
 #include "pars0pars.h"
 #include "row0ftsort.h"
 #include "ut0mem.h"
+#include "ut0timer.h"
 #include "mem0mem.h"
 #include "data0data.h"
 #include "data0type.h"
@@ -66,12 +68,15 @@ Created 2/16/1996 Heikki Tuuri
 #include "ibuf0ibuf.h"
 #include "srv0start.h"
 #include "srv0srv.h"
+#include "btr0defragment.h"
+
 #ifndef UNIV_HOTBACKUP
 # include "trx0rseg.h"
 # include "os0proc.h"
 # include "sync0sync.h"
 # include "buf0flu.h"
 # include "buf0rea.h"
+# include "buf0mtflu.h"
 # include "dict0boot.h"
 # include "dict0load.h"
 # include "dict0stats_bg.h"
@@ -129,7 +134,11 @@ static os_file_t	files[1000];
 /** io_handler_thread parameters for thread identification */
 static ulint		n[SRV_MAX_N_IO_THREADS + 6];
 /** io_handler_thread identifiers, 32 is the maximum number of purge threads  */
-static os_thread_id_t	thread_ids[SRV_MAX_N_IO_THREADS + 6 + 32];
+/** 6 is the ? */
+#define	START_OLD_THREAD_CNT	(SRV_MAX_N_IO_THREADS + 6 + 32)
+static os_thread_id_t	thread_ids[SRV_MAX_N_IO_THREADS + 6 + 32 + MTFLUSH_MAX_WORKER];
+/* Thread contex data for multi-threaded flush */
+void *mtflush_ctx=NULL;
 
 /** We use this mutex to test the return value of pthread_mutex_trylock
    on successful locking. HP-UX does NOT return 0, though Linux et al do. */
@@ -531,7 +540,7 @@ create_log_file(
 	*file = os_file_create(
 		innodb_file_log_key, name,
 		OS_FILE_CREATE|OS_FILE_ON_ERROR_NO_EXIT, OS_FILE_NORMAL,
-		OS_LOG_FILE, &ret);
+		OS_LOG_FILE, &ret, FALSE);
 
 	if (!ret) {
 		ib_logf(IB_LOG_LEVEL_ERROR, "Cannot create %s", name);
@@ -738,7 +747,7 @@ open_log_file(
 
 	*file = os_file_create(innodb_file_log_key, name,
 			       OS_FILE_OPEN, OS_FILE_AIO,
-			       OS_LOG_FILE, &ret);
+			       OS_LOG_FILE, &ret, FALSE);
 	if (!ret) {
 		ib_logf(IB_LOG_LEVEL_ERROR, "Unable to open '%s'", name);
 		return(DB_ERROR);
@@ -829,7 +838,7 @@ open_or_create_data_files(
 
 			files[i] = os_file_create(
 				innodb_file_data_key, name, OS_FILE_CREATE,
-				OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+				OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE);
 
 			if (srv_read_only_mode) {
 
@@ -872,7 +881,7 @@ open_or_create_data_files(
 
 			files[i] = os_file_create(
 				innodb_file_data_key, name, OS_FILE_OPEN_RAW,
-				OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+				OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE);
 
 			if (!ret) {
 				ib_logf(IB_LOG_LEVEL_ERROR,
@@ -905,17 +914,17 @@ open_or_create_data_files(
 				files[i] = os_file_create(
 					innodb_file_data_key,
 					name, OS_FILE_OPEN_RAW,
-					OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+					OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE);
 			} else if (i == 0) {
 				files[i] = os_file_create(
 					innodb_file_data_key,
 					name, OS_FILE_OPEN_RETRY,
-					OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+					OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE);
 			} else {
 				files[i] = os_file_create(
 					innodb_file_data_key,
 					name, OS_FILE_OPEN, OS_FILE_NORMAL,
-					OS_DATA_FILE, &ret);
+					OS_DATA_FILE, &ret, FALSE);
 			}
 
 			if (!ret) {
@@ -1000,7 +1009,8 @@ check_first_page:
 #ifdef UNIV_LOG_ARCHIVE
 				min_arch_log_no, max_arch_log_no,
 #endif /* UNIV_LOG_ARCHIVE */
-				min_flushed_lsn, max_flushed_lsn);
+				min_flushed_lsn, max_flushed_lsn,
+				ULINT_UNDEFINED);
 
 			if (check_msg) {
 
@@ -1135,7 +1145,7 @@ srv_undo_tablespace_create(
 		innodb_file_data_key,
 		name,
 		srv_read_only_mode ? OS_FILE_OPEN : OS_FILE_CREATE,
-		OS_FILE_NORMAL, OS_DATA_FILE, &ret);
+		OS_FILE_NORMAL, OS_DATA_FILE, &ret, FALSE);
 
 	if (srv_read_only_mode && ret) {
 		ib_logf(IB_LOG_LEVEL_INFO,
@@ -1222,7 +1232,8 @@ srv_undo_tablespace_open(
 		| OS_FILE_ON_ERROR_SILENT,
 		OS_FILE_NORMAL,
 		OS_DATA_FILE,
-		&ret);
+		&ret,
+		FALSE);
 
 	/* If the file open was successful then load the tablespace. */
 
@@ -1524,6 +1535,9 @@ innobase_start_or_create_for_mysql(void)
 	size_t		dirnamelen;
 	bool		sys_datafiles_created = false;
 
+	/* This should be initialized early */
+	ut_init_timer();
+
 	if (srv_force_recovery > SRV_FORCE_NO_TRX_UNDO) {
 		srv_read_only_mode = true;
 	}
@@ -2715,6 +2729,25 @@ files_checked:
 	}
 
 	if (!srv_read_only_mode) {
+
+		if (srv_use_mtflush) {
+			/* Start multi-threaded flush threads */
+			mtflush_ctx = buf_mtflu_handler_init(
+				srv_mtflush_threads,
+				srv_buf_pool_instances);
+
+			/* Set up the thread ids */
+			buf_mtflu_set_thread_ids(
+				srv_mtflush_threads,
+				mtflush_ctx,
+				(thread_ids + 6 + 32));
+
+#if UNIV_DEBUG
+			fprintf(stderr, "InnoDB: Note: %s:%d buf-pool-instances:%lu mtflush_threads %lu\n",
+				__FILE__, __LINE__, srv_buf_pool_instances, srv_mtflush_threads);
+#endif
+		}
+
 		os_thread_create(buf_flush_page_cleaner_thread, NULL, NULL);
 	}
 
@@ -2869,6 +2902,9 @@ files_checked:
 		fts_optimize_init();
 	}
 
+	/* Initialize online defragmentation. */
+	btr_defragment_init();
+
 	srv_was_started = TRUE;
 
 	return(DB_SUCCESS);
@@ -2979,6 +3015,13 @@ innobase_shutdown_for_mysql(void)
 		logs_empty_and_mark_files_at_shutdown() and should have
 		already quit or is quitting right now. */
 
+
+		if (srv_use_mtflush) {
+			/* g. Exit the multi threaded flush threads */
+
+			buf_mtflu_io_thread_exit();
+		}
+
 		os_mutex_enter(os_sync_mutex);
 
 		if (os_thread_count == 0) {
diff --git a/storage/innobase/sync/sync0sync.cc b/storage/innobase/sync/sync0sync.cc
index fb559f26bd4..aa2b5fa29db 100644
--- a/storage/innobase/sync/sync0sync.cc
+++ b/storage/innobase/sync/sync0sync.cc
@@ -1172,6 +1172,7 @@ sync_thread_add_level(
 	case SYNC_IBUF_MUTEX:
 	case SYNC_INDEX_ONLINE_LOG:
 	case SYNC_STATS_AUTO_RECALC:
+	case SYNC_STATS_DEFRAG:
 		if (!sync_thread_levels_g(array, level, TRUE)) {
 			fprintf(stderr,
 				"InnoDB: sync_thread_levels_g(array, %lu)"
diff --git a/storage/innobase/trx/trx0rec.cc b/storage/innobase/trx/trx0rec.cc
index 11ad7fe4afd..fa3fe0904b8 100644
--- a/storage/innobase/trx/trx0rec.cc
+++ b/storage/innobase/trx/trx0rec.cc
@@ -781,7 +781,8 @@ trx_undo_page_report_modify(
 				}
 
 				pos = dict_index_get_nth_col_pos(index,
-								 col_no);
+								 col_no,
+								 NULL);
 				ptr += mach_write_compressed(ptr, pos);
 
 				/* Save the old value of field */
diff --git a/storage/innobase/trx/trx0sys.cc b/storage/innobase/trx/trx0sys.cc
index 52830a77b12..2c31af9442c 100644
--- a/storage/innobase/trx/trx0sys.cc
+++ b/storage/innobase/trx/trx0sys.cc
@@ -44,6 +44,8 @@ Created 3/26/1996 Heikki Tuuri
 #include "os0file.h"
 #include "read0read.h"
 
+#include <mysql/service_wsrep.h>
+
 /** The file format tag structure with id and name. */
 struct file_format_t {
 	ulint		id;		/*!< id of the file format */
@@ -174,7 +176,12 @@ trx_sys_flush_max_trx_id(void)
 	mtr_t		mtr;
 	trx_sysf_t*	sys_header;
 
+#ifndef WITH_WSREP
+       /* wsrep_fake_trx_id  violates this assert
+        * Copied from trx_sys_get_new_trx_id
+        */
 	ut_ad(mutex_own(&trx_sys->mutex));
+#endif /* WITH_WSREP */
 
 	if (!srv_read_only_mode) {
 		mtr_start(&mtr);
@@ -202,9 +209,14 @@ trx_sys_update_mysql_binlog_offset(
 	ib_int64_t	offset,	/*!< in: position in that log file */
 	ulint		field,	/*!< in: offset of the MySQL log info field in
 				the trx sys header */
+#ifdef WITH_WSREP
+        trx_sysf_t*     sys_header, /*!< in: trx sys header */
+#endif /* WITH_WSREP */
 	mtr_t*		mtr)	/*!< in: mtr */
 {
+#ifndef WITH_WSREP
 	trx_sysf_t*	sys_header;
+#endif /* !WITH_WSREP */
 
 	if (ut_strlen(file_name) >= TRX_SYS_MYSQL_LOG_NAME_LEN) {
 
@@ -213,7 +225,9 @@ trx_sys_update_mysql_binlog_offset(
 		return;
 	}
 
+#ifndef WITH_WSREP
 	sys_header = trx_sysf_get(mtr);
+#endif /* !WITH_WSREP */
 
 	if (mach_read_from_4(sys_header + field
 			     + TRX_SYS_MYSQL_LOG_MAGIC_N_FLD)
@@ -300,6 +314,124 @@ trx_sys_print_mysql_binlog_offset(void)
 	mtr_commit(&mtr);
 }
 
+#ifdef WITH_WSREP
+
+#ifdef UNIV_DEBUG
+static long long trx_sys_cur_xid_seqno = -1;
+static unsigned char trx_sys_cur_xid_uuid[16];
+
+long long read_wsrep_xid_seqno(const XID* xid)
+{
+    long long seqno;
+    memcpy(&seqno, xid->data + 24, sizeof(long long));
+    return seqno;
+}
+
+void read_wsrep_xid_uuid(const XID* xid, unsigned char* buf)
+{
+    memcpy(buf, xid->data + 8, 16);
+}
+
+#endif /* UNIV_DEBUG */
+
+void
+trx_sys_update_wsrep_checkpoint(
+        const XID*      xid,        /*!< in: transaction XID */
+        trx_sysf_t*     sys_header, /*!< in: sys_header */
+        mtr_t*          mtr)        /*!< in: mtr */
+{
+#ifdef UNIV_DEBUG
+        {
+            /* Check that seqno is monotonically increasing */
+            unsigned char xid_uuid[16];
+            long long xid_seqno = read_wsrep_xid_seqno(xid);
+            read_wsrep_xid_uuid(xid, xid_uuid);
+            if (!memcmp(xid_uuid, trx_sys_cur_xid_uuid, 8))
+            {
+                ut_ad(xid_seqno > trx_sys_cur_xid_seqno);
+                trx_sys_cur_xid_seqno = xid_seqno;
+            }
+            else
+            {
+                memcpy(trx_sys_cur_xid_uuid, xid_uuid, 16);
+            }
+            trx_sys_cur_xid_seqno = xid_seqno;
+        }
+#endif /* UNIV_DEBUG */
+
+        ut_ad(xid && mtr);
+        ut_a(xid->formatID == -1 || wsrep_is_wsrep_xid(xid));
+
+        if (mach_read_from_4(sys_header + TRX_SYS_WSREP_XID_INFO
+                             + TRX_SYS_WSREP_XID_MAGIC_N_FLD)
+            != TRX_SYS_WSREP_XID_MAGIC_N) {
+                mlog_write_ulint(sys_header + TRX_SYS_WSREP_XID_INFO
+                                 + TRX_SYS_WSREP_XID_MAGIC_N_FLD,
+                                 TRX_SYS_WSREP_XID_MAGIC_N,
+                                 MLOG_4BYTES, mtr);
+        }
+
+        mlog_write_ulint(sys_header + TRX_SYS_WSREP_XID_INFO
+                         + TRX_SYS_WSREP_XID_FORMAT,
+                         (int)xid->formatID,
+                         MLOG_4BYTES, mtr);
+        mlog_write_ulint(sys_header + TRX_SYS_WSREP_XID_INFO
+                         + TRX_SYS_WSREP_XID_GTRID_LEN,
+                         (int)xid->gtrid_length,
+                         MLOG_4BYTES, mtr);
+        mlog_write_ulint(sys_header + TRX_SYS_WSREP_XID_INFO
+                         + TRX_SYS_WSREP_XID_BQUAL_LEN,
+                         (int)xid->bqual_length,
+                         MLOG_4BYTES, mtr);
+        mlog_write_string(sys_header + TRX_SYS_WSREP_XID_INFO
+                          + TRX_SYS_WSREP_XID_DATA,
+                          (const unsigned char*) xid->data,
+                          XIDDATASIZE, mtr);
+
+}
+
+void
+trx_sys_read_wsrep_checkpoint(XID* xid)
+/*===================================*/
+{
+        trx_sysf_t* sys_header;
+	mtr_t	    mtr;
+        ulint       magic;
+
+        ut_ad(xid);
+
+	mtr_start(&mtr);
+
+	sys_header = trx_sysf_get(&mtr);
+
+        if ((magic = mach_read_from_4(sys_header + TRX_SYS_WSREP_XID_INFO
+                                      + TRX_SYS_WSREP_XID_MAGIC_N_FLD))
+            != TRX_SYS_WSREP_XID_MAGIC_N) {
+                memset(xid, 0, sizeof(*xid));
+                xid->formatID = -1;
+                trx_sys_update_wsrep_checkpoint(xid, sys_header, &mtr);
+                mtr_commit(&mtr);
+                return;
+        }
+
+        xid->formatID     = (int)mach_read_from_4(
+                sys_header
+                + TRX_SYS_WSREP_XID_INFO + TRX_SYS_WSREP_XID_FORMAT);
+        xid->gtrid_length = (int)mach_read_from_4(
+                sys_header
+                + TRX_SYS_WSREP_XID_INFO + TRX_SYS_WSREP_XID_GTRID_LEN);
+        xid->bqual_length = (int)mach_read_from_4(
+                sys_header
+                + TRX_SYS_WSREP_XID_INFO + TRX_SYS_WSREP_XID_BQUAL_LEN);
+        ut_memcpy(xid->data,
+                  sys_header + TRX_SYS_WSREP_XID_INFO + TRX_SYS_WSREP_XID_DATA,
+                  XIDDATASIZE);
+
+	mtr_commit(&mtr);
+}
+
+#endif /* WITH_WSREP */
+
 /*****************************************************************//**
 Prints to stderr the MySQL master log offset info in the trx system header if
 the magic number shows it valid. */
diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc
index 405d4ef958f..5410bb98190 100644
--- a/storage/innobase/trx/trx0trx.cc
+++ b/storage/innobase/trx/trx0trx.cc
@@ -29,6 +29,8 @@ Created 3/26/1996 Heikki Tuuri
 #include "trx0trx.ic"
 #endif
 
+#include <mysql/service_wsrep.h>
+
 #include "trx0undo.h"
 #include "trx0rseg.h"
 #include "log0log.h"
@@ -162,6 +164,9 @@ trx_create(void)
 	trx->lock.table_locks = ib_vector_create(
 		heap_alloc, sizeof(void**), 32);
 
+#ifdef WITH_WSREP
+	trx->wsrep_event = NULL;
+#endif /* WITH_WSREP */
 	return(trx);
 }
 
@@ -857,6 +862,11 @@ trx_start_low(
 			srv_undo_logs, srv_undo_tablespaces);
 	}
 
+#ifdef WITH_WSREP
+        memset(&trx->xid, 0, sizeof(trx->xid));
+        trx->xid.formatID = -1;
+#endif /* WITH_WSREP */
+
 	/* The initial value for trx->no: TRX_ID_MAX is used in
 	read_view_open_now: */
 
@@ -971,6 +981,9 @@ trx_write_serialisation_history(
 	trx_t*		trx,	/*!< in/out: transaction */
 	mtr_t*		mtr)	/*!< in/out: mini-transaction */
 {
+#ifdef WITH_WSREP
+        trx_sysf_t* sys_header;
+#endif /* WITH_WSREP */
 	trx_rseg_t*	rseg;
 
 	rseg = trx->rseg;
@@ -1017,6 +1030,15 @@ trx_write_serialisation_history(
 
 	MONITOR_INC(MONITOR_TRX_COMMIT_UNDO);
 
+#ifdef WITH_WSREP
+	sys_header = trx_sysf_get(mtr);
+	/* Update latest MySQL wsrep XID in trx sys header. */
+	if (wsrep_is_wsrep_xid(&trx->xid))
+	{
+		trx_sys_update_wsrep_checkpoint(&trx->xid, sys_header, mtr);
+	}
+#endif /* WITH_WSREP */
+
 	/* Update the latest MySQL binlog name and offset info
 	in trx sys header if MySQL binlogging is on or the database
 	server is a MySQL replication slave */
@@ -1027,7 +1049,11 @@ trx_write_serialisation_history(
 		trx_sys_update_mysql_binlog_offset(
 			trx->mysql_log_file_name,
 			trx->mysql_log_offset,
-			TRX_SYS_MYSQL_LOG_INFO, mtr);
+			TRX_SYS_MYSQL_LOG_INFO, 
+#ifdef WITH_WSREP
+                        sys_header,
+#endif /* WITH_WSREP */
+			mtr);
 
 		trx->mysql_log_file_name = NULL;
 	}
@@ -1321,6 +1347,11 @@ trx_commit_in_memory(
 	ut_ad(!trx->in_ro_trx_list);
 	ut_ad(!trx->in_rw_trx_list);
 
+#ifdef WITH_WSREP
+	if (wsrep_on(trx->mysql_thd)) {
+		trx->lock.was_chosen_as_deadlock_victim = FALSE;
+	}
+#endif
 	trx->dict_operation = TRX_DICT_OP_NONE;
 
 	trx->error_state = DB_SUCCESS;
@@ -1505,6 +1536,10 @@ trx_commit_or_rollback_prepare(
 
 	switch (trx->state) {
 	case TRX_STATE_NOT_STARTED:
+#ifdef WITH_WSREP
+		ut_d(trx->start_file = __FILE__);
+		ut_d(trx->start_line = __LINE__);
+#endif /* WITH_WSREP */
 		trx_start_low(trx);
 		/* fall through */
 	case TRX_STATE_ACTIVE:
diff --git a/storage/innobase/ut/ut0timer.cc b/storage/innobase/ut/ut0timer.cc
new file mode 100644
index 00000000000..85292cce28c
--- /dev/null
+++ b/storage/innobase/ut/ut0timer.cc
@@ -0,0 +1,92 @@
+/*****************************************************************************
+
+Copyright (c) 2013, 2014, Facebook, Inc. All Rights Reserved.
+Copyright (c) 2014, SkySQL Ab. All Rights Reserved.
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; version 2 of the License.
+
+This program is distributed in the hope that it will be useful, but WITHOUT
+ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along with
+this program; if not, write to the Free Software Foundation, Inc.,
+51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
+
+*****************************************************************************/
+
+/********************************************************************//**
+@file ut/ut0timer.cc
+Timer rountines
+
+Created 30/07/2014 Jan Lindström jan.lindstrom@skysql.com
+modified from https://github.com/facebook/mysql-5.6/commit/c75a413edeb96eb99bf11d7269bdfea06f96d6b6
+*************************************************************************/
+
+#include "data0type.h"
+#include <my_rdtsc.h>
+#include <ut0timer.h>
+
+/**************************************************************//**
+Initial timer definition
+@return	0 */
+static
+ulonglong
+ut_timer_none(void)
+/*===============*/
+{
+	return 0;
+}
+
+/**************************************************************//**
+Function pointer to point selected timer function.
+@return	timer current value */
+ulonglong (*ut_timer_now)(void) = &ut_timer_none;
+
+struct my_timer_unit_info ut_timer;
+
+/**************************************************************//**
+Sets up the data required for use of my_timer_* functions.
+Selects the best timer by high frequency, and tight resolution.
+Points my_timer_now() to the selected timer function.
+Initializes my_timer struct to contain the info for selected timer.*/
+UNIV_INTERN
+void
+ut_init_timer(void)
+/*===============*/
+{
+	MY_TIMER_INFO all_timer_info;
+	my_timer_init(&all_timer_info);
+
+	if (all_timer_info.cycles.frequency > 1000000 &&
+	    all_timer_info.cycles.resolution == 1) {
+		ut_timer = all_timer_info.cycles;
+		ut_timer_now = &my_timer_cycles;
+	} else if (all_timer_info.nanoseconds.frequency > 1000000 &&
+		 all_timer_info.nanoseconds.resolution == 1) {
+		ut_timer = all_timer_info.nanoseconds;
+		ut_timer_now = &my_timer_nanoseconds;
+	} else if (all_timer_info.microseconds.frequency >= 1000000 &&
+		all_timer_info.microseconds.resolution == 1) {
+		ut_timer = all_timer_info.microseconds;
+		ut_timer_now = &my_timer_microseconds;
+
+	} else if (all_timer_info.milliseconds.frequency >= 1000 &&
+		all_timer_info.milliseconds.resolution == 1) {
+		ut_timer = all_timer_info.milliseconds;
+		ut_timer_now = &my_timer_milliseconds;
+	} else if (all_timer_info.ticks.frequency >= 1000 &&
+		 /* Will probably be false */
+		all_timer_info.ticks.resolution == 1) {
+		ut_timer = all_timer_info.ticks;
+		ut_timer_now = &my_timer_ticks;
+	} else {
+		/* None are acceptable, so leave it as "None", and fill in struct */
+		ut_timer.frequency = 1; /* Avoid div-by-zero */
+		ut_timer.overhead = 0; /* Since it doesn't do anything */
+		ut_timer.resolution = 10; /* Another sign it's bad */
+		ut_timer.routine = 0; /* None */
+	}
+}
diff --git a/storage/innobase/ut/ut0wqueue.cc b/storage/innobase/ut/ut0wqueue.cc
index d1ba36b3b00..1607e535a94 100644
--- a/storage/innobase/ut/ut0wqueue.cc
+++ b/storage/innobase/ut/ut0wqueue.cc
@@ -162,6 +162,38 @@ ib_wqueue_timedwait(
 }
 
 /********************************************************************
+Return first item on work queue or NULL if queue is empty
+@return work item or NULL */
+void*
+ib_wqueue_nowait(
+/*=============*/
+	ib_wqueue_t*	wq)		/*<! in: work queue */
+{
+	ib_list_node_t*	node = NULL;
+
+	mutex_enter(&wq->mutex);
+
+	if(!ib_list_is_empty(wq->items)) {
+		node = ib_list_get_first(wq->items);
+
+		if (node) {
+			ib_list_remove(wq->items, node);
+
+		}
+	}
+
+	/* We must reset the event when the list
+	gets emptied. */
+	if(ib_list_is_empty(wq->items)) {
+		os_event_reset(wq->event);
+	}
+
+	mutex_exit(&wq->mutex);
+
+	return (node ? node->data : NULL);
+}
+
+/********************************************************************
 Check if queue is empty. */
 
 ibool
@@ -173,3 +205,20 @@ ib_wqueue_is_empty(
 {
 	return(ib_list_is_empty(wq->items));
 }
+
+/********************************************************************
+Get number of items on queue.
+@return number of items on queue */
+ulint
+ib_wqueue_len(
+/*==========*/
+	ib_wqueue_t*	wq)		/*<! in: work queue */
+{
+	ulint len = 0;
+
+	mutex_enter(&wq->mutex);
+	len = ib_list_len(wq->items);
+	mutex_exit(&wq->mutex);
+
+        return(len);
+}