264 files changed, 12898 insertions, 5199 deletions
diff --git a/storage/connect/JdbcInterface.java b/storage/connect/JdbcInterface.java
index f765052915d..34af8c4e013 100644
--- a/storage/connect/JdbcInterface.java
+++ b/storage/connect/JdbcInterface.java
@@ -340,6 +340,18 @@ public class JdbcInterface {
       return m;
     } // end of GetMaxValue
     
+    public String GetQuoteString() {
+      String qs = null;
+      
+      try {
+        qs = dbmd.getIdentifierQuoteString();
+      } catch(SQLException se) {
+    	SetErrmsg(se);  
+      } // end try/catch
+      
+      return qs;
+    } // end of GetQuoteString
+    
     public int GetColumns(String[] parms) {
       int ncol = -1;
       
@@ -680,11 +692,11 @@ public class JdbcInterface {
 	  return 0;  
 	} // end of TimestampField
     
-    public String ObjectField(int n, String name) {
+    public Object ObjectField(int n, String name) {
 	  if (rs == null) {
 		System.out.println("No result set");
 	  } else try {
-	    return (n > 0) ? rs.getObject(n).toString() : rs.getObject(name).toString();
+	    return (n > 0) ? rs.getObject(n) : rs.getObject(name);
 	  } catch (SQLException se) {
 		SetErrmsg(se);
 	  } //end try/catch
diff --git a/storage/connect/filamdbf.cpp b/storage/connect/filamdbf.cpp
index 327bcf376df..5e3dfd8fe60 100644
--- a/storage/connect/filamdbf.cpp
+++ b/storage/connect/filamdbf.cpp
@@ -383,7 +383,7 @@ DBFBASE::DBFBASE(DBFBASE *txfp)
 /*  and header length. Set Records, check that Reclen is equal to lrecl and */
 /*  return the header length or 0 in case of error.                         */
 /****************************************************************************/
-int DBFBASE::ScanHeader(PGLOBAL g, PSZ fname, int lrecl, char *defpath)
+int DBFBASE::ScanHeader(PGLOBAL g, PSZ fn, int lrecl, int *rln, char *defpath)
   {
   int       rc;
   char      filename[_MAX_PATH];
@@ -393,7 +393,7 @@ int DBFBASE::ScanHeader(PGLOBAL g, PSZ fname, int lrecl, char *defpath)
   /************************************************************************/
   /*  Open the input file.                                                */
   /************************************************************************/
-  PlugSetPath(filename, fname, defpath);
+  PlugSetPath(filename, fn, defpath);
 
   if (!(infile= global_fopen(g, MSGID_CANNOT_OPEN, filename, "rb")))
     return 0;              // Assume file does not exist
@@ -410,11 +410,7 @@ int DBFBASE::ScanHeader(PGLOBAL g, PSZ fname, int lrecl, char *defpath)
   } else if (rc == RC_FX)
     return -1;
 
-  if ((int)header.Reclen() != lrecl) {
-    sprintf(g->Message, MSG(BAD_LRECL), lrecl, header.Reclen());
-    return -1;
-    } // endif Lrecl
-
+	*rln = (int)header.Reclen();
   Records = (int)header.Records();
   return (int)header.Headlen();
   } // end of ScanHeader
@@ -431,9 +427,27 @@ int DBFFAM::Cardinality(PGLOBAL g)
   if (!g)
     return 1;
 
-  if (!Headlen)
-    if ((Headlen = ScanHeader(g, To_File, Lrecl, Tdbp->GetPath())) < 0)
-      return -1;                // Error in ScanHeader
+	if (!Headlen) {
+		int rln = 0;								// Record length in the file header
+
+		Headlen = ScanHeader(g, To_File, Lrecl, &rln, Tdbp->GetPath());
+
+		if (Headlen < 0)
+			return -1;                // Error in ScanHeader
+
+		if (rln && Lrecl != rln) {
+			// This happens always on some Linux platforms
+			sprintf(g->Message, MSG(BAD_LRECL), Lrecl, rln);
+
+			if (Accept) {
+				Lrecl = rln;
+				PushWarning(g, Tdbp);
+			} else
+				return -1;
+
+		} // endif rln
+
+	}	// endif Headlen
 
   // Set number of blocks for later use
   Block = (Records > 0) ? (Records + Nrec - 1) / Nrec : 0;
@@ -565,7 +579,13 @@ bool DBFFAM::AllocateBuffer(PGLOBAL g)
 
       if (Lrecl != reclen) {
         sprintf(g->Message, MSG(BAD_LRECL), Lrecl, reclen);
-        return true;
+
+				if (Accept) {
+					Lrecl = reclen;
+					PushWarning(g, Tdbp);
+				}	else
+					return true;
+
         } // endif Lrecl
 
       hlen = HEADLEN * (n + 1) + 2;
@@ -641,8 +661,14 @@ bool DBFFAM::AllocateBuffer(PGLOBAL g)
     if ((rc = dbfhead(g, Stream, Tdbp->GetFile(g), &header)) == RC_OK) {
       if (Lrecl != (int)header.Reclen()) {
         sprintf(g->Message, MSG(BAD_LRECL), Lrecl, header.Reclen());
-        return true;
-        } // endif Lrecl
+
+				if (Accept) {
+					Lrecl = header.Reclen();
+					PushWarning(g, Tdbp);
+				} else
+					return true;
+
+			} // endif Lrecl
 
       Records = (int)header.Records();
       Headlen = (int)header.Headlen();
@@ -916,9 +942,27 @@ int DBMFAM::Cardinality(PGLOBAL g)
   if (!g)
     return 1;
 
-  if (!Headlen)
-    if ((Headlen = ScanHeader(g, To_File, Lrecl, Tdbp->GetPath())) < 0)
-      return -1;                // Error in ScanHeader
+	if (!Headlen) {
+		int rln = 0;								// Record length in the file header
+
+		Headlen = ScanHeader(g, To_File, Lrecl, &rln, Tdbp->GetPath());
+
+		if (Headlen < 0)
+			return -1;                // Error in ScanHeader
+
+		if (rln && Lrecl != rln) {
+			// This happens always on some Linux platforms
+			sprintf(g->Message, MSG(BAD_LRECL), Lrecl, rln);
+
+			if (Accept) {
+				Lrecl = rln;
+				PushWarning(g, Tdbp);
+			} else
+				return -1;
+
+		} // endif rln
+
+	}	// endif Headlen
 
   // Set number of blocks for later use
   Block = (Records > 0) ? (Records + Nrec - 1) / Nrec : 0;
@@ -961,8 +1005,14 @@ bool DBMFAM::AllocateBuffer(PGLOBAL g)
 
     if (Lrecl != (int)hp->Reclen()) {
       sprintf(g->Message, MSG(BAD_LRECL), Lrecl, hp->Reclen());
-      return true;
-      } // endif Lrecl
+
+			if (Accept) {
+				Lrecl = hp->Reclen();
+				PushWarning(g, Tdbp);
+			} else
+				return true;
+
+		} // endif Lrecl
 
     Records = (int)hp->Records();
     Headlen = (int)hp->Headlen();
diff --git a/storage/connect/filamdbf.h b/storage/connect/filamdbf.h
index da84d7685a8..66458a10eaa 100644
--- a/storage/connect/filamdbf.h
+++ b/storage/connect/filamdbf.h
@@ -31,7 +31,7 @@ class DllExport DBFBASE {
   DBFBASE(PDBF txfp);
 
   // Implementation
-  int  ScanHeader(PGLOBAL g, PSZ fname, int lrecl, char *defpath);
+  int  ScanHeader(PGLOBAL g, PSZ fname, int lrecl, int *rlen, char *defpath);
 
  protected:
   // Default constructor, not to be used
diff --git a/storage/connect/ha_connect.cc b/storage/connect/ha_connect.cc
index a2ba6ffad13..419a33ed74e 100644
--- a/storage/connect/ha_connect.cc
+++ b/storage/connect/ha_connect.cc
@@ -224,6 +224,7 @@ uint    GetWorkSize(void);
 void    SetWorkSize(uint);
 extern "C" const char *msglang(void);
 
+static void PopUser(PCONNECT xp);
 static PCONNECT GetUser(THD *thd, PCONNECT xp);
 static PGLOBAL  GetPlug(THD *thd, PCONNECT& lxp);
 
@@ -831,34 +832,43 @@ ha_connect::~ha_connect(void)
                          table ? table->s->table_name.str : "<null>",
                          xp, xp ? xp->count : 0);
 
-  if (xp) {
-    PCONNECT p;
+	PopUser(xp);
+} // end of ha_connect destructor
 
-    xp->count--;
 
-    for (p= user_connect::to_users; p; p= p->next)
-      if (p == xp)
-        break;
+/****************************************************************************/
+/*  Check whether this user can be removed.                                 */
+/****************************************************************************/
+static void PopUser(PCONNECT xp)
+{
+	if (xp) {
+		xp->count--;
 
-    if (p && !p->count) {
-      if (p->next)
-        p->next->previous= p->previous;
+		if (!xp->count) {
+			PCONNECT p;
 
-      if (p->previous)
-        p->previous->next= p->next;
-      else
-        user_connect::to_users= p->next;
+			for (p= user_connect::to_users; p; p= p->next)
+			  if (p == xp)
+				  break;
 
-      } // endif p
+		  if (p) {
+			  if (p->next)
+				  p->next->previous= p->previous;
 
-    if (!xp->count) {
-      PlugCleanup(xp->g, true);
-      delete xp;
-      } // endif count
+			  if (p->previous)
+				  p->previous->next= p->next;
+			  else
+				  user_connect::to_users= p->next;
 
-    } // endif xp
+		  } // endif p
 
-} // end of ha_connect destructor
+			PlugCleanup(xp->g, true);
+			delete xp;
+		} // endif count
+
+	} // endif xp
+
+} // end of PopUser
 
 
 /****************************************************************************/
@@ -866,7 +876,7 @@ ha_connect::~ha_connect(void)
 /****************************************************************************/
 static PCONNECT GetUser(THD *thd, PCONNECT xp)
 {
-  if (!thd)
+	if (!thd)
     return NULL;
 
   if (xp && thd == xp->thdp)
@@ -890,7 +900,6 @@ static PCONNECT GetUser(THD *thd, PCONNECT xp)
   return xp;
 } // end of GetUser
 
-
 /****************************************************************************/
 /*  Get the global pointer of the user of this handler.                     */
 /****************************************************************************/
@@ -5260,7 +5269,18 @@ static int connect_assisted_discovery(handlerton *, THD* thd,
   if (!(shm= (char*)db))
     db= table_s->db.str;                   // Default value
 
-  // Check table type
+	// Save stack and allocation environment and prepare error return
+	if (g->jump_level == MAX_JUMP) {
+		strcpy(g->Message, MSG(TOO_MANY_JUMPS));
+		goto jer;
+	} // endif jump_level
+
+	if ((rc= setjmp(g->jumper[++g->jump_level])) != 0) {
+		my_message(ER_UNKNOWN_ERROR, g->Message, MYF(0));
+		goto err;
+	} // endif rc
+
+	// Check table type
   if (ttp == TAB_UNDEF) {
     topt->type= (src) ? "MYSQL" : (tab) ? "PROXY" : "DOS";
     ttp= GetTypeID(topt->type);
@@ -5269,20 +5289,9 @@ static int connect_assisted_discovery(handlerton *, THD* thd,
   } else if (ttp == TAB_NIY) {
     sprintf(g->Message, "Unsupported table type %s", topt->type);
     my_message(ER_UNKNOWN_ERROR, g->Message, MYF(0));
-    return HA_ERR_INTERNAL_ERROR;
+		goto err;
   } // endif ttp
 
-  // Save stack and allocation environment and prepare error return
-  if (g->jump_level == MAX_JUMP) {
-    strcpy(g->Message, MSG(TOO_MANY_JUMPS));
-    return HA_ERR_INTERNAL_ERROR;
-    } // endif jump_level
-
-  if ((rc= setjmp(g->jumper[++g->jump_level])) != 0) {
-    my_message(ER_UNKNOWN_ERROR, g->Message, MYF(0));
-    goto err;
-    } // endif rc
-
   if (!tab) {
     if (ttp == TAB_TBL) {
       // Make tab the first table of the list
@@ -5842,6 +5851,7 @@ static int connect_assisted_discovery(handlerton *, THD* thd,
       rc= init_table_share(thd, table_s, create_info, &sql);
 
     g->jump_level--;
+		PopUser(xp);
     return rc;
     } // endif ok
 
@@ -5849,7 +5859,9 @@ static int connect_assisted_discovery(handlerton *, THD* thd,
 
  err:
   g->jump_level--;
-  return HA_ERR_INTERNAL_ERROR;
+ jer:
+	PopUser(xp);
+	return HA_ERR_INTERNAL_ERROR;
 } // end of connect_assisted_discovery
 
 /**
diff --git a/storage/connect/jdbconn.cpp b/storage/connect/jdbconn.cpp
index 3b8de3e975b..dca9bd0eac4 100644
--- a/storage/connect/jdbconn.cpp
+++ b/storage/connect/jdbconn.cpp
@@ -498,145 +498,6 @@ PQRYRES JDBCDrivers(PGLOBAL g, int maxres, bool info)
 	return qrp;
 } // end of JDBCDrivers
 
-#if 0
-/*************************************************************************/
-/*  JDBCDataSources: constructs the result blocks containing all JDBC    */
-/*  data sources available on the local host.                            */
-/*  Called with info=true to have result column names.                   */
-/*************************************************************************/
-PQRYRES JDBCDataSources(PGLOBAL g, int maxres, bool info)
-{
-	int      buftyp[] ={ TYPE_STRING, TYPE_STRING };
-	XFLD     fldtyp[] ={ FLD_NAME, FLD_REM };
-	unsigned int length[] ={ 0, 256 };
-	bool     b[] ={ false, true };
-	int      i, n = 0, ncol = 2;
-	PCOLRES  crp;
-	PQRYRES  qrp;
-	JDBConn *jcp = NULL;
-
-	/************************************************************************/
-	/*  Do an evaluation of the result size.                                */
-	/************************************************************************/
-	if (!info) {
-		jcp = new(g)JDBConn(g, NULL);
-		n = jcp->GetMaxValue(SQL_MAX_DSN_LENGTH);
-		length[0] = (n) ? (n + 1) : 256;
-
-		if (!maxres)
-			maxres = 512;         // Estimated max number of data sources
-
-	} else {
-		length[0] = 256;
-		maxres = 0;
-	} // endif info
-
-	if (trace)
-		htrc("JDBCDataSources: max=%d len=%d\n", maxres, length[0]);
-
-	/************************************************************************/
-	/*  Allocate the structures used to refer to the result set.            */
-	/************************************************************************/
-	qrp = PlgAllocResult(g, ncol, maxres, IDS_DSRC,
-		buftyp, fldtyp, length, false, true);
-
-	for (i = 0, crp = qrp->Colresp; crp; i++, crp = crp->Next)
-		if (b[i])
-			crp->Kdata->SetNullable(true);
-
-	/************************************************************************/
-	/*  Now get the results into blocks.                                    */
-	/************************************************************************/
-	if (!info && qrp && jcp->GetDataSources(qrp))
-		qrp = NULL;
-
-	/************************************************************************/
-	/*  Return the result pointer for use by GetData routines.              */
-	/************************************************************************/
-	return qrp;
-} // end of JDBCDataSources
-
-/**************************************************************************/
-/*  PrimaryKeys: constructs the result blocks containing all the          */
-/*  JDBC catalog information concerning primary keys.                     */
-/**************************************************************************/
-PQRYRES JDBCPrimaryKeys(PGLOBAL g, JDBConn *op, char *dsn, char *table)
-{
-	static int buftyp[] ={ TYPE_STRING, TYPE_STRING, TYPE_STRING,
-		TYPE_STRING, TYPE_SHORT, TYPE_STRING };
-	static unsigned int length[] ={ 0, 0, 0, 0, 6, 128 };
-	int      n, ncol = 5;
-	int     maxres;
-	PQRYRES  qrp;
-	JCATPARM *cap;
-	JDBConn *jcp = op;
-
-	if (!op) {
-		/**********************************************************************/
-		/*  Open the connection with the JDBC data source.                    */
-		/**********************************************************************/
-		jcp = new(g)JDBConn(g, NULL);
-
-		if (jcp->Open(dsn, 2) < 1)        // 2 is openReadOnly
-			return NULL;
-
-	} // endif op
-
-	/************************************************************************/
-	/*  Do an evaluation of the result size.                                */
-	/************************************************************************/
-	n = jcp->GetMaxValue(SQL_MAX_COLUMNS_IN_TABLE);
-	maxres = (n) ? (int)n : 250;
-	n = jcp->GetMaxValue(SQL_MAX_CATALOG_NAME_LEN);
-	length[0] = (n) ? (n + 1) : 128;
-	n = jcp->GetMaxValue(SQL_MAX_SCHEMA_NAME_LEN);
-	length[1] = (n) ? (n + 1) : 128;
-	n = jcp->GetMaxValue(SQL_MAX_TABLE_NAME_LEN);
-	length[2] = (n) ? (n + 1) : 128;
-	n = jcp->GetMaxValue(SQL_MAX_COLUMN_NAME_LEN);
-	length[3] = (n) ? (n + 1) : 128;
-
-	if (trace)
-		htrc("JDBCPrimaryKeys: max=%d len=%d,%d,%d\n",
-		maxres, length[0], length[1], length[2]);
-
-	/************************************************************************/
-	/*  Allocate the structure used to refer to the result set.             */
-	/************************************************************************/
-	qrp = PlgAllocResult(g, ncol, maxres, IDS_PKEY,
-		buftyp, NULL, length, false, true);
-
-	if (trace)
-		htrc("Getting pkey results ncol=%d\n", qrp->Nbcol);
-
-	cap = AllocCatInfo(g, CAT_KEY, NULL, table, qrp);
-
-	/************************************************************************/
-	/*  Now get the results into blocks.                                    */
-	/************************************************************************/
-	if ((n = jcp->GetCatInfo(cap)) >= 0) {
-		qrp->Nblin = n;
-		//  ResetNullValues(cap);
-
-		if (trace)
-			htrc("PrimaryKeys: NBCOL=%d NBLIN=%d\n", qrp->Nbcol, qrp->Nblin);
-
-	} else
-		qrp = NULL;
-
-	/************************************************************************/
-	/*  Close any local connection.                                         */
-	/************************************************************************/
-	if (!op)
-		jcp->Close();
-
-	/************************************************************************/
-	/*  Return the result pointer for use by GetData routines.              */
-	/************************************************************************/
-	return qrp;
-} // end of JDBCPrimaryKeys
-#endif // 0
-
 /***********************************************************************/
 /*  JDBConn construction/destruction.                                  */
 /***********************************************************************/
@@ -651,7 +512,7 @@ JDBConn::JDBConn(PGLOBAL g, TDBJDBC *tdbp)
 	xqid = xuid = xid = grs = readid = fetchid = typid = errid = nullptr;
 	prepid = xpid = pcid = nullptr;
 	chrfldid = intfldid = dblfldid = fltfldid = bigfldid = nullptr;
-	datfldid = timfldid = tspfldid = nullptr;
+	objfldid = datfldid = timfldid = tspfldid = nullptr;
 	//m_LoginTimeout = DEFAULT_LOGIN_TIMEOUT;
 //m_QueryTimeout = DEFAULT_QUERY_TIMEOUT;
 //m_UpdateOptions = 0;
@@ -739,60 +600,6 @@ bool  JDBConn::gmID(PGLOBAL g, jmethodID& mid, const char *name, const char *sig
 
 } // end of gmID
 
-#if 0
-/***********************************************************************/
-/*  Utility routine.                                                   */
-/***********************************************************************/
-PSZ JDBConn::GetStringInfo(ushort infotype)
-{
-	//ASSERT(m_hdbc != SQL_NULL_HDBC);
-	char   *p, buffer[MAX_STRING_INFO];
-	SWORD   result;
-	RETCODE rc;
-
-	rc = SQLGetInfo(m_hdbc, infotype, buffer, sizeof(buffer), &result);
-
-	if (!Check(rc)) {
-		ThrowDJX(rc, "SQLGetInfo");  // Temporary
-		//  *buffer = '\0';
-	} // endif rc
-
-	p = PlugDup(m_G, buffer);
-	return p;
-} // end of GetStringInfo
-
-/***********************************************************************/
-/*  Utility routines.                                                  */
-/***********************************************************************/
-void JDBConn::OnSetOptions(HSTMT hstmt)
-{
-	RETCODE rc;
-	ASSERT(m_hdbc != SQL_NULL_HDBC);
-
-	if ((signed)m_QueryTimeout != -1) {
-		// Attempt to set query timeout.  Ignore failure
-		rc = SQLSetStmtOption(hstmt, SQL_QUERY_TIMEOUT, m_QueryTimeout);
-
-		if (!Check(rc))
-			// don't attempt it again
-			m_QueryTimeout = (DWORD)-1;
-
-	} // endif m_QueryTimeout
-
-	if (m_RowsetSize > 0) {
-		// Attempt to set rowset size.
-		// In case of failure reset it to 0 to use Fetch.
-		rc = SQLSetStmtOption(hstmt, SQL_ROWSET_SIZE, m_RowsetSize);
-
-		if (!Check(rc))
-			// don't attempt it again
-			m_RowsetSize = 0;
-
-	} // endif m_RowsetSize
-
-} // end of OnSetOptions
-#endif // 0
-
 /***********************************************************************/
 /*  Utility routine.                                                   */
 /***********************************************************************/
@@ -1007,7 +814,7 @@ int JDBConn::Open(PJPARM sop)
 #define N 1
 #endif
 
-		// Java source will be compiled as ajar file installed in the plugin dir
+		// Java source will be compiled as a jar file installed in the plugin dir
 		jpop->Append(sep);
 		jpop->Append(GetPluginDir());
 		jpop->Append("JdbcInterface.jar");
@@ -1204,6 +1011,21 @@ int JDBConn::Open(PJPARM sop)
 		return RC_FX;
 	}	// endif Msg
 
+	jmethodID qcid = nullptr;
+
+	if (!gmID(g, qcid, "GetQuoteString", "()Ljava/lang/String;")) {
+		jstring s = (jstring)env->CallObjectMethod(job, qcid);
+
+		if (s != nullptr) {
+			char *qch = (char*)env->GetStringUTFChars(s, (jboolean)false);
+			m_IDQuoteChar[0] = *qch;
+		} else {
+			s = (jstring)env->CallObjectMethod(job, errid);
+			Msg = (char*)env->GetStringUTFChars(s, (jboolean)false);
+		}	// endif s
+
+	}	// endif qcid
+
 	if (gmID(g, typid, "ColumnType", "(ILjava/lang/String;)I"))
 		return RC_FX;
 	else
@@ -1345,9 +1167,10 @@ void JDBConn::Close()
 /***********************************************************************/
 void JDBConn::SetColumnValue(int rank, PSZ name, PVAL val)
 {
-	PGLOBAL&   g = m_G;
-	jint       ctyp;
-	jstring    cn, jn = nullptr;
+	PGLOBAL& g = m_G;
+	jint     ctyp;
+	jstring  cn, jn = nullptr;
+	jobject  jb = nullptr;
 
 	if (rank == 0)
 		if (!name || (jn = env->NewStringUTF(name)) == nullptr) {
@@ -1363,21 +1186,32 @@ void JDBConn::SetColumnValue(int rank, PSZ name, PVAL val)
 		longjmp(g->jumper[g->jump_level], TYPE_AM_JDBC);
 	} // endif Check
 
+	if (val->GetNullable())
+		if (!gmID(g, objfldid, "ObjectField", "(ILjava/lang/String;)Ljava/lang/Object;")) {
+			jb = env->CallObjectMethod(job, objfldid, (jint)rank, jn);
+
+			if (jb == nullptr) {
+				val->Reset();
+				val->SetNull(true);
+				goto chk;
+			}	// endif job
+
+		}	// endif objfldid
+
 	switch (ctyp) {
 	case 12:          // VARCHAR
 	case -1:          // LONGVARCHAR
 	case 1:           // CHAR
-		if (!gmID(g, chrfldid, "StringField", "(ILjava/lang/String;)Ljava/lang/String;")) {
+		if (jb)
+			cn = (jstring)jb;
+		else if (!gmID(g, chrfldid, "StringField", "(ILjava/lang/String;)Ljava/lang/String;"))
 			cn = (jstring)env->CallObjectMethod(job, chrfldid, (jint)rank, jn);
+		else
+			cn = nullptr;
 
-			if (cn) {
-				const char *field = env->GetStringUTFChars(cn, (jboolean)false);
-				val->SetValue_psz((PSZ)field);
-			} else {
-				val->Reset();
-				val->SetNull(true);
-			} // endif cn
-
+		if (cn) {
+			const char *field = env->GetStringUTFChars(cn, (jboolean)false);
+			val->SetValue_psz((PSZ)field);
 		} else
 			val->Reset();
 
@@ -1449,6 +1283,7 @@ void JDBConn::SetColumnValue(int rank, PSZ name, PVAL val)
 		val->Reset();
 	} // endswitch Type
 
+ chk:
 	if (Check()) {
 		if (rank == 0)
 			env->DeleteLocalRef(jn);
diff --git a/storage/connect/jdbconn.h b/storage/connect/jdbconn.h
index 095b1565bd2..0a1c52d4576 100644
--- a/storage/connect/jdbconn.h
+++ b/storage/connect/jdbconn.h
@@ -165,6 +165,7 @@ protected:
 	jmethodID xpid;										  // The ExecutePrep method ID
 	jmethodID pcid;										  // The ClosePrepStmt method ID
 	jmethodID errid;										// The GetErrmsg method ID
+	jmethodID objfldid;									// The ObjectField method ID
 	jmethodID chrfldid;									// The StringField method ID
 	jmethodID intfldid;									// The IntField method ID
 	jmethodID dblfldid;									// The DoubleField method ID
diff --git a/storage/connect/reldef.cpp b/storage/connect/reldef.cpp
index 8a6174ea53b..56a7ea8c512 100644
--- a/storage/connect/reldef.cpp
+++ b/storage/connect/reldef.cpp
@@ -294,7 +294,7 @@ int TABDEF::GetColCatInfo(PGLOBAL g)
 				nlg+= nof;
       case TAB_DIR:
       case TAB_XML:
-        poff= loff + 1;
+				poff= loff + (pcf->Flags & U_VIRTUAL ? 0 : 1);
         break;
       case TAB_INI:
       case TAB_MAC:
@@ -440,7 +440,11 @@ int TABDEF::GetColCatInfo(PGLOBAL g)
       } // endswitch tc
 
 		// lrecl must be at least recln to avoid buffer overflow
-		recln= MY_MAX(recln, Hc->GetIntegerOption("Lrecl"));
+		if (trace)
+			htrc("Lrecl: Calculated=%d defined=%d\n", 
+			  recln, Hc->GetIntegerOption("Lrecl"));
+
+		recln = MY_MAX(recln, Hc->GetIntegerOption("Lrecl"));
 		Hc->SetIntegerOption("Lrecl", recln);
 		((PDOSDEF)this)->SetLrecl(recln);
 		} // endif Lrecl
diff --git a/storage/connect/tabjdbc.cpp b/storage/connect/tabjdbc.cpp
index 2726d2207dc..93ba3ca264d 100644
--- a/storage/connect/tabjdbc.cpp
+++ b/storage/connect/tabjdbc.cpp
@@ -686,6 +686,9 @@ bool TDBJDBC::MakeInsert(PGLOBAL g)
 	else
 		Prepared = true;
 
+	if (trace)
+		htrc("Insert=%s\n", Query->GetStr());
+
 	return false;
 } // end of MakeInsert
 
@@ -733,17 +736,18 @@ bool TDBJDBC::MakeCommand(PGLOBAL g)
 	// If so, it must be quoted in the original query
 	strlwr(strcat(strcat(strcpy(name, " "), Name), " "));
 
-	if (!strstr(" update delete low_priority ignore quick from ", name))
-		strlwr(strcpy(name, Name));     // Not a keyword
-	else
+	if (strstr(" update delete low_priority ignore quick from ", name)) {
 		strlwr(strcat(strcat(strcpy(name, qc), Name), qc));
+		k += 2;
+	} else
+		strlwr(strcpy(name, Name));     // Not a keyword
 
 	if ((p = strstr(qrystr, name))) {
 		for (i = 0; i < p - qrystr; i++)
 			stmt[i] = (Qrystr[i] == '`') ? *qc : Qrystr[i];
 
 		stmt[i] = 0;
-		k = i + (int)strlen(Name);
+		k += i + (int)strlen(Name);
 
 		if (qtd && *(p-1) == ' ')
 			strcat(strcat(strcat(stmt, qc), TableName), qc);
@@ -765,6 +769,9 @@ bool TDBJDBC::MakeCommand(PGLOBAL g)
 		return 1;
 	} // endif p
 
+	if (trace)
+		htrc("Command=%s\n", stmt);
+
 	Query = new(g)STRING(g, 0, stmt);
 	return (!Query->GetSize());
 } // end of MakeCommand
@@ -1214,6 +1221,10 @@ int TDBJDBC::WriteDB(PGLOBAL g)
 	} // endif oom
 
 	Query->RepLast(')');
+
+	if (trace > 1)
+		htrc("Inserting: %s\n", Query->GetStr());
+
 	rc = Jcp->ExecuteUpdate(Query->GetStr());
 	Query->Truncate(len);     // Restore query
 
diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc
index 31ba56d2993..2b16c6ca8bd 100644
--- a/storage/innobase/btr/btr0btr.cc
+++ b/storage/innobase/btr/btr0btr.cc
@@ -175,14 +175,16 @@ btr_root_block_get(
 
 
 	if (!block) {
-		index->table->is_encrypted = TRUE;
-		index->table->corrupted = FALSE;
-
-		ib_push_warning(index->table->thd, DB_DECRYPTION_FAILED,
-			"Table %s in tablespace %lu is encrypted but encryption service or"
-			" used key_id is not available. "
-			" Can't continue reading table.",
-			index->table->name, space);
+		if (index && index->table) {
+			index->table->is_encrypted = TRUE;
+			index->table->corrupted = FALSE;
+
+			ib_push_warning(index->table->thd, DB_DECRYPTION_FAILED,
+				"Table %s in tablespace %lu is encrypted but encryption service or"
+				" used key_id is not available. "
+				" Can't continue reading table.",
+				index->table->name, space);
+		}
 
 		return NULL;
 	}
@@ -1319,6 +1321,11 @@ leaf_loop:
 
 	page_t*	root = block->frame;
 
+	if (!root) {
+		mtr_commit(&mtr);
+		return;
+	}
+
 #ifdef UNIV_BTR_DEBUG
 	ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
 				    + root, block->page.id.space()));
@@ -1399,10 +1406,12 @@ btr_free(
 	buf_block_t*	block = buf_page_get(
 		page_id, page_size, RW_X_LATCH, &mtr);
 
-	ut_ad(page_is_root(block->frame));
+	if (block) {
+		ut_ad(page_is_root(block->frame));
 
-	btr_free_but_not_root(block, MTR_LOG_NO_REDO);
-	btr_free_root(block, &mtr);
+		btr_free_but_not_root(block, MTR_LOG_NO_REDO);
+		btr_free_root(block, &mtr);
+	}
 	mtr.commit();
 }
 #endif /* !UNIV_HOTBACKUP */
diff --git a/storage/innobase/btr/btr0scrub.cc b/storage/innobase/btr/btr0scrub.cc
index 8ed0117b36e..f361a9d8b1e 100644
--- a/storage/innobase/btr/btr0scrub.cc
+++ b/storage/innobase/btr/btr0scrub.cc
@@ -366,12 +366,17 @@ btr_optimistic_scrub(
 
 	/* We play safe and reset the free bits */
 	if (!dict_index_is_clust(index) &&
-	    page_is_leaf(buf_block_get_frame(block))) {
+	    block != NULL) {
+		buf_frame_t* frame = buf_block_get_frame(block);
+		if (frame &&
+		    page_is_leaf(frame)) {
 
 			ibuf_reset_free_bits(block);
+		}
 	}
 
 	scrub_data->scrub_stat.page_reorganizations++;
+
 	return DB_SUCCESS;
 }
 
@@ -486,9 +491,13 @@ btr_pessimistic_scrub(
 		/* We play safe and reset the free bits
 		* NOTE: need to call this prior to btr_page_split_and_insert */
 		if (!dict_index_is_clust(index) &&
-		    page_is_leaf(buf_block_get_frame(block))) {
+		    block != NULL) {
+			buf_frame_t* frame = buf_block_get_frame(block);
+			if (frame &&
+			    page_is_leaf(frame)) {
 
-			ibuf_reset_free_bits(block);
+				ibuf_reset_free_bits(block);
+			}
 		}
 
 		rec = btr_page_split_and_insert(
@@ -787,11 +796,8 @@ btr_scrub_page(
 		return BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE;
 	}
 
-	buf_frame_t* frame = NULL;
+	buf_frame_t* frame = buf_block_get_frame(block);
 
-	if (block) {
-		frame = buf_block_get_frame(block);
-	}
 	if (!frame || btr_page_get_index_id(frame) !=
 	    scrub_data->current_index->id) {
 		/* page has been reallocated to new index */
diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc
index 43683d63c97..ffd739bc686 100644
--- a/storage/innobase/buf/buf0buf.cc
+++ b/storage/innobase/buf/buf0buf.cc
@@ -130,6 +130,11 @@ struct set_numa_interleave_t
 #define NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE
 #endif /* HAVE_LIBNUMA && WITH_NUMA */
 
+/* Enable this for checksum error messages. */
+//#ifdef UNIV_DEBUG
+//#define UNIV_DEBUG_LEVEL2 1
+//#endif
+
 /*
 		IMPLEMENTATION OF THE BUFFER POOL
 		=================================
@@ -407,6 +412,9 @@ buf_pool_register_chunk(
 		chunk->blocks->frame, chunk));
 }
 
+/* prototypes for new functions added to ha_innodb.cc */
+trx_t* innobase_get_trx();
+
 /********************************************************************//**
 Check if page is maybe compressed, encrypted or both when we encounter
 corrupted page. Note that we can't be 100% sure if page is corrupted
@@ -641,19 +649,26 @@ buf_page_is_checksum_valid_crc32(
 #endif /* UNIV_INNOCHECKSUM */
 
 	if (checksum_field1 != checksum_field2) {
-		return(false);
+		goto invalid;
 	}
 
 	if (checksum_field1 == crc32) {
 		return(true);
-	}
-
-	const uint32_t	crc32_legacy = buf_calc_page_crc32(read_buf, true);
+	} else {
+		const uint32_t	crc32_legacy = buf_calc_page_crc32(read_buf, true);
 
-	if (checksum_field1 == crc32_legacy) {
-		return(true);
+		if (checksum_field1 == crc32_legacy) {
+			return(true);
+		}
 	}
 
+invalid:
+#ifdef UNIV_DEBUG_LEVEL2
+	ib::info() << "Page checksum crc32 not valid"
+		   << " field1 " << checksum_field1
+		   << " field2 " << checksum_field2
+		   << " crc32 " << crc32;
+#endif
 	return(false);
 }
 
@@ -725,6 +740,13 @@ buf_page_is_checksum_valid_innodb(
 
 	if (checksum_field2 != mach_read_from_4(read_buf + FIL_PAGE_LSN)
 	    && checksum_field2 != old_checksum) {
+#ifdef UNIV_DEBUG_LEVEL2
+		ib::info() << "Page checksum crc32 not valid"
+			   << " field1 " << checksum_field1
+			   << " field2 " << checksum_field2
+			   << " crc32 " << buf_calc_page_old_checksum(read_buf)
+			   << " lsn " << mach_read_from_4(read_buf + FIL_PAGE_LSN);
+#endif
 		return(false);
 	}
 
@@ -734,6 +756,13 @@ buf_page_is_checksum_valid_innodb(
 	(always equal to 0), to FIL_PAGE_SPACE_OR_CHKSUM */
 
 	if (checksum_field1 != 0 && checksum_field1 != new_checksum) {
+#ifdef UNIV_DEBUG_LEVEL2
+		ib::info() << "Page checksum crc32 not valid"
+			   << " field1 " << checksum_field1
+			   << " field2 " << checksum_field2
+			   << " crc32 " << buf_calc_page_new_checksum(read_buf)
+			   << " lsn " << mach_read_from_4(read_buf + FIL_PAGE_LSN);
+#endif
 		return(false);
 	}
 
@@ -763,6 +792,15 @@ buf_page_is_checksum_valid_none(
 #endif	/* UNIV_INNOCHECKSUM */
 	)
 {
+#ifdef UNIV_DEBUG_LEVEL2
+	if (!(checksum_field1 == checksum_field2 || checksum_field1 == BUF_NO_CHECKSUM_MAGIC)) {
+		ib::info() << "Page checksum crc32 not valid"
+			   << " field1 " << checksum_field1
+			   << " field2 " << checksum_field2
+			   << " crc32 " << BUF_NO_CHECKSUM_MAGIC
+			   << " lsn " << mach_read_from_4(read_buf + FIL_PAGE_LSN);
+	}
+#endif
 
 #ifdef UNIV_INNOCHECKSUM
 	if (is_log_enabled
@@ -806,9 +844,24 @@ buf_page_is_corrupted(
 #endif /* UNIV_INNOCHECKSUM */
 )
 {
-	ulint		page_encrypted = (mach_read_from_4(read_buf+FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION) != 0);
 	ulint		checksum_field1;
 	ulint		checksum_field2;
+	bool page_encrypted = false;
+
+#ifndef UNIV_INNOCHECKSUM // FIXME see also encryption.innochecksum test
+	ulint 		space_id = mach_read_from_4(
+		read_buf + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+	fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(space_id);
+
+	/* Page is encrypted if encryption information is found from
+	tablespace and page contains used key_version. This is true
+	also for pages first compressed and then encrypted. */
+	if (crypt_data &&
+	    crypt_data->type != CRYPT_SCHEME_UNENCRYPTED &&
+	    fil_page_is_encrypted(read_buf)) {
+		page_encrypted = true;
+	}
+#endif
 
 	DBUG_EXECUTE_IF("buf_page_is_corrupt_failure", return(TRUE); );
 
@@ -820,6 +873,13 @@ buf_page_is_corrupted(
 		/* Stored log sequence numbers at the start and the end
 		of page do not match */
 
+#ifndef UNIV_INNOCHECKSUM
+		ib::info() << "Log sequence number at the start "
+			   << mach_read_from_4(read_buf + FIL_PAGE_LSN + 4)
+			   << " and the end "
+			   << mach_read_from_4(read_buf + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)
+			   << " do not match";
+#endif
 		return(TRUE);
 	}
 
@@ -907,6 +967,10 @@ buf_page_is_corrupted(
 			     || i >= FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID)
 			    && read_buf[i] != 0) {
 
+#ifndef UNIV_INNOCHECKSUM
+				ib::info() << "Checksum fields zero but page is not empty.";
+#endif
+
 				break;
 			}
 		}
@@ -3411,6 +3475,11 @@ page_found:
 	buf_pool->watch[]. However, it is not in the critical code path
 	as this function will be called only by the purge thread. */
 
+/* Enable this for checksum error messages. Currently on by
+default on UNIV_DEBUG for encryption bugs. */
+#ifdef UNIV_DEBUG
+#define UNIV_DEBUG_LEVEL2 1
+#endif
 
 	/* To obey latching order first release the hash_lock. */
 	rw_lock_x_unlock(*hash_lock);
@@ -5794,16 +5863,16 @@ buf_page_check_corrupt(
 {
 	byte* dst_frame = (bpage->zip.data) ? bpage->zip.data :
 		((buf_block_t*) bpage)->frame;
-	unsigned key_version = bpage->key_version;
 	bool page_compressed = bpage->page_encrypted;
 	ulint stored_checksum = bpage->stored_checksum;
-	ulint calculated_checksum = bpage->stored_checksum;
+	ulint calculated_checksum = bpage->calculated_checksum;
 	bool page_compressed_encrypted = bpage->page_compressed;
 	ulint space_id = mach_read_from_4(
 		dst_frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
 	fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(space_id);
 	fil_space_t* space = fil_space_found_by_id(space_id);
 	bool corrupted = true;
+	ulint key_version = bpage->key_version;
 
 	if (key_version != 0 || page_compressed_encrypted) {
 		bpage->encrypted = true;
@@ -5893,7 +5962,7 @@ buf_page_io_complete(
 	if (io_type == BUF_IO_READ) {
 		ulint	read_page_no;
 		ulint	read_space_id;
-		byte*	frame;
+		byte*	frame = NULL;
 		bool	compressed_page=false;
 
 		ut_ad(bpage->zip.data != NULL || ((buf_block_t*)bpage)->frame != NULL);
@@ -5905,7 +5974,15 @@ buf_page_io_complete(
 			} else {
 				frame = ((buf_block_t*) bpage)->frame;
 			}
-			goto corrupt;
+
+			ib::info() << "Page "
+				   << bpage->id
+				   << " in tablespace "
+				   << bpage->space
+				   << " encryption error key_version "
+				   << bpage->key_version;
+
+			goto database_corrupted;
 		}
 
 		if (bpage->size.is_compressed()) {
@@ -5918,7 +5995,14 @@ buf_page_io_complete(
 
 				buf_pool->n_pend_unzip--;
 				compressed_page = false;
-				goto corrupt;
+
+				ib::info() << "Page "
+					   << bpage->id
+					   << " in tablespace "
+					   << bpage->space
+					   << " zip_decompress failure.";
+
+				goto database_corrupted;
 			}
 			buf_pool->n_pend_unzip--;
 		} else {
@@ -6007,7 +6091,7 @@ buf_page_io_complete(
 				}
 				goto page_not_corrupt;
 				;);
-corrupt:
+database_corrupted:
 			bool corrupted = buf_page_check_corrupt(bpage);
 
 			/* Compressed and encrypted pages are basically gibberish avoid
@@ -6047,6 +6131,7 @@ corrupt:
 					return(false);
 				} else {
 					corrupted = buf_page_check_corrupt(bpage);
+					ulint key_version = bpage->key_version;
 
 					if (corrupted) {
 						ib::fatal()
@@ -6063,7 +6148,7 @@ corrupt:
 						"However key management plugin or used key_id %u is not found or"
 						" used encryption algorithm or method does not match."
 						" Can't continue opening the table.",
-						(ulint)bpage->space, bpage->key_version);
+						(ulint)bpage->space, key_version);
 
 					buf_page_print(frame, bpage->size, BUF_PAGE_PRINT_NO_CRASH);
 
@@ -7434,12 +7519,12 @@ buf_page_encrypt_before_write(
 		return src_frame;
 	}
 
-	if (crypt_data != NULL && crypt_data->encryption == FIL_SPACE_ENCRYPTION_OFF) {
+	if (crypt_data != NULL && crypt_data->not_encrypted()) {
 		/* Encryption is disabled */
 		encrypted = false;
 	}
 
-	if (!srv_encrypt_tables && (crypt_data == NULL || crypt_data->encryption == FIL_SPACE_ENCRYPTION_DEFAULT)) {
+	if (!srv_encrypt_tables && (crypt_data == NULL || crypt_data->is_default_encryption())) {
 		/* Encryption is disabled */
 		encrypted = false;
 	}
@@ -7544,6 +7629,35 @@ buf_page_decrypt_after_read(
 	bool page_compressed_encrypted = fil_page_is_compressed_encrypted(dst_frame);
 	buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
 	bool success = true;
+	ulint 		space_id = mach_read_from_4(
+		dst_frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+	fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(space_id);
+
+	/* Page is encrypted if encryption information is found from
+	tablespace and page contains used key_version. This is true
+	also for pages first compressed and then encrypted. */
+	if (!crypt_data ||
+	    (crypt_data &&
+	     crypt_data->type == CRYPT_SCHEME_UNENCRYPTED &&
+	     key_version != 0)) {
+		byte*	frame = NULL;
+
+		if (bpage->size.is_compressed()) {
+			frame = bpage->zip.data;
+		} else {
+			frame = ((buf_block_t*) bpage)->frame;
+		}
+
+		/* If page is not corrupted at this point, page can't be
+		encrypted, thus set key_version to 0. If page is corrupted,
+		we assume at this point that it is encrypted as page
+		contained key_version != 0. Note that page could still be
+		really corrupted. This we will find out after decrypt by
+		checking page checksums. */
+		if (!buf_page_is_corrupted(false, frame, bpage->size, false)) {
+			key_version = 0;
+		}
+	}
 
 	/* If page is encrypted read post-encryption checksum */
 	if (!page_compressed_encrypted && key_version != 0) {
@@ -7648,4 +7762,3 @@ buf_page_decrypt_after_read(
 	return (success);
 }
 #endif /* !UNIV_INNOCHECKSUM */
-
diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc
index ff5162a68c4..5cb6d9714c8 100644
--- a/storage/innobase/dict/dict0stats.cc
+++ b/storage/innobase/dict/dict0stats.cc
@@ -700,7 +700,10 @@ void
 dict_stats_copy(
 /*============*/
 	dict_table_t*		dst,	/*!< in/out: destination table */
-	const dict_table_t*	src)	/*!< in: source table */
+	const dict_table_t*	src,	/*!< in: source table */
+	bool reset_ignored_indexes)	/*!< in: if true, set ignored indexes
+                                             to have the same statistics as if
+                                             the table was empty */
 {
 	dst->stats_last_recalc = src->stats_last_recalc;
 	dst->stat_n_rows = src->stat_n_rows;
@@ -719,7 +722,16 @@ dict_stats_copy(
 	      && (src_idx = dict_table_get_next_index(src_idx)))) {
 
 		if (dict_stats_should_ignore_index(dst_idx)) {
-			continue;
+			if (reset_ignored_indexes) {
+				/* Reset index statistics for all ignored indexes,
+				unless they are FT indexes (these have no statistics)*/
+				if (dst_idx->type & DICT_FTS) {
+					continue;
+				}
+				dict_stats_empty_index(dst_idx, true);
+			} else {
+				continue;
+			}
 		}
 
 		ut_ad(!dict_index_is_ibuf(dst_idx));
@@ -818,7 +830,7 @@ dict_stats_snapshot_create(
 
 	t = dict_stats_table_clone_create(table);
 
-	dict_stats_copy(t, table);
+	dict_stats_copy(t, table, false);
 
 	t->stat_persistent = table->stat_persistent;
 	t->stats_auto_recalc = table->stats_auto_recalc;
@@ -3283,13 +3295,10 @@ dict_stats_update(
 
 			dict_table_stats_lock(table, RW_X_LATCH);
 
-			/* Initialize all stats to dummy values before
-			copying because dict_stats_table_clone_create() does
-			skip corrupted indexes so our dummy object 't' may
-			have less indexes than the real object 'table'. */
-			dict_stats_empty_table(table, true);
-
-			dict_stats_copy(table, t);
+			/* Pass reset_ignored_indexes=true as parameter
+			to dict_stats_copy. This will cause statictics
+			for corrupted indexes to be set to empty values */
+			dict_stats_copy(table, t, true);
 
 			dict_stats_assert_initialized(table);
 
diff --git a/storage/innobase/fil/fil0crypt.cc b/storage/innobase/fil/fil0crypt.cc
index f68eabba579..b4c273299f3 100644
--- a/storage/innobase/fil/fil0crypt.cc
+++ b/storage/innobase/fil/fil0crypt.cc
@@ -95,17 +95,6 @@ extern uint srv_background_scrub_data_check_interval;
 static fil_crypt_stat_t crypt_stat;
 static ib_mutex_t crypt_stat_mutex;
 
-#ifdef UNIV_PFS_MUTEX
-UNIV_INTERN mysql_pfs_key_t fil_crypt_stat_mutex_key;
-#endif
-
-/**
- * key for crypt data mutex
-*/
-#ifdef UNIV_PFS_MUTEX
-UNIV_INTERN mysql_pfs_key_t fil_crypt_data_mutex_key;
-#endif
-
 static bool
 fil_crypt_needs_rotation(
 /*=====================*/
@@ -142,6 +131,23 @@ fil_space_crypt_cleanup()
 	mutex_free(&crypt_stat_mutex);
 }
 
+/**
+Get latest key version from encryption plugin.
+@return key version or ENCRYPTION_KEY_VERSION_INVALID */
+uint
+fil_space_crypt_struct::key_get_latest_version(void)
+{
+	uint key_version = key_found;
+
+	if (is_key_found()) {
+		key_version = encryption_key_get_latest_version(key_id);
+		srv_stats.n_key_requests.inc();
+		key_found = key_version;
+	}
+
+	return key_version;
+}
+
 /******************************************************************
 Get the latest(key-version), waking the encrypt thread, if needed */
 static inline
@@ -150,20 +156,25 @@ fil_crypt_get_latest_key_version(
 /*=============================*/
 	fil_space_crypt_t* crypt_data) 	/*!< in: crypt data */
 {
-	uint rc = encryption_key_get_latest_version(crypt_data->key_id);
+	ut_ad(crypt_data != NULL);
 
-	if (fil_crypt_needs_rotation(crypt_data->encryption,
-					crypt_data->min_key_version,
-					rc, srv_fil_crypt_rotate_key_age)) {
-		os_event_set(fil_crypt_threads_event);
+	uint key_version = crypt_data->key_get_latest_version();
+
+	if (crypt_data->is_key_found()) {
+
+		if (fil_crypt_needs_rotation(crypt_data->encryption,
+				crypt_data->min_key_version,
+				key_version,
+				srv_fil_crypt_rotate_key_age)) {
+			os_event_set(fil_crypt_threads_event);
+		}
 	}
 
-	return rc;
+	return key_version;
 }
 
 /******************************************************************
 Mutex helper for crypt_data->scheme */
-static
 void
 crypt_data_scheme_locker(
 /*=====================*/
@@ -183,37 +194,47 @@ crypt_data_scheme_locker(
 /******************************************************************
 Create a fil_space_crypt_t object
 @return crypt object */
-UNIV_INTERN
+static
 fil_space_crypt_t*
 fil_space_create_crypt_data(
 /*========================*/
-	fil_encryption_t	encrypt_mode,	/*!< in: encryption mode */
-	uint			key_id)		/*!< in: encryption key id */
+	uint			type,
+	fil_encryption_t	encrypt_mode,
+	uint			min_key_version,
+	uint			key_id,
+	ulint			offset)
 {
 	const uint sz = sizeof(fil_space_crypt_t);
-	fil_space_crypt_t* crypt_data =
-		static_cast<fil_space_crypt_t*>(malloc(sz));
-
-	memset(crypt_data, 0, sz);
+	void* buf =  ut_zalloc_nokey(sz);
+	fil_space_crypt_t* crypt_data = NULL;
 
-	if (encrypt_mode == FIL_SPACE_ENCRYPTION_OFF ||
-		(!srv_encrypt_tables && encrypt_mode == FIL_SPACE_ENCRYPTION_DEFAULT)) {
-		crypt_data->type = CRYPT_SCHEME_UNENCRYPTED;
-	} else {
-		crypt_data->type = CRYPT_SCHEME_1;
-		crypt_data->min_key_version = encryption_key_get_latest_version(key_id);
+	if (buf) {
+		crypt_data = new(buf)
+			fil_space_crypt_struct(
+				type,
+				min_key_version,
+				key_id,
+				offset,
+				encrypt_mode);
 	}
 
-	mutex_create(LATCH_ID_FIL_CRYPT_DATA_MUTEX, &crypt_data->mutex);
-	crypt_data->locker = crypt_data_scheme_locker;
-	my_random_bytes(crypt_data->iv, sizeof(crypt_data->iv));
-	crypt_data->encryption = encrypt_mode;
-	crypt_data->inited = true;
-	crypt_data->key_id = key_id;
 	return crypt_data;
 }
 
 /******************************************************************
+Create a fil_space_crypt_t object
+@return crypt object */
+UNIV_INTERN
+fil_space_crypt_t*
+fil_space_create_crypt_data(
+/*========================*/
+	fil_encryption_t	encrypt_mode,	/*!< in: encryption mode */
+	uint			key_id)		/*!< in: encryption key id */
+{
+	return (fil_space_create_crypt_data(0, encrypt_mode, 0, key_id, 0));
+}
+
+/******************************************************************
 Merge fil_space_crypt_t object */
 UNIV_INTERN
 void
@@ -235,7 +256,7 @@ fil_space_merge_crypt_data(
 	dst->type = src->type;
 	dst->min_key_version = src->min_key_version;
 	dst->keyserver_requests += src->keyserver_requests;
-	dst->inited = src->inited;
+	dst->closing = src->closing;
 
 	mutex_exit(&dst->mutex);
 }
@@ -302,18 +323,12 @@ fil_space_read_crypt_data(
 	fil_encryption_t encryption = (fil_encryption_t)mach_read_from_1(
 		page + offset + MAGIC_SZ + 2 + iv_length + 8);
 
-	const uint sz = sizeof(fil_space_crypt_t) + iv_length;
-	crypt_data = static_cast<fil_space_crypt_t*>(malloc(sz));
-	memset(crypt_data, 0, sz);
-
+	crypt_data = fil_space_create_crypt_data(encryption, key_id);
+	/* We need to overwrite these as above function will initialize
+	members */
 	crypt_data->type = type;
 	crypt_data->min_key_version = min_key_version;
-	crypt_data->key_id = key_id;
 	crypt_data->page0_offset = offset;
-	crypt_data->encryption = encryption;
-	mutex_create(LATCH_ID_FIL_CRYPT_DATA_MUTEX, &crypt_data->mutex);
-	crypt_data->locker = crypt_data_scheme_locker;
-	crypt_data->inited = true;
 	memcpy(crypt_data->iv, page + offset + MAGIC_SZ + 2, iv_length);
 
 	return crypt_data;
@@ -328,12 +343,10 @@ fil_space_destroy_crypt_data(
 	fil_space_crypt_t **crypt_data)	/*!< out: crypt data */
 {
 	if (crypt_data != NULL && (*crypt_data) != NULL) {
-		/* Make sure that this thread owns the crypt_data
-		and make it unawailable, this does not fully
-		avoid the race between drop table and crypt thread */
 		mutex_enter(&fil_crypt_threads_mutex);
-		mutex_free(&(*crypt_data)->mutex);
-		free(*crypt_data);
+		fil_space_crypt_t* c = *crypt_data;
+		c->~fil_space_crypt_struct();
+		ut_free(c);
 		*crypt_data = NULL;
 		mutex_exit(&fil_crypt_threads_mutex);
 	}
@@ -482,6 +495,7 @@ fil_parse_write_crypt_data(
 	}
 
 	fil_space_crypt_t* crypt_data = fil_space_create_crypt_data(encryption, key_id);
+	/* Need to overwrite these as above will initialize fields. */
 	crypt_data->page0_offset = offset;
 	crypt_data->min_key_version = min_key_version;
 	crypt_data->encryption = encryption;
@@ -635,7 +649,7 @@ fil_space_encrypt(
 		return src_frame;
 	}
 
-	ut_a(crypt_data != NULL && crypt_data->encryption != FIL_SPACE_ENCRYPTION_OFF);
+	ut_a(crypt_data != NULL && crypt_data->is_encrypted());
 
 	byte* tmp = fil_encrypt_buf(crypt_data, space, offset, lsn, src_frame, page_size, dst_frame);
 
@@ -714,7 +728,7 @@ fil_space_check_encryption_read(
 		return false;
 	}
 
-	if (crypt_data->encryption == FIL_SPACE_ENCRYPTION_OFF) {
+	if (crypt_data->not_encrypted()) {
 		return false;
 	}
 
@@ -767,7 +781,7 @@ fil_space_decrypt(
 		return false;
 	}
 
-	ut_a(crypt_data != NULL && crypt_data->encryption != FIL_SPACE_ENCRYPTION_OFF);
+	ut_a(crypt_data != NULL && crypt_data->is_encrypted());
 
 	/* read space & lsn */
 	ulint header_len = FIL_PAGE_DATA;
@@ -999,20 +1013,13 @@ Copy global key state */
 static void
 fil_crypt_get_key_state(
 /*====================*/
-	key_state_t *new_state)	/*!< out: key state */
+	key_state_t*		new_state,	/*!< out: key state */
+	fil_space_crypt_t*	crypt_data)	/*!< in, out: crypt_data */
 {
 	if (srv_encrypt_tables) {
-		new_state->key_version =
-			encryption_key_get_latest_version(new_state->key_id);
+		new_state->key_version = crypt_data->key_get_latest_version();
 		new_state->rotate_key_age = srv_fil_crypt_rotate_key_age;
 
-		if (new_state->key_version == ENCRYPTION_KEY_VERSION_INVALID) {
-			ib::error() << "Used key_id "
-				    << new_state->key_id
-				    << " can't be found from key file.";
-		}
-
-		ut_a(new_state->key_version != ENCRYPTION_KEY_VERSION_INVALID);
 		ut_a(new_state->key_version != ENCRYPTION_KEY_NOT_ENCRYPTED);
 	} else {
 		new_state->key_version = 0;
@@ -1072,9 +1079,7 @@ fil_crypt_is_closing(
 	fil_space_crypt_t *crypt_data = fil_space_get_crypt_data(space);
 
 	if (crypt_data) {
-		mutex_enter(&crypt_data->mutex);
-		closing = crypt_data->closing;
-		mutex_exit(&crypt_data->mutex);
+		closing = crypt_data->is_closing(false);
 	}
 
 	return closing;
@@ -1333,6 +1338,18 @@ fil_crypt_space_needs_rotation(
 			}
 			return false;
 		}
+
+		crypt_data->key_get_latest_version();
+
+		if (!crypt_data->is_key_found()) {
+			return false;
+		}
+	}
+
+	/* If used key_id is not found from encryption plugin we can't
+	continue to rotate the tablespace */
+	if (!crypt_data->is_key_found()) {
+		return false;
 	}
 
 	mutex_enter(&crypt_data->mutex);
@@ -1346,7 +1363,7 @@ fil_crypt_space_needs_rotation(
 		}
 
 		/* prevent threads from starting to rotate space */
-		if (crypt_data->closing) {
+		if (crypt_data->is_closing(true)) {
 			break;
 		}
 
@@ -1355,13 +1372,13 @@ fil_crypt_space_needs_rotation(
 		}
 
 		/* No need to rotate space if encryption is disabled */
-		if (crypt_data->encryption == FIL_SPACE_ENCRYPTION_OFF) {
+		if (crypt_data->not_encrypted()) {
 			break;
 		}
 
 		if (crypt_data->key_id != key_state->key_id) {
 			key_state->key_id= crypt_data->key_id;
-			fil_crypt_get_key_state(key_state);
+			fil_crypt_get_key_state(key_state, crypt_data);
 		}
 
 		bool need_key_rotation = fil_crypt_needs_rotation(
@@ -1374,12 +1391,14 @@ fil_crypt_space_needs_rotation(
 
 		time_t diff = time(0) - crypt_data->rotate_state.scrubbing.
 			last_scrub_completed;
+
 		bool need_scrubbing =
 			crypt_data->rotate_state.scrubbing.is_active
                   && diff >= (time_t) srv_background_scrub_data_interval;
 
-		if (need_key_rotation == false && need_scrubbing == false)
+		if (need_key_rotation == false && need_scrubbing == false) {
 			break;
+		}
 
 		mutex_exit(&crypt_data->mutex);
 		/* NOTE! fil_decr_pending_ops is performed outside */
@@ -1595,8 +1614,9 @@ fil_crypt_find_space_to_rotate(
 		os_event_wait_time(fil_crypt_threads_event, 1000000);
 	}
 
-	if (state->should_shutdown())
+	if (state->should_shutdown()) {
 		return false;
+	}
 
 	if (state->first) {
 		state->first = false;
@@ -1658,7 +1678,7 @@ fil_crypt_start_rotate_space(
 		crypt_data->rotate_state.start_time = time(0);
 
 		if (crypt_data->type == CRYPT_SCHEME_UNENCRYPTED &&
-			crypt_data->encryption != FIL_SPACE_ENCRYPTION_OFF &&
+			crypt_data->is_encrypted() &&
 			key_state->key_version != 0) {
 			/* this is rotation unencrypted => encrypted */
 			crypt_data->type = CRYPT_SCHEME_1;
@@ -1695,7 +1715,7 @@ fil_crypt_find_page_to_rotate(
 		mutex_enter(&crypt_data->mutex);
 		ut_ad(key_state->key_id == crypt_data->key_id);
 
-		if (crypt_data->closing == false &&
+		if (!crypt_data->is_closing(true) &&
 			crypt_data->rotate_state.next_offset <
 			crypt_data->rotate_state.max_offset) {
 
@@ -1959,7 +1979,7 @@ fil_crypt_rotate_page(
 				/* statistics */
 				state->crypt_stat.pages_modified++;
 			} else {
-				if (crypt_data->encryption !=  FIL_SPACE_ENCRYPTION_OFF) {
+				if (crypt_data->is_encrypted()) {
 					ut_a(kv >= crypt_data->min_key_version ||
 						(kv == 0 && key_state->key_version == 0));
 
@@ -2172,7 +2192,7 @@ fil_crypt_complete_rotate_space(
 	fil_space_crypt_t *crypt_data = fil_space_get_crypt_data(space);
 
 	/* Space might already be dropped */
-	if (crypt_data != NULL && crypt_data->inited) {
+	if (crypt_data != NULL && !crypt_data->is_closing(false)) {
 		mutex_enter(&crypt_data->mutex);
 
 		/**
@@ -2469,7 +2489,8 @@ UNIV_INTERN
 void
 fil_space_crypt_mark_space_closing(
 /*===============================*/
-	ulint	space)	/*!< in: Space id */
+	ulint			space,		/*!< in: tablespace id */
+	fil_space_crypt_t*	crypt_data)	/*!< in: crypt_data or NULL */
 {
 	if (!fil_crypt_threads_inited) {
 		return;
@@ -2477,7 +2498,9 @@ fil_space_crypt_mark_space_closing(
 
 	mutex_enter(&fil_crypt_threads_mutex);
 
-	fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(space);
+	if (!crypt_data) {
+		crypt_data = fil_space_get_crypt_data(space);
+	}
 
 	if (crypt_data == NULL) {
 		mutex_exit(&fil_crypt_threads_mutex);
@@ -2506,7 +2529,7 @@ fil_space_crypt_close_tablespace(
 
 	fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(space);
 
-	if (crypt_data == NULL || !crypt_data->inited) {
+	if (crypt_data == NULL || crypt_data->is_closing(false)) {
 		mutex_exit(&fil_crypt_threads_mutex);
 		return;
 	}
@@ -2560,6 +2583,8 @@ fil_space_crypt_get_status(
 {
 	fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(id);
 
+	memset(status, 0, sizeof(*status));
+
 	if (crypt_data != NULL) {
 		status->space = id;
 		status->scheme = crypt_data->type;
@@ -2580,6 +2605,7 @@ fil_space_crypt_get_status(
 		} else {
 			status->rotating = false;
 		}
+
 		mutex_exit(&crypt_data->mutex);
 
 		if (srv_encrypt_tables || crypt_data->min_key_version) {
@@ -2589,7 +2615,6 @@ fil_space_crypt_get_status(
 			status->current_key_version = 0;
 		}
 	} else {
-		memset(status, 0, sizeof(*status));
 		if (srv_encrypt_tables) {
 			os_event_set(fil_crypt_threads_event);
 		}
@@ -2622,6 +2647,7 @@ fil_space_get_scrub_status(
 	struct fil_space_scrub_status_t* status)	/*!< out: status  */
 {
 	fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(id);
+
 	memset(status, 0, sizeof(*status));
 
 	if (crypt_data != NULL) {
@@ -2646,9 +2672,8 @@ fil_space_get_scrub_status(
 		} else {
 			status->scrubbing = false;
 		}
+
 		mutex_exit(&crypt_data->mutex);
-	} else {
-		memset(status, 0, sizeof(*status));
 	}
 
 	return crypt_data == NULL ? 1 : 0;
diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc
index ab7c6a87f5e..3bd3237d1aa 100644
--- a/storage/innobase/fil/fil0fil.cc
+++ b/storage/innobase/fil/fil0fil.cc
@@ -721,6 +721,7 @@ retry:
 		success = os_file_read(
 			request,
 			node->handle, page, 0, UNIV_PAGE_SIZE);
+		srv_stats.page0_read.add(1);
 
 		space_id = fsp_header_get_space_id(page);
 		flags = fsp_header_get_flags(page);
@@ -1046,8 +1047,13 @@ fil_mutex_enter_and_prepare_for_io(
 		space does not exist, we handle the situation in the function
 		which called this function. */
 
-		if (space == NULL || UT_LIST_GET_FIRST(space->chain)->is_open) {
+		if (!space) {
+			return;
+		}
 
+		fil_node_t*	node = UT_LIST_GET_FIRST(space->chain);
+
+		if (!node || node->is_open) {
 			return;
 		}
 
@@ -1270,7 +1276,8 @@ fil_space_create(
 	ulint		id,
 	ulint		flags,
 	fil_type_t	purpose,
-	fil_space_crypt_t* crypt_data)	/*!< in: crypt data */
+	fil_space_crypt_t* crypt_data, /*!< in: crypt data */
+	bool		create_table) /*!< in: true if create table */
 {
 	fil_space_t*	space;
 
@@ -1334,6 +1341,22 @@ fil_space_create(
 
 	space->magic_n = FIL_SPACE_MAGIC_N;
 
+	space->crypt_data = crypt_data;
+
+	/* In create table we write page 0 so we have already
+	"read" it and for system tablespaces we have read
+	crypt data at startup. */
+	if (create_table || crypt_data != NULL) {
+		space->page_0_crypt_read = true;
+	}
+
+#ifdef UNIV_DEBUG
+	ib::info() << "Created tablespace for space " << space->id
+		<< " name " << space->name
+		<< " key_id " << (space->crypt_data ? space->crypt_data->key_id : 0)
+		<< " encryption " << (space->crypt_data ? space->crypt_data->encryption : 0);
+#endif
+
 	space->encryption_type = Encryption::NONE;
 
 	rw_lock_create(fil_space_latch_key, &space->latch, SYNC_FSP);
@@ -1356,8 +1379,6 @@ fil_space_create(
 		fil_system->max_assigned_id = id;
 	}
 
-	space->crypt_data = crypt_data;
-
 	if (crypt_data) {
 		space->read_page0 = true;
 		/* If table could be encrypted print info */
@@ -3675,7 +3696,7 @@ fil_ibd_create(
 
 	space = fil_space_create(name, space_id, flags, is_temp
 		? FIL_TYPE_TEMPORARY : FIL_TYPE_TABLESPACE,
-		crypt_data);
+		crypt_data, true);
 
 	if (!fil_node_create_low(
 			path, size, space, false, punch_hole, atomic_write)) {
@@ -3833,6 +3854,7 @@ fil_ibd_open(
 		link_file_found = true;
 		if (table) {
 			table->crypt_data = df_remote.get_crypt_info();
+			table->page_0_read = true;
 		}
 	} else if (df_remote.filepath() != NULL) {
 		/* An ISL file was found but contained a bad filepath in it.
@@ -3855,6 +3877,7 @@ fil_ibd_open(
 
 				if (table) {
 					table->crypt_data = df_dict.get_crypt_info();
+					table->page_0_read = true;
 				}
 			}
 		}
@@ -3869,6 +3892,7 @@ fil_ibd_open(
 		++tablespaces_found;
 		if (table) {
 			table->crypt_data = df_default.get_crypt_info();
+			table->page_0_read = true;
 		}
 	}
 
@@ -4095,7 +4119,7 @@ skip_validate:
 			space_name, id, flags, purpose,
 			df_remote.is_open() ? df_remote.get_crypt_info() :
 			df_dict.is_open() ? df_dict.get_crypt_info() :
-			df_default.get_crypt_info());
+			df_default.get_crypt_info(), false);
 
 		/* We do not measure the size of the file, that is why
 		we pass the 0 below */
@@ -4577,7 +4601,7 @@ fil_ibd_load(
 	space = fil_space_create(
 		file.name(), space_id, file.flags(),
 		is_temp ? FIL_TYPE_TEMPORARY : FIL_TYPE_TABLESPACE,
-		file.get_crypt_info());
+		file.get_crypt_info(), false);
 
 	if (space == NULL) {
 		return(FIL_LOAD_INVALID);
@@ -6358,9 +6382,7 @@ fil_iterate(
 		bool encrypted = false;
 
 		/* Use additional crypt io buffer if tablespace is encrypted */
-		if ((iter.crypt_data != NULL && iter.crypt_data->encryption == FIL_SPACE_ENCRYPTION_ON) ||
-				(srv_encrypt_tables &&
-					iter.crypt_data && iter.crypt_data->encryption == FIL_SPACE_ENCRYPTION_DEFAULT)) {
+		if (iter.crypt_data != NULL && iter.crypt_data->should_encrypt()) {
 
 			encrypted = true;
 			readptr = iter.crypt_io_buffer;
@@ -7573,11 +7595,53 @@ fil_space_get_crypt_data(
 
 	space = fil_space_get_by_id(id);
 
+	mutex_exit(&fil_system->mutex);
+
 	if (space != NULL) {
+		/* If we have not yet read the page0
+		of this tablespace we will do it now. */
+		if (!space->crypt_data && !space->page_0_crypt_read) {
+			ulint space_id = space->id;
+			fil_node_t*	node;
+
+			ut_a(space->crypt_data == NULL);
+			node = UT_LIST_GET_FIRST(space->chain);
+
+			byte *buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE, PSI_INSTRUMENT_ME));
+			byte *page = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE));
+			fil_read(page_id_t(space_id, 0), univ_page_size, 0, univ_page_size.physical(),
+				 page);
+			ulint flags = fsp_header_get_flags(page);
+			ulint offset = fsp_header_get_crypt_offset(
+				page_size_t(flags), NULL);
+			space->crypt_data = fil_space_read_crypt_data(space_id, page, offset);
+			ut_free(buf);
+
+#ifdef UNIV_DEBUG
+			ib::info() << "Read page 0 from tablespace for"
+				<< "space " << space_id
+				<< " name " << space->name
+				<< " key_id " << (space->crypt_data ? space->crypt_data->key_id : 0)
+				<< " encryption " << (space->crypt_data ? space->crypt_data->encryption : 0)
+				<< " handle " << node->handle;
+#endif
+
+			ut_a(space->id == space_id);
+
+			space->page_0_crypt_read = true;
+		}
+
 		crypt_data = space->crypt_data;
-	}
 
-	mutex_exit(&fil_system->mutex);
+		if (!space->page_0_crypt_read) {
+			ib::warn() << "Space " << space->id << " name "
+				<< space->name << " contains encryption "
+				<< (space->crypt_data ? space->crypt_data->encryption : 0)
+				<< " information for key_id "
+				<< (space->crypt_data ? space->crypt_data->key_id : 0)
+				<< " but page0 is not read.";
+		}
+	}
 
 	return(crypt_data);
 }
diff --git a/storage/innobase/fsp/fsp0space.cc b/storage/innobase/fsp/fsp0space.cc
index 371baee627c..a26a7ae69b7 100644
--- a/storage/innobase/fsp/fsp0space.cc
+++ b/storage/innobase/fsp/fsp0space.cc
@@ -150,7 +150,8 @@ Tablespace::open_or_create(bool is_temp)
 			tablespace in the tablespace manager. */
 			space = fil_space_create(
 				m_name, m_space_id, flags, is_temp
-				? FIL_TYPE_TEMPORARY : FIL_TYPE_TABLESPACE, it->m_crypt_info);
+				? FIL_TYPE_TEMPORARY : FIL_TYPE_TABLESPACE, it->m_crypt_info,
+				false);
 		}
 
 		ut_a(fil_validate());
diff --git a/storage/innobase/fsp/fsp0sysspace.cc b/storage/innobase/fsp/fsp0sysspace.cc
index 66b5da15e8b..9125a26b912 100644
--- a/storage/innobase/fsp/fsp0sysspace.cc
+++ b/storage/innobase/fsp/fsp0sysspace.cc
@@ -966,7 +966,8 @@ SysTablespace::open_or_create(
 
 			space = fil_space_create(
 				name(), space_id(), flags(), is_temp
-				? FIL_TYPE_TEMPORARY : FIL_TYPE_TABLESPACE, m_crypt_info);
+				? FIL_TYPE_TEMPORARY : FIL_TYPE_TABLESPACE, m_crypt_info,
+				false);
 		}
 
 		ut_a(fil_validate());
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index 83dd0c17eaa..c0144647147 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -1123,6 +1123,8 @@ static SHOW_VAR innodb_status_variables[]= {
   (char*) &export_vars.innodb_pages_created,		  SHOW_LONG},
   {"pages_read",
   (char*) &export_vars.innodb_pages_read,		  SHOW_LONG},
+  {"pages0_read",
+  (char*) &export_vars.innodb_page0_read,		  SHOW_LONG},
   {"pages_written",
   (char*) &export_vars.innodb_pages_written,		  SHOW_LONG},
   {"row_lock_current_waits",
@@ -1271,6 +1273,8 @@ static SHOW_VAR innodb_status_variables[]= {
   {"scrub_background_page_split_failures_unknown",
    (char*) &export_vars.innodb_scrub_page_split_failures_unknown,
    SHOW_LONG},
+  {"encryption_num_key_requests",
+   (char*) &export_vars.innodb_encryption_key_requests, SHOW_LONGLONG},
 
   {NullS, NullS, SHOW_LONG}
 };
@@ -2563,7 +2567,7 @@ innobase_get_stmt_unsafe(
 	LEX_STRING* stmt;
 	const char* query=NULL;
 
-	stmt = thd_query_string(thd);
+	stmt =  thd ? thd_query_string(thd) : NULL;
 	// MySQL 5.7
 	//stmt = thd_query_unsafe(thd);
 
@@ -2596,7 +2600,7 @@ innobase_get_stmt_safe(
 
 	ut_ad(buflen > 1);
 
-	stmt = thd_query_string(thd);
+	stmt =  thd ? thd_query_string(thd) : NULL;
 
 	if (stmt && stmt->str) {
 		length = stmt->length > buflen ? buflen : stmt->length;
@@ -3218,6 +3222,20 @@ innodb_replace_trx_in_thd(
 }
 #endif /* MYSQL_REPLACE_TRX_IN_THD */
 
+/*************************************************************************
+Gets current trx. */
+trx_t*
+innobase_get_trx()
+{
+	THD *thd=current_thd;
+	if (likely(thd != 0)) {
+		trx_t*& trx = thd_to_trx(thd);
+		return(trx);
+	} else {
+		return(NULL);
+	}
+}
+
 /*********************************************************************//**
 Note that a transaction has been registered with MySQL.
 @return true if transaction is registered with MySQL 2PC coordinator */
@@ -4187,17 +4205,17 @@ innobase_init(
 	ut_new_boot();
 
 	if (UNIV_PAGE_SIZE != UNIV_PAGE_SIZE_DEF) {
-		fprintf(stderr,
-			"InnoDB: Warning: innodb_page_size has been "
-			"changed from default value %d to %ldd. (###EXPERIMENTAL### "
-			"operation)\n", UNIV_PAGE_SIZE_DEF, UNIV_PAGE_SIZE);
+		ib::info() << "innodb_page_size has been "
+			<< "changed from default value "
+			<< UNIV_PAGE_SIZE_DEF << " to " << UNIV_PAGE_SIZE;
 
 		/* There is hang on buffer pool when trying to get a new
 		page if buffer pool size is too small for large page sizes */
 		if (innobase_buffer_pool_size < (24 * 1024 * 1024)) {
-			fprintf(stderr, "InnoDB: Error: innobase_page_size %lu requires "
-				"innodb_buffer_pool_size > 24M current %lld",
-				UNIV_PAGE_SIZE, innobase_buffer_pool_size);
+			ib::info() << "innobase_page_size "
+				<< UNIV_PAGE_SIZE << " requires "
+				<< "innodb_buffer_pool_size > 24M current "
+				<< innobase_buffer_pool_size;
 			goto error;
 		}
 	}
@@ -7018,9 +7036,7 @@ ha_innobase::open(
 			bool warning_pushed = false;
 			fil_space_crypt_t* crypt_data = ib_table->crypt_data;
 
-			if ((crypt_data && crypt_data->encryption == FIL_SPACE_ENCRYPTION_ON) ||
-				(srv_encrypt_tables &&
-					crypt_data && crypt_data->encryption == FIL_SPACE_ENCRYPTION_DEFAULT)) {
+			if (crypt_data && crypt_data->should_encrypt()) {
 
 				if (!encryption_key_id_exists(crypt_data->key_id)) {
 					push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
@@ -8274,7 +8290,62 @@ build_template_field(
 		templ->col_no = i;
 		templ->clust_rec_field_no = dict_col_get_clust_pos(
 						col, clust_index);
-		ut_a(templ->clust_rec_field_no != ULINT_UNDEFINED);
+		/* If clustered index record field is not found, lets print out
+		field names and all the rest to understand why field is not found. */
+		if (templ->clust_rec_field_no == ULINT_UNDEFINED) {
+			const char* tb_col_name = dict_table_get_col_name(clust_index->table, i);
+			dict_field_t* field=NULL;
+			size_t size = 0;
+
+			for(ulint j=0; j < clust_index->n_user_defined_cols; j++) {
+				dict_field_t* ifield = &(clust_index->fields[j]);
+				if (ifield && !memcmp(tb_col_name, ifield->name,
+						strlen(tb_col_name))) {
+					field = ifield;
+					break;
+				}
+			}
+
+			ib::info() << "Looking for field " << i << " name "
+				<< (tb_col_name ? tb_col_name : "NULL")
+				<< " from table " << clust_index->table->name;
+
+
+			for(ulint j=0; j < clust_index->n_user_defined_cols; j++) {
+				dict_field_t* ifield = &(clust_index->fields[j]);
+				ib::info() << "InnoDB Table "
+					<< clust_index->table->name
+					<< "field " << j << " name "
+					<< (ifield ? ifield->name() : "NULL");
+			}
+
+			for(ulint j=0; j < table->s->stored_fields; j++) {
+				ib::info() << "MySQL table "
+					<< table->s->table_name.str
+					<< " field " << j << " name "
+					<< table->field[j]->field_name;
+			}
+
+			ib::error() << "Clustered record field for column " << i
+				<< " not found table n_user_defined "
+				<< clust_index->n_user_defined_cols
+				<< " index n_user_defined "
+				<< clust_index->table->n_cols - DATA_N_SYS_COLS
+				<< " InnoDB table "
+				<< clust_index->table->name
+				<< " field name "
+				<< (field ? field->name() : "NULL")
+				<< " MySQL table "
+				<< table->s->table_name.str
+				<< " field name "
+				<< (tb_col_name ? tb_col_name : "NULL")
+				<< " n_fields "
+				<< table->s->stored_fields
+				<< " query "
+				<< innobase_get_stmt_unsafe(current_thd, &size);
+
+			ut_a(templ->clust_rec_field_no != ULINT_UNDEFINED);
+		}
 		templ->rec_field_is_prefix = FALSE;
 		templ->rec_prefix_field_no = ULINT_UNDEFINED;
 
@@ -16774,7 +16845,7 @@ ha_innobase::check(
 			if (!dict_index_is_clust(index)) {
 				m_prebuilt->index_usable = FALSE;
 				// row_mysql_lock_data_dictionary(m_prebuilt->trx);
-				dict_set_corrupted(index, m_prebuilt->trx, "dict_set_index_corrupted");;
+				dict_set_corrupted(index, m_prebuilt->trx, "dict_set_index_corrupted");
 				// row_mysql_unlock_data_dictionary(m_prebuilt->trx);
 			});
 
@@ -24325,20 +24396,22 @@ ib_push_warning(
 	const char	*format,/*!< in: warning message */
 	...)
 {
-	va_list args;
-	THD *thd = (THD *)trx->mysql_thd;
-	char *buf;
+	if (trx && trx->mysql_thd) {
+		THD *thd = (THD *)trx->mysql_thd;
+		va_list args;
+		char *buf;
 #define MAX_BUF_SIZE 4*1024
 
-	va_start(args, format);
-	buf = (char *)my_malloc(MAX_BUF_SIZE, MYF(MY_WME));
-	vsprintf(buf,format, args);
+		va_start(args, format);
+		buf = (char *)my_malloc(MAX_BUF_SIZE, MYF(MY_WME));
+		vsprintf(buf,format, args);
 
-	push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
-		convert_error_code_to_mysql((dberr_t)error, 0, thd),
-		buf);
-	my_free(buf);
-	va_end(args);
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+			convert_error_code_to_mysql((dberr_t)error, 0, thd),
+			buf);
+		my_free(buf);
+		va_end(args);
+	}
 }
 
 /********************************************************************//**
diff --git a/storage/innobase/ibuf/ibuf0ibuf.cc b/storage/innobase/ibuf/ibuf0ibuf.cc
index 4b1c3704123..6940bbee5c2 100644
--- a/storage/innobase/ibuf/ibuf0ibuf.cc
+++ b/storage/innobase/ibuf/ibuf0ibuf.cc
@@ -862,11 +862,17 @@ ibuf_set_free_bits_low(
 	mtr_t*			mtr)	/*!< in/out: mtr */
 {
 	page_t*	bitmap_page;
+	buf_frame_t* frame;
 
 	ut_ad(mtr->is_named_space(block->page.id.space()));
 
-	if (!page_is_leaf(buf_block_get_frame(block))) {
+	if (!block) {
+		return;
+	}
 
+	frame = buf_block_get_frame(block);
+
+	if (!frame || !page_is_leaf(frame)) {
 		return;
 	}
 
@@ -1040,7 +1046,10 @@ ibuf_update_free_bits_zip(
 	page_t*	bitmap_page;
 	ulint	after;
 
-	ut_a(page_is_leaf(buf_block_get_frame(block)));
+	ut_a(block);
+	buf_frame_t* frame = buf_block_get_frame(block);
+	ut_a(frame);
+	ut_a(page_is_leaf(frame));
 	ut_a(block->page.size.is_compressed());
 
 	bitmap_page = ibuf_bitmap_get_map_page(block->page.id,
diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h
index c177f23824f..48c5eb42724 100644
--- a/storage/innobase/include/btr0btr.h
+++ b/storage/innobase/include/btr0btr.h
@@ -242,9 +242,16 @@ btr_block_get_func(
 @param index index tree, may be NULL if not the insert buffer tree
 @param mtr mini-transaction handle
 @return the uncompressed page frame */
-# define btr_page_get(page_id, page_size, mode, index, mtr)	\
-	buf_block_get_frame(btr_block_get(page_id, page_size,	\
-					  mode, index, mtr))
+UNIV_INLINE
+page_t*
+btr_page_get(
+/*=========*/
+	const page_id_t&	page_id,
+	const page_size_t&	page_size,
+	ulint			mode,
+	dict_index_t*		index,
+	mtr_t*			mtr)
+	MY_ATTRIBUTE((warn_unused_result));
 #endif /* !UNIV_HOTBACKUP */
 /**************************************************************//**
 Gets the index id field of a page.
diff --git a/storage/innobase/include/btr0btr.ic b/storage/innobase/include/btr0btr.ic
index 58a0c6755b1..d01e19b5202 100644
--- a/storage/innobase/include/btr0btr.ic
+++ b/storage/innobase/include/btr0btr.ic
@@ -63,7 +63,9 @@ btr_block_get_func(
 		page_id, page_size, mode, NULL, BUF_GET, file, line, mtr, &err);
 
 	if (err == DB_DECRYPTION_FAILED) {
-		index->table->is_encrypted = true;
+		if (index && index->table) {
+			index->table->is_encrypted = true;
+		}
 	}
 
 	if (block) {
@@ -99,6 +101,37 @@ btr_page_set_index_id(
 		mlog_write_ull(page + (PAGE_HEADER + PAGE_INDEX_ID), id, mtr);
 	}
 }
+
+/** Gets a buffer page and declares its latching order level.
+@param space	tablespace identifier
+@param zip_size	compressed page size in bytes or 0 for uncompressed pages
+@param page_no	page number
+@param mode	latch mode
+@param idx	index tree, may be NULL if not the insert buffer tree
+@param mtr	mini-transaction handle
+@return the uncompressed page frame */
+UNIV_INLINE
+page_t*
+btr_page_get(
+/*=========*/
+	const page_id_t&	page_id,
+	const page_size_t&	page_size,
+	ulint			mode,
+	dict_index_t*		index,
+	mtr_t*			mtr)
+{
+	buf_block_t* block=NULL;
+	buf_frame_t* frame=NULL;
+
+	block = btr_block_get(page_id, page_size, mode, index, mtr);
+
+	if (block) {
+		frame = buf_block_get_frame(block);
+	}
+
+	return ((page_t*)frame);
+}
+
 #endif /* !UNIV_HOTBACKUP */
 
 /**************************************************************//**
diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h
index c81d893ed5a..8cefdddad65 100644
--- a/storage/innobase/include/dict0mem.h
+++ b/storage/innobase/include/dict0mem.h
@@ -1372,6 +1372,8 @@ struct dict_table_t {
 	inline void acquire();
 
 	void*		thd;		/*!< thd */
+	bool		page_0_read; /*!< true if page 0 has
+				     been already read */
 	fil_space_crypt_t *crypt_data; /*!< crypt data if present */
 
 	/** Release the table handle. */
diff --git a/storage/innobase/include/fil0crypt.h b/storage/innobase/include/fil0crypt.h
index acac155ef3f..eb6eaa229b5 100644
--- a/storage/innobase/include/fil0crypt.h
+++ b/storage/innobase/include/fil0crypt.h
@@ -75,6 +75,17 @@ struct key_struct
                                                 (that is L in CRYPT_SCHEME_1) */
 };
 
+/** is encryption enabled */
+extern ulong	srv_encrypt_tables;
+
+/** Mutex helper for crypt_data->scheme
+@param[in, out]	schme	encryption scheme
+@param[in]	exit	should we exit or enter mutex ? */
+void
+crypt_data_scheme_locker(
+	st_encryption_scheme*	scheme,
+	int			exit);
+
 struct fil_space_rotate_state_t
 {
 	time_t start_time;	/*!< time when rotation started */
@@ -96,13 +107,109 @@ struct fil_space_rotate_state_t
 
 struct fil_space_crypt_struct : st_encryption_scheme
 {
+ public:
+	/** Constructor. Does not initialize the members!
+	The object is expected to be placed in a buffer that
+	has been zero-initialized. */
+	fil_space_crypt_struct(
+		ulint new_type,
+		uint new_min_key_version,
+		uint new_key_id,
+		ulint offset,
+		fil_encryption_t new_encryption)
+		: st_encryption_scheme(),
+		min_key_version(new_min_key_version),
+		page0_offset(offset),
+		encryption(new_encryption),
+		closing(false),
+		key_found(),
+		rotate_state()
+	{
+		key_found = new_min_key_version;
+		key_id = new_key_id;
+		my_random_bytes(iv, sizeof(iv));
+		mutex_create(LATCH_ID_FIL_CRYPT_DATA_MUTEX, &mutex);
+		locker = crypt_data_scheme_locker;
+		type = new_type;
+
+		if (new_encryption == FIL_SPACE_ENCRYPTION_OFF ||
+			(!srv_encrypt_tables &&
+			 new_encryption == FIL_SPACE_ENCRYPTION_DEFAULT)) {
+			type = CRYPT_SCHEME_UNENCRYPTED;
+		} else {
+			type = CRYPT_SCHEME_1;
+			min_key_version = key_get_latest_version();
+		}
+	}
+
+	/** Destructor */
+	~fil_space_crypt_struct()
+	{
+		closing = true;
+		mutex_free(&mutex);
+	}
+
+	/** Get latest key version from encryption plugin
+	@retval key_version or
+	@retval ENCRYPTION_KEY_VERSION_INVALID if used key_id
+	is not found from encryption plugin. */
+	uint key_get_latest_version(void);
+
+	/** Returns true if key was found from encryption plugin
+	and false if not. */
+	bool is_key_found() const {
+		return key_found != ENCRYPTION_KEY_VERSION_INVALID;
+	}
+
+	/** Returns true if tablespace should be encrypted */
+	bool should_encrypt() const {
+		return ((encryption == FIL_SPACE_ENCRYPTION_ON) ||
+			(srv_encrypt_tables &&
+				encryption == FIL_SPACE_ENCRYPTION_DEFAULT));
+	}
+
+	/** Return true if tablespace is encrypted. */
+	bool is_encrypted() const {
+		return (encryption != FIL_SPACE_ENCRYPTION_OFF);
+	}
+
+	/** Return true if default tablespace encryption is used, */
+	bool is_default_encryption() const {
+		return (encryption == FIL_SPACE_ENCRYPTION_DEFAULT);
+	}
+
+	/** Return true if tablespace is not encrypted. */
+	bool not_encrypted() const {
+		return (encryption == FIL_SPACE_ENCRYPTION_OFF);
+	}
+
+	/** Is this tablespace closing. */
+	bool is_closing(bool is_fixed) {
+		bool closed;
+		if (!is_fixed) {
+			mutex_enter(&mutex);
+		}
+		closed = closing;
+		if (!is_fixed) {
+			mutex_exit(&mutex);
+		}
+		return closed;
+	}
+
 	uint min_key_version; // min key version for this space
 	ulint page0_offset;   // byte offset on page 0 for crypt data
 	fil_encryption_t encryption; // Encryption setup
 
 	ib_mutex_t mutex;   // mutex protecting following variables
 	bool closing;	    // is tablespace being closed
-	bool inited;
+
+	/** Return code from encryption_key_get_latest_version.
+        If ENCRYPTION_KEY_VERSION_INVALID encryption plugin
+	could not find the key and there is no need to call
+	get_latest_key_version again as keys are read only
+	at startup. */
+	uint key_found;
+
 	fil_space_rotate_state_t rotate_state;
 };
 
@@ -321,7 +428,8 @@ UNIV_INTERN
 void
 fil_space_crypt_mark_space_closing(
 /*===============================*/
-	ulint space);          /*!< in: tablespace id */
+	ulint			space,		/*!< in: tablespace id */
+	fil_space_crypt_t*	crypt_data);	/*!< in: crypt_data or NULL */
 
 /*********************************************************************
 Wait for crypt threads to stop accessing space */
diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h
index bd31ed58283..65f73448c6e 100644
--- a/storage/innobase/include/fil0fil.h
+++ b/storage/innobase/include/fil0fil.h
@@ -235,6 +235,9 @@ struct fil_space_t {
 	/** MariaDB encryption data */
         fil_space_crypt_t* crypt_data;
 
+	/** tablespace crypt data has been read */
+	bool		page_0_crypt_read;
+
 	/** Space file block size */
 	ulint		file_block_size;
 
@@ -751,7 +754,8 @@ fil_space_create(
 	ulint		id,
 	ulint		flags,
 	fil_type_t	purpose,	/*!< in: FIL_TABLESPACE, or FIL_LOG if log */
-	fil_space_crypt_t* crypt_data)	/*!< in: crypt data */
+	fil_space_crypt_t* crypt_data, /*!< in: crypt data */
+	bool		create_table)  /*!< in: true if create table */
 	MY_ATTRIBUTE((warn_unused_result));
 
 /*******************************************************************//**
diff --git a/storage/innobase/include/fil0pagecompress.h b/storage/innobase/include/fil0pagecompress.h
index 2948207133b..d4cc54c7b2a 100644
--- a/storage/innobase/include/fil0pagecompress.h
+++ b/storage/innobase/include/fil0pagecompress.h
@@ -124,5 +124,4 @@ fil_node_get_block_size(
 /*====================*/
 	fil_node_t*	node);	/*!< in: Node where to get block
 				size */
-
 #endif
diff --git a/storage/innobase/include/srv0mon.h b/storage/innobase/include/srv0mon.h
index 125401373ba..d5a305bdf68 100644
--- a/storage/innobase/include/srv0mon.h
+++ b/storage/innobase/include/srv0mon.h
@@ -180,6 +180,7 @@ enum monitor_id_t {
 	MONITOR_OVLD_INDEX_PAGES_WRITTEN,
 	MONITOR_OVLD_NON_INDEX_PAGES_WRITTEN,
 	MONITOR_OVLD_PAGES_READ,
+	MONITOR_OVLD_PAGES0_READ,
 	MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS,
 	MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS_AVOIDED,
 	MONITOR_OVLD_BYTE_READ,
diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h
index 4f9bee7019f..056a6267347 100644
--- a/storage/innobase/include/srv0srv.h
+++ b/storage/innobase/include/srv0srv.h
@@ -189,6 +189,12 @@ struct srv_stats_t {
 
 	/** Number of times prefix optimization avoided triggering cluster lookup */
 	ulint_ctr_64_t		n_sec_rec_cluster_reads_avoided;
+
+	/** Number of times page 0 is read from tablespace */
+	ulint_ctr_64_t		page0_read;
+
+	/** Number of encryption_get_latest_key_version calls */
+	ulint_ctr_64_t		n_key_requests;
 };
 
 extern const char*	srv_main_thread_op_info;
@@ -1028,7 +1034,8 @@ struct export_var_t{
 	ulint innodb_os_log_pending_fsyncs;	/*!< fil_n_pending_log_flushes */
 	ulint innodb_page_size;			/*!< UNIV_PAGE_SIZE */
 	ulint innodb_pages_created;		/*!< buf_pool->stat.n_pages_created */
-	ulint innodb_pages_read;		/*!< buf_pool->stat.n_pages_read */
+	ulint innodb_pages_read;		/*!< buf_pool->stat.n_pages_read*/
+	ulint innodb_page0_read;		/*!< srv_stats.page0_read */
 	ulint innodb_pages_written;		/*!< buf_pool->stat.n_pages_written */
 	ulint innodb_row_lock_waits;		/*!< srv_n_lock_wait_count */
 	ulint innodb_row_lock_current_waits;	/*!< srv_n_lock_wait_current_count */
@@ -1118,6 +1125,7 @@ struct export_var_t{
 	ulint innodb_encryption_rotation_pages_modified;
 	ulint innodb_encryption_rotation_pages_flushed;
 	ulint innodb_encryption_rotation_estimated_iops;
+	int64_t innodb_encryption_key_requests;
 
 	ulint innodb_scrub_page_reorganizations;
 	ulint innodb_scrub_page_splits;
diff --git a/storage/innobase/row/row0ftsort.cc b/storage/innobase/row/row0ftsort.cc
index 3d459e00506..3f2255d4644 100644
--- a/storage/innobase/row/row0ftsort.cc
+++ b/storage/innobase/row/row0ftsort.cc
@@ -227,10 +227,7 @@ row_fts_psort_info_init(
 	common_info->opt_doc_id_size = opt_doc_id_size;
 	crypt_data = fil_space_get_crypt_data(new_table->space);
 
-	if ((crypt_data && crypt_data->encryption == FIL_SPACE_ENCRYPTION_ON) ||
-		(srv_encrypt_tables &&
-			crypt_data && crypt_data->encryption == FIL_SPACE_ENCRYPTION_DEFAULT)) {
-
+	if (crypt_data && crypt_data->should_encrypt()) {
 		common_info->crypt_data = crypt_data;
 		encrypted = true;
 	} else {
diff --git a/storage/innobase/row/row0import.cc b/storage/innobase/row/row0import.cc
index 1ae11204f69..1bab51d89bd 100644
--- a/storage/innobase/row/row0import.cc
+++ b/storage/innobase/row/row0import.cc
@@ -1852,6 +1852,11 @@ PageConverter::update_index_page(
 		row_index_t*	index = find_index(id);
 
 		if (index == 0) {
+			ib::error() << "Page for tablespace " << m_space
+				<< " is index page with id " << id
+				<< " but that index is not found from"
+				<< " configuration file. Current index name "
+				<< m_index->m_name << " and id " <<  m_index->m_id;
 			m_index = 0;
 			return(DB_CORRUPTION);
 		}
diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc
index 7a8b74279f5..a5cd0064ddd 100644
--- a/storage/innobase/row/row0merge.cc
+++ b/storage/innobase/row/row0merge.cc
@@ -4701,10 +4701,7 @@ row_merge_build_indexes(
 
 	/* If tablespace is encrypted, allocate additional buffer for
 	encryption/decryption. */
-	if ((crypt_data && crypt_data->encryption == FIL_SPACE_ENCRYPTION_ON) ||
-		(srv_encrypt_tables &&
-			crypt_data && crypt_data->encryption == FIL_SPACE_ENCRYPTION_DEFAULT)) {
-
+	if (crypt_data && crypt_data->should_encrypt()) {
 		crypt_block = static_cast<row_merge_block_t*>(
 			alloc.allocate_large(3 * srv_sort_buf_size, &crypt_pfx));
 
diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc
index 100e1bcb708..f7ad96191b1 100644
--- a/storage/innobase/row/row0mysql.cc
+++ b/storage/innobase/row/row0mysql.cc
@@ -3511,7 +3511,7 @@ fil_wait_crypt_bg_threads(
 	uint start = time(0);
 	uint last = start;
 	if (table->space != 0) {
-		fil_space_crypt_mark_space_closing(table->space);
+		fil_space_crypt_mark_space_closing(table->space, table->crypt_data);
 	}
 
 	while (table->get_ref_count()> 0) {
@@ -3986,6 +3986,13 @@ row_drop_table_for_mysql(
 	/* As we don't insert entries to SYSTEM TABLES for temp-tables
 	we need to avoid running removal of these entries. */
 	if (!dict_table_is_temporary(table)) {
+
+		/* If table has not yet have crypt_data, try to read it to
+		make freeing the table easier. */
+		if (!table->crypt_data) {
+			table->crypt_data = fil_space_get_crypt_data(table->space);
+		}
+
 		/* We use the private SQL parser of Innobase to generate the
 		query graphs needed in deleting the dictionary data from system
 		tables in Innobase. Deleting a row from SYS_INDEXES table also
diff --git a/storage/innobase/srv/srv0mon.cc b/storage/innobase/srv/srv0mon.cc
index 6d213a2c761..b2d0fc852e5 100644
--- a/storage/innobase/srv/srv0mon.cc
+++ b/storage/innobase/srv/srv0mon.cc
@@ -303,6 +303,12 @@ static monitor_info_t	innodb_counter_info[] =
 	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
 	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_READ},
 
+	{"buffer_pages0_read", "buffer",
+	 "Number of page 0 read (innodb_pages0_read)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES0_READ},
+
 	{"buffer_index_sec_rec_cluster_reads", "buffer",
 	 "Number of secondary record reads triggered cluster read",
 	 static_cast<monitor_type_t>(
@@ -1780,6 +1786,11 @@ srv_mon_process_existing_counter(
 		value = stat.n_pages_read;
 		break;
 
+	/* innodb_pages0_read */
+	case MONITOR_OVLD_PAGES0_READ:
+		value = srv_stats.page0_read;
+		break;
+
 	/* Number of times secondary index lookup triggered cluster lookup */
 	case MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS:
 		value = srv_stats.n_sec_rec_cluster_reads;
diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc
index 51e898d98d8..d35b6421e5a 100644
--- a/storage/innobase/srv/srv0srv.cc
+++ b/storage/innobase/srv/srv0srv.cc
@@ -3,7 +3,7 @@
 Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2008, 2009 Google Inc.
 Copyright (c) 2009, Percona Inc.
-Copyright (c) 2013, 2016, MariaDB Corporation. All Rights Reserved.
+Copyright (c) 2013, 2016, MariaDB Corporation.
 
 Portions of this file contain modifications contributed and copyrighted by
 Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -1576,6 +1576,7 @@ srv_export_innodb_status(void)
 	export_vars.innodb_pages_created = stat.n_pages_created;
 
 	export_vars.innodb_pages_read = stat.n_pages_read;
+	export_vars.innodb_page0_read = srv_stats.page0_read;
 
 	export_vars.innodb_pages_written = stat.n_pages_written;
 
@@ -1691,6 +1692,8 @@ srv_export_innodb_status(void)
 		crypt_stat.pages_flushed;
 	export_vars.innodb_encryption_rotation_estimated_iops =
 		crypt_stat.estimated_iops;
+	export_vars.innodb_encryption_key_requests =
+		srv_stats.n_key_requests;
 
 	export_vars.innodb_scrub_page_reorganizations =
 		scrub_stat.page_reorganizations;
diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc
index 3eeb698d5f6..c549e458c7b 100644
--- a/storage/innobase/srv/srv0start.cc
+++ b/storage/innobase/srv/srv0start.cc
@@ -455,8 +455,8 @@ create_log_files(
 		"innodb_redo_log", SRV_LOG_SPACE_FIRST_ID,
 		fsp_flags_set_page_size(0, univ_page_size),
 		FIL_TYPE_LOG,
-		NULL /* No encryption yet */
-		);
+		NULL, /* No encryption yet */
+		true /* this is create */);
 	ut_a(fil_validate());
 	ut_a(log_space != NULL);
 
@@ -712,7 +712,7 @@ srv_undo_tablespace_open(
 		flags = fsp_flags_init(
 			univ_page_size, false, false, false, false, false, 0, ATOMIC_WRITES_DEFAULT);
 		space = fil_space_create(
-			undo_name, space_id, flags, FIL_TYPE_TABLESPACE, NULL);
+			undo_name, space_id, flags, FIL_TYPE_TABLESPACE, NULL, true);
 
 		ut_a(fil_validate());
 		ut_a(space);
@@ -2065,7 +2065,8 @@ innobase_start_or_create_for_mysql(void)
 			SRV_LOG_SPACE_FIRST_ID,
 			fsp_flags_set_page_size(0, univ_page_size),
 			FIL_TYPE_LOG,
-			NULL /* no encryption yet */);
+			NULL /* no encryption yet */,
+			true /* create */);
 
 		ut_a(fil_validate());
 		ut_a(log_space);
diff --git a/storage/innobase/trx/trx0trx.cc b/storage/innobase/trx/trx0trx.cc
index 7bead6e16c7..db9aa7fde96 100644
--- a/storage/innobase/trx/trx0trx.cc
+++ b/storage/innobase/trx/trx0trx.cc
@@ -1488,6 +1488,8 @@ trx_start_low(
 
 	ut_a(trx->error_state == DB_SUCCESS);
 
+	trx->start_time_micro = clock();
+
 	MONITOR_INC(MONITOR_TRX_ACTIVE);
 }
 
diff --git a/storage/maria/ha_maria.cc b/storage/maria/ha_maria.cc
index 59323a27ae2..5ed836ddb63 100644
--- a/storage/maria/ha_maria.cc
+++ b/storage/maria/ha_maria.cc
@@ -557,8 +557,7 @@ static int table2maria(TABLE *table_arg, data_file_type row_type,
       keydef[i].seg[j].type= (int) type;
       keydef[i].seg[j].start= pos->key_part[j].offset;
       keydef[i].seg[j].length= pos->key_part[j].length;
-      keydef[i].seg[j].bit_start= keydef[i].seg[j].bit_end=
-        keydef[i].seg[j].bit_length= 0;
+      keydef[i].seg[j].bit_start= keydef[i].seg[j].bit_length= 0;
       keydef[i].seg[j].bit_pos= 0;
       keydef[i].seg[j].language= field->charset()->number;
 
diff --git a/storage/maria/ma_check.c b/storage/maria/ma_check.c
index 89b1a733fc9..1a271c217a8 100644
--- a/storage/maria/ma_check.c
+++ b/storage/maria/ma_check.c
@@ -6103,7 +6103,7 @@ int maria_recreate_table(HA_CHECK *param, MARIA_HA **org_info, char *filename)
   create_info.data_file_length=file_length;
   create_info.auto_increment=share.state.auto_increment;
   create_info.language = (param->language ? param->language :
-			  share.state.header.language);
+			  share.base.language);
   create_info.key_file_length=  status_info.key_file_length;
   create_info.org_data_file_type= ((enum data_file_type)
                                    share.state.header.org_data_file_type);
diff --git a/storage/maria/ma_create.c b/storage/maria/ma_create.c
index f160499a94e..0680b5d568e 100644
--- a/storage/maria/ma_create.c
+++ b/storage/maria/ma_create.c
@@ -725,8 +725,7 @@ int maria_create(const char *name, enum data_file_type datafile_type,
   mi_int2store(share.state.header.base_pos,base_pos);
   share.state.header.data_file_type= share.data_file_type= datafile_type;
   share.state.header.org_data_file_type= org_datafile_type;
-  share.state.header.language= (ci->language ?
-				ci->language : default_charset_info->number);
+  share.state.header.not_used= 0;
 
   share.state.dellink = HA_OFFSET_ERROR;
   share.state.first_bitmap_with_space= 0;
@@ -739,6 +738,8 @@ int maria_create(const char *name, enum data_file_type datafile_type,
   share.options=options;
   share.base.rec_reflength=pointer;
   share.base.block_size= maria_block_size;
+  share.base.language= (ci->language ? ci->language :
+                        default_charset_info->number);
 
   /*
     Get estimate for index file length (this may be wrong for FT keys)
@@ -937,7 +938,6 @@ int maria_create(const char *name, enum data_file_type datafile_type,
       sseg.language= 7;                         /* Binary */
       sseg.null_bit=0;
       sseg.bit_start=0;
-      sseg.bit_end=0;
       sseg.bit_length= 0;
       sseg.bit_pos= 0;
       sseg.length=SPLEN;
diff --git a/storage/maria/ma_open.c b/storage/maria/ma_open.c
index 42861e92ed4..4e97c6b43b9 100644
--- a/storage/maria/ma_open.c
+++ b/storage/maria/ma_open.c
@@ -276,6 +276,7 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags)
   uint i,j,len,errpos,head_length,base_pos,keys, realpath_err,
     key_parts,unique_key_parts,fulltext_keys,uniques;
   uint internal_table= MY_TEST(open_flags & HA_OPEN_INTERNAL_TABLE);
+  uint file_version;
   size_t info_length;
   char name_buff[FN_REFLEN], org_name[FN_REFLEN], index_name[FN_REFLEN],
        data_name[FN_REFLEN];
@@ -335,8 +336,8 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags)
     }
     share->mode=open_mode;
     errpos= 1;
-    if (mysql_file_pread(kfile,share->state.header.file_version, head_length, 0,
-                 MYF(MY_NABP)))
+    if (mysql_file_pread(kfile,share->state.header.file_version, head_length,
+                         0, MYF(MY_NABP)))
     {
       my_errno= HA_ERR_NOT_A_TABLE;
       goto err;
@@ -429,6 +430,14 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags)
 			    len,MARIA_BASE_INFO_SIZE));
     }
     disk_pos= _ma_base_info_read(disk_cache + base_pos, &share->base);
+    /*
+      Check if old version of Aria file. Version 0 has language
+      stored in header.not_used
+    */
+    file_version= (share->state.header.not_used == 0);
+    if (file_version == 0)
+      share->base.language= share->state.header.not_used;
+    
     share->state.state_length=base_pos;
     /* For newly opened tables we reset the error-has-been-printed flag */
     share->state.changed&= ~STATE_CRASHED_PRINTED;
@@ -1581,7 +1590,7 @@ uint _ma_base_info_write(File file, MARIA_BASE_INFO *base)
   mi_int2store(ptr,base->null_bytes);                   ptr+= 2;
   mi_int2store(ptr,base->original_null_bytes);	        ptr+= 2;
   mi_int2store(ptr,base->field_offsets);	        ptr+= 2;
-  mi_int2store(ptr,0);				        ptr+= 2; /* reserved */
+  mi_int2store(ptr,base->language);		        ptr+= 2;
   mi_int2store(ptr,base->block_size);	        	ptr+= 2;
   *ptr++= base->rec_reflength;
   *ptr++= base->key_reflength;
@@ -1624,7 +1633,7 @@ static uchar *_ma_base_info_read(uchar *ptr, MARIA_BASE_INFO *base)
   base->null_bytes= mi_uint2korr(ptr);			ptr+= 2;
   base->original_null_bytes= mi_uint2korr(ptr);		ptr+= 2;
   base->field_offsets= mi_uint2korr(ptr);		ptr+= 2;
-                                                        ptr+= 2;
+  base->language= mi_uint2korr(ptr);		        ptr+= 2;
   base->block_size= mi_uint2korr(ptr);			ptr+= 2;
 
   base->rec_reflength= *ptr++;
@@ -1689,10 +1698,10 @@ my_bool _ma_keyseg_write(File file, const HA_KEYSEG *keyseg)
   ulong pos;
 
   *ptr++= keyseg->type;
-  *ptr++= keyseg->language;
+  *ptr++= keyseg->language & 0xFF; /* Collation ID, low byte */
   *ptr++= keyseg->null_bit;
   *ptr++= keyseg->bit_start;
-  *ptr++= keyseg->bit_end;
+  *ptr++= keyseg->language >> 8; /* Collation ID, high byte */
   *ptr++= keyseg->bit_length;
   mi_int2store(ptr,keyseg->flag);	ptr+= 2;
   mi_int2store(ptr,keyseg->length);	ptr+= 2;
@@ -1711,7 +1720,7 @@ uchar *_ma_keyseg_read(uchar *ptr, HA_KEYSEG *keyseg)
    keyseg->language	= *ptr++;
    keyseg->null_bit	= *ptr++;
    keyseg->bit_start	= *ptr++;
-   keyseg->bit_end	= *ptr++;
+   keyseg->language	+= ((uint16) (*ptr++)) << 8;
    keyseg->bit_length   = *ptr++;
    keyseg->flag		= mi_uint2korr(ptr);  ptr+= 2;
    keyseg->length	= mi_uint2korr(ptr);  ptr+= 2;
diff --git a/storage/maria/ma_test2.c b/storage/maria/ma_test2.c
index e5e461261b5..8f8849785b8 100644
--- a/storage/maria/ma_test2.c
+++ b/storage/maria/ma_test2.c
@@ -27,9 +27,6 @@
 #define STANDARD_LENGTH 37
 #define MARIA_KEYS 6
 #define MAX_PARTS 4
-#if !defined(MSDOS) && !defined(labs)
-#define labs(a) abs(a)
-#endif
 
 static void get_options(int argc, char *argv[]);
 static uint rnd(uint max_value);
diff --git a/storage/maria/maria_chk.c b/storage/maria/maria_chk.c
index c9d38400bc4..0c1c56dfa94 100644
--- a/storage/maria/maria_chk.c
+++ b/storage/maria/maria_chk.c
@@ -1120,7 +1120,7 @@ static int maria_chk(HA_CHECK *param, char *filename)
        maria_test_if_almost_full(info) ||
        info->s->state.header.file_version[3] != maria_file_magic[3] ||
        (set_collation &&
-        set_collation->number != share->state.header.language)))
+        set_collation->number != share->base.language)))
   {
     if (set_collation)
       param->language= set_collation->number;
@@ -1507,8 +1507,8 @@ static void descript(HA_CHECK *param, register MARIA_HA *info, char *name)
   printf("Crashsafe:           %s\n",
          share->base.born_transactional ? "yes" : "no");
   printf("Character set:       %s (%d)\n",
-	 get_charset_name(share->state.header.language),
-	 share->state.header.language);
+	 get_charset_name(share->base.language),
+         (int) share->base.language);
 
   if (param->testflag & T_VERBOSE)
   {
diff --git a/storage/maria/maria_def.h b/storage/maria/maria_def.h
index a4c6e5295d2..19910f4288d 100644
--- a/storage/maria/maria_def.h
+++ b/storage/maria/maria_def.h
@@ -139,7 +139,7 @@ typedef struct st_maria_state_info
     uchar unique_key_parts[2];		/* Key parts + unique parts */
     uchar keys;				/* number of keys in file */
     uchar uniques;			/* number of UNIQUE definitions */
-    uchar language;			/* Language for indexes */
+    uchar not_used;			/* Language for indexes */
     uchar fulltext_keys;
     uchar data_file_type;
     /* Used by mariapack to store the original data_file_type */
@@ -209,6 +209,7 @@ typedef struct st_maria_state_info
 } MARIA_STATE_INFO;
 
 
+/* Number of bytes written be _ma_state_info_write_sub() */
 #define MARIA_STATE_INFO_SIZE	\
   (24 + 2 + LSN_STORE_SIZE*3 + 4 + 11*8 + 4*4 + 8 + 3*4 + 5*8)
 #define MARIA_FILE_OPEN_COUNT_OFFSET 0
@@ -291,6 +292,8 @@ typedef struct st_ma_base_info
   uint extra_rec_buff_size;
   /* Tuning flags that can be ignored by older Maria versions */
   uint extra_options;
+  /* default language, not really used but displayed by maria_chk */
+  uint language;
 
   /* The following are from the header */
   uint key_parts, all_key_parts;
@@ -916,7 +919,6 @@ extern mysql_mutex_t THR_LOCK_maria;
 #define MARIA_SMALL_BLOB_BUFFER 1024
 #define MARIA_MAX_CONTROL_FILE_LOCK_RETRY 30     /* Retry this many times */
 
-
 /* Some extern variables */
 extern LIST *maria_open_list;
 extern uchar maria_file_magic[], maria_pack_file_magic[];
diff --git a/storage/myisam/ft_boolean_search.c b/storage/myisam/ft_boolean_search.c
index 4480a67ebd7..a44e24c9db1 100644
--- a/storage/myisam/ft_boolean_search.c
+++ b/storage/myisam/ft_boolean_search.c
@@ -195,12 +195,7 @@ static int ftb_query_add_word(MYSQL_FTPARSER_PARAM *param,
   switch (info->type) {
     case FT_TOKEN_WORD:
       ftbw= (FTB_WORD *)alloc_root(&ftb_param->ftb->mem_root,
-                                   sizeof(FTB_WORD) +
-                                   (info->trunc ? HA_MAX_KEY_BUFF :
-                                    (word_len + 1) *
-                                    ftb_param->ftb->charset->mbmaxlen +
-                                    HA_FT_WLEN +
-                                    ftb_param->ftb->info->s->rec_reflength));
+                                   sizeof(FTB_WORD) + HA_MAX_KEY_BUFF);
       ftbw->len= word_len + 1;
       ftbw->flags= 0;
       ftbw->off= 0;
diff --git a/storage/myisam/ft_static.c b/storage/myisam/ft_static.c
index aa8adba88a2..92a0621fd9f 100644
--- a/storage/myisam/ft_static.c
+++ b/storage/myisam/ft_static.c
@@ -34,7 +34,7 @@ const HA_KEYSEG ft_keysegs[FT_SEGS]= {
   63,                                           /* language (will be overwritten) */
   HA_KEYTYPE_VARTEXT2,                          /* type */
   0,                                            /* null_bit */
-  2, 0, 0                                       /* bit_start, bit_end, bit_length */
+  2, 0                                          /* bit_start, bit_length */
 },
 {
   /*
@@ -42,7 +42,7 @@ const HA_KEYSEG ft_keysegs[FT_SEGS]= {
       be packed in any way, otherwise w_search() won't be able to
       update key entry 'in vivo'
     */
-  0, 0, 0, 0, HA_NO_SORT, HA_FT_WLEN, 63, HA_FT_WTYPE, 0, 0, 0, 0
+  0, 0, 0, 0, HA_NO_SORT, HA_FT_WLEN, 63, HA_FT_WTYPE, 0, 0, 0
 }
 };
 
diff --git a/storage/myisam/ha_myisam.cc b/storage/myisam/ha_myisam.cc
index 61f7fd37486..9e09853871d 100644
--- a/storage/myisam/ha_myisam.cc
+++ b/storage/myisam/ha_myisam.cc
@@ -279,8 +279,7 @@ int table2myisam(TABLE *table_arg, MI_KEYDEF **keydef_out,
       keydef[i].seg[j].type= (int) type;
       keydef[i].seg[j].start= pos->key_part[j].offset;
       keydef[i].seg[j].length= pos->key_part[j].length;
-      keydef[i].seg[j].bit_start= keydef[i].seg[j].bit_end=
-        keydef[i].seg[j].bit_length= 0;
+      keydef[i].seg[j].bit_start= keydef[i].seg[j].bit_length= 0;
       keydef[i].seg[j].bit_pos= 0;
       keydef[i].seg[j].language= field->charset_for_protocol()->number;
 
diff --git a/storage/myisam/mi_create.c b/storage/myisam/mi_create.c
index dbf2343c2a6..5cffd1a759f 100644
--- a/storage/myisam/mi_create.c
+++ b/storage/myisam/mi_create.c
@@ -730,7 +730,6 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs,
       sseg.language= 7;                         /* Binary */
       sseg.null_bit=0;
       sseg.bit_start=0;
-      sseg.bit_end=0;
       sseg.bit_length= 0;
       sseg.bit_pos= 0;
       sseg.length=SPLEN;
diff --git a/storage/myisam/mi_open.c b/storage/myisam/mi_open.c
index 776594a6409..01a13a6225c 100644
--- a/storage/myisam/mi_open.c
+++ b/storage/myisam/mi_open.c
@@ -1189,7 +1189,6 @@ uchar *mi_keyseg_read(uchar *ptr, HA_KEYSEG *keyseg)
    keyseg->length	= mi_uint2korr(ptr);  ptr +=2;
    keyseg->start	= mi_uint4korr(ptr);  ptr +=4;
    keyseg->null_pos	= mi_uint4korr(ptr);  ptr +=4;
-   keyseg->bit_end= 0;
    keyseg->charset=0;				/* Will be filled in later */
    if (keyseg->null_bit)
      /* We adjust bit_pos if null_bit is last in the byte */
diff --git a/storage/myisam/mi_test2.c b/storage/myisam/mi_test2.c
index be58b3c54d0..32dabca0ef5 100644
--- a/storage/myisam/mi_test2.c
+++ b/storage/myisam/mi_test2.c
@@ -26,9 +26,6 @@
 #define STANDARD_LENGTH 37
 #define MYISAM_KEYS 6
 #define MAX_PARTS 4
-#if !defined(labs)
-#define labs(a) abs(a)
-#endif
 
 static void get_options(int argc, char *argv[]);
 static uint rnd(uint max_value);
diff --git a/storage/oqgraph/graphcore.cc b/storage/oqgraph/graphcore.cc
index 4346b94805c..7c8ca53c096 100644
--- a/storage/oqgraph/graphcore.cc
+++ b/storage/oqgraph/graphcore.cc
@@ -485,7 +485,7 @@ namespace open_query
   optional<Vertex>
   oqgraph_share::find_vertex(VertexID id) const
   {
-    return ::boost::find_vertex(id, g);
+    return oqgraph3::find_vertex(id, g);
   }
 
 #if 0
diff --git a/storage/oqgraph/oqgraph_shim.h b/storage/oqgraph/oqgraph_shim.h
index af240b88ebd..004d7f0f7c5 100644
--- a/storage/oqgraph/oqgraph_shim.h
+++ b/storage/oqgraph/oqgraph_shim.h
@@ -274,6 +274,33 @@ namespace boost
   };
 #endif
 
+  template<>
+  struct property_map<oqgraph3::graph, edge_weight_t>
+  {
+    typedef void type;
+    typedef oqgraph3::edge_weight_property_map const_type;
+  };
+
+  template<>
+  struct property_map<oqgraph3::graph, vertex_index_t>
+  {
+    typedef void type;
+    typedef oqgraph3::vertex_index_property_map const_type;
+  };
+
+  template<>
+  struct property_map<oqgraph3::graph, edge_index_t>
+  {
+    typedef void type;
+    typedef oqgraph3::edge_index_property_map const_type;
+  };
+
+}
+
+namespace oqgraph3
+{
+  using namespace boost;
+
   inline graph_traits<oqgraph3::graph>::vertex_descriptor
   source(
       const graph_traits<oqgraph3::graph>::edge_descriptor& e,
@@ -401,27 +428,6 @@ namespace boost
     return count;
   }
 
-  template<>
-  struct property_map<oqgraph3::graph, edge_weight_t>
-  {
-    typedef void type;
-    typedef oqgraph3::edge_weight_property_map const_type;
-  };
-
-  template<>
-  struct property_map<oqgraph3::graph, vertex_index_t>
-  {
-    typedef void type;
-    typedef oqgraph3::vertex_index_property_map const_type;
-  };
-
-  template<>
-  struct property_map<oqgraph3::graph, edge_index_t>
-  {
-    typedef void type;
-    typedef oqgraph3::edge_index_property_map const_type;
-  };
-
   inline property_map<
       oqgraph3::graph,
       edge_weight_t>::const_type::reference
diff --git a/storage/perfschema/ha_perfschema.cc b/storage/perfschema/ha_perfschema.cc
index d703d5d594a..598fc1061d8 100644
--- a/storage/perfschema/ha_perfschema.cc
+++ b/storage/perfschema/ha_perfschema.cc
@@ -225,7 +225,7 @@ maria_declare_plugin(perfschema)
   0x0001,
   pfs_status_vars,
   NULL,
-  "5.6.32",
+  "5.6.33",
   MariaDB_PLUGIN_MATURITY_STABLE
 }
 maria_declare_plugin_end;
diff --git a/storage/sphinx/mysql-test/sphinx/disabled.def b/storage/sphinx/mysql-test/sphinx/disabled.def
new file mode 100644
index 00000000000..a85b8b71e52
--- /dev/null
+++ b/storage/sphinx/mysql-test/sphinx/disabled.def
@@ -0,0 +1,2 @@
+sphinx     : MDEV-10986, MDEV-10985
+union-5539 : MDEV-10986, MDEV-10985
diff --git a/storage/tokudb/CMakeLists.txt b/storage/tokudb/CMakeLists.txt
index fd3bf90f8c4..4cfb177e495 100644
--- a/storage/tokudb/CMakeLists.txt
+++ b/storage/tokudb/CMakeLists.txt
@@ -1,4 +1,4 @@
-SET(TOKUDB_VERSION 5.6.31-77.0)
+SET(TOKUDB_VERSION 5.6.34-79.1)
 # PerconaFT only supports x86-64 and cmake-2.8.9+
 IF(CMAKE_VERSION VERSION_LESS "2.8.9")
   MESSAGE(STATUS "CMake 2.8.9 or higher is required by TokuDB")
diff --git a/storage/tokudb/PerconaFT/buildheader/make_tdb.cc b/storage/tokudb/PerconaFT/buildheader/make_tdb.cc
index 4b62703480f..0145d631839 100644
--- a/storage/tokudb/PerconaFT/buildheader/make_tdb.cc
+++ b/storage/tokudb/PerconaFT/buildheader/make_tdb.cc
@@ -367,8 +367,8 @@ static void print_db_env_struct (void) {
                              "int (*checkpointing_get_period)             (DB_ENV*, uint32_t*) /* Retrieve the delay between automatic checkpoints.  0 means disabled. */",
                              "int (*cleaner_set_period)                   (DB_ENV*, uint32_t) /* Change the delay between automatic cleaner attempts.  0 means disabled. */",
                              "int (*cleaner_get_period)                   (DB_ENV*, uint32_t*) /* Retrieve the delay between automatic cleaner attempts.  0 means disabled. */",
-                             "int (*cleaner_set_iterations)               (DB_ENV*, uint32_t) /* Change the number of attempts on each cleaner invokation.  0 means disabled. */",
-                             "int (*cleaner_get_iterations)               (DB_ENV*, uint32_t*) /* Retrieve the number of attempts on each cleaner invokation.  0 means disabled. */",
+                             "int (*cleaner_set_iterations)               (DB_ENV*, uint32_t) /* Change the number of attempts on each cleaner invocation.  0 means disabled. */",
+                             "int (*cleaner_get_iterations)               (DB_ENV*, uint32_t*) /* Retrieve the number of attempts on each cleaner invocation.  0 means disabled. */",
                              "int (*evictor_set_enable_partial_eviction)  (DB_ENV*, bool) /* Enables or disabled partial eviction of nodes from cachetable. */",
                              "int (*evictor_get_enable_partial_eviction)  (DB_ENV*, bool*) /* Retrieve the status of partial eviction of nodes from cachetable. */",
                              "int (*checkpointing_postpone)               (DB_ENV*) /* Use for 'rename table' or any other operation that must be disjoint from a checkpoint */",
@@ -405,6 +405,7 @@ static void print_db_env_struct (void) {
                              "int (*set_lock_timeout)                     (DB_ENV *env, uint64_t default_lock_wait_time_msec, uint64_t (*get_lock_wait_time_cb)(uint64_t default_lock_wait_time))",
                              "int (*get_lock_timeout)                     (DB_ENV *env, uint64_t *lock_wait_time_msec)",
                              "int (*set_lock_timeout_callback)            (DB_ENV *env, lock_timeout_callback callback)",
+                             "int (*set_lock_wait_callback)               (DB_ENV *env, lock_wait_callback callback)",
                              "int (*txn_xa_recover)                       (DB_ENV*, TOKU_XA_XID list[/*count*/], long count, /*out*/ long *retp, uint32_t flags)",
                              "int (*get_txn_from_xid)                     (DB_ENV*, /*in*/ TOKU_XA_XID *, /*out*/ DB_TXN **)",
                              "DB* (*get_db_for_directory)                 (DB_ENV*)",
@@ -422,6 +423,10 @@ static void print_db_env_struct (void) {
                              "int (*set_checkpoint_pool_threads)(DB_ENV *, uint32_t)",
                              "void (*set_check_thp)(DB_ENV *, bool new_val)",
                              "bool (*get_check_thp)(DB_ENV *)",
+                             "bool (*set_dir_per_db)(DB_ENV *, bool new_val)",
+                             "bool (*get_dir_per_db)(DB_ENV *)",
+                             "const char *(*get_data_dir)(DB_ENV *env)",
+                             "void (*kill_waiter)(DB_ENV *, void *extra)",
                              NULL};
 
         sort_and_dump_fields("db_env", true, extra);
@@ -542,8 +547,8 @@ static void print_db_txn_struct (void) {
 	"int (*abort_with_progress)(DB_TXN*, TXN_PROGRESS_POLL_FUNCTION, void*)",
 	"int (*xa_prepare) (DB_TXN*, TOKU_XA_XID *, uint32_t flags)",
         "uint64_t (*id64) (DB_TXN*)",
-        "void (*set_client_id)(DB_TXN *, uint64_t client_id)",
-        "uint64_t (*get_client_id)(DB_TXN *)",
+        "void (*set_client_id)(DB_TXN *, uint64_t client_id, void *client_extra)",
+        "void (*get_client_id)(DB_TXN *, uint64_t *client_id, void **client_extra)",
         "bool (*is_prepared)(DB_TXN *)",
         "DB_TXN *(*get_child)(DB_TXN *)",
         "uint64_t (*get_start_time)(DB_TXN *)",
@@ -747,6 +752,7 @@ int main (int argc, char *const argv[] __attribute__((__unused__))) {
     printf("void toku_dbt_array_resize(DBT_ARRAY *dbts, uint32_t size) %s;\n", VISIBLE);
 
     printf("typedef void (*lock_timeout_callback)(DB *db, uint64_t requesting_txnid, const DBT *left_key, const DBT *right_key, uint64_t blocking_txnid);\n");
+    printf("typedef void (*lock_wait_callback)(void *arg, uint64_t requesting_txnid, uint64_t blocking_txnid);\n");
     printf("typedef int (*iterate_row_locks_callback)(DB **db, DBT *left_key, DBT *right_key, void *extra);\n");
     printf("typedef int (*iterate_transactions_callback)(DB_TXN *dbtxn, iterate_row_locks_callback cb, void *locks_extra, void *extra);\n");
     printf("typedef int (*iterate_requests_callback)(DB *db, uint64_t requesting_txnid, const DBT *left_key, const DBT *right_key, uint64_t blocking_txnid, uint64_t start_time, void *extra);\n");
diff --git a/storage/tokudb/PerconaFT/cmake_modules/TokuFeatureDetection.cmake b/storage/tokudb/PerconaFT/cmake_modules/TokuFeatureDetection.cmake
index e3c900fbadb..2f04a33558a 100644
--- a/storage/tokudb/PerconaFT/cmake_modules/TokuFeatureDetection.cmake
+++ b/storage/tokudb/PerconaFT/cmake_modules/TokuFeatureDetection.cmake
@@ -97,7 +97,7 @@ if (NOT HAVE_BACKTRACE_WITHOUT_EXECINFO)
   endif ()
 endif ()
 
-if(HAVE_CLOCK_REALTIME)
+if(HAVE_CLOCK_REALTIME AND (NOT APPLE))
   list(APPEND EXTRA_SYSTEM_LIBS rt)
 else()
   list(APPEND EXTRA_SYSTEM_LIBS System)
@@ -109,6 +109,8 @@ check_function_exists(pthread_rwlockattr_setkind_np HAVE_PTHREAD_RWLOCKATTR_SETK
 ## check for the right way to yield using pthreads
 check_function_exists(pthread_yield HAVE_PTHREAD_YIELD)
 check_function_exists(pthread_yield_np HAVE_PTHREAD_YIELD_NP)
+## check if we have pthread_threadid_np() (i.e. osx)
+check_function_exists(pthread_threadid_np HAVE_PTHREAD_THREADID_NP)
 ## check if we have pthread_getthreadid_np() (i.e. freebsd)
 check_function_exists(pthread_getthreadid_np HAVE_PTHREAD_GETTHREADID_NP)
 check_function_exists(sched_getcpu HAVE_SCHED_GETCPU)
diff --git a/storage/tokudb/PerconaFT/cmake_modules/TokuSetupCompiler.cmake b/storage/tokudb/PerconaFT/cmake_modules/TokuSetupCompiler.cmake
index 77f6d8f67b7..f7e7f76e96e 100644
--- a/storage/tokudb/PerconaFT/cmake_modules/TokuSetupCompiler.cmake
+++ b/storage/tokudb/PerconaFT/cmake_modules/TokuSetupCompiler.cmake
@@ -66,11 +66,10 @@ set_cflags_if_supported(
   -Wno-error=address-of-array-temporary
   -Wno-error=tautological-constant-out-of-range-compare
   -Wno-error=maybe-uninitialized
-  -Wno-ignored-attributes
   -Wno-error=extern-c-compat
-  -Wno-pointer-bool-conversion
   -fno-rtti
   -fno-exceptions
+  -Wno-error=nonnull-compare
   )
 ## set_cflags_if_supported_named("-Weffc++" -Weffcpp)
 
diff --git a/storage/tokudb/PerconaFT/ft/CMakeLists.txt b/storage/tokudb/PerconaFT/ft/CMakeLists.txt
index 11091073ac2..6696c26ecc0 100644
--- a/storage/tokudb/PerconaFT/ft/CMakeLists.txt
+++ b/storage/tokudb/PerconaFT/ft/CMakeLists.txt
@@ -55,8 +55,8 @@ set(FT_SOURCES
   msg_buffer
   node
   pivotkeys
+  serialize/rbtree_mhs
   serialize/block_allocator
-  serialize/block_allocator_strategy
   serialize/block_table
   serialize/compress
   serialize/ft_node-serialize
diff --git a/storage/tokudb/PerconaFT/ft/cachetable/cachetable-internal.h b/storage/tokudb/PerconaFT/ft/cachetable/cachetable-internal.h
index dc6aec9226d..05fb771de08 100644
--- a/storage/tokudb/PerconaFT/ft/cachetable/cachetable-internal.h
+++ b/storage/tokudb/PerconaFT/ft/cachetable/cachetable-internal.h
@@ -138,6 +138,8 @@ struct cachefile {
     // nor attempt to open any cachefile with the same fname (dname)
     // until this cachefile has been fully closed and unlinked.
     bool unlink_on_close;
+    // If set then fclose will not be logged in recovery log.
+    bool skip_log_recover_on_close;
     int fd;       /* Bug: If a file is opened read-only, then it is stuck in read-only.  If it is opened read-write, then subsequent writers can write to it too. */
     CACHETABLE cachetable;
     struct fileid fileid;
diff --git a/storage/tokudb/PerconaFT/ft/cachetable/cachetable.cc b/storage/tokudb/PerconaFT/ft/cachetable/cachetable.cc
index 4495694e06f..4505a236e13 100644
--- a/storage/tokudb/PerconaFT/ft/cachetable/cachetable.cc
+++ b/storage/tokudb/PerconaFT/ft/cachetable/cachetable.cc
@@ -468,6 +468,10 @@ toku_cachefile_fname_in_env (CACHEFILE cf) {
     return cf->fname_in_env;
 }
 
+void toku_cachefile_set_fname_in_env(CACHEFILE cf, char *new_fname_in_env) {
+    cf->fname_in_env = new_fname_in_env;
+}
+
 int 
 toku_cachefile_get_fd (CACHEFILE cf) {
     return cf->fd;
@@ -2904,6 +2908,18 @@ bool toku_cachefile_is_unlink_on_close(CACHEFILE cf) {
     return cf->unlink_on_close;
 }
 
+void toku_cachefile_skip_log_recover_on_close(CACHEFILE cf) {
+    cf->skip_log_recover_on_close = true;
+}
+
+void toku_cachefile_do_log_recover_on_close(CACHEFILE cf) {
+    cf->skip_log_recover_on_close = false;
+}
+
+bool toku_cachefile_is_skip_log_recover_on_close(CACHEFILE cf) {
+    return cf->skip_log_recover_on_close;
+}
+
 uint64_t toku_cachefile_size(CACHEFILE cf) {
     int64_t file_size;
     int fd = toku_cachefile_get_fd(cf);
diff --git a/storage/tokudb/PerconaFT/ft/cachetable/cachetable.h b/storage/tokudb/PerconaFT/ft/cachetable/cachetable.h
index 148326562ab..3b3cb0a2d46 100644
--- a/storage/tokudb/PerconaFT/ft/cachetable/cachetable.h
+++ b/storage/tokudb/PerconaFT/ft/cachetable/cachetable.h
@@ -500,12 +500,18 @@ int toku_cachefile_get_fd (CACHEFILE);
 // Return the filename
 char * toku_cachefile_fname_in_env (CACHEFILE cf);
 
+void toku_cachefile_set_fname_in_env(CACHEFILE cf, char *new_fname_in_env);
+
 // Make it so when the cachefile closes, the underlying file is unlinked
 void toku_cachefile_unlink_on_close(CACHEFILE cf);
 
 // is this cachefile marked as unlink on close?
 bool toku_cachefile_is_unlink_on_close(CACHEFILE cf);
 
+void toku_cachefile_skip_log_recover_on_close(CACHEFILE cf);
+void toku_cachefile_do_log_recover_on_close(CACHEFILE cf);
+bool toku_cachefile_is_skip_log_recover_on_close(CACHEFILE cf);
+
 // Return the logger associated with the cachefile
 struct tokulogger *toku_cachefile_logger(CACHEFILE cf);
 
diff --git a/storage/tokudb/PerconaFT/ft/ft-flusher.cc b/storage/tokudb/PerconaFT/ft/ft-flusher.cc
index 6c682f3c215..3fd59f388ce 100644
--- a/storage/tokudb/PerconaFT/ft/ft-flusher.cc
+++ b/storage/tokudb/PerconaFT/ft/ft-flusher.cc
@@ -1471,7 +1471,7 @@ void toku_ft_flush_some_child(FT ft, FTNODE parent, struct flusher_advice *fa)
     // It is possible after reading in the entire child,
     // that we now know that the child is not reactive
     // if so, we can unpin parent right now
-    // we wont be splitting/merging child
+    // we won't be splitting/merging child
     // and we have already replaced the bnc
     // for the root with a fresh one
     enum reactivity child_re = toku_ftnode_get_reactivity(ft, child);
diff --git a/storage/tokudb/PerconaFT/ft/ft-ops.cc b/storage/tokudb/PerconaFT/ft/ft-ops.cc
index 02fa8bda0e5..238290df949 100644
--- a/storage/tokudb/PerconaFT/ft/ft-ops.cc
+++ b/storage/tokudb/PerconaFT/ft/ft-ops.cc
@@ -150,22 +150,23 @@ basement nodes, bulk fetch,  and partial fetch:
 #include <my_global.h>
 #include "ft/cachetable/checkpoint.h"
 #include "ft/cursor.h"
-#include "ft/ft.h"
 #include "ft/ft-cachetable-wrappers.h"
 #include "ft/ft-flusher.h"
 #include "ft/ft-internal.h"
-#include "ft/msg.h"
+#include "ft/ft.h"
 #include "ft/leafentry.h"
 #include "ft/logger/log-internal.h"
+#include "ft/msg.h"
 #include "ft/node.h"
 #include "ft/serialize/block_table.h"
-#include "ft/serialize/sub_block.h"
 #include "ft/serialize/ft-serialize.h"
 #include "ft/serialize/ft_layout_version.h"
 #include "ft/serialize/ft_node-serialize.h"
+#include "ft/serialize/sub_block.h"
 #include "ft/txn/txn_manager.h"
-#include "ft/ule.h"
 #include "ft/txn/xids.h"
+#include "ft/ule.h"
+#include "src/ydb-internal.h"
 
 #include <toku_race_tools.h>
 
@@ -180,6 +181,7 @@ basement nodes, bulk fetch,  and partial fetch:
 
 #include <stdint.h>
 
+#include <memory>
 /* Status is intended for display to humans to help understand system behavior.
  * It does not need to be perfectly thread-safe.
  */
@@ -599,15 +601,12 @@ void toku_ftnode_checkpoint_complete_callback(void *value_data) {
     }
 }
 
-void toku_ftnode_clone_callback(
-    void* value_data,
-    void** cloned_value_data,
-    long* clone_size,
-    PAIR_ATTR* new_attr,
-    bool for_checkpoint,
-    void* write_extraargs
-    )
-{
+void toku_ftnode_clone_callback(void *value_data,
+                                void **cloned_value_data,
+                                long *clone_size,
+                                PAIR_ATTR *new_attr,
+                                bool for_checkpoint,
+                                void *write_extraargs) {
     FTNODE node = static_cast<FTNODE>(value_data);
     toku_ftnode_assert_fully_in_memory(node);
     FT ft = static_cast<FT>(write_extraargs);
@@ -619,13 +618,16 @@ void toku_ftnode_clone_callback(
         toku_ftnode_leaf_rebalance(node, ft->h->basementnodesize);
     }
 
-    cloned_node->oldest_referenced_xid_known = node->oldest_referenced_xid_known;
-    cloned_node->max_msn_applied_to_node_on_disk = node->max_msn_applied_to_node_on_disk;
+    cloned_node->oldest_referenced_xid_known =
+        node->oldest_referenced_xid_known;
+    cloned_node->max_msn_applied_to_node_on_disk =
+        node->max_msn_applied_to_node_on_disk;
     cloned_node->flags = node->flags;
     cloned_node->blocknum = node->blocknum;
     cloned_node->layout_version = node->layout_version;
     cloned_node->layout_version_original = node->layout_version_original;
-    cloned_node->layout_version_read_from_disk = node->layout_version_read_from_disk;
+    cloned_node->layout_version_read_from_disk =
+        node->layout_version_read_from_disk;
     cloned_node->build_id = node->build_id;
     cloned_node->height = node->height;
     cloned_node->dirty = node->dirty;
@@ -650,38 +652,39 @@ void toku_ftnode_clone_callback(
     // set new pair attr if necessary
     if (node->height == 0) {
         *new_attr = make_ftnode_pair_attr(node);
-    }
-    else {
+        for (int i = 0; i < node->n_children; i++) {
+            BLB(node, i)->logical_rows_delta = 0;
+            BLB(cloned_node, i)->logical_rows_delta = 0;
+        }
+    } else {
         new_attr->is_valid = false;
     }
     *clone_size = ftnode_memory_size(cloned_node);
     *cloned_value_data = cloned_node;
 }
 
-void toku_ftnode_flush_callback(
-    CACHEFILE UU(cachefile),
-    int fd,
-    BLOCKNUM blocknum,
-    void *ftnode_v,
-    void** disk_data,
-    void *extraargs,
-    PAIR_ATTR size __attribute__((unused)),
-    PAIR_ATTR* new_size,
-    bool write_me,
-    bool keep_me,
-    bool for_checkpoint,
-    bool is_clone
-    )
-{
-    FT ft = (FT) extraargs;
-    FTNODE ftnode = (FTNODE) ftnode_v;
-    FTNODE_DISK_DATA* ndd = (FTNODE_DISK_DATA*)disk_data;
+void toku_ftnode_flush_callback(CACHEFILE UU(cachefile),
+                                int fd,
+                                BLOCKNUM blocknum,
+                                void *ftnode_v,
+                                void **disk_data,
+                                void *extraargs,
+                                PAIR_ATTR size __attribute__((unused)),
+                                PAIR_ATTR *new_size,
+                                bool write_me,
+                                bool keep_me,
+                                bool for_checkpoint,
+                                bool is_clone) {
+    FT ft = (FT)extraargs;
+    FTNODE ftnode = (FTNODE)ftnode_v;
+    FTNODE_DISK_DATA *ndd = (FTNODE_DISK_DATA *)disk_data;
     assert(ftnode->blocknum.b == blocknum.b);
     int height = ftnode->height;
     if (write_me) {
         toku_ftnode_assert_fully_in_memory(ftnode);
         if (height > 0 && !is_clone) {
-            // cloned nodes already had their stale messages moved, see toku_ftnode_clone_callback()
+            // cloned nodes already had their stale messages moved, see
+            // toku_ftnode_clone_callback()
             toku_move_ftnode_messages_to_stale(ft, ftnode);
         } else if (height == 0) {
             toku_ftnode_leaf_run_gc(ft, ftnode);
@@ -689,7 +692,8 @@ void toku_ftnode_flush_callback(
                 toku_ftnode_update_disk_stats(ftnode, ft, for_checkpoint);
             }
         }
-        int r = toku_serialize_ftnode_to(fd, ftnode->blocknum, ftnode, ndd, !is_clone, ft, for_checkpoint);
+        int r = toku_serialize_ftnode_to(
+            fd, ftnode->blocknum, ftnode, ndd, !is_clone, ft, for_checkpoint);
         assert_zero(r);
         ftnode->layout_version_read_from_disk = FT_LAYOUT_VERSION;
     }
@@ -704,20 +708,22 @@ void toku_ftnode_flush_callback(
                 FT_STATUS_INC(FT_FULL_EVICTIONS_NONLEAF_BYTES, node_size);
             }
             toku_free(*disk_data);
-        }
-        else {
+        } else {
             if (ftnode->height == 0) {
                 for (int i = 0; i < ftnode->n_children; i++) {
-                    if (BP_STATE(ftnode,i) == PT_AVAIL) {
+                    if (BP_STATE(ftnode, i) == PT_AVAIL) {
                         BASEMENTNODE bn = BLB(ftnode, i);
-                        toku_ft_decrease_stats(&ft->in_memory_stats, bn->stat64_delta);
+                        toku_ft_decrease_stats(&ft->in_memory_stats,
+                                               bn->stat64_delta);
+                        if (!ftnode->dirty)
+                            toku_ft_adjust_logical_row_count(
+                                ft, -bn->logical_rows_delta);
                     }
                 }
             }
         }
         toku_ftnode_free(&ftnode);
-    }
-    else {
+    } else {
         *new_size = make_ftnode_pair_attr(ftnode);
     }
 }
@@ -846,10 +852,13 @@ static void compress_internal_node_partition(FTNODE node, int i, enum toku_compr
 }
 
 // callback for partially evicting a node
-int toku_ftnode_pe_callback(void *ftnode_pv, PAIR_ATTR old_attr, void *write_extraargs,
-                            void (*finalize)(PAIR_ATTR new_attr, void *extra), void *finalize_extra) {
-    FTNODE node = (FTNODE) ftnode_pv;
-    FT ft = (FT) write_extraargs;
+int toku_ftnode_pe_callback(void *ftnode_pv,
+                            PAIR_ATTR old_attr,
+                            void *write_extraargs,
+                            void (*finalize)(PAIR_ATTR new_attr, void *extra),
+                            void *finalize_extra) {
+    FTNODE node = (FTNODE)ftnode_pv;
+    FT ft = (FT)write_extraargs;
     int num_partial_evictions = 0;
 
     // Hold things we intend to destroy here.
@@ -867,7 +876,8 @@ int toku_ftnode_pe_callback(void *ftnode_pv, PAIR_ATTR old_attr, void *write_ext
     }
     // Don't partially evict nodes whose partitions can't be read back
     // from disk individually
-    if (node->layout_version_read_from_disk < FT_FIRST_LAYOUT_VERSION_WITH_BASEMENT_NODES) {
+    if (node->layout_version_read_from_disk <
+        FT_FIRST_LAYOUT_VERSION_WITH_BASEMENT_NODES) {
         goto exit;
     }
     //
@@ -875,77 +885,77 @@ int toku_ftnode_pe_callback(void *ftnode_pv, PAIR_ATTR old_attr, void *write_ext
     //
     if (node->height > 0) {
         for (int i = 0; i < node->n_children; i++) {
-            if (BP_STATE(node,i) == PT_AVAIL) {
-                if (BP_SHOULD_EVICT(node,i)) {
+            if (BP_STATE(node, i) == PT_AVAIL) {
+                if (BP_SHOULD_EVICT(node, i)) {
                     NONLEAF_CHILDINFO bnc = BNC(node, i);
                     if (ft_compress_buffers_before_eviction &&
-                        // We may not serialize and compress a partition in memory if its
-                        // in memory layout version is different than what's on disk (and
-                        // therefore requires upgrade).
+                        // We may not serialize and compress a partition in
+                        // memory if its in memory layout version is different
+                        // than what's on disk (and therefore requires upgrade).
                         //
-                        // Auto-upgrade code assumes that if a node's layout version read
-                        // from disk is not current, it MUST require upgrade. Breaking
-                        // this rule would cause upgrade code to upgrade this partition
-                        // again after we serialize it as the current version, which is bad.
-                        node->layout_version == node->layout_version_read_from_disk) {
+                        // Auto-upgrade code assumes that if a node's layout
+                        // version read from disk is not current, it MUST
+                        // require upgrade.
+                        // Breaking this rule would cause upgrade code to
+                        // upgrade this partition again after we serialize it as
+                        // the current version, which is bad.
+                        node->layout_version ==
+                            node->layout_version_read_from_disk) {
                         toku_ft_bnc_move_messages_to_stale(ft, bnc);
                         compress_internal_node_partition(
                             node,
                             i,
                             // Always compress with quicklz
-                            TOKU_QUICKLZ_METHOD
-                            );
+                            TOKU_QUICKLZ_METHOD);
                     } else {
                         // We're not compressing buffers before eviction. Simply
-                        // detach the buffer and set the child's state to on-disk.
+                        // detach the buffer and set the child's state to
+                        // on-disk.
                         set_BNULL(node, i);
                         BP_STATE(node, i) = PT_ON_DISK;
                     }
                     buffers_to_destroy[num_buffers_to_destroy++] = bnc;
                     num_partial_evictions++;
+                } else {
+                    BP_SWEEP_CLOCK(node, i);
                 }
-                else {
-                    BP_SWEEP_CLOCK(node,i);
-                }
-            }
-            else {
+            } else {
                 continue;
             }
         }
-    }
-    //
-    // partial eviction strategy for basement nodes:
-    //  if the bn is compressed, evict it
-    //  else: check if it requires eviction, if it does, evict it, if not, sweep the clock count
-    //
-    else {
+    } else {
+        //
+        // partial eviction strategy for basement nodes:
+        //  if the bn is compressed, evict it
+        //  else: check if it requires eviction, if it does, evict it, if not,
+        //  sweep the clock count
+        //
         for (int i = 0; i < node->n_children; i++) {
             // Get rid of compressed stuff no matter what.
-            if (BP_STATE(node,i) == PT_COMPRESSED) {
+            if (BP_STATE(node, i) == PT_COMPRESSED) {
                 SUB_BLOCK sb = BSB(node, i);
                 pointers_to_free[num_pointers_to_free++] = sb->compressed_ptr;
                 pointers_to_free[num_pointers_to_free++] = sb;
                 set_BNULL(node, i);
-                BP_STATE(node,i) = PT_ON_DISK;
+                BP_STATE(node, i) = PT_ON_DISK;
                 num_partial_evictions++;
-            }
-            else if (BP_STATE(node,i) == PT_AVAIL) {
-                if (BP_SHOULD_EVICT(node,i)) {
+            } else if (BP_STATE(node, i) == PT_AVAIL) {
+                if (BP_SHOULD_EVICT(node, i)) {
                     BASEMENTNODE bn = BLB(node, i);
                     basements_to_destroy[num_basements_to_destroy++] = bn;
-                    toku_ft_decrease_stats(&ft->in_memory_stats, bn->stat64_delta);
+                    toku_ft_decrease_stats(&ft->in_memory_stats,
+                                           bn->stat64_delta);
+                    toku_ft_adjust_logical_row_count(ft,
+                                                     -bn->logical_rows_delta);
                     set_BNULL(node, i);
                     BP_STATE(node, i) = PT_ON_DISK;
                     num_partial_evictions++;
+                } else {
+                    BP_SWEEP_CLOCK(node, i);
                 }
-                else {
-                    BP_SWEEP_CLOCK(node,i);
-                }
-            }
-            else if (BP_STATE(node,i) == PT_ON_DISK) {
+            } else if (BP_STATE(node, i) == PT_ON_DISK) {
                 continue;
-            }
-            else {
+            } else {
                 abort();
             }
         }
@@ -2379,12 +2389,16 @@ ft_send_update_msg(FT_HANDLE ft_h, const ft_msg &msg, TOKUTXN txn) {
     toku_ft_root_put_msg(ft_h->ft, msg, &gc_info);
 }
 
-void toku_ft_maybe_update(FT_HANDLE ft_h, const DBT *key, const DBT *update_function_extra,
-                      TOKUTXN txn, bool oplsn_valid, LSN oplsn,
-                      bool do_logging) {
+void toku_ft_maybe_update(FT_HANDLE ft_h,
+                          const DBT *key,
+                          const DBT *update_function_extra,
+                          TOKUTXN txn,
+                          bool oplsn_valid,
+                          LSN oplsn,
+                          bool do_logging) {
     TXNID_PAIR xid = toku_txn_get_txnid(txn);
     if (txn) {
-        BYTESTRING keybs = { key->size, (char *) key->data };
+        BYTESTRING keybs = {key->size, (char *)key->data};
         toku_logger_save_rollback_cmdupdate(
             txn, toku_cachefile_filenum(ft_h->ft->cf), &keybs);
         toku_txn_maybe_note_ft(txn, ft_h->ft);
@@ -2393,22 +2407,33 @@ void toku_ft_maybe_update(FT_HANDLE ft_h, const DBT *key, const DBT *update_func
     TOKULOGGER logger;
     logger = toku_txn_logger(txn);
     if (do_logging && logger) {
-        BYTESTRING keybs = {.len=key->size, .data=(char *) key->data};
-        BYTESTRING extrabs = {.len=update_function_extra->size,
-                              .data = (char *) update_function_extra->data};
-        toku_log_enq_update(logger, NULL, 0, txn,
-                                toku_cachefile_filenum(ft_h->ft->cf),
-                                xid, keybs, extrabs);
+        BYTESTRING keybs = {.len = key->size, .data = (char *)key->data};
+        BYTESTRING extrabs = {.len = update_function_extra->size,
+                              .data = (char *)update_function_extra->data};
+        toku_log_enq_update(logger,
+                            NULL,
+                            0,
+                            txn,
+                            toku_cachefile_filenum(ft_h->ft->cf),
+                            xid,
+                            keybs,
+                            extrabs);
     }
 
     LSN treelsn;
-    if (oplsn_valid && oplsn.lsn <= (treelsn = toku_ft_checkpoint_lsn(ft_h->ft)).lsn) {
+    if (oplsn_valid &&
+        oplsn.lsn <= (treelsn = toku_ft_checkpoint_lsn(ft_h->ft)).lsn) {
         // do nothing
     } else {
-        XIDS message_xids = txn ? toku_txn_get_xids(txn) : toku_xids_get_root_xids();
-        ft_msg msg(key, update_function_extra, FT_UPDATE, ZERO_MSN, message_xids);
+        XIDS message_xids =
+            txn ? toku_txn_get_xids(txn) : toku_xids_get_root_xids();
+        ft_msg msg(
+            key, update_function_extra, FT_UPDATE, ZERO_MSN, message_xids);
         ft_send_update_msg(ft_h, msg, txn);
     }
+    // updates get converted to insert messages, which should do a -1 on the
+    // logical row count when the messages are permanently applied
+    toku_ft_adjust_logical_row_count(ft_h->ft, 1);
 }
 
 void toku_ft_maybe_update_broadcast(FT_HANDLE ft_h, const DBT *update_function_extra,
@@ -2571,12 +2596,104 @@ static inline int ft_open_maybe_direct(const char *filename, int oflag, int mode
 
 static const mode_t file_mode = S_IRUSR+S_IWUSR+S_IRGRP+S_IWGRP+S_IROTH+S_IWOTH;
 
+inline bool toku_file_is_root(const char *path, const char *last_slash) {
+    return last_slash == path;
+}
+
+static std::unique_ptr<char[], decltype(&toku_free)> toku_file_get_parent_dir(
+    const char *path) {
+    std::unique_ptr<char[], decltype(&toku_free)> result(nullptr, &toku_free);
+
+    bool has_trailing_slash = false;
+
+    /* Find the offset of the last slash */
+    const char *last_slash = strrchr(path, OS_PATH_SEPARATOR);
+
+    if (!last_slash) {
+        /* No slash in the path, return NULL */
+        return result;
+    }
+
+    /* Ok, there is a slash. Is there anything after it? */
+    if (static_cast<size_t>(last_slash - path + 1) == strlen(path)) {
+        has_trailing_slash = true;
+    }
+
+    /* Reduce repetative slashes. */
+    while (last_slash > path && last_slash[-1] == OS_PATH_SEPARATOR) {
+        last_slash--;
+    }
+
+    /* Check for the root of a drive. */
+    if (toku_file_is_root(path, last_slash)) {
+        return result;
+    }
+
+    /* If a trailing slash prevented the first strrchr() from trimming
+    the last component of the path, trim that component now. */
+    if (has_trailing_slash) {
+        /* Back up to the previous slash. */
+        last_slash--;
+        while (last_slash > path && last_slash[0] != OS_PATH_SEPARATOR) {
+            last_slash--;
+        }
+
+        /* Reduce repetative slashes. */
+        while (last_slash > path && last_slash[-1] == OS_PATH_SEPARATOR) {
+            last_slash--;
+        }
+    }
+
+    /* Check for the root of a drive. */
+    if (toku_file_is_root(path, last_slash)) {
+        return result;
+    }
+
+    result.reset(toku_strndup(path, last_slash - path));
+    return result;
+}
+
+static bool toku_create_subdirs_if_needed(const char *path) {
+    static const mode_t dir_mode = S_IRUSR | S_IWUSR | S_IXUSR | S_IRGRP |
+                                   S_IWGRP | S_IXGRP | S_IROTH | S_IXOTH;
+
+    toku_struct_stat stat;
+    bool subdir_exists = true;
+    auto subdir = toku_file_get_parent_dir(path);
+
+    if (!subdir.get())
+        return true;
+
+    if (toku_stat(subdir.get(), &stat) == -1) {
+        if (ENOENT == get_error_errno())
+            subdir_exists = false;
+        else
+            return false;
+    }
+
+    if (subdir_exists) {
+        if (!S_ISDIR(stat.st_mode))
+            return false;
+        return true;
+    }
+
+    if (!toku_create_subdirs_if_needed(subdir.get()))
+        return false;
+
+    if (toku_os_mkdir(subdir.get(), dir_mode))
+        return false;
+
+    return true;
+}
+
 // open a file for use by the ft
 // Requires:  File does not exist.
 static int ft_create_file(FT_HANDLE UU(ft_handle), const char *fname, int *fdp) {
     int r;
     int fd;
     int er;
+    if (!toku_create_subdirs_if_needed(fname))
+        return get_error_errno();
     fd = ft_open_maybe_direct(fname, O_RDWR | O_BINARY, file_mode);
     assert(fd==-1);
     if ((er = get_maybe_error_errno()) != ENOENT) {
@@ -4405,6 +4522,55 @@ void toku_ft_unlink(FT_HANDLE handle) {
     toku_cachefile_unlink_on_close(cf);
 }
 
+int toku_ft_rename_iname(DB_TXN *txn,
+                         const char *data_dir,
+                         const char *old_iname,
+                         const char *new_iname,
+                         CACHETABLE ct) {
+    int r = 0;
+
+    std::unique_ptr<char[], decltype(&toku_free)> new_iname_full(nullptr,
+                                                                 &toku_free);
+    std::unique_ptr<char[], decltype(&toku_free)> old_iname_full(nullptr,
+                                                                 &toku_free);
+
+    new_iname_full.reset(toku_construct_full_name(2, data_dir, new_iname));
+    old_iname_full.reset(toku_construct_full_name(2, data_dir, old_iname));
+
+    if (txn) {
+        BYTESTRING bs_old_name = {static_cast<uint32_t>(strlen(old_iname) + 1),
+                                  const_cast<char *>(old_iname)};
+        BYTESTRING bs_new_name = {static_cast<uint32_t>(strlen(new_iname) + 1),
+                                  const_cast<char *>(new_iname)};
+        FILENUM filenum = FILENUM_NONE;
+        {
+            CACHEFILE cf;
+            r = toku_cachefile_of_iname_in_env(ct, old_iname, &cf);
+            if (r != ENOENT) {
+                char *old_fname_in_cf = toku_cachefile_fname_in_env(cf);
+                toku_cachefile_set_fname_in_env(cf, toku_xstrdup(new_iname));
+                toku_free(old_fname_in_cf);
+                filenum = toku_cachefile_filenum(cf);
+            }
+        }
+        toku_logger_save_rollback_frename(
+            db_txn_struct_i(txn)->tokutxn, &bs_old_name, &bs_new_name);
+        toku_log_frename(db_txn_struct_i(txn)->tokutxn->logger,
+                         (LSN *)0,
+                         0,
+                         toku_txn_get_txnid(db_txn_struct_i(txn)->tokutxn),
+                         bs_old_name,
+                         filenum,
+                         bs_new_name);
+    }
+
+    r = toku_os_rename(old_iname_full.get(), new_iname_full.get());
+    if (r != 0)
+        return r;
+    r = toku_fsync_directory(new_iname_full.get());
+    return r;
+}
+
 int toku_ft_get_fragmentation(FT_HANDLE ft_handle, TOKU_DB_FRAGMENTATION report) {
     int fd = toku_cachefile_get_fd(ft_handle->ft->cf);
     toku_ft_lock(ft_handle->ft);
diff --git a/storage/tokudb/PerconaFT/ft/ft-ops.h b/storage/tokudb/PerconaFT/ft/ft-ops.h
index 313a74628ea..70cf045d43c 100644
--- a/storage/tokudb/PerconaFT/ft/ft-ops.h
+++ b/storage/tokudb/PerconaFT/ft/ft-ops.h
@@ -48,6 +48,8 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 #include "ft/msg.h"
 #include "util/dbt.h"
 
+#define OS_PATH_SEPARATOR '/'
+
 typedef struct ft_handle *FT_HANDLE;
 
 int toku_open_ft_handle (const char *fname, int is_create, FT_HANDLE *, int nodesize, int basementnodesize, enum toku_compression_method compression_method, CACHETABLE, TOKUTXN, int(*)(DB *,const DBT*,const DBT*)) __attribute__ ((warn_unused_result));
diff --git a/storage/tokudb/PerconaFT/ft/ft-recount-rows.cc b/storage/tokudb/PerconaFT/ft/ft-recount-rows.cc
index adac96f4882..e31d80772d5 100644
--- a/storage/tokudb/PerconaFT/ft/ft-recount-rows.cc
+++ b/storage/tokudb/PerconaFT/ft/ft-recount-rows.cc
@@ -73,30 +73,20 @@ static bool recount_rows_interrupt(void* extra, uint64_t deleted_rows) {
     return rre->_cancelled =
         rre->_progress_callback(rre->_keys, deleted_rows, rre->_progress_extra);
 }
-int toku_ft_recount_rows(
-    FT_HANDLE ft,
-    int (*progress_callback)(
-        uint64_t count,
-        uint64_t deleted,
-        void* progress_extra),
-    void* progress_extra) {
-
+int toku_ft_recount_rows(FT_HANDLE ft,
+                         int (*progress_callback)(uint64_t count,
+                                                  uint64_t deleted,
+                                                  void* progress_extra),
+                         void* progress_extra) {
     int ret = 0;
-    recount_rows_extra_t rre = {
-        progress_callback,
-        progress_extra,
-        0,
-        false
-        };
+    recount_rows_extra_t rre = {progress_callback, progress_extra, 0, false};
 
     ft_cursor c;
     ret = toku_ft_cursor_create(ft, &c, nullptr, C_READ_ANY, false, false);
-    if (ret) return ret;
+    if (ret)
+        return ret;
 
-    toku_ft_cursor_set_check_interrupt_cb(
-        &c,
-        recount_rows_interrupt,
-        &rre);
+    toku_ft_cursor_set_check_interrupt_cb(&c, recount_rows_interrupt, &rre);
 
     ret = toku_ft_cursor_first(&c, recount_rows_found, &rre);
     while (FT_LIKELY(ret == 0)) {
@@ -108,6 +98,7 @@ int toku_ft_recount_rows(
     if (rre._cancelled == false) {
         // update ft count
         toku_unsafe_set(&ft->ft->in_memory_logical_rows, rre._keys);
+        ft->ft->h->dirty = 1;
         ret = 0;
     }
 
diff --git a/storage/tokudb/PerconaFT/ft/ft.cc b/storage/tokudb/PerconaFT/ft/ft.cc
index c9a11182b74..cec54e38a47 100644
--- a/storage/tokudb/PerconaFT/ft/ft.cc
+++ b/storage/tokudb/PerconaFT/ft/ft.cc
@@ -254,7 +254,19 @@ static void ft_close(CACHEFILE cachefile, int fd, void *header_v, bool oplsn_val
             char* fname_in_env = toku_cachefile_fname_in_env(cachefile);
             assert(fname_in_env);
             BYTESTRING bs = {.len=(uint32_t) strlen(fname_in_env), .data=fname_in_env};
-            toku_log_fclose(logger, &lsn, ft->h->dirty, bs, toku_cachefile_filenum(cachefile)); // flush the log on close (if new header is being written), otherwise it might not make it out.
+            if (!toku_cachefile_is_skip_log_recover_on_close(cachefile)) {
+                toku_log_fclose(
+                    logger,
+                    &lsn,
+                    ft->h->dirty,
+                    bs,
+                    toku_cachefile_filenum(cachefile));  // flush the log on
+                                                         // close (if new header
+                                                         // is being written),
+                                                         // otherwise it might
+                                                         // not make it out.
+                toku_cachefile_do_log_recover_on_close(cachefile);
+            }
         }
     }
     if (ft->h->dirty) {               // this is the only place this bit is tested (in currentheader)
@@ -904,6 +916,9 @@ void toku_ft_adjust_logical_row_count(FT ft, int64_t delta) {
     // must be returned in toku_ft_stat64.
     if (delta != 0 && ft->in_memory_logical_rows != (uint64_t)-1) {
         toku_sync_fetch_and_add(&(ft->in_memory_logical_rows), delta);
+        if (ft->in_memory_logical_rows == (uint64_t)-1) {
+            toku_sync_fetch_and_add(&(ft->in_memory_logical_rows), 1);
+        }
     }
 }
 
diff --git a/storage/tokudb/PerconaFT/ft/ft.h b/storage/tokudb/PerconaFT/ft/ft.h
index d600e093bdc..7a3c4fa783c 100644
--- a/storage/tokudb/PerconaFT/ft/ft.h
+++ b/storage/tokudb/PerconaFT/ft/ft.h
@@ -53,6 +53,12 @@ typedef struct ft_options *FT_OPTIONS;
 void toku_ft_unlink(FT_HANDLE handle);
 void toku_ft_unlink_on_commit(FT_HANDLE handle, TOKUTXN txn);
 
+int toku_ft_rename_iname(DB_TXN *txn,
+                         const char *data_dir,
+                         const char *old_iname,
+                         const char *new_iname,
+                         CACHETABLE ct);
+
 void toku_ft_init_reflock(FT ft);
 void toku_ft_destroy_reflock(FT ft);
 void toku_ft_grab_reflock(FT ft);
diff --git a/storage/tokudb/PerconaFT/ft/loader/loader-internal.h b/storage/tokudb/PerconaFT/ft/loader/loader-internal.h
index dd070373e26..1aa2c203831 100644
--- a/storage/tokudb/PerconaFT/ft/loader/loader-internal.h
+++ b/storage/tokudb/PerconaFT/ft/loader/loader-internal.h
@@ -301,7 +301,7 @@ int toku_ft_loader_internal_init (/* out */ FTLOADER *blp,
 
 void toku_ft_loader_internal_destroy (FTLOADER bl, bool is_error);
 
-// For test purposes only.  (In production, the rowset size is determined by negotation with the cachetable for some memory.  See #2613.)
+// For test purposes only.  (In production, the rowset size is determined by negotiation with the cachetable for some memory.  See #2613.)
 uint64_t toku_ft_loader_get_rowset_budget_for_testing (void);
 
 int toku_ft_loader_finish_extractor(FTLOADER bl);
diff --git a/storage/tokudb/PerconaFT/ft/loader/loader.cc b/storage/tokudb/PerconaFT/ft/loader/loader.cc
index 3028aa3d524..f867639b953 100644
--- a/storage/tokudb/PerconaFT/ft/loader/loader.cc
+++ b/storage/tokudb/PerconaFT/ft/loader/loader.cc
@@ -92,7 +92,7 @@ toku_ft_loader_set_size_factor(uint32_t factor) {
 
 uint64_t
 toku_ft_loader_get_rowset_budget_for_testing (void)
-// For test purposes only.  In production, the rowset size is determined by negotation with the cachetable for some memory.  (See #2613).
+// For test purposes only.  In production, the rowset size is determined by negotiation with the cachetable for some memory.  (See #2613).
 {
     return 16ULL*size_factor*1024ULL;
 }
diff --git a/storage/tokudb/PerconaFT/ft/logger/logformat.cc b/storage/tokudb/PerconaFT/ft/logger/logformat.cc
index 6f3baa81c86..49b61138803 100644
--- a/storage/tokudb/PerconaFT/ft/logger/logformat.cc
+++ b/storage/tokudb/PerconaFT/ft/logger/logformat.cc
@@ -90,6 +90,10 @@ const struct logtype rollbacks[] = {
     {"fcreate", 'F', FA{{"FILENUM", "filenum", 0},
                         {"BYTESTRING", "iname", 0},
                         NULLFIELD}, LOG_BEGIN_ACTION_NA},
+    //rename file
+    {"frename", 'n', FA{{"BYTESTRING",    "old_iname", 0},
+                        {"BYTESTRING",    "new_iname", 0},
+                        NULLFIELD}, LOG_BEGIN_ACTION_NA},
     // cmdinsert is used to insert a key-value pair into a DB.  For rollback we don't need the data.
     {"cmdinsert", 'i', FA{
                           {"FILENUM", "filenum", 0},
@@ -195,6 +199,11 @@ const struct logtype logtypes[] = {
     {"fdelete", 'U', FA{{"TXNID_PAIR",      "xid", 0},
                         {"FILENUM", "filenum", 0},
                         NULLFIELD}, SHOULD_LOG_BEGIN},
+    {"frename", 'n', FA{{"TXNID_PAIR",      "xid", 0},
+                        {"BYTESTRING",    "old_iname", 0},
+                        {"FILENUM",       "old_filenum",   0},
+                        {"BYTESTRING",    "new_iname", 0},
+                        NULLFIELD}, IGNORE_LOG_BEGIN},
     {"enq_insert", 'I', FA{{"FILENUM",    "filenum", 0},
                            {"TXNID_PAIR",      "xid", 0},
                            {"BYTESTRING", "key", 0},
diff --git a/storage/tokudb/PerconaFT/ft/logger/recover.cc b/storage/tokudb/PerconaFT/ft/logger/recover.cc
index 77eee8630a9..a9c30c0e37a 100644
--- a/storage/tokudb/PerconaFT/ft/logger/recover.cc
+++ b/storage/tokudb/PerconaFT/ft/logger/recover.cc
@@ -36,7 +36,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 
 #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
 
-#include <my_global.h>
+#include <memory>
 #include "ft/cachetable/cachetable.h"
 #include "ft/cachetable/checkpoint.h"
 #include "ft/ft.h"
@@ -936,6 +936,83 @@ static int toku_recover_backward_fdelete (struct logtype_fdelete *UU(l), RECOVER
     return 0;
 }
 
+static int toku_recover_frename(struct logtype_frename *l, RECOVER_ENV renv) {
+    assert(renv);
+    assert(renv->env);
+
+    toku_struct_stat stat;
+    const char *data_dir = renv->env->get_data_dir(renv->env);
+    bool old_exist = true;
+    bool new_exist = true;
+
+    assert(data_dir);
+
+    struct file_map_tuple *tuple;
+
+    std::unique_ptr<char[], decltype(&toku_free)> old_iname_full(
+        toku_construct_full_name(2, data_dir, l->old_iname.data), &toku_free);
+    std::unique_ptr<char[], decltype(&toku_free)> new_iname_full(
+        toku_construct_full_name(2, data_dir, l->new_iname.data), &toku_free);
+
+    if (toku_stat(old_iname_full.get(), &stat) == -1) {
+        if (ENOENT == errno)
+            old_exist = false;
+        else
+            return 1;
+    }
+
+    if (toku_stat(new_iname_full.get(), &stat) == -1) {
+        if (ENOENT == errno)
+            new_exist = false;
+        else
+            return 1;
+    }
+
+    // Both old and new files can exist if:
+    // - rename() is not completed
+    // - fcreate was replayed during recovery
+    // 'Stalled cachefiles' container cachefile_list::m_stale_fileid contains
+    // closed but not yet evicted cachefiles and the key of this container is
+    // fs-dependent file id - (device id, inode number) pair. As it is supposed
+    // new file have not yet created during recovery process the 'stalled
+    // cachefile' container can contain only cache file of old file.
+    // To preserve the old cachefile file's id and keep it in
+    // 'stalled cachefiles' container the new file is removed
+    // and the old file is renamed.
+    if (old_exist && new_exist &&
+        (toku_os_unlink(new_iname_full.get()) == -1 ||
+         toku_os_rename(old_iname_full.get(), new_iname_full.get()) == -1 ||
+         toku_fsync_directory(old_iname_full.get()) == -1 ||
+         toku_fsync_directory(new_iname_full.get()) == -1))
+        return 1;
+
+    if (old_exist && !new_exist &&
+        (toku_os_rename(old_iname_full.get(), new_iname_full.get()) == -1 ||
+         toku_fsync_directory(old_iname_full.get()) == -1 ||
+         toku_fsync_directory(new_iname_full.get()) == -1))
+        return 1;
+
+    if (file_map_find(&renv->fmap, l->old_filenum, &tuple) != DB_NOTFOUND) {
+        if (tuple->iname)
+            toku_free(tuple->iname);
+        tuple->iname = toku_xstrdup(l->new_iname.data);
+    }
+
+    TOKUTXN txn = NULL;
+    toku_txnid2txn(renv->logger, l->xid, &txn);
+
+    if (txn)
+        toku_logger_save_rollback_frename(txn, &l->old_iname, &l->new_iname);
+
+    return 0;
+}
+
+static int toku_recover_backward_frename(struct logtype_frename *UU(l),
+                                         RECOVER_ENV UU(renv)) {
+    // nothing
+    return 0;
+}
+
 static int toku_recover_enq_insert (struct logtype_enq_insert *l, RECOVER_ENV renv) {
     int r;
     TOKUTXN txn = NULL;
diff --git a/storage/tokudb/PerconaFT/ft/node.cc b/storage/tokudb/PerconaFT/ft/node.cc
index 928b046bce1..7ddf0f3a1b0 100644
--- a/storage/tokudb/PerconaFT/ft/node.cc
+++ b/storage/tokudb/PerconaFT/ft/node.cc
@@ -374,52 +374,48 @@ find_bounds_within_message_tree(
     }
 }
 
-/**
- * For each message in the ancestor's buffer (determined by childnum) that
- * is key-wise between lower_bound_exclusive and upper_bound_inclusive,
- * apply the message to the basement node.  We treat the bounds as minus
- * or plus infinity respectively if they are NULL.  Do not mark the node
- * as dirty (preserve previous state of 'dirty' bit).
- */
+// For each message in the ancestor's buffer (determined by childnum) that
+// is key-wise between lower_bound_exclusive and upper_bound_inclusive,
+// apply the message to the basement node.  We treat the bounds as minus
+// or plus infinity respectively if they are NULL.  Do not mark the node
+// as dirty (preserve previous state of 'dirty' bit).
 static void bnc_apply_messages_to_basement_node(
-    FT_HANDLE t,             // used for comparison function
-    BASEMENTNODE bn,   // where to apply messages
+    FT_HANDLE t,      // used for comparison function
+    BASEMENTNODE bn,  // where to apply messages
     FTNODE ancestor,  // the ancestor node where we can find messages to apply
-    int childnum,      // which child buffer of ancestor contains messages we want
-    const pivot_bounds &bounds,  // contains pivot key bounds of this basement node
-    txn_gc_info* gc_info,
-    bool* msgs_applied) {
-
+    int childnum,  // which child buffer of ancestor contains messages we want
+    const pivot_bounds &
+        bounds,  // contains pivot key bounds of this basement node
+    txn_gc_info *gc_info,
+    bool *msgs_applied) {
     int r;
     NONLEAF_CHILDINFO bnc = BNC(ancestor, childnum);
 
     // Determine the offsets in the message trees between which we need to
     // apply messages from this buffer
-    STAT64INFO_S stats_delta = {0,0};
+    STAT64INFO_S stats_delta = {0, 0};
     uint64_t workdone_this_ancestor = 0;
     int64_t logical_rows_delta = 0;
 
     uint32_t stale_lbi, stale_ube;
     if (!bn->stale_ancestor_messages_applied) {
-        find_bounds_within_message_tree(
-            t->ft->cmp,
-            bnc->stale_message_tree,
-            &bnc->msg_buffer,
-            bounds,
-            &stale_lbi,
-            &stale_ube);
+        find_bounds_within_message_tree(t->ft->cmp,
+                                        bnc->stale_message_tree,
+                                        &bnc->msg_buffer,
+                                        bounds,
+                                        &stale_lbi,
+                                        &stale_ube);
     } else {
         stale_lbi = 0;
         stale_ube = 0;
     }
     uint32_t fresh_lbi, fresh_ube;
-    find_bounds_within_message_tree(
-        t->ft->cmp,
-        bnc->fresh_message_tree,
-        &bnc->msg_buffer,
-        bounds,
-        &fresh_lbi,
-        &fresh_ube);
+    find_bounds_within_message_tree(t->ft->cmp,
+                                    bnc->fresh_message_tree,
+                                    &bnc->msg_buffer,
+                                    bounds,
+                                    &fresh_lbi,
+                                    &fresh_ube);
 
     // We now know where all the messages we must apply are, so one of the
     // following 4 cases will do the application, depending on which of
@@ -433,44 +429,53 @@ static void bnc_apply_messages_to_basement_node(
         // We have messages in multiple trees, so we grab all
         // the relevant messages' offsets and sort them by MSN, then apply
         // them in MSN order.
-        const int buffer_size = ((stale_ube - stale_lbi) +
-                                 (fresh_ube - fresh_lbi) +
-                                 bnc->broadcast_list.size());
+        const int buffer_size =
+            ((stale_ube - stale_lbi) + (fresh_ube - fresh_lbi) +
+             bnc->broadcast_list.size());
         toku::scoped_malloc offsets_buf(buffer_size * sizeof(int32_t));
         int32_t *offsets = reinterpret_cast<int32_t *>(offsets_buf.get());
-        struct store_msg_buffer_offset_extra sfo_extra = { .offsets = offsets, .i = 0 };
+        struct store_msg_buffer_offset_extra sfo_extra = {.offsets = offsets,
+                                                          .i = 0};
 
         // Populate offsets array with offsets to stale messages
-        r = bnc->stale_message_tree.iterate_on_range<struct store_msg_buffer_offset_extra, store_msg_buffer_offset>(stale_lbi, stale_ube, &sfo_extra);
+        r = bnc->stale_message_tree
+                .iterate_on_range<struct store_msg_buffer_offset_extra,
+                                  store_msg_buffer_offset>(
+                    stale_lbi, stale_ube, &sfo_extra);
         assert_zero(r);
 
         // Then store fresh offsets, and mark them to be moved to stale later.
-        r = bnc->fresh_message_tree.iterate_and_mark_range<struct store_msg_buffer_offset_extra, store_msg_buffer_offset>(fresh_lbi, fresh_ube, &sfo_extra);
+        r = bnc->fresh_message_tree
+                .iterate_and_mark_range<struct store_msg_buffer_offset_extra,
+                                        store_msg_buffer_offset>(
+                    fresh_lbi, fresh_ube, &sfo_extra);
         assert_zero(r);
 
         // Store offsets of all broadcast messages.
-        r = bnc->broadcast_list.iterate<struct store_msg_buffer_offset_extra, store_msg_buffer_offset>(&sfo_extra);
+        r = bnc->broadcast_list.iterate<struct store_msg_buffer_offset_extra,
+                                        store_msg_buffer_offset>(&sfo_extra);
         assert_zero(r);
         invariant(sfo_extra.i == buffer_size);
 
         // Sort by MSN.
-        toku::sort<int32_t, message_buffer, msg_buffer_offset_msn_cmp>::mergesort_r(offsets, buffer_size, bnc->msg_buffer);
+        toku::sort<int32_t, message_buffer, msg_buffer_offset_msn_cmp>::
+            mergesort_r(offsets, buffer_size, bnc->msg_buffer);
 
         // Apply the messages in MSN order.
         for (int i = 0; i < buffer_size; ++i) {
             *msgs_applied = true;
-            do_bn_apply_msg(
-                t,
-                bn,
-                &bnc->msg_buffer,
-                offsets[i],
-                gc_info,
-                &workdone_this_ancestor,
-                &stats_delta,
-                &logical_rows_delta);
+            do_bn_apply_msg(t,
+                            bn,
+                            &bnc->msg_buffer,
+                            offsets[i],
+                            gc_info,
+                            &workdone_this_ancestor,
+                            &stats_delta,
+                            &logical_rows_delta);
         }
     } else if (stale_lbi == stale_ube) {
-        // No stale messages to apply, we just apply fresh messages, and mark them to be moved to stale later.
+        // No stale messages to apply, we just apply fresh messages, and mark
+        // them to be moved to stale later.
         struct iterate_do_bn_apply_msg_extra iter_extra = {
             .t = t,
             .bn = bn,
@@ -478,16 +483,20 @@ static void bnc_apply_messages_to_basement_node(
             .gc_info = gc_info,
             .workdone = &workdone_this_ancestor,
             .stats_to_update = &stats_delta,
-            .logical_rows_delta = &logical_rows_delta
-        };
-        if (fresh_ube - fresh_lbi > 0) *msgs_applied = true;
-        r = bnc->fresh_message_tree.iterate_and_mark_range<struct iterate_do_bn_apply_msg_extra, iterate_do_bn_apply_msg>(fresh_lbi, fresh_ube, &iter_extra);
+            .logical_rows_delta = &logical_rows_delta};
+        if (fresh_ube - fresh_lbi > 0)
+            *msgs_applied = true;
+        r = bnc->fresh_message_tree
+                .iterate_and_mark_range<struct iterate_do_bn_apply_msg_extra,
+                                        iterate_do_bn_apply_msg>(
+                    fresh_lbi, fresh_ube, &iter_extra);
         assert_zero(r);
     } else {
         invariant(fresh_lbi == fresh_ube);
         // No fresh messages to apply, we just apply stale messages.
 
-        if (stale_ube - stale_lbi > 0) *msgs_applied = true;
+        if (stale_ube - stale_lbi > 0)
+            *msgs_applied = true;
         struct iterate_do_bn_apply_msg_extra iter_extra = {
             .t = t,
             .bn = bn,
@@ -495,22 +504,26 @@ static void bnc_apply_messages_to_basement_node(
             .gc_info = gc_info,
             .workdone = &workdone_this_ancestor,
             .stats_to_update = &stats_delta,
-            .logical_rows_delta = &logical_rows_delta
-        };
+            .logical_rows_delta = &logical_rows_delta};
 
-        r = bnc->stale_message_tree.iterate_on_range<struct iterate_do_bn_apply_msg_extra, iterate_do_bn_apply_msg>(stale_lbi, stale_ube, &iter_extra);
+        r = bnc->stale_message_tree
+                .iterate_on_range<struct iterate_do_bn_apply_msg_extra,
+                                  iterate_do_bn_apply_msg>(
+                    stale_lbi, stale_ube, &iter_extra);
         assert_zero(r);
     }
     //
     // update stats
     //
     if (workdone_this_ancestor > 0) {
-        (void) toku_sync_fetch_and_add(&BP_WORKDONE(ancestor, childnum), workdone_this_ancestor);
+        (void)toku_sync_fetch_and_add(&BP_WORKDONE(ancestor, childnum),
+                                      workdone_this_ancestor);
     }
     if (stats_delta.numbytes || stats_delta.numrows) {
         toku_ft_update_stats(&t->ft->in_memory_stats, stats_delta);
     }
     toku_ft_adjust_logical_row_count(t->ft, logical_rows_delta);
+    bn->logical_rows_delta += logical_rows_delta;
 }
 
 static void
diff --git a/storage/tokudb/PerconaFT/ft/node.h b/storage/tokudb/PerconaFT/ft/node.h
index ad0298e81c5..52eefec0936 100644
--- a/storage/tokudb/PerconaFT/ft/node.h
+++ b/storage/tokudb/PerconaFT/ft/node.h
@@ -199,6 +199,7 @@ struct ftnode_leaf_basement_node {
     MSN max_msn_applied;            // max message sequence number applied
     bool stale_ancestor_messages_applied;
     STAT64INFO_S stat64_delta;      // change in stat64 counters since basement was last written to disk
+    int64_t logical_rows_delta;
 };
 typedef struct ftnode_leaf_basement_node *BASEMENTNODE;
 
diff --git a/storage/tokudb/PerconaFT/ft/serialize/block_allocator.cc b/storage/tokudb/PerconaFT/ft/serialize/block_allocator.cc
index 1355f3739ee..19811373d16 100644
--- a/storage/tokudb/PerconaFT/ft/serialize/block_allocator.cc
+++ b/storage/tokudb/PerconaFT/ft/serialize/block_allocator.cc
@@ -46,415 +46,214 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 #include "portability/toku_stdlib.h"
 
 #include "ft/serialize/block_allocator.h"
-#include "ft/serialize/block_allocator_strategy.h"
+#include "ft/serialize/rbtree_mhs.h"
 
 #if TOKU_DEBUG_PARANOID
-#define VALIDATE() validate()
+#define VALIDATE() Validate()
 #else
 #define VALIDATE()
 #endif
 
-static FILE *ba_trace_file = nullptr;
-
-void block_allocator::maybe_initialize_trace(void) {
-    const char *ba_trace_path = getenv("TOKU_BA_TRACE_PATH");        
-    if (ba_trace_path != nullptr) {
-        ba_trace_file = toku_os_fopen(ba_trace_path, "w");
-        if (ba_trace_file == nullptr) {
-            fprintf(stderr, "tokuft: error: block allocator trace path found in environment (%s), "
-                            "but it could not be opened for writing (errno %d)\n",
-                            ba_trace_path, get_maybe_error_errno());
-        } else {
-            fprintf(stderr, "tokuft: block allocator tracing enabled, path: %s\n", ba_trace_path);
-        }
-    }
-}
-
-void block_allocator::maybe_close_trace() {
-    if (ba_trace_file != nullptr) {
-        int r = toku_os_fclose(ba_trace_file);
-        if (r != 0) {
-            fprintf(stderr, "tokuft: error: block allocator trace file did not close properly (r %d, errno %d)\n",
-                            r, get_maybe_error_errno());
-        } else {
-            fprintf(stderr, "tokuft: block allocator tracing finished, file closed successfully\n");
-        }
-    }
-}
-
-void block_allocator::_create_internal(uint64_t reserve_at_beginning, uint64_t alignment) {
-    // the alignment must be at least 512 and aligned with 512 to work with direct I/O
-    assert(alignment >= 512 && (alignment % 512) == 0);
+void BlockAllocator::CreateInternal(uint64_t reserve_at_beginning,
+                                    uint64_t alignment) {
+    // the alignment must be at least 512 and aligned with 512 to work with
+    // direct I/O
+    invariant(alignment >= 512 && (alignment % 512) == 0);
 
     _reserve_at_beginning = reserve_at_beginning;
     _alignment = alignment;
     _n_blocks = 0;
-    _blocks_array_size = 1;
-    XMALLOC_N(_blocks_array_size, _blocks_array);
     _n_bytes_in_use = reserve_at_beginning;
-    _strategy = BA_STRATEGY_FIRST_FIT;
-
-    memset(&_trace_lock, 0, sizeof(toku_mutex_t));
-    toku_mutex_init(&_trace_lock, nullptr);
+    _tree = new MhsRbTree::Tree(alignment);
+}
 
+void BlockAllocator::Create(uint64_t reserve_at_beginning, uint64_t alignment) {
+    CreateInternal(reserve_at_beginning, alignment);
+    _tree->Insert({reserve_at_beginning, MAX_BYTE});
     VALIDATE();
 }
 
-void block_allocator::create(uint64_t reserve_at_beginning, uint64_t alignment) {
-    _create_internal(reserve_at_beginning, alignment);
-    _trace_create();
+void BlockAllocator::Destroy() {
+    delete _tree;
 }
 
-void block_allocator::destroy() {
-    toku_free(_blocks_array);
-    _trace_destroy();
-    toku_mutex_destroy(&_trace_lock);
-}
+void BlockAllocator::CreateFromBlockPairs(uint64_t reserve_at_beginning,
+                                          uint64_t alignment,
+                                          struct BlockPair *translation_pairs,
+                                          uint64_t n_blocks) {
+    CreateInternal(reserve_at_beginning, alignment);
+    _n_blocks = n_blocks;
 
-void block_allocator::set_strategy(enum allocation_strategy strategy) {
-    _strategy = strategy;
-}
+    struct BlockPair *XMALLOC_N(n_blocks, pairs);
+    memcpy(pairs, translation_pairs, n_blocks * sizeof(struct BlockPair));
+    std::sort(pairs, pairs + n_blocks);
 
-void block_allocator::grow_blocks_array_by(uint64_t n_to_add) {
-    if (_n_blocks + n_to_add > _blocks_array_size) {
-        uint64_t new_size = _n_blocks + n_to_add;
-        uint64_t at_least = _blocks_array_size * 2;
-        if (at_least > new_size) {
-            new_size = at_least;
-        }
-        _blocks_array_size = new_size;
-        XREALLOC_N(_blocks_array_size, _blocks_array);
+    if (pairs[0]._offset > reserve_at_beginning) {
+        _tree->Insert(
+            {reserve_at_beginning, pairs[0]._offset - reserve_at_beginning});
     }
-}
-
-void block_allocator::grow_blocks_array() {
-    grow_blocks_array_by(1);
-}
-
-void block_allocator::create_from_blockpairs(uint64_t reserve_at_beginning, uint64_t alignment,
-                                             struct blockpair *pairs, uint64_t n_blocks) {
-    _create_internal(reserve_at_beginning, alignment);
-
-    _n_blocks = n_blocks;
-    grow_blocks_array_by(_n_blocks);
-    memcpy(_blocks_array, pairs, _n_blocks * sizeof(struct blockpair));
-    std::sort(_blocks_array, _blocks_array + _n_blocks);
     for (uint64_t i = 0; i < _n_blocks; i++) {
-        // Allocator does not support size 0 blocks. See block_allocator_free_block.
-        invariant(_blocks_array[i].size > 0);
-        invariant(_blocks_array[i].offset >= _reserve_at_beginning);
-        invariant(_blocks_array[i].offset % _alignment == 0);
-
-        _n_bytes_in_use += _blocks_array[i].size;
+        // Allocator does not support size 0 blocks. See
+        // block_allocator_free_block.
+        invariant(pairs[i]._size > 0);
+        invariant(pairs[i]._offset >= _reserve_at_beginning);
+        invariant(pairs[i]._offset % _alignment == 0);
+
+        _n_bytes_in_use += pairs[i]._size;
+
+        MhsRbTree::OUUInt64 free_size(MAX_BYTE);
+        MhsRbTree::OUUInt64 free_offset(pairs[i]._offset + pairs[i]._size);
+        if (i < n_blocks - 1) {
+            MhsRbTree::OUUInt64 next_offset(pairs[i + 1]._offset);
+            invariant(next_offset >= free_offset);
+            free_size = next_offset - free_offset;
+            if (free_size == 0)
+                continue;
+        }
+        _tree->Insert({free_offset, free_size});
     }
-
+    toku_free(pairs);
     VALIDATE();
-
-    _trace_create_from_blockpairs();
 }
 
 // Effect: align a value by rounding up.
-static inline uint64_t align(uint64_t value, uint64_t ba_alignment) {
+static inline uint64_t Align(uint64_t value, uint64_t ba_alignment) {
     return ((value + ba_alignment - 1) / ba_alignment) * ba_alignment;
 }
 
-struct block_allocator::blockpair *
-block_allocator::choose_block_to_alloc_after(size_t size, uint64_t heat) {
-    switch (_strategy) {
-    case BA_STRATEGY_FIRST_FIT:
-        return block_allocator_strategy::first_fit(_blocks_array, _n_blocks, size, _alignment);
-    case BA_STRATEGY_BEST_FIT:
-        return block_allocator_strategy::best_fit(_blocks_array, _n_blocks, size, _alignment);
-    case BA_STRATEGY_HEAT_ZONE:
-        return block_allocator_strategy::heat_zone(_blocks_array, _n_blocks, size, _alignment, heat);
-    case BA_STRATEGY_PADDED_FIT:
-        return block_allocator_strategy::padded_fit(_blocks_array, _n_blocks, size, _alignment);
-    default:
-        abort();
-    }
-}
-
-// Effect: Allocate a block. The resulting block must be aligned on the ba->alignment (which to make direct_io happy must be a positive multiple of 512).
-void block_allocator::alloc_block(uint64_t size, uint64_t heat, uint64_t *offset) {
-    struct blockpair *bp;
-
+// Effect: Allocate a block. The resulting block must be aligned on the
+// ba->alignment (which to make direct_io happy must be a positive multiple of
+// 512).
+void BlockAllocator::AllocBlock(uint64_t size,
+                                uint64_t *offset) {
     // Allocator does not support size 0 blocks. See block_allocator_free_block.
     invariant(size > 0);
 
-    grow_blocks_array();
     _n_bytes_in_use += size;
+    *offset = _tree->Remove(size);
 
-    uint64_t end_of_reserve = align(_reserve_at_beginning, _alignment);
-
-    if (_n_blocks == 0) {
-        // First and only block
-        assert(_n_bytes_in_use == _reserve_at_beginning + size); // we know exactly how many are in use
-        _blocks_array[0].offset = align(_reserve_at_beginning, _alignment);
-        _blocks_array[0].size = size;
-        *offset = _blocks_array[0].offset;
-        goto done;
-    } else if (end_of_reserve + size <= _blocks_array[0].offset ) {
-        // Check to see if the space immediately after the reserve is big enough to hold the new block.
-        bp = &_blocks_array[0];
-        memmove(bp + 1, bp, _n_blocks * sizeof(*bp));
-        bp[0].offset = end_of_reserve;
-        bp[0].size = size;
-        *offset = end_of_reserve;
-        goto done;
-    }
-
-    bp = choose_block_to_alloc_after(size, heat);
-    if (bp != nullptr) {
-        // our allocation strategy chose the space after `bp' to fit the new block
-        uint64_t answer_offset = align(bp->offset + bp->size, _alignment);
-        uint64_t blocknum = bp - _blocks_array;
-        invariant(&_blocks_array[blocknum] == bp);
-        invariant(blocknum < _n_blocks);
-        memmove(bp + 2, bp + 1, (_n_blocks - blocknum - 1) * sizeof(*bp));
-        bp[1].offset = answer_offset;
-        bp[1].size = size;
-        *offset = answer_offset;
-    } else {
-        // It didn't fit anywhere, so fit it on the end.
-        assert(_n_blocks < _blocks_array_size);
-        bp = &_blocks_array[_n_blocks];
-        uint64_t answer_offset = align(bp[-1].offset + bp[-1].size, _alignment);
-        bp->offset = answer_offset;
-        bp->size = size;
-        *offset = answer_offset;
-    }
-
-done:
     _n_blocks++;
     VALIDATE();
-
-    _trace_alloc(size, heat, *offset);
-}
-
-// Find the index in the blocks array that has a particular offset.  Requires that the block exist.
-// Use binary search so it runs fast.
-int64_t block_allocator::find_block(uint64_t offset) {
-    VALIDATE();
-    if (_n_blocks == 1) {
-        assert(_blocks_array[0].offset == offset);
-        return 0;
-    }
-
-    uint64_t lo = 0;
-    uint64_t hi = _n_blocks;
-    while (1) {
-        assert(lo < hi); // otherwise no such block exists.
-        uint64_t mid = (lo + hi) / 2;
-        uint64_t thisoff = _blocks_array[mid].offset;
-        if (thisoff < offset) {
-            lo = mid + 1;
-        } else if (thisoff > offset) {
-            hi = mid;
-        } else {
-            return mid;
-        }
-    }
 }
 
-// To support 0-sized blocks, we need to include size as an input to this function.
+// To support 0-sized blocks, we need to include size as an input to this
+// function.
 // All 0-sized blocks at the same offset can be considered identical, but
 // a 0-sized block can share offset with a non-zero sized block.
-// The non-zero sized block is not exchangable with a zero sized block (or vice versa),
-// so inserting 0-sized blocks can cause corruption here.
-void block_allocator::free_block(uint64_t offset) {
+// The non-zero sized block is not exchangable with a zero sized block (or vice
+// versa), so inserting 0-sized blocks can cause corruption here.
+void BlockAllocator::FreeBlock(uint64_t offset, uint64_t size) {
     VALIDATE();
-    int64_t bn = find_block(offset);
-    assert(bn >= 0); // we require that there is a block with that offset.
-    _n_bytes_in_use -= _blocks_array[bn].size;
-    memmove(&_blocks_array[bn], &_blocks_array[bn + 1],
-            (_n_blocks - bn - 1) * sizeof(struct blockpair));
+    _n_bytes_in_use -= size;
+    _tree->Insert({offset, size});
     _n_blocks--;
     VALIDATE();
-    
-    _trace_free(offset);
-}
-
-uint64_t block_allocator::block_size(uint64_t offset) {
-    int64_t bn = find_block(offset);
-    assert(bn >=0); // we require that there is a block with that offset.
-    return _blocks_array[bn].size;
 }
 
-uint64_t block_allocator::allocated_limit() const {
-    if (_n_blocks == 0) {
-        return _reserve_at_beginning;
-    } else {
-        struct blockpair *last = &_blocks_array[_n_blocks - 1];
-        return last->offset + last->size;
-    }
+uint64_t BlockAllocator::AllocatedLimit() const {
+    MhsRbTree::Node *max_node = _tree->MaxNode();
+    return rbn_offset(max_node).ToInt();
 }
 
-// Effect: Consider the blocks in sorted order.  The reserved block at the beginning is number 0.  The next one is number 1 and so forth.
+// Effect: Consider the blocks in sorted order.  The reserved block at the
+// beginning is number 0.  The next one is number 1 and so forth.
 // Return the offset and size of the block with that number.
 // Return 0 if there is a block that big, return nonzero if b is too big.
-int block_allocator::get_nth_block_in_layout_order(uint64_t b, uint64_t *offset, uint64_t *size) {
-    if (b ==0 ) {
+int BlockAllocator::NthBlockInLayoutOrder(uint64_t b,
+                                          uint64_t *offset,
+                                          uint64_t *size) {
+    MhsRbTree::Node *x, *y;
+    if (b == 0) {
         *offset = 0;
         *size = _reserve_at_beginning;
-        return  0;
+        return 0;
     } else if (b > _n_blocks) {
         return -1;
     } else {
-        *offset =_blocks_array[b - 1].offset;
-        *size =_blocks_array[b - 1].size;
+        x = _tree->MinNode();
+        for (uint64_t i = 1; i <= b; i++) {
+            y = x;
+            x = _tree->Successor(x);
+        }
+        *size = (rbn_offset(x) - (rbn_offset(y) + rbn_size(y))).ToInt();
+        *offset = (rbn_offset(y) + rbn_size(y)).ToInt();
         return 0;
     }
 }
 
+struct VisUnusedExtra {
+    TOKU_DB_FRAGMENTATION _report;
+    uint64_t _align;
+};
+
+static void VisUnusedCollector(void *extra,
+                               MhsRbTree::Node *node,
+                               uint64_t UU(depth)) {
+    struct VisUnusedExtra *v_e = (struct VisUnusedExtra *)extra;
+    TOKU_DB_FRAGMENTATION report = v_e->_report;
+    uint64_t alignm = v_e->_align;
+
+    MhsRbTree::OUUInt64 offset = rbn_offset(node);
+    MhsRbTree::OUUInt64 size = rbn_size(node);
+    MhsRbTree::OUUInt64 answer_offset(Align(offset.ToInt(), alignm));
+    uint64_t free_space = (offset + size - answer_offset).ToInt();
+    if (free_space > 0) {
+        report->unused_bytes += free_space;
+        report->unused_blocks++;
+        if (free_space > report->largest_unused_block) {
+            report->largest_unused_block = free_space;
+        }
+    }
+}
 // Requires: report->file_size_bytes is filled in
 // Requires: report->data_bytes is filled in
 // Requires: report->checkpoint_bytes_additional is filled in
-void block_allocator::get_unused_statistics(TOKU_DB_FRAGMENTATION report) {
-    assert(_n_bytes_in_use == report->data_bytes + report->checkpoint_bytes_additional);
+void BlockAllocator::UnusedStatistics(TOKU_DB_FRAGMENTATION report) {
+    invariant(_n_bytes_in_use ==
+              report->data_bytes + report->checkpoint_bytes_additional);
 
     report->unused_bytes = 0;
     report->unused_blocks = 0;
     report->largest_unused_block = 0;
-    if (_n_blocks > 0) {
-        //Deal with space before block 0 and after reserve:
-        {
-            struct blockpair *bp = &_blocks_array[0];
-            assert(bp->offset >= align(_reserve_at_beginning, _alignment));
-            uint64_t free_space = bp->offset - align(_reserve_at_beginning, _alignment);
-            if (free_space > 0) {
-                report->unused_bytes += free_space;
-                report->unused_blocks++;
-                if (free_space > report->largest_unused_block) {
-                    report->largest_unused_block = free_space;
-                }
-            }
-        }
-
-        //Deal with space between blocks:
-        for (uint64_t blocknum = 0; blocknum +1 < _n_blocks; blocknum ++) {
-            // Consider the space after blocknum
-            struct blockpair *bp = &_blocks_array[blocknum];
-            uint64_t this_offset = bp[0].offset;
-            uint64_t this_size   = bp[0].size;
-            uint64_t end_of_this_block = align(this_offset+this_size, _alignment);
-            uint64_t next_offset = bp[1].offset;
-            uint64_t free_space  = next_offset - end_of_this_block;
-            if (free_space > 0) {
-                report->unused_bytes += free_space;
-                report->unused_blocks++;
-                if (free_space > report->largest_unused_block) {
-                    report->largest_unused_block = free_space;
-                }
-            }
-        }
-
-        //Deal with space after last block
-        {
-            struct blockpair *bp = &_blocks_array[_n_blocks-1];
-            uint64_t this_offset = bp[0].offset;
-            uint64_t this_size   = bp[0].size;
-            uint64_t end_of_this_block = align(this_offset+this_size, _alignment);
-            if (end_of_this_block < report->file_size_bytes) {
-                uint64_t free_space  = report->file_size_bytes - end_of_this_block;
-                assert(free_space > 0);
-                report->unused_bytes += free_space;
-                report->unused_blocks++;
-                if (free_space > report->largest_unused_block) {
-                    report->largest_unused_block = free_space;
-                }
-            }
-        }
-    } else {
-        // No blocks.  Just the reserve.
-        uint64_t end_of_this_block = align(_reserve_at_beginning, _alignment);
-        if (end_of_this_block < report->file_size_bytes) {
-            uint64_t free_space  = report->file_size_bytes - end_of_this_block;
-            assert(free_space > 0);
-            report->unused_bytes += free_space;
-            report->unused_blocks++;
-            if (free_space > report->largest_unused_block) {
-                report->largest_unused_block = free_space;
-            }
-        }
-    }
+    struct VisUnusedExtra extra = {report, _alignment};
+    _tree->InOrderVisitor(VisUnusedCollector, &extra);
 }
 
-void block_allocator::get_statistics(TOKU_DB_FRAGMENTATION report) {
-    report->data_bytes = _n_bytes_in_use; 
-    report->data_blocks = _n_blocks; 
+void BlockAllocator::Statistics(TOKU_DB_FRAGMENTATION report) {
+    report->data_bytes = _n_bytes_in_use;
+    report->data_blocks = _n_blocks;
     report->file_size_bytes = 0;
     report->checkpoint_bytes_additional = 0;
-    get_unused_statistics(report);
+    UnusedStatistics(report);
 }
 
-void block_allocator::validate() const {
-    uint64_t n_bytes_in_use = _reserve_at_beginning;
-    for (uint64_t i = 0; i < _n_blocks; i++) {
-        n_bytes_in_use += _blocks_array[i].size;
-        if (i > 0) {
-            assert(_blocks_array[i].offset >  _blocks_array[i - 1].offset);
-            assert(_blocks_array[i].offset >= _blocks_array[i - 1].offset + _blocks_array[i - 1].size );
-        }
-    }
-    assert(n_bytes_in_use == _n_bytes_in_use);
-}
-
-// Tracing
-
-void block_allocator::_trace_create(void) {
-    if (ba_trace_file != nullptr) {
-        toku_mutex_lock(&_trace_lock);
-        fprintf(ba_trace_file, "ba_trace_create %p %" PRIu64 " %" PRIu64 "\n",
-                this, _reserve_at_beginning, _alignment);
-        toku_mutex_unlock(&_trace_lock);
-
-        fflush(ba_trace_file);
-    }
-}
-
-void block_allocator::_trace_create_from_blockpairs(void) {
-    if (ba_trace_file != nullptr) {
-        toku_mutex_lock(&_trace_lock);
-        fprintf(ba_trace_file, "ba_trace_create_from_blockpairs %p %" PRIu64 " %" PRIu64 " ",
-                this, _reserve_at_beginning, _alignment);
-        for (uint64_t i = 0; i < _n_blocks; i++) {
-            fprintf(ba_trace_file, "[%" PRIu64 " %" PRIu64 "] ",
-                    _blocks_array[i].offset, _blocks_array[i].size);
-        }
-        fprintf(ba_trace_file, "\n");
-        toku_mutex_unlock(&_trace_lock);
-
-        fflush(ba_trace_file);
-    }
-}
-
-void block_allocator::_trace_destroy(void) {
-    if (ba_trace_file != nullptr) {
-        toku_mutex_lock(&_trace_lock);
-        fprintf(ba_trace_file, "ba_trace_destroy %p\n", this);
-        toku_mutex_unlock(&_trace_lock);
-
-        fflush(ba_trace_file);
-    }
-}
-
-void block_allocator::_trace_alloc(uint64_t size, uint64_t heat, uint64_t offset) {
-    if (ba_trace_file != nullptr) {
-        toku_mutex_lock(&_trace_lock);
-        fprintf(ba_trace_file, "ba_trace_alloc %p %" PRIu64 " %" PRIu64 " %" PRIu64 "\n",
-                this, size, heat, offset);
-        toku_mutex_unlock(&_trace_lock);
-
-        fflush(ba_trace_file);
+struct ValidateExtra {
+    uint64_t _bytes;
+    MhsRbTree::Node *_pre_node;
+};
+static void VisUsedBlocksInOrder(void *extra,
+                                 MhsRbTree::Node *cur_node,
+                                 uint64_t UU(depth)) {
+    struct ValidateExtra *v_e = (struct ValidateExtra *)extra;
+    MhsRbTree::Node *pre_node = v_e->_pre_node;
+    // verify no overlaps
+    if (pre_node) {
+        invariant(rbn_size(pre_node) > 0);
+        invariant(rbn_offset(cur_node) >
+                  rbn_offset(pre_node) + rbn_size(pre_node));
+        MhsRbTree::OUUInt64 used_space =
+            rbn_offset(cur_node) - (rbn_offset(pre_node) + rbn_size(pre_node));
+        v_e->_bytes += used_space.ToInt();
+    } else {
+        v_e->_bytes += rbn_offset(cur_node).ToInt();
     }
+    v_e->_pre_node = cur_node;
 }
 
-void block_allocator::_trace_free(uint64_t offset) {
-    if (ba_trace_file != nullptr) {
-        toku_mutex_lock(&_trace_lock);
-        fprintf(ba_trace_file, "ba_trace_free %p %" PRIu64 "\n", this, offset);
-        toku_mutex_unlock(&_trace_lock);
-
-        fflush(ba_trace_file);
-    }
+void BlockAllocator::Validate() const {
+    _tree->ValidateBalance();
+    _tree->ValidateMhs();
+    struct ValidateExtra extra = {0, nullptr};
+    _tree->InOrderVisitor(VisUsedBlocksInOrder, &extra);
+    invariant(extra._bytes == _n_bytes_in_use);
 }
diff --git a/storage/tokudb/PerconaFT/ft/serialize/block_allocator.h b/storage/tokudb/PerconaFT/ft/serialize/block_allocator.h
index 9b2c1553e7f..648ea9a9ef2 100644
--- a/storage/tokudb/PerconaFT/ft/serialize/block_allocator.h
+++ b/storage/tokudb/PerconaFT/ft/serialize/block_allocator.h
@@ -43,6 +43,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 #include "portability/toku_pthread.h"
 #include "portability/toku_stdint.h"
 #include "portability/toku_stdlib.h"
+#include "ft/serialize/rbtree_mhs.h"
 
 // Block allocator.
 //
@@ -51,151 +52,128 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 // The allocation of block numbers is handled elsewhere.
 //
 // When creating a block allocator we also specify a certain-sized
-// block at the beginning that is preallocated (and cannot be allocated or freed)
+// block at the beginning that is preallocated (and cannot be allocated or
+// freed)
 //
 // We can allocate blocks of a particular size at a particular location.
-// We can allocate blocks of a particular size at a location chosen by the allocator.
 // We can free blocks.
 // We can determine the size of a block.
-
-class block_allocator {
-public:
+#define MAX_BYTE 0xffffffffffffffff
+class BlockAllocator {
+   public:
     static const size_t BLOCK_ALLOCATOR_ALIGNMENT = 4096;
 
     // How much must be reserved at the beginning for the block?
-    //  The actual header is 8+4+4+8+8_4+8+ the length of the db names + 1 pointer for each root.
+    //  The actual header is 8+4+4+8+8_4+8+ the length of the db names + 1
+    //  pointer for each root.
     //  So 4096 should be enough.
     static const size_t BLOCK_ALLOCATOR_HEADER_RESERVE = 4096;
-    
-    static_assert(BLOCK_ALLOCATOR_HEADER_RESERVE % BLOCK_ALLOCATOR_ALIGNMENT == 0,
+
+    static_assert(BLOCK_ALLOCATOR_HEADER_RESERVE % BLOCK_ALLOCATOR_ALIGNMENT ==
+                      0,
                   "block allocator header must have proper alignment");
 
-    static const size_t BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE = BLOCK_ALLOCATOR_HEADER_RESERVE * 2;
+    static const size_t BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE =
+        BLOCK_ALLOCATOR_HEADER_RESERVE * 2;
 
-    enum allocation_strategy {
-        BA_STRATEGY_FIRST_FIT = 1,
-        BA_STRATEGY_BEST_FIT,
-        BA_STRATEGY_PADDED_FIT,
-        BA_STRATEGY_HEAT_ZONE
-    };
-
-    struct blockpair {
-        uint64_t offset;
-        uint64_t size;
-        blockpair(uint64_t o, uint64_t s) :
-            offset(o), size(s) {
-        }
-        int operator<(const struct blockpair &rhs) const {
-            return offset < rhs.offset;
-        }
-        int operator<(const uint64_t &o) const {
-            return offset < o;
+    struct BlockPair {
+        uint64_t _offset;
+        uint64_t _size;
+        BlockPair(uint64_t o, uint64_t s) : _offset(o), _size(s) {}
+        int operator<(const struct BlockPair &rhs) const {
+            return _offset < rhs._offset;
         }
+        int operator<(const uint64_t &o) const { return _offset < o; }
     };
 
-    // Effect: Create a block allocator, in which the first RESERVE_AT_BEGINNING bytes are not put into a block.
-    //         The default allocation strategy is first fit (BA_STRATEGY_FIRST_FIT)
+    // Effect: Create a block allocator, in which the first RESERVE_AT_BEGINNING
+    // bytes are not put into a block.
+    //         The default allocation strategy is first fit
+    //         (BA_STRATEGY_FIRST_FIT)
     //  All blocks be start on a multiple of ALIGNMENT.
     //  Aborts if we run out of memory.
     // Parameters
-    //  reserve_at_beginning (IN)        Size of reserved block at beginning.  This size does not have to be aligned.
+    //  reserve_at_beginning (IN)        Size of reserved block at beginning.
+    //  This size does not have to be aligned.
     //  alignment (IN)                   Block alignment.
-    void create(uint64_t reserve_at_beginning, uint64_t alignment);
+    void Create(uint64_t reserve_at_beginning, uint64_t alignment);
 
-    // Effect: Create a block allocator, in which the first RESERVE_AT_BEGINNING bytes are not put into a block.
-    //         The default allocation strategy is first fit (BA_STRATEGY_FIRST_FIT)
-    //         The allocator is initialized to contain `n_blocks' of blockpairs, taken from `pairs'
+    // Effect: Create a block allocator, in which the first RESERVE_AT_BEGINNING
+    // bytes are not put into a block.
+    //         The allocator is initialized to contain `n_blocks' of BlockPairs,
+    //         taken from `pairs'
     //  All blocks be start on a multiple of ALIGNMENT.
     //  Aborts if we run out of memory.
     // Parameters
     //  pairs,                           unowned array of pairs to copy
     //  n_blocks,                        Size of pairs array
-    //  reserve_at_beginning (IN)        Size of reserved block at beginning.  This size does not have to be aligned.
+    //  reserve_at_beginning (IN)        Size of reserved block at beginning.
+    //  This size does not have to be aligned.
     //  alignment (IN)                   Block alignment.
-    void create_from_blockpairs(uint64_t reserve_at_beginning, uint64_t alignment,
-                                struct blockpair *pairs, uint64_t n_blocks);
+    void CreateFromBlockPairs(uint64_t reserve_at_beginning,
+                              uint64_t alignment,
+                              struct BlockPair *pairs,
+                              uint64_t n_blocks);
 
     // Effect: Destroy this block allocator
-    void destroy();
-
-    // Effect: Set the allocation strategy that the allocator should use
-    // Requires: No other threads are operating on this block allocator
-    void set_strategy(enum allocation_strategy strategy);
+    void Destroy();
 
-    // Effect: Allocate a block of the specified size at an address chosen by the allocator.
+    // Effect: Allocate a block of the specified size at an address chosen by
+    // the allocator.
     //  Aborts if anything goes wrong.
     //  The block address will be a multiple of the alignment.
     // Parameters:
-    //  size (IN):    The size of the block.  (The size does not have to be aligned.)
+    //  size (IN):    The size of the block.  (The size does not have to be
+    //  aligned.)
     //  offset (OUT): The location of the block.
-    //  heat (IN):    A higher heat means we should be prepared to free this block soon (perhaps in the next checkpoint)
-    //                Heat values are lexiographically ordered (like integers), but their specific values are arbitrary
-    void alloc_block(uint64_t size, uint64_t heat, uint64_t *offset);
+    //  block soon (perhaps in the next checkpoint)
+    //                Heat values are lexiographically ordered (like integers),
+    //                but their specific values are arbitrary
+    void AllocBlock(uint64_t size, uint64_t *offset);
 
     // Effect: Free the block at offset.
     // Requires: There must be a block currently allocated at that offset.
     // Parameters:
     //  offset (IN): The offset of the block.
-    void free_block(uint64_t offset);
+    void FreeBlock(uint64_t offset, uint64_t size);
 
-    // Effect: Return the size of the block that starts at offset.
-    // Requires: There must be a block currently allocated at that offset.
-    // Parameters:
-    //  offset (IN): The offset of the block.
-    uint64_t block_size(uint64_t offset);
-
-    // Effect: Check to see if the block allocator is OK.  This may take a long time.
+    // Effect: Check to see if the block allocator is OK.  This may take a long
+    // time.
     // Usage Hints: Probably only use this for unit tests.
     // TODO: Private?
-    void validate() const;
+    void Validate() const;
 
     // Effect: Return the unallocated block address of "infinite" size.
-    //  That is, return the smallest address that is above all the allocated blocks.
-    uint64_t allocated_limit() const;
+    //  That is, return the smallest address that is above all the allocated
+    //  blocks.
+    uint64_t AllocatedLimit() const;
 
-    // Effect: Consider the blocks in sorted order.  The reserved block at the beginning is number 0.  The next one is number 1 and so forth.
+    // Effect: Consider the blocks in sorted order.  The reserved block at the
+    // beginning is number 0.  The next one is number 1 and so forth.
     //  Return the offset and size of the block with that number.
     //  Return 0 if there is a block that big, return nonzero if b is too big.
     // Rationale: This is probably useful only for tests.
-    int get_nth_block_in_layout_order(uint64_t b, uint64_t *offset, uint64_t *size);
+    int NthBlockInLayoutOrder(uint64_t b, uint64_t *offset, uint64_t *size);
 
     // Effect:  Fill in report to indicate how the file is used.
-    // Requires: 
+    // Requires:
     //  report->file_size_bytes is filled in
     //  report->data_bytes is filled in
     //  report->checkpoint_bytes_additional is filled in
-    void get_unused_statistics(TOKU_DB_FRAGMENTATION report);
+    void UnusedStatistics(TOKU_DB_FRAGMENTATION report);
 
     // Effect: Fill in report->data_bytes with the number of bytes in use
-    //         Fill in report->data_blocks with the number of blockpairs in use
+    //         Fill in report->data_blocks with the number of BlockPairs in use
     //         Fill in unused statistics using this->get_unused_statistics()
     // Requires:
     //  report->file_size is ignored on return
     //  report->checkpoint_bytes_additional is ignored on return
-    void get_statistics(TOKU_DB_FRAGMENTATION report);
-
-    // Block allocator tracing.
-    // - Enabled by setting TOKU_BA_TRACE_PATH to the file that the trace file
-    //   should be written to.
-    // - Trace may be replayed by ba_trace_replay tool in tools/ directory
-    //   eg: "cat mytracefile | ba_trace_replay"
-    static void maybe_initialize_trace();
-    static void maybe_close_trace();
-
-private:
-    void _create_internal(uint64_t reserve_at_beginning, uint64_t alignment);
-    void grow_blocks_array_by(uint64_t n_to_add);
-    void grow_blocks_array();
-    int64_t find_block(uint64_t offset);
-    struct blockpair *choose_block_to_alloc_after(size_t size, uint64_t heat);
-
-    // Tracing
-    toku_mutex_t _trace_lock;
-    void _trace_create(void);
-    void _trace_create_from_blockpairs(void);
-    void _trace_destroy(void);
-    void _trace_alloc(uint64_t size, uint64_t heat, uint64_t offset);
-    void _trace_free(uint64_t offset);
+    void Statistics(TOKU_DB_FRAGMENTATION report);
+
+    virtual ~BlockAllocator(){};
+
+   private:
+    void CreateInternal(uint64_t reserve_at_beginning, uint64_t alignment);
 
     // How much to reserve at the beginning
     uint64_t _reserve_at_beginning;
@@ -203,12 +181,8 @@ private:
     uint64_t _alignment;
     // How many blocks
     uint64_t _n_blocks;
-    // How big is the blocks_array.  Must be >= n_blocks.
-    uint64_t _blocks_array_size;
-    // These blocks are sorted by address.
-    struct blockpair *_blocks_array;
-    // Including the reserve_at_beginning
     uint64_t _n_bytes_in_use;
-    // The allocation strategy are we using
-    enum allocation_strategy _strategy;
+
+    // These blocks are sorted by address.
+    MhsRbTree::Tree *_tree;
 };
diff --git a/storage/tokudb/PerconaFT/ft/serialize/block_allocator_strategy.cc b/storage/tokudb/PerconaFT/ft/serialize/block_allocator_strategy.cc
deleted file mode 100644
index 62bb8fc4a87..00000000000
--- a/storage/tokudb/PerconaFT/ft/serialize/block_allocator_strategy.cc
+++ /dev/null
@@ -1,224 +0,0 @@
-/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
-// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
-#ident "$Id$"
-/*======
-This file is part of PerconaFT.
-
-
-Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
-
-    PerconaFT is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License, version 2,
-    as published by the Free Software Foundation.
-
-    PerconaFT is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
-
-----------------------------------------
-
-    PerconaFT is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Affero General Public License, version 3,
-    as published by the Free Software Foundation.
-
-    PerconaFT is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Affero General Public License for more details.
-
-    You should have received a copy of the GNU Affero General Public License
-    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
-======= */
-
-#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
-
-#include <algorithm>
-
-#include <string.h>
-
-#include "portability/toku_assert.h"
-
-#include "ft/serialize/block_allocator_strategy.h"
-
-static uint64_t _align(uint64_t value, uint64_t ba_alignment) {
-    return ((value + ba_alignment - 1) / ba_alignment) * ba_alignment;
-}
-
-static uint64_t _roundup_to_power_of_two(uint64_t value) {
-    uint64_t r = 4096;
-    while (r < value) {
-        r *= 2;
-        invariant(r > 0);
-    }
-    return r;
-}
-
-// First fit block allocation
-static struct block_allocator::blockpair *
-_first_fit(struct block_allocator::blockpair *blocks_array,
-           uint64_t n_blocks, uint64_t size, uint64_t alignment,
-           uint64_t max_padding) {
-    if (n_blocks == 1) {
-        // won't enter loop, can't underflow the direction < 0 case
-        return nullptr;
-    }
-
-    struct block_allocator::blockpair *bp = &blocks_array[0];
-    for (uint64_t n_spaces_to_check = n_blocks - 1; n_spaces_to_check > 0;
-         n_spaces_to_check--, bp++) {
-        // Consider the space after bp
-        uint64_t padded_alignment = max_padding != 0 ? _align(max_padding, alignment) : alignment;
-        uint64_t possible_offset = _align(bp->offset + bp->size, padded_alignment);
-        if (possible_offset + size <= bp[1].offset) { // bp[1] is always valid since bp < &blocks_array[n_blocks-1]
-            invariant(bp - blocks_array < (int64_t) n_blocks);
-            return bp;
-        }
-    }
-    return nullptr;
-}
-
-static struct block_allocator::blockpair *
-_first_fit_bw(struct block_allocator::blockpair *blocks_array,
-           uint64_t n_blocks, uint64_t size, uint64_t alignment,
-           uint64_t max_padding, struct block_allocator::blockpair *blocks_array_limit) {
-    if (n_blocks == 1) {
-        // won't enter loop, can't underflow the direction < 0 case
-        return nullptr;
-    }
-
-    struct block_allocator::blockpair *bp = &blocks_array[-1];
-    for (uint64_t n_spaces_to_check = n_blocks - 1; n_spaces_to_check > 0;
-         n_spaces_to_check--, bp--) {
-        // Consider the space after bp
-        uint64_t padded_alignment = max_padding != 0 ? _align(max_padding, alignment) : alignment;
-        uint64_t possible_offset = _align(bp->offset + bp->size, padded_alignment);
-        if (&bp[1] < blocks_array_limit && possible_offset + size <= bp[1].offset) {
-            invariant(blocks_array - bp < (int64_t) n_blocks);
-            return bp;
-        }
-    }
-    return nullptr;
-}
-
-struct block_allocator::blockpair *
-block_allocator_strategy::first_fit(struct block_allocator::blockpair *blocks_array,
-                                    uint64_t n_blocks, uint64_t size, uint64_t alignment) {
-    return _first_fit(blocks_array, n_blocks, size, alignment, 0);
-}
-
-// Best fit block allocation
-struct block_allocator::blockpair *
-block_allocator_strategy::best_fit(struct block_allocator::blockpair *blocks_array,
-                                   uint64_t n_blocks, uint64_t size, uint64_t alignment) {
-    struct block_allocator::blockpair *best_bp = nullptr;
-    uint64_t best_hole_size = 0;
-    for (uint64_t blocknum = 0; blocknum + 1 < n_blocks; blocknum++) {
-        // Consider the space after blocknum
-        struct block_allocator::blockpair *bp = &blocks_array[blocknum];
-        uint64_t possible_offset = _align(bp->offset + bp->size, alignment);
-        uint64_t possible_end_offset = possible_offset + size;
-        if (possible_end_offset <= bp[1].offset) {
-            // It fits here. Is it the best fit?
-            uint64_t hole_size = bp[1].offset - possible_end_offset;
-            if (best_bp == nullptr || hole_size < best_hole_size) {
-                best_hole_size = hole_size;
-                best_bp = bp;
-            }
-        }
-    }
-    return best_bp;
-}
-
-static uint64_t padded_fit_alignment = 4096;
-
-// TODO: These compiler specific directives should be abstracted in a portability header
-//       portability/toku_compiler.h?
-__attribute__((__constructor__))
-static void determine_padded_fit_alignment_from_env(void) {
-    // TODO: Should be in portability as 'toku_os_getenv()?'
-    const char *s = getenv("TOKU_BA_PADDED_FIT_ALIGNMENT");
-    if (s != nullptr && strlen(s) > 0) {
-        const int64_t alignment = strtoll(s, nullptr, 10);
-        if (alignment <= 0) {
-            fprintf(stderr, "tokuft: error: block allocator padded fit alignment found in environment (%s), "
-                            "but it's out of range (should be an integer > 0). defaulting to %" PRIu64 "\n",
-                            s, padded_fit_alignment);
-        } else {
-            padded_fit_alignment = _roundup_to_power_of_two(alignment);
-            fprintf(stderr, "tokuft: setting block allocator padded fit alignment to %" PRIu64 "\n",
-                    padded_fit_alignment);
-        }
-    }
-}
-
-// First fit into a block that is oversized by up to max_padding.
-// The hope is that if we purposefully waste a bit of space at allocation
-// time we'll be more likely to reuse this block later.
-struct block_allocator::blockpair *
-block_allocator_strategy::padded_fit(struct block_allocator::blockpair *blocks_array,
-                                     uint64_t n_blocks, uint64_t size, uint64_t alignment) {
-    return _first_fit(blocks_array, n_blocks, size, alignment, padded_fit_alignment);
-}
-
-static double hot_zone_threshold = 0.85;
-
-// TODO: These compiler specific directives should be abstracted in a portability header
-//       portability/toku_compiler.h?
-__attribute__((__constructor__))
-static void determine_hot_zone_threshold_from_env(void) {
-    // TODO: Should be in portability as 'toku_os_getenv()?'
-    const char *s = getenv("TOKU_BA_HOT_ZONE_THRESHOLD");
-    if (s != nullptr && strlen(s) > 0) {
-        const double hot_zone = strtod(s, nullptr);
-        if (hot_zone < 1 || hot_zone > 99) {
-            fprintf(stderr, "tokuft: error: block allocator hot zone threshold found in environment (%s), "
-                            "but it's out of range (should be an integer 1 through 99). defaulting to 85\n", s);
-            hot_zone_threshold = 85 / 100;
-        } else {
-            fprintf(stderr, "tokuft: setting block allocator hot zone threshold to %s\n", s);
-            hot_zone_threshold = hot_zone / 100;
-        }
-    }
-}
-
-struct block_allocator::blockpair *
-block_allocator_strategy::heat_zone(struct block_allocator::blockpair *blocks_array,
-                                    uint64_t n_blocks, uint64_t size, uint64_t alignment,
-                                    uint64_t heat) {
-    if (heat > 0) {
-        struct block_allocator::blockpair *bp, *boundary_bp;
-
-        // Hot allocation. Find the beginning of the hot zone.
-        boundary_bp = &blocks_array[n_blocks - 1];
-        uint64_t highest_offset = _align(boundary_bp->offset + boundary_bp->size, alignment);
-        uint64_t hot_zone_offset = static_cast<uint64_t>(hot_zone_threshold * highest_offset);
-
-        boundary_bp = std::lower_bound(blocks_array, blocks_array + n_blocks, hot_zone_offset);
-        uint64_t blocks_in_zone = (blocks_array + n_blocks) - boundary_bp;
-        uint64_t blocks_outside_zone = boundary_bp - blocks_array;
-        invariant(blocks_in_zone + blocks_outside_zone == n_blocks);
-
-        if (blocks_in_zone > 0) {
-            // Find the first fit in the hot zone, going forward.
-            bp = _first_fit(boundary_bp, blocks_in_zone, size, alignment, 0);
-            if (bp != nullptr) {
-                return bp;
-            }
-        }
-        if (blocks_outside_zone > 0) {
-            // Find the first fit in the cold zone, going backwards.
-            bp = _first_fit_bw(boundary_bp, blocks_outside_zone, size, alignment, 0, &blocks_array[n_blocks]);
-            if (bp != nullptr) {
-                return bp;
-            }
-        }
-    } else {
-        // Cold allocations are simply first-fit from the beginning.
-        return _first_fit(blocks_array, n_blocks, size, alignment, 0);
-    }
-    return nullptr;
-}
diff --git a/storage/tokudb/PerconaFT/ft/serialize/block_table.cc b/storage/tokudb/PerconaFT/ft/serialize/block_table.cc
index ec5a1140310..12700d9d83e 100644
--- a/storage/tokudb/PerconaFT/ft/serialize/block_table.cc
+++ b/storage/tokudb/PerconaFT/ft/serialize/block_table.cc
@@ -47,31 +47,27 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 #include "ft/ft-internal.h"
 
 // TODO: reorganize this dependency (FT-303)
-#include "ft/ft-ops.h" // for toku_maybe_truncate_file
+#include "ft/ft-ops.h"  // for toku_maybe_truncate_file
 #include "ft/serialize/block_table.h"
 #include "ft/serialize/rbuf.h"
 #include "ft/serialize/wbuf.h"
 #include "ft/serialize/block_allocator.h"
-
 #include "util/nb_mutex.h"
 #include "util/scoped_malloc.h"
 
 // indicates the end of a freelist
-static const BLOCKNUM freelist_null = { -1 };
+static const BLOCKNUM freelist_null = {-1};
 
 // value of block_translation_pair.size if blocknum is unused
-static const DISKOFF size_is_free = (DISKOFF) -1;
+static const DISKOFF size_is_free = (DISKOFF)-1;
 
-// value of block_translation_pair.u.diskoff if blocknum is used but does not yet have a diskblock
-static const DISKOFF diskoff_unused = (DISKOFF) -2;
+// value of block_translation_pair.u.diskoff if blocknum is used but does not
+// yet have a diskblock
+static const DISKOFF diskoff_unused = (DISKOFF)-2;
 
-void block_table::_mutex_lock() {
-    toku_mutex_lock(&_mutex);
-}
+void block_table::_mutex_lock() { toku_mutex_lock(&_mutex); }
 
-void block_table::_mutex_unlock() {
-    toku_mutex_unlock(&_mutex);
-}
+void block_table::_mutex_unlock() { toku_mutex_unlock(&_mutex); }
 
 // TODO: Move lock to FT
 void toku_ft_lock(FT ft) {
@@ -86,13 +82,16 @@ void toku_ft_unlock(FT ft) {
     bt->_mutex_unlock();
 }
 
-// There are two headers: the reserve must fit them both and be suitably aligned.
-static_assert(block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE %
-              block_allocator::BLOCK_ALLOCATOR_ALIGNMENT == 0,
+// There are two headers: the reserve must fit them both and be suitably
+// aligned.
+static_assert(BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE %
+                      BlockAllocator::BLOCK_ALLOCATOR_ALIGNMENT ==
+                  0,
               "Block allocator's header reserve must be suitibly aligned");
-static_assert(block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE * 2 ==
-              block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE,
-              "Block allocator's total header reserve must exactly fit two headers");
+static_assert(
+    BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE * 2 ==
+        BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE,
+    "Block allocator's total header reserve must exactly fit two headers");
 
 // does NOT initialize the block allocator: the caller is responsible
 void block_table::_create_internal() {
@@ -100,25 +99,30 @@ void block_table::_create_internal() {
     memset(&_inprogress, 0, sizeof(struct translation));
     memset(&_checkpointed, 0, sizeof(struct translation));
     memset(&_mutex, 0, sizeof(_mutex));
+    _bt_block_allocator = new BlockAllocator();
     toku_mutex_init(&_mutex, nullptr);
     nb_mutex_init(&_safe_file_size_lock);
 }
 
-// Fill in the checkpointed translation from buffer, and copy checkpointed to current.
-// The one read from disk is the last known checkpointed one, so we are keeping it in 
-// place and then setting current (which is never stored on disk) for current use.
-// The translation_buffer has translation only, we create the rest of the block_table.
-int block_table::create_from_buffer(int fd,
-                                    DISKOFF location_on_disk, //Location of translation_buffer
-                                    DISKOFF size_on_disk,
-                                    unsigned char *translation_buffer) {
+// Fill in the checkpointed translation from buffer, and copy checkpointed to
+// current.
+// The one read from disk is the last known checkpointed one, so we are keeping
+// it in
+// place and then setting current (which is never stored on disk) for current
+// use.
+// The translation_buffer has translation only, we create the rest of the
+// block_table.
+int block_table::create_from_buffer(
+    int fd,
+    DISKOFF location_on_disk,  // Location of translation_buffer
+    DISKOFF size_on_disk,
+    unsigned char *translation_buffer) {
     // Does not initialize the block allocator
     _create_internal();
 
     // Deserialize the translation and copy it to current
-    int r = _translation_deserialize_from_buffer(&_checkpointed,
-                                                 location_on_disk, size_on_disk,
-                                                 translation_buffer);
+    int r = _translation_deserialize_from_buffer(
+        &_checkpointed, location_on_disk, size_on_disk, translation_buffer);
     if (r != 0) {
         return r;
     }
@@ -131,22 +135,26 @@ int block_table::create_from_buffer(int fd,
     invariant(file_size >= 0);
     _safe_file_size = file_size;
 
-    // Gather the non-empty translations and use them to create the block allocator
+    // Gather the non-empty translations and use them to create the block
+    // allocator
     toku::scoped_malloc pairs_buf(_checkpointed.smallest_never_used_blocknum.b *
-                                  sizeof(struct block_allocator::blockpair));
-    struct block_allocator::blockpair *CAST_FROM_VOIDP(pairs, pairs_buf.get());
+                                  sizeof(struct BlockAllocator::BlockPair));
+    struct BlockAllocator::BlockPair *CAST_FROM_VOIDP(pairs, pairs_buf.get());
     uint64_t n_pairs = 0;
     for (int64_t i = 0; i < _checkpointed.smallest_never_used_blocknum.b; i++) {
         struct block_translation_pair pair = _checkpointed.block_translation[i];
         if (pair.size > 0) {
             invariant(pair.u.diskoff != diskoff_unused);
-            pairs[n_pairs++] = block_allocator::blockpair(pair.u.diskoff, pair.size);
+            pairs[n_pairs++] =
+                BlockAllocator::BlockPair(pair.u.diskoff, pair.size);
         }
     }
 
-    _bt_block_allocator.create_from_blockpairs(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE,
-                                               block_allocator::BLOCK_ALLOCATOR_ALIGNMENT,
-                                               pairs, n_pairs);
+    _bt_block_allocator->CreateFromBlockPairs(
+        BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE,
+        BlockAllocator::BLOCK_ALLOCATOR_ALIGNMENT,
+        pairs,
+        n_pairs);
 
     return 0;
 }
@@ -156,8 +164,10 @@ void block_table::create() {
     _create_internal();
 
     _checkpointed.type = TRANSLATION_CHECKPOINTED;
-    _checkpointed.smallest_never_used_blocknum = make_blocknum(RESERVED_BLOCKNUMS);
-    _checkpointed.length_of_array = _checkpointed.smallest_never_used_blocknum.b;
+    _checkpointed.smallest_never_used_blocknum =
+        make_blocknum(RESERVED_BLOCKNUMS);
+    _checkpointed.length_of_array =
+        _checkpointed.smallest_never_used_blocknum.b;
     _checkpointed.blocknum_freelist_head = freelist_null;
     XMALLOC_N(_checkpointed.length_of_array, _checkpointed.block_translation);
     for (int64_t i = 0; i < _checkpointed.length_of_array; i++) {
@@ -165,12 +175,13 @@ void block_table::create() {
         _checkpointed.block_translation[i].u.diskoff = diskoff_unused;
     }
 
-    // we just created a default checkpointed, now copy it to current.  
+    // we just created a default checkpointed, now copy it to current.
     _copy_translation(&_current, &_checkpointed, TRANSLATION_CURRENT);
 
     // Create an empty block allocator.
-    _bt_block_allocator.create(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE,
-                               block_allocator::BLOCK_ALLOCATOR_ALIGNMENT);
+    _bt_block_allocator->Create(
+        BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE,
+        BlockAllocator::BLOCK_ALLOCATOR_ALIGNMENT);
 }
 
 // TODO: Refactor with FT-303
@@ -186,20 +197,24 @@ static void ft_set_dirty(FT ft, bool for_checkpoint) {
 
 void block_table::_maybe_truncate_file(int fd, uint64_t size_needed_before) {
     toku_mutex_assert_locked(&_mutex);
-    uint64_t new_size_needed = _bt_block_allocator.allocated_limit();
-    //Save a call to toku_os_get_file_size (kernel call) if unlikely to be useful.
-    if (new_size_needed < size_needed_before && new_size_needed < _safe_file_size) {
+    uint64_t new_size_needed = _bt_block_allocator->AllocatedLimit();
+    // Save a call to toku_os_get_file_size (kernel call) if unlikely to be
+    // useful.
+    if (new_size_needed < size_needed_before &&
+        new_size_needed < _safe_file_size) {
         nb_mutex_lock(&_safe_file_size_lock, &_mutex);
 
         // Must hold _safe_file_size_lock to change _safe_file_size.
         if (new_size_needed < _safe_file_size) {
             int64_t safe_file_size_before = _safe_file_size;
-            // Not safe to use the 'to-be-truncated' portion until truncate is done.
+            // Not safe to use the 'to-be-truncated' portion until truncate is
+            // done.
             _safe_file_size = new_size_needed;
             _mutex_unlock();
 
             uint64_t size_after;
-            toku_maybe_truncate_file(fd, new_size_needed, safe_file_size_before, &size_after);
+            toku_maybe_truncate_file(
+                fd, new_size_needed, safe_file_size_before, &size_after);
             _mutex_lock();
 
             _safe_file_size = size_after;
@@ -214,26 +229,35 @@ void block_table::maybe_truncate_file_on_open(int fd) {
     _mutex_unlock();
 }
 
-void block_table::_copy_translation(struct translation *dst, struct translation *src, enum translation_type newtype) {
-    // We intend to malloc a fresh block, so the incoming translation should be empty
+void block_table::_copy_translation(struct translation *dst,
+                                    struct translation *src,
+                                    enum translation_type newtype) {
+    // We intend to malloc a fresh block, so the incoming translation should be
+    // empty
     invariant_null(dst->block_translation);
 
     invariant(src->length_of_array >= src->smallest_never_used_blocknum.b);
     invariant(newtype == TRANSLATION_DEBUG ||
-              (src->type == TRANSLATION_CURRENT && newtype == TRANSLATION_INPROGRESS) ||
-              (src->type == TRANSLATION_CHECKPOINTED && newtype == TRANSLATION_CURRENT));
+              (src->type == TRANSLATION_CURRENT &&
+               newtype == TRANSLATION_INPROGRESS) ||
+              (src->type == TRANSLATION_CHECKPOINTED &&
+               newtype == TRANSLATION_CURRENT));
     dst->type = newtype;
     dst->smallest_never_used_blocknum = src->smallest_never_used_blocknum;
-    dst->blocknum_freelist_head = src->blocknum_freelist_head; 
+    dst->blocknum_freelist_head = src->blocknum_freelist_head;
 
-    // destination btt is of fixed size. Allocate + memcpy the exact length necessary.
+    // destination btt is of fixed size. Allocate + memcpy the exact length
+    // necessary.
     dst->length_of_array = dst->smallest_never_used_blocknum.b;
     XMALLOC_N(dst->length_of_array, dst->block_translation);
-    memcpy(dst->block_translation, src->block_translation, dst->length_of_array * sizeof(*dst->block_translation));
+    memcpy(dst->block_translation,
+           src->block_translation,
+           dst->length_of_array * sizeof(*dst->block_translation));
 
     // New version of btt is not yet stored on disk.
     dst->block_translation[RESERVED_BLOCKNUM_TRANSLATION].size = 0;
-    dst->block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff = diskoff_unused;
+    dst->block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff =
+        diskoff_unused;
 }
 
 int64_t block_table::get_blocks_in_use_unlocked() {
@@ -241,8 +265,9 @@ int64_t block_table::get_blocks_in_use_unlocked() {
     struct translation *t = &_current;
     int64_t num_blocks = 0;
     {
-        //Reserved blocknums do not get upgraded; They are part of the header.
-        for (b.b = RESERVED_BLOCKNUMS; b.b < t->smallest_never_used_blocknum.b; b.b++) {
+        // Reserved blocknums do not get upgraded; They are part of the header.
+        for (b.b = RESERVED_BLOCKNUMS; b.b < t->smallest_never_used_blocknum.b;
+             b.b++) {
             if (t->block_translation[b.b].size != size_is_free) {
                 num_blocks++;
             }
@@ -252,38 +277,43 @@ int64_t block_table::get_blocks_in_use_unlocked() {
 }
 
 void block_table::_maybe_optimize_translation(struct translation *t) {
-    //Reduce 'smallest_never_used_blocknum.b' (completely free blocknums instead of just
-    //on a free list.  Doing so requires us to regenerate the free list.
-    //This is O(n) work, so do it only if you're already doing that.
+    // Reduce 'smallest_never_used_blocknum.b' (completely free blocknums
+    // instead of just
+    // on a free list.  Doing so requires us to regenerate the free list.
+    // This is O(n) work, so do it only if you're already doing that.
 
     BLOCKNUM b;
     paranoid_invariant(t->smallest_never_used_blocknum.b >= RESERVED_BLOCKNUMS);
-    //Calculate how large the free suffix is.
+    // Calculate how large the free suffix is.
     int64_t freed;
     {
-        for (b.b = t->smallest_never_used_blocknum.b; b.b > RESERVED_BLOCKNUMS; b.b--) {
-            if (t->block_translation[b.b-1].size != size_is_free) {
+        for (b.b = t->smallest_never_used_blocknum.b; b.b > RESERVED_BLOCKNUMS;
+             b.b--) {
+            if (t->block_translation[b.b - 1].size != size_is_free) {
                 break;
             }
         }
         freed = t->smallest_never_used_blocknum.b - b.b;
     }
-    if (freed>0) {
+    if (freed > 0) {
         t->smallest_never_used_blocknum.b = b.b;
-        if (t->length_of_array/4 > t->smallest_never_used_blocknum.b) {
-            //We're using more memory than necessary to represent this now.  Reduce.
+        if (t->length_of_array / 4 > t->smallest_never_used_blocknum.b) {
+            // We're using more memory than necessary to represent this now.
+            // Reduce.
             uint64_t new_length = t->smallest_never_used_blocknum.b * 2;
             XREALLOC_N(new_length, t->block_translation);
             t->length_of_array = new_length;
-            //No need to zero anything out. 
+            // No need to zero anything out.
         }
 
-        //Regenerate free list.
+        // Regenerate free list.
         t->blocknum_freelist_head.b = freelist_null.b;
-        for (b.b = RESERVED_BLOCKNUMS; b.b < t->smallest_never_used_blocknum.b; b.b++) {
+        for (b.b = RESERVED_BLOCKNUMS; b.b < t->smallest_never_used_blocknum.b;
+             b.b++) {
             if (t->block_translation[b.b].size == size_is_free) {
-                t->block_translation[b.b].u.next_free_blocknum = t->blocknum_freelist_head;
-                t->blocknum_freelist_head                      = b;
+                t->block_translation[b.b].u.next_free_blocknum =
+                    t->blocknum_freelist_head;
+                t->blocknum_freelist_head = b;
             }
         }
     }
@@ -304,14 +334,16 @@ void block_table::note_start_checkpoint_unlocked() {
 }
 
 void block_table::note_skipped_checkpoint() {
-    //Purpose, alert block translation that the checkpoint was skipped, e.x. for a non-dirty header
+    // Purpose, alert block translation that the checkpoint was skipped, e.x.
+    // for a non-dirty header
     _mutex_lock();
     paranoid_invariant_notnull(_inprogress.block_translation);
     _checkpoint_skipped = true;
     _mutex_unlock();
 }
 
-// Purpose: free any disk space used by previous checkpoint that isn't in use by either
+// Purpose: free any disk space used by previous checkpoint that isn't in use by
+// either
 //           - current state
 //           - in-progress checkpoint
 //          capture inprogress as new checkpointed.
@@ -324,7 +356,7 @@ void block_table::note_skipped_checkpoint() {
 void block_table::note_end_checkpoint(int fd) {
     // Free unused blocks
     _mutex_lock();
-    uint64_t allocated_limit_at_start = _bt_block_allocator.allocated_limit();
+    uint64_t allocated_limit_at_start = _bt_block_allocator->AllocatedLimit();
     paranoid_invariant_notnull(_inprogress.block_translation);
     if (_checkpoint_skipped) {
         toku_free(_inprogress.block_translation);
@@ -332,17 +364,23 @@ void block_table::note_end_checkpoint(int fd) {
         goto end;
     }
 
-    //Make certain inprogress was allocated space on disk
-    assert(_inprogress.block_translation[RESERVED_BLOCKNUM_TRANSLATION].size > 0);
-    assert(_inprogress.block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff > 0);
+    // Make certain inprogress was allocated space on disk
+    invariant(
+        _inprogress.block_translation[RESERVED_BLOCKNUM_TRANSLATION].size > 0);
+    invariant(
+        _inprogress.block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff >
+        0);
 
     {
         struct translation *t = &_checkpointed;
         for (int64_t i = 0; i < t->length_of_array; i++) {
             struct block_translation_pair *pair = &t->block_translation[i];
-            if (pair->size > 0 && !_translation_prevents_freeing(&_inprogress, make_blocknum(i), pair)) {
-                assert(!_translation_prevents_freeing(&_current, make_blocknum(i), pair));
-                _bt_block_allocator.free_block(pair->u.diskoff);
+            if (pair->size > 0 &&
+                !_translation_prevents_freeing(
+                    &_inprogress, make_blocknum(i), pair)) {
+                invariant(!_translation_prevents_freeing(
+                              &_current, make_blocknum(i), pair));
+                _bt_block_allocator->FreeBlock(pair->u.diskoff, pair->size);
             }
         }
         toku_free(_checkpointed.block_translation);
@@ -360,53 +398,65 @@ bool block_table::_is_valid_blocknum(struct translation *t, BLOCKNUM b) {
     return b.b >= 0 && b.b < t->smallest_never_used_blocknum.b;
 }
 
-void block_table::_verify_valid_blocknum(struct translation *UU(t), BLOCKNUM UU(b)) {
+void block_table::_verify_valid_blocknum(struct translation *UU(t),
+                                         BLOCKNUM UU(b)) {
     invariant(_is_valid_blocknum(t, b));
 }
 
-bool block_table::_is_valid_freeable_blocknum(struct translation *t, BLOCKNUM b) {
+bool block_table::_is_valid_freeable_blocknum(struct translation *t,
+                                              BLOCKNUM b) {
     invariant(t->length_of_array >= t->smallest_never_used_blocknum.b);
     return b.b >= RESERVED_BLOCKNUMS && b.b < t->smallest_never_used_blocknum.b;
 }
 
 // should be freeable
-void block_table::_verify_valid_freeable_blocknum(struct translation *UU(t), BLOCKNUM UU(b)) {
+void block_table::_verify_valid_freeable_blocknum(struct translation *UU(t),
+                                                  BLOCKNUM UU(b)) {
     invariant(_is_valid_freeable_blocknum(t, b));
 }
 
 // Also used only in ft-serialize-test.
-void block_table::block_free(uint64_t offset) {
+void block_table::block_free(uint64_t offset, uint64_t size) {
     _mutex_lock();
-    _bt_block_allocator.free_block(offset);
+    _bt_block_allocator->FreeBlock(offset, size);
     _mutex_unlock();
 }
 
 int64_t block_table::_calculate_size_on_disk(struct translation *t) {
-    return 8 + // smallest_never_used_blocknum
-           8 + // blocknum_freelist_head
-           t->smallest_never_used_blocknum.b * 16 + // Array
-           4; // 4 for checksum
+    return 8 +  // smallest_never_used_blocknum
+           8 +  // blocknum_freelist_head
+           t->smallest_never_used_blocknum.b * 16 +  // Array
+           4;                                        // 4 for checksum
 }
 
-// We cannot free the disk space allocated to this blocknum if it is still in use by the given translation table.
-bool block_table::_translation_prevents_freeing(struct translation *t, BLOCKNUM b, struct block_translation_pair *old_pair) {
-    return t->block_translation &&
-           b.b < t->smallest_never_used_blocknum.b &&
+// We cannot free the disk space allocated to this blocknum if it is still in
+// use by the given translation table.
+bool block_table::_translation_prevents_freeing(
+    struct translation *t,
+    BLOCKNUM b,
+    struct block_translation_pair *old_pair) {
+    return t->block_translation && b.b < t->smallest_never_used_blocknum.b &&
            old_pair->u.diskoff == t->block_translation[b.b].u.diskoff;
 }
 
-void block_table::_realloc_on_disk_internal(BLOCKNUM b, DISKOFF size, DISKOFF *offset, FT ft, bool for_checkpoint, uint64_t heat) {
+void block_table::_realloc_on_disk_internal(BLOCKNUM b,
+                                            DISKOFF size,
+                                            DISKOFF *offset,
+                                            FT ft,
+                                            bool for_checkpoint) {
     toku_mutex_assert_locked(&_mutex);
     ft_set_dirty(ft, for_checkpoint);
 
     struct translation *t = &_current;
     struct block_translation_pair old_pair = t->block_translation[b.b];
-    //Free the old block if it is not still in use by the checkpoint in progress or the previous checkpoint
-    bool cannot_free = (bool)
-        ((!for_checkpoint && _translation_prevents_freeing(&_inprogress,   b, &old_pair)) ||
-         _translation_prevents_freeing(&_checkpointed, b, &old_pair));
-    if (!cannot_free && old_pair.u.diskoff!=diskoff_unused) {
-        _bt_block_allocator.free_block(old_pair.u.diskoff);
+    // Free the old block if it is not still in use by the checkpoint in
+    // progress or the previous checkpoint
+    bool cannot_free =
+        (!for_checkpoint &&
+         _translation_prevents_freeing(&_inprogress, b, &old_pair)) ||
+        _translation_prevents_freeing(&_checkpointed, b, &old_pair);
+    if (!cannot_free && old_pair.u.diskoff != diskoff_unused) {
+        _bt_block_allocator->FreeBlock(old_pair.u.diskoff, old_pair.size);
     }
 
     uint64_t allocator_offset = diskoff_unused;
@@ -414,19 +464,22 @@ void block_table::_realloc_on_disk_internal(BLOCKNUM b, DISKOFF size, DISKOFF *o
     if (size > 0) {
         // Allocate a new block if the size is greater than 0,
         // if the size is just 0, offset will be set to diskoff_unused
-        _bt_block_allocator.alloc_block(size, heat, &allocator_offset);
+        _bt_block_allocator->AllocBlock(size, &allocator_offset);
     }
     t->block_translation[b.b].u.diskoff = allocator_offset;
     *offset = allocator_offset;
 
-    //Update inprogress btt if appropriate (if called because Pending bit is set).
+    // Update inprogress btt if appropriate (if called because Pending bit is
+    // set).
     if (for_checkpoint) {
         paranoid_invariant(b.b < _inprogress.length_of_array);
         _inprogress.block_translation[b.b] = t->block_translation[b.b];
     }
 }
 
-void block_table::_ensure_safe_write_unlocked(int fd, DISKOFF block_size, DISKOFF block_offset) {
+void block_table::_ensure_safe_write_unlocked(int fd,
+                                              DISKOFF block_size,
+                                              DISKOFF block_offset) {
     // Requires: holding _mutex
     uint64_t size_needed = block_size + block_offset;
     if (size_needed > _safe_file_size) {
@@ -436,7 +489,8 @@ void block_table::_ensure_safe_write_unlocked(int fd, DISKOFF block_size, DISKOF
             _mutex_unlock();
 
             int64_t size_after;
-            toku_maybe_preallocate_in_file(fd, size_needed, _safe_file_size, &size_after);
+            toku_maybe_preallocate_in_file(
+                fd, size_needed, _safe_file_size, &size_after);
 
             _mutex_lock();
             _safe_file_size = size_after;
@@ -445,11 +499,16 @@ void block_table::_ensure_safe_write_unlocked(int fd, DISKOFF block_size, DISKOF
     }
 }
 
-void block_table::realloc_on_disk(BLOCKNUM b, DISKOFF size, DISKOFF *offset, FT ft, int fd, bool for_checkpoint, uint64_t heat) {
+void block_table::realloc_on_disk(BLOCKNUM b,
+                                  DISKOFF size,
+                                  DISKOFF *offset,
+                                  FT ft,
+                                  int fd,
+                                  bool for_checkpoint) {
     _mutex_lock();
     struct translation *t = &_current;
     _verify_valid_freeable_blocknum(t, b);
-    _realloc_on_disk_internal(b, size, offset, ft, for_checkpoint, heat);
+    _realloc_on_disk_internal(b, size, offset, ft, for_checkpoint);
 
     _ensure_safe_write_unlocked(fd, size, *offset);
     _mutex_unlock();
@@ -459,70 +518,97 @@ bool block_table::_pair_is_unallocated(struct block_translation_pair *pair) {
     return pair->size == 0 && pair->u.diskoff == diskoff_unused;
 }
 
-// Effect: figure out where to put the inprogress btt on disk, allocate space for it there.
-//   The space must be 512-byte aligned (both the starting address and the size).
-//   As a result, the allcoated space may be a little bit bigger (up to the next 512-byte boundary) than the actual btt.
+// Effect: figure out where to put the inprogress btt on disk, allocate space
+// for it there.
+//   The space must be 512-byte aligned (both the starting address and the
+//   size).
+//   As a result, the allcoated space may be a little bit bigger (up to the next
+//   512-byte boundary) than the actual btt.
 void block_table::_alloc_inprogress_translation_on_disk_unlocked() {
     toku_mutex_assert_locked(&_mutex);
 
     struct translation *t = &_inprogress;
     paranoid_invariant_notnull(t->block_translation);
     BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_TRANSLATION);
-    //Each inprogress is allocated only once
+    // Each inprogress is allocated only once
     paranoid_invariant(_pair_is_unallocated(&t->block_translation[b.b]));
 
-    //Allocate a new block
+    // Allocate a new block
     int64_t size = _calculate_size_on_disk(t);
     uint64_t offset;
-    _bt_block_allocator.alloc_block(size, 0, &offset);
+    _bt_block_allocator->AllocBlock(size, &offset);
     t->block_translation[b.b].u.diskoff = offset;
-    t->block_translation[b.b].size      = size;
+    t->block_translation[b.b].size = size;
 }
 
 // Effect: Serializes the blocktable to a wbuf (which starts uninitialized)
-//   A clean shutdown runs checkpoint start so that current and inprogress are copies.
-//   The resulting wbuf buffer is guaranteed to be be 512-byte aligned and the total length is a multiple of 512 (so we pad with zeros at the end if needd)
-//   The address is guaranteed to be 512-byte aligned, but the size is not guaranteed.
-//   It *is* guaranteed that we can read up to the next 512-byte boundary, however
-void block_table::serialize_translation_to_wbuf(int fd, struct wbuf *w,
-                                                int64_t *address, int64_t *size) {
+//   A clean shutdown runs checkpoint start so that current and inprogress are
+//   copies.
+//   The resulting wbuf buffer is guaranteed to be be 512-byte aligned and the
+//   total length is a multiple of 512 (so we pad with zeros at the end if
+//   needd)
+//   The address is guaranteed to be 512-byte aligned, but the size is not
+//   guaranteed.
+//   It *is* guaranteed that we can read up to the next 512-byte boundary,
+//   however
+void block_table::serialize_translation_to_wbuf(int fd,
+                                                struct wbuf *w,
+                                                int64_t *address,
+                                                int64_t *size) {
     _mutex_lock();
     struct translation *t = &_inprogress;
 
     BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_TRANSLATION);
-    _alloc_inprogress_translation_on_disk_unlocked(); // The allocated block must be 512-byte aligned to make O_DIRECT happy.
+    _alloc_inprogress_translation_on_disk_unlocked();  // The allocated block
+                                                       // must be 512-byte
+                                                       // aligned to make
+                                                       // O_DIRECT happy.
     uint64_t size_translation = _calculate_size_on_disk(t);
-    uint64_t size_aligned     = roundup_to_multiple(512, size_translation);
-    assert((int64_t)size_translation==t->block_translation[b.b].size);
+    uint64_t size_aligned = roundup_to_multiple(512, size_translation);
+    invariant((int64_t)size_translation == t->block_translation[b.b].size);
     {
-        //Init wbuf
+        // Init wbuf
         if (0)
-            printf("%s:%d writing translation table of size_translation %" PRIu64 " at %" PRId64 "\n", __FILE__, __LINE__, size_translation, t->block_translation[b.b].u.diskoff);
+            printf(
+                "%s:%d writing translation table of size_translation %" PRIu64
+                " at %" PRId64 "\n",
+                __FILE__,
+                __LINE__,
+                size_translation,
+                t->block_translation[b.b].u.diskoff);
         char *XMALLOC_N_ALIGNED(512, size_aligned, buf);
-        for (uint64_t i=size_translation; i<size_aligned; i++) buf[i]=0; // fill in the end of the buffer with zeros.
+        for (uint64_t i = size_translation; i < size_aligned; i++)
+            buf[i] = 0;  // fill in the end of the buffer with zeros.
         wbuf_init(w, buf, size_aligned);
     }
-    wbuf_BLOCKNUM(w, t->smallest_never_used_blocknum); 
-    wbuf_BLOCKNUM(w, t->blocknum_freelist_head); 
+    wbuf_BLOCKNUM(w, t->smallest_never_used_blocknum);
+    wbuf_BLOCKNUM(w, t->blocknum_freelist_head);
     int64_t i;
-    for (i=0; i<t->smallest_never_used_blocknum.b; i++) {
+    for (i = 0; i < t->smallest_never_used_blocknum.b; i++) {
         if (0)
-            printf("%s:%d %" PRId64 ",%" PRId64 "\n", __FILE__, __LINE__, t->block_translation[i].u.diskoff, t->block_translation[i].size);
+            printf("%s:%d %" PRId64 ",%" PRId64 "\n",
+                   __FILE__,
+                   __LINE__,
+                   t->block_translation[i].u.diskoff,
+                   t->block_translation[i].size);
         wbuf_DISKOFF(w, t->block_translation[i].u.diskoff);
         wbuf_DISKOFF(w, t->block_translation[i].size);
     }
     uint32_t checksum = toku_x1764_finish(&w->checksum);
     wbuf_int(w, checksum);
     *address = t->block_translation[b.b].u.diskoff;
-    *size    = size_translation;
-    assert((*address)%512 == 0);
+    *size = size_translation;
+    invariant((*address) % 512 == 0);
 
     _ensure_safe_write_unlocked(fd, size_aligned, *address);
     _mutex_unlock();
 }
 
-// Perhaps rename: purpose is get disk address of a block, given its blocknum (blockid?)
-void block_table::_translate_blocknum_to_offset_size_unlocked(BLOCKNUM b, DISKOFF *offset, DISKOFF *size) {
+// Perhaps rename: purpose is get disk address of a block, given its blocknum
+// (blockid?)
+void block_table::_translate_blocknum_to_offset_size_unlocked(BLOCKNUM b,
+                                                              DISKOFF *offset,
+                                                              DISKOFF *size) {
     struct translation *t = &_current;
     _verify_valid_blocknum(t, b);
     if (offset) {
@@ -533,8 +619,11 @@ void block_table::_translate_blocknum_to_offset_size_unlocked(BLOCKNUM b, DISKOF
     }
 }
 
-// Perhaps rename: purpose is get disk address of a block, given its blocknum (blockid?)
-void block_table::translate_blocknum_to_offset_size(BLOCKNUM b, DISKOFF *offset, DISKOFF *size) {
+// Perhaps rename: purpose is get disk address of a block, given its blocknum
+// (blockid?)
+void block_table::translate_blocknum_to_offset_size(BLOCKNUM b,
+                                                    DISKOFF *offset,
+                                                    DISKOFF *size) {
     _mutex_lock();
     _translate_blocknum_to_offset_size_unlocked(b, offset, size);
     _mutex_unlock();
@@ -545,13 +634,13 @@ void block_table::translate_blocknum_to_offset_size(BLOCKNUM b, DISKOFF *offset,
 // given that one more never-used blocknum will soon be used.
 void block_table::_maybe_expand_translation(struct translation *t) {
     if (t->length_of_array <= t->smallest_never_used_blocknum.b) {
-        //expansion is necessary
+        // expansion is necessary
         uint64_t new_length = t->smallest_never_used_blocknum.b * 2;
         XREALLOC_N(new_length, t->block_translation);
         uint64_t i;
         for (i = t->length_of_array; i < new_length; i++) {
             t->block_translation[i].u.next_free_blocknum = freelist_null;
-            t->block_translation[i].size                 = size_is_free;
+            t->block_translation[i].size = size_is_free;
         }
         t->length_of_array = new_length;
     }
@@ -564,7 +653,8 @@ void block_table::_allocate_blocknum_unlocked(BLOCKNUM *res, FT ft) {
     if (t->blocknum_freelist_head.b == freelist_null.b) {
         // no previously used blocknums are available
         // use a never used blocknum
-        _maybe_expand_translation(t); //Ensure a never used blocknums is available
+        _maybe_expand_translation(
+            t);  // Ensure a never used blocknums is available
         result = t->smallest_never_used_blocknum;
         t->smallest_never_used_blocknum.b++;
     } else {  // reuse a previously used blocknum
@@ -572,11 +662,11 @@ void block_table::_allocate_blocknum_unlocked(BLOCKNUM *res, FT ft) {
         BLOCKNUM next = t->block_translation[result.b].u.next_free_blocknum;
         t->blocknum_freelist_head = next;
     }
-    //Verify the blocknum is free
+    // Verify the blocknum is free
     paranoid_invariant(t->block_translation[result.b].size == size_is_free);
-    //blocknum is not free anymore
+    // blocknum is not free anymore
     t->block_translation[result.b].u.diskoff = diskoff_unused;
-    t->block_translation[result.b].size    = 0;
+    t->block_translation[result.b].size = 0;
     _verify_valid_freeable_blocknum(t, result);
     *res = result;
     ft_set_dirty(ft, false);
@@ -588,42 +678,46 @@ void block_table::allocate_blocknum(BLOCKNUM *res, FT ft) {
     _mutex_unlock();
 }
 
-void block_table::_free_blocknum_in_translation(struct translation *t, BLOCKNUM b) {
+void block_table::_free_blocknum_in_translation(struct translation *t,
+                                                BLOCKNUM b) {
     _verify_valid_freeable_blocknum(t, b);
     paranoid_invariant(t->block_translation[b.b].size != size_is_free);
 
-    t->block_translation[b.b].size                 = size_is_free;
+    t->block_translation[b.b].size = size_is_free;
     t->block_translation[b.b].u.next_free_blocknum = t->blocknum_freelist_head;
-    t->blocknum_freelist_head                      = b;
+    t->blocknum_freelist_head = b;
 }
 
 // Effect: Free a blocknum.
 // If the blocknum holds the only reference to a block on disk, free that block
-void block_table::_free_blocknum_unlocked(BLOCKNUM *bp, FT ft, bool for_checkpoint) {
+void block_table::_free_blocknum_unlocked(BLOCKNUM *bp,
+                                          FT ft,
+                                          bool for_checkpoint) {
     toku_mutex_assert_locked(&_mutex);
     BLOCKNUM b = *bp;
-    bp->b = 0; //Remove caller's reference.
+    bp->b = 0;  // Remove caller's reference.
 
     struct block_translation_pair old_pair = _current.block_translation[b.b];
 
     _free_blocknum_in_translation(&_current, b);
     if (for_checkpoint) {
-        paranoid_invariant(ft->checkpoint_header->type == FT_CHECKPOINT_INPROGRESS);
+        paranoid_invariant(ft->checkpoint_header->type ==
+                           FT_CHECKPOINT_INPROGRESS);
         _free_blocknum_in_translation(&_inprogress, b);
     }
 
-    //If the size is 0, no disk block has ever been assigned to this blocknum.
+    // If the size is 0, no disk block has ever been assigned to this blocknum.
     if (old_pair.size > 0) {
-        //Free the old block if it is not still in use by the checkpoint in progress or the previous checkpoint
-        bool cannot_free = (bool)
-            (_translation_prevents_freeing(&_inprogress,   b, &old_pair) ||
-             _translation_prevents_freeing(&_checkpointed, b, &old_pair));
+        // Free the old block if it is not still in use by the checkpoint in
+        // progress or the previous checkpoint
+        bool cannot_free =
+            _translation_prevents_freeing(&_inprogress, b, &old_pair) ||
+            _translation_prevents_freeing(&_checkpointed, b, &old_pair);
         if (!cannot_free) {
-            _bt_block_allocator.free_block(old_pair.u.diskoff);
+            _bt_block_allocator->FreeBlock(old_pair.u.diskoff, old_pair.size);
         }
-    }
-    else {
-        paranoid_invariant(old_pair.size==0);
+    } else {
+        paranoid_invariant(old_pair.size == 0);
         paranoid_invariant(old_pair.u.diskoff == diskoff_unused);
     }
     ft_set_dirty(ft, for_checkpoint);
@@ -645,13 +739,14 @@ void block_table::verify_no_free_blocknums() {
 void block_table::free_unused_blocknums(BLOCKNUM root) {
     _mutex_lock();
     int64_t smallest = _current.smallest_never_used_blocknum.b;
-    for (int64_t i=RESERVED_BLOCKNUMS; i < smallest; i++) {
+    for (int64_t i = RESERVED_BLOCKNUMS; i < smallest; i++) {
         if (i == root.b) {
             continue;
         }
         BLOCKNUM b = make_blocknum(i);
         if (_current.block_translation[b.b].size == 0) {
-            invariant(_current.block_translation[b.b].u.diskoff == diskoff_unused);
+            invariant(_current.block_translation[b.b].u.diskoff ==
+                      diskoff_unused);
             _free_blocknum_in_translation(&_current, b);
         }
     }
@@ -676,13 +771,14 @@ bool block_table::_no_data_blocks_except_root(BLOCKNUM root) {
             goto cleanup;
         }
     }
- cleanup:
+cleanup:
     _mutex_unlock();
     return ok;
 }
 
 // Verify there are no data blocks except root.
-// TODO(leif): This actually takes a lock, but I don't want to fix all the callers right now.
+// TODO(leif): This actually takes a lock, but I don't want to fix all the
+// callers right now.
 void block_table::verify_no_data_blocks_except_root(BLOCKNUM UU(root)) {
     paranoid_invariant(_no_data_blocks_except_root(root));
 }
@@ -706,13 +802,24 @@ void block_table::_dump_translation_internal(FILE *f, struct translation *t) {
     if (t->block_translation) {
         BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_TRANSLATION);
         fprintf(f, " length_of_array[%" PRId64 "]", t->length_of_array);
-        fprintf(f, " smallest_never_used_blocknum[%" PRId64 "]", t->smallest_never_used_blocknum.b);
-        fprintf(f, " blocknum_free_list_head[%" PRId64 "]", t->blocknum_freelist_head.b);
-        fprintf(f, " size_on_disk[%" PRId64 "]", t->block_translation[b.b].size);
-        fprintf(f, " location_on_disk[%" PRId64 "]\n", t->block_translation[b.b].u.diskoff);
+        fprintf(f,
+                " smallest_never_used_blocknum[%" PRId64 "]",
+                t->smallest_never_used_blocknum.b);
+        fprintf(f,
+                " blocknum_free_list_head[%" PRId64 "]",
+                t->blocknum_freelist_head.b);
+        fprintf(
+            f, " size_on_disk[%" PRId64 "]", t->block_translation[b.b].size);
+        fprintf(f,
+                " location_on_disk[%" PRId64 "]\n",
+                t->block_translation[b.b].u.diskoff);
         int64_t i;
-        for (i=0; i<t->length_of_array; i++) {
-            fprintf(f, " %" PRId64 ": %" PRId64 " %" PRId64 "\n", i, t->block_translation[i].u.diskoff, t->block_translation[i].size);
+        for (i = 0; i < t->length_of_array; i++) {
+            fprintf(f,
+                    " %" PRId64 ": %" PRId64 " %" PRId64 "\n",
+                    i,
+                    t->block_translation[i].u.diskoff,
+                    t->block_translation[i].size);
         }
         fprintf(f, "\n");
     } else {
@@ -725,9 +832,13 @@ void block_table::_dump_translation_internal(FILE *f, struct translation *t) {
 void block_table::dump_translation_table_pretty(FILE *f) {
     _mutex_lock();
     struct translation *t = &_checkpointed;
-    assert(t->block_translation != nullptr);
+    invariant(t->block_translation != nullptr);
     for (int64_t i = 0; i < t->length_of_array; ++i) {
-        fprintf(f, "%" PRId64 "\t%" PRId64 "\t%" PRId64 "\n", i, t->block_translation[i].u.diskoff, t->block_translation[i].size);
+        fprintf(f,
+                "%" PRId64 "\t%" PRId64 "\t%" PRId64 "\n",
+                i,
+                t->block_translation[i].u.diskoff,
+                t->block_translation[i].size);
     }
     _mutex_unlock();
 }
@@ -751,7 +862,10 @@ void block_table::blocknum_dump_translation(BLOCKNUM b) {
     struct translation *t = &_current;
     if (b.b < t->length_of_array) {
         struct block_translation_pair *bx = &t->block_translation[b.b];
-        printf("%" PRId64 ": %" PRId64 " %" PRId64 "\n", b.b, bx->u.diskoff, bx->size);
+        printf("%" PRId64 ": %" PRId64 " %" PRId64 "\n",
+               b.b,
+               bx->u.diskoff,
+               bx->size);
     }
     _mutex_unlock();
 }
@@ -764,26 +878,31 @@ void block_table::destroy(void) {
     toku_free(_inprogress.block_translation);
     toku_free(_checkpointed.block_translation);
 
-    _bt_block_allocator.destroy();
+    _bt_block_allocator->Destroy();
+    delete _bt_block_allocator;
     toku_mutex_destroy(&_mutex);
     nb_mutex_destroy(&_safe_file_size_lock);
 }
 
-int block_table::_translation_deserialize_from_buffer(struct translation *t,
-                                                      DISKOFF location_on_disk,
-                                                      uint64_t size_on_disk,
-                                                      // out: buffer with serialized translation
-                                                      unsigned char *translation_buffer) {
+int block_table::_translation_deserialize_from_buffer(
+    struct translation *t,
+    DISKOFF location_on_disk,
+    uint64_t size_on_disk,
+    // out: buffer with serialized translation
+    unsigned char *translation_buffer) {
     int r = 0;
-    assert(location_on_disk != 0);
+    invariant(location_on_disk != 0);
     t->type = TRANSLATION_CHECKPOINTED;
 
     // check the checksum
     uint32_t x1764 = toku_x1764_memory(translation_buffer, size_on_disk - 4);
     uint64_t offset = size_on_disk - 4;
-    uint32_t stored_x1764 = toku_dtoh32(*(int*)(translation_buffer + offset));
+    uint32_t stored_x1764 = toku_dtoh32(*(int *)(translation_buffer + offset));
     if (x1764 != stored_x1764) {
-        fprintf(stderr, "Translation table checksum failure: calc=0x%08x read=0x%08x\n", x1764, stored_x1764);
+        fprintf(stderr,
+                "Translation table checksum failure: calc=0x%08x read=0x%08x\n",
+                x1764,
+                stored_x1764);
         r = TOKUDB_BAD_CHECKSUM;
         goto exit;
     }
@@ -791,42 +910,47 @@ int block_table::_translation_deserialize_from_buffer(struct translation *t,
     struct rbuf rb;
     rb.buf = translation_buffer;
     rb.ndone = 0;
-    rb.size = size_on_disk-4;//4==checksum
+    rb.size = size_on_disk - 4;  // 4==checksum
 
-    t->smallest_never_used_blocknum = rbuf_blocknum(&rb); 
+    t->smallest_never_used_blocknum = rbuf_blocknum(&rb);
     t->length_of_array = t->smallest_never_used_blocknum.b;
     invariant(t->smallest_never_used_blocknum.b >= RESERVED_BLOCKNUMS);
-    t->blocknum_freelist_head = rbuf_blocknum(&rb); 
+    t->blocknum_freelist_head = rbuf_blocknum(&rb);
     XMALLOC_N(t->length_of_array, t->block_translation);
     for (int64_t i = 0; i < t->length_of_array; i++) {
         t->block_translation[i].u.diskoff = rbuf_DISKOFF(&rb);
         t->block_translation[i].size = rbuf_DISKOFF(&rb);
     }
-    invariant(_calculate_size_on_disk(t) == (int64_t) size_on_disk);
-    invariant(t->block_translation[RESERVED_BLOCKNUM_TRANSLATION].size == (int64_t) size_on_disk);
-    invariant(t->block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff == location_on_disk);
+    invariant(_calculate_size_on_disk(t) == (int64_t)size_on_disk);
+    invariant(t->block_translation[RESERVED_BLOCKNUM_TRANSLATION].size ==
+              (int64_t)size_on_disk);
+    invariant(t->block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff ==
+              location_on_disk);
 
 exit:
     return r;
 }
 
 int block_table::iterate(enum translation_type type,
-                         BLOCKTABLE_CALLBACK f, void *extra, bool data_only, bool used_only) {
+                         BLOCKTABLE_CALLBACK f,
+                         void *extra,
+                         bool data_only,
+                         bool used_only) {
     struct translation *src;
-    
+
     int r = 0;
     switch (type) {
-    case TRANSLATION_CURRENT:
-        src = &_current;
-        break;
-    case TRANSLATION_INPROGRESS:
-        src = &_inprogress;
-        break;
-    case TRANSLATION_CHECKPOINTED:
-        src = &_checkpointed;
-        break;
-    default:
-        r = EINVAL;
+        case TRANSLATION_CURRENT:
+            src = &_current;
+            break;
+        case TRANSLATION_INPROGRESS:
+            src = &_inprogress;
+            break;
+        case TRANSLATION_CHECKPOINTED:
+            src = &_checkpointed;
+            break;
+        default:
+            r = EINVAL;
     }
 
     struct translation fakecurrent;
@@ -840,12 +964,15 @@ int block_table::iterate(enum translation_type type,
             src->block_translation[RESERVED_BLOCKNUM_TRANSLATION];
         _mutex_unlock();
         int64_t i;
-        for (i=0; i<t->smallest_never_used_blocknum.b; i++) {
+        for (i = 0; i < t->smallest_never_used_blocknum.b; i++) {
             struct block_translation_pair pair = t->block_translation[i];
-            if (data_only && i< RESERVED_BLOCKNUMS) continue;
-            if (used_only && pair.size <= 0) continue;
+            if (data_only && i < RESERVED_BLOCKNUMS)
+                continue;
+            if (used_only && pair.size <= 0)
+                continue;
             r = f(make_blocknum(i), pair.size, pair.u.diskoff, extra);
-            if (r!=0) break;
+            if (r != 0)
+                break;
         }
         toku_free(t->block_translation);
     }
@@ -857,8 +984,11 @@ typedef struct {
     int64_t total_space;
 } frag_extra;
 
-static int frag_helper(BLOCKNUM UU(b), int64_t size, int64_t address, void *extra) {
-    frag_extra *info = (frag_extra *) extra;
+static int frag_helper(BLOCKNUM UU(b),
+                       int64_t size,
+                       int64_t address,
+                       void *extra) {
+    frag_extra *info = (frag_extra *)extra;
 
     if (size + address > info->total_space)
         info->total_space = size + address;
@@ -866,22 +996,30 @@ static int frag_helper(BLOCKNUM UU(b), int64_t size, int64_t address, void *extr
     return 0;
 }
 
-void block_table::internal_fragmentation(int64_t *total_sizep, int64_t *used_sizep) {
-    frag_extra info = { 0, 0 };
+void block_table::internal_fragmentation(int64_t *total_sizep,
+                                         int64_t *used_sizep) {
+    frag_extra info = {0, 0};
     int r = iterate(TRANSLATION_CHECKPOINTED, frag_helper, &info, false, true);
-    assert_zero(r);
+    invariant_zero(r);
 
-    if (total_sizep) *total_sizep = info.total_space;
-    if (used_sizep)  *used_sizep  = info.used_space;
+    if (total_sizep)
+        *total_sizep = info.total_space;
+    if (used_sizep)
+        *used_sizep = info.used_space;
 }
 
-void block_table::_realloc_descriptor_on_disk_unlocked(DISKOFF size, DISKOFF *offset, FT ft) {
+void block_table::_realloc_descriptor_on_disk_unlocked(DISKOFF size,
+                                                       DISKOFF *offset,
+                                                       FT ft) {
     toku_mutex_assert_locked(&_mutex);
     BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_DESCRIPTOR);
-    _realloc_on_disk_internal(b, size, offset, ft, false, 0);
+    _realloc_on_disk_internal(b, size, offset, ft, false);
 }
 
-void block_table::realloc_descriptor_on_disk(DISKOFF size, DISKOFF *offset, FT ft, int fd) {
+void block_table::realloc_descriptor_on_disk(DISKOFF size,
+                                             DISKOFF *offset,
+                                             FT ft,
+                                             int fd) {
     _mutex_lock();
     _realloc_descriptor_on_disk_unlocked(size, offset, ft);
     _ensure_safe_write_unlocked(fd, size, *offset);
@@ -898,11 +1036,12 @@ void block_table::get_descriptor_offset_size(DISKOFF *offset, DISKOFF *size) {
 void block_table::get_fragmentation_unlocked(TOKU_DB_FRAGMENTATION report) {
     // Requires:  blocktable lock is held.
     // Requires:  report->file_size_bytes is already filled in.
-    
+
     // Count the headers.
-    report->data_bytes = block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE;
+    report->data_bytes = BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE;
     report->data_blocks = 1;
-    report->checkpoint_bytes_additional = block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE;
+    report->checkpoint_bytes_additional =
+        BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE;
     report->checkpoint_blocks_additional = 1;
 
     struct translation *current = &_current;
@@ -916,30 +1055,34 @@ void block_table::get_fragmentation_unlocked(TOKU_DB_FRAGMENTATION report) {
 
     struct translation *checkpointed = &_checkpointed;
     for (int64_t i = 0; i < checkpointed->length_of_array; i++) {
-        struct block_translation_pair *pair = &checkpointed->block_translation[i];
-        if (pair->size > 0 && !(i < current->length_of_array &&
-                                current->block_translation[i].size > 0 &&
-                                current->block_translation[i].u.diskoff == pair->u.diskoff)) {
-                report->checkpoint_bytes_additional += pair->size;
-                report->checkpoint_blocks_additional++;
+        struct block_translation_pair *pair =
+            &checkpointed->block_translation[i];
+        if (pair->size > 0 &&
+            !(i < current->length_of_array &&
+              current->block_translation[i].size > 0 &&
+              current->block_translation[i].u.diskoff == pair->u.diskoff)) {
+            report->checkpoint_bytes_additional += pair->size;
+            report->checkpoint_blocks_additional++;
         }
     }
 
     struct translation *inprogress = &_inprogress;
     for (int64_t i = 0; i < inprogress->length_of_array; i++) {
         struct block_translation_pair *pair = &inprogress->block_translation[i];
-        if (pair->size > 0 && !(i < current->length_of_array &&
-                                current->block_translation[i].size > 0 &&
-                                current->block_translation[i].u.diskoff == pair->u.diskoff) &&
-                              !(i < checkpointed->length_of_array &&
-                                checkpointed->block_translation[i].size > 0 &&
-                                checkpointed->block_translation[i].u.diskoff == pair->u.diskoff)) {
+        if (pair->size > 0 &&
+            !(i < current->length_of_array &&
+              current->block_translation[i].size > 0 &&
+              current->block_translation[i].u.diskoff == pair->u.diskoff) &&
+            !(i < checkpointed->length_of_array &&
+              checkpointed->block_translation[i].size > 0 &&
+              checkpointed->block_translation[i].u.diskoff ==
+                  pair->u.diskoff)) {
             report->checkpoint_bytes_additional += pair->size;
             report->checkpoint_blocks_additional++;
         }
     }
 
-    _bt_block_allocator.get_unused_statistics(report);
+    _bt_block_allocator->UnusedStatistics(report);
 }
 
 void block_table::get_info64(struct ftinfo64 *s) {
@@ -968,25 +1111,38 @@ void block_table::get_info64(struct ftinfo64 *s) {
     _mutex_unlock();
 }
 
-int block_table::iterate_translation_tables(uint64_t checkpoint_count,
-                                            int (*iter)(uint64_t checkpoint_count,
-                                                        int64_t total_num_rows,
-                                                        int64_t blocknum,
-                                                        int64_t diskoff,
-                                                        int64_t size,
-                                                        void *extra),
-                                            void *iter_extra) {
+int block_table::iterate_translation_tables(
+    uint64_t checkpoint_count,
+    int (*iter)(uint64_t checkpoint_count,
+                int64_t total_num_rows,
+                int64_t blocknum,
+                int64_t diskoff,
+                int64_t size,
+                void *extra),
+    void *iter_extra) {
     int error = 0;
     _mutex_lock();
 
-    int64_t total_num_rows = _current.length_of_array + _checkpointed.length_of_array;
+    int64_t total_num_rows =
+        _current.length_of_array + _checkpointed.length_of_array;
     for (int64_t i = 0; error == 0 && i < _current.length_of_array; ++i) {
         struct block_translation_pair *block = &_current.block_translation[i];
-        error = iter(checkpoint_count, total_num_rows, i, block->u.diskoff, block->size, iter_extra);
+        error = iter(checkpoint_count,
+                     total_num_rows,
+                     i,
+                     block->u.diskoff,
+                     block->size,
+                     iter_extra);
     }
     for (int64_t i = 0; error == 0 && i < _checkpointed.length_of_array; ++i) {
-        struct block_translation_pair *block = &_checkpointed.block_translation[i];
-        error = iter(checkpoint_count - 1, total_num_rows, i, block->u.diskoff, block->size, iter_extra);
+        struct block_translation_pair *block =
+            &_checkpointed.block_translation[i];
+        error = iter(checkpoint_count - 1,
+                     total_num_rows,
+                     i,
+                     block->u.diskoff,
+                     block->size,
+                     iter_extra);
     }
 
     _mutex_unlock();
diff --git a/storage/tokudb/PerconaFT/ft/serialize/block_table.h b/storage/tokudb/PerconaFT/ft/serialize/block_table.h
index 8d391674540..dd732d4f372 100644
--- a/storage/tokudb/PerconaFT/ft/serialize/block_table.h
+++ b/storage/tokudb/PerconaFT/ft/serialize/block_table.h
@@ -62,13 +62,16 @@ enum {
     RESERVED_BLOCKNUMS
 };
 
-typedef int (*BLOCKTABLE_CALLBACK)(BLOCKNUM b, int64_t size, int64_t address, void *extra);
+typedef int (*BLOCKTABLE_CALLBACK)(BLOCKNUM b,
+                                   int64_t size,
+                                   int64_t address,
+                                   void *extra);
 
 static inline BLOCKNUM make_blocknum(int64_t b) {
-    BLOCKNUM result = { .b = b };
+    BLOCKNUM result = {.b = b};
     return result;
 }
-static const BLOCKNUM ROLLBACK_NONE = { .b = 0 };
+static const BLOCKNUM ROLLBACK_NONE = {.b = 0};
 
 /**
  *  There are three copies of the translation table (btt) in the block table:
@@ -80,18 +83,20 @@ static const BLOCKNUM ROLLBACK_NONE = { .b = 0 };
  *
  *    inprogress     Is only filled by copying from current,
  *                   and is the only version ever serialized to disk.
- *                   (It is serialized to disk on checkpoint and clean shutdown.)
+ *                   (It is serialized to disk on checkpoint and clean
+ *shutdown.)
  *                   At end of checkpoint it replaces 'checkpointed'.
  *                   During a checkpoint, any 'pending' dirty writes will update
  *                   inprogress.
  *
  *    current        Is initialized by copying from checkpointed,
- *                   is the only version ever modified while the database is in use, 
+ *                   is the only version ever modified while the database is in
+ *use,
  *                   and is the only version ever copied to inprogress.
  *                   It is never stored on disk.
  */
 class block_table {
-public:
+   public:
     enum translation_type {
         TRANSLATION_NONE = 0,
         TRANSLATION_CURRENT,
@@ -102,7 +107,10 @@ public:
 
     void create();
 
-    int create_from_buffer(int fd, DISKOFF location_on_disk, DISKOFF size_on_disk, unsigned char *translation_buffer);
+    int create_from_buffer(int fd,
+                           DISKOFF location_on_disk,
+                           DISKOFF size_on_disk,
+                           unsigned char *translation_buffer);
 
     void destroy();
 
@@ -114,11 +122,21 @@ public:
 
     // Blocknums
     void allocate_blocknum(BLOCKNUM *res, struct ft *ft);
-    void realloc_on_disk(BLOCKNUM b, DISKOFF size, DISKOFF *offset, struct ft *ft, int fd, bool for_checkpoint, uint64_t heat);
+    void realloc_on_disk(BLOCKNUM b,
+                         DISKOFF size,
+                         DISKOFF *offset,
+                         struct ft *ft,
+                         int fd,
+                         bool for_checkpoint);
     void free_blocknum(BLOCKNUM *b, struct ft *ft, bool for_checkpoint);
-    void translate_blocknum_to_offset_size(BLOCKNUM b, DISKOFF *offset, DISKOFF *size);
+    void translate_blocknum_to_offset_size(BLOCKNUM b,
+                                           DISKOFF *offset,
+                                           DISKOFF *size);
     void free_unused_blocknums(BLOCKNUM root);
-    void realloc_descriptor_on_disk(DISKOFF size, DISKOFF *offset, struct ft *ft, int fd);
+    void realloc_descriptor_on_disk(DISKOFF size,
+                                    DISKOFF *offset,
+                                    struct ft *ft,
+                                    int fd);
     void get_descriptor_offset_size(DISKOFF *offset, DISKOFF *size);
 
     // External verfication
@@ -127,15 +145,22 @@ public:
     void verify_no_free_blocknums();
 
     // Serialization
-    void serialize_translation_to_wbuf(int fd, struct wbuf *w, int64_t *address, int64_t *size);
+    void serialize_translation_to_wbuf(int fd,
+                                       struct wbuf *w,
+                                       int64_t *address,
+                                       int64_t *size);
 
     // DEBUG ONLY (ftdump included), tests included
     void blocknum_dump_translation(BLOCKNUM b);
     void dump_translation_table_pretty(FILE *f);
     void dump_translation_table(FILE *f);
-    void block_free(uint64_t offset);
+    void block_free(uint64_t offset, uint64_t size);
 
-    int iterate(enum translation_type type, BLOCKTABLE_CALLBACK f, void *extra, bool data_only, bool used_only); 
+    int iterate(enum translation_type type,
+                BLOCKTABLE_CALLBACK f,
+                void *extra,
+                bool data_only,
+                bool used_only);
     void internal_fragmentation(int64_t *total_sizep, int64_t *used_sizep);
 
     // Requires: blocktable lock is held.
@@ -146,13 +171,16 @@ public:
 
     void get_info64(struct ftinfo64 *);
 
-    int iterate_translation_tables(uint64_t, int (*)(uint64_t, int64_t, int64_t, int64_t, int64_t, void *), void *);
+    int iterate_translation_tables(
+        uint64_t,
+        int (*)(uint64_t, int64_t, int64_t, int64_t, int64_t, void *),
+        void *);
 
-private:
+   private:
     struct block_translation_pair {
         // If in the freelist, use next_free_blocknum, otherwise diskoff.
         union {
-            DISKOFF  diskoff; 
+            DISKOFF diskoff;
             BLOCKNUM next_free_blocknum;
         } u;
 
@@ -173,7 +201,8 @@ private:
     struct translation {
         enum translation_type type;
 
-        // Number of elements in array (block_translation).  always >= smallest_never_used_blocknum
+        // Number of elements in array (block_translation).  always >=
+        // smallest_never_used_blocknum
         int64_t length_of_array;
         BLOCKNUM smallest_never_used_blocknum;
 
@@ -181,20 +210,28 @@ private:
         BLOCKNUM blocknum_freelist_head;
         struct block_translation_pair *block_translation;
 
-        // size_on_disk is stored in block_translation[RESERVED_BLOCKNUM_TRANSLATION].size
-        // location_on is stored in block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff
+        // size_on_disk is stored in
+        // block_translation[RESERVED_BLOCKNUM_TRANSLATION].size
+        // location_on is stored in
+        // block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff
     };
 
     void _create_internal();
-    int _translation_deserialize_from_buffer(struct translation *t,    // destination into which to deserialize
-                                             DISKOFF location_on_disk, // location of translation_buffer
-                                             uint64_t size_on_disk,
-                                             unsigned char * translation_buffer);   // buffer with serialized translation
-
-    void _copy_translation(struct translation *dst, struct translation *src, enum translation_type newtype);
+    int _translation_deserialize_from_buffer(
+        struct translation *t,     // destination into which to deserialize
+        DISKOFF location_on_disk,  // location of translation_buffer
+        uint64_t size_on_disk,
+        unsigned char *
+            translation_buffer);  // buffer with serialized translation
+
+    void _copy_translation(struct translation *dst,
+                           struct translation *src,
+                           enum translation_type newtype);
     void _maybe_optimize_translation(struct translation *t);
     void _maybe_expand_translation(struct translation *t);
-    bool _translation_prevents_freeing(struct translation *t, BLOCKNUM b, struct block_translation_pair *old_pair);
+    bool _translation_prevents_freeing(struct translation *t,
+                                       BLOCKNUM b,
+                                       struct block_translation_pair *old_pair);
     void _free_blocknum_in_translation(struct translation *t, BLOCKNUM b);
     int64_t _calculate_size_on_disk(struct translation *t);
     bool _pair_is_unallocated(struct block_translation_pair *pair);
@@ -203,14 +240,26 @@ private:
 
     // Blocknum management
     void _allocate_blocknum_unlocked(BLOCKNUM *res, struct ft *ft);
-    void _free_blocknum_unlocked(BLOCKNUM *bp, struct ft *ft, bool for_checkpoint);
-    void _realloc_descriptor_on_disk_unlocked(DISKOFF size, DISKOFF *offset, struct ft *ft);
-    void _realloc_on_disk_internal(BLOCKNUM b, DISKOFF size, DISKOFF *offset, struct ft *ft, bool for_checkpoint, uint64_t heat);
-    void _translate_blocknum_to_offset_size_unlocked(BLOCKNUM b, DISKOFF *offset, DISKOFF *size);
+    void _free_blocknum_unlocked(BLOCKNUM *bp,
+                                 struct ft *ft,
+                                 bool for_checkpoint);
+    void _realloc_descriptor_on_disk_unlocked(DISKOFF size,
+                                              DISKOFF *offset,
+                                              struct ft *ft);
+    void _realloc_on_disk_internal(BLOCKNUM b,
+                                   DISKOFF size,
+                                   DISKOFF *offset,
+                                   struct ft *ft,
+                                   bool for_checkpoint);
+    void _translate_blocknum_to_offset_size_unlocked(BLOCKNUM b,
+                                                     DISKOFF *offset,
+                                                     DISKOFF *size);
 
     // File management
     void _maybe_truncate_file(int fd, uint64_t size_needed_before);
-    void _ensure_safe_write_unlocked(int fd, DISKOFF block_size, DISKOFF block_offset);
+    void _ensure_safe_write_unlocked(int fd,
+                                     DISKOFF block_size,
+                                     DISKOFF block_offset);
 
     // Verification
     bool _is_valid_blocknum(struct translation *t, BLOCKNUM b);
@@ -220,29 +269,33 @@ private:
     bool _no_data_blocks_except_root(BLOCKNUM root);
     bool _blocknum_allocated(BLOCKNUM b);
 
-    // Locking 
+    // Locking
     //
     // TODO: Move the lock to the FT
     void _mutex_lock();
     void _mutex_unlock();
 
-    // The current translation is the one used by client threads. 
+    // The current translation is the one used by client threads.
     // It is not represented on disk.
     struct translation _current;
 
-    // The translation used by the checkpoint currently in progress. 
-    // If the checkpoint thread allocates a block, it must also update the current translation.
+    // The translation used by the checkpoint currently in progress.
+    // If the checkpoint thread allocates a block, it must also update the
+    // current translation.
     struct translation _inprogress;
 
-    // The translation for the data that shall remain inviolate on disk until the next checkpoint finishes,
+    // The translation for the data that shall remain inviolate on disk until
+    // the next checkpoint finishes,
     // after which any blocks used only in this translation can be freed.
     struct translation _checkpointed;
 
-    // The in-memory data structure for block allocation. 
+    // The in-memory data structure for block allocation.
     // There is no on-disk data structure for block allocation.
-    // Note: This is *allocation* not *translation* - the block allocator is unaware of which
-    //       blocks are used for which translation, but simply allocates and deallocates blocks.
-    block_allocator _bt_block_allocator;
+    // Note: This is *allocation* not *translation* - the block allocator is
+    // unaware of which
+    //       blocks are used for which translation, but simply allocates and
+    //       deallocates blocks.
+    BlockAllocator *_bt_block_allocator;
     toku_mutex_t _mutex;
     struct nb_mutex _safe_file_size_lock;
     bool _checkpoint_skipped;
@@ -257,16 +310,16 @@ private:
 
 #include "ft/serialize/wbuf.h"
 
-static inline void wbuf_BLOCKNUM (struct wbuf *w, BLOCKNUM b) {
+static inline void wbuf_BLOCKNUM(struct wbuf *w, BLOCKNUM b) {
     wbuf_ulonglong(w, b.b);
 }
 
-static inline void wbuf_nocrc_BLOCKNUM (struct wbuf *w, BLOCKNUM b) {
+static inline void wbuf_nocrc_BLOCKNUM(struct wbuf *w, BLOCKNUM b) {
     wbuf_nocrc_ulonglong(w, b.b);
 }
 
 static inline void wbuf_DISKOFF(struct wbuf *wb, DISKOFF off) {
-    wbuf_ulonglong(wb, (uint64_t) off);
+    wbuf_ulonglong(wb, (uint64_t)off);
 }
 
 #include "ft/serialize/rbuf.h"
@@ -280,6 +333,8 @@ static inline BLOCKNUM rbuf_blocknum(struct rbuf *rb) {
     return result;
 }
 
-static inline void rbuf_ma_BLOCKNUM(struct rbuf *rb, memarena *UU(ma), BLOCKNUM *blocknum) {
+static inline void rbuf_ma_BLOCKNUM(struct rbuf *rb,
+                                    memarena *UU(ma),
+                                    BLOCKNUM *blocknum) {
     *blocknum = rbuf_blocknum(rb);
 }
diff --git a/storage/tokudb/PerconaFT/ft/serialize/compress.cc b/storage/tokudb/PerconaFT/ft/serialize/compress.cc
index 113a8763510..584faa5c3be 100644
--- a/storage/tokudb/PerconaFT/ft/serialize/compress.cc
+++ b/storage/tokudb/PerconaFT/ft/serialize/compress.cc
@@ -237,7 +237,7 @@ void toku_decompress (Bytef       *dest,   uLongf destLen,
         strm.zalloc = Z_NULL;
         strm.zfree = Z_NULL;
         strm.opaque = Z_NULL;
-        char windowBits = source[1];
+        int8_t windowBits = source[1];
         int r = inflateInit2(&strm, windowBits);
         lazy_assert(r == Z_OK);
         strm.next_out = dest;
diff --git a/storage/tokudb/PerconaFT/ft/serialize/ft-serialize.cc b/storage/tokudb/PerconaFT/ft/serialize/ft-serialize.cc
index 49d4368a3ab..8fcb5293412 100644
--- a/storage/tokudb/PerconaFT/ft/serialize/ft-serialize.cc
+++ b/storage/tokudb/PerconaFT/ft/serialize/ft-serialize.cc
@@ -217,8 +217,8 @@ int deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ftp, uint32_t version)
             // translation table itself won't fit in main memory.
             ssize_t readsz = toku_os_pread(fd, tbuf, size_to_read,
                                            translation_address_on_disk);
-            assert(readsz >= translation_size_on_disk);
-            assert(readsz <= (ssize_t)size_to_read);
+            invariant(readsz >= translation_size_on_disk);
+            invariant(readsz <= (ssize_t)size_to_read);
         }
         // Create table and read in data.
         r = ft->blocktable.create_from_buffer(fd,
@@ -411,73 +411,90 @@ exit:
     return r;
 }
 
-static size_t
-serialize_ft_min_size (uint32_t version) {
+static size_t serialize_ft_min_size(uint32_t version) {
     size_t size = 0;
 
-    switch(version) {
-    case FT_LAYOUT_VERSION_29:
-        size += sizeof(uint64_t); // logrows in ft
-    case FT_LAYOUT_VERSION_28:
-        size += sizeof(uint32_t); // fanout in ft
-    case FT_LAYOUT_VERSION_27:
-    case FT_LAYOUT_VERSION_26:
-    case FT_LAYOUT_VERSION_25:
-    case FT_LAYOUT_VERSION_24:
-    case FT_LAYOUT_VERSION_23:
-    case FT_LAYOUT_VERSION_22:
-    case FT_LAYOUT_VERSION_21:
-        size += sizeof(MSN);       // max_msn_in_ft
-    case FT_LAYOUT_VERSION_20:
-    case FT_LAYOUT_VERSION_19:
-        size += 1; // compression method
-        size += sizeof(MSN);       // highest_unused_msn_for_upgrade
-    case FT_LAYOUT_VERSION_18:
-        size += sizeof(uint64_t);  // time_of_last_optimize_begin
-        size += sizeof(uint64_t);  // time_of_last_optimize_end
-        size += sizeof(uint32_t);  // count_of_optimize_in_progress
-        size += sizeof(MSN);       // msn_at_start_of_last_completed_optimize
-        size -= 8;                 // removed num_blocks_to_upgrade_14
-        size -= 8;                 // removed num_blocks_to_upgrade_13
-    case FT_LAYOUT_VERSION_17:
-        size += 16;
-        invariant(sizeof(STAT64INFO_S) == 16);
-    case FT_LAYOUT_VERSION_16:
-    case FT_LAYOUT_VERSION_15:
-        size += 4;  // basement node size
-        size += 8;  // num_blocks_to_upgrade_14 (previously num_blocks_to_upgrade, now one int each for upgrade from 13, 14
-        size += 8;  // time of last verification
-    case FT_LAYOUT_VERSION_14:
-        size += 8;  //TXNID that created
-    case FT_LAYOUT_VERSION_13:
-        size += ( 4 // build_id
-                  +4 // build_id_original
-                  +8 // time_of_creation
-                  +8 // time_of_last_modification
-            );
+    switch (version) {
+        case FT_LAYOUT_VERSION_29:
+            size += sizeof(uint64_t);  // logrows in ft
+        case FT_LAYOUT_VERSION_28:
+            size += sizeof(uint32_t);  // fanout in ft
+        case FT_LAYOUT_VERSION_27:
+        case FT_LAYOUT_VERSION_26:
+        case FT_LAYOUT_VERSION_25:
+        case FT_LAYOUT_VERSION_24:
+        case FT_LAYOUT_VERSION_23:
+        case FT_LAYOUT_VERSION_22:
+        case FT_LAYOUT_VERSION_21:
+            size += sizeof(MSN);  // max_msn_in_ft
+        case FT_LAYOUT_VERSION_20:
+        case FT_LAYOUT_VERSION_19:
+            size += 1;            // compression method
+            size += sizeof(MSN);  // highest_unused_msn_for_upgrade
+        case FT_LAYOUT_VERSION_18:
+            size += sizeof(uint64_t);  // time_of_last_optimize_begin
+            size += sizeof(uint64_t);  // time_of_last_optimize_end
+            size += sizeof(uint32_t);  // count_of_optimize_in_progress
+            size += sizeof(MSN);  // msn_at_start_of_last_completed_optimize
+            size -= 8;            // removed num_blocks_to_upgrade_14
+            size -= 8;            // removed num_blocks_to_upgrade_13
+        case FT_LAYOUT_VERSION_17:
+            size += 16;
+            invariant(sizeof(STAT64INFO_S) == 16);
+        case FT_LAYOUT_VERSION_16:
+        case FT_LAYOUT_VERSION_15:
+            size += 4;  // basement node size
+            size += 8;  // num_blocks_to_upgrade_14 (previously
+                        // num_blocks_to_upgrade, now one int each for upgrade
+                        // from 13, 14
+            size += 8;  // time of last verification
+        case FT_LAYOUT_VERSION_14:
+            size += 8;  // TXNID that created
+        case FT_LAYOUT_VERSION_13:
+            size += (4  // build_id
+                     +
+                     4  // build_id_original
+                     +
+                     8  // time_of_creation
+                     +
+                     8  // time_of_last_modification
+                     );
         // fall through
-    case FT_LAYOUT_VERSION_12:
-        size += (+8 // "tokudata"
-                 +4 // version
-                 +4 // original_version
-                 +4 // size
-                 +8 // byte order verification
-                 +8 // checkpoint_count
-                 +8 // checkpoint_lsn
-                 +4 // tree's nodesize
-                 +8 // translation_size_on_disk
-                 +8 // translation_address_on_disk
-                 +4 // checksum
-                 +8 // Number of blocks in old version.
-                 +8 // diskoff
-                 +4 // flags
-            );
-        break;
-    default:
-        abort();
-    }
-
-    lazy_assert(size <= block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE);
+        case FT_LAYOUT_VERSION_12:
+            size += (+8  // "tokudata"
+                     +
+                     4  // version
+                     +
+                     4  // original_version
+                     +
+                     4  // size
+                     +
+                     8  // byte order verification
+                     +
+                     8  // checkpoint_count
+                     +
+                     8  // checkpoint_lsn
+                     +
+                     4  // tree's nodesize
+                     +
+                     8  // translation_size_on_disk
+                     +
+                     8  // translation_address_on_disk
+                     +
+                     4  // checksum
+                     +
+                     8  // Number of blocks in old version.
+                     +
+                     8  // diskoff
+                     +
+                     4  // flags
+                     );
+            break;
+        default:
+            abort();
+    }
+
+    lazy_assert(size <= BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE);
     return size;
 }
 
@@ -486,7 +503,7 @@ int deserialize_ft_from_fd_into_rbuf(int fd,
                                      struct rbuf *rb,
                                      uint64_t *checkpoint_count,
                                      LSN *checkpoint_lsn,
-                                     uint32_t * version_p)
+                                     uint32_t *version_p)
 // Effect: Read and parse the header of a fractalal tree
 //
 //  Simply reading the raw bytes of the header into an rbuf is insensitive
@@ -496,18 +513,18 @@ int deserialize_ft_from_fd_into_rbuf(int fd,
 //  file AND the header is useless
 {
     int r = 0;
-    const int64_t prefix_size = 8 + // magic ("tokudata")
-                                4 + // version
-                                4 + // build_id
-                                4;  // size
+    const int64_t prefix_size = 8 +  // magic ("tokudata")
+                                4 +  // version
+                                4 +  // build_id
+                                4;   // size
     const int64_t read_size = roundup_to_multiple(512, prefix_size);
     unsigned char *XMALLOC_N_ALIGNED(512, read_size, prefix);
     rb->buf = NULL;
     int64_t n = toku_os_pread(fd, prefix, read_size, offset_of_header);
     if (n != read_size) {
-        if (n==0) {
+        if (n == 0) {
             r = TOKUDB_DICTIONARY_NO_HEADER;
-        } else if (n<0) {
+        } else if (n < 0) {
             r = get_error_errno();
         } else {
             r = EINVAL;
@@ -518,95 +535,102 @@ int deserialize_ft_from_fd_into_rbuf(int fd,
 
     rbuf_init(rb, prefix, prefix_size);
 
-    //Check magic number
+    // Check magic number
     const void *magic;
     rbuf_literal_bytes(rb, &magic, 8);
-    if (memcmp(magic,"tokudata",8)!=0) {
-        if ((*(uint64_t*)magic) == 0) {
+    if (memcmp(magic, "tokudata", 8) != 0) {
+        if ((*(uint64_t *)magic) == 0) {
             r = TOKUDB_DICTIONARY_NO_HEADER;
         } else {
-            r = EINVAL; //Not a tokudb file! Do not use.
+            r = EINVAL;  // Not a tokudb file! Do not use.
         }
         goto exit;
     }
 
-    //Version MUST be in network order regardless of disk order.
+    // Version MUST be in network order regardless of disk order.
     uint32_t version;
     version = rbuf_network_int(rb);
     *version_p = version;
     if (version < FT_LAYOUT_MIN_SUPPORTED_VERSION) {
-        r = TOKUDB_DICTIONARY_TOO_OLD; //Cannot use
+        r = TOKUDB_DICTIONARY_TOO_OLD;  // Cannot use
         goto exit;
     } else if (version > FT_LAYOUT_VERSION) {
-        r = TOKUDB_DICTIONARY_TOO_NEW; //Cannot use
+        r = TOKUDB_DICTIONARY_TOO_NEW;  // Cannot use
         goto exit;
     }
 
-    //build_id MUST be in network order regardless of disk order.
+    // build_id MUST be in network order regardless of disk order.
     uint32_t build_id __attribute__((__unused__));
     build_id = rbuf_network_int(rb);
     int64_t min_header_size;
     min_header_size = serialize_ft_min_size(version);
 
-    //Size MUST be in network order regardless of disk order.
+    // Size MUST be in network order regardless of disk order.
     uint32_t size;
     size = rbuf_network_int(rb);
-    //If too big, it is corrupt.  We would probably notice during checksum
-    //but may have to do a multi-gigabyte malloc+read to find out.
-    //If its too small reading rbuf would crash, so verify.
-    if (size > block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE || size < min_header_size) {
+    // If too big, it is corrupt.  We would probably notice during checksum
+    // but may have to do a multi-gigabyte malloc+read to find out.
+    // If its too small reading rbuf would crash, so verify.
+    if (size > BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE ||
+        size < min_header_size) {
         r = TOKUDB_DICTIONARY_NO_HEADER;
         goto exit;
     }
 
-    lazy_assert(rb->ndone==prefix_size);
+    lazy_assert(rb->ndone == prefix_size);
     rb->size = size;
     {
         toku_free(rb->buf);
         uint32_t size_to_read = roundup_to_multiple(512, size);
         XMALLOC_N_ALIGNED(512, size_to_read, rb->buf);
 
-        assert(offset_of_header%512==0);
+        invariant(offset_of_header % 512 == 0);
         n = toku_os_pread(fd, rb->buf, size_to_read, offset_of_header);
         if (n != size_to_read) {
             if (n < 0) {
                 r = get_error_errno();
             } else {
-                r = EINVAL; //Header might be useless (wrong size) or could be a disk read error.
+                r = EINVAL;  // Header might be useless (wrong size) or could be
+                             // a disk read error.
             }
             goto exit;
         }
     }
-    //It's version 14 or later.  Magic looks OK.
-    //We have an rbuf that represents the header.
-    //Size is within acceptable bounds.
+    // It's version 14 or later.  Magic looks OK.
+    // We have an rbuf that represents the header.
+    // Size is within acceptable bounds.
 
-    //Verify checksum (FT_LAYOUT_VERSION_13 or later, when checksum function changed)
+    // Verify checksum (FT_LAYOUT_VERSION_13 or later, when checksum function
+    // changed)
     uint32_t calculated_x1764;
-    calculated_x1764 = toku_x1764_memory(rb->buf, rb->size-4);
+    calculated_x1764 = toku_x1764_memory(rb->buf, rb->size - 4);
     uint32_t stored_x1764;
-    stored_x1764 = toku_dtoh32(*(int*)(rb->buf+rb->size-4));
+    stored_x1764 = toku_dtoh32(*(int *)(rb->buf + rb->size - 4));
     if (calculated_x1764 != stored_x1764) {
-        r = TOKUDB_BAD_CHECKSUM; //Header useless
-        fprintf(stderr, "Header checksum failure: calc=0x%08x read=0x%08x\n", calculated_x1764, stored_x1764);
+        r = TOKUDB_BAD_CHECKSUM;  // Header useless
+        fprintf(stderr,
+                "Header checksum failure: calc=0x%08x read=0x%08x\n",
+                calculated_x1764,
+                stored_x1764);
         goto exit;
     }
 
-    //Verify byte order
+    // Verify byte order
     const void *tmp_byte_order_check;
     lazy_assert((sizeof toku_byte_order_host) == 8);
-    rbuf_literal_bytes(rb, &tmp_byte_order_check, 8); //Must not translate byte order
+    rbuf_literal_bytes(
+        rb, &tmp_byte_order_check, 8);  // Must not translate byte order
     int64_t byte_order_stored;
-    byte_order_stored = *(int64_t*)tmp_byte_order_check;
+    byte_order_stored = *(int64_t *)tmp_byte_order_check;
     if (byte_order_stored != toku_byte_order_host) {
-        r = TOKUDB_DICTIONARY_NO_HEADER; //Cannot use dictionary
+        r = TOKUDB_DICTIONARY_NO_HEADER;  // Cannot use dictionary
         goto exit;
     }
 
-    //Load checkpoint count
+    // Load checkpoint count
     *checkpoint_count = rbuf_ulonglong(rb);
     *checkpoint_lsn = rbuf_LSN(rb);
-    //Restart at beginning during regular deserialization
+    // Restart at beginning during regular deserialization
     rb->ndone = 0;
 
 exit:
@@ -620,11 +644,7 @@ exit:
 // Read ft from file into struct.  Read both headers and use one.
 // We want the latest acceptable header whose checkpoint_lsn is no later
 // than max_acceptable_lsn.
-int
-toku_deserialize_ft_from(int fd,
-                         LSN max_acceptable_lsn,
-                         FT *ft)
-{
+int toku_deserialize_ft_from(int fd, LSN max_acceptable_lsn, FT *ft) {
     struct rbuf rb_0;
     struct rbuf rb_1;
     uint64_t checkpoint_count_0 = 0;
@@ -638,13 +658,23 @@ toku_deserialize_ft_from(int fd,
     int r0, r1, r;
 
     toku_off_t header_0_off = 0;
-    r0 = deserialize_ft_from_fd_into_rbuf(fd, header_0_off, &rb_0, &checkpoint_count_0, &checkpoint_lsn_0, &version_0);
+    r0 = deserialize_ft_from_fd_into_rbuf(fd,
+                                          header_0_off,
+                                          &rb_0,
+                                          &checkpoint_count_0,
+                                          &checkpoint_lsn_0,
+                                          &version_0);
     if (r0 == 0 && checkpoint_lsn_0.lsn <= max_acceptable_lsn.lsn) {
         h0_acceptable = true;
     }
 
-    toku_off_t header_1_off = block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE;
-    r1 = deserialize_ft_from_fd_into_rbuf(fd, header_1_off, &rb_1, &checkpoint_count_1, &checkpoint_lsn_1, &version_1);
+    toku_off_t header_1_off = BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE;
+    r1 = deserialize_ft_from_fd_into_rbuf(fd,
+                                          header_1_off,
+                                          &rb_1,
+                                          &checkpoint_count_1,
+                                          &checkpoint_lsn_1,
+                                          &version_1);
     if (r1 == 0 && checkpoint_lsn_1.lsn <= max_acceptable_lsn.lsn) {
         h1_acceptable = true;
     }
@@ -655,24 +685,29 @@ toku_deserialize_ft_from(int fd,
         // We were unable to read either header or at least one is too
         // new.  Certain errors are higher priority than others. Order of
         // these if/else if is important.
-        if (r0 == TOKUDB_DICTIONARY_TOO_NEW || r1 == TOKUDB_DICTIONARY_TOO_NEW) {
+        if (r0 == TOKUDB_DICTIONARY_TOO_NEW ||
+            r1 == TOKUDB_DICTIONARY_TOO_NEW) {
             r = TOKUDB_DICTIONARY_TOO_NEW;
-        } else if (r0 == TOKUDB_DICTIONARY_TOO_OLD || r1 == TOKUDB_DICTIONARY_TOO_OLD) {
+        } else if (r0 == TOKUDB_DICTIONARY_TOO_OLD ||
+                   r1 == TOKUDB_DICTIONARY_TOO_OLD) {
             r = TOKUDB_DICTIONARY_TOO_OLD;
         } else if (r0 == TOKUDB_BAD_CHECKSUM && r1 == TOKUDB_BAD_CHECKSUM) {
             fprintf(stderr, "Both header checksums failed.\n");
             r = TOKUDB_BAD_CHECKSUM;
-        } else if (r0 == TOKUDB_DICTIONARY_NO_HEADER || r1 == TOKUDB_DICTIONARY_NO_HEADER) {
+        } else if (r0 == TOKUDB_DICTIONARY_NO_HEADER ||
+                   r1 == TOKUDB_DICTIONARY_NO_HEADER) {
             r = TOKUDB_DICTIONARY_NO_HEADER;
         } else {
-            r = r0 ? r0 : r1; //Arbitrarily report the error from the
-                              //first header, unless it's readable
+            r = r0 ? r0 : r1;  // Arbitrarily report the error from the
+            // first header, unless it's readable
         }
 
-        // it should not be possible for both headers to be later than the max_acceptable_lsn
-        invariant(!((r0==0 && checkpoint_lsn_0.lsn > max_acceptable_lsn.lsn) &&
-                    (r1==0 && checkpoint_lsn_1.lsn > max_acceptable_lsn.lsn)));
-        invariant(r!=0);
+        // it should not be possible for both headers to be later than the
+        // max_acceptable_lsn
+        invariant(
+            !((r0 == 0 && checkpoint_lsn_0.lsn > max_acceptable_lsn.lsn) &&
+              (r1 == 0 && checkpoint_lsn_1.lsn > max_acceptable_lsn.lsn)));
+        invariant(r != 0);
         goto exit;
     }
 
@@ -682,8 +717,7 @@ toku_deserialize_ft_from(int fd,
             invariant(version_0 >= version_1);
             rb = &rb_0;
             version = version_0;
-        }
-        else {
+        } else {
             invariant(checkpoint_count_1 == checkpoint_count_0 + 1);
             invariant(version_1 >= version_0);
             rb = &rb_1;
@@ -692,14 +726,18 @@ toku_deserialize_ft_from(int fd,
     } else if (h0_acceptable) {
         if (r1 == TOKUDB_BAD_CHECKSUM) {
             // print something reassuring
-            fprintf(stderr, "Header 2 checksum failed, but header 1 ok.  Proceeding.\n");
+            fprintf(
+                stderr,
+                "Header 2 checksum failed, but header 1 ok.  Proceeding.\n");
         }
         rb = &rb_0;
         version = version_0;
     } else if (h1_acceptable) {
         if (r0 == TOKUDB_BAD_CHECKSUM) {
             // print something reassuring
-            fprintf(stderr, "Header 1 checksum failed, but header 2 ok.  Proceeding.\n");
+            fprintf(
+                stderr,
+                "Header 1 checksum failed, but header 2 ok.  Proceeding.\n");
         }
         rb = &rb_1;
         version = version_1;
@@ -718,15 +756,13 @@ exit:
     return r;
 }
 
-
-size_t toku_serialize_ft_size (FT_HEADER h) {
+size_t toku_serialize_ft_size(FT_HEADER h) {
     size_t size = serialize_ft_min_size(h->layout_version);
-    //There is no dynamic data.
-    lazy_assert(size <= block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE);
+    // There is no dynamic data.
+    lazy_assert(size <= BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE);
     return size;
 }
 
-
 void toku_serialize_ft_to_wbuf (
     struct wbuf *wbuf, 
     FT_HEADER h, 
@@ -771,52 +807,60 @@ void toku_serialize_ft_to_wbuf (
 }
 
 void toku_serialize_ft_to(int fd, FT_HEADER h, block_table *bt, CACHEFILE cf) {
-    lazy_assert(h->type==FT_CHECKPOINT_INPROGRESS);
+    lazy_assert(h->type == FT_CHECKPOINT_INPROGRESS);
     struct wbuf w_translation;
     int64_t size_translation;
     int64_t address_translation;
 
     // Must serialize translation first, to get address,size for header.
-    bt->serialize_translation_to_wbuf(fd, &w_translation,
-                                      &address_translation,
-                                      &size_translation);
-    assert(size_translation == w_translation.ndone);
+    bt->serialize_translation_to_wbuf(
+        fd, &w_translation, &address_translation, &size_translation);
+    invariant(size_translation == w_translation.ndone);
 
-    // the number of bytes available in the buffer is 0 mod 512, and those last bytes are all initialized.
-    assert(w_translation.size % 512 == 0);
+    // the number of bytes available in the buffer is 0 mod 512, and those last
+    // bytes are all initialized.
+    invariant(w_translation.size % 512 == 0);
 
     struct wbuf w_main;
-    size_t size_main       = toku_serialize_ft_size(h);
+    size_t size_main = toku_serialize_ft_size(h);
     size_t size_main_aligned = roundup_to_multiple(512, size_main);
-    assert(size_main_aligned<block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE);
+    invariant(size_main_aligned <
+              BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE);
     char *XMALLOC_N_ALIGNED(512, size_main_aligned, mainbuf);
-    for (size_t i=size_main; i<size_main_aligned; i++) mainbuf[i]=0; // initialize the end of the buffer with zeros
+    for (size_t i = size_main; i < size_main_aligned; i++)
+        mainbuf[i] = 0;  // initialize the end of the buffer with zeros
     wbuf_init(&w_main, mainbuf, size_main);
-    toku_serialize_ft_to_wbuf(&w_main, h, address_translation, size_translation);
+    toku_serialize_ft_to_wbuf(
+        &w_main, h, address_translation, size_translation);
     lazy_assert(w_main.ndone == size_main);
 
     // Actually write translation table
-    // This write is guaranteed to read good data at the end of the buffer, since the
+    // This write is guaranteed to read good data at the end of the buffer,
+    // since the
     // w_translation.buf is padded with zeros to a 512-byte boundary.
-    toku_os_full_pwrite(fd, w_translation.buf, roundup_to_multiple(512, size_translation), address_translation);
-
-    //Everything but the header MUST be on disk before header starts.
-    //Otherwise we will think the header is good and some blocks might not
-    //yet be on disk.
-    //If the header has a cachefile we need to do cachefile fsync (to
-    //prevent crash if we redirected to dev null)
-    //If there is no cachefile we still need to do an fsync.
+    toku_os_full_pwrite(fd,
+                        w_translation.buf,
+                        roundup_to_multiple(512, size_translation),
+                        address_translation);
+
+    // Everything but the header MUST be on disk before header starts.
+    // Otherwise we will think the header is good and some blocks might not
+    // yet be on disk.
+    // If the header has a cachefile we need to do cachefile fsync (to
+    // prevent crash if we redirected to dev null)
+    // If there is no cachefile we still need to do an fsync.
     if (cf) {
         toku_cachefile_fsync(cf);
-    }
-    else {
+    } else {
         toku_file_fsync(fd);
     }
 
-    //Alternate writing header to two locations:
+    // Alternate writing header to two locations:
     //   Beginning (0) or BLOCK_ALLOCATOR_HEADER_RESERVE
     toku_off_t main_offset;
-    main_offset = (h->checkpoint_count & 0x1) ? 0 : block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE;
+    main_offset = (h->checkpoint_count & 0x1)
+                      ? 0
+                      : BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE;
     toku_os_full_pwrite(fd, w_main.buf, size_main_aligned, main_offset);
     toku_free(w_main.buf);
     toku_free(w_translation.buf);
diff --git a/storage/tokudb/PerconaFT/ft/serialize/ft_node-serialize.cc b/storage/tokudb/PerconaFT/ft/serialize/ft_node-serialize.cc
index c4f4886b6a0..5914f8a1050 100644
--- a/storage/tokudb/PerconaFT/ft/serialize/ft_node-serialize.cc
+++ b/storage/tokudb/PerconaFT/ft/serialize/ft_node-serialize.cc
@@ -99,13 +99,11 @@ void toku_ft_serialize_layer_init(void) {
     num_cores = toku_os_get_number_active_processors();
     int r = toku_thread_pool_create(&ft_pool, num_cores);
     lazy_assert_zero(r);
-    block_allocator::maybe_initialize_trace();
     toku_serialize_in_parallel = false;
 }
 
 void toku_ft_serialize_layer_destroy(void) {
     toku_thread_pool_destroy(&ft_pool);
-    block_allocator::maybe_close_trace();
 }
 
 enum { FILE_CHANGE_INCREMENT = (16 << 20) };
@@ -773,19 +771,23 @@ int toku_serialize_ftnode_to_memory(FTNODE node,
     return 0;
 }
 
-int
-toku_serialize_ftnode_to (int fd, BLOCKNUM blocknum, FTNODE node, FTNODE_DISK_DATA* ndd, bool do_rebalancing, FT ft, bool for_checkpoint) {
-
+int toku_serialize_ftnode_to(int fd,
+                             BLOCKNUM blocknum,
+                             FTNODE node,
+                             FTNODE_DISK_DATA *ndd,
+                             bool do_rebalancing,
+                             FT ft,
+                             bool for_checkpoint) {
     size_t n_to_write;
     size_t n_uncompressed_bytes;
     char *compressed_buf = nullptr;
 
-    // because toku_serialize_ftnode_to is only called for 
+    // because toku_serialize_ftnode_to is only called for
     // in toku_ftnode_flush_callback, we pass false
     // for in_parallel. The reasoning is that when we write
-    // nodes to disk via toku_ftnode_flush_callback, we 
+    // nodes to disk via toku_ftnode_flush_callback, we
     // assume that it is being done on a non-critical
-    // background thread (probably for checkpointing), and therefore 
+    // background thread (probably for checkpointing), and therefore
     // should not hog CPU,
     //
     // Should the above facts change, we may want to revisit
@@ -802,32 +804,32 @@ toku_serialize_ftnode_to (int fd, BLOCKNUM blocknum, FTNODE node, FTNODE_DISK_DA
         toku_unsafe_fetch(&toku_serialize_in_parallel),
         &n_to_write,
         &n_uncompressed_bytes,
-        &compressed_buf
-        );
+        &compressed_buf);
     if (r != 0) {
         return r;
     }
 
-    // If the node has never been written, then write the whole buffer, including the zeros
-    invariant(blocknum.b>=0);
+    // If the node has never been written, then write the whole buffer,
+    // including the zeros
+    invariant(blocknum.b >= 0);
     DISKOFF offset;
 
     // Dirties the ft
-    ft->blocktable.realloc_on_disk(blocknum, n_to_write, &offset,
-                                   ft, fd, for_checkpoint,
-                                   // Allocations for nodes high in the tree are considered 'hot',
-                                   // as they are likely to move again in the next checkpoint.
-                                   node->height);
+    ft->blocktable.realloc_on_disk(
+        blocknum, n_to_write, &offset, ft, fd, for_checkpoint);
 
     tokutime_t t0 = toku_time_now();
     toku_os_full_pwrite(fd, compressed_buf, n_to_write, offset);
     tokutime_t t1 = toku_time_now();
 
     tokutime_t io_time = t1 - t0;
-    toku_ft_status_update_flush_reason(node, n_uncompressed_bytes, n_to_write, io_time, for_checkpoint);
+    toku_ft_status_update_flush_reason(
+        node, n_uncompressed_bytes, n_to_write, io_time, for_checkpoint);
 
     toku_free(compressed_buf);
-    node->dirty = 0;  // See #1957.   Must set the node to be clean after serializing it so that it doesn't get written again on the next checkpoint or eviction.
+    node->dirty = 0;  // See #1957.   Must set the node to be clean after
+                      // serializing it so that it doesn't get written again on
+                      // the next checkpoint or eviction.
     return 0;
 }
 
@@ -994,6 +996,7 @@ BASEMENTNODE toku_clone_bn(BASEMENTNODE orig_bn) {
     bn->seqinsert = orig_bn->seqinsert;
     bn->stale_ancestor_messages_applied = orig_bn->stale_ancestor_messages_applied;
     bn->stat64_delta = orig_bn->stat64_delta;
+    bn->logical_rows_delta = orig_bn->logical_rows_delta;
     bn->data_buffer.clone(&orig_bn->data_buffer);
     return bn;
 }
@@ -1004,6 +1007,7 @@ BASEMENTNODE toku_create_empty_bn_no_buffer(void) {
     bn->seqinsert = 0;
     bn->stale_ancestor_messages_applied = false;
     bn->stat64_delta = ZEROSTATS;
+    bn->logical_rows_delta = 0;
     bn->data_buffer.init_zero();
     return bn;
 }
@@ -1897,7 +1901,7 @@ read_and_decompress_block_from_fd_into_rbuf(int fd, BLOCKNUM blocknum,
                                             /* out */ int *layout_version_p);
 
 // This function upgrades a version 14 or 13 ftnode to the current
-// verison. NOTE: This code assumes the first field of the rbuf has
+// version. NOTE: This code assumes the first field of the rbuf has
 // already been read from the buffer (namely the layout_version of the
 // ftnode.)
 static int
@@ -2488,9 +2492,12 @@ toku_serialize_rollback_log_to_memory_uncompressed(ROLLBACK_LOG_NODE log, SERIAL
     serialized->blocknum = log->blocknum;
 }
 
-int
-toku_serialize_rollback_log_to (int fd, ROLLBACK_LOG_NODE log, SERIALIZED_ROLLBACK_LOG_NODE serialized_log, bool is_serialized,
-                                FT ft, bool for_checkpoint) {
+int toku_serialize_rollback_log_to(int fd,
+                                   ROLLBACK_LOG_NODE log,
+                                   SERIALIZED_ROLLBACK_LOG_NODE serialized_log,
+                                   bool is_serialized,
+                                   FT ft,
+                                   bool for_checkpoint) {
     size_t n_to_write;
     char *compressed_buf;
     struct serialized_rollback_log_node serialized_local;
@@ -2511,21 +2518,21 @@ toku_serialize_rollback_log_to (int fd, ROLLBACK_LOG_NODE log, SERIALIZED_ROLLBA
                                            serialized_log->n_sub_blocks,
                                            serialized_log->sub_block,
                                            ft->h->compression_method,
-                                           &n_to_write, &compressed_buf);
+                                           &n_to_write,
+                                           &compressed_buf);
 
     // Dirties the ft
     DISKOFF offset;
-    ft->blocktable.realloc_on_disk(blocknum, n_to_write, &offset,
-                                   ft, fd, for_checkpoint,
-                                   // We consider rollback log flushing the hottest possible allocation,
-                                   // since rollback logs are short-lived compared to FT nodes.
-                                   INT_MAX);
+    ft->blocktable.realloc_on_disk(
+        blocknum, n_to_write, &offset, ft, fd, for_checkpoint);
 
     toku_os_full_pwrite(fd, compressed_buf, n_to_write, offset);
     toku_free(compressed_buf);
     if (!is_serialized) {
         toku_static_serialized_rollback_log_destroy(&serialized_local);
-        log->dirty = 0;  // See #1957.   Must set the node to be clean after serializing it so that it doesn't get written again on the next checkpoint or eviction.
+        log->dirty = 0;  // See #1957.   Must set the node to be clean after
+                         // serializing it so that it doesn't get written again
+                         // on the next checkpoint or eviction.
     }
     return 0;
 }
@@ -2704,7 +2711,7 @@ exit:
 }
 
 static int decompress_from_raw_block_into_rbuf_versioned(uint32_t version, uint8_t *raw_block, size_t raw_block_size, struct rbuf *rb, BLOCKNUM blocknum) {
-    // This function exists solely to accomodate future changes in compression.
+    // This function exists solely to accommodate future changes in compression.
     int r = 0;
     if ((version == FT_LAYOUT_VERSION_13 || version == FT_LAYOUT_VERSION_14) ||
         (FT_LAYOUT_VERSION_25 <= version && version <= FT_LAYOUT_VERSION_27) ||
diff --git a/storage/tokudb/PerconaFT/ft/serialize/rbtree_mhs.cc b/storage/tokudb/PerconaFT/ft/serialize/rbtree_mhs.cc
new file mode 100644
index 00000000000..922850fb3e0
--- /dev/null
+++ b/storage/tokudb/PerconaFT/ft/serialize/rbtree_mhs.cc
@@ -0,0 +1,833 @@
+/*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILIT or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "ft/serialize/rbtree_mhs.h"
+#include "portability/toku_assert.h"
+#include "portability/toku_portability.h"
+#include <algorithm>
+
+namespace MhsRbTree {
+
+    Tree::Tree() : _root(NULL), _align(1) {}
+
+    Tree::Tree(uint64_t align) : _root(NULL), _align(align) {}
+
+    Tree::~Tree() { Destroy(); }
+
+    void Tree::PreOrder(Node *tree) const {
+        if (tree != NULL) {
+            fprintf(stderr, "%" PRIu64 " ", rbn_offset(tree).ToInt());
+            PreOrder(tree->_left);
+            PreOrder(tree->_right);
+        }
+    }
+
+    void Tree::PreOrder() { PreOrder(_root); }
+
+    void Tree::InOrder(Node *tree) const {
+        if (tree != NULL) {
+            InOrder(tree->_left);
+            fprintf(stderr, "%" PRIu64 " ", rbn_offset(tree).ToInt());
+            InOrder(tree->_right);
+        }
+    }
+
+    // yeah, i only care about in order visitor. -Jun
+    void Tree::InOrderVisitor(Node *tree,
+                              void (*f)(void *, Node *, uint64_t),
+                              void *extra,
+                              uint64_t depth) {
+        if (tree != NULL) {
+            InOrderVisitor(tree->_left, f, extra, depth + 1);
+            f(extra, tree, depth);
+            InOrderVisitor(tree->_right, f, extra, depth + 1);
+        }
+    }
+
+    void Tree::InOrderVisitor(void (*f)(void *, Node *, uint64_t),
+                              void *extra) {
+        InOrderVisitor(_root, f, extra, 0);
+    }
+
+    void Tree::InOrder() { InOrder(_root); }
+
+    void Tree::PostOrder(Node *tree) const {
+        if (tree != NULL) {
+            PostOrder(tree->_left);
+            PostOrder(tree->_right);
+            fprintf(stderr, "%" PRIu64 " ", rbn_offset(tree).ToInt());
+        }
+    }
+
+    void Tree::PostOrder() { PostOrder(_root); }
+
+    Node *Tree::SearchByOffset(uint64_t offset) {
+        Node *x = _root;
+        while ((x != NULL) && (rbn_offset(x).ToInt() != offset)) {
+            if (offset < rbn_offset(x).ToInt())
+                x = x->_left;
+            else
+                x = x->_right;
+        }
+
+        return x;
+    }
+
+    // mostly for testing
+    Node *Tree::SearchFirstFitBySize(uint64_t size) {
+        if (EffectiveSize(_root) < size && rbn_left_mhs(_root) < size &&
+            rbn_right_mhs(_root) < size) {
+            return nullptr;
+        } else {
+            return SearchFirstFitBySizeHelper(_root, size);
+        }
+    }
+
+    Node *Tree::SearchFirstFitBySizeHelper(Node *x, uint64_t size) {
+        if (EffectiveSize(x) >= size) {
+            // only possible to go left
+            if (rbn_left_mhs(x) >= size)
+                return SearchFirstFitBySizeHelper(x->_left, size);
+            else
+                return x;
+        }
+        if (rbn_left_mhs(x) >= size)
+            return SearchFirstFitBySizeHelper(x->_left, size);
+
+        if (rbn_right_mhs(x) >= size)
+            return SearchFirstFitBySizeHelper(x->_right, size);
+
+        // this is an invalid state
+        Dump();
+        ValidateBalance();
+        ValidateMhs();
+        invariant(0);
+        return NULL;
+    }
+
+    Node *Tree::MinNode(Node *tree) {
+        if (tree == NULL)
+            return NULL;
+
+        while (tree->_left != NULL)
+            tree = tree->_left;
+        return tree;
+    }
+
+    Node *Tree::MinNode() { return MinNode(_root); }
+
+    Node *Tree::MaxNode(Node *tree) {
+        if (tree == NULL)
+            return NULL;
+
+        while (tree->_right != NULL)
+            tree = tree->_right;
+        return tree;
+    }
+
+    Node *Tree::MaxNode() { return MaxNode(_root); }
+
+    Node *Tree::SuccessorHelper(Node *y, Node *x) {
+        while ((y != NULL) && (x == y->_right)) {
+            x = y;
+            y = y->_parent;
+        }
+        return y;
+    }
+    Node *Tree::Successor(Node *x) {
+        if (x->_right != NULL)
+            return MinNode(x->_right);
+
+        Node *y = x->_parent;
+        return SuccessorHelper(y, x);
+    }
+
+    Node *Tree::PredecessorHelper(Node *y, Node *x) {
+        while ((y != NULL) && (x == y->_left)) {
+            x = y;
+            y = y->_parent;
+        }
+
+        return y;
+    }
+    Node *Tree::Predecessor(Node *x) {
+        if (x->_left != NULL)
+            return MaxNode(x->_left);
+
+        Node *y = x->_parent;
+        return SuccessorHelper(y, x);
+    }
+
+    /*
+    *      px                              px
+    *     /                               /
+    *    x                               y
+    *   /  \      --(left rotation)-->  / \               #
+    *  lx   y                          x  ry
+    *     /   \                       /  \
+    *    ly   ry                      lx  ly
+    *  max_hole_size updates are pretty local
+    */
+
+    void Tree::LeftRotate(Node *&root, Node *x) {
+        Node *y = x->_right;
+
+        x->_right = y->_left;
+        rbn_right_mhs(x) = rbn_left_mhs(y);
+
+        if (y->_left != NULL)
+            y->_left->_parent = x;
+
+        y->_parent = x->_parent;
+
+        if (x->_parent == NULL) {
+            root = y;
+        } else {
+            if (x->_parent->_left == x) {
+                x->_parent->_left = y;
+            } else {
+                x->_parent->_right = y;
+            }
+        }
+        y->_left = x;
+        rbn_left_mhs(y) = mhs_of_subtree(x);
+
+        x->_parent = y;
+    }
+
+    /*            py                               py
+     *           /                                /
+     *          y                                x
+     *         /  \      --(right rotate)-->    /  \                     #
+     *        x   ry                           lx   y
+     *       / \                                   / \                   #
+     *      lx  rx                                rx  ry
+     *
+     */
+
+    void Tree::RightRotate(Node *&root, Node *y) {
+        Node *x = y->_left;
+
+        y->_left = x->_right;
+        rbn_left_mhs(y) = rbn_right_mhs(x);
+
+        if (x->_right != NULL)
+            x->_right->_parent = y;
+
+        x->_parent = y->_parent;
+
+        if (y->_parent == NULL) {
+            root = x;
+        } else {
+            if (y == y->_parent->_right)
+                y->_parent->_right = x;
+            else
+                y->_parent->_left = x;
+        }
+
+        x->_right = y;
+        rbn_right_mhs(x) = mhs_of_subtree(y);
+        y->_parent = x;
+    }
+
+    // walking from this node up to update the mhs info
+    // whenver there is change on left/right mhs or size we should recalculate.
+    // prerequisit: the children of the node are mhs up-to-date.
+    void Tree::RecalculateMhs(Node *node) {
+        uint64_t *p_node_mhs = 0;
+        Node *parent = node->_parent;
+
+        if (!parent)
+            return;
+
+        uint64_t max_mhs = mhs_of_subtree(node);
+        if (node == parent->_left) {
+            p_node_mhs = &rbn_left_mhs(parent);
+        } else if (node == parent->_right) {
+            p_node_mhs = &rbn_right_mhs(parent);
+        } else {
+            return;
+        }
+        if (*p_node_mhs != max_mhs) {
+            *p_node_mhs = max_mhs;
+            RecalculateMhs(parent);
+        }
+    }
+
+    void Tree::IsNewNodeMergable(Node *pred,
+                                 Node *succ,
+                                 Node::BlockPair pair,
+                                 bool *left_merge,
+                                 bool *right_merge) {
+        if (pred) {
+            OUUInt64 end_of_pred = rbn_size(pred) + rbn_offset(pred);
+            if (end_of_pred < pair._offset)
+                *left_merge = false;
+            else {
+                invariant(end_of_pred == pair._offset);
+                *left_merge = true;
+            }
+        }
+        if (succ) {
+            OUUInt64 begin_of_succ = rbn_offset(succ);
+            OUUInt64 end_of_node = pair._offset + pair._size;
+            if (end_of_node < begin_of_succ) {
+                *right_merge = false;
+            } else {
+                invariant(end_of_node == begin_of_succ);
+                *right_merge = true;
+            }
+        }
+    }
+
+    void Tree::AbsorbNewNode(Node *pred,
+                             Node *succ,
+                             Node::BlockPair pair,
+                             bool left_merge,
+                             bool right_merge,
+                             bool is_right_child) {
+        invariant(left_merge || right_merge);
+        if (left_merge && right_merge) {
+            // merge to the succ
+            if (!is_right_child) {
+                rbn_size(succ) += pair._size;
+                rbn_offset(succ) = pair._offset;
+                // merge to the pred
+                rbn_size(pred) += rbn_size(succ);
+                // to keep the invariant of the tree -no overlapping holes
+                rbn_offset(succ) += rbn_size(succ);
+                rbn_size(succ) = 0;
+                RecalculateMhs(succ);
+                RecalculateMhs(pred);
+                // pred dominates succ. this is going to
+                // update the pred labels separately.
+                // remove succ
+                RawRemove(_root, succ);
+            } else {
+                rbn_size(pred) += pair._size;
+                rbn_offset(succ) = rbn_offset(pred);
+                rbn_size(succ) += rbn_size(pred);
+                rbn_offset(pred) += rbn_size(pred);
+                rbn_size(pred) = 0;
+                RecalculateMhs(pred);
+                RecalculateMhs(succ);
+                // now remove pred
+                RawRemove(_root, pred);
+            }
+        } else if (left_merge) {
+            rbn_size(pred) += pair._size;
+            RecalculateMhs(pred);
+        } else if (right_merge) {
+            rbn_offset(succ) -= pair._size;
+            rbn_size(succ) += pair._size;
+            RecalculateMhs(succ);
+        }
+    }
+    // this is the most tedious part, but not complicated:
+    // 1.find where to insert the pair
+    // 2.if the pred and succ can merge with the pair. merge with them. either
+    // pred
+    // or succ can be removed.
+    // 3. if only left-mergable or right-mergeable, just merge
+    // 4. non-mergable case. insert the node and run the fixup.
+
+    int Tree::Insert(Node *&root, Node::BlockPair pair) {
+        Node *x = _root;
+        Node *y = NULL;
+        bool left_merge = false;
+        bool right_merge = false;
+        Node *node = NULL;
+
+        while (x != NULL) {
+            y = x;
+            if (pair._offset < rbn_key(x))
+                x = x->_left;
+            else
+                x = x->_right;
+        }
+
+        // we found where to insert, lets find out the pred and succ for
+        // possible
+        // merges.
+        //  node->parent = y;
+        Node *pred, *succ;
+        if (y != NULL) {
+            if (pair._offset < rbn_key(y)) {
+                // as the left child
+                pred = PredecessorHelper(y->_parent, y);
+                succ = y;
+                IsNewNodeMergable(pred, succ, pair, &left_merge, &right_merge);
+                if (left_merge || right_merge) {
+                    AbsorbNewNode(
+                        pred, succ, pair, left_merge, right_merge, false);
+                } else {
+                    // construct the node
+                    Node::Pair mhsp {0, 0};
+                    node =
+                        new Node(EColor::BLACK, pair, mhsp, nullptr, nullptr, nullptr);
+                    if (!node)
+                        return -1;
+                    y->_left = node;
+                    node->_parent = y;
+                    RecalculateMhs(node);
+                }
+
+            } else {
+                // as the right child
+                pred = y;
+                succ = SuccessorHelper(y->_parent, y);
+                IsNewNodeMergable(pred, succ, pair, &left_merge, &right_merge);
+                if (left_merge || right_merge) {
+                    AbsorbNewNode(
+                        pred, succ, pair, left_merge, right_merge, true);
+                } else {
+                    // construct the node
+                    Node::Pair mhsp {0, 0};
+                    node =
+                        new Node(EColor::BLACK, pair, mhsp, nullptr, nullptr, nullptr);
+                    if (!node)
+                        return -1;
+                    y->_right = node;
+                    node->_parent = y;
+                    RecalculateMhs(node);
+                }
+            }
+        } else {
+            Node::Pair mhsp {0, 0};
+            node = new Node(EColor::BLACK, pair, mhsp, nullptr, nullptr, nullptr);
+            if (!node)
+                return -1;
+            root = node;
+        }
+        if (!left_merge && !right_merge) {
+            invariant_notnull(node);
+            node->_color = EColor::RED;
+            return InsertFixup(root, node);
+        }
+        return 0;
+    }
+
+    int Tree::InsertFixup(Node *&root, Node *node) {
+        Node *parent, *gparent;
+        while ((parent = rbn_parent(node)) && rbn_is_red(parent)) {
+            gparent = rbn_parent(parent);
+            if (parent == gparent->_left) {
+                {
+                    Node *uncle = gparent->_right;
+                    if (uncle && rbn_is_red(uncle)) {
+                        rbn_set_black(uncle);
+                        rbn_set_black(parent);
+                        rbn_set_red(gparent);
+                        node = gparent;
+                        continue;
+                    }
+                }
+
+                if (parent->_right == node) {
+                    Node *tmp;
+                    LeftRotate(root, parent);
+                    tmp = parent;
+                    parent = node;
+                    node = tmp;
+                }
+
+                rbn_set_black(parent);
+                rbn_set_red(gparent);
+                RightRotate(root, gparent);
+            } else {
+                {
+                    Node *uncle = gparent->_left;
+                    if (uncle && rbn_is_red(uncle)) {
+                        rbn_set_black(uncle);
+                        rbn_set_black(parent);
+                        rbn_set_red(gparent);
+                        node = gparent;
+                        continue;
+                    }
+                }
+
+                if (parent->_left == node) {
+                    Node *tmp;
+                    RightRotate(root, parent);
+                    tmp = parent;
+                    parent = node;
+                    node = tmp;
+                }
+                rbn_set_black(parent);
+                rbn_set_red(gparent);
+                LeftRotate(root, gparent);
+            }
+        }
+        rbn_set_black(root);
+        return 0;
+    }
+
+    int Tree::Insert(Node::BlockPair pair) { return Insert(_root, pair); }
+
+    uint64_t Tree::Remove(size_t size) {
+        Node *node = SearchFirstFitBySize(size);
+        return Remove(_root, node, size);
+    }
+
+    void Tree::RawRemove(Node *&root, Node *node) {
+        Node *child, *parent;
+        EColor color;
+
+        if ((node->_left != NULL) && (node->_right != NULL)) {
+            Node *replace = node;
+            replace = replace->_right;
+            while (replace->_left != NULL)
+                replace = replace->_left;
+
+            if (rbn_parent(node)) {
+                if (rbn_parent(node)->_left == node)
+                    rbn_parent(node)->_left = replace;
+                else
+                    rbn_parent(node)->_right = replace;
+            } else {
+                root = replace;
+            }
+            child = replace->_right;
+            parent = rbn_parent(replace);
+            color = rbn_color(replace);
+
+            if (parent == node) {
+                parent = replace;
+            } else {
+                if (child)
+                    rbn_parent(child) = parent;
+
+                parent->_left = child;
+                rbn_left_mhs(parent) = rbn_right_mhs(replace);
+                RecalculateMhs(parent);
+                replace->_right = node->_right;
+                rbn_set_parent(node->_right, replace);
+                rbn_right_mhs(replace) = rbn_right_mhs(node);
+            }
+
+            replace->_parent = node->_parent;
+            replace->_color = node->_color;
+            replace->_left = node->_left;
+            rbn_left_mhs(replace) = rbn_left_mhs(node);
+            node->_left->_parent = replace;
+            RecalculateMhs(replace);
+            if (color == EColor::BLACK)
+                RawRemoveFixup(root, child, parent);
+            delete node;
+            return;
+        }
+
+        if (node->_left != NULL)
+            child = node->_left;
+        else
+            child = node->_right;
+
+        parent = node->_parent;
+        color = node->_color;
+
+        if (child)
+            child->_parent = parent;
+
+        if (parent) {
+            if (parent->_left == node) {
+                parent->_left = child;
+                rbn_left_mhs(parent) = child ? mhs_of_subtree(child) : 0;
+            } else {
+                parent->_right = child;
+                rbn_right_mhs(parent) = child ? mhs_of_subtree(child) : 0;
+            }
+            RecalculateMhs(parent);
+        } else
+            root = child;
+        if (color == EColor::BLACK)
+            RawRemoveFixup(root, child, parent);
+        delete node;
+    }
+
+    void Tree::RawRemove(uint64_t offset) {
+        Node *node = SearchByOffset(offset);
+        RawRemove(_root, node);
+    }
+    static inline uint64_t align(uint64_t value, uint64_t ba_alignment) {
+        return ((value + ba_alignment - 1) / ba_alignment) * ba_alignment;
+    }
+    uint64_t Tree::Remove(Node *&root, Node *node, size_t size) {
+        OUUInt64 n_offset = rbn_offset(node);
+        OUUInt64 n_size = rbn_size(node);
+        OUUInt64 answer_offset(align(rbn_offset(node).ToInt(), _align));
+
+        invariant((answer_offset + size) <= (n_offset + n_size));
+        if (answer_offset == n_offset) {
+            rbn_offset(node) += size;
+            rbn_size(node) -= size;
+            RecalculateMhs(node);
+            if (rbn_size(node) == 0) {
+                RawRemove(root, node);
+            }
+
+        } else {
+            if (answer_offset + size == n_offset + n_size) {
+                rbn_size(node) -= size;
+                RecalculateMhs(node);
+            } else {
+                // well, cut in the middle...
+                rbn_size(node) = answer_offset - n_offset;
+                RecalculateMhs(node);
+                Insert(_root,
+                       {(answer_offset + size),
+                        (n_offset + n_size) - (answer_offset + size)});
+            }
+        }
+        return answer_offset.ToInt();
+    }
+
+    void Tree::RawRemoveFixup(Node *&root, Node *node, Node *parent) {
+        Node *other;
+        while ((!node || rbn_is_black(node)) && node != root) {
+            if (parent->_left == node) {
+                other = parent->_right;
+                if (rbn_is_red(other)) {
+                    // Case 1: the brother of X, w, is read
+                    rbn_set_black(other);
+                    rbn_set_red(parent);
+                    LeftRotate(root, parent);
+                    other = parent->_right;
+                }
+                if ((!other->_left || rbn_is_black(other->_left)) &&
+                    (!other->_right || rbn_is_black(other->_right))) {
+                    // Case 2: w is black and both of w's children are black
+                    rbn_set_red(other);
+                    node = parent;
+                    parent = rbn_parent(node);
+                } else {
+                    if (!other->_right || rbn_is_black(other->_right)) {
+                        // Case 3: w is black and left child of w is red but
+                        // right
+                        // child is black
+                        rbn_set_black(other->_left);
+                        rbn_set_red(other);
+                        RightRotate(root, other);
+                        other = parent->_right;
+                    }
+                    // Case 4: w is black and right child of w is red,
+                    // regardless of
+                    // left child's color
+                    rbn_set_color(other, rbn_color(parent));
+                    rbn_set_black(parent);
+                    rbn_set_black(other->_right);
+                    LeftRotate(root, parent);
+                    node = root;
+                    break;
+                }
+            } else {
+                other = parent->_left;
+                if (rbn_is_red(other)) {
+                    // Case 1: w is red
+                    rbn_set_black(other);
+                    rbn_set_red(parent);
+                    RightRotate(root, parent);
+                    other = parent->_left;
+                }
+                if ((!other->_left || rbn_is_black(other->_left)) &&
+                    (!other->_right || rbn_is_black(other->_right))) {
+                    // Case 2: w is black and both children are black
+                    rbn_set_red(other);
+                    node = parent;
+                    parent = rbn_parent(node);
+                } else {
+                    if (!other->_left || rbn_is_black(other->_left)) {
+                        // Case 3: w is black and left child of w is red whereas
+                        // right child is black
+                        rbn_set_black(other->_right);
+                        rbn_set_red(other);
+                        LeftRotate(root, other);
+                        other = parent->_left;
+                    }
+                    // Case 4:w is black and right child of w is red, regardless
+                    // of
+                    // the left child's color
+                    rbn_set_color(other, rbn_color(parent));
+                    rbn_set_black(parent);
+                    rbn_set_black(other->_left);
+                    RightRotate(root, parent);
+                    node = root;
+                    break;
+                }
+            }
+        }
+        if (node)
+            rbn_set_black(node);
+    }
+
+    void Tree::Destroy(Node *&tree) {
+        if (tree == NULL)
+            return;
+
+        if (tree->_left != NULL)
+            Destroy(tree->_left);
+        if (tree->_right != NULL)
+            Destroy(tree->_right);
+
+        delete tree;
+        tree = NULL;
+    }
+
+    void Tree::Destroy() { Destroy(_root); }
+
+    void Tree::Dump(Node *tree, Node::BlockPair pair, EDirection dir) {
+        if (tree != NULL) {
+            if (dir == EDirection::NONE)
+                fprintf(stderr,
+                        "(%" PRIu64 ",%" PRIu64 ", mhs:(%" PRIu64 ",%" PRIu64
+                        "))(B) is root\n",
+                        rbn_offset(tree).ToInt(),
+                        rbn_size(tree).ToInt(),
+                        rbn_left_mhs(tree),
+                        rbn_right_mhs(tree));
+            else
+                fprintf(stderr,
+                        "(%" PRIu64 ",%" PRIu64 ",mhs:(%" PRIu64 ",%" PRIu64
+                        "))(%c) is %" PRIu64 "'s %s\n",
+                        rbn_offset(tree).ToInt(),
+                        rbn_size(tree).ToInt(),
+                        rbn_left_mhs(tree),
+                        rbn_right_mhs(tree),
+                        rbn_is_red(tree) ? 'R' : 'B',
+                        pair._offset.ToInt(),
+                        dir == EDirection::RIGHT ? "right child" : "left child");
+
+            Dump(tree->_left, tree->_hole, EDirection::LEFT);
+            Dump(tree->_right, tree->_hole, EDirection::RIGHT);
+        }
+    }
+
+    uint64_t Tree::EffectiveSize(Node *node) {
+        OUUInt64 offset = rbn_offset(node);
+        OUUInt64 size = rbn_size(node);
+        OUUInt64 end = offset + size;
+        OUUInt64 aligned_offset(align(offset.ToInt(), _align));
+        if (aligned_offset > end) {
+            return 0;
+        }
+        return (end - aligned_offset).ToInt();
+    }
+
+    void Tree::Dump() {
+        if (_root != NULL)
+            Dump(_root, _root->_hole, (EDirection)0);
+    }
+
+    static void vis_bal_f(void *extra, Node *node, uint64_t depth) {
+        uint64_t **p = (uint64_t **)extra;
+        uint64_t min = *p[0];
+        uint64_t max = *p[1];
+        if (node->_left) {
+            Node *left = node->_left;
+            invariant(node == left->_parent);
+        }
+
+        if (node->_right) {
+            Node *right = node->_right;
+            invariant(node == right->_parent);
+        }
+
+        if (!node->_left || !node->_right) {
+            if (min > depth) {
+                *p[0] = depth;
+            } else if (max < depth) {
+                *p[1] = depth;
+            }
+        }
+    }
+
+    void Tree::ValidateBalance() {
+        uint64_t min_depth = 0xffffffffffffffff;
+        uint64_t max_depth = 0;
+        if (!_root) {
+            return;
+        }
+        uint64_t *p[2] = {&min_depth, &max_depth};
+        InOrderVisitor(vis_bal_f, (void *)p);
+        invariant((min_depth + 1) * 2 >= max_depth + 1);
+    }
+
+    static void vis_cmp_f(void *extra, Node *node, uint64_t UU(depth)) {
+        Node::BlockPair **p = (Node::BlockPair **)extra;
+
+        invariant_notnull(*p);
+        invariant((*p)->_offset == node->_hole._offset);
+
+        *p = *p + 1;
+    }
+
+    // validate the input pairs matches with sorted pairs
+    void Tree::ValidateInOrder(Node::BlockPair *pairs) {
+        InOrderVisitor(vis_cmp_f, &pairs);
+    }
+
+    uint64_t Tree::ValidateMhs(Node *node) {
+        if (!node)
+            return 0;
+        else {
+            uint64_t mhs_left = ValidateMhs(node->_left);
+            uint64_t mhs_right = ValidateMhs(node->_right);
+            if (mhs_left != rbn_left_mhs(node)) {
+                printf("assert failure: mhs_left = %" PRIu64 "\n", mhs_left);
+                Dump(node, node->_hole, (EDirection)0);
+            }
+            invariant(mhs_left == rbn_left_mhs(node));
+
+            if (mhs_right != rbn_right_mhs(node)) {
+                printf("assert failure: mhs_right = %" PRIu64 "\n", mhs_right);
+                Dump(node, node->_hole, (EDirection)0);
+            }
+            invariant(mhs_right == rbn_right_mhs(node));
+            return std::max(EffectiveSize(node), std::max(mhs_left, mhs_right));
+        }
+    }
+
+    void Tree::ValidateMhs() {
+        if (!_root)
+            return;
+        uint64_t mhs_left = ValidateMhs(_root->_left);
+        uint64_t mhs_right = ValidateMhs(_root->_right);
+        invariant(mhs_left == rbn_left_mhs(_root));
+        invariant(mhs_right == rbn_right_mhs(_root));
+    }
+
+}  // namespace MhsRbTree
diff --git a/storage/tokudb/PerconaFT/ft/serialize/rbtree_mhs.h b/storage/tokudb/PerconaFT/ft/serialize/rbtree_mhs.h
new file mode 100644
index 00000000000..eb8c953b08c
--- /dev/null
+++ b/storage/tokudb/PerconaFT/ft/serialize/rbtree_mhs.h
@@ -0,0 +1,355 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#pragma once
+
+#include <db.h>
+
+#include "portability/toku_pthread.h"
+#include "portability/toku_stdint.h"
+#include "portability/toku_stdlib.h"
+
+// RBTree(Red-black tree) with max hole sizes for subtrees.
+
+// This is a tentative data struct to improve the block allocation time
+// complexity from the linear time to the log time. Please be noted this DS only
+// supports first-fit for now. It is actually easier to do it with
+// best-fit.(just
+// sort by size).
+
+// RBTree is a classic data struct with O(log(n)) for insertion, deletion and
+// search. Many years have seen its efficiency.
+
+// a *hole* is the representation of an available BlockPair for allocation.
+// defined as (start_address,size) or (offset, size) interchangably.
+
+// each node has a *label* to indicate a pair of the max hole sizes for its
+// subtree.
+
+// We are implementing a RBTree with max hole sizes for subtree. It is a red
+// black tree that is sorted by the start_address but also labeld with the max
+// hole sizes of the subtrees.
+
+//        [(6,3)]  -> [(offset, size)], the hole
+//        [{2,5}]  -> [{mhs_of_left, mhs_of_right}], the label
+/*        /     \           */
+// [(0, 1)]    [(10,  5)]
+// [{0, 2}]    [{0,   0}]
+/*        \                 */
+//       [(3,  2)]
+//       [{0,  0}]
+// request of allocation size=2 goes from root to [(3,2)].
+
+// above example shows a simplified RBTree_max_holes.
+// it is easier to tell the search time is O(log(n)) as we can make a decision
+// on each descent until we get to the target.
+
+// the only question is if we can keep the maintenance cost low -- and i think
+// it is not a problem becoz an insertion/deletion is only going to update the
+// max_hole_sizes of the nodes along the path from the root to the node to be
+// deleted/inserted. The path can be cached and search is anyway O(log(n)).
+
+// unlike the typical rbtree, Tree has to handle the inserts and deletes
+// with more care: an allocation that triggers the delete might leave some
+// unused space which we can simply update the start_addr and size without
+// worrying overlapping. An free might not only mean the insertion but also
+// *merging* with the adjacent holes.
+
+namespace MhsRbTree {
+
+#define offset_t uint64_t
+    enum class EColor { RED, BLACK };
+    enum class EDirection { NONE = 0, LEFT, RIGHT };
+
+    // I am a bit tired of fixing overflow/underflow, just quickly craft some
+    // int
+    // class that has an infinity-like max value and prevents overflow and
+    // underflow. If you got a file offset larger than MHS_MAX_VAL, it is not
+    // a problem here. :-/  - JYM
+    class OUUInt64 {
+       public:
+        static const uint64_t MHS_MAX_VAL = 0xffffffffffffffff;
+        OUUInt64() : _value(0) {}
+        OUUInt64(uint64_t s) : _value(s) {}
+        OUUInt64(const OUUInt64& o) : _value(o._value) {}
+        bool operator<(const OUUInt64 &r) const {
+            invariant(!(_value == MHS_MAX_VAL && r.ToInt() == MHS_MAX_VAL));
+            return _value < r.ToInt();
+        }
+        bool operator>(const OUUInt64 &r) const {
+            invariant(!(_value == MHS_MAX_VAL && r.ToInt() == MHS_MAX_VAL));
+            return _value > r.ToInt();
+        }
+        bool operator<=(const OUUInt64 &r) const {
+            invariant(!(_value == MHS_MAX_VAL && r.ToInt() == MHS_MAX_VAL));
+            return _value <= r.ToInt();
+        }
+        bool operator>=(const OUUInt64 &r) const {
+            invariant(!(_value == MHS_MAX_VAL && r.ToInt() == MHS_MAX_VAL));
+            return _value >= r.ToInt();
+        }
+        OUUInt64 operator+(const OUUInt64 &r) const {
+            if (_value == MHS_MAX_VAL || r.ToInt() == MHS_MAX_VAL) {
+                OUUInt64 tmp(MHS_MAX_VAL);
+                return tmp;
+            } else {
+                // detecting overflow
+                invariant((MHS_MAX_VAL - _value) >= r.ToInt());
+                uint64_t plus = _value + r.ToInt();
+                OUUInt64 tmp(plus);
+                return tmp;
+            }
+        }
+        OUUInt64 operator-(const OUUInt64 &r) const {
+            invariant(r.ToInt() != MHS_MAX_VAL);
+            if (_value == MHS_MAX_VAL) {
+                return *this;
+            } else {
+                invariant(_value >= r.ToInt());
+                uint64_t minus = _value - r.ToInt();
+                OUUInt64 tmp(minus);
+                return tmp;
+            }
+        }
+        OUUInt64 operator-=(const OUUInt64 &r) {
+            if (_value != MHS_MAX_VAL) {
+                invariant(r.ToInt() != MHS_MAX_VAL);
+                invariant(_value >= r.ToInt());
+                _value -= r.ToInt();
+            }
+            return *this;
+        }
+        OUUInt64 operator+=(const OUUInt64 &r) {
+            if (_value != MHS_MAX_VAL) {
+                if (r.ToInt() == MHS_MAX_VAL) {
+                    _value = MHS_MAX_VAL;
+                } else {
+                    invariant((MHS_MAX_VAL - _value) >= r.ToInt());
+                    this->_value += r.ToInt();
+                }
+            }
+            return *this;
+        }
+        bool operator==(const OUUInt64 &r) const {
+            return _value == r.ToInt();
+        }
+        bool operator!=(const OUUInt64 &r) const {
+            return _value != r.ToInt();
+        }
+        OUUInt64 operator=(const OUUInt64 &r) {
+            _value = r.ToInt();
+            return *this;
+        }
+        uint64_t ToInt() const { return _value; }
+
+       private:
+        uint64_t _value;
+    };
+
+    class Node {
+       public:
+        class BlockPair {
+           public:
+            OUUInt64 _offset;
+            OUUInt64 _size;
+
+            BlockPair() : _offset(0), _size(0) {}
+            BlockPair(uint64_t o, uint64_t s) : _offset(o), _size(s) {}
+            BlockPair(OUUInt64 o, OUUInt64 s) : _offset(o), _size(s) {}
+            BlockPair(const BlockPair &o)
+                : _offset(o._offset), _size(o._size) {}
+
+            int operator<(const BlockPair &rhs) const {
+                return _offset < rhs._offset;
+            }
+            int operator<(const uint64_t &o) const { return _offset < o; }
+        };
+
+        struct Pair {
+            uint64_t _left;
+            uint64_t _right;
+            Pair(uint64_t l, uint64_t r) : _left(l), _right(r) {}
+        };
+
+        EColor _color;
+        BlockPair _hole;
+        Pair _label;
+        Node *_left;
+        Node *_right;
+        Node *_parent;
+
+        Node(EColor c,
+             Node::BlockPair h,
+             Pair lb,
+             Node *l,
+             Node *r,
+             Node *p)
+            : _color(c),
+              _hole(h),
+              _label(lb),
+              _left(l),
+              _right(r),
+              _parent(p) {}
+    };
+
+    class Tree {
+       private:
+        Node *_root;
+        uint64_t _align;
+
+       public:
+        Tree();
+        Tree(uint64_t);
+        ~Tree();
+
+        void PreOrder();
+        void InOrder();
+        void PostOrder();
+        // immutable operations
+        Node *SearchByOffset(uint64_t addr);
+        Node *SearchFirstFitBySize(uint64_t size);
+
+        Node *MinNode();
+        Node *MaxNode();
+
+        Node *Successor(Node *);
+        Node *Predecessor(Node *);
+
+        // mapped from tree_allocator::free_block
+        int Insert(Node::BlockPair pair);
+        // mapped from tree_allocator::alloc_block
+        uint64_t Remove(size_t size);
+        // mapped from tree_allocator::alloc_block_after
+
+        void RawRemove(uint64_t offset);
+        void Destroy();
+        // print the tree
+        void Dump();
+        // validation
+        // balance
+        void ValidateBalance();
+        void ValidateInOrder(Node::BlockPair *);
+        void InOrderVisitor(void (*f)(void *, Node *, uint64_t), void *);
+        void ValidateMhs();
+
+       private:
+        void PreOrder(Node *node) const;
+        void InOrder(Node *node) const;
+        void PostOrder(Node *node) const;
+        Node *SearchByOffset(Node *node, offset_t addr) const;
+        Node *SearchFirstFitBySize(Node *node, size_t size) const;
+
+        Node *MinNode(Node *node);
+        Node *MaxNode(Node *node);
+
+        // rotations to fix up. we will have to update the labels too.
+        void LeftRotate(Node *&root, Node *x);
+        void RightRotate(Node *&root, Node *y);
+
+        int Insert(Node *&root, Node::BlockPair pair);
+        int InsertFixup(Node *&root, Node *node);
+
+        void RawRemove(Node *&root, Node *node);
+        uint64_t Remove(Node *&root, Node *node, size_t size);
+        void RawRemoveFixup(Node *&root, Node *node, Node *parent);
+
+        void Destroy(Node *&tree);
+        void Dump(Node *tree, Node::BlockPair pair, EDirection dir);
+        void RecalculateMhs(Node *node);
+        void IsNewNodeMergable(Node *, Node *, Node::BlockPair, bool *, bool *);
+        void AbsorbNewNode(Node *, Node *, Node::BlockPair, bool, bool, bool);
+        Node *SearchFirstFitBySizeHelper(Node *x, uint64_t size);
+
+        Node *SuccessorHelper(Node *y, Node *x);
+
+        Node *PredecessorHelper(Node *y, Node *x);
+
+        void InOrderVisitor(Node *,
+                            void (*f)(void *, Node *, uint64_t),
+                            void *,
+                            uint64_t);
+        uint64_t ValidateMhs(Node *);
+
+        uint64_t EffectiveSize(Node *);
+// mixed with some macros.....
+#define rbn_parent(r) ((r)->_parent)
+#define rbn_color(r) ((r)->_color)
+#define rbn_is_red(r) ((r)->_color == EColor::RED)
+#define rbn_is_black(r) ((r)->_color == EColor::BLACK)
+#define rbn_set_black(r)     \
+    do {                     \
+        (r)->_color = EColor::BLACK; \
+    } while (0)
+#define rbn_set_red(r)     \
+    do {                   \
+        (r)->_color = EColor::RED; \
+    } while (0)
+#define rbn_set_parent(r, p) \
+    do {                     \
+        (r)->_parent = (p);  \
+    } while (0)
+#define rbn_set_color(r, c) \
+    do {                    \
+        (r)->_color = (c);  \
+    } while (0)
+#define rbn_set_offset(r)         \
+    do {                          \
+        (r)->_hole._offset = (c); \
+    } while (0)
+#define rbn_set_size(r, c)      \
+    do {                        \
+        (r)->_hole._size = (c); \
+    } while (0)
+#define rbn_set_left_mhs(r, c)   \
+    do {                         \
+        (r)->_label._left = (c); \
+    } while (0)
+#define rbn_set_right_mhs(r, c)   \
+    do {                          \
+        (r)->_label._right = (c); \
+    } while (0)
+#define rbn_size(r) ((r)->_hole._size)
+#define rbn_offset(r) ((r)->_hole._offset)
+#define rbn_key(r) ((r)->_hole._offset)
+#define rbn_left_mhs(r) ((r)->_label._left)
+#define rbn_right_mhs(r) ((r)->_label._right)
+#define mhs_of_subtree(y) \
+    (std::max(std::max(rbn_left_mhs(y), rbn_right_mhs(y)), EffectiveSize(y)))
+    };
+
+}  // namespace MhsRbTree
diff --git a/storage/tokudb/PerconaFT/ft/tests/block_allocator_strategy_test.cc b/storage/tokudb/PerconaFT/ft/tests/block_allocator_strategy_test.cc
deleted file mode 100644
index 3670ef81cc2..00000000000
--- a/storage/tokudb/PerconaFT/ft/tests/block_allocator_strategy_test.cc
+++ /dev/null
@@ -1,126 +0,0 @@
-/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
-// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
-#ident "$Id$"
-/*======
-This file is part of PerconaFT.
-
-
-Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
-
-    PerconaFT is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License, version 2,
-    as published by the Free Software Foundation.
-
-    PerconaFT is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
-
-----------------------------------------
-
-    PerconaFT is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Affero General Public License, version 3,
-    as published by the Free Software Foundation.
-
-    PerconaFT is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Affero General Public License for more details.
-
-    You should have received a copy of the GNU Affero General Public License
-    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
-======= */
-
-#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
-
-#include "ft/tests/test.h"
-
-#include "ft/serialize/block_allocator_strategy.h"
-
-static const uint64_t alignment = 4096;
-
-static void test_first_vs_best_fit(void) {
-    struct block_allocator::blockpair pairs[] = {
-        block_allocator::blockpair(1 * alignment, 6 * alignment),
-        // hole between 7x align -> 8x align
-        block_allocator::blockpair(8 * alignment, 4 * alignment),
-        // hole between 12x align -> 16x align
-        block_allocator::blockpair(16 * alignment, 1 * alignment),
-        block_allocator::blockpair(17 * alignment, 2 * alignment),
-        // hole between 19 align -> 21x align
-        block_allocator::blockpair(21 * alignment, 2 * alignment),
-    };
-    const uint64_t n_blocks = sizeof(pairs) / sizeof(pairs[0]);
-    
-    block_allocator::blockpair *bp;
-
-    // first fit
-    bp = block_allocator_strategy::first_fit(pairs, n_blocks, 100, alignment);
-    assert(bp == &pairs[0]);
-    bp = block_allocator_strategy::first_fit(pairs, n_blocks, 4096, alignment);
-    assert(bp == &pairs[0]);
-    bp = block_allocator_strategy::first_fit(pairs, n_blocks, 3 * 4096, alignment);
-    assert(bp == &pairs[1]);
-    bp = block_allocator_strategy::first_fit(pairs, n_blocks, 5 * 4096, alignment);
-    assert(bp == nullptr);
-
-    // best fit
-    bp = block_allocator_strategy::best_fit(pairs, n_blocks, 100, alignment);
-    assert(bp == &pairs[0]);
-    bp = block_allocator_strategy::best_fit(pairs, n_blocks, 4100, alignment);
-    assert(bp == &pairs[3]);
-    bp = block_allocator_strategy::best_fit(pairs, n_blocks, 3 * 4096, alignment);
-    assert(bp == &pairs[1]);
-    bp = block_allocator_strategy::best_fit(pairs, n_blocks, 5 * 4096, alignment);
-    assert(bp == nullptr);
-}
-
-static void test_padded_fit(void) {
-    struct block_allocator::blockpair pairs[] = {
-        block_allocator::blockpair(1 * alignment, 1 * alignment),
-        // 4096 byte hole after bp[0]
-        block_allocator::blockpair(3 * alignment, 1 * alignment),
-        // 8192 byte hole after bp[1]
-        block_allocator::blockpair(6 * alignment, 1 * alignment),
-        // 16384 byte hole after bp[2]
-        block_allocator::blockpair(11 * alignment, 1 * alignment),
-        // 32768 byte hole after bp[3]
-        block_allocator::blockpair(17 * alignment, 1 * alignment),
-        // 116kb hole after bp[4]
-        block_allocator::blockpair(113 * alignment, 1 * alignment),
-        // 256kb hole after bp[5]
-        block_allocator::blockpair(371 * alignment, 1 * alignment),
-    };
-    const uint64_t n_blocks = sizeof(pairs) / sizeof(pairs[0]);
-    
-    block_allocator::blockpair *bp;
-
-    // padding for a 100 byte allocation will be < than standard alignment,
-    // so it should fit in the first 4096 byte hole.
-    bp = block_allocator_strategy::padded_fit(pairs, n_blocks, 4000, alignment);
-    assert(bp == &pairs[0]);
-
-    // Even padded, a 12kb alloc will fit in a 16kb hole
-    bp = block_allocator_strategy::padded_fit(pairs, n_blocks, 3 * alignment, alignment);
-    assert(bp == &pairs[2]);
-
-    // would normally fit in the 116kb hole but the padding will bring it over
-    bp = block_allocator_strategy::padded_fit(pairs, n_blocks, 116 * alignment, alignment);
-    assert(bp == &pairs[5]);
-
-    bp = block_allocator_strategy::padded_fit(pairs, n_blocks, 127 * alignment, alignment);
-    assert(bp == &pairs[5]);
-}
-
-int test_main(int argc, const char *argv[]) {
-    (void) argc;
-    (void) argv;
-
-    test_first_vs_best_fit();
-    test_padded_fit();
-
-    return 0;
-}
diff --git a/storage/tokudb/PerconaFT/ft/tests/block_allocator_test.cc b/storage/tokudb/PerconaFT/ft/tests/block_allocator_test.cc
index d80ee83cbc9..3eff52b915d 100644
--- a/storage/tokudb/PerconaFT/ft/tests/block_allocator_test.cc
+++ b/storage/tokudb/PerconaFT/ft/tests/block_allocator_test.cc
@@ -38,253 +38,243 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 
 #include "test.h"
 
-static void ba_alloc(block_allocator *ba, uint64_t size, uint64_t *answer) {
-    ba->validate();
+static void ba_alloc(BlockAllocator *ba, uint64_t size, uint64_t *answer) {
+    ba->Validate();
     uint64_t actual_answer;
-    const uint64_t heat = random() % 2;
-    ba->alloc_block(512 * size, heat, &actual_answer);
-    ba->validate();
+    ba->AllocBlock(512 * size, &actual_answer);
+    ba->Validate();
 
-    assert(actual_answer%512==0);
-    *answer = actual_answer/512;
+    invariant(actual_answer % 512 == 0);
+    *answer = actual_answer / 512;
 }
 
-static void ba_free(block_allocator *ba, uint64_t offset) {
-    ba->validate();
-    ba->free_block(offset * 512);
-    ba->validate();
+static void ba_free(BlockAllocator *ba, uint64_t offset, uint64_t size) {
+    ba->Validate();
+    ba->FreeBlock(offset * 512, 512 * size);
+    ba->Validate();
 }
 
-static void ba_check_l(block_allocator *ba, uint64_t blocknum_in_layout_order,
-                       uint64_t expected_offset, uint64_t expected_size) {
+static void ba_check_l(BlockAllocator *ba,
+                       uint64_t blocknum_in_layout_order,
+                       uint64_t expected_offset,
+                       uint64_t expected_size) {
     uint64_t actual_offset, actual_size;
-    int r = ba->get_nth_block_in_layout_order(blocknum_in_layout_order, &actual_offset, &actual_size);
-    assert(r==0);
-    assert(expected_offset*512 == actual_offset);
-    assert(expected_size  *512 == actual_size);
+    int r = ba->NthBlockInLayoutOrder(
+        blocknum_in_layout_order, &actual_offset, &actual_size);
+    invariant(r == 0);
+    invariant(expected_offset * 512 == actual_offset);
+    invariant(expected_size * 512 == actual_size);
 }
 
-static void ba_check_none(block_allocator *ba, uint64_t blocknum_in_layout_order) {
+static void ba_check_none(BlockAllocator *ba,
+                          uint64_t blocknum_in_layout_order) {
     uint64_t actual_offset, actual_size;
-    int r = ba->get_nth_block_in_layout_order(blocknum_in_layout_order, &actual_offset, &actual_size);
-    assert(r==-1);
+    int r = ba->NthBlockInLayoutOrder(
+        blocknum_in_layout_order, &actual_offset, &actual_size);
+    invariant(r == -1);
 }
 
-
 // Simple block allocator test
-static void test_ba0(block_allocator::allocation_strategy strategy) {
-    block_allocator allocator;
-    block_allocator *ba = &allocator;
-    ba->create(100*512, 1*512);
-    ba->set_strategy(strategy);
-    assert(ba->allocated_limit()==100*512);
+static void test_ba0() {
+    BlockAllocator allocator;
+    BlockAllocator *ba = &allocator;
+    ba->Create(100 * 512, 1 * 512);
+    invariant(ba->AllocatedLimit() == 100 * 512);
 
     uint64_t b2, b3, b4, b5, b6, b7;
-    ba_alloc(ba, 100, &b2);     
-    ba_alloc(ba, 100, &b3);     
-    ba_alloc(ba, 100, &b4);     
-    ba_alloc(ba, 100, &b5);     
-    ba_alloc(ba, 100, &b6);     
-    ba_alloc(ba, 100, &b7);     
-    ba_free(ba, b2);
-    ba_alloc(ba, 100, &b2);  
-    ba_free(ba, b4);         
-    ba_free(ba, b6);         
+    ba_alloc(ba, 100, &b2);
+    ba_alloc(ba, 100, &b3);
+    ba_alloc(ba, 100, &b4);
+    ba_alloc(ba, 100, &b5);
+    ba_alloc(ba, 100, &b6);
+    ba_alloc(ba, 100, &b7);
+    ba_free(ba, b2, 100);
+    ba_alloc(ba, 100, &b2);
+    ba_free(ba, b4, 100);
+    ba_free(ba, b6, 100);
     uint64_t b8, b9;
-    ba_alloc(ba, 100, &b4);    
-    ba_free(ba, b2);           
-    ba_alloc(ba, 100, &b6);    
-    ba_alloc(ba, 100, &b8);    
-    ba_alloc(ba, 100, &b9);    
-    ba_free(ba, b6);           
-    ba_free(ba, b7);           
-    ba_free(ba, b8);           
-    ba_alloc(ba, 100, &b6);    
-    ba_alloc(ba, 100, &b7);    
-    ba_free(ba, b4);           
-    ba_alloc(ba, 100, &b4);    
-
-    ba->destroy();
+    ba_alloc(ba, 100, &b4);
+    ba_free(ba, b2, 100);
+    ba_alloc(ba, 100, &b6);
+    ba_alloc(ba, 100, &b8);
+    ba_alloc(ba, 100, &b9);
+    ba_free(ba, b6, 100);
+    ba_free(ba, b7, 100);
+    ba_free(ba, b8, 100);
+    ba_alloc(ba, 100, &b6);
+    ba_alloc(ba, 100, &b7);
+    ba_free(ba, b4, 100);
+    ba_alloc(ba, 100, &b4);
+
+    ba->Destroy();
 }
 
 // Manually to get coverage of all the code in the block allocator.
-static void
-test_ba1(block_allocator::allocation_strategy strategy, int n_initial) {
-    block_allocator allocator;
-    block_allocator *ba = &allocator;
-    ba->create(0*512, 1*512);
-    ba->set_strategy(strategy);
-
-    int n_blocks=0;
+static void test_ba1(int n_initial) {
+    BlockAllocator allocator;
+    BlockAllocator *ba = &allocator;
+    ba->Create(0 * 512, 1 * 512);
+
+    int n_blocks = 0;
     uint64_t blocks[1000];
     for (int i = 0; i < 1000; i++) {
-	if (i < n_initial || random() % 2 == 0) {
-	    if (n_blocks < 1000) {
-		ba_alloc(ba, 1, &blocks[n_blocks]);
-		//printf("A[%d]=%ld\n", n_blocks, blocks[n_blocks]);
-		n_blocks++;
-	    } 
-	} else {
-	    if (n_blocks > 0) {
-		int blocknum = random()%n_blocks;
-		//printf("F[%d]%ld\n", blocknum, blocks[blocknum]);
-		ba_free(ba, blocks[blocknum]);
-		blocks[blocknum]=blocks[n_blocks-1];
-		n_blocks--;
-	    }
-	}
+        if (i < n_initial || random() % 2 == 0) {
+            if (n_blocks < 1000) {
+                ba_alloc(ba, 1, &blocks[n_blocks]);
+                // printf("A[%d]=%ld\n", n_blocks, blocks[n_blocks]);
+                n_blocks++;
+            }
+        } else {
+            if (n_blocks > 0) {
+                int blocknum = random() % n_blocks;
+                // printf("F[%d]=%ld\n", blocknum, blocks[blocknum]);
+                ba_free(ba, blocks[blocknum], 1);
+                blocks[blocknum] = blocks[n_blocks - 1];
+                n_blocks--;
+            }
+        }
     }
-    
-    ba->destroy();
+
+    ba->Destroy();
 }
-    
+
 // Check to see if it is first fit or best fit.
-static void
-test_ba2 (void)
-{
-    block_allocator allocator;
-    block_allocator *ba = &allocator;
+static void test_ba2(void) {
+    BlockAllocator allocator;
+    BlockAllocator *ba = &allocator;
     uint64_t b[6];
     enum { BSIZE = 1024 };
-    ba->create(100*512, BSIZE*512);
-    ba->set_strategy(block_allocator::BA_STRATEGY_FIRST_FIT);
-    assert(ba->allocated_limit()==100*512);
-
-    ba_check_l    (ba, 0, 0, 100);
-    ba_check_none (ba, 1);
-
-    ba_alloc (ba, 100, &b[0]);
-    ba_check_l    (ba, 0, 0, 100);
-    ba_check_l    (ba, 1, BSIZE, 100);
-    ba_check_none (ba, 2);
-
-    ba_alloc (ba, BSIZE + 100, &b[1]);
-    ba_check_l    (ba, 0, 0, 100);
-    ba_check_l    (ba, 1,   BSIZE,       100);
-    ba_check_l    (ba, 2, 2*BSIZE, BSIZE + 100);
-    ba_check_none (ba, 3);
-
-    ba_alloc (ba, 100, &b[2]);
-    ba_check_l    (ba, 0, 0, 100);
-    ba_check_l    (ba, 1,   BSIZE,       100);
-    ba_check_l    (ba, 2, 2*BSIZE, BSIZE + 100);
-    ba_check_l    (ba, 3, 4*BSIZE,       100);
-    ba_check_none (ba, 4);
-
-    ba_alloc (ba, 100, &b[3]);
-    ba_alloc (ba, 100, &b[4]);
-    ba_alloc (ba, 100, &b[5]);
-    ba_check_l    (ba, 0, 0, 100);
-    ba_check_l    (ba, 1,   BSIZE,       100);
-    ba_check_l    (ba, 2, 2*BSIZE, BSIZE + 100);
-    ba_check_l    (ba, 3, 4*BSIZE,       100);
-    ba_check_l    (ba, 4, 5*BSIZE,       100);
-    ba_check_l    (ba, 5, 6*BSIZE,       100);
-    ba_check_l    (ba, 6, 7*BSIZE,       100);
-    ba_check_none (ba, 7);
-   
-    ba_free (ba, 4*BSIZE);
-    ba_check_l    (ba, 0, 0, 100);
-    ba_check_l    (ba, 1,   BSIZE,       100);
-    ba_check_l    (ba, 2, 2*BSIZE, BSIZE + 100);
-    ba_check_l    (ba, 3, 5*BSIZE,       100);
-    ba_check_l    (ba, 4, 6*BSIZE,       100);
-    ba_check_l    (ba, 5, 7*BSIZE,       100);
-    ba_check_none (ba, 6);
+    ba->Create(100 * 512, BSIZE * 512);
+    invariant(ba->AllocatedLimit() == 100 * 512);
+
+    ba_check_l(ba, 0, 0, 100);
+    ba_check_none(ba, 1);
+
+    ba_alloc(ba, 100, &b[0]);
+    ba_check_l(ba, 0, 0, 100);
+    ba_check_l(ba, 1, BSIZE, 100);
+    ba_check_none(ba, 2);
+
+    ba_alloc(ba, BSIZE + 100, &b[1]);
+    ba_check_l(ba, 0, 0, 100);
+    ba_check_l(ba, 1, BSIZE, 100);
+    ba_check_l(ba, 2, 2 * BSIZE, BSIZE + 100);
+    ba_check_none(ba, 3);
+
+    ba_alloc(ba, 100, &b[2]);
+    ba_check_l(ba, 0, 0, 100);
+    ba_check_l(ba, 1, BSIZE, 100);
+    ba_check_l(ba, 2, 2 * BSIZE, BSIZE + 100);
+    ba_check_l(ba, 3, 4 * BSIZE, 100);
+    ba_check_none(ba, 4);
+
+    ba_alloc(ba, 100, &b[3]);
+    ba_alloc(ba, 100, &b[4]);
+    ba_alloc(ba, 100, &b[5]);
+    ba_check_l(ba, 0, 0, 100);
+    ba_check_l(ba, 1, BSIZE, 100);
+    ba_check_l(ba, 2, 2 * BSIZE, BSIZE + 100);
+    ba_check_l(ba, 3, 4 * BSIZE, 100);
+    ba_check_l(ba, 4, 5 * BSIZE, 100);
+    ba_check_l(ba, 5, 6 * BSIZE, 100);
+    ba_check_l(ba, 6, 7 * BSIZE, 100);
+    ba_check_none(ba, 7);
+
+    ba_free(ba, 4 * BSIZE, 100);
+    ba_check_l(ba, 0, 0, 100);
+    ba_check_l(ba, 1, BSIZE, 100);
+    ba_check_l(ba, 2, 2 * BSIZE, BSIZE + 100);
+    ba_check_l(ba, 3, 5 * BSIZE, 100);
+    ba_check_l(ba, 4, 6 * BSIZE, 100);
+    ba_check_l(ba, 5, 7 * BSIZE, 100);
+    ba_check_none(ba, 6);
 
     uint64_t b2;
     ba_alloc(ba, 100, &b2);
-    assert(b2==4*BSIZE);
-    ba_check_l    (ba, 0, 0, 100);
-    ba_check_l    (ba, 1,   BSIZE,       100);
-    ba_check_l    (ba, 2, 2*BSIZE, BSIZE + 100);
-    ba_check_l    (ba, 3, 4*BSIZE,       100);
-    ba_check_l    (ba, 4, 5*BSIZE,       100);
-    ba_check_l    (ba, 5, 6*BSIZE,       100);
-    ba_check_l    (ba, 6, 7*BSIZE,       100);
-    ba_check_none (ba, 7);
-
-    ba_free (ba,   BSIZE);
-    ba_free (ba, 5*BSIZE);
-    ba_check_l    (ba, 0, 0, 100);
-    ba_check_l    (ba, 1, 2*BSIZE, BSIZE + 100);
-    ba_check_l    (ba, 2, 4*BSIZE,       100);
-    ba_check_l    (ba, 3, 6*BSIZE,       100);
-    ba_check_l    (ba, 4, 7*BSIZE,       100);
-    ba_check_none (ba, 5);
-
-    // This alloc will allocate the first block after the reserve space in the case of first fit.
+    invariant(b2 == 4 * BSIZE);
+    ba_check_l(ba, 0, 0, 100);
+    ba_check_l(ba, 1, BSIZE, 100);
+    ba_check_l(ba, 2, 2 * BSIZE, BSIZE + 100);
+    ba_check_l(ba, 3, 4 * BSIZE, 100);
+    ba_check_l(ba, 4, 5 * BSIZE, 100);
+    ba_check_l(ba, 5, 6 * BSIZE, 100);
+    ba_check_l(ba, 6, 7 * BSIZE, 100);
+    ba_check_none(ba, 7);
+
+    ba_free(ba, BSIZE, 100);
+    ba_free(ba, 5 * BSIZE, 100);
+    ba_check_l(ba, 0, 0, 100);
+    ba_check_l(ba, 1, 2 * BSIZE, BSIZE + 100);
+    ba_check_l(ba, 2, 4 * BSIZE, 100);
+    ba_check_l(ba, 3, 6 * BSIZE, 100);
+    ba_check_l(ba, 4, 7 * BSIZE, 100);
+    ba_check_none(ba, 5);
+
+    // This alloc will allocate the first block after the reserve space in the
+    // case of first fit.
     uint64_t b3;
     ba_alloc(ba, 100, &b3);
-    assert(b3==  BSIZE);      // First fit.
+    invariant(b3 == BSIZE);  // First fit.
     // if (b3==5*BSIZE) then it is next fit.
 
     // Now 5*BSIZE is free
     uint64_t b5;
     ba_alloc(ba, 100, &b5);
-    assert(b5==5*BSIZE);
-    ba_check_l    (ba, 0, 0, 100);
-    ba_check_l    (ba, 1,   BSIZE,       100);
-    ba_check_l    (ba, 2, 2*BSIZE, BSIZE + 100);
-    ba_check_l    (ba, 3, 4*BSIZE,       100);
-    ba_check_l    (ba, 4, 5*BSIZE,       100);
-    ba_check_l    (ba, 5, 6*BSIZE,       100);
-    ba_check_l    (ba, 6, 7*BSIZE,       100);
-    ba_check_none (ba, 7);
+    invariant(b5 == 5 * BSIZE);
+    ba_check_l(ba, 0, 0, 100);
+    ba_check_l(ba, 1, BSIZE, 100);
+    ba_check_l(ba, 2, 2 * BSIZE, BSIZE + 100);
+    ba_check_l(ba, 3, 4 * BSIZE, 100);
+    ba_check_l(ba, 4, 5 * BSIZE, 100);
+    ba_check_l(ba, 5, 6 * BSIZE, 100);
+    ba_check_l(ba, 6, 7 * BSIZE, 100);
+    ba_check_none(ba, 7);
 
     // Now all blocks are busy
     uint64_t b6, b7, b8;
     ba_alloc(ba, 100, &b6);
     ba_alloc(ba, 100, &b7);
     ba_alloc(ba, 100, &b8);
-    assert(b6==8*BSIZE);
-    assert(b7==9*BSIZE);
-    assert(b8==10*BSIZE);
-    ba_check_l    (ba, 0, 0, 100);
-    ba_check_l    (ba, 1,   BSIZE,       100);
-    ba_check_l    (ba, 2, 2*BSIZE, BSIZE + 100);
-    ba_check_l    (ba, 3, 4*BSIZE,       100);
-    ba_check_l    (ba, 4, 5*BSIZE,       100);
-    ba_check_l    (ba, 5, 6*BSIZE,       100);
-    ba_check_l    (ba, 6, 7*BSIZE,       100);
-    ba_check_l    (ba, 7, 8*BSIZE,       100);
-    ba_check_l    (ba, 8, 9*BSIZE,       100);
-    ba_check_l    (ba, 9, 10*BSIZE,       100);
-    ba_check_none (ba, 10);
-    
-    ba_free(ba, 9*BSIZE);
-    ba_free(ba, 7*BSIZE);
+    invariant(b6 == 8 * BSIZE);
+    invariant(b7 == 9 * BSIZE);
+    invariant(b8 == 10 * BSIZE);
+    ba_check_l(ba, 0, 0, 100);
+    ba_check_l(ba, 1, BSIZE, 100);
+    ba_check_l(ba, 2, 2 * BSIZE, BSIZE + 100);
+    ba_check_l(ba, 3, 4 * BSIZE, 100);
+    ba_check_l(ba, 4, 5 * BSIZE, 100);
+    ba_check_l(ba, 5, 6 * BSIZE, 100);
+    ba_check_l(ba, 6, 7 * BSIZE, 100);
+    ba_check_l(ba, 7, 8 * BSIZE, 100);
+    ba_check_l(ba, 8, 9 * BSIZE, 100);
+    ba_check_l(ba, 9, 10 * BSIZE, 100);
+    ba_check_none(ba, 10);
+
+    ba_free(ba, 9 * BSIZE, 100);
+    ba_free(ba, 7 * BSIZE, 100);
     uint64_t b9;
     ba_alloc(ba, 100, &b9);
-    assert(b9==7*BSIZE);
+    invariant(b9 == 7 * BSIZE);
 
-    ba_free(ba, 5*BSIZE);
-    ba_free(ba, 2*BSIZE);
+    ba_free(ba, 5 * BSIZE, 100);
+    ba_free(ba, 2 * BSIZE, BSIZE + 100);
     uint64_t b10, b11;
     ba_alloc(ba, 100, &b10);
-    assert(b10==2*BSIZE);
+    invariant(b10 == 2 * BSIZE);
     ba_alloc(ba, 100, &b11);
-    assert(b11==3*BSIZE);
+    invariant(b11 == 3 * BSIZE);
     ba_alloc(ba, 100, &b11);
-    assert(b11==5*BSIZE);
+    invariant(b11 == 5 * BSIZE);
 
-    ba->destroy();
+    ba->Destroy();
 }
 
-int
-test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
-    enum block_allocator::allocation_strategy strategies[] = {
-        block_allocator::BA_STRATEGY_FIRST_FIT,
-        block_allocator::BA_STRATEGY_BEST_FIT,
-        block_allocator::BA_STRATEGY_PADDED_FIT,
-        block_allocator::BA_STRATEGY_HEAT_ZONE,
-    };
-    for (size_t i = 0; i < sizeof(strategies) / sizeof(strategies[0]); i++) {
-        test_ba0(strategies[i]);
-        test_ba1(strategies[i], 0);
-        test_ba1(strategies[i], 10);
-        test_ba1(strategies[i], 20);
-    }
+int test_main(int argc __attribute__((__unused__)),
+              const char *argv[] __attribute__((__unused__))) {
+    test_ba0();
+    test_ba1(0);
+    test_ba1(10);
+    test_ba1(20);
     test_ba2();
     return 0;
 }
diff --git a/storage/tokudb/PerconaFT/ft/tests/cachetable-5978.cc b/storage/tokudb/PerconaFT/ft/tests/cachetable-5978.cc
index a7c48ef709a..ee68ab3ef0b 100644
--- a/storage/tokudb/PerconaFT/ft/tests/cachetable-5978.cc
+++ b/storage/tokudb/PerconaFT/ft/tests/cachetable-5978.cc
@@ -45,7 +45,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 // #5978 is fixed. Here is what we do. We have four pairs with
 // blocknums and fullhashes of 1,2,3,4. The cachetable has only
 // two bucket mutexes, so 1 and 3 share a pair mutex, as do 2 and 4.
-// We pin all four with expensive write locks. Then, on backgroud threads,
+// We pin all four with expensive write locks. Then, on background threads,
 // we call get_and_pin_nonblocking on 3, where the unlockers unpins 2, and
 // we call get_and_pin_nonblocking on 4, where the unlockers unpins 1. Run this
 // enough times, and we should see a deadlock before the fix, and no deadlock
diff --git a/storage/tokudb/PerconaFT/ft/tests/cachetable-simple-clone2.cc b/storage/tokudb/PerconaFT/ft/tests/cachetable-simple-clone2.cc
index be4bae898be..51cf70c3e76 100644
--- a/storage/tokudb/PerconaFT/ft/tests/cachetable-simple-clone2.cc
+++ b/storage/tokudb/PerconaFT/ft/tests/cachetable-simple-clone2.cc
@@ -77,7 +77,7 @@ flush (
 
 //
 // test the following things for simple cloning:
-//  - verifies that after teh checkpoint ends, the PAIR is properly 
+//  - verifies that after the checkpoint ends, the PAIR is properly
 //     dirty or clean based on the second unpin
 //
 static void
diff --git a/storage/tokudb/PerconaFT/ft/tests/ft-bfe-query.cc b/storage/tokudb/PerconaFT/ft/tests/ft-bfe-query.cc
index cb03a23e0fc..7abd2267a7e 100644
--- a/storage/tokudb/PerconaFT/ft/tests/ft-bfe-query.cc
+++ b/storage/tokudb/PerconaFT/ft/tests/ft-bfe-query.cc
@@ -38,69 +38,72 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 
 #include "test.h"
 
-static  int
-int64_key_cmp (DB *db UU(), const DBT *a, const DBT *b) {
-    int64_t x = *(int64_t *) a->data;
-    int64_t y = *(int64_t *) b->data;
-
-    if (x<y) return -1;
-    if (x>y) return 1;
+static int int64_key_cmp(DB *db UU(), const DBT *a, const DBT *b) {
+    int64_t x = *(int64_t *)a->data;
+    int64_t y = *(int64_t *)b->data;
+
+    if (x < y)
+        return -1;
+    if (x > y)
+        return 1;
     return 0;
 }
 
-static void
-test_prefetch_read(int fd, FT_HANDLE UU(ft), FT ft_h) {
+static void test_prefetch_read(int fd, FT_HANDLE UU(ft), FT ft_h) {
     int r;
     FT_CURSOR XMALLOC(cursor);
     FTNODE dn = NULL;
     PAIR_ATTR attr;
-    
+
     // first test that prefetching everything should work
-    memset(&cursor->range_lock_left_key, 0 , sizeof(DBT));
-    memset(&cursor->range_lock_right_key, 0 , sizeof(DBT));
+    memset(&cursor->range_lock_left_key, 0, sizeof(DBT));
+    memset(&cursor->range_lock_right_key, 0, sizeof(DBT));
     cursor->left_is_neg_infty = true;
     cursor->right_is_pos_infty = true;
     cursor->disable_prefetching = false;
-    
+
     ftnode_fetch_extra bfe;
 
     // quick test to see that we have the right behavior when we set
     // disable_prefetching to true
     cursor->disable_prefetching = true;
-    bfe.create_for_prefetch( ft_h, cursor);
+    bfe.create_for_prefetch(ft_h, cursor);
     FTNODE_DISK_DATA ndd = NULL;
-    r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
-    assert(r==0);
-    assert(dn->n_children == 3);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_ON_DISK);
-    assert(BP_STATE(dn,2) == PT_ON_DISK);
+    r = toku_deserialize_ftnode_from(
+        fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe);
+    invariant(r == 0);
+    invariant(dn->n_children == 3);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 2) == PT_ON_DISK);
     r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_ON_DISK);
-    assert(BP_STATE(dn,2) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 2) == PT_ON_DISK);
     bfe.destroy();
     toku_ftnode_free(&dn);
     toku_free(ndd);
 
     // now enable prefetching again
     cursor->disable_prefetching = false;
-    
-    bfe.create_for_prefetch( ft_h, cursor);
-    r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
-    assert(r==0);
-    assert(dn->n_children == 3);
-    assert(BP_STATE(dn,0) == PT_AVAIL);
-    assert(BP_STATE(dn,1) == PT_AVAIL);
-    assert(BP_STATE(dn,2) == PT_AVAIL);
-    toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
-    assert(BP_STATE(dn,0) == PT_COMPRESSED);
-    assert(BP_STATE(dn,1) == PT_COMPRESSED);
-    assert(BP_STATE(dn,2) == PT_COMPRESSED);
+
+    bfe.create_for_prefetch(ft_h, cursor);
+    r = toku_deserialize_ftnode_from(
+        fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe);
+    invariant(r == 0);
+    invariant(dn->n_children == 3);
+    invariant(BP_STATE(dn, 0) == PT_AVAIL);
+    invariant(BP_STATE(dn, 1) == PT_AVAIL);
+    invariant(BP_STATE(dn, 2) == PT_AVAIL);
+    toku_ftnode_pe_callback(
+        dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
+    invariant(BP_STATE(dn, 0) == PT_COMPRESSED);
+    invariant(BP_STATE(dn, 1) == PT_COMPRESSED);
+    invariant(BP_STATE(dn, 2) == PT_COMPRESSED);
     r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr);
-    assert(BP_STATE(dn,0) == PT_AVAIL);
-    assert(BP_STATE(dn,1) == PT_AVAIL);
-    assert(BP_STATE(dn,2) == PT_AVAIL);
+    invariant(BP_STATE(dn, 0) == PT_AVAIL);
+    invariant(BP_STATE(dn, 1) == PT_AVAIL);
+    invariant(BP_STATE(dn, 2) == PT_AVAIL);
     bfe.destroy();
     toku_ftnode_free(&dn);
     toku_free(ndd);
@@ -108,21 +111,23 @@ test_prefetch_read(int fd, FT_HANDLE UU(ft), FT ft_h) {
     uint64_t left_key = 150;
     toku_fill_dbt(&cursor->range_lock_left_key, &left_key, sizeof(uint64_t));
     cursor->left_is_neg_infty = false;
-    bfe.create_for_prefetch( ft_h, cursor);
-    r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
-    assert(r==0);
-    assert(dn->n_children == 3);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_AVAIL);
-    assert(BP_STATE(dn,2) == PT_AVAIL);
-    toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_COMPRESSED);
-    assert(BP_STATE(dn,2) == PT_COMPRESSED);
+    bfe.create_for_prefetch(ft_h, cursor);
+    r = toku_deserialize_ftnode_from(
+        fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe);
+    invariant(r == 0);
+    invariant(dn->n_children == 3);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_AVAIL);
+    invariant(BP_STATE(dn, 2) == PT_AVAIL);
+    toku_ftnode_pe_callback(
+        dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_COMPRESSED);
+    invariant(BP_STATE(dn, 2) == PT_COMPRESSED);
     r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_AVAIL);
-    assert(BP_STATE(dn,2) == PT_AVAIL);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_AVAIL);
+    invariant(BP_STATE(dn, 2) == PT_AVAIL);
     bfe.destroy();
     toku_ftnode_free(&dn);
     toku_free(ndd);
@@ -130,63 +135,69 @@ test_prefetch_read(int fd, FT_HANDLE UU(ft), FT ft_h) {
     uint64_t right_key = 151;
     toku_fill_dbt(&cursor->range_lock_right_key, &right_key, sizeof(uint64_t));
     cursor->right_is_pos_infty = false;
-    bfe.create_for_prefetch( ft_h, cursor);
-    r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
-    assert(r==0);
-    assert(dn->n_children == 3);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_AVAIL);
-    assert(BP_STATE(dn,2) == PT_ON_DISK);
-    toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_COMPRESSED);
-    assert(BP_STATE(dn,2) == PT_ON_DISK);
+    bfe.create_for_prefetch(ft_h, cursor);
+    r = toku_deserialize_ftnode_from(
+        fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe);
+    invariant(r == 0);
+    invariant(dn->n_children == 3);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_AVAIL);
+    invariant(BP_STATE(dn, 2) == PT_ON_DISK);
+    toku_ftnode_pe_callback(
+        dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_COMPRESSED);
+    invariant(BP_STATE(dn, 2) == PT_ON_DISK);
     r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_AVAIL);
-    assert(BP_STATE(dn,2) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_AVAIL);
+    invariant(BP_STATE(dn, 2) == PT_ON_DISK);
     bfe.destroy();
     toku_ftnode_free(&dn);
     toku_free(ndd);
 
     left_key = 100000;
     right_key = 100000;
-    bfe.create_for_prefetch( ft_h, cursor);
-    r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
-    assert(r==0);
-    assert(dn->n_children == 3);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_ON_DISK);
-    assert(BP_STATE(dn,2) == PT_AVAIL);
-    toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_ON_DISK);
-    assert(BP_STATE(dn,2) == PT_COMPRESSED);
+    bfe.create_for_prefetch(ft_h, cursor);
+    r = toku_deserialize_ftnode_from(
+        fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe);
+    invariant(r == 0);
+    invariant(dn->n_children == 3);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 2) == PT_AVAIL);
+    toku_ftnode_pe_callback(
+        dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 2) == PT_COMPRESSED);
     r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_ON_DISK);
-    assert(BP_STATE(dn,2) == PT_AVAIL);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 2) == PT_AVAIL);
     bfe.destroy();
     toku_free(ndd);
     toku_ftnode_free(&dn);
 
     left_key = 100;
     right_key = 100;
-    bfe.create_for_prefetch( ft_h, cursor);
-    r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
-    assert(r==0);
-    assert(dn->n_children == 3);
-    assert(BP_STATE(dn,0) == PT_AVAIL);
-    assert(BP_STATE(dn,1) == PT_ON_DISK);
-    assert(BP_STATE(dn,2) == PT_ON_DISK);
-    toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
-    assert(BP_STATE(dn,0) == PT_COMPRESSED);
-    assert(BP_STATE(dn,1) == PT_ON_DISK);
-    assert(BP_STATE(dn,2) == PT_ON_DISK);
+    bfe.create_for_prefetch(ft_h, cursor);
+    r = toku_deserialize_ftnode_from(
+        fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe);
+    invariant(r == 0);
+    invariant(dn->n_children == 3);
+    invariant(BP_STATE(dn, 0) == PT_AVAIL);
+    invariant(BP_STATE(dn, 1) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 2) == PT_ON_DISK);
+    toku_ftnode_pe_callback(
+        dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
+    invariant(BP_STATE(dn, 0) == PT_COMPRESSED);
+    invariant(BP_STATE(dn, 1) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 2) == PT_ON_DISK);
     r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr);
-    assert(BP_STATE(dn,0) == PT_AVAIL);
-    assert(BP_STATE(dn,1) == PT_ON_DISK);
-    assert(BP_STATE(dn,2) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 0) == PT_AVAIL);
+    invariant(BP_STATE(dn, 1) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 2) == PT_ON_DISK);
     bfe.destroy();
     toku_ftnode_free(&dn);
     toku_free(ndd);
@@ -194,20 +205,19 @@ test_prefetch_read(int fd, FT_HANDLE UU(ft), FT ft_h) {
     toku_free(cursor);
 }
 
-static void
-test_subset_read(int fd, FT_HANDLE UU(ft), FT ft_h) {
+static void test_subset_read(int fd, FT_HANDLE UU(ft), FT ft_h) {
     int r;
     FT_CURSOR XMALLOC(cursor);
     FTNODE dn = NULL;
     FTNODE_DISK_DATA ndd = NULL;
     PAIR_ATTR attr;
-    
+
     // first test that prefetching everything should work
-    memset(&cursor->range_lock_left_key, 0 , sizeof(DBT));
-    memset(&cursor->range_lock_right_key, 0 , sizeof(DBT));
+    memset(&cursor->range_lock_left_key, 0, sizeof(DBT));
+    memset(&cursor->range_lock_right_key, 0, sizeof(DBT));
     cursor->left_is_neg_infty = true;
     cursor->right_is_pos_infty = true;
-    
+
     uint64_t left_key = 150;
     uint64_t right_key = 151;
     DBT left, right;
@@ -216,101 +226,106 @@ test_subset_read(int fd, FT_HANDLE UU(ft), FT ft_h) {
 
     ftnode_fetch_extra bfe;
     bfe.create_for_subset_read(
-        ft_h,
-        NULL, 
-        &left,
-        &right,
-        false,
-        false,
-        false,
-        false
-        );
-    
+        ft_h, NULL, &left, &right, false, false, false, false);
+
     // fake the childnum to read
     // set disable_prefetching ON
     bfe.child_to_read = 2;
     bfe.disable_prefetching = true;
-    r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
-    assert(r==0);
-    assert(dn->n_children == 3);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_ON_DISK);
-    assert(BP_STATE(dn,2) == PT_AVAIL);
-    // need to call this twice because we had a subset read before, that touched the clock
-    toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_ON_DISK);
-    assert(BP_STATE(dn,2) == PT_AVAIL);
-    toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_ON_DISK);
-    assert(BP_STATE(dn,2) == PT_COMPRESSED);
+    r = toku_deserialize_ftnode_from(
+        fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe);
+    invariant(r == 0);
+    invariant(dn->n_children == 3);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 2) == PT_AVAIL);
+    // need to call this twice because we had a subset read before, that touched
+    // the clock
+    toku_ftnode_pe_callback(
+        dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 2) == PT_AVAIL);
+    toku_ftnode_pe_callback(
+        dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 2) == PT_COMPRESSED);
     r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_ON_DISK);
-    assert(BP_STATE(dn,2) == PT_AVAIL);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 2) == PT_AVAIL);
     toku_ftnode_free(&dn);
     toku_free(ndd);
 
     // fake the childnum to read
     bfe.child_to_read = 2;
     bfe.disable_prefetching = false;
-    r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
-    assert(r==0);
-    assert(dn->n_children == 3);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_AVAIL);
-    assert(BP_STATE(dn,2) == PT_AVAIL);
-    // need to call this twice because we had a subset read before, that touched the clock
-    toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_COMPRESSED);
-    assert(BP_STATE(dn,2) == PT_AVAIL);
-    toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_COMPRESSED);
-    assert(BP_STATE(dn,2) == PT_COMPRESSED);
+    r = toku_deserialize_ftnode_from(
+        fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe);
+    invariant(r == 0);
+    invariant(dn->n_children == 3);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_AVAIL);
+    invariant(BP_STATE(dn, 2) == PT_AVAIL);
+    // need to call this twice because we had a subset read before, that touched
+    // the clock
+    toku_ftnode_pe_callback(
+        dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_COMPRESSED);
+    invariant(BP_STATE(dn, 2) == PT_AVAIL);
+    toku_ftnode_pe_callback(
+        dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_COMPRESSED);
+    invariant(BP_STATE(dn, 2) == PT_COMPRESSED);
     r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr);
-    assert(BP_STATE(dn,0) == PT_ON_DISK);
-    assert(BP_STATE(dn,1) == PT_AVAIL);
-    assert(BP_STATE(dn,2) == PT_AVAIL);
+    invariant(BP_STATE(dn, 0) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 1) == PT_AVAIL);
+    invariant(BP_STATE(dn, 2) == PT_AVAIL);
     toku_ftnode_free(&dn);
     toku_free(ndd);
 
     // fake the childnum to read
     bfe.child_to_read = 0;
-    r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe);
-    assert(r==0);
-    assert(dn->n_children == 3);
-    assert(BP_STATE(dn,0) == PT_AVAIL);
-    assert(BP_STATE(dn,1) == PT_AVAIL);
-    assert(BP_STATE(dn,2) == PT_ON_DISK);
-    // need to call this twice because we had a subset read before, that touched the clock
-    toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
-    assert(BP_STATE(dn,0) == PT_AVAIL);
-    assert(BP_STATE(dn,1) == PT_COMPRESSED);
-    assert(BP_STATE(dn,2) == PT_ON_DISK);
-    toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
-    assert(BP_STATE(dn,0) == PT_COMPRESSED);
-    assert(BP_STATE(dn,1) == PT_COMPRESSED);
-    assert(BP_STATE(dn,2) == PT_ON_DISK);
+    r = toku_deserialize_ftnode_from(
+        fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe);
+    invariant(r == 0);
+    invariant(dn->n_children == 3);
+    invariant(BP_STATE(dn, 0) == PT_AVAIL);
+    invariant(BP_STATE(dn, 1) == PT_AVAIL);
+    invariant(BP_STATE(dn, 2) == PT_ON_DISK);
+    // need to call this twice because we had a subset read before, that touched
+    // the clock
+    toku_ftnode_pe_callback(
+        dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
+    invariant(BP_STATE(dn, 0) == PT_AVAIL);
+    invariant(BP_STATE(dn, 1) == PT_COMPRESSED);
+    invariant(BP_STATE(dn, 2) == PT_ON_DISK);
+    toku_ftnode_pe_callback(
+        dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
+    invariant(BP_STATE(dn, 0) == PT_COMPRESSED);
+    invariant(BP_STATE(dn, 1) == PT_COMPRESSED);
+    invariant(BP_STATE(dn, 2) == PT_ON_DISK);
     r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr);
-    assert(BP_STATE(dn,0) == PT_AVAIL);
-    assert(BP_STATE(dn,1) == PT_AVAIL);
-    assert(BP_STATE(dn,2) == PT_ON_DISK);
+    invariant(BP_STATE(dn, 0) == PT_AVAIL);
+    invariant(BP_STATE(dn, 1) == PT_AVAIL);
+    invariant(BP_STATE(dn, 2) == PT_ON_DISK);
     toku_ftnode_free(&dn);
     toku_free(ndd);
 
     toku_free(cursor);
 }
 
-
-static void
-test_prefetching(void) {
+static void test_prefetching(void) {
     //    struct ft_handle source_ft;
     struct ftnode sn;
 
-    int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
+    int fd = open(TOKU_TEST_FILENAME,
+                  O_RDWR | O_CREAT | O_BINARY,
+                  S_IRWXU | S_IRWXG | S_IRWXO);
+    invariant(fd >= 0);
 
     int r;
 
@@ -327,7 +342,7 @@ test_prefetching(void) {
 
     uint64_t key1 = 100;
     uint64_t key2 = 200;
-    
+
     MALLOC_N(sn.n_children, sn.bp);
     DBT pivotkeys[2];
     toku_fill_dbt(&pivotkeys[0], &key1, sizeof(key1));
@@ -336,13 +351,13 @@ test_prefetching(void) {
     BP_BLOCKNUM(&sn, 0).b = 30;
     BP_BLOCKNUM(&sn, 1).b = 35;
     BP_BLOCKNUM(&sn, 2).b = 40;
-    BP_STATE(&sn,0) = PT_AVAIL;
-    BP_STATE(&sn,1) = PT_AVAIL;
-    BP_STATE(&sn,2) = PT_AVAIL;
+    BP_STATE(&sn, 0) = PT_AVAIL;
+    BP_STATE(&sn, 1) = PT_AVAIL;
+    BP_STATE(&sn, 2) = PT_AVAIL;
     set_BNC(&sn, 0, toku_create_empty_nl());
     set_BNC(&sn, 1, toku_create_empty_nl());
     set_BNC(&sn, 2, toku_create_empty_nl());
-    //Create XIDS
+    // Create XIDS
     XIDS xids_0 = toku_xids_get_root_xids();
     XIDS xids_123;
     XIDS xids_234;
@@ -352,7 +367,7 @@ test_prefetching(void) {
     CKERR(r);
 
     // data in the buffers does not matter in this test
-    //Cleanup:
+    // Cleanup:
     toku_xids_destroy(&xids_0);
     toku_xids_destroy(&xids_123);
     toku_xids_destroy(&xids_234);
@@ -363,41 +378,48 @@ test_prefetching(void) {
                  make_blocknum(0),
                  ZERO_LSN,
                  TXNID_NONE,
-                 4*1024*1024,
-                 128*1024,
+                 4 * 1024 * 1024,
+                 128 * 1024,
                  TOKU_DEFAULT_COMPRESSION_METHOD,
                  16);
     ft_h->cmp.create(int64_key_cmp, nullptr);
     ft->ft = ft_h;
     ft_h->blocktable.create();
-    { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); }
-    //Want to use block #20
+    {
+        int r_truncate = ftruncate(fd, 0);
+        CKERR(r_truncate);
+    }
+    // Want to use block #20
     BLOCKNUM b = make_blocknum(0);
     while (b.b < 20) {
         ft_h->blocktable.allocate_blocknum(&b, ft_h);
     }
-    assert(b.b == 20);
+    invariant(b.b == 20);
 
     {
         DISKOFF offset;
         DISKOFF size;
-        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0);
-        assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
 
         ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size);
-        assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
-        assert(size   == 100);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        invariant(size == 100);
     }
     FTNODE_DISK_DATA ndd = NULL;
-    r = toku_serialize_ftnode_to(fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false);
-    assert(r==0);
+    r = toku_serialize_ftnode_to(
+        fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false);
+    invariant(r == 0);
 
-    test_prefetch_read(fd, ft, ft_h);    
+    test_prefetch_read(fd, ft, ft_h);
     test_subset_read(fd, ft, ft_h);
 
     toku_destroy_ftnode_internals(&sn);
 
-    ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+    ft_h->blocktable.block_free(
+        BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100);
     ft_h->blocktable.destroy();
     ft_h->cmp.destroy();
     toku_free(ft_h->h);
@@ -405,11 +427,12 @@ test_prefetching(void) {
     toku_free(ft);
     toku_free(ndd);
 
-    r = close(fd); assert(r != -1);
+    r = close(fd);
+    invariant(r != -1);
 }
 
-int
-test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
+int test_main(int argc __attribute__((__unused__)),
+              const char *argv[] __attribute__((__unused__))) {
     test_prefetching();
 
     return 0;
diff --git a/storage/tokudb/PerconaFT/ft/tests/ft-clock-test.cc b/storage/tokudb/PerconaFT/ft/tests/ft-clock-test.cc
index ceef3772e2a..26a3dae673c 100644
--- a/storage/tokudb/PerconaFT/ft/tests/ft-clock-test.cc
+++ b/storage/tokudb/PerconaFT/ft/tests/ft-clock-test.cc
@@ -40,38 +40,28 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 
 #include "ft/cursor.h"
 
-enum ftnode_verify_type {
-    read_all=1,
-    read_compressed,
-    read_none
-};
+enum ftnode_verify_type { read_all = 1, read_compressed, read_none };
 
 #ifndef MIN
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 #endif
 
-static int
-string_key_cmp(DB *UU(e), const DBT *a, const DBT *b)
-{
+static int string_key_cmp(DB *UU(e), const DBT *a, const DBT *b) {
     char *CAST_FROM_VOIDP(s, a->data);
     char *CAST_FROM_VOIDP(t, b->data);
     return strcmp(s, t);
 }
 
-static void
-le_add_to_bn(bn_data* bn, uint32_t idx, const  char *key, int keylen, const char *val, int vallen)
-{
+static void le_add_to_bn(bn_data *bn,
+                         uint32_t idx,
+                         const char *key,
+                         int keylen,
+                         const char *val,
+                         int vallen) {
     LEAFENTRY r = NULL;
     uint32_t size_needed = LE_CLEAN_MEMSIZE(vallen);
     void *maybe_free = nullptr;
-    bn->get_space_for_insert(
-        idx, 
-        key,
-        keylen,
-        size_needed,
-        &r,
-        &maybe_free
-        );
+    bn->get_space_for_insert(idx, key, keylen, size_needed, &r, &maybe_free);
     if (maybe_free) {
         toku_free(maybe_free);
     }
@@ -81,70 +71,67 @@ le_add_to_bn(bn_data* bn, uint32_t idx, const  char *key, int keylen, const char
     memcpy(r->u.clean.val, val, vallen);
 }
 
-
-static void
-le_malloc(bn_data* bn, uint32_t idx, const char *key, const char *val)
-{
+static void le_malloc(bn_data *bn,
+                      uint32_t idx,
+                      const char *key,
+                      const char *val) {
     int keylen = strlen(key) + 1;
     int vallen = strlen(val) + 1;
     le_add_to_bn(bn, idx, key, keylen, val, vallen);
 }
 
-
-static void
-test1(int fd, FT ft_h, FTNODE *dn) {
+static void test1(int fd, FT ft_h, FTNODE *dn) {
     int r;
     ftnode_fetch_extra bfe_all;
     bfe_all.create_for_full_read(ft_h);
     FTNODE_DISK_DATA ndd = NULL;
-    r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, &ndd, &bfe_all);
+    r = toku_deserialize_ftnode_from(
+        fd, make_blocknum(20), 0 /*pass zero for hash*/, dn, &ndd, &bfe_all);
     bool is_leaf = ((*dn)->height == 0);
-    assert(r==0);
+    invariant(r == 0);
     for (int i = 0; i < (*dn)->n_children; i++) {
-        assert(BP_STATE(*dn,i) == PT_AVAIL);
+        invariant(BP_STATE(*dn, i) == PT_AVAIL);
     }
     // should sweep and NOT get rid of anything
     PAIR_ATTR attr;
-    memset(&attr,0,sizeof(attr));
+    memset(&attr, 0, sizeof(attr));
     toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
     for (int i = 0; i < (*dn)->n_children; i++) {
-        assert(BP_STATE(*dn,i) == PT_AVAIL);
+        invariant(BP_STATE(*dn, i) == PT_AVAIL);
     }
     // should sweep and get compress all
     toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
     for (int i = 0; i < (*dn)->n_children; i++) {
         if (!is_leaf) {
-            assert(BP_STATE(*dn,i) == PT_COMPRESSED);
-        }
-        else {
-            assert(BP_STATE(*dn,i) == PT_ON_DISK);
+            invariant(BP_STATE(*dn, i) == PT_COMPRESSED);
+        } else {
+            invariant(BP_STATE(*dn, i) == PT_ON_DISK);
         }
     }
     PAIR_ATTR size;
     bool req = toku_ftnode_pf_req_callback(*dn, &bfe_all);
-    assert(req);
+    invariant(req);
     toku_ftnode_pf_callback(*dn, ndd, &bfe_all, fd, &size);
     toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
     for (int i = 0; i < (*dn)->n_children; i++) {
-        assert(BP_STATE(*dn,i) == PT_AVAIL);
+        invariant(BP_STATE(*dn, i) == PT_AVAIL);
     }
     // should sweep and get compress all
     toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
     for (int i = 0; i < (*dn)->n_children; i++) {
         if (!is_leaf) {
-            assert(BP_STATE(*dn,i) == PT_COMPRESSED);
-        }
-        else {
-            assert(BP_STATE(*dn,i) == PT_ON_DISK);
+            invariant(BP_STATE(*dn, i) == PT_COMPRESSED);
+        } else {
+            invariant(BP_STATE(*dn, i) == PT_ON_DISK);
         }
-    }    
+    }
 
     req = toku_ftnode_pf_req_callback(*dn, &bfe_all);
-    assert(req);
+    invariant(req);
     toku_ftnode_pf_callback(*dn, ndd, &bfe_all, fd, &size);
     toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
     for (int i = 0; i < (*dn)->n_children; i++) {
-        assert(BP_STATE(*dn,i) == PT_AVAIL);
+        invariant(BP_STATE(*dn, i) == PT_AVAIL);
     }
     (*dn)->dirty = 1;
     toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
@@ -152,101 +139,102 @@ test1(int fd, FT ft_h, FTNODE *dn) {
     toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
     toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
     for (int i = 0; i < (*dn)->n_children; i++) {
-        assert(BP_STATE(*dn,i) == PT_AVAIL);
+        invariant(BP_STATE(*dn, i) == PT_AVAIL);
     }
     toku_free(ndd);
     toku_ftnode_free(dn);
 }
 
-
-static int search_cmp(const struct ft_search& UU(so), const DBT* UU(key)) {
+static int search_cmp(const struct ft_search &UU(so), const DBT *UU(key)) {
     return 0;
 }
 
-static void
-test2(int fd, FT ft_h, FTNODE *dn) {
+static void test2(int fd, FT ft_h, FTNODE *dn) {
     DBT left, right;
     DB dummy_db;
     memset(&dummy_db, 0, sizeof(dummy_db));
     memset(&left, 0, sizeof(left));
     memset(&right, 0, sizeof(right));
     ft_search search;
-    
+
     ftnode_fetch_extra bfe_subset;
     bfe_subset.create_for_subset_read(
         ft_h,
-        ft_search_init(&search, search_cmp, FT_SEARCH_LEFT, nullptr, nullptr, nullptr),
+        ft_search_init(
+            &search, search_cmp, FT_SEARCH_LEFT, nullptr, nullptr, nullptr),
         &left,
         &right,
         true,
         true,
         false,
-        false
-        );
+        false);
 
     FTNODE_DISK_DATA ndd = NULL;
-    int r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, &ndd, &bfe_subset);
-    assert(r==0);
+    int r = toku_deserialize_ftnode_from(
+        fd, make_blocknum(20), 0 /*pass zero for hash*/, dn, &ndd, &bfe_subset);
+    invariant(r == 0);
     bool is_leaf = ((*dn)->height == 0);
-    // at this point, although both partitions are available, only the 
+    // at this point, although both partitions are available, only the
     // second basement node should have had its clock
     // touched
-    assert(BP_STATE(*dn, 0) == PT_AVAIL);
-    assert(BP_STATE(*dn, 1) == PT_AVAIL);
-    assert(BP_SHOULD_EVICT(*dn, 0));
-    assert(!BP_SHOULD_EVICT(*dn, 1));
+    invariant(BP_STATE(*dn, 0) == PT_AVAIL);
+    invariant(BP_STATE(*dn, 1) == PT_AVAIL);
+    invariant(BP_SHOULD_EVICT(*dn, 0));
+    invariant(!BP_SHOULD_EVICT(*dn, 1));
     PAIR_ATTR attr;
-    memset(&attr,0,sizeof(attr));
+    memset(&attr, 0, sizeof(attr));
     toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
-    assert(BP_STATE(*dn, 0) == (is_leaf) ? PT_ON_DISK : PT_COMPRESSED);
-    assert(BP_STATE(*dn, 1) == PT_AVAIL);
-    assert(BP_SHOULD_EVICT(*dn, 1));
+    invariant(BP_STATE(*dn, 0) == (is_leaf) ? PT_ON_DISK : PT_COMPRESSED);
+    invariant(BP_STATE(*dn, 1) == PT_AVAIL);
+    invariant(BP_SHOULD_EVICT(*dn, 1));
     toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr);
-    assert(BP_STATE(*dn, 1) == (is_leaf) ? PT_ON_DISK : PT_COMPRESSED);
+    invariant(BP_STATE(*dn, 1) == (is_leaf) ? PT_ON_DISK : PT_COMPRESSED);
 
     bool req = toku_ftnode_pf_req_callback(*dn, &bfe_subset);
-    assert(req);
+    invariant(req);
     toku_ftnode_pf_callback(*dn, ndd, &bfe_subset, fd, &attr);
-    assert(BP_STATE(*dn, 0) == PT_AVAIL);
-    assert(BP_STATE(*dn, 1) == PT_AVAIL);
-    assert(BP_SHOULD_EVICT(*dn, 0));
-    assert(!BP_SHOULD_EVICT(*dn, 1));
+    invariant(BP_STATE(*dn, 0) == PT_AVAIL);
+    invariant(BP_STATE(*dn, 1) == PT_AVAIL);
+    invariant(BP_SHOULD_EVICT(*dn, 0));
+    invariant(!BP_SHOULD_EVICT(*dn, 1));
 
     toku_free(ndd);
     toku_ftnode_free(dn);
 }
 
-static void
-test3_leaf(int fd, FT ft_h, FTNODE *dn) {
+static void test3_leaf(int fd, FT ft_h, FTNODE *dn) {
     DBT left, right;
     DB dummy_db;
     memset(&dummy_db, 0, sizeof(dummy_db));
     memset(&left, 0, sizeof(left));
     memset(&right, 0, sizeof(right));
-    
+
     ftnode_fetch_extra bfe_min;
     bfe_min.create_for_min_read(ft_h);
 
     FTNODE_DISK_DATA ndd = NULL;
-    int r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, &ndd, &bfe_min);
-    assert(r==0);
+    int r = toku_deserialize_ftnode_from(
+        fd, make_blocknum(20), 0 /*pass zero for hash*/, dn, &ndd, &bfe_min);
+    invariant(r == 0);
     //
     // make sure we have a leaf
     //
-    assert((*dn)->height == 0);
+    invariant((*dn)->height == 0);
     for (int i = 0; i < (*dn)->n_children; i++) {
-        assert(BP_STATE(*dn, i) == PT_ON_DISK);
+        invariant(BP_STATE(*dn, i) == PT_ON_DISK);
     }
     toku_ftnode_free(dn);
     toku_free(ndd);
 }
 
-static void
-test_serialize_nonleaf(void) {
+static void test_serialize_nonleaf(void) {
     //    struct ft_handle source_ft;
     struct ftnode sn, *dn;
 
-    int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
+    int fd = open(TOKU_TEST_FILENAME,
+                  O_RDWR | O_CREAT | O_BINARY,
+                  S_IRWXU | S_IRWXG | S_IRWXO);
+    invariant(fd >= 0);
 
     int r;
 
@@ -265,11 +253,11 @@ test_serialize_nonleaf(void) {
     sn.pivotkeys.create_from_dbts(toku_fill_dbt(&pivotkey, "hello", 6), 1);
     BP_BLOCKNUM(&sn, 0).b = 30;
     BP_BLOCKNUM(&sn, 1).b = 35;
-    BP_STATE(&sn,0) = PT_AVAIL;
-    BP_STATE(&sn,1) = PT_AVAIL;
+    BP_STATE(&sn, 0) = PT_AVAIL;
+    BP_STATE(&sn, 1) = PT_AVAIL;
     set_BNC(&sn, 0, toku_create_empty_nl());
     set_BNC(&sn, 1, toku_create_empty_nl());
-    //Create XIDS
+    // Create XIDS
     XIDS xids_0 = toku_xids_get_root_xids();
     XIDS xids_123;
     XIDS xids_234;
@@ -281,11 +269,38 @@ test_serialize_nonleaf(void) {
     toku::comparator cmp;
     cmp.create(string_key_cmp, nullptr);
 
-    toku_bnc_insert_msg(BNC(&sn, 0), "a", 2, "aval", 5, FT_NONE, next_dummymsn(), xids_0, true, cmp);
-    toku_bnc_insert_msg(BNC(&sn, 0), "b", 2, "bval", 5, FT_NONE, next_dummymsn(), xids_123, false, cmp);
-    toku_bnc_insert_msg(BNC(&sn, 1), "x", 2, "xval", 5, FT_NONE, next_dummymsn(), xids_234, true, cmp);
-
-    //Cleanup:
+    toku_bnc_insert_msg(BNC(&sn, 0),
+                        "a",
+                        2,
+                        "aval",
+                        5,
+                        FT_NONE,
+                        next_dummymsn(),
+                        xids_0,
+                        true,
+                        cmp);
+    toku_bnc_insert_msg(BNC(&sn, 0),
+                        "b",
+                        2,
+                        "bval",
+                        5,
+                        FT_NONE,
+                        next_dummymsn(),
+                        xids_123,
+                        false,
+                        cmp);
+    toku_bnc_insert_msg(BNC(&sn, 1),
+                        "x",
+                        2,
+                        "xval",
+                        5,
+                        FT_NONE,
+                        next_dummymsn(),
+                        xids_234,
+                        true,
+                        cmp);
+
+    // Cleanup:
     toku_xids_destroy(&xids_0);
     toku_xids_destroy(&xids_123);
     toku_xids_destroy(&xids_234);
@@ -297,35 +312,41 @@ test_serialize_nonleaf(void) {
                  make_blocknum(0),
                  ZERO_LSN,
                  TXNID_NONE,
-                 4*1024*1024,
-                 128*1024,
+                 4 * 1024 * 1024,
+                 128 * 1024,
                  TOKU_DEFAULT_COMPRESSION_METHOD,
                  16);
     ft_h->cmp.create(string_key_cmp, nullptr);
     ft->ft = ft_h;
-    
+
     ft_h->blocktable.create();
-    { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); }
-    //Want to use block #20
+    {
+        int r_truncate = ftruncate(fd, 0);
+        CKERR(r_truncate);
+    }
+    // Want to use block #20
     BLOCKNUM b = make_blocknum(0);
     while (b.b < 20) {
         ft_h->blocktable.allocate_blocknum(&b, ft_h);
     }
-    assert(b.b == 20);
+    invariant(b.b == 20);
 
     {
         DISKOFF offset;
         DISKOFF size;
-        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0);
-        assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
 
         ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size);
-        assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
-        assert(size   == 100);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        invariant(size == 100);
     }
     FTNODE_DISK_DATA ndd = NULL;
-    r = toku_serialize_ftnode_to(fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false);
-    assert(r==0);
+    r = toku_serialize_ftnode_to(
+        fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false);
+    invariant(r == 0);
 
     test1(fd, ft_h, &dn);
     test2(fd, ft_h, &dn);
@@ -333,22 +354,26 @@ test_serialize_nonleaf(void) {
     toku_destroy_ftnode_internals(&sn);
     toku_free(ndd);
 
-    ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+    ft_h->blocktable.block_free(
+        BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100);
     ft_h->blocktable.destroy();
     toku_free(ft_h->h);
     ft_h->cmp.destroy();
     toku_free(ft_h);
     toku_free(ft);
 
-    r = close(fd); assert(r != -1);
+    r = close(fd);
+    invariant(r != -1);
 }
 
-static void
-test_serialize_leaf(void) {
+static void test_serialize_leaf(void) {
     //    struct ft_handle source_ft;
     struct ftnode sn, *dn;
 
-    int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
+    int fd = open(TOKU_TEST_FILENAME,
+                  O_RDWR | O_CREAT | O_BINARY,
+                  S_IRWXU | S_IRWXG | S_IRWXO);
+    invariant(fd >= 0);
 
     int r;
 
@@ -364,8 +389,8 @@ test_serialize_leaf(void) {
     MALLOC_N(sn.n_children, sn.bp);
     DBT pivotkey;
     sn.pivotkeys.create_from_dbts(toku_fill_dbt(&pivotkey, "b", 2), 1);
-    BP_STATE(&sn,0) = PT_AVAIL;
-    BP_STATE(&sn,1) = PT_AVAIL;
+    BP_STATE(&sn, 0) = PT_AVAIL;
+    BP_STATE(&sn, 1) = PT_AVAIL;
     set_BLB(&sn, 0, toku_create_empty_bn());
     set_BLB(&sn, 1, toku_create_empty_bn());
     le_malloc(BLB_DATA(&sn, 0), 0, "a", "aval");
@@ -378,51 +403,59 @@ test_serialize_leaf(void) {
                  make_blocknum(0),
                  ZERO_LSN,
                  TXNID_NONE,
-                 4*1024*1024,
-                 128*1024,
+                 4 * 1024 * 1024,
+                 128 * 1024,
                  TOKU_DEFAULT_COMPRESSION_METHOD,
                  16);
     ft->ft = ft_h;
-    
+
     ft_h->blocktable.create();
-    { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); }
-    //Want to use block #20
+    {
+        int r_truncate = ftruncate(fd, 0);
+        CKERR(r_truncate);
+    }
+    // Want to use block #20
     BLOCKNUM b = make_blocknum(0);
     while (b.b < 20) {
         ft_h->blocktable.allocate_blocknum(&b, ft_h);
     }
-    assert(b.b == 20);
+    invariant(b.b == 20);
 
     {
         DISKOFF offset;
         DISKOFF size;
-        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0);
-        assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
 
         ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size);
-        assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
-        assert(size   == 100);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        invariant(size == 100);
     }
     FTNODE_DISK_DATA ndd = NULL;
-    r = toku_serialize_ftnode_to(fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false);
-    assert(r==0);
+    r = toku_serialize_ftnode_to(
+        fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false);
+    invariant(r == 0);
 
     test1(fd, ft_h, &dn);
-    test3_leaf(fd, ft_h,&dn);
+    test3_leaf(fd, ft_h, &dn);
 
     toku_destroy_ftnode_internals(&sn);
 
-    ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+    ft_h->blocktable.block_free(
+        BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100);
     ft_h->blocktable.destroy();
     toku_free(ft_h->h);
     toku_free(ft_h);
     toku_free(ft);
     toku_free(ndd);
-    r = close(fd); assert(r != -1);
+    r = close(fd);
+    invariant(r != -1);
 }
 
-int
-test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
+int test_main(int argc __attribute__((__unused__)),
+              const char *argv[] __attribute__((__unused__))) {
     initialize_dummymsn();
     test_serialize_nonleaf();
     test_serialize_leaf();
diff --git a/storage/tokudb/PerconaFT/ft/tests/ft-serialize-benchmark.cc b/storage/tokudb/PerconaFT/ft/tests/ft-serialize-benchmark.cc
index 9828f49513c..d50488ae197 100644
--- a/storage/tokudb/PerconaFT/ft/tests/ft-serialize-benchmark.cc
+++ b/storage/tokudb/PerconaFT/ft/tests/ft-serialize-benchmark.cc
@@ -41,27 +41,21 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 #include <sys/time.h>
 #include "test.h"
 
-
-
 #ifndef MIN
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 #endif
 const double USECS_PER_SEC = 1000000.0;
 
-static void
-le_add_to_bn(bn_data* bn, uint32_t idx, char *key, int keylen, char *val, int vallen)
-{
+static void le_add_to_bn(bn_data *bn,
+                         uint32_t idx,
+                         char *key,
+                         int keylen,
+                         char *val,
+                         int vallen) {
     LEAFENTRY r = NULL;
     uint32_t size_needed = LE_CLEAN_MEMSIZE(vallen);
     void *maybe_free = nullptr;
-    bn->get_space_for_insert(
-        idx, 
-        key,
-        keylen,
-        size_needed,
-        &r,
-        &maybe_free
-        );
+    bn->get_space_for_insert(idx, key, keylen, size_needed, &r, &maybe_free);
     if (maybe_free) {
         toku_free(maybe_free);
     }
@@ -71,20 +65,24 @@ le_add_to_bn(bn_data* bn, uint32_t idx, char *key, int keylen, char *val, int va
     memcpy(r->u.clean.val, val, vallen);
 }
 
-static int
-long_key_cmp(DB *UU(e), const DBT *a, const DBT *b)
-{
+static int long_key_cmp(DB *UU(e), const DBT *a, const DBT *b) {
     const long *CAST_FROM_VOIDP(x, a->data);
     const long *CAST_FROM_VOIDP(y, b->data);
     return (*x > *y) - (*x < *y);
 }
 
-static void
-test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int deser_runs) {
+static void test_serialize_leaf(int valsize,
+                                int nelts,
+                                double entropy,
+                                int ser_runs,
+                                int deser_runs) {
     //    struct ft_handle source_ft;
     struct ftnode *sn, *dn;
 
-    int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
+    int fd = open(TOKU_TEST_FILENAME,
+                  O_RDWR | O_CREAT | O_BINARY,
+                  S_IRWXU | S_IRWXG | S_IRWXO);
+    invariant(fd >= 0);
 
     int r;
 
@@ -102,7 +100,7 @@ test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int de
     MALLOC_N(sn->n_children, sn->bp);
     sn->pivotkeys.create_empty();
     for (int i = 0; i < sn->n_children; ++i) {
-        BP_STATE(sn,i) = PT_AVAIL;
+        BP_STATE(sn, i) = PT_AVAIL;
         set_BLB(sn, i, toku_create_empty_bn());
     }
     int nperbn = nelts / sn->n_children;
@@ -112,24 +110,19 @@ test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int de
             k = ck * nperbn + i;
             char buf[valsize];
             int c;
-            for (c = 0; c < valsize * entropy; ) {
-                int *p = (int *) &buf[c];
+            for (c = 0; c < valsize * entropy;) {
+                int *p = (int *)&buf[c];
                 *p = rand();
                 c += sizeof(*p);
             }
             memset(&buf[c], 0, valsize - c);
             le_add_to_bn(
-                BLB_DATA(sn,ck),
-                i,
-                (char *)&k, 
-                sizeof k, 
-                buf, 
-                sizeof buf
-                );
+                BLB_DATA(sn, ck), i, (char *)&k, sizeof k, buf, sizeof buf);
         }
         if (ck < 7) {
             DBT pivotkey;
-            sn->pivotkeys.insert_at(toku_fill_dbt(&pivotkey, &k, sizeof(k)), ck);
+            sn->pivotkeys.insert_at(toku_fill_dbt(&pivotkey, &k, sizeof(k)),
+                                    ck);
         }
     }
 
@@ -139,31 +132,36 @@ test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int de
                  make_blocknum(0),
                  ZERO_LSN,
                  TXNID_NONE,
-                 4*1024*1024,
-                 128*1024,
+                 4 * 1024 * 1024,
+                 128 * 1024,
                  TOKU_DEFAULT_COMPRESSION_METHOD,
                  16);
     ft_h->cmp.create(long_key_cmp, nullptr);
     ft->ft = ft_h;
-    
+
     ft_h->blocktable.create();
-    { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); }
-    //Want to use block #20
+    {
+        int r_truncate = ftruncate(fd, 0);
+        CKERR(r_truncate);
+    }
+    // Want to use block #20
     BLOCKNUM b = make_blocknum(0);
     while (b.b < 20) {
         ft_h->blocktable.allocate_blocknum(&b, ft_h);
     }
-    assert(b.b == 20);
+    invariant(b.b == 20);
 
     {
         DISKOFF offset;
         DISKOFF size;
-        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0);
-        assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
 
         ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size);
-        assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
-        assert(size   == 100);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        invariant(size == 100);
     }
 
     struct timeval total_start;
@@ -176,8 +174,9 @@ test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int de
         gettimeofday(&t[0], NULL);
         ndd = NULL;
         sn->dirty = 1;
-        r = toku_serialize_ftnode_to(fd, make_blocknum(20), sn, &ndd, true, ft->ft, false);
-        assert(r==0);
+        r = toku_serialize_ftnode_to(
+            fd, make_blocknum(20), sn, &ndd, true, ft->ft, false);
+        invariant(r == 0);
         gettimeofday(&t[1], NULL);
         total_start.tv_sec += t[0].tv_sec;
         total_start.tv_usec += t[0].tv_usec;
@@ -186,12 +185,14 @@ test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int de
         toku_free(ndd);
     }
     double dt;
-    dt = (total_end.tv_sec - total_start.tv_sec) + ((total_end.tv_usec - total_start.tv_usec) / USECS_PER_SEC);
+    dt = (total_end.tv_sec - total_start.tv_sec) +
+         ((total_end.tv_usec - total_start.tv_usec) / USECS_PER_SEC);
     dt *= 1000;
     dt /= ser_runs;
-    printf("serialize leaf(ms):   %0.05lf (average of %d runs)\n", dt, ser_runs);
+    printf(
+        "serialize leaf(ms):   %0.05lf (average of %d runs)\n", dt, ser_runs);
 
-    //reset 
+    // reset
     total_start.tv_sec = total_start.tv_usec = 0;
     total_end.tv_sec = total_end.tv_usec = 0;
 
@@ -200,8 +201,9 @@ test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int de
         bfe.create_for_full_read(ft_h);
         gettimeofday(&t[0], NULL);
         FTNODE_DISK_DATA ndd2 = NULL;
-        r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd2, &bfe);
-        assert(r==0);
+        r = toku_deserialize_ftnode_from(
+            fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd2, &bfe);
+        invariant(r == 0);
         gettimeofday(&t[1], NULL);
 
         total_start.tv_sec += t[0].tv_sec;
@@ -212,35 +214,46 @@ test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int de
         toku_ftnode_free(&dn);
         toku_free(ndd2);
     }
-    dt = (total_end.tv_sec - total_start.tv_sec) + ((total_end.tv_usec - total_start.tv_usec) / USECS_PER_SEC);
+    dt = (total_end.tv_sec - total_start.tv_sec) +
+         ((total_end.tv_usec - total_start.tv_usec) / USECS_PER_SEC);
     dt *= 1000;
     dt /= deser_runs;
-    printf("deserialize leaf(ms): %0.05lf (average of %d runs)\n", dt, deser_runs);
-    printf("io time(ms) %lf decompress time(ms) %lf deserialize time(ms) %lf (average of %d runs)\n",
-           tokutime_to_seconds(bfe.io_time)*1000,
-           tokutime_to_seconds(bfe.decompress_time)*1000,
-           tokutime_to_seconds(bfe.deserialize_time)*1000,
-           deser_runs
-           );
+    printf(
+        "deserialize leaf(ms): %0.05lf (average of %d runs)\n", dt, deser_runs);
+    printf(
+        "io time(ms) %lf decompress time(ms) %lf deserialize time(ms) %lf "
+        "(average of %d runs)\n",
+        tokutime_to_seconds(bfe.io_time) * 1000,
+        tokutime_to_seconds(bfe.decompress_time) * 1000,
+        tokutime_to_seconds(bfe.deserialize_time) * 1000,
+        deser_runs);
 
     toku_ftnode_free(&sn);
 
-    ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+    ft_h->blocktable.block_free(
+        BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100);
     ft_h->blocktable.destroy();
     ft_h->cmp.destroy();
     toku_free(ft_h->h);
     toku_free(ft_h);
     toku_free(ft);
 
-    r = close(fd); assert(r != -1);
+    r = close(fd);
+    invariant(r != -1);
 }
 
-static void
-test_serialize_nonleaf(int valsize, int nelts, double entropy, int ser_runs, int deser_runs) {
+static void test_serialize_nonleaf(int valsize,
+                                   int nelts,
+                                   double entropy,
+                                   int ser_runs,
+                                   int deser_runs) {
     //    struct ft_handle source_ft;
     struct ftnode sn, *dn;
 
-    int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
+    int fd = open(TOKU_TEST_FILENAME,
+                  O_RDWR | O_CREAT | O_BINARY,
+                  S_IRWXU | S_IRWXG | S_IRWXO);
+    invariant(fd >= 0);
 
     int r;
 
@@ -257,11 +270,11 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy, int ser_runs, int
     MALLOC_N(sn.n_children, sn.bp);
     sn.pivotkeys.create_empty();
     for (int i = 0; i < sn.n_children; ++i) {
-        BP_BLOCKNUM(&sn, i).b = 30 + (i*5);
-        BP_STATE(&sn,i) = PT_AVAIL;
+        BP_BLOCKNUM(&sn, i).b = 30 + (i * 5);
+        BP_STATE(&sn, i) = PT_AVAIL;
         set_BNC(&sn, i, toku_create_empty_nl());
     }
-    //Create XIDS
+    // Create XIDS
     XIDS xids_0 = toku_xids_get_root_xids();
     XIDS xids_123;
     r = toku_xids_create_child(xids_0, &xids_123, (TXNID)123);
@@ -276,14 +289,23 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy, int ser_runs, int
             k = ck * nperchild + i;
             char buf[valsize];
             int c;
-            for (c = 0; c < valsize * entropy; ) {
-                int *p = (int *) &buf[c];
+            for (c = 0; c < valsize * entropy;) {
+                int *p = (int *)&buf[c];
                 *p = rand();
                 c += sizeof(*p);
             }
             memset(&buf[c], 0, valsize - c);
 
-            toku_bnc_insert_msg(bnc, &k, sizeof k, buf, valsize, FT_NONE, next_dummymsn(), xids_123, true, cmp);
+            toku_bnc_insert_msg(bnc,
+                                &k,
+                                sizeof k,
+                                buf,
+                                valsize,
+                                FT_NONE,
+                                next_dummymsn(),
+                                xids_123,
+                                true,
+                                cmp);
         }
         if (ck < 7) {
             DBT pivotkey;
@@ -291,7 +313,7 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy, int ser_runs, int
         }
     }
 
-    //Cleanup:
+    // Cleanup:
     toku_xids_destroy(&xids_0);
     toku_xids_destroy(&xids_123);
     cmp.destroy();
@@ -302,65 +324,78 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy, int ser_runs, int
                  make_blocknum(0),
                  ZERO_LSN,
                  TXNID_NONE,
-                 4*1024*1024,
-                 128*1024,
+                 4 * 1024 * 1024,
+                 128 * 1024,
                  TOKU_DEFAULT_COMPRESSION_METHOD,
                  16);
     ft_h->cmp.create(long_key_cmp, nullptr);
     ft->ft = ft_h;
-    
+
     ft_h->blocktable.create();
-    { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); }
-    //Want to use block #20
+    {
+        int r_truncate = ftruncate(fd, 0);
+        CKERR(r_truncate);
+    }
+    // Want to use block #20
     BLOCKNUM b = make_blocknum(0);
     while (b.b < 20) {
         ft_h->blocktable.allocate_blocknum(&b, ft_h);
     }
-    assert(b.b == 20);
+    invariant(b.b == 20);
 
     {
         DISKOFF offset;
         DISKOFF size;
-        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0);
-        assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
 
         ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size);
-        assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
-        assert(size   == 100);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        invariant(size == 100);
     }
 
     struct timeval t[2];
     gettimeofday(&t[0], NULL);
     FTNODE_DISK_DATA ndd = NULL;
-    r = toku_serialize_ftnode_to(fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false);
-    assert(r==0);
+    r = toku_serialize_ftnode_to(
+        fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false);
+    invariant(r == 0);
     gettimeofday(&t[1], NULL);
     double dt;
-    dt = (t[1].tv_sec - t[0].tv_sec) + ((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC);
+    dt = (t[1].tv_sec - t[0].tv_sec) +
+         ((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC);
     dt *= 1000;
-    printf("serialize nonleaf(ms):   %0.05lf (IGNORED RUNS=%d)\n", dt, ser_runs);
+    printf(
+        "serialize nonleaf(ms):   %0.05lf (IGNORED RUNS=%d)\n", dt, ser_runs);
 
     ftnode_fetch_extra bfe;
     bfe.create_for_full_read(ft_h);
     gettimeofday(&t[0], NULL);
     FTNODE_DISK_DATA ndd2 = NULL;
-    r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd2, &bfe);
-    assert(r==0);
+    r = toku_deserialize_ftnode_from(
+        fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd2, &bfe);
+    invariant(r == 0);
     gettimeofday(&t[1], NULL);
-    dt = (t[1].tv_sec - t[0].tv_sec) + ((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC);
+    dt = (t[1].tv_sec - t[0].tv_sec) +
+         ((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC);
     dt *= 1000;
-    printf("deserialize nonleaf(ms): %0.05lf (IGNORED RUNS=%d)\n", dt, deser_runs);
-    printf("io time(ms) %lf decompress time(ms) %lf deserialize time(ms) %lf (IGNORED RUNS=%d)\n",
-           tokutime_to_seconds(bfe.io_time)*1000,
-           tokutime_to_seconds(bfe.decompress_time)*1000,
-           tokutime_to_seconds(bfe.deserialize_time)*1000,
-           deser_runs
-           );
+    printf(
+        "deserialize nonleaf(ms): %0.05lf (IGNORED RUNS=%d)\n", dt, deser_runs);
+    printf(
+        "io time(ms) %lf decompress time(ms) %lf deserialize time(ms) %lf "
+        "(IGNORED RUNS=%d)\n",
+        tokutime_to_seconds(bfe.io_time) * 1000,
+        tokutime_to_seconds(bfe.decompress_time) * 1000,
+        tokutime_to_seconds(bfe.deserialize_time) * 1000,
+        deser_runs);
 
     toku_ftnode_free(&dn);
     toku_destroy_ftnode_internals(&sn);
 
-    ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+    ft_h->blocktable.block_free(
+        BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100);
     ft_h->blocktable.destroy();
     toku_free(ft_h->h);
     ft_h->cmp.destroy();
@@ -369,17 +404,21 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy, int ser_runs, int
     toku_free(ndd);
     toku_free(ndd2);
 
-    r = close(fd); assert(r != -1);
+    r = close(fd);
+    invariant(r != -1);
 }
 
-int
-test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
+int test_main(int argc __attribute__((__unused__)),
+              const char *argv[] __attribute__((__unused__))) {
     const int DEFAULT_RUNS = 5;
     long valsize, nelts, ser_runs = DEFAULT_RUNS, deser_runs = DEFAULT_RUNS;
     double entropy = 0.3;
 
     if (argc != 3 && argc != 5) {
-        fprintf(stderr, "Usage: %s <valsize> <nelts> [<serialize_runs> <deserialize_runs>]\n", argv[0]);
+        fprintf(stderr,
+                "Usage: %s <valsize> <nelts> [<serialize_runs> "
+                "<deserialize_runs>]\n",
+                argv[0]);
         fprintf(stderr, "Default (and min) runs is %d\n", DEFAULT_RUNS);
         return 2;
     }
diff --git a/storage/tokudb/PerconaFT/ft/tests/ft-serialize-test.cc b/storage/tokudb/PerconaFT/ft/tests/ft-serialize-test.cc
index 332aaa0c170..0cddaf19651 100644
--- a/storage/tokudb/PerconaFT/ft/tests/ft-serialize-test.cc
+++ b/storage/tokudb/PerconaFT/ft/tests/ft-serialize-test.cc
@@ -39,26 +39,20 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 #include "test.h"
 #include "bndata.h"
 
-
-
 #ifndef MIN
 #define MIN(x, y) (((x) < (y)) ? (x) : (y))
 #endif
 
-static size_t
-le_add_to_bn(bn_data* bn, uint32_t idx, const  char *key, int keysize, const char *val, int valsize)
-{
+static size_t le_add_to_bn(bn_data *bn,
+                           uint32_t idx,
+                           const char *key,
+                           int keysize,
+                           const char *val,
+                           int valsize) {
     LEAFENTRY r = NULL;
     uint32_t size_needed = LE_CLEAN_MEMSIZE(valsize);
     void *maybe_free = nullptr;
-    bn->get_space_for_insert(
-        idx,
-        key,
-        keysize,
-        size_needed,
-        &r,
-        &maybe_free
-        );
+    bn->get_space_for_insert(idx, key, keysize, size_needed, &r, &maybe_free);
     if (maybe_free) {
         toku_free(maybe_free);
     }
@@ -70,16 +64,19 @@ le_add_to_bn(bn_data* bn, uint32_t idx, const  char *key, int keysize, const cha
 }
 
 class test_key_le_pair {
-    public:
+   public:
     uint32_t keylen;
-    char* keyp;
+    char *keyp;
     LEAFENTRY le;
 
     test_key_le_pair() : keylen(), keyp(), le() {}
     void init(const char *_keyp, const char *_val) {
         init(_keyp, strlen(_keyp) + 1, _val, strlen(_val) + 1);
     }
-    void init(const char * _keyp, uint32_t _keylen, const char*_val, uint32_t _vallen) {
+    void init(const char *_keyp,
+              uint32_t _keylen,
+              const char *_val,
+              uint32_t _vallen) {
         keylen = _keylen;
 
         CAST_FROM_VOIDP(le, toku_malloc(LE_CLEAN_MEMSIZE(_vallen)));
@@ -95,126 +92,144 @@ class test_key_le_pair {
     }
 };
 
-enum ftnode_verify_type {
-    read_all=1,
-    read_compressed,
-    read_none
-};
+enum ftnode_verify_type { read_all = 1, read_compressed, read_none };
 
-static int
-string_key_cmp(DB *UU(e), const DBT *a, const DBT *b)
-{
+static int string_key_cmp(DB *UU(e), const DBT *a, const DBT *b) {
     char *CAST_FROM_VOIDP(s, a->data);
     char *CAST_FROM_VOIDP(t, b->data);
     return strcmp(s, t);
 }
 
-static void
-setup_dn(enum ftnode_verify_type bft, int fd, FT ft_h, FTNODE *dn, FTNODE_DISK_DATA* ndd) {
+static void setup_dn(enum ftnode_verify_type bft,
+                     int fd,
+                     FT ft_h,
+                     FTNODE *dn,
+                     FTNODE_DISK_DATA *ndd) {
     int r;
     if (bft == read_all) {
         ftnode_fetch_extra bfe;
         bfe.create_for_full_read(ft_h);
-        r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, ndd, &bfe);
-        assert(r==0);
-    }
-    else if (bft == read_compressed || bft == read_none) {
+        r = toku_deserialize_ftnode_from(
+            fd, make_blocknum(20), 0 /*pass zero for hash*/, dn, ndd, &bfe);
+        invariant(r == 0);
+    } else if (bft == read_compressed || bft == read_none) {
         ftnode_fetch_extra bfe;
         bfe.create_for_min_read(ft_h);
-        r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, ndd, &bfe);
-        assert(r==0);
-        // assert all bp's are compressed or on disk.
+        r = toku_deserialize_ftnode_from(
+            fd, make_blocknum(20), 0 /*pass zero for hash*/, dn, ndd, &bfe);
+        invariant(r == 0);
+        // invariant all bp's are compressed or on disk.
         for (int i = 0; i < (*dn)->n_children; i++) {
-            assert(BP_STATE(*dn,i) == PT_COMPRESSED || BP_STATE(*dn, i) == PT_ON_DISK);
+            invariant(BP_STATE(*dn, i) == PT_COMPRESSED ||
+                   BP_STATE(*dn, i) == PT_ON_DISK);
         }
         // if read_none, get rid of the compressed bp's
         if (bft == read_none) {
             if ((*dn)->height == 0) {
-                toku_ftnode_pe_callback(*dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
-                // assert all bp's are on disk
+                toku_ftnode_pe_callback(*dn,
+                                        make_pair_attr(0xffffffff),
+                                        ft_h,
+                                        def_pe_finalize_impl,
+                                        nullptr);
+                // invariant all bp's are on disk
                 for (int i = 0; i < (*dn)->n_children; i++) {
                     if ((*dn)->height == 0) {
-                        assert(BP_STATE(*dn,i) == PT_ON_DISK);
-                        assert(is_BNULL(*dn, i));
-                    }
-                    else {
-                        assert(BP_STATE(*dn,i) == PT_COMPRESSED);
+                        invariant(BP_STATE(*dn, i) == PT_ON_DISK);
+                        invariant(is_BNULL(*dn, i));
+                    } else {
+                        invariant(BP_STATE(*dn, i) == PT_COMPRESSED);
                     }
                 }
-            }
-            else {
+            } else {
                 // first decompress everything, and make sure
                 // that it is available
                 // then run partial eviction to get it compressed
                 PAIR_ATTR attr;
                 bfe.create_for_full_read(ft_h);
-                assert(toku_ftnode_pf_req_callback(*dn, &bfe));
+                invariant(toku_ftnode_pf_req_callback(*dn, &bfe));
                 r = toku_ftnode_pf_callback(*dn, *ndd, &bfe, fd, &attr);
-                assert(r==0);
-                // assert all bp's are available
+                invariant(r == 0);
+                // invariant all bp's are available
                 for (int i = 0; i < (*dn)->n_children; i++) {
-                    assert(BP_STATE(*dn,i) == PT_AVAIL);
+                    invariant(BP_STATE(*dn, i) == PT_AVAIL);
                 }
-                toku_ftnode_pe_callback(*dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
+                toku_ftnode_pe_callback(*dn,
+                                        make_pair_attr(0xffffffff),
+                                        ft_h,
+                                        def_pe_finalize_impl,
+                                        nullptr);
                 for (int i = 0; i < (*dn)->n_children; i++) {
-                    // assert all bp's are still available, because we touched the clock
-                    assert(BP_STATE(*dn,i) == PT_AVAIL);
-                    // now assert all should be evicted
-                    assert(BP_SHOULD_EVICT(*dn, i));
+                    // invariant all bp's are still available, because we touched
+                    // the clock
+                    invariant(BP_STATE(*dn, i) == PT_AVAIL);
+                    // now invariant all should be evicted
+                    invariant(BP_SHOULD_EVICT(*dn, i));
                 }
-                toku_ftnode_pe_callback(*dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr);
+                toku_ftnode_pe_callback(*dn,
+                                        make_pair_attr(0xffffffff),
+                                        ft_h,
+                                        def_pe_finalize_impl,
+                                        nullptr);
                 for (int i = 0; i < (*dn)->n_children; i++) {
-                    assert(BP_STATE(*dn,i) == PT_COMPRESSED);
+                    invariant(BP_STATE(*dn, i) == PT_COMPRESSED);
                 }
             }
         }
         // now decompress them
         bfe.create_for_full_read(ft_h);
-        assert(toku_ftnode_pf_req_callback(*dn, &bfe));
+        invariant(toku_ftnode_pf_req_callback(*dn, &bfe));
         PAIR_ATTR attr;
         r = toku_ftnode_pf_callback(*dn, *ndd, &bfe, fd, &attr);
-        assert(r==0);
-        // assert all bp's are available
+        invariant(r == 0);
+        // invariant all bp's are available
         for (int i = 0; i < (*dn)->n_children; i++) {
-            assert(BP_STATE(*dn,i) == PT_AVAIL);
+            invariant(BP_STATE(*dn, i) == PT_AVAIL);
         }
         // continue on with test
-    }
-    else {
+    } else {
         // if we get here, this is a test bug, NOT a bug in development code
-        assert(false);
+        invariant(false);
     }
 }
 
-static void write_sn_to_disk(int fd, FT_HANDLE ft, FTNODE sn, FTNODE_DISK_DATA* src_ndd, bool do_clone) {
+static void write_sn_to_disk(int fd,
+                             FT_HANDLE ft,
+                             FTNODE sn,
+                             FTNODE_DISK_DATA *src_ndd,
+                             bool do_clone) {
     int r;
     if (do_clone) {
-        void* cloned_node_v = NULL;
+        void *cloned_node_v = NULL;
         PAIR_ATTR attr;
         long clone_size;
-        toku_ftnode_clone_callback(sn, &cloned_node_v, &clone_size, &attr, false, ft->ft);
+        toku_ftnode_clone_callback(
+            sn, &cloned_node_v, &clone_size, &attr, false, ft->ft);
         FTNODE CAST_FROM_VOIDP(cloned_node, cloned_node_v);
-        r = toku_serialize_ftnode_to(fd, make_blocknum(20), cloned_node, src_ndd, false, ft->ft, false);
-        assert(r==0);        
+        r = toku_serialize_ftnode_to(
+            fd, make_blocknum(20), cloned_node, src_ndd, false, ft->ft, false);
+        invariant(r == 0);
         toku_ftnode_free(&cloned_node);
-    }
-    else {
-        r = toku_serialize_ftnode_to(fd, make_blocknum(20), sn, src_ndd, true, ft->ft, false);
-        assert(r==0);
+    } else {
+        r = toku_serialize_ftnode_to(
+            fd, make_blocknum(20), sn, src_ndd, true, ft->ft, false);
+        invariant(r == 0);
     }
 }
 
-static void
-test_serialize_leaf_check_msn(enum ftnode_verify_type bft, bool do_clone) {
+static void test_serialize_leaf_check_msn(enum ftnode_verify_type bft,
+                                          bool do_clone) {
     //    struct ft_handle source_ft;
     struct ftnode sn, *dn;
 
-    int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
+    int fd = open(TOKU_TEST_FILENAME,
+                  O_RDWR | O_CREAT | O_BINARY,
+                  S_IRWXU | S_IRWXG | S_IRWXO);
+    invariant(fd >= 0);
 
     int r;
 
-#define PRESERIALIZE_MSN_ON_DISK ((MSN) { MIN_MSN.msn + 42 })
-#define POSTSERIALIZE_MSN_ON_DISK ((MSN) { MIN_MSN.msn + 84 })
+#define PRESERIALIZE_MSN_ON_DISK ((MSN){MIN_MSN.msn + 42})
+#define POSTSERIALIZE_MSN_ON_DISK ((MSN){MIN_MSN.msn + 84})
 
     sn.max_msn_applied_to_node_on_disk = PRESERIALIZE_MSN_ON_DISK;
     sn.flags = 0x11223344;
@@ -228,14 +243,14 @@ test_serialize_leaf_check_msn(enum ftnode_verify_type bft, bool do_clone) {
     MALLOC_N(sn.n_children, sn.bp);
     DBT pivotkey;
     sn.pivotkeys.create_from_dbts(toku_fill_dbt(&pivotkey, "b", 2), 1);
-    BP_STATE(&sn,0) = PT_AVAIL;
-    BP_STATE(&sn,1) = PT_AVAIL;
+    BP_STATE(&sn, 0) = PT_AVAIL;
+    BP_STATE(&sn, 1) = PT_AVAIL;
     set_BLB(&sn, 0, toku_create_empty_bn());
     set_BLB(&sn, 1, toku_create_empty_bn());
     le_add_to_bn(BLB_DATA(&sn, 0), 0, "a", 2, "aval", 5);
     le_add_to_bn(BLB_DATA(&sn, 0), 1, "b", 2, "bval", 5);
     le_add_to_bn(BLB_DATA(&sn, 1), 0, "x", 2, "xval", 5);
-    BLB_MAX_MSN_APPLIED(&sn, 0) = ((MSN) { MIN_MSN.msn + 73 });
+    BLB_MAX_MSN_APPLIED(&sn, 0) = ((MSN){MIN_MSN.msn + 73});
     BLB_MAX_MSN_APPLIED(&sn, 1) = POSTSERIALIZE_MSN_ON_DISK;
 
     FT_HANDLE XMALLOC(ft);
@@ -244,30 +259,35 @@ test_serialize_leaf_check_msn(enum ftnode_verify_type bft, bool do_clone) {
                  make_blocknum(0),
                  ZERO_LSN,
                  TXNID_NONE,
-                 4*1024*1024,
-                 128*1024,
+                 4 * 1024 * 1024,
+                 128 * 1024,
                  TOKU_DEFAULT_COMPRESSION_METHOD,
                  16);
     ft->ft = ft_h;
     ft_h->blocktable.create();
-    { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); }
+    {
+        int r_truncate = ftruncate(fd, 0);
+        CKERR(r_truncate);
+    }
 
-    //Want to use block #20
+    // Want to use block #20
     BLOCKNUM b = make_blocknum(0);
     while (b.b < 20) {
         ft_h->blocktable.allocate_blocknum(&b, ft_h);
     }
-    assert(b.b == 20);
+    invariant(b.b == 20);
 
     {
         DISKOFF offset;
         DISKOFF size;
-        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0);
-        assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
 
         ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size);
-        assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
-        assert(size   == 100);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        invariant(size == 100);
     }
     FTNODE_DISK_DATA src_ndd = NULL;
     FTNODE_DISK_DATA dest_ndd = NULL;
@@ -276,16 +296,18 @@ test_serialize_leaf_check_msn(enum ftnode_verify_type bft, bool do_clone) {
 
     setup_dn(bft, fd, ft_h, &dn, &dest_ndd);
 
-    assert(dn->blocknum.b==20);
+    invariant(dn->blocknum.b == 20);
 
-    assert(dn->layout_version ==FT_LAYOUT_VERSION);
-    assert(dn->layout_version_original ==FT_LAYOUT_VERSION);
-    assert(dn->layout_version_read_from_disk ==FT_LAYOUT_VERSION);
-    assert(dn->height == 0);
-    assert(dn->n_children>=1);
-    assert(dn->max_msn_applied_to_node_on_disk.msn == POSTSERIALIZE_MSN_ON_DISK.msn);
+    invariant(dn->layout_version == FT_LAYOUT_VERSION);
+    invariant(dn->layout_version_original == FT_LAYOUT_VERSION);
+    invariant(dn->layout_version_read_from_disk == FT_LAYOUT_VERSION);
+    invariant(dn->height == 0);
+    invariant(dn->n_children >= 1);
+    invariant(dn->max_msn_applied_to_node_on_disk.msn ==
+           POSTSERIALIZE_MSN_ON_DISK.msn);
     {
-        // Man, this is way too ugly.  This entire test suite needs to be refactored.
+        // Man, this is way too ugly.  This entire test suite needs to be
+        // refactored.
         // Create a dummy mempool and put the leaves there.  Ugh.
         test_key_le_pair elts[3];
         elts[0].init("a", "aval");
@@ -294,34 +316,41 @@ test_serialize_leaf_check_msn(enum ftnode_verify_type bft, bool do_clone) {
         const uint32_t npartitions = dn->n_children;
         uint32_t last_i = 0;
         for (uint32_t bn = 0; bn < npartitions; ++bn) {
-            assert(BLB_MAX_MSN_APPLIED(dn, bn).msn == POSTSERIALIZE_MSN_ON_DISK.msn);
-            assert(dest_ndd[bn].start > 0);
-            assert(dest_ndd[bn].size  > 0);
+            invariant(BLB_MAX_MSN_APPLIED(dn, bn).msn ==
+                   POSTSERIALIZE_MSN_ON_DISK.msn);
+            invariant(dest_ndd[bn].start > 0);
+            invariant(dest_ndd[bn].size > 0);
             if (bn > 0) {
-                assert(dest_ndd[bn].start >= dest_ndd[bn-1].start + dest_ndd[bn-1].size);
+                invariant(dest_ndd[bn].start >=
+                       dest_ndd[bn - 1].start + dest_ndd[bn - 1].size);
             }
             for (uint32_t i = 0; i < BLB_DATA(dn, bn)->num_klpairs(); i++) {
                 LEAFENTRY curr_le;
                 uint32_t curr_keylen;
-                void* curr_key;
-                BLB_DATA(dn, bn)->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key);
-                assert(leafentry_memsize(curr_le) == leafentry_memsize(elts[last_i].le));
-                assert(memcmp(curr_le, elts[last_i].le, leafentry_memsize(curr_le)) == 0);
-                if (bn < npartitions-1) {
-                    assert(strcmp((char*)dn->pivotkeys.get_pivot(bn).data, elts[last_i].keyp) <= 0);
+                void *curr_key;
+                BLB_DATA(dn, bn)
+                    ->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key);
+                invariant(leafentry_memsize(curr_le) ==
+                       leafentry_memsize(elts[last_i].le));
+                invariant(memcmp(curr_le,
+                              elts[last_i].le,
+                              leafentry_memsize(curr_le)) == 0);
+                if (bn < npartitions - 1) {
+                    invariant(strcmp((char *)dn->pivotkeys.get_pivot(bn).data,
+                                  elts[last_i].keyp) <= 0);
                 }
                 // TODO for later, get a key comparison here as well
                 last_i++;
             }
-
         }
-        assert(last_i == 3);
+        invariant(last_i == 3);
     }
 
     toku_ftnode_free(&dn);
     toku_destroy_ftnode_internals(&sn);
 
-    ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+    ft_h->blocktable.block_free(
+        BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100);
     ft_h->blocktable.destroy();
     toku_free(ft_h->h);
     toku_free(ft_h);
@@ -329,17 +358,21 @@ test_serialize_leaf_check_msn(enum ftnode_verify_type bft, bool do_clone) {
     toku_free(src_ndd);
     toku_free(dest_ndd);
 
-    r = close(fd); assert(r != -1);
+    r = close(fd);
+    invariant(r != -1);
 }
 
-static void
-test_serialize_leaf_with_large_pivots(enum ftnode_verify_type bft, bool do_clone) {
+static void test_serialize_leaf_with_large_pivots(enum ftnode_verify_type bft,
+                                                  bool do_clone) {
     int r;
     struct ftnode sn, *dn;
-    const int keylens = 256*1024, vallens = 0;
+    const int keylens = 256 * 1024, vallens = 0;
     const uint32_t nrows = 8;
-    // assert(val_size > BN_MAX_SIZE);  // BN_MAX_SIZE isn't visible
-    int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
+    // invariant(val_size > BN_MAX_SIZE);  // BN_MAX_SIZE isn't visible
+    int fd = open(TOKU_TEST_FILENAME,
+                  O_RDWR | O_CREAT | O_BINARY,
+                  S_IRWXU | S_IRWXG | S_IRWXO);
+    invariant(fd >= 0);
 
     sn.max_msn_applied_to_node_on_disk.msn = 0;
     sn.flags = 0x11223344;
@@ -354,21 +387,27 @@ test_serialize_leaf_with_large_pivots(enum ftnode_verify_type bft, bool do_clone
     MALLOC_N(sn.n_children, sn.bp);
     sn.pivotkeys.create_empty();
     for (int i = 0; i < sn.n_children; ++i) {
-        BP_STATE(&sn,i) = PT_AVAIL;
+        BP_STATE(&sn, i) = PT_AVAIL;
         set_BLB(&sn, i, toku_create_empty_bn());
     }
     for (uint32_t i = 0; i < nrows; ++i) {  // one basement per row
         char key[keylens], val[vallens];
-        key[keylens-1] = '\0';
+        key[keylens - 1] = '\0';
         char c = 'a' + i;
-        memset(key, c, keylens-1);
-        le_add_to_bn(BLB_DATA(&sn, i), 0, (char *) &key, sizeof(key), (char *) &val, sizeof(val));
-        if (i < nrows-1) {
+        memset(key, c, keylens - 1);
+        le_add_to_bn(BLB_DATA(&sn, i),
+                     0,
+                     (char *)&key,
+                     sizeof(key),
+                     (char *)&val,
+                     sizeof(val));
+        if (i < nrows - 1) {
             uint32_t keylen;
-            void* curr_key;
+            void *curr_key;
             BLB_DATA(&sn, i)->fetch_key_and_len(0, &keylen, &curr_key);
             DBT pivotkey;
-            sn.pivotkeys.insert_at(toku_fill_dbt(&pivotkey, curr_key, keylen), i);
+            sn.pivotkeys.insert_at(toku_fill_dbt(&pivotkey, curr_key, keylen),
+                                   i);
         }
     }
 
@@ -378,29 +417,34 @@ test_serialize_leaf_with_large_pivots(enum ftnode_verify_type bft, bool do_clone
                  make_blocknum(0),
                  ZERO_LSN,
                  TXNID_NONE,
-                 4*1024*1024,
-                 128*1024,
+                 4 * 1024 * 1024,
+                 128 * 1024,
                  TOKU_DEFAULT_COMPRESSION_METHOD,
                  16);
     ft->ft = ft_h;
     ft_h->blocktable.create();
-    { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); }
-    //Want to use block #20
+    {
+        int r_truncate = ftruncate(fd, 0);
+        CKERR(r_truncate);
+    }
+    // Want to use block #20
     BLOCKNUM b = make_blocknum(0);
     while (b.b < 20) {
         ft_h->blocktable.allocate_blocknum(&b, ft_h);
     }
-    assert(b.b == 20);
+    invariant(b.b == 20);
 
     {
         DISKOFF offset;
         DISKOFF size;
-        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0);
-        assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
 
         ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size);
-        assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
-        assert(size   == 100);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        invariant(size == 100);
     }
     FTNODE_DISK_DATA src_ndd = NULL;
     FTNODE_DISK_DATA dest_ndd = NULL;
@@ -408,55 +452,64 @@ test_serialize_leaf_with_large_pivots(enum ftnode_verify_type bft, bool do_clone
     write_sn_to_disk(fd, ft, &sn, &src_ndd, do_clone);
 
     setup_dn(bft, fd, ft_h, &dn, &dest_ndd);
-    
-    assert(dn->blocknum.b==20);
 
-    assert(dn->layout_version ==FT_LAYOUT_VERSION);
-    assert(dn->layout_version_original ==FT_LAYOUT_VERSION);
+    invariant(dn->blocknum.b == 20);
+
+    invariant(dn->layout_version == FT_LAYOUT_VERSION);
+    invariant(dn->layout_version_original == FT_LAYOUT_VERSION);
     {
-        // Man, this is way too ugly.  This entire test suite needs to be refactored.
+        // Man, this is way too ugly.  This entire test suite needs to be
+        // refactored.
         // Create a dummy mempool and put the leaves there.  Ugh.
         test_key_le_pair *les = new test_key_le_pair[nrows];
         {
             char key[keylens], val[vallens];
-            key[keylens-1] = '\0';
+            key[keylens - 1] = '\0';
             for (uint32_t i = 0; i < nrows; ++i) {
                 char c = 'a' + i;
-                memset(key, c, keylens-1);
-                les[i].init((char *) &key, sizeof(key), (char *) &val, sizeof(val));
+                memset(key, c, keylens - 1);
+                les[i].init(
+                    (char *)&key, sizeof(key), (char *)&val, sizeof(val));
             }
         }
         const uint32_t npartitions = dn->n_children;
         uint32_t last_i = 0;
         for (uint32_t bn = 0; bn < npartitions; ++bn) {
-            assert(dest_ndd[bn].start > 0);
-            assert(dest_ndd[bn].size  > 0);
+            invariant(dest_ndd[bn].start > 0);
+            invariant(dest_ndd[bn].size > 0);
             if (bn > 0) {
-                assert(dest_ndd[bn].start >= dest_ndd[bn-1].start + dest_ndd[bn-1].size);
+                invariant(dest_ndd[bn].start >=
+                       dest_ndd[bn - 1].start + dest_ndd[bn - 1].size);
             }
-            assert(BLB_DATA(dn, bn)->num_klpairs() > 0);
+            invariant(BLB_DATA(dn, bn)->num_klpairs() > 0);
             for (uint32_t i = 0; i < BLB_DATA(dn, bn)->num_klpairs(); i++) {
                 LEAFENTRY curr_le;
                 uint32_t curr_keylen;
-                void* curr_key;
-                BLB_DATA(dn, bn)->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key);
-                assert(leafentry_memsize(curr_le) == leafentry_memsize(les[last_i].le));
-                assert(memcmp(curr_le, les[last_i].le, leafentry_memsize(curr_le)) == 0);
-                if (bn < npartitions-1) {
-                    assert(strcmp((char*)dn->pivotkeys.get_pivot(bn).data, les[last_i].keyp) <= 0);
+                void *curr_key;
+                BLB_DATA(dn, bn)
+                    ->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key);
+                invariant(leafentry_memsize(curr_le) ==
+                       leafentry_memsize(les[last_i].le));
+                invariant(memcmp(curr_le,
+                              les[last_i].le,
+                              leafentry_memsize(curr_le)) == 0);
+                if (bn < npartitions - 1) {
+                    invariant(strcmp((char *)dn->pivotkeys.get_pivot(bn).data,
+                                  les[last_i].keyp) <= 0);
                 }
                 // TODO for later, get a key comparison here as well
                 last_i++;
             }
         }
-        assert(last_i == nrows);
+        invariant(last_i == nrows);
         delete[] les;
     }
 
     toku_ftnode_free(&dn);
     toku_destroy_ftnode_internals(&sn);
 
-    ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+    ft_h->blocktable.block_free(
+        BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100);
     ft_h->blocktable.destroy();
     toku_free(ft_h->h);
     toku_free(ft_h);
@@ -464,15 +517,19 @@ test_serialize_leaf_with_large_pivots(enum ftnode_verify_type bft, bool do_clone
     toku_free(src_ndd);
     toku_free(dest_ndd);
 
-    r = close(fd); assert(r != -1);
+    r = close(fd);
+    invariant(r != -1);
 }
 
-static void
-test_serialize_leaf_with_many_rows(enum ftnode_verify_type bft, bool do_clone) {
+static void test_serialize_leaf_with_many_rows(enum ftnode_verify_type bft,
+                                               bool do_clone) {
     int r;
     struct ftnode sn, *dn;
-    const uint32_t nrows = 196*1024;
-    int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
+    const uint32_t nrows = 196 * 1024;
+    int fd = open(TOKU_TEST_FILENAME,
+                  O_RDWR | O_CREAT | O_BINARY,
+                  S_IRWXU | S_IRWXG | S_IRWXO);
+    invariant(fd >= 0);
 
     sn.max_msn_applied_to_node_on_disk.msn = 0;
     sn.flags = 0x11223344;
@@ -487,14 +544,19 @@ test_serialize_leaf_with_many_rows(enum ftnode_verify_type bft, bool do_clone) {
     XMALLOC_N(sn.n_children, sn.bp);
     sn.pivotkeys.create_empty();
     for (int i = 0; i < sn.n_children; ++i) {
-        BP_STATE(&sn,i) = PT_AVAIL;
-        set_BLB(&sn, i, toku_create_empty_bn()); 
+        BP_STATE(&sn, i) = PT_AVAIL;
+        set_BLB(&sn, i, toku_create_empty_bn());
     }
     size_t total_size = 0;
     for (uint32_t i = 0; i < nrows; ++i) {
         uint32_t key = i;
         uint32_t val = i;
-        total_size += le_add_to_bn(BLB_DATA(&sn, 0), i, (char *) &key, sizeof(key), (char *) &val, sizeof(val));
+        total_size += le_add_to_bn(BLB_DATA(&sn, 0),
+                                   i,
+                                   (char *)&key,
+                                   sizeof(key),
+                                   (char *)&val,
+                                   sizeof(val));
     }
 
     FT_HANDLE XMALLOC(ft);
@@ -503,30 +565,35 @@ test_serialize_leaf_with_many_rows(enum ftnode_verify_type bft, bool do_clone) {
                  make_blocknum(0),
                  ZERO_LSN,
                  TXNID_NONE,
-                 4*1024*1024,
-                 128*1024,
+                 4 * 1024 * 1024,
+                 128 * 1024,
                  TOKU_DEFAULT_COMPRESSION_METHOD,
                  16);
     ft->ft = ft_h;
-    
+
     ft_h->blocktable.create();
-    { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); }
-    //Want to use block #20
+    {
+        int r_truncate = ftruncate(fd, 0);
+        CKERR(r_truncate);
+    }
+    // Want to use block #20
     BLOCKNUM b = make_blocknum(0);
     while (b.b < 20) {
         ft_h->blocktable.allocate_blocknum(&b, ft_h);
     }
-    assert(b.b == 20);
+    invariant(b.b == 20);
 
     {
         DISKOFF offset;
         DISKOFF size;
-        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0);
-        assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
 
         ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size);
-        assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
-        assert(size   == 100);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        invariant(size == 100);
     }
 
     FTNODE_DISK_DATA src_ndd = NULL;
@@ -535,56 +602,66 @@ test_serialize_leaf_with_many_rows(enum ftnode_verify_type bft, bool do_clone) {
 
     setup_dn(bft, fd, ft_h, &dn, &dest_ndd);
 
-    assert(dn->blocknum.b==20);
+    invariant(dn->blocknum.b == 20);
 
-    assert(dn->layout_version ==FT_LAYOUT_VERSION);
-    assert(dn->layout_version_original ==FT_LAYOUT_VERSION);
+    invariant(dn->layout_version == FT_LAYOUT_VERSION);
+    invariant(dn->layout_version_original == FT_LAYOUT_VERSION);
     {
-        // Man, this is way too ugly.  This entire test suite needs to be refactored.
+        // Man, this is way too ugly.  This entire test suite needs to be
+        // refactored.
         // Create a dummy mempool and put the leaves there.  Ugh.
         test_key_le_pair *les = new test_key_le_pair[nrows];
         {
             int key = 0, val = 0;
             for (uint32_t i = 0; i < nrows; ++i, key++, val++) {
-                les[i].init((char *) &key, sizeof(key), (char *) &val, sizeof(val));
+                les[i].init(
+                    (char *)&key, sizeof(key), (char *)&val, sizeof(val));
             }
         }
         const uint32_t npartitions = dn->n_children;
         uint32_t last_i = 0;
         for (uint32_t bn = 0; bn < npartitions; ++bn) {
-            assert(dest_ndd[bn].start > 0);
-            assert(dest_ndd[bn].size  > 0);
+            invariant(dest_ndd[bn].start > 0);
+            invariant(dest_ndd[bn].size > 0);
             if (bn > 0) {
-                assert(dest_ndd[bn].start >= dest_ndd[bn-1].start + dest_ndd[bn-1].size);
+                invariant(dest_ndd[bn].start >=
+                       dest_ndd[bn - 1].start + dest_ndd[bn - 1].size);
             }
-            assert(BLB_DATA(dn, bn)->num_klpairs() > 0);
+            invariant(BLB_DATA(dn, bn)->num_klpairs() > 0);
             for (uint32_t i = 0; i < BLB_DATA(dn, bn)->num_klpairs(); i++) {
                 LEAFENTRY curr_le;
                 uint32_t curr_keylen;
-                void* curr_key;
-                BLB_DATA(dn, bn)->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key);
-                assert(leafentry_memsize(curr_le) == leafentry_memsize(les[last_i].le));
-                assert(memcmp(curr_le, les[last_i].le, leafentry_memsize(curr_le)) == 0);
-                if (bn < npartitions-1) {
-                    uint32_t *CAST_FROM_VOIDP(pivot, dn->pivotkeys.get_pivot(bn).data);
-                    void* tmp = les[last_i].keyp;
+                void *curr_key;
+                BLB_DATA(dn, bn)
+                    ->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key);
+                invariant(leafentry_memsize(curr_le) ==
+                       leafentry_memsize(les[last_i].le));
+                invariant(memcmp(curr_le,
+                              les[last_i].le,
+                              leafentry_memsize(curr_le)) == 0);
+                if (bn < npartitions - 1) {
+                    uint32_t *CAST_FROM_VOIDP(pivot,
+                                              dn->pivotkeys.get_pivot(bn).data);
+                    void *tmp = les[last_i].keyp;
                     uint32_t *CAST_FROM_VOIDP(item, tmp);
-                    assert(*pivot >= *item);
+                    invariant(*pivot >= *item);
                 }
                 // TODO for later, get a key comparison here as well
                 last_i++;
             }
             // don't check soft_copy_is_up_to_date or seqinsert
-            assert(BLB_DATA(dn, bn)->get_disk_size() < 128*1024);  // BN_MAX_SIZE, apt to change
+            invariant(BLB_DATA(dn, bn)->get_disk_size() <
+                   128 * 1024);  // BN_MAX_SIZE, apt to change
         }
-        assert(last_i == nrows);
+        invariant(last_i == nrows);
         delete[] les;
     }
 
     toku_ftnode_free(&dn);
     toku_destroy_ftnode_internals(&sn);
 
-    ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+    ft_h->blocktable.block_free(
+        BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100);
     ft_h->blocktable.destroy();
     toku_free(ft_h->h);
     toku_free(ft_h);
@@ -592,19 +669,22 @@ test_serialize_leaf_with_many_rows(enum ftnode_verify_type bft, bool do_clone) {
     toku_free(src_ndd);
     toku_free(dest_ndd);
 
-    r = close(fd); assert(r != -1);
+    r = close(fd);
+    invariant(r != -1);
 }
 
-
-static void
-test_serialize_leaf_with_large_rows(enum ftnode_verify_type bft, bool do_clone) {
+static void test_serialize_leaf_with_large_rows(enum ftnode_verify_type bft,
+                                                bool do_clone) {
     int r;
     struct ftnode sn, *dn;
     const uint32_t nrows = 7;
     const size_t key_size = 8;
-    const size_t val_size = 512*1024;
-    // assert(val_size > BN_MAX_SIZE);  // BN_MAX_SIZE isn't visible
-    int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
+    const size_t val_size = 512 * 1024;
+    // invariant(val_size > BN_MAX_SIZE);  // BN_MAX_SIZE isn't visible
+    int fd = open(TOKU_TEST_FILENAME,
+                  O_RDWR | O_CREAT | O_BINARY,
+                  S_IRWXU | S_IRWXG | S_IRWXO);
+    invariant(fd >= 0);
 
     sn.max_msn_applied_to_node_on_disk.msn = 0;
     sn.flags = 0x11223344;
@@ -615,21 +695,21 @@ test_serialize_leaf_with_large_rows(enum ftnode_verify_type bft, bool do_clone)
     sn.n_children = 1;
     sn.dirty = 1;
     sn.oldest_referenced_xid_known = TXNID_NONE;
-    
+
     MALLOC_N(sn.n_children, sn.bp);
     sn.pivotkeys.create_empty();
     for (int i = 0; i < sn.n_children; ++i) {
-        BP_STATE(&sn,i) = PT_AVAIL;
+        BP_STATE(&sn, i) = PT_AVAIL;
         set_BLB(&sn, i, toku_create_empty_bn());
     }
     for (uint32_t i = 0; i < nrows; ++i) {
         char key[key_size], val[val_size];
-        key[key_size-1] = '\0';
-        val[val_size-1] = '\0';
+        key[key_size - 1] = '\0';
+        val[val_size - 1] = '\0';
         char c = 'a' + i;
-        memset(key, c, key_size-1);
-        memset(val, c, val_size-1);
-        le_add_to_bn(BLB_DATA(&sn, 0), i,key, 8, val, val_size);
+        memset(key, c, key_size - 1);
+        memset(val, c, val_size - 1);
+        le_add_to_bn(BLB_DATA(&sn, 0), i, key, 8, val, val_size);
     }
 
     FT_HANDLE XMALLOC(ft);
@@ -638,30 +718,35 @@ test_serialize_leaf_with_large_rows(enum ftnode_verify_type bft, bool do_clone)
                  make_blocknum(0),
                  ZERO_LSN,
                  TXNID_NONE,
-                 4*1024*1024,
-                 128*1024,
+                 4 * 1024 * 1024,
+                 128 * 1024,
                  TOKU_DEFAULT_COMPRESSION_METHOD,
                  16);
     ft->ft = ft_h;
-    
+
     ft_h->blocktable.create();
-    { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); }
-    //Want to use block #20
+    {
+        int r_truncate = ftruncate(fd, 0);
+        CKERR(r_truncate);
+    }
+    // Want to use block #20
     BLOCKNUM b = make_blocknum(0);
     while (b.b < 20) {
         ft_h->blocktable.allocate_blocknum(&b, ft_h);
     }
-    assert(b.b == 20);
+    invariant(b.b == 20);
 
     {
         DISKOFF offset;
         DISKOFF size;
-        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0);
-        assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
 
         ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size);
-        assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
-        assert(size   == 100);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        invariant(size == 100);
     }
 
     FTNODE_DISK_DATA src_ndd = NULL;
@@ -670,58 +755,66 @@ test_serialize_leaf_with_large_rows(enum ftnode_verify_type bft, bool do_clone)
 
     setup_dn(bft, fd, ft_h, &dn, &dest_ndd);
 
-    assert(dn->blocknum.b==20);
+    invariant(dn->blocknum.b == 20);
 
-    assert(dn->layout_version ==FT_LAYOUT_VERSION);
-    assert(dn->layout_version_original ==FT_LAYOUT_VERSION);
+    invariant(dn->layout_version == FT_LAYOUT_VERSION);
+    invariant(dn->layout_version_original == FT_LAYOUT_VERSION);
     {
-        // Man, this is way too ugly.  This entire test suite needs to be refactored.
+        // Man, this is way too ugly.  This entire test suite needs to be
+        // refactored.
         // Create a dummy mempool and put the leaves there.  Ugh.
         test_key_le_pair *les = new test_key_le_pair[nrows];
         {
             char key[key_size], val[val_size];
-            key[key_size-1] = '\0';
-            val[val_size-1] = '\0';
+            key[key_size - 1] = '\0';
+            val[val_size - 1] = '\0';
             for (uint32_t i = 0; i < nrows; ++i) {
                 char c = 'a' + i;
-                memset(key, c, key_size-1);
-                memset(val, c, val_size-1);
+                memset(key, c, key_size - 1);
+                memset(val, c, val_size - 1);
                 les[i].init(key, key_size, val, val_size);
             }
         }
         const uint32_t npartitions = dn->n_children;
-        assert(npartitions == nrows);
+        invariant(npartitions == nrows);
         uint32_t last_i = 0;
         for (uint32_t bn = 0; bn < npartitions; ++bn) {
-            assert(dest_ndd[bn].start > 0);
-            assert(dest_ndd[bn].size  > 0);
+            invariant(dest_ndd[bn].start > 0);
+            invariant(dest_ndd[bn].size > 0);
             if (bn > 0) {
-                assert(dest_ndd[bn].start >= dest_ndd[bn-1].start + dest_ndd[bn-1].size);
+                invariant(dest_ndd[bn].start >=
+                       dest_ndd[bn - 1].start + dest_ndd[bn - 1].size);
             }
-            assert(BLB_DATA(dn, bn)->num_klpairs() > 0);
+            invariant(BLB_DATA(dn, bn)->num_klpairs() > 0);
             for (uint32_t i = 0; i < BLB_DATA(dn, bn)->num_klpairs(); i++) {
                 LEAFENTRY curr_le;
                 uint32_t curr_keylen;
-                void* curr_key;
-                BLB_DATA(dn, bn)->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key);
-                assert(leafentry_memsize(curr_le) == leafentry_memsize(les[last_i].le));
-                assert(memcmp(curr_le, les[last_i].le, leafentry_memsize(curr_le)) == 0);
-                if (bn < npartitions-1) {
-                    assert(strcmp((char*)dn->pivotkeys.get_pivot(bn).data, (char*)(les[last_i].keyp)) <= 0);
+                void *curr_key;
+                BLB_DATA(dn, bn)
+                    ->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key);
+                invariant(leafentry_memsize(curr_le) ==
+                       leafentry_memsize(les[last_i].le));
+                invariant(memcmp(curr_le,
+                              les[last_i].le,
+                              leafentry_memsize(curr_le)) == 0);
+                if (bn < npartitions - 1) {
+                    invariant(strcmp((char *)dn->pivotkeys.get_pivot(bn).data,
+                                  (char *)(les[last_i].keyp)) <= 0);
                 }
                 // TODO for later, get a key comparison here as well
                 last_i++;
             }
             // don't check soft_copy_is_up_to_date or seqinsert
         }
-        assert(last_i == 7);
+        invariant(last_i == 7);
         delete[] les;
     }
 
     toku_ftnode_free(&dn);
     toku_destroy_ftnode_internals(&sn);
 
-    ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+    ft_h->blocktable.block_free(
+        BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100);
     ft_h->blocktable.destroy();
     toku_free(ft_h->h);
     toku_free(ft_h);
@@ -729,15 +822,19 @@ test_serialize_leaf_with_large_rows(enum ftnode_verify_type bft, bool do_clone)
     toku_free(src_ndd);
     toku_free(dest_ndd);
 
-    r = close(fd); assert(r != -1);
+    r = close(fd);
+    invariant(r != -1);
 }
 
-
-static void
-test_serialize_leaf_with_empty_basement_nodes(enum ftnode_verify_type bft, bool do_clone) {
+static void test_serialize_leaf_with_empty_basement_nodes(
+    enum ftnode_verify_type bft,
+    bool do_clone) {
     struct ftnode sn, *dn;
 
-    int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
+    int fd = open(TOKU_TEST_FILENAME,
+                  O_RDWR | O_CREAT | O_BINARY,
+                  S_IRWXU | S_IRWXG | S_IRWXO);
+    invariant(fd >= 0);
 
     int r;
 
@@ -760,7 +857,7 @@ test_serialize_leaf_with_empty_basement_nodes(enum ftnode_verify_type bft, bool
     toku_fill_dbt(&pivotkeys[5], "x", 2);
     sn.pivotkeys.create_from_dbts(pivotkeys, 6);
     for (int i = 0; i < sn.n_children; ++i) {
-        BP_STATE(&sn,i) = PT_AVAIL;
+        BP_STATE(&sn, i) = PT_AVAIL;
         set_BLB(&sn, i, toku_create_empty_bn());
         BLB_SEQINSERT(&sn, i) = 0;
     }
@@ -774,30 +871,35 @@ test_serialize_leaf_with_empty_basement_nodes(enum ftnode_verify_type bft, bool
                  make_blocknum(0),
                  ZERO_LSN,
                  TXNID_NONE,
-                 4*1024*1024,
-                 128*1024,
+                 4 * 1024 * 1024,
+                 128 * 1024,
                  TOKU_DEFAULT_COMPRESSION_METHOD,
                  16);
     ft->ft = ft_h;
-    
+
     ft_h->blocktable.create();
-    { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); }
-    //Want to use block #20
+    {
+        int r_truncate = ftruncate(fd, 0);
+        CKERR(r_truncate);
+    }
+    // Want to use block #20
     BLOCKNUM b = make_blocknum(0);
     while (b.b < 20) {
         ft_h->blocktable.allocate_blocknum(&b, ft_h);
     }
-    assert(b.b == 20);
+    invariant(b.b == 20);
 
     {
         DISKOFF offset;
         DISKOFF size;
-        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0);
-        assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
 
         ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size);
-        assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
-        assert(size   == 100);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        invariant(size == 100);
     }
     FTNODE_DISK_DATA src_ndd = NULL;
     FTNODE_DISK_DATA dest_ndd = NULL;
@@ -805,17 +907,18 @@ test_serialize_leaf_with_empty_basement_nodes(enum ftnode_verify_type bft, bool
 
     setup_dn(bft, fd, ft_h, &dn, &dest_ndd);
 
-    assert(dn->blocknum.b==20);
+    invariant(dn->blocknum.b == 20);
 
-    assert(dn->layout_version ==FT_LAYOUT_VERSION);
-    assert(dn->layout_version_original ==FT_LAYOUT_VERSION);
-    assert(dn->layout_version_read_from_disk ==FT_LAYOUT_VERSION);
-    assert(dn->height == 0);
-    assert(dn->n_children>0);
+    invariant(dn->layout_version == FT_LAYOUT_VERSION);
+    invariant(dn->layout_version_original == FT_LAYOUT_VERSION);
+    invariant(dn->layout_version_read_from_disk == FT_LAYOUT_VERSION);
+    invariant(dn->height == 0);
+    invariant(dn->n_children > 0);
     {
         test_key_le_pair elts[3];
 
-        // Man, this is way too ugly.  This entire test suite needs to be refactored.
+        // Man, this is way too ugly.  This entire test suite needs to be
+        // refactored.
         // Create a dummy mempool and put the leaves there.  Ugh.
         elts[0].init("a", "aval");
         elts[1].init("b", "bval");
@@ -823,33 +926,39 @@ test_serialize_leaf_with_empty_basement_nodes(enum ftnode_verify_type bft, bool
         const uint32_t npartitions = dn->n_children;
         uint32_t last_i = 0;
         for (uint32_t bn = 0; bn < npartitions; ++bn) {
-            assert(dest_ndd[bn].start > 0);
-            assert(dest_ndd[bn].size  > 0);
+            invariant(dest_ndd[bn].start > 0);
+            invariant(dest_ndd[bn].size > 0);
             if (bn > 0) {
-                assert(dest_ndd[bn].start >= dest_ndd[bn-1].start + dest_ndd[bn-1].size);
+                invariant(dest_ndd[bn].start >=
+                       dest_ndd[bn - 1].start + dest_ndd[bn - 1].size);
             }
             for (uint32_t i = 0; i < BLB_DATA(dn, bn)->num_klpairs(); i++) {
                 LEAFENTRY curr_le;
                 uint32_t curr_keylen;
-                void* curr_key;
-                BLB_DATA(dn, bn)->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key);
-                assert(leafentry_memsize(curr_le) == leafentry_memsize(elts[last_i].le));
-                assert(memcmp(curr_le, elts[last_i].le, leafentry_memsize(curr_le)) == 0);
-                if (bn < npartitions-1) {
-                    assert(strcmp((char*)dn->pivotkeys.get_pivot(bn).data, (char*)(elts[last_i].keyp)) <= 0);
+                void *curr_key;
+                BLB_DATA(dn, bn)
+                    ->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key);
+                invariant(leafentry_memsize(curr_le) ==
+                       leafentry_memsize(elts[last_i].le));
+                invariant(memcmp(curr_le,
+                              elts[last_i].le,
+                              leafentry_memsize(curr_le)) == 0);
+                if (bn < npartitions - 1) {
+                    invariant(strcmp((char *)dn->pivotkeys.get_pivot(bn).data,
+                                  (char *)(elts[last_i].keyp)) <= 0);
                 }
                 // TODO for later, get a key comparison here as well
                 last_i++;
             }
-
         }
-        assert(last_i == 3);
+        invariant(last_i == 3);
     }
 
     toku_ftnode_free(&dn);
     toku_destroy_ftnode_internals(&sn);
 
-    ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+    ft_h->blocktable.block_free(
+        BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100);
     ft_h->blocktable.destroy();
     toku_free(ft_h->h);
     toku_free(ft_h);
@@ -857,14 +966,19 @@ test_serialize_leaf_with_empty_basement_nodes(enum ftnode_verify_type bft, bool
     toku_free(src_ndd);
     toku_free(dest_ndd);
 
-    r = close(fd); assert(r != -1);
+    r = close(fd);
+    invariant(r != -1);
 }
 
-static void
-test_serialize_leaf_with_multiple_empty_basement_nodes(enum ftnode_verify_type bft, bool do_clone) {
+static void test_serialize_leaf_with_multiple_empty_basement_nodes(
+    enum ftnode_verify_type bft,
+    bool do_clone) {
     struct ftnode sn, *dn;
 
-    int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
+    int fd = open(TOKU_TEST_FILENAME,
+                  O_RDWR | O_CREAT | O_BINARY,
+                  S_IRWXU | S_IRWXG | S_IRWXO);
+    invariant(fd >= 0);
 
     int r;
 
@@ -884,7 +998,7 @@ test_serialize_leaf_with_multiple_empty_basement_nodes(enum ftnode_verify_type b
     toku_fill_dbt(&pivotkeys[2], "A", 2);
     sn.pivotkeys.create_from_dbts(pivotkeys, 3);
     for (int i = 0; i < sn.n_children; ++i) {
-        BP_STATE(&sn,i) = PT_AVAIL;
+        BP_STATE(&sn, i) = PT_AVAIL;
         set_BLB(&sn, i, toku_create_empty_bn());
     }
 
@@ -894,30 +1008,35 @@ test_serialize_leaf_with_multiple_empty_basement_nodes(enum ftnode_verify_type b
                  make_blocknum(0),
                  ZERO_LSN,
                  TXNID_NONE,
-                 4*1024*1024,
-                 128*1024,
+                 4 * 1024 * 1024,
+                 128 * 1024,
                  TOKU_DEFAULT_COMPRESSION_METHOD,
                  16);
     ft->ft = ft_h;
-    
+
     ft_h->blocktable.create();
-    { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); }
-    //Want to use block #20
+    {
+        int r_truncate = ftruncate(fd, 0);
+        CKERR(r_truncate);
+    }
+    // Want to use block #20
     BLOCKNUM b = make_blocknum(0);
     while (b.b < 20) {
         ft_h->blocktable.allocate_blocknum(&b, ft_h);
     }
-    assert(b.b == 20);
+    invariant(b.b == 20);
 
     {
         DISKOFF offset;
         DISKOFF size;
-        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0);
-        assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
 
         ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size);
-        assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
-        assert(size   == 100);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        invariant(size == 100);
     }
 
     FTNODE_DISK_DATA src_ndd = NULL;
@@ -926,29 +1045,31 @@ test_serialize_leaf_with_multiple_empty_basement_nodes(enum ftnode_verify_type b
 
     setup_dn(bft, fd, ft_h, &dn, &dest_ndd);
 
-    assert(dn->blocknum.b==20);
+    invariant(dn->blocknum.b == 20);
 
-    assert(dn->layout_version ==FT_LAYOUT_VERSION);
-    assert(dn->layout_version_original ==FT_LAYOUT_VERSION);
-    assert(dn->layout_version_read_from_disk ==FT_LAYOUT_VERSION);
-    assert(dn->height == 0);
-    assert(dn->n_children == 1);
+    invariant(dn->layout_version == FT_LAYOUT_VERSION);
+    invariant(dn->layout_version_original == FT_LAYOUT_VERSION);
+    invariant(dn->layout_version_read_from_disk == FT_LAYOUT_VERSION);
+    invariant(dn->height == 0);
+    invariant(dn->n_children == 1);
     {
         const uint32_t npartitions = dn->n_children;
         for (uint32_t i = 0; i < npartitions; ++i) {
-            assert(dest_ndd[i].start > 0);
-            assert(dest_ndd[i].size  > 0);
+            invariant(dest_ndd[i].start > 0);
+            invariant(dest_ndd[i].size > 0);
             if (i > 0) {
-                assert(dest_ndd[i].start >= dest_ndd[i-1].start + dest_ndd[i-1].size);
+                invariant(dest_ndd[i].start >=
+                       dest_ndd[i - 1].start + dest_ndd[i - 1].size);
             }
-            assert(BLB_DATA(dn, i)->num_klpairs() == 0);
+            invariant(BLB_DATA(dn, i)->num_klpairs() == 0);
         }
     }
-    
+
     toku_ftnode_free(&dn);
     toku_destroy_ftnode_internals(&sn);
 
-    ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+    ft_h->blocktable.block_free(
+        BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100);
     ft_h->blocktable.destroy();
     toku_free(ft_h->h);
     toku_free(ft_h);
@@ -956,16 +1077,18 @@ test_serialize_leaf_with_multiple_empty_basement_nodes(enum ftnode_verify_type b
     toku_free(src_ndd);
     toku_free(dest_ndd);
 
-    r = close(fd); assert(r != -1);
+    r = close(fd);
+    invariant(r != -1);
 }
 
-
-static void
-test_serialize_nonleaf(enum ftnode_verify_type bft, bool do_clone) {
+static void test_serialize_nonleaf(enum ftnode_verify_type bft, bool do_clone) {
     //    struct ft_handle source_ft;
     struct ftnode sn, *dn;
 
-    int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0);
+    int fd = open(TOKU_TEST_FILENAME,
+                  O_RDWR | O_CREAT | O_BINARY,
+                  S_IRWXU | S_IRWXG | S_IRWXO);
+    invariant(fd >= 0);
 
     int r;
 
@@ -984,11 +1107,11 @@ test_serialize_nonleaf(enum ftnode_verify_type bft, bool do_clone) {
     sn.pivotkeys.create_from_dbts(toku_fill_dbt(&pivotkey, "hello", 6), 1);
     BP_BLOCKNUM(&sn, 0).b = 30;
     BP_BLOCKNUM(&sn, 1).b = 35;
-    BP_STATE(&sn,0) = PT_AVAIL;
-    BP_STATE(&sn,1) = PT_AVAIL;
+    BP_STATE(&sn, 0) = PT_AVAIL;
+    BP_STATE(&sn, 1) = PT_AVAIL;
     set_BNC(&sn, 0, toku_create_empty_nl());
     set_BNC(&sn, 1, toku_create_empty_nl());
-    //Create XIDS
+    // Create XIDS
     XIDS xids_0 = toku_xids_get_root_xids();
     XIDS xids_123;
     XIDS xids_234;
@@ -1000,11 +1123,38 @@ test_serialize_nonleaf(enum ftnode_verify_type bft, bool do_clone) {
     toku::comparator cmp;
     cmp.create(string_key_cmp, nullptr);
 
-    toku_bnc_insert_msg(BNC(&sn, 0), "a", 2, "aval", 5, FT_NONE, next_dummymsn(), xids_0, true, cmp);
-    toku_bnc_insert_msg(BNC(&sn, 0), "b", 2, "bval", 5, FT_NONE, next_dummymsn(), xids_123, false, cmp);
-    toku_bnc_insert_msg(BNC(&sn, 1), "x", 2, "xval", 5, FT_NONE, next_dummymsn(), xids_234, true, cmp);
-
-    //Cleanup:
+    toku_bnc_insert_msg(BNC(&sn, 0),
+                        "a",
+                        2,
+                        "aval",
+                        5,
+                        FT_NONE,
+                        next_dummymsn(),
+                        xids_0,
+                        true,
+                        cmp);
+    toku_bnc_insert_msg(BNC(&sn, 0),
+                        "b",
+                        2,
+                        "bval",
+                        5,
+                        FT_NONE,
+                        next_dummymsn(),
+                        xids_123,
+                        false,
+                        cmp);
+    toku_bnc_insert_msg(BNC(&sn, 1),
+                        "x",
+                        2,
+                        "xval",
+                        5,
+                        FT_NONE,
+                        next_dummymsn(),
+                        xids_234,
+                        true,
+                        cmp);
+
+    // Cleanup:
     toku_xids_destroy(&xids_0);
     toku_xids_destroy(&xids_123);
     toku_xids_destroy(&xids_234);
@@ -1016,31 +1166,36 @@ test_serialize_nonleaf(enum ftnode_verify_type bft, bool do_clone) {
                  make_blocknum(0),
                  ZERO_LSN,
                  TXNID_NONE,
-                 4*1024*1024,
-                 128*1024,
+                 4 * 1024 * 1024,
+                 128 * 1024,
                  TOKU_DEFAULT_COMPRESSION_METHOD,
                  16);
     ft_h->cmp.create(string_key_cmp, nullptr);
     ft->ft = ft_h;
-    
+
     ft_h->blocktable.create();
-    { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); }
-    //Want to use block #20
+    {
+        int r_truncate = ftruncate(fd, 0);
+        CKERR(r_truncate);
+    }
+    // Want to use block #20
     BLOCKNUM b = make_blocknum(0);
     while (b.b < 20) {
         ft_h->blocktable.allocate_blocknum(&b, ft_h);
     }
-    assert(b.b == 20);
+    invariant(b.b == 20);
 
     {
         DISKOFF offset;
         DISKOFF size;
-        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0);
-        assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
 
         ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size);
-        assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
-        assert(size   == 100);
+        invariant(offset ==
+               (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+        invariant(size == 100);
     }
     FTNODE_DISK_DATA src_ndd = NULL;
     FTNODE_DISK_DATA dest_ndd = NULL;
@@ -1048,30 +1203,31 @@ test_serialize_nonleaf(enum ftnode_verify_type bft, bool do_clone) {
 
     setup_dn(bft, fd, ft_h, &dn, &dest_ndd);
 
-    assert(dn->blocknum.b==20);
+    invariant(dn->blocknum.b == 20);
 
-    assert(dn->layout_version ==FT_LAYOUT_VERSION);
-    assert(dn->layout_version_original ==FT_LAYOUT_VERSION);
-    assert(dn->layout_version_read_from_disk ==FT_LAYOUT_VERSION);
-    assert(dn->height == 1);
-    assert(dn->n_children==2);
-    assert(strcmp((char*)dn->pivotkeys.get_pivot(0).data, "hello")==0);
-    assert(dn->pivotkeys.get_pivot(0).size==6);
-    assert(BP_BLOCKNUM(dn,0).b==30);
-    assert(BP_BLOCKNUM(dn,1).b==35);
+    invariant(dn->layout_version == FT_LAYOUT_VERSION);
+    invariant(dn->layout_version_original == FT_LAYOUT_VERSION);
+    invariant(dn->layout_version_read_from_disk == FT_LAYOUT_VERSION);
+    invariant(dn->height == 1);
+    invariant(dn->n_children == 2);
+    invariant(strcmp((char *)dn->pivotkeys.get_pivot(0).data, "hello") == 0);
+    invariant(dn->pivotkeys.get_pivot(0).size == 6);
+    invariant(BP_BLOCKNUM(dn, 0).b == 30);
+    invariant(BP_BLOCKNUM(dn, 1).b == 35);
 
     message_buffer *src_msg_buffer1 = &BNC(&sn, 0)->msg_buffer;
     message_buffer *src_msg_buffer2 = &BNC(&sn, 1)->msg_buffer;
     message_buffer *dest_msg_buffer1 = &BNC(dn, 0)->msg_buffer;
     message_buffer *dest_msg_buffer2 = &BNC(dn, 1)->msg_buffer;
 
-    assert(src_msg_buffer1->equals(dest_msg_buffer1));
-    assert(src_msg_buffer2->equals(dest_msg_buffer2));
+    invariant(src_msg_buffer1->equals(dest_msg_buffer1));
+    invariant(src_msg_buffer2->equals(dest_msg_buffer2));
 
     toku_ftnode_free(&dn);
     toku_destroy_ftnode_internals(&sn);
 
-    ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE);
+    ft_h->blocktable.block_free(
+        BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100);
     ft_h->blocktable.destroy();
     ft_h->cmp.destroy();
     toku_free(ft_h->h);
@@ -1080,11 +1236,12 @@ test_serialize_nonleaf(enum ftnode_verify_type bft, bool do_clone) {
     toku_free(src_ndd);
     toku_free(dest_ndd);
 
-    r = close(fd); assert(r != -1);
+    r = close(fd);
+    invariant(r != -1);
 }
 
-int
-test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
+int test_main(int argc __attribute__((__unused__)),
+              const char *argv[] __attribute__((__unused__))) {
     initialize_dummymsn();
 
     test_serialize_nonleaf(read_none, false);
@@ -1103,10 +1260,12 @@ test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute_
 
     test_serialize_leaf_with_multiple_empty_basement_nodes(read_none, false);
     test_serialize_leaf_with_multiple_empty_basement_nodes(read_all, false);
-    test_serialize_leaf_with_multiple_empty_basement_nodes(read_compressed, false);
+    test_serialize_leaf_with_multiple_empty_basement_nodes(read_compressed,
+                                                           false);
     test_serialize_leaf_with_multiple_empty_basement_nodes(read_none, true);
     test_serialize_leaf_with_multiple_empty_basement_nodes(read_all, true);
-    test_serialize_leaf_with_multiple_empty_basement_nodes(read_compressed, true);
+    test_serialize_leaf_with_multiple_empty_basement_nodes(read_compressed,
+                                                           true);
 
     test_serialize_leaf_with_empty_basement_nodes(read_none, false);
     test_serialize_leaf_with_empty_basement_nodes(read_all, false);
diff --git a/storage/tokudb/PerconaFT/ft/tests/ft-test.cc b/storage/tokudb/PerconaFT/ft/tests/ft-test.cc
index 598a1cc7085..706bd94fbc3 100644
--- a/storage/tokudb/PerconaFT/ft/tests/ft-test.cc
+++ b/storage/tokudb/PerconaFT/ft/tests/ft-test.cc
@@ -164,17 +164,16 @@ static void  test_read_what_was_written (void) {
     int r;
     const int NVALS=10000;
 
-    if (verbose) printf("test_read_what_was_written(): "); fflush(stdout);
+    if (verbose) {
+        printf("test_read_what_was_written(): "); fflush(stdout);
+    }
 
     unlink(fname);
-    
 
     toku_cachetable_create(&ct, 0, ZERO_LSN, nullptr);
     r = toku_open_ft_handle(fname, 1, &ft, 1<<12, 1<<9, TOKU_DEFAULT_COMPRESSION_METHOD, ct, null_txn, toku_builtin_compare_fun);  assert(r==0);
     r = toku_close_ft_handle_nolsn(ft, 0); assert(r==0);
-    toku_cachetable_close(&ct);
-
-    
+    toku_cachetable_close(&ct);    
 
     /* Now see if we can read an empty tree in. */
     toku_cachetable_create(&ct, 0, ZERO_LSN, nullptr);
@@ -189,8 +188,6 @@ static void  test_read_what_was_written (void) {
     r = toku_close_ft_handle_nolsn(ft, 0); assert(r==0);
     toku_cachetable_close(&ct);
 
-    
-
     /* Now see if we can read it in and get the value. */
     toku_cachetable_create(&ct, 0, ZERO_LSN, nullptr);
     r = toku_open_ft_handle(fname, 0, &ft, 1<<12, 1<<9, TOKU_DEFAULT_COMPRESSION_METHOD, ct, null_txn, toku_builtin_compare_fun); assert(r==0);
diff --git a/storage/tokudb/PerconaFT/ft/tests/pqueue-test.cc b/storage/tokudb/PerconaFT/ft/tests/pqueue-test.cc
index 53973794eae..aeb5a897c48 100644
--- a/storage/tokudb/PerconaFT/ft/tests/pqueue-test.cc
+++ b/storage/tokudb/PerconaFT/ft/tests/pqueue-test.cc
@@ -109,7 +109,9 @@ static int run_test(void)
         r = pqueue_pop(pq, &node);   assert(r==0);
         if (verbose) printf("%d : %d\n", i, *(int*)(node->key->data));
         if ( *(int*)(node->key->data) != i ) { 
-            if (verbose) printf("FAIL\n"); return -1; 
+            if (verbose)
+                printf("FAIL\n");
+            return -1;
         }
     }
     pqueue_free(pq);
diff --git a/storage/tokudb/PerconaFT/ft/tests/test-leafentry-nested.cc b/storage/tokudb/PerconaFT/ft/tests/test-leafentry-nested.cc
index a78f787cdf2..f2004964862 100644
--- a/storage/tokudb/PerconaFT/ft/tests/test-leafentry-nested.cc
+++ b/storage/tokudb/PerconaFT/ft/tests/test-leafentry-nested.cc
@@ -793,7 +793,7 @@ static void test_le_garbage_collection_birdie(void) {
     do_garbage_collect = ule_worth_running_garbage_collection(&ule, 200);
     invariant(do_garbage_collect);
 
-    // It is definately worth doing when the above case is true
+    // It is definitely worth doing when the above case is true
     // and there is more than one provisional entry.
     ule.num_cuxrs = 1;
     ule.num_puxrs = 2;
diff --git a/storage/tokudb/PerconaFT/ft/tests/test-oldest-referenced-xid-flush.cc b/storage/tokudb/PerconaFT/ft/tests/test-oldest-referenced-xid-flush.cc
index 419af550545..71357a1e16a 100644
--- a/storage/tokudb/PerconaFT/ft/tests/test-oldest-referenced-xid-flush.cc
+++ b/storage/tokudb/PerconaFT/ft/tests/test-oldest-referenced-xid-flush.cc
@@ -72,7 +72,7 @@ static void dummy_update_status(FTNODE UU(child), int UU(dirtied), void* UU(extr
 
 enum { NODESIZE = 1024, KSIZE=NODESIZE-100, TOKU_PSIZE=20 };
 
-static void test_oldest_referenced_xid_gets_propogated(void) {
+static void test_oldest_referenced_xid_gets_propagated(void) {
     int r;
     CACHETABLE ct;
     FT_HANDLE t;
@@ -166,7 +166,7 @@ static void test_oldest_referenced_xid_gets_propogated(void) {
     toku_ft_flush_some_child(t->ft, node, &fa);
 
     // pin the child, verify that oldest referenced xid was
-    // propogated from parent to child during the flush
+    // propagated from parent to child during the flush
     toku_pin_ftnode(
         t->ft, 
         child_nonleaf_blocknum,
@@ -185,6 +185,6 @@ static void test_oldest_referenced_xid_gets_propogated(void) {
 
 int test_main(int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) {
     default_parse_args(argc, argv);
-    test_oldest_referenced_xid_gets_propogated();
+    test_oldest_referenced_xid_gets_propagated();
     return 0;
 }
diff --git a/storage/tokudb/PerconaFT/ft/serialize/block_allocator_strategy.h b/storage/tokudb/PerconaFT/ft/tests/test-rbtree-insert-remove-with-mhs.cc
index 8aded3898c1..ea4f9374dc3 100644
--- a/storage/tokudb/PerconaFT/ft/serialize/block_allocator_strategy.h
+++ b/storage/tokudb/PerconaFT/ft/tests/test-rbtree-insert-remove-with-mhs.cc
@@ -36,30 +36,62 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 
 #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
 
-#pragma once
-
-#include <db.h>
-
-#include "ft/serialize/block_allocator.h"
-
-// Block allocation strategy implementations
-
-class block_allocator_strategy {
-public:
-    static struct block_allocator::blockpair *
-    first_fit(struct block_allocator::blockpair *blocks_array,
-              uint64_t n_blocks, uint64_t size, uint64_t alignment);
-
-    static struct block_allocator::blockpair *
-    best_fit(struct block_allocator::blockpair *blocks_array,
-             uint64_t n_blocks, uint64_t size, uint64_t alignment);
-
-    static struct block_allocator::blockpair *
-    padded_fit(struct block_allocator::blockpair *blocks_array,
-               uint64_t n_blocks, uint64_t size, uint64_t alignment);
-
-    static struct block_allocator::blockpair *
-    heat_zone(struct block_allocator::blockpair *blocks_array,
-              uint64_t n_blocks, uint64_t size, uint64_t alignment,
-              uint64_t heat);
-};
+#include "ft/serialize/rbtree_mhs.h"
+#include "test.h"
+#include <algorithm>
+#include <vector>
+#include <ctime>
+#include <cstdlib>
+
+static void test_insert_remove(void) {
+    uint64_t i;
+    MhsRbTree::Tree *tree = new MhsRbTree::Tree();
+    verbose = 0;
+
+    tree->Insert({0, 100});
+
+    for (i = 0; i < 10; i++) {
+        tree->Remove(3);
+        tree->Remove(2);
+    }
+    tree->ValidateBalance();
+    tree->ValidateMhs();
+
+    for (i = 0; i < 10; i++) {
+        tree->Insert({5 * i, 3});
+    }
+    tree->ValidateBalance();
+    tree->ValidateMhs();
+
+    uint64_t offset = tree->Remove(2);
+    invariant(offset == 0);
+    offset = tree->Remove(10);
+    invariant(offset == 50);
+    offset = tree->Remove(3);
+    invariant(offset == 5);
+    tree->ValidateBalance();
+    tree->ValidateMhs();
+
+    tree->Insert({48, 2});
+    tree->Insert({50, 10});
+
+    tree->ValidateBalance();
+    tree->ValidateMhs();
+
+    tree->Insert({3, 7});
+    offset = tree->Remove(10);
+    invariant(offset == 2);
+    tree->ValidateBalance();
+    tree->ValidateMhs();
+    tree->Dump();
+    delete tree;
+}
+
+int test_main(int argc, const char *argv[]) {
+    default_parse_args(argc, argv);
+
+    test_insert_remove();
+    if (verbose)
+        printf("test ok\n");
+    return 0;
+}
diff --git a/storage/tokudb/PerconaFT/ft/tests/test-rbtree-insert-remove-without-mhs.cc b/storage/tokudb/PerconaFT/ft/tests/test-rbtree-insert-remove-without-mhs.cc
new file mode 100644
index 00000000000..cefe66335a6
--- /dev/null
+++ b/storage/tokudb/PerconaFT/ft/tests/test-rbtree-insert-remove-without-mhs.cc
@@ -0,0 +1,103 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include "ft/serialize/rbtree_mhs.h"
+#include "test.h"
+#include <algorithm>
+#include <vector>
+#include <ctime>
+#include <cstdlib>
+
+#define N 1000000
+std::vector<MhsRbTree::Node::BlockPair> input_vector;
+MhsRbTree::Node::BlockPair old_vector[N];
+
+static int myrandom(int i) { return std::rand() % i; }
+
+static void generate_random_input() {
+    std::srand(unsigned(std::time(0)));
+
+    // set some values:
+    for (uint64_t i = 0; i < N; ++i) {
+        MhsRbTree::Node::BlockPair bp = {i+1, 0};
+        input_vector.push_back(bp);
+        old_vector[i] = bp;
+    }
+    // using built-in random generator:
+    std::random_shuffle(input_vector.begin(), input_vector.end(), myrandom);
+}
+
+static void test_insert_remove(void) {
+    int i;
+    MhsRbTree::Tree *tree = new MhsRbTree::Tree();
+    verbose = 0;
+    generate_random_input();
+    if (verbose) {
+        printf("\n we are going to insert the following block offsets\n");
+        for (i = 0; i < N; i++)
+            printf("%" PRIu64 "\t", input_vector[i]._offset.ToInt());
+    }
+    for (i = 0; i < N; i++) {
+        tree->Insert(input_vector[i]);
+        // tree->ValidateBalance();
+    }
+    tree->ValidateBalance();
+    MhsRbTree::Node::BlockPair *p_bps = &old_vector[0];
+    tree->ValidateInOrder(p_bps);
+    printf("min node of the tree:%" PRIu64 "\n",
+           rbn_offset(tree->MinNode()).ToInt());
+    printf("max node of the tree:%" PRIu64 "\n",
+           rbn_offset(tree->MaxNode()).ToInt());
+
+    for (i = 0; i < N; i++) {
+        // tree->ValidateBalance();
+        tree->RawRemove(input_vector[i]._offset.ToInt());
+    }
+
+    tree->Destroy();
+    delete tree;
+}
+
+int test_main(int argc, const char *argv[]) {
+    default_parse_args(argc, argv);
+
+    test_insert_remove();
+    if (verbose)
+        printf("test ok\n");
+    return 0;
+}
diff --git a/storage/tokudb/PerconaFT/ft/txn/roll.cc b/storage/tokudb/PerconaFT/ft/txn/roll.cc
index 407116b983c..9f3977743a0 100644
--- a/storage/tokudb/PerconaFT/ft/txn/roll.cc
+++ b/storage/tokudb/PerconaFT/ft/txn/roll.cc
@@ -38,18 +38,18 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 
 /* rollback and rollforward routines. */
 
-
-#include "ft/ft.h"
+#include <memory>
 #include "ft/ft-ops.h"
+#include "ft/ft.h"
 #include "ft/log_header.h"
 #include "ft/logger/log-internal.h"
-#include "ft/txn/xids.h"
 #include "ft/txn/rollback-apply.h"
+#include "ft/txn/xids.h"
 
 // functionality provided by roll.c is exposed by an autogenerated
 // header file, logheader.h
 //
-// this (poorly) explains the absense of "roll.h"
+// this (poorly) explains the absence of "roll.h"
 
 // these flags control whether or not we send commit messages for
 // various operations
@@ -162,10 +162,122 @@ toku_rollback_fcreate (FILENUM    filenum,
     // directory row lock for its dname) and we would not get this
     // far if there were other live handles.
     toku_cachefile_unlink_on_close(cf);
+    toku_cachefile_skip_log_recover_on_close(cf);
 done:
     return 0;
 }
 
+int toku_commit_frename(BYTESTRING /* old_name */,
+                        BYTESTRING /* new_iname */,
+                        TOKUTXN /* txn */,
+                        LSN UU(oplsn)) {
+    return 0;
+}
+
+int toku_rollback_frename(BYTESTRING old_iname,
+                          BYTESTRING new_iname,
+                          TOKUTXN txn,
+                          LSN UU(oplsn)) {
+    assert(txn);
+    assert(txn->logger);
+    assert(txn->logger->ct);
+
+    CACHETABLE cachetable = txn->logger->ct;
+
+    toku_struct_stat stat;
+    bool old_exist = true;
+    bool new_exist = true;
+
+    std::unique_ptr<char[], decltype(&toku_free)> old_iname_full(
+        toku_cachetable_get_fname_in_cwd(cachetable, old_iname.data),
+        &toku_free);
+    std::unique_ptr<char[], decltype(&toku_free)> new_iname_full(
+        toku_cachetable_get_fname_in_cwd(cachetable, new_iname.data),
+        &toku_free);
+
+    if (toku_stat(old_iname_full.get(), &stat) == -1) {
+        if (ENOENT == errno)
+            old_exist = false;
+        else
+            return 1;
+    }
+
+    if (toku_stat(new_iname_full.get(), &stat) == -1) {
+        if (ENOENT == errno)
+            new_exist = false;
+        else
+            return 1;
+    }
+
+    // Both old and new files can exist if:
+    // - rename() is not completed
+    // - fcreate was replayed during recovery
+    // 'Stalled cachefiles' container cachefile_list::m_stale_fileid contains
+    // closed but not yet evicted cachefiles and the key of this container is
+    // fs-dependent file id - (device id, inode number) pair. To preserve the
+    // new cachefile
+    // file's id and keep it in 'stalled cachefiles' container the old file is
+    // removed
+    // and the new file is renamed.
+    if (old_exist && new_exist &&
+        (toku_os_unlink(old_iname_full.get()) == -1 ||
+         toku_os_rename(new_iname_full.get(), old_iname_full.get()) == -1 ||
+         toku_fsync_directory(new_iname_full.get()) == -1 ||
+         toku_fsync_directory(old_iname_full.get()) == -1))
+        return 1;
+
+    if (!old_exist && new_exist &&
+        (toku_os_rename(new_iname_full.get(), old_iname_full.get()) == -1 ||
+         toku_fsync_directory(new_iname_full.get()) == -1 ||
+         toku_fsync_directory(old_iname_full.get()) == -1))
+        return 1;
+
+    // it's ok if both files do not exist on recovery
+    if (!old_exist && !new_exist)
+        assert(txn->for_recovery);
+
+    CACHEFILE cf;
+    int r = toku_cachefile_of_iname_in_env(cachetable, new_iname.data, &cf);
+    if (r != ENOENT) {
+        char *old_fname_in_cf = toku_cachefile_fname_in_env(cf);
+        toku_cachefile_set_fname_in_env(cf, toku_xstrdup(old_iname.data));
+        toku_free(old_fname_in_cf);
+        // There is at least one case when fclose logging cause error:
+        // 1) start transaction
+        // 2) create ft 'a'(write "fcreate" in recovery log)
+        // 3) rename ft 'a' to 'b'(write "frename" in recovery log)
+        // 4) abort transaction:
+        //    a) rollback rename ft (renames 'b' to 'a')
+        //    b) rollback create ft (removes 'a'):
+        //       invokes toku_cachefile_unlink_on_close - lazy unlink on file
+        //       close,
+        //       it just sets corresponding flag in cachefile object
+        //    c) write "unlink" for 'a' in recovery log
+        //       (when transaction is aborted all locks are released,
+        //       when file lock is released the file is closed and unlinked if
+        //       corresponding flag is set in cachefile object)
+        // 5) crash
+        //
+        // After this we have the following records in recovery log:
+        // - create ft 'a',
+        // - rename 'a' to 'b',
+        // - unlink 'a'
+        //
+        // On recovery:
+        // - create 'a'
+        // - rename 'a' to 'b'
+        // - unlink 'a' - as 'a' file does not exist we have crash on assert
+        // here
+        //
+        // There is no need to write "unlink" in recovery log in (4a) because
+        // 'a' will be removed
+        // on transaction rollback on recovery.
+        toku_cachefile_skip_log_recover_on_close(cf);
+    }
+
+    return 0;
+}
+
 int find_ft_from_filenum (const FT &ft, const FILENUM &filenum);
 int find_ft_from_filenum (const FT &ft, const FILENUM &filenum) {
     FILENUM thisfnum = toku_cachefile_filenum(ft->cf);
diff --git a/storage/tokudb/PerconaFT/ft/txn/rollback-apply.cc b/storage/tokudb/PerconaFT/ft/txn/rollback-apply.cc
index df830afd0df..c9464c3ed60 100644
--- a/storage/tokudb/PerconaFT/ft/txn/rollback-apply.cc
+++ b/storage/tokudb/PerconaFT/ft/txn/rollback-apply.cc
@@ -169,7 +169,7 @@ int toku_rollback_commit(TOKUTXN txn, LSN lsn) {
             txn->roll_info.spilled_rollback_head      = ROLLBACK_NONE; 
             txn->roll_info.spilled_rollback_tail      = ROLLBACK_NONE; 
         }
-        // if we're commiting a child rollback, put its entries into the parent
+        // if we're committing a child rollback, put its entries into the parent
         // by pinning both child and parent and then linking the child log entry
         // list to the end of the parent log entry list.
         if (txn_has_current_rollback_log(txn)) {
diff --git a/storage/tokudb/PerconaFT/ft/txn/rollback-ct-callbacks.cc b/storage/tokudb/PerconaFT/ft/txn/rollback-ct-callbacks.cc
index 68c94c2ad11..08d7c8874e5 100644
--- a/storage/tokudb/PerconaFT/ft/txn/rollback-ct-callbacks.cc
+++ b/storage/tokudb/PerconaFT/ft/txn/rollback-ct-callbacks.cc
@@ -59,21 +59,18 @@ rollback_log_destroy(ROLLBACK_LOG_NODE log) {
 
 // flush an ununused log to disk, by allocating a size 0 blocknum in
 // the blocktable
-static void
-toku_rollback_flush_unused_log(
-    ROLLBACK_LOG_NODE log,
-    BLOCKNUM logname,
-    int fd,
-    FT ft,
-    bool write_me,
-    bool keep_me,
-    bool for_checkpoint,
-    bool is_clone
-    )
-{
+static void toku_rollback_flush_unused_log(ROLLBACK_LOG_NODE log,
+                                           BLOCKNUM logname,
+                                           int fd,
+                                           FT ft,
+                                           bool write_me,
+                                           bool keep_me,
+                                           bool for_checkpoint,
+                                           bool is_clone) {
     if (write_me) {
         DISKOFF offset;
-        ft->blocktable.realloc_on_disk(logname, 0, &offset, ft, fd, for_checkpoint, INT_MAX);
+        ft->blocktable.realloc_on_disk(
+            logname, 0, &offset, ft, fd, for_checkpoint);
     }
     if (!keep_me && !is_clone) {
         toku_free(log);
diff --git a/storage/tokudb/PerconaFT/ft/txn/txn.cc b/storage/tokudb/PerconaFT/ft/txn/txn.cc
index dd03073a3ec..9e48d0d05dd 100644
--- a/storage/tokudb/PerconaFT/ft/txn/txn.cc
+++ b/storage/tokudb/PerconaFT/ft/txn/txn.cc
@@ -269,6 +269,7 @@ static txn_child_manager tcm;
         .state = TOKUTXN_LIVE,
         .num_pin = 0,
         .client_id = 0,
+        .client_extra = nullptr,
         .start_time = time(NULL),
     };
 
@@ -705,12 +706,14 @@ bool toku_txn_has_spilled_rollback(TOKUTXN txn) {
     return txn_has_spilled_rollback_logs(txn);
 }
 
-uint64_t toku_txn_get_client_id(TOKUTXN txn) {
-    return txn->client_id;
+void toku_txn_get_client_id(TOKUTXN txn, uint64_t *client_id, void **client_extra) {
+    *client_id = txn->client_id;
+    *client_extra = txn->client_extra;
 }
 
-void toku_txn_set_client_id(TOKUTXN txn, uint64_t client_id) {
+void toku_txn_set_client_id(TOKUTXN txn, uint64_t client_id, void *client_extra) {
     txn->client_id = client_id;
+    txn->client_extra = client_extra;
 }
 
 time_t toku_txn_get_start_time(struct tokutxn *txn) {
diff --git a/storage/tokudb/PerconaFT/ft/txn/txn.h b/storage/tokudb/PerconaFT/ft/txn/txn.h
index 51a46022150..34a76aa9cad 100644
--- a/storage/tokudb/PerconaFT/ft/txn/txn.h
+++ b/storage/tokudb/PerconaFT/ft/txn/txn.h
@@ -193,6 +193,7 @@ struct tokutxn {
     uint32_t num_pin; // number of threads (all hot indexes) that want this
                       // txn to not transition to commit or abort
     uint64_t client_id;
+    void *client_extra;
     time_t start_time;
 };
 typedef struct tokutxn *TOKUTXN;
@@ -293,8 +294,8 @@ void toku_txn_unpin_live_txn(struct tokutxn *txn);
 
 bool toku_txn_has_spilled_rollback(struct tokutxn *txn);
 
-uint64_t toku_txn_get_client_id(struct tokutxn *txn);
-void toku_txn_set_client_id(struct tokutxn *txn, uint64_t client_id);
+void toku_txn_get_client_id(struct tokutxn *txn, uint64_t *client_id, void **client_extra);
+void toku_txn_set_client_id(struct tokutxn *txn, uint64_t client_id, void *client_extra);
 
 time_t toku_txn_get_start_time(struct tokutxn *txn);
 
diff --git a/storage/tokudb/PerconaFT/ft/ule.cc b/storage/tokudb/PerconaFT/ft/ule.cc
index ec73e148c90..f43094b6070 100644
--- a/storage/tokudb/PerconaFT/ft/ule.cc
+++ b/storage/tokudb/PerconaFT/ft/ule.cc
@@ -588,8 +588,8 @@ bool toku_le_worth_running_garbage_collection(
 //                by new txns.
 //            2.) There is only one committed entry, but the outermost
 //                provisional entry is older than the oldest known referenced
-//                xid, so it must have commited. Therefor we can promote it to
-//                committed and get rid of the old commited entry.
+//                xid, so it must have committed. Therefor we can promote it to
+//                committed and get rid of the old committed entry.
     if (le->type != LE_MVCC) {
         return false;
     }
diff --git a/storage/tokudb/PerconaFT/ftcxx/db_env.hpp b/storage/tokudb/PerconaFT/ftcxx/db_env.hpp
index 071614b87e9..15b5ce55f72 100644
--- a/storage/tokudb/PerconaFT/ftcxx/db_env.hpp
+++ b/storage/tokudb/PerconaFT/ftcxx/db_env.hpp
@@ -202,6 +202,7 @@ namespace ftcxx {
         typedef uint64_t (*get_lock_wait_time_cb_func)(uint64_t);
         get_lock_wait_time_cb_func _get_lock_wait_time_cb;
         lock_timeout_callback _lock_timeout_callback;
+        lock_wait_callback _lock_wait_needed_callback;
         uint64_t (*_loader_memory_size_callback)(void);
 
         uint32_t _cachesize_gbytes;
@@ -231,6 +232,7 @@ namespace ftcxx {
               _lock_wait_time_msec(0),
               _get_lock_wait_time_cb(nullptr),
               _lock_timeout_callback(nullptr),
+              _lock_wait_needed_callback(nullptr),
               _loader_memory_size_callback(nullptr),
               _cachesize_gbytes(0),
               _cachesize_bytes(0),
@@ -296,6 +298,11 @@ namespace ftcxx {
                 handle_ft_retval(r);
             }
 
+            if (_lock_wait_needed_callback) {
+                r = env->set_lock_wait_callback(env, _lock_wait_needed_callback);
+                handle_ft_retval(r);
+            }
+
             if (_loader_memory_size_callback) {
                 env->set_loader_memory_size(env, _loader_memory_size_callback);
             }
@@ -419,6 +426,11 @@ namespace ftcxx {
             return *this;
         }
 
+        DBEnvBuilder& set_lock_wait_callback(lock_wait_callback callback) {
+            _lock_wait_needed_callback = callback;
+            return *this;
+        }
+
         DBEnvBuilder& set_loader_memory_size(uint64_t (*callback)(void)) {
             _loader_memory_size_callback = callback;
             return *this;
diff --git a/storage/tokudb/PerconaFT/locktree/lock_request.cc b/storage/tokudb/PerconaFT/locktree/lock_request.cc
index 22b6da9afc4..943362e1b9d 100644
--- a/storage/tokudb/PerconaFT/locktree/lock_request.cc
+++ b/storage/tokudb/PerconaFT/locktree/lock_request.cc
@@ -65,6 +65,7 @@ void lock_request::create(void) {
     toku_cond_init(&m_wait_cond, nullptr);
 
     m_start_test_callback = nullptr;
+    m_start_before_pending_test_callback = nullptr;
     m_retry_test_callback = nullptr;
 }
 
@@ -79,7 +80,7 @@ void lock_request::destroy(void) {
 }
 
 // set the lock request parameters. this API allows a lock request to be reused.
-void lock_request::set(locktree *lt, TXNID txnid, const DBT *left_key, const DBT *right_key, lock_request::type lock_type, bool big_txn) {
+void lock_request::set(locktree *lt, TXNID txnid, const DBT *left_key, const DBT *right_key, lock_request::type lock_type, bool big_txn, void *extra) {
     invariant(m_state != state::PENDING);
     m_lt = lt;
     m_txnid = txnid;
@@ -91,6 +92,7 @@ void lock_request::set(locktree *lt, TXNID txnid, const DBT *left_key, const DBT
     m_state = state::INITIALIZED;
     m_info = lt ? lt->get_lock_request_info() : nullptr;
     m_big_txn = big_txn;
+    m_extra = extra;
 }
 
 // get rid of any stored left and right key copies and
@@ -173,6 +175,7 @@ int lock_request::start(void) {
         m_state = state::PENDING;
         m_start_time = toku_current_time_microsec() / 1000;
         m_conflicting_txnid = conflicts.get(0);
+        if (m_start_before_pending_test_callback) m_start_before_pending_test_callback();
         toku_mutex_lock(&m_info->mutex);
         insert_into_lock_requests();
         if (deadlock_exists(conflicts)) {
@@ -196,14 +199,32 @@ int lock_request::wait(uint64_t wait_time_ms) {
     return wait(wait_time_ms, 0, nullptr);
 }
 
-int lock_request::wait(uint64_t wait_time_ms, uint64_t killed_time_ms, int (*killed_callback)(void)) {
+int lock_request::wait(uint64_t wait_time_ms, uint64_t killed_time_ms, int (*killed_callback)(void),
+                       void (*lock_wait_callback)(void *, TXNID, TXNID)) {
     uint64_t t_now = toku_current_time_microsec();
     uint64_t t_start = t_now;
     uint64_t t_end = t_start + wait_time_ms * 1000;
 
     toku_mutex_lock(&m_info->mutex);
 
+    // check again, this time locking out other retry calls
+    if (m_state == state::PENDING) {
+        GrowableArray<TXNID> conflicts_collector;
+        conflicts_collector.init();
+        retry(&conflicts_collector);
+        if (m_state == state::PENDING) {
+            report_waits(&conflicts_collector, lock_wait_callback);
+        }
+        conflicts_collector.deinit();
+    }
+
     while (m_state == state::PENDING) {
+        // check if this thread is killed
+        if (killed_callback && killed_callback()) {
+            remove_from_lock_requests();
+            complete(DB_LOCK_NOTGRANTED);
+            continue;
+        }
 
         // compute next wait time
         uint64_t t_wait;
@@ -221,7 +242,7 @@ int lock_request::wait(uint64_t wait_time_ms, uint64_t killed_time_ms, int (*kil
         invariant(r == 0 || r == ETIMEDOUT);
 
         t_now = toku_current_time_microsec();
-        if (m_state == state::PENDING && (t_now >= t_end || (killed_callback && killed_callback()))) {
+        if (m_state == state::PENDING && t_now >= t_end) {
             m_info->counters.timeout_count += 1;
             
             // if we're still pending and we timed out, then remove our
@@ -273,14 +294,16 @@ TXNID lock_request::get_conflicting_txnid(void) const {
     return m_conflicting_txnid;
 }
 
-int lock_request::retry(void) {
+int lock_request::retry(GrowableArray<TXNID> *conflicts_collector) {
+    invariant(m_state == state::PENDING);
     int r;
 
-    invariant(m_state == state::PENDING);
+    txnid_set conflicts;
+    conflicts.create();
     if (m_type == type::WRITE) {
-        r = m_lt->acquire_write_lock(m_txnid, m_left_key, m_right_key, nullptr, m_big_txn);
+        r = m_lt->acquire_write_lock(m_txnid, m_left_key, m_right_key, &conflicts, m_big_txn);
     } else {
-        r = m_lt->acquire_read_lock(m_txnid, m_left_key, m_right_key, nullptr, m_big_txn);
+        r = m_lt->acquire_read_lock(m_txnid, m_left_key, m_right_key, &conflicts, m_big_txn);
     }
 
     // if the acquisition succeeded then remove ourselves from the
@@ -290,59 +313,105 @@ int lock_request::retry(void) {
         complete(r);
         if (m_retry_test_callback) m_retry_test_callback(); // test callback
         toku_cond_broadcast(&m_wait_cond);
+    } else {
+        m_conflicting_txnid = conflicts.get(0);
+        add_conflicts_to_waits(&conflicts, conflicts_collector);
     }
+    conflicts.destroy();
 
     return r;
 }
 
-void lock_request::retry_all_lock_requests(locktree *lt) {
+void lock_request::retry_all_lock_requests(locktree *lt, void (*lock_wait_callback)(void *, TXNID, TXNID), void (*after_retry_all_test_callback)(void)) {
     lt_lock_request_info *info = lt->get_lock_request_info();
 
-    // if a thread reads this bit to be true, then it should go ahead and
-    // take the locktree mutex and retry lock requests. we use this bit
-    // to prevent every single thread from waiting on the locktree mutex
-    // in order to retry requests, especially when no requests actually exist.
-    //
-    // it is important to note that this bit only provides an optimization.
-    // it is not problematic for it to be true when it should be false,
-    // but it can be problematic for it to be false when it should be true.
-    // therefore, the lock request code must ensures that when lock requests
-    // are added to this locktree, the bit is set.
-    // see lock_request::insert_into_lock_requests()
-    if (!info->should_retry_lock_requests) {
+    info->retry_want++;
+
+    // if there are no pending lock requests than there is nothing to do
+    // the unlocked data race on pending_is_empty is OK since lock requests
+    // are retried after added to the pending set.
+    if (info->pending_is_empty)
         return;
-    }
 
     toku_mutex_lock(&info->mutex);
 
-    // let other threads know that they need not retry lock requests at this time.
-    //
-    // the motivation here is that if a bunch of threads have already released
-    // their locks in the rangetree, then its probably okay for only one thread
-    // to iterate over the list of requests and retry them. otherwise, at high
-    // thread counts and a large number of pending lock requests, you could
-    // end up wasting a lot of cycles.
-    info->should_retry_lock_requests = false;
-
-    size_t i = 0;
-    while (i < info->pending_lock_requests.size()) {
-        lock_request *request;
-        int r = info->pending_lock_requests.fetch(i, &request);
-        invariant_zero(r);
-
-        // retry the lock request. if it didn't succeed,
-        // move on to the next lock request. otherwise
-        // the request is gone from the list so we may
-        // read the i'th entry for the next one.
-        r = request->retry();
-        if (r != 0) {
-            i++;
+    GrowableArray<TXNID> conflicts_collector;
+    conflicts_collector.init();
+
+    // here is the group retry algorithm.
+    // get the latest retry_want count and use it as the generation number of this retry operation.
+    // if this retry generation is > the last retry generation, then do the lock retries.  otherwise,
+    // no lock retries are needed.
+    unsigned long long retry_gen = info->retry_want.load();
+    if (retry_gen > info->retry_done) {
+
+        // retry all of the pending lock requests.
+        for (size_t i = 0; i < info->pending_lock_requests.size(); ) {
+            lock_request *request;
+            int r = info->pending_lock_requests.fetch(i, &request);
+            invariant_zero(r);
+
+            // retry this lock request. if it didn't succeed,
+            // move on to the next lock request. otherwise
+            // the request is gone from the list so we may
+            // read the i'th entry for the next one.
+            r = request->retry(&conflicts_collector);
+            if (r != 0) {
+                i++;
+            }
         }
+        if (after_retry_all_test_callback) after_retry_all_test_callback();
+        info->retry_done = retry_gen;
+    }
+
+    toku_mutex_unlock(&info->mutex);
+
+    report_waits(&conflicts_collector, lock_wait_callback);
+    conflicts_collector.deinit();
+}
+
+void lock_request::add_conflicts_to_waits(txnid_set *conflicts,
+                                          GrowableArray<TXNID> *wait_conflicts) {
+    size_t num_conflicts = conflicts->size();
+    for (size_t i = 0; i < num_conflicts; i++) {
+        wait_conflicts->push(m_txnid);
+        wait_conflicts->push(conflicts->get(i));
     }
+}
+
+void lock_request::report_waits(GrowableArray<TXNID> *wait_conflicts,
+                                void (*lock_wait_callback)(void *, TXNID, TXNID)) {
+    if (!lock_wait_callback)
+        return;
+    size_t num_conflicts = wait_conflicts->get_size();
+    for (size_t i = 0; i < num_conflicts; i += 2) {
+        TXNID blocked_txnid = wait_conflicts->fetch_unchecked(i);
+        TXNID blocking_txnid = wait_conflicts->fetch_unchecked(i+1);
+        (*lock_wait_callback)(nullptr, blocked_txnid, blocking_txnid);
+    }
+}
+
+void *lock_request::get_extra(void) const {
+    return m_extra;
+}
 
-    // future threads should only retry lock requests if some still exist
-    info->should_retry_lock_requests = info->pending_lock_requests.size() > 0;
+void lock_request::kill_waiter(void) {
+    remove_from_lock_requests();
+    complete(DB_LOCK_NOTGRANTED);
+    toku_cond_broadcast(&m_wait_cond);
+}
 
+void lock_request::kill_waiter(locktree *lt, void *extra) {
+    lt_lock_request_info *info = lt->get_lock_request_info();
+    toku_mutex_lock(&info->mutex);
+    for (size_t i = 0; i < info->pending_lock_requests.size(); i++) {
+        lock_request *request;
+        int r = info->pending_lock_requests.fetch(i, &request);
+        if (r == 0 && request->get_extra() == extra) {
+            request->kill_waiter();
+            break;
+        }
+    }
     toku_mutex_unlock(&info->mutex);
 }
 
@@ -364,9 +433,7 @@ void lock_request::insert_into_lock_requests(void) {
     invariant(r == DB_NOTFOUND);
     r = m_info->pending_lock_requests.insert_at(this, idx);
     invariant_zero(r);
-
-    // ensure that this bit is true, now that at least one lock request is in the set
-    m_info->should_retry_lock_requests = true;
+    m_info->pending_is_empty = false;
 }
 
 // remove this lock request from the locktree's set. must hold the mutex.
@@ -378,6 +445,8 @@ void lock_request::remove_from_lock_requests(void) {
     invariant(request == this);
     r = m_info->pending_lock_requests.delete_at(idx);
     invariant_zero(r);
+    if (m_info->pending_lock_requests.size() == 0)
+        m_info->pending_is_empty = true;
 }
 
 int lock_request::find_by_txnid(lock_request * const &request, const TXNID &txnid) {
@@ -395,6 +464,10 @@ void lock_request::set_start_test_callback(void (*f)(void)) {
     m_start_test_callback = f;
 }
 
+void lock_request::set_start_before_pending_test_callback(void (*f)(void)) {
+    m_start_before_pending_test_callback = f;
+}
+
 void lock_request::set_retry_test_callback(void (*f)(void)) {
     m_retry_test_callback = f;
 }
diff --git a/storage/tokudb/PerconaFT/locktree/lock_request.h b/storage/tokudb/PerconaFT/locktree/lock_request.h
index 48d1279cde2..1fa94ef5b96 100644
--- a/storage/tokudb/PerconaFT/locktree/lock_request.h
+++ b/storage/tokudb/PerconaFT/locktree/lock_request.h
@@ -78,7 +78,7 @@ public:
 
     // effect: Resets the lock request parameters, allowing it to be reused.
     // requires: Lock request was already created at some point
-    void set(locktree *lt, TXNID txnid, const DBT *left_key, const DBT *right_key, type lock_type, bool big_txn);
+    void set(locktree *lt, TXNID txnid, const DBT *left_key, const DBT *right_key, type lock_type, bool big_txn, void *extra = nullptr);
 
     // effect: Tries to acquire a lock described by this lock request.
     // returns: The return code of locktree::acquire_[write,read]_lock()
@@ -89,7 +89,8 @@ public:
     // returns: The return code of locktree::acquire_[write,read]_lock()
     //          or simply DB_LOCK_NOTGRANTED if the wait time expired.
     int wait(uint64_t wait_time_ms);
-    int wait(uint64_t wait_time_ms, uint64_t killed_time_ms, int (*killed_callback)(void));
+    int wait(uint64_t wait_time_ms, uint64_t killed_time_ms, int (*killed_callback)(void),
+             void (*lock_wait_callback)(void *, TXNID, TXNID) = nullptr);
 
     // return: left end-point of the lock range
     const DBT *get_left_key(void) const;
@@ -109,12 +110,18 @@ public:
     // effect: Retries all of the lock requests for the given locktree.
     //         Any lock requests successfully restarted is completed and woken up.
     //         The rest remain pending.
-    static void retry_all_lock_requests(locktree *lt);
+    static void retry_all_lock_requests(locktree *lt, void (*lock_wait_callback)(void *, TXNID, TXNID) = nullptr, void (*after_retry_test_callback)(void) = nullptr);
 
     void set_start_test_callback(void (*f)(void));
+    void set_start_before_pending_test_callback(void (*f)(void));
     void set_retry_test_callback(void (*f)(void));
-private:
 
+    void *get_extra(void) const;
+
+    void kill_waiter(void);
+    static void kill_waiter(locktree *lt, void *extra);
+
+private:
     enum state {
         UNINITIALIZED,
         INITIALIZED,
@@ -152,9 +159,11 @@ private:
     // locktree that this lock request is for.
     struct lt_lock_request_info *m_info;
 
+    void *m_extra;
+
     // effect: tries again to acquire the lock described by this lock request
     // returns: 0 if retrying the request succeeded and is now complete
-    int retry(void);
+    int retry(GrowableArray<TXNID> *conflict_collector);
 
     void complete(int complete_r);
 
@@ -186,7 +195,13 @@ private:
 
     static int find_by_txnid(lock_request * const &request, const TXNID &txnid);
 
+    // Report list of conflicts to lock wait callback.
+    static void report_waits(GrowableArray<TXNID> *wait_conflicts,
+                             void (*lock_wait_callback)(void *, TXNID, TXNID));
+    void add_conflicts_to_waits(txnid_set *conflicts, GrowableArray<TXNID> *wait_conflicts);
+
     void (*m_start_test_callback)(void);
+    void (*m_start_before_pending_test_callback)(void);
     void (*m_retry_test_callback)(void);
 
     friend class lock_request_unit_test;
diff --git a/storage/tokudb/PerconaFT/locktree/locktree.cc b/storage/tokudb/PerconaFT/locktree/locktree.cc
index d3596d47eeb..11f8a4e5ff7 100644
--- a/storage/tokudb/PerconaFT/locktree/locktree.cc
+++ b/storage/tokudb/PerconaFT/locktree/locktree.cc
@@ -81,20 +81,14 @@ void locktree::create(locktree_manager *mgr, DICTIONARY_ID dict_id, const compar
     m_sto_end_early_time = 0;
 
     m_lock_request_info.pending_lock_requests.create();
+    m_lock_request_info.pending_is_empty = true;
     ZERO_STRUCT(m_lock_request_info.mutex);
     toku_mutex_init(&m_lock_request_info.mutex, nullptr);
-    m_lock_request_info.should_retry_lock_requests = false;
+    m_lock_request_info.retry_want = m_lock_request_info.retry_done = 0;
     ZERO_STRUCT(m_lock_request_info.counters);
 
-    // Threads read the should retry bit without a lock
-    // for performance. It's ok to read the wrong value.
-    // - If you think you should but you shouldn't, you waste a little time.
-    // - If you think you shouldn't but you should, then some other thread
-    // will come around to do the work of retrying requests instead of you.
-    TOKU_VALGRIND_HG_DISABLE_CHECKING(
-            &m_lock_request_info.should_retry_lock_requests,
-            sizeof(m_lock_request_info.should_retry_lock_requests));
-    TOKU_DRD_IGNORE_VAR(m_lock_request_info.should_retry_lock_requests);
+    TOKU_VALGRIND_HG_DISABLE_CHECKING(&m_lock_request_info.pending_is_empty, sizeof(m_lock_request_info.pending_is_empty));
+    TOKU_DRD_IGNORE_VAR(m_lock_request_info.pending_is_empty);
 }
 
 void locktree::destroy(void) {
diff --git a/storage/tokudb/PerconaFT/locktree/locktree.h b/storage/tokudb/PerconaFT/locktree/locktree.h
index 710f9e7db06..64171c51b23 100644
--- a/storage/tokudb/PerconaFT/locktree/locktree.h
+++ b/storage/tokudb/PerconaFT/locktree/locktree.h
@@ -38,6 +38,8 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 
 #pragma once
 
+#include <atomic>
+
 #include <db.h>
 #include <toku_time.h>
 #include <toku_pthread.h>
@@ -80,9 +82,11 @@ namespace toku {
     // Lock request state for some locktree
     struct lt_lock_request_info {
         omt<lock_request *> pending_lock_requests;
+        std::atomic_bool pending_is_empty;
         toku_mutex_t mutex;
-        bool should_retry_lock_requests;
         lt_counters counters;
+        std::atomic_ullong retry_want;
+        unsigned long long retry_done;
     };
 
     // The locktree manager manages a set of locktrees, one for each open dictionary.
@@ -159,6 +163,8 @@ namespace toku {
         // Add time t to the escalator's wait time statistics
         void add_escalator_wait_time(uint64_t t);
 
+        void kill_waiter(void *extra);
+
     private:
         static const uint64_t DEFAULT_MAX_LOCK_MEMORY = 64L * 1024 * 1024;
 
diff --git a/storage/tokudb/PerconaFT/locktree/manager.cc b/storage/tokudb/PerconaFT/locktree/manager.cc
index 4708cdf4a5a..91ff7c5a007 100644
--- a/storage/tokudb/PerconaFT/locktree/manager.cc
+++ b/storage/tokudb/PerconaFT/locktree/manager.cc
@@ -483,4 +483,17 @@ void locktree_manager::get_status(LTM_STATUS statp) {
     *statp = ltm_status;
 }
 
+void locktree_manager::kill_waiter(void *extra) {
+    mutex_lock();
+    int r = 0;
+    size_t num_locktrees = m_locktree_map.size();
+    for (size_t i = 0; i < num_locktrees; i++) {
+        locktree *lt;
+        r = m_locktree_map.fetch(i, &lt);
+        invariant_zero(r);
+        lock_request::kill_waiter(lt, extra);
+    }
+    mutex_unlock();
+}
+
 } /* namespace toku */
diff --git a/storage/tokudb/PerconaFT/locktree/tests/kill_waiter.cc b/storage/tokudb/PerconaFT/locktree/tests/kill_waiter.cc
new file mode 100644
index 00000000000..8d93c0bbbab
--- /dev/null
+++ b/storage/tokudb/PerconaFT/locktree/tests/kill_waiter.cc
@@ -0,0 +1,100 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+
+// test the lock manager kill waiter function
+
+#include "locktree.h"
+#include "lock_request.h"
+#include "test.h"
+#include "locktree_unit_test.h"
+#include <thread>
+#include <atomic>
+
+namespace toku {
+
+const uint64_t my_lock_wait_time = 1000 * 1000;
+const uint64_t my_killed_time = 500 * 1000;
+const int n_locks = 4;
+
+static int my_killed_callback(void) {
+    if (1) fprintf(stderr, "%s:%u %s\n", __FILE__, __LINE__, __FUNCTION__);
+    return 0;
+}
+
+static void locktree_release_lock(locktree *lt, TXNID txn_id, const DBT *left, const DBT *right) {
+    range_buffer buffer;
+    buffer.create();
+    buffer.append(left, right);
+    lt->release_locks(txn_id, &buffer);
+    buffer.destroy();
+}
+
+static void wait_lock(lock_request *lr, std::atomic_int *done) {
+    int r = lr->wait(my_lock_wait_time, my_killed_time, my_killed_callback);
+    assert(r == DB_LOCK_NOTGRANTED);
+    *done = 1;
+}
+
+static void test_kill_waiter(void) {
+    int r;
+
+    locktree_manager mgr;
+    mgr.create(nullptr, nullptr, nullptr, nullptr);
+
+    DICTIONARY_ID dict_id = { 1 };
+    locktree *lt = mgr.get_lt(dict_id, dbt_comparator, nullptr);
+
+    const DBT *one = get_dbt(1);
+
+    lock_request locks[n_locks];
+    std::thread waiters[n_locks-1];
+    for (int i = 0; i < n_locks; i++) {
+        locks[i].create();
+        locks[i].set(lt, i+1, one, one, lock_request::type::WRITE, false, &waiters[i]);
+    }
+
+    // txn 'n_locks' grabs the lock
+    r = locks[n_locks-1].start();
+    assert_zero(r);
+
+    for (int i = 0; i < n_locks-1; i++) {
+        r = locks[i].start();
+        assert(r == DB_LOCK_NOTGRANTED);
+    }
+
+    std::atomic_int done[n_locks-1];
+    for (int i = 0; i < n_locks-1; i++) {
+        done[i] = 0;
+        waiters[i] = std::thread(wait_lock, &locks[i], &done[i]);
+    }
+
+    for (int i = 0; i < n_locks-1; i++) {
+        assert(!done[i]);
+    }
+
+    sleep(1);
+    for (int i = 0; i < n_locks-1; i++) {
+        mgr.kill_waiter(&waiters[i]);
+        while (!done[i]) sleep(1);
+        waiters[i].join();
+        for (int j = i+1; j < n_locks-1; j++)
+            assert(!done[j]);
+    }
+
+    locktree_release_lock(lt, n_locks, one, one);
+
+    for (int i = 0; i < n_locks; i++) {
+        locks[i].destroy();
+    }
+
+    mgr.release_lt(lt);
+    mgr.destroy();
+}
+
+} /* namespace toku */
+
+int main(void) {
+    toku::test_kill_waiter();
+    return 0;
+}
+
diff --git a/storage/tokudb/PerconaFT/locktree/tests/lock_request_killed.cc b/storage/tokudb/PerconaFT/locktree/tests/lock_request_killed.cc
index efd4092906b..ec464444271 100644
--- a/storage/tokudb/PerconaFT/locktree/tests/lock_request_killed.cc
+++ b/storage/tokudb/PerconaFT/locktree/tests/lock_request_killed.cc
@@ -51,8 +51,9 @@ static uint64_t t_do_kill;
 
 static int my_killed_callback(void) {
     uint64_t t_now = toku_current_time_microsec();
+    if (t_now == t_last_kill)
+        return 0;
     assert(t_now >= t_last_kill);
-    assert(t_now - t_last_kill >= my_killed_time * 1000 / 2); // div by 2 for valgrind which is not very accurate
     t_last_kill = t_now;
     killed_calls++;
     if (t_now >= t_do_kill)
diff --git a/storage/tokudb/PerconaFT/locktree/tests/lock_request_not_killed.cc b/storage/tokudb/PerconaFT/locktree/tests/lock_request_not_killed.cc
index 702e2e2626c..647b4d3c418 100644
--- a/storage/tokudb/PerconaFT/locktree/tests/lock_request_not_killed.cc
+++ b/storage/tokudb/PerconaFT/locktree/tests/lock_request_not_killed.cc
@@ -52,7 +52,6 @@ static uint64_t t_last_kill;
 static int my_killed_callback(void) {
     uint64_t t_now = toku_current_time_microsec();
     assert(t_now >= t_last_kill);
-    assert(t_now - t_last_kill >= my_killed_time * 1000 / 2); // div by 2 for valgrind which is not very accurate
     t_last_kill = t_now;
     killed_calls++;
     return 0;
diff --git a/storage/tokudb/PerconaFT/locktree/tests/lock_request_start_release_wait.cc b/storage/tokudb/PerconaFT/locktree/tests/lock_request_start_release_wait.cc
new file mode 100644
index 00000000000..eb19ceb70e5
--- /dev/null
+++ b/storage/tokudb/PerconaFT/locktree/tests/lock_request_start_release_wait.cc
@@ -0,0 +1,89 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+
+// test the race between start, release, and wait.  since start does not put its
+// lock request into the pending set, the blocking txn could release its lock before
+// the first txn waits.  this will block the first txn because its lock request is
+// not known when the lock is released.  the bug fix is to try again when lock retries
+// are locked out.
+
+#include "locktree.h"
+#include "lock_request.h"
+#include "test.h"
+#include "locktree_unit_test.h"
+#include <thread>
+#include <atomic>
+
+namespace toku {
+
+const uint64_t my_lock_wait_time = 1000 * 1000; // ms
+const uint64_t my_killed_time = 1 * 1000; // ms
+
+static uint64_t t_wait;
+
+static int my_killed_callback(void) {
+    uint64_t t_now = toku_current_time_microsec();
+    assert(t_now >= t_wait);
+    if (t_now - t_wait >= my_killed_time*1000)
+        abort();
+    return 0;
+}
+
+static void locktree_release_lock(locktree *lt, TXNID txn_id, const DBT *left, const DBT *right) {
+    range_buffer buffer;
+    buffer.create();
+    buffer.append(left, right);
+    lt->release_locks(txn_id, &buffer);
+    buffer.destroy();
+}
+
+static void test_start_release_wait(void) {
+    int r;
+
+    locktree_manager mgr;
+    mgr.create(nullptr, nullptr, nullptr, nullptr);
+
+    DICTIONARY_ID dict_id = { 1 };
+    locktree *lt = mgr.get_lt(dict_id, dbt_comparator, nullptr);
+
+    const DBT *one = get_dbt(1);
+
+    // a locks one
+    lock_request a;
+    a.create();
+    a.set(lt, 1, one, one, lock_request::type::WRITE, false);
+    r = a.start();
+    assert(r == 0);
+
+    // b tries to lock one, fails
+    lock_request b;
+    b.create();
+    b.set(lt, 2, one, one, lock_request::type::WRITE, false);
+    r = b.start();
+    assert(r == DB_LOCK_NOTGRANTED);
+
+    // a releases its lock
+    locktree_release_lock(lt, 1, one, one);
+
+    // b waits for one, gets locks immediately
+    t_wait = toku_current_time_microsec();
+    r = b.wait(my_lock_wait_time, my_killed_time, my_killed_callback);
+    assert(r == 0);
+
+    // b releases its lock so we can exit cleanly
+    locktree_release_lock(lt, 2, one, one);
+
+    a.destroy();
+    b.destroy();
+
+    mgr.release_lt(lt);
+    mgr.destroy();
+}
+
+} /* namespace toku */
+
+int main(void) {
+    toku::test_start_release_wait();
+    return 0;
+}
+
diff --git a/storage/tokudb/PerconaFT/locktree/tests/lock_request_start_retry_race.cc b/storage/tokudb/PerconaFT/locktree/tests/lock_request_start_retry_race.cc
index 3b653e9c6ef..88493ec9ce0 100644
--- a/storage/tokudb/PerconaFT/locktree/tests/lock_request_start_retry_race.cc
+++ b/storage/tokudb/PerconaFT/locktree/tests/lock_request_start_retry_race.cc
@@ -37,6 +37,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
 
 #include <iostream>
+#include <thread>
 #include "test.h"
 #include "locktree.h"
 #include "lock_request.h"
@@ -47,15 +48,6 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 
 namespace toku {
 
-struct locker_arg {
-    locktree *_lt;
-    TXNID _id;
-    const DBT *_key;
-
-    locker_arg(locktree *lt, TXNID id, const DBT *key) : _lt(lt), _id(id), _key(key) {
-    }
-};
-
 static void locker_callback(void) {
     usleep(10000);
 }
@@ -97,20 +89,13 @@ static void run_locker(locktree *lt, TXNID txnid, const DBT *key) {
 
         toku_pthread_yield();
         if ((i % 10) == 0)
-            std::cout << toku_pthread_self() << " " << i << std::endl;
+            std::cout << std::this_thread::get_id() << " " << i << std::endl;
     }
 }
 
-static void *locker(void *v_arg) {
-    locker_arg *arg = static_cast<locker_arg *>(v_arg);
-    run_locker(arg->_lt, arg->_id, arg->_key);
-    return arg;
-}
-
 } /* namespace toku */
 
 int main(void) {
-    int r;
 
     toku::locktree lt;
     DICTIONARY_ID dict_id = { 1 };
@@ -119,18 +104,12 @@ int main(void) {
     const DBT *one = toku::get_dbt(1);
 
     const int n_workers = 2;
-    toku_pthread_t ids[n_workers];
+    std::thread worker[n_workers];
     for (int i = 0; i < n_workers; i++) {
-        toku::locker_arg *arg = new toku::locker_arg(&lt, i, one);
-        r = toku_pthread_create(&ids[i], nullptr, toku::locker, arg);
-        assert_zero(r);
+        worker[i] = std::thread(toku::run_locker, &lt, i, one);
     }
     for (int i = 0; i < n_workers; i++) {
-        void *ret;
-        r = toku_pthread_join(ids[i], &ret);
-        assert_zero(r);
-        toku::locker_arg *arg = static_cast<toku::locker_arg *>(ret);
-        delete arg;
+        worker[i].join();
     }
 
     lt.release_reference();
diff --git a/storage/tokudb/PerconaFT/locktree/tests/lock_request_start_retry_race_3.cc b/storage/tokudb/PerconaFT/locktree/tests/lock_request_start_retry_race_3.cc
new file mode 100644
index 00000000000..8f0d86c9f64
--- /dev/null
+++ b/storage/tokudb/PerconaFT/locktree/tests/lock_request_start_retry_race_3.cc
@@ -0,0 +1,127 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <iostream>
+#include <thread>
+#include <pthread.h>
+#include "test.h"
+#include "locktree.h"
+#include "lock_request.h"
+
+// Suppose that 3 threads are running a lock acquire, release, retry sequence.  There is
+// a race in the retry algorithm with 2 threads running lock retry simultaneously.  The
+// first thread to run retry sets a flag that will cause the second thread to skip the
+// lock retries.  If the first thread progressed past the contended lock, then the second
+// threa will HANG until its lock timer pops, even when the contended lock is no longer held.
+
+// This test exposes this problem as a test hang.  The group retry algorithm fixes the race
+// in the lock request retry algorihm and this test should no longer hang.
+
+namespace toku {
+
+// use 1000 when after_retry_all is implemented, otherwise use 100000
+static const int n_tests = 1000; // 100000;
+
+static void after_retry_all(void) {
+    usleep(10000);
+}
+
+static void run_locker(locktree *lt, TXNID txnid, const DBT *key, pthread_barrier_t *b) {
+    for (int i = 0; i < n_tests; i++) {
+        int r;
+        r = pthread_barrier_wait(b); assert(r == 0 || r == PTHREAD_BARRIER_SERIAL_THREAD);
+
+        lock_request request;
+        request.create();
+
+        request.set(lt, txnid, key, key, lock_request::type::WRITE, false);
+
+        // try to acquire the lock
+        r = request.start();
+        if (r == DB_LOCK_NOTGRANTED) {
+            // wait for the lock to be granted
+            r = request.wait(1000 * 1000);
+        }
+
+        if (r == 0) {
+            // release the lock
+            range_buffer buffer;
+            buffer.create();
+            buffer.append(key, key);
+            lt->release_locks(txnid, &buffer);
+            buffer.destroy();
+
+            // retry pending lock requests
+            lock_request::retry_all_lock_requests(lt, nullptr, after_retry_all);
+        }
+
+        request.destroy();
+        memset(&request, 0xab, sizeof request);
+
+        toku_pthread_yield();
+        if ((i % 10) == 0)
+            std::cout << std::this_thread::get_id() << " " << i << std::endl;
+    }
+}
+
+} /* namespace toku */
+
+int main(void) {
+
+    toku::locktree lt;
+    DICTIONARY_ID dict_id = { 1 };
+    lt.create(nullptr, dict_id, toku::dbt_comparator);
+
+    const DBT *one = toku::get_dbt(1);
+
+    const int n_workers = 3;
+    std::thread worker[n_workers];
+    pthread_barrier_t b;
+    int r = pthread_barrier_init(&b, nullptr, n_workers); assert(r == 0);
+    for (int i = 0; i < n_workers; i++) {
+        worker[i] = std::thread(toku::run_locker, &lt, i, one, &b);
+    }
+    for (int i = 0; i < n_workers; i++) {
+        worker[i].join();
+    }
+    r = pthread_barrier_destroy(&b); assert(r == 0);
+    lt.release_reference();
+    lt.destroy();
+    return 0;
+}
+
diff --git a/storage/tokudb/PerconaFT/locktree/tests/lock_request_start_retry_wait_race_2.cc b/storage/tokudb/PerconaFT/locktree/tests/lock_request_start_retry_wait_race_2.cc
new file mode 100644
index 00000000000..a2ceff99edb
--- /dev/null
+++ b/storage/tokudb/PerconaFT/locktree/tests/lock_request_start_retry_wait_race_2.cc
@@ -0,0 +1,128 @@
+/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
+#ident "$Id$"
+/*======
+This file is part of PerconaFT.
+
+
+Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU General Public License, version 2,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+
+----------------------------------------
+
+    PerconaFT is free software: you can redistribute it and/or modify
+    it under the terms of the GNU Affero General Public License, version 3,
+    as published by the Free Software Foundation.
+
+    PerconaFT is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU Affero General Public License for more details.
+
+    You should have received a copy of the GNU Affero General Public License
+    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
+======= */
+
+#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
+
+#include <iostream>
+#include <thread>
+#include <pthread.h>
+#include "test.h"
+#include "locktree.h"
+#include "lock_request.h"
+
+// Suppose that 2 threads are running a lock acquire, release, retry sequence.  There is a
+// race between the acquire and the release with 2 threads.  If thread 1 acquires a lock,
+// and thread 2 tries to acquire the same lock and fails, thread 1 may release its lock and retry
+// pending lock requests BEFORE thread 2 adds itself to the pending lock requests.  If this
+// happens, then thread 2 will HANG until its lock timer expires even when the lock it is
+// waiting for is FREE.
+
+// This test exposes this problem as a test hang.  If the race is fixed, then the test runs to
+// completion.
+
+namespace toku {
+
+static void start_before_pending(void) {
+    usleep(10000);
+}
+
+static void run_locker(locktree *lt, TXNID txnid, const DBT *key, pthread_barrier_t *b) {
+    for (int i = 0; i < 100000; i++) {
+        int r;
+        r = pthread_barrier_wait(b); assert(r == 0 || r == PTHREAD_BARRIER_SERIAL_THREAD);
+
+        lock_request request;
+        request.create();
+        request.set(lt, txnid, key, key, lock_request::type::WRITE, false);
+
+        // if the callback is included, then the race is easy to reproduce.  Otherwise, several
+        // test runs may be required before the race happens.
+        if (1) request.set_start_before_pending_test_callback(start_before_pending);
+
+        // try to acquire the lock
+        r = request.start();
+        if (r == DB_LOCK_NOTGRANTED) {
+            // wait for the lock to be granted
+            r = request.wait(1000 * 1000);
+        }
+
+        if (r == 0) {
+            // release the lock
+            range_buffer buffer;
+            buffer.create();
+            buffer.append(key, key);
+            lt->release_locks(txnid, &buffer);
+            buffer.destroy();
+
+            // retry pending lock requests
+            lock_request::retry_all_lock_requests(lt);
+        }
+
+        request.destroy();
+        memset(&request, 0xab, sizeof request);
+
+        toku_pthread_yield();
+        if ((i % 10) == 0)
+            std::cout << std::this_thread::get_id() << " " << i << std::endl;
+    }
+}
+
+} /* namespace toku */
+
+int main(void) {
+
+    toku::locktree lt;
+    DICTIONARY_ID dict_id = { 1 };
+    lt.create(nullptr, dict_id, toku::dbt_comparator);
+
+    const DBT *one = toku::get_dbt(1);
+
+    const int n_workers = 2;
+    std::thread worker[n_workers];
+    pthread_barrier_t b;
+    int r = pthread_barrier_init(&b, nullptr, n_workers); assert(r == 0);
+    for (int i = 0; i < n_workers; i++) {
+        worker[i] = std::thread(toku::run_locker, &lt, i, one, &b);
+    }
+    for (int i = 0; i < n_workers; i++) {
+        worker[i].join();
+    }
+    r = pthread_barrier_destroy(&b); assert(r == 0);
+    lt.release_reference();
+    lt.destroy();
+    return 0;
+}
+
diff --git a/storage/tokudb/PerconaFT/portability/CMakeLists.txt b/storage/tokudb/PerconaFT/portability/CMakeLists.txt
index 9f84d9b03df..4793db63cc1 100644
--- a/storage/tokudb/PerconaFT/portability/CMakeLists.txt
+++ b/storage/tokudb/PerconaFT/portability/CMakeLists.txt
@@ -14,12 +14,11 @@ set(tokuportability_srcs
   )
 
 add_library(${LIBTOKUPORTABILITY} SHARED ${tokuportability_srcs})
-target_link_libraries(${LIBTOKUPORTABILITY} LINK_PRIVATE ${LIBJEMALLOC})
 target_link_libraries(${LIBTOKUPORTABILITY} LINK_PUBLIC ${CMAKE_THREAD_LIBS_INIT} ${EXTRA_SYSTEM_LIBS})
 
 add_library(tokuportability_static_conv STATIC ${tokuportability_srcs})
 set_target_properties(tokuportability_static_conv PROPERTIES POSITION_INDEPENDENT_CODE ON)
-set(tokuportability_source_libs tokuportability_static_conv ${LIBJEMALLOC} ${CMAKE_THREAD_LIBS_INIT} ${EXTRA_SYSTEM_LIBS})
+set(tokuportability_source_libs tokuportability_static_conv ${CMAKE_THREAD_LIBS_INIT} ${EXTRA_SYSTEM_LIBS})
 toku_merge_static_libs(${LIBTOKUPORTABILITY}_static ${LIBTOKUPORTABILITY}_static "${tokuportability_source_libs}")
 
 maybe_add_gcov_to_libraries(${LIBTOKUPORTABILITY} tokuportability_static_conv)
diff --git a/storage/tokudb/PerconaFT/portability/file.cc b/storage/tokudb/PerconaFT/portability/file.cc
index 5332a2dff55..0e3efc1a12a 100644
--- a/storage/tokudb/PerconaFT/portability/file.cc
+++ b/storage/tokudb/PerconaFT/portability/file.cc
@@ -356,6 +356,12 @@ toku_os_close(int fd) {  // if EINTR, retry until success
     return r;
 }
 
+int toku_os_rename(const char *old_name, const char *new_name) {
+    return rename(old_name, new_name);
+}
+
+int toku_os_unlink(const char *path) { return unlink(path); }
+
 ssize_t 
 toku_os_read(int fd, void *buf, size_t count) {
     ssize_t r;
diff --git a/storage/tokudb/PerconaFT/portability/memory.cc b/storage/tokudb/PerconaFT/portability/memory.cc
index 2de12699c61..5430ff84b70 100644
--- a/storage/tokudb/PerconaFT/portability/memory.cc
+++ b/storage/tokudb/PerconaFT/portability/memory.cc
@@ -313,6 +313,15 @@ toku_strdup(const char *s) {
     return (char *) toku_memdup(s, strlen(s)+1);
 }
 
+char *toku_strndup(const char *s, size_t n) {
+    size_t s_size = strlen(s);
+    size_t bytes_to_copy = n > s_size ? s_size : n;
+    ++bytes_to_copy;
+    char *result = (char *)toku_memdup(s, bytes_to_copy);
+    result[bytes_to_copy - 1] = 0;
+    return result;
+}
+
 void
 toku_free(void *p) {
     if (p) {
diff --git a/storage/tokudb/PerconaFT/portability/memory.h b/storage/tokudb/PerconaFT/portability/memory.h
index 7780536f279..5ae652d39fc 100644
--- a/storage/tokudb/PerconaFT/portability/memory.h
+++ b/storage/tokudb/PerconaFT/portability/memory.h
@@ -125,7 +125,9 @@ size_t toku_malloc_usable_size(void *p) __attribute__((__visibility__("default")
 void *toku_memdup (const void *v, size_t len);
 /* Toku-version of strdup.  Use this so that it calls toku_malloc() */
 char *toku_strdup (const char *s)   __attribute__((__visibility__("default")));
-
+/* Toku-version of strndup.  Use this so that it calls toku_malloc() */
+char *toku_strndup(const char *s, size_t n)
+    __attribute__((__visibility__("default")));
 /* Copy memory.  Analogous to strdup() Crashes instead of returning NULL */
 void *toku_xmemdup (const void *v, size_t len) __attribute__((__visibility__("default")));
 /* Toku-version of strdup.  Use this so that it calls toku_xmalloc()  Crashes instead of returning NULL */
diff --git a/storage/tokudb/PerconaFT/portability/portability.cc b/storage/tokudb/PerconaFT/portability/portability.cc
index ba9f8d48ed5..19f445a85d7 100644
--- a/storage/tokudb/PerconaFT/portability/portability.cc
+++ b/storage/tokudb/PerconaFT/portability/portability.cc
@@ -63,6 +63,9 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 #if defined(HAVE_SYS_SYSCTL_H)
 # include <sys/sysctl.h>
 #endif
+#if defined(HAVE_PTHREAD_H)
+# include <pthread.h>
+#endif
 #if defined(HAVE_PTHREAD_NP_H)
 # include <pthread_np.h>
 #endif
@@ -102,7 +105,11 @@ toku_os_getpid(void) {
 
 int
 toku_os_gettid(void) {
-#if defined(__NR_gettid)
+#if defined(HAVE_PTHREAD_THREADID_NP)
+    uint64_t result;
+    pthread_threadid_np(NULL, &result);
+    return (int) result; // Used for instrumentation so overflow is ok here.
+#elif defined(__NR_gettid)
     return syscall(__NR_gettid);
 #elif defined(SYS_gettid)
     return syscall(SYS_gettid);
diff --git a/storage/tokudb/PerconaFT/portability/tests/test-max-data.cc b/storage/tokudb/PerconaFT/portability/tests/test-max-data.cc
index 880f9a3a9bb..dbbea974a49 100644
--- a/storage/tokudb/PerconaFT/portability/tests/test-max-data.cc
+++ b/storage/tokudb/PerconaFT/portability/tests/test-max-data.cc
@@ -64,7 +64,7 @@ int main(int argc, char *const argv[]) {
     if (verbose) printf("maxdata=%" PRIu64 " 0x%" PRIx64 "\n", maxdata, maxdata);
 
     // check the data size
-#if __x86_64__
+#if defined(__x86_64__) || defined(__aarch64__)
     assert(maxdata > (1ULL << 32));
 #elif __i386__
     assert(maxdata < (1ULL << 32));
diff --git a/storage/tokudb/PerconaFT/portability/tests/test-xid.cc b/storage/tokudb/PerconaFT/portability/tests/test-xid.cc
index 9ee68906bb3..71736f898ef 100644
--- a/storage/tokudb/PerconaFT/portability/tests/test-xid.cc
+++ b/storage/tokudb/PerconaFT/portability/tests/test-xid.cc
@@ -51,11 +51,18 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 #if defined(HAVE_PTHREAD_NP_H)
 # include <pthread_np.h>
 #endif
+#if defined(HAVE_PTHREAD_H)
+# include <pthread.h>
+#endif
 
 // since we implement the same thing here as in toku_os_gettid, this test
 // is pretty pointless
 static int gettid(void) {
-#if defined(__NR_gettid)
+#if defined(HAVE_PTHREAD_THREADID_NP)
+    uint64_t result;
+    pthread_threadid_np(NULL, &result);
+    return (int) result;
+#elif defined(__NR_gettid)
     return syscall(__NR_gettid);
 #elif defined(SYS_gettid)
     return syscall(SYS_gettid);
diff --git a/storage/tokudb/PerconaFT/portability/toku_config.h.in b/storage/tokudb/PerconaFT/portability/toku_config.h.in
index 9033f27fd25..714835c2581 100644
--- a/storage/tokudb/PerconaFT/portability/toku_config.h.in
+++ b/storage/tokudb/PerconaFT/portability/toku_config.h.in
@@ -42,7 +42,6 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 
 #cmakedefine TOKU_DEBUG_PARANOID 1
 #cmakedefine USE_VALGRIND 1
-
 #cmakedefine HAVE_ALLOCA_H 1
 #cmakedefine HAVE_ARPA_INET_H 1
 #cmakedefine HAVE_BYTESWAP_H 1
@@ -92,6 +91,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 #cmakedefine HAVE_PTHREAD_RWLOCKATTR_SETKIND_NP 1
 #cmakedefine HAVE_PTHREAD_YIELD 1
 #cmakedefine HAVE_PTHREAD_YIELD_NP 1
+#cmakedefine HAVE_PTHREAD_THREADID_NP 1
 #cmakedefine HAVE_PTHREAD_GETTHREADID_NP 1
 
 #cmakedefine PTHREAD_YIELD_RETURNS_INT 1
diff --git a/storage/tokudb/PerconaFT/portability/toku_portability.h b/storage/tokudb/PerconaFT/portability/toku_portability.h
index 459567552b1..1f1215def5b 100644
--- a/storage/tokudb/PerconaFT/portability/toku_portability.h
+++ b/storage/tokudb/PerconaFT/portability/toku_portability.h
@@ -248,6 +248,8 @@ int toku_os_open(const char *path, int oflag, int mode);
 int toku_os_open_direct(const char *path, int oflag, int mode);
 int toku_os_close(int fd);
 int toku_os_fclose(FILE * stream);
+int toku_os_rename(const char *old_name, const char *new_name);
+int toku_os_unlink(const char *path);
 ssize_t toku_os_read(int fd, void *buf, size_t count);
 ssize_t toku_os_pread(int fd, void *buf, size_t count, off_t offset);
 void toku_os_recursive_delete(const char *path);
diff --git a/storage/tokudb/PerconaFT/portability/toku_time.h b/storage/tokudb/PerconaFT/portability/toku_time.h
index 11a3f3aa2b9..a1278ef0337 100644
--- a/storage/tokudb/PerconaFT/portability/toku_time.h
+++ b/storage/tokudb/PerconaFT/portability/toku_time.h
@@ -98,9 +98,17 @@ double tokutime_to_seconds(tokutime_t)  __attribute__((__visibility__("default")
 
 // Get the value of tokutime for right now.  We want this to be fast, so we expose the implementation as RDTSC.
 static inline tokutime_t toku_time_now(void) {
+#if defined(__x86_64__) || defined(__i386__)
     uint32_t lo, hi;
     __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
     return (uint64_t)hi << 32 | lo;
+#elif defined (__aarch64__)
+    uint64_t result;
+    __asm __volatile__ ("mrs %[rt], cntvct_el0" : [rt] "=r" (result));
+    return result;
+#else
+#error No timer implementation for this platform
+#endif
 }
 
 static inline uint64_t toku_current_time_microsec(void) {
diff --git a/storage/tokudb/PerconaFT/src/indexer-internal.h b/storage/tokudb/PerconaFT/src/indexer-internal.h
index 48e62ee49b2..fdaa561e3d0 100644
--- a/storage/tokudb/PerconaFT/src/indexer-internal.h
+++ b/storage/tokudb/PerconaFT/src/indexer-internal.h
@@ -42,7 +42,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 #include <toku_pthread.h>
 
 // the indexer_commit_keys is an ordered set of keys described by a DBT in the keys array.
-// the array is a resizeable array with max size "max_keys" and current size "current_keys".
+// the array is a resizable array with max size "max_keys" and current size "current_keys".
 // the ordered set is used by the hotindex undo function to collect the commit keys.
 struct indexer_commit_keys {
     int max_keys;        // max number of keys
diff --git a/storage/tokudb/PerconaFT/src/indexer-undo-do.cc b/storage/tokudb/PerconaFT/src/indexer-undo-do.cc
index 8d0b080b9fe..4c7f5336161 100644
--- a/storage/tokudb/PerconaFT/src/indexer-undo-do.cc
+++ b/storage/tokudb/PerconaFT/src/indexer-undo-do.cc
@@ -528,7 +528,7 @@ indexer_find_prev_xr(DB_INDEXER *UU(indexer), ULEHANDLE ule, uint64_t xrindex, u
 }
 
 // inject "delete" message into ft with logging in recovery and rollback logs,
-// and making assocation between txn and ft
+// and making association between txn and ft
 static int 
 indexer_ft_delete_provisional(DB_INDEXER *indexer, DB *hotdb, DBT *hotkey, XIDS xids, TOKUTXN txn) {
     int result = 0;
@@ -577,7 +577,7 @@ indexer_ft_delete_committed(DB_INDEXER *indexer, DB *hotdb, DBT *hotkey, XIDS xi
 }
 
 // inject "insert" message into ft with logging in recovery and rollback logs,
-// and making assocation between txn and ft
+// and making association between txn and ft
 static int 
 indexer_ft_insert_provisional(DB_INDEXER *indexer, DB *hotdb, DBT *hotkey, DBT *hotval, XIDS xids, TOKUTXN txn) {
     int result = 0;
diff --git a/storage/tokudb/PerconaFT/src/tests/CMakeLists.txt b/storage/tokudb/PerconaFT/src/tests/CMakeLists.txt
index 47f6aa44a75..c01a8f0d628 100644
--- a/storage/tokudb/PerconaFT/src/tests/CMakeLists.txt
+++ b/storage/tokudb/PerconaFT/src/tests/CMakeLists.txt
@@ -108,11 +108,11 @@ if(BUILD_TESTING OR BUILD_SRC_TESTS)
   foreach(ov c d r)
 
     if (ov STREQUAL c)
-      set(gset 0)
       set(hset 0)
+      set(iset 0)
     else ()
-      set(gset 0 1 2 3 4 5)
-      set(hset 0 1)
+      set(hset 0 1 2 3 4 5)
+      set(iset 0 1)
     endif ()
 
     foreach(av 0 1)
@@ -130,25 +130,27 @@ if(BUILD_TESTING OR BUILD_SRC_TESTS)
           foreach(dv ${dset})
             foreach(ev ${eset})
               foreach(fv 0 1)
-                foreach(gv ${gset})
+                foreach(gv 0 1)
                   foreach(hv ${hset})
-
-                    if ((NOT ov STREQUAL c) AND (NOT cv) AND ((NOT bv) OR (NOT ev) OR (dv)))
-                      set(iset 0 1)
-                    else ()
-                      set(iset 0)
-                    endif ()
-
                     foreach(iv ${iset})
-                      set(testname "ydb/recovery_fileops_unit.${ov}${av}${bv}${cv}${dv}${ev}${fv}${gv}${hv}${iv}")
-                      set(envdir "recovery_fileops_unit_dir/${ov}${av}${bv}${cv}${dv}${ev}${fv}${gv}${hv}${iv}")
-                      set(errfile "recovery_fileops_unit_dir/${ov}${av}${bv}${cv}${dv}${ev}${fv}${gv}${hv}${iv}.ctest-errors")
-                      add_test(NAME ${testname}
-                        COMMAND run_recovery_fileops_unit.sh $<TARGET_FILE:recovery_fileops_unit.tdb> ${errfile} 137
-                        -O ${ov} -A ${av} -B ${bv} -C ${cv} -D ${dv} -E ${ev} -F ${fv} -G ${gv} -H ${hv} -I ${iv}
-                        )
-                      setup_toku_test_properties(${testname} ${envdir})
-                      set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES "${errfile}")
+
+                      if ((NOT ov STREQUAL c) AND (NOT cv) AND ((NOT bv) OR (NOT ev) OR (dv)))
+                        set(jset 0 1)
+                      else ()
+                        set(jset 0)
+                      endif ()
+
+                      foreach(jv ${jset})
+                        set(testname "ydb/recovery_fileops_unit.${ov}${av}${bv}${cv}${dv}${ev}${fv}${gv}${hv}${iv}${jv}")
+                        set(envdir "recovery_fileops_unit_dir/${ov}${av}${bv}${cv}${dv}${ev}${fv}${gv}${hv}${iv}${jv}")
+                        set(errfile "recovery_fileops_unit_dir/${ov}${av}${bv}${cv}${dv}${ev}${fv}${gv}${hv}${iv}${jv}.ctest-errors")
+                        add_test(NAME ${testname}
+                          COMMAND run_recovery_fileops_unit.sh $<TARGET_FILE:recovery_fileops_unit.tdb> ${errfile} 137
+                          -O ${ov} -A ${av} -B ${bv} -C ${cv} -D ${dv} -E ${ev} -F ${fv} -G ${gv} -H ${hv} -I ${iv} -J ${jv}
+                          )
+                        setup_toku_test_properties(${testname} ${envdir})
+                        set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_MAKE_CLEAN_FILES "${errfile}")
+                      endforeach(jv)
                     endforeach(iv)
                   endforeach(hv)
                 endforeach(gv)
diff --git a/storage/tokudb/PerconaFT/src/tests/hotindexer-undo-do-tests/commit.i0.test b/storage/tokudb/PerconaFT/src/tests/hotindexer-undo-do-tests/commit.i0.test
index 20df13923e6..7cce68e6ff8 100644
--- a/storage/tokudb/PerconaFT/src/tests/hotindexer-undo-do-tests/commit.i0.test
+++ b/storage/tokudb/PerconaFT/src/tests/hotindexer-undo-do-tests/commit.i0.test
@@ -1,3 +1,3 @@
-# commited insert
+# committed insert
 key k1
 insert committed 0 v100
diff --git a/storage/tokudb/PerconaFT/src/tests/recovery_fileops_unit.cc b/storage/tokudb/PerconaFT/src/tests/recovery_fileops_unit.cc
index a4dc0ea9236..cc99ab560d8 100644
--- a/storage/tokudb/PerconaFT/src/tests/recovery_fileops_unit.cc
+++ b/storage/tokudb/PerconaFT/src/tests/recovery_fileops_unit.cc
@@ -36,17 +36,17 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 
 #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
 
-#include "test.h"
-#include "toku_pthread.h"
 #include <db.h>
-#include <sys/stat.h>
 #include <stdlib.h>
-
+#include <sys/stat.h>
+#include "ft/logger/logger.h"
+#include "test.h"
+#include "toku_pthread.h"
 
 static int do_recover;
 static int do_crash;
 static char fileop;
-static int choices['I'-'A'+1];
+static int choices['J' - 'A' + 1];
 const int num_choices = sizeof(choices)/sizeof(choices[0]);
 static DB_TXN *txn;
 const char *oldname = "oldfoo";
@@ -58,11 +58,14 @@ static char *cmd;
 
 static void
 usage(void) {
-    fprintf(stderr, "Usage:\n%s [-v|-q]* [-h] (-c|-r) -O fileop -A# -B# -C# -D# -E# -F# [-G# -H# -I#]\n"
-                    "  fileop = c/r/d (create/rename/delete)\n"
-                    "  Where # is a single digit number > 0.\n"
-                    "  A-F are required for fileop=create\n"
-                    "  A-I are required for fileop=delete, fileop=rename\n", cmd);
+    fprintf(stderr,
+            "Usage:\n%s [-v|-q]* [-h] (-c|-r) -O fileop -A# -B# -C# -D# -E# "
+            "-F# -G# [-H# -I# -J#]\n"
+            "  fileop = c/r/d (create/rename/delete)\n"
+            "  Where # is a single digit number > 0.\n"
+            "  A-G are required for fileop=create\n"
+            "  A-I are required for fileop=delete, fileop=rename\n",
+            cmd);
     exit(1);
 }
 
@@ -129,19 +132,18 @@ get_choice_flush_log_before_crash(void) {
     return get_bool_choice('F');
 }
 
-static int
-get_choice_create_type(void) {
-    return get_x_choice('G', 6);
-}
+static int get_choice_dir_per_db(void) { return get_bool_choice('G'); }
+
+static int get_choice_create_type(void) { return get_x_choice('H', 6); }
 
 static int
 get_choice_txn_does_open_close_before_fileop(void) {
-    return get_bool_choice('H');
+    return get_bool_choice('I');
 }
 
 static int
 get_choice_lock_table_split_fcreate(void) {
-    int choice = get_bool_choice('I');
+    int choice = get_bool_choice('J');
     if (choice)
         assert(fileop_did_commit());
     return choice;
@@ -157,62 +159,64 @@ do_args(int argc, char * const argv[]) {
     }
 
     char c;
-    while ((c = getopt(argc, argv, "vqhcrO:A:B:C:D:E:F:G:H:I:X:")) != -1) {
-	switch(c) {
-        case 'v':
-	    verbose++;
-            break;
-        case 'q':
-            verbose--;
-	    if (verbose<0) verbose=0;
-            break;
-        case 'h':
-        case '?':
-            usage();
-            break;
-        case 'c':
-            do_crash = 1;
-            break;
-        case 'r':
-            do_recover = 1;
-            break;
-        case 'O':
-            if (fileop != '\0')
+    while ((c = getopt(argc, argv, "vqhcrO:A:B:C:D:E:F:G:H:I:J:X:")) != -1) {
+        switch (c) {
+            case 'v':
+                verbose++;
+                break;
+            case 'q':
+                verbose--;
+                if (verbose < 0)
+                    verbose = 0;
+                break;
+            case 'h':
+            case '?':
                 usage();
-            fileop = optarg[0];
-            switch (fileop) {
-                case 'c':
-                case 'r':
-                case 'd':
-                    break;
-                default:
+                break;
+            case 'c':
+                do_crash = 1;
+                break;
+            case 'r':
+                do_recover = 1;
+                break;
+            case 'O':
+                if (fileop != '\0')
                     usage();
-                    break;
-            }
-            break;
-        case 'A':
-        case 'B':
-        case 'C':
-        case 'D':
-        case 'E':
-        case 'F':
-        case 'G':
-        case 'H':
-        case 'I':
-            if (fileop == '\0')
-                usage();
-            int num;
-            num = atoi(optarg);
-            if (num < 0 || num > 9)
-                usage();
-            choices[c - 'A'] = num;
-            break;
-        case 'X':
-            if (strcmp(optarg, "novalgrind") == 0) {
-                // provide a way for the shell script runner to pass an
-                // arg that suppresses valgrind on this child process
+                fileop = optarg[0];
+                switch (fileop) {
+                    case 'c':
+                    case 'r':
+                    case 'd':
+                        break;
+                    default:
+                        usage();
+                        break;
+                }
+                break;
+            case 'A':
+            case 'B':
+            case 'C':
+            case 'D':
+            case 'E':
+            case 'F':
+            case 'G':
+            case 'H':
+            case 'I':
+            case 'J':
+                if (fileop == '\0')
+                    usage();
+                int num;
+                num = atoi(optarg);
+                if (num < 0 || num > 9)
+                    usage();
+                choices[c - 'A'] = num;
                 break;
-            }
+            case 'X':
+                if (strcmp(optarg, "novalgrind") == 0) {
+                    // provide a way for the shell script runner to pass an
+                    // arg that suppresses valgrind on this child process
+                    break;
+                }
             // otherwise, fall through to an error
 	default:
             usage();
@@ -222,7 +226,7 @@ do_args(int argc, char * const argv[]) {
     if (argc!=optind) { usage(); exit(1); }
 
     for (i = 0; i < num_choices; i++) {
-        if (i >= 'G' - 'A' && fileop == 'c')
+        if (i >= 'H' - 'A' && fileop == 'c')
             break;
         if (choices[i] == -1)
             usage();
@@ -261,6 +265,8 @@ static void env_startup(void) {
     int envflags = DB_INIT_LOCK | DB_INIT_LOG | DB_INIT_MPOOL | DB_INIT_TXN | DB_CREATE | DB_PRIVATE | recover_flag;
     r = db_env_create(&env, 0);
     CKERR(r);
+    r = env->set_dir_per_db(env, get_choice_dir_per_db());
+    CKERR(r);
     env->set_errfile(env, stderr);
     r = env->open(env, TOKU_TEST_FILENAME, envflags, S_IRWXU+S_IRWXG+S_IRWXO);
     CKERR(r);
@@ -625,8 +631,11 @@ recover_and_verify(void) {
         else if (did_create_commit_early())
             expect_old_name = 1;
     }
-    verify_file_exists(oldname, expect_old_name);
-    verify_file_exists(newname, expect_new_name);
+    // We can't expect files existence until recovery log was not flushed
+    if ((get_choice_flush_log_before_crash())) {
+        verify_file_exists(oldname, expect_old_name);
+        verify_file_exists(newname, expect_new_name);
+    }
     env_shutdown();
 }
 
diff --git a/storage/tokudb/PerconaFT/src/tests/stat64-root-changes.cc b/storage/tokudb/PerconaFT/src/tests/stat64-root-changes.cc
index a2b48e443cd..48843a0bd32 100644
--- a/storage/tokudb/PerconaFT/src/tests/stat64-root-changes.cc
+++ b/storage/tokudb/PerconaFT/src/tests/stat64-root-changes.cc
@@ -166,7 +166,7 @@ run_test (void) {
 
         DB_BTREE_STAT64 s;
         r = db->stat64(db, NULL, &s); CKERR(r);
-        assert(s.bt_nkeys == 0);
+        assert(s.bt_nkeys == 1);
 
         r = db->close(db, 0);     CKERR(r);
 
@@ -176,7 +176,7 @@ run_test (void) {
         r = txn->commit(txn, 0);    CKERR(r);
 
         r = db->stat64(db, NULL, &s); CKERR(r);
-        assert(s.bt_nkeys == 0);
+        assert(s.bt_nkeys == 1);
     }
 
     // verify update callback overwrites the row
diff --git a/storage/tokudb/PerconaFT/src/tests/test_insert_many_gc.cc b/storage/tokudb/PerconaFT/src/tests/test_insert_many_gc.cc
index 8e5109cd2a9..f6111d4b67c 100644
--- a/storage/tokudb/PerconaFT/src/tests/test_insert_many_gc.cc
+++ b/storage/tokudb/PerconaFT/src/tests/test_insert_many_gc.cc
@@ -78,7 +78,7 @@ static void test_insert_many_gc(void) {
     // from having an MVCC stack of size 'N'. At the time of this
     // writing, we run full GC on leaf-inject when the leaf is
     // 32mb or larger. A good invariant is that the max LE size
-    // never grew larger than 35mb and that the max commited xr stack
+    // never grew larger than 35mb and that the max committed xr stack
     // length never exceeded 35
     const uint64_t le_max_memsize = get_engine_status_val(env, "LE_MAX_MEMSIZE");
     const uint64_t le_max_committed_xr = get_engine_status_val(env, "LE_MAX_COMMITTED_XR");
diff --git a/storage/tokudb/PerconaFT/src/tests/test_iterate_live_transactions.cc b/storage/tokudb/PerconaFT/src/tests/test_iterate_live_transactions.cc
index c5561cdf90f..23c79620cd8 100644
--- a/storage/tokudb/PerconaFT/src/tests/test_iterate_live_transactions.cc
+++ b/storage/tokudb/PerconaFT/src/tests/test_iterate_live_transactions.cc
@@ -55,7 +55,8 @@ static int iterate_callback(DB_TXN *txn,
                             iterate_row_locks_callback iterate_locks,
                             void *locks_extra, void *extra) {
     uint64_t txnid = txn->id64(txn);
-    uint64_t client_id = txn->get_client_id(txn);
+    uint64_t client_id; void *client_extra;
+    txn->get_client_id(txn, &client_id, &client_extra);
     iterate_extra *info = reinterpret_cast<iterate_extra *>(extra);
     DB *db;
     DBT left_key, right_key;
@@ -93,13 +94,13 @@ int test_main(int UU(argc), char *const UU(argv[])) {
     r = env->open(env, TOKU_TEST_FILENAME, env_flags, 0755); CKERR(r);
 
     r = env->txn_begin(env, NULL, &txn1, 0); CKERR(r);
-    txn1->set_client_id(txn1, 0);
+    txn1->set_client_id(txn1, 0, NULL);
     txnid1 = txn1->id64(txn1);
     r = env->txn_begin(env, NULL, &txn2, 0); CKERR(r);
-    txn2->set_client_id(txn2, 1);
+    txn2->set_client_id(txn2, 1, NULL);
     txnid2 = txn2->id64(txn2);
     r = env->txn_begin(env, NULL, &txn3, 0); CKERR(r);
-    txn3->set_client_id(txn3, 2);
+    txn3->set_client_id(txn3, 2, NULL);
     txnid3 = txn3->id64(txn3);
 
     {
diff --git a/storage/tokudb/PerconaFT/src/tests/test_stress0.cc b/storage/tokudb/PerconaFT/src/tests/test_stress0.cc
index aaafe284906..037ffdd312d 100644
--- a/storage/tokudb/PerconaFT/src/tests/test_stress0.cc
+++ b/storage/tokudb/PerconaFT/src/tests/test_stress0.cc
@@ -53,7 +53,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 // This test is a micro stress test that does multithreaded updates on a fixed size table.
 // There is also a thread that scans the table with bulk fetch, ensuring the sum is zero.
 //
-// This test is targetted at stressing the locktree, hence the small table and many update threads.
+// This test is targeted at stressing the locktree, hence the small table and many update threads.
 //
 
 static int UU() lock_escalation_op(DB_TXN *UU(txn), ARG arg, void* operation_extra, void *UU(stats_extra)) {
@@ -93,7 +93,8 @@ static int iterate_txns(DB_TXN *txn,
                         iterate_row_locks_callback iterate_locks,
                         void *locks_extra, void *extra) {
     uint64_t txnid = txn->id64(txn);
-    uint64_t client_id = txn->get_client_id(txn);
+    uint64_t client_id; void *client_extra;
+    txn->get_client_id(txn, &client_id, &client_extra);
     invariant_null(extra);
     invariant(txnid > 0);
     invariant(client_id == 0);
diff --git a/storage/tokudb/PerconaFT/src/tests/test_txn_abort5a.cc b/storage/tokudb/PerconaFT/src/tests/test_txn_abort5a.cc
index fec454b8009..301eed1560e 100644
--- a/storage/tokudb/PerconaFT/src/tests/test_txn_abort5a.cc
+++ b/storage/tokudb/PerconaFT/src/tests/test_txn_abort5a.cc
@@ -123,7 +123,8 @@ test_main(int argc, char *const argv[]) {
             continue;
         }
     }
-    if (verbose>0) printf("%s", __FILE__); if (verbose>1) printf("\n");
+    if (verbose>0) printf("%s", __FILE__);
+    if (verbose>1) printf("\n");
     for (i=1; i<100; i++) 
         test_txn_abort(i);
     if (verbose>1) printf("%s OK\n", __FILE__);
diff --git a/storage/tokudb/PerconaFT/src/ydb-internal.h b/storage/tokudb/PerconaFT/src/ydb-internal.h
index 462a2a3d861..a1eb43a67c5 100644
--- a/storage/tokudb/PerconaFT/src/ydb-internal.h
+++ b/storage/tokudb/PerconaFT/src/ydb-internal.h
@@ -105,6 +105,7 @@ struct __toku_db_env_internal {
     TOKULOGGER logger;
     toku::locktree_manager ltm;
     lock_timeout_callback lock_wait_timeout_callback;   // Called when a lock request times out waiting for a lock.
+    lock_wait_callback lock_wait_needed_callback;       // Called when a lock request requires a wait.
 
     DB *directory;                                      // Maps dnames to inames
     DB *persistent_environment;                         // Stores environment settings, can be used for upgrade
@@ -114,7 +115,7 @@ struct __toku_db_env_internal {
 
     char *real_data_dir;                                // data dir used when the env is opened (relative to cwd, or absolute with leading /)
     char *real_log_dir;                                 // log dir used when the env is opened  (relative to cwd, or absolute with leading /)
-    char *real_tmp_dir;                                 // tmp dir used for temporary files (relative to cwd, or absoulte with leading /)
+    char *real_tmp_dir;                                 // tmp dir used for temporary files (relative to cwd, or absolute with leading /)
 
     fs_redzone_state fs_state;
     uint64_t fs_seq;                                    // how many times has fs_poller run?
@@ -132,7 +133,8 @@ struct __toku_db_env_internal {
     int datadir_lockfd;
     int logdir_lockfd;
     int tmpdir_lockfd;
-    bool check_thp;                                     // if set check if transparent huge pages are disables
+    bool check_thp;  // if set check if transparent huge pages are disabled
+    bool dir_per_db;
     uint64_t (*get_loader_memory_size_callback)(void);
     uint64_t default_lock_timeout_msec;
     uint64_t (*get_lock_timeout_callback)(uint64_t default_lock_timeout_msec);
diff --git a/storage/tokudb/PerconaFT/src/ydb.cc b/storage/tokudb/PerconaFT/src/ydb.cc
index bde479a9ed6..45385ef9120 100644
--- a/storage/tokudb/PerconaFT/src/ydb.cc
+++ b/storage/tokudb/PerconaFT/src/ydb.cc
@@ -1299,6 +1299,22 @@ env_get_check_thp(DB_ENV * env) {
     return env->i->check_thp;
 }
 
+static bool env_set_dir_per_db(DB_ENV *env, bool new_val) {
+    HANDLE_PANICKED_ENV(env);
+    bool r = env->i->dir_per_db;
+    env->i->dir_per_db = new_val;
+    return r;
+}
+
+static bool env_get_dir_per_db(DB_ENV *env) {
+    HANDLE_PANICKED_ENV(env);
+    return env->i->dir_per_db;
+}
+
+static const char *env_get_data_dir(DB_ENV *env) {
+    return env->i->real_data_dir;
+}
+
 static int env_dbremove(DB_ENV * env, DB_TXN *txn, const char *fname, const char *dbname, uint32_t flags);
 
 static int
@@ -1789,6 +1805,12 @@ env_set_lock_timeout_callback(DB_ENV *env, lock_timeout_callback callback) {
     return 0;
 }
 
+static int
+env_set_lock_wait_callback(DB_ENV *env, lock_wait_callback callback) {
+    env->i->lock_wait_needed_callback = callback;
+    return 0;
+}
+
 static void
 format_time(const time_t *timer, char *buf) {
     ctime_r(timer, buf);
@@ -2605,6 +2627,10 @@ static void env_set_killed_callback(DB_ENV *env, uint64_t default_killed_time_ms
     env->i->killed_callback = killed_callback;
 }
 
+static void env_kill_waiter(DB_ENV *env, void *extra) {
+    env->i->ltm.kill_waiter(extra);
+}
+
 static void env_do_backtrace(DB_ENV *env) {
     if (env->i->errcall) {
         db_env_do_backtrace_errfunc((toku_env_err_func) toku_env_err, (const void *) env);
@@ -2685,6 +2711,7 @@ toku_env_create(DB_ENV ** envp, uint32_t flags) {
     USENV(get_lock_timeout);
     USENV(set_lock_timeout);
     USENV(set_lock_timeout_callback);
+    USENV(set_lock_wait_callback);
     USENV(set_redzone);
     USENV(log_flush);
     USENV(log_archive);
@@ -2701,6 +2728,10 @@ toku_env_create(DB_ENV ** envp, uint32_t flags) {
     USENV(do_backtrace);
     USENV(set_check_thp);
     USENV(get_check_thp);
+    USENV(set_dir_per_db);
+    USENV(get_dir_per_db);
+    USENV(get_data_dir);
+    USENV(kill_waiter);
 #undef USENV
     
     // unlocked methods
@@ -3046,7 +3077,7 @@ env_dbrename(DB_ENV *env, DB_TXN *txn, const char *fname, const char *dbname, co
     if (env_is_db_with_dname_open(env, newname)) {
         return toku_ydb_do_error(env, EINVAL, "Cannot rename dictionary; Dictionary with target name has an open handle.\n");
     }
-    
+
     DBT old_dname_dbt;  
     DBT new_dname_dbt;  
     DBT iname_dbt;  
@@ -3066,10 +3097,35 @@ env_dbrename(DB_ENV *env, DB_TXN *txn, const char *fname, const char *dbname, co
             r = EEXIST;
         }
         else if (r == DB_NOTFOUND) {
+            DBT new_iname_dbt;
+            // Do not rename ft file if 'dir_per_db' option is not set
+            auto new_iname =
+                env->get_dir_per_db(env)
+                    ? generate_iname_for_rename_or_open(
+                          env, txn, newname, false)
+                    : std::unique_ptr<char[], decltype(&toku_free)>(
+                          toku_strdup(iname), &toku_free);
+            toku_fill_dbt(
+                &new_iname_dbt, new_iname.get(), strlen(new_iname.get()) + 1);
+
             // remove old (dname,iname) and insert (newname,iname) in directory
             r = toku_db_del(env->i->directory, txn, &old_dname_dbt, DB_DELETE_ANY, true);
             if (r != 0) { goto exit; }
-            r = toku_db_put(env->i->directory, txn, &new_dname_dbt, &iname_dbt, 0, true);
+
+            // Do not rename ft file if 'dir_per_db' option is not set
+            if (env->get_dir_per_db(env))
+                r = toku_ft_rename_iname(txn,
+                                         env->get_data_dir(env),
+                                         iname,
+                                         new_iname.get(),
+                                         env->i->cachetable);
+
+            r = toku_db_put(env->i->directory,
+                            txn,
+                            &new_dname_dbt,
+                            &new_iname_dbt,
+                            0,
+                            true);
             if (r != 0) { goto exit; }
 
             //Now that we have writelocks on both dnames, verify that there are still no handles open. (to prevent race conditions)
@@ -3092,7 +3148,7 @@ env_dbrename(DB_ENV *env, DB_TXN *txn, const char *fname, const char *dbname, co
             // otherwise, we're okay in marking this ft as remove on
             // commit. no new handles can open for this dictionary
             // because the txn has directory write locks on the dname
-            if (txn && !can_acquire_table_lock(env, txn, iname)) {
+            if (txn && !can_acquire_table_lock(env, txn, new_iname.get())) {
                 r = DB_LOCK_NOTGRANTED;
             }
             // We don't do anything at the ft or cachetable layer for rename.
diff --git a/storage/tokudb/PerconaFT/src/ydb_db.cc b/storage/tokudb/PerconaFT/src/ydb_db.cc
index 10e44e18b39..29d21ad0452 100644
--- a/storage/tokudb/PerconaFT/src/ydb_db.cc
+++ b/storage/tokudb/PerconaFT/src/ydb_db.cc
@@ -84,8 +84,7 @@ ydb_db_layer_get_status(YDB_DB_LAYER_STATUS statp) {
     *statp = ydb_db_layer_status;
 }
 
-static void
-create_iname_hint(const char *dname, char *hint) {
+void create_iname_hint(const char *dname, char *hint) {
     //Requires: size of hint array must be > strlen(dname)
     //Copy alphanumeric characters only.
     //Replace strings of non-alphanumeric characters with a single underscore.
@@ -106,11 +105,43 @@ create_iname_hint(const char *dname, char *hint) {
     *hint = '\0';
 }
 
+void create_iname_hint_for_dbdir(const char *dname, char *hint) {
+    assert(dname);
+    if (*dname == '.')
+        ++dname;
+    if (*dname == '/')
+        ++dname;
+    bool underscored = false;
+    bool dbdir_is_parsed = false;
+    // Do not change the first '/' because this is
+    // delimiter which splits name into database dir
+    // and table dir.
+    while (*dname) {
+        if (isalnum(*dname) || (*dname == '/' && !dbdir_is_parsed)) {
+            char c = *dname++;
+            *hint++ = c;
+            if (c == '/')
+                dbdir_is_parsed = true;
+            underscored = false;
+        } else {
+            if (!underscored)
+                *hint++ = '_';
+            dname++;
+            underscored = true;
+        }
+    }
+    *hint = '\0';
+}
+
 // n < 0  means to ignore mark and ignore n
 // n >= 0 means to include mark ("_B_" or "_P_") with hex value of n in iname
 // (intended for use by loader, which will create many inames using one txnid).
-static char *
-create_iname(DB_ENV *env, uint64_t id1, uint64_t id2, char *hint, const char *mark, int n) {
+char *create_iname(DB_ENV *env,
+                   uint64_t id1,
+                   uint64_t id2,
+                   char *hint,
+                   const char *mark,
+                   int n) {
     int bytes;
     char inamebase[strlen(hint) +
                    8 +  // hex file format version
@@ -139,6 +170,34 @@ create_iname(DB_ENV *env, uint64_t id1, uint64_t id2, char *hint, const char *ma
     return rval;
 }
 
+static uint64_t nontransactional_open_id = 0;
+
+std::unique_ptr<char[], decltype(&toku_free)> generate_iname_for_rename_or_open(
+    DB_ENV *env,
+    DB_TXN *txn,
+    const char *dname,
+    bool is_open) {
+    std::unique_ptr<char[], decltype(&toku_free)> result(nullptr, &toku_free);
+    char hint[strlen(dname) + 1];
+    uint64_t id1 = 0;
+    uint64_t id2 = 0;
+
+    if (txn) {
+        id1 = toku_txn_get_txnid(db_txn_struct_i(txn)->tokutxn).parent_id64;
+        id2 = toku_txn_get_txnid(db_txn_struct_i(txn)->tokutxn).child_id64;
+    } else if (is_open)
+        id1 = toku_sync_fetch_and_add(&nontransactional_open_id, 1);
+
+    if (env->get_dir_per_db(env) && !toku_os_is_absolute_name(dname))
+        create_iname_hint_for_dbdir(dname, hint);
+    else
+        create_iname_hint(dname, hint);
+
+    result.reset(create_iname(env, id1, id2, hint, NULL, -1));
+
+    return result;
+}
+
 static int toku_db_open(DB * db, DB_TXN * txn, const char *fname, const char *dbname, DBTYPE dbtype, uint32_t flags, int mode);
 
 // Effect: Do the work required of DB->close().
@@ -228,8 +287,6 @@ db_open_subdb(DB * db, DB_TXN * txn, const char *fname, const char *dbname, DBTY
     return r;
 }
 
-static uint64_t nontransactional_open_id = 0;
-
 // inames are created here.
 // algorithm:
 //  begin txn
@@ -287,27 +344,15 @@ toku_db_open(DB * db, DB_TXN * txn, const char *fname, const char *dbname, DBTYP
     toku_fill_dbt(&dname_dbt, dname, strlen(dname)+1);
     toku_init_dbt_flags(&iname_dbt, DB_DBT_REALLOC);
     r = toku_db_get(db->dbenv->i->directory, txn, &dname_dbt, &iname_dbt, DB_SERIALIZABLE);  // allocates memory for iname
-    char *iname = (char *) iname_dbt.data;
+    std::unique_ptr<char[], decltype(&toku_free)> iname(
+        static_cast<char *>(iname_dbt.data), &toku_free);
     if (r == DB_NOTFOUND && !is_db_create) {
         r = ENOENT;
     } else if (r==0 && is_db_excl) {
         r = EEXIST;
     } else if (r == DB_NOTFOUND) {
-        char hint[strlen(dname) + 1];
-
-        // create iname and make entry in directory
-        uint64_t id1 = 0;
-        uint64_t id2 = 0;
-
-        if (txn) {
-            id1 = toku_txn_get_txnid(db_txn_struct_i(txn)->tokutxn).parent_id64;
-            id2 = toku_txn_get_txnid(db_txn_struct_i(txn)->tokutxn).child_id64;
-        } else {
-            id1 = toku_sync_fetch_and_add(&nontransactional_open_id, 1);
-        }
-        create_iname_hint(dname, hint);
-        iname = create_iname(db->dbenv, id1, id2, hint, NULL, -1);  // allocated memory for iname
-        toku_fill_dbt(&iname_dbt, iname, strlen(iname) + 1);
+        iname = generate_iname_for_rename_or_open(db->dbenv, txn, dname, true);
+        toku_fill_dbt(&iname_dbt, iname.get(), strlen(iname.get()) + 1);
         //
         // put_flags will be 0 for performance only, avoid unnecessary query
         // if we are creating a hot index, per #3166, we do not want the write lock  in directory grabbed.
@@ -319,16 +364,13 @@ toku_db_open(DB * db, DB_TXN * txn, const char *fname, const char *dbname, DBTYP
 
     // we now have an iname
     if (r == 0) {
-        r = toku_db_open_iname(db, txn, iname, flags, mode);
+        r = toku_db_open_iname(db, txn, iname.get(), flags, mode);
         if (r == 0) {
             db->i->dname = toku_xstrdup(dname);
             env_note_db_opened(db->dbenv, db);  // tell env that a new db handle is open (using dname)
         }
     }
 
-    if (iname) {
-        toku_free(iname);
-    }
     return r;
 }
 
@@ -1182,7 +1224,10 @@ load_inames(DB_ENV * env, DB_TXN * txn, int N, DB * dbs[/*N*/], const char * new
         toku_fill_dbt(&dname_dbt, dname, strlen(dname)+1);
         // now create new iname
         char hint[strlen(dname) + 1];
-        create_iname_hint(dname, hint);
+        if (env->get_dir_per_db(env) && !toku_os_is_absolute_name(dname))
+            create_iname_hint_for_dbdir(dname, hint);
+        else
+            create_iname_hint(dname, hint);
         const char *new_iname = create_iname(env, xid.parent_id64, xid.child_id64, hint, mark, i);               // allocates memory for iname_in_env
         new_inames_in_env[i] = new_iname;
         toku_fill_dbt(&iname_dbt, new_iname, strlen(new_iname) + 1);      // iname_in_env goes in directory
diff --git a/storage/tokudb/PerconaFT/src/ydb_db.h b/storage/tokudb/PerconaFT/src/ydb_db.h
index 8b92dd1c3cb..8be28857c14 100644
--- a/storage/tokudb/PerconaFT/src/ydb_db.h
+++ b/storage/tokudb/PerconaFT/src/ydb_db.h
@@ -43,6 +43,8 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 #include "ydb-internal.h"
 #include "ydb_txn.h"
 
+#include <memory>
+
 typedef enum {
     YDB_LAYER_DIRECTORY_WRITE_LOCKS = 0,        /* total directory write locks taken */
     YDB_LAYER_DIRECTORY_WRITE_LOCKS_FAIL,   /* total directory write locks unable to be taken */
@@ -119,3 +121,17 @@ toku_db_destruct_autotxn(DB_TXN *txn, int r, bool changed) {
     }
     return r; 
 }
+
+void create_iname_hint_for_dbdir(const char *dname, char *hint);
+void create_iname_hint(const char *dname, char *hint);
+char *create_iname(DB_ENV *env,
+                   uint64_t id1,
+                   uint64_t id2,
+                   char *hint,
+                   const char *mark,
+                   int n);
+std::unique_ptr<char[], decltype(&toku_free)> generate_iname_for_rename_or_open(
+    DB_ENV *env,
+    DB_TXN *txn,
+    const char *dname,
+    bool is_open);
diff --git a/storage/tokudb/PerconaFT/src/ydb_row_lock.cc b/storage/tokudb/PerconaFT/src/ydb_row_lock.cc
index 913e1a44faf..597e6311eb8 100644
--- a/storage/tokudb/PerconaFT/src/ydb_row_lock.cc
+++ b/storage/tokudb/PerconaFT/src/ydb_row_lock.cc
@@ -193,7 +193,10 @@ int toku_db_start_range_lock(DB *db, DB_TXN *txn, const DBT *left_key, const DBT
         toku::lock_request::type lock_type, toku::lock_request *request) {
     DB_TXN *txn_anc = txn_oldest_ancester(txn);
     TXNID txn_anc_id = txn_anc->id64(txn_anc);
-    request->set(db->i->lt, txn_anc_id, left_key, right_key, lock_type, toku_is_big_txn(txn_anc));
+    uint64_t client_id;
+    void *client_extra;
+    txn->get_client_id(txn, &client_id, &client_extra);
+    request->set(db->i->lt, txn_anc_id, left_key, right_key, lock_type, toku_is_big_txn(txn_anc), client_extra);
 
     const int r = request->start();
     if (r == 0) {
@@ -221,7 +224,8 @@ int toku_db_wait_range_lock(DB *db, DB_TXN *txn, toku::lock_request *request) {
     uint64_t killed_time_msec = env->i->default_killed_time_msec;
     if (env->i->get_killed_time_callback)
         killed_time_msec = env->i->get_killed_time_callback(killed_time_msec);
-    const int r = request->wait(wait_time_msec, killed_time_msec, env->i->killed_callback);
+    const int r = request->wait(wait_time_msec, killed_time_msec, env->i->killed_callback,
+                                env->i->lock_wait_needed_callback);
     if (r == 0) {
         db_txn_note_row_lock(db, txn_anc, left_key, right_key);
     } else if (r == DB_LOCK_NOTGRANTED) {
@@ -248,7 +252,10 @@ void toku_db_grab_write_lock (DB *db, DBT *key, TOKUTXN tokutxn) {
     // This lock request must succeed, so we do not want to wait
     toku::lock_request request;
     request.create();
-    request.set(db->i->lt, txn_anc_id, key, key, toku::lock_request::type::WRITE, toku_is_big_txn(txn_anc));
+    uint64_t client_id;
+    void *client_extra;
+    txn->get_client_id(txn, &client_id, &client_extra);
+    request.set(db->i->lt, txn_anc_id, key, key, toku::lock_request::type::WRITE, toku_is_big_txn(txn_anc), client_extra);
     int r = request.start();
     invariant_zero(r);
     db_txn_note_row_lock(db, txn_anc, key, key);
@@ -268,7 +275,7 @@ void toku_db_release_lt_key_ranges(DB_TXN *txn, txn_lt_key_ranges *ranges) {
 
     // all of our locks have been released, so first try to wake up
     // pending lock requests, then release our reference on the lt
-    toku::lock_request::retry_all_lock_requests(lt);
+    toku::lock_request::retry_all_lock_requests(lt, txn->mgrp->i->lock_wait_needed_callback);
 
     // Release our reference on this locktree
     toku::locktree_manager *ltm = &txn->mgrp->i->ltm;
diff --git a/storage/tokudb/PerconaFT/src/ydb_txn.cc b/storage/tokudb/PerconaFT/src/ydb_txn.cc
index ae1f93011d1..40b479055f2 100644
--- a/storage/tokudb/PerconaFT/src/ydb_txn.cc
+++ b/storage/tokudb/PerconaFT/src/ydb_txn.cc
@@ -323,12 +323,12 @@ int locked_txn_abort(DB_TXN *txn) {
     return r;
 }
 
-static void locked_txn_set_client_id(DB_TXN *txn, uint64_t client_id) {
-    toku_txn_set_client_id(db_txn_struct_i(txn)->tokutxn, client_id);
+static void locked_txn_set_client_id(DB_TXN *txn, uint64_t client_id, void *client_extra) {
+    toku_txn_set_client_id(db_txn_struct_i(txn)->tokutxn, client_id, client_extra);
 }
 
-static uint64_t locked_txn_get_client_id(DB_TXN *txn) {
-    return toku_txn_get_client_id(db_txn_struct_i(txn)->tokutxn);
+static void locked_txn_get_client_id(DB_TXN *txn, uint64_t *client_id, void **client_extra) {
+    toku_txn_get_client_id(db_txn_struct_i(txn)->tokutxn, client_id, client_extra);
 }
 
 static int toku_txn_discard(DB_TXN *txn, uint32_t flags) {
diff --git a/storage/tokudb/PerconaFT/third_party/xz-4.999.9beta/build-aux/config.guess b/storage/tokudb/PerconaFT/third_party/xz-4.999.9beta/build-aux/config.guess
index da833146088..7501b1bee01 100644
--- a/storage/tokudb/PerconaFT/third_party/xz-4.999.9beta/build-aux/config.guess
+++ b/storage/tokudb/PerconaFT/third_party/xz-4.999.9beta/build-aux/config.guess
@@ -1,10 +1,10 @@
 #! /bin/sh
 # Attempt to guess a canonical system name.
 #   Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
-#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
-#   Free Software Foundation, Inc.
+#   2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010,
+#   2011, 2012 Free Software Foundation, Inc.
 
-timestamp='2009-04-27'
+timestamp='2016-06-22'
 
 # This file is free software; you can redistribute it and/or modify it
 # under the terms of the GNU General Public License as published by
@@ -17,9 +17,7 @@ timestamp='2009-04-27'
 # General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
-# along with this program; if not, write to the Free Software
-# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
-# 02110-1301, USA.
+# along with this program; if not, see <http://www.gnu.org/licenses/>.
 #
 # As a special exception to the GNU General Public License, if you
 # distribute this file as part of a program that contains a
@@ -27,16 +25,16 @@ timestamp='2009-04-27'
 # the same distribution terms that you use for the rest of that program.
 
 
-# Originally written by Per Bothner <per@bothner.com>.
-# Please send patches to <config-patches@gnu.org>.  Submit a context
-# diff and a properly formatted ChangeLog entry.
+# Originally written by Per Bothner.  Please send patches (context
+# diff format) to <config-patches@gnu.org> and include a ChangeLog
+# entry.
 #
 # This script attempts to guess a canonical system name similar to
 # config.sub.  If it succeeds, it prints the system name on stdout, and
 # exits with 0.  Otherwise, it exits with 1.
 #
-# The plan is that this can be called by configure scripts if you
-# don't specify an explicit build system type.
+# You can get the latest version of this script from:
+# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD
 
 me=`echo "$0" | sed -e 's,.*/,,'`
 
@@ -56,8 +54,9 @@ version="\
 GNU config.guess ($timestamp)
 
 Originally written by Per Bothner.
-Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001,
-2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc.
+Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000,
+2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012
+Free Software Foundation, Inc.
 
 This is free software; see the source for copying conditions.  There is NO
 warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE."
@@ -144,7 +143,7 @@ UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown
 case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
     *:NetBSD:*:*)
 	# NetBSD (nbsd) targets should (where applicable) match one or
-	# more of the tupples: *-*-netbsdelf*, *-*-netbsdaout*,
+	# more of the tuples: *-*-netbsdelf*, *-*-netbsdaout*,
 	# *-*-netbsdecoff* and *-*-netbsd*.  For targets that recently
 	# switched to ELF, *-*-netbsd* would select the old
 	# object file format.  This provides both forward
@@ -170,7 +169,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 	    arm*|i386|m68k|ns32k|sh3*|sparc|vax)
 		eval $set_cc_for_build
 		if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \
-			| grep __ELF__ >/dev/null
+			| grep -q __ELF__
 		then
 		    # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout).
 		    # Return netbsd for either.  FIX?
@@ -180,7 +179,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 		fi
 		;;
 	    *)
-	        os=netbsd
+		os=netbsd
 		;;
 	esac
 	# The OS release
@@ -223,7 +222,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'`
 		;;
 	*5.*)
-	        UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'`
+		UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'`
 		;;
 	esac
 	# According to Compaq, /usr/sbin/psrinfo has been available on
@@ -269,7 +268,10 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 	# A Xn.n version is an unreleased experimental baselevel.
 	# 1.2 uses "1.2" for uname -r.
 	echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
-	exit ;;
+	# Reset EXIT trap before exiting to avoid spurious non-zero exit code.
+	exitcode=$?
+	trap '' 0
+	exit $exitcode ;;
     Alpha\ *:Windows_NT*:*)
 	# How do we know it's Interix rather than the generic POSIX subsystem?
 	# Should we change UNAME_MACHINE based on the output of uname instead
@@ -295,7 +297,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
 	echo s390-ibm-zvmoe
 	exit ;;
     *:OS400:*:*)
-        echo powerpc-ibm-os400
+	echo powerpc-ibm-os400
 	exit ;;
     arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*)
 	echo arm-acorn-riscix${UNAME_RELEASE}
@@ -333,6 +335,9 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
     sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*)
 	echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'`
 	exit ;;
+    i86pc:AuroraUX:5.*:* | i86xen:AuroraUX:5.*:*)
+	echo i386-pc-auroraux${UNAME_RELEASE}
+	exit ;;
     i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*)
 	eval $set_cc_for_build
 	SUN_ARCH="i386"
@@ -391,23 +396,23 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in
     # MiNT.  But MiNT is downward compatible to TOS, so this should
     # be no problem.
     atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*)
-        echo m68k-atari-mint${UNAME_RELEASE}
+	echo m68k-atari-mint${UNAME_RELEASE}
 	exit ;;
     atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*)
 	echo m68k-atari-mint${UNAME_RELEASE}
-        exit ;;
+	exit ;;
     *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*)
-        echo m68k-atari-mint${UNAME_RELEASE}
+	echo m68k-atari-mint${UNAME_RELEASE}
 	exit ;;
     milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*)
-        echo m68k-milan-mint${UNAME_RELEASE}
-        exit ;;
+	echo m68k-milan-mint${UNAME_RELEASE}
+	exit ;;
     hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*)
-        echo m68k-hades-mint${UNAME_RELEASE}
-        exit ;;
+	echo m68k-hades-mint${UNAME_RELEASE}
+	exit ;;
     *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*)
-        echo m68k-unknown-mint${UNAME_RELEASE}
-        exit ;;
+	echo m68k-unknown-mint${UNAME_RELEASE}
+	exit ;;
     m68k:machten:*:*)
 	echo m68k-apple-machten${UNAME_RELEASE}
 	exit ;;
@@ -477,8 +482,8 @@ EOF
 	echo m88k-motorola-sysv3
 	exit ;;
     AViiON:dgux:*:*)
-        # DG/UX returns AViiON for all architectures
-        UNAME_PROCESSOR=`/usr/bin/uname -p`
+	# DG/UX returns AViiON for all architectures
+	UNAME_PROCESSOR=`/usr/bin/uname -p`
 	if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ]
 	then
 	    if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \
@@ -491,7 +496,7 @@ EOF
 	else
 	    echo i586-dg-dgux${UNAME_RELEASE}
 	fi
- 	exit ;;
+	exit ;;
     M88*:DolphinOS:*:*)	# DolphinOS (SVR3)
 	echo m88k-dolphin-sysv3
 	exit ;;
@@ -548,7 +553,7 @@ EOF
 		echo rs6000-ibm-aix3.2
 	fi
 	exit ;;
-    *:AIX:*:[456])
+    *:AIX:*:[4567])
 	IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'`
 	if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then
 		IBM_ARCH=rs6000
@@ -591,52 +596,52 @@ EOF
 	    9000/[678][0-9][0-9])
 		if [ -x /usr/bin/getconf ]; then
 		    sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null`
-                    sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
-                    case "${sc_cpu_version}" in
-                      523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0
-                      528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1
-                      532)                      # CPU_PA_RISC2_0
-                        case "${sc_kernel_bits}" in
-                          32) HP_ARCH="hppa2.0n" ;;
-                          64) HP_ARCH="hppa2.0w" ;;
+		    sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null`
+		    case "${sc_cpu_version}" in
+		      523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0
+		      528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1
+		      532)                      # CPU_PA_RISC2_0
+			case "${sc_kernel_bits}" in
+			  32) HP_ARCH="hppa2.0n" ;;
+			  64) HP_ARCH="hppa2.0w" ;;
 			  '') HP_ARCH="hppa2.0" ;;   # HP-UX 10.20
-                        esac ;;
-                    esac
+			esac ;;
+		    esac
 		fi
 		if [ "${HP_ARCH}" = "" ]; then
 		    eval $set_cc_for_build
-		    sed 's/^              //' << EOF >$dummy.c
+		    sed 's/^		//' << EOF >$dummy.c
 
-              #define _HPUX_SOURCE
-              #include <stdlib.h>
-              #include <unistd.h>
+		#define _HPUX_SOURCE
+		#include <stdlib.h>
+		#include <unistd.h>
 
-              int main ()
-              {
-              #if defined(_SC_KERNEL_BITS)
-                  long bits = sysconf(_SC_KERNEL_BITS);
-              #endif
-                  long cpu  = sysconf (_SC_CPU_VERSION);
+		int main ()
+		{
+		#if defined(_SC_KERNEL_BITS)
+		    long bits = sysconf(_SC_KERNEL_BITS);
+		#endif
+		    long cpu  = sysconf (_SC_CPU_VERSION);
 
-                  switch (cpu)
-              	{
-              	case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
-              	case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
-              	case CPU_PA_RISC2_0:
-              #if defined(_SC_KERNEL_BITS)
-              	    switch (bits)
-              		{
-              		case 64: puts ("hppa2.0w"); break;
-              		case 32: puts ("hppa2.0n"); break;
-              		default: puts ("hppa2.0"); break;
-              		} break;
-              #else  /* !defined(_SC_KERNEL_BITS) */
-              	    puts ("hppa2.0"); break;
-              #endif
-              	default: puts ("hppa1.0"); break;
-              	}
-                  exit (0);
-              }
+		    switch (cpu)
+			{
+			case CPU_PA_RISC1_0: puts ("hppa1.0"); break;
+			case CPU_PA_RISC1_1: puts ("hppa1.1"); break;
+			case CPU_PA_RISC2_0:
+		#if defined(_SC_KERNEL_BITS)
+			    switch (bits)
+				{
+				case 64: puts ("hppa2.0w"); break;
+				case 32: puts ("hppa2.0n"); break;
+				default: puts ("hppa2.0"); break;
+				} break;
+		#else  /* !defined(_SC_KERNEL_BITS) */
+			    puts ("hppa2.0"); break;
+		#endif
+			default: puts ("hppa1.0"); break;
+			}
+		    exit (0);
+		}
 EOF
 		    (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy`
 		    test -z "$HP_ARCH" && HP_ARCH=hppa
@@ -656,7 +661,7 @@ EOF
 	    # => hppa64-hp-hpux11.23
 
 	    if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) |
-		grep __LP64__ >/dev/null
+		grep -q __LP64__
 	    then
 		HP_ARCH="hppa2.0w"
 	    else
@@ -727,22 +732,22 @@ EOF
 	exit ;;
     C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*)
 	echo c1-convex-bsd
-        exit ;;
+	exit ;;
     C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*)
 	if getsysinfo -f scalar_acc
 	then echo c32-convex-bsd
 	else echo c2-convex-bsd
 	fi
-        exit ;;
+	exit ;;
     C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*)
 	echo c34-convex-bsd
-        exit ;;
+	exit ;;
     C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*)
 	echo c38-convex-bsd
-        exit ;;
+	exit ;;
     C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*)
 	echo c4-convex-bsd
-        exit ;;
+	exit ;;
     CRAY*Y-MP:*:*:*)
 	echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/'
 	exit ;;
@@ -766,14 +771,14 @@ EOF
 	exit ;;
     F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*)
 	FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'`
-        FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
-        FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
-        echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
-        exit ;;
+	FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
+	FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'`
+	echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
+	exit ;;
     5000:UNIX_System_V:4.*:*)
-        FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
-        FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'`
-        echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
+	FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'`
+	FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'`
+	echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}"
 	exit ;;
     i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*)
 	echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE}
@@ -785,13 +790,12 @@ EOF
 	echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE}
 	exit ;;
     *:FreeBSD:*:*)
-	case ${UNAME_MACHINE} in
-	    pc98)
-		echo i386-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+	UNAME_PROCESSOR=`/usr/bin/uname -p`
+	case ${UNAME_PROCESSOR} in
 	    amd64)
 		echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
 	    *)
-		echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
+		echo ${UNAME_PROCESSOR}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;;
 	esac
 	exit ;;
     i*:CYGWIN*:*)
@@ -800,19 +804,22 @@ EOF
     *:MINGW*:*)
 	echo ${UNAME_MACHINE}-pc-mingw32
 	exit ;;
+    i*:MSYS*:*)
+	echo ${UNAME_MACHINE}-pc-msys
+	exit ;;
     i*:windows32*:*)
-    	# uname -m includes "-pc" on this system.
-    	echo ${UNAME_MACHINE}-mingw32
+	# uname -m includes "-pc" on this system.
+	echo ${UNAME_MACHINE}-mingw32
 	exit ;;
     i*:PW*:*)
 	echo ${UNAME_MACHINE}-pc-pw32
 	exit ;;
-    *:Interix*:[3456]*)
-    	case ${UNAME_MACHINE} in
+    *:Interix*:*)
+	case ${UNAME_MACHINE} in
 	    x86)
 		echo i586-pc-interix${UNAME_RELEASE}
 		exit ;;
-	    EM64T | authenticamd | genuineintel)
+	    authenticamd | genuineintel | EM64T)
 		echo x86_64-unknown-interix${UNAME_RELEASE}
 		exit ;;
 	    IA64)
@@ -822,6 +829,9 @@ EOF
     [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*)
 	echo i${UNAME_MACHINE}-pc-mks
 	exit ;;
+    8664:Windows_NT:*)
+	echo x86_64-pc-mks
+	exit ;;
     i*:Windows_NT*:* | Pentium*:Windows_NT*:*)
 	# How do we know it's Interix rather than the generic POSIX subsystem?
 	# It also conflicts with pre-2.0 versions of AT&T UWIN. Should we
@@ -851,6 +861,27 @@ EOF
     i*86:Minix:*:*)
 	echo ${UNAME_MACHINE}-pc-minix
 	exit ;;
+    aarch64:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    aarch64_be:Linux:*:*)
+	UNAME_MACHINE=aarch64_be
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    alpha:Linux:*:*)
+	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
+	  EV5)   UNAME_MACHINE=alphaev5 ;;
+	  EV56)  UNAME_MACHINE=alphaev56 ;;
+	  PCA56) UNAME_MACHINE=alphapca56 ;;
+	  PCA57) UNAME_MACHINE=alphapca56 ;;
+	  EV6)   UNAME_MACHINE=alphaev6 ;;
+	  EV67)  UNAME_MACHINE=alphaev67 ;;
+	  EV68*) UNAME_MACHINE=alphaev68 ;;
+	esac
+	objdump --private-headers /bin/sh | grep -q ld.so.1
+	if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
+	echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
+	exit ;;
     arm*:Linux:*:*)
 	eval $set_cc_for_build
 	if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \
@@ -858,20 +889,40 @@ EOF
 	then
 	    echo ${UNAME_MACHINE}-unknown-linux-gnu
 	else
-	    echo ${UNAME_MACHINE}-unknown-linux-gnueabi
+	    if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \
+		| grep -q __ARM_PCS_VFP
+	    then
+		echo ${UNAME_MACHINE}-unknown-linux-gnueabi
+	    else
+		echo ${UNAME_MACHINE}-unknown-linux-gnueabihf
+	    fi
 	fi
 	exit ;;
     avr32*:Linux:*:*)
 	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
     cris:Linux:*:*)
-	echo cris-axis-linux-gnu
+	echo ${UNAME_MACHINE}-axis-linux-gnu
 	exit ;;
     crisv32:Linux:*:*)
-	echo crisv32-axis-linux-gnu
+	echo ${UNAME_MACHINE}-axis-linux-gnu
 	exit ;;
     frv:Linux:*:*)
-    	echo frv-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    hexagon:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
+    i*86:Linux:*:*)
+	LIBC=gnu
+	eval $set_cc_for_build
+	sed 's/^	//' << EOF >$dummy.c
+	#ifdef __dietlibc__
+	LIBC=dietlibc
+	#endif
+EOF
+	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC'`
+	echo "${UNAME_MACHINE}-pc-linux-${LIBC}"
 	exit ;;
     ia64:Linux:*:*)
 	echo ${UNAME_MACHINE}-unknown-linux-gnu
@@ -882,78 +933,34 @@ EOF
     m68*:Linux:*:*)
 	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
-    mips:Linux:*:*)
+    mips:Linux:*:* | mips64:Linux:*:*)
 	eval $set_cc_for_build
 	sed 's/^	//' << EOF >$dummy.c
 	#undef CPU
-	#undef mips
-	#undef mipsel
+	#undef ${UNAME_MACHINE}
+	#undef ${UNAME_MACHINE}el
 	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
-	CPU=mipsel
+	CPU=${UNAME_MACHINE}el
 	#else
 	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
-	CPU=mips
+	CPU=${UNAME_MACHINE}
 	#else
 	CPU=
 	#endif
 	#endif
 EOF
-	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
-	    /^CPU/{
-		s: ::g
-		p
-	    }'`"
-	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
-	;;
-    mips64:Linux:*:*)
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
-	#undef CPU
-	#undef mips64
-	#undef mips64el
-	#if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL)
-	CPU=mips64el
-	#else
-	#if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB)
-	CPU=mips64
-	#else
-	CPU=
-	#endif
-	#endif
-EOF
-	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
-	    /^CPU/{
-		s: ::g
-		p
-	    }'`"
+	eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'`
 	test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; }
 	;;
     or32:Linux:*:*)
-	echo or32-unknown-linux-gnu
-	exit ;;
-    ppc:Linux:*:*)
-	echo powerpc-unknown-linux-gnu
-	exit ;;
-    ppc64:Linux:*:*)
-	echo powerpc64-unknown-linux-gnu
-	exit ;;
-    alpha:Linux:*:*)
-	case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in
-	  EV5)   UNAME_MACHINE=alphaev5 ;;
-	  EV56)  UNAME_MACHINE=alphaev56 ;;
-	  PCA56) UNAME_MACHINE=alphapca56 ;;
-	  PCA57) UNAME_MACHINE=alphapca56 ;;
-	  EV6)   UNAME_MACHINE=alphaev6 ;;
-	  EV67)  UNAME_MACHINE=alphaev67 ;;
-	  EV68*) UNAME_MACHINE=alphaev68 ;;
-        esac
-	objdump --private-headers /bin/sh | grep ld.so.1 >/dev/null
-	if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi
-	echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC}
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
     padre:Linux:*:*)
 	echo sparc-unknown-linux-gnu
 	exit ;;
+    parisc64:Linux:*:* | hppa64:Linux:*:*)
+	echo hppa64-unknown-linux-gnu
+	exit ;;
     parisc:Linux:*:* | hppa:Linux:*:*)
 	# Look for CPU level
 	case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in
@@ -962,14 +969,17 @@ EOF
 	  *)    echo hppa-unknown-linux-gnu ;;
 	esac
 	exit ;;
-    parisc64:Linux:*:* | hppa64:Linux:*:*)
-	echo hppa64-unknown-linux-gnu
+    ppc64:Linux:*:*)
+	echo powerpc64-unknown-linux-gnu
+	exit ;;
+    ppc:Linux:*:*)
+	echo powerpc-unknown-linux-gnu
 	exit ;;
     s390:Linux:*:* | s390x:Linux:*:*)
 	echo ${UNAME_MACHINE}-ibm-linux
 	exit ;;
     sh64*:Linux:*:*)
-    	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
     sh*:Linux:*:*)
 	echo ${UNAME_MACHINE}-unknown-linux-gnu
@@ -977,75 +987,18 @@ EOF
     sparc:Linux:*:* | sparc64:Linux:*:*)
 	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
+    tile*:Linux:*:*)
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	exit ;;
     vax:Linux:*:*)
 	echo ${UNAME_MACHINE}-dec-linux-gnu
 	exit ;;
     x86_64:Linux:*:*)
-	echo x86_64-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
     xtensa*:Linux:*:*)
-    	echo ${UNAME_MACHINE}-unknown-linux-gnu
+	echo ${UNAME_MACHINE}-unknown-linux-gnu
 	exit ;;
-    i*86:Linux:*:*)
-	# The BFD linker knows what the default object file format is, so
-	# first see if it will tell us. cd to the root directory to prevent
-	# problems with other programs or directories called `ld' in the path.
-	# Set LC_ALL=C to ensure ld outputs messages in English.
-	ld_supported_targets=`cd /; LC_ALL=C ld --help 2>&1 \
-			 | sed -ne '/supported targets:/!d
-				    s/[ 	][ 	]*/ /g
-				    s/.*supported targets: *//
-				    s/ .*//
-				    p'`
-        case "$ld_supported_targets" in
-	  elf32-i386)
-		TENTATIVE="${UNAME_MACHINE}-pc-linux-gnu"
-		;;
-	  a.out-i386-linux)
-		echo "${UNAME_MACHINE}-pc-linux-gnuaout"
-		exit ;;
-	  "")
-		# Either a pre-BFD a.out linker (linux-gnuoldld) or
-		# one that does not give us useful --help.
-		echo "${UNAME_MACHINE}-pc-linux-gnuoldld"
-		exit ;;
-	esac
-	# Determine whether the default compiler is a.out or elf
-	eval $set_cc_for_build
-	sed 's/^	//' << EOF >$dummy.c
-	#include <features.h>
-	#ifdef __ELF__
-	# ifdef __GLIBC__
-	#  if __GLIBC__ >= 2
-	LIBC=gnu
-	#  else
-	LIBC=gnulibc1
-	#  endif
-	# else
-	LIBC=gnulibc1
-	# endif
-	#else
-	#if defined(__INTEL_COMPILER) || defined(__PGI) || defined(__SUNPRO_C) || defined(__SUNPRO_CC)
-	LIBC=gnu
-	#else
-	LIBC=gnuaout
-	#endif
-	#endif
-	#ifdef __dietlibc__
-	LIBC=dietlibc
-	#endif
-EOF
-	eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n '
-	    /^LIBC/{
-		s: ::g
-		p
-	    }'`"
-	test x"${LIBC}" != x && {
-		echo "${UNAME_MACHINE}-pc-linux-${LIBC}"
-		exit
-	}
-	test x"${TENTATIVE}" != x && { echo "${TENTATIVE}"; exit; }
-	;;
     i*86:DYNIX/ptx:4*:*)
 	# ptx 4.0 does uname -s correctly, with DYNIX/ptx in there.
 	# earlier versions are messed up and put the nodename in both
@@ -1053,11 +1006,11 @@ EOF
 	echo i386-sequent-sysv4
 	exit ;;
     i*86:UNIX_SV:4.2MP:2.*)
-        # Unixware is an offshoot of SVR4, but it has its own version
-        # number series starting with 2...
-        # I am not positive that other SVR4 systems won't match this,
+	# Unixware is an offshoot of SVR4, but it has its own version
+	# number series starting with 2...
+	# I am not positive that other SVR4 systems won't match this,
 	# I just have to hope.  -- rms.
-        # Use sysv4.2uw... so that sysv4* matches it.
+	# Use sysv4.2uw... so that sysv4* matches it.
 	echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION}
 	exit ;;
     i*86:OS/2:*:*)
@@ -1074,7 +1027,7 @@ EOF
     i*86:syllable:*:*)
 	echo ${UNAME_MACHINE}-pc-syllable
 	exit ;;
-    i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.0*:*)
+    i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*)
 	echo i386-unknown-lynxos${UNAME_RELEASE}
 	exit ;;
     i*86:*DOS:*:*)
@@ -1089,7 +1042,7 @@ EOF
 	fi
 	exit ;;
     i*86:*:5:[678]*)
-    	# UnixWare 7.x, OpenUNIX and OpenServer 6.
+	# UnixWare 7.x, OpenUNIX and OpenServer 6.
 	case `/bin/uname -X | grep "^Machine"` in
 	    *486*)	     UNAME_MACHINE=i486 ;;
 	    *Pentium)	     UNAME_MACHINE=i586 ;;
@@ -1117,13 +1070,13 @@ EOF
 	exit ;;
     pc:*:*:*)
 	# Left here for compatibility:
-        # uname -m prints for DJGPP always 'pc', but it prints nothing about
-        # the processor, so we play safe by assuming i586.
+	# uname -m prints for DJGPP always 'pc', but it prints nothing about
+	# the processor, so we play safe by assuming i586.
 	# Note: whatever this is, it MUST be the same as what config.sub
 	# prints for the "djgpp" host, or else GDB configury will decide that
 	# this is a cross-build.
 	echo i586-pc-msdosdjgpp
-        exit ;;
+	exit ;;
     Intel:Mach:3*:*)
 	echo i386-pc-mach3
 	exit ;;
@@ -1158,8 +1111,8 @@ EOF
 	/bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \
 	  && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;;
     3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*)
-        /bin/uname -p 2>/dev/null | grep 86 >/dev/null \
-          && { echo i486-ncr-sysv4; exit; } ;;
+	/bin/uname -p 2>/dev/null | grep 86 >/dev/null \
+	  && { echo i486-ncr-sysv4; exit; } ;;
     NCR*:*:4.2:* | MPRAS*:*:4.2:*)
 	OS_REL='.3'
 	test -r /etc/.relid \
@@ -1182,7 +1135,7 @@ EOF
     rs6000:LynxOS:2.*:*)
 	echo rs6000-unknown-lynxos${UNAME_RELEASE}
 	exit ;;
-    PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.0*:*)
+    PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*)
 	echo powerpc-unknown-lynxos${UNAME_RELEASE}
 	exit ;;
     SM[BE]S:UNIX_SV:*:*)
@@ -1202,10 +1155,10 @@ EOF
 		echo ns32k-sni-sysv
 	fi
 	exit ;;
-    PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort
-                      # says <Richard.M.Bartel@ccMail.Census.GOV>
-        echo i586-unisys-sysv4
-        exit ;;
+    PENTIUM:*:4.0*:*)	# Unisys `ClearPath HMP IX 4000' SVR4/MP effort
+			# says <Richard.M.Bartel@ccMail.Census.GOV>
+	echo i586-unisys-sysv4
+	exit ;;
     *:UNIX_System_V:4*:FTX*)
 	# From Gerald Hewes <hewes@openmarket.com>.
 	# How about differentiating between stratus architectures? -djm
@@ -1231,11 +1184,11 @@ EOF
 	exit ;;
     R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*)
 	if [ -d /usr/nec ]; then
-	        echo mips-nec-sysv${UNAME_RELEASE}
+		echo mips-nec-sysv${UNAME_RELEASE}
 	else
-	        echo mips-unknown-sysv${UNAME_RELEASE}
+		echo mips-unknown-sysv${UNAME_RELEASE}
 	fi
-        exit ;;
+	exit ;;
     BeBox:BeOS:*:*)	# BeOS running on hardware made by Be, PPC only.
 	echo powerpc-be-beos
 	exit ;;
@@ -1275,6 +1228,16 @@ EOF
     *:Darwin:*:*)
 	UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown
 	case $UNAME_PROCESSOR in
+	    i386)
+		eval $set_cc_for_build
+		if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then
+		  if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \
+		      (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \
+		      grep IS_64BIT_ARCH >/dev/null
+		  then
+		      UNAME_PROCESSOR="x86_64"
+		  fi
+		fi ;;
 	    unknown) UNAME_PROCESSOR=powerpc ;;
 	esac
 	echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE}
@@ -1290,6 +1253,9 @@ EOF
     *:QNX:*:4*)
 	echo i386-pc-qnx
 	exit ;;
+    NEO-?:NONSTOP_KERNEL:*:*)
+	echo neo-tandem-nsk${UNAME_RELEASE}
+	exit ;;
     NSE-?:NONSTOP_KERNEL:*:*)
 	echo nse-tandem-nsk${UNAME_RELEASE}
 	exit ;;
@@ -1335,13 +1301,13 @@ EOF
 	echo pdp10-unknown-its
 	exit ;;
     SEI:*:*:SEIUX)
-        echo mips-sei-seiux${UNAME_RELEASE}
+	echo mips-sei-seiux${UNAME_RELEASE}
 	exit ;;
     *:DragonFly:*:*)
 	echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'`
 	exit ;;
     *:*VMS:*:*)
-    	UNAME_MACHINE=`(uname -p) 2>/dev/null`
+	UNAME_MACHINE=`(uname -p) 2>/dev/null`
 	case "${UNAME_MACHINE}" in
 	    A*) echo alpha-dec-vms ; exit ;;
 	    I*) echo ia64-dec-vms ; exit ;;
@@ -1359,6 +1325,9 @@ EOF
     i*86:AROS:*:*)
 	echo ${UNAME_MACHINE}-pc-aros
 	exit ;;
+    x86_64:VMkernel:*:*)
+	echo ${UNAME_MACHINE}-unknown-esx
+	exit ;;
 esac
 
 #echo '(No uname command or uname output not recognized.)' 1>&2
@@ -1381,11 +1350,11 @@ main ()
 #include <sys/param.h>
   printf ("m68k-sony-newsos%s\n",
 #ifdef NEWSOS4
-          "4"
+	"4"
 #else
-	  ""
+	""
 #endif
-         ); exit (0);
+	); exit (0);
 #endif
 #endif
 
diff --git a/storage/tokudb/PerconaFT/tools/CMakeLists.txt b/storage/tokudb/PerconaFT/tools/CMakeLists.txt
index af82b4357d2..f11b9f350d7 100644
--- a/storage/tokudb/PerconaFT/tools/CMakeLists.txt
+++ b/storage/tokudb/PerconaFT/tools/CMakeLists.txt
@@ -1,6 +1,6 @@
 set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS _GNU_SOURCE DONT_DEPRECATE_ERRNO)
 
-set(tools tokudb_dump tokuftdump tokuft_logprint tdb-recover ftverify ba_replay)
+set(tools tokudb_dump tokuftdump tokuft_logprint tdb-recover ftverify)
 foreach(tool ${tools})
   add_executable(${tool} ${tool}.cc)
   add_dependencies(${tool} install_tdb_h)
@@ -14,4 +14,3 @@ target_link_libraries(ftverify m)
 
 install(TARGETS tokuftdump      DESTINATION ${INSTALL_BINDIR} COMPONENT Server)
 install(TARGETS tokuft_logprint DESTINATION ${INSTALL_BINDIR} COMPONENT Server)
-
diff --git a/storage/tokudb/PerconaFT/tools/ba_replay.cc b/storage/tokudb/PerconaFT/tools/ba_replay.cc
deleted file mode 100644
index cade7e5dfaf..00000000000
--- a/storage/tokudb/PerconaFT/tools/ba_replay.cc
+++ /dev/null
@@ -1,629 +0,0 @@
-/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */
-// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4:
-#ident "$Id$"
-/*======
-This file is part of PerconaFT.
-
-
-Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
-
-    PerconaFT is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License, version 2,
-    as published by the Free Software Foundation.
-
-    PerconaFT is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
-
-    You should have received a copy of the GNU General Public License
-    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
-
-----------------------------------------
-
-    PerconaFT is free software: you can redistribute it and/or modify
-    it under the terms of the GNU Affero General Public License, version 3,
-    as published by the Free Software Foundation.
-
-    PerconaFT is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU Affero General Public License for more details.
-
-    You should have received a copy of the GNU Affero General Public License
-    along with PerconaFT.  If not, see <http://www.gnu.org/licenses/>.
-======= */
-
-#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved."
-
-// Replay a block allocator trace against different strategies and compare
-// the results
-
-#include <db.h>
-
-#include <getopt.h>
-#include <math.h>
-#include <stdio.h>
-#include <string.h>
-
-#include <map>
-#include <set>
-#include <string>
-#include <sstream>
-#include <vector>
-
-#include <portability/memory.h>
-#include <portability/toku_assert.h>
-#include <portability/toku_stdlib.h>
-
-#include "ft/serialize/block_allocator.h"
-
-using std::map;
-using std::set;
-using std::string;
-using std::vector;
-
-static int verbose = false;
-
-static void ba_replay_assert(bool pred, const char *msg, const char *line, int line_num) {
-    if (!pred) {
-        fprintf(stderr, "%s, line (#%d): %s\n", msg, line_num, line);
-        abort();
-    }
-}
-
-static char *trim_whitespace(char *line) {
-    // skip leading whitespace
-    while (isspace(*line)) {
-        line++;
-    }
-    return line;
-}
-
-static int64_t parse_number(char **ptr, int line_num, int base) {
-    *ptr = trim_whitespace(*ptr);
-    char *line = *ptr;
-
-    char *new_ptr;
-    int64_t n = strtoll(line, &new_ptr, base);
-    ba_replay_assert(n >= 0, "malformed trace (bad numeric token)", line, line_num);
-    ba_replay_assert(new_ptr > *ptr, "malformed trace (missing numeric token)", line, line_num);
-    *ptr = new_ptr;
-    return n;
-}
-
-static uint64_t parse_uint64(char **ptr, int line_num) {
-    int64_t n = parse_number(ptr, line_num, 10);
-    // we happen to know that the uint64's we deal with will
-    // take less than 63 bits (they come from pointers)
-    return static_cast<uint64_t>(n);
-}
-
-static string parse_token(char **ptr, int line_num) {
-    *ptr = trim_whitespace(*ptr);
-    char *line = *ptr;
-
-    // parse the first token, which represents the traced function
-    char token[64];
-    int r = sscanf(*ptr, "%64s", token);
-    ba_replay_assert(r == 1, "malformed trace (missing string token)", line, line_num);
-    *ptr += strlen(token);
-    return string(token);
-}
-
-static block_allocator::blockpair parse_blockpair(char **ptr, int line_num) {
-    *ptr = trim_whitespace(*ptr);
-    char *line = *ptr;
-
-    uint64_t offset, size;
-    int bytes_read;
-    int r = sscanf(line, "[%" PRIu64 " %" PRIu64 "]%n", &offset, &size, &bytes_read);
-    ba_replay_assert(r == 2, "malformed trace (bad offset/size pair)", line, line_num);
-    *ptr += bytes_read;
-    return block_allocator::blockpair(offset, size);
-}
-
-static char *strip_newline(char *line, bool *found) {
-    char *ptr = strchr(line, '\n');
-    if (ptr != nullptr) {
-        if (found != nullptr) {
-            *found = true;
-        }
-        *ptr = '\0';
-    }
-    return line;
-}
-
-static char *read_trace_line(FILE *file) {
-    const int buf_size = 4096;
-    char buf[buf_size];
-    std::stringstream ss;
-    while (true) {
-        if (fgets(buf, buf_size, file) == nullptr) {
-            break;
-        }
-        bool has_newline = false;
-        ss << strip_newline(buf, &has_newline);
-        if (has_newline) {
-            // end of the line, we're done out
-            break;
-        }
-    }
-    std::string s = ss.str();
-    return s.size() ? toku_strdup(s.c_str()) : nullptr;
-}
-
-static vector<string> canonicalize_trace_from(FILE *file) {
-    // new trace, canonicalized from a raw trace
-    vector<string> canonicalized_trace;
-
-    // raw allocator id -> canonical allocator id
-    //
-    // keeps track of allocators that were created as part of the trace,
-    // and therefore will be part of the canonicalized trace.
-    uint64_t allocator_id_seq_num = 0;
-    map<uint64_t, uint64_t> allocator_ids;
-
-    // allocated offset -> allocation seq num
-    //
-    uint64_t allocation_seq_num = 0;
-    static const uint64_t ASN_NONE = (uint64_t) -1;
-    typedef map<uint64_t, uint64_t> offset_seq_map;
-
-    // raw allocator id -> offset_seq_map that tracks its allocations
-    map<uint64_t, offset_seq_map> offset_to_seq_num_maps;
-
-    int line_num = 0;
-    char *line;
-    while ((line = read_trace_line(file)) != nullptr) {
-        line_num++;
-        char *ptr = line;
-
-        string fn = parse_token(&ptr, line_num);
-        int64_t allocator_id = parse_number(&ptr, line_num, 16);
-
-        std::stringstream ss;
-        if (fn.find("ba_trace_create") != string::npos) {
-            ba_replay_assert(allocator_ids.count(allocator_id) == 0, "corrupted trace: double create", line, line_num);
-            ba_replay_assert(fn == "ba_trace_create" || fn == "ba_trace_create_from_blockpairs",
-                             "corrupted trace: bad fn", line, line_num);
-
-            // we only convert the allocator_id to an allocator_id_seq_num
-            // in the canonical trace and leave the rest of the line as-is.
-            allocator_ids[allocator_id] = allocator_id_seq_num;
-            ss << fn << ' ' << allocator_id_seq_num << ' ' << trim_whitespace(ptr) << std::endl;
-            allocator_id_seq_num++;
-
-            // First, read passed the reserve / alignment values.
-            (void) parse_uint64(&ptr, line_num);
-            (void) parse_uint64(&ptr, line_num);
-            if (fn == "ba_trace_create_from_blockpairs") {
-                // For each blockpair created by this traceline, add its offset to the offset seq map
-                // with asn ASN_NONE so that later canonicalizations of `free' know whether to write
-                // down the asn or the raw offset.
-                offset_seq_map *map = &offset_to_seq_num_maps[allocator_id];
-                while (*trim_whitespace(ptr) != '\0') {
-                    const block_allocator::blockpair bp = parse_blockpair(&ptr, line_num);
-                    (*map)[bp.offset] = ASN_NONE;
-                }
-            }
-        } else {
-            ba_replay_assert(allocator_ids.count(allocator_id) > 0, "corrupted trace: unknown allocator", line, line_num);
-            uint64_t canonical_allocator_id = allocator_ids[allocator_id];
-
-            // this is the map that tracks allocations for this allocator
-            offset_seq_map *map = &offset_to_seq_num_maps[allocator_id];
-
-            if (fn == "ba_trace_alloc") {
-                const uint64_t size = parse_uint64(&ptr, line_num);
-                const uint64_t heat = parse_uint64(&ptr, line_num);
-                const uint64_t offset = parse_uint64(&ptr, line_num);
-                ba_replay_assert(map->count(offset) == 0, "corrupted trace: double alloc", line, line_num);
-
-                // remember that an allocation at `offset' has the current alloc seq num
-                (*map)[offset] = allocation_seq_num;
-
-                // translate `offset = alloc(size)' to `asn = alloc(size)'
-                ss << fn << ' ' << canonical_allocator_id << ' ' << size << ' ' << heat << ' ' << allocation_seq_num << std::endl;
-                allocation_seq_num++;
-            } else if (fn == "ba_trace_free") {
-                const uint64_t offset = parse_uint64(&ptr, line_num);
-                ba_replay_assert(map->count(offset) != 0, "corrupted trace: invalid free", line, line_num);
-
-                // get the alloc seq num for an allcation that occurred at `offset'
-                const uint64_t asn = (*map)[offset];
-                map->erase(offset);
-
-                // if there's an asn, then a corresponding ba_trace_alloc occurred and we should
-                // write `free(asn)'. otherwise, the blockpair was initialized from create_from_blockpairs
-                // and we write the original offset.
-                if (asn != ASN_NONE) {
-                    ss << "ba_trace_free_asn" << ' ' << canonical_allocator_id << ' ' << asn << std::endl;
-                } else {
-                    ss << "ba_trace_free_offset" << ' ' << canonical_allocator_id << ' ' << offset << std::endl;
-                }
-            } else if (fn == "ba_trace_destroy") {
-                // Remove this allocator from both maps
-                allocator_ids.erase(allocator_id);
-                offset_to_seq_num_maps.erase(allocator_id);
-
-                // translate `destroy(ptr_id) to destroy(canonical_id)'
-                ss << fn << ' ' << canonical_allocator_id << ' ' << std::endl;
-            } else {
-                ba_replay_assert(false, "corrupted trace: bad fn", line, line_num);
-            }
-        }
-        canonicalized_trace.push_back(ss.str());
-
-        toku_free(line);
-    }
-
-    if (allocator_ids.size() != 0) {
-        fprintf(stderr, "warning: leaked allocators. this might be ok if the tracing process is still running");
-    }
-
-    return canonicalized_trace;
-}
-
-struct streaming_variance_calculator {
-    int64_t n_samples;
-    int64_t mean;
-    int64_t variance;
-
-    // math credit: AoCP, Donald Knuth, '62
-    void add_sample(int64_t x) {
-        n_samples++;
-        if (n_samples == 1) {
-            mean = x;
-            variance = 0;
-        } else {
-            int64_t old_mean = mean;
-            mean = old_mean + ((x - old_mean) / n_samples);
-            variance = (((n_samples - 1) * variance) +
-                        ((x - old_mean) * (x - mean))) / n_samples;
-        }
-    }
-};
-
-struct canonical_trace_stats {
-    uint64_t n_lines_replayed;
-
-    uint64_t n_create;
-    uint64_t n_create_from_blockpairs;
-    uint64_t n_alloc_hot;
-    uint64_t n_alloc_cold;
-    uint64_t n_free;
-    uint64_t n_destroy;
-
-    struct streaming_variance_calculator alloc_hot_bytes;
-    struct streaming_variance_calculator alloc_cold_bytes;
-
-    canonical_trace_stats() {
-        memset(this, 0, sizeof(*this));
-    }
-};
-
-struct fragmentation_report {
-    TOKU_DB_FRAGMENTATION_S beginning;
-    TOKU_DB_FRAGMENTATION_S end;
-    fragmentation_report() {
-        memset(this, 0, sizeof(*this));
-    }
-    void merge(const struct fragmentation_report &src_report) {
-        for (int i = 0; i < 2; i++) {
-            TOKU_DB_FRAGMENTATION_S *dst = i == 0 ? &beginning : &end;
-            const TOKU_DB_FRAGMENTATION_S *src = i == 0 ? &src_report.beginning : &src_report.end;
-            dst->file_size_bytes += src->file_size_bytes;
-            dst->data_bytes += src->data_bytes;
-            dst->data_blocks += src->data_blocks;
-            dst->checkpoint_bytes_additional += src->checkpoint_bytes_additional;
-            dst->checkpoint_blocks_additional += src->checkpoint_blocks_additional;
-            dst->unused_bytes += src->unused_bytes;
-            dst->unused_blocks += src->unused_blocks;
-            dst->largest_unused_block += src->largest_unused_block;
-        }
-    }
-};
-
-static void replay_canonicalized_trace(const vector<string> &canonicalized_trace,
-                                       block_allocator::allocation_strategy strategy,
-                                       map<uint64_t, struct fragmentation_report> *reports,
-                                       struct canonical_trace_stats *stats) {
-    // maps an allocator id to its block allocator
-    map<uint64_t, block_allocator *> allocator_map;
-
-    // maps allocation seq num to allocated offset
-    map<uint64_t, uint64_t> seq_num_to_offset;
-
-    for (vector<string>::const_iterator it = canonicalized_trace.begin();
-         it != canonicalized_trace.end(); it++) {
-        const int line_num = stats->n_lines_replayed++;
-
-        char *line = toku_strdup(it->c_str());
-        line = strip_newline(line, nullptr);
-
-        char *ptr = trim_whitespace(line);
-
-        // canonical allocator id is in base 10, not 16
-        string fn = parse_token(&ptr, line_num);
-        int64_t allocator_id = parse_number(&ptr, line_num, 10);
-
-        if (fn.find("ba_trace_create") != string::npos) {
-            const uint64_t reserve_at_beginning = parse_uint64(&ptr, line_num);
-            const uint64_t alignment = parse_uint64(&ptr, line_num);
-            ba_replay_assert(allocator_map.count(allocator_id) == 0,
-                             "corrupted canonical trace: double create", line, line_num);
-
-            block_allocator *ba = new block_allocator();
-            if (fn == "ba_trace_create") {
-                ba->create(reserve_at_beginning, alignment);
-                stats->n_create++;
-            } else {
-                ba_replay_assert(fn == "ba_trace_create_from_blockpairs",
-                                 "corrupted canonical trace: bad create fn", line, line_num);
-                vector<block_allocator::blockpair> pairs;
-                while (*trim_whitespace(ptr) != '\0') {
-                    const block_allocator::blockpair bp = parse_blockpair(&ptr, line_num);
-                    pairs.push_back(bp);
-                }
-                ba->create_from_blockpairs(reserve_at_beginning, alignment, &pairs[0], pairs.size());
-                stats->n_create_from_blockpairs++;
-            }
-            ba->set_strategy(strategy);
-
-            TOKU_DB_FRAGMENTATION_S report;
-            ba->get_statistics(&report);
-            (*reports)[allocator_id].beginning = report;
-            allocator_map[allocator_id] = ba;
-        } else {
-            ba_replay_assert(allocator_map.count(allocator_id) > 0,
-                             "corrupted canonical trace: no such allocator", line, line_num);
-
-            block_allocator *ba = allocator_map[allocator_id];
-            if (fn == "ba_trace_alloc") {
-                // replay an `alloc' whose result will be associated with a certain asn
-                const uint64_t size = parse_uint64(&ptr, line_num);
-                const uint64_t heat = parse_uint64(&ptr, line_num);
-                const uint64_t asn = parse_uint64(&ptr, line_num);
-                ba_replay_assert(seq_num_to_offset.count(asn) == 0,
-                                 "corrupted canonical trace: double alloc (asn in use)", line, line_num);
-
-                uint64_t offset;
-                ba->alloc_block(size, heat, &offset);
-                seq_num_to_offset[asn] = offset;
-                heat ? stats->n_alloc_hot++ : stats->n_alloc_cold++;
-                heat ? stats->alloc_hot_bytes.add_sample(size) : stats->alloc_cold_bytes.add_sample(size);
-            } else if (fn == "ba_trace_free_asn") {
-                // replay a `free' on a block whose offset is the result of an alloc with an asn
-                const uint64_t asn = parse_uint64(&ptr, line_num);
-                ba_replay_assert(seq_num_to_offset.count(asn) == 1,
-                                 "corrupted canonical trace: double free (asn unused)", line, line_num);
-
-                const uint64_t offset = seq_num_to_offset[asn];
-                ba->free_block(offset);
-                seq_num_to_offset.erase(asn);
-                stats->n_free++;
-            } else if (fn == "ba_trace_free_offset") {
-                // replay a `free' on a block whose offset was explicitly set during a create_from_blockpairs
-                const uint64_t offset = parse_uint64(&ptr, line_num);
-                ba->free_block(offset);
-                stats->n_free++;
-            } else if (fn == "ba_trace_destroy") {
-                TOKU_DB_FRAGMENTATION_S report;
-                ba->get_statistics(&report);
-                ba->destroy();
-                (*reports)[allocator_id].end = report;
-                allocator_map.erase(allocator_id);
-                stats->n_destroy++;
-            } else {
-                ba_replay_assert(false, "corrupted canonical trace: bad fn", line, line_num);
-            }
-        }
-
-        toku_free(line);
-    }
-}
-
-static const char *strategy_to_cstring(block_allocator::allocation_strategy strategy) {
-    switch (strategy) {
-    case block_allocator::allocation_strategy::BA_STRATEGY_FIRST_FIT:
-        return "first-fit";
-    case block_allocator::allocation_strategy::BA_STRATEGY_BEST_FIT:
-        return "best-fit";
-    case block_allocator::allocation_strategy::BA_STRATEGY_HEAT_ZONE:
-        return "heat-zone";
-    case block_allocator::allocation_strategy::BA_STRATEGY_PADDED_FIT:
-        return "padded-fit";
-    default:
-        abort();
-    }
-}
-
-static block_allocator::allocation_strategy cstring_to_strategy(const char *str) {
-    if (strcmp(str, "first-fit") == 0) {
-        return block_allocator::allocation_strategy::BA_STRATEGY_FIRST_FIT;
-    }
-    if (strcmp(str, "best-fit") == 0) {
-        return block_allocator::allocation_strategy::BA_STRATEGY_BEST_FIT;
-    }
-    if (strcmp(str, "heat-zone") == 0) {
-        return block_allocator::allocation_strategy::BA_STRATEGY_HEAT_ZONE;
-    }
-    if (strcmp(str, "padded-fit") != 0) {
-        fprintf(stderr, "bad strategy string: %s\n", str);
-        abort();
-    }
-    return block_allocator::allocation_strategy::BA_STRATEGY_PADDED_FIT;
-}
-
-static void print_result_verbose(uint64_t allocator_id,
-                                 block_allocator::allocation_strategy strategy,
-                                 const struct fragmentation_report &report) {
-    if (report.end.data_bytes + report.end.unused_bytes +
-        report.beginning.data_bytes + report.beginning.unused_bytes
-        < 32UL * 1024 * 1024) {
-        printf(" ...skipping allocator_id %" PRId64 " (total bytes < 32mb)\n", allocator_id);
-        return;
-    }
-
-    printf(" allocator_id:   %20" PRId64 "\n", allocator_id);
-    printf(" strategy:       %20s\n", strategy_to_cstring(strategy));
-
-    for (int i = 0; i < 2; i++) {
-        const TOKU_DB_FRAGMENTATION_S *r = i == 0 ? &report.beginning : &report.end;
-        printf("%s\n", i == 0 ? "BEFORE" : "AFTER");
-
-        uint64_t total_bytes = r->data_bytes + r->unused_bytes;
-        uint64_t total_blocks = r->data_blocks + r->unused_blocks;
-
-        // byte statistics
-        printf(" total bytes:    %20" PRId64 "\n", total_bytes);
-        printf(" used bytes:     %20" PRId64 " (%.3lf)\n", r->data_bytes,
-               static_cast<double>(r->data_bytes) / total_bytes);
-        printf(" unused bytes:   %20" PRId64 " (%.3lf)\n", r->unused_bytes,
-               static_cast<double>(r->unused_bytes) / total_bytes);
-
-        // block statistics
-        printf(" total blocks:   %20" PRId64 "\n", total_blocks);
-        printf(" used blocks:    %20" PRId64 " (%.3lf)\n", r->data_blocks,
-               static_cast<double>(r->data_blocks) / total_blocks);
-        printf(" unused blocks:  %20" PRId64 " (%.3lf)\n", r->unused_blocks,
-               static_cast<double>(r->unused_blocks) / total_blocks);
-
-        // misc
-        printf(" largest unused: %20" PRId64 "\n", r->largest_unused_block);
-    }
-}
-
-static void print_result(uint64_t allocator_id,
-                         block_allocator::allocation_strategy strategy,
-                         const struct fragmentation_report &report) {
-    const TOKU_DB_FRAGMENTATION_S *beginning = &report.beginning;
-    const TOKU_DB_FRAGMENTATION_S *end = &report.end;
-
-    uint64_t total_beginning_bytes = beginning->data_bytes + beginning->unused_bytes;
-    uint64_t total_end_bytes = end->data_bytes + end->unused_bytes;
-    if (total_end_bytes + total_beginning_bytes < 32UL * 1024 * 1024) {
-        if (verbose) {
-            printf("\n");
-            printf(" ...skipping allocator_id %" PRId64 " (total bytes < 32mb)\n", allocator_id);
-        }
-        return;
-    }
-    printf("\n");
-    if (verbose) {
-        print_result_verbose(allocator_id, strategy, report);
-    } else {
-        printf(" %-15s: allocator %" PRId64 ", %.3lf used bytes (%.3lf before)\n",
-               strategy_to_cstring(strategy), allocator_id,
-               static_cast<double>(report.end.data_bytes) / total_end_bytes,
-               static_cast<double>(report.beginning.data_bytes) / total_beginning_bytes);
-    }
-}
-
-static int only_aggregate_reports;
-
-static struct option getopt_options[] = {
-    { "verbose", no_argument, &verbose, 1 },
-    { "only-aggregate-reports", no_argument, &only_aggregate_reports, 1 },
-    { "include-strategy", required_argument, nullptr, 'i' },
-    { "exclude-strategy", required_argument, nullptr, 'x' },
-    { nullptr, 0, nullptr, 0 },
-};
-
-int main(int argc, char *argv[]) {
-    int opt;
-    set<block_allocator::allocation_strategy> candidate_strategies, excluded_strategies;
-    while ((opt = getopt_long(argc, argv, "", getopt_options, nullptr)) != -1) {
-        switch (opt) {
-        case 0:
-            break;
-        case 'i':
-            candidate_strategies.insert(cstring_to_strategy(optarg));
-            break;
-        case 'x':
-            excluded_strategies.insert(cstring_to_strategy(optarg));
-            break;
-        case '?':
-        default:
-            abort();
-        };
-    }
-    // Default to everything if nothing was explicitly included.
-    if (candidate_strategies.empty()) {
-        candidate_strategies.insert(block_allocator::allocation_strategy::BA_STRATEGY_FIRST_FIT);
-        candidate_strategies.insert(block_allocator::allocation_strategy::BA_STRATEGY_BEST_FIT);
-        candidate_strategies.insert(block_allocator::allocation_strategy::BA_STRATEGY_PADDED_FIT);
-        candidate_strategies.insert(block_allocator::allocation_strategy::BA_STRATEGY_HEAT_ZONE);
-    }
-    // ..but remove anything that was explicitly excluded
-    for (set<block_allocator::allocation_strategy>::const_iterator it = excluded_strategies.begin();
-         it != excluded_strategies.end(); it++) {
-        candidate_strategies.erase(*it);
-    }
-
-    // Run the real trace
-    //
-    // First, read the raw trace from stdin
-    vector<string> canonicalized_trace = canonicalize_trace_from(stdin);
-
-    if (!only_aggregate_reports) {
-        printf("\n");
-        printf("Individual reports, by allocator:\n");
-    }
-
-    struct canonical_trace_stats stats;
-    map<block_allocator::allocation_strategy, struct fragmentation_report> reports_by_strategy; 
-    for (set<block_allocator::allocation_strategy>::const_iterator it = candidate_strategies.begin();
-         it != candidate_strategies.end(); it++) {
-        const block_allocator::allocation_strategy strategy(*it);
-
-        // replay the canonicalized trace against the current strategy.
-        //
-        // we provided the allocator map so we can gather statistics later
-        struct canonical_trace_stats dummy_stats;
-        map<uint64_t, struct fragmentation_report> reports;
-        replay_canonicalized_trace(canonicalized_trace, strategy, &reports,
-                                   // Only need to gather canonical trace stats once
-                                   it == candidate_strategies.begin() ? &stats : &dummy_stats);
-
-        struct fragmentation_report aggregate_report;
-        memset(&aggregate_report, 0, sizeof(aggregate_report));
-        for (map<uint64_t, struct fragmentation_report>::iterator rp = reports.begin();
-             rp != reports.end(); rp++) {
-            const struct fragmentation_report &report = rp->second;
-            aggregate_report.merge(report);
-            if (!only_aggregate_reports) {
-                print_result(rp->first, strategy, report);
-            }
-        }
-        reports_by_strategy[strategy] = aggregate_report;
-    }
-
-    printf("\n");
-    printf("Aggregate reports, by strategy:\n");
-
-    for (map<block_allocator::allocation_strategy, struct fragmentation_report>::iterator it = reports_by_strategy.begin();
-         it != reports_by_strategy.end(); it++) {
-        print_result(0, it->first, it->second);
-    }
-
-    printf("\n");
-    printf("Overall trace stats:\n");
-    printf("\n");
-    printf(" n_lines_played:            %15" PRIu64 "\n", stats.n_lines_replayed);
-    printf(" n_create:                  %15" PRIu64 "\n", stats.n_create);
-    printf(" n_create_from_blockpairs:  %15" PRIu64 "\n", stats.n_create_from_blockpairs);
-    printf(" n_alloc_hot:               %15" PRIu64 "\n", stats.n_alloc_hot);
-    printf(" n_alloc_cold:              %15" PRIu64 "\n", stats.n_alloc_cold);
-    printf(" n_free:                    %15" PRIu64 "\n", stats.n_free);
-    printf(" n_destroy:                 %15" PRIu64 "\n", stats.n_destroy);
-    printf("\n");
-    printf(" avg_alloc_hot:             %15" PRIu64 "\n", stats.alloc_hot_bytes.mean);
-    printf(" stddev_alloc_hot:          %15" PRIu64 "\n", (uint64_t) sqrt(stats.alloc_hot_bytes.variance));
-    printf(" avg_alloc_cold:            %15" PRIu64 "\n", stats.alloc_cold_bytes.mean);
-    printf(" stddev_alloc_cold:         %15" PRIu64 "\n", (uint64_t) sqrt(stats.alloc_cold_bytes.variance));
-    printf("\n");
-
-    return 0;
-}
diff --git a/storage/tokudb/PerconaFT/tools/ftverify.cc b/storage/tokudb/PerconaFT/tools/ftverify.cc
index 5920be8deda..2324249ba00 100644
--- a/storage/tokudb/PerconaFT/tools/ftverify.cc
+++ b/storage/tokudb/PerconaFT/tools/ftverify.cc
@@ -148,7 +148,7 @@ deserialize_headers(int fd, struct ft **h1p, struct ft **h2p)
         }
     }
     {
-        toku_off_t header_1_off = block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE;
+        toku_off_t header_1_off = BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE;
         r1 = deserialize_ft_from_fd_into_rbuf(
             fd,
             header_1_off,
diff --git a/storage/tokudb/PerconaFT/tools/tokuftdump.cc b/storage/tokudb/PerconaFT/tools/tokuftdump.cc
index 23ef72218ac..f6d777b4161 100644
--- a/storage/tokudb/PerconaFT/tools/tokuftdump.cc
+++ b/storage/tokudb/PerconaFT/tools/tokuftdump.cc
@@ -192,6 +192,7 @@ static void dump_header(FT ft) {
     dump_descriptor(&ft->descriptor);
     printf(" estimated numrows=%" PRId64 "\n", ft->in_memory_stats.numrows);
     printf(" estimated numbytes=%" PRId64 "\n", ft->in_memory_stats.numbytes);
+    printf(" logical row count=%" PRId64 "\n", ft->in_memory_logical_rows);
 }
 
 static int64_t getRootNode(FT ft) {
diff --git a/storage/tokudb/PerconaFT/util/tests/x1764-test.cc b/storage/tokudb/PerconaFT/util/tests/x1764-test.cc
index 48ff28e89af..76b1d9c713e 100644
--- a/storage/tokudb/PerconaFT/util/tests/x1764-test.cc
+++ b/storage/tokudb/PerconaFT/util/tests/x1764-test.cc
@@ -110,7 +110,7 @@ test2 (void) {
 
 static void
 test3 (void)
-// Compare the simple version to the highly optimized verison.
+// Compare the simple version to the highly optimized version.
 {
     const int datalen = 1000;
     char data[datalen];
diff --git a/storage/tokudb/ha_tokudb.cc b/storage/tokudb/ha_tokudb.cc
index c10dfc0e76e..827c22ccd7d 100644
--- a/storage/tokudb/ha_tokudb.cc
+++ b/storage/tokudb/ha_tokudb.cc
@@ -382,17 +382,17 @@ void TOKUDB_SHARE::update_row_count(
         pct_of_rows_changed_to_trigger = ((_rows * auto_threshold) / 100);
         if (_row_delta_activity >= pct_of_rows_changed_to_trigger) {
             char msg[200];
-            snprintf(
-                msg,
-                sizeof(msg),
-                "TokuDB: Auto %s background analysis for %s, delta_activity "
-                "%llu is greater than %llu percent of %llu rows.",
-                tokudb::sysvars::analyze_in_background(thd) > 0 ?
-                    "scheduling" : "running",
-                full_table_name(),
-                _row_delta_activity,
-                auto_threshold,
-                (ulonglong)(_rows));
+            snprintf(msg,
+                     sizeof(msg),
+                     "TokuDB: Auto %s analysis for %s, delta_activity %llu is "
+                     "greater than %llu percent of %llu rows.",
+                     tokudb::sysvars::analyze_in_background(thd) > 0
+                         ? "scheduling background"
+                         : "running foreground",
+                     full_table_name(),
+                     _row_delta_activity,
+                     auto_threshold,
+                     (ulonglong)(_rows));
 
             // analyze_standard will unlock _mutex regardless of success/failure
             int ret = analyze_standard(thd, NULL);
@@ -533,7 +533,7 @@ typedef struct index_read_info {
 
 static int ai_poll_fun(void *extra, float progress) {
     LOADER_CONTEXT context = (LOADER_CONTEXT)extra;
-    if (thd_killed(context->thd)) {
+    if (thd_kill_level(context->thd)) {
         sprintf(context->write_status_msg, "The process has been killed, aborting add index.");
         return ER_ABORTING_CONNECTION;
     }
@@ -548,7 +548,7 @@ static int ai_poll_fun(void *extra, float progress) {
 
 static int loader_poll_fun(void *extra, float progress) {
     LOADER_CONTEXT context = (LOADER_CONTEXT)extra;
-    if (thd_killed(context->thd)) {
+    if (thd_kill_level(context->thd)) {
         sprintf(context->write_status_msg, "The process has been killed, aborting bulk load.");
         return ER_ABORTING_CONNECTION;
     }
@@ -3435,7 +3435,7 @@ int ha_tokudb::end_bulk_insert(bool abort) {
     ai_metadata_update_required = false;
     loader_error = 0;
     if (loader) {
-        if (!abort_loader && !thd_killed(thd)) {
+        if (!abort_loader && !thd_kill_level(thd)) {
             DBUG_EXECUTE_IF("tokudb_end_bulk_insert_sleep", {
                 const char *orig_proc_info = tokudb_thd_get_proc_info(thd);
                 thd_proc_info(thd, "DBUG sleep");
@@ -3445,7 +3445,7 @@ int ha_tokudb::end_bulk_insert(bool abort) {
             error = loader->close(loader);
             loader = NULL;
             if (error) { 
-                if (thd_killed(thd)) {
+                if (thd_kill_level(thd)) {
                     my_error(ER_QUERY_INTERRUPTED, MYF(0));
                 }
                 goto cleanup; 
@@ -3580,7 +3580,7 @@ int ha_tokudb::is_index_unique(bool* is_unique, DB_TXN* txn, DB* db, KEY* key_in
                 share->row_count(),
                 key_info->name);
             thd_proc_info(thd, status_msg);
-            if (thd_killed(thd)) {
+            if (thd_kill_level(thd)) {
                 my_error(ER_QUERY_INTERRUPTED, MYF(0));
                 error = ER_QUERY_INTERRUPTED;
                 goto cleanup;
@@ -3696,6 +3696,8 @@ int ha_tokudb::do_uniqueness_checks(uchar* record, DB_TXN* txn, THD* thd) {
     // first do uniqueness checks
     //
     if (share->has_unique_keys && do_unique_checks(thd, in_rpl_write_rows)) {
+        DBUG_EXECUTE_IF("tokudb_crash_if_rpl_does_uniqueness_check",
+                        DBUG_ASSERT(0););
         for (uint keynr = 0; keynr < table_share->keys; keynr++) {
             bool is_unique_key = (table->key_info[keynr].flags & HA_NOSAME) || (keynr == primary_key);
             bool is_unique = false;
@@ -4096,7 +4098,7 @@ int ha_tokudb::write_row(uchar * record) {
             goto cleanup; 
         }
         if (curr_num_DBs == 1) {
-            error = insert_row_to_main_dictionary(record,&prim_key, &row, txn);
+            error = insert_row_to_main_dictionary(record, &prim_key, &row, txn);
             if (error) { goto cleanup; }
         } else {
             error = insert_rows_to_dictionaries_mult(&prim_key, &row, txn, thd);
@@ -5239,7 +5241,7 @@ int ha_tokudb::fill_range_query_buf(
         // otherwise, if we simply see that the current key is no match,
         // we tell the cursor to continue and don't store
         // the key locally
-        if (result == ICP_OUT_OF_RANGE || thd_killed(thd)) {
+        if (result == ICP_OUT_OF_RANGE || thd_kill_level(thd)) {
             icp_went_out_of_range = true;
             error = 0;
             DEBUG_SYNC(ha_thd(), "tokudb_icp_asc_scan_out_of_range");
@@ -5607,7 +5609,7 @@ int ha_tokudb::get_next(
             static_cast<tokudb_trx_data*>(thd_get_ha_data(thd, tokudb_hton));
         trx->stmt_progress.queried++;
         track_progress(thd);
-        if (thd_killed(thd))
+        if (thd_kill_level(thd))
             error = ER_ABORTING_CONNECTION;
     }
 cleanup:
@@ -5901,6 +5903,7 @@ int ha_tokudb::rnd_pos(uchar * buf, uchar * pos) {
     // test rpl slave by inducing a delay before the point query
     THD *thd = ha_thd();
     if (thd->slave_thread && (in_rpl_delete_rows || in_rpl_update_rows)) {
+        DBUG_EXECUTE_IF("tokudb_crash_if_rpl_looks_up_row", DBUG_ASSERT(0););
         uint64_t delay_ms = tokudb::sysvars::rpl_lookup_rows_delay(thd);
         if (delay_ms)
             usleep(delay_ms * 1000);
@@ -6116,7 +6119,7 @@ int ha_tokudb::info(uint flag) {
             // we should always have a primary key
             assert_always(share->file != NULL);
 
-            error = estimate_num_rows(share->file,&num_rows, txn);
+            error = estimate_num_rows(share->file, &num_rows, txn);
             if (error == 0) {
                 share->set_row_count(num_rows, false);
                 stats.records = num_rows;
@@ -8337,7 +8340,7 @@ int ha_tokudb::tokudb_add_index(
                     (long long unsigned)share->row_count());
 #endif
 
-                if (thd_killed(thd)) {
+                if (thd_kill_level(thd)) {
                     error = ER_ABORTING_CONNECTION;
                     goto cleanup;
                 }
diff --git a/storage/tokudb/ha_tokudb_admin.cc b/storage/tokudb/ha_tokudb_admin.cc
index db3d6c112d4..e1443101bb6 100644
--- a/storage/tokudb/ha_tokudb_admin.cc
+++ b/storage/tokudb/ha_tokudb_admin.cc
@@ -7,7 +7,7 @@ This file is part of TokuDB
 
 Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 
-    TokuDBis is free software: you can redistribute it and/or modify
+    TokuDB is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License, version 2,
     as published by the Free Software Foundation.
 
@@ -43,13 +43,11 @@ public:
     virtual ~recount_rows_t();
 
     virtual const char* key();
-
-    virtual void status(
-        char* database,
-        char* table,
-        char* type,
-        char* params,
-        char* status);
+    virtual const char* database();
+    virtual const char* table();
+    virtual const char* type();
+    virtual const char* parameters();
+    virtual const char* status();
 
 protected:
     virtual void on_run();
@@ -64,6 +62,8 @@ private:
     ulonglong       _throttle;
 
     // for recount rows status reporting
+    char            _parameters[256];
+    char            _status[1024];
     int             _result;
     ulonglong       _recount_start; // in microseconds
     ulonglong       _total_elapsed_time; // in microseconds
@@ -78,7 +78,6 @@ private:
         uint64_t deleted,
         void* extra);
     int analyze_recount_rows_progress(uint64_t count, uint64_t deleted);
-    void get_analyze_status(char*);
 };
 
 void* recount_rows_t::operator new(size_t sz) {
@@ -114,10 +113,19 @@ recount_rows_t::recount_rows_t(
     }
 
     _throttle = tokudb::sysvars::analyze_throttle(thd);
+
+    snprintf(_parameters,
+             sizeof(_parameters),
+             "TOKUDB_ANALYZE_THROTTLE=%llu;",
+             _throttle);
+    _status[0] = '\0';
 }
 recount_rows_t::~recount_rows_t() {
 }
 void recount_rows_t::on_run() {
+    const char* orig_proc_info = NULL;
+    if (_thd)
+        orig_proc_info = tokudb_thd_get_proc_info(_thd);
     _recount_start = tokudb::time::microsec();
     _total_elapsed_time = 0;
 
@@ -171,6 +179,8 @@ void recount_rows_t::on_run() {
         _result,
         _share->row_count());
 error:
+    if(_thd)
+        tokudb_thd_set_proc_info(_thd, orig_proc_info);
     return;
 }
 void recount_rows_t::on_destroy() {
@@ -179,18 +189,21 @@ void recount_rows_t::on_destroy() {
 const char* recount_rows_t::key() {
     return _share->full_table_name();
 }
-void recount_rows_t::status(
-    char* database,
-    char* table,
-    char* type,
-    char* params,
-    char* status) {
-
-    strcpy(database, _share->database_name());
-    strcpy(table, _share->table_name());
-    strcpy(type, "TOKUDB_ANALYZE_MODE_RECOUNT_ROWS");
-    sprintf(params, "TOKUDB_ANALYZE_THROTTLE=%llu;", _throttle);
-    get_analyze_status(status);
+const char* recount_rows_t::database() {
+    return _share->database_name();
+}
+const char* recount_rows_t::table() {
+    return _share->table_name();
+}
+const char* recount_rows_t::type() {
+    static const char* type = "TOKUDB_ANALYZE_MODE_RECOUNT_ROWS";
+    return type;
+}
+const char* recount_rows_t::parameters() {
+    return _parameters;
+}
+const char* recount_rows_t::status() {
+    return _status;
 }
 int recount_rows_t::analyze_recount_rows_progress(
     uint64_t count,
@@ -212,17 +225,37 @@ int recount_rows_t::analyze_recount_rows_progress(
         _ticks = 0;
         uint64_t now = tokudb::time::microsec();
         _total_elapsed_time = now - _recount_start;
-        if ((_thd && thd_killed(_thd)) || cancelled()) {
+        if ((_thd && thd_kill_level(_thd)) || cancelled()) {
             // client killed
             return ER_ABORTING_CONNECTION;
         }
 
+        // rebuild status
+        // There is a slight race condition here,
+        // _status is used here for tokudb_thd_set_proc_info and it is also used
+        // for the status column in i_s.background_job_status.
+        // If someone happens to be querying/building the i_s table
+        // at the exact same time that the status is being rebuilt here,
+        // the i_s table could get some garbage status.
+        // This solution is a little heavy handed but it works, it prevents us
+        // from changing the status while someone might be immediately observing
+        // us and it prevents someone from observing us while we change the
+        // status
+        tokudb::background::_job_manager->lock();
+        snprintf(_status,
+                 sizeof(_status),
+                 "recount_rows %s.%s counted %llu rows and %llu deleted "
+                 "in %llu seconds.",
+                 _share->database_name(),
+                 _share->table_name(),
+                 _rows,
+                 _deleted_rows,
+                 _total_elapsed_time / tokudb::time::MICROSECONDS);
+        tokudb::background::_job_manager->unlock();
+
         // report
-        if (_thd) {
-            char status[256];
-            get_analyze_status(status);
-            thd_proc_info(_thd, status);
-        }
+        if (_thd)
+            tokudb_thd_set_proc_info(_thd, _status);
 
         // throttle
         // given the throttle value, lets calculate the maximum number of rows
@@ -238,18 +271,6 @@ int recount_rows_t::analyze_recount_rows_progress(
     }
     return 0;
 }
-void recount_rows_t::get_analyze_status(char* msg) {
-    sprintf(
-        msg,
-        "recount_rows %s.%s counted %llu rows and %llu deleted in %llu "
-        "seconds.",
-        _share->database_name(),
-        _share->table_name(),
-        _rows,
-        _deleted_rows,
-        _total_elapsed_time / tokudb::time::MICROSECONDS);
-}
-
 
 class standard_t : public tokudb::background::job_manager_t::job_t {
 public:
@@ -261,13 +282,11 @@ public:
     virtual ~standard_t();
 
     virtual const char* key(void);
-
-    virtual void status(
-        char* database,
-        char* table,
-        char* type,
-        char* params,
-        char* status);
+    virtual const char* database();
+    virtual const char* table();
+    virtual const char* type();
+    virtual const char* parameters();
+    virtual const char* status();
 
 protected:
     virtual void on_run();
@@ -284,6 +303,8 @@ private:
     double          _delete_fraction;
 
     // for analyze status reporting, may also use other state
+    char            _parameters[256];
+    char            _status[1024];
     int             _result;
     ulonglong       _analyze_start; // in microseconds
     ulonglong       _total_elapsed_time; // in microseconds
@@ -305,7 +326,6 @@ private:
         uint64_t deleted_rows);
     bool analyze_standard_cursor_callback(uint64_t deleted_rows);
 
-    void get_analyze_status(char*);
     int analyze_key_progress();
     int analyze_key(uint64_t* rec_per_key_part);
 };
@@ -351,6 +371,16 @@ standard_t::standard_t(
     _time_limit =
         tokudb::sysvars::analyze_time(thd) * tokudb::time::MICROSECONDS;
     _delete_fraction = tokudb::sysvars::analyze_delete_fraction(thd);
+
+    snprintf(_parameters,
+             sizeof(_parameters),
+             "TOKUDB_ANALYZE_DELETE_FRACTION=%f; "
+             "TOKUDB_ANALYZE_TIME=%llu; TOKUDB_ANALYZE_THROTTLE=%llu;",
+             _delete_fraction,
+             _time_limit / tokudb::time::MICROSECONDS,
+             _throttle);
+
+    _status[0] = '\0';
 }
 standard_t::~standard_t() {
 }
@@ -358,6 +388,10 @@ void standard_t::on_run() {
     DB_BTREE_STAT64 stat64;
     uint64_t rec_per_key_part[_share->_max_key_parts];
     uint64_t total_key_parts = 0;
+    const char* orig_proc_info = NULL;
+    if (_thd)
+        orig_proc_info = tokudb_thd_get_proc_info(_thd);
+
     _analyze_start = tokudb::time::microsec();
     _half_time = _time_limit > 0 ? _time_limit/2 : 0;
 
@@ -395,7 +429,7 @@ void standard_t::on_run() {
             _result = HA_ADMIN_FAILED;
         }
         if (_thd && (_result == HA_ADMIN_FAILED ||
-            (double)_deleted_rows >
+            static_cast<double>(_deleted_rows) >
                 _delete_fraction * (_rows + _deleted_rows))) {
 
             char name[256]; int namelen;
@@ -460,8 +494,9 @@ cleanup:
     }
 
 error:
+    if (_thd)
+        tokudb_thd_set_proc_info(_thd, orig_proc_info);
     return;
-
 }
 void standard_t::on_destroy() {
     _share->lock();
@@ -472,24 +507,21 @@ void standard_t::on_destroy() {
 const char* standard_t::key() {
     return _share->full_table_name();
 }
-void standard_t::status(
-    char* database,
-    char* table,
-    char* type,
-    char* params,
-    char* status) {
-
-    strcpy(database, _share->database_name());
-    strcpy(table, _share->table_name());
-    strcpy(type, "TOKUDB_ANALYZE_MODE_STANDARD");
-    sprintf(
-        params,
-        "TOKUDB_ANALYZE_DELETE_FRACTION=%f; "
-        "TOKUDB_ANALYZE_TIME=%llu; TOKUDB_ANALYZE_THROTTLE=%llu;",
-        _delete_fraction,
-        _time_limit / tokudb::time::MICROSECONDS,
-        _throttle);
-    get_analyze_status(status);
+const char* standard_t::database() {
+    return _share->database_name();
+}
+const char* standard_t::table() {
+    return _share->table_name();
+}
+const char* standard_t::type() {
+    static const char* type = "TOKUDB_ANALYZE_MODE_STANDARD";
+    return type;
+}
+const char* standard_t::parameters() {
+    return _parameters;
+}
+const char* standard_t::status() {
+    return _status;
 }
 bool standard_t::analyze_standard_cursor_callback(
     void* extra,
@@ -502,63 +534,81 @@ bool standard_t::analyze_standard_cursor_callback(uint64_t deleted_rows) {
     _ticks += deleted_rows;
     return analyze_key_progress() != 0;
 }
-void standard_t::get_analyze_status(char* msg) {
-    static const char* scan_direction_str[] = {
-        "not scanning",
-        "scanning forward",
-        "scanning backward",
-        "scan unknown"
-    };
-
-    const char* scan_direction = NULL;
-    switch (_scan_direction) {
-        case 0: scan_direction = scan_direction_str[0]; break;
-        case DB_NEXT: scan_direction = scan_direction_str[1]; break;
-        case DB_PREV: scan_direction = scan_direction_str[2]; break;
-        default: scan_direction = scan_direction_str[3]; break;
-    }
-
-    float progress_rows = 0.0;
-    if (_share->row_count() > 0)
-        progress_rows = (float) _rows / (float) _share->row_count();
-    float progress_time = 0.0;
-    if (_time_limit > 0)
-        progress_time = (float) _key_elapsed_time / (float) _time_limit;
-    sprintf(
-        msg,
-        "analyze table standard %s.%s.%s %llu of %u %.lf%% rows %.lf%% time, "
-        "%s",
-        _share->database_name(),
-        _share->table_name(),
-        _share->_key_descriptors[_current_key]._name,
-        _current_key,
-        _share->_keys,
-        progress_rows * 100.0,
-        progress_time * 100.0,
-        scan_direction);
-}
 int standard_t::analyze_key_progress(void) {
     if (_ticks > 1000) {
         _ticks = 0;
         uint64_t now = tokudb::time::microsec();
         _total_elapsed_time = now - _analyze_start;
         _key_elapsed_time = now - _analyze_key_start;
-        if ((_thd && thd_killed(_thd)) || cancelled()) {
+        if ((_thd && thd_kill_level(_thd)) || cancelled()) {
             // client killed
             return ER_ABORTING_CONNECTION;
-        } else if(_time_limit > 0 &&
-                  (uint64_t)_key_elapsed_time > _time_limit) {
+        } else if (_time_limit > 0 &&
+                   static_cast<uint64_t>(_key_elapsed_time) > _time_limit) {
             // time limit reached
             return ETIME;
         }
 
-        // report
-        if (_thd) {
-            char status[256];
-            get_analyze_status(status);
-            thd_proc_info(_thd, status);
+        // rebuild status
+        // There is a slight race condition here,
+        // _status is used here for tokudb_thd_set_proc_info and it is also used
+        // for the status column in i_s.background_job_status.
+        // If someone happens to be querying/building the i_s table
+        // at the exact same time that the status is being rebuilt here,
+        // the i_s table could get some garbage status.
+        // This solution is a little heavy handed but it works, it prevents us
+        // from changing the status while someone might be immediately observing
+        // us and it prevents someone from observing us while we change the
+        // status.
+        static const char* scan_direction_str[] = {"not scanning",
+                                                   "scanning forward",
+                                                   "scanning backward",
+                                                   "scan unknown"};
+
+        const char* scan_direction = NULL;
+        switch (_scan_direction) {
+            case 0:
+                scan_direction = scan_direction_str[0];
+                break;
+            case DB_NEXT:
+                scan_direction = scan_direction_str[1];
+                break;
+            case DB_PREV:
+                scan_direction = scan_direction_str[2];
+                break;
+            default:
+                scan_direction = scan_direction_str[3];
+                break;
         }
 
+        float progress_rows = 0.0;
+        if (_share->row_count() > 0)
+            progress_rows = static_cast<float>(_rows) /
+                            static_cast<float>(_share->row_count());
+        float progress_time = 0.0;
+        if (_time_limit > 0)
+            progress_time = static_cast<float>(_key_elapsed_time) /
+                            static_cast<float>(_time_limit);
+        tokudb::background::_job_manager->lock();
+        snprintf(
+            _status,
+            sizeof(_status),
+            "analyze table standard %s.%s.%s %llu of %u %.lf%% rows %.lf%% "
+            "time, %s",
+            _share->database_name(),
+            _share->table_name(),
+            _share->_key_descriptors[_current_key]._name,
+            _current_key,
+            _share->_keys,
+            progress_rows * 100.0,
+            progress_time * 100.0,
+            scan_direction);
+        tokudb::background::_job_manager->unlock();
+
+        // report
+        if (_thd)
+            tokudb_thd_set_proc_info(_thd, _status);
+
         // throttle
         // given the throttle value, lets calculate the maximum number of rows
         // we should have seen so far in a .1 sec resolution
@@ -694,6 +744,11 @@ int standard_t::analyze_key(uint64_t* rec_per_key_part) {
     assert_always(close_error == 0);
 
 done:
+    // in case we timed out (bunch of deleted records) without hitting a
+    // single row
+    if (_rows == 0)
+        _rows = 1;
+
     // return cardinality
     for (uint64_t i = 0; i < num_key_parts; i++) {
         rec_per_key_part[i] = _rows / unique_rows[i];
@@ -733,7 +788,6 @@ int TOKUDB_SHARE::analyze_recount_rows(THD* thd,DB_TXN* txn) {
 
     assert_always(thd != NULL);
 
-    const char *orig_proc_info = tokudb_thd_get_proc_info(thd);
     int result = HA_ADMIN_OK;
 
     tokudb::analyze::recount_rows_t* job
@@ -753,8 +807,6 @@ int TOKUDB_SHARE::analyze_recount_rows(THD* thd,DB_TXN* txn) {
         result = HA_ADMIN_FAILED;
     }
 
-    thd_proc_info(thd, orig_proc_info);
-
     TOKUDB_HANDLER_DBUG_RETURN(result);
 }
 
@@ -778,8 +830,6 @@ int TOKUDB_SHARE::analyze_standard(THD* thd, DB_TXN* txn) {
         TOKUDB_HANDLER_DBUG_RETURN(result);
     }
 
-    const char *orig_proc_info = tokudb_thd_get_proc_info(thd);
-
     tokudb::analyze::standard_t* job
         = new tokudb::analyze::standard_t(txn == NULL ? false : true, thd,
                                           this, txn);
@@ -808,8 +858,6 @@ int TOKUDB_SHARE::analyze_standard(THD* thd, DB_TXN* txn) {
 
     lock();
 
-    thd_proc_info(thd, orig_proc_info);
-
     TOKUDB_HANDLER_DBUG_RETURN(result);
 }
 
@@ -828,7 +876,7 @@ typedef struct hot_optimize_context {
 
 static int hot_optimize_progress_fun(void *extra, float progress) {
     HOT_OPTIMIZE_CONTEXT context = (HOT_OPTIMIZE_CONTEXT)extra;
-    if (thd_killed(context->thd)) {
+    if (thd_kill_level(context->thd)) {
         sprintf(
             context->write_status_msg,
             "The process has been killed, aborting hot optimize.");
@@ -955,7 +1003,7 @@ struct check_context {
 static int ha_tokudb_check_progress(void* extra, float progress) {
     struct check_context* context = (struct check_context*)extra;
     int result = 0;
-    if (thd_killed(context->thd))
+    if (thd_kill_level(context->thd))
         result = ER_ABORTING_CONNECTION;
     return result;
 }
diff --git a/storage/tokudb/hatoku_defines.h b/storage/tokudb/hatoku_defines.h
index 8c7ebfa62de..a03f365d724 100644
--- a/storage/tokudb/hatoku_defines.h
+++ b/storage/tokudb/hatoku_defines.h
@@ -7,7 +7,7 @@ This file is part of TokuDB
 
 Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 
-    TokuDBis is free software: you can redistribute it and/or modify
+    TokuDB is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License, version 2,
     as published by the Free Software Foundation.
 
@@ -234,9 +234,12 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 // mysql 5.6.15 removed the test macro, so we define our own
 #define tokudb_test(e) ((e) ? 1 : 0)
 
-inline const char* tokudb_thd_get_proc_info(const THD *thd) {
+inline const char* tokudb_thd_get_proc_info(const THD* thd) {
     return thd->proc_info;
 }
+inline void tokudb_thd_set_proc_info(THD* thd, const char* proc_info) {
+    thd_proc_info(thd, proc_info);
+}
 
 // uint3korr reads 4 bytes and valgrind reports an error, so we use this function instead
 inline uint tokudb_uint3korr(const uchar *a) {
diff --git a/storage/tokudb/hatoku_hton.cc b/storage/tokudb/hatoku_hton.cc
index 6a32f35479e..c3fb6afed73 100644
--- a/storage/tokudb/hatoku_hton.cc
+++ b/storage/tokudb/hatoku_hton.cc
@@ -55,6 +55,7 @@ static bool tokudb_show_status(
 static void tokudb_handle_fatal_signal(handlerton* hton, THD* thd, int sig);
 #endif
 static int tokudb_close_connection(handlerton* hton, THD* thd);
+static void tokudb_kill_query(handlerton *hton, THD *thd, enum thd_kill_levels level);
 static int tokudb_commit(handlerton* hton, THD* thd, bool all);
 static int tokudb_rollback(handlerton* hton, THD* thd, bool all);
 #if TOKU_INCLUDE_XA
@@ -147,6 +148,11 @@ static void tokudb_lock_timeout_callback(
     const DBT* right_key,
     uint64_t blocking_txnid);
 
+static void tokudb_lock_wait_needed_callback(
+    void* arg,
+    uint64_t requesting_txnid,
+    uint64_t blocking_txnid);
+
 #define ASSERT_MSGLEN 1024
 
 void toku_hton_assert_fail(
@@ -331,6 +337,7 @@ static int tokudb_init_func(void *p) {
 
     tokudb_hton->create = tokudb_create_handler;
     tokudb_hton->close_connection = tokudb_close_connection;
+    tokudb_hton->kill_query = tokudb_kill_query;
 
     tokudb_hton->savepoint_offset = sizeof(SP_INFO_T);
     tokudb_hton->savepoint_set = tokudb_savepoint;
@@ -531,6 +538,8 @@ static int tokudb_init_func(void *p) {
     db_env->change_fsync_log_period(db_env, tokudb::sysvars::fsync_log_period);
 
     db_env->set_lock_timeout_callback(db_env, tokudb_lock_timeout_callback);
+    db_env->set_dir_per_db(db_env, tokudb::sysvars::dir_per_db);
+    db_env->set_lock_wait_callback(db_env, tokudb_lock_wait_needed_callback);
 
     db_env->set_loader_memory_size(
         db_env,
@@ -753,6 +762,12 @@ static int tokudb_close_connection(handlerton* hton, THD* thd) {
     return error;
 }
 
+void tokudb_kill_query(handlerton *hton, THD *thd, enum thd_kill_levels level) {
+    TOKUDB_DBUG_ENTER("");
+    db_env->kill_waiter(db_env, thd);
+    DBUG_VOID_RETURN;
+}
+
 bool tokudb_flush_logs(handlerton * hton) {
     TOKUDB_DBUG_ENTER("");
     int error;
@@ -872,9 +887,9 @@ static int tokudb_commit(handlerton * hton, THD * thd, bool all) {
             tokudb_sync_on_commit(thd, trx, this_txn) ? 0 : DB_TXN_NOSYNC;
         TOKUDB_TRACE_FOR_FLAGS(
             TOKUDB_DEBUG_TXN,
-            "commit trx %u txn %p syncflag %u",
+            "commit trx %u txn %p %" PRIu64 " syncflag %u",
             all,
-            this_txn,
+            this_txn, this_txn->id64(this_txn),
             syncflag);
         // test hook to induce a crash on a debug build
         DBUG_EXECUTE_IF("tokudb_crash_commit_before", DBUG_SUICIDE(););
@@ -903,9 +918,9 @@ static int tokudb_rollback(handlerton * hton, THD * thd, bool all) {
     if (this_txn) {
         TOKUDB_TRACE_FOR_FLAGS(
             TOKUDB_DEBUG_TXN,
-            "rollback %u txn %p",
+            "rollback %u txn %p %" PRIu64,
             all,
-            this_txn);
+            this_txn, this_txn->id64(this_txn));
         tokudb_cleanup_handlers(trx, this_txn);
         abort_txn_with_progress(this_txn, thd);
         *txn = NULL;
@@ -951,9 +966,9 @@ static int tokudb_xa_prepare(handlerton* hton, THD* thd, bool all) {
         uint32_t syncflag = tokudb_sync_on_prepare() ? 0 : DB_TXN_NOSYNC;
         TOKUDB_TRACE_FOR_FLAGS(
             TOKUDB_DEBUG_XA,
-            "doing txn prepare:%d:%p",
+            "doing txn prepare:%d:%p %" PRIu64,
             all,
-            txn);
+            txn, txn->id64(txn));
         // a TOKU_XA_XID is identical to a MYSQL_XID
         TOKU_XA_XID thd_xid;
         thd_get_xid(thd, (MYSQL_XID*) &thd_xid);
@@ -1569,7 +1584,9 @@ static int tokudb_search_txn_callback(
     void* extra) {
 
     uint64_t txn_id = txn->id64(txn);
-    uint64_t client_id = txn->get_client_id(txn);
+    uint64_t client_id;
+    void *client_extra;
+    txn->get_client_id(txn, &client_id, &client_extra);
     struct tokudb_search_txn_extra* e =
         reinterpret_cast<struct tokudb_search_txn_extra*>(extra);
     if (e->match_txn_id == txn_id) {
@@ -1747,6 +1764,63 @@ static void tokudb_lock_timeout_callback(
     }
 }
 
+extern "C" int thd_rpl_deadlock_check(MYSQL_THD thd, MYSQL_THD other_thd);
+
+struct tokudb_search_txn_thd {
+    bool match_found;
+    uint64_t match_txn_id;
+    THD *match_client_thd;
+};
+
+static int tokudb_search_txn_thd_callback(
+    DB_TXN* txn,
+    iterate_row_locks_callback iterate_locks,
+    void* locks_extra,
+    void* extra) {
+
+    uint64_t txn_id = txn->id64(txn);
+    uint64_t client_id;
+    void *client_extra;
+    txn->get_client_id(txn, &client_id, &client_extra);
+    struct tokudb_search_txn_thd* e =
+        reinterpret_cast<struct tokudb_search_txn_thd*>(extra);
+    if (e->match_txn_id == txn_id) {
+        e->match_found = true;
+        e->match_client_thd = reinterpret_cast<THD *>(client_extra);
+        return 1;
+    }
+    return 0;
+}
+
+static bool tokudb_txn_id_to_thd(
+    uint64_t txnid,
+    THD **out_thd) {
+
+    struct tokudb_search_txn_thd e = {
+        false,
+        txnid,
+        0
+    };
+    db_env->iterate_live_transactions(db_env, tokudb_search_txn_thd_callback, &e);
+    if (e.match_found) {
+        *out_thd = e.match_client_thd;
+    }
+    return e.match_found;
+}
+
+static void tokudb_lock_wait_needed_callback(
+    void *arg,
+    uint64_t requesting_txnid,
+    uint64_t blocking_txnid) {
+
+    THD *requesting_thd;
+    THD *blocking_thd;
+    if (tokudb_txn_id_to_thd(requesting_txnid, &requesting_thd) &&
+        tokudb_txn_id_to_thd(blocking_txnid, &blocking_thd)) {
+        thd_rpl_deadlock_check (requesting_thd, blocking_thd);
+    }
+}
+
 // Retrieves variables for information_schema.global_status.
 // Names (columnname) are automatically converted to upper case,
 // and prefixed with "TOKUDB_"
diff --git a/storage/tokudb/hatoku_hton.h b/storage/tokudb/hatoku_hton.h
index ade7be128a5..d126ff4339f 100644
--- a/storage/tokudb/hatoku_hton.h
+++ b/storage/tokudb/hatoku_hton.h
@@ -172,12 +172,12 @@ inline uint64_t tokudb_get_killed_time_callback(uint64_t default_killed_time) {
 
 inline int tokudb_killed_callback(void) {
     THD *thd = current_thd;
-    return thd_killed(thd);
+    return thd_kill_level(thd);
 }
 
 inline bool tokudb_killed_thd_callback(void *extra, uint64_t deleted_rows) {
     THD *thd = static_cast<THD *>(extra);
-    return thd_killed(thd) != 0;
+    return thd_kill_level(thd) != 0;
 }
 
 
diff --git a/storage/tokudb/mysql-test/rpl/r/rpl_foreign_key_tokudb.result b/storage/tokudb/mysql-test/rpl/r/rpl_foreign_key_tokudb.result
deleted file mode 100644
index e35e29d8248..00000000000
--- a/storage/tokudb/mysql-test/rpl/r/rpl_foreign_key_tokudb.result
+++ /dev/null
@@ -1,59 +0,0 @@
-include/master-slave.inc
-[connection master]
-CREATE TABLE t1 (a INT AUTO_INCREMENT KEY) ENGINE=TokuDB;
-CREATE TABLE t2 (b INT AUTO_INCREMENT KEY, c INT, FOREIGN KEY(b) REFERENCES t1(a)) ENGINE=TokuDB;
-SET FOREIGN_KEY_CHECKS=0;
-INSERT INTO t1 VALUES (10);
-INSERT INTO t1 VALUES (NULL),(NULL),(NULL);
-INSERT INTO t2 VALUES (5,0);
-INSERT INTO t2 VALUES (NULL,LAST_INSERT_ID());
-SET FOREIGN_KEY_CHECKS=1;
-SELECT * FROM t1 ORDER BY a;
-a
-10
-11
-12
-13
-SELECT * FROM t2 ORDER BY b;
-b	c
-5	0
-6	11
-connection slave;
-SELECT * FROM t1 ORDER BY a;
-a
-10
-11
-12
-13
-SELECT * FROM t2 ORDER BY b;
-b	c
-5	0
-6	11
-connection master;
-SET TIMESTAMP=1000000000;
-CREATE TABLE t3 ( a INT UNIQUE );
-SET FOREIGN_KEY_CHECKS=0;
-INSERT INTO t3 VALUES (1),(1);
-ERROR 23000: Duplicate entry '1' for key 'a'
-connection slave;
-connection master;
-SET FOREIGN_KEY_CHECKS=0;
-DROP TABLE IF EXISTS t1,t2,t3;
-SET FOREIGN_KEY_CHECKS=1;
-connection slave;
-connection master;
-create table t1 (b int primary key) engine = TokuDB;
-create table t2 (a int primary key, b int, foreign key (b) references t1(b))
-engine = TokuDB;
-insert into t1 set b=1;
-insert into t2 set a=1, b=1;
-set foreign_key_checks=0;
-delete from t1;
-must sync w/o a problem (could not with the buggy code)
-connection slave;
-select count(*) from t1 /* must be zero */;
-count(*)
-0
-connection master;
-drop table t2,t1;
-include/rpl_end.inc
diff --git a/storage/tokudb/mysql-test/rpl/t/rpl_foreign_key_tokudb.test b/storage/tokudb/mysql-test/rpl/t/rpl_foreign_key_tokudb.test
deleted file mode 100644
index 120ad0d5c1e..00000000000
--- a/storage/tokudb/mysql-test/rpl/t/rpl_foreign_key_tokudb.test
+++ /dev/null
@@ -1,3 +0,0 @@
--- source include/have_tokudb.inc
-let $engine_type=TokuDB;
--- source extra/rpl_tests/rpl_foreign_key.test
diff --git a/storage/tokudb/mysql-test/tokudb/disabled.def b/storage/tokudb/mysql-test/tokudb/disabled.def
index c98a8aa622a..ddefceb432e 100644
--- a/storage/tokudb/mysql-test/tokudb/disabled.def
+++ b/storage/tokudb/mysql-test/tokudb/disabled.def
@@ -28,3 +28,4 @@ type_timestamp_explicit:
 cluster_key_part: engine options on partitioned tables
 i_s_tokudb_lock_waits_released: unstable, race conditions
 i_s_tokudb_locks_released: unstable, race conditions
+row_format: n/a
diff --git a/storage/tokudb/mysql-test/tokudb/include/table_files_replace_pattern.inc b/storage/tokudb/mysql-test/tokudb/include/table_files_replace_pattern.inc
new file mode 100644
index 00000000000..b10ad21dd95
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb/include/table_files_replace_pattern.inc
@@ -0,0 +1 @@
+--replace_regex  /[a-z0-9]+_[a-z0-9]+_[a-z0-9]+(_[BP]_[a-z0-9]+){0,1}\./id./ /sqlx_[a-z0-9]+_[a-z0-9]+_/sqlx_nnnn_nnnn_/ /sqlx-[a-z0-9]+_[a-z0-9]+/sqlx-nnnn_nnnn/ /#p#/#P#/ /#sp#/#SP#/ /#tmp#/#TMP#/
diff --git a/storage/tokudb/mysql-test/tokudb/r/background_job_manager.result b/storage/tokudb/mysql-test/tokudb/r/background_job_manager.result
index 9d813eca8e9..69b55582aa2 100644
--- a/storage/tokudb/mysql-test/tokudb/r/background_job_manager.result
+++ b/storage/tokudb/mysql-test/tokudb/r/background_job_manager.result
@@ -25,7 +25,7 @@ TokuDB_background_job_status	CREATE TEMPORARY TABLE `TokuDB_background_job_statu
   `scheduler` varchar(32) NOT NULL DEFAULT '',
   `scheduled_time` datetime NOT NULL DEFAULT '0000-00-00 00:00:00',
   `started_time` datetime DEFAULT NULL,
-  `status` varchar(256) DEFAULT NULL
+  `status` varchar(1024) DEFAULT NULL
 ) ENGINE=MEMORY DEFAULT CHARSET=utf8
 create table t1 (a int not null auto_increment, b int, c int, primary key(a), key kb(b), key kc(c), key kabc(a,b,c), key kab(a,b), key kbc(b,c));
 insert into t1(b,c) values(0,0), (1,1), (2,2), (3,3);
diff --git a/storage/tokudb/mysql-test/tokudb/r/cluster_2968-1.result b/storage/tokudb/mysql-test/tokudb/r/cluster_2968-1.result
index 6e89358756d..758d51be01c 100644
--- a/storage/tokudb/mysql-test/tokudb/r/cluster_2968-1.result
+++ b/storage/tokudb/mysql-test/tokudb/r/cluster_2968-1.result
@@ -1045,10 +1045,10 @@ id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t	ref	b	b	5	test.s.b	1	
 alter table s add key(b) clustering=yes;
 Warnings:
-Note	1831	Duplicate index 'b_2' defined on the table 'test.s'. This is deprecated and will be disallowed in a future release
+Note	1831	Duplicate index `b_2`. This is deprecated and will be disallowed in a future release
 alter table t add key(b) clustering=yes;
 Warnings:
-Note	1831	Duplicate index 'b_2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release
+Note	1831	Duplicate index `b_2`. This is deprecated and will be disallowed in a future release
 show create table s;
 Table	Create Table
 s	CREATE TABLE `s` (
@@ -1095,10 +1095,10 @@ id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t	ref	b_2	b_2	5	test.s.b	1	Using index
 alter table s add key(b);
 Warnings:
-Note	1831	Duplicate index 'b' defined on the table 'test.s'. This is deprecated and will be disallowed in a future release
+Note	1831	Duplicate index `b`. This is deprecated and will be disallowed in a future release
 alter table t add key(b);
 Warnings:
-Note	1831	Duplicate index 'b' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release
+Note	1831	Duplicate index `b`. This is deprecated and will be disallowed in a future release
 show create table s;
 Table	Create Table
 s	CREATE TABLE `s` (
diff --git a/storage/tokudb/mysql-test/tokudb/r/cluster_2968-2.result b/storage/tokudb/mysql-test/tokudb/r/cluster_2968-2.result
index c8c962ee3be..4c3f971770e 100644
--- a/storage/tokudb/mysql-test/tokudb/r/cluster_2968-2.result
+++ b/storage/tokudb/mysql-test/tokudb/r/cluster_2968-2.result
@@ -1069,10 +1069,10 @@ id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	t	ref	b,b_2	b_2	5	test.s.b	1	Using index
 alter table s add key(b) clustering=yes;
 Warnings:
-Note	1831	Duplicate index 'b_3' defined on the table 'test.s'. This is deprecated and will be disallowed in a future release
+Note	1831	Duplicate index `b_3`. This is deprecated and will be disallowed in a future release
 alter table t add key(b) clustering=yes;
 Warnings:
-Note	1831	Duplicate index 'b_3' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release
+Note	1831	Duplicate index `b_3`. This is deprecated and will be disallowed in a future release
 show create table s;
 Table	Create Table
 s	CREATE TABLE `s` (
diff --git a/storage/tokudb/mysql-test/tokudb/r/cluster_2968-3.result b/storage/tokudb/mysql-test/tokudb/r/cluster_2968-3.result
index 3ff67b27807..407ef9cc316 100644
--- a/storage/tokudb/mysql-test/tokudb/r/cluster_2968-3.result
+++ b/storage/tokudb/mysql-test/tokudb/r/cluster_2968-3.result
@@ -1066,13 +1066,13 @@ id	select_type	table	type	possible_keys	key	key_len	ref	rows	Extra
 1	SIMPLE	u	ref	c	c	5	test.s.c	1	
 alter table s add key (b) clustering=yes;
 Warnings:
-Note	1831	Duplicate index 'b_2' defined on the table 'test.s'. This is deprecated and will be disallowed in a future release
+Note	1831	Duplicate index `b_2`. This is deprecated and will be disallowed in a future release
 alter table t add key (b) clustering=yes;
 Warnings:
-Note	1831	Duplicate index 'b_2' defined on the table 'test.t'. This is deprecated and will be disallowed in a future release
+Note	1831	Duplicate index `b_2`. This is deprecated and will be disallowed in a future release
 alter table u add key (c) clustering=yes;
 Warnings:
-Note	1831	Duplicate index 'c_2' defined on the table 'test.u'. This is deprecated and will be disallowed in a future release
+Note	1831	Duplicate index `c_2`. This is deprecated and will be disallowed in a future release
 show create table s;
 Table	Create Table
 s	CREATE TABLE `s` (
diff --git a/storage/tokudb/mysql-test/tokudb/r/dir-per-db-with-custom-data-dir.result b/storage/tokudb/mysql-test/tokudb/r/dir-per-db-with-custom-data-dir.result
new file mode 100644
index 00000000000..a36dbcb28c0
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb/r/dir-per-db-with-custom-data-dir.result
@@ -0,0 +1,10 @@
+SELECT @@tokudb_dir_per_db;
+@@tokudb_dir_per_db
+1
+TOKUDB_DATA_DIR_CHANGED
+1
+CREATE DATABASE tokudb_test;
+USE tokudb_test;
+CREATE TABLE t (a INT UNSIGNED AUTO_INCREMENT PRIMARY KEY) ENGINE=tokudb;
+DROP TABLE t;
+DROP DATABASE tokudb_test;
diff --git a/storage/tokudb/mysql-test/tokudb/r/dir_per_db.result b/storage/tokudb/mysql-test/tokudb/r/dir_per_db.result
new file mode 100644
index 00000000000..371f97406c8
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb/r/dir_per_db.result
@@ -0,0 +1,180 @@
+########
+#  tokudb_dir_per_db = 1
+########
+SET GLOBAL tokudb_dir_per_db= 1;
+########
+#  CREATE
+########
+CREATE TABLE t1 (a INT UNSIGNED AUTO_INCREMENT PRIMARY KEY, b INT(10) UNSIGNED NOT NULL) ENGINE=tokudb;
+INSERT INTO t1 SET b = 10;
+INSERT INTO t1 SET b = 20;
+SELECT b FROM t1 ORDER BY a;
+b
+10
+20
+CREATE INDEX b ON t1 (b);
+CREATE INDEX ab ON t1 (a,b);
+## Looking for *.tokudb files in data_dir
+## Looking for *.tokudb files in data_dir/test
+t1_key_ab_id.tokudb
+t1_key_b_id.tokudb
+t1_main_id.tokudb
+t1_status_id.tokudb
+########
+#  RENAME
+########
+RENAME TABLE t1 TO t2;
+SELECT b FROM t2 ORDER BY a;
+b
+10
+20
+## Looking for *.tokudb files in data_dir
+## Looking for *.tokudb files in data_dir/test
+t2_key_ab_id.tokudb
+t2_key_b_id.tokudb
+t2_main_id.tokudb
+t2_status_id.tokudb
+########
+#  DROP
+########
+DROP TABLE t2;
+## Looking for *.tokudb files in data_dir
+## Looking for *.tokudb files in data_dir/test
+########
+#  tokudb_dir_per_db = 0
+########
+SET GLOBAL tokudb_dir_per_db= 0;
+########
+#  CREATE
+########
+CREATE TABLE t1 (a INT UNSIGNED AUTO_INCREMENT PRIMARY KEY, b INT(10) UNSIGNED NOT NULL) ENGINE=tokudb;
+INSERT INTO t1 SET b = 10;
+INSERT INTO t1 SET b = 20;
+SELECT b FROM t1 ORDER BY a;
+b
+10
+20
+CREATE INDEX b ON t1 (b);
+CREATE INDEX ab ON t1 (a,b);
+## Looking for *.tokudb files in data_dir
+_test_t1_key_ab_id.tokudb
+_test_t1_key_b_id.tokudb
+_test_t1_main_id.tokudb
+_test_t1_status_id.tokudb
+## Looking for *.tokudb files in data_dir/test
+########
+#  RENAME
+########
+RENAME TABLE t1 TO t2;
+SELECT b FROM t2 ORDER BY a;
+b
+10
+20
+## Looking for *.tokudb files in data_dir
+_test_t1_key_ab_id.tokudb
+_test_t1_key_b_id.tokudb
+_test_t1_main_id.tokudb
+_test_t1_status_id.tokudb
+## Looking for *.tokudb files in data_dir/test
+########
+#  DROP
+########
+DROP TABLE t2;
+## Looking for *.tokudb files in data_dir
+## Looking for *.tokudb files in data_dir/test
+########
+#  CREATE on tokudb_dir_per_db = 0 and RENAME on tokudb_dir_per_db = 1 and vice versa
+########
+########
+#  tokudb_dir_per_db = (1 - 1);
+########
+SET GLOBAL tokudb_dir_per_db= (1 - 1);;
+########
+#  CREATE
+########
+CREATE TABLE t1 (a INT UNSIGNED AUTO_INCREMENT PRIMARY KEY, b INT(10) UNSIGNED NOT NULL) ENGINE=tokudb;
+INSERT INTO t1 SET b = 10;
+INSERT INTO t1 SET b = 20;
+SELECT b FROM t1 ORDER BY a;
+b
+10
+20
+CREATE INDEX b ON t1 (b);
+CREATE INDEX ab ON t1 (a,b);
+## Looking for *.tokudb files in data_dir
+_test_t1_key_ab_id.tokudb
+_test_t1_key_b_id.tokudb
+_test_t1_main_id.tokudb
+_test_t1_status_id.tokudb
+## Looking for *.tokudb files in data_dir/test
+########
+#  tokudb_dir_per_db = 1
+########
+SET GLOBAL tokudb_dir_per_db= 1;
+########
+#  RENAME
+########
+RENAME TABLE t1 TO t2;
+SELECT b FROM t2 ORDER BY a;
+b
+10
+20
+## Looking for *.tokudb files in data_dir
+## Looking for *.tokudb files in data_dir/test
+t2_key_ab_id.tokudb
+t2_key_b_id.tokudb
+t2_main_id.tokudb
+t2_status_id.tokudb
+########
+#  DROP
+########
+DROP TABLE t2;
+## Looking for *.tokudb files in data_dir
+## Looking for *.tokudb files in data_dir/test
+########
+#  tokudb_dir_per_db = (1 - 0);
+########
+SET GLOBAL tokudb_dir_per_db= (1 - 0);;
+########
+#  CREATE
+########
+CREATE TABLE t1 (a INT UNSIGNED AUTO_INCREMENT PRIMARY KEY, b INT(10) UNSIGNED NOT NULL) ENGINE=tokudb;
+INSERT INTO t1 SET b = 10;
+INSERT INTO t1 SET b = 20;
+SELECT b FROM t1 ORDER BY a;
+b
+10
+20
+CREATE INDEX b ON t1 (b);
+CREATE INDEX ab ON t1 (a,b);
+## Looking for *.tokudb files in data_dir
+## Looking for *.tokudb files in data_dir/test
+t1_key_ab_id.tokudb
+t1_key_b_id.tokudb
+t1_main_id.tokudb
+t1_status_id.tokudb
+########
+#  tokudb_dir_per_db = 0
+########
+SET GLOBAL tokudb_dir_per_db= 0;
+########
+#  RENAME
+########
+RENAME TABLE t1 TO t2;
+SELECT b FROM t2 ORDER BY a;
+b
+10
+20
+## Looking for *.tokudb files in data_dir
+## Looking for *.tokudb files in data_dir/test
+t1_key_ab_id.tokudb
+t1_key_b_id.tokudb
+t1_main_id.tokudb
+t1_status_id.tokudb
+########
+#  DROP
+########
+DROP TABLE t2;
+## Looking for *.tokudb files in data_dir
+## Looking for *.tokudb files in data_dir/test
+SET GLOBAL tokudb_dir_per_db=default;
diff --git a/storage/tokudb/mysql-test/tokudb/r/i_s_tokudb_lock_waits_released.result b/storage/tokudb/mysql-test/tokudb/r/i_s_tokudb_lock_waits_released.result
index 6f9592ddc1f..ecd4d077206 100644
--- a/storage/tokudb/mysql-test/tokudb/r/i_s_tokudb_lock_waits_released.result
+++ b/storage/tokudb/mysql-test/tokudb/r/i_s_tokudb_lock_waits_released.result
@@ -2,6 +2,7 @@ set default_storage_engine='tokudb';
 set tokudb_prelock_empty=false;
 drop table if exists t;
 create table t (id int primary key);
+t should be empty
 select trx_id,trx_mysql_thread_id from information_schema.tokudb_trx;
 trx_id	trx_mysql_thread_id
 select * from information_schema.tokudb_locks;
@@ -15,17 +16,21 @@ insert into t values (1);
 set autocommit=0;
 set tokudb_lock_timeout=600000;
 insert into t values (1);
+should find the presence of a lock on 1st transaction
 select * from information_schema.tokudb_locks;
 locks_trx_id	locks_mysql_thread_id	locks_dname	locks_key_left	locks_key_right	locks_table_schema	locks_table_name	locks_table_dictionary_name
 TRX_ID	MYSQL_ID	./test/t-main	0001000000	0001000000	test	t	main
+should find the presence of a lock_wait on the 2nd transaction
 select * from information_schema.tokudb_lock_waits;
 requesting_trx_id	blocking_trx_id	lock_waits_dname	lock_waits_key_left	lock_waits_key_right	lock_waits_start_time	lock_waits_table_schema	lock_waits_table_name	lock_waits_table_dictionary_name
 REQUEST_TRX_ID	BLOCK_TRX_ID	./test/t-main	0001000000	0001000000	LOCK_WAITS_START_TIME	test	t	main
+should find the presence of two transactions
 select trx_id,trx_mysql_thread_id from information_schema.tokudb_trx;
 trx_id	trx_mysql_thread_id
 TRX_ID	MYSQL_ID
 TRX_ID	MYSQL_ID
 commit;
+verify that the lock on the 1st transaction is released and replaced by the lock for the 2nd transaction
 select * from information_schema.tokudb_locks;
 locks_trx_id	locks_mysql_thread_id	locks_dname	locks_key_left	locks_key_right	locks_table_schema	locks_table_name	locks_table_dictionary_name
 TRX_ID	MYSQL_ID	./test/t-main	0001000000	0001000000	test	t	main
@@ -33,6 +38,8 @@ select * from information_schema.tokudb_lock_waits;
 requesting_trx_id	blocking_trx_id	lock_waits_dname	lock_waits_key_left	lock_waits_key_right	lock_waits_start_time	lock_waits_table_schema	lock_waits_table_name	lock_waits_table_dictionary_name
 ERROR 23000: Duplicate entry '1' for key 'PRIMARY'
 commit;
+verify that txn_a replace (1) blocks txn_b replace (1) and txn_b eventually gets the lock on (1) and completes
+verify that the lock on the 2nd transaction has been released, should be be empty
 select trx_id,trx_mysql_thread_id from information_schema.tokudb_trx;
 trx_id	trx_mysql_thread_id
 select * from information_schema.tokudb_locks;
@@ -46,23 +53,28 @@ replace into t values (1);
 set autocommit=0;
 set tokudb_lock_timeout=600000;
 replace into t values (1);
+should find the presence of a lock on 1st transaction
 select * from information_schema.tokudb_locks;
 locks_trx_id	locks_mysql_thread_id	locks_dname	locks_key_left	locks_key_right	locks_table_schema	locks_table_name	locks_table_dictionary_name
 TRX_ID	MYSQL_ID	./test/t-main	0001000000	0001000000	test	t	main
+should find the presence of a lock_wait on the 2nd transaction
 select * from information_schema.tokudb_lock_waits;
 requesting_trx_id	blocking_trx_id	lock_waits_dname	lock_waits_key_left	lock_waits_key_right	lock_waits_start_time	lock_waits_table_schema	lock_waits_table_name	lock_waits_table_dictionary_name
 REQUEST_TRX_ID	BLOCK_TRX_ID	./test/t-main	0001000000	0001000000	LOCK_WAITS_START_TIME	test	t	main
+should find the presence of two transactions
 select trx_id,trx_mysql_thread_id from information_schema.tokudb_trx;
 trx_id	trx_mysql_thread_id
 TRX_ID	MYSQL_ID
 TRX_ID	MYSQL_ID
 commit;
+verify that the lock on the 1st transaction is released and replaced by the lock for the 2nd transaction
 select * from information_schema.tokudb_locks;
 locks_trx_id	locks_mysql_thread_id	locks_dname	locks_key_left	locks_key_right	locks_table_schema	locks_table_name	locks_table_dictionary_name
 TRX_ID	MYSQL_ID	./test/t-main	0001000000	0001000000	test	t	main
 select * from information_schema.tokudb_lock_waits;
 requesting_trx_id	blocking_trx_id	lock_waits_dname	lock_waits_key_left	lock_waits_key_right	lock_waits_start_time	lock_waits_table_schema	lock_waits_table_name	lock_waits_table_dictionary_name
 commit;
+verify that the lock on the 2nd transaction has been released, should be be empty
 select trx_id,trx_mysql_thread_id from information_schema.tokudb_trx;
 trx_id	trx_mysql_thread_id
 select * from information_schema.tokudb_locks;
diff --git a/storage/tokudb/mysql-test/tokudb/r/row_format.result b/storage/tokudb/mysql-test/tokudb/r/row_format.result
new file mode 100644
index 00000000000..cb669148445
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb/r/row_format.result
@@ -0,0 +1,51 @@
+CREATE TABLE tokudb_row_format_test_1 (a INT) ENGINE=TokuDB ROW_FORMAT=TOKUDB_DEFAULT;
+CREATE TABLE tokudb_row_format_test_2 (a INT) ENGINE=TokuDB ROW_FORMAT=TOKUDB_FAST;
+CREATE TABLE tokudb_row_format_test_3 (a INT) ENGINE=TokuDB ROW_FORMAT=TOKUDB_SMALL;
+CREATE TABLE tokudb_row_format_test_4 (a INT) ENGINE=TokuDB ROW_FORMAT=TOKUDB_UNCOMPRESSED;
+CREATE TABLE tokudb_row_format_test_5 (a INT) ENGINE=TokuDB ROW_FORMAT=TOKUDB_ZLIB;
+CREATE TABLE tokudb_row_format_test_6 (a INT) ENGINE=TokuDB ROW_FORMAT=TOKUDB_LZMA;
+CREATE TABLE tokudb_row_format_test_7 (a INT) ENGINE=TokuDB ROW_FORMAT=TOKUDB_QUICKLZ;
+CREATE TABLE tokudb_row_format_test_8 (a INT) ENGINE=TokuDB ROW_FORMAT=TOKUDB_SNAPPY;
+SELECT table_name, row_format, engine FROM information_schema.tables WHERE table_name like 'tokudb_row_format_test%' ORDER BY table_name;
+table_name	row_format	engine
+tokudb_row_format_test_1	tokudb_zlib	TokuDB
+tokudb_row_format_test_2	tokudb_quicklz	TokuDB
+tokudb_row_format_test_3	tokudb_lzma	TokuDB
+tokudb_row_format_test_4	tokudb_uncompressed	TokuDB
+tokudb_row_format_test_5	tokudb_zlib	TokuDB
+tokudb_row_format_test_6	tokudb_lzma	TokuDB
+tokudb_row_format_test_7	tokudb_quicklz	TokuDB
+tokudb_row_format_test_8	tokudb_snappy	TokuDB
+ALTER TABLE tokudb_row_format_test_1 ENGINE=TokuDB ROW_FORMAT=TOKUDB_FAST;
+SELECT table_name, row_format, engine FROM information_schema.tables WHERE table_name = 'tokudb_row_format_test_1';
+table_name	row_format	engine
+tokudb_row_format_test_1	tokudb_quicklz	TokuDB
+ALTER TABLE tokudb_row_format_test_1 ENGINE=TokuDB ROW_FORMAT=TOKUDB_SMALL;
+SELECT table_name, row_format, engine FROM information_schema.tables WHERE table_name = 'tokudb_row_format_test_1';
+table_name	row_format	engine
+tokudb_row_format_test_1	tokudb_lzma	TokuDB
+ALTER TABLE tokudb_row_format_test_1 ENGINE=TokuDB ROW_FORMAT=TOKUDB_UNCOMPRESSED;
+SELECT table_name, row_format, engine FROM information_schema.tables WHERE table_name = 'tokudb_row_format_test_1';
+table_name	row_format	engine
+tokudb_row_format_test_1	tokudb_uncompressed	TokuDB
+ALTER TABLE tokudb_row_format_test_1 ENGINE=TokuDB ROW_FORMAT=TOKUDB_ZLIB;
+SELECT table_name, row_format, engine FROM information_schema.tables WHERE table_name = 'tokudb_row_format_test_1';
+table_name	row_format	engine
+tokudb_row_format_test_1	tokudb_zlib	TokuDB
+ALTER TABLE tokudb_row_format_test_1 ENGINE=TokuDB ROW_FORMAT=TOKUDB_SNAPPY;
+SELECT table_name, row_format, engine FROM information_schema.tables WHERE table_name = 'tokudb_row_format_test_1';
+table_name	row_format	engine
+tokudb_row_format_test_1	tokudb_snappy	TokuDB
+ALTER TABLE tokudb_row_format_test_1 ENGINE=TokuDB ROW_FORMAT=TOKUDB_QUICKLZ;
+SELECT table_name, row_format, engine FROM information_schema.tables WHERE table_name = 'tokudb_row_format_test_1';
+table_name	row_format	engine
+tokudb_row_format_test_1	tokudb_quicklz	TokuDB
+ALTER TABLE tokudb_row_format_test_1 ENGINE=TokuDB ROW_FORMAT=TOKUDB_LZMA;
+SELECT table_name, row_format, engine FROM information_schema.tables WHERE table_name = 'tokudb_row_format_test_1';
+table_name	row_format	engine
+tokudb_row_format_test_1	tokudb_lzma	TokuDB
+ALTER TABLE tokudb_row_format_test_1 ENGINE=TokuDB ROW_FORMAT=TOKUDB_DEFAULT;
+SELECT table_name, row_format, engine FROM information_schema.tables WHERE table_name = 'tokudb_row_format_test_1';
+table_name	row_format	engine
+tokudb_row_format_test_1	tokudb_zlib	TokuDB
+DROP TABLE tokudb_row_format_test_1, tokudb_row_format_test_2, tokudb_row_format_test_3, tokudb_row_format_test_4, tokudb_row_format_test_5, tokudb_row_format_test_6, tokudb_row_format_test_7, tokudb_row_format_test_8;
diff --git a/storage/tokudb/mysql-test/tokudb/r/table_index_statistics.result b/storage/tokudb/mysql-test/tokudb/r/table_index_statistics.result
new file mode 100644
index 00000000000..d3184bfb07a
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb/r/table_index_statistics.result
@@ -0,0 +1,48 @@
+SET @default_storage_engine_old = @@session.default_storage_engine;
+SET SESSION default_storage_engine = TOKUDB;
+FLUSH INDEX_STATISTICS;
+FLUSH TABLE_STATISTICS;
+SET @userstat_old= @@userstat;
+SET GLOBAL userstat=ON;
+CREATE TABLE t1 (id int(10), PRIMARY KEY (id));
+INSERT INTO t1 VALUES (1),(2),(3),(4),(5),(6),(7),(8),(9),(10);
+SELECT COUNT(*) FROM t1;
+COUNT(*)
+10
+SELECT ROWS_READ FROM INFORMATION_SCHEMA.TABLE_STATISTICS WHERE TABLE_NAME='t1';
+ROWS_READ
+10
+SELECT ROWS_READ FROM INFORMATION_SCHEMA.INDEX_STATISTICS WHERE TABLE_NAME='t1';
+ROWS_READ
+10
+FLUSH TABLE_STATISTICS;
+SELECT ROWS_READ FROM INFORMATION_SCHEMA.TABLE_STATISTICS WHERE TABLE_NAME='t1';
+ROWS_READ
+SELECT ROWS_READ FROM INFORMATION_SCHEMA.INDEX_STATISTICS WHERE TABLE_NAME='t1';
+ROWS_READ
+10
+FLUSH INDEX_STATISTICS;
+SELECT ROWS_READ FROM INFORMATION_SCHEMA.INDEX_STATISTICS WHERE TABLE_NAME='t1';
+ROWS_READ
+SELECT COUNT(*) FROM t1;
+COUNT(*)
+10
+SELECT ROWS_READ FROM INFORMATION_SCHEMA.TABLE_STATISTICS WHERE TABLE_NAME='t1';
+ROWS_READ
+10
+SELECT ROWS_READ FROM INFORMATION_SCHEMA.INDEX_STATISTICS WHERE TABLE_NAME='t1';
+ROWS_READ
+10
+DROP TABLE t1;
+CREATE TABLE t2 (c1 INT UNSIGNED);
+ALTER TABLE t2 MODIFY c1 FLOAT;
+SELECT * FROM INFORMATION_SCHEMA.TABLE_STATISTICS WHERE TABLE_NAME='t2';
+TABLE_SCHEMA	TABLE_NAME	ROWS_READ	ROWS_CHANGED	ROWS_CHANGED_X_INDEXES
+DROP TABLE t2;
+CREATE TABLE t2 (c1 INT UNSIGNED);
+ALTER TABLE t2 MODIFY c1 FLOAT;
+SELECT * FROM INFORMATION_SCHEMA.TABLE_STATISTICS WHERE TABLE_NAME='t2';
+TABLE_SCHEMA	TABLE_NAME	ROWS_READ	ROWS_CHANGED	ROWS_CHANGED_X_INDEXES
+DROP TABLE t2;
+SET GLOBAL userstat= @userstat_old;
+SET SESSION default_storage_engine = @default_storage_engine_old;
diff --git a/storage/tokudb/mysql-test/tokudb/r/type_ranges.result b/storage/tokudb/mysql-test/tokudb/r/type_ranges.result
index 07803071fe8..1c9cd769a14 100644
--- a/storage/tokudb/mysql-test/tokudb/r/type_ranges.result
+++ b/storage/tokudb/mysql-test/tokudb/r/type_ranges.result
@@ -84,10 +84,10 @@ t1	1	options	2	flags	A	NA	NULL	NULL		BTREE
 CREATE UNIQUE INDEX test on t1 ( auto ) ;
 CREATE INDEX test2 on t1 ( ulonglong,ulong) ;
 Warnings:
-Note	1831	Duplicate index 'test2' defined on the table 'test.t1'. This is deprecated and will be disallowed in a future release
+Note	1831	Duplicate index `test2`. This is deprecated and will be disallowed in a future release
 CREATE INDEX test3 on t1 ( medium ) ;
 Warnings:
-Note	1831	Duplicate index 'test3' defined on the table 'test.t1'. This is deprecated and will be disallowed in a future release
+Note	1831	Duplicate index `test3`. This is deprecated and will be disallowed in a future release
 DROP INDEX test ON t1;
 insert into t1 values (10, 1,1,1,1,1,1,1,1,1,1,1,1,1,NULL,0,0,0,1,1,1,1,'one','one');
 insert into t1 values (NULL,2,2,2,2,2,2,2,2,2,2,2,2,2,NULL,NULL,NULL,NULL,NULL,NULL,2,2,'two','two,one');
@@ -306,7 +306,7 @@ const	int(1)	NULL	NO		NULL		#
 drop table t1,t2,t3;
 create table t1 ( myfield INT NOT NULL, UNIQUE INDEX (myfield), unique (myfield), index(myfield));
 Warnings:
-Note	1831	Duplicate index 'myfield_2' defined on the table 'test.t1'. This is deprecated and will be disallowed in a future release
+Note	1831	Duplicate index `myfield_2`. This is deprecated and will be disallowed in a future release
 drop table t1;
 create table t1 ( id integer unsigned not null primary key );
 create table t2 ( id integer unsigned not null primary key );
diff --git a/storage/tokudb/mysql-test/tokudb/t/dir-per-db-with-custom-data-dir-master.opt b/storage/tokudb/mysql-test/tokudb/t/dir-per-db-with-custom-data-dir-master.opt
new file mode 100644
index 00000000000..a9090f4d115
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb/t/dir-per-db-with-custom-data-dir-master.opt
@@ -0,0 +1 @@
+--loose-tokudb_data_dir="$MYSQL_TMP_DIR" --loose-tokudb-dir-per-db=1
diff --git a/storage/tokudb/mysql-test/tokudb/t/dir-per-db-with-custom-data-dir.test b/storage/tokudb/mysql-test/tokudb/t/dir-per-db-with-custom-data-dir.test
new file mode 100644
index 00000000000..7f415a72515
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb/t/dir-per-db-with-custom-data-dir.test
@@ -0,0 +1,16 @@
+--source include/have_tokudb.inc
+
+SELECT @@tokudb_dir_per_db;
+
+--disable_query_log
+--eval SELECT STRCMP(@@tokudb_data_dir, '$MYSQL_TMP_DIR') = 0 AS TOKUDB_DATA_DIR_CHANGED
+--enable_query_log
+
+CREATE DATABASE tokudb_test;
+USE tokudb_test;
+CREATE TABLE t (a INT UNSIGNED AUTO_INCREMENT PRIMARY KEY) ENGINE=tokudb;
+
+--file_exists $MYSQL_TMP_DIR/tokudb_test
+
+DROP TABLE t;
+DROP DATABASE tokudb_test;
diff --git a/storage/tokudb/mysql-test/tokudb/t/dir_per_db.test b/storage/tokudb/mysql-test/tokudb/t/dir_per_db.test
new file mode 100644
index 00000000000..b638b706d87
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb/t/dir_per_db.test
@@ -0,0 +1,76 @@
+source include/have_tokudb.inc;
+
+--let $DB= test
+--let $DATADIR= `select @@datadir`
+--let $i= 2
+
+while ($i) {
+  --dec $i
+  --echo ########
+  --echo #  tokudb_dir_per_db = $i
+  --echo ########
+  --eval SET GLOBAL tokudb_dir_per_db= $i
+  --echo ########
+  --echo #  CREATE
+  --echo ########
+  CREATE TABLE t1 (a INT UNSIGNED AUTO_INCREMENT PRIMARY KEY, b INT(10) UNSIGNED NOT NULL) ENGINE=tokudb;
+  INSERT INTO t1 SET b = 10;
+  INSERT INTO t1 SET b = 20;
+  SELECT b FROM t1 ORDER BY a;
+  CREATE INDEX b ON t1 (b);
+  CREATE INDEX ab ON t1 (a,b);
+  --source dir_per_db_show_table_files.inc
+  --echo ########
+  --echo #  RENAME
+  --echo ########
+  RENAME TABLE t1 TO t2;
+  SELECT b FROM t2 ORDER BY a;
+  --source dir_per_db_show_table_files.inc
+  --echo ########
+  --echo #  DROP
+  --echo ########
+  DROP TABLE t2;
+  --source dir_per_db_show_table_files.inc
+}
+
+--echo ########
+--echo #  CREATE on tokudb_dir_per_db = 0 and RENAME on tokudb_dir_per_db = 1 and vice versa
+--echo ########
+
+--let $i= 2
+
+while ($i) {
+  --dec $i
+  --let $inv_i= (1 - $i);
+  --echo ########
+  --echo #  tokudb_dir_per_db = $inv_i
+  --echo ########
+  --eval SET GLOBAL tokudb_dir_per_db= $inv_i
+  --echo ########
+  --echo #  CREATE
+  --echo ########
+  CREATE TABLE t1 (a INT UNSIGNED AUTO_INCREMENT PRIMARY KEY, b INT(10) UNSIGNED NOT NULL) ENGINE=tokudb;
+  INSERT INTO t1 SET b = 10;
+  INSERT INTO t1 SET b = 20;
+  SELECT b FROM t1 ORDER BY a;
+  CREATE INDEX b ON t1 (b);
+  CREATE INDEX ab ON t1 (a,b);
+  --source dir_per_db_show_table_files.inc
+  --echo ########
+  --echo #  tokudb_dir_per_db = $i
+  --echo ########
+  --eval SET GLOBAL tokudb_dir_per_db= $i
+  --echo ########
+  --echo #  RENAME
+  --echo ########
+  RENAME TABLE t1 TO t2;
+  SELECT b FROM t2 ORDER BY a;
+  --source dir_per_db_show_table_files.inc
+  --echo ########
+  --echo #  DROP
+  --echo ########
+  DROP TABLE t2;
+  --source dir_per_db_show_table_files.inc
+}
+
+SET GLOBAL tokudb_dir_per_db=default;
diff --git a/storage/tokudb/mysql-test/tokudb/t/dir_per_db_show_table_files.inc b/storage/tokudb/mysql-test/tokudb/t/dir_per_db_show_table_files.inc
new file mode 100644
index 00000000000..bdf7d5b235f
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb/t/dir_per_db_show_table_files.inc
@@ -0,0 +1,9 @@
+--sorted_result
+
+--echo ## Looking for *.tokudb files in data_dir
+--source include/table_files_replace_pattern.inc
+--list_files $DATADIR *.tokudb
+
+--echo ## Looking for *.tokudb files in data_dir/$DB
+--source include/table_files_replace_pattern.inc
+--list_files $DATADIR/$DB/ *.tokudb
diff --git a/storage/tokudb/mysql-test/tokudb/t/i_s_tokudb_lock_waits_released.test b/storage/tokudb/mysql-test/tokudb/t/i_s_tokudb_lock_waits_released.test
index d8ce18b3aa7..6534175d619 100644
--- a/storage/tokudb/mysql-test/tokudb/t/i_s_tokudb_lock_waits_released.test
+++ b/storage/tokudb/mysql-test/tokudb/t/i_s_tokudb_lock_waits_released.test
@@ -17,7 +17,7 @@ create table t (id int primary key);
 
 # verify that txn_a insert (1) blocks txn_b insert (1) and txn_b gets a duplicate key error
 
-# should be empty
+--echo t should be empty
 select trx_id,trx_mysql_thread_id from information_schema.tokudb_trx;
 select * from information_schema.tokudb_locks;
 select * from information_schema.tokudb_lock_waits;
@@ -33,7 +33,7 @@ set autocommit=0;
 set tokudb_lock_timeout=600000; # set lock wait timeout to 10 minutes
 send insert into t values (1);
 
-# should find the presence of a lock on 1st transaction
+--echo should find the presence of a lock on 1st transaction
 connection default;
 let $wait_condition= select count(*)=1 from information_schema.processlist where info='insert into t values (1)' and state='update';
 source include/wait_condition.inc;
@@ -42,17 +42,17 @@ real_sleep 1; # delay a little to shorten the update -> write row -> lock wait r
 replace_column 1 TRX_ID 2 MYSQL_ID; 
 select * from information_schema.tokudb_locks;
 
-# should find the presence of a lock_wait on the 2nd transaction
+--echo should find the presence of a lock_wait on the 2nd transaction
 replace_column 1 REQUEST_TRX_ID 2 BLOCK_TRX_ID 6 LOCK_WAITS_START_TIME;
 select * from information_schema.tokudb_lock_waits;
 
-# should find the presence of two transactions
+--echo should find the presence of two transactions
 replace_column 1 TRX_ID 2 MYSQL_ID;
 select trx_id,trx_mysql_thread_id from information_schema.tokudb_trx;
 
 connection conn_a;
 commit;
-# verify that the lock on the 1st transaction is released and replaced by the lock for the 2nd transaction
+--echo verify that the lock on the 1st transaction is released and replaced by the lock for the 2nd transaction
 let $wait_condition= select count(*)=1 from information_schema.tokudb_locks where locks_dname='./test/t-main';
 source include/wait_condition.inc;
 
@@ -69,10 +69,8 @@ connection default;
 disconnect conn_a;
 disconnect conn_b;
 
-# verify that txn_a replace (1) blocks txn_b replace (1) and txn_b eventually gets the lock on (1) and completes
-
-# verify that the lock on the 2nd transaction has been released
-# should be be empty
+--echo verify that txn_a replace (1) blocks txn_b replace (1) and txn_b eventually gets the lock on (1) and completes
+--echo verify that the lock on the 2nd transaction has been released, should be be empty
 select trx_id,trx_mysql_thread_id from information_schema.tokudb_trx;
 select * from information_schema.tokudb_locks;
 select * from information_schema.tokudb_lock_waits;
@@ -88,7 +86,7 @@ set autocommit=0;
 set tokudb_lock_timeout=600000; # set lock wait timeout to 10 minutes
 send replace into t values (1);
 
-# should find the presence of a lock on 1st transaction
+--echo should find the presence of a lock on 1st transaction
 connection default;
 let $wait_condition= select count(*)=1 from information_schema.processlist where info='replace into t values (1)' and state='update';
 source include/wait_condition.inc;
@@ -97,17 +95,19 @@ real_sleep 1; # delay a little to shorten the update -> write row -> lock wait r
 replace_column 1 TRX_ID 2 MYSQL_ID; 
 select * from information_schema.tokudb_locks;
 
-# should find the presence of a lock_wait on the 2nd transaction
+--echo should find the presence of a lock_wait on the 2nd transaction
 replace_column 1 REQUEST_TRX_ID 2 BLOCK_TRX_ID 6 LOCK_WAITS_START_TIME;
 select * from information_schema.tokudb_lock_waits;
 
-# should find the presence of two transactions
+--echo should find the presence of two transactions
 replace_column 1 TRX_ID 2 MYSQL_ID;
 select trx_id,trx_mysql_thread_id from information_schema.tokudb_trx;
 
 connection conn_a;
 commit;
-# verify that the lock on the 1st transaction is released and replaced by the lock for the 2nd transaction
+--echo verify that the lock on the 1st transaction is released and replaced by the lock for the 2nd transaction
+let $wait_condition= select count(*)=1 from information_schema.tokudb_locks where locks_dname='./test/t-main';
+source include/wait_condition.inc;
 replace_column 1 TRX_ID 2 MYSQL_ID;
 select * from information_schema.tokudb_locks;
 select * from information_schema.tokudb_lock_waits;
@@ -120,8 +120,7 @@ connection default;
 disconnect conn_a;
 disconnect conn_b;
 
-# verify that the lock on the 2nd transaction has been released
-# should be be empty
+--echo verify that the lock on the 2nd transaction has been released, should be be empty
 select trx_id,trx_mysql_thread_id from information_schema.tokudb_trx;
 select * from information_schema.tokudb_locks;
 select * from information_schema.tokudb_lock_waits;
diff --git a/storage/tokudb/mysql-test/tokudb/t/row_format.test b/storage/tokudb/mysql-test/tokudb/t/row_format.test
new file mode 100644
index 00000000000..6533f8c06be
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb/t/row_format.test
@@ -0,0 +1,41 @@
+#
+# Test TokuDB compression option additions to row_format
+#
+--source include/have_tokudb.inc
+
+CREATE TABLE tokudb_row_format_test_1 (a INT) ENGINE=TokuDB ROW_FORMAT=TOKUDB_DEFAULT;
+CREATE TABLE tokudb_row_format_test_2 (a INT) ENGINE=TokuDB ROW_FORMAT=TOKUDB_FAST;
+CREATE TABLE tokudb_row_format_test_3 (a INT) ENGINE=TokuDB ROW_FORMAT=TOKUDB_SMALL;
+CREATE TABLE tokudb_row_format_test_4 (a INT) ENGINE=TokuDB ROW_FORMAT=TOKUDB_UNCOMPRESSED;
+CREATE TABLE tokudb_row_format_test_5 (a INT) ENGINE=TokuDB ROW_FORMAT=TOKUDB_ZLIB;
+CREATE TABLE tokudb_row_format_test_6 (a INT) ENGINE=TokuDB ROW_FORMAT=TOKUDB_LZMA;
+CREATE TABLE tokudb_row_format_test_7 (a INT) ENGINE=TokuDB ROW_FORMAT=TOKUDB_QUICKLZ;
+CREATE TABLE tokudb_row_format_test_8 (a INT) ENGINE=TokuDB ROW_FORMAT=TOKUDB_SNAPPY;
+
+SELECT table_name, row_format, engine FROM information_schema.tables WHERE table_name like 'tokudb_row_format_test%' ORDER BY table_name;
+
+ALTER TABLE tokudb_row_format_test_1 ENGINE=TokuDB ROW_FORMAT=TOKUDB_FAST;
+SELECT table_name, row_format, engine FROM information_schema.tables WHERE table_name = 'tokudb_row_format_test_1';
+
+ALTER TABLE tokudb_row_format_test_1 ENGINE=TokuDB ROW_FORMAT=TOKUDB_SMALL;
+SELECT table_name, row_format, engine FROM information_schema.tables WHERE table_name = 'tokudb_row_format_test_1';
+
+ALTER TABLE tokudb_row_format_test_1 ENGINE=TokuDB ROW_FORMAT=TOKUDB_UNCOMPRESSED;
+SELECT table_name, row_format, engine FROM information_schema.tables WHERE table_name = 'tokudb_row_format_test_1';
+
+ALTER TABLE tokudb_row_format_test_1 ENGINE=TokuDB ROW_FORMAT=TOKUDB_ZLIB;
+SELECT table_name, row_format, engine FROM information_schema.tables WHERE table_name = 'tokudb_row_format_test_1';
+
+ALTER TABLE tokudb_row_format_test_1 ENGINE=TokuDB ROW_FORMAT=TOKUDB_SNAPPY;
+SELECT table_name, row_format, engine FROM information_schema.tables WHERE table_name = 'tokudb_row_format_test_1';
+
+ALTER TABLE tokudb_row_format_test_1 ENGINE=TokuDB ROW_FORMAT=TOKUDB_QUICKLZ;
+SELECT table_name, row_format, engine FROM information_schema.tables WHERE table_name = 'tokudb_row_format_test_1';
+
+ALTER TABLE tokudb_row_format_test_1 ENGINE=TokuDB ROW_FORMAT=TOKUDB_LZMA;
+SELECT table_name, row_format, engine FROM information_schema.tables WHERE table_name = 'tokudb_row_format_test_1';
+
+ALTER TABLE tokudb_row_format_test_1 ENGINE=TokuDB ROW_FORMAT=TOKUDB_DEFAULT;
+SELECT table_name, row_format, engine FROM information_schema.tables WHERE table_name = 'tokudb_row_format_test_1';
+
+DROP TABLE tokudb_row_format_test_1, tokudb_row_format_test_2, tokudb_row_format_test_3, tokudb_row_format_test_4, tokudb_row_format_test_5, tokudb_row_format_test_6, tokudb_row_format_test_7, tokudb_row_format_test_8;
diff --git a/storage/tokudb/mysql-test/tokudb/t/table_index_statistics.test b/storage/tokudb/mysql-test/tokudb/t/table_index_statistics.test
new file mode 100644
index 00000000000..9a3b28aab95
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb/t/table_index_statistics.test
@@ -0,0 +1,8 @@
+--source include/have_tokudb.inc
+
+SET @default_storage_engine_old = @@session.default_storage_engine;
+SET SESSION default_storage_engine = TOKUDB;
+
+--source extra/table_index_statistics.inc
+
+SET SESSION default_storage_engine = @default_storage_engine_old;
diff --git a/storage/tokudb/mysql-test/tokudb_alter_table/r/hcad_tmp_tables_56.result b/storage/tokudb/mysql-test/tokudb_alter_table/r/hcad_tmp_tables_56.result
index 0b6a3cf1378..932a6171781 100644
--- a/storage/tokudb/mysql-test/tokudb_alter_table/r/hcad_tmp_tables_56.result
+++ b/storage/tokudb/mysql-test/tokudb_alter_table/r/hcad_tmp_tables_56.result
@@ -5,7 +5,7 @@ create temporary table bar (a int, key(a))engine=TOkuDB;
 alter table bar add column c int default 0;
 create index blah on bar(a);
 Warnings:
-Note	1831	Duplicate index 'blah' defined on the table 'test.bar'. This is deprecated and will be disallowed in a future release
+Note	1831	Duplicate index `blah`. This is deprecated and will be disallowed in a future release
 drop index a on bar;
 set session tokudb_disable_slow_alter=OFF;
 insert into bar (a) values (1),(2),(3);
diff --git a/storage/tokudb/mysql-test/tokudb_bugs/r/db938.result b/storage/tokudb/mysql-test/tokudb_bugs/r/db938.result
index b64c240b1ea..fb332155563 100644
--- a/storage/tokudb/mysql-test/tokudb_bugs/r/db938.result
+++ b/storage/tokudb/mysql-test/tokudb_bugs/r/db938.result
@@ -28,6 +28,7 @@ set DEBUG_SYNC = 'now SIGNAL done';
 connection conn1;
 connection default;
 disconnect conn1;
+set DEBUG_SYNC = 'RESET';
 drop table t1;
 set session tokudb_auto_analyze = @orig_auto_analyze;
 set session tokudb_analyze_in_background = @orig_in_background;
@@ -37,4 +38,3 @@ set session tokudb_analyze_time = @orig_time;
 set global tokudb_cardinality_scale_percent = @orig_scale_percent;
 set session default_storage_engine = @orig_default_storage_engine;
 set global tokudb_debug_pause_background_job_manager = @orig_pause_background_job_manager;
-set DEBUG_SYNC='reset';
diff --git a/storage/tokudb/mysql-test/tokudb_bugs/t/db938.test b/storage/tokudb/mysql-test/tokudb_bugs/t/db938.test
index f56f93d1492..50434a79a00 100644
--- a/storage/tokudb/mysql-test/tokudb_bugs/t/db938.test
+++ b/storage/tokudb/mysql-test/tokudb_bugs/t/db938.test
@@ -40,6 +40,7 @@ insert into t1(b,c) values(0,0), (1,1), (2,2), (3,3);
 select database_name, table_name, job_type, job_params, scheduler from information_schema.tokudb_background_job_status;
 
 # lets flip to another connection
+--source include/count_sessions.inc
 connect(conn1, localhost, root);
 
 # set up the DEBUG_SYNC point
@@ -64,6 +65,7 @@ connection conn1;
 reap;
 connection default;
 disconnect conn1;
+set DEBUG_SYNC = 'RESET';
 drop table t1;
 
 set session tokudb_auto_analyze = @orig_auto_analyze;
@@ -74,4 +76,4 @@ set session tokudb_analyze_time = @orig_time;
 set global tokudb_cardinality_scale_percent = @orig_scale_percent;
 set session default_storage_engine = @orig_default_storage_engine;
 set global tokudb_debug_pause_background_job_manager = @orig_pause_background_job_manager;
-set DEBUG_SYNC='reset';
+--source include/wait_until_count_sessions.inc
diff --git a/storage/tokudb/mysql-test/tokudb_bugs/t/frm_store.test b/storage/tokudb/mysql-test/tokudb_bugs/t/frm_store.test
index 6100d9aeec2..8b6df4966f4 100644
--- a/storage/tokudb/mysql-test/tokudb_bugs/t/frm_store.test
+++ b/storage/tokudb/mysql-test/tokudb_bugs/t/frm_store.test
@@ -12,33 +12,11 @@ let $MYSQLD_DATADIR= `SELECT @@datadir`;
 create table foo (a int, b int);
 create table bar (a int, key(a));
 
-# Write file to make mysql-test-run.pl expect the "crash", but don't start
-# it until it's told to
---write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
-wait
-EOF
-
-# Send shutdown to the connected server and give
-# it 10 seconds to die before zapping it
-shutdown_server 10;
-
+--source include/shutdown_mysqld.inc
 remove_file $MYSQLD_DATADIR/test/foo.frm;
 copy_file $MYSQLD_DATADIR/test/bar.frm $MYSQLD_DATADIR/test/foo.frm;
 remove_file $MYSQLD_DATADIR/test/bar.frm;
-
-# Write file to make mysql-test-run.pl start up the server again
---append_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
-restart
-EOF
-
-# Turn on reconnect
---enable_reconnect
-
-# Call script that will poll the server waiting for it to be back online again
---source include/wait_until_connected_again.inc
-
-# Turn off reconnect again
---disable_reconnect
+--source include/start_mysqld.inc
 
 show create table foo;
 show create table bar;
diff --git a/storage/tokudb/mysql-test/tokudb_bugs/t/frm_store2.test b/storage/tokudb/mysql-test/tokudb_bugs/t/frm_store2.test
index e1acea13ed7..53c1037b051 100644
--- a/storage/tokudb/mysql-test/tokudb_bugs/t/frm_store2.test
+++ b/storage/tokudb/mysql-test/tokudb_bugs/t/frm_store2.test
@@ -15,33 +15,11 @@ create table bar (a int);
 alter table foo drop column a;
 alter table bar add column b int, add column c int;
 
-# Write file to make mysql-test-run.pl expect the "crash", but don't start
-# it until it's told to
---write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
-wait
-EOF
-
-# Send shutdown to the connected server and give
-# it 10 seconds to die before zapping it
-shutdown_server 10;
-
+--source include/shutdown_mysqld.inc
 remove_file $MYSQLD_DATADIR/test/foo.frm;
 copy_file $MYSQLD_DATADIR/test/bar.frm $MYSQLD_DATADIR/test/foo.frm;
 remove_file $MYSQLD_DATADIR/test/bar.frm;
-
-# Write file to make mysql-test-run.pl start up the server again
---append_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
-restart
-EOF
-
-# Turn on reconnect
---enable_reconnect
-
-# Call script that will poll the server waiting for it to be back online again
---source include/wait_until_connected_again.inc
-
-# Turn off reconnect again
---disable_reconnect
+--source include/start_mysqld.inc
 
 show create table foo;
 show create table bar;
diff --git a/storage/tokudb/mysql-test/tokudb_bugs/t/frm_store3.test b/storage/tokudb/mysql-test/tokudb_bugs/t/frm_store3.test
index 17a124249da..0421b8e9d26 100644
--- a/storage/tokudb/mysql-test/tokudb_bugs/t/frm_store3.test
+++ b/storage/tokudb/mysql-test/tokudb_bugs/t/frm_store3.test
@@ -14,33 +14,11 @@ create table bar (a bigint)engine=TokuDB;
 alter table foo drop index b;
 alter table bar add index (a);
 
-# Write file to make mysql-test-run.pl expect the "crash", but don't start
-# it until it's told to
---write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
-wait
-EOF
-
-# Send shutdown to the connected server and give
-# it 10 seconds to die before zapping it
-shutdown_server 10;
-
+--source include/shutdown_mysqld.inc
 remove_file $MYSQLD_DATADIR/test/foo.frm;
 copy_file $MYSQLD_DATADIR/test/bar.frm $MYSQLD_DATADIR/test/foo.frm;
 remove_file $MYSQLD_DATADIR/test/bar.frm;
-
-# Write file to make mysql-test-run.pl start up the server again
---append_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect
-restart
-EOF
-
-# Turn on reconnect
---enable_reconnect
-
-# Call script that will poll the server waiting for it to be back online again
---source include/wait_until_connected_again.inc
-
-# Turn off reconnect again
---disable_reconnect
+--source include/start_mysqld.inc
 
 show create table foo;
 show create table bar;
diff --git a/storage/tokudb/mysql-test/tokudb_bugs/t/tokudb_drop_part_table_668.test b/storage/tokudb/mysql-test/tokudb_bugs/t/tokudb_drop_part_table_668.test
index 42dbb30058a..4c40339be5a 100644
--- a/storage/tokudb/mysql-test/tokudb_bugs/t/tokudb_drop_part_table_668.test
+++ b/storage/tokudb/mysql-test/tokudb_bugs/t/tokudb_drop_part_table_668.test
@@ -7,17 +7,7 @@ set default_storage_engine='tokudb';
 # capture the datadir
 let $MYSQLD_DATADIR= `SELECT @@datadir`;
 
-# shutdown mysqld (code stolen from mysql_plugin.test)
-let $expect_file= $MYSQLTEST_VARDIR/tmp/mysqld.1.expect;
-# MTR will remove this file later, but this might be too late.
---error 0,1
---remove_file $expect_file
---write_file $expect_file
-wait
-EOF
---shutdown_server 10
---source include/wait_until_disconnected.inc
-
+--source include/shutdown_mysqld.inc
 # remove all tokudb file in the datadir
 system mkdir $MYSQLD_DATADIR/save;
 system mv $MYSQLD_DATADIR/*toku* $MYSQLD_DATADIR/test $MYSQLD_DATADIR/save;
@@ -25,13 +15,7 @@ system mkdir $MYSQLD_DATADIR/test;
 
 # install 6.6.8 tokudb test files
 system cp -r std_data/tokudb_drop_part_table_668/data/* $MYSQLD_DATADIR;
-
-# restart mysqld
---append_file $expect_file
-restart
-EOF
---enable_reconnect
---source include/wait_until_connected_again.inc
+--source include/start_mysqld.inc
 
 create table tc (a int, b int, c int, primary key(a), key(b)) engine=tokudb partition by hash(a) partitions 2;
 
@@ -45,26 +29,9 @@ select dictionary_name from information_schema.tokudb_file_map;
 # check that the test dir is empty
 list_files $MYSQLD_DATADIR/test *.frm;
 
-# shutdown mysqld (code stolen from mysql_plugin.test)
-let $expect_file= $MYSQLTEST_VARDIR/tmp/mysqld.1.expect;
-# MTR will remove this file later, but this might be too late.
---error 0,1
---remove_file $expect_file
---write_file $expect_file
-wait
-EOF
---shutdown_server 10
---source include/wait_until_disconnected.inc
-
+--source include/shutdown_mysqld.inc
 # restore saved datadir
 system rm -rf $MYSQLD_DATADIR/*toku* $MYSQLD_DATADIR/test;
 system mv $MYSQLD_DATADIR/save/* $MYSQLD_DATADIR;
 system rmdir $MYSQLD_DATADIR/save;
-
-# restart mysqld
---append_file $expect_file
-restart
-EOF
---enable_reconnect
---source include/wait_until_connected_again.inc
-
+--source include/start_mysqld.inc
diff --git a/storage/tokudb/mysql-test/tokudb_bugs/t/tokudb_drop_simple_table_668.test b/storage/tokudb/mysql-test/tokudb_bugs/t/tokudb_drop_simple_table_668.test
index 3903c2cef9f..0340b960fa5 100644
--- a/storage/tokudb/mysql-test/tokudb_bugs/t/tokudb_drop_simple_table_668.test
+++ b/storage/tokudb/mysql-test/tokudb_bugs/t/tokudb_drop_simple_table_668.test
@@ -6,17 +6,7 @@ set default_storage_engine='tokudb';
 # capture the datadir
 let $MYSQLD_DATADIR= `SELECT @@datadir`;
 
-# shutdown mysqld (code stolen from mysql_plugin.test)
-let $expect_file= $MYSQLTEST_VARDIR/tmp/mysqld.1.expect;
-# MTR will remove this file later, but this might be too late.
---error 0,1
---remove_file $expect_file
---write_file $expect_file
-wait
-EOF
---shutdown_server 10
---source include/wait_until_disconnected.inc
-
+--source include/shutdown_mysqld.inc
 # remove all tokudb file in the datadir
 system mkdir $MYSQLD_DATADIR/save;
 system mv $MYSQLD_DATADIR/*toku* $MYSQLD_DATADIR/test $MYSQLD_DATADIR/save;
@@ -24,13 +14,7 @@ system mkdir $MYSQLD_DATADIR/test;
 
 # install 6.6.8 tokudb test files
 system cp -r std_data/tokudb_drop_simple_table_668/data/* $MYSQLD_DATADIR;
-
-# restart mysqld
---append_file $expect_file
-restart
-EOF
---enable_reconnect
---source include/wait_until_connected_again.inc
+--source include/start_mysqld.inc
 
 create table tc (id int, x int, primary key(id), key(x));
 
@@ -46,26 +30,9 @@ select dictionary_name from information_schema.tokudb_file_map;
 # check that the test dir is empty
 list_files $MYSQLD_DATADIR/test *.frm;
 
-# shutdown mysqld (code stolen from mysql_plugin.test)
-let $expect_file= $MYSQLTEST_VARDIR/tmp/mysqld.1.expect;
-# MTR will remove this file later, but this might be too late.
---error 0,1
---remove_file $expect_file
---write_file $expect_file
-wait
-EOF
---shutdown_server 10
---source include/wait_until_disconnected.inc
-
+--source include/shutdown_mysqld.inc
 # restore saved datadir
 system rm -rf $MYSQLD_DATADIR/*toku* $MYSQLD_DATADIR/test;
 system mv $MYSQLD_DATADIR/save/* $MYSQLD_DATADIR;
 system rmdir $MYSQLD_DATADIR/save;
-
-# restart mysqld
---append_file $expect_file
-restart
-EOF
---enable_reconnect
---source include/wait_until_connected_again.inc
-
+--source include/start_mysqld.inc
diff --git a/storage/tokudb/mysql-test/tokudb_parts/include/table_files_replace_pattern.inc b/storage/tokudb/mysql-test/tokudb_parts/include/table_files_replace_pattern.inc
new file mode 100644
index 00000000000..b10ad21dd95
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb_parts/include/table_files_replace_pattern.inc
@@ -0,0 +1 @@
+--replace_regex  /[a-z0-9]+_[a-z0-9]+_[a-z0-9]+(_[BP]_[a-z0-9]+){0,1}\./id./ /sqlx_[a-z0-9]+_[a-z0-9]+_/sqlx_nnnn_nnnn_/ /sqlx-[a-z0-9]+_[a-z0-9]+/sqlx-nnnn_nnnn/ /#p#/#P#/ /#sp#/#SP#/ /#tmp#/#TMP#/
diff --git a/storage/tokudb/mysql-test/tokudb_parts/t/partition_debug_sync_tokudb.test b/storage/tokudb/mysql-test/tokudb_parts/t/partition_debug_sync_tokudb.test
index be14d8814f0..f97235a0a2d 100644
--- a/storage/tokudb/mysql-test/tokudb_parts/t/partition_debug_sync_tokudb.test
+++ b/storage/tokudb/mysql-test/tokudb_parts/t/partition_debug_sync_tokudb.test
@@ -56,7 +56,7 @@ partition by range (a)
 insert into t1 values (1), (11), (21), (33);
 SELECT * FROM t1;
 SHOW CREATE TABLE t1;
---replace_result #p# #P# #sp# #SP#
+--source include/table_files_replace_pattern.inc
 --list_files $MYSQLD_DATADIR/test
 
 SET DEBUG_SYNC='before_open_in_get_all_tables SIGNAL parked WAIT_FOR open';
@@ -82,7 +82,7 @@ ALTER TABLE t1 REORGANIZE PARTITION p0 INTO
 disconnect con1;
 connection default;
 --reap
---replace_result #p# #P# #sp# #SP#
+--source include/table_files_replace_pattern.inc
 --list_files $MYSQLD_DATADIR/test
 SHOW CREATE TABLE t1;
 SELECT * FROM t1;
diff --git a/storage/tokudb/mysql-test/tokudb_rpl/disabled.def b/storage/tokudb/mysql-test/tokudb_rpl/disabled.def
new file mode 100644
index 00000000000..12758473121
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb_rpl/disabled.def
@@ -0,0 +1 @@
+rpl_tokudb_rfr_partition_table : no read-free replication yet
diff --git a/storage/tokudb/mysql-test/tokudb_rpl/include/have_tokudb.inc b/storage/tokudb/mysql-test/tokudb_rpl/include/have_tokudb.inc
new file mode 100644
index 00000000000..12b29a22d2c
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb_rpl/include/have_tokudb.inc
@@ -0,0 +1 @@
+let $datadir=`select @@datadir`;
diff --git a/storage/tokudb/mysql-test/tokudb_rpl/r/rpl_parallel_optimistic.result b/storage/tokudb/mysql-test/tokudb_rpl/r/rpl_parallel_optimistic.result
new file mode 100644
index 00000000000..8f662f3db06
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb_rpl/r/rpl_parallel_optimistic.result
@@ -0,0 +1,494 @@
+include/master-slave.inc
+[connection master]
+ALTER TABLE mysql.gtid_slave_pos ENGINE=TokuDB;
+CREATE TABLE t1 (a int PRIMARY KEY, b INT, UNIQUE KEY (b)) ENGINE=TokuDB;
+SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
+include/stop_slave.inc
+SET GLOBAL slave_parallel_threads=10;
+CHANGE MASTER TO master_use_gtid=slave_pos;
+SET @old_parallel_mode=@@GLOBAL.slave_parallel_mode;
+SET GLOBAL slave_parallel_mode='optimistic';
+INSERT INTO t1 VALUES(1,1);
+BEGIN;
+INSERT INTO t1 VALUES(2,2);
+INSERT INTO t1 VALUES(3,3);
+COMMIT;
+DELETE FROM t1 WHERE a=2;
+INSERT INTO t1 VALUES (2,2);
+DELETE FROM t1 WHERE a=2;
+INSERT INTO t1 VALUES (2,6);
+DELETE FROM t1 WHERE a=2;
+INSERT INTO t1 VALUES (2,4);
+DELETE FROM t1 WHERE a=2;
+INSERT INTO t1 VALUES (2,5);
+DELETE FROM t1 WHERE a=3;
+INSERT INTO t1 VALUES(3,3);
+DELETE FROM t1 WHERE a=1;
+INSERT INTO t1 VALUES(1,4);
+DELETE FROM t1 WHERE a=3;
+INSERT INTO t1 VALUES(3,3);
+DELETE FROM t1 WHERE a=2;
+INSERT INTO t1 VALUES (2,6);
+include/save_master_gtid.inc
+SELECT * FROM t1 ORDER BY a;
+a	b
+1	4
+2	6
+3	3
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+SELECT * FROM t1 ORDER BY a;
+a	b
+1	4
+2	6
+3	3
+*** Test a bunch of non-transactional/DDL event groups. ***
+include/stop_slave.inc
+INSERT INTO t1 VALUES (4,8);
+INSERT INTO t1 VALUES (5,9);
+CREATE TABLE t2 (a INT PRIMARY KEY) ENGINE=TokuDB;
+INSERT INTO t2 VALUES (1);
+CREATE TABLE t3 (a INT PRIMARY KEY) ENGINE=MyISAM;
+ALTER TABLE t2 ADD b INT;
+INSERT INTO t2 VALUES (2,2);
+ALTER TABLE t2 DROP b;
+INSERT INTO t2 VALUES (3);
+ALTER TABLE t2 ADD c INT;
+INSERT INTO t2 VALUES (4,5);
+INSERT INTO t2 VALUES (5,5);
+INSERT INTO t3 VALUES (1);
+UPDATE t2 SET c=NULL WHERE a=4;
+ALTER TABLE t2 ADD UNIQUE (c);
+INSERT INTO t2 VALUES (6,6);
+UPDATE t2 SET c=c+100 WHERE a=2;
+INSERT INTO t3(a) VALUES (2);
+DELETE FROM t3 WHERE a=2;
+INSERT INTO t3(a) VALUES (2);
+DELETE FROM t3 WHERE a=2;
+ALTER TABLE t3 CHANGE a c INT NOT NULL;
+INSERT INTO t3(c) VALUES (2);
+DELETE FROM t3 WHERE c=2;
+INSERT INTO t3 SELECT a+200 FROM t2;
+DELETE FROM t3 WHERE c >= 200;
+INSERT INTO t3 SELECT a+200 FROM t2;
+include/save_master_gtid.inc
+SELECT * FROM t1 ORDER BY a;
+a	b
+1	4
+2	6
+3	3
+4	8
+5	9
+SELECT * FROM t2 ORDER BY a;
+a	c
+1	NULL
+2	NULL
+3	NULL
+4	NULL
+5	5
+6	6
+SELECT * FROM t3 ORDER BY c;
+c
+1
+201
+202
+203
+204
+205
+206
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+SELECT * FROM t1 ORDER BY a;
+a	b
+1	4
+2	6
+3	3
+4	8
+5	9
+SELECT * FROM t2 ORDER BY a;
+a	c
+1	NULL
+2	NULL
+3	NULL
+4	NULL
+5	5
+6	6
+SELECT * FROM t3 ORDER BY c;
+c
+1
+201
+202
+203
+204
+205
+206
+*** Test @@skip_parallel_replication. ***
+include/stop_slave.inc
+UPDATE t1 SET b=10 WHERE a=3;
+SET SESSION skip_parallel_replication=1;
+UPDATE t1 SET b=20 WHERE a=3;
+UPDATE t1 SET b=30 WHERE a=3;
+UPDATE t1 SET b=50 WHERE a=3;
+UPDATE t1 SET b=80 WHERE a=3;
+UPDATE t1 SET b=130 WHERE a=3;
+UPDATE t1 SET b=210 WHERE a=3;
+UPDATE t1 SET b=340 WHERE a=3;
+UPDATE t1 SET b=550 WHERE a=3;
+UPDATE t1 SET b=890 WHERE a=3;
+SET SESSION skip_parallel_replication=0;
+SELECT * FROM t1 ORDER BY a;
+a	b
+1	4
+2	6
+3	890
+4	8
+5	9
+include/save_master_gtid.inc
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+SELECT * FROM t1 ORDER BY a;
+a	b
+1	4
+2	6
+3	890
+4	8
+5	9
+status
+Ok, no retry
+*** Test that we do not replicate in parallel transactions that had row lock waits on the master ***
+include/stop_slave.inc
+BEGIN;
+UPDATE t1 SET b=b+1 WHERE a=3;
+SET debug_sync='thd_report_wait_for SIGNAL waiting1';
+UPDATE t1 SET b=1001 WHERE a=3;
+SET debug_sync='now WAIT_FOR waiting1';
+BEGIN;
+UPDATE t1 SET b=1002 WHERE a=5;
+SET debug_sync='thd_report_wait_for SIGNAL waiting2';
+UPDATE t1 SET b=102 WHERE a=3;
+SET debug_sync='now WAIT_FOR waiting2';
+UPDATE t1 SET b=1000 WHERE a=1;
+SET debug_sync='thd_report_wait_for SIGNAL waiting3';
+UPDATE t1 SET b=1003 WHERE a=5;
+SET debug_sync='now WAIT_FOR waiting3';
+SET debug_sync='thd_report_wait_for SIGNAL waiting4';
+UPDATE t1 SET b=1004 WHERE a=3;
+SET debug_sync='now WAIT_FOR waiting4';
+SET debug_sync='thd_report_wait_for SIGNAL waiting5';
+UPDATE t1 SET b=1005 WHERE a=5;
+SET debug_sync='now WAIT_FOR waiting5';
+SET debug_sync='thd_report_wait_for SIGNAL waiting6';
+UPDATE t1 SET b=1006 WHERE a=1;
+SET debug_sync='now WAIT_FOR waiting6';
+SET debug_sync='thd_report_wait_for SIGNAL waiting7';
+UPDATE t1 SET b=1007 WHERE a=5;
+SET debug_sync='now WAIT_FOR waiting7';
+SET debug_sync='thd_report_wait_for SIGNAL waiting8';
+UPDATE t1 SET b=1008 WHERE a=3;
+SET debug_sync='now WAIT_FOR waiting8';
+COMMIT;
+COMMIT;
+SET debug_sync='RESET';
+include/save_master_gtid.inc
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+SELECT IF(@master_value=@slave_value, "Slave data matches master", CONCAT("ERROR: Slave had different data '", @slave_value, "' than master's '", @master_value, "'!")) as check_result;
+check_result
+Slave data matches master
+status
+Ok, no retry
+*** Test that we replicate correctly when using READ COMMITTED and binlog_format=MIXED on the slave ***
+include/stop_slave.inc
+SET @old_format= @@GLOBAL.binlog_format;
+SET GLOBAL binlog_format= MIXED;
+SET @old_isolation= @@GLOBAL.tx_isolation;
+SET GLOBAL TRANSACTION ISOLATION LEVEL READ COMMITTED;
+SET GLOBAL slave_parallel_threads=0;
+SET GLOBAL slave_parallel_threads=10;
+DROP TABLE t1, t2;
+CREATE TABLE t1 (a int PRIMARY KEY, b INT) ENGINE=TokuDB;
+CREATE TABLE t2 (a int PRIMARY KEY, b INT) ENGINE=TokuDB;
+INSERT INTO t1 VALUES (1,0), (2,0), (3,0);
+INSERT INTO t2 VALUES (1,0), (2,0);
+INSERT INTO t1 SELECT 4, COUNT(*) FROM t2;
+INSERT INTO t2 SELECT 4, COUNT(*) FROM t1;
+INSERT INTO t1 SELECT 5, COUNT(*) FROM t2;
+INSERT INTO t2 SELECT 5, COUNT(*) FROM t1;
+INSERT INTO t2 SELECT 6, COUNT(*) FROM t1;
+INSERT INTO t1 SELECT 6, COUNT(*) FROM t2;
+INSERT INTO t1 SELECT 7, COUNT(*) FROM t2;
+INSERT INTO t2 SELECT 7, COUNT(*) FROM t1;
+INSERT INTO t2 SELECT 8, COUNT(*) FROM t1;
+INSERT INTO t1 SELECT 8, COUNT(*) FROM t2;
+INSERT INTO t2 SELECT 9, COUNT(*) FROM t1;
+INSERT INTO t1 SELECT 9, COUNT(*) FROM t2;
+INSERT INTO t1 SELECT 10, COUNT(*) FROM t2;
+INSERT INTO t2 SELECT 10, COUNT(*) FROM t1;
+SELECT * FROM t1 ORDER BY a;
+a	b
+1	0
+2	0
+3	0
+4	2
+5	3
+6	5
+7	5
+8	7
+9	8
+10	8
+SELECT * FROM t2 ORDER BY a;
+a	b
+1	0
+2	0
+4	4
+5	5
+6	5
+7	7
+8	7
+9	8
+10	10
+include/save_master_gtid.inc
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+SELECT * FROM t1 ORDER BY a;
+a	b
+1	0
+2	0
+3	0
+4	2
+5	3
+6	5
+7	5
+8	7
+9	8
+10	8
+SELECT * FROM t2 ORDER BY a;
+a	b
+1	0
+2	0
+4	4
+5	5
+6	5
+7	7
+8	7
+9	8
+10	10
+include/stop_slave.inc
+SET GLOBAL binlog_format= @old_format;
+SET GLOBAL tx_isolation= @old_isolation;
+include/start_slave.inc
+*** MDEV-7888: ANALYZE TABLE does wakeup_subsequent_commits(), causing wrong binlog order and parallel replication hang ***
+DROP TABLE t1, t2, t3;
+CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=TokuDB;
+CREATE TABLE t2 (a INT PRIMARY KEY, b INT) ENGINE=TokuDB;
+CREATE TABLE t3 (a INT PRIMARY KEY, b INT) ENGINE=MyISAM;
+INSERT INTO t2 VALUES (1,1), (2,1), (3,1), (4,1), (5,1);
+include/save_master_gtid.inc
+include/sync_with_master_gtid.inc
+include/stop_slave.inc
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug= '+d,inject_analyze_table_sleep';
+ALTER TABLE t2 COMMENT "123abc";
+ANALYZE TABLE t2;
+Table	Op	Msg_type	Msg_text
+test.t2	analyze	status	OK
+INSERT INTO t1 VALUES (1,2);
+INSERT INTO t1 VALUES (2,2);
+INSERT INTO t1 VALUES (3,2);
+INSERT INTO t1 VALUES (4,2);
+INSERT INTO t3 VALUES (1,3);
+ALTER TABLE t2 COMMENT "hello, world";
+BEGIN;
+INSERT INTO t1 VALUES (5,4);
+INSERT INTO t1 VALUES (6,4);
+INSERT INTO t1 VALUES (7,4);
+INSERT INTO t1 VALUES (8,4);
+INSERT INTO t1 VALUES (9,4);
+INSERT INTO t1 VALUES (10,4);
+INSERT INTO t1 VALUES (11,4);
+INSERT INTO t1 VALUES (12,4);
+INSERT INTO t1 VALUES (13,4);
+INSERT INTO t1 VALUES (14,4);
+INSERT INTO t1 VALUES (15,4);
+INSERT INTO t1 VALUES (16,4);
+INSERT INTO t1 VALUES (17,4);
+INSERT INTO t1 VALUES (18,4);
+INSERT INTO t1 VALUES (19,4);
+INSERT INTO t1 VALUES (20,4);
+COMMIT;
+INSERT INTO t1 VALUES (21,5);
+INSERT INTO t1 VALUES (22,5);
+SELECT * FROM t1 ORDER BY a;
+a	b
+1	2
+2	2
+3	2
+4	2
+5	4
+6	4
+7	4
+8	4
+9	4
+10	4
+11	4
+12	4
+13	4
+14	4
+15	4
+16	4
+17	4
+18	4
+19	4
+20	4
+21	5
+22	5
+SELECT * FROM t2 ORDER BY a;
+a	b
+1	1
+2	1
+3	1
+4	1
+5	1
+SELECT * FROM t3 ORDER BY a;
+a	b
+1	3
+include/save_master_gtid.inc
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+SELECT * FROM t1 ORDER BY a;
+a	b
+1	2
+2	2
+3	2
+4	2
+5	4
+6	4
+7	4
+8	4
+9	4
+10	4
+11	4
+12	4
+13	4
+14	4
+15	4
+16	4
+17	4
+18	4
+19	4
+20	4
+21	5
+22	5
+SELECT * FROM t2 ORDER BY a;
+a	b
+1	1
+2	1
+3	1
+4	1
+5	1
+SELECT * FROM t3 ORDER BY a;
+a	b
+1	3
+include/stop_slave.inc
+SET GLOBAL debug_dbug= @old_debug;
+include/start_slave.inc
+*** MDEV-7929: record_gtid() for non-transactional event group calls wakeup_subsequent_commits() too early, causing slave hang. ***
+include/stop_slave.inc
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug= '+d,inject_record_gtid_serverid_100_sleep';
+ALTER TABLE t3 COMMENT "DDL statement 1";
+INSERT INTO t1 VALUES (30,0);
+INSERT INTO t1 VALUES (31,0);
+INSERT INTO t1 VALUES (32,0);
+INSERT INTO t1 VALUES (33,0);
+INSERT INTO t1 VALUES (34,0);
+INSERT INTO t1 VALUES (35,0);
+INSERT INTO t1 VALUES (36,0);
+SET @old_server_id= @@SESSION.server_id;
+SET SESSION server_id= 100;
+ANALYZE TABLE t2;
+Table	Op	Msg_type	Msg_text
+test.t2	analyze	status	OK
+SET SESSION server_id= @old_server_id;
+INSERT INTO t1 VALUES (37,0);
+ALTER TABLE t3 COMMENT "DDL statement 2";
+INSERT INTO t1 VALUES (38,0);
+INSERT INTO t1 VALUES (39,0);
+ALTER TABLE t3 COMMENT "DDL statement 3";
+SELECT * FROM t1 WHERE a >= 30 ORDER BY a;
+a	b
+30	0
+31	0
+32	0
+33	0
+34	0
+35	0
+36	0
+37	0
+38	0
+39	0
+include/save_master_gtid.inc
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+SELECT * FROM t1 WHERE a >= 30 ORDER BY a;
+a	b
+30	0
+31	0
+32	0
+33	0
+34	0
+35	0
+36	0
+37	0
+38	0
+39	0
+include/stop_slave.inc
+SET GLOBAL debug_dbug= @old_debug;
+include/start_slave.inc
+*** MDEV-8113: ALTER TABLE causes slave hang in optimistic parallel replication ***
+include/stop_slave.inc
+ALTER TABLE t2 ADD c INT;
+INSERT INTO t2 (a,b) VALUES (50, 0);
+INSERT INTO t2 (a,b) VALUES (51, 1);
+INSERT INTO t2 (a,b) VALUES (52, 2);
+INSERT INTO t2 (a,b) VALUES (53, 3);
+INSERT INTO t2 (a,b) VALUES (54, 4);
+INSERT INTO t2 (a,b) VALUES (55, 5);
+INSERT INTO t2 (a,b) VALUES (56, 6);
+INSERT INTO t2 (a,b) VALUES (57, 7);
+INSERT INTO t2 (a,b) VALUES (58, 8);
+INSERT INTO t2 (a,b) VALUES (59, 9);
+ALTER TABLE t2 DROP COLUMN c;
+SELECT * FROM t2 WHERE a >= 50 ORDER BY a;
+a	b
+50	0
+51	1
+52	2
+53	3
+54	4
+55	5
+56	6
+57	7
+58	8
+59	9
+include/save_master_gtid.inc
+include/start_slave.inc
+include/sync_with_master_gtid.inc
+SELECT * FROM t2 WHERE a >= 50 ORDER BY a;
+a	b
+50	0
+51	1
+52	2
+53	3
+54	4
+55	5
+56	6
+57	7
+58	8
+59	9
+include/stop_slave.inc
+SET GLOBAL slave_parallel_mode=@old_parallel_mode;
+SET GLOBAL slave_parallel_threads=@old_parallel_threads;
+include/start_slave.inc
+DROP TABLE t1, t2, t3;
+include/rpl_end.inc
diff --git a/storage/tokudb/mysql-test/tokudb_rpl/r/rpl_rfr_disable_on_expl_pk_absence.result b/storage/tokudb/mysql-test/tokudb_rpl/r/rpl_rfr_disable_on_expl_pk_absence.result
index 981a833aea5..2977dc859f5 100644
--- a/storage/tokudb/mysql-test/tokudb_rpl/r/rpl_rfr_disable_on_expl_pk_absence.result
+++ b/storage/tokudb/mysql-test/tokudb_rpl/r/rpl_rfr_disable_on_expl_pk_absence.result
@@ -1,7 +1,4 @@
 include/master-slave.inc
-Warnings:
-Note	####	Sending passwords in plain text without SSL/TLS is extremely insecure.
-Note	####	Storing MySQL user name or password information in the master info repository is not secure and is therefore not recommended. Please consider using the USER and PASSWORD connection options for START SLAVE; see the 'START SLAVE Syntax' in the MySQL Manual for more information.
 [connection master]
 call mtr.add_suppression("read free replication is disabled for tokudb table");
 CREATE TABLE t (a int(11), b char(20)) ENGINE = TokuDB;
diff --git a/storage/tokudb/mysql-test/tokudb_rpl/r/rpl_tokudb_rfr_partition_table.result b/storage/tokudb/mysql-test/tokudb_rpl/r/rpl_tokudb_rfr_partition_table.result
new file mode 100644
index 00000000000..4594959c6d0
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb_rpl/r/rpl_tokudb_rfr_partition_table.result
@@ -0,0 +1,36 @@
+include/master-slave.inc
+Warnings:
+Note	####	Sending passwords in plain text without SSL/TLS is extremely insecure.
+Note	####	Storing MySQL user name or password information in the master info repository is not secure and is therefore not recommended. Please consider using the USER and PASSWORD connection options for START SLAVE; see the 'START SLAVE Syntax' in the MySQL Manual for more information.
+[connection master]
+call mtr.add_suppression(".*read free replication is disabled for TokuDB table.*continue with rows lookup");
+CREATE TABLE t1 (id int(11) NOT NULL, pid int(11), PRIMARY KEY (id)) ENGINE=TokuDB
+PARTITION BY RANGE (id)
+(PARTITION p_1 VALUES LESS THAN (10) ENGINE = TokuDB,
+PARTITION p_2 VALUES LESS THAN (20) ENGINE = TokuDB,
+PARTITION p_all VALUES LESS THAN MAXVALUE ENGINE = TokuDB);
+insert into t1 values (1, 1), (2, 2), (3, 3), (11, 11), (12, 12), (13, 13);
+CREATE TABLE t2 (id int(11) NOT NULL, pid int(11), key idx_1(id)) ENGINE=TokuDB
+PARTITION BY RANGE (id)
+(PARTITION p_1 VALUES LESS THAN (10) ENGINE = TokuDB,
+PARTITION p_2 VALUES LESS THAN (20) ENGINE = TokuDB,
+PARTITION p_all VALUES LESS THAN MAXVALUE ENGINE = TokuDB);
+insert into t2 values (1, 1), (2, 2), (3, 3), (11, 11), (12, 12), (13, 13);
+include/stop_slave.inc
+set global debug= "+d,tokudb_crash_if_rpl_looks_up_row,tokudb_crash_if_rpl_does_uniqueness_check";
+include/start_slave.inc
+insert into t1 values(21, 21);
+delete from t1 where id = 11;
+update t1 set pid = 2 where id = 1;
+include/diff_tables.inc [master:test.t1, slave:test.t1]
+insert into t2 values(21, 21);
+delete from t2 where id = 11;
+update t2 set pid = 2 where id = 1;
+include/diff_tables.inc [master:test.t2, slave:test.t2]
+drop table t1;
+drop table t2;
+include/stop_slave.inc
+set global debug= "-d,tokudb_crash_if_rpl_looks_up_row,tokudb_crash_if_rpl_does_uniqueness_check";
+set global debug= @saved_debug;
+include/start_slave.inc
+include/rpl_end.inc
diff --git a/storage/tokudb/mysql-test/tokudb_rpl/suite.opt b/storage/tokudb/mysql-test/tokudb_rpl/suite.opt
new file mode 100644
index 00000000000..f94d0f6d6dc
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb_rpl/suite.opt
@@ -0,0 +1 @@
+--tokudb --plugin-load-add=$HA_TOKUDB_SO --loose-tokudb-check-jemalloc=0 --sql-mode=NO_ENGINE_SUBSTITUTION
diff --git a/storage/tokudb/mysql-test/tokudb_rpl/t/rpl_parallel_optimistic.test b/storage/tokudb/mysql-test/tokudb_rpl/t/rpl_parallel_optimistic.test
new file mode 100644
index 00000000000..c7c7de9d902
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb_rpl/t/rpl_parallel_optimistic.test
@@ -0,0 +1,478 @@
+--source include/have_tokudb.inc
+--source include/have_debug.inc
+--source include/have_debug_sync.inc
+--source include/master-slave.inc
+
+--connection master
+ALTER TABLE mysql.gtid_slave_pos ENGINE=TokuDB;
+CREATE TABLE t1 (a int PRIMARY KEY, b INT, UNIQUE KEY (b)) ENGINE=TokuDB;
+--save_master_pos
+
+--connection slave
+--sync_with_master
+SET @old_parallel_threads=@@GLOBAL.slave_parallel_threads;
+--source include/stop_slave.inc
+SET GLOBAL slave_parallel_threads=10;
+CHANGE MASTER TO master_use_gtid=slave_pos;
+SET @old_parallel_mode=@@GLOBAL.slave_parallel_mode;
+SET GLOBAL slave_parallel_mode='optimistic';
+
+
+--connection master
+
+INSERT INTO t1 VALUES(1,1);
+BEGIN;
+INSERT INTO t1 VALUES(2,2);
+INSERT INTO t1 VALUES(3,3);
+COMMIT;
+
+# Do a bunch of INSERT/DELETE on the same rows, bound to conflict.
+# We will get a lot of rollbacks, probably, but they should be handled without
+# any visible errors.
+
+DELETE FROM t1 WHERE a=2;
+INSERT INTO t1 VALUES (2,2);
+DELETE FROM t1 WHERE a=2;
+INSERT INTO t1 VALUES (2,6);
+DELETE FROM t1 WHERE a=2;
+INSERT INTO t1 VALUES (2,4);
+DELETE FROM t1 WHERE a=2;
+INSERT INTO t1 VALUES (2,5);
+
+DELETE FROM t1 WHERE a=3;
+INSERT INTO t1 VALUES(3,3);
+DELETE FROM t1 WHERE a=1;
+INSERT INTO t1 VALUES(1,4);
+DELETE FROM t1 WHERE a=3;
+INSERT INTO t1 VALUES(3,3);
+
+DELETE FROM t1 WHERE a=2;
+INSERT INTO t1 VALUES (2,6);
+--source include/save_master_gtid.inc
+SELECT * FROM t1 ORDER BY a;
+
+--connection slave
+--source include/start_slave.inc
+--source include/sync_with_master_gtid.inc
+SELECT * FROM t1 ORDER BY a;
+#SHOW STATUS LIKE 'Slave_retried_transactions';
+
+
+--echo *** Test a bunch of non-transactional/DDL event groups. ***
+
+--connection slave
+--source include/stop_slave.inc
+
+--connection master
+
+INSERT INTO t1 VALUES (4,8);
+INSERT INTO t1 VALUES (5,9);
+CREATE TABLE t2 (a INT PRIMARY KEY) ENGINE=TokuDB;
+INSERT INTO t2 VALUES (1);
+CREATE TABLE t3 (a INT PRIMARY KEY) ENGINE=MyISAM;
+ALTER TABLE t2 ADD b INT;
+INSERT INTO t2 VALUES (2,2);
+ALTER TABLE t2 DROP b;
+INSERT INTO t2 VALUES (3);
+ALTER TABLE t2 ADD c INT;
+INSERT INTO t2 VALUES (4,5);
+INSERT INTO t2 VALUES (5,5);
+INSERT INTO t3 VALUES (1);
+UPDATE t2 SET c=NULL WHERE a=4;
+ALTER TABLE t2 ADD UNIQUE (c);
+INSERT INTO t2 VALUES (6,6);
+UPDATE t2 SET c=c+100 WHERE a=2;
+INSERT INTO t3(a) VALUES (2);
+DELETE FROM t3 WHERE a=2;
+INSERT INTO t3(a) VALUES (2);
+DELETE FROM t3 WHERE a=2;
+ALTER TABLE t3 CHANGE a c INT NOT NULL;
+INSERT INTO t3(c) VALUES (2);
+DELETE FROM t3 WHERE c=2;
+INSERT INTO t3 SELECT a+200 FROM t2;
+DELETE FROM t3 WHERE c >= 200;
+INSERT INTO t3 SELECT a+200 FROM t2;
+--source include/save_master_gtid.inc
+SELECT * FROM t1 ORDER BY a;
+SELECT * FROM t2 ORDER BY a;
+SELECT * FROM t3 ORDER BY c;
+
+--connection slave
+--source include/start_slave.inc
+--source include/sync_with_master_gtid.inc
+SELECT * FROM t1 ORDER BY a;
+SELECT * FROM t2 ORDER BY a;
+SELECT * FROM t3 ORDER BY c;
+#SHOW STATUS LIKE 'Slave_retried_transactions';
+
+
+--echo *** Test @@skip_parallel_replication. ***
+
+--connection slave
+--source include/stop_slave.inc
+--let $retry1= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1)
+
+--connection master
+# We do a bunch of conflicting transactions on the master with
+# skip_parallel_replication set to true, and check that we do not
+# get any retries on the slave.
+
+UPDATE t1 SET b=10 WHERE a=3;
+SET SESSION skip_parallel_replication=1;
+UPDATE t1 SET b=20 WHERE a=3;
+UPDATE t1 SET b=30 WHERE a=3;
+UPDATE t1 SET b=50 WHERE a=3;
+UPDATE t1 SET b=80 WHERE a=3;
+UPDATE t1 SET b=130 WHERE a=3;
+UPDATE t1 SET b=210 WHERE a=3;
+UPDATE t1 SET b=340 WHERE a=3;
+UPDATE t1 SET b=550 WHERE a=3;
+UPDATE t1 SET b=890 WHERE a=3;
+SET SESSION skip_parallel_replication=0;
+SELECT * FROM t1 ORDER BY a;
+--source include/save_master_gtid.inc
+
+--connection slave
+--source include/start_slave.inc
+--source include/sync_with_master_gtid.inc
+SELECT * FROM t1 ORDER BY a;
+--let $retry2= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1)
+--disable_query_log
+eval SELECT IF($retry1=$retry2, "Ok, no retry",
+       CONCAT("ERROR: ", $retry2-$retry1, " retries during replication (was ",
+              $retry1, " now ", $retry2, ")")) AS status;
+--enable_query_log
+
+
+--echo *** Test that we do not replicate in parallel transactions that had row lock waits on the master ***
+
+--connection slave
+--source include/stop_slave.inc
+--let $retry1= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1)
+
+--connection master
+# Setup a bunch of transactions that all needed to wait.
+--connect (m1,127.0.0.1,root,,test,$SERVER_MYPORT_1,)
+--connect (m2,127.0.0.1,root,,test,$SERVER_MYPORT_1,)
+--connect (m3,127.0.0.1,root,,test,$SERVER_MYPORT_1,)
+--connect (m4,127.0.0.1,root,,test,$SERVER_MYPORT_1,)
+--connect (m5,127.0.0.1,root,,test,$SERVER_MYPORT_1,)
+--connect (m6,127.0.0.1,root,,test,$SERVER_MYPORT_1,)
+--connect (m7,127.0.0.1,root,,test,$SERVER_MYPORT_1,)
+--connect (m8,127.0.0.1,root,,test,$SERVER_MYPORT_1,)
+
+--connection default
+BEGIN; UPDATE t1 SET b=b+1 WHERE a=3;
+
+--connection m1
+SET debug_sync='thd_report_wait_for SIGNAL waiting1';
+send UPDATE t1 SET b=1001 WHERE a=3;
+--connection default
+SET debug_sync='now WAIT_FOR waiting1';
+
+--connection m2
+BEGIN;
+UPDATE t1 SET b=1002 WHERE a=5;
+SET debug_sync='thd_report_wait_for SIGNAL waiting2';
+send UPDATE t1 SET b=102 WHERE a=3;
+--connection default
+SET debug_sync='now WAIT_FOR waiting2';
+
+UPDATE t1 SET b=1000 WHERE a=1;
+--connection m3
+SET debug_sync='thd_report_wait_for SIGNAL waiting3';
+send UPDATE t1 SET b=1003 WHERE a=5;
+--connection default
+SET debug_sync='now WAIT_FOR waiting3';
+
+--connection m4
+SET debug_sync='thd_report_wait_for SIGNAL waiting4';
+send UPDATE t1 SET b=1004 WHERE a=3;
+--connection default
+SET debug_sync='now WAIT_FOR waiting4';
+
+--connection m5
+SET debug_sync='thd_report_wait_for SIGNAL waiting5';
+send UPDATE t1 SET b=1005 WHERE a=5;
+--connection default
+SET debug_sync='now WAIT_FOR waiting5';
+
+--connection m6
+SET debug_sync='thd_report_wait_for SIGNAL waiting6';
+send UPDATE t1 SET b=1006 WHERE a=1;
+--connection default
+SET debug_sync='now WAIT_FOR waiting6';
+
+--connection m7
+SET debug_sync='thd_report_wait_for SIGNAL waiting7';
+send UPDATE t1 SET b=1007 WHERE a=5;
+--connection default
+SET debug_sync='now WAIT_FOR waiting7';
+
+--connection m8
+SET debug_sync='thd_report_wait_for SIGNAL waiting8';
+send UPDATE t1 SET b=1008 WHERE a=3;
+--connection default
+SET debug_sync='now WAIT_FOR waiting8';
+
+--connection default
+COMMIT;
+--connection m1
+REAP;
+--connection m2
+REAP;
+COMMIT;
+--connection m3
+REAP;
+--connection m4
+REAP;
+--connection m5
+REAP;
+--connection m6
+REAP;
+--connection m7
+REAP;
+--connection m8
+REAP;
+--connection default
+SET debug_sync='RESET';
+
+--source include/save_master_gtid.inc
+# It is not deterministic in which order the parallel conflicting
+# updates will run. Eg. for key a=5, we could get 1003, 1005, or
+# 1007. As long as we get the same on the slave, it is ok.
+--let $master_value= `SELECT GROUP_CONCAT(CONCAT("(", a, ",", b, ")") ORDER BY a, b SEPARATOR "/") FROM t1`
+
+--connection slave
+--source include/start_slave.inc
+--source include/sync_with_master_gtid.inc
+--let $slave_value= `SELECT GROUP_CONCAT(CONCAT("(", a, ",", b, ")") ORDER BY a, b SEPARATOR "/") FROM t1`
+--disable_query_log
+eval SET @master_value="$master_value";
+eval SET @slave_value="$slave_value";
+--enable_query_log
+SELECT IF(@master_value=@slave_value, "Slave data matches master", CONCAT("ERROR: Slave had different data '", @slave_value, "' than master's '", @master_value, "'!")) as check_result;
+--let $retry2= query_get_value(SHOW STATUS LIKE 'Slave_retried_transactions', Value, 1)
+--disable_query_log
+eval SELECT IF($retry1=$retry2, "Ok, no retry",
+       CONCAT("ERROR: ", $retry2-$retry1, " retries during replication (was ",
+              $retry1, " now ", $retry2, ")")) AS status;
+--enable_query_log
+
+
+--echo *** Test that we replicate correctly when using READ COMMITTED and binlog_format=MIXED on the slave ***
+
+--connection slave
+--source include/stop_slave.inc
+SET @old_format= @@GLOBAL.binlog_format;
+# Use MIXED format; we cannot binlog ROW events on slave in STATEMENT format.
+SET GLOBAL binlog_format= MIXED;
+SET @old_isolation= @@GLOBAL.tx_isolation;
+SET GLOBAL TRANSACTION ISOLATION LEVEL READ COMMITTED;
+# Reset the worker threads to make the new settings take effect.
+SET GLOBAL slave_parallel_threads=0;
+SET GLOBAL slave_parallel_threads=10;
+
+--connection master
+DROP TABLE t1, t2;
+CREATE TABLE t1 (a int PRIMARY KEY, b INT) ENGINE=TokuDB;
+CREATE TABLE t2 (a int PRIMARY KEY, b INT) ENGINE=TokuDB;
+INSERT INTO t1 VALUES (1,0), (2,0), (3,0);
+INSERT INTO t2 VALUES (1,0), (2,0);
+INSERT INTO t1 SELECT 4, COUNT(*) FROM t2;
+INSERT INTO t2 SELECT 4, COUNT(*) FROM t1;
+
+INSERT INTO t1 SELECT 5, COUNT(*) FROM t2;
+INSERT INTO t2 SELECT 5, COUNT(*) FROM t1;
+
+INSERT INTO t2 SELECT 6, COUNT(*) FROM t1;
+INSERT INTO t1 SELECT 6, COUNT(*) FROM t2;
+
+INSERT INTO t1 SELECT 7, COUNT(*) FROM t2;
+INSERT INTO t2 SELECT 7, COUNT(*) FROM t1;
+
+INSERT INTO t2 SELECT 8, COUNT(*) FROM t1;
+INSERT INTO t1 SELECT 8, COUNT(*) FROM t2;
+
+INSERT INTO t2 SELECT 9, COUNT(*) FROM t1;
+INSERT INTO t1 SELECT 9, COUNT(*) FROM t2;
+
+INSERT INTO t1 SELECT 10, COUNT(*) FROM t2;
+INSERT INTO t2 SELECT 10, COUNT(*) FROM t1;
+
+SELECT * FROM t1 ORDER BY a;
+SELECT * FROM t2 ORDER BY a;
+--source include/save_master_gtid.inc
+
+--connection slave
+--source include/start_slave.inc
+--source include/sync_with_master_gtid.inc
+SELECT * FROM t1 ORDER BY a;
+SELECT * FROM t2 ORDER BY a;
+
+--source include/stop_slave.inc
+SET GLOBAL binlog_format= @old_format;
+SET GLOBAL tx_isolation= @old_isolation;
+--source include/start_slave.inc
+
+
+--echo *** MDEV-7888: ANALYZE TABLE does wakeup_subsequent_commits(), causing wrong binlog order and parallel replication hang ***
+
+--connection master
+DROP TABLE t1, t2, t3;
+CREATE TABLE t1 (a INT PRIMARY KEY, b INT) ENGINE=TokuDB;
+CREATE TABLE t2 (a INT PRIMARY KEY, b INT) ENGINE=TokuDB;
+CREATE TABLE t3 (a INT PRIMARY KEY, b INT) ENGINE=MyISAM;
+INSERT INTO t2 VALUES (1,1), (2,1), (3,1), (4,1), (5,1);
+--source include/save_master_gtid.inc
+
+--connection slave
+--source include/sync_with_master_gtid.inc
+--source include/stop_slave.inc
+SET @old_dbug= @@GLOBAL.debug_dbug;
+SET GLOBAL debug_dbug= '+d,inject_analyze_table_sleep';
+
+--connection master
+# The bug was that ANALYZE TABLE would call
+# wakeup_subsequent_commits() too early, allowing the following
+# transaction in the same group to run ahead and binlog and free the
+# GCO. Then we get wrong binlog order and later access freed GCO,
+# which causes lost wakeup of following GCO and thus replication hang.
+# We injected a small sleep in ANALYZE to make the race easier to hit (this
+# can only cause false negatives in versions with the bug, not false positives,
+# so sleep is ok here. And it's in general not possible to trigger reliably
+# the race with debug_sync, since the bugfix makes the race impossible).
+
+ALTER TABLE t2 COMMENT "123abc";
+ANALYZE TABLE t2;
+INSERT INTO t1 VALUES (1,2);
+INSERT INTO t1 VALUES (2,2);
+INSERT INTO t1 VALUES (3,2);
+INSERT INTO t1 VALUES (4,2);
+INSERT INTO t3 VALUES (1,3);
+ALTER TABLE t2 COMMENT "hello, world";
+BEGIN;
+INSERT INTO t1 VALUES (5,4);
+INSERT INTO t1 VALUES (6,4);
+INSERT INTO t1 VALUES (7,4);
+INSERT INTO t1 VALUES (8,4);
+INSERT INTO t1 VALUES (9,4);
+INSERT INTO t1 VALUES (10,4);
+INSERT INTO t1 VALUES (11,4);
+INSERT INTO t1 VALUES (12,4);
+INSERT INTO t1 VALUES (13,4);
+INSERT INTO t1 VALUES (14,4);
+INSERT INTO t1 VALUES (15,4);
+INSERT INTO t1 VALUES (16,4);
+INSERT INTO t1 VALUES (17,4);
+INSERT INTO t1 VALUES (18,4);
+INSERT INTO t1 VALUES (19,4);
+INSERT INTO t1 VALUES (20,4);
+COMMIT;
+INSERT INTO t1 VALUES (21,5);
+INSERT INTO t1 VALUES (22,5);
+
+SELECT * FROM t1 ORDER BY a;
+SELECT * FROM t2 ORDER BY a;
+SELECT * FROM t3 ORDER BY a;
+--source include/save_master_gtid.inc
+
+--connection slave
+--source include/start_slave.inc
+--source include/sync_with_master_gtid.inc
+
+SELECT * FROM t1 ORDER BY a;
+SELECT * FROM t2 ORDER BY a;
+SELECT * FROM t3 ORDER BY a;
+
+--source include/stop_slave.inc
+SET GLOBAL debug_dbug= @old_debug;
+--source include/start_slave.inc
+
+--echo *** MDEV-7929: record_gtid() for non-transactional event group calls wakeup_subsequent_commits() too early, causing slave hang. ***
+
+--connection slave
+--source include/stop_slave.inc
+SET @old_dbug= @@GLOBAL.debug_dbug;
+# The bug was that record_gtid(), when there is no existing transaction from
+# a DML event being replicated, would commit its own transaction. This wrongly
+# caused wakeup_subsequent_commits(), with similar consequences as MDEV-7888
+# above. We simulate this condition with a small sleep in record_gtid() for
+# a specific ANALYZE that we binlog with server id 100.
+SET GLOBAL debug_dbug= '+d,inject_record_gtid_serverid_100_sleep';
+
+--connection master
+
+ALTER TABLE t3 COMMENT "DDL statement 1";
+INSERT INTO t1 VALUES (30,0);
+INSERT INTO t1 VALUES (31,0);
+INSERT INTO t1 VALUES (32,0);
+INSERT INTO t1 VALUES (33,0);
+INSERT INTO t1 VALUES (34,0);
+INSERT INTO t1 VALUES (35,0);
+INSERT INTO t1 VALUES (36,0);
+SET @old_server_id= @@SESSION.server_id;
+SET SESSION server_id= 100;
+ANALYZE TABLE t2;
+SET SESSION server_id= @old_server_id;
+INSERT INTO t1 VALUES (37,0);
+ALTER TABLE t3 COMMENT "DDL statement 2";
+INSERT INTO t1 VALUES (38,0);
+INSERT INTO t1 VALUES (39,0);
+ALTER TABLE t3 COMMENT "DDL statement 3";
+
+SELECT * FROM t1 WHERE a >= 30 ORDER BY a;
+
+--source include/save_master_gtid.inc
+
+
+--connection slave
+--source include/start_slave.inc
+--source include/sync_with_master_gtid.inc
+SELECT * FROM t1 WHERE a >= 30 ORDER BY a;
+
+
+--source include/stop_slave.inc
+SET GLOBAL debug_dbug= @old_debug;
+--source include/start_slave.inc
+
+
+--echo *** MDEV-8113: ALTER TABLE causes slave hang in optimistic parallel replication ***
+
+--connection slave
+--source include/stop_slave.inc
+
+--connection master
+ALTER TABLE t2 ADD c INT;
+INSERT INTO t2 (a,b) VALUES (50, 0);
+INSERT INTO t2 (a,b) VALUES (51, 1);
+INSERT INTO t2 (a,b) VALUES (52, 2);
+INSERT INTO t2 (a,b) VALUES (53, 3);
+INSERT INTO t2 (a,b) VALUES (54, 4);
+INSERT INTO t2 (a,b) VALUES (55, 5);
+INSERT INTO t2 (a,b) VALUES (56, 6);
+INSERT INTO t2 (a,b) VALUES (57, 7);
+INSERT INTO t2 (a,b) VALUES (58, 8);
+INSERT INTO t2 (a,b) VALUES (59, 9);
+ALTER TABLE t2 DROP COLUMN c;
+SELECT * FROM t2 WHERE a >= 50 ORDER BY a;
+--source include/save_master_gtid.inc
+
+--connection slave
+--source include/start_slave.inc
+--source include/sync_with_master_gtid.inc
+SELECT * FROM t2 WHERE a >= 50 ORDER BY a;
+
+
+# Clean up.
+
+--connection slave
+--source include/stop_slave.inc
+SET GLOBAL slave_parallel_mode=@old_parallel_mode;
+SET GLOBAL slave_parallel_threads=@old_parallel_threads;
+--source include/start_slave.inc
+
+--connection master
+DROP TABLE t1, t2, t3;
+
+--source include/rpl_end.inc
diff --git a/storage/tokudb/mysql-test/tokudb_rpl/t/rpl_tokudb_rfr_partition_table-slave.opt b/storage/tokudb/mysql-test/tokudb_rpl/t/rpl_tokudb_rfr_partition_table-slave.opt
new file mode 100644
index 00000000000..2a7ec2590cc
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb_rpl/t/rpl_tokudb_rfr_partition_table-slave.opt
@@ -0,0 +1 @@
+--read-only=ON --loose-tokudb-rpl-unique-checks=OFF  --loose-tokudb-rpl-lookup-rows=OFF
diff --git a/storage/tokudb/mysql-test/tokudb_rpl/t/rpl_tokudb_rfr_partition_table.test b/storage/tokudb/mysql-test/tokudb_rpl/t/rpl_tokudb_rfr_partition_table.test
new file mode 100644
index 00000000000..eadeda83cd5
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb_rpl/t/rpl_tokudb_rfr_partition_table.test
@@ -0,0 +1,75 @@
+# test tokudb read free replication feature with partition table
+
+--source include/have_debug.inc
+--source include/have_partition.inc
+--source include/have_tokudb.inc
+--source include/have_binlog_format_row.inc
+--source include/master-slave.inc
+
+call mtr.add_suppression(".*read free replication is disabled for TokuDB table.*continue with rows lookup");
+
+connection master;
+
+# partition table with explicit PK
+CREATE TABLE t1 (id int(11) NOT NULL, pid int(11), PRIMARY KEY (id)) ENGINE=TokuDB
+PARTITION BY RANGE (id)
+(PARTITION p_1 VALUES LESS THAN (10) ENGINE = TokuDB,
+ PARTITION p_2 VALUES LESS THAN (20) ENGINE = TokuDB,
+ PARTITION p_all VALUES LESS THAN MAXVALUE ENGINE = TokuDB);
+
+insert into t1 values (1, 1), (2, 2), (3, 3), (11, 11), (12, 12), (13, 13);
+
+# partition table without explicit PK
+CREATE TABLE t2 (id int(11) NOT NULL, pid int(11), key idx_1(id)) ENGINE=TokuDB
+PARTITION BY RANGE (id)
+(PARTITION p_1 VALUES LESS THAN (10) ENGINE = TokuDB,
+ PARTITION p_2 VALUES LESS THAN (20) ENGINE = TokuDB,
+ PARTITION p_all VALUES LESS THAN MAXVALUE ENGINE = TokuDB);
+
+insert into t2 values (1, 1), (2, 2), (3, 3), (11, 11), (12, 12), (13, 13);
+
+--sync_slave_with_master
+
+# set tokudb rfr crash/assert conditions if we enter lookup code
+# to make sure no unique checks or row lookups is invoked
+connection slave;
+--source include/stop_slave.inc
+let $saved_debug = `select @@debug`;
+set global debug= "+d,tokudb_crash_if_rpl_looks_up_row,tokudb_crash_if_rpl_does_uniqueness_check";
+--source include/start_slave.inc
+
+connection master;
+insert into t1 values(21, 21);
+delete from t1 where id = 11;
+update t1 set pid = 2 where id = 1;
+
+sync_slave_with_master;
+
+connection master;
+
+--let $diff_tables= master:test.t1, slave:test.t1
+--source include/diff_tables.inc
+
+# print rfr disabled warning in errlog
+connection master;
+insert into t2 values(21, 21);
+delete from t2 where id = 11;
+update t2 set pid = 2 where id = 1;
+
+sync_slave_with_master;
+
+--let $diff_tables= master:test.t2, slave:test.t2
+--source include/diff_tables.inc
+
+connection master;
+drop table t1;
+drop table t2;
+sync_slave_with_master;
+
+connection slave;
+--source include/stop_slave.inc
+set global debug= "-d,tokudb_crash_if_rpl_looks_up_row,tokudb_crash_if_rpl_does_uniqueness_check";
+set global debug= @saved_debug;
+--source include/start_slave.inc
+
+--source include/rpl_end.inc
diff --git a/storage/tokudb/mysql-test/tokudb_sys_vars/include/have_tokudb.inc b/storage/tokudb/mysql-test/tokudb_sys_vars/include/have_tokudb.inc
new file mode 100644
index 00000000000..12b29a22d2c
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb_sys_vars/include/have_tokudb.inc
@@ -0,0 +1 @@
+let $datadir=`select @@datadir`;
diff --git a/storage/tokudb/mysql-test/tokudb_sys_vars/r/tokudb_analyze_in_background_basic.result b/storage/tokudb/mysql-test/tokudb_sys_vars/r/tokudb_analyze_in_background_basic.result
index 53e96810eda..02b808fe6cb 100644
--- a/storage/tokudb/mysql-test/tokudb_sys_vars/r/tokudb_analyze_in_background_basic.result
+++ b/storage/tokudb/mysql-test/tokudb_sys_vars/r/tokudb_analyze_in_background_basic.result
@@ -19,29 +19,30 @@ SELECT @@global.tokudb_analyze_in_background;
 @@global.tokudb_analyze_in_background
 0
 SET GLOBAL tokudb_analyze_in_background = -6;
+ERROR 42000: Variable 'tokudb_analyze_in_background' can't be set to the value of '-6'
 SELECT @@global.tokudb_analyze_in_background;
 @@global.tokudb_analyze_in_background
-1
+0
 SET GLOBAL tokudb_analyze_in_background = 1.6;
 ERROR 42000: Incorrect argument type to variable 'tokudb_analyze_in_background'
 SELECT @@global.tokudb_analyze_in_background;
 @@global.tokudb_analyze_in_background
-1
+0
 SET GLOBAL tokudb_analyze_in_background = "T";
 ERROR 42000: Variable 'tokudb_analyze_in_background' can't be set to the value of 'T'
 SELECT @@global.tokudb_analyze_in_background;
 @@global.tokudb_analyze_in_background
-1
+0
 SET GLOBAL tokudb_analyze_in_background = "Y";
 ERROR 42000: Variable 'tokudb_analyze_in_background' can't be set to the value of 'Y'
 SELECT @@global.tokudb_analyze_in_background;
 @@global.tokudb_analyze_in_background
-1
+0
 SET GLOBAL tokudb_analyze_in_background = 'foobar';
 ERROR 42000: Variable 'tokudb_analyze_in_background' can't be set to the value of 'foobar'
 SELECT @@global.tokudb_analyze_in_background;
 @@global.tokudb_analyze_in_background
-1
+0
 SET SESSION tokudb_analyze_in_background = 0;
 SELECT @@session.tokudb_analyze_in_background;
 @@session.tokudb_analyze_in_background
@@ -53,31 +54,32 @@ SELECT @@session.tokudb_analyze_in_background;
 SET SESSION tokudb_analyze_in_background = DEFAULT;
 SELECT @@session.tokudb_analyze_in_background;
 @@session.tokudb_analyze_in_background
-1
+0
 SET SESSION tokudb_analyze_in_background = -6;
+ERROR 42000: Variable 'tokudb_analyze_in_background' can't be set to the value of '-6'
 SELECT @@session.tokudb_analyze_in_background;
 @@session.tokudb_analyze_in_background
-1
+0
 SET SESSION tokudb_analyze_in_background = 1.6;
 ERROR 42000: Incorrect argument type to variable 'tokudb_analyze_in_background'
 SELECT @@session.tokudb_analyze_in_background;
 @@session.tokudb_analyze_in_background
-1
+0
 SET SESSION tokudb_analyze_in_background = "T";
 ERROR 42000: Variable 'tokudb_analyze_in_background' can't be set to the value of 'T'
 SELECT @@session.tokudb_analyze_in_background;
 @@session.tokudb_analyze_in_background
-1
+0
 SET SESSION tokudb_analyze_in_background = "Y";
 ERROR 42000: Variable 'tokudb_analyze_in_background' can't be set to the value of 'Y'
 SELECT @@session.tokudb_analyze_in_background;
 @@session.tokudb_analyze_in_background
-1
+0
 SET SESSION tokudb_analyze_in_background = 'foobar';
 ERROR 42000: Variable 'tokudb_analyze_in_background' can't be set to the value of 'foobar'
 SELECT @@session.tokudb_analyze_in_background;
 @@session.tokudb_analyze_in_background
-1
+0
 SET GLOBAL tokudb_analyze_in_background = 0;
 SET SESSION tokudb_analyze_in_background = 1;
 SELECT @@global.tokudb_analyze_in_background;
diff --git a/storage/tokudb/mysql-test/tokudb_sys_vars/suite.opt b/storage/tokudb/mysql-test/tokudb_sys_vars/suite.opt
new file mode 100644
index 00000000000..f94d0f6d6dc
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb_sys_vars/suite.opt
@@ -0,0 +1 @@
+--tokudb --plugin-load-add=$HA_TOKUDB_SO --loose-tokudb-check-jemalloc=0 --sql-mode=NO_ENGINE_SUBSTITUTION
diff --git a/storage/tokudb/mysql-test/tokudb_sys_vars/suite.pm b/storage/tokudb/mysql-test/tokudb_sys_vars/suite.pm
new file mode 100644
index 00000000000..6c52d0110fe
--- /dev/null
+++ b/storage/tokudb/mysql-test/tokudb_sys_vars/suite.pm
@@ -0,0 +1,14 @@
+package My::Suite::TokuDB;
+use File::Basename;
+@ISA = qw(My::Suite);
+
+# Ensure we can run the TokuDB tests even if hugepages are enabled
+$ENV{TOKU_HUGE_PAGES_OK}=1;
+
+#return "Not run for embedded server" if $::opt_embedded_server;
+return "No TokuDB engine" unless $ENV{HA_TOKUDB_SO} or $::mysqld_variables{tokudb};
+
+sub is_default { not $::opt_embedded_server }
+
+bless { };
+
diff --git a/storage/tokudb/mysql-test/tokudb_sys_vars/t/tokudb_analyze_in_background_basic.test b/storage/tokudb/mysql-test/tokudb_sys_vars/t/tokudb_analyze_in_background_basic.test
index dfb2a0e416d..84b001b1962 100644
--- a/storage/tokudb/mysql-test/tokudb_sys_vars/t/tokudb_analyze_in_background_basic.test
+++ b/storage/tokudb/mysql-test/tokudb_sys_vars/t/tokudb_analyze_in_background_basic.test
@@ -17,6 +17,7 @@ SELECT @@global.tokudb_analyze_in_background;
 SET GLOBAL tokudb_analyze_in_background = DEFAULT;
 SELECT @@global.tokudb_analyze_in_background;
 
+-- error ER_WRONG_VALUE_FOR_VAR
 SET GLOBAL tokudb_analyze_in_background = -6;
 SELECT @@global.tokudb_analyze_in_background;
 
@@ -46,6 +47,7 @@ SELECT @@session.tokudb_analyze_in_background;
 SET SESSION tokudb_analyze_in_background = DEFAULT;
 SELECT @@session.tokudb_analyze_in_background;
 
+-- error ER_WRONG_VALUE_FOR_VAR
 SET SESSION tokudb_analyze_in_background = -6;
 SELECT @@session.tokudb_analyze_in_background;
 
diff --git a/storage/tokudb/tokudb_background.cc b/storage/tokudb/tokudb_background.cc
index d8ef54a5972..e019e41c788 100644
--- a/storage/tokudb/tokudb_background.cc
+++ b/storage/tokudb/tokudb_background.cc
@@ -8,7 +8,7 @@ This file is part of TokuDB
 
 Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 
-    TokuDBis is free software: you can redistribute it and/or modify
+    TokuDB is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License, version 2,
     as published by the Free Software Foundation.
 
@@ -68,7 +68,8 @@ void job_manager_t::destroy() {
     while (_background_jobs.size()) {
         _mutex.lock();
         job_t* job = _background_jobs.front();
-        cancel(job);
+        if (!job->cancelled())
+            cancel(job);
         _background_jobs.pop_front();
         delete job;
         _mutex.unlock();
@@ -148,11 +149,8 @@ bool job_manager_t::cancel_job(const char* key) {
          it != _background_jobs.end(); it++) {
         job_t* job = *it;
 
-        if (!job->cancelled() &&
-            strcmp(job->key(), key) == 0) {
-
+        if (!job->cancelled() && strcmp(job->key(), key) == 0) {
             cancel(job);
-
             ret = true;
         }
     }
@@ -162,8 +160,6 @@ bool job_manager_t::cancel_job(const char* key) {
 }
 void job_manager_t::iterate_jobs(pfn_iterate_t callback, void* extra) const {
 
-    char database[256], table[256], type[256], params[256], status[256];
-
     _mutex.lock();
 
     for (jobs_t::const_iterator it = _background_jobs.begin();
@@ -171,19 +167,7 @@ void job_manager_t::iterate_jobs(pfn_iterate_t callback, void* extra) const {
          it++) {
         job_t* job = *it;
         if (!job->cancelled()) {
-            database[0] = table[0] = type[0] = params[0] = status[0] = '\0';
-            job->status(database, table, type, params, status);
-            callback(
-                job->id(),
-                database,
-                table,
-                type,
-                params,
-                status,
-                job->user_scheduled(),
-                job->scheduled_time(),
-                job->started_time(),
-                extra);
+            callback(job, extra);
         }
     }
 
@@ -233,6 +217,7 @@ void job_manager_t::run(job_t* job) {
 }
 void job_manager_t::cancel(job_t* job) {
     assert_debug(_mutex.is_owned_by_me());
+    assert_always(!job->cancelled());
     job->cancel();
 }
 job_manager_t* _job_manager = NULL;
diff --git a/storage/tokudb/tokudb_background.h b/storage/tokudb/tokudb_background.h
index 3786701fd0f..29991ab325d 100644
--- a/storage/tokudb/tokudb_background.h
+++ b/storage/tokudb/tokudb_background.h
@@ -7,7 +7,7 @@ This file is part of TokuDB
 
 Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved.
 
-    TokuDBis is free software: you can redistribute it and/or modify
+    TokuDB is free software: you can redistribute it and/or modify
     it under the terms of the GNU General Public License, version 2,
     as published by the Free Software Foundation.
 
@@ -58,13 +58,20 @@ public:
         // (or jobs) usually used to find jobs to cancel
         virtual const char* key() = 0;
 
-        // method to get info for information schema, 255 chars per buffer
-        virtual void status(
-            char* database,
-            char* table,
-            char* type,
-            char* params,
-            char* status) = 0;
+        // method to obtain the database name the job is scheduled on
+        virtual const char* database() = 0;
+
+        // method to obtain the table name the job is scheduled on
+        virtual const char* table() = 0;
+
+        // method to obtain the type of job
+        virtual const char* type() = 0;
+
+        // method to obtain a stringized list of job parameters
+        virtual const char* parameters() = 0;
+
+        // method to obtain a sting identifying the current status of the job
+        virtual const char* status() = 0;
 
         inline bool running() const;
 
@@ -99,17 +106,7 @@ public:
     };
 
     // pfn for iterate callback
-    typedef void (*pfn_iterate_t)(
-        uint64_t,
-        const char*,
-        const char*,
-        const char*,
-        const char*,
-        const char*,
-        bool,
-        time_t,
-        time_t,
-        void*);
+    typedef void (*pfn_iterate_t)(class job_t*, void*);
 
 public:
     void* operator new(size_t sz);
@@ -144,6 +141,11 @@ public:
     // data passed when the job was scheduled
     void iterate_jobs(pfn_iterate_t callback, void* extra) const;
 
+    // lock the bjm, this prevents anyone from running, cancelling or iterating
+    // jobs in the bjm.
+    inline void lock();
+    inline void unlock();
+
 private:
     static void* thread_func(void* v);
 
@@ -170,6 +172,15 @@ extern job_manager_t*    _job_manager;
 bool initialize();
 bool destroy();
 
+inline void job_manager_t::lock() {
+    assert_debug(!_mutex.is_owned_by_me());
+    _mutex.lock();
+}
+inline void job_manager_t::unlock() {
+    assert_debug(_mutex.is_owned_by_me());
+    _mutex.unlock();
+}
+
 inline void job_manager_t::job_t::run() {
     if (!_cancelled) {
         _running = true;
diff --git a/storage/tokudb/tokudb_information_schema.cc b/storage/tokudb/tokudb_information_schema.cc
index e69a7899b45..2b4128cc03a 100644
--- a/storage/tokudb/tokudb_information_schema.cc
+++ b/storage/tokudb/tokudb_information_schema.cc
@@ -75,7 +75,9 @@ int trx_callback(
     void *extra) {
 
     uint64_t txn_id = txn->id64(txn);
-    uint64_t client_id = txn->get_client_id(txn);
+    uint64_t client_id;
+    void *client_extra;
+    txn->get_client_id(txn, &client_id, &client_extra);
     uint64_t start_time = txn->get_start_time(txn);
     trx_extra_t* e = reinterpret_cast<struct trx_extra_t*>(extra);
     THD* thd = e->thd;
@@ -85,7 +87,7 @@ int trx_callback(
     uint64_t tnow = (uint64_t) ::time(NULL);
     table->field[2]->store(tnow >= start_time ? tnow - start_time : 0, false);
     int error = schema_table_store_record(thd, table);
-    if (!error && thd_killed(thd))
+    if (!error && thd_kill_level(thd))
         error = ER_QUERY_INTERRUPTED;
     return error;
 }
@@ -219,7 +221,7 @@ int lock_waits_callback(
 
     int error = schema_table_store_record(thd, table);
 
-    if (!error && thd_killed(thd))
+    if (!error && thd_kill_level(thd))
         error = ER_QUERY_INTERRUPTED;
 
     return error;
@@ -314,7 +316,9 @@ int locks_callback(
     void* extra) {
 
     uint64_t txn_id = txn->id64(txn);
-    uint64_t client_id = txn->get_client_id(txn);
+    uint64_t client_id;
+    void *client_extra;
+    txn->get_client_id(txn, &client_id, &client_extra);
     locks_extra_t* e = reinterpret_cast<struct locks_extra_t*>(extra);
     THD* thd = e->thd;
     TABLE* table = e->table;
@@ -361,7 +365,7 @@ int locks_callback(
 
         error = schema_table_store_record(thd, table);
 
-        if (!error && thd_killed(thd))
+        if (!error && thd_kill_level(thd))
             error = ER_QUERY_INTERRUPTED;
     }
     return error;
@@ -493,7 +497,7 @@ int report_file_map(TABLE* table, THD* thd) {
 
             error = schema_table_store_record(thd, table);
         }
-        if (!error && thd_killed(thd))
+        if (!error && thd_kill_level(thd))
             error = ER_QUERY_INTERRUPTED;
     }
     if (error == DB_NOTFOUND) {
@@ -698,7 +702,7 @@ int report_fractal_tree_info(TABLE* table, THD* thd) {
             if (error)
                 error = 0; // ignore read uncommitted errors
         }
-        if (!error && thd_killed(thd))
+        if (!error && thd_kill_level(thd))
             error = ER_QUERY_INTERRUPTED;
     }
     if (error == DB_NOTFOUND) {
@@ -989,7 +993,7 @@ int report_fractal_tree_block_map(TABLE* table, THD* thd) {
                 table,
                 thd);
         }
-        if (!error && thd_killed(thd))
+        if (!error && thd_kill_level(thd))
             error = ER_QUERY_INTERRUPTED;
     }
     if (error == DB_NOTFOUND) {
@@ -1085,7 +1089,7 @@ ST_FIELD_INFO background_job_status_field_info[] = {
     {"scheduler", 32, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE },
     {"scheduled_time", 0, MYSQL_TYPE_DATETIME, 0, 0, NULL, SKIP_OPEN_TABLE },
     {"started_time", 0, MYSQL_TYPE_DATETIME, 0, MY_I_S_MAYBE_NULL, NULL, SKIP_OPEN_TABLE },
-    {"status", 256, MYSQL_TYPE_STRING, 0, MY_I_S_MAYBE_NULL, SKIP_OPEN_TABLE },
+    {"status", 1024, MYSQL_TYPE_STRING, 0, MY_I_S_MAYBE_NULL, SKIP_OPEN_TABLE },
     {NULL, 0, MYSQL_TYPE_NULL, 0, 0, NULL, SKIP_OPEN_TABLE}
 };
 
@@ -1095,15 +1099,7 @@ struct background_job_status_extra {
 };
 
 void background_job_status_callback(
-    uint64_t id,
-    const char* database_name,
-    const char* table_name,
-    const char* type,
-    const char* params,
-    const char* status,
-    bool user_scheduled,
-    time_t scheduled_time,
-    time_t started_time,
+    tokudb::background::job_manager_t::job_t* job,
     void* extra) {
 
     background_job_status_extra* e =
@@ -1111,24 +1107,33 @@ void background_job_status_callback(
 
     THD* thd = e->thd;
     TABLE* table = e->table;
+    const char* tmp = NULL;
 
-    table->field[0]->store(id, false);
-    table->field[1]->store(
-        database_name,
-        strlen(database_name),
-        system_charset_info);
-    table->field[2]->store(table_name, strlen(table_name), system_charset_info);
-    table->field[3]->store(type, strlen(type), system_charset_info);
-    table->field[4]->store(params, strlen(params), system_charset_info);
-    if (user_scheduled)
+    table->field[0]->store(job->id(), false);
+
+    tmp = job->database();
+    table->field[1]->store(tmp, strlen(tmp),  system_charset_info);
+
+    tmp = job->table();
+    table->field[2]->store(tmp, strlen(tmp),  system_charset_info);
+
+    tmp = job->type();
+    table->field[3]->store(tmp, strlen(tmp),  system_charset_info);
+
+    tmp = job->parameters();
+    table->field[4]->store(tmp, strlen(tmp),  system_charset_info);
+
+    if (job->user_scheduled())
         table->field[5]->store("USER", strlen("USER"), system_charset_info);
     else
         table->field[5]->store("AUTO", strlen("AUTO"), system_charset_info);
 
-    field_store_time_t(table->field[6], scheduled_time);
-    field_store_time_t(table->field[7], started_time);
-    if (status[0] != '\0') {
-        table->field[8]->store(status, strlen(status), system_charset_info);
+    field_store_time_t(table->field[6], job->scheduled_time());
+    field_store_time_t(table->field[7], job->started_time());
+
+    tmp = job->status();
+    if (tmp && tmp[0] != '\0') {
+        table->field[8]->store(tmp, strlen(tmp), system_charset_info);
         table->field[8]->set_notnull();
     } else {
         table->field[8]->store(NULL, 0, system_charset_info);
diff --git a/storage/tokudb/tokudb_sysvars.cc b/storage/tokudb/tokudb_sysvars.cc
index 7cea749b4fb..b758929c10e 100644
--- a/storage/tokudb/tokudb_sysvars.cc
+++ b/storage/tokudb/tokudb_sysvars.cc
@@ -66,6 +66,7 @@ uint        read_status_frequency = 0;
 my_bool     strip_frm_data = FALSE;
 char*       tmp_dir = NULL;
 uint        write_status_frequency = 0;
+my_bool     dir_per_db = FALSE;
 char*       version = (char*) TOKUDB_VERSION_STR;
 
 // file system reserve as a percentage of total disk space
@@ -394,6 +395,18 @@ static MYSQL_SYSVAR_UINT(
     ~0U,
     0);
 
+static void tokudb_dir_per_db_update(THD* thd,
+                                     struct st_mysql_sys_var* sys_var,
+                                     void* var, const void* save) {
+    my_bool *value = (my_bool *) var;
+    *value = *(const my_bool *) save;
+    db_env->set_dir_per_db(db_env, *value);
+}
+
+static MYSQL_SYSVAR_BOOL(dir_per_db, dir_per_db,
+    0, "TokuDB store ft files in db directories",
+    NULL, tokudb_dir_per_db_update, FALSE);
+
 #if TOKU_INCLUDE_HANDLERTON_HANDLE_FATAL_SIGNAL
 static MYSQL_SYSVAR_STR(
     gdb_path,
@@ -935,6 +948,7 @@ st_mysql_sys_var* system_variables[] = {
     MYSQL_SYSVAR(tmp_dir),
     MYSQL_SYSVAR(version),
     MYSQL_SYSVAR(write_status_frequency),
+    MYSQL_SYSVAR(dir_per_db),
 
 #if TOKU_INCLUDE_HANDLERTON_HANDLE_FATAL_SIGNAL
     MYSQL_SYSVAR(gdb_path),
diff --git a/storage/tokudb/tokudb_sysvars.h b/storage/tokudb/tokudb_sysvars.h
index 3bd96f7c68d..7701f211729 100644
--- a/storage/tokudb/tokudb_sysvars.h
+++ b/storage/tokudb/tokudb_sysvars.h
@@ -101,6 +101,7 @@ extern uint         read_status_frequency;
 extern my_bool      strip_frm_data;
 extern char*        tmp_dir;
 extern uint         write_status_frequency;
+extern my_bool      dir_per_db;
 extern char*        version;
 
 #if TOKU_INCLUDE_HANDLERTON_HANDLE_FATAL_SIGNAL
diff --git a/storage/tokudb/tokudb_txn.h b/storage/tokudb/tokudb_txn.h
index 67bf591d088..d0255415403 100644
--- a/storage/tokudb/tokudb_txn.h
+++ b/storage/tokudb/tokudb_txn.h
@@ -116,7 +116,7 @@ inline int txn_begin(
     int r = env->txn_begin(env, parent, txn, flags);
     if (r == 0 && thd) {
         DB_TXN* this_txn = *txn;
-        this_txn->set_client_id(this_txn, thd_get_thread_id(thd));
+        this_txn->set_client_id(this_txn, thd_get_thread_id(thd), thd);
     }
     TOKUDB_TRACE_FOR_FLAGS(
         TOKUDB_DEBUG_TXN,
diff --git a/storage/xtradb/btr/btr0btr.cc b/storage/xtradb/btr/btr0btr.cc
index 00a04e75c49..bce81f95ead 100644
--- a/storage/xtradb/btr/btr0btr.cc
+++ b/storage/xtradb/btr/btr0btr.cc
@@ -2,7 +2,7 @@
 
 Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2012, Facebook Inc.
-Copyright (c) 2014, 2015, MariaDB Corporation
+Copyright (c) 2014, 2016, MariaDB Corporation
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -80,7 +80,7 @@ btr_corruption_report(
 			       buf_block_get_zip_size(block),
 			       BUF_PAGE_PRINT_NO_CRASH);
 	}
-	buf_page_print(buf_block_get_frame_fast(block), 0, 0);
+	buf_page_print(buf_nonnull_block_get_frame(block), 0, 0);
 }
 
 #ifndef UNIV_HOTBACKUP
@@ -743,14 +743,16 @@ btr_root_block_get(
 	block = btr_block_get(space, zip_size, root_page_no, mode, (dict_index_t*)index, mtr);
 
 	if (!block) {
-		index->table->is_encrypted = TRUE;
-		index->table->corrupted = FALSE;
-
-		ib_push_warning(index->table->thd, DB_DECRYPTION_FAILED,
-			"Table %s in tablespace %lu is encrypted but encryption service or"
-			" used key_id is not available. "
-			" Can't continue reading table.",
-			index->table->name, space);
+		if (index && index->table) {
+			index->table->is_encrypted = TRUE;
+			index->table->corrupted = FALSE;
+
+			ib_push_warning(index->table->thd, DB_DECRYPTION_FAILED,
+				"Table %s in tablespace %lu is encrypted but encryption service or"
+				" used key_id is not available. "
+				" Can't continue reading table.",
+				index->table->name, space);
+		}
 
 		return NULL;
 	}
@@ -825,11 +827,12 @@ btr_height_get(
 
         /* S latches the page */
         root_block = btr_root_block_get(index, RW_S_LATCH, mtr);
+	ut_ad(root_block); // The index must not be corrupted
 
 	if (root_block) {
 
-		height = btr_page_get_level(buf_block_get_frame_fast(root_block), mtr);
-
+		height = btr_page_get_level(buf_nonnull_block_get_frame(root_block),
+					    mtr);
 		/* Release the S latch on the root page. */
 		mtr_memo_release(mtr, root_block, MTR_MEMO_PAGE_S_FIX);
 #ifdef UNIV_SYNC_DEBUG
@@ -1840,6 +1843,11 @@ leaf_loop:
 	root = btr_page_get(space, zip_size, root_page_no, RW_X_LATCH,
 			    NULL, &mtr);
 
+	if (!root) {
+		mtr_commit(&mtr);
+		return;
+	}
+
 	SRV_CORRUPT_TABLE_CHECK(root,
 	{
 		mtr_commit(&mtr);
@@ -1909,17 +1917,19 @@ btr_free_root(
 	block = btr_block_get(space, zip_size, root_page_no, RW_X_LATCH,
 			      NULL, mtr);
 
-	SRV_CORRUPT_TABLE_CHECK(block, return;);
+	if (block) {
+		SRV_CORRUPT_TABLE_CHECK(block, return;);
 
-	btr_search_drop_page_hash_index(block);
+		btr_search_drop_page_hash_index(block);
 
-	header = buf_block_get_frame(block) + PAGE_HEADER + PAGE_BTR_SEG_TOP;
+		header = buf_block_get_frame(block) + PAGE_HEADER + PAGE_BTR_SEG_TOP;
 #ifdef UNIV_BTR_DEBUG
-	ut_a(btr_root_fseg_validate(header, space));
+		ut_a(btr_root_fseg_validate(header, space));
 #endif /* UNIV_BTR_DEBUG */
 
-	while (!fseg_free_step(header, mtr)) {
-		/* Free the entire segment in small steps. */
+		while (!fseg_free_step(header, mtr)) {
+			/* Free the entire segment in small steps. */
+		}
 	}
 }
 #endif /* !UNIV_HOTBACKUP */
@@ -2903,7 +2913,7 @@ btr_attach_half_pages(
 	}
 
 	/* Get the level of the split pages */
-	level = btr_page_get_level(buf_block_get_frame_fast(block), mtr);
+	level = btr_page_get_level(buf_nonnull_block_get_frame(block), mtr);
 	ut_ad(level
 	      == btr_page_get_level(buf_block_get_frame(new_block), mtr));
 
@@ -4280,8 +4290,10 @@ btr_discard_page(
 
 	/* Decide the page which will inherit the locks */
 
-	left_page_no = btr_page_get_prev(buf_block_get_frame_fast(block), mtr);
-	right_page_no = btr_page_get_next(buf_block_get_frame_fast(block), mtr);
+	left_page_no = btr_page_get_prev(buf_nonnull_block_get_frame(block),
+					 mtr);
+	right_page_no = btr_page_get_next(buf_nonnull_block_get_frame(block),
+					  mtr);
 
 	if (left_page_no != FIL_NULL) {
 		merge_block = btr_block_get(space, zip_size, left_page_no,
diff --git a/storage/xtradb/btr/btr0cur.cc b/storage/xtradb/btr/btr0cur.cc
index 0dc6a7a3ee9..2acf5dfa6f7 100644
--- a/storage/xtradb/btr/btr0cur.cc
+++ b/storage/xtradb/btr/btr0cur.cc
@@ -2279,9 +2279,12 @@ func_exit:
 	if (page_zip
 	    && !(flags & BTR_KEEP_IBUF_BITMAP)
 	    && !dict_index_is_clust(index)
-	    && page_is_leaf(buf_block_get_frame(block))) {
-		/* Update the free bits in the insert buffer. */
-		ibuf_update_free_bits_zip(block, mtr);
+	    && block) {
+		buf_frame_t* frame = buf_block_get_frame(block);
+		if (frame && page_is_leaf(frame)) {
+			/* Update the free bits in the insert buffer. */
+			ibuf_update_free_bits_zip(block, mtr);
+		}
 	}
 
 	return(err);
diff --git a/storage/xtradb/btr/btr0scrub.cc b/storage/xtradb/btr/btr0scrub.cc
index e6acb7802f1..62a41d19768 100644
--- a/storage/xtradb/btr/btr0scrub.cc
+++ b/storage/xtradb/btr/btr0scrub.cc
@@ -368,12 +368,17 @@ btr_optimistic_scrub(
 
 	/* We play safe and reset the free bits */
 	if (!dict_index_is_clust(index) &&
-	    page_is_leaf(buf_block_get_frame(block))) {
+	    block != NULL) {
+		buf_frame_t* frame = buf_block_get_frame(block);
+		if (frame &&
+		    page_is_leaf(frame)) {
 
 			ibuf_reset_free_bits(block);
+		}
 	}
 
 	scrub_data->scrub_stat.page_reorganizations++;
+
 	return DB_SUCCESS;
 }
 
@@ -488,9 +493,13 @@ btr_pessimistic_scrub(
 		/* We play safe and reset the free bits
 		* NOTE: need to call this prior to btr_page_split_and_insert */
 		if (!dict_index_is_clust(index) &&
-		    page_is_leaf(buf_block_get_frame(block))) {
+		    block != NULL) {
+			buf_frame_t* frame = buf_block_get_frame(block);
+			if (frame &&
+			    page_is_leaf(frame)) {
 
-			ibuf_reset_free_bits(block);
+				ibuf_reset_free_bits(block);
+			}
 		}
 
 		rec = btr_page_split_and_insert(
@@ -788,11 +797,8 @@ btr_scrub_page(
 		return BTR_SCRUB_SKIP_PAGE_AND_CLOSE_TABLE;
 	}
 
-	buf_frame_t* frame = NULL;
+	buf_frame_t* frame = buf_block_get_frame(block);
 
-	if (block) {
-		frame = buf_block_get_frame(block);
-	}
 	if (!frame || btr_page_get_index_id(frame) !=
 	    scrub_data->current_index->id) {
 		/* page has been reallocated to new index */
diff --git a/storage/xtradb/buf/buf0buf.cc b/storage/xtradb/buf/buf0buf.cc
index 7d52701494f..84b64622c78 100644
--- a/storage/xtradb/buf/buf0buf.cc
+++ b/storage/xtradb/buf/buf0buf.cc
@@ -65,6 +65,10 @@ Created 11/5/1995 Heikki Tuuri
 #include "fil0pagecompress.h"
 #include "ha_prototypes.h"
 
+/* Enable this for checksum error messages. */
+//#ifdef UNIV_DEBUG
+//#define UNIV_DEBUG_LEVEL2 1
+//#endif
 
 /* prototypes for new functions added to ha_innodb.cc */
 trx_t* innobase_get_trx();
@@ -595,6 +599,14 @@ buf_page_is_checksum_valid_crc32(
 {
 	ib_uint32_t	crc32 = buf_calc_page_crc32(read_buf);
 
+#ifdef UNIV_DEBUG_LEVEL2
+	if (!(checksum_field1 == crc32 && checksum_field2 == crc32)) {
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Page checksum crc32 not valid field1 %lu field2 %lu crc32 %lu.",
+			checksum_field1, checksum_field2, (ulint)crc32);
+	}
+#endif
+
 	return(checksum_field1 == crc32 && checksum_field2 == crc32);
 }
 
@@ -622,6 +634,13 @@ buf_page_is_checksum_valid_innodb(
 
 	if (checksum_field2 != mach_read_from_4(read_buf + FIL_PAGE_LSN)
 	    && checksum_field2 != buf_calc_page_old_checksum(read_buf)) {
+#ifdef UNIV_DEBUG_LEVEL2
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Page checksum innodb not valid field1 %lu field2 %lu crc32 %lu lsn %lu.",
+			checksum_field1, checksum_field2, buf_calc_page_old_checksum(read_buf),
+			mach_read_from_4(read_buf + FIL_PAGE_LSN)
+		);
+#endif
 		return(false);
 	}
 
@@ -632,6 +651,13 @@ buf_page_is_checksum_valid_innodb(
 
 	if (checksum_field1 != 0
 	    && checksum_field1 != buf_calc_page_new_checksum(read_buf)) {
+#ifdef UNIV_DEBUG_LEVEL2
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Page checksum innodb not valid field1 %lu field2 %lu crc32 %lu lsn %lu.",
+			checksum_field1, checksum_field2, buf_calc_page_new_checksum(read_buf),
+			mach_read_from_4(read_buf + FIL_PAGE_LSN)
+		);
+#endif
 		return(false);
 	}
 
@@ -650,6 +676,16 @@ buf_page_is_checksum_valid_none(
 	ulint		checksum_field1,
 	ulint		checksum_field2)
 {
+#ifdef UNIV_DEBUG_LEVEL2
+	if (!(checksum_field1 == checksum_field2 || checksum_field1 == BUF_NO_CHECKSUM_MAGIC)) {
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Page checksum none not valid field1 %lu field2 %lu crc32 %lu lsn %lu.",
+			checksum_field1, checksum_field2, BUF_NO_CHECKSUM_MAGIC,
+			mach_read_from_4(read_buf + FIL_PAGE_LSN)
+		);
+	}
+#endif
+
 	return(checksum_field1 == checksum_field2
 	       && checksum_field1 == BUF_NO_CHECKSUM_MAGIC);
 }
@@ -667,9 +703,21 @@ buf_page_is_corrupted(
 	ulint		zip_size)	/*!< in: size of compressed page;
 					0 for uncompressed pages */
 {
-	ulint		page_encrypted = fil_page_is_encrypted(read_buf);
 	ulint		checksum_field1;
 	ulint		checksum_field2;
+	ulint 		space_id = mach_read_from_4(
+		read_buf + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+	fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(space_id);
+	bool page_encrypted = false;
+
+	/* Page is encrypted if encryption information is found from
+	tablespace and page contains used key_version. This is true
+	also for pages first compressed and then encrypted. */
+	if (crypt_data &&
+	    crypt_data->type != CRYPT_SCHEME_UNENCRYPTED &&
+	    fil_page_is_encrypted(read_buf)) {
+		page_encrypted = true;
+	}
 
 	if (!page_encrypted && !zip_size
 	    && memcmp(read_buf + FIL_PAGE_LSN + 4,
@@ -679,6 +727,11 @@ buf_page_is_corrupted(
 		/* Stored log sequence numbers at the start and the end
 		of page do not match */
 
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Log sequence number at the start %lu and the end %lu do not match.",
+			mach_read_from_4(read_buf + FIL_PAGE_LSN + 4),
+			mach_read_from_4(read_buf + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM + 4));
+
 		return(TRUE);
 	}
 
@@ -724,6 +777,7 @@ buf_page_is_corrupted(
 	if (zip_size) {
 		return(!page_zip_verify_checksum(read_buf, zip_size));
 	}
+
 	if (page_encrypted) {
 		return (FALSE);
 	}
@@ -745,6 +799,9 @@ buf_page_is_corrupted(
 		/* make sure that the page is really empty */
 		for (ulint i = 0; i < UNIV_PAGE_SIZE; i++) {
 			if (read_buf[i] != 0) {
+				ib_logf(IB_LOG_LEVEL_INFO,
+					"Checksum fields zero but page is not empty.");
+
 				return(TRUE);
 			}
 		}
@@ -755,7 +812,7 @@ buf_page_is_corrupted(
 	DBUG_EXECUTE_IF("buf_page_is_corrupt_failure", return(TRUE); );
 
 	ulint	page_no = mach_read_from_4(read_buf + FIL_PAGE_OFFSET);
-	ulint	space_id = mach_read_from_4(read_buf + FIL_PAGE_SPACE_ID);
+
 	const srv_checksum_algorithm_t	curr_algo =
 		static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm);
 
@@ -4546,16 +4603,16 @@ buf_page_check_corrupt(
 	ulint zip_size = buf_page_get_zip_size(bpage);
 	byte* dst_frame = (zip_size) ? bpage->zip.data :
 		((buf_block_t*) bpage)->frame;
-	unsigned key_version = bpage->key_version;
 	bool page_compressed = bpage->page_encrypted;
 	ulint stored_checksum = bpage->stored_checksum;
-	ulint calculated_checksum = bpage->stored_checksum;
+	ulint calculated_checksum = bpage->calculated_checksum;
 	bool page_compressed_encrypted = bpage->page_compressed;
 	ulint space_id = mach_read_from_4(
 		dst_frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
 	fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(space_id);
 	fil_space_t* space = fil_space_found_by_id(space_id);
 	bool corrupted = true;
+	ulint key_version = bpage->key_version;
 
 	if (key_version != 0 || page_compressed_encrypted) {
 		bpage->encrypted = true;
@@ -4585,7 +4642,7 @@ buf_page_check_corrupt(
 					stored_checksum, calculated_checksum);
 			}
 			ib_logf(IB_LOG_LEVEL_ERROR,
-				"Reason could be that key_version %u in page "
+				"Reason could be that key_version %lu in page "
 				"or in crypt_data %p could not be found.",
 				key_version, crypt_data);
 			ib_logf(IB_LOG_LEVEL_ERROR,
@@ -4599,7 +4656,7 @@ buf_page_check_corrupt(
 				"Block in space_id %lu in file %s encrypted.",
 				space_id, space ? space->name : "NULL");
 			ib_logf(IB_LOG_LEVEL_ERROR,
-				"However key management plugin or used key_id %u is not found or"
+				"However key management plugin or used key_id %lu is not found or"
 				" used encryption algorithm or method does not match.",
 				key_version);
 			ib_logf(IB_LOG_LEVEL_ERROR,
@@ -4651,7 +4708,12 @@ buf_page_io_complete(
 			} else {
 				frame = ((buf_block_t*) bpage)->frame;
 			}
-			goto corrupt;
+
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Page %u in tablespace %u encryption error key_version %u.",
+				bpage->offset, bpage->space, bpage->key_version);
+
+			goto database_corrupted;
 		}
 
 		if (buf_page_get_zip_size(bpage)) {
@@ -4663,7 +4725,12 @@ buf_page_io_complete(
 
 				os_atomic_decrement_ulint(
 					&buf_pool->n_pend_unzip, 1);
-				goto corrupt;
+
+				ib_logf(IB_LOG_LEVEL_INFO,
+					"Page %u in tablespace %u zip_decompress failure.",
+					bpage->offset, bpage->space);
+
+				goto database_corrupted;
 			}
 			os_atomic_decrement_ulint(&buf_pool->n_pend_unzip, 1);
 		} else {
@@ -4711,119 +4778,120 @@ buf_page_io_complete(
 
 		if (UNIV_LIKELY(!bpage->is_corrupt ||
 				!srv_pass_corrupt_table)) {
-		/* From version 3.23.38 up we store the page checksum
-		to the 4 first bytes of the page end lsn field */
-
-		if (buf_page_is_corrupted(true, frame,
-					  buf_page_get_zip_size(bpage))) {
-
-			/* Not a real corruption if it was triggered by
-			error injection */
-			DBUG_EXECUTE_IF("buf_page_is_corrupt_failure",
-				if (bpage->space > TRX_SYS_SPACE
-				    && buf_mark_space_corrupt(bpage)) {
-					ib_logf(IB_LOG_LEVEL_INFO,
-						"Simulated page corruption");
-					return(true);
-				}
-				goto page_not_corrupt;
-				;);
-corrupt:
+			/* From version 3.23.38 up we store the page checksum
+			to the 4 first bytes of the page end lsn field */
+
+			if (buf_page_is_corrupted(true, frame,
+					buf_page_get_zip_size(bpage))) {
+
+				/* Not a real corruption if it was triggered by
+				error injection */
+				DBUG_EXECUTE_IF("buf_page_is_corrupt_failure",
+					if (bpage->space > TRX_SYS_SPACE
+						&& buf_mark_space_corrupt(bpage)) {
+						ib_logf(IB_LOG_LEVEL_INFO,
+							"Simulated page corruption");
+						return(true);
+					}
+					goto page_not_corrupt;
+					;);
+database_corrupted:
 
-			bool corrupted = buf_page_check_corrupt(bpage);
+				bool corrupted = buf_page_check_corrupt(bpage);
 
-			if (corrupted) {
-				fil_system_enter();
-				space = fil_space_get_by_id(bpage->space);
-				fil_system_exit();
-				ib_logf(IB_LOG_LEVEL_ERROR,
-					"Database page corruption on disk"
-					" or a failed");
-				ib_logf(IB_LOG_LEVEL_ERROR,
-					"Space %u file %s read of page %u.",
-					bpage->space,
-					space ? space->name : "NULL",
-					bpage->offset);
-				ib_logf(IB_LOG_LEVEL_ERROR,
-					"You may have to recover"
-					" from a backup.");
+				if (corrupted) {
+					fil_system_enter();
+					space = fil_space_get_by_id(bpage->space);
+					fil_system_exit();
+					ib_logf(IB_LOG_LEVEL_ERROR,
+						"Database page corruption on disk"
+						" or a failed");
+					ib_logf(IB_LOG_LEVEL_ERROR,
+						"Space %u file %s read of page %u.",
+						bpage->space,
+						space ? space->name : "NULL",
+						bpage->offset);
+					ib_logf(IB_LOG_LEVEL_ERROR,
+						"You may have to recover"
+						" from a backup.");
 
 
-				buf_page_print(frame, buf_page_get_zip_size(bpage),
-					BUF_PAGE_PRINT_NO_CRASH);
+					buf_page_print(frame, buf_page_get_zip_size(bpage),
+						BUF_PAGE_PRINT_NO_CRASH);
 
-				ib_logf(IB_LOG_LEVEL_ERROR,
-					"It is also possible that your operating"
-					"system has corrupted its own file cache.");
-				ib_logf(IB_LOG_LEVEL_ERROR,
-					"and rebooting your computer removes the error.");
-				ib_logf(IB_LOG_LEVEL_ERROR,
-					"If the corrupt page is an index page you can also try to");
-				ib_logf(IB_LOG_LEVEL_ERROR,
-					"fix the corruption by dumping, dropping, and reimporting");
-				ib_logf(IB_LOG_LEVEL_ERROR,
-					"the corrupt table. You can use CHECK");
-				ib_logf(IB_LOG_LEVEL_ERROR,
-					"TABLE to scan your table for corruption.");
-				ib_logf(IB_LOG_LEVEL_ERROR,
-					"See also "
-					REFMAN "forcing-innodb-recovery.html"
-					" about forcing recovery.");
-			}
-
-			if (srv_pass_corrupt_table && bpage->space != 0
-			    && bpage->space < SRV_LOG_SPACE_FIRST_ID) {
-				trx_t*	trx;
-
-				fprintf(stderr,
-					"InnoDB: space %u will be treated as corrupt.\n",
-					bpage->space);
-				fil_space_set_corrupt(bpage->space);
-
-				trx = innobase_get_trx();
-				if (trx && trx->dict_operation_lock_mode == RW_X_LATCH) {
-					dict_table_set_corrupt_by_space(bpage->space, FALSE);
-				} else {
-					dict_table_set_corrupt_by_space(bpage->space, TRUE);
+					ib_logf(IB_LOG_LEVEL_ERROR,
+						"It is also possible that your operating"
+						"system has corrupted its own file cache.");
+					ib_logf(IB_LOG_LEVEL_ERROR,
+						"and rebooting your computer removes the error.");
+					ib_logf(IB_LOG_LEVEL_ERROR,
+						"If the corrupt page is an index page you can also try to");
+					ib_logf(IB_LOG_LEVEL_ERROR,
+						"fix the corruption by dumping, dropping, and reimporting");
+					ib_logf(IB_LOG_LEVEL_ERROR,
+						"the corrupt table. You can use CHECK");
+					ib_logf(IB_LOG_LEVEL_ERROR,
+						"TABLE to scan your table for corruption.");
+					ib_logf(IB_LOG_LEVEL_ERROR,
+						"See also "
+						REFMAN "forcing-innodb-recovery.html"
+						" about forcing recovery.");
 				}
-				bpage->is_corrupt = TRUE;
-			}
 
-			if (srv_force_recovery < SRV_FORCE_IGNORE_CORRUPT) {
-				/* If page space id is larger than TRX_SYS_SPACE
-				(0), we will attempt to mark the corresponding
-				table as corrupted instead of crashing server */
-				if (bpage->space > TRX_SYS_SPACE
-				    && buf_mark_space_corrupt(bpage)) {
-					return(false);
-				} else {
-					corrupted = buf_page_check_corrupt(bpage);
+				if (srv_pass_corrupt_table && bpage->space != 0
+					&& bpage->space < SRV_LOG_SPACE_FIRST_ID) {
+					trx_t*	trx;
 
-					if (corrupted) {
-						ib_logf(IB_LOG_LEVEL_ERROR,
-							"Ending processing because of a corrupt database page.");
+					fprintf(stderr,
+						"InnoDB: space %u will be treated as corrupt.\n",
+						bpage->space);
+					fil_space_set_corrupt(bpage->space);
 
-						ut_error;
+					trx = innobase_get_trx();
+					if (trx && trx->dict_operation_lock_mode == RW_X_LATCH) {
+						dict_table_set_corrupt_by_space(bpage->space, FALSE);
+					} else {
+						dict_table_set_corrupt_by_space(bpage->space, TRUE);
 					}
+					bpage->is_corrupt = TRUE;
+				}
 
-					ib_push_warning(innobase_get_trx(), DB_DECRYPTION_FAILED,
-						"Table in tablespace %lu encrypted."
-						"However key management plugin or used key_id %u is not found or"
-						" used encryption algorithm or method does not match."
-						" Can't continue opening the table.",
-						(ulint)bpage->space, bpage->key_version);
+				if (srv_force_recovery < SRV_FORCE_IGNORE_CORRUPT) {
+					/* If page space id is larger than TRX_SYS_SPACE
+					(0), we will attempt to mark the corresponding
+					table as corrupted instead of crashing server */
+					if (bpage->space > TRX_SYS_SPACE
+						&& buf_mark_space_corrupt(bpage)) {
+						return(false);
+					} else {
+						corrupted = buf_page_check_corrupt(bpage);
+						ulint key_version = bpage->key_version;
 
-					if (bpage->space > TRX_SYS_SPACE) {
 						if (corrupted) {
-							buf_mark_space_corrupt(bpage);
+							ib_logf(IB_LOG_LEVEL_ERROR,
+								"Ending processing because of a corrupt database page.");
+
+							ut_error;
 						}
-					} else {
-						ut_error;
+
+						ib_push_warning(innobase_get_trx(), DB_DECRYPTION_FAILED,
+							"Table in tablespace %lu encrypted."
+							"However key management plugin or used key_id %lu is not found or"
+							" used encryption algorithm or method does not match."
+							" Can't continue opening the table.",
+							(ulint)bpage->space, key_version);
+
+						if (bpage->space > TRX_SYS_SPACE) {
+							if (corrupted) {
+								buf_mark_space_corrupt(bpage);
+							}
+						} else {
+							ut_error;
+						}
+						return(false);
 					}
-					return(false);
 				}
 			}
-		}
 		} /**/
 
 		DBUG_EXECUTE_IF("buf_page_is_corrupt_failure",
@@ -4835,7 +4903,9 @@ corrupt:
 			recv_recover_page(TRUE, (buf_block_t*) bpage);
 		}
 
-		if (uncompressed && !recv_no_ibuf_operations) {
+		if (uncompressed && !recv_no_ibuf_operations
+		    && fil_page_get_type(frame) == FIL_PAGE_INDEX
+		    && page_is_leaf(frame)) {
 
 			buf_block_t*	block;
 			ibool		update_ibuf_bitmap;
@@ -6260,12 +6330,12 @@ buf_page_encrypt_before_write(
 		return src_frame;
 	}
 
-	if (crypt_data != NULL && crypt_data->encryption == FIL_SPACE_ENCRYPTION_OFF) {
+	if (crypt_data != NULL && crypt_data->not_encrypted()) {
 		/* Encryption is disabled */
 		encrypted = false;
 	}
 
-	if (!srv_encrypt_tables && (crypt_data == NULL || crypt_data->encryption == FIL_SPACE_ENCRYPTION_DEFAULT)) {
+	if (!srv_encrypt_tables && (crypt_data == NULL || crypt_data->is_default_encryption())) {
 		/* Encryption is disabled */
 		encrypted = false;
 	}
@@ -6371,6 +6441,35 @@ buf_page_decrypt_after_read(
 	bool page_compressed_encrypted = fil_page_is_compressed_encrypted(dst_frame);
 	buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
 	bool success = true;
+	ulint 		space_id = mach_read_from_4(
+		dst_frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
+	fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(space_id);
+
+	/* Page is encrypted if encryption information is found from
+	tablespace and page contains used key_version. This is true
+	also for pages first compressed and then encrypted. */
+	if (!crypt_data ||
+	    (crypt_data &&
+	     crypt_data->type == CRYPT_SCHEME_UNENCRYPTED &&
+	     key_version != 0)) {
+		byte*	frame = NULL;
+
+		if (buf_page_get_zip_size(bpage)) {
+			frame = bpage->zip.data;
+		} else {
+			frame = ((buf_block_t*) bpage)->frame;
+		}
+
+		/* If page is not corrupted at this point, page can't be
+		encrypted, thus set key_version to 0. If page is corrupted,
+		we assume at this point that it is encrypted as page
+		contained key_version != 0. Note that page could still be
+		really corrupted. This we will find out after decrypt by
+		checking page checksums. */
+		if (!buf_page_is_corrupted(false, frame, buf_page_get_zip_size(bpage))) {
+			key_version = 0;
+		}
+	}
 
 	/* If page is encrypted read post-encryption checksum */
 	if (!page_compressed_encrypted && key_version != 0) {
diff --git a/storage/xtradb/buf/buf0dblwr.cc b/storage/xtradb/buf/buf0dblwr.cc
index ff334003524..d8d85c25289 100644
--- a/storage/xtradb/buf/buf0dblwr.cc
+++ b/storage/xtradb/buf/buf0dblwr.cc
@@ -550,7 +550,7 @@ buf_dblwr_process()
 			} else if (buf_page_is_corrupted(true, read_buf, zip_size)) {
 
 				fprintf(stderr,
-					"InnoDB: Warning: database page"
+					"InnoDB: Database page"
 					" corruption or a failed\n"
 					"InnoDB: file read of"
 					" space %lu page %lu.\n"
diff --git a/storage/xtradb/buf/buf0flu.cc b/storage/xtradb/buf/buf0flu.cc
index 168f0a438a6..09f07bbd696 100644
--- a/storage/xtradb/buf/buf0flu.cc
+++ b/storage/xtradb/buf/buf0flu.cc
@@ -305,6 +305,8 @@ buf_flush_init_flush_rbt(void)
 
 		buf_flush_list_mutex_enter(buf_pool);
 
+		ut_ad(buf_pool->flush_rbt == NULL);
+
 		/* Create red black tree for speedy insertions in flush list. */
 		buf_pool->flush_rbt = rbt_create(
 			sizeof(buf_page_t*), buf_flush_block_cmp);
@@ -2220,7 +2222,7 @@ buf_flush_single_page_from_LRU(
 		if (ready) {
 			bool	evict_zip;
 
-			evict_zip = !buf_LRU_evict_from_unzip_LRU(buf_pool);;
+			evict_zip = !buf_LRU_evict_from_unzip_LRU(buf_pool);
 
 			freed = buf_LRU_free_page(bpage, evict_zip);
 
@@ -2627,6 +2629,11 @@ page_cleaner_sleep_if_needed(
 	ulint	next_loop_time)	/*!< in: time when next loop iteration
 				should start */
 {
+	/* No sleep if we are cleaning the buffer pool during the shutdown
+	with everything else finished */
+	if (srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE)
+		return;
+
 	ulint	cur_time = ut_time_ms();
 
 	if (next_loop_time > cur_time) {
diff --git a/storage/xtradb/dict/dict0boot.cc b/storage/xtradb/dict/dict0boot.cc
index 138d3131e09..7162a7f4c87 100644
--- a/storage/xtradb/dict/dict0boot.cc
+++ b/storage/xtradb/dict/dict0boot.cc
@@ -273,6 +273,10 @@ dict_boot(void)
 	ut_ad(DICT_NUM_FIELDS__SYS_FOREIGN_FOR_NAME == 2);
 	ut_ad(DICT_NUM_COLS__SYS_FOREIGN_COLS == 4);
 	ut_ad(DICT_NUM_FIELDS__SYS_FOREIGN_COLS == 6);
+	ut_ad(DICT_NUM_COLS__SYS_ZIP_DICT == 3);
+	ut_ad(DICT_NUM_FIELDS__SYS_ZIP_DICT == 5);
+	ut_ad(DICT_NUM_COLS__SYS_ZIP_DICT_COLS == 3);
+	ut_ad(DICT_NUM_FIELDS__SYS_ZIP_DICT_COLS == 5);
 
 	mtr_start(&mtr);
 
diff --git a/storage/xtradb/dict/dict0crea.cc b/storage/xtradb/dict/dict0crea.cc
index 870e452e5f2..4a7bd2e8a4e 100644
--- a/storage/xtradb/dict/dict0crea.cc
+++ b/storage/xtradb/dict/dict0crea.cc
@@ -38,6 +38,7 @@ Created 1/8/1996 Heikki Tuuri
 #include "que0que.h"
 #include "row0ins.h"
 #include "row0mysql.h"
+#include "row0sel.h"
 #include "pars0pars.h"
 #include "trx0roll.h"
 #include "usr0sess.h"
@@ -1935,6 +1936,135 @@ dict_create_or_check_sys_tablespace(void)
 	return(err);
 }
 
+/** Creates the zip_dict system table inside InnoDB
+at server bootstrap or server start if it is not found or is
+not of the right form.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_create_or_check_sys_zip_dict(void)
+{
+	trx_t*		trx;
+	my_bool		srv_file_per_table_backup;
+	dberr_t		err;
+	dberr_t		sys_zip_dict_err;
+	dberr_t		sys_zip_dict_cols_err;
+
+	ut_a(srv_get_active_thread_type() == SRV_NONE);
+
+	/* Note: The master thread has not been started at this point. */
+
+	sys_zip_dict_err = dict_check_if_system_table_exists(
+		"SYS_ZIP_DICT", DICT_NUM_FIELDS__SYS_ZIP_DICT + 1, 2);
+	sys_zip_dict_cols_err = dict_check_if_system_table_exists(
+		"SYS_ZIP_DICT_COLS", DICT_NUM_FIELDS__SYS_ZIP_DICT_COLS + 1,
+		1);
+
+	if (sys_zip_dict_err == DB_SUCCESS &&
+		sys_zip_dict_cols_err == DB_SUCCESS)
+		return (DB_SUCCESS);
+
+	trx = trx_allocate_for_mysql();
+
+	trx_set_dict_operation(trx, TRX_DICT_OP_TABLE);
+
+	trx->op_info = "creating zip_dict and zip_dict_cols sys tables";
+
+	row_mysql_lock_data_dictionary(trx);
+
+	/* Check which incomplete table definition to drop. */
+
+	if (sys_zip_dict_err == DB_CORRUPTION) {
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Dropping incompletely created "
+			"SYS_ZIP_DICT table.");
+		row_drop_table_for_mysql("SYS_ZIP_DICT", trx, TRUE, TRUE);
+	}
+	if (sys_zip_dict_cols_err == DB_CORRUPTION) {
+		ib_logf(IB_LOG_LEVEL_WARN,
+			"Dropping incompletely created "
+			"SYS_ZIP_DICT_COLS table.");
+		row_drop_table_for_mysql("SYS_ZIP_DICT_COLS", trx, TRUE, TRUE);
+	}
+
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Creating zip_dict and zip_dict_cols system tables.");
+
+	/* We always want SYSTEM tables to be created inside the system
+	tablespace. */
+	srv_file_per_table_backup = srv_file_per_table;
+	srv_file_per_table = 0;
+
+	err = que_eval_sql(
+		NULL,
+		"PROCEDURE CREATE_SYS_ZIP_DICT_PROC () IS\n"
+		"BEGIN\n"
+		"CREATE TABLE SYS_ZIP_DICT(\n"
+		"  ID INT UNSIGNED NOT NULL,\n"
+		"  NAME CHAR("
+		  STRINGIFY_ARG(ZIP_DICT_MAX_NAME_LENGTH)
+		") NOT NULL,\n"
+		"  DATA BLOB NOT NULL\n"
+		");\n"
+		"CREATE UNIQUE CLUSTERED INDEX SYS_ZIP_DICT_ID"
+		" ON SYS_ZIP_DICT (ID);\n"
+		"CREATE UNIQUE INDEX SYS_ZIP_DICT_NAME"
+		" ON SYS_ZIP_DICT (NAME);\n"
+		"CREATE TABLE SYS_ZIP_DICT_COLS(\n"
+		"  TABLE_ID INT UNSIGNED NOT NULL,\n"
+		"  COLUMN_POS INT UNSIGNED NOT NULL,\n"
+		"  DICT_ID INT UNSIGNED NOT NULL\n"
+		");\n"
+		"CREATE UNIQUE CLUSTERED INDEX SYS_ZIP_DICT_COLS_COMPOSITE"
+		" ON SYS_ZIP_DICT_COLS (TABLE_ID, COLUMN_POS);\n"
+		"END;\n",
+		FALSE, trx);
+
+	if (err != DB_SUCCESS) {
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Creation of SYS_ZIP_DICT and SYS_ZIP_DICT_COLS"
+			"has failed with error %lu. Tablespace is full. "
+			"Dropping incompletely created tables.",
+			(ulong) err);
+
+		ut_a(err == DB_OUT_OF_FILE_SPACE
+			|| err == DB_TOO_MANY_CONCURRENT_TRXS);
+
+		row_drop_table_for_mysql("SYS_ZIP_DICT", trx, TRUE, TRUE);
+		row_drop_table_for_mysql("SYS_ZIP_DICT_COLS", trx, TRUE, TRUE);
+
+		if (err == DB_OUT_OF_FILE_SPACE) {
+			err = DB_MUST_GET_MORE_FILE_SPACE;
+		}
+	}
+
+	trx_commit_for_mysql(trx);
+
+	row_mysql_unlock_data_dictionary(trx);
+
+	trx_free_for_mysql(trx);
+
+	srv_file_per_table = srv_file_per_table_backup;
+
+	if (err == DB_SUCCESS) {
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"zip_dict and zip_dict_cols system tables created.");
+	}
+
+	/* Note: The master thread has not been started at this point. */
+	/* Confirm and move to the non-LRU part of the table LRU list. */
+
+	sys_zip_dict_err = dict_check_if_system_table_exists(
+		"SYS_ZIP_DICT", DICT_NUM_FIELDS__SYS_ZIP_DICT + 1, 2);
+	ut_a(sys_zip_dict_err == DB_SUCCESS);
+	sys_zip_dict_cols_err = dict_check_if_system_table_exists(
+		"SYS_ZIP_DICT_COLS",
+		DICT_NUM_FIELDS__SYS_ZIP_DICT_COLS + 1, 1);
+	ut_a(sys_zip_dict_cols_err == DB_SUCCESS);
+
+	return(err);
+}
+
 /********************************************************************//**
 Add a single tablespace definition to the data dictionary tables in the
 database.
@@ -1988,3 +2118,456 @@ dict_create_add_tablespace_to_dictionary(
 
 	return(error);
 }
+
+/** Add a single compression dictionary definition to the SYS_ZIP_DICT
+InnoDB system table.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+dict_create_add_zip_dict(
+	const char*	name,		/*!< in: dict name */
+	ulint		name_len,	/*!< in: dict name length */
+	const char*	data,		/*!< in: dict data */
+	ulint		data_len,	/*!< in: dict data length */
+	trx_t*		trx)		/*!< in/out: transaction */
+{
+	ut_ad(name);
+	ut_ad(data);
+
+	pars_info_t* info = pars_info_create();
+
+	pars_info_add_literal(info, "name", name, name_len,
+		DATA_VARCHAR, DATA_ENGLISH);
+	pars_info_add_literal(info, "data", data, data_len,
+		DATA_BLOB, DATA_BINARY_TYPE | DATA_NOT_NULL);
+
+	dberr_t error = que_eval_sql(info,
+		"PROCEDURE P () IS\n"
+		"  max_id INT;\n"
+		"DECLARE CURSOR cur IS\n"
+		"  SELECT ID FROM SYS_ZIP_DICT\n"
+		"  ORDER BY ID DESC;\n"
+		"BEGIN\n"
+		"  max_id := 0;\n"
+		"  OPEN cur;\n"
+		"  FETCH cur INTO max_id;\n"
+		"  IF (cur % NOTFOUND) THEN\n"
+		"    max_id := 0;\n"
+		"  END IF;\n"
+		"  CLOSE cur;\n"
+		"  INSERT INTO SYS_ZIP_DICT VALUES"
+		"    (max_id + 1, :name, :data);\n"
+		"END;\n",
+		FALSE, trx);
+
+	return error;
+}
+
+/** Fetch callback, just stores extracted zip_dict id in the external
+variable.
+@return TRUE if all OK */
+static
+ibool
+dict_create_extract_int_aux(
+	void*	row,		/*!< in: sel_node_t* */
+	void*	user_arg)	/*!< in: int32 id */
+{
+	sel_node_t*	node = static_cast<sel_node_t*>(row);
+	dfield_t*	dfield = que_node_get_val(node->select_list);
+	dtype_t*	type = dfield_get_type(dfield);
+	ulint		len = dfield_get_len(dfield);
+
+	ut_a(dtype_get_mtype(type) == DATA_INT);
+	ut_a(len == sizeof(ib_uint32_t));
+
+	memcpy(user_arg, dfield_get_data(dfield), sizeof(ib_uint32_t));
+
+	return(TRUE);
+}
+
+/** Add a single compression dictionary reference to the SYS_ZIP_DICT_COLS
+InnoDB system table.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+dict_create_add_zip_dict_reference(
+	ulint		table_id,	/*!< in: table id */
+	ulint		column_pos,	/*!< in: column position */
+	ulint		dict_id,	/*!< in: dict id */
+	trx_t*		trx)		/*!< in/out: transaction */
+{
+	pars_info_t* info = pars_info_create();
+
+	pars_info_add_int4_literal(info, "table_id", table_id);
+	pars_info_add_int4_literal(info, "column_pos", column_pos);
+	pars_info_add_int4_literal(info, "dict_id", dict_id);
+
+	dberr_t error = que_eval_sql(info,
+		"PROCEDURE P () IS\n"
+		"BEGIN\n"
+		"  INSERT INTO SYS_ZIP_DICT_COLS VALUES"
+		"    (:table_id, :column_pos, :dict_id);\n"
+		"END;\n",
+		FALSE, trx);
+	return error;
+}
+
+/** Get a single compression dictionary id for the given
+(table id, column pos) pair.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+dict_create_get_zip_dict_id_by_reference(
+	ulint	table_id,	/*!< in: table id */
+	ulint	column_pos,	/*!< in: column position */
+	ulint*	dict_id,	/*!< out: dict id */
+	trx_t*	trx)		/*!< in/out: transaction */
+{
+	ut_ad(dict_id);
+
+	pars_info_t* info = pars_info_create();
+
+	ib_uint32_t dict_id_buf;
+	mach_write_to_4(reinterpret_cast<byte*>(&dict_id_buf ),
+		ULINT32_UNDEFINED);
+
+	pars_info_add_int4_literal(info, "table_id", table_id);
+	pars_info_add_int4_literal(info, "column_pos", column_pos);
+	pars_info_bind_function(
+		info, "my_func", dict_create_extract_int_aux, &dict_id_buf);
+
+	dberr_t error = que_eval_sql(info,
+		"PROCEDURE P () IS\n"
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR cur IS\n"
+		"  SELECT DICT_ID FROM SYS_ZIP_DICT_COLS\n"
+		"    WHERE TABLE_ID = :table_id AND\n"
+		"          COLUMN_POS = :column_pos;\n"
+		"BEGIN\n"
+		"  OPEN cur;\n"
+		"  FETCH cur INTO my_func();\n"
+		"  CLOSE cur;\n"
+		"END;\n",
+		FALSE, trx);
+	if (error == DB_SUCCESS) {
+		ib_uint32_t local_dict_id = mach_read_from_4(
+			reinterpret_cast<const byte*>(&dict_id_buf));
+		if (local_dict_id == ULINT32_UNDEFINED)
+			error = DB_RECORD_NOT_FOUND;
+		else
+			*dict_id = local_dict_id;
+	}
+	return error;
+}
+
+/** Get compression dictionary id for the given name.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+dict_create_get_zip_dict_id_by_name(
+	const char*	dict_name,	/*!< in: dict name */
+	ulint		dict_name_len,	/*!< in: dict name length */
+	ulint*		dict_id,	/*!< out: dict id */
+	trx_t*		trx)		/*!< in/out: transaction */
+{
+	ut_ad(dict_name);
+	ut_ad(dict_name_len);
+	ut_ad(dict_id);
+
+	pars_info_t* info = pars_info_create();
+
+	pars_info_add_literal(info, "dict_name", dict_name, dict_name_len,
+		DATA_VARCHAR, DATA_ENGLISH);
+
+	ib_uint32_t dict_id_buf;
+	mach_write_to_4(reinterpret_cast<byte*>(&dict_id_buf),
+		ULINT32_UNDEFINED);
+	pars_info_bind_function(
+		info, "my_func", dict_create_extract_int_aux, &dict_id_buf);
+
+	dberr_t error = que_eval_sql(info,
+		"PROCEDURE P () IS\n"
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR cur IS\n"
+		"  SELECT ID FROM SYS_ZIP_DICT\n"
+		"    WHERE NAME = :dict_name;\n"
+		"BEGIN\n"
+		"  OPEN cur;\n"
+		"  FETCH cur INTO my_func();\n"
+		"  CLOSE cur;\n"
+		"END;\n",
+		FALSE, trx);
+	if (error == DB_SUCCESS) {
+		ib_uint32_t local_dict_id = mach_read_from_4(
+			reinterpret_cast<const byte*>(&dict_id_buf));
+		if (local_dict_id == ULINT32_UNDEFINED)
+			error = DB_RECORD_NOT_FOUND;
+		else
+			*dict_id = local_dict_id;
+	}
+	return error;
+}
+
+/** Auxiliary enum used to indicate zip dict data extraction result code */
+enum zip_dict_info_aux_code {
+	zip_dict_info_success,		/*!< success */
+	zip_dict_info_not_found,	/*!< zip dict record not found */
+	zip_dict_info_oom,		/*!< out of memory */
+	zip_dict_info_corrupted_name,	/*!< corrupted zip dict name */
+	zip_dict_info_corrupted_data	/*!< corrupted zip dict data */
+};
+
+/** Auxiliary struct used to return zip dict info aling with result code */
+struct zip_dict_info_aux {
+	LEX_STRING	name;	/*!< zip dict name */
+	LEX_STRING	data;	/*!< zip dict data */
+	int		code;	/*!< result code (0 - success) */
+};
+
+/** Fetch callback, just stores extracted zip_dict data in the external
+variable.
+@return always returns TRUE */
+static
+ibool
+dict_create_get_zip_dict_info_by_id_aux(
+	void*	row,		/*!< in: sel_node_t* */
+	void*	user_arg)	/*!< in: pointer to zip_dict_info_aux* */
+{
+	sel_node_t*		node = static_cast<sel_node_t*>(row);
+	zip_dict_info_aux*	result =
+		static_cast<zip_dict_info_aux*>(user_arg);
+
+	result->code = zip_dict_info_success;
+	result->name.str = 0;
+	result->name.length = 0;
+	result->data.str = 0;
+	result->data.length = 0;
+
+	/* NAME field */
+	que_node_t*	exp = node->select_list;
+	ut_a(exp != 0);
+
+	dfield_t*	dfield = que_node_get_val(exp);
+	dtype_t*	type = dfield_get_type(dfield);
+	ut_a(dtype_get_mtype(type) == DATA_VARCHAR);
+
+	ulint	len = dfield_get_len(dfield);
+	void*	data = dfield_get_data(dfield);
+
+
+	if (len == UNIV_SQL_NULL) {
+		result->code = zip_dict_info_corrupted_name;
+	}
+	else {
+		result->name.str =
+			static_cast<char*>(my_malloc(len + 1, MYF(0)));
+		if (result->name.str == 0) {
+			result->code = zip_dict_info_oom;
+		}
+		else {
+			memcpy(result->name.str, data, len);
+			result->name.str[len] = '\0';
+			result->name.length = len;
+		}
+	}
+
+	/* DATA field */
+	exp = que_node_get_next(exp);
+	ut_a(exp != 0);
+
+	dfield = que_node_get_val(exp);
+	type = dfield_get_type(dfield);
+	ut_a(dtype_get_mtype(type) == DATA_BLOB);
+
+	len = dfield_get_len(dfield);
+	data = dfield_get_data(dfield);
+
+	if (len == UNIV_SQL_NULL) {
+		result->code = zip_dict_info_corrupted_data;
+	}
+	else {
+		result->data.str =
+			static_cast<char*>(my_malloc(
+				len == 0 ? 1 : len, MYF(0)));
+		if (result->data.str == 0) {
+			result->code = zip_dict_info_oom;
+		}
+		else {
+			memcpy(result->data.str, data, len);
+			result->data.length = len;
+		}
+	}
+
+	ut_ad(que_node_get_next(exp) == 0);
+
+	if (result->code != zip_dict_info_success) {
+		if (result->name.str == 0) {
+			mem_free(result->name.str);
+			result->name.str = 0;
+			result->name.length = 0;
+		}
+		if (result->data.str == 0) {
+			mem_free(result->data.str);
+			result->data.str = 0;
+			result->data.length = 0;
+		}
+	}
+
+	return TRUE;
+}
+
+/** Get compression dictionary info (name and data) for the given id.
+Allocates memory for name and data on success.
+Must be freed with mem_free().
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+dict_create_get_zip_dict_info_by_id(
+	ulint	dict_id,	/*!< in: dict id */
+	char**	name,		/*!< out: dict name */
+	ulint*	name_len,	/*!< out: dict name length*/
+	char**	data,		/*!< out: dict data */
+	ulint*	data_len,	/*!< out: dict data length*/
+	trx_t*	trx)		/*!< in/out: transaction */
+{
+	ut_ad(name);
+	ut_ad(data);
+
+	zip_dict_info_aux rec;
+	rec.code = zip_dict_info_not_found;
+	pars_info_t* info = pars_info_create();
+
+	pars_info_add_int4_literal(info, "id", dict_id);
+	pars_info_bind_function(
+		info, "my_func", dict_create_get_zip_dict_info_by_id_aux,
+		&rec);
+
+	dberr_t error = que_eval_sql(info,
+		"PROCEDURE P () IS\n"
+		"DECLARE FUNCTION my_func;\n"
+		"DECLARE CURSOR cur IS\n"
+		"  SELECT NAME, DATA FROM SYS_ZIP_DICT\n"
+		"    WHERE ID = :id;\n"
+		"BEGIN\n"
+		"  OPEN cur;\n"
+		"  FETCH cur INTO my_func();\n"
+		"  CLOSE cur;\n"
+		"END;\n",
+		FALSE, trx);
+	if (error == DB_SUCCESS) {
+		switch (rec.code) {
+			case zip_dict_info_success:
+				*name = rec.name.str;
+				*name_len = rec.name.length;
+				*data = rec.data.str;
+				*data_len = rec.data.length;
+				break;
+			case zip_dict_info_not_found:
+				error = DB_RECORD_NOT_FOUND;
+				break;
+			case zip_dict_info_oom:
+				error = DB_OUT_OF_MEMORY;
+				break;
+			case zip_dict_info_corrupted_name:
+			case zip_dict_info_corrupted_data:
+				error = DB_INVALID_NULL;
+				break;
+			default:
+				ut_error;
+		}
+	}
+	return error;
+}
+
+/** Remove a single compression dictionary from the data dictionary
+tables in the database.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+dict_create_remove_zip_dict(
+	const char*	name,		/*!< in: dict name */
+	ulint		name_len,	/*!< in: dict name length */
+	trx_t*		trx)		/*!< in/out: transaction */
+{
+	ut_ad(name);
+
+	pars_info_t* info = pars_info_create();
+
+	ib_uint32_t dict_id_buf;
+	mach_write_to_4(reinterpret_cast<byte*>(&dict_id_buf),
+		ULINT32_UNDEFINED);
+	ib_uint32_t counter_buf;
+	mach_write_to_4(reinterpret_cast<byte*>(&counter_buf),
+		ULINT32_UNDEFINED);
+
+	pars_info_add_literal(info, "name", name, name_len,
+		DATA_VARCHAR, DATA_ENGLISH);
+	pars_info_bind_int4_literal(info, "dict_id", &dict_id_buf);
+	pars_info_bind_function(info, "find_dict_func",
+		dict_create_extract_int_aux, &dict_id_buf);
+	pars_info_bind_function(info, "count_func",
+		dict_create_extract_int_aux, &counter_buf);
+
+	dberr_t error = que_eval_sql(info,
+		"PROCEDURE P () IS\n"
+		"DECLARE FUNCTION find_dict_func;\n"
+		"DECLARE FUNCTION count_func;\n"
+		"DECLARE CURSOR dict_cur IS\n"
+		"  SELECT ID FROM SYS_ZIP_DICT\n"
+		"    WHERE NAME = :name\n"
+		"  FOR UPDATE;\n"
+		"DECLARE CURSOR ref_cur IS\n"
+		"  SELECT 1 FROM SYS_ZIP_DICT_COLS\n"
+		"    WHERE DICT_ID = :dict_id;\n"
+		"BEGIN\n"
+		"  OPEN dict_cur;\n"
+		"  FETCH dict_cur INTO find_dict_func();\n"
+		"  IF NOT (SQL % NOTFOUND) THEN\n"
+		"    OPEN ref_cur;\n"
+		"    FETCH ref_cur INTO count_func();\n"
+		"    IF SQL % NOTFOUND THEN\n"
+		"      DELETE FROM SYS_ZIP_DICT WHERE CURRENT OF dict_cur;\n"
+		"    END IF;\n"
+		"    CLOSE ref_cur;\n"
+		"  END IF;\n"
+		"  CLOSE dict_cur;\n"
+		"END;\n",
+		FALSE, trx);
+	if (error == DB_SUCCESS) {
+		ib_uint32_t local_dict_id = mach_read_from_4(
+			reinterpret_cast<const byte*>(&dict_id_buf));
+		if (local_dict_id == ULINT32_UNDEFINED) {
+			error = DB_RECORD_NOT_FOUND;
+		}
+		else {
+			ib_uint32_t local_counter = mach_read_from_4(
+				reinterpret_cast<const byte*>(&counter_buf));
+			if (local_counter != ULINT32_UNDEFINED)
+				error = DB_ROW_IS_REFERENCED;
+		}
+	}
+	return error;
+}
+
+/** Remove all compression dictionary references for the given table ID from
+the data dictionary tables in the database.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+dict_create_remove_zip_dict_references_for_table(
+	ulint	table_id,	/*!< in: table id */
+	trx_t*	trx)		/*!< in/out: transaction */
+{
+	pars_info_t* info = pars_info_create();
+
+	pars_info_add_int4_literal(info, "table_id", table_id);
+
+	dberr_t error = que_eval_sql(info,
+		"PROCEDURE P () IS\n"
+		"BEGIN\n"
+		"  DELETE FROM SYS_ZIP_DICT_COLS\n"
+		"    WHERE TABLE_ID = :table_id;\n"
+		"END;\n",
+		FALSE, trx);
+	return error;
+}
diff --git a/storage/xtradb/dict/dict0dict.cc b/storage/xtradb/dict/dict0dict.cc
index 1b6194ca098..dc75996ac7b 100644
--- a/storage/xtradb/dict/dict0dict.cc
+++ b/storage/xtradb/dict/dict0dict.cc
@@ -7341,3 +7341,161 @@ dict_tf_to_row_format_string(
 	return(0);
 }
 #endif /* !UNIV_HOTBACKUP */
+
+/** Insert a records into SYS_ZIP_DICT.
+@retval	DB_SUCCESS	if OK
+@retval	dberr_t		if the insert failed */
+UNIV_INTERN
+dberr_t
+dict_create_zip_dict(
+	const char*	name,		/*!< in: zip_dict name */
+	ulint		name_len,	/*!< in: zip_dict name length*/
+	const char*	data,		/*!< in: zip_dict data */
+	ulint		data_len)	/*!< in: zip_dict data length */
+{
+	dberr_t		err = DB_SUCCESS;
+	trx_t*		trx;
+
+	ut_ad(name);
+	ut_ad(data);
+
+	rw_lock_x_lock(&dict_operation_lock);
+	dict_mutex_enter_for_mysql();
+
+	trx = trx_allocate_for_background();
+	trx->op_info = "insert zip_dict";
+	trx->dict_operation_lock_mode = RW_X_LATCH;
+	trx_start_if_not_started(trx);
+
+	err = dict_create_add_zip_dict(name, name_len, data, data_len, trx);
+
+	if (err == DB_SUCCESS) {
+		trx_commit_for_mysql(trx);
+	}
+	else {
+		trx->op_info = "rollback of internal trx on zip_dict table";
+		trx_rollback_to_savepoint(trx, NULL);
+		ut_a(trx->error_state == DB_SUCCESS);
+	}
+	trx->op_info = "";
+	trx->dict_operation_lock_mode = 0;
+	trx_free_for_background(trx);
+
+	dict_mutex_exit_for_mysql();
+	rw_lock_x_unlock(&dict_operation_lock);
+
+	return err;
+}
+/** Get single compression dictionary id for the given
+(table id, column pos) pair.
+@retval	DB_SUCCESS		if OK
+@retval	DB_RECORD_NOT_FOUND	if not found */
+UNIV_INTERN
+dberr_t
+dict_get_dictionary_id_by_key(
+	ulint	table_id,	/*!< in: table id */
+	ulint	column_pos,	/*!< in: column position */
+	ulint*	dict_id)	/*!< out: zip_dict id */
+{
+	dberr_t		err = DB_SUCCESS;
+	trx_t*		trx;
+
+	rw_lock_s_lock(&dict_operation_lock);
+	dict_mutex_enter_for_mysql();
+
+	trx = trx_allocate_for_background();
+	trx->op_info = "get zip dict id by composite key";
+	trx->dict_operation_lock_mode = RW_S_LATCH;
+	trx_start_if_not_started(trx);
+
+	err = dict_create_get_zip_dict_id_by_reference(table_id, column_pos,
+		dict_id, trx);
+
+	trx_commit_for_mysql(trx);
+	trx->dict_operation_lock_mode = 0;
+	trx_free_for_background(trx);
+
+	dict_mutex_exit_for_mysql();
+	rw_lock_s_unlock(&dict_operation_lock);
+
+	return err;
+}
+/** Get compression dictionary info (name and data) for the given id.
+Allocates memory in name->str and data->str on success.
+Must be freed with mem_free().
+@retval	DB_SUCCESS		if OK
+@retval	DB_RECORD_NOT_FOUND	if not found */
+UNIV_INTERN
+dberr_t
+dict_get_dictionary_info_by_id(
+	ulint	dict_id,	/*!< in: table name */
+	char**	name,		/*!< out: dictionary name */
+	ulint*	name_len,	/*!< out: dictionary name length*/
+	char**	data,		/*!< out: dictionary data */
+	ulint*	data_len)	/*!< out: dictionary data length*/
+{
+	dberr_t		err = DB_SUCCESS;
+	trx_t*		trx;
+
+	rw_lock_s_lock(&dict_operation_lock);
+	dict_mutex_enter_for_mysql();
+
+	trx = trx_allocate_for_background();
+	trx->op_info = "get zip dict name and data by id";
+	trx->dict_operation_lock_mode = RW_S_LATCH;
+	trx_start_if_not_started(trx);
+
+	err = dict_create_get_zip_dict_info_by_id(dict_id, name, name_len,
+		data, data_len, trx);
+
+	trx_commit_for_mysql(trx);
+	trx->dict_operation_lock_mode = 0;
+	trx_free_for_background(trx);
+
+	dict_mutex_exit_for_mysql();
+	rw_lock_s_unlock(&dict_operation_lock);
+
+	return err;
+}
+/** Delete a record in SYS_ZIP_DICT with the given name.
+@retval	DB_SUCCESS		if OK
+@retval	DB_RECORD_NOT_FOUND	if not found
+@retval	DB_ROW_IS_REFERENCED	if in use */
+UNIV_INTERN
+dberr_t
+dict_drop_zip_dict(
+	const char*	name,		/*!< in: zip_dict name */
+	ulint		name_len)	/*!< in: zip_dict name length*/
+{
+	dberr_t		err = DB_SUCCESS;
+	trx_t*		trx;
+
+	ut_ad(name);
+
+	rw_lock_x_lock(&dict_operation_lock);
+	dict_mutex_enter_for_mysql();
+
+	trx = trx_allocate_for_background();
+	trx->op_info = "delete zip_dict";
+	trx->dict_operation_lock_mode = RW_X_LATCH;
+	trx_start_if_not_started(trx);
+
+	err = dict_create_remove_zip_dict(name, name_len, trx);
+
+	if (err == DB_SUCCESS) {
+		trx_commit_for_mysql(trx);
+	}
+	else {
+		trx->op_info = "rollback of internal trx on zip_dict table";
+		trx_rollback_to_savepoint(trx, NULL);
+		ut_a(trx->error_state == DB_SUCCESS);
+	}
+	trx->op_info = "";
+	trx->dict_operation_lock_mode = 0;
+	trx_free_for_background(trx);
+
+	dict_mutex_exit_for_mysql();
+	rw_lock_x_unlock(&dict_operation_lock);
+
+	return err;
+}
diff --git a/storage/xtradb/dict/dict0load.cc b/storage/xtradb/dict/dict0load.cc
index 45f314f8c67..b4ffb6ddf02 100644
--- a/storage/xtradb/dict/dict0load.cc
+++ b/storage/xtradb/dict/dict0load.cc
@@ -57,7 +57,9 @@ static const char* SYSTEM_TABLE_NAME[] = {
 	"SYS_FOREIGN",
 	"SYS_FOREIGN_COLS",
 	"SYS_TABLESPACES",
-	"SYS_DATAFILES"
+	"SYS_DATAFILES",
+	"SYS_ZIP_DICT",
+	"SYS_ZIP_DICT_COLS"
 };
 
 /* If this flag is TRUE, then we will load the cluster index's (and tables')
@@ -729,6 +731,161 @@ err_len:
 	return(NULL);
 }
 
+/** This function parses a SYS_ZIP_DICT record, extracts necessary
+information from the record and returns to caller.
+@return error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_process_sys_zip_dict(
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	ulint		zip_size,	/*!< in: nonzero=compressed BLOB page size */
+	const rec_t*	rec,		/*!< in: current SYS_ZIP_DICT rec */
+	ulint*		id,		/*!< out: dict id */
+	const char**	name,		/*!< out: dict name */
+	const char**	data,		/*!< out: dict data */
+	ulint*		data_len)	/*!< out: dict data length */
+{
+	ulint		len;
+	const byte*	field;
+
+	/* Initialize the output values */
+	*id = ULINT_UNDEFINED;
+	*name = NULL;
+	*data = NULL;
+	*data_len = 0;
+
+	if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, 0))) {
+		return("delete-marked record in SYS_ZIP_DICT");
+	}
+
+	if (UNIV_UNLIKELY(
+		rec_get_n_fields_old(rec)!= DICT_NUM_FIELDS__SYS_ZIP_DICT)) {
+		return("wrong number of columns in SYS_ZIP_DICT record");
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_ZIP_DICT__ID, &len);
+	if (UNIV_UNLIKELY(len != DICT_FLD_LEN_SPACE)) {
+		goto err_len;
+	}
+	*id = mach_read_from_4(field);
+
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_ZIP_DICT__DB_TRX_ID, &len);
+	if (UNIV_UNLIKELY(len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL)) {
+		goto err_len;
+	}
+
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_ZIP_DICT__DB_ROLL_PTR, &len);
+	if (UNIV_UNLIKELY(len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL)) {
+		goto err_len;
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_ZIP_DICT__NAME, &len);
+	if (UNIV_UNLIKELY(len == 0 || len == UNIV_SQL_NULL)) {
+		goto err_len;
+	}
+	*name = mem_heap_strdupl(heap, (char*) field, len);
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_ZIP_DICT__DATA, &len);
+	if (UNIV_UNLIKELY(len == UNIV_SQL_NULL)) {
+		goto err_len;
+	}
+
+	if (rec_get_1byte_offs_flag(rec) == 0 &&
+		rec_2_is_field_extern(rec, DICT_FLD__SYS_ZIP_DICT__DATA)) {
+		ut_a(len >= BTR_EXTERN_FIELD_REF_SIZE);
+
+		if (UNIV_UNLIKELY
+			(!memcmp(field + len - BTR_EXTERN_FIELD_REF_SIZE,
+				field_ref_zero,
+				BTR_EXTERN_FIELD_REF_SIZE))) {
+			goto err_len;
+		}
+		*data = reinterpret_cast<char*>(
+			btr_copy_externally_stored_field(data_len, field,
+							zip_size, len, heap, 0));
+	}
+	else {
+		*data_len = len;
+		*data = static_cast<char*>(mem_heap_dup(heap, field, len));
+	}
+
+	return(NULL);
+
+err_len:
+	return("incorrect column length in SYS_ZIP_DICT");
+}
+
+/** This function parses a SYS_ZIP_DICT_COLS record, extracts necessary
+information from the record and returns to caller.
+@return error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_process_sys_zip_dict_cols(
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_ZIP_DICT rec */
+	ulint*		table_id,	/*!< out: table id */
+	ulint*		column_pos,	/*!< out: column position */
+	ulint*		dict_id)	/*!< out: dict id */
+{
+	ulint		len;
+	const byte*	field;
+
+	/* Initialize the output values */
+	*table_id = ULINT_UNDEFINED;
+	*column_pos = ULINT_UNDEFINED;
+	*dict_id = ULINT_UNDEFINED;
+
+	if (UNIV_UNLIKELY(rec_get_deleted_flag(rec, 0))) {
+		return("delete-marked record in SYS_ZIP_DICT_COLS");
+	}
+
+	if (UNIV_UNLIKELY(rec_get_n_fields_old(rec) !=
+		DICT_NUM_FIELDS__SYS_ZIP_DICT_COLS)) {
+		return("wrong number of columns in SYS_ZIP_DICT_COLS"
+			" record");
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_ZIP_DICT_COLS__TABLE_ID, &len);
+	if (UNIV_UNLIKELY(len != DICT_FLD_LEN_SPACE)) {
+err_len:
+		return("incorrect column length in SYS_ZIP_DICT_COLS");
+	}
+	*table_id = mach_read_from_4(field);
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_ZIP_DICT_COLS__COLUMN_POS, &len);
+	if (UNIV_UNLIKELY(len != DICT_FLD_LEN_SPACE)) {
+		goto err_len;
+	}
+	*column_pos = mach_read_from_4(field);
+
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_ZIP_DICT_COLS__DB_TRX_ID, &len);
+	if (UNIV_UNLIKELY(len != DATA_TRX_ID_LEN && len != UNIV_SQL_NULL)) {
+		goto err_len;
+	}
+
+	rec_get_nth_field_offs_old(
+		rec, DICT_FLD__SYS_ZIP_DICT_COLS__DB_ROLL_PTR, &len);
+	if (UNIV_UNLIKELY(len != DATA_ROLL_PTR_LEN && len != UNIV_SQL_NULL)) {
+		goto err_len;
+	}
+
+	field = rec_get_nth_field_old(
+		rec, DICT_FLD__SYS_ZIP_DICT_COLS__DICT_ID, &len);
+	if (UNIV_UNLIKELY(len != DICT_FLD_LEN_SPACE)) {
+		goto err_len;
+	}
+	*dict_id = mach_read_from_4(field);
+
+	return(NULL);
+}
 /********************************************************************//**
 Determine the flags of a table as stored in SYS_TABLES.TYPE and N_COLS.
 @return  ULINT_UNDEFINED if error, else a valid dict_table_t::flags. */
@@ -1154,11 +1311,14 @@ loop:
 					space_id, name);
 			}
 
-			/* We need to read page 0 to get (optional) IV
-			regardless if encryptions is turned on or not,
-			since if it's off we should decrypt a potentially
-			already encrypted table */
-			bool read_page_0 = true;
+			/* We could read page 0 to get (optional) IV
+			if encryption is turned on, if it's off
+			we will read the page 0 later and find out
+			if we should decrypt a potentially
+			already encrypted table
+			bool read_page_0 = srv_encrypt_tables; */
+
+			bool read_page_0 = false;
 
 			/* We set the 2nd param (fix_dict = true)
 			here because we already have an x-lock on
diff --git a/storage/xtradb/dict/dict0stats.cc b/storage/xtradb/dict/dict0stats.cc
index 5c283f693d5..c13d4583fef 100644
--- a/storage/xtradb/dict/dict0stats.cc
+++ b/storage/xtradb/dict/dict0stats.cc
@@ -708,7 +708,10 @@ void
 dict_stats_copy(
 /*============*/
 	dict_table_t*		dst,	/*!< in/out: destination table */
-	const dict_table_t*	src)	/*!< in: source table */
+	const dict_table_t*	src,	/*!< in: source table */
+	bool reset_ignored_indexes)	/*!< in: if true, set ignored indexes
+                                             to have the same statistics as if 
+                                             the table was empty */
 {
 	dst->stats_last_recalc = src->stats_last_recalc;
 	dst->stat_n_rows = src->stat_n_rows;
@@ -727,7 +730,16 @@ dict_stats_copy(
 	      && (src_idx = dict_table_get_next_index(src_idx)))) {
 
 		if (dict_stats_should_ignore_index(dst_idx)) {
-			continue;
+			if (reset_ignored_indexes) {
+				/* Reset index statistics for all ignored indexes,
+				unless they are FT indexes (these have no statistics)*/
+				if (dst_idx->type & DICT_FTS) {
+					continue;
+				}
+				dict_stats_empty_index(dst_idx, true);
+			} else {
+				continue;
+			}
 		}
 
 		ut_ad(!dict_index_is_univ(dst_idx));
@@ -827,7 +839,7 @@ dict_stats_snapshot_create(
 
 	t = dict_stats_table_clone_create(table);
 
-	dict_stats_copy(t, table);
+	dict_stats_copy(t, table, false);
 
 	t->stat_persistent = table->stat_persistent;
 	t->stats_auto_recalc = table->stats_auto_recalc;
@@ -3319,13 +3331,10 @@ dict_stats_update(
 
 			dict_table_stats_lock(table, RW_X_LATCH);
 
-			/* Initialize all stats to dummy values before
-			copying because dict_stats_table_clone_create() does
-			skip corrupted indexes so our dummy object 't' may
-			have less indexes than the real object 'table'. */
-			dict_stats_empty_table(table, true);
-
-			dict_stats_copy(table, t);
+			/* Pass reset_ignored_indexes=true as parameter
+			to dict_stats_copy. This will cause statictics
+			for corrupted indexes to be set to empty values */
+			dict_stats_copy(table, t, true);
 
 			dict_stats_assert_initialized(table);
 
diff --git a/storage/xtradb/fil/fil0crypt.cc b/storage/xtradb/fil/fil0crypt.cc
index 4b7102d9fd6..e752f0d8650 100644
--- a/storage/xtradb/fil/fil0crypt.cc
+++ b/storage/xtradb/fil/fil0crypt.cc
@@ -142,6 +142,23 @@ fil_space_crypt_cleanup()
 	os_event_free(fil_crypt_throttle_sleep_event);
 }
 
+/**
+Get latest key version from encryption plugin.
+@return key version or ENCRYPTION_KEY_VERSION_INVALID */
+uint
+fil_space_crypt_struct::key_get_latest_version(void)
+{
+	uint key_version = key_found;
+
+	if (is_key_found()) {
+		key_version = encryption_key_get_latest_version(key_id);
+		srv_stats.n_key_requests.inc();
+		key_found = key_version;
+	}
+
+	return key_version;
+}
+
 /******************************************************************
 Get the latest(key-version), waking the encrypt thread, if needed */
 static inline
@@ -150,20 +167,25 @@ fil_crypt_get_latest_key_version(
 /*=============================*/
 	fil_space_crypt_t* crypt_data) 	/*!< in: crypt data */
 {
-	uint rc = encryption_key_get_latest_version(crypt_data->key_id);
+	ut_ad(crypt_data != NULL);
 
-	if (fil_crypt_needs_rotation(crypt_data->encryption,
-					crypt_data->min_key_version,
-					rc, srv_fil_crypt_rotate_key_age)) {
-		os_event_set(fil_crypt_threads_event);
+	uint key_version = crypt_data->key_get_latest_version();
+
+	if (crypt_data->is_key_found()) {
+
+		if (fil_crypt_needs_rotation(crypt_data->encryption,
+				crypt_data->min_key_version,
+				key_version,
+				srv_fil_crypt_rotate_key_age)) {
+			os_event_set(fil_crypt_threads_event);
+		}
 	}
 
-	return rc;
+	return key_version;
 }
 
 /******************************************************************
 Mutex helper for crypt_data->scheme */
-static
 void
 crypt_data_scheme_locker(
 /*=====================*/
@@ -183,38 +205,47 @@ crypt_data_scheme_locker(
 /******************************************************************
 Create a fil_space_crypt_t object
 @return crypt object */
-UNIV_INTERN
+static
 fil_space_crypt_t*
 fil_space_create_crypt_data(
 /*========================*/
-	fil_encryption_t	encrypt_mode,	/*!< in: encryption mode */
-	uint			key_id)		/*!< in: encryption key id */
+	uint			type,
+	fil_encryption_t	encrypt_mode,
+	uint			min_key_version,
+	uint			key_id,
+	ulint			offset)
 {
 	const uint sz = sizeof(fil_space_crypt_t);
-	fil_space_crypt_t* crypt_data =
-		static_cast<fil_space_crypt_t*>(malloc(sz));
-
-	memset(crypt_data, 0, sz);
+	void* buf = mem_zalloc(sz);
+	fil_space_crypt_t* crypt_data = NULL;
 
-	if (encrypt_mode == FIL_SPACE_ENCRYPTION_OFF ||
-		(!srv_encrypt_tables && encrypt_mode == FIL_SPACE_ENCRYPTION_DEFAULT)) {
-		crypt_data->type = CRYPT_SCHEME_UNENCRYPTED;
-	} else {
-		crypt_data->type = CRYPT_SCHEME_1;
-		crypt_data->min_key_version = encryption_key_get_latest_version(key_id);
+	if (buf) {
+		crypt_data = new(buf)
+			fil_space_crypt_struct(
+				type,
+				min_key_version,
+				key_id,
+				offset,
+				encrypt_mode);
 	}
 
-	mutex_create(fil_crypt_data_mutex_key,
-		&crypt_data->mutex, SYNC_NO_ORDER_CHECK);
-	crypt_data->locker = crypt_data_scheme_locker;
-	my_random_bytes(crypt_data->iv, sizeof(crypt_data->iv));
-	crypt_data->encryption = encrypt_mode;
-	crypt_data->inited = true;
-	crypt_data->key_id = key_id;
 	return crypt_data;
 }
 
 /******************************************************************
+Create a fil_space_crypt_t object
+@return crypt object */
+UNIV_INTERN
+fil_space_crypt_t*
+fil_space_create_crypt_data(
+/*========================*/
+	fil_encryption_t	encrypt_mode,	/*!< in: encryption mode */
+	uint			key_id)		/*!< in: encryption key id */
+{
+	return (fil_space_create_crypt_data(0, encrypt_mode, 0, key_id, 0));
+}
+
+/******************************************************************
 Merge fil_space_crypt_t object */
 UNIV_INTERN
 void
@@ -236,7 +267,7 @@ fil_space_merge_crypt_data(
 	dst->type = src->type;
 	dst->min_key_version = src->min_key_version;
 	dst->keyserver_requests += src->keyserver_requests;
-	dst->inited = src->inited;
+	dst->closing = src->closing;
 
 	mutex_exit(&dst->mutex);
 }
@@ -258,20 +289,6 @@ fil_space_read_crypt_data(
 	}
 
 	if (memcmp(page + offset, CRYPT_MAGIC, MAGIC_SZ) != 0) {
-#ifdef UNIV_DEBUG
-		ib_logf(IB_LOG_LEVEL_WARN,
-			"Found potentially bogus bytes on "
-			"page 0 offset %lu for space %lu : "
-			"[ %.2x %.2x %.2x %.2x %.2x %.2x ]. "
-			"Assuming space is not encrypted!.",
-			offset, space,
-			page[offset + 0],
-			page[offset + 1],
-			page[offset + 2],
-			page[offset + 3],
-			page[offset + 4],
-			page[offset + 5]);
-#endif
 		/* Crypt data is not stored. */
 		return NULL;
 	}
@@ -322,19 +339,12 @@ fil_space_read_crypt_data(
 	fil_encryption_t encryption = (fil_encryption_t)mach_read_from_1(
 		page + offset + MAGIC_SZ + 2 + iv_length + 8);
 
-	const uint sz = sizeof(fil_space_crypt_t) + iv_length;
-	crypt_data = static_cast<fil_space_crypt_t*>(malloc(sz));
-	memset(crypt_data, 0, sz);
-
+	crypt_data = fil_space_create_crypt_data(encryption, key_id);
+	/* We need to overwrite these as above function will initialize
+	members */
 	crypt_data->type = type;
 	crypt_data->min_key_version = min_key_version;
-	crypt_data->key_id = key_id;
 	crypt_data->page0_offset = offset;
-	crypt_data->encryption = encryption;
-	mutex_create(fil_crypt_data_mutex_key,
-		     &crypt_data->mutex, SYNC_NO_ORDER_CHECK);
-	crypt_data->locker = crypt_data_scheme_locker;
-	crypt_data->inited = true;
 	memcpy(crypt_data->iv, page + offset + MAGIC_SZ + 2, iv_length);
 
 	return crypt_data;
@@ -349,15 +359,9 @@ fil_space_destroy_crypt_data(
 	fil_space_crypt_t **crypt_data)	/*!< out: crypt data */
 {
 	if (crypt_data != NULL && (*crypt_data) != NULL) {
-		/* Make sure that this thread owns the crypt_data
-		and make it unawailable, this does not fully
-		avoid the race between drop table and crypt thread */
-		mutex_enter(&(*crypt_data)->mutex);
-		(*crypt_data)->inited = false;
-		mutex_exit(&(*crypt_data)->mutex);
-		mutex_free(& (*crypt_data)->mutex);
-		memset(*crypt_data, 0, sizeof(fil_space_crypt_t));
-		free(*crypt_data);
+		fil_space_crypt_t* c = *crypt_data;
+		c->~fil_space_crypt_struct();
+		mem_free(c);
 		(*crypt_data) = NULL;
 	}
 }
@@ -505,6 +509,7 @@ fil_parse_write_crypt_data(
 	}
 
 	fil_space_crypt_t* crypt_data = fil_space_create_crypt_data(encryption, key_id);
+	/* Need to overwrite these as above will initialize fields. */
 	crypt_data->page0_offset = offset;
 	crypt_data->min_key_version = min_key_version;
 	crypt_data->encryption = encryption;
@@ -662,10 +667,65 @@ fil_space_encrypt(
 		return src_frame;
 	}
 
-	ut_a(crypt_data != NULL && crypt_data->encryption != FIL_SPACE_ENCRYPTION_OFF);
+	ut_a(crypt_data != NULL && crypt_data->is_encrypted());
 
 	byte* tmp = fil_encrypt_buf(crypt_data, space, offset, lsn, src_frame, zip_size, dst_frame);
 
+#ifdef UNIV_DEBUG
+	if (tmp) {
+		/* Verify that encrypted buffer is not corrupted */
+		byte* tmp_mem = (byte *)malloc(UNIV_PAGE_SIZE);
+		dberr_t err = DB_SUCCESS;
+		byte* src = src_frame;
+		bool page_compressed_encrypted = (mach_read_from_2(tmp+FIL_PAGE_TYPE) == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED);
+		byte* comp_mem = NULL;
+		byte* uncomp_mem = NULL;
+		ulint size = (zip_size) ? zip_size : UNIV_PAGE_SIZE;
+
+		if (page_compressed_encrypted) {
+			comp_mem = (byte *)malloc(UNIV_PAGE_SIZE);
+			uncomp_mem = (byte *)malloc(UNIV_PAGE_SIZE);
+			memcpy(comp_mem, src_frame, UNIV_PAGE_SIZE);
+			fil_decompress_page(uncomp_mem, comp_mem, UNIV_PAGE_SIZE, NULL);
+			src = uncomp_mem;
+		}
+
+		bool corrupted1 = buf_page_is_corrupted(true, src, zip_size);
+		bool ok = fil_space_decrypt(crypt_data, tmp_mem, size, tmp, &err);
+
+		/* Need to decompress the page if it was also compressed */
+		if (page_compressed_encrypted) {
+			memcpy(comp_mem, tmp_mem, UNIV_PAGE_SIZE);
+			fil_decompress_page(tmp_mem, comp_mem, UNIV_PAGE_SIZE, NULL);
+		}
+
+		bool corrupted = buf_page_is_corrupted(true, tmp_mem, zip_size);
+		bool different = memcmp(src, tmp_mem, size);
+
+		if (!ok || corrupted || corrupted1 || err != DB_SUCCESS || different) {
+			fprintf(stderr, "JAN: ok %d corrupted %d corrupted1 %d err %d different %d\n", ok , corrupted, corrupted1, err, different);
+			fprintf(stderr, "JAN1: src_frame\n");
+			buf_page_print(src_frame, zip_size, BUF_PAGE_PRINT_NO_CRASH);
+			fprintf(stderr, "JAN2: encrypted_frame\n");
+			buf_page_print(tmp, zip_size, BUF_PAGE_PRINT_NO_CRASH);
+			fprintf(stderr, "JAN1: decrypted_frame\n");
+			buf_page_print(tmp_mem, zip_size, BUF_PAGE_PRINT_NO_CRASH);
+			ut_error;
+		}
+
+		free(tmp_mem);
+
+		if (comp_mem) {
+			free(comp_mem);
+		}
+
+		if (uncomp_mem) {
+			free(uncomp_mem);
+		}
+	}
+
+#endif /* UNIV_DEBUG */
+
 	return tmp;
 }
 
@@ -688,7 +748,7 @@ fil_space_check_encryption_read(
 		return false;
 	}
 
-	if (crypt_data->encryption == FIL_SPACE_ENCRYPTION_OFF) {
+	if (crypt_data->not_encrypted()) {
 		return false;
 	}
 
@@ -740,7 +800,7 @@ fil_space_decrypt(
 		return false;
 	}
 
-	ut_a(crypt_data != NULL && crypt_data->encryption != FIL_SPACE_ENCRYPTION_OFF);
+	ut_a(crypt_data != NULL && crypt_data->is_encrypted());
 
 	/* read space & lsn */
 	ulint header_len = FIL_PAGE_DATA;
@@ -978,20 +1038,13 @@ Copy global key state */
 static void
 fil_crypt_get_key_state(
 /*====================*/
-	key_state_t *new_state)	/*!< out: key state */
+	key_state_t*		new_state,	/*!< out: key state */
+	fil_space_crypt_t*	crypt_data)	/*!< in, out: crypt_data */
 {
 	if (srv_encrypt_tables) {
-		new_state->key_version =
-			encryption_key_get_latest_version(new_state->key_id);
+		new_state->key_version = crypt_data->key_get_latest_version();
 		new_state->rotate_key_age = srv_fil_crypt_rotate_key_age;
 
-		if (new_state->key_version == ENCRYPTION_KEY_VERSION_INVALID) {
-			ib_logf(IB_LOG_LEVEL_ERROR,
-				"Used key_id %u can't be found from key file.",
-				new_state->key_id);
-		}
-
-		ut_a(new_state->key_version != ENCRYPTION_KEY_VERSION_INVALID);
 		ut_a(new_state->key_version != ENCRYPTION_KEY_NOT_ENCRYPTED);
 	} else {
 		new_state->key_version = 0;
@@ -1051,9 +1104,7 @@ fil_crypt_is_closing(
 	fil_space_crypt_t *crypt_data = fil_space_get_crypt_data(space);
 
 	if (crypt_data) {
-		mutex_enter(&crypt_data->mutex);
-		closing = crypt_data->closing;
-		mutex_exit(&crypt_data->mutex);
+		closing = crypt_data->is_closing(false);
 	}
 
 	return closing;
@@ -1308,6 +1359,18 @@ fil_crypt_space_needs_rotation(
 			}
 			return false;
 		}
+
+		crypt_data->key_get_latest_version();
+
+		if (!crypt_data->is_key_found()) {
+			return false;
+		}
+	}
+
+	/* If used key_id is not found from encryption plugin we can't
+	continue to rotate the tablespace */
+	if (!crypt_data->is_key_found()) {
+		return false;
 	}
 
 	mutex_enter(&crypt_data->mutex);
@@ -1321,7 +1384,7 @@ fil_crypt_space_needs_rotation(
 		}
 
 		/* prevent threads from starting to rotate space */
-		if (crypt_data->closing) {
+		if (crypt_data->is_closing(true)) {
 			break;
 		}
 
@@ -1330,13 +1393,13 @@ fil_crypt_space_needs_rotation(
 		}
 
 		/* No need to rotate space if encryption is disabled */
-		if (crypt_data->encryption == FIL_SPACE_ENCRYPTION_OFF) {
+		if (crypt_data->not_encrypted()) {
 			break;
 		}
 
 		if (crypt_data->key_id != key_state->key_id) {
 			key_state->key_id= crypt_data->key_id;
-			fil_crypt_get_key_state(key_state);
+			fil_crypt_get_key_state(key_state, crypt_data);
 		}
 
 		bool need_key_rotation = fil_crypt_needs_rotation(
@@ -1349,12 +1412,14 @@ fil_crypt_space_needs_rotation(
 
 		time_t diff = time(0) - crypt_data->rotate_state.scrubbing.
 			last_scrub_completed;
+
 		bool need_scrubbing =
 			crypt_data->rotate_state.scrubbing.is_active
                         && diff >= (time_t) srv_background_scrub_data_interval;
 
-		if (need_key_rotation == false && need_scrubbing == false)
+		if (need_key_rotation == false && need_scrubbing == false) {
 			break;
+		}
 
 		mutex_exit(&crypt_data->mutex);
 		/* NOTE! fil_decr_pending_ops is performed outside */
@@ -1570,8 +1635,9 @@ fil_crypt_find_space_to_rotate(
 		os_event_wait_time(fil_crypt_threads_event, 1000000);
 	}
 
-	if (state->should_shutdown())
+	if (state->should_shutdown()) {
 		return false;
+	}
 
 	if (state->first) {
 		state->first = false;
@@ -1633,7 +1699,7 @@ fil_crypt_start_rotate_space(
 		crypt_data->rotate_state.start_time = time(0);
 
 		if (crypt_data->type == CRYPT_SCHEME_UNENCRYPTED &&
-			crypt_data->encryption != FIL_SPACE_ENCRYPTION_OFF &&
+			crypt_data->is_encrypted() &&
 			key_state->key_version != 0) {
 			/* this is rotation unencrypted => encrypted */
 			crypt_data->type = CRYPT_SCHEME_1;
@@ -1670,7 +1736,7 @@ fil_crypt_find_page_to_rotate(
 		mutex_enter(&crypt_data->mutex);
 		ut_ad(key_state->key_id == crypt_data->key_id);
 
-		if (crypt_data->closing == false &&
+		if (!crypt_data->is_closing(true) &&
 			crypt_data->rotate_state.next_offset <
 			crypt_data->rotate_state.max_offset) {
 
@@ -1927,7 +1993,7 @@ fil_crypt_rotate_page(
 				/* statistics */
 				state->crypt_stat.pages_modified++;
 			} else {
-				if (crypt_data->encryption !=  FIL_SPACE_ENCRYPTION_OFF) {
+				if (crypt_data->is_encrypted()) {
 					ut_a(kv >= crypt_data->min_key_version ||
 						(kv == 0 && key_state->key_version == 0));
 
@@ -2130,7 +2196,7 @@ fil_crypt_complete_rotate_space(
 	fil_space_crypt_t *crypt_data = fil_space_get_crypt_data(space);
 
 	/* Space might already be dropped */
-	if (crypt_data != NULL && crypt_data->inited) {
+	if (crypt_data != NULL && !crypt_data->is_closing(false)) {
 		mutex_enter(&crypt_data->mutex);
 
 		/**
@@ -2426,7 +2492,8 @@ UNIV_INTERN
 void
 fil_space_crypt_mark_space_closing(
 /*===============================*/
-	ulint	space)	/*!< in: Space id */
+	ulint			space,		/*!< in: tablespace id */
+	fil_space_crypt_t*	crypt_data)	/*!< in: crypt_data or NULL */
 {
 	if (!fil_crypt_threads_inited) {
 		return;
@@ -2434,7 +2501,9 @@ fil_space_crypt_mark_space_closing(
 
 	mutex_enter(&fil_crypt_threads_mutex);
 
-	fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(space);
+	if (!crypt_data) {
+		crypt_data = fil_space_get_crypt_data(space);
+	}
 
 	if (crypt_data == NULL) {
 		mutex_exit(&fil_crypt_threads_mutex);
@@ -2463,7 +2532,7 @@ fil_space_crypt_close_tablespace(
 
 	fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(space);
 
-	if (crypt_data == NULL || !crypt_data->inited) {
+	if (crypt_data == NULL || crypt_data->is_closing(false)) {
 		mutex_exit(&fil_crypt_threads_mutex);
 		return;
 	}
@@ -2516,6 +2585,8 @@ fil_space_crypt_get_status(
 {
 	fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(id);
 
+	memset(status, 0, sizeof(*status));
+
 	if (crypt_data != NULL) {
 		status->space = id;
 		status->scheme = crypt_data->type;
@@ -2536,6 +2607,7 @@ fil_space_crypt_get_status(
 		} else {
 			status->rotating = false;
 		}
+
 		mutex_exit(&crypt_data->mutex);
 
 		if (srv_encrypt_tables || crypt_data->min_key_version) {
@@ -2545,7 +2617,6 @@ fil_space_crypt_get_status(
 			status->current_key_version = 0;
 		}
 	} else {
-		memset(status, 0, sizeof(*status));
 		if (srv_encrypt_tables) {
 			os_event_set(fil_crypt_threads_event);
 		}
@@ -2578,6 +2649,7 @@ fil_space_get_scrub_status(
 	struct fil_space_scrub_status_t* status)	/*!< out: status  */
 {
 	fil_space_crypt_t* crypt_data = fil_space_get_crypt_data(id);
+
 	memset(status, 0, sizeof(*status));
 
 	if (crypt_data != NULL) {
@@ -2600,9 +2672,8 @@ fil_space_get_scrub_status(
 		} else {
 			status->scrubbing = false;
 		}
+
 		mutex_exit(&crypt_data->mutex);
-	} else {
-		memset(status, 0, sizeof(*status));
 	}
 
 	return crypt_data == NULL ? 1 : 0;
diff --git a/storage/xtradb/fil/fil0fil.cc b/storage/xtradb/fil/fil0fil.cc
index 58e08f11778..28f262b50c7 100644
--- a/storage/xtradb/fil/fil0fil.cc
+++ b/storage/xtradb/fil/fil0fil.cc
@@ -341,6 +341,8 @@ fil_space_get_by_id(
 		    ut_ad(space->magic_n == FIL_SPACE_MAGIC_N),
 		    space->id == id);
 
+	/* The system tablespace must always be found */
+	ut_ad(space || id != 0 || srv_is_being_started);
 	return(space);
 }
 
@@ -672,6 +674,7 @@ fil_node_open_file(
 		page = static_cast<byte*>(ut_align(buf2, UNIV_PAGE_SIZE));
 
 		success = os_file_read(node->handle, page, 0, UNIV_PAGE_SIZE);
+		srv_stats.page0_read.add(1);
 
 		space_id = fsp_header_get_space_id(page);
 		flags = fsp_header_get_flags(page);
@@ -1001,8 +1004,13 @@ retry:
 	/* If the file is already open, no need to do anything; if the space
 	does not exist, we handle the situation in the function which called
 	this function */
+	if (!space) {
+		return;
+	}
 
-	if (!space || UT_LIST_GET_FIRST(space->chain)->open) {
+	fil_node_t*	node = UT_LIST_GET_FIRST(space->chain);
+
+	if (!node || node->open) {
 
 		return;
 	}
@@ -1191,7 +1199,8 @@ fil_space_create(
 	ulint		id,	/*!< in: space id */
 	ulint		flags,	/*!< in: tablespace flags */
 	ulint		purpose,/*!< in: FIL_TABLESPACE, or FIL_LOG if log */
-	fil_space_crypt_t* crypt_data) /*!< in: crypt data */
+	fil_space_crypt_t* crypt_data, /*!< in: crypt data */
+	bool		create_table) /*!< in: true if create table */
 {
 	fil_space_t*	space;
 
@@ -1285,10 +1294,25 @@ fil_space_create(
 	space->is_in_unflushed_spaces = false;
 
 	space->is_corrupt = FALSE;
+	space->crypt_data = crypt_data;
 
-	UT_LIST_ADD_LAST(space_list, fil_system->space_list, space);
+	/* In create table we write page 0 so we have already
+	"read" it and for system tablespaces we have read
+	crypt data at startup. */
+	if (create_table || crypt_data != NULL) {
+		space->page_0_crypt_read = true;
+	}
 
-	space->crypt_data = crypt_data;
+#ifdef UNIV_DEBUG
+	ib_logf(IB_LOG_LEVEL_INFO,
+		"Created tablespace for space %lu name %s key_id %u encryption %d.",
+		space->id,
+		space->name,
+		space->crypt_data ? space->crypt_data->key_id : 0,
+		space->crypt_data ? space->crypt_data->encryption : 0);
+#endif
+
+	UT_LIST_ADD_LAST(space_list, fil_system->space_list, space);
 
 	mutex_exit(&fil_system->mutex);
 
@@ -1770,6 +1794,9 @@ fil_close_all_files(void)
 {
 	fil_space_t*	space;
 
+	// Must check both flags as it's possible for this to be called during
+	// server startup with srv_track_changed_pages == true but
+	// srv_redo_log_thread_started == false
 	if (srv_track_changed_pages && srv_redo_log_thread_started)
 		os_event_wait(srv_redo_log_tracked_event);
 
@@ -1809,6 +1836,9 @@ fil_close_log_files(
 {
 	fil_space_t*	space;
 
+	// Must check both flags as it's possible for this to be called during
+	// server startup with srv_track_changed_pages == true but
+	// srv_redo_log_thread_started == false
 	if (srv_track_changed_pages && srv_redo_log_thread_started)
 		os_event_wait(srv_redo_log_tracked_event);
 
@@ -2057,6 +2087,8 @@ fil_read_first_page(
 
 	os_file_read(data_file, page, 0, UNIV_PAGE_SIZE);
 
+	srv_stats.page0_read.add(1);
+
 	/* The FSP_HEADER on page 0 is only valid for the first file
 	in a tablespace.  So if this is not the first datafile, leave
 	*flags and *space_id as they were read from the first file and
@@ -2077,6 +2109,7 @@ fil_read_first_page(
 	ulint space = fsp_header_get_space_id(page);
 	ulint offset = fsp_header_get_crypt_offset(
 		fsp_flags_get_zip_size(*flags), NULL);
+
 	cdata = fil_space_read_crypt_data(space, page, offset);
 
 	if (crypt_data) {
@@ -2085,9 +2118,7 @@ fil_read_first_page(
 
 	/* If file space is encrypted we need to have at least some
 	encryption service available where to get keys */
-	if ((cdata && cdata->encryption == FIL_SPACE_ENCRYPTION_ON) ||
-		(srv_encrypt_tables &&
-			cdata && cdata->encryption == FIL_SPACE_ENCRYPTION_DEFAULT)) {
+	if (cdata && cdata->should_encrypt()) {
 
 		if (!encryption_key_id_exists(cdata->key_id)) {
 			ib_logf(IB_LOG_LEVEL_ERROR,
@@ -3627,7 +3658,7 @@ fil_create_new_single_table_tablespace(
 	}
 
 	success = fil_space_create(tablename, space_id, flags, FIL_TABLESPACE,
-				   crypt_data);
+				   crypt_data, true);
 
 	if (!success || !fil_node_create(path, size, space_id, FALSE)) {
 		err = DB_ERROR;
@@ -3861,6 +3892,7 @@ fil_open_single_table_tablespace(
 
 		if (table) {
 			table->crypt_data = def.crypt_data;
+			table->page_0_read = true;
 		}
 
 		/* Validate this single-table-tablespace with SYS_TABLES,
@@ -3897,6 +3929,7 @@ fil_open_single_table_tablespace(
 
 		if (table) {
 			table->crypt_data = remote.crypt_data;
+			table->page_0_read = true;
 		}
 
 		/* Validate this single-table-tablespace with SYS_TABLES,
@@ -3933,6 +3966,7 @@ fil_open_single_table_tablespace(
 
 		if (table) {
 			table->crypt_data = dict.crypt_data;
+			table->page_0_read = true;
 		}
 
 		/* Validate this single-table-tablespace with SYS_TABLES,
@@ -4104,7 +4138,7 @@ skip_validate:
 	if (err != DB_SUCCESS) {
 		; // Don't load the tablespace into the cache
 	} else if (!fil_space_create(tablename, id, flags, FIL_TABLESPACE,
-				     crypt_data)) {
+				     crypt_data, false)) {
 		err = DB_ERROR;
 	} else {
 		/* We do not measure the size of the file, that is why
@@ -4710,7 +4744,7 @@ will_not_choose:
 #endif /* UNIV_HOTBACKUP */
 	ibool file_space_create_success = fil_space_create(
 		tablename, fsp->id, fsp->flags, FIL_TABLESPACE,
-		fsp->crypt_data);
+		fsp->crypt_data, false);
 
 	if (!file_space_create_success) {
 		if (srv_force_recovery > 0) {
@@ -6591,10 +6625,7 @@ fil_iterate(
 		bool encrypted = false;
 
 		/* Use additional crypt io buffer if tablespace is encrypted */
-		if ((iter.crypt_data != NULL && iter.crypt_data->encryption == FIL_SPACE_ENCRYPTION_ON) ||
-				(srv_encrypt_tables &&
-					iter.crypt_data && iter.crypt_data->encryption == FIL_SPACE_ENCRYPTION_DEFAULT)) {
-
+		if (iter.crypt_data != NULL && iter.crypt_data->should_encrypt()) {
 			encrypted = true;
 			readptr = iter.crypt_io_buffer;
 			writeptr = iter.crypt_io_buffer;
@@ -7322,11 +7353,54 @@ fil_space_get_crypt_data(
 
 	space = fil_space_get_by_id(id);
 
+	mutex_exit(&fil_system->mutex);
+
 	if (space != NULL) {
+		/* If we have not yet read the page0
+		of this tablespace we will do it now. */
+		if (!space->crypt_data && !space->page_0_crypt_read) {
+			ulint space_id = space->id;
+			fil_node_t*	node;
+
+			ut_a(space->crypt_data == NULL);
+			node = UT_LIST_GET_FIRST(space->chain);
+
+			byte *buf = static_cast<byte*>(ut_malloc(2 * UNIV_PAGE_SIZE));
+			byte *page = static_cast<byte*>(ut_align(buf, UNIV_PAGE_SIZE));
+			fil_read(true, space_id, 0, 0, 0, UNIV_PAGE_SIZE, page,
+				NULL, NULL);
+			ulint flags = fsp_header_get_flags(page);
+			ulint offset = fsp_header_get_crypt_offset(
+				fsp_flags_get_zip_size(flags), NULL);
+			space->crypt_data = fil_space_read_crypt_data(space_id, page, offset);
+			ut_free(buf);
+
+#ifdef UNIV_DEBUG
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"Read page 0 from tablespace for space %lu name %s key_id %u encryption %d handle %d.",
+				space_id,
+				space->name,
+				space->crypt_data ? space->crypt_data->key_id : 0,
+				space->crypt_data ? space->crypt_data->encryption : 0,
+				node->handle);
+#endif
+
+			ut_a(space->id == space_id);
+
+			space->page_0_crypt_read = true;
+		}
+
 		crypt_data = space->crypt_data;
-	}
 
-	mutex_exit(&fil_system->mutex);
+		if (!space->page_0_crypt_read) {
+			ib_logf(IB_LOG_LEVEL_WARN,
+				"Space %lu name %s contains encryption %d information for key_id %u but page0 is not read.",
+				space->id,
+				space->name,
+				space->crypt_data ? space->crypt_data->encryption : 0,
+				space->crypt_data ? space->crypt_data->key_id : 0);
+		}
+	}
 
 	return(crypt_data);
 }
diff --git a/storage/xtradb/fts/fts0fts.cc b/storage/xtradb/fts/fts0fts.cc
index ea532c181ae..a9c4d175715 100644
--- a/storage/xtradb/fts/fts0fts.cc
+++ b/storage/xtradb/fts/fts0fts.cc
@@ -109,6 +109,7 @@ UNIV_INTERN mysql_pfs_key_t	fts_pll_tokenize_mutex_key;
 /** variable to record innodb_fts_internal_tbl_name for information
 schema table INNODB_FTS_INSERTED etc. */
 UNIV_INTERN char* fts_internal_tbl_name		= NULL;
+UNIV_INTERN char* fts_internal_tbl_name2	= NULL;
 
 /** InnoDB default stopword list:
 There are different versions of stopwords, the stop words listed
@@ -266,13 +267,15 @@ FTS auxiliary INDEX table and clear the cache at the end.
 @param[in,out]	sync		sync state
 @param[in]	unlock_cache	whether unlock cache lock when write node
 @param[in]	wait		whether wait when a sync is in progress
+@param[in]      has_dict        whether has dict operation lock
 @return DB_SUCCESS if all OK */
 static
 dberr_t
 fts_sync(
 	fts_sync_t*	sync,
 	bool		unlock_cache,
-	bool		wait);
+	bool		wait,
+	bool		has_dict);
 
 /****************************************************************//**
 Release all resources help by the words rb tree e.g., the node ilist. */
@@ -3569,7 +3572,7 @@ fts_add_doc_by_id(
 
 				DBUG_EXECUTE_IF(
 					"fts_instrument_sync_debug",
-					fts_sync(cache->sync, true, true);
+					fts_sync(cache->sync, true, true, false);
 				);
 
 				DEBUG_SYNC_C("fts_instrument_sync_request");
@@ -4381,13 +4384,11 @@ fts_sync_index(
 }
 
 /** Check if index cache has been synced completely
-@param[in,out]	sync		sync state
 @param[in,out]	index_cache	index cache
 @return true if index is synced, otherwise false. */
 static
 bool
 fts_sync_index_check(
-	fts_sync_t*		sync,
 	fts_index_cache_t*	index_cache)
 {
 	const ib_rbt_node_t*	rbt_node;
@@ -4410,14 +4411,36 @@ fts_sync_index_check(
 	return(true);
 }
 
-/*********************************************************************//**
-Commit the SYNC, change state of processed doc ids etc.
+/** Reset synced flag in index cache when rollback
+@param[in,out]	index_cache	index cache */
+static
+void
+fts_sync_index_reset(
+	fts_index_cache_t*	index_cache)
+{
+	const ib_rbt_node_t*	rbt_node;
+
+	for (rbt_node = rbt_first(index_cache->words);
+	     rbt_node != NULL;
+	     rbt_node = rbt_next(index_cache->words, rbt_node)) {
+
+		fts_tokenizer_word_t*	word;
+		word = rbt_value(fts_tokenizer_word_t, rbt_node);
+
+		fts_node_t*	fts_node;
+		fts_node = static_cast<fts_node_t*>(ib_vector_last(word->nodes));
+
+		fts_node->synced = false;
+	}
+}
+
+/** Commit the SYNC, change state of processed doc ids etc.
+@param[in,out]	sync	sync state
 @return DB_SUCCESS if all OK */
 static  MY_ATTRIBUTE((nonnull, warn_unused_result))
 dberr_t
 fts_sync_commit(
-/*============*/
-	fts_sync_t*	sync)			/*!< in: sync state */
+	fts_sync_t*	sync)
 {
 	dberr_t		error;
 	trx_t*		trx = sync->trx;
@@ -4470,6 +4493,8 @@ fts_sync_commit(
 			(double) n_nodes/ (double) elapsed_time);
 	}
 
+	/* Avoid assertion in trx_free(). */
+	trx->dict_operation_lock_mode = 0;
 	trx_free_for_background(trx);
 
 	return(error);
@@ -4492,6 +4517,10 @@ fts_sync_rollback(
 		index_cache = static_cast<fts_index_cache_t*>(
 			ib_vector_get(cache->indexes, i));
 
+		/* Reset synced flag so nodes will not be skipped
+		in the next sync, see fts_sync_write_words(). */
+		fts_sync_index_reset(index_cache);
+
 		for (j = 0; fts_index_selector[j].value; ++j) {
 
 			if (index_cache->ins_graph[j] != NULL) {
@@ -4517,6 +4546,9 @@ fts_sync_rollback(
 	rw_lock_x_unlock(&cache->lock);
 
 	fts_sql_rollback(trx);
+
+	/* Avoid assertion in trx_free(). */
+	trx->dict_operation_lock_mode = 0;
 	trx_free_for_background(trx);
 }
 
@@ -4525,13 +4557,15 @@ FTS auxiliary INDEX table and clear the cache at the end.
 @param[in,out]	sync		sync state
 @param[in]	unlock_cache	whether unlock cache lock when write node
 @param[in]	wait		whether wait when a sync is in progress
+@param[in]      has_dict        whether has dict operation lock
 @return DB_SUCCESS if all OK */
 static
 dberr_t
 fts_sync(
 	fts_sync_t*	sync,
 	bool		unlock_cache,
-	bool		wait)
+	bool		wait,
+	bool		has_dict)
 {
 	ulint		i;
 	dberr_t		error = DB_SUCCESS;
@@ -4560,6 +4594,12 @@ fts_sync(
 	DEBUG_SYNC_C("fts_sync_begin");
 	fts_sync_begin(sync);
 
+	/* When sync in background, we hold dict operation lock
+	to prevent DDL like DROP INDEX, etc. */
+	if (has_dict) {
+		sync->trx->dict_operation_lock_mode = RW_S_LATCH;
+	}
+
 begin_sync:
 	if (cache->total_size > fts_max_cache_size) {
 		/* Avoid the case: sync never finish when
@@ -4600,7 +4640,7 @@ begin_sync:
 			ib_vector_get(cache->indexes, i));
 
 		if (index_cache->index->to_be_dropped
-		    || fts_sync_index_check(sync, index_cache)) {
+		    || fts_sync_index_check(index_cache)) {
 			continue;
 		}
 
@@ -4615,6 +4655,7 @@ end_sync:
 	}
 
 	rw_lock_x_lock(&cache->lock);
+	sync->interrupted = false;
 	sync->in_progress = false;
 	os_event_set(sync->event);
 	rw_lock_x_unlock(&cache->lock);
@@ -4638,20 +4679,23 @@ FTS auxiliary INDEX table and clear the cache at the end.
 @param[in,out]	table		fts table
 @param[in]	unlock_cache	whether unlock cache when write node
 @param[in]	wait		whether wait for existing sync to finish
+@param[in]	has_dict	whether has dict operation lock
 @return DB_SUCCESS on success, error code on failure. */
 UNIV_INTERN
 dberr_t
 fts_sync_table(
 	dict_table_t*	table,
 	bool		unlock_cache,
-	bool		wait)
+	bool		wait,
+	bool		has_dict)
 {
 	dberr_t	err = DB_SUCCESS;
 
 	ut_ad(table->fts);
 
 	if (!dict_table_is_discarded(table) && table->fts->cache) {
-		err = fts_sync(table->fts->cache->sync, unlock_cache, wait);
+		err = fts_sync(table->fts->cache->sync,
+			       unlock_cache, wait, has_dict);
 	}
 
 	return(err);
@@ -6529,6 +6573,36 @@ fts_check_corrupt_index(
 	return(0);
 }
 
+/* Get parent table name if it's a fts aux table
+@param[in]	aux_table_name	aux table name
+@param[in]	aux_table_len	aux table length
+@return parent table name, or NULL */
+char*
+fts_get_parent_table_name(
+	const char*	aux_table_name,
+	ulint		aux_table_len)
+{
+	fts_aux_table_t	aux_table;
+	char*		parent_table_name = NULL;
+
+	if (fts_is_aux_table_name(&aux_table, aux_table_name, aux_table_len)) {
+		dict_table_t*	parent_table;
+
+		parent_table = dict_table_open_on_id(
+			aux_table.parent_id, TRUE, DICT_TABLE_OP_NORMAL);
+
+		if (parent_table != NULL) {
+			parent_table_name = mem_strdupl(
+				parent_table->name,
+				strlen(parent_table->name));
+
+			dict_table_close(parent_table, TRUE, FALSE);
+		}
+	}
+
+	return(parent_table_name);
+}
+
 /** Check the validity of the parent table.
 @param[in]	aux_table	auxiliary table
 @return true if it is a valid table or false if it is not */
diff --git a/storage/xtradb/fts/fts0opt.cc b/storage/xtradb/fts/fts0opt.cc
index 84f0563e038..ed882d33548 100644
--- a/storage/xtradb/fts/fts0opt.cc
+++ b/storage/xtradb/fts/fts0opt.cc
@@ -2987,7 +2987,7 @@ fts_optimize_sync_table(
 
 	if (table) {
 		if (dict_table_has_fts_index(table) && table->fts->cache) {
-			fts_sync_table(table, true, false);
+			fts_sync_table(table, true, false, true);
 		}
 
 		dict_table_close(table, FALSE, FALSE);
diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc
index 6d956f5574b..984d508bd04 100644
--- a/storage/xtradb/handler/ha_innodb.cc
+++ b/storage/xtradb/handler/ha_innodb.cc
@@ -113,6 +113,14 @@ this program; if not, write to the Free Software Foundation, Inc.,
 
 #define thd_get_trx_isolation(X) ((enum_tx_isolation)thd_tx_isolation(X))
 
+#ifndef HAVE_PERCONA_COMPRESSED_COLUMNS
+#define COLUMN_FORMAT_TYPE_COMPRESSED                   0xBADF00D
+#define SQLCOM_CREATE_COMPRESSION_DICTIONARY            0xDECAF
+#define SQLCOM_DROP_COMPRESSION_DICTIONARY              0xC0FFEE
+#define ER_COMPRESSION_DICTIONARY_DOES_NOT_EXIST        0xDEADFACE
+const static LEX_CSTRING null_lex_cstr={0,0};
+#endif
+
 #ifdef MYSQL_DYNAMIC_PLUGIN
 #define tc_size 400
 #define tdc_size 400
@@ -194,6 +202,8 @@ static long innobase_buffer_pool_instances = 1;
 static ulong innobase_log_block_size;
 
 static long long innobase_buffer_pool_size, innobase_log_file_size;
+/** Deprecated option that has no effect. */
+static my_bool innodb_buffer_pool_populate;
 
 /** Percentage of the buffer pool to reserve for 'old' blocks.
 Connected to buf_LRU_old_ratio. */
@@ -348,6 +358,23 @@ static TYPELIB innodb_empty_free_list_algorithm_typelib = {
 	NULL
 };
 
+/** Possible values of the parameter innodb_lock_schedule_algorithm */
+static const char* innodb_lock_schedule_algorithm_names[] = {
+	"fcfs",
+	"vats",
+	NullS
+};
+
+/** Used to define an enumerate type of the system variable
+innodb_lock_schedule_algorithm. */
+static TYPELIB innodb_lock_schedule_algorithm_typelib = {
+	array_elements(innodb_lock_schedule_algorithm_names) - 1,
+	"innodb_lock_schedule_algorithm_typelib",
+	innodb_lock_schedule_algorithm_names,
+	NULL
+};
+
+
 /* The following counter is used to convey information to InnoDB
 about server activity: in case of normal DML ops it is not
 sensible to call srv_active_wake_master_thread after each
@@ -620,11 +647,6 @@ ib_cb_t innodb_api_cb[] = {
 static void innodb_remember_check_sysvar_funcs();
 mysql_var_check_func check_sysvar_enum;
 
-// should page compression be used by default for new tables
-static MYSQL_THDVAR_BOOL(compression_default, PLUGIN_VAR_OPCMDARG,
-  "Is compression the default for new tables", 
-  NULL, NULL, FALSE);
-
 static MYSQL_THDVAR_UINT(default_encryption_key_id, PLUGIN_VAR_RQCMDARG,
 			 "Default encryption key id used for table encryption.",
 			 NULL, NULL,
@@ -642,7 +664,7 @@ ha_create_table_option innodb_table_option_list[]=
 {
   /* With this option user can enable page compression feature for the
   table */
-  HA_TOPTION_BOOL("PAGE_COMPRESSED", page_compressed, compression_default),
+  HA_TOPTION_BOOL("PAGE_COMPRESSED", page_compressed, 0),
   /* With this option user can set zip compression level for page
   compression for this table*/
   HA_TOPTION_NUMBER("PAGE_COMPRESSION_LEVEL", page_compression_level, 0, 1, 9, 1),
@@ -869,6 +891,19 @@ innobase_is_fake_change(
 	THD*		thd) __attribute__((unused));	/*!< in: MySQL thread handle of the user for
 				  whom the transaction is being committed */
 
+/** Get the list of foreign keys referencing a specified table
+table.
+@param thd		The thread handle
+@param path		Path to the table
+@param f_key_list[out]	The list of foreign keys
+
+@return error code or zero for success */
+static
+int
+innobase_get_parent_fk_list(
+	THD*			thd,
+	const char*		path,
+	List<FOREIGN_KEY_INFO>*	f_key_list) __attribute__((unused));
 
 /******************************************************************//**
 Maps a MySQL trx isolation level code to the InnoDB isolation level code
@@ -1121,6 +1156,8 @@ static SHOW_VAR innodb_status_variables[]= {
   (char*) &export_vars.innodb_pages_created,		  SHOW_LONG},
   {"pages_read",
   (char*) &export_vars.innodb_pages_read,		  SHOW_LONG},
+  {"pages0_read",
+  (char*) &export_vars.innodb_page0_read,		  SHOW_LONG},
   {"pages_written",
   (char*) &export_vars.innodb_pages_written,		  SHOW_LONG},
   {"purge_trx_id",
@@ -1283,6 +1320,8 @@ static SHOW_VAR innodb_status_variables[]= {
   {"scrub_background_page_split_failures_unknown",
    (char*) &export_vars.innodb_scrub_page_split_failures_unknown,
    SHOW_LONG},
+  {"encryption_num_key_requests",
+   (char*) &export_vars.innodb_encryption_key_requests, SHOW_LONGLONG},
 
   {NullS, NullS, SHOW_LONG}
 };
@@ -1664,6 +1703,30 @@ normalize_table_name_low(
 	ibool           set_lower_case); /* in: TRUE if we want to set
 					 name to lower case */
 
+#ifdef HAVE_PERCONA_COMPRESSED_COLUMNS
+/** Creates a new compression dictionary. */
+static
+handler_create_zip_dict_result
+innobase_create_zip_dict(
+	handlerton*	hton,	/*!< in: innobase handlerton */
+	THD*		thd,	/*!< in: handle to the MySQL thread */
+	const char*	name,	/*!< in: zip dictionary name */
+	ulint*		name_len,
+				/*!< in/out: zip dictionary name length */
+	const char*	data,	/*!< in: zip dictionary data */
+	ulint*		data_len);
+				/*!< in/out: zip dictionary data length */
+
+/** Drops a existing compression dictionary. */
+static
+handler_drop_zip_dict_result
+innobase_drop_zip_dict(
+	handlerton*	hton,	/*!< in: innobase handlerton */
+	THD*		thd,	/*!< in: handle to the MySQL thread */
+	const char*	name,	/*!< in: zip dictionary name */
+	ulint*		name_len);
+				/*!< in/out: zip dictionary name length */
+#endif
 /*************************************************************//**
 Checks if buffer pool is big enough to enable backoff algorithm.
 InnoDB empty free list algorithm backoff requires free pages
@@ -1799,7 +1862,7 @@ thd_is_replication_slave_thread(
 /*============================*/
 	THD*	thd)	/*!< in: thread handle */
 {
-	return((ibool) thd_slave_thread(thd));
+	return thd && ((ibool) thd_slave_thread(thd));
 }
 
 /******************************************************************//**
@@ -2429,7 +2492,7 @@ innobase_check_identifier_length(
 	CHARSET_INFO	*cs = system_charset_info;
 	DBUG_ENTER("innobase_check_identifier_length");
 
-	size_t len = my_well_formed_length(
+	size_t len = cs->cset->well_formed_len(
 		cs, id, id + strlen(id),
 		NAME_CHAR_LEN, &well_formed_error);
 
@@ -2540,11 +2603,16 @@ innobase_get_stmt(
 	THD*	thd,		/*!< in: MySQL thread handle */
 	size_t*	length)		/*!< out: length of the SQL statement */
 {
-	LEX_STRING* stmt;
-
-	stmt = thd_query_string(thd);
-	*length = stmt->length;
-	return(stmt->str);
+	const char* query = NULL;
+	LEX_STRING *stmt = NULL;
+	if (thd) {
+		stmt = thd_query_string(thd);
+		if (stmt) {
+			*length = stmt->length;
+			query = stmt->str;
+		}
+	}
+	return (query);
 }
 
 /**********************************************************************//**
@@ -3786,6 +3854,10 @@ innobase_init(
 
 	innodb_remember_check_sysvar_funcs();
 
+#ifdef HAVE_PERCONA_COMPRESSED_COLUMNS
+	innobase_hton->create_zip_dict = innobase_create_zip_dict;
+	innobase_hton->drop_zip_dict = innobase_drop_zip_dict;
+#endif
 	ut_a(DATA_MYSQL_TRUE_VARCHAR == (ulint)MYSQL_TYPE_VARCHAR);
 
 #ifndef DBUG_OFF
@@ -3832,17 +3904,19 @@ innobase_init(
 	}
 
 	if (UNIV_PAGE_SIZE != UNIV_PAGE_SIZE_DEF) {
-		fprintf(stderr,
-			"InnoDB: Warning: innodb_page_size has been "
-			"changed from default value %d to %ldd. (###EXPERIMENTAL### "
-			"operation)\n", UNIV_PAGE_SIZE_DEF, UNIV_PAGE_SIZE);
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"innodb_page_size has been "
+			"changed from default value %d to %ldd.",
+			UNIV_PAGE_SIZE_DEF, UNIV_PAGE_SIZE);
 
 		/* There is hang on buffer pool when trying to get a new
 		page if buffer pool size is too small for large page sizes */
 		if (innobase_buffer_pool_size < (24 * 1024 * 1024)) {
-			fprintf(stderr, "InnoDB: Error: innobase_page_size %lu requires "
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"innobase_page_size %lu requires "
 				"innodb_buffer_pool_size > 24M current %lld",
 				UNIV_PAGE_SIZE, innobase_buffer_pool_size);
+
 			goto error;
 		}
 	}
@@ -4217,6 +4291,15 @@ innobase_change_buffering_inited_ok:
 			"allocator.\n");
 	}
 
+	if (innodb_buffer_pool_populate) {
+		ut_print_timestamp(stderr);
+		fprintf(stderr,
+			" InnoDB: Warning: Setting "
+			"innodb_buffer_pool_populate is DEPRECATED"
+			" and has no effect. "
+			"This option will be removed in MariaDB 10.2.3.\n");
+	}
+
 	srv_n_file_io_threads = (ulint) innobase_file_io_threads;
 	srv_n_read_io_threads = (ulint) innobase_read_io_threads;
 	srv_n_write_io_threads = (ulint) innobase_write_io_threads;
@@ -4547,6 +4630,90 @@ innobase_purge_changed_page_bitmaps(
 	return (my_bool)log_online_purge_changed_page_bitmaps(lsn);
 }
 
+#ifdef HAVE_PERCONA_COMPRESSED_COLUMNS
+/** Creates a new compression dictionary. */
+static
+handler_create_zip_dict_result
+innobase_create_zip_dict(
+	handlerton*	hton,	/*!< in: innobase handlerton */
+	THD*		thd,	/*!< in: handle to the MySQL thread */
+	const char*	name,	/*!< in: zip dictionary name */
+	ulint*		name_len,
+				/*!< in/out: zip dictionary name length */
+	const char*	data,	/*!< in: zip dictionary data */
+	ulint*		data_len)
+				/*!< in/out: zip dictionary data length */
+{
+	handler_create_zip_dict_result result =
+		HA_CREATE_ZIP_DICT_UNKNOWN_ERROR;
+
+	DBUG_ENTER("innobase_create_zip_dict");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	if (UNIV_UNLIKELY(high_level_read_only)) {
+		DBUG_RETURN(HA_CREATE_ZIP_DICT_READ_ONLY);
+	}
+
+	if (UNIV_UNLIKELY(*name_len > ZIP_DICT_MAX_NAME_LENGTH)) {
+		*name_len = ZIP_DICT_MAX_NAME_LENGTH;
+		DBUG_RETURN(HA_CREATE_ZIP_DICT_NAME_TOO_LONG);
+	}
+
+	if (UNIV_UNLIKELY(*data_len > ZIP_DICT_MAX_DATA_LENGTH)) {
+		*data_len = ZIP_DICT_MAX_DATA_LENGTH;
+		DBUG_RETURN(HA_CREATE_ZIP_DICT_DATA_TOO_LONG);
+	}
+
+	switch (dict_create_zip_dict(name, *name_len, data, *data_len)) {
+		case DB_SUCCESS:
+			result = HA_CREATE_ZIP_DICT_OK;
+			break;
+		case DB_DUPLICATE_KEY:
+			result = HA_CREATE_ZIP_DICT_ALREADY_EXISTS;
+			break;
+		default:
+			ut_ad(0);
+			result = HA_CREATE_ZIP_DICT_UNKNOWN_ERROR;
+	}
+	DBUG_RETURN(result);
+}
+
+/** Drops a existing compression dictionary. */
+static
+handler_drop_zip_dict_result
+innobase_drop_zip_dict(
+	handlerton*	hton,	/*!< in: innobase handlerton */
+	THD*		thd,	/*!< in: handle to the MySQL thread */
+	const char*	name,	/*!< in: zip dictionary name */
+	ulint*		name_len)
+				/*!< in/out: zip dictionary name length */
+{
+	handler_drop_zip_dict_result result = HA_DROP_ZIP_DICT_UNKNOWN_ERROR;
+
+	DBUG_ENTER("innobase_drop_zip_dict");
+	DBUG_ASSERT(hton == innodb_hton_ptr);
+
+	if (UNIV_UNLIKELY(high_level_read_only)) {
+		DBUG_RETURN(HA_DROP_ZIP_DICT_READ_ONLY);
+	}
+
+	switch (dict_drop_zip_dict(name, *name_len)) {
+		case DB_SUCCESS:
+			result = HA_DROP_ZIP_DICT_OK;
+			break;
+		case DB_RECORD_NOT_FOUND:
+			result = HA_DROP_ZIP_DICT_DOES_NOT_EXIST;
+			break;
+		case DB_ROW_IS_REFERENCED:
+			result = HA_DROP_ZIP_DICT_IS_REFERENCED;
+			break;
+		default:
+			ut_ad(0);
+			result = HA_DROP_ZIP_DICT_UNKNOWN_ERROR;
+	}
+	DBUG_RETURN(result);
+}
+#endif
 /*****************************************************************//**
 Check whether this is a fake change transaction.
 @return TRUE if a fake change transaction */
@@ -5445,7 +5612,8 @@ innobase_kill_connection(
 			wsrep_thd_is_BF(current_thd, FALSE),
 			lock_get_info(trx->lock.wait_lock).c_str());
 
-		if (!wsrep_thd_is_BF(trx->mysql_thd, FALSE)) {
+		if (!wsrep_thd_is_BF(trx->mysql_thd, FALSE) &&
+		    trx->abort_type == TRX_SERVER_ABORT) {
 			ut_ad(!lock_mutex_own());
 			lock_mutex_enter();
 		}
@@ -5465,7 +5633,8 @@ innobase_kill_connection(
 			trx_mutex_exit(trx);
 		}
 
-		if (!wsrep_thd_is_BF(trx->mysql_thd, FALSE)) {
+		if (!wsrep_thd_is_BF(trx->mysql_thd, FALSE) &&
+		    trx->abort_type == TRX_SERVER_ABORT) {
 			lock_mutex_exit();
 		}
 	}
@@ -6082,6 +6251,88 @@ func_exit:
 	DBUG_RETURN(ret);
 }
 
+/** This function checks if all the compression dictionaries referenced
+in table->fields exist in SYS_ZIP_DICT InnoDB system table.
+@return true if all referenced dictionaries exist */
+UNIV_INTERN
+bool
+innobase_check_zip_dicts(
+	const TABLE*	table,		/*!< in: table in MySQL data
+					dictionary */
+	ulint*		dict_ids,	/*!< out: identified zip dict ids
+					(at least n_fields long) */
+	trx_t*		trx,		/*!< in: transaction */
+	const char**	err_dict_name)	/*!< out: the name of the
+					zip_dict which does not exist. */
+{
+	DBUG_ENTER("innobase_check_zip_dicts");
+
+	bool res = true;
+#ifdef HAVE_PERCONA_COMPRESSED_COLUMNS
+	dberr_t err = DB_SUCCESS;
+	const size_t n_fields = table->s->fields;
+
+	Field* field_ptr;
+	for (size_t field_idx = 0; err == DB_SUCCESS && field_idx < n_fields;
+		++field_idx)
+	{
+		field_ptr = table->field[field_idx];
+		if (field_ptr->has_associated_compression_dictionary()) {
+			err = dict_create_get_zip_dict_id_by_name(
+				field_ptr->zip_dict_name.str,
+				field_ptr->zip_dict_name.length,
+				&dict_ids[field_idx],
+				trx);
+			ut_a(err == DB_SUCCESS || err == DB_RECORD_NOT_FOUND);
+		}
+		else {
+			dict_ids[field_idx] = ULINT_UNDEFINED;
+		}
+
+	}
+
+	if (err != DB_SUCCESS) {
+		res = false;
+		*err_dict_name = field_ptr->zip_dict_name.str;
+	}
+
+#endif
+	DBUG_RETURN(res);
+}
+
+/** This function creates compression dictionary references in
+SYS_ZIP_DICT_COLS InnoDB system table for table_id based on info
+in table->fields and provided zip dict ids. */
+UNIV_INTERN
+void
+innobase_create_zip_dict_references(
+	const TABLE*	table,		/*!< in: table in MySQL data
+					dictionary */
+	table_id_t	ib_table_id,	/*!< in: table ID in Innodb data
+					dictionary */
+	ulint*		zip_dict_ids,	/*!< in: zip dict ids
+					(at least n_fields long) */
+	trx_t*		trx)		/*!< in: transaction */
+{
+	DBUG_ENTER("innobase_create_zip_dict_references");
+
+	dberr_t err = DB_SUCCESS;
+	const size_t n_fields = table->s->fields;
+
+	for (size_t field_idx = 0; err == DB_SUCCESS && field_idx < n_fields;
+		++field_idx)
+	{
+		if (zip_dict_ids[field_idx] != ULINT_UNDEFINED) {
+			err = dict_create_add_zip_dict_reference(ib_table_id,
+				table->field[field_idx]->field_index,
+				zip_dict_ids[field_idx], trx);
+			ut_a(err == DB_SUCCESS);
+		}
+	}
+
+	DBUG_VOID_RETURN;
+}
+
 /*******************************************************************//**
 This function uses index translation table to quickly locate the
 requested index structure.
@@ -6465,9 +6716,8 @@ table_opened:
 		or used key_id is not available. */
 		if (ib_table) {
 			fil_space_crypt_t* crypt_data = ib_table->crypt_data;
-			if ((crypt_data && crypt_data->encryption == FIL_SPACE_ENCRYPTION_ON) ||
-				(srv_encrypt_tables &&
-					crypt_data && crypt_data->encryption == FIL_SPACE_ENCRYPTION_DEFAULT)) {
+
+			if (crypt_data && crypt_data->should_encrypt()) {
 
 				if (!encryption_key_id_exists(crypt_data->key_id)) {
 					push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
@@ -6905,7 +7155,7 @@ innobase_mysql_cmp(
 		having indexes on such data need to rebuild their tables! */
 
 		ret = charset->coll->strnncollsp(
-			charset, a, a_length, b, b_length);
+			charset, a, a_length, b, b_length, 0);
 
 		if (ret < 0) {
 			return(-1);
@@ -6999,7 +7249,7 @@ innobase_mysql_cmp_prefix(
 	charset = innobase_get_fts_charset(mysql_type, charset_number);
 
 	result = ha_compare_text(charset, (uchar*) a, a_length,
-				 (uchar*) b, b_length, 1);
+				 (uchar*) b, b_length, 1, 0);
 
 	return(result);
 }
@@ -7019,7 +7269,7 @@ innobase_fts_text_cmp(
 
 	return(ha_compare_text(
 		charset, s1->f_str, static_cast<uint>(s1->f_len),
-		s2->f_str, static_cast<uint>(s2->f_len), 0));
+		s2->f_str, static_cast<uint>(s2->f_len), 0, 0));
 }
 /******************************************************************//**
 compare two character string case insensitively according to their charset. */
@@ -7042,7 +7292,7 @@ innobase_fts_text_case_cmp(
 
 	return(ha_compare_text(
 		charset, s1->f_str, static_cast<uint>(s1->f_len),
-		s2->f_str, static_cast<uint>(newlen), 0));
+		s2->f_str, static_cast<uint>(newlen), 0, 0));
 }
 /******************************************************************//**
 Get the first character's code position for FTS index partition. */
@@ -7090,7 +7340,7 @@ innobase_fts_text_cmp_prefix(
 
 	result = ha_compare_text(
 		charset, s2->f_str, static_cast<uint>(s2->f_len),
-		s1->f_str, static_cast<uint>(s1->f_len), 1);
+		s1->f_str, static_cast<uint>(s1->f_len), 1, 0);
 
 	/* We switched s1, s2 position in ha_compare_text. So we need
 	to negate the result */
@@ -7353,6 +7603,7 @@ wsrep_store_key_val_for_row(
 				format) */
 	uint		buff_len,/*!< in: buffer length */
 	const uchar*	record,
+	row_prebuilt_t*	prebuilt,	/*!< in: InnoDB prebuilt struct */
 	ibool*          key_is_null)/*!< out: full key was null */
 {
 	KEY*		key_info	= table->key_info + keynr;
@@ -7434,7 +7685,7 @@ wsrep_store_key_val_for_row(
 			the true length of the key */
 
 			if (len > 0 && cs->mbmaxlen > 1) {
-				true_len = (ulint) my_well_formed_length(cs,
+				true_len = (ulint) cs->cset->well_formed_len(cs,
 						(const char *) data,
 						(const char *) data + len,
                                                 (uint) (key_len /
@@ -7509,8 +7760,17 @@ wsrep_store_key_val_for_row(
 
 			blob_data = row_mysql_read_blob_ref(&blob_len,
 				(byte*) (record
-				+ (ulint)get_field_offset(table, field)),
-					(ulint) field->pack_length());
+				+ (ulint) get_field_offset(table, field)),
+				(ulint) field->pack_length(),
+#ifdef HAVE_PERCONA_COMPRESSED_COLUMNS
+				field->column_format() ==
+					COLUMN_FORMAT_TYPE_COMPRESSED,
+				reinterpret_cast<const byte*>(
+					field->zip_dict_data.str),
+				field->zip_dict_data.length, prebuilt);
+#else
+                                0, 0, 0, prebuilt);
+#endif
 
 			true_len = blob_len;
 
@@ -7521,7 +7781,7 @@ wsrep_store_key_val_for_row(
 			the true length of the key */
 
 			if (blob_len > 0 && cs->mbmaxlen > 1) {
-				true_len = (ulint) my_well_formed_length(cs,
+				true_len = (ulint) cs->cset->well_formed_len(cs,
 						(const char *) blob_data,
 						(const char *) blob_data
 							+ blob_len,
@@ -7610,7 +7870,7 @@ wsrep_store_key_val_for_row(
 				if (key_len > 0 && cs->mbmaxlen > 1) {
 
 					true_len = (ulint)
-						my_well_formed_length(cs,
+						cs->cset->well_formed_len(cs,
 							(const char *)src_start,
 							(const char *)src_start
 								+ key_len,
@@ -7745,7 +8005,7 @@ ha_innobase::store_key_val_for_row(
 			the true length of the key */
 
 			if (len > 0 && cs->mbmaxlen > 1) {
-				true_len = (ulint) my_well_formed_length(cs,
+				true_len = (ulint) cs->cset->well_formed_len(cs,
 						(const char*) data,
 						(const char*) data + len,
 						(uint) (key_len / cs->mbmaxlen),
@@ -7805,7 +8065,16 @@ ha_innobase::store_key_val_for_row(
 			blob_data = row_mysql_read_blob_ref(&blob_len,
 				(byte*) (record
 				+ (ulint) get_field_offset(table, field)),
-					(ulint) field->pack_length());
+				(ulint) field->pack_length(),
+#ifdef HAVE_PERCONA_COMPRESSED_COLUMNS
+				field->column_format() ==
+					COLUMN_FORMAT_TYPE_COMPRESSED,
+				reinterpret_cast<const byte*>(
+					field->zip_dict_data.str),
+				field->zip_dict_data.length, prebuilt);
+#else
+                                0, 0, 0, prebuilt);
+#endif
 
 			true_len = blob_len;
 
@@ -7816,7 +8085,7 @@ ha_innobase::store_key_val_for_row(
 			the true length of the key */
 
 			if (blob_len > 0 && cs->mbmaxlen > 1) {
-				true_len = (ulint) my_well_formed_length(cs,
+				true_len = (ulint) cs->cset->well_formed_len(cs,
 						(const char*) blob_data,
 						(const char*) blob_data
 							+ blob_len,
@@ -7888,7 +8157,7 @@ ha_innobase::store_key_val_for_row(
 				if (key_len > 0 && cs->mbmaxlen > 1) {
 
 					true_len = (ulint)
-						my_well_formed_length(cs,
+						cs->cset->well_formed_len(cs,
 							(const char*) src_start,
 							(const char*) src_start
 								+ key_len,
@@ -8029,7 +8298,66 @@ build_template_field(
 	UNIV_MEM_INVALID(templ, sizeof *templ);
 	templ->col_no = i;
 	templ->clust_rec_field_no = dict_col_get_clust_pos(col, clust_index);
-	ut_a(templ->clust_rec_field_no != ULINT_UNDEFINED);
+
+	/* If clustered index record field is not found, lets print out
+	field names and all the rest to understand why field is not found. */
+	if (templ->clust_rec_field_no == ULINT_UNDEFINED) {
+		const char* tb_col_name = dict_table_get_col_name(clust_index->table, i);
+		dict_field_t* field=NULL;
+		size_t size = 0;
+
+		for(ulint j=0; j < clust_index->n_user_defined_cols; j++) {
+			dict_field_t* ifield = &(clust_index->fields[j]);
+			if (ifield && !memcmp(tb_col_name, ifield->name,
+					strlen(tb_col_name))) {
+				field = ifield;
+				break;
+			}
+		}
+
+		ib_logf(IB_LOG_LEVEL_INFO,
+			"Looking for field %lu name %s from table %s",
+			i,
+			(tb_col_name ? tb_col_name : "NULL"),
+			clust_index->table->name);
+
+
+		for(ulint j=0; j < clust_index->n_user_defined_cols; j++) {
+			dict_field_t* ifield = &(clust_index->fields[j]);
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"InnoDB Table %s field %lu name %s",
+				clust_index->table->name,
+				j,
+				(ifield ? ifield->name : "NULL"));
+		}
+
+		for(ulint j=0; j < table->s->stored_fields; j++) {
+			ib_logf(IB_LOG_LEVEL_INFO,
+				"MySQL table %s field %lu name %s",
+				table->s->table_name.str,
+				j,
+				table->field[j]->field_name);
+		}
+
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"Clustered record field for column %lu"
+			" not found table n_user_defined %d"
+			" index n_user_defined %d"
+			" InnoDB table %s field name %s"
+			" MySQL table %s field name %s n_fields %d"
+			" query %s",
+			i,
+			clust_index->n_user_defined_cols,
+			clust_index->table->n_cols - DATA_N_SYS_COLS,
+			clust_index->table->name,
+			(field ? field->name : "NULL"),
+			table->s->table_name.str,
+			(tb_col_name ? tb_col_name : "NULL"),
+			table->s->stored_fields,
+			innobase_get_stmt(current_thd, &size));
+
+		ut_a(templ->clust_rec_field_no != ULINT_UNDEFINED);
+	}
 	templ->rec_field_is_prefix = FALSE;
 
 	if (dict_index_is_clust(index)) {
@@ -8070,6 +8398,14 @@ build_template_field(
 	templ->mbminlen = dict_col_get_mbminlen(col);
 	templ->mbmaxlen = dict_col_get_mbmaxlen(col);
 	templ->is_unsigned = col->prtype & DATA_UNSIGNED;
+#ifdef HAVE_PERCONA_COMPRESSED_COLUMNS
+	templ->compressed = (field->column_format()
+				== COLUMN_FORMAT_TYPE_COMPRESSED);
+	templ->zip_dict_data = field->zip_dict_data;
+#else
+	templ->compressed = 0;
+	templ->zip_dict_data = null_lex_cstr;
+#endif
 
 	if (!dict_index_is_clust(index)
 	    && templ->rec_field_no == ULINT_UNDEFINED) {
@@ -8188,7 +8524,7 @@ ha_innobase::build_template(
 		/* Push down an index condition or an end_range check. */
 		for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) {
 
-			while (!table->field[sql_idx]->stored_in_db()) {
+			while (!table->field[sql_idx]->stored_in_db) {
 				sql_idx++;
 			}
 
@@ -8307,7 +8643,7 @@ ha_innobase::build_template(
 		pushdown. */
 		for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) {
 
-			while (!table->field[sql_idx]->stored_in_db()) {
+			while (!table->field[sql_idx]->stored_in_db) {
 				sql_idx++;
 			}
 
@@ -8347,7 +8683,7 @@ ha_innobase::build_template(
                 for (i = 0, sql_idx = 0; i < n_stored_fields; i++, sql_idx++) {
 			const Field*	field;
 
-			while (!table->field[sql_idx]->stored_in_db()) {
+			while (!table->field[sql_idx]->stored_in_db) {
 				sql_idx++;
 			}
 
@@ -8396,6 +8732,7 @@ dberr_t
 ha_innobase::innobase_lock_autoinc(void)
 /*====================================*/
 {
+	DBUG_ENTER("ha_innobase::innobase_lock_autoinc");
 	dberr_t		error = DB_SUCCESS;
 
 	ut_ad(!srv_read_only_mode);
@@ -8435,6 +8772,8 @@ ha_innobase::innobase_lock_autoinc(void)
 		/* Fall through to old style locking. */
 
 	case AUTOINC_OLD_STYLE_LOCKING:
+		DBUG_EXECUTE_IF("die_if_autoinc_old_lock_style_used",
+				ut_ad(0););
 		error = row_lock_table_autoinc_for_mysql(prebuilt);
 
 		if (error == DB_SUCCESS) {
@@ -8448,7 +8787,7 @@ ha_innobase::innobase_lock_autoinc(void)
 		ut_error;
 	}
 
-	return(error);
+	DBUG_RETURN(error);
 }
 
 /********************************************************************//**
@@ -8540,6 +8879,8 @@ ha_innobase::write_row(
 		++trx->will_lock;
 	}
 
+	ha_statistic_increment(&SSV::ha_write_count);
+
 	if (share->ib_table != prebuilt->table) {
 		fprintf(stderr,
 			"InnoDB: Warning: share->ib_table %p prebuilt->table %p table %s is_corrupt %lu.",
@@ -8728,12 +9069,6 @@ no_commit:
 	error = row_insert_for_mysql((byte*) record, prebuilt);
 	DEBUG_SYNC(user_thd, "ib_after_row_insert");
 
-#ifdef EXTENDED_FOR_USERSTAT
-	if (UNIV_LIKELY(error == DB_SUCCESS && !trx->fake_changes)) {
-		rows_changed++;
-	}
-#endif
-
 	/* Handle duplicate key errors */
 	if (auto_inc_used) {
 		ulonglong	auto_inc;
@@ -8956,7 +9291,7 @@ calc_row_difference(
 
 	for (sql_idx = 0; sql_idx < n_fields; sql_idx++) {
 		field = table->field[sql_idx];
-		if (!field->stored_in_db())
+		if (!field->stored_in_db)
 		  continue;
 
 		o_ptr = (const byte*) old_row + get_field_offset(table, field);
@@ -8980,8 +9315,11 @@ calc_row_difference(
 		switch (col_type) {
 
 		case DATA_BLOB:
-			o_ptr = row_mysql_read_blob_ref(&o_len, o_ptr, o_len);
-			n_ptr = row_mysql_read_blob_ref(&n_len, n_ptr, n_len);
+			/* Do not compress blob column while comparing*/
+			o_ptr = row_mysql_read_blob_ref(&o_len, o_ptr, o_len,
+				false, 0, 0, prebuilt);
+			n_ptr = row_mysql_read_blob_ref(&n_len, n_ptr, n_len,
+				false, 0, 0, prebuilt);
 
 			break;
 
@@ -9051,7 +9389,17 @@ calc_row_difference(
 					TRUE,
 					new_mysql_row_col,
 					col_pack_len,
-					dict_table_is_comp(prebuilt->table));
+					dict_table_is_comp(prebuilt->table),
+#ifdef HAVE_PERCONA_COMPRESSED_COLUMNS
+					field->column_format() ==
+						COLUMN_FORMAT_TYPE_COMPRESSED,
+					reinterpret_cast<const byte*>(
+						field->zip_dict_data.str),
+					field->zip_dict_data.length,
+#else
+                                        0, 0, 0,
+#endif
+					prebuilt);
 				dfield_copy(&ufield->new_val, &dfield);
 			} else {
 				dfield_set_null(&ufield->new_val);
@@ -9094,7 +9442,7 @@ calc_row_difference(
 				}
 			}
 		}
-		if (field->stored_in_db())
+		if (field->stored_in_db)
 			innodb_idx++;
 	}
 
@@ -9223,7 +9571,8 @@ wsrep_calc_row_hash(
 		switch (col_type) {
 
 		case DATA_BLOB:
-			ptr = row_mysql_read_blob_ref(&len, ptr, len);
+			ptr = row_mysql_read_blob_ref(&len, ptr, len,
+				false, 0, 0, prebuilt);
 
 			break;
 
@@ -9311,6 +9660,8 @@ ha_innobase::update_row(
 		}
 	}
 
+	ha_statistic_increment(&SSV::ha_update_count);
+
 	if (share->ib_table != prebuilt->table) {
 		fprintf(stderr,
 			"InnoDB: Warning: share->ib_table %p prebuilt->table %p table %s is_corrupt %lu.",
@@ -9385,12 +9736,6 @@ ha_innobase::update_row(
 		}
 	}
 
-#ifdef EXTENDED_FOR_USERSTAT
-	if (UNIV_LIKELY(error == DB_SUCCESS && !trx->fake_changes)) {
-		rows_changed++;
-	}
-#endif
-
 	innobase_srv_conc_exit_innodb(trx);
 
 func_exit:
@@ -9468,6 +9813,8 @@ ha_innobase::delete_row(
 		++trx->will_lock;
 	}
 
+	ha_statistic_increment(&SSV::ha_delete_count);
+
 	if (UNIV_UNLIKELY(share && share->ib_table
 			  && share->ib_table->is_corrupt)) {
 		DBUG_RETURN(HA_ERR_CRASHED);
@@ -9485,12 +9832,6 @@ ha_innobase::delete_row(
 
 	error = row_update_for_mysql((byte*) record, prebuilt);
 
-#ifdef EXTENDED_FOR_USERSTAT
-	if (UNIV_LIKELY(error == DB_SUCCESS && !trx->fake_changes)) {
-		rows_changed++;
-	}
-#endif
-
 	innobase_srv_conc_exit_innodb(trx);
 
 	/* Tell the InnoDB server that there might be work for
@@ -9766,6 +10107,8 @@ ha_innobase::index_read(
 	ut_a(prebuilt->trx == thd_to_trx(user_thd));
 	ut_ad(key_len != 0 || find_flag != HA_READ_KEY_EXACT);
 
+	ha_statistic_increment(&SSV::ha_read_key_count);
+
 	if (UNIV_UNLIKELY(srv_pass_corrupt_table <= 1 && share
 			  && share->ib_table && share->ib_table->is_corrupt)) {
 		DBUG_RETURN(HA_ERR_CRASHED);
@@ -9859,11 +10202,6 @@ ha_innobase::index_read(
 			srv_stats.n_rows_read.add(
 				(size_t) prebuilt->trx->id, 1);
 		}
-#ifdef EXTENDED_FOR_USERSTAT
-		rows_read++;
-		if (active_index < MAX_KEY)
-			index_rows_read[active_index]++;
-#endif
 		break;
 	case DB_RECORD_NOT_FOUND:
 		error = HA_ERR_KEY_NOT_FOUND;
@@ -10160,11 +10498,6 @@ ha_innobase::general_fetch(
 		error = 0;
 		table->status = 0;
 		srv_stats.n_rows_read.add((size_t) prebuilt->trx->id, 1);
-#ifdef EXTENDED_FOR_USERSTAT
-		rows_read++;
-		if (active_index < MAX_KEY)
-			index_rows_read[active_index]++;
-#endif
 		break;
 	case DB_RECORD_NOT_FOUND:
 		error = HA_ERR_END_OF_FILE;
@@ -11029,7 +11362,7 @@ ha_innobase::wsrep_append_keys(
 
 		len = wsrep_store_key_val_for_row(
 			thd, table, 0, key, WSREP_MAX_SUPPORTED_KEY_LENGTH,
-			record0, &is_null);
+			record0, prebuilt, &is_null);
 
 		if (!is_null) {
 			rcode = wsrep_append_key(
@@ -11083,7 +11416,7 @@ ha_innobase::wsrep_append_keys(
 				len = wsrep_store_key_val_for_row(
 					thd, table, i, key0, 
 					WSREP_MAX_SUPPORTED_KEY_LENGTH, 
-					record0, &is_null);
+					record0, prebuilt, &is_null);
 				if (!is_null) {
 					rcode = wsrep_append_key(
 						thd, trx, table_share, table, 
@@ -11102,7 +11435,7 @@ ha_innobase::wsrep_append_keys(
 					len = wsrep_store_key_val_for_row(
 						thd, table, i, key1, 
 						WSREP_MAX_SUPPORTED_KEY_LENGTH,
-						record1, &is_null);
+						record1, prebuilt, &is_null);
 					if (!is_null && memcmp(key0, key1, len)) {
 						rcode = wsrep_append_key(
 							thd, trx, table_share, 
@@ -11279,6 +11612,7 @@ create_table_def(
 	ulint		unsigned_type;
 	ulint		binary_type;
 	ulint		long_true_varchar;
+	ulint		compressed;
 	ulint		charset_no;
 	ulint		i;
 	ulint		doc_id_col = 0;
@@ -11365,7 +11699,7 @@ create_table_def(
 
 	for (i = 0; i < n_cols; i++) {
 		Field*	field = form->field[i];
-		if (!field->stored_in_db())
+		if (!field->stored_in_db)
 			continue;
 
 		col_type = get_innobase_type_from_mysql_type(&unsigned_type,
@@ -11428,6 +11762,13 @@ create_table_def(
 			}
 		}
 
+		/* Check if the the field has COMPRESSED attribute */
+		compressed = 0;
+		if (field->column_format() ==
+			COLUMN_FORMAT_TYPE_COMPRESSED) {
+			compressed = DATA_COMPRESSED;
+		}
+
 		/* First check whether the column to be added has a
 		system reserved name. */
 		if (dict_col_name_is_reserved(field->field_name)){
@@ -11448,7 +11789,8 @@ err_col:
 			dtype_form_prtype(
 				(ulint) field->type()
 				| nulls_allowed | unsigned_type
-				| binary_type | long_true_varchar,
+				| binary_type | long_true_varchar
+				| compressed,
 				charset_no),
 			col_len);
 	}
@@ -12498,6 +12840,9 @@ ha_innobase::create(
 	fil_encryption_t encrypt = (fil_encryption_t)options->encryption;
 	uint		key_id = (uint)options->encryption_key_id;
 
+	mem_heap_t*	heap = 0;
+	ulint*		zip_dict_ids = 0;
+
 	DBUG_ENTER("ha_innobase::create");
 
 	DBUG_ASSERT(thd != NULL);
@@ -12594,6 +12939,25 @@ ha_innobase::create(
 
 	row_mysql_lock_data_dictionary(trx);
 
+	heap = mem_heap_create(form->s->fields * sizeof(ulint));
+	zip_dict_ids = static_cast<ulint*>(
+		mem_heap_alloc(heap, form->s->fields * sizeof(ulint)));
+
+	/* This is currently required for valgrind because MariaDB does
+	not currently support compressed columns. */
+	for (size_t field_idx = 0; field_idx < form->s->fields; ++field_idx) {
+		zip_dict_ids[field_idx] = ULINT_UNDEFINED;
+	}
+
+	const char*	err_zip_dict_name = 0;
+	if (!innobase_check_zip_dicts(form, zip_dict_ids,
+		trx, &err_zip_dict_name)) {
+		error = -1;
+		my_error(ER_COMPRESSION_DICTIONARY_DOES_NOT_EXIST,
+			MYF(0), err_zip_dict_name);
+		goto cleanup;
+	}
+
 	error = create_table_def(trx, form, norm_name, temp_path,
 			remote_path, flags, flags2, encrypt, key_id);
 	if (error) {
@@ -12701,6 +13065,22 @@ ha_innobase::create(
 		dict_table_get_all_fts_indexes(innobase_table, fts->indexes);
 	}
 
+	/*
+	Adding compression dictionary <-> compressed table column links
+	to the SYS_ZIP_DICT_COLS table.
+	*/
+	ut_a(zip_dict_ids != 0);
+	{
+		dict_table_t*	local_table = dict_table_open_on_name(
+			norm_name, TRUE, FALSE, DICT_ERR_IGNORE_NONE);
+
+		ut_a(local_table);
+		table_id_t table_id = local_table->id;
+		dict_table_close(local_table, TRUE, FALSE);
+		innobase_create_zip_dict_references(form,
+			table_id, zip_dict_ids, trx);
+	}
+
 	stmt = innobase_get_stmt(thd, &stmt_len);
 
 	if (stmt) {
@@ -12817,6 +13197,9 @@ ha_innobase::create(
 
 	trx_free_for_mysql(trx);
 
+	if (heap != 0)
+		mem_heap_free(heap);
+
 	DBUG_RETURN(0);
 
 cleanup:
@@ -12826,6 +13209,9 @@ cleanup:
 
 	trx_free_for_mysql(trx);
 
+	if (heap != 0)
+		mem_heap_free(heap);
+
 	DBUG_RETURN(error);
 }
 
@@ -14009,6 +14395,14 @@ ha_innobase::info_low(
 			if (dict_stats_is_persistent_enabled(ib_table)) {
 
 				if (is_analyze) {
+
+					/* If this table is already queued for
+					background analyze, remove it from the
+					queue as we are about to do the same */
+					dict_mutex_enter_for_mysql();
+					dict_stats_recalc_pool_del(ib_table);
+					dict_mutex_exit_for_mysql();
+
 					opt = DICT_STATS_RECALC_PERSISTENT;
 				} else {
 					/* This is e.g. 'SHOW INDEXES', fetch
@@ -14459,7 +14853,7 @@ ha_innobase::optimize(
 	if (innodb_optimize_fulltext_only) {
 		if (prebuilt->table->fts && prebuilt->table->fts->cache
 		    && !dict_table_is_discarded(prebuilt->table)) {
-			fts_sync_table(prebuilt->table, false, true);
+			fts_sync_table(prebuilt->table, false, true, false);
 			fts_optimize_table(prebuilt->table);
 		}
 		return(HA_ADMIN_OK);
@@ -14637,7 +15031,7 @@ ha_innobase::check(
 			if (!dict_index_is_clust(index)) {
 				prebuilt->index_usable = FALSE;
 				row_mysql_lock_data_dictionary(prebuilt->trx);
-                                dict_set_corrupted(index, prebuilt->trx, "dict_set_index_corrupted");;
+                                dict_set_corrupted(index, prebuilt->trx, "dict_set_index_corrupted");
 				row_mysql_unlock_data_dictionary(prebuilt->trx);
 			});
 
@@ -14676,7 +15070,14 @@ ha_innobase::check(
 
 		prebuilt->select_lock_type = LOCK_NONE;
 
-		if (!row_check_index_for_mysql(prebuilt, index, &n_rows)) {
+		bool check_result
+			= row_check_index_for_mysql(prebuilt, index, &n_rows);
+		DBUG_EXECUTE_IF(
+				"dict_set_index_corrupted",
+				if (!(index->type & DICT_CLUSTERED)) {
+					check_result = false;
+				});
+		if (!check_result) {
 			innobase_format_name(
 				index_name, sizeof index_name,
 				index->name, TRUE);
@@ -15003,6 +15404,75 @@ get_foreign_key_info(
 	return(pf_key_info);
 }
 
+/** Get the list of foreign keys referencing a specified table
+table.
+@param thd		The thread handle
+@param path		Path to the table
+@param f_key_list[out]	The list of foreign keys */
+static
+void
+fill_foreign_key_list(THD* thd,
+		      const dict_table_t* table,
+		      List<FOREIGN_KEY_INFO>* f_key_list)
+{
+	ut_ad(mutex_own(&dict_sys->mutex));
+
+	for (dict_foreign_set::iterator it = table->referenced_set.begin();
+	     it != table->referenced_set.end(); ++it) {
+
+		dict_foreign_t* foreign = *it;
+
+		FOREIGN_KEY_INFO* pf_key_info
+			= get_foreign_key_info(thd, foreign);
+		if (pf_key_info) {
+			f_key_list->push_back(pf_key_info);
+		}
+	}
+}
+
+/** Get the list of foreign keys referencing a specified table
+table.
+@param thd		The thread handle
+@param path		Path to the table
+@param f_key_list[out]	The list of foreign keys
+
+@return error code or zero for success */
+static
+int
+innobase_get_parent_fk_list(
+	THD*			thd,
+	const char*		path,
+	List<FOREIGN_KEY_INFO>*	f_key_list)
+{
+	ut_a(strlen(path) <= FN_REFLEN);
+	char	norm_name[FN_REFLEN + 1];
+	normalize_table_name(norm_name, path);
+
+	trx_t*	parent_trx = check_trx_exists(thd);
+	parent_trx->op_info = "getting list of referencing foreign keys";
+	trx_search_latch_release_if_reserved(parent_trx);
+
+	mutex_enter(&dict_sys->mutex);
+
+	dict_table_t*	table
+		= dict_table_open_on_name(norm_name, TRUE, FALSE,
+					  static_cast<dict_err_ignore_t>(
+						  DICT_ERR_IGNORE_INDEX_ROOT
+						  | DICT_ERR_IGNORE_CORRUPT));
+	if (!table) {
+		mutex_exit(&dict_sys->mutex);
+		return(HA_ERR_NO_SUCH_TABLE);
+	}
+
+	fill_foreign_key_list(thd, table, f_key_list);
+
+	dict_table_close(table, TRUE, FALSE);
+
+	mutex_exit(&dict_sys->mutex);
+	parent_trx->op_info = "";
+	return(0);
+}
+
 /*******************************************************************//**
 Gets the list of foreign keys in this table.
 @return always 0, that is, always succeeds */
@@ -15055,9 +15525,6 @@ ha_innobase::get_parent_foreign_key_list(
 	THD*			thd,		/*!< in: user thread handle */
 	List<FOREIGN_KEY_INFO>*	f_key_list)	/*!< out: foreign key list */
 {
-	FOREIGN_KEY_INFO*	pf_key_info;
-	dict_foreign_t*		foreign;
-
 	ut_a(prebuilt != NULL);
 	update_thd(ha_thd());
 
@@ -15066,20 +15533,7 @@ ha_innobase::get_parent_foreign_key_list(
 	trx_search_latch_release_if_reserved(prebuilt->trx);
 
 	mutex_enter(&(dict_sys->mutex));
-
-	for (dict_foreign_set::iterator it
-		= prebuilt->table->referenced_set.begin();
-	     it != prebuilt->table->referenced_set.end();
-	     ++it) {
-
-		foreign = *it;
-
-		pf_key_info = get_foreign_key_info(thd, foreign);
-		if (pf_key_info) {
-			f_key_list->push_back(pf_key_info);
-		}
-	}
-
+	fill_foreign_key_list(thd, prebuilt->table, f_key_list);
 	mutex_exit(&(dict_sys->mutex));
 
 	prebuilt->trx->op_info = "";
@@ -15169,6 +15623,11 @@ ha_innobase::extra(
 		if (prebuilt->blob_heap) {
 			row_mysql_prebuilt_free_blob_heap(prebuilt);
 		}
+
+		if (prebuilt->compress_heap) {
+			row_mysql_prebuilt_free_compress_heap(prebuilt);
+		}
+
 		break;
 	case HA_EXTRA_RESET_STATE:
 		reset_template();
@@ -15220,6 +15679,10 @@ ha_innobase::reset()
 		row_mysql_prebuilt_free_blob_heap(prebuilt);
 	}
 
+	if (prebuilt->compress_heap) {
+		row_mysql_prebuilt_free_compress_heap(prebuilt);
+	}
+
 	reset_template();
 	ds_mrr.dsmrr_close();
 
@@ -15426,7 +15889,11 @@ ha_innobase::external_lock(
 		    && lock_type == F_WRLCK)
 		|| thd_sql_command(thd) == SQLCOM_CREATE_INDEX
 		|| thd_sql_command(thd) == SQLCOM_DROP_INDEX
-		|| thd_sql_command(thd) == SQLCOM_DELETE)) {
+		|| thd_sql_command(thd) == SQLCOM_DELETE
+		|| thd_sql_command(thd) ==
+			SQLCOM_CREATE_COMPRESSION_DICTIONARY
+		|| thd_sql_command(thd) ==
+			SQLCOM_DROP_COMPRESSION_DICTIONARY)) {
 
 		if (thd_sql_command(thd) == SQLCOM_CREATE_TABLE)
 		{
@@ -16194,7 +16661,9 @@ ha_innobase::store_lock(
 			 && lock_type <= TL_WRITE))
 		|| sql_command == SQLCOM_CREATE_INDEX
 		|| sql_command == SQLCOM_DROP_INDEX
-		|| sql_command == SQLCOM_DELETE)) {
+		|| sql_command == SQLCOM_DELETE
+		|| sql_command == SQLCOM_CREATE_COMPRESSION_DICTIONARY
+		|| sql_command == SQLCOM_DROP_COMPRESSION_DICTIONARY)) {
 
 		ib_senderrf(trx->mysql_thd,
 			    IB_LOG_LEVEL_WARN, ER_READ_ONLY_MODE);
@@ -16258,7 +16727,7 @@ ha_innobase::store_lock(
 		/* Use consistent read for checksum table */
 
 		if (sql_command == SQLCOM_CHECKSUM
-                    || (sql_command == SQLCOM_ANALYZE && lock_type == TL_READ)
+		    || sql_command == SQLCOM_CHECKSUM
 		    || ((srv_locks_unsafe_for_binlog
 			|| trx->isolation_level <= TRX_ISO_READ_COMMITTED)
 			&& trx->isolation_level != TRX_ISO_SERIALIZABLE
@@ -17165,6 +17634,84 @@ ha_innobase::check_if_incompatible_data(
 	return(COMPATIBLE_DATA_YES);
 }
 
+/** This function reads zip dict-related info from SYS_ZIP_DICT
+and SYS_ZIP_DICT_COLS for all columns marked with
+COLUMN_FORMAT_TYPE_COMPRESSED flag and updates
+zip_dict_name / zip_dict_data for those which have associated
+compression dictionaries.
+*/
+UNIV_INTERN
+void
+ha_innobase::update_field_defs_with_zip_dict_info()
+{
+	DBUG_ENTER("update_field_defs_with_zip_dict_info");
+	ut_ad(!mutex_own(&dict_sys->mutex));
+
+	char norm_name[FN_REFLEN];
+	normalize_table_name(norm_name, table_share->normalized_path.str);
+
+	dict_table_t* ib_table = dict_table_open_on_name(
+		norm_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE);
+
+	/* if dict_table_open_on_name() returns NULL, then it means that
+	TABLE_SHARE is populated for a table being created and we can
+	skip filling zip dict info here */
+	if (ib_table == 0)
+		DBUG_VOID_RETURN;
+
+#ifdef HAVE_PERCONA_COMPRESSED_COLUMNS
+	table_id_t ib_table_id = ib_table->id;
+	dict_table_close(ib_table, FALSE, FALSE);
+	Field* field;
+	for (uint i = 0; i < table_share->fields; ++i) {
+		field = table_share->field[i];
+		if (field->column_format() ==
+		    COLUMN_FORMAT_TYPE_COMPRESSED) {
+			bool reference_found = false;
+			ulint dict_id = 0;
+			switch (dict_get_dictionary_id_by_key(ib_table_id, i,
+				&dict_id)) {
+				case DB_SUCCESS:
+					reference_found = true;
+					break;
+				case DB_RECORD_NOT_FOUND:
+					reference_found = false;
+					break;
+				default:
+					ut_error;
+			}
+			if (reference_found) {
+				char* local_name = 0;
+				ulint local_name_len = 0;
+				char* local_data = 0;
+				ulint local_data_len = 0;
+				if (dict_get_dictionary_info_by_id(dict_id,
+					&local_name, &local_name_len,
+					&local_data, &local_data_len) !=
+					DB_SUCCESS) {
+					ut_error;
+				}
+				else {
+					field->zip_dict_name.str =
+						local_name;
+					field->zip_dict_name.length =
+						local_name_len;
+					field->zip_dict_data.str =
+						local_data;
+					field->zip_dict_data.length =
+						local_data_len;
+				}
+			}
+			else {
+				field->zip_dict_name = null_lex_cstr;
+				field->zip_dict_data = null_lex_cstr;
+			}
+		}
+	}
+#endif
+	DBUG_VOID_RETURN;
+}
+
 /****************************************************************//**
 Update the system variable innodb_io_capacity_max using the "saved"
 value. This function is registered as a callback with MySQL. */
@@ -17733,7 +18280,12 @@ innodb_internal_table_update(
 		my_free(old);
 	}
 
-	fts_internal_tbl_name = *(char**) var_ptr;
+	fts_internal_tbl_name2 = *(char**) var_ptr;
+	if (fts_internal_tbl_name2 == NULL) {
+		fts_internal_tbl_name = const_cast<char*>("default");
+	} else {
+		fts_internal_tbl_name = fts_internal_tbl_name2;
+	}
 }
 
 /****************************************************************//**
@@ -18628,7 +19180,7 @@ innodb_defragment_frequency_update(
 {
 	srv_defragment_frequency = (*static_cast<const uint*>(save));
 	srv_defragment_interval = ut_microseconds_to_timer(
-		(ulonglong) (1000000.0 / srv_defragment_frequency));
+		1000000.0 / srv_defragment_frequency);
 }
 
 /****************************************************************//**
@@ -18882,7 +19434,6 @@ innodb_track_changed_pages_validate(
 						for update function */
 	struct st_mysql_value*		value)	/*!< in: incoming bool */
 {
-	static bool     enabled_on_startup = false;
 	long long	intbuf = 0;
 
 	if (value->val_int(value, &intbuf)) {
@@ -18890,8 +19441,7 @@ innodb_track_changed_pages_validate(
 		return 1;
 	}
 
-	if (srv_track_changed_pages || enabled_on_startup) {
-		enabled_on_startup = true;
+	if (srv_redo_log_thread_started) {
 		*reinterpret_cast<ulong*>(save)
 			= static_cast<ulong>(intbuf);
 		return 0;
@@ -19491,22 +20041,22 @@ wsrep_innobase_kill_one_trx(
 
 	if (!thd) {
 		DBUG_PRINT("wsrep", ("no thd for conflicting lock"));
-		WSREP_WARN("no THD for trx: %lu", (ulong) victim_trx->id);
+		WSREP_WARN("no THD for trx: %lu", victim_trx->id);
 		DBUG_RETURN(1);
 	}
 
 	if (!bf_thd) {
 		DBUG_PRINT("wsrep", ("no BF thd for conflicting lock"));
-		WSREP_WARN("no BF THD for trx: %lu", (bf_trx) ? (ulong) bf_trx->id : (ulong) 0);
+		WSREP_WARN("no BF THD for trx: %lu", (bf_trx) ? bf_trx->id : 0);
 		DBUG_RETURN(1);
 	}
 
 	WSREP_LOG_CONFLICT(bf_thd, thd, TRUE);
 
-	WSREP_DEBUG("BF kill (%lu, seqno: %lld), victim: (%lu) trx: %llu",
+	WSREP_DEBUG("BF kill (%lu, seqno: %lld), victim: (%lu) trx: %lu",
  		    signal, (long long)bf_seqno,
 		    thd_get_thread_id(thd),
-		    (ulonglong) victim_trx->id);
+		    victim_trx->id);
 
 	WSREP_DEBUG("Aborting query: %s",
 		  (thd && wsrep_thd_query(thd)) ? wsrep_thd_query(thd) : "void");
@@ -19523,15 +20073,14 @@ wsrep_innobase_kill_one_trx(
 
 
 	if (wsrep_thd_query_state(thd) == QUERY_EXITING) {
-                WSREP_DEBUG("kill trx EXITING for %llu",
-                            (ulonglong) victim_trx->id);
+		WSREP_DEBUG("kill trx EXITING for %lu", victim_trx->id);
 		wsrep_thd_UNLOCK(thd);
 		DBUG_RETURN(0);
 	}
 
 	if(wsrep_thd_exec_mode(thd) != LOCAL_STATE) {
-		WSREP_DEBUG("withdraw for BF trx: %llu, state: %d",
-			    (longlong) victim_trx->id,
+		WSREP_DEBUG("withdraw for BF trx: %lu, state: %d",
+			    victim_trx->id,
 		wsrep_thd_get_conflict_state(thd));
 	}
 
@@ -19540,8 +20089,8 @@ wsrep_innobase_kill_one_trx(
 		wsrep_thd_set_conflict_state(thd, MUST_ABORT);
 		break;
         case MUST_ABORT:
-		WSREP_DEBUG("victim %llu in MUST ABORT state",
-			    (longlong) victim_trx->id);
+		WSREP_DEBUG("victim %lu in MUST ABORT state",
+			    victim_trx->id);
 		wsrep_thd_UNLOCK(thd);
 		wsrep_thd_awake(thd, signal);
 		DBUG_RETURN(0);
@@ -19549,9 +20098,8 @@ wsrep_innobase_kill_one_trx(
 	case ABORTED:
 	case ABORTING: // fall through
 	default:
-		WSREP_DEBUG("victim %llu in state %d",
-			    (longlong) victim_trx->id,
-                            wsrep_thd_get_conflict_state(thd));
+		WSREP_DEBUG("victim %lu in state %d",
+			    victim_trx->id, wsrep_thd_get_conflict_state(thd));
 		wsrep_thd_UNLOCK(thd);
 		DBUG_RETURN(0);
 		break;
@@ -19563,8 +20111,8 @@ wsrep_innobase_kill_one_trx(
 
 		WSREP_DEBUG("kill query for: %ld",
 			    thd_get_thread_id(thd));
-		WSREP_DEBUG("kill trx QUERY_COMMITTING for %llu",
-			    (longlong) victim_trx->id);
+		WSREP_DEBUG("kill trx QUERY_COMMITTING for %lu",
+			    victim_trx->id);
 
 		if (wsrep_thd_exec_mode(thd) == REPL_RECV) {
 			wsrep_abort_slave_trx(bf_seqno,
@@ -19578,8 +20126,8 @@ wsrep_innobase_kill_one_trx(
 
 			switch (rcode) {
 			case WSREP_WARNING:
-				WSREP_DEBUG("cancel commit warning: %llu",
-					    (ulonglong) victim_trx->id);
+				WSREP_DEBUG("cancel commit warning: %lu",
+					    victim_trx->id);
 				wsrep_thd_UNLOCK(thd);
 				wsrep_thd_awake(thd, signal);
 				DBUG_RETURN(1);
@@ -19588,9 +20136,9 @@ wsrep_innobase_kill_one_trx(
 				break;
 			default:
 				WSREP_ERROR(
-					"cancel commit bad exit: %d %llu",
+					"cancel commit bad exit: %d %lu",
 					rcode,
-					(ulonglong) victim_trx->id);
+					victim_trx->id);
 				/* unable to interrupt, must abort */
 				/* note: kill_mysql() will block, if we cannot.
 				 * kill the lock holder first.
@@ -19606,8 +20154,7 @@ wsrep_innobase_kill_one_trx(
 		/* it is possible that victim trx is itself waiting for some
 		 * other lock. We need to cancel this waiting
 		 */
-                WSREP_DEBUG("kill trx QUERY_EXEC for %llu",
-                            (ulonglong) victim_trx->id);
+		WSREP_DEBUG("kill trx QUERY_EXEC for %lu", victim_trx->id);
 
 		victim_trx->lock.was_chosen_as_deadlock_victim= TRUE;
 		if (victim_trx->lock.wait_lock) {
@@ -19642,7 +20189,7 @@ wsrep_innobase_kill_one_trx(
 		break;
 	case QUERY_IDLE:
 	{
-                WSREP_DEBUG("kill IDLE for %llu", (ulonglong) victim_trx->id);
+		WSREP_DEBUG("kill IDLE for %lu", victim_trx->id);
 
 		if (wsrep_thd_exec_mode(thd) == REPL_RECV) {
 			WSREP_DEBUG("kill BF IDLE, seqno: %lld",
@@ -19659,7 +20206,7 @@ wsrep_innobase_kill_one_trx(
 
 		if (wsrep_aborting_thd_contains(thd)) {
 			WSREP_WARN("duplicate thd aborter %lu",
-			           (ulong) thd_get_thread_id(thd));
+			           thd_get_thread_id(thd));
 		} else {
 			wsrep_aborting_thd_enqueue(thd);
 			DBUG_PRINT("wsrep",("enqueuing trx abort for %lu",
@@ -20262,6 +20809,12 @@ static MYSQL_SYSVAR_LONGLONG(buffer_pool_size, innobase_buffer_pool_size,
   "The size of the memory buffer InnoDB uses to cache data and indexes of its tables.",
   NULL, NULL, 128*1024*1024L, 5*1024*1024L, LONGLONG_MAX, 1024*1024L);
 
+static MYSQL_SYSVAR_BOOL(buffer_pool_populate, innodb_buffer_pool_populate,
+  PLUGIN_VAR_NOCMDARG | PLUGIN_VAR_READONLY,
+  "Deprecated. This option has no effect and "
+  "will be removed in MariaDB 10.2.3.",
+  NULL, NULL, FALSE);
+
 static MYSQL_SYSVAR_ENUM(foreground_preflush, srv_foreground_preflush,
   PLUGIN_VAR_OPCMDARG,
   "The algorithm InnoDB uses for the query threads at sync preflush.  "
@@ -20387,6 +20940,18 @@ static MYSQL_SYSVAR_ENUM(empty_free_list_algorithm,
   innodb_srv_empty_free_list_algorithm_validate, NULL, SRV_EMPTY_FREE_LIST_BACKOFF,
   &innodb_empty_free_list_algorithm_typelib);
 
+static MYSQL_SYSVAR_ENUM(lock_schedule_algorithm, innodb_lock_schedule_algorithm,
+  PLUGIN_VAR_RQCMDARG,
+  "The algorithm Innodb uses for deciding which locks to grant next when"
+  " a lock is released. Possible values are"
+  " FCFS"
+    " grant the locks in First-Come-First-Served order;"
+  " VATS"
+    " use the Variance-Aware-Transaction-Scheduling algorithm, which"
+    " uses an Eldest-Transaction-First heuristic.",
+  NULL, NULL, INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS,
+  &innodb_lock_schedule_algorithm_typelib);
+
 static MYSQL_SYSVAR_LONG(buffer_pool_instances, innobase_buffer_pool_instances,
   PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
   "Number of buffer pool instances, set to higher value on high-end machines to increase scalability",
@@ -20535,7 +21100,7 @@ static MYSQL_SYSVAR_BOOL(disable_sort_file_cache, srv_disable_sort_file_cache,
   "Whether to disable OS system file cache for sort I/O",
   NULL, NULL, FALSE);
 
-static MYSQL_SYSVAR_STR(ft_aux_table, fts_internal_tbl_name,
+static MYSQL_SYSVAR_STR(ft_aux_table, fts_internal_tbl_name2,
   PLUGIN_VAR_NOCMDARG,
   "FTS internal auxiliary table to be checked",
   innodb_internal_table_validate,
@@ -21035,7 +21600,7 @@ static	MYSQL_SYSVAR_ENUM(corrupt_table_action, srv_pass_corrupt_table,
   "Warn corruptions of user tables as 'corrupt table' instead of not crashing itself, "
   "when used with file_per_table. "
   "All file io for the datafile after detected as corrupt are disabled, "
-  "except for the deletion",
+  "except for the deletion.",
   NULL, NULL, 0, &corrupt_table_action_typelib);
 
 static MYSQL_SYSVAR_BOOL(locking_fake_changes, srv_fake_changes_locks,
@@ -21050,6 +21615,21 @@ static MYSQL_SYSVAR_BOOL(use_stacktrace, srv_use_stacktrace,
   "Print stacktrace on long semaphore wait (off by default supported only on linux)",
   NULL, NULL, FALSE);
 
+#ifdef HAVE_PERCONA_COMPRESSED_COLUMNS
+static MYSQL_SYSVAR_UINT(compressed_columns_zip_level,
+  srv_compressed_columns_zip_level,
+  PLUGIN_VAR_RQCMDARG,
+  "Compression level used for compressed columns.  0 is no compression"
+  ", 1 is fastest and 9 is best compression. Default is 6.",
+  NULL, NULL, DEFAULT_COMPRESSION_LEVEL, 0, 9, 0);
+
+static MYSQL_SYSVAR_ULONG(compressed_columns_threshold,
+  srv_compressed_columns_threshold,
+  PLUGIN_VAR_RQCMDARG,
+  "Compress column data if its length exceeds this value. Default is 96",
+  NULL, NULL, 96, 1, ~0UL, 0);
+#endif
+
 static MYSQL_SYSVAR_UINT(compression_level, page_zip_level,
   PLUGIN_VAR_RQCMDARG,
   "Compression level used for zlib compression.  0 is no compression"
@@ -21280,6 +21860,7 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(ft_sort_pll_degree),
   MYSQL_SYSVAR(large_prefix),
   MYSQL_SYSVAR(force_load_corrupted),
+  MYSQL_SYSVAR(lock_schedule_algorithm),
   MYSQL_SYSVAR(locks_unsafe_for_binlog),
   MYSQL_SYSVAR(lock_wait_timeout),
 #ifdef UNIV_LOG_ARCHIVE
@@ -21426,11 +22007,14 @@ static struct st_mysql_sys_var* innobase_system_variables[]= {
   MYSQL_SYSVAR(locking_fake_changes),
   MYSQL_SYSVAR(tmpdir),
   MYSQL_SYSVAR(use_stacktrace),
+#ifdef HAVE_PERCONA_COMPRESSED_COLUMNS
+  MYSQL_SYSVAR(compressed_columns_zip_level),
+  MYSQL_SYSVAR(compressed_columns_threshold),
+#endif
   MYSQL_SYSVAR(force_primary_key),
   MYSQL_SYSVAR(fatal_semaphore_wait_threshold),
   /* Table page compression feature */
   MYSQL_SYSVAR(use_trim),
-  MYSQL_SYSVAR(compression_default),
   MYSQL_SYSVAR(compression_algorithm),
   MYSQL_SYSVAR(mtflush_threads),
   MYSQL_SYSVAR(use_mtflush),
@@ -21476,6 +22060,10 @@ maria_declare_plugin(xtradb)
 i_s_xtradb_read_view,
 i_s_xtradb_internal_hash_tables,
 i_s_xtradb_rseg,
+#ifdef HAVE_PERCONA_COMPRESSED_COLUMNS
+i_s_xtradb_zip_dict,
+i_s_xtradb_zip_dict_cols,
+#endif
 i_s_innodb_trx,
 i_s_innodb_locks,
 i_s_innodb_lock_waits,
@@ -22125,20 +22713,22 @@ ib_push_warning(
 	const char	*format,/*!< in: warning message */
 	...)
 {
-	va_list args;
-	THD *thd = (THD *)trx->mysql_thd;
-	char *buf;
+	if (trx && trx->mysql_thd) {
+		THD *thd = (THD *)trx->mysql_thd;
+		va_list args;
+		char *buf;
 #define MAX_BUF_SIZE 4*1024
 
-	va_start(args, format);
-	buf = (char *)my_malloc(MAX_BUF_SIZE, MYF(MY_WME));
-	vsprintf(buf,format, args);
+		va_start(args, format);
+		buf = (char *)my_malloc(MAX_BUF_SIZE, MYF(MY_WME));
+		vsprintf(buf,format, args);
 
-	push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
-		convert_error_code_to_mysql((dberr_t)error, 0, thd),
-		buf);
-	my_free(buf);
-	va_end(args);
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+			convert_error_code_to_mysql((dberr_t)error, 0, thd),
+			buf);
+		my_free(buf);
+		va_end(args);
+	}
 }
 
 /********************************************************************//**
@@ -22160,15 +22750,17 @@ ib_push_warning(
 		thd = current_thd;
 	}
 
-	va_start(args, format);
-	buf = (char *)my_malloc(MAX_BUF_SIZE, MYF(MY_WME));
-	vsprintf(buf,format, args);
+	if (thd) {
+		va_start(args, format);
+		buf = (char *)my_malloc(MAX_BUF_SIZE, MYF(MY_WME));
+		vsprintf(buf,format, args);
 
-	push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
-		convert_error_code_to_mysql((dberr_t)error, 0, thd),
-		buf);
-	my_free(buf);
-	va_end(args);
+		push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN,
+			convert_error_code_to_mysql((dberr_t)error, 0, thd),
+			buf);
+		my_free(buf);
+		va_end(args);
+	}
 }
 
 /********************************************************************//**
diff --git a/storage/xtradb/handler/ha_innodb.h b/storage/xtradb/handler/ha_innodb.h
index 7b0d20aec72..e6026f81c99 100644
--- a/storage/xtradb/handler/ha_innodb.h
+++ b/storage/xtradb/handler/ha_innodb.h
@@ -315,8 +315,17 @@ class ha_innobase: public handler
 	void set_partition_owner_stats(ha_statistics *stats);
 	bool check_if_incompatible_data(HA_CREATE_INFO *info,
 					uint table_changes);
+
 	bool check_if_supported_virtual_columns(void) { return TRUE; }
 
+	/** This function reads zip dict-related info from SYS_ZIP_DICT
+	and SYS_ZIP_DICT_COLS for all columns marked with
+	COLUMN_FORMAT_TYPE_COMPRESSED flag and updates
+	zip_dict_name / zip_dict_data for those which have associated
+	compression dictionaries.
+	*/
+	virtual void update_field_defs_with_zip_dict_info();
+
 private:
 	/** Builds a 'template' to the prebuilt struct.
 
@@ -732,3 +741,31 @@ ib_push_frm_error(
 	TABLE*		table,		/*!< in: MySQL table */
 	ulint		n_keys,		/*!< in: InnoDB #keys */
 	bool		push_warning);	/*!< in: print warning ? */
+
+/** This function checks if all the compression dictionaries referenced
+in table->fields exist in SYS_ZIP_DICT InnoDB system table.
+@return true if all referenced dictionaries exist */
+UNIV_INTERN
+bool
+innobase_check_zip_dicts(
+	const TABLE*	table,		/*!< in: table in MySQL data
+					dictionary */
+	ulint*		dict_ids,	/*!< out: identified zip dict ids
+					(at least n_fields long) */
+	trx_t*		trx,		/*!< in: transaction */
+	const char**	err_dict_name);	/*!< out: the name of the
+					zip_dict which does not exist. */
+
+/** This function creates compression dictionary references in
+SYS_ZIP_DICT_COLS InnoDB system table for table_id based on info
+in table->fields and provided zip dict ids. */
+UNIV_INTERN
+void
+innobase_create_zip_dict_references(
+	const TABLE*	table,		/*!< in: table in MySQL data
+					dictionary */
+	table_id_t	ib_table_id,	/*!< in: table ID in Innodb data
+					dictionary */
+	ulint*		zip_dict_ids,	/*!< in: zip dict ids
+					(at least n_fields long) */
+	trx_t*		trx);		/*!< in: transaction */
diff --git a/storage/xtradb/handler/handler0alter.cc b/storage/xtradb/handler/handler0alter.cc
index 1ce5595865a..9b46918a9d5 100644
--- a/storage/xtradb/handler/handler0alter.cc
+++ b/storage/xtradb/handler/handler0alter.cc
@@ -22,6 +22,11 @@ this program; if not, write to the Free Software Foundation, Inc.,
 Smart ALTER TABLE
 *******************************************************/
 
+#ifndef HAVE_PERCONA_COMPRESSED_COLUMNS
+#define COLUMN_FORMAT_TYPE_COMPRESSED                   0xBADF00D
+#define ER_COMPRESSION_DICTIONARY_DOES_NOT_EXIST        0xDEADFACE
+#endif
+
 #include <my_global.h>
 #include <unireg.h>
 #include <mysqld_error.h>
@@ -214,7 +219,10 @@ innobase_need_rebuild(
 	const Alter_inplace_info*	ha_alter_info,
 	const TABLE*			altered_table)
 {
-	if (ha_alter_info->handler_flags
+	Alter_inplace_info::HA_ALTER_FLAGS alter_inplace_flags =
+		ha_alter_info->handler_flags & ~(INNOBASE_INPLACE_IGNORE);
+
+	if (alter_inplace_flags
 	    == Alter_inplace_info::CHANGE_CREATE_OPTION
 	    && !(ha_alter_info->create_info->used_fields
 		 & (HA_CREATE_USED_ROW_FORMAT
@@ -1205,6 +1213,15 @@ innobase_col_to_mysql(
 		field->reset();
 
 		if (field->type() == MYSQL_TYPE_VARCHAR) {
+			if (field->column_format() ==
+				COLUMN_FORMAT_TYPE_COMPRESSED) {
+				/* Skip compressed varchar column when
+				reporting an erroneous row
+				during index creation or table rebuild. */
+				field->set_null();
+				break;
+			}
+
 			/* This is a >= 5.0.3 type true VARCHAR. Store the
 			length of the data to the first byte or the first
 			two bytes of dest. */
@@ -2492,7 +2509,8 @@ innobase_build_col_map_add(
 	mem_heap_t*	heap,
 	dfield_t*	dfield,
 	const Field*	field,
-	ulint		comp)
+	ulint		comp,
+	row_prebuilt_t*	prebuilt)
 {
 	if (field->is_real_null()) {
 		dfield_set_null(dfield);
@@ -2504,7 +2522,14 @@ innobase_build_col_map_add(
 	byte*	buf	= static_cast<byte*>(mem_heap_alloc(heap, size));
 
 	row_mysql_store_col_in_innobase_format(
-		dfield, buf, TRUE, field->ptr, size, comp);
+		dfield, buf, TRUE, field->ptr, size, comp,
+#ifdef HAVE_PERCONA_COMPRESSED_COLUMNS
+		field->column_format() == COLUMN_FORMAT_TYPE_COMPRESSED,
+		reinterpret_cast<const byte*>(field->zip_dict_data.str),
+		field->zip_dict_data.length, prebuilt);
+#else
+		0,0,0, prebuilt);
+#endif
 }
 
 /** Construct the translation table for reordering, dropping or
@@ -2529,7 +2554,8 @@ innobase_build_col_map(
 	const dict_table_t*	new_table,
 	const dict_table_t*	old_table,
 	dtuple_t*		add_cols,
-	mem_heap_t*		heap)
+	mem_heap_t*		heap,
+	row_prebuilt_t*	prebuilt)
 {
         uint old_i, old_innobase_i;
 	DBUG_ENTER("innobase_build_col_map");
@@ -2580,7 +2606,7 @@ innobase_build_col_map(
 		innobase_build_col_map_add(
 			heap, dtuple_get_nth_field(add_cols, i),
 			altered_table->field[sql_idx],
-			dict_table_is_comp(new_table));
+			dict_table_is_comp(new_table), prebuilt);
 found_col:
 		i++;
                 sql_idx++;
@@ -2744,7 +2770,8 @@ prepare_inplace_alter_table_dict(
 	ulint			flags2,
 	ulint			fts_doc_id_col,
 	bool			add_fts_doc_id,
-	bool			add_fts_doc_id_idx)
+	bool			add_fts_doc_id_idx,
+	row_prebuilt_t* 	prebuilt)
 {
 	bool			dict_locked	= false;
 	ulint*			add_key_nums;	/* MySQL key numbers */
@@ -2756,6 +2783,7 @@ prepare_inplace_alter_table_dict(
 	ulint			num_fts_index;
 	ha_innobase_inplace_ctx*ctx;
         uint                    sql_idx;
+	ulint*			zip_dict_ids = 0;
 
 	DBUG_ENTER("prepare_inplace_alter_table_dict");
 
@@ -2902,6 +2930,26 @@ prepare_inplace_alter_table_dict(
 			mode = crypt_data->encryption;
 		}
 
+		zip_dict_ids = static_cast<ulint*>(
+			mem_heap_alloc(ctx->heap,
+				altered_table->s->fields * sizeof(ulint)));
+
+		/* This is currently required for valgrind because MariaDB does
+		not currently support compressed columns. */
+		for (size_t field_idx = 0;
+		     field_idx < altered_table->s->fields;
+		     ++field_idx) {
+			zip_dict_ids[field_idx] = ULINT_UNDEFINED;
+		}
+
+		const char*	err_zip_dict_name = 0;
+		if (!innobase_check_zip_dicts(altered_table, zip_dict_ids,
+			ctx->trx, &err_zip_dict_name)) {
+			my_error(ER_COMPRESSION_DICTIONARY_DOES_NOT_EXIST,
+				MYF(0), err_zip_dict_name);
+			goto new_clustered_failed;
+		}
+
 		if (innobase_check_foreigns(
 			    ha_alter_info, altered_table, old_table,
 			    user_table, ctx->drop_fk, ctx->num_to_drop_fk)) {
@@ -3008,6 +3056,12 @@ prepare_inplace_alter_table_dict(
 				}
 			}
 
+			if (field->column_format() ==
+				COLUMN_FORMAT_TYPE_COMPRESSED) {
+				field_type |= DATA_COMPRESSED;
+			}
+
+
 			if (dict_col_name_is_reserved(field->field_name)) {
 				dict_mem_table_free(ctx->new_table);
 				my_error(ER_WRONG_COLUMN_NAME, MYF(0),
@@ -3087,7 +3141,7 @@ prepare_inplace_alter_table_dict(
 		ctx->col_map = innobase_build_col_map(
 			ha_alter_info, altered_table, old_table,
 			ctx->new_table, user_table,
-			add_cols, ctx->heap);
+			add_cols, ctx->heap, prebuilt);
 		ctx->add_cols = add_cols;
 	} else {
 		DBUG_ASSERT(!innobase_need_rebuild(ha_alter_info, old_table));
@@ -3265,6 +3319,17 @@ op_ok:
 
 	DBUG_ASSERT(error == DB_SUCCESS);
 
+#ifdef HAVE_PERCONA_COMPRESSED_COLUMNS
+	/*
+	Adding compression dictionary <-> compressed table column links
+	to the SYS_ZIP_DICT_COLS table.
+	*/
+	if (zip_dict_ids != 0) {
+		innobase_create_zip_dict_references(altered_table,
+			ctx->trx->table_id, zip_dict_ids, ctx->trx);
+	}
+#endif
+
 	/* Commit the data dictionary transaction in order to release
 	the table locks on the system tables.  This means that if
 	MySQL crashes while creating a new primary key inside
@@ -3999,7 +4064,7 @@ err_exit:
 	}
 
 	if (!(ha_alter_info->handler_flags & INNOBASE_ALTER_DATA)
-	    || (ha_alter_info->handler_flags
+	    || ((ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE)
 		== Alter_inplace_info::CHANGE_CREATE_OPTION
 		&& !innobase_need_rebuild(ha_alter_info, table))) {
 
@@ -4133,7 +4198,7 @@ found_col:
 			    table_share->table_name.str,
 			    flags, flags2,
 			    fts_doc_col_no, add_fts_doc_id,
-			    add_fts_doc_id_idx));
+			    add_fts_doc_id_idx, prebuilt));
 }
 
 /** Alter the table structure in-place with operations
@@ -4173,7 +4238,7 @@ ok_exit:
 		DBUG_RETURN(false);
 	}
 
-	if (ha_alter_info->handler_flags
+	if ((ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE)
 	    == Alter_inplace_info::CHANGE_CREATE_OPTION
 	    && !innobase_need_rebuild(ha_alter_info, table)) {
 		goto ok_exit;
diff --git a/storage/xtradb/handler/i_s.cc b/storage/xtradb/handler/i_s.cc
index d0e26f1352c..420dff83a40 100644
--- a/storage/xtradb/handler/i_s.cc
+++ b/storage/xtradb/handler/i_s.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2007, 2016, Oracle and/or its affiliates.
 Copyrigth (c) 2014, 2016, MariaDB Corporation
 
 This program is free software; you can redistribute it and/or modify it under
@@ -2935,15 +2935,26 @@ i_s_fts_deleted_generic_fill(
 		DBUG_RETURN(0);
 	}
 
-	deleted = fts_doc_ids_create();
+	/* Prevent DDL to drop fts aux tables. */
+	rw_lock_s_lock(&dict_operation_lock);
 
 	user_table = dict_table_open_on_name(
 		fts_internal_tbl_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE);
 
 	if (!user_table) {
+		rw_lock_s_unlock(&dict_operation_lock);
+
+		DBUG_RETURN(0);
+	} else if (!dict_table_has_fts_index(user_table)) {
+		dict_table_close(user_table, FALSE, FALSE);
+
+		rw_lock_s_unlock(&dict_operation_lock);
+
 		DBUG_RETURN(0);
 	}
 
+	deleted = fts_doc_ids_create();
+
 	trx = trx_allocate_for_background();
 	trx->op_info = "Select for FTS DELETE TABLE";
 
@@ -2971,6 +2982,8 @@ i_s_fts_deleted_generic_fill(
 
 	dict_table_close(user_table, FALSE, FALSE);
 
+	rw_lock_s_unlock(&dict_operation_lock);
+
 	DBUG_RETURN(0);
 }
 
@@ -3342,6 +3355,12 @@ i_s_fts_index_cache_fill(
 		DBUG_RETURN(0);
 	}
 
+	if (user_table->fts == NULL || user_table->fts->cache == NULL) {
+		dict_table_close(user_table, FALSE, FALSE);
+
+		DBUG_RETURN(0);
+	}
+
 	cache = user_table->fts->cache;
 
 	ut_a(cache);
@@ -3775,10 +3794,15 @@ i_s_fts_index_table_fill(
 		DBUG_RETURN(0);
 	}
 
+	/* Prevent DDL to drop fts aux tables. */
+	rw_lock_s_lock(&dict_operation_lock);
+
 	user_table = dict_table_open_on_name(
 		fts_internal_tbl_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE);
 
 	if (!user_table) {
+		rw_lock_s_unlock(&dict_operation_lock);
+
 		DBUG_RETURN(0);
 	}
 
@@ -3791,6 +3815,8 @@ i_s_fts_index_table_fill(
 
 	dict_table_close(user_table, FALSE, FALSE);
 
+	rw_lock_s_unlock(&dict_operation_lock);
+
 	DBUG_RETURN(0);
 }
 
@@ -3923,16 +3949,25 @@ i_s_fts_config_fill(
 		DBUG_RETURN(0);
 	}
 
+	DEBUG_SYNC_C("i_s_fts_config_fille_check");
+
 	fields = table->field;
 
+	/* Prevent DDL to drop fts aux tables. */
+	rw_lock_s_lock(&dict_operation_lock);
+
 	user_table = dict_table_open_on_name(
 		fts_internal_tbl_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE);
 
 	if (!user_table) {
+		rw_lock_s_unlock(&dict_operation_lock);
+
 		DBUG_RETURN(0);
 	} else if (!dict_table_has_fts_index(user_table)) {
 		dict_table_close(user_table, FALSE, FALSE);
 
+		rw_lock_s_unlock(&dict_operation_lock);
+
 		DBUG_RETURN(0);
 	}
 
@@ -3988,6 +4023,8 @@ i_s_fts_config_fill(
 
 	dict_table_close(user_table, FALSE, FALSE);
 
+	rw_lock_s_unlock(&dict_operation_lock);
+
 	DBUG_RETURN(0);
 }
 
diff --git a/storage/xtradb/handler/xtradb_i_s.cc b/storage/xtradb/handler/xtradb_i_s.cc
index 96e31b94470..39f2efb90db 100644
--- a/storage/xtradb/handler/xtradb_i_s.cc
+++ b/storage/xtradb/handler/xtradb_i_s.cc
@@ -33,9 +33,11 @@ this program; if not, write to the Free Software Foundation, Inc.,
 #include <read0i_s.h>
 #include <trx0i_s.h>
 #include "srv0start.h"	/* for srv_was_started */
+#include <btr0pcur.h> /* btr_pcur_t */
 #include <btr0sea.h> /* btr_search_sys */
 #include <log0recv.h> /* recv_sys */
 #include <fil0fil.h>
+#include <dict0crea.h> /* for ZIP_DICT_MAX_* constants */
 
 /* for XTRADB_RSEG table */
 #include "trx0trx.h" /* for TRX_QUE_STATE_STR_MAX_LEN */
@@ -44,6 +46,30 @@ this program; if not, write to the Free Software Foundation, Inc.,
 
 #define PLUGIN_AUTHOR "Percona Inc."
 
+static int field_store_blob(Field*, const char*, uint) __attribute__((unused));
+/** Auxiliary function to store (char*, len) value in MYSQL_TYPE_BLOB
+field.
+@return	0 on success */
+static
+int
+field_store_blob(
+	Field*		field,		/*!< in/out: target field for storage */
+	const char*	data,		/*!< in: pointer to data, or NULL */
+	uint		data_len)	/*!< in: data length */
+{
+	int	ret;
+
+	if (data != NULL) {
+		ret = field->store(data, data_len, system_charset_info);
+		field->set_notnull();
+	} else {
+		ret = 0; /* success */
+		field->set_null();
+	}
+
+	return(ret);
+}
+
 static
 int
 i_s_common_deinit(
@@ -516,3 +542,331 @@ UNIV_INTERN struct st_mysql_plugin	i_s_xtradb_rseg =
 	STRUCT_FLD(version_info, INNODB_VERSION_STR),
         STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE),
 };
+
+
+#ifdef HAVE_PERCONA_COMPRESSED_COLUMNS
+/************************************************************************/
+enum zip_dict_field_type
+{
+	zip_dict_field_id,
+	zip_dict_field_name,
+	zip_dict_field_zip_dict
+};
+
+static ST_FIELD_INFO xtradb_sys_zip_dict_fields_info[] =
+{
+	{ STRUCT_FLD(field_name, "id"),
+	STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+	STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+	STRUCT_FLD(value, 0),
+	STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+	STRUCT_FLD(old_name, ""),
+	STRUCT_FLD(open_method, SKIP_OPEN_TABLE) },
+
+	{ STRUCT_FLD(field_name, "name"),
+	STRUCT_FLD(field_length, ZIP_DICT_MAX_NAME_LENGTH),
+	STRUCT_FLD(field_type, MYSQL_TYPE_STRING),
+	STRUCT_FLD(value, 0),
+	STRUCT_FLD(field_flags, 0),
+	STRUCT_FLD(old_name, ""),
+	STRUCT_FLD(open_method, SKIP_OPEN_TABLE) },
+
+	{ STRUCT_FLD(field_name, "zip_dict"),
+	STRUCT_FLD(field_length, ZIP_DICT_MAX_DATA_LENGTH),
+	STRUCT_FLD(field_type, MYSQL_TYPE_BLOB),
+	STRUCT_FLD(value, 0),
+	STRUCT_FLD(field_flags, 0),
+	STRUCT_FLD(old_name, ""),
+	STRUCT_FLD(open_method, SKIP_OPEN_TABLE) },
+
+	END_OF_ST_FIELD_INFO
+};
+
+/** Function to fill INFORMATION_SCHEMA.XTRADB_ZIP_DICT with information
+collected by scanning SYS_ZIP_DICT table.
+@return 0 on success */
+static
+int
+xtradb_i_s_dict_fill_sys_zip_dict(
+	THD*		thd,		/*!< in: thread */
+	ulint		id,		/*!< in: dict ID */
+	const char*	name,		/*!< in: dict name */
+	const char*	data,		/*!< in: dict data */
+	ulint		data_len,	/*!< in: dict data length */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
+{
+	DBUG_ENTER("xtradb_i_s_dict_fill_sys_zip_dict");
+
+	Field**	fields = table_to_fill->field;
+
+	OK(field_store_ulint(fields[zip_dict_field_id], id));
+	OK(field_store_string(fields[zip_dict_field_name], name));
+	OK(field_store_blob(fields[zip_dict_field_zip_dict], data,
+		data_len));
+
+	OK(schema_table_store_record(thd, table_to_fill));
+
+	DBUG_RETURN(0);
+}
+
+/** Function to populate INFORMATION_SCHEMA.XTRADB_ZIP_DICT table.
+Loop through each record in SYS_ZIP_DICT, and extract the column
+information and fill the INFORMATION_SCHEMA.XTRADB_ZIP_DICT table.
+@return 0 on success */
+static
+int
+xtradb_i_s_sys_zip_dict_fill_table(
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	mem_heap_t*	heap;
+	mtr_t		mtr;
+
+	DBUG_ENTER("xtradb_i_s_sys_zip_dict_fill_table");
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+	/* deny access to user without SUPER_ACL privilege */
+	if (check_global_access(thd, SUPER_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	heap = mem_heap_create(1000);
+	mutex_enter(&dict_sys->mutex);
+	mtr_start(&mtr);
+
+	rec = dict_startscan_system(&pcur, &mtr, SYS_ZIP_DICT);
+	ulint zip_size = dict_table_zip_size(pcur.btr_cur.index->table);
+
+	while (rec) {
+		const char*	err_msg;
+		ulint		id;
+		const char*	name;
+		const char*	data;
+		ulint		data_len;
+
+		/* Extract necessary information from a SYS_ZIP_DICT row */
+		err_msg = dict_process_sys_zip_dict(
+			heap, zip_size, rec, &id, &name, &data, &data_len);
+
+		mtr_commit(&mtr);
+		mutex_exit(&dict_sys->mutex);
+
+		if (!err_msg) {
+			xtradb_i_s_dict_fill_sys_zip_dict(
+				thd, id, name, data, data_len,
+				tables->table);
+		} else {
+			push_warning_printf(thd,
+				Sql_condition::WARN_LEVEL_WARN,
+				ER_CANT_FIND_SYSTEM_REC, "%s", err_msg);
+		}
+
+		mem_heap_empty(heap);
+
+		/* Get the next record */
+		mutex_enter(&dict_sys->mutex);
+		mtr_start(&mtr);
+		rec = dict_getnext_system(&pcur, &mtr);
+	}
+
+	mtr_commit(&mtr);
+	mutex_exit(&dict_sys->mutex);
+	mem_heap_free(heap);
+
+	DBUG_RETURN(0);
+}
+
+static int i_s_xtradb_zip_dict_init(void* p)
+{
+	DBUG_ENTER("i_s_xtradb_zip_dict_init");
+
+	ST_SCHEMA_TABLE* schema = static_cast<ST_SCHEMA_TABLE*>(p);
+
+	schema->fields_info = xtradb_sys_zip_dict_fields_info;
+	schema->fill_table = xtradb_i_s_sys_zip_dict_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_xtradb_zip_dict =
+{
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+	STRUCT_FLD(info, &i_s_info),
+	STRUCT_FLD(name, "XTRADB_ZIP_DICT"),
+	STRUCT_FLD(author, PLUGIN_AUTHOR),
+	STRUCT_FLD(descr, "InnoDB compression dictionaries information"),
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+	STRUCT_FLD(init, i_s_xtradb_zip_dict_init),
+	STRUCT_FLD(deinit, i_s_common_deinit),
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+	STRUCT_FLD(status_vars, NULL),
+	STRUCT_FLD(system_vars, NULL),
+	STRUCT_FLD(__reserved1, NULL),
+	STRUCT_FLD(flags, 0UL),
+};
+
+enum zip_dict_cols_field_type
+{
+	zip_dict_cols_field_table_id,
+	zip_dict_cols_field_column_pos,
+	zip_dict_cols_field_dict_id
+};
+
+static ST_FIELD_INFO xtradb_sys_zip_dict_cols_fields_info[] =
+{
+	{ STRUCT_FLD(field_name, "table_id"),
+	STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+	STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+	STRUCT_FLD(value, 0),
+	STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+	STRUCT_FLD(old_name, ""),
+	STRUCT_FLD(open_method, SKIP_OPEN_TABLE) },
+
+	{ STRUCT_FLD(field_name, "column_pos"),
+	STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+	STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+	STRUCT_FLD(value, 0),
+	STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+	STRUCT_FLD(old_name, ""),
+	STRUCT_FLD(open_method, SKIP_OPEN_TABLE) },
+
+	{ STRUCT_FLD(field_name, "dict_id"),
+	STRUCT_FLD(field_length, MY_INT64_NUM_DECIMAL_DIGITS),
+	STRUCT_FLD(field_type, MYSQL_TYPE_LONGLONG),
+	STRUCT_FLD(value, 0),
+	STRUCT_FLD(field_flags, MY_I_S_UNSIGNED),
+	STRUCT_FLD(old_name, ""),
+	STRUCT_FLD(open_method, SKIP_OPEN_TABLE) },
+
+	END_OF_ST_FIELD_INFO
+};
+
+/** Function to fill INFORMATION_SCHEMA.XTRADB_ZIP_DICT_COLS with information
+collected by scanning SYS_ZIP_DICT_COLS table.
+@return 0 on success */
+static
+int
+xtradb_i_s_dict_fill_sys_zip_dict_cols(
+	THD*		thd,		/*!< in: thread */
+	ulint		table_id,	/*!< in: table ID */
+	ulint		column_pos,	/*!< in: column position */
+	ulint		dict_id,	/*!< in: dict ID */
+	TABLE*		table_to_fill)	/*!< in/out: fill this table */
+{
+	DBUG_ENTER("xtradb_i_s_dict_fill_sys_zip_dict_cols");
+
+	Field**	fields = table_to_fill->field;
+
+	OK(field_store_ulint(fields[zip_dict_cols_field_table_id],
+		table_id));
+	OK(field_store_ulint(fields[zip_dict_cols_field_column_pos],
+		column_pos));
+	OK(field_store_ulint(fields[zip_dict_cols_field_dict_id],
+		dict_id));
+
+	OK(schema_table_store_record(thd, table_to_fill));
+
+	DBUG_RETURN(0);
+}
+
+/** Function to populate INFORMATION_SCHEMA.XTRADB_ZIP_DICT_COLS table.
+Loop through each record in SYS_ZIP_DICT_COLS, and extract the column
+information and fill the INFORMATION_SCHEMA.XTRADB_ZIP_DICT_COLS table.
+@return 0 on success */
+static
+int
+xtradb_i_s_sys_zip_dict_cols_fill_table(
+	THD*		thd,	/*!< in: thread */
+	TABLE_LIST*	tables,	/*!< in/out: tables to fill */
+	Item*		)	/*!< in: condition (not used) */
+{
+	btr_pcur_t	pcur;
+	const rec_t*	rec;
+	mem_heap_t*	heap;
+	mtr_t		mtr;
+
+	DBUG_ENTER("xtradb_i_s_sys_zip_dict_cols_fill_table");
+	RETURN_IF_INNODB_NOT_STARTED(tables->schema_table_name);
+
+	/* deny access to user without SUPER_ACL privilege */
+	if (check_global_access(thd, SUPER_ACL)) {
+		DBUG_RETURN(0);
+	}
+
+	heap = mem_heap_create(1000);
+	mutex_enter(&dict_sys->mutex);
+	mtr_start(&mtr);
+
+	rec = dict_startscan_system(&pcur, &mtr, SYS_ZIP_DICT_COLS);
+
+	while (rec) {
+		const char*	err_msg;
+		ulint table_id;
+		ulint column_pos;
+		ulint dict_id;
+
+		/* Extract necessary information from a SYS_ZIP_DICT_COLS
+		row */
+		err_msg = dict_process_sys_zip_dict_cols(
+			heap, rec, &table_id, &column_pos, &dict_id);
+
+		mtr_commit(&mtr);
+		mutex_exit(&dict_sys->mutex);
+
+		if (!err_msg) {
+			xtradb_i_s_dict_fill_sys_zip_dict_cols(
+				thd, table_id, column_pos, dict_id,
+				tables->table);
+		} else {
+			push_warning_printf(thd,
+				Sql_condition::WARN_LEVEL_WARN,
+				ER_CANT_FIND_SYSTEM_REC, "%s", err_msg);
+		}
+
+		mem_heap_empty(heap);
+
+		/* Get the next record */
+		mutex_enter(&dict_sys->mutex);
+		mtr_start(&mtr);
+		rec = dict_getnext_system(&pcur, &mtr);
+	}
+
+	mtr_commit(&mtr);
+	mutex_exit(&dict_sys->mutex);
+	mem_heap_free(heap);
+
+	DBUG_RETURN(0);
+}
+
+static int i_s_xtradb_zip_dict_cols_init(void* p)
+{
+	DBUG_ENTER("i_s_xtradb_zip_dict_cols_init");
+
+	ST_SCHEMA_TABLE* schema = static_cast<ST_SCHEMA_TABLE*>(p);
+
+	schema->fields_info = xtradb_sys_zip_dict_cols_fields_info;
+	schema->fill_table = xtradb_i_s_sys_zip_dict_cols_fill_table;
+
+	DBUG_RETURN(0);
+}
+
+UNIV_INTERN struct st_mysql_plugin	i_s_xtradb_zip_dict_cols =
+{
+	STRUCT_FLD(type, MYSQL_INFORMATION_SCHEMA_PLUGIN),
+	STRUCT_FLD(info, &i_s_info),
+	STRUCT_FLD(name, "XTRADB_ZIP_DICT_COLS"),
+	STRUCT_FLD(author, PLUGIN_AUTHOR),
+	STRUCT_FLD(descr, "InnoDB compressed columns information"),
+	STRUCT_FLD(license, PLUGIN_LICENSE_GPL),
+	STRUCT_FLD(init, i_s_xtradb_zip_dict_cols_init),
+	STRUCT_FLD(deinit, i_s_common_deinit),
+	STRUCT_FLD(version, INNODB_VERSION_SHORT),
+	STRUCT_FLD(status_vars, NULL),
+	STRUCT_FLD(system_vars, NULL),
+	STRUCT_FLD(__reserved1, NULL),
+	STRUCT_FLD(flags, 0UL),
+};
+#endif
diff --git a/storage/xtradb/handler/xtradb_i_s.h b/storage/xtradb/handler/xtradb_i_s.h
index 2f7552c565a..905d84587af 100644
--- a/storage/xtradb/handler/xtradb_i_s.h
+++ b/storage/xtradb/handler/xtradb_i_s.h
@@ -22,5 +22,7 @@ this program; if not, write to the Free Software Foundation, Inc.,
 extern struct st_mysql_plugin	i_s_xtradb_read_view;
 extern struct st_mysql_plugin	i_s_xtradb_internal_hash_tables;
 extern struct st_mysql_plugin	i_s_xtradb_rseg;
+extern struct st_mysql_plugin	i_s_xtradb_zip_dict;
+extern struct st_mysql_plugin	i_s_xtradb_zip_dict_cols;
 
 #endif /* XTRADB_I_S_H */
diff --git a/storage/xtradb/ibuf/ibuf0ibuf.cc b/storage/xtradb/ibuf/ibuf0ibuf.cc
index 4a496cb4190..c1d735eecdd 100644
--- a/storage/xtradb/ibuf/ibuf0ibuf.cc
+++ b/storage/xtradb/ibuf/ibuf0ibuf.cc
@@ -956,9 +956,15 @@ ibuf_set_free_bits_low(
 	page_t*	bitmap_page;
 	ulint	space;
 	ulint	page_no;
+	buf_frame_t* frame;
 
-	if (!page_is_leaf(buf_block_get_frame(block))) {
+	if (!block) {
+		return;
+	}
+
+	frame = buf_block_get_frame(block);
 
+	if (!frame || !page_is_leaf(frame)) {
 		return;
 	}
 
@@ -1132,7 +1138,11 @@ ibuf_update_free_bits_zip(
 	page_no = buf_block_get_page_no(block);
 	zip_size = buf_block_get_zip_size(block);
 
-	ut_a(page_is_leaf(buf_block_get_frame(block)));
+	ut_a(block);
+
+	buf_frame_t* frame = buf_block_get_frame(block);
+
+	ut_a(frame && page_is_leaf(frame));
 	ut_a(zip_size);
 
 	bitmap_page = ibuf_bitmap_get_map_page(space, page_no, zip_size, mtr);
diff --git a/storage/xtradb/include/btr0btr.h b/storage/xtradb/include/btr0btr.h
index 5047d1b2d4e..9ab62f7739f 100644
--- a/storage/xtradb/include/btr0btr.h
+++ b/storage/xtradb/include/btr0btr.h
@@ -298,9 +298,17 @@ btr_block_get_func(
 @param idx	index tree, may be NULL if not the insert buffer tree
 @param mtr	mini-transaction handle
 @return the uncompressed page frame */
-# define btr_page_get(space,zip_size,page_no,mode,idx,mtr)		\
-	buf_block_get_frame(btr_block_get(space,zip_size,page_no, \
-			mode,idx,mtr))
+UNIV_INLINE
+page_t*
+btr_page_get(
+/*=========*/
+	ulint		space,
+	ulint		zip_size,
+	ulint		root_page_no,
+	ulint		mode,
+	dict_index_t*	index,
+	mtr_t*		mtr)
+	MY_ATTRIBUTE((warn_unused_result));
 #endif /* !UNIV_HOTBACKUP */
 /**************************************************************//**
 Gets the index id field of a page.
diff --git a/storage/xtradb/include/btr0btr.ic b/storage/xtradb/include/btr0btr.ic
index 8c9c3bead09..62a24873482 100644
--- a/storage/xtradb/include/btr0btr.ic
+++ b/storage/xtradb/include/btr0btr.ic
@@ -60,7 +60,9 @@ btr_block_get_func(
 		NULL, BUF_GET, file, line, mtr, &err);
 
 	if (err == DB_DECRYPTION_FAILED) {
-		index->table->is_encrypted = true;
+		if (index && index->table) {
+			index->table->is_encrypted = true;
+		}
 	}
 
 	if (block) {
@@ -96,6 +98,38 @@ btr_page_set_index_id(
 		mlog_write_ull(page + (PAGE_HEADER + PAGE_INDEX_ID), id, mtr);
 	}
 }
+
+/** Gets a buffer page and declares its latching order level.
+@param space	tablespace identifier
+@param zip_size	compressed page size in bytes or 0 for uncompressed pages
+@param page_no	page number
+@param mode	latch mode
+@param idx	index tree, may be NULL if not the insert buffer tree
+@param mtr	mini-transaction handle
+@return the uncompressed page frame */
+UNIV_INLINE
+page_t*
+btr_page_get(
+/*=========*/
+	ulint		space,
+	ulint		zip_size,
+	ulint		root_page_no,
+	ulint		mode,
+	dict_index_t*	index,
+	mtr_t*		mtr)
+{
+	buf_block_t* block=NULL;
+	buf_frame_t* frame=NULL;
+
+	block = btr_block_get(space, zip_size, root_page_no, mode, index, mtr);
+
+	if (block) {
+		frame = buf_block_get_frame(block);
+	}
+
+	return ((page_t*)frame);
+}
+
 #endif /* !UNIV_HOTBACKUP */
 
 /**************************************************************//**
diff --git a/storage/xtradb/include/buf0buf.h b/storage/xtradb/include/buf0buf.h
index f599997be02..6924481af49 100644
--- a/storage/xtradb/include/buf0buf.h
+++ b/storage/xtradb/include/buf0buf.h
@@ -1084,10 +1084,20 @@ buf_block_get_frame(
 /*================*/
 	const buf_block_t*	block)	/*!< in: pointer to the control block */
 	MY_ATTRIBUTE((pure));
-# define buf_block_get_frame_fast(block) buf_block_get_frame(block)
+
+/*********************************************************************//**
+Gets a pointer to the memory frame of a block, where block is known not to be
+NULL.
+@return	pointer to the frame */
+UNIV_INLINE
+buf_frame_t*
+buf_nonnull_block_get_frame(
+	const buf_block_t*	block)	/*!< in: pointer to the control block */
+	MY_ATTRIBUTE((pure));
+
 #else /* UNIV_DEBUG */
 # define buf_block_get_frame(block) (block ? (block)->frame : 0)
-# define buf_block_get_frame_fast(block) (block)->frame
+# define buf_nonnull_block_get_frame(block) ((block)->frame)
 #endif /* UNIV_DEBUG */
 /*********************************************************************//**
 Gets the space id of a block.
diff --git a/storage/xtradb/include/buf0buf.ic b/storage/xtradb/include/buf0buf.ic
index 7b1c66f2a05..20721b28ef2 100644
--- a/storage/xtradb/include/buf0buf.ic
+++ b/storage/xtradb/include/buf0buf.ic
@@ -744,6 +744,19 @@ buf_block_get_frame(
 
 	SRV_CORRUPT_TABLE_CHECK(block, return(0););
 
+	return(buf_nonnull_block_get_frame(block));
+}
+
+/*********************************************************************//**
+Gets a pointer to the memory frame of a block, where block is known not to be
+NULL.
+@return	pointer to the frame */
+UNIV_INLINE
+buf_frame_t*
+buf_nonnull_block_get_frame(
+/*========================*/
+	const buf_block_t*	block)	/*!< in: pointer to the control block */
+{
 	switch (buf_block_get_state(block)) {
 	case BUF_BLOCK_POOL_WATCH:
 	case BUF_BLOCK_ZIP_PAGE:
@@ -768,6 +781,7 @@ buf_block_get_frame(
 ok:
 	return((buf_frame_t*) block->frame);
 }
+
 #endif /* UNIV_DEBUG */
 
 /*********************************************************************//**
diff --git a/storage/xtradb/include/data0type.h b/storage/xtradb/include/data0type.h
index 111664b0b52..f269c266efb 100644
--- a/storage/xtradb/include/data0type.h
+++ b/storage/xtradb/include/data0type.h
@@ -170,6 +170,9 @@ be less than 256 */
 				type when the column is true VARCHAR where
 				MySQL uses 2 bytes to store the data len;
 				for shorter VARCHARs MySQL uses only 1 byte */
+#define	DATA_COMPRESSED	16384	/* this is ORed to the precise data
+				type when the column has COLUMN_FORMAT =
+				COMPRESSED attribute*/
 /*-------------------------------------------*/
 
 /* This many bytes we need to store the type information affecting the
@@ -500,6 +503,17 @@ dtype_print(
 /*========*/
 	const dtype_t*	type);	/*!< in: type */
 
+/**
+Calculates the number of extra bytes needed for compression header
+depending on precise column type.
+@reval 0 if prtype does not include DATA_COMPRESSED flag
+@reval ZIP_COLUMN_HEADER_LENGTH if prtype includes DATA_COMPRESSED flag
+*/
+UNIV_INLINE
+ulint
+prtype_get_compression_extra(
+	ulint		prtype);	/*!< in: precise type */
+
 /* Structure for an SQL data type.
 If you add fields to this structure, be sure to initialize them everywhere.
 This structure is initialized in the following functions:
diff --git a/storage/xtradb/include/data0type.ic b/storage/xtradb/include/data0type.ic
index d489bef89a8..29dc480a19c 100644
--- a/storage/xtradb/include/data0type.ic
+++ b/storage/xtradb/include/data0type.ic
@@ -26,6 +26,7 @@ Created 1/16/1996 Heikki Tuuri
 #include <string.h> /* strlen() */
 
 #include "mach0data.h"
+#include "rem0types.h" /* ZIP_COLUMN_HEADER_LENGTH */
 #ifndef UNIV_HOTBACKUP
 # include "ha_prototypes.h"
 
@@ -709,3 +710,18 @@ dtype_get_sql_null_size(
 					0, 0));
 #endif /* !UNIV_HOTBACKUP */
 }
+
+/**
+Calculates the number of extra bytes needed for compression header
+depending on precise column type.
+@reval 0 if prtype does not include DATA_COMPRESSED flag
+@reval ZIP_COLUMN_HEADER_LENGTH if prtype includes DATA_COMPRESSED flag
+*/
+UNIV_INLINE
+ulint
+prtype_get_compression_extra(
+	ulint		prtype)	/*!< in: precise type */
+{
+	return (prtype & DATA_COMPRESSED) != 0 ?
+		ZIP_COLUMN_HEADER_LENGTH : 0;
+}
diff --git a/storage/xtradb/include/dict0boot.h b/storage/xtradb/include/dict0boot.h
index 477e1150f43..d5bee886cbf 100644
--- a/storage/xtradb/include/dict0boot.h
+++ b/storage/xtradb/include/dict0boot.h
@@ -324,6 +324,38 @@ enum dict_fld_sys_datafiles_enum {
 	DICT_FLD__SYS_DATAFILES__PATH			= 3,
 	DICT_NUM_FIELDS__SYS_DATAFILES			= 4
 };
+/* The columns in SYS_DICT */
+enum dict_col_sys_zip_dict_enum {
+	DICT_COL__SYS_ZIP_DICT__ID			= 0,
+	DICT_COL__SYS_ZIP_DICT__NAME			= 1,
+	DICT_COL__SYS_ZIP_DICT__DATA			= 2,
+	DICT_NUM_COLS__SYS_ZIP_DICT			= 3
+};
+/* The field numbers in the SYS_DICT clustered index */
+enum dict_fld_sys_zip_dict_enum {
+	DICT_FLD__SYS_ZIP_DICT__ID			= 0,
+	DICT_FLD__SYS_ZIP_DICT__DB_TRX_ID		= 1,
+	DICT_FLD__SYS_ZIP_DICT__DB_ROLL_PTR		= 2,
+	DICT_FLD__SYS_ZIP_DICT__NAME			= 3,
+	DICT_FLD__SYS_ZIP_DICT__DATA			= 4,
+	DICT_NUM_FIELDS__SYS_ZIP_DICT			= 5
+};
+/* The columns in SYS_DICT_COLS */
+enum dict_col_sys_zip_dict_cols_enum {
+	DICT_COL__SYS_ZIP_DICT_COLS__TABLE_ID		= 0,
+	DICT_COL__SYS_ZIP_DICT_COLS__COLUMN_POS		= 1,
+	DICT_COL__SYS_ZIP_DICT_COLS__DICT_ID		= 2,
+	DICT_NUM_COLS__SYS_ZIP_DICT_COLS		= 3
+};
+/* The field numbers in the SYS_DICT_COLS clustered index */
+enum dict_fld_sys_zip_dict_cols_enum {
+	DICT_FLD__SYS_ZIP_DICT_COLS__TABLE_ID		= 0,
+	DICT_FLD__SYS_ZIP_DICT_COLS__COLUMN_POS		= 1,
+	DICT_FLD__SYS_ZIP_DICT_COLS__DB_TRX_ID		= 2,
+	DICT_FLD__SYS_ZIP_DICT_COLS__DB_ROLL_PTR	= 3,
+	DICT_FLD__SYS_ZIP_DICT_COLS__DICT_ID		= 4,
+	DICT_NUM_FIELDS__SYS_ZIP_DICT_COLS		= 5
+};
 
 /* A number of the columns above occur in multiple tables.  These are the
 length of thos fields. */
diff --git a/storage/xtradb/include/dict0crea.h b/storage/xtradb/include/dict0crea.h
index 77627c9bf67..6ea71ada83e 100644
--- a/storage/xtradb/include/dict0crea.h
+++ b/storage/xtradb/include/dict0crea.h
@@ -166,6 +166,19 @@ UNIV_INTERN
 dberr_t
 dict_create_or_check_sys_tablespace(void);
 /*=====================================*/
+
+#define ZIP_DICT_MAX_NAME_LENGTH 64
+/* Max window size (2^15) minus 262 */
+#define ZIP_DICT_MAX_DATA_LENGTH 32506
+
+/** Creates the zip_dict system table inside InnoDB
+at server bootstrap or server start if it is not found or is
+not of the right form.
+@return	DB_SUCCESS or error code */
+UNIV_INTERN
+dberr_t
+dict_create_or_check_sys_zip_dict(void);
+
 /********************************************************************//**
 Add a single tablespace definition to the data dictionary tables in the
 database.
@@ -181,6 +194,84 @@ dict_create_add_tablespace_to_dictionary(
 	trx_t*		trx,		/*!< in: transaction */
 	bool		commit);	/*!< in: if true then commit the
 					transaction */
+
+/** Add a single compression dictionary definition to the SYS_ZIP_DICT
+InnoDB system table.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+dict_create_add_zip_dict(
+	const char*	name,		/*!< in: dict name */
+	ulint		name_len,	/*!< in: dict name length */
+	const char*	data,		/*!< in: dict data */
+	ulint		data_len,	/*!< in: dict data length */
+	trx_t*		trx);		/*!< in/out: transaction */
+
+/** Add a single compression dictionary reference to the SYS_ZIP_DICT_COLS
+InnoDB system table.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+dict_create_add_zip_dict_reference(
+	ulint		table_id,	/*!< in: table id */
+	ulint		column_pos,	/*!< in: column position */
+	ulint		dict_id,	/*!< in: dict id */
+	trx_t*		trx);		/*!< in/out: transaction */
+
+/** Get a single compression dictionary id for the given
+(table id, column pos) pair.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+dict_create_get_zip_dict_id_by_reference(
+	ulint	table_id,	/*!< in: table id */
+	ulint	column_pos,	/*!< in: column position */
+	ulint*	dict_id,	/*!< out: dict id */
+	trx_t*	trx);		/*!< in/out: transaction */
+
+/** Get compression dictionary id for the given name.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+dict_create_get_zip_dict_id_by_name(
+	const char*	dict_name,	/*!< in: dict name */
+	ulint		dict_name_len,	/*!< in: dict name length */
+	ulint*		dict_id,	/*!< out: dict id */
+	trx_t*		trx);		/*!< in/out: transaction */
+
+/** Get compression dictionary info (name and data) for the given id.
+Allocates memory for name and data on success.
+Must be freed with mem_free().
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+dict_create_get_zip_dict_info_by_id(
+	ulint	dict_id,	/*!< in: dict id */
+	char**	name,		/*!< out: dict name */
+	ulint*	name_len,	/*!< out: dict name length */
+	char**	data,		/*!< out: dict data */
+	ulint*	data_len,	/*!< out: dict data length */
+	trx_t*	trx);		/*!< in/out: transaction */
+
+/** Remove a single compression dictionary from the data dictionary
+tables in the database.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+dict_create_remove_zip_dict(
+	const char*	name,		/*!< in: dict name */
+	ulint		name_len,	/*!< in: dict name length */
+	trx_t*		trx);		/*!< in/out: transaction */
+
+/** Remove all compression dictionary references for the given table ID from
+the data dictionary tables in the database.
+@return	error code or DB_SUCCESS */
+UNIV_INTERN
+dberr_t
+dict_create_remove_zip_dict_references_for_table(
+	ulint	table_id,	/*!< in: table id */
+	trx_t*	trx);		/*!< in/out: transaction */
+
 /********************************************************************//**
 Add a foreign key definition to the data dictionary tables.
 @return	error code or DB_SUCCESS */
diff --git a/storage/xtradb/include/dict0dict.h b/storage/xtradb/include/dict0dict.h
index e843d63e717..5aab595302a 100644
--- a/storage/xtradb/include/dict0dict.h
+++ b/storage/xtradb/include/dict0dict.h
@@ -1910,6 +1910,52 @@ dict_table_set_corrupt_by_space(
 	ulint	space_id,
 	ibool	need_mutex);
 
+/** Insert a records into SYS_ZIP_DICT.
+@retval	DB_SUCCESS	if OK
+@retval	dberr_t		if the insert failed */
+UNIV_INTERN
+dberr_t
+dict_create_zip_dict(
+	const char*	name,		/*!< in: zip_dict name */
+	ulint		name_len,	/*!< in: zip_dict name length*/
+	const char*	data,		/*!< in: zip_dict data */
+	ulint		data_len);	/*!< in: zip_dict data length */
+
+/** Get single compression dictionary id for the given
+(table id, column pos) pair.
+@retval	DB_SUCCESS		if OK
+@retval	DB_RECORD_NOT_FOUND	if not found */
+UNIV_INTERN
+dberr_t
+dict_get_dictionary_id_by_key(
+	ulint	table_id,	/*!< in: table id */
+	ulint	column_pos,	/*!< in: column position */
+	ulint*	dict_id);	/*!< out: zip_dict id */
+
+/** Get compression dictionary info (name and data) for the given id.
+Allocates memory in name->str and data->str on success.
+Must be freed with mem_free().
+@retval	DB_SUCCESS		if OK
+@retval	DB_RECORD_NOT_FOUND	if not found */
+UNIV_INTERN
+dberr_t
+dict_get_dictionary_info_by_id(
+	ulint	dict_id,	/*!< in: table name */
+	char**	name,		/*!< out: dictionary name */
+	ulint*	name_len,	/*!< out: dictionary name length*/
+	char**	data,		/*!< out: dictionary data */
+	ulint*	data_len);	/*!< out: dictionary data length*/
+
+/** Delete a record in SYS_ZIP_DICT with the given name.
+@retval	DB_SUCCESS		if OK
+@retval	DB_RECORD_NOT_FOUND	if not found
+@retval	DB_ROW_IS_REFERENCED	if in use */
+UNIV_INTERN
+dberr_t
+dict_drop_zip_dict(
+	const char*	name,		/*!< in: zip_dict name */
+	ulint		name_len);	/*!< in: zip_dict name length*/
+
 #ifndef UNIV_NONINL
 #include "dict0dict.ic"
 #endif
diff --git a/storage/xtradb/include/dict0load.h b/storage/xtradb/include/dict0load.h
index dcbc3de8e94..85e3e565637 100644
--- a/storage/xtradb/include/dict0load.h
+++ b/storage/xtradb/include/dict0load.h
@@ -44,6 +44,8 @@ enum dict_system_id_t {
 	SYS_FOREIGN_COLS,
 	SYS_TABLESPACES,
 	SYS_DATAFILES,
+	SYS_ZIP_DICT,
+	SYS_ZIP_DICT_COLS,
 
 	/* This must be last item. Defines the number of system tables. */
 	SYS_NUM_SYSTEM_TABLES
@@ -386,6 +388,33 @@ dict_process_sys_datafiles(
 	const rec_t*	rec,		/*!< in: current SYS_DATAFILES rec */
 	ulint*		space,		/*!< out: pace id */
 	const char**	path);		/*!< out: datafile path */
+
+/** This function parses a SYS_ZIP_DICT record, extracts necessary
+information from the record and returns to caller.
+@return error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_process_sys_zip_dict(
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	ulint		zip_size,	/*!< in: nonzero=compressed BLOB page size */
+	const rec_t*	rec,		/*!< in: current SYS_ZIP_DICT rec */
+	ulint*		id,		/*!< out: dict id */
+	const char**	name,		/*!< out: dict name */
+	const char**	data,		/*!< out: dict data */
+	ulint*		data_len);	/*!< out: dict data length */
+
+/** This function parses a SYS_ZIP_DICT_COLS record, extracts necessary
+information from the record and returns to caller.
+@return error message, or NULL on success */
+UNIV_INTERN
+const char*
+dict_process_sys_zip_dict_cols(
+	mem_heap_t*	heap,		/*!< in/out: heap memory */
+	const rec_t*	rec,		/*!< in: current SYS_ZIP_DICT rec */
+	ulint*		table_id,	/*!< out: table id */
+	ulint*		column_pos,	/*!< out: column position */
+	ulint*		dict_id);	/*!< out: dict id */
+
 /********************************************************************//**
 Get the filepath for a spaceid from SYS_DATAFILES. This function provides
 a temporary heap which is used for the table lookup, but not for the path.
diff --git a/storage/xtradb/include/dict0mem.h b/storage/xtradb/include/dict0mem.h
index 0ee1c7692fd..96c85cd8a99 100644
--- a/storage/xtradb/include/dict0mem.h
+++ b/storage/xtradb/include/dict0mem.h
@@ -1046,6 +1046,8 @@ struct dict_table_t{
 	mem_heap_t*	heap;	/*!< memory heap */
 	char*		name;	/*!< table name */
 	void*		thd;		/*!< thd */
+	bool		page_0_read; /*!< true if page 0 has
+				     been already read */
 	fil_space_crypt_t *crypt_data; /*!< crypt data if present */
 	const char*	dir_path_of_temp_table;/*!< NULL or the directory path
 				where a TEMPORARY table that was explicitly
diff --git a/storage/xtradb/include/fil0crypt.h b/storage/xtradb/include/fil0crypt.h
index 5deed1f001c..8bb0ce65a6b 100644
--- a/storage/xtradb/include/fil0crypt.h
+++ b/storage/xtradb/include/fil0crypt.h
@@ -75,6 +75,21 @@ struct key_struct
                                                 (that is L in CRYPT_SCHEME_1) */
 };
 
+/** is encryption enabled */
+extern ulong	srv_encrypt_tables;
+
+#ifdef UNIV_PFS_MUTEX
+extern mysql_pfs_key_t fil_crypt_data_mutex_key;
+#endif
+
+/** Mutex helper for crypt_data->scheme
+@param[in, out]	schme	encryption scheme
+@param[in]	exit	should we exit or enter mutex ? */
+void
+crypt_data_scheme_locker(
+	st_encryption_scheme*	scheme,
+	int			exit);
+
 struct fil_space_rotate_state_t
 {
 	time_t start_time;	/*!< time when rotation started */
@@ -96,13 +111,110 @@ struct fil_space_rotate_state_t
 
 struct fil_space_crypt_struct : st_encryption_scheme
 {
+ public:
+	/** Constructor. Does not initialize the members!
+	The object is expected to be placed in a buffer that
+	has been zero-initialized. */
+	fil_space_crypt_struct(
+		ulint new_type,
+		uint new_min_key_version,
+		uint new_key_id,
+		ulint offset,
+		fil_encryption_t new_encryption)
+		: st_encryption_scheme(),
+		min_key_version(new_min_key_version),
+		page0_offset(offset),
+		encryption(new_encryption),
+		closing(false),
+		key_found(),
+		rotate_state()
+	{
+		key_found = new_min_key_version;
+		key_id = new_key_id;
+		my_random_bytes(iv, sizeof(iv));
+		mutex_create(fil_crypt_data_mutex_key,
+			&mutex, SYNC_NO_ORDER_CHECK);
+		locker = crypt_data_scheme_locker;
+		type = new_type;
+
+		if (new_encryption == FIL_SPACE_ENCRYPTION_OFF ||
+			(!srv_encrypt_tables &&
+			 new_encryption == FIL_SPACE_ENCRYPTION_DEFAULT)) {
+			type = CRYPT_SCHEME_UNENCRYPTED;
+		} else {
+			type = CRYPT_SCHEME_1;
+			min_key_version = key_get_latest_version();
+		}
+	}
+
+	/** Destructor */
+	~fil_space_crypt_struct()
+	{
+		closing = true;
+		mutex_free(&mutex);
+	}
+
+	/** Get latest key version from encryption plugin
+	@retval key_version or
+	@retval ENCRYPTION_KEY_VERSION_INVALID if used key_id
+	is not found from encryption plugin. */
+	uint key_get_latest_version(void);
+
+	/** Returns true if key was found from encryption plugin
+	and false if not. */
+	bool is_key_found() const {
+		return key_found != ENCRYPTION_KEY_VERSION_INVALID;
+	}
+
+	/** Returns true if tablespace should be encrypted */
+	bool should_encrypt() const {
+		return ((encryption == FIL_SPACE_ENCRYPTION_ON) ||
+			(srv_encrypt_tables &&
+				encryption == FIL_SPACE_ENCRYPTION_DEFAULT));
+	}
+
+	/** Return true if tablespace is encrypted. */
+	bool is_encrypted() const {
+		return (encryption != FIL_SPACE_ENCRYPTION_OFF);
+	}
+
+	/** Return true if default tablespace encryption is used, */
+	bool is_default_encryption() const {
+		return (encryption == FIL_SPACE_ENCRYPTION_DEFAULT);
+	}
+
+	/** Return true if tablespace is not encrypted. */
+	bool not_encrypted() const {
+		return (encryption == FIL_SPACE_ENCRYPTION_OFF);
+	}
+
+	/** Is this tablespace closing. */
+	bool is_closing(bool is_fixed) {
+		bool closed;
+		if (!is_fixed) {
+			mutex_enter(&mutex);
+		}
+		closed = closing;
+		if (!is_fixed) {
+			mutex_exit(&mutex);
+		}
+		return closed;
+	}
+
 	uint min_key_version; // min key version for this space
 	ulint page0_offset;   // byte offset on page 0 for crypt data
 	fil_encryption_t encryption; // Encryption setup
 
 	ib_mutex_t mutex;   // mutex protecting following variables
 	bool closing;	    // is tablespace being closed
-	bool inited;
+
+	/** Return code from encryption_key_get_latest_version.
+        If ENCRYPTION_KEY_VERSION_INVALID encryption plugin
+	could not find the key and there is no need to call
+	get_latest_key_version again as keys are read only
+	at startup. */
+	uint key_found;
+
 	fil_space_rotate_state_t rotate_state;
 };
 
@@ -316,7 +428,8 @@ UNIV_INTERN
 void
 fil_space_crypt_mark_space_closing(
 /*===============================*/
-	ulint space);          /*!< in: tablespace id */
+	ulint			space,		/*!< in: tablespace id */
+	fil_space_crypt_t*	crypt_data);	/*!< in: crypt_data or NULL */
 
 /*********************************************************************
 Wait for crypt threads to stop accessing space */
@@ -423,7 +536,6 @@ fil_crypt_calculate_checksum(
 	ulint	zip_size,	/*!< in: zip_size or 0 */
 	byte*	dst_frame);	/*!< in: page where to calculate */
 
-
 #ifndef UNIV_NONINL
 #include "fil0crypt.ic"
 #endif
diff --git a/storage/xtradb/include/fil0fil.h b/storage/xtradb/include/fil0fil.h
index 20e8303f764..95011ae6125 100644
--- a/storage/xtradb/include/fil0fil.h
+++ b/storage/xtradb/include/fil0fil.h
@@ -321,13 +321,21 @@ struct fil_space_t {
 				/*!< true if this space is currently in
 				unflushed_spaces */
 	ibool		is_corrupt;
+				/*!< true if tablespace corrupted */
 	bool		printed_compression_failure;
 				/*!< true if we have already printed
 				compression failure */
+	fil_space_crypt_t* crypt_data;
+				/*!< tablespace crypt data or NULL */
+	bool		page_0_crypt_read;
+				/*!< tablespace crypt data has been
+				read */
+	ulint		file_block_size;
+				/*!< file system block size */
+
 	UT_LIST_NODE_T(fil_space_t) space_list;
 				/*!< list of all spaces */
-        fil_space_crypt_t* crypt_data;
-	ulint		file_block_size;/*!< file system block size */
+
 	ulint		magic_n;/*!< FIL_SPACE_MAGIC_N */
 };
 
@@ -471,7 +479,8 @@ fil_space_create(
 	ulint		zip_size,/*!< in: compressed page size, or
 				0 for uncompressed tablespaces */
 	ulint		purpose, /*!< in: FIL_TABLESPACE, or FIL_LOG if log */
-	fil_space_crypt_t* crypt_data); /*!< in: crypt data */
+	fil_space_crypt_t* crypt_data, /*!< in: crypt data */
+	bool		create_table); /*!< in: true if create table */
 
 /*******************************************************************//**
 Assigns a new space id for a new single-table tablespace. This works simply by
diff --git a/storage/xtradb/include/fil0fil.ic b/storage/xtradb/include/fil0fil.ic
index 7ccc69b9561..23614a6567a 100644
--- a/storage/xtradb/include/fil0fil.ic
+++ b/storage/xtradb/include/fil0fil.ic
@@ -131,6 +131,7 @@ fil_page_type_validate(
 		page_type == FIL_PAGE_TYPE_XDES ||
 		page_type == FIL_PAGE_TYPE_BLOB ||
 		page_type == FIL_PAGE_TYPE_ZBLOB ||
+		page_type == FIL_PAGE_TYPE_ZBLOB2 ||
 		page_type == FIL_PAGE_TYPE_COMPRESSED))) {
 
 		uint key_version = mach_read_from_4(page + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
@@ -166,6 +167,7 @@ fil_page_type_validate(
 			page_type == FIL_PAGE_TYPE_XDES ||
 			page_type == FIL_PAGE_TYPE_BLOB ||
 			page_type == FIL_PAGE_TYPE_ZBLOB ||
+			page_type == FIL_PAGE_TYPE_ZBLOB2 ||
 			page_type == FIL_PAGE_TYPE_COMPRESSED);
 		return false;
 	}
diff --git a/storage/xtradb/include/fts0fts.h b/storage/xtradb/include/fts0fts.h
index 1b9b1250e9a..7aa7055640c 100644
--- a/storage/xtradb/include/fts0fts.h
+++ b/storage/xtradb/include/fts0fts.h
@@ -376,6 +376,7 @@ extern bool		fts_need_sync;
 /** Variable specifying the table that has Fulltext index to display its
 content through information schema table */
 extern char*		fts_internal_tbl_name;
+extern char*		fts_internal_tbl_name2;
 
 #define	fts_que_graph_free(graph)			\
 do {							\
@@ -824,6 +825,15 @@ void
 fts_drop_orphaned_tables(void);
 /*==========================*/
 
+/* Get parent table name if it's a fts aux table
+@param[in]	aux_table_name	aux table name
+@param[in]	aux_table_len	aux table length
+@return parent table name, or NULL */
+char*
+fts_get_parent_table_name(
+	const char*	aux_table_name,
+	ulint		aux_table_len);
+
 /******************************************************************//**
 Since we do a horizontal split on the index table, we need to drop
 all the split tables.
@@ -841,13 +851,15 @@ FTS auxiliary INDEX table and clear the cache at the end.
 @param[in,out]	table		fts table
 @param[in]	unlock_cache	whether unlock cache when write node
 @param[in]	wait		whether wait for existing sync to finish
+@param[in]      has_dict        whether has dict operation lock
 @return DB_SUCCESS on success, error code on failure. */
 UNIV_INTERN
 dberr_t
 fts_sync_table(
 	dict_table_t*	table,
 	bool		unlock_cache,
-	bool		wait);
+	bool		wait,
+	bool		has_dict);
 
 /****************************************************************//**
 Free the query graph but check whether dict_sys->mutex is already
diff --git a/storage/xtradb/include/lock0lock.h b/storage/xtradb/include/lock0lock.h
index b6100d470cc..a12ca1d85e6 100644
--- a/storage/xtradb/include/lock0lock.h
+++ b/storage/xtradb/include/lock0lock.h
@@ -45,6 +45,15 @@ Created 5/7/1996 Heikki Tuuri
 extern ibool	lock_print_waits;
 #endif /* UNIV_DEBUG */
 
+/** Alternatives for innodb_lock_schedule_algorithm, which can be changed by
+	setting innodb_lock_schedule_algorithm. */
+enum innodb_lock_schedule_algorithm_t {
+	INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS,		/*!< First Come First Served */
+	INNODB_LOCK_SCHEDULE_ALGORITHM_VATS			/*!< Variance-Aware-Transaction-Scheduling */
+};
+
+extern ulong innodb_lock_schedule_algorithm;
+
 extern ulint	srv_n_lock_deadlock_count;
 
 /*********************************************************************//**
diff --git a/storage/xtradb/include/os0thread.h b/storage/xtradb/include/os0thread.h
index 815faf97319..671b9b7dc3f 100644
--- a/storage/xtradb/include/os0thread.h
+++ b/storage/xtradb/include/os0thread.h
@@ -131,14 +131,27 @@ os_thread_create_func(
 	os_thread_id_t*		thread_id);	/*!< out: id of the created
 						thread, or NULL */
 
+/**
+Waits until the specified thread completes and joins it. Its return value is
+ignored.
+
+@param	thread	thread to join */
+UNIV_INTERN
+void
+os_thread_join(
+	os_thread_t	thread);
+
 /*****************************************************************//**
 Exits the current thread. */
 UNIV_INTERN
 void
 os_thread_exit(
 /*===========*/
-	void*	exit_value)	/*!< in: exit value; in Windows this void*
+	void*	exit_value,	/*!< in: exit value; in Windows this void*
 				is cast as a DWORD */
+	bool	detach = true)	/*!< in: if true, the thread will be detached
+				right before exiting. If false, another thread
+				is responsible for joining this thread. */
 	UNIV_COLD MY_ATTRIBUTE((noreturn));
 /*****************************************************************//**
 Returns the thread identifier of current thread.
diff --git a/storage/xtradb/include/rem0types.h b/storage/xtradb/include/rem0types.h
index f8133f77466..5da96066f88 100644
--- a/storage/xtradb/include/rem0types.h
+++ b/storage/xtradb/include/rem0types.h
@@ -71,4 +71,7 @@ enum rec_format_enum {
 };
 typedef enum rec_format_enum rec_format_t;
 
+/** Compressed field header size in bytes */
+#define ZIP_COLUMN_HEADER_LENGTH	2
+
 #endif
diff --git a/storage/xtradb/include/row0mysql.h b/storage/xtradb/include/row0mysql.h
index 52e82da668d..b818791cf72 100644
--- a/storage/xtradb/include/row0mysql.h
+++ b/storage/xtradb/include/row0mysql.h
@@ -42,6 +42,9 @@ struct SysIndexCallback;
 
 extern ibool row_rollback_on_timeout;
 
+extern uint	srv_compressed_columns_zip_level;
+extern ulong	srv_compressed_columns_threshold;
+
 struct row_prebuilt_t;
 
 /*******************************************************************//**
@@ -52,6 +55,49 @@ row_mysql_prebuilt_free_blob_heap(
 /*==============================*/
 	row_prebuilt_t*	prebuilt);	/*!< in: prebuilt struct of a
 					ha_innobase:: table handle */
+
+/** Frees the compress heap in prebuilt when no longer needed. */
+UNIV_INTERN
+void
+row_mysql_prebuilt_free_compress_heap(
+	row_prebuilt_t*	prebuilt);	/*!< in: prebuilt struct of a
+					ha_innobase:: table handle */
+
+/** Uncompress blob/text/varchar column using zlib
+@return pointer to the uncompressed data */
+const byte*
+row_decompress_column(
+	const byte*	data,	/*!< in: data in innodb(compressed) format */
+	ulint		*len,	/*!< in: data length; out: length of
+				decompressed data*/
+	const byte*	dict_data,
+				/*!< in: optional dictionary data used for
+				decompression */
+	ulint		dict_data_len,
+				/*!< in: optional dictionary data length */
+	row_prebuilt_t*	prebuilt);
+				/*!< in: use prebuilt->compress_heap only
+				here*/
+
+/** Compress blob/text/varchar column using zlib
+@return pointer to the compressed data */
+byte*
+row_compress_column(
+	const byte*	data,	/*!< in: data in mysql(uncompressed)
+				format */
+	ulint		*len,	/*!< in: data length; out: length of
+				compressed data*/
+	ulint		lenlen,	/*!< in: bytes used to store the length of
+				data */
+	const byte*	dict_data,
+				/*!< in: optional dictionary data used for
+				compression */
+	ulint		dict_data_len,
+				/*!< in: optional dictionary data length */
+	row_prebuilt_t*	prebuilt);
+				/*!< in: use prebuilt->compress_heap only
+				here*/
+
 /*******************************************************************//**
 Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row
 format.
@@ -90,10 +136,21 @@ row_mysql_store_blob_ref(
 				to 4 bytes */
 	const void*	data,	/*!< in: BLOB data; if the value to store
 				is SQL NULL this should be NULL pointer */
-	ulint		len);	/*!< in: BLOB length; if the value to store
+	ulint		len,	/*!< in: BLOB length; if the value to store
 				is SQL NULL this should be 0; remember
 				also to set the NULL bit in the MySQL record
 				header! */
+	bool		need_decompression,
+				/*!< in: if the data need to be compressed*/
+	const byte*	dict_data,
+				/*!< in: optional compression dictionary
+				data */
+	ulint		dict_data_len,
+				/*!< in: optional compression dictionary data
+				length */
+	row_prebuilt_t*	prebuilt);
+				/*<! in: use prebuilt->compress_heap only
+				here */
 /*******************************************************************//**
 Reads a reference to a BLOB in the MySQL format.
 @return	pointer to BLOB data */
@@ -104,8 +161,17 @@ row_mysql_read_blob_ref(
 	ulint*		len,		/*!< out: BLOB length */
 	const byte*	ref,		/*!< in: BLOB reference in the
 					MySQL format */
-	ulint		col_len);	/*!< in: BLOB reference length
+	ulint		col_len,	/*!< in: BLOB reference length
 					(not BLOB length) */
+	bool		need_compression,
+					/*!< in: if the data need to be
+					compressed*/
+	const byte*	dict_data,	/*!< in: optional compression
+					dictionary data */
+	ulint		dict_data_len,	/*!< in: optional compression
+					dictionary data length */
+	row_prebuilt_t*	prebuilt);	/*!< in: use prebuilt->compress_heap
+					only here */
 /**************************************************************//**
 Pad a column with spaces. */
 UNIV_INTERN
@@ -153,7 +219,16 @@ row_mysql_store_col_in_innobase_format(
 					necessarily the length of the actual
 					payload data; if the column is a true
 					VARCHAR then this is irrelevant */
-	ulint		comp);		/*!< in: nonzero=compact format */
+	ulint		comp,		/*!< in: nonzero=compact format */
+	bool		need_compression,
+					/*!< in: if the data need to be
+					compressed */
+	const byte*	dict_data,	/*!< in: optional compression
+					dictionary data */
+	ulint		dict_data_len,	/*!< in: optional compression
+					dictionary data length */
+	row_prebuilt_t*	prebuilt);	/*!< in: use prebuilt->compress_heap
+					only here */
 /****************************************************************//**
 Handles user errors and lock waits detected by the database engine.
 @return true if it was a lock wait and we should continue running the
@@ -655,6 +730,8 @@ struct mysql_row_templ_t {
 	ulint	is_unsigned;		/*!< if a column type is an integer
 					type and this field is != 0, then
 					it is an unsigned integer type */
+	bool		compressed;	/*!< if column format is compressed */
+	LEX_CSTRING	zip_dict_data;	/*!< associated compression dictionary */
 };
 
 #define MYSQL_FETCH_CACHE_SIZE		8
@@ -854,6 +931,8 @@ struct row_prebuilt_t {
 					in fetch_cache */
 	mem_heap_t*	blob_heap;	/*!< in SELECTS BLOB fields are copied
 					to this heap */
+	mem_heap_t*	compress_heap;  /*!< memory heap used to compress
+					/decompress blob column*/
 	mem_heap_t*	old_vers_heap;	/*!< memory heap where a previous
 					version is built in consistent read */
 	bool		in_fts_query;	/*!< Whether we are in a FTS query */
diff --git a/storage/xtradb/include/srv0mon.h b/storage/xtradb/include/srv0mon.h
index 33ae7749ca5..3b030d56d29 100644
--- a/storage/xtradb/include/srv0mon.h
+++ b/storage/xtradb/include/srv0mon.h
@@ -167,6 +167,7 @@ enum monitor_id_t {
 	MONITOR_OVLD_INDEX_PAGES_WRITTEN,
 	MONITOR_OVLD_NON_INDEX_PAGES_WRITTEN,
 	MONITOR_OVLD_PAGES_READ,
+	MONITOR_OVLD_PAGES0_READ,
 	MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS,
 	MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS_AVOIDED,
 	MONITOR_OVLD_BYTE_READ,
diff --git a/storage/xtradb/include/srv0srv.h b/storage/xtradb/include/srv0srv.h
index f4557a08762..7e727d0917f 100644
--- a/storage/xtradb/include/srv0srv.h
+++ b/storage/xtradb/include/srv0srv.h
@@ -186,6 +186,12 @@ struct srv_stats_t {
 	/** Number of lock waits that have been up to max time (i.e.) lock
 	wait timeout */
 	ulint_ctr_1_t		n_lock_max_wait_time;
+
+	/** Number of times page 0 is read from tablespace */
+	ulint_ctr_64_t		page0_read;
+
+	/** Number of encryption_get_latest_key_version calls */
+	ulint_ctr_64_t		n_key_requests;
 };
 
 extern const char*	srv_main_thread_op_info;
@@ -222,8 +228,10 @@ extern os_event_t	srv_checkpoint_completed_event;
 log tracking iteration */
 extern os_event_t	srv_redo_log_tracked_event;
 
-/** srv_redo_log_follow_thread spawn flag */
-extern bool srv_redo_log_thread_started;
+/** Whether the redo log tracker thread has been started. Does not take into
+account whether the tracking is currently enabled (see srv_track_changed_pages
+for that) */
+extern bool		srv_redo_log_thread_started;
 
 /* If the last data file is auto-extended, we add this many pages to it
 at a time */
@@ -338,6 +346,10 @@ extern char**	srv_data_file_names;
 extern ulint*	srv_data_file_sizes;
 extern ulint*	srv_data_file_is_raw_partition;
 
+
+/** Whether the redo log tracking is currently enabled. Note that it is
+possible for the log tracker thread to be running and the tracking to be
+disabled */
 extern my_bool		srv_track_changed_pages;
 extern ulonglong	srv_max_bitmap_file_size;
 
@@ -585,6 +597,9 @@ extern ibool	srv_priority_boost;
 extern ulint	srv_truncated_status_writes;
 extern ulint	srv_available_undo_logs;
 
+extern ulint	srv_column_compressed;
+extern ulint	srv_column_decompressed;
+
 extern	ulint	srv_mem_pool_size;
 extern	ulint	srv_lock_table_size;
 
@@ -1158,7 +1173,8 @@ struct export_var_t{
 	ulint innodb_os_log_pending_fsyncs;	/*!< fil_n_pending_log_flushes */
 	ulint innodb_page_size;			/*!< UNIV_PAGE_SIZE */
 	ulint innodb_pages_created;		/*!< buf_pool->stat.n_pages_created */
-	ulint innodb_pages_read;		/*!< buf_pool->stat.n_pages_read */
+	ulint innodb_pages_read;		/*!< buf_pool->stat.n_pages_read*/
+	ulint innodb_page0_read;		/*!< srv_stats.page0_read */
 	ulint innodb_pages_written;		/*!< buf_pool->stat.n_pages_written */
 	ib_int64_t innodb_purge_trx_id;
 	ib_int64_t innodb_purge_undo_no;
@@ -1212,6 +1228,8 @@ struct export_var_t{
 	ulint innodb_purge_view_trx_id_age;	/*!< rw_max_trx_id
 						- purged view's min trx_id */
 #endif /* UNIV_DEBUG */
+	ulint innodb_column_compressed;		/*!< srv_column_compressed */
+	ulint innodb_column_decompressed;	/*!< srv_column_decompressed */
 
 	ib_int64_t innodb_page_compression_saved;/*!< Number of bytes saved
 						by page compression */
@@ -1257,6 +1275,7 @@ struct export_var_t{
 	ulint innodb_encryption_rotation_pages_modified;
 	ulint innodb_encryption_rotation_pages_flushed;
 	ulint innodb_encryption_rotation_estimated_iops;
+	ib_int64_t innodb_encryption_key_requests;
 
 	ulint innodb_scrub_page_reorganizations;
 	ulint innodb_scrub_page_splits;
diff --git a/storage/xtradb/include/trx0trx.h b/storage/xtradb/include/trx0trx.h
index 0733d830a9a..239ed0b273b 100644
--- a/storage/xtradb/include/trx0trx.h
+++ b/storage/xtradb/include/trx0trx.h
@@ -881,6 +881,8 @@ struct trx_t{
 
 	time_t		start_time;	/*!< time the trx state last time became
 					TRX_STATE_ACTIVE */
+	clock_t		start_time_micro;	/*!< start time of transaction in
+					microseconds */
 	trx_id_t	id;		/*!< transaction id */
 	XID		xid;		/*!< X/Open XA transaction
 					identification to identify a
diff --git a/storage/xtradb/include/univ.i b/storage/xtradb/include/univ.i
index 5320776c042..08236889556 100644
--- a/storage/xtradb/include/univ.i
+++ b/storage/xtradb/include/univ.i
@@ -45,10 +45,10 @@ Created 1/20/1994 Heikki Tuuri
 
 #define INNODB_VERSION_MAJOR	5
 #define INNODB_VERSION_MINOR	6
-#define INNODB_VERSION_BUGFIX	31
+#define INNODB_VERSION_BUGFIX	34
 
 #ifndef PERCONA_INNODB_VERSION
-#define PERCONA_INNODB_VERSION 77.0
+#define PERCONA_INNODB_VERSION 79.1
 #endif
 
 /* Enable UNIV_LOG_ARCHIVE in XtraDB */
diff --git a/storage/xtradb/lock/lock0lock.cc b/storage/xtradb/lock/lock0lock.cc
index 29f89dcbf4f..af2c823af64 100644
--- a/storage/xtradb/lock/lock0lock.cc
+++ b/storage/xtradb/lock/lock0lock.cc
@@ -76,6 +76,9 @@ bitmap */
 
 #define LOCK_PAGE_BITMAP_MARGIN		64
 
+/** Lock scheduling algorithm */
+ulong innodb_lock_schedule_algorithm = INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS;
+
 /* An explicit record lock affects both the record and the gap before it.
 An implicit x-lock does not affect the gap, it only locks the index
 record from read or update.
@@ -380,11 +383,33 @@ struct lock_stack_t {
 	ulint		heap_no;		/*!< heap number if rec lock */
 };
 
-extern "C" void thd_rpl_deadlock_check(MYSQL_THD thd, MYSQL_THD other_thd);
-extern "C" int thd_need_wait_reports(const MYSQL_THD thd);
+/*********************************************************************//**
+Checks if a waiting record lock request still has to wait in a queue.
+@return lock that is causing the wait */
+static
+const lock_t*
+lock_rec_has_to_wait_in_queue(
+/*==========================*/
+  const lock_t*	wait_lock);	/*!< in: waiting record lock */
+
+/*************************************************************//**
+Grants a lock to a waiting lock request and releases the waiting transaction.
+The caller must hold lock_sys->mutex. */
+static
+void
+lock_grant(
+/*=======*/
+	lock_t*	lock,	/*!< in/out: waiting lock request */
+    bool    owns_trx_mutex);    /*!< in: whether lock->trx->mutex is owned */
+
+extern "C" void thd_report_wait_for(MYSQL_THD thd, MYSQL_THD other_thd);
+extern "C" int thd_need_wait_for(const MYSQL_THD thd);
 extern "C"
 int thd_need_ordering_with(const MYSQL_THD thd, const MYSQL_THD other_thd);
 
+extern "C"
+int thd_deadlock_victim_preference(const MYSQL_THD thd1, const MYSQL_THD thd2);
+
 /** Stack to use during DFS search. Currently only a single stack is required
 because there is no parallel deadlock check. This stack is protected by
 the lock_sys_t::mutex. */
@@ -406,7 +431,7 @@ UNIV_INTERN mysql_pfs_key_t	lock_sys_wait_mutex_key;
 struct thd_wait_reports {
 	struct thd_wait_reports *next;	/*!< List link */
 	ulint used;			/*!< How many elements in waitees[] */
-	trx_t *waitees[64];		/*!< Trxs for thd_rpl_deadlock_check() */
+	trx_t *waitees[64];		/*!< Trxs for thd_report_wait_for() */
 };
 
 
@@ -890,7 +915,8 @@ lock_reset_lock_and_trx_wait(
 	ut_ad(lock_get_wait(lock));
 	ut_ad(lock_mutex_own());
 
-	if (lock->trx->lock.wait_lock != lock) {
+	if (lock->trx->lock.wait_lock &&
+	    lock->trx->lock.wait_lock != lock) {
 		const char*	stmt=NULL;
 		const char*	stmt2=NULL;
 		size_t		stmt_len;
@@ -911,7 +937,7 @@ lock_reset_lock_and_trx_wait(
 			trx_id,
 			stmt2 ? stmt2 : "NULL",
 			lock->trx->lock.wait_lock);
-		ut_error;
+		ut_ad(lock->trx->lock.wait_lock == lock);
 	}
 
 	lock->trx->lock.wait_lock = NULL;
@@ -2029,6 +2055,145 @@ wsrep_print_wait_locks(
 #endif /* WITH_WSREP */
 
 /*********************************************************************//**
+Check if lock1 has higher priority than lock2.
+NULL has lowest priority.
+If neither of them is wait lock, the first one has higher priority.
+If only one of them is a wait lock, it has lower priority.
+Otherwise, the one with an older transaction has higher priority.
+@returns true if lock1 has higher priority, false otherwise. */
+bool
+has_higher_priority(
+	lock_t *lock1,
+	lock_t *lock2)
+{
+	if (lock1 == NULL) {
+		return false;
+	} else if (lock2 == NULL) {
+		return true;
+	}
+	// No preference. Compre them by wait mode and trx age.
+	if (!lock_get_wait(lock1)) {
+		return true;
+	} else if (!lock_get_wait(lock2)) {
+		return false;
+	}
+	return lock1->trx->start_time_micro <= lock2->trx->start_time_micro;
+}
+
+/*********************************************************************//**
+Insert a lock to the hash list according to the mode (whether it is a wait
+lock) and the age of the transaction the it is associated with.
+If the lock is not a wait lock, insert it to the head of the hash list.
+Otherwise, insert it to the middle of the wait locks according to the age of
+the transaciton. */
+static
+dberr_t
+lock_rec_insert_by_trx_age(
+	lock_t	*in_lock) /*!< in: lock to be insert */{
+	ulint				space;
+	ulint				page_no;
+	ulint				rec_fold;
+	lock_t*				node;
+	lock_t*				next;
+	hash_cell_t*		cell;
+
+	space = in_lock->un_member.rec_lock.space;
+	page_no = in_lock->un_member.rec_lock.page_no;
+	rec_fold = lock_rec_fold(space, page_no);
+	cell = hash_get_nth_cell(lock_sys->rec_hash,
+				 hash_calc_hash(rec_fold, lock_sys->rec_hash));
+
+	node = (lock_t *) cell->node;
+	// If in_lock is not a wait lock, we insert it to the head of the list.
+	if (node == NULL || !lock_get_wait(in_lock) || has_higher_priority(in_lock, node)) {
+		cell->node = in_lock;
+		in_lock->hash = node;
+		if (lock_get_wait(in_lock)) {
+			lock_grant(in_lock, true);
+			return DB_SUCCESS_LOCKED_REC;
+		}
+		return DB_SUCCESS;
+	}
+	while (node != NULL && has_higher_priority((lock_t *) node->hash,
+						   in_lock)) {
+		node = (lock_t *) node->hash;
+	}
+	next = (lock_t *) node->hash;
+	node->hash = in_lock;
+	in_lock->hash = next;
+
+	if (lock_get_wait(in_lock) && !lock_rec_has_to_wait_in_queue(in_lock)) {
+		lock_grant(in_lock, true);
+		if (cell->node != in_lock) {
+			// Move it to the front of the queue
+			node->hash = in_lock->hash;
+			next = (lock_t *) cell->node;
+			cell->node = in_lock;
+			in_lock->hash = next;
+		}
+		return DB_SUCCESS_LOCKED_REC;
+	}
+
+	return DB_SUCCESS;
+}
+
+static
+bool
+lock_queue_validate(
+	const lock_t	*in_lock) /*!< in: lock whose hash list is to be validated */
+{
+	ulint				space;
+	ulint				page_no;
+	ulint				rec_fold;
+	hash_cell_t*		cell;
+	lock_t*				next;
+	bool				wait_lock = false;
+
+	if (in_lock == NULL) {
+		return true;
+	}
+
+	space = in_lock->un_member.rec_lock.space;
+	page_no = in_lock->un_member.rec_lock.page_no;
+	rec_fold = lock_rec_fold(space, page_no);
+	cell = hash_get_nth_cell(lock_sys->rec_hash,
+			hash_calc_hash(rec_fold, lock_sys->rec_hash));
+	next = (lock_t *) cell->node;
+	while (next != NULL) {
+		// If this is a granted lock, check that there's no wait lock before it.
+		if (!lock_get_wait(next)) {
+			ut_ad(!wait_lock);
+		} else {
+			wait_lock = true;
+		}
+		next = (lock_t *) next->hash;
+	}
+	return true;
+}
+
+static
+void
+lock_rec_insert_to_head(
+	lock_t *in_lock,   /*!< in: lock to be insert */
+	ulint	rec_fold)  /*!< in: rec_fold of the page */
+{
+	hash_cell_t*		cell;
+	lock_t*				node;
+
+	if (in_lock == NULL) {
+		return;
+	}
+
+	cell = hash_get_nth_cell(lock_sys->rec_hash,
+			hash_calc_hash(rec_fold, lock_sys->rec_hash));
+	node = (lock_t *) cell->node;
+	if (node != in_lock) {
+		cell->node = in_lock;
+		in_lock->hash = node;
+	}
+}
+
+/*********************************************************************//**
 Creates a new record lock and inserts it to the lock queue. Does NOT check
 for deadlocks or lock compatibility!
 @return	created lock */
@@ -2055,8 +2220,10 @@ lock_rec_create(
 	lock_t*		lock;
 	ulint		page_no;
 	ulint		space;
+	ulint		rec_fold;
 	ulint		n_bits;
 	ulint		n_bytes;
+	bool		wait_lock;
 	const page_t*	page;
 
 	ut_ad(lock_mutex_own());
@@ -2083,6 +2250,8 @@ lock_rec_create(
 		type_mode = type_mode & ~(LOCK_GAP | LOCK_REC_NOT_GAP);
 	}
 
+	wait_lock = type_mode & LOCK_WAIT;
+
 	/* Make lock bitmap bigger by a safety margin */
 	n_bits = page_dir_get_n_heap(page) + LOCK_PAGE_BITMAP_MARGIN;
 	n_bytes = 1 + n_bits / 8;
@@ -2098,6 +2267,7 @@ lock_rec_create(
 	lock->un_member.rec_lock.space = space;
 	lock->un_member.rec_lock.page_no = page_no;
 	lock->un_member.rec_lock.n_bits = n_bytes * 8;
+	rec_fold = lock_rec_fold(space, page_no);
 
 	/* Reset to zero the bitmap which resides immediately after the
 	lock struct */
@@ -2190,13 +2360,27 @@ lock_rec_create(
 			return(lock);
 		}
 		trx_mutex_exit(c_lock->trx);
+	} else if (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS
+		&& !thd_is_replication_slave_thread(lock->trx->mysql_thd)) {
+		if (wait_lock) {
+			HASH_INSERT(lock_t, hash, lock_sys->rec_hash, rec_fold, lock);
+		} else {
+			lock_rec_insert_to_head(lock, rec_fold);
+		}
 	} else {
-		HASH_INSERT(lock_t, hash, lock_sys->rec_hash,
-			    lock_rec_fold(space, page_no), lock);
+		HASH_INSERT(lock_t, hash, lock_sys->rec_hash, rec_fold, lock);
 	}
 #else
-	HASH_INSERT(lock_t, hash, lock_sys->rec_hash,
-		    lock_rec_fold(space, page_no), lock);
+	if (innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS
+		&& !thd_is_replication_slave_thread(lock->trx->mysql_thd)) {
+		if (wait_lock) {
+			HASH_INSERT(lock_t, hash, lock_sys->rec_hash, rec_fold, lock);
+		} else {
+			lock_rec_insert_to_head(lock, rec_fold);
+		}
+	} else {
+		HASH_INSERT(lock_t, hash, lock_sys->rec_hash, rec_fold, lock);
+	}
 #endif /* WITH_WSREP */
 
 	lock_sys->rec_num++;
@@ -2255,6 +2439,9 @@ lock_rec_enqueue_waiting(
 	trx_id_t		victim_trx_id;
 	ulint			sec;
 	ulint			ms;
+	ulint			space;
+	ulint			page_no;
+	dberr_t			err;
 
 
 	ut_ad(lock_mutex_own());
@@ -2329,34 +2516,51 @@ lock_rec_enqueue_waiting(
 		transaction as a victim, it is possible that we
 		already have the lock now granted! */
 
-		return(DB_SUCCESS_LOCKED_REC);
-	}
-
-	trx->lock.que_state = TRX_QUE_LOCK_WAIT;
+		err = DB_SUCCESS_LOCKED_REC;
+	} else {
+		trx->lock.que_state = TRX_QUE_LOCK_WAIT;
 
-	trx->lock.was_chosen_as_deadlock_victim = FALSE;
-	trx->lock.wait_started = ut_time();
+		trx->lock.was_chosen_as_deadlock_victim = FALSE;
+		trx->lock.wait_started = ut_time();
 
-	if (UNIV_UNLIKELY(trx->take_stats)) {
-		ut_usectime(&sec, &ms);
-		trx->lock_que_wait_ustarted = (ib_uint64_t)sec * 1000000 + ms;
-	}
+		if (UNIV_UNLIKELY(trx->take_stats)) {
+			ut_usectime(&sec, &ms);
+			trx->lock_que_wait_ustarted = (ib_uint64_t)sec * 1000000 + ms;
+		}
 
-	ut_a(que_thr_stop(thr));
+		ut_a(que_thr_stop(thr));
 
 #ifdef UNIV_DEBUG
-	if (lock_print_waits) {
-		fprintf(stderr, "Lock wait for trx " TRX_ID_FMT " in index ",
-			trx->id);
-		ut_print_name(stderr, trx, FALSE, index->name);
-	}
+		if (lock_print_waits) {
+			fprintf(stderr, "Lock wait for trx " TRX_ID_FMT " in index ",
+				trx->id);
+			ut_print_name(stderr, trx, FALSE, index->name);
+		}
 #endif /* UNIV_DEBUG */
 
-	MONITOR_INC(MONITOR_LOCKREC_WAIT);
+		MONITOR_INC(MONITOR_LOCKREC_WAIT);
 
-	trx->n_rec_lock_waits++;
+		trx->n_rec_lock_waits++;
 
-	return(DB_LOCK_WAIT);
+		err = DB_LOCK_WAIT;
+	}
+
+	// Move it only when it does not cause a deadlock.
+	if (err != DB_DEADLOCK
+		&& innodb_lock_schedule_algorithm
+			== INNODB_LOCK_SCHEDULE_ALGORITHM_VATS
+		&& !thd_is_replication_slave_thread(lock->trx->mysql_thd)) {
+		space = buf_block_get_space(block);
+		page_no	= buf_block_get_page_no(block);
+		HASH_DELETE(lock_t, hash, lock_sys->rec_hash,
+			lock_rec_fold(space, page_no), lock);
+		dberr_t res = lock_rec_insert_by_trx_age(lock);
+		if (res != DB_SUCCESS) {
+			return res;
+		}
+	}
+
+	return err;
 }
 
 /*********************************************************************//**
@@ -2451,7 +2655,7 @@ lock_rec_add_to_queue(
 				if (wsrep_debug) {
 					fprintf(stderr,
 						"BF skipping wait: %lu\n",
-						(ulong) trx->id);
+						trx->id);
 					lock_rec_print(stderr, lock);
 				}
 		  } else
@@ -2788,13 +2992,16 @@ static
 void
 lock_grant(
 /*=======*/
-	lock_t*	lock)	/*!< in/out: waiting lock request */
+	lock_t*	lock,	/*!< in/out: waiting lock request */
+	bool	owns_trx_mutex)    /*!< in: whether lock->trx->mutex is owned */
 {
 	ut_ad(lock_mutex_own());
 
 	lock_reset_lock_and_trx_wait(lock);
 
-	trx_mutex_enter(lock->trx);
+	if (!owns_trx_mutex) {
+		trx_mutex_enter(lock->trx);
+	}
 
 	if (lock_get_mode(lock) == LOCK_AUTO_INC) {
 		dict_table_t*	table = lock->un_member.tab_lock.table;
@@ -2843,7 +3050,9 @@ lock_grant(
 
 	lock->wait_time = (ulint)difftime(ut_time(), lock->requested_time);
 
-	trx_mutex_exit(lock->trx);
+	if (!owns_trx_mutex) {
+		trx_mutex_exit(lock->trx);
+	}
 }
 
 /*************************************************************//**
@@ -2881,6 +3090,66 @@ lock_rec_cancel(
 	trx_mutex_exit(lock->trx);
 }
 
+static
+void
+lock_grant_and_move_on_page(
+	ulint			space,
+	ulint			page_no)
+{
+	lock_t*		lock;
+	lock_t*		next;
+	lock_t*		previous;
+	ulint		rec_fold = lock_rec_fold(space, page_no);
+
+	previous = (lock_t *) hash_get_nth_cell(lock_sys->rec_hash,
+							hash_calc_hash(rec_fold, lock_sys->rec_hash))->node;
+	if (previous == NULL) {
+		return;
+	}
+	if (previous->un_member.rec_lock.space == space &&
+		previous->un_member.rec_lock.page_no == page_no) {
+		lock = previous;
+	}
+	else {
+		next = (lock_t *) previous->hash;
+		while (next &&
+				(next->un_member.rec_lock.space != space ||
+				next->un_member.rec_lock.page_no != page_no)) {
+			previous = next;
+			next = (lock_t *) previous->hash;
+		}
+		lock = (lock_t *) previous->hash;
+	}
+
+	ut_ad(previous->hash == lock || previous == lock);
+	/* Grant locks if there are no conflicting locks ahead.
+	 Move granted locks to the head of the list. */
+	for (;lock != NULL;) {
+		/* If the lock is a wait lock on this page, and it does not need to wait. */
+		if ((lock->un_member.rec_lock.space == space)
+			&& (lock->un_member.rec_lock.page_no == page_no)
+			&& lock_get_wait(lock)
+			&& !lock_rec_has_to_wait_in_queue(lock)) {
+			
+			lock_grant(lock, false);
+			
+			if (previous != NULL) {
+				/* Move the lock to the head of the list. */
+				HASH_GET_NEXT(hash, previous) = HASH_GET_NEXT(hash, lock);
+				lock_rec_insert_to_head(lock, rec_fold);
+			} else {
+				/* Already at the head of the list. */
+				previous = lock;
+			}
+			/* Move on to the next lock. */
+			lock = static_cast<lock_t *>(HASH_GET_NEXT(hash, previous));
+		} else {
+			previous = lock;
+			lock = static_cast<lock_t *>(HASH_GET_NEXT(hash, lock));
+		}
+	}
+}
+
 /*************************************************************//**
 Removes a record lock request, waiting or granted, from the queue and
 grants locks to other transactions in the queue if they now are entitled
@@ -2921,21 +3190,27 @@ lock_rec_dequeue_from_page(
 	MONITOR_INC(MONITOR_RECLOCK_REMOVED);
 	MONITOR_DEC(MONITOR_NUM_RECLOCK);
 
-	/* Check if waiting locks in the queue can now be granted: grant
-	locks if there are no conflicting locks ahead. Stop at the first
-	X lock that is waiting or has been granted. */
+	if (innodb_lock_schedule_algorithm
+		== INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS ||
+		thd_is_replication_slave_thread(in_lock->trx->mysql_thd)) {
+		/* Check if waiting locks in the queue can now be granted: grant
+		locks if there are no conflicting locks ahead. Stop at the first
+		X lock that is waiting or has been granted. */
 
-	for (lock = lock_rec_get_first_on_page_addr(space, page_no);
-	     lock != NULL;
-	     lock = lock_rec_get_next_on_page(lock)) {
+		for (lock = lock_rec_get_first_on_page_addr(space, page_no);
+		     lock != NULL;
+		     lock = lock_rec_get_next_on_page(lock)) {
 
-		if (lock_get_wait(lock)
-		    && !lock_rec_has_to_wait_in_queue(lock)) {
+			if (lock_get_wait(lock)
+			    && !lock_rec_has_to_wait_in_queue(lock)) {
 
-			/* Grant the lock */
-			ut_ad(lock->trx != in_lock->trx);
-			lock_grant(lock);
+				/* Grant the lock */
+				ut_ad(lock->trx != in_lock->trx);
+				lock_grant(lock, false);
+			}
 		}
+	} else {
+		lock_grant_and_move_on_page(space, page_no);
 	}
 }
 
@@ -4141,7 +4416,8 @@ lock_get_first_lock(
 	}
 
 	ut_a(lock != NULL);
-	ut_a(lock != ctx->wait_lock);
+	ut_a(lock != ctx->wait_lock ||
+            innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_VATS);
 	ut_ad(lock_get_type_low(lock) == lock_get_type_low(ctx->wait_lock));
 
 	return(lock);
@@ -4512,7 +4788,14 @@ lock_report_waiters_to_mysql(
 			/*  There is no need to report waits to a trx already
 			selected as a victim. */
 			if (w_trx->id != victim_trx_id) {
-				thd_rpl_deadlock_check(mysql_thd, w_trx->mysql_thd);
+				/* If thd_report_wait_for() decides to kill the
+				transaction, then we will get a call back into
+				innobase_kill_query. We mark this by setting
+				current_lock_mutex_owner, so we can avoid trying
+				to recursively take lock_sys->mutex. */
+				w_trx->abort_type = TRX_REPLICATION_ABORT;
+				thd_report_wait_for(mysql_thd, w_trx->mysql_thd);
+				w_trx->abort_type = TRX_SERVER_ABORT;
 			}
 			++i;
 		}
@@ -4551,7 +4834,7 @@ lock_deadlock_check_and_resolve(
 	assert_trx_in_list(trx);
 
 	start_mysql_thd = trx->mysql_thd;
-	if (start_mysql_thd && thd_need_wait_reports(start_mysql_thd)) {
+	if (start_mysql_thd && thd_need_wait_for(start_mysql_thd)) {
 		waitee_buf_ptr = &waitee_buf;
 	} else {
 		waitee_buf_ptr = NULL;
@@ -5033,7 +5316,7 @@ lock_table_other_has_incompatible(
 #ifdef WITH_WSREP
 			if(wsrep_thd_is_wsrep(trx->mysql_thd)) {
 				if (wsrep_debug) {
-					fprintf(stderr, "WSREP: trx " TRX_ID_FMT " table lock abort\n",
+					fprintf(stderr, "WSREP: trx %ld table lock abort\n",
 						trx->id);
 				}
 				trx_mutex_enter(lock->trx);
@@ -5239,12 +5522,71 @@ lock_table_dequeue(
 
 			/* Grant the lock */
 			ut_ad(in_lock->trx != lock->trx);
-			lock_grant(lock);
+			lock_grant(lock, false);
 		}
 	}
 }
 
 /*=========================== LOCK RELEASE ==============================*/
+static
+void
+lock_grant_and_move_on_rec(
+	lock_t*			first_lock,
+	ulint			heap_no)
+{
+	lock_t*		lock;
+	lock_t*		previous;
+	ulint		space;
+	ulint		page_no;
+	ulint		rec_fold;
+
+	space = first_lock->un_member.rec_lock.space;
+	page_no = first_lock->un_member.rec_lock.page_no;
+	rec_fold = lock_rec_fold(space, page_no);
+
+	previous = (lock_t *) hash_get_nth_cell(lock_sys->rec_hash,
+							hash_calc_hash(rec_fold, lock_sys->rec_hash))->node;
+	if (previous == NULL) {
+		return;
+	}
+	if (previous == first_lock) {
+		lock = previous;
+	} else {
+		while (previous->hash &&
+				previous->hash != first_lock) {
+			previous = (lock_t *) previous->hash;
+	    }
+		lock = (lock_t *) previous->hash;
+	}
+	/* Grant locks if there are no conflicting locks ahead.
+	 Move granted locks to the head of the list. */
+	for (;lock != NULL;) {
+
+		/* If the lock is a wait lock on this page, and it does not need to wait. */
+		if (lock->un_member.rec_lock.space == space
+			&& lock->un_member.rec_lock.page_no == page_no
+			&& lock_rec_get_nth_bit(lock, heap_no)
+			&& lock_get_wait(lock)
+			&& !lock_rec_has_to_wait_in_queue(lock)) {
+
+			lock_grant(lock, false);
+
+			if (previous != NULL) {
+				/* Move the lock to the head of the list. */
+				HASH_GET_NEXT(hash, previous) = HASH_GET_NEXT(hash, lock);
+				lock_rec_insert_to_head(lock, rec_fold);
+			} else {
+				/* Already at the head of the list. */
+				previous = lock;
+			}
+			/* Move on to the next lock. */
+			lock = static_cast<lock_t *>(HASH_GET_NEXT(hash, previous));
+		} else {
+			previous = lock;
+			lock = static_cast<lock_t *>(HASH_GET_NEXT(hash, lock));
+		}
+	}
+}
 
 /*************************************************************//**
 Removes a granted record lock of a transaction from the queue and grants
@@ -5308,17 +5650,24 @@ released:
 	ut_a(!lock_get_wait(lock));
 	lock_rec_reset_nth_bit(lock, heap_no);
 
-	/* Check if we can now grant waiting lock requests */
+	if (innodb_lock_schedule_algorithm
+		== INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS ||
+		thd_is_replication_slave_thread(lock->trx->mysql_thd)) {
 
-	for (lock = first_lock; lock != NULL;
-	     lock = lock_rec_get_next(heap_no, lock)) {
-		if (lock_get_wait(lock)
-		    && !lock_rec_has_to_wait_in_queue(lock)) {
+		/* Check if we can now grant waiting lock requests */
 
-			/* Grant the lock */
-			ut_ad(trx != lock->trx);
-			lock_grant(lock);
+		for (lock = first_lock; lock != NULL;
+			 lock = lock_rec_get_next(heap_no, lock)) {
+			if (lock_get_wait(lock)
+				&& !lock_rec_has_to_wait_in_queue(lock)) {
+
+				/* Grant the lock */
+				ut_ad(trx != lock->trx);
+				lock_grant(lock, false);
+			}
 		}
+	} else {
+		lock_grant_and_move_on_rec(first_lock, heap_no);
 	}
 
 	lock_mutex_exit();
@@ -6353,7 +6702,6 @@ lock_rec_queue_validate(
 
 		if (!lock_rec_get_gap(lock) && !lock_get_wait(lock)) {
 
-#ifndef WITH_WSREP
 			enum lock_mode	mode;
 
 
@@ -6362,16 +6710,31 @@ lock_rec_queue_validate(
 			} else {
 				mode = LOCK_S;
 			}
-			ut_a(!lock_rec_other_has_expl_req(
-			        mode, 0, 0, block, heap_no, lock->trx->id));
-#endif /* WITH_WSREP */
 
-		} else if (lock_get_wait(lock) && !lock_rec_get_gap(lock)) {
+			const lock_t*	other_lock
+				= lock_rec_other_has_expl_req(
+					mode, 0, 0, block, heap_no,
+					lock->trx->id);
+#ifdef WITH_WSREP
+			ut_a(!other_lock
+			     || wsrep_thd_is_BF(lock->trx->mysql_thd, FALSE)
+			     || wsrep_thd_is_BF(other_lock->trx->mysql_thd, FALSE));
+
+#else
+			ut_a(!other_lock);
+#endif /* WITH_WSREP */
 
+		} else if (lock_get_wait(lock) && !lock_rec_get_gap(lock)
+				   && innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS) {
+			// If using VATS, it's possible that a wait lock is inserted to a place in the list
+			// such that it does not need to wait.
 			ut_a(lock_rec_has_to_wait_in_queue(lock));
 		}
 	}
 
+	ut_ad(innodb_lock_schedule_algorithm == INNODB_LOCK_SCHEDULE_ALGORITHM_FCFS ||
+		  lock_queue_validate(lock));
+
 func_exit:
 	if (!locked_lock_trx_sys) {
 		lock_mutex_exit();
@@ -6926,7 +7289,7 @@ lock_clust_rec_modify_check_and_lock(
 	lock_rec_convert_impl_to_expl(block, rec, index, offsets);
 
 	lock_mutex_enter();
-	trx_t*		trx __attribute__((unused))= thr_get_trx(thr);
+	trx_t*		trx = thr_get_trx(thr);
 
 	ut_ad(lock_table_has(trx, index->table, LOCK_IX));
 
@@ -6990,7 +7353,7 @@ lock_sec_rec_modify_check_and_lock(
 	index record, and this would not have been possible if another active
 	transaction had modified this secondary index record. */
 
-	trx_t* trx __attribute__((unused))= thr_get_trx(thr);
+	trx_t* trx = thr_get_trx(thr);
 	lock_mutex_enter();
 
 	ut_ad(lock_table_has(trx, index->table, LOCK_IX));
@@ -7099,7 +7462,7 @@ lock_sec_rec_read_check_and_lock(
 		lock_rec_convert_impl_to_expl(block, rec, index, offsets);
 	}
 
-	trx_t* trx __attribute__((unused))= thr_get_trx(thr);
+	trx_t* trx = thr_get_trx(thr);
 	lock_mutex_enter();
 
 	ut_ad(mode != LOCK_X
@@ -7182,7 +7545,7 @@ lock_clust_rec_read_check_and_lock(
 	}
 
 	lock_mutex_enter();
-	trx_t* trx __attribute__((unused))= thr_get_trx(thr);
+	trx_t* trx = thr_get_trx(thr);
 
 	ut_ad(mode != LOCK_X
 	      || lock_table_has(trx, index->table, LOCK_IX));
diff --git a/storage/xtradb/log/log0log.cc b/storage/xtradb/log/log0log.cc
index 0b5d27b8fd1..a01a2ed9570 100644
--- a/storage/xtradb/log/log0log.cc
+++ b/storage/xtradb/log/log0log.cc
@@ -1011,6 +1011,7 @@ log_init(void)
 
 	log_sys->next_checkpoint_no = 0;
 	log_sys->last_checkpoint_lsn = log_sys->lsn;
+	log_sys->next_checkpoint_lsn = log_sys->lsn;
 	log_sys->n_pending_checkpoint_writes = 0;
 
 
@@ -1944,6 +1945,7 @@ log_complete_checkpoint(void)
 
 	log_sys->next_checkpoint_no++;
 
+	ut_ad(log_sys->next_checkpoint_lsn >= log_sys->last_checkpoint_lsn);
 	log_sys->last_checkpoint_lsn = log_sys->next_checkpoint_lsn;
 	MONITOR_SET(MONITOR_LSN_CHECKPOINT_AGE,
 		    log_sys->lsn - log_sys->last_checkpoint_lsn);
@@ -2031,11 +2033,17 @@ log_group_checkpoint(
 	ulint		i;
 
 	ut_ad(!srv_read_only_mode);
+	ut_ad(srv_shutdown_state != SRV_SHUTDOWN_LAST_PHASE);
 	ut_ad(mutex_own(&(log_sys->mutex)));
 	ut_a(LOG_CHECKPOINT_SIZE <= OS_FILE_LOG_BLOCK_SIZE);
 
 	buf = group->checkpoint_buf;
 
+#ifdef UNIV_DEBUG
+	lsn_t		old_next_checkpoint_lsn
+		= mach_read_from_8(buf + LOG_CHECKPOINT_LSN);
+	ut_ad(old_next_checkpoint_lsn <= log_sys->next_checkpoint_lsn);
+#endif /* UNIV_DEBUG */
 	mach_write_to_8(buf + LOG_CHECKPOINT_NO, log_sys->next_checkpoint_no);
 	mach_write_to_8(buf + LOG_CHECKPOINT_LSN, log_sys->next_checkpoint_lsn);
 
@@ -2314,6 +2322,7 @@ log_checkpoint(
 		return(FALSE);
 	}
 
+	ut_ad(oldest_lsn >= log_sys->next_checkpoint_lsn);
 	log_sys->next_checkpoint_lsn = oldest_lsn;
 #ifdef UNIV_DEBUG
 	if (log_debug_writes) {
@@ -3670,13 +3679,15 @@ loop:
 	before proceeding further. */
 	srv_shutdown_state = SRV_SHUTDOWN_FLUSH_PHASE;
 	count = 0;
-	while (buf_page_cleaner_is_active) {
-		++count;
-		os_thread_sleep(100000);
-		if (srv_print_verbose_log && count > 600) {
+	while (buf_page_cleaner_is_active || buf_lru_manager_is_active) {
+		if (srv_print_verbose_log && count == 0) {
 			ib_logf(IB_LOG_LEVEL_INFO,
 				"Waiting for page_cleaner to "
 				"finish flushing of buffer pool");
+		}
+		++count;
+		os_thread_sleep(100000);
+		if (count > 600) {
 			count = 0;
 		}
 	}
@@ -3752,7 +3763,7 @@ loop:
 
 		/* Wake the log tracking thread which will then immediatelly
 		quit because of srv_shutdown_state value */
-		if (srv_track_changed_pages) {
+		if (srv_redo_log_thread_started) {
 			os_event_reset(srv_redo_log_tracked_event);
 			os_event_set(srv_checkpoint_completed_event);
 		}
@@ -3831,7 +3842,7 @@ loop:
 	srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE;
 
 	/* Signal the log following thread to quit */
-	if (srv_track_changed_pages) {
+	if (srv_redo_log_thread_started) {
 		os_event_reset(srv_redo_log_tracked_event);
 		os_event_set(srv_checkpoint_completed_event);
 	}
@@ -3844,6 +3855,7 @@ loop:
 	ut_a(freed);
 
 	ut_a(lsn == log_sys->lsn);
+	ut_ad(lsn == log_sys->last_checkpoint_lsn);
 
 	if (lsn < srv_start_lsn) {
 		ib_logf(IB_LOG_LEVEL_ERROR,
diff --git a/storage/xtradb/log/log0online.cc b/storage/xtradb/log/log0online.cc
index 63f1ef39568..2a1ac63dc5b 100644
--- a/storage/xtradb/log/log0online.cc
+++ b/storage/xtradb/log/log0online.cc
@@ -433,6 +433,7 @@ log_online_track_missing_on_startup(
 					current server startup */
 {
 	ut_ad(last_tracked_lsn != tracking_start_lsn);
+	ut_ad(srv_track_changed_pages);
 
 	ib_logf(IB_LOG_LEVEL_WARN, "last tracked LSN in \'%s\' is " LSN_PF
 		", but the last checkpoint LSN is " LSN_PF ".  This might be "
@@ -615,6 +616,8 @@ log_online_read_init(void)
 	compile_time_assert(MODIFIED_PAGE_BLOCK_BITMAP % 8 == 0);
 	compile_time_assert(MODIFIED_PAGE_BLOCK_BITMAP_LEN % 8 == 0);
 
+	ut_ad(srv_track_changed_pages);
+
 	log_bmp_sys = static_cast<log_bitmap_struct *>
 		(ut_malloc(sizeof(*log_bmp_sys)));
 	log_bmp_sys->read_buf_ptr = static_cast<byte *>
@@ -1089,10 +1092,15 @@ log_online_write_bitmap_page(
 {
 	ibool	success;
 
+	ut_ad(srv_track_changed_pages);
 	ut_ad(mutex_own(&log_bmp_sys->mutex));
 
 	/* Simulate a write error */
-	DBUG_EXECUTE_IF("bitmap_page_write_error", return FALSE;);
+	DBUG_EXECUTE_IF("bitmap_page_write_error",
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"simulating bitmap write error in "
+				"log_online_write_bitmap_page");
+			return FALSE;);
 
 	success = os_file_write(log_bmp_sys->out.name, log_bmp_sys->out.file,
 				block, log_bmp_sys->out.offset,
@@ -1182,7 +1190,9 @@ log_online_write_bitmap(void)
 			rbt_next(log_bmp_sys->modified_pages, bmp_tree_node);
 
 		DBUG_EXECUTE_IF("bitmap_page_2_write_error",
-				DBUG_SET("+d,bitmap_page_write_error"););
+				ut_ad(bmp_tree_node); /* 2nd page must exist */
+				DBUG_SET("+d,bitmap_page_write_error");
+				DBUG_SET("-d,bitmap_page_2_write_error"););
 	}
 
 	rbt_reset(log_bmp_sys->modified_pages);
@@ -1203,15 +1213,11 @@ log_online_follow_redo_log(void)
 	log_group_t*	group;
 	ibool		result;
 
-	mutex_enter(&log_bmp_sys->mutex);
-
-	if (!srv_track_changed_pages) {
-		mutex_exit(&log_bmp_sys->mutex);
-		return FALSE;
-	}
-
+	ut_ad(srv_track_changed_pages);
 	ut_ad(!srv_read_only_mode);
 
+	mutex_enter(&log_bmp_sys->mutex);
+
 	/* Grab the LSN of the last checkpoint, we will parse up to it */
 	mutex_enter(&(log_sys->mutex));
 	log_bmp_sys->end_lsn = log_sys->last_checkpoint_lsn;
@@ -1554,9 +1560,12 @@ log_online_diagnose_bitmap_eof(
 			/* It's a "Warning" here because it's not a fatal error
 			for the whole server */
 			ib_logf(IB_LOG_LEVEL_WARN,
-				"changed page bitmap file \'%s\' does not "
-				"contain a complete run at the end.",
-				bitmap_file->name);
+				"changed page bitmap file \'%s\', size "
+				UINT64PF " bytes, does not "
+				"contain a complete run at the next read "
+				"offset " UINT64PF,
+				bitmap_file->name, bitmap_file->size,
+				bitmap_file->offset);
 			return FALSE;
 		}
 	}
@@ -1788,20 +1797,20 @@ log_online_purge_changed_page_bitmaps(
 		lsn = LSN_MAX;
 	}
 
-	if (srv_track_changed_pages) {
+	if (srv_redo_log_thread_started) {
 		/* User requests might happen with both enabled and disabled
 		tracking */
 		mutex_enter(&log_bmp_sys->mutex);
 	}
 
 	if (!log_online_setup_bitmap_file_range(&bitmap_files, 0, LSN_MAX)) {
-		if (srv_track_changed_pages) {
+		if (srv_redo_log_thread_started) {
 			mutex_exit(&log_bmp_sys->mutex);
 		}
 		return TRUE;
 	}
 
-	if (srv_track_changed_pages && lsn > log_bmp_sys->end_lsn) {
+	if (srv_redo_log_thread_started && lsn > log_bmp_sys->end_lsn) {
 		/* If we have to delete the current output file, close it
 		first. */
 		os_file_close(log_bmp_sys->out.file);
@@ -1834,7 +1843,7 @@ log_online_purge_changed_page_bitmaps(
 		}
 	}
 
-	if (srv_track_changed_pages) {
+	if (srv_redo_log_thread_started) {
 		if (lsn > log_bmp_sys->end_lsn) {
 			lsn_t	new_file_lsn;
 			if (lsn == LSN_MAX) {
@@ -1845,9 +1854,7 @@ log_online_purge_changed_page_bitmaps(
 				new_file_lsn = log_bmp_sys->end_lsn;
 			}
 			if (!log_online_rotate_bitmap_file(new_file_lsn)) {
-				/* If file create failed, signal the log
-				tracking thread to quit next time it wakes
-				up.  */
+				/* If file create failed, stop log tracking */
 				srv_track_changed_pages = FALSE;
 			}
 		}
diff --git a/storage/xtradb/log/log0recv.cc b/storage/xtradb/log/log0recv.cc
index 759687e3fe5..092c2ed88dc 100644
--- a/storage/xtradb/log/log0recv.cc
+++ b/storage/xtradb/log/log0recv.cc
@@ -392,12 +392,6 @@ recv_sys_init(
 	}
 
 #ifndef UNIV_HOTBACKUP
-	/* Initialize red-black tree for fast insertions into the
-	flush_list during recovery process.
-	As this initialization is done while holding the buffer pool
-	mutex we perform it before acquiring recv_sys->mutex. */
-	buf_flush_init_flush_rbt();
-
 	mutex_enter(&(recv_sys->mutex));
 
 	recv_sys->heap = mem_heap_create_typed(256,
@@ -490,9 +484,6 @@ recv_sys_debug_free(void)
 	recv_sys->last_block_buf_start = NULL;
 
 	mutex_exit(&(recv_sys->mutex));
-
-	/* Free up the flush_rbt. */
-	buf_flush_free_flush_rbt();
 }
 # endif /* UNIV_LOG_DEBUG */
 
@@ -3140,6 +3131,11 @@ recv_recovery_from_checkpoint_start_func(
 	byte*		log_hdr_buf_base = reinterpret_cast<byte *>
 		(alloca(LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE));
 	dberr_t		err;
+
+	/* Initialize red-black tree for fast insertions into the
+	flush_list during recovery process. */
+	buf_flush_init_flush_rbt();
+
 	ut_when_dtor<recv_dblwr_t> tmp(recv_sys->dblwr);
 
 	log_hdr_buf = static_cast<byte *>
@@ -3568,6 +3564,9 @@ recv_recovery_from_checkpoint_finish(void)
 #ifndef UNIV_LOG_DEBUG
 	recv_sys_debug_free();
 #endif
+	/* Free up the flush_rbt. */
+	buf_flush_free_flush_rbt();
+
 	/* Roll back any recovered data dictionary transactions, so
 	that the data dictionary tables will be free of any locks.
 	The data dictionary latch should guarantee that there is at
diff --git a/storage/xtradb/mach/mach0data.cc b/storage/xtradb/mach/mach0data.cc
index df68aab8a18..206434dc5ab 100644
--- a/storage/xtradb/mach/mach0data.cc
+++ b/storage/xtradb/mach/mach0data.cc
@@ -56,7 +56,18 @@ mach_parse_compressed(
 		*val = flag;
 		return(ptr + 1);
 
-	} else if (flag < 0xC0UL) {
+	}
+
+	/* Workaround GCC bug
+	https://gcc.gnu.org/bugzilla/show_bug.cgi?id=77673:
+	the compiler moves mach_read_from_4 right to the beginning of the
+	function, causing and out-of-bounds read if we are reading a short
+	integer close to the end of buffer. */
+#if defined(__GNUC__) && (__GNUC__ >= 5) && !defined(__clang__)
+	asm volatile("": : :"memory");
+#endif
+
+	if (flag < 0xC0UL) {
 		if (end_ptr < ptr + 2) {
 			return(NULL);
 		}
diff --git a/storage/xtradb/os/os0thread.cc b/storage/xtradb/os/os0thread.cc
index aabdd06d76b..af826027efc 100644
--- a/storage/xtradb/os/os0thread.cc
+++ b/storage/xtradb/os/os0thread.cc
@@ -210,14 +210,42 @@ os_thread_create_func(
 #endif
 }
 
+/**
+Waits until the specified thread completes and joins it. Its return value is
+ignored.
+
+@param	thread	thread to join */
+UNIV_INTERN
+void
+os_thread_join(
+	os_thread_t	thread)
+{
+	/*This function is currently only used to workaround glibc bug
+	described in http://bugs.mysql.com/bug.php?id=82886
+
+	On Windows, no workarounds are necessary, all threads
+	are "detached" upon thread exit (handle is closed), so we do
+	nothing.
+	*/
+#ifndef _WIN32
+	int ret	MY_ATTRIBUTE((unused)) = pthread_join(thread, NULL);
+
+	/* Waiting on already-quit threads is allowed */
+	ut_ad(ret == 0 || ret == ESRCH);
+#endif
+}
+
 /*****************************************************************//**
 Exits the current thread. */
 UNIV_INTERN
 void
 os_thread_exit(
 /*===========*/
-	void*	exit_value)	/*!< in: exit value; in Windows this void*
+	void*	exit_value,	/*!< in: exit value; in Windows this void*
 				is cast as a DWORD */
+	bool	detach)		/*!< in: if true, the thread will be detached
+				right before exiting. If false, another thread
+				is responsible for joining this thread. */
 {
 #ifdef UNIV_DEBUG_THREAD_CREATION
 	fprintf(stderr, "Thread exits, id %lu\n",
@@ -233,7 +261,8 @@ os_thread_exit(
 #ifdef __WIN__
 	ExitThread((DWORD) exit_value);
 #else
-	pthread_detach(pthread_self());
+	if (detach)
+		pthread_detach(pthread_self());
 	pthread_exit(exit_value);
 #endif
 }
diff --git a/storage/xtradb/rem/rem0rec.cc b/storage/xtradb/rem/rem0rec.cc
index 80dc7557ec7..d1205608d47 100644
--- a/storage/xtradb/rem/rem0rec.cc
+++ b/storage/xtradb/rem/rem0rec.cc
@@ -323,7 +323,8 @@ rec_init_offsets_comp_ordinary(
 			stored in one byte for 0..127.  The length
 			will be encoded in two bytes when it is 128 or
 			more, or when the field is stored externally. */
-			if (UNIV_UNLIKELY(col->len > 255)
+			if (UNIV_UNLIKELY(col->len > 255 -
+			    prtype_get_compression_extra(col->prtype))
 			    || UNIV_UNLIKELY(col->mtype
 					     == DATA_BLOB)) {
 				if (len & 0x80) {
@@ -844,8 +845,12 @@ rec_get_converted_size_comp_prefix_low(
 			continue;
 		}
 
-		ut_ad(len <= col->len || col->mtype == DATA_BLOB
-			|| (col->len == 0 && col->mtype == DATA_VARCHAR));
+		ut_ad(len <= col->len || col->mtype == DATA_BLOB ||
+		  ((col->mtype == DATA_VARCHAR || col->mtype == DATA_BINARY
+		   || col->mtype == DATA_VARMYSQL)
+		   && (col->len == 0
+		       || len <= col->len +
+			  prtype_get_compression_extra(col->prtype))));
 
 		fixed_len = field->fixed_len;
 		if (temp && fixed_len
@@ -877,7 +882,9 @@ rec_get_converted_size_comp_prefix_low(
 			ut_ad(col->len >= 256 || col->mtype == DATA_BLOB);
 			extra_size += 2;
 		} else if (len < 128
-			   || (col->len < 256 && col->mtype != DATA_BLOB)) {
+			   || (col->len < 256 -
+			       prtype_get_compression_extra(col->prtype)
+			       && col->mtype != DATA_BLOB)) {
 			extra_size++;
 		} else {
 			/* For variable-length columns, we look up the
@@ -1272,12 +1279,16 @@ rec_convert_dtuple_to_rec_comp(
 			*lens-- = (byte) (len >> 8) | 0xc0;
 			*lens-- = (byte) len;
 		} else {
-			ut_ad(len <= dtype_get_len(type)
+			ut_ad(len <= dtype_get_len(type) +
+			      prtype_get_compression_extra(
+			        dtype_get_prtype(type))
 			      || dtype_get_mtype(type) == DATA_BLOB
 			      || !strcmp(index->name,
 					 FTS_INDEX_TABLE_IND_NAME));
 			if (len < 128
-			    || (dtype_get_len(type) < 256
+			    || (dtype_get_len(type) < 256 -
+			        prtype_get_compression_extra(
+				  dtype_get_prtype(type))
 				&& dtype_get_mtype(type) != DATA_BLOB)) {
 
 				*lens-- = (byte) len;
diff --git a/storage/xtradb/row/row0ftsort.cc b/storage/xtradb/row/row0ftsort.cc
index 032b0badcbd..9f182fc5e70 100644
--- a/storage/xtradb/row/row0ftsort.cc
+++ b/storage/xtradb/row/row0ftsort.cc
@@ -226,10 +226,7 @@ row_fts_psort_info_init(
 	common_info->opt_doc_id_size = opt_doc_id_size;
 	crypt_data = fil_space_get_crypt_data(new_table->space);
 
-	if ((crypt_data && crypt_data->encryption == FIL_SPACE_ENCRYPTION_ON) ||
-		(srv_encrypt_tables &&
-			crypt_data && crypt_data->encryption == FIL_SPACE_ENCRYPTION_DEFAULT)) {
-
+	if (crypt_data && crypt_data->should_encrypt()) {
 		common_info->crypt_data = crypt_data;
 		encrypted = true;
 	} else {
@@ -1023,7 +1020,7 @@ fts_parallel_merge(
 	CloseHandle(psort_info->thread_hdl);
 #endif /*__WIN__ */
 
-	os_thread_exit(NULL);
+	os_thread_exit(NULL, false);
 
 	OS_THREAD_DUMMY_RETURN;
 }
diff --git a/storage/xtradb/row/row0import.cc b/storage/xtradb/row/row0import.cc
index 4c7cb7d33b5..d45ce907304 100644
--- a/storage/xtradb/row/row0import.cc
+++ b/storage/xtradb/row/row0import.cc
@@ -1899,6 +1899,15 @@ PageConverter::update_index_page(
 		row_index_t*	index = find_index(id);
 
 		if (index == 0) {
+			ib_logf(IB_LOG_LEVEL_ERROR,
+				"Page for tablespace %lu is "
+				" index page with id %lu but that"
+				" index is not found from configuration file."
+				" Current index name %s and id %lu.",
+				m_space,
+				id,
+				m_index->m_name,
+				m_index->m_id);
 			m_index = 0;
 			return(DB_CORRUPTION);
 		}
diff --git a/storage/xtradb/row/row0log.cc b/storage/xtradb/row/row0log.cc
index 53e375023fb..a17ee405720 100644
--- a/storage/xtradb/row/row0log.cc
+++ b/storage/xtradb/row/row0log.cc
@@ -621,7 +621,7 @@ row_log_table_delete(
 		&old_pk_extra_size);
 	ut_ad(old_pk_extra_size < 0x100);
 
-	mrec_size = 4 + old_pk_size;
+	mrec_size = 6 + old_pk_size;
 
 	/* Log enough prefix of the BLOB unless both the
 	old and new table are in COMPACT or REDUNDANT format,
@@ -651,8 +651,8 @@ row_log_table_delete(
 		*b++ = static_cast<byte>(old_pk_extra_size);
 
 		/* Log the size of external prefix we saved */
-		mach_write_to_2(b, ext_size);
-		b += 2;
+		mach_write_to_4(b, ext_size);
+		b += 4;
 
 		rec_convert_dtuple_to_temp(
 			b + old_pk_extra_size, new_index,
@@ -2276,14 +2276,14 @@ row_log_table_apply_op(
 		break;
 
 	case ROW_T_DELETE:
-		/* 1 (extra_size) + 2 (ext_size) + at least 1 (payload) */
-		if (mrec + 4 >= mrec_end) {
+		/* 1 (extra_size) + 4 (ext_size) + at least 1 (payload) */
+		if (mrec + 6 >= mrec_end) {
 			return(NULL);
 		}
 
 		extra_size = *mrec++;
-		ext_size = mach_read_from_2(mrec);
-		mrec += 2;
+		ext_size = mach_read_from_4(mrec);
+		mrec += 4;
 		ut_ad(mrec < mrec_end);
 
 		/* We assume extra_size < 0x100 for the PRIMARY KEY prefix.
diff --git a/storage/xtradb/row/row0merge.cc b/storage/xtradb/row/row0merge.cc
index f46da173eaa..5daad1e0e4f 100644
--- a/storage/xtradb/row/row0merge.cc
+++ b/storage/xtradb/row/row0merge.cc
@@ -619,7 +619,12 @@ row_merge_buf_add(
 			dfield_set_len(field, len);
 		}
 
-		ut_ad(len <= col->len || col->mtype == DATA_BLOB);
+		ut_ad(len <= col->len || col->mtype == DATA_BLOB ||
+		  ((col->mtype == DATA_VARCHAR || col->mtype == DATA_BINARY
+		   || col->mtype == DATA_VARMYSQL)
+		   && (col->len == 0
+		       || len <= col->len +
+			  prtype_get_compression_extra(col->prtype))));
 
 		fixed_len = ifield->fixed_len;
 		if (fixed_len && !dict_table_is_comp(index->table)
@@ -648,7 +653,9 @@ row_merge_buf_add(
 		} else if (dfield_is_ext(field)) {
 			extra_size += 2;
 		} else if (len < 128
-			   || (col->len < 256 && col->mtype != DATA_BLOB)) {
+			   || (col->len < 256 -
+			       prtype_get_compression_extra(col->prtype)
+			       && col->mtype != DATA_BLOB)) {
 			extra_size++;
 		} else {
 			/* For variable-length columns, we look up the
@@ -2177,7 +2184,7 @@ wait_again:
 		/* Sync fts cache for other fts indexes to keep all
 		fts indexes consistent in sync_doc_id. */
 		err = fts_sync_table(const_cast<dict_table_t*>(new_table),
-				     false, true);
+				     false, true, false);
 
 		if (err == DB_SUCCESS) {
 			fts_update_next_doc_id(
@@ -3995,10 +4002,7 @@ row_merge_build_indexes(
 
 	/* If tablespace is encrypted, allocate additional buffer for
 	encryption/decryption. */
-	if ((crypt_data && crypt_data->encryption == FIL_SPACE_ENCRYPTION_ON) ||
-		(srv_encrypt_tables &&
-			crypt_data && crypt_data->encryption == FIL_SPACE_ENCRYPTION_DEFAULT)) {
-
+	if (crypt_data && crypt_data->should_encrypt()) {
 		crypt_block = static_cast<row_merge_block_t*>(
 			os_mem_alloc_large(&block_size));
 
@@ -4165,6 +4169,13 @@ wait_again:
 						" exited when creating FTS"
 						" index '%s'",
 						indexes[i]->name);
+				} else {
+					for (j = 0; j < FTS_NUM_AUX_INDEX;
+					     j++) {
+
+					    os_thread_join(merge_info[j]
+							   .thread_hdl);
+					}
 				}
 			} else {
 				/* This cannot report duplicates; an
diff --git a/storage/xtradb/row/row0mysql.cc b/storage/xtradb/row/row0mysql.cc
index ba2e0047fe9..0bdee1282f8 100644
--- a/storage/xtradb/row/row0mysql.cc
+++ b/storage/xtradb/row/row0mysql.cc
@@ -65,11 +65,54 @@ Created 9/17/2000 Heikki Tuuri
 #include "row0import.h"
 #include "m_string.h"
 #include "my_sys.h"
+#include "zlib.h"
 #include <algorithm>
 
 /** Provide optional 4.x backwards compatibility for 5.0 and above */
 UNIV_INTERN ibool	row_rollback_on_timeout	= FALSE;
 
+/**
+Z_NO_COMPRESSION = 0
+Z_BEST_SPEED = 1
+Z_BEST_COMPRESSION = 9
+Z_DEFAULT_COMPRESSION = -1
+Compression level to be used by zlib for compressed-blob columns.
+Settable by user.
+*/
+UNIV_INTERN uint	srv_compressed_columns_zip_level = DEFAULT_COMPRESSION_LEVEL;
+/**
+(Z_FILTERED | Z_HUFFMAN_ONLY | Z_RLE | Z_FIXED | Z_DEFAULT_STRATEGY)
+
+The strategy parameter is used to tune the compression algorithm. Use the
+value Z_DEFAULT_STRATEGY for normal data, Z_FILTERED for data produced by a
+filter (or predictor), Z_HUFFMAN_ONLY to force Huffman encoding only
+(no string match), or Z_RLE to limit match distances to one
+(run-length encoding). Filtered data consists mostly of small values with a
+somewhat random distribution. In this case, the compression algorithm is
+tuned to compress them better.
+The effect of Z_FILTERED is to force more Huffman coding and less string
+matching; it is somewhat intermediate between Z_DEFAULT_STRATEGY and
+Z_HUFFMAN_ONLY. Z_RLE is designed to be almost as fast as Z_HUFFMAN_ONLY,
+but give better compression for PNG image data. The strategy parameter only
+affects the compression ratio but not the correctness of the compressed
+output even if it is not set appropriately. Z_FIXED prevents the use of
+dynamic Huffman codes, allowing for a simpler decoder for special
+applications.
+*/
+const uint	srv_compressed_columns_zlib_strategy = Z_DEFAULT_STRATEGY;
+/** Compress the column if the data length exceeds this value. */
+UNIV_INTERN ulong	srv_compressed_columns_threshold = 96;
+/**
+Determine if zlib needs to compute adler32 value for the compressed data.
+This variables is similar to page_zip_zlib_wrap, but only used by
+compressed blob columns.
+*/
+const bool	srv_compressed_columns_zlib_wrap = true;
+/**
+Determine if zlib will use custom memory allocation functions based on
+InnoDB memory heap routines (mem_heap_t*).
+*/
+const bool	srv_compressed_columns_zlib_use_heap = false;
 /** Chain node of the list of tables to drop in the background. */
 struct row_mysql_drop_t{
 	char*				table_name;	/*!< table name */
@@ -173,6 +216,17 @@ row_mysql_prebuilt_free_blob_heap(
 	prebuilt->blob_heap = NULL;
 }
 
+/** Frees the compress heap in prebuilt when no longer needed. */
+UNIV_INTERN
+void
+row_mysql_prebuilt_free_compress_heap(
+	row_prebuilt_t*	prebuilt)	/*!< in: prebuilt struct of a
+					ha_innobase:: table handle */
+{
+	mem_heap_free(prebuilt->compress_heap);
+	prebuilt->compress_heap = NULL;
+}
+
 /*******************************************************************//**
 Stores a >= 5.0.3 format true VARCHAR length to dest, in the MySQL row
 format.
@@ -229,6 +283,425 @@ row_mysql_read_true_varchar(
 	return(field + 1);
 }
 
+/**
+  Compressed BLOB header format:
+  ---------------------------------------------------------------
+  | reserved | wrap | algorithm | len-len | compressed | unused |
+  |      [1] |  [1] |       [5] |     [3] |        [1] |    [5] |
+  ---------------------------------------------------------------
+  | 0      0 | 1  1 | 2       6 | 7     9 | 10      10 | 11  15 |
+  ---------------------------------------------------------------
+  * 'reserved' bit is planned to be used in future versions of the BLOB
+  header. In this version it must always be
+  'default_zip_column_reserved_value' (0).
+  * 'wrap' identifies if compression algorithm calculated a checksum
+  (adler32 in case of zlib) and appended it to the compressed data.
+  * 'algorithm' identifies which algoritm was used to compress this BLOB.
+  Currently, the only value 'default_zip_column_algorithm_value' (0) is
+  supported.
+  * 'len-len' field identifies the length of the column length data portion
+  followed by this header (see below).
+  * If 'compressed' bit is set to 1, then this header is immediately followed
+  by 1..8 bytes (depending on the value of 'len-len' bitfield) which
+  determine original (uncompressed) block size. These 'len-len' bytes are
+  followed by compressed representation of the original data.
+  * If 'compressed' bit is set to 0, every other bitfield ('wrap',
+  'algorithm' and 'le-len') must be ignored. In this case the header is
+  immediately followed by uncompressed (original) data.
+*/
+
+/**
+  Currently the only supported value for the 'reserved' field is
+  false (0).
+*/
+static const bool default_zip_column_reserved_value = false;
+
+/**
+  Currently the only supported value for the 'algorithm' field is 0, which
+  means 'zlib'.
+*/
+static const uint default_zip_column_algorithm_value = 0;
+
+static const size_t zip_column_prefix_max_length =
+	ZIP_COLUMN_HEADER_LENGTH + 8;
+static const size_t zip_column_header_length = ZIP_COLUMN_HEADER_LENGTH;
+
+/* 'reserved', bit 0 */
+static const uint zip_column_reserved = 0;
+/* 0000 0000 0000 0001 */
+static const uint zip_column_reserved_mask = 0x0001;
+
+/* 'wrap', bit 1 */
+static const uint zip_column_wrap = 1;
+/* 0000 0000 0000 0010 */
+static const uint zip_column_wrap_mask = 0x0002;
+
+/* 'algorithm', bit 2,3,4,5,6 */
+static const uint zip_column_algorithm = 2;
+/* 0000 0000 0111 1100 */
+static const uint zip_column_algorithm_mask = 0x007C;
+
+/* 'len-len', bit 7,8,9 */
+static const uint zip_column_data_length = 7;
+/* 0000 0011 1000 0000 */
+static const uint zip_column_data_length_mask = 0x0380;
+
+/* 'compressed', bit 10 */
+static const uint zip_column_compressed = 10;
+/* 0000 0100 0000 0000 */
+static const uint zip_column_compressed_mask = 0x0400;
+
+/** Updates compressed block header with the given components */
+static void
+column_set_compress_header(
+	byte*	data,
+	bool	compressed,
+	ulint	lenlen,
+	uint	alg,
+	bool	wrap,
+	bool	reserved)
+{
+	ulint header = 0;
+	header |= (compressed << zip_column_compressed);
+	header |= (lenlen << zip_column_data_length);
+	header |= (alg << zip_column_algorithm);
+	header |= (wrap << zip_column_wrap);
+	header |= (reserved << zip_column_reserved);
+	mach_write_to_2(data, header);
+}
+
+/** Parse compressed block header into components */
+static void
+column_get_compress_header(
+	const byte*	data,
+	bool*		compressed,
+	ulint*		lenlen,
+	uint*		alg,
+	bool*		wrap,
+	bool*		reserved
+)
+{
+	ulint header = mach_read_from_2(data);
+	*compressed = ((header & zip_column_compressed_mask) >>
+		zip_column_compressed);
+	*lenlen = ((header & zip_column_data_length_mask) >>
+		zip_column_data_length);
+	*alg = ((header & zip_column_algorithm_mask) >>
+		zip_column_algorithm);
+	*wrap = ((header & zip_column_wrap_mask) >>
+		zip_column_wrap);
+	*reserved = ((header & zip_column_reserved_mask) >>
+		zip_column_reserved);
+}
+
+/** Allocate memory for zlib. */
+static
+void*
+column_zip_zalloc(
+	void*	opaque,	/*!< in/out: memory heap */
+	uInt	items,	/*!< in: number of items to allocate */
+	uInt	size)	/*!< in: size of an item in bytes */
+{
+	return(mem_heap_zalloc(static_cast<mem_heap_t*>(opaque),
+		items * size));
+}
+
+/** Deallocate memory for zlib. */
+static
+void
+column_zip_free(
+	 void*	opaque MY_ATTRIBUTE((unused)),	/*!< in: memory heap */
+	 void*	address MY_ATTRIBUTE((unused)))	/*!< in: object to free */
+{
+}
+
+/** Configure the zlib allocator to use the given memory heap. */
+UNIV_INTERN
+void
+column_zip_set_alloc(
+	void*		stream,	/*!< in/out: zlib stream */
+	mem_heap_t*	heap)	/*!< in: memory heap to use */
+{
+	z_stream* strm = static_cast<z_stream*>(stream);
+
+	if (srv_compressed_columns_zlib_use_heap) {
+		strm->zalloc = column_zip_zalloc;
+		strm->zfree = column_zip_free;
+		strm->opaque = heap;
+	} else {
+		strm->zalloc = (alloc_func)0;
+		strm->zfree = (free_func)0;
+		strm->opaque = (voidpf)0;
+	}
+}
+
+/** Compress blob/text/varchar column using zlib
+@return pointer to the compressed data */
+byte*
+row_compress_column(
+	const byte*	data,	/*!< in: data in mysql(uncompressed)
+				format */
+	ulint		*len,	/*!< in: data length; out: length of
+				compressed data*/
+	ulint		lenlen,	/*!< in: bytes used to store the length of
+				data */
+	const byte*	dict_data,
+				/*!< in: optional dictionary data used for
+				compression */
+	ulint		dict_data_len,
+				/*!< in: optional dictionary data length */
+	row_prebuilt_t*	prebuilt)
+				/*!< in: use prebuilt->compress_heap only
+				here*/
+{
+	int err = 0;
+	ulint comp_len = *len;
+	ulint buf_len = *len + zip_column_prefix_max_length;
+	byte* buf;
+	byte* ptr;
+	z_stream c_stream;
+	bool wrap = srv_compressed_columns_zlib_wrap;
+
+	int window_bits = wrap ? MAX_WBITS : -MAX_WBITS;
+
+	if (!prebuilt->compress_heap) {
+		prebuilt->compress_heap =
+			mem_heap_create(max(UNIV_PAGE_SIZE, buf_len));
+	}
+
+	buf = static_cast<byte*>(mem_heap_zalloc(
+			prebuilt->compress_heap,buf_len));
+
+	if (*len < srv_compressed_columns_threshold ||
+		srv_compressed_columns_zip_level == Z_NO_COMPRESSION)
+		goto do_not_compress;
+
+	ptr = buf + zip_column_header_length + lenlen;
+
+	/*init deflate object*/
+	c_stream.next_in = const_cast<Bytef*>(data);
+	c_stream.avail_in = *len;
+	c_stream.next_out = ptr;
+	c_stream.avail_out = comp_len;
+
+	column_zip_set_alloc(&c_stream, prebuilt->compress_heap);
+
+	err = deflateInit2(&c_stream, srv_compressed_columns_zip_level,
+		Z_DEFLATED, window_bits, MAX_MEM_LEVEL,
+		srv_compressed_columns_zlib_strategy);
+	ut_a(err == Z_OK);
+
+	if (dict_data != 0 && dict_data_len != 0) {
+		err = deflateSetDictionary(&c_stream, dict_data,
+			dict_data_len);
+		ut_a(err == Z_OK);
+	}
+
+	err = deflate(&c_stream, Z_FINISH);
+	if (err != Z_STREAM_END) {
+		deflateEnd(&c_stream);
+		if (err == Z_OK)
+			err = Z_BUF_ERROR;
+	} else {
+		comp_len = c_stream.total_out;
+		err = deflateEnd(&c_stream);
+	}
+
+	switch (err) {
+	case Z_OK:
+		break;
+	case Z_BUF_ERROR:
+		/* data after compress is larger than uncompressed data*/
+		break;
+	default:
+		ib_logf(IB_LOG_LEVEL_ERROR,
+			"failed to compress the column, error: %d\n", err);
+	}
+
+	/* make sure the compressed data size is smaller than
+	uncompressed data */
+	if (err == Z_OK &&
+		*len > (comp_len + zip_column_header_length + lenlen)) {
+		column_set_compress_header(buf, true, lenlen - 1,
+			default_zip_column_algorithm_value, wrap,
+			default_zip_column_reserved_value);
+		ptr = buf + zip_column_header_length;
+		/*store the uncompressed data length*/
+		switch (lenlen) {
+		case 1:
+			mach_write_to_1(ptr, *len);
+			break;
+		case 2:
+			mach_write_to_2(ptr, *len);
+			break;
+		case 3:
+			mach_write_to_3(ptr, *len);
+			break;
+		case 4:
+			mach_write_to_4(ptr, *len);
+			break;
+		default:
+			ut_error;
+		}
+
+		*len = comp_len + zip_column_header_length + lenlen;
+		return buf;
+	}
+
+do_not_compress:
+	ptr = buf;
+	column_set_compress_header(ptr, false, 0,
+		default_zip_column_algorithm_value, false,
+		default_zip_column_reserved_value);
+	ptr += zip_column_header_length;
+	memcpy(ptr, data, *len);
+	*len += zip_column_header_length;
+	return buf;
+}
+
+/** Uncompress blob/text/varchar column using zlib
+@return pointer to the uncompressed data */
+const byte*
+row_decompress_column(
+	const byte*	data,	/*!< in: data in innodb(compressed) format */
+	ulint		*len,	/*!< in: data length; out: length of
+				decompressed data*/
+	const byte*	dict_data,
+				/*!< in: optional dictionary data used for
+				decompression */
+	ulint		dict_data_len,
+				/*!< in: optional dictionary data length */
+	row_prebuilt_t*	prebuilt)
+				/*!< in: use prebuilt->compress_heap only
+				here*/
+{
+	ulint buf_len = 0;
+	byte* buf;
+	int err = 0;
+	int window_bits = 0;
+	z_stream d_stream;
+	bool is_compressed = false;
+	bool wrap = false;
+	bool reserved = false;
+	ulint lenlen = 0;
+	uint alg = 0;
+
+	ut_ad(*len != ULINT_UNDEFINED);
+	ut_ad(*len >= zip_column_header_length);
+
+	column_get_compress_header(data, &is_compressed, &lenlen, &alg,
+		&wrap, &reserved);
+
+	if (reserved != default_zip_column_reserved_value) {
+		ib_logf(IB_LOG_LEVEL_FATAL,
+			"unsupported compressed BLOB header format\n");
+	}
+
+	if (alg != default_zip_column_algorithm_value) {
+		ib_logf(IB_LOG_LEVEL_FATAL,
+			"unsupported 'algorithm' value in the"
+			" compressed BLOB header\n");
+	}
+
+	ut_a(lenlen < 4);
+
+	data += zip_column_header_length;
+	if (!is_compressed) { /* column not compressed */
+		*len -= zip_column_header_length;
+		return data;
+	}
+
+	lenlen++;
+
+	ulint comp_len = *len - zip_column_header_length - lenlen;
+
+	ulint uncomp_len = 0;
+	switch (lenlen) {
+	case 1:
+		uncomp_len = mach_read_from_1(data);
+		break;
+	case 2:
+		uncomp_len = mach_read_from_2(data);
+		break;
+	case 3:
+		uncomp_len = mach_read_from_3(data);
+		break;
+	case 4:
+		uncomp_len = mach_read_from_4(data);
+		break;
+	default:
+		ut_error;
+	}
+
+	data += lenlen;
+
+	/* data is compressed, decompress it*/
+	if (!prebuilt->compress_heap) {
+		prebuilt->compress_heap =
+			mem_heap_create(max(UNIV_PAGE_SIZE, uncomp_len));
+	}
+
+	buf_len = uncomp_len;
+	buf = static_cast<byte*>(mem_heap_zalloc(
+				 prebuilt->compress_heap, buf_len));
+
+	/* init d_stream */
+	d_stream.next_in = const_cast<Bytef*>(data);
+	d_stream.avail_in = comp_len;
+	d_stream.next_out = buf;
+	d_stream.avail_out = buf_len;
+
+	column_zip_set_alloc(&d_stream, prebuilt->compress_heap);
+
+	window_bits = wrap ? MAX_WBITS : -MAX_WBITS;
+	err = inflateInit2(&d_stream, window_bits);
+	ut_a(err == Z_OK);
+
+	err = inflate(&d_stream, Z_FINISH);
+	if (err == Z_NEED_DICT) {
+		ut_a(dict_data != 0 && dict_data_len != 0);
+		err = inflateSetDictionary(&d_stream, dict_data,
+			dict_data_len);
+		ut_a(err == Z_OK);
+		err = inflate(&d_stream, Z_FINISH);
+	}
+
+	if (err != Z_STREAM_END) {
+		inflateEnd(&d_stream);
+		if (err == Z_BUF_ERROR && d_stream.avail_in == 0)
+			err = Z_DATA_ERROR;
+	} else {
+		buf_len = d_stream.total_out;
+		err = inflateEnd(&d_stream);
+	}
+
+	switch (err) {
+	case Z_OK:
+		break;
+	case Z_BUF_ERROR:
+		ib_logf(IB_LOG_LEVEL_FATAL,
+			"zlib buf error, this shouldn't happen\n");
+		break;
+	default:
+		ib_logf(IB_LOG_LEVEL_FATAL,
+			"failed to decompress column, error: %d\n", err);
+	}
+
+	if (err == Z_OK) {
+		if (buf_len != uncomp_len) {
+			ib_logf(IB_LOG_LEVEL_FATAL,
+				"failed to decompress blob column, may"
+				" be corrupted\n");
+		}
+		*len = buf_len;
+		return buf;
+	}
+
+	*len -= (zip_column_header_length + lenlen);
+	return data;
+}
+
+
 /*******************************************************************//**
 Stores a reference to a BLOB in the MySQL format. */
 UNIV_INTERN
@@ -242,10 +715,21 @@ row_mysql_store_blob_ref(
 				to 4 bytes */
 	const void*	data,	/*!< in: BLOB data; if the value to store
 				is SQL NULL this should be NULL pointer */
-	ulint		len)	/*!< in: BLOB length; if the value to store
+	ulint		len,	/*!< in: BLOB length; if the value to store
 				is SQL NULL this should be 0; remember
 				also to set the NULL bit in the MySQL record
 				header! */
+	bool		need_decompression,
+				/*!< in: if the data need to be compressed*/
+	const byte*	dict_data,
+				/*!< in: optional compression dictionary
+				data */
+	ulint		dict_data_len,
+				/*!< in: optional compression dictionary data
+				length */
+	row_prebuilt_t*	prebuilt)
+				/*<! in: use prebuilt->compress_heap only
+				here */
 {
 	/* MySQL might assume the field is set to zero except the length and
 	the pointer fields */
@@ -257,13 +741,28 @@ row_mysql_store_blob_ref(
 	In 32-bit architectures we only use the first 4 bytes of the pointer
 	slot. */
 
-	ut_a(col_len - 8 > 1 || len < 256);
-	ut_a(col_len - 8 > 2 || len < 256 * 256);
-	ut_a(col_len - 8 > 3 || len < 256 * 256 * 256);
+	ut_a(col_len - 8 > 1 ||
+		len < 256 +
+		(need_decompression ? ZIP_COLUMN_HEADER_LENGTH : 0));
+	ut_a(col_len - 8 > 2 ||
+		len < 256 * 256 +
+		(need_decompression ? ZIP_COLUMN_HEADER_LENGTH : 0));
+	ut_a(col_len - 8 > 3 ||
+		len < 256 * 256 * 256 +
+		(need_decompression ? ZIP_COLUMN_HEADER_LENGTH : 0));
 
-	mach_write_to_n_little_endian(dest, col_len - 8, len);
+	const byte *ptr = NULL;
+
+	if (need_decompression)
+		ptr = row_decompress_column((const byte*)data, &len,
+			dict_data, dict_data_len, prebuilt);
 
-	memcpy(dest + col_len - 8, &data, sizeof data);
+	if (ptr)
+		memcpy(dest + col_len - 8, &ptr, sizeof ptr);
+	else
+		memcpy(dest + col_len - 8, &data, sizeof data);
+
+	mach_write_to_n_little_endian(dest, col_len - 8, len);
 }
 
 /*******************************************************************//**
@@ -276,15 +775,32 @@ row_mysql_read_blob_ref(
 	ulint*		len,		/*!< out: BLOB length */
 	const byte*	ref,		/*!< in: BLOB reference in the
 					MySQL format */
-	ulint		col_len)	/*!< in: BLOB reference length
+	ulint		col_len,	/*!< in: BLOB reference length
 					(not BLOB length) */
+	bool		need_compression,
+					/*!< in: if the data need to be
+					compressed*/
+	const byte*	dict_data,	/*!< in: optional compression
+					dictionary data */
+	ulint		dict_data_len,	/*!< in: optional compression
+					dictionary data length */
+	row_prebuilt_t*	prebuilt)	/*!< in: use prebuilt->compress_heap
+					only here */
 {
-	byte*	data;
+	byte*	data = NULL;
+	byte*	ptr = NULL;
 
 	*len = mach_read_from_n_little_endian(ref, col_len - 8);
 
 	memcpy(&data, ref + col_len - 8, sizeof data);
 
+	if (need_compression) {
+		ptr = row_compress_column(data, len, col_len - 8, dict_data,
+			dict_data_len, prebuilt);
+		if (ptr)
+			data = ptr;
+	}
+
 	return(data);
 }
 
@@ -367,7 +883,16 @@ row_mysql_store_col_in_innobase_format(
 					necessarily the length of the actual
 					payload data; if the column is a true
 					VARCHAR then this is irrelevant */
-	ulint		comp)		/*!< in: nonzero=compact format */
+	ulint		comp,		/*!< in: nonzero=compact format */
+	bool		need_compression,
+					/*!< in: if the data need to be
+					compressed*/
+	const byte*	dict_data,	/*!< in: optional compression
+					dictionary data */
+	ulint		dict_data_len,	/*!< in: optional compression
+					dictionary data length */
+	row_prebuilt_t*	prebuilt)	/*!< in: use prebuilt->compress_heap
+					only here */
 {
 	const byte*	ptr	= mysql_data;
 	const dtype_t*	dtype;
@@ -420,8 +945,14 @@ row_mysql_store_col_in_innobase_format(
 				lenlen = 2;
 			}
 
-			ptr = row_mysql_read_true_varchar(&col_len, mysql_data,
-							  lenlen);
+			const byte* tmp_ptr = row_mysql_read_true_varchar(
+				&col_len, mysql_data, lenlen);
+			if (need_compression)
+				ptr = row_compress_column(tmp_ptr, &col_len,
+					lenlen, dict_data, dict_data_len,
+					prebuilt);
+			else
+				ptr = tmp_ptr;
 		} else {
 			/* Remove trailing spaces from old style VARCHAR
 			columns. */
@@ -503,7 +1034,9 @@ row_mysql_store_col_in_innobase_format(
 		}
 	} else if (type == DATA_BLOB && row_format_col) {
 
-		ptr = row_mysql_read_blob_ref(&col_len, mysql_data, col_len);
+		ptr = row_mysql_read_blob_ref(&col_len, mysql_data, col_len,
+			need_compression, dict_data, dict_data_len,
+			prebuilt);
 	}
 
 	dfield_set_data(dfield, ptr, col_len);
@@ -561,7 +1094,11 @@ row_mysql_convert_row_to_innobase(
 			TRUE, /* MySQL row format data */
 			mysql_rec + templ->mysql_col_offset,
 			templ->mysql_col_len,
-			dict_table_is_comp(prebuilt->table));
+			dict_table_is_comp(prebuilt->table),
+			templ->compressed,
+			reinterpret_cast<const byte*>(
+				templ->zip_dict_data.str),
+			templ->zip_dict_data.length, prebuilt);
 next_column:
 		;
 	}
@@ -909,6 +1446,10 @@ row_prebuilt_free(
 		mem_heap_free(prebuilt->blob_heap);
 	}
 
+	if (prebuilt->compress_heap) {
+		mem_heap_free(prebuilt->compress_heap);
+	}
+
 	if (prebuilt->old_vers_heap) {
 		mem_heap_free(prebuilt->old_vers_heap);
 	}
@@ -1344,6 +1885,9 @@ row_insert_for_mysql(
 		return(DB_READ_ONLY);
 	}
 
+	if (UNIV_LIKELY_NULL(prebuilt->compress_heap))
+		mem_heap_empty(prebuilt->compress_heap);
+
 	trx->op_info = "inserting";
 
 	row_mysql_delay_if_needed();
@@ -2748,6 +3292,10 @@ loop:
 		return(n_tables + n_tables_dropped);
 	}
 
+	DBUG_EXECUTE_IF("row_drop_tables_in_background_sleep",
+		os_thread_sleep(5000000);
+	);
+
 	table = dict_table_open_on_name(drop->table_name, FALSE, FALSE,
 					DICT_ERR_IGNORE_NONE);
 
@@ -2758,6 +3306,16 @@ loop:
 		goto already_dropped;
 	}
 
+	if (!table->to_be_dropped) {
+		/* There is a scenario: the old table is dropped
+		just after it's added into drop list, and new
+		table with the same name is created, then we try
+		to drop the new table in background. */
+		dict_table_close(table, FALSE, FALSE);
+
+		goto already_dropped;
+	}
+
 	ut_a(!table->can_be_evicted);
 
 	dict_table_close(table, FALSE, FALSE);
@@ -2888,6 +3446,12 @@ row_mysql_table_id_reassign(
 	pars_info_add_ull_literal(info, "old_id", table->id);
 	pars_info_add_ull_literal(info, "new_id", *new_id);
 
+	/* As micro-SQL does not support int4 == int8 comparisons,
+	old and new IDs are added again under different names as
+	int4 values*/
+	pars_info_add_int4_literal(info, "old_id_narrow", table->id);
+	pars_info_add_int4_literal(info, "new_id_narrow", *new_id);
+
 	err = que_eval_sql(
 		info,
 		"PROCEDURE RENUMBER_TABLE_PROC () IS\n"
@@ -2898,6 +3462,8 @@ row_mysql_table_id_reassign(
 		" WHERE TABLE_ID = :old_id;\n"
 		"UPDATE SYS_INDEXES SET TABLE_ID = :new_id\n"
 		" WHERE TABLE_ID = :old_id;\n"
+		"UPDATE SYS_ZIP_DICT_COLS SET TABLE_ID = :new_id_narrow\n"
+		" WHERE TABLE_ID = :old_id_narrow;\n"
 		"END;\n", FALSE, trx);
 
 	return(err);
@@ -3293,7 +3859,7 @@ fil_wait_crypt_bg_threads(
 	uint last = start;
 
 	if (table->space != 0) {
-		fil_space_crypt_mark_space_closing(table->space);
+		fil_space_crypt_mark_space_closing(table->space, table->crypt_data);
 	}
 
 	while (table->n_ref_count > 0) {
@@ -3713,6 +4279,12 @@ next_rec:
 	pars_info_add_ull_literal(info, "old_id", table->id);
 	pars_info_add_ull_literal(info, "new_id", new_id);
 
+	/* As micro-SQL does not support int4 == int8 comparisons,
+	old and new IDs are added again under different names as
+	int4 values*/
+	pars_info_add_int4_literal(info, "old_id_narrow", table->id);
+	pars_info_add_int4_literal(info, "new_id_narrow", new_id);
+
 	err = que_eval_sql(info,
 			   "PROCEDURE RENUMBER_TABLE_ID_PROC () IS\n"
 			   "BEGIN\n"
@@ -3724,6 +4296,9 @@ next_rec:
 			   "UPDATE SYS_INDEXES"
 			   " SET TABLE_ID = :new_id, SPACE = :new_space\n"
 			   " WHERE TABLE_ID = :old_id;\n"
+			   "UPDATE SYS_ZIP_DICT_COLS\n"
+			   " SET TABLE_ID = :new_id_narrow\n"
+			   " WHERE TABLE_ID = :old_id_narrow;\n"
 			   "END;\n"
 			   , FALSE, trx);
 
@@ -4089,6 +4664,13 @@ row_drop_table_for_mysql(
 		}
 	}
 
+
+	DBUG_EXECUTE_IF("row_drop_table_add_to_background",
+		row_add_table_to_background_drop_list(table->name);
+		err = DB_SUCCESS;
+		goto funct_exit;
+	);
+
 	/* TODO: could we replace the counter n_foreign_key_checks_running
 	with lock checks on the table? Acquire here an exclusive lock on the
 	table, and rewrite lock0lock.cc and the lock wait in srv0srv.cc so that
@@ -4225,6 +4807,12 @@ row_drop_table_for_mysql(
 		rw_lock_x_unlock(dict_index_get_lock(index));
 	}
 
+	/* If table has not yet have crypt_data, try to read it to
+	make freeing the table easier. */
+	if (!table->crypt_data) {
+		table->crypt_data = fil_space_get_crypt_data(table->space);
+	}
+
 	/* We use the private SQL parser of Innobase to generate the
 	query graphs needed in deleting the dictionary data from system
 	tables in Innobase. Deleting a row from SYS_INDEXES table also
@@ -4362,6 +4950,19 @@ row_drop_table_for_mysql(
 			filepath = fil_make_ibd_name(tablename, false);
 		}
 
+		/* Remove all compression dictionary references for the
+		table */
+		err = dict_create_remove_zip_dict_references_for_table(
+			table->id, trx);
+		if (err != DB_SUCCESS) {
+			ib_logf(IB_LOG_LEVEL_ERROR, "Error: (%s) not "
+				"able to remove compression dictionary "
+				"references for table %s", ut_strerr(err),
+				tablename);
+
+			goto funct_exit;
+		}
+
 		if (dict_table_has_fts_index(table)
 		    || DICT_TF2_FLAG_IS_SET(table, DICT_TF2_FTS_HAS_DOC_ID)) {
 			ut_ad(table->n_ref_count == 0);
@@ -4709,6 +5310,19 @@ loop:
 	row_mysql_lock_data_dictionary(trx);
 
 	while ((table_name = dict_get_first_table_name_in_db(name))) {
+		/* Drop parent table if it is a fts aux table, to
+		avoid accessing dropped fts aux tables in information
+		scheam when parent table still exists.
+		Note: Drop parent table will drop fts aux tables. */
+		char*	parent_table_name;
+		parent_table_name = fts_get_parent_table_name(
+				table_name, strlen(table_name));
+
+		if (parent_table_name != NULL) {
+			mem_free(table_name);
+			table_name = parent_table_name;
+		}
+
 		ut_a(memcmp(table_name, name, namelen) == 0);
 
 		table = dict_table_open_on_name(
diff --git a/storage/xtradb/row/row0sel.cc b/storage/xtradb/row/row0sel.cc
index 9732a5f9400..9bdb8d1bb98 100644
--- a/storage/xtradb/row/row0sel.cc
+++ b/storage/xtradb/row/row0sel.cc
@@ -2458,9 +2458,11 @@ row_sel_convert_mysql_key_to_innobase(
 		if (UNIV_LIKELY(!is_null)) {
 			buf = row_mysql_store_col_in_innobase_format(
 					dfield, buf,
-					FALSE, /* MySQL key value format col */
+					/* MySQL key value format col */
+					FALSE,
 					key_ptr + data_offset, data_len,
-					dict_table_is_comp(index->table));
+					dict_table_is_comp(index->table),
+					false, 0, 0 ,0);
 			ut_a(buf <= original_buf + buf_len);
 		}
 
@@ -2553,12 +2555,16 @@ row_sel_store_row_id_to_prebuilt(
 
 #ifdef UNIV_DEBUG
 /** Convert a non-SQL-NULL field from Innobase format to MySQL format. */
-# define row_sel_field_store_in_mysql_format(dest,templ,idx,field,src,len) \
-	row_sel_field_store_in_mysql_format_func(dest,templ,idx,field,src,len)
+# define row_sel_field_store_in_mysql_format( \
+	dest,templ,idx,field,src,len,prebuilt) \
+	row_sel_field_store_in_mysql_format_func \
+	(dest,templ,idx,field,src,len, prebuilt)
 #else /* UNIV_DEBUG */
 /** Convert a non-SQL-NULL field from Innobase format to MySQL format. */
-# define row_sel_field_store_in_mysql_format(dest,templ,idx,field,src,len) \
-	row_sel_field_store_in_mysql_format_func(dest,templ,src,len)
+# define row_sel_field_store_in_mysql_format( \
+	dest,templ,idx,field,src,len,prebuilt) \
+	row_sel_field_store_in_mysql_format_func \
+	(dest,templ,src,len, prebuilt)
 #endif /* UNIV_DEBUG */
 
 /**************************************************************//**
@@ -2588,7 +2594,10 @@ row_sel_field_store_in_mysql_format_func(
 				templ->icp_rec_field_no */
 #endif /* UNIV_DEBUG */
 	const byte*	data,	/*!< in: data to store */
-	ulint		len)	/*!< in: length of the data */
+	ulint		len,	/*!< in: length of the data */
+	row_prebuilt_t* prebuilt)
+				/*!< in: use prebuilt->compress_heap
+				only here */
 {
 	byte*			ptr;
 #ifdef UNIV_DEBUG
@@ -2632,6 +2641,15 @@ row_sel_field_store_in_mysql_format_func(
 		field_end = dest + templ->mysql_col_len;
 
 		if (templ->mysql_type == DATA_MYSQL_TRUE_VARCHAR) {
+			/* If this is a compressed column,
+			decompress it first */
+			if (templ->compressed)
+				data = row_decompress_column(data, &len,
+					reinterpret_cast<const byte*>(
+						templ->zip_dict_data.str),
+					templ->zip_dict_data.length,
+					prebuilt);
+
 			/* This is a >= 5.0.3 type true VARCHAR. Store the
 			length of the data to the first byte or the first
 			two bytes of dest. */
@@ -2682,7 +2700,11 @@ row_sel_field_store_in_mysql_format_func(
 		already copied to the buffer in row_sel_store_mysql_rec */
 
 		row_mysql_store_blob_ref(dest, templ->mysql_col_len, data,
-					 len);
+					len, templ->compressed,
+					reinterpret_cast<const byte*>(
+						templ->zip_dict_data.str),
+					templ->zip_dict_data.length,
+					prebuilt);
 		break;
 
 	case DATA_MYSQL:
@@ -2835,7 +2857,7 @@ row_sel_store_mysql_field_func(
 
 		row_sel_field_store_in_mysql_format(
 			mysql_rec + templ->mysql_col_offset,
-			templ, index, field_no, data, len);
+			templ, index, field_no, data, len, prebuilt);
 
 		if (heap != prebuilt->blob_heap) {
 			mem_heap_free(heap);
@@ -2885,7 +2907,7 @@ row_sel_store_mysql_field_func(
 
 		row_sel_field_store_in_mysql_format(
 			mysql_rec + templ->mysql_col_offset,
-			templ, index, field_no, data, len);
+			templ, index, field_no, data, len, prebuilt);
 	}
 
 	ut_ad(len != UNIV_SQL_NULL);
@@ -2933,6 +2955,9 @@ row_sel_store_mysql_rec(
 		prebuilt->blob_heap = NULL;
 	}
 
+	if (UNIV_LIKELY_NULL(prebuilt->compress_heap))
+		mem_heap_empty(prebuilt->compress_heap);
+
 	for (i = 0; i < prebuilt->n_template; i++) {
 		const mysql_row_templ_t*templ = &prebuilt->mysql_template[i];
 		const ulint		field_no
diff --git a/storage/xtradb/srv/srv0mon.cc b/storage/xtradb/srv/srv0mon.cc
index e72868c6450..7c2e549e188 100644
--- a/storage/xtradb/srv/srv0mon.cc
+++ b/storage/xtradb/srv/srv0mon.cc
@@ -1,6 +1,6 @@
 /*****************************************************************************
 
-Copyright (c) 2010, 2014, Oracle and/or its affiliates. All Rights Reserved.
+Copyright (c) 2010, 2016, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2012, Facebook Inc.
 Copyright (c) 2013, 2016, MariaDB Corporation.
 
@@ -309,6 +309,12 @@ static monitor_info_t	innodb_counter_info[] =
 	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
 	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES_READ},
 
+	{"buffer_pages0_read", "buffer",
+	 "Number of page 0 read (innodb_pages0_read)",
+	 static_cast<monitor_type_t>(
+	 MONITOR_EXISTING | MONITOR_DEFAULT_ON),
+	 MONITOR_DEFAULT_START, MONITOR_OVLD_PAGES0_READ},
+
 	{"buffer_index_sec_rec_cluster_reads", "buffer",
 	 "Number of secondary record reads triggered cluster read",
 	 static_cast<monitor_type_t>(
@@ -1493,7 +1499,10 @@ srv_mon_set_module_control(
 				module */
 				set_current_module = FALSE;
 			} else if (module_id == MONITOR_ALL_COUNTER) {
-				continue;
+				if (!(innodb_counter_info[ix].monitor_type
+				      & MONITOR_GROUP_MODULE)) {
+					continue;
+				}
 			} else {
 				/* Hitting the next module, stop */
 				break;
@@ -1715,6 +1724,11 @@ srv_mon_process_existing_counter(
 		value = stat.n_pages_read;
 		break;
 
+	/* innodb_pages0_read */
+	case MONITOR_OVLD_PAGES0_READ:
+		value = srv_stats.page0_read;
+		break;
+
 	/* Number of times secondary index lookup triggered cluster lookup */
 	case MONITOR_OVLD_INDEX_SEC_REC_CLUSTER_READS:
 		value = srv_stats.n_sec_rec_cluster_reads;
diff --git a/storage/xtradb/srv/srv0srv.cc b/storage/xtradb/srv/srv0srv.cc
index 50565de69b0..426a1c57e7c 100644
--- a/storage/xtradb/srv/srv0srv.cc
+++ b/storage/xtradb/srv/srv0srv.cc
@@ -3,7 +3,7 @@
 Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
 Copyright (c) 2008, 2009 Google Inc.
 Copyright (c) 2009, Percona Inc.
-Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved.
+Copyright (c) 2013, 2016, MariaDB Corporation.
 
 Portions of this file contain modifications contributed and copyrighted by
 Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -219,6 +219,9 @@ UNIV_INTERN char**	srv_data_file_names = NULL;
 /* size in database pages */
 UNIV_INTERN ulint*	srv_data_file_sizes = NULL;
 
+/** Whether the redo log tracking is currently enabled. Note that it is
+possible for the log tracker thread to be running and the tracking to be
+disabled */
 UNIV_INTERN my_bool	srv_track_changed_pages = FALSE;
 
 UNIV_INTERN ulonglong	srv_max_bitmap_file_size = 100 * 1024 * 1024;
@@ -843,6 +846,9 @@ UNIV_INTERN os_event_t	srv_checkpoint_completed_event;
 
 UNIV_INTERN os_event_t	srv_redo_log_tracked_event;
 
+/** Whether the redo log tracker thread has been started. Does not take into
+account whether the tracking is currently enabled (see srv_track_changed_pages
+for that) */
 UNIV_INTERN bool	srv_redo_log_thread_started = false;
 
 /*********************************************************************//**
@@ -1946,6 +1952,7 @@ srv_export_innodb_status(void)
 	export_vars.innodb_pages_created = stat.n_pages_created;
 
 	export_vars.innodb_pages_read = stat.n_pages_read;
+	export_vars.innodb_page0_read = srv_stats.page0_read;
 
 	export_vars.innodb_pages_written = stat.n_pages_written;
 
@@ -2064,6 +2071,8 @@ srv_export_innodb_status(void)
 		crypt_stat.pages_flushed;
 	export_vars.innodb_encryption_rotation_estimated_iops =
 		crypt_stat.estimated_iops;
+	export_vars.innodb_encryption_key_requests =
+		srv_stats.n_key_requests;
 
 	export_vars.innodb_scrub_page_reorganizations =
 		scrub_stat.page_reorganizations;
@@ -2540,13 +2549,8 @@ DECLARE_THREAD(srv_redo_log_follow_thread)(
 		os_event_wait(srv_checkpoint_completed_event);
 		os_event_reset(srv_checkpoint_completed_event);
 
-#ifdef UNIV_DEBUG
-		if (!srv_track_changed_pages) {
-			continue;
-		}
-#endif
-
-		if (srv_shutdown_state < SRV_SHUTDOWN_LAST_PHASE) {
+		if (srv_track_changed_pages
+		    && srv_shutdown_state < SRV_SHUTDOWN_LAST_PHASE) {
 			if (!log_online_follow_redo_log()) {
 				/* TODO: sync with I_S log tracking status? */
 				ib_logf(IB_LOG_LEVEL_ERROR,
diff --git a/storage/xtradb/srv/srv0start.cc b/storage/xtradb/srv/srv0start.cc
index e01537e8f73..bcd4bfbf495 100644
--- a/storage/xtradb/srv/srv0start.cc
+++ b/storage/xtradb/srv/srv0start.cc
@@ -705,7 +705,8 @@ create_log_files(
 		logfilename, SRV_LOG_SPACE_FIRST_ID,
 		fsp_flags_set_page_size(0, UNIV_PAGE_SIZE),
 		FIL_LOG,
-		NULL /* no encryption yet */);
+		NULL /* no encryption yet */,
+		true /* this is create */);
 	ut_a(fil_validate());
 
 	logfile0 = fil_node_create(
@@ -727,7 +728,7 @@ create_log_files(
 #ifdef UNIV_LOG_ARCHIVE
 	/* Create the file space object for archived logs. */
 	fil_space_create("arch_log_space", SRV_LOG_SPACE_FIRST_ID + 1,
-		0, FIL_LOG, NULL /* no encryption yet */);
+		0, FIL_LOG, NULL /* no encryption yet */, true /* create */);
 #endif
 	log_group_init(0, srv_n_log_files,
 		       srv_log_file_size * UNIV_PAGE_SIZE,
@@ -853,7 +854,7 @@ open_or_create_data_files(
 	ulint		space;
 	ulint		rounded_size_pages;
 	char		name[10000];
-	fil_space_crypt_t*    crypt_data;
+	fil_space_crypt_t*    crypt_data=NULL;
 
 	if (srv_n_data_files >= 1000) {
 
@@ -1184,18 +1185,20 @@ check_first_page:
 			}
 
 			*sum_of_new_sizes += srv_data_file_sizes[i];
-
-			crypt_data = fil_space_create_crypt_data(FIL_SPACE_ENCRYPTION_DEFAULT, FIL_DEFAULT_ENCRYPTION_KEY);
 		}
 
 		ret = os_file_close(files[i]);
 		ut_a(ret);
 
 		if (i == 0) {
+			if (!crypt_data) {
+				crypt_data = fil_space_create_crypt_data(FIL_SPACE_ENCRYPTION_DEFAULT, FIL_DEFAULT_ENCRYPTION_KEY);
+			}
+
 			flags = fsp_flags_set_page_size(0, UNIV_PAGE_SIZE);
+
 			fil_space_create(name, 0, flags, FIL_TABLESPACE,
-					 crypt_data);
-			crypt_data = NULL;
+					crypt_data, (*create_new_db) == true);
 		}
 
 		ut_a(fil_validate());
@@ -1342,7 +1345,8 @@ srv_undo_tablespace_open(
 		/* Set the compressed page size to 0 (non-compressed) */
 		flags = fsp_flags_set_page_size(0, UNIV_PAGE_SIZE);
 		fil_space_create(name, space, flags, FIL_TABLESPACE,
-				 NULL /* no encryption */);
+				NULL /* no encryption */,
+				true /* create */);
 
 		ut_a(fil_validate());
 
@@ -2340,7 +2344,8 @@ innobase_start_or_create_for_mysql(void)
 				 SRV_LOG_SPACE_FIRST_ID,
 				 fsp_flags_set_page_size(0, UNIV_PAGE_SIZE),
 				 FIL_LOG,
-				 NULL /* no encryption yet */);
+				 NULL /* no encryption yet */,
+				 true /* create */);
 
 		ut_a(fil_validate());
 
@@ -2362,7 +2367,8 @@ innobase_start_or_create_for_mysql(void)
 		/* Create the file space object for archived logs. Under
 		MySQL, no archiving ever done. */
 		fil_space_create("arch_log_space", SRV_LOG_SPACE_FIRST_ID + 1,
-			0, FIL_LOG, NULL /* no encryption yet */);
+			0, FIL_LOG, NULL /* no encryption yet */,
+			true /* create */);
 #endif /* UNIV_LOG_ARCHIVE */
 		log_group_init(0, i, srv_log_file_size * UNIV_PAGE_SIZE,
 			       SRV_LOG_SPACE_FIRST_ID,
@@ -2772,6 +2778,12 @@ files_checked:
 		}
 	}
 
+	/* Create the SYS_ZIP_DICT system table */
+	err = dict_create_or_check_sys_zip_dict();
+	if (err != DB_SUCCESS) {
+		return(err);
+	}
+
 	srv_is_being_started = FALSE;
 
 	ut_a(trx_purge_state() == PURGE_STATE_INIT);
diff --git a/storage/xtradb/trx/trx0trx.cc b/storage/xtradb/trx/trx0trx.cc
index 41f8c166190..ec57a8e5c54 100644
--- a/storage/xtradb/trx/trx0trx.cc
+++ b/storage/xtradb/trx/trx0trx.cc
@@ -1117,6 +1117,8 @@ trx_start_low(
 
 	trx->start_time = ut_time();
 
+	trx->start_time_micro = clock();
+
 	MONITOR_INC(MONITOR_TRX_ACTIVE);
 }