4 files changed, 170 insertions, 127 deletions
diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc
index 9bb1235cbbc..57d8f23b662 100644
--- a/storage/innobase/handler/ha_innodb.cc
+++ b/storage/innobase/handler/ha_innodb.cc
@@ -18873,23 +18873,28 @@ static void bg_wsrep_kill_trx(
 
 	if (thd) {
 		wsrep_thd_LOCK(thd);
-		victim_trx = thd_to_trx(thd);
-		lock_mutex_enter();
-		trx_mutex_enter(victim_trx);
-		wsrep_thd_UNLOCK(thd);
-		if (victim_trx->id != arg->trx_id)
-		{
-			trx_mutex_exit(victim_trx);
-			lock_mutex_exit();
-			victim_trx = NULL;
+		victim_trx= thd_to_trx(thd);
+		/* Victim trx might not exist e.g. on MDL-conflict. */
+		if (victim_trx) {
+			lock_mutex_enter();
+			trx_mutex_enter(victim_trx);
+			if (victim_trx->id != arg->trx_id ||
+			    victim_trx->state == TRX_STATE_COMMITTED_IN_MEMORY)
+			{
+				/* Victim was meanwhile rolled back or
+				committed */
+				lock_mutex_exit();
+				trx_mutex_exit(victim_trx);
+				goto no_victim;
+			}
+		} else {
+no_victim:
+			wsrep_thd_UNLOCK(thd);
+			/* find_thread_by_id() acquired THD::LOCK_kill_data */
 			wsrep_thd_kill_UNLOCK(thd);
+			goto ret;
 		}
-	}
-
-	if (!victim_trx) {
-		/* it can happen that trx_id was meanwhile rolled back */
-		DBUG_PRINT("wsrep", ("no thd for conflicting lock"));
-		goto ret;
+		wsrep_thd_UNLOCK(thd);
 	}
 
 	WSREP_DEBUG("BF kill (" ULINTPF ", seqno: " INT64PF
@@ -19044,7 +19049,7 @@ static void bg_wsrep_kill_trx(
 	}
 
 ret_awake:
-	awake = true;
+	awake= true;
 
 ret_unlock:
 	trx_mutex_exit(victim_trx);
diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc
index ee57a493119..bd09f3b81ca 100644
--- a/storage/innobase/lock/lock0lock.cc
+++ b/storage/innobase/lock/lock0lock.cc
@@ -637,71 +637,81 @@ lock_rec_get_insert_intention(
 	return(lock->type_mode & LOCK_INSERT_INTENTION);
 }
 
+#ifdef UNIV_DEBUG
 #ifdef WITH_WSREP
-/** Check if both conflicting lock and other record lock are brute force
-(BF). This case is a bug so report lock information and wsrep state.
-@param[in]	lock_rec1	conflicting waiting record lock or NULL
-@param[in]	lock_rec2	other waiting record lock
-@param[in]	trx1		lock_rec1 can be NULL, trx
+/** Check if both conflicting lock transaction and other transaction
+requesting record lock are brute force (BF). If they are check is
+this BF-BF wait correct and if not report BF wait and assert.
+
+@param[in]	lock_rec	other waiting record lock
+@param[in]	trx		trx requesting conflicting record lock
 */
-static void wsrep_assert_no_bf_bf_wait(
-	const lock_t* lock_rec1,
-	const lock_t* lock_rec2,
-	const trx_t* trx1)
+static void wsrep_assert_no_bf_bf_wait(const lock_t *lock, const trx_t *trx)
 {
-	ut_ad(!lock_rec1 || lock_get_type_low(lock_rec1) == LOCK_REC);
-	ut_ad(lock_get_type_low(lock_rec2) == LOCK_REC);
+	ut_ad(lock_get_type_low(lock) == LOCK_REC);
 	ut_ad(lock_mutex_own());
+	trx_t* lock_trx= lock->trx;
 
 	/* Note that we are holding lock_sys->mutex, thus we should
 	not acquire THD::LOCK_thd_data mutex below to avoid mutexing
 	order violation. */
 
-	if (!trx1->is_wsrep() || !lock_rec2->trx->is_wsrep())
+	if (!trx->is_wsrep() || !lock_trx->is_wsrep())
 		return;
-	if (UNIV_LIKELY(!wsrep_thd_is_BF(trx1->mysql_thd, FALSE)))
-		return;
-	if (UNIV_LIKELY(!wsrep_thd_is_BF(lock_rec2->trx->mysql_thd, FALSE)))
+	if (UNIV_LIKELY(!wsrep_thd_is_BF(trx->mysql_thd, FALSE))
+	    || UNIV_LIKELY(!wsrep_thd_is_BF(lock_trx->mysql_thd, FALSE)))
 		return;
 
-	/* if BF - BF order is honored, we can keep trx1 waiting for the lock */
-	if (wsrep_trx_order_before(trx1->mysql_thd, lock_rec2->trx->mysql_thd))
+	ut_ad(trx->state == TRX_STATE_ACTIVE);
+
+	trx_mutex_enter(lock_trx);
+	const trx_state_t trx2_state= lock_trx->state;
+	trx_mutex_exit(lock_trx);
+
+	/* If transaction is already committed in memory or
+	prepared we should wait. When transaction is committed in
+	memory we held trx mutex, but not lock_sys->mutex. Therefore,
+	we could end here before transaction has time to do
+	lock_release() that is protected with lock_sys->mutex. */
+	switch (trx2_state) {
+	case TRX_STATE_COMMITTED_IN_MEMORY:
+	case TRX_STATE_PREPARED:
 		return;
+	case TRX_STATE_ACTIVE:
+		break;
+	default:
+		ut_ad("invalid state" == 0);
+	}
 
-	/* avoiding BF-BF conflict assert, if victim is already aborting
-	   or rolling back for replaying
-	*/
-	if (wsrep_trx_is_aborting(lock_rec2->trx->mysql_thd))
+	/* If BF - BF order is honored, i.e. trx already holding
+	record lock should be ordered before this new lock request
+	we can keep trx waiting for the lock. If conflicting
+	transaction is already aborting or rolling back for replaying
+	we can also let new transaction waiting. */
+	if (wsrep_trx_order_before(lock_trx->mysql_thd, trx->mysql_thd)
+	    || wsrep_trx_is_aborting(lock_trx->mysql_thd))
 		return;
 
 	mtr_t mtr;
 
-	if (lock_rec1) {
-		ib::error() << "Waiting lock on table: "
-			    << lock_rec1->index->table->name
-			    << " index: "
-			    << lock_rec1->index->name()
-			    << " that has conflicting lock ";
-		lock_rec_print(stderr, lock_rec1, mtr);
-	}
-
 	ib::error() << "Conflicting lock on table: "
-		    << lock_rec2->index->table->name
+		    << lock->index->table->name
 		    << " index: "
-		    << lock_rec2->index->name()
+		    << lock->index->name()
 		    << " that has lock ";
-	lock_rec_print(stderr, lock_rec2, mtr);
+	lock_rec_print(stderr, lock, mtr);
 
 	ib::error() << "WSREP state: ";
 
-	wsrep_report_bf_lock_wait(trx1->mysql_thd,
-				  trx1->id);
-	wsrep_report_bf_lock_wait(lock_rec2->trx->mysql_thd,
-				  lock_rec2->trx->id);
+	wsrep_report_bf_lock_wait(trx->mysql_thd,
+				  trx->id);
+	wsrep_report_bf_lock_wait(lock_trx->mysql_thd,
+				  lock_trx->id);
 	/* BF-BF wait is a bug */
 	ut_error;
 }
 #endif /* WITH_WSREP */
+#endif /* UNIV_DEBUG */
 
 /*********************************************************************//**
 Checks if a lock request for a new lock has to wait for request lock2.
@@ -824,9 +834,11 @@ lock_rec_has_to_wait(
 			return false;
 		}
 
-		/* There should not be two conflicting locks that are
-		brute force. If there is it is a bug. */
-		wsrep_assert_no_bf_bf_wait(NULL, lock2, trx);
+		/* We very well can let bf to wait normally as other
+		BF will be replayed in case of conflict. For debug
+		builds we will do additional sanity checks to catch
+		unsupported bf wait if any. */
+		ut_d(wsrep_assert_no_bf_bf_wait(lock2, trx));
 #endif /* WITH_WSREP */
 
 	return true;
@@ -1095,66 +1107,35 @@ lock_rec_other_has_expl_req(
 #endif /* UNIV_DEBUG */
 
 #ifdef WITH_WSREP
-static
-void
-wsrep_kill_victim(
-/*==============*/
-	const trx_t * const trx,
-	const lock_t *lock)
+static void wsrep_kill_victim(const trx_t * const trx, const lock_t *lock)
 {
 	ut_ad(lock_mutex_own());
-	ut_ad(trx_mutex_own(lock->trx));
+	ut_ad(trx->is_wsrep());
+	trx_t* lock_trx = lock->trx;
+	ut_ad(trx_mutex_own(lock_trx));
+	ut_ad(lock_trx != trx);
 
-	/* quit for native mysql */
-	if (!trx->is_wsrep()) return;
+	if (!wsrep_thd_is_BF(trx->mysql_thd, FALSE))
+		return;
 
-	my_bool bf_this  = wsrep_thd_is_BF(trx->mysql_thd, FALSE);
-	my_bool bf_other = wsrep_thd_is_BF(lock->trx->mysql_thd, FALSE);
-	mtr_t mtr;
+	if (lock_trx->state == TRX_STATE_COMMITTED_IN_MEMORY
+	    || lock_trx->lock.was_chosen_as_deadlock_victim)
+              return;
 
-	if ((bf_this && !bf_other) ||
-		(bf_this && bf_other && wsrep_trx_order_before(
-			trx->mysql_thd, lock->trx->mysql_thd))) {
+	my_bool bf_other = wsrep_thd_is_BF(lock_trx->mysql_thd, FALSE);
 
-		if (lock->trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
-			if (UNIV_UNLIKELY(wsrep_debug)) {
-				ib::info() << "WSREP: BF victim waiting\n";
-			}
+	if (!bf_other
+	    || wsrep_trx_order_before(trx->mysql_thd,
+				      lock_trx->mysql_thd)) {
+
+		if (lock_trx->lock.que_state == TRX_QUE_LOCK_WAIT) {
+			if (UNIV_UNLIKELY(wsrep_debug))
+				WSREP_INFO("BF victim waiting");
 			/* cannot release lock, until our lock
 			is in the queue*/
-		} else if (lock->trx != trx) {
-			if (wsrep_log_conflicts) {
-				if (bf_this) {
-					ib::info() << "*** Priority TRANSACTION:";
-				} else {
-					ib::info() << "*** Victim TRANSACTION:";
-				}
-
-				trx_print_latched(stderr, trx, 3000);
-
-				if (bf_other) {
-					ib::info() << "*** Priority TRANSACTION:";
-				} else {
-					ib::info() << "*** Victim TRANSACTION:";
-				}
-                                trx_print_latched(stderr, lock->trx, 3000);
-
-				ib::info() << "*** WAITING FOR THIS LOCK TO BE GRANTED:";
-
-				if (lock_get_type(lock) == LOCK_REC) {
-					lock_rec_print(stderr, lock, mtr);
-				} else {
-					lock_table_print(stderr, lock);
-				}
-
-				ib::info() << " SQL1: "
-					   << wsrep_thd_query(trx->mysql_thd);
-				ib::info() << " SQL2: "
-					   << wsrep_thd_query(lock->trx->mysql_thd);
-			}
-
-			wsrep_innobase_kill_one_trx(trx->mysql_thd,
-						    trx, lock->trx, TRUE);
+		} else {
+			wsrep_innobase_kill_one_trx(trx->mysql_thd, trx,
+						    lock_trx, true);
 		}
 	}
 }
@@ -2248,10 +2229,6 @@ static void lock_rec_dequeue_from_page(lock_t* in_lock)
 				/* Grant the lock */
 				ut_ad(lock->trx != in_lock->trx);
 				lock_grant(lock);
-#ifdef WITH_WSREP
-			} else {
-				wsrep_assert_no_bf_bf_wait(c, lock, c->trx);
-#endif /* WITH_WSREP */
 			}
 		}
 	} else {
@@ -4204,10 +4181,6 @@ released:
 				/* Grant the lock */
 				ut_ad(trx != lock->trx);
 				lock_grant(lock);
-#ifdef WITH_WSREP
-			} else {
-				wsrep_assert_no_bf_bf_wait(c, lock, c->trx);
-#endif /* WITH_WSREP */
 			}
 		}
 	} else {
diff --git a/storage/innobase/row/row0import.cc b/storage/innobase/row/row0import.cc
index 7b888cdecc1..0f9e309accb 100644
--- a/storage/innobase/row/row0import.cc
+++ b/storage/innobase/row/row0import.cc
@@ -1,7 +1,7 @@
 /*****************************************************************************
 
 Copyright (c) 2012, 2016, Oracle and/or its affiliates. All Rights Reserved.
-Copyright (c) 2015, 2020, MariaDB Corporation.
+Copyright (c) 2015, 2021, MariaDB Corporation.
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -3370,6 +3370,57 @@ struct fil_iterator_t {
 	byte*           crypt_io_buffer;        /*!< IO buffer when encrypted */
 };
 
+
+/** InnoDB writes page by page when there is page compressed
+tablespace involved. It does help to save the disk space when
+punch hole is enabled
+@param iter 	Tablespace iterator
+@param write_request Request to write into the file
+@param offset	offset of the file to be written
+@param writeptr	buffer to be written
+@param n_bytes	number of bytes to be written
+@param try_punch_only	Try the range punch only because the
+			current range is full of empty pages
+@return DB_SUCCESS */
+static
+dberr_t fil_import_compress_fwrite(const fil_iterator_t &iter,
+                                   const IORequest &write_request,
+                                   os_offset_t offset,
+                                   const byte *writeptr,
+                                   ulint n_bytes,
+                                   bool try_punch_only=false)
+{
+  dberr_t err= os_file_punch_hole(iter.file, offset, n_bytes);
+  if (err != DB_SUCCESS || try_punch_only)
+    return err;
+
+  for (ulint j= 0; j < n_bytes; j+= srv_page_size)
+  {
+    /* Read the original data length from block and
+    safer to read FIL_PAGE_COMPRESSED_SIZE because it
+    is not encrypted*/
+    ulint n_write_bytes= srv_page_size;
+    if (j || offset)
+    {
+      n_write_bytes= mach_read_from_2(writeptr + j + FIL_PAGE_DATA);
+      const unsigned  ptype= mach_read_from_2(writeptr + j + FIL_PAGE_TYPE);
+      /* Ignore the empty page */
+      if (ptype == 0 && n_write_bytes == 0)
+        continue;
+      n_write_bytes+= FIL_PAGE_DATA + FIL_PAGE_COMPRESSED_SIZE;
+      if (ptype == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED)
+        n_write_bytes+= FIL_PAGE_COMPRESSION_METHOD_SIZE;
+    }
+
+    err= os_file_write(write_request, iter.filepath, iter.file,
+                       writeptr + j, offset + j, n_write_bytes);
+    if (err != DB_SUCCESS)
+      break;
+  }
+
+  return err;
+}
+
 /********************************************************************//**
 TODO: This can be made parallel trivially by chunking up the file and creating
 a callback per thread. . Main benefit will be to use multiple CPUs for
@@ -3411,7 +3462,10 @@ fil_iterate(
 	/* TODO: For ROW_FORMAT=COMPRESSED tables we do a lot of useless
 	copying for non-index pages. Unfortunately, it is
 	required by buf_zip_decompress() */
-	dberr_t err = DB_SUCCESS;
+	dberr_t		err = DB_SUCCESS;
+	bool		page_compressed = false;
+	bool		punch_hole = true;
+	IORequest	write_request(IORequest::WRITE);
 
 	for (offset = iter.start; offset < iter.end; offset += n_bytes) {
 		if (callback.is_interrupted()) {
@@ -3489,9 +3543,8 @@ page_corrupted:
 				goto func_exit;
 			}
 
-			const bool page_compressed
-				= fil_page_is_compressed_encrypted(src)
-				|| fil_page_is_compressed(src);
+			page_compressed= fil_page_is_compressed_encrypted(src)
+					 || fil_page_is_compressed(src);
 
 			if (page_compressed && block->page.zip.data) {
 				goto page_corrupted;
@@ -3646,13 +3699,23 @@ not_encrypted:
 			}
 		}
 
-		/* A page was updated in the set, write back to disk. */
-		if (updated) {
-			IORequest       write_request(IORequest::WRITE);
+		if (page_compressed && punch_hole) {
+			err = fil_import_compress_fwrite(
+				iter, write_request, offset, writeptr, n_bytes,
+				!updated);
 
-			err = os_file_write(write_request,
-					    iter.filepath, iter.file,
-					    writeptr, offset, n_bytes);
+			if (err != DB_SUCCESS) {
+				punch_hole = false;
+				if (updated) {
+					goto normal_write;
+				}
+			}
+		} else if (updated) {
+			/* A page was updated in the set, write back to disk. */
+normal_write:
+			err = os_file_write(
+				write_request, iter.filepath, iter.file,
+				writeptr, offset, n_bytes);
 
 			if (err != DB_SUCCESS) {
 				goto func_exit;
diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc
index 7a9c2cd9240..0638465527c 100644
--- a/storage/innobase/row/row0merge.cc
+++ b/storage/innobase/row/row0merge.cc
@@ -3900,6 +3900,8 @@ row_merge_drop_indexes(
 					ut_ad(prev);
 					ut_a(table->fts);
 					fts_drop_index(table, index, trx);
+					row_merge_drop_index_dict(
+						trx, index->id);
 					/* We can remove a DICT_FTS
 					index from the cache, because
 					we do not allow ADD FULLTEXT INDEX