diff options
91 files changed, 5772 insertions, 168 deletions
diff --git a/mysql-test/suite/innodb/r/innodb_defrag_binlog.result b/mysql-test/suite/innodb/r/innodb_defrag_binlog.result new file mode 100644 index 00000000000..2a1992e449d --- /dev/null +++ b/mysql-test/suite/innodb/r/innodb_defrag_binlog.result @@ -0,0 +1,29 @@ +include/master-slave.inc +[connection master] +drop table if exists t1; +create table t1(a int not null primary key auto_increment, b varchar(256), key second(b)) engine=innodb; +insert into t1 values (1, REPEAT("a", 256)); +insert into t1 values (2, REPEAT("a", 256)); +optimize table t1; +Table Op Msg_type Msg_text +test.t1 optimize status OK +drop table t1; +show binlog events in 'master-bin.000001' from 313; +Log_name Pos Event_type Server_id End_log_pos Info +master-bin.000001 313 Gtid 1 351 GTID 0-1-1 +master-bin.000001 351 Query 1 465 use `test`; DROP TABLE IF EXISTS `t1` +master-bin.000001 465 Gtid 1 503 GTID 0-1-2 +master-bin.000001 503 Query 1 669 use `test`; create table t1(a int not null primary key auto_increment, b varchar(256), key second(b)) engine=innodb +master-bin.000001 669 Gtid 1 707 BEGIN GTID 0-1-3 +master-bin.000001 707 Table_map 1 751 table_id: 82 (test.t1) +master-bin.000001 751 Write_rows_v1 1 1043 table_id: 82 flags: STMT_END_F +master-bin.000001 1043 Xid 1 1070 COMMIT +master-bin.000001 1070 Gtid 1 1108 BEGIN GTID 0-1-4 +master-bin.000001 1108 Table_map 1 1152 table_id: 82 (test.t1) +master-bin.000001 1152 Write_rows_v1 1 1444 table_id: 82 flags: STMT_END_F +master-bin.000001 1444 Xid 1 1471 COMMIT +master-bin.000001 1471 Gtid 1 1509 GTID 0-1-5 +master-bin.000001 1509 Query 1 1589 use `test`; optimize table t1 +master-bin.000001 1589 Gtid 1 1627 GTID 0-1-6 +master-bin.000001 1627 Query 1 1731 use `test`; DROP TABLE `t1` +include/rpl_end.inc diff --git a/mysql-test/suite/innodb/r/innodb_defrag_concurrent.result b/mysql-test/suite/innodb/r/innodb_defrag_concurrent.result new file mode 100644 index 00000000000..1b1ff3a858f --- /dev/null +++ b/mysql-test/suite/innodb/r/innodb_defrag_concurrent.result @@ -0,0 +1,73 @@ +DROP TABLE if exists t1; +select @@global.innodb_stats_persistent; +@@global.innodb_stats_persistent +0 +set global innodb_defragment_stats_accuracy = 80; +CREATE TABLE t1 (a INT NOT NULL PRIMARY KEY AUTO_INCREMENT, b VARCHAR(256), c INT, KEY second(a, b),KEY third(c)) ENGINE=INNODB; +SET @@global.innodb_defragment_n_pages = 20; +after populate PRIMARY +select count(*) from t1; +count(*) +20000 +after populate second +select count(*) from t1 force index (second); +count(*) +20000 +after populate third +select count(*) from t1 force index (third); +count(*) +20000 +select count(*) from t1; +count(*) +15800 +after delete PRIMAY +select count(*) from t1 force index (second); +count(*) +15800 +after delete second +select count(*) from t1 force index (third); +count(*) +15800 +after delete third +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_pages_freed'); +count(stat_value) > 0 +0 +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_page_split'); +count(stat_value) > 0 +1 +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_leaf_pages_defrag'); +count(stat_value) > 0 +1 +optimize table t1;; +INSERT INTO t1 VALUES (400000, REPEAT('A', 256),300000);; +INSERT INTO t1 VALUES (500000, REPEAT('A', 256),400000);; +DELETE FROM t1 where a between 1 and 100;; +UPDATE t1 SET c = c + 1 where c between 2000 and 8000;; +optimize table t1; +Table Op Msg_type Msg_text +test.t1 optimize status OK +select sleep(5); +sleep(5) +0 +select count(*) from t1; +count(*) +15723 +after optimize PRIMARY +select count(*) from t1 force index (second); +count(*) +15723 +after optimize second +select count(*) from t1 force index (third); +count(*) +15723 +after optimize third +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_pages_freed'); +count(stat_value) > 0 +1 +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_page_split'); +count(stat_value) > 0 +1 +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_leaf_pages_defrag'); +count(stat_value) > 0 +1 +drop table t1; diff --git a/mysql-test/suite/innodb/r/innodb_defrag_stats.result b/mysql-test/suite/innodb/r/innodb_defrag_stats.result new file mode 100644 index 00000000000..0838a199b3b --- /dev/null +++ b/mysql-test/suite/innodb/r/innodb_defrag_stats.result @@ -0,0 +1,94 @@ +DROP TABLE if exists t1; +select @@global.innodb_stats_persistent; +@@global.innodb_stats_persistent +0 +set global innodb_defragment_stats_accuracy = 20; +# Create table. +CREATE TABLE t1 (a INT NOT NULL PRIMARY KEY AUTO_INCREMENT, b VARCHAR(256), KEY SECOND(a, b)) ENGINE=INNODB; +# Populate data +INSERT INTO t1 VALUES(1, REPEAT('A', 256)); +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +# Not enough page splits to trigger persistent stats write yet. +select count(*) from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_page_split', 'n_leaf_pages_defrag'); +count(*) +0 +INSERT INTO t1 (b) SELECT b from t1; +# Persistent stats recorded. +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_page_split', 'n_leaf_pages_defrag'); +count(stat_value) > 0 +0 +# Delete some rows. +delete from t1 where a between 100 * 20 and 100 * 20 + 30; +delete from t1 where a between 100 * 19 and 100 * 19 + 30; +delete from t1 where a between 100 * 18 and 100 * 18 + 30; +delete from t1 where a between 100 * 17 and 100 * 17 + 30; +delete from t1 where a between 100 * 16 and 100 * 16 + 30; +delete from t1 where a between 100 * 15 and 100 * 15 + 30; +delete from t1 where a between 100 * 14 and 100 * 14 + 30; +delete from t1 where a between 100 * 13 and 100 * 13 + 30; +delete from t1 where a between 100 * 12 and 100 * 12 + 30; +delete from t1 where a between 100 * 11 and 100 * 11 + 30; +delete from t1 where a between 100 * 10 and 100 * 10 + 30; +delete from t1 where a between 100 * 9 and 100 * 9 + 30; +delete from t1 where a between 100 * 8 and 100 * 8 + 30; +delete from t1 where a between 100 * 7 and 100 * 7 + 30; +delete from t1 where a between 100 * 6 and 100 * 6 + 30; +delete from t1 where a between 100 * 5 and 100 * 5 + 30; +delete from t1 where a between 100 * 4 and 100 * 4 + 30; +delete from t1 where a between 100 * 3 and 100 * 3 + 30; +delete from t1 where a between 100 * 2 and 100 * 2 + 30; +delete from t1 where a between 100 * 1 and 100 * 1 + 30; +# Server Restarted +# Confirm persistent stats still there after restart. +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_page_split', 'n_leaf_pages_defrag'); +count(stat_value) > 0 +0 +optimize table t1; +Table Op Msg_type Msg_text +test.t1 optimize status OK +# n_page_split should be 0 after defragmentation, n_pages_freed should be non-zero. +select stat_value = 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name = 'n_page_split'; +stat_value = 0 +1 +1 +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_pages_freed', 'n_leaf_pages_defrag'); +count(stat_value) > 0 +1 +set global innodb_defragment_stats_accuracy = 40; +INSERT INTO t1 (b) SELECT b from t1; +# Not enough operation to trigger persistent stats write +select stat_value = 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name = 'n_page_split'; +stat_value = 0 +1 +1 +INSERT INTO t1 (b) SELECT b from t1; +# Persistent stats write triggered +select stat_value > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name = 'n_page_split'; +stat_value > 0 +0 +0 +# Table rename should cause stats rename. +rename table t1 to t2; +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t2%' and stat_name in ('n_pages_freed', 'n_page_split', 'n_leaf_pages_defrag'); +count(stat_value) > 0 +1 +# Drop index should cause stats drop. +drop index SECOND on t2; +select count(*) from mysql.innodb_index_stats where table_name like '%t2%' and index_name = 'SECOND'; +count(*) +4 +Server Restarted +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t2%' and stat_name in ('n_pages_freed', 'n_page_split', 'n_leaf_pages_defrag'); +count(stat_value) > 0 +1 +# Clean up +DROP TABLE t2; diff --git a/mysql-test/suite/innodb/r/innodb_defrag_stats_many_tables.result b/mysql-test/suite/innodb/r/innodb_defrag_stats_many_tables.result new file mode 100644 index 00000000000..f19e5ddf590 --- /dev/null +++ b/mysql-test/suite/innodb/r/innodb_defrag_stats_many_tables.result @@ -0,0 +1,36 @@ +DROP TABLE if exists t1; +SET @start_table_definition_cache = @@global.table_definition_cache; +SET @@global.table_definition_cache = 400; +SET @start_innodb_defragment_stats_accuracy = @@global.innodb_defragment_stats_accuracy; +SET @@global.innodb_defragment_stats_accuracy = 10; +CREATE TABLE t1 (a INT NOT NULL PRIMARY KEY AUTO_INCREMENT, b VARCHAR(256), KEY SECOND(a, b)) ENGINE=INNODB; +INSERT INTO t1 VALUES(1, REPEAT('A', 256)); +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +select stat_value > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name = 'n_page_split'; +stat_value > 0 +Create 405 table to overflow the table cache. +Sleep for a while to make sure t1 is evicted. +select sleep(10); +sleep(10) +0 +Reload t1 to get defrag stats from persistent storage +INSERT INTO t1 (b) SELECT b from t1; +make sure the stats thread will wake up and do the write even if there's a race condition between set and reset. +select sleep(12); +sleep(12) +0 +select stat_value > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name = 'n_page_split'; +stat_value > 0 +SET @@global.innodb_defragment_stats_accuracy = @start_innodb_defragment_stats_accuracy; +SET @@global.table_definition_cache = @start_table_definition_cache; +DROP TABLE t1; diff --git a/mysql-test/suite/innodb/r/innodb_defragment.result b/mysql-test/suite/innodb/r/innodb_defragment.result new file mode 100644 index 00000000000..b8f61b0eba3 --- /dev/null +++ b/mysql-test/suite/innodb/r/innodb_defragment.result @@ -0,0 +1,81 @@ +DROP TABLE if exists t1; +set global innodb_defragment_stats_accuracy = 80; +CREATE TABLE t1 (a INT NOT NULL PRIMARY KEY AUTO_INCREMENT, b VARCHAR(256), KEY SECOND(a, b)) ENGINE=INNODB; +optimize table t1; +Table Op Msg_type Msg_text +test.t1 optimize status OK +INSERT INTO t1 VALUES (100000, REPEAT('A', 256)); +INSERT INTO t1 VALUES (200000, REPEAT('A', 256)); +INSERT INTO t1 VALUES (300000, REPEAT('A', 256)); +INSERT INTO t1 VALUES (400000, REPEAT('A', 256)); +optimize table t1; +Table Op Msg_type Msg_text +test.t1 optimize status OK +create procedure defragment() +begin +set @i = 0; +repeat +set @i = @i + 1; +optimize table t1; +select sleep(5); +until @i = 3 end repeat; +end // +select count(stat_value) = 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_pages_freed'); +count(stat_value) = 0 +1 +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_page_split'); +count(stat_value) > 0 +1 +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_leaf_pages_defrag'); +count(stat_value) > 0 +1 +select count(*) from t1; +count(*) +10004 +select count(*) from t1 force index (second); +count(*) +10004 +call defragment(); +optimize table t1; +Table Op Msg_type Msg_text +test.t1 optimize status OK +select sleep(5); +sleep(5) +0 +select count(*) from t1; +count(*) +7904 +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t2%' and stat_name in ('n_pages_freed', 'n_page_split', 'n_leaf_pages_defrag'); +count(stat_value) > 0 +0 +select count(*) from t1 force index (second); +count(*) +7904 +SET @@global.innodb_defragment_n_pages = 3; +optimize table t1; +Table Op Msg_type Msg_text +test.t1 optimize status OK +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t2%' and stat_name in ('n_pages_freed', 'n_page_split', 'n_leaf_pages_defrag'); +count(stat_value) > 0 +0 +select count(*) from t1; +count(*) +6904 +select count(*) from t1 force index (second); +count(*) +6904 +SET @@global.innodb_defragment_n_pages = 10; +optimize table t1; +Table Op Msg_type Msg_text +test.t1 optimize status OK +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t2%' and stat_name in ('n_pages_freed', 'n_page_split', 'n_leaf_pages_defrag'); +count(stat_value) > 0 +0 +select count(*) from t1; +count(*) +6904 +select count(*) from t1 force index (second); +count(*) +6904 +DROP PROCEDURE defragment; +DROP TABLE t1; diff --git a/mysql-test/suite/innodb/r/innodb_defragment_fill_factor.result b/mysql-test/suite/innodb/r/innodb_defragment_fill_factor.result new file mode 100644 index 00000000000..90dcbc004f7 --- /dev/null +++ b/mysql-test/suite/innodb/r/innodb_defragment_fill_factor.result @@ -0,0 +1,59 @@ +DROP TABLE if exists t1; +DROP TABLE if exists t2; +Testing tables with large records +CREATE TABLE t1 (a INT NOT NULL PRIMARY KEY AUTO_INCREMENT, b VARCHAR(256), KEY SECOND(a, b)) ENGINE=INNODB; +optimize table t1; +Table Op Msg_type Msg_text +test.t1 optimize status OK +select count(*) from t1; +count(*) +790 +select count(*) from t1 force index (second); +count(*) +790 +# A few more insertions on the page should not cause a page split. +insert into t1 values (81, REPEAT('A', 256)); +insert into t1 values (83, REPEAT('A', 256)); +insert into t1 values (87, REPEAT('A', 256)); +insert into t1 values (82, REPEAT('A', 256)); +insert into t1 values (86, REPEAT('A', 256)); +# More insertions will cause page splits +insert into t1 values (88, REPEAT('A', 50)); +Too much space are reserved on primary index. +Too much space are reserved on second index. +DROP TABLE t1; +Testing table with small records +CREATE TABLE t2 (a INT NOT NULL PRIMARY KEY AUTO_INCREMENT, b VARchar(16), KEY SECOND(a,b)) ENGINE=INNODB; +optimize table t2; +Table Op Msg_type Msg_text +test.t2 optimize status OK +select count(*) from t2 force index(second); +count(*) +3701 +The page should have room for about 20 insertions +insert into t2 values(1181, REPEAT('A', 16)); +insert into t2 values(1191, REPEAT('A', 16)); +insert into t2 values(1182, REPEAT('A', 16)); +insert into t2 values(1192, REPEAT('A', 16)); +insert into t2 values(1183, REPEAT('A', 16)); +insert into t2 values(1193, REPEAT('A', 16)); +insert into t2 values(1184, REPEAT('A', 16)); +insert into t2 values(1194, REPEAT('A', 16)); +insert into t2 values(1185, REPEAT('A', 16)); +insert into t2 values(1195, REPEAT('A', 16)); +insert into t2 values(1186, REPEAT('A', 16)); +insert into t2 values(1196, REPEAT('A', 16)); +insert into t2 values(1187, REPEAT('A', 16)); +insert into t2 values(1197, REPEAT('A', 16)); +insert into t2 values(1188, REPEAT('A', 16)); +insert into t2 values(1198, REPEAT('A', 16)); +insert into t2 values(1189, REPEAT('A', 16)); +insert into t2 values(1199, REPEAT('A', 16)); +insert into t2 values(1190, REPEAT('A', 16)); +insert into t2 values(1180, REPEAT('A', 16)); +More insertions will cause page split. +insert into t2 values(1280, REPEAT('A', 16)); +insert into t2 values(1290, REPEAT('A', 16)); +insert into t2 values(1281, REPEAT('A', 16)); +insert into t2 values(1291, REPEAT('A', 16)); +DROP TABLE t2; diff --git a/mysql-test/suite/innodb/t/innodb.opt b/mysql-test/suite/innodb/t/innodb.opt new file mode 100644 index 00000000000..59e43fea231 --- /dev/null +++ b/mysql-test/suite/innodb/t/innodb.opt @@ -0,0 +1 @@ +--innodb-defragment=0
\ No newline at end of file diff --git a/mysql-test/suite/innodb/t/innodb_defrag_binlog.opt b/mysql-test/suite/innodb/t/innodb_defrag_binlog.opt new file mode 100644 index 00000000000..8a432b8c76e --- /dev/null +++ b/mysql-test/suite/innodb/t/innodb_defrag_binlog.opt @@ -0,0 +1,5 @@ +--loose-innodb-buffer-pool-stats +--loose-innodb-buffer-page +--loose-innodb-buffer-page-lru +--binlog-format=row +--innodb-defragment=1
\ No newline at end of file diff --git a/mysql-test/suite/innodb/t/innodb_defrag_binlog.test b/mysql-test/suite/innodb/t/innodb_defrag_binlog.test new file mode 100644 index 00000000000..c0d4b377cb1 --- /dev/null +++ b/mysql-test/suite/innodb/t/innodb_defrag_binlog.test @@ -0,0 +1,19 @@ +--source include/have_innodb.inc +--source include/master-slave.inc + +--disable_warnings +drop table if exists t1; +--enable_warnings + +create table t1(a int not null primary key auto_increment, b varchar(256), key second(b)) engine=innodb; + +insert into t1 values (1, REPEAT("a", 256)); +insert into t1 values (2, REPEAT("a", 256)); +optimize table t1; + +drop table t1; + +--replace_regex /\/\*.*// +show binlog events in 'master-bin.000001' from 313; + +--source include/rpl_end.inc diff --git a/mysql-test/suite/innodb/t/innodb_defrag_concurrent.opt b/mysql-test/suite/innodb/t/innodb_defrag_concurrent.opt new file mode 100644 index 00000000000..6426bac41a0 --- /dev/null +++ b/mysql-test/suite/innodb/t/innodb_defrag_concurrent.opt @@ -0,0 +1,4 @@ +--loose-innodb-buffer-pool-stats +--loose-innodb-buffer-page +--loose-innodb-buffer-page-lru +--innodb-defragment=1
\ No newline at end of file diff --git a/mysql-test/suite/innodb/t/innodb_defrag_concurrent.test b/mysql-test/suite/innodb/t/innodb_defrag_concurrent.test new file mode 100644 index 00000000000..7cf00e1da4c --- /dev/null +++ b/mysql-test/suite/innodb/t/innodb_defrag_concurrent.test @@ -0,0 +1,180 @@ +--source include/have_innodb.inc + +--disable_warnings +DROP TABLE if exists t1; +--enable_warnings + +--disable_query_log +let $innodb_defragment_n_pages_orig=`select @@innodb_defragment_n_pages`; +let $innodb_defragment_stats_accuracy_orig=`select @@innodb_defragment_stats_accuracy`; +--enable_query_log + +select @@global.innodb_stats_persistent; +set global innodb_defragment_stats_accuracy = 80; + +# Create table. +CREATE TABLE t1 (a INT NOT NULL PRIMARY KEY AUTO_INCREMENT, b VARCHAR(256), c INT, KEY second(a, b),KEY third(c)) ENGINE=INNODB; + +connect (con1,localhost,root,,test,$MASTER_MYPORT,$MASTER_MYSOCK); +connect (con2,localhost,root,,test,$MASTER_MYPORT,$MASTER_MYSOCK); +connect (con3,localhost,root,,test,$MASTER_MYPORT,$MASTER_MYSOCK); +connect (con4,localhost,root,,test,$MASTER_MYPORT,$MASTER_MYSOCK); + +connection default; + +SET @@global.innodb_defragment_n_pages = 20; + +let $data_size = 20000; +let $delete_size = 2000; + +# Populate table. +let $i = $data_size; +--disable_query_log +while ($i) +{ + eval + INSERT INTO t1 VALUES ($data_size + 1 - $i, REPEAT('A', 256), $i); + dec $i; +} +--enable_query_log + +--echo after populate PRIMARY +select count(*) from t1; + +if (`select count(*) < 30 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY' order by page_number;`) +{ +aelect count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY' order by page_number; +} + +--echo after populate second +select count(*) from t1 force index (second); + +if (`select count(*) < 320 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second' order by page_number;`) +{ +select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second' order by page_number; +} + +--ECHO after populate third +select count(*) from t1 force index (third); + +if (`select count(*) < 20 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'third' order by page_number;`) +{ +select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'third' order by page_number; +} + +# Delete some data +--disable_query_log +let $size = $delete_size; +while ($size) +{ + let $j = 100 * $size; + eval delete from t1 where a between $j - 20 and $j; + dec $size; +} +--enable_query_log + +select count(*) from t1; + +--echo after delete PRIMAY +if (`select count(*) < 30 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY' order by page_number;`) +{ +select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY' order by page_number; +} + +select count(*) from t1 force index (second); + +--echo after delete second +if (`select count(*) < 300 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second' order by page_number;`) +{ +select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second' order by page_number; +} + +select count(*) from t1 force index (third); + +--echo after delete third +if (`select count(*) > 20 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'third' order by page_number;`) +{ +select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'third' order by page_number; +} + +# Above delete will free some pages and insert causes page split and these could cause defrag +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_pages_freed'); +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_page_split'); +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_leaf_pages_defrag'); + +connection con1; +--send optimize table t1; + +connection default; +--send INSERT INTO t1 VALUES (400000, REPEAT('A', 256),300000); + +connection con2; +--send INSERT INTO t1 VALUES (500000, REPEAT('A', 256),400000); + +connection con3; +--send DELETE FROM t1 where a between 1 and 100; + +connection con4; +--send UPDATE t1 SET c = c + 1 where c between 2000 and 8000; + +connection con1; +--disable_result_log +--reap +--enable_result_log + +connection con2; +--reap + +connection con3; +--reap + +connection con4; +--reap + +connection default; +--reap + +disconnect con1; +disconnect con2; +disconnect con3; +disconnect con4; + +optimize table t1; +select sleep(5); + +select count(*) from t1; + +--echo after optimize PRIMARY +if (`select count(*) > 62 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY' order by page_number;`) +{ +select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY' order by page_number; +} + +select count(*) from t1 force index (second); + +--echo after optimize second +if (`select count(*) > 340 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second' order by page_number;`) +{ +select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second' order by page_number; +} + +select count(*) from t1 force index (third); + +--echo after optimize third +if (`select count(*) > 25 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'third' order by page_number;`) +{ +select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'third' order by page_number; +} + +# Now pages are freed +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_pages_freed'); +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_page_split'); +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_leaf_pages_defrag'); + +drop table t1; + +# reset system +--disable_query_log +EVAL SET GLOBAL innodb_defragment_n_pages = $innodb_defragment_n_pages_orig; +EVAL SET GLOBAL innodb_defragment_stats_accuracy = $innodb_defragment_stats_accuracy_orig; +--enable_query_log diff --git a/mysql-test/suite/innodb/t/innodb_defrag_stats.opt b/mysql-test/suite/innodb/t/innodb_defrag_stats.opt new file mode 100644 index 00000000000..d3525162f03 --- /dev/null +++ b/mysql-test/suite/innodb/t/innodb_defrag_stats.opt @@ -0,0 +1 @@ +--innodb-defragment=1 diff --git a/mysql-test/suite/innodb/t/innodb_defrag_stats.test b/mysql-test/suite/innodb/t/innodb_defrag_stats.test new file mode 100644 index 00000000000..f07544df4f6 --- /dev/null +++ b/mysql-test/suite/innodb/t/innodb_defrag_stats.test @@ -0,0 +1,87 @@ +--source include/have_innodb.inc +--source include/big_test.inc + +--disable_warnings +DROP TABLE if exists t1; +--enable_warnings + +--disable_query_log +let $innodb_defragment_stats_accuracy_orig=`select @@innodb_defragment_stats_accuracy`; +--enable_query_log + +select @@global.innodb_stats_persistent; +set global innodb_defragment_stats_accuracy = 20; + +--echo # Create table. +CREATE TABLE t1 (a INT NOT NULL PRIMARY KEY AUTO_INCREMENT, b VARCHAR(256), KEY SECOND(a, b)) ENGINE=INNODB; + +--echo # Populate data +INSERT INTO t1 VALUES(1, REPEAT('A', 256)); +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; + +--echo # Not enough page splits to trigger persistent stats write yet. +select count(*) from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_page_split', 'n_leaf_pages_defrag'); + +INSERT INTO t1 (b) SELECT b from t1; + +--echo # Persistent stats recorded. +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_page_split', 'n_leaf_pages_defrag'); + +--echo # Delete some rows. +let $num_delete = 20; +while ($num_delete) +{ + let $j = 100 * $num_delete; + eval delete from t1 where a between $j and $j + 30; + dec $num_delete; +} + +--source include/restart_mysqld.inc +--echo # Server Restarted + +--echo # Confirm persistent stats still there after restart. +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_page_split', 'n_leaf_pages_defrag'); + +optimize table t1; +--echo # n_page_split should be 0 after defragmentation, n_pages_freed should be non-zero. +select stat_value = 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name = 'n_page_split'; +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_pages_freed', 'n_leaf_pages_defrag'); + +set global innodb_defragment_stats_accuracy = 40; + +INSERT INTO t1 (b) SELECT b from t1; +--echo # Not enough operation to trigger persistent stats write +select stat_value = 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name = 'n_page_split'; + +INSERT INTO t1 (b) SELECT b from t1; +--echo # Persistent stats write triggered +select stat_value > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name = 'n_page_split'; + +--echo # Table rename should cause stats rename. +rename table t1 to t2; +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t2%' and stat_name in ('n_pages_freed', 'n_page_split', 'n_leaf_pages_defrag'); + +--echo # Drop index should cause stats drop. +drop index SECOND on t2; +select count(*) from mysql.innodb_index_stats where table_name like '%t2%' and index_name = 'SECOND'; + +--source include/restart_mysqld.inc +--echo Server Restarted + +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t2%' and stat_name in ('n_pages_freed', 'n_page_split', 'n_leaf_pages_defrag'); + +--echo # Clean up +DROP TABLE t2; + +--disable_query_log +EVAL SET GLOBAL innodb_defragment_stats_accuracy = $innodb_defragment_stats_accuracy_orig; +--enable_query_log diff --git a/mysql-test/suite/innodb/t/innodb_defrag_stats_many_tables.opt b/mysql-test/suite/innodb/t/innodb_defrag_stats_many_tables.opt new file mode 100644 index 00000000000..d3525162f03 --- /dev/null +++ b/mysql-test/suite/innodb/t/innodb_defrag_stats_many_tables.opt @@ -0,0 +1 @@ +--innodb-defragment=1 diff --git a/mysql-test/suite/innodb/t/innodb_defrag_stats_many_tables.test b/mysql-test/suite/innodb/t/innodb_defrag_stats_many_tables.test new file mode 100644 index 00000000000..e1a463459be --- /dev/null +++ b/mysql-test/suite/innodb/t/innodb_defrag_stats_many_tables.test @@ -0,0 +1,71 @@ +--source include/have_innodb.inc +--source include/big_test.inc + +--disable_warnings +DROP TABLE if exists t1; +--enable_warnings + +let $num_tables = 405; + +SET @start_table_definition_cache = @@global.table_definition_cache; +SET @@global.table_definition_cache = 400; + +# set stats accuracy to be pretty high so stats sync is easily triggered. +SET @start_innodb_defragment_stats_accuracy = @@global.innodb_defragment_stats_accuracy; +SET @@global.innodb_defragment_stats_accuracy = 10; + +# Create table. +CREATE TABLE t1 (a INT NOT NULL PRIMARY KEY AUTO_INCREMENT, b VARCHAR(256), KEY SECOND(a, b)) ENGINE=INNODB; + +# Populate data +INSERT INTO t1 VALUES(1, REPEAT('A', 256)); +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; +INSERT INTO t1 (b) SELECT b from t1; + +select stat_value > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name = 'n_page_split'; + +# Create many tables to over flow the table definition cache + +--echo Create $num_tables table to overflow the table cache. +--disable_query_log +let $count = $num_tables; +while ($count) +{ + EVAL CREATE TABLE t_$count (a INT NOT NULL PRIMARY KEY AUTO_INCREMENT) ENGINE=INNODB; + EVAL INSERT INTO t_$count VALUES (1), (2); + dec $count; +} +--enable_query_log +--echo Sleep for a while to make sure t1 is evicted. +select sleep(10); + +--echo Reload t1 to get defrag stats from persistent storage +INSERT INTO t1 (b) SELECT b from t1; + +--echo make sure the stats thread will wake up and do the write even if there's a race condition between set and reset. +select sleep(12); + +select stat_value > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name = 'n_page_split'; + + +# Clean up +SET @@global.innodb_defragment_stats_accuracy = @start_innodb_defragment_stats_accuracy; +SET @@global.table_definition_cache = @start_table_definition_cache; +--disable_query_log +let $count = $num_tables; +while ($count) +{ + EVAL DROP TABLE t_$count; + dec $count; +} +--enable_query_log +DROP TABLE t1; diff --git a/mysql-test/suite/innodb/t/innodb_defragment-master.opt b/mysql-test/suite/innodb/t/innodb_defragment-master.opt new file mode 100644 index 00000000000..6fc7f343b24 --- /dev/null +++ b/mysql-test/suite/innodb/t/innodb_defragment-master.opt @@ -0,0 +1,2 @@ +--innodb_file_per_table +--innodb-defragment=1
\ No newline at end of file diff --git a/mysql-test/suite/innodb/t/innodb_defragment.opt b/mysql-test/suite/innodb/t/innodb_defragment.opt new file mode 100644 index 00000000000..6426bac41a0 --- /dev/null +++ b/mysql-test/suite/innodb/t/innodb_defragment.opt @@ -0,0 +1,4 @@ +--loose-innodb-buffer-pool-stats +--loose-innodb-buffer-page +--loose-innodb-buffer-page-lru +--innodb-defragment=1
\ No newline at end of file diff --git a/mysql-test/suite/innodb/t/innodb_defragment.test b/mysql-test/suite/innodb/t/innodb_defragment.test new file mode 100644 index 00000000000..77fceeaa56b --- /dev/null +++ b/mysql-test/suite/innodb/t/innodb_defragment.test @@ -0,0 +1,190 @@ +--source include/have_innodb.inc + +--disable_warnings +DROP TABLE if exists t1; +--enable_warnings + +--disable_query_log +let $innodb_defragment_n_pages_orig=`select @@innodb_defragment_n_pages`; +let $innodb_defragment_stats_accuracy_orig=`select @@innodb_defragment_stats_accuracy`; +--enable_query_log + +set global innodb_defragment_stats_accuracy = 80; + +# Create table. +CREATE TABLE t1 (a INT NOT NULL PRIMARY KEY AUTO_INCREMENT, b VARCHAR(256), KEY SECOND(a, b)) ENGINE=INNODB; + +## Test-1 defragment an empty table +optimize table t1; + +## Test-2 defragment a single page table +INSERT INTO t1 VALUES (100000, REPEAT('A', 256)); +INSERT INTO t1 VALUES (200000, REPEAT('A', 256)); +INSERT INTO t1 VALUES (300000, REPEAT('A', 256)); +INSERT INTO t1 VALUES (400000, REPEAT('A', 256)); + +optimize table t1; + +## Test-3 defragment (somewhat) in parallel with delete queries +let $data_size = 10000; +let $delete_size = 100; + +delimiter //; +create procedure defragment() +begin + set @i = 0; + repeat + set @i = @i + 1; + optimize table t1; + select sleep(5); + until @i = 3 end repeat; +end // +delimiter ;// + + +# Populate table. +let $i = $data_size; +--disable_query_log +while ($i) +{ + eval + INSERT INTO t1 VALUES ($data_size + 1 - $i, REPEAT('A', 256)); + dec $i; +} +--enable_query_log + +select count(stat_value) = 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_pages_freed'); +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_page_split'); +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t1%' and stat_name in ('n_leaf_pages_defrag'); + +select count(*) from t1; + +if (!`select count(*) > 180 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY' order by page_number;`) +{ +select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY' order by page_number; +} + +select count(*) from t1 force index (second); + +if (!`select count(*) > 170 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second' order by page_number;`) +{ +select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second' order by page_number; +} + + +connect (con1,localhost,root,,test,$MASTER_MYPORT,$MASTER_MYSOCK); + +connection con1; +--send call defragment() + +connection default; + +--disable_query_log +let $size = $delete_size; +while ($size) +{ + let $j = 100 * $size; + eval delete from t1 where a between $j - 20 and $j; + dec $size; +} +--enable_query_log + +connection con1; +--disable_result_log +--reap +--enable_result_log + +connection default; +disconnect con1; + +optimize table t1; +select sleep(5); + +--source include/restart_mysqld.inc +select count(*) from t1; + +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t2%' and stat_name in ('n_pages_freed', 'n_page_split', 'n_leaf_pages_defrag'); + +# After deletion & defragmentation, there are 8000 records left +if (!`select count(*) < 180 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY' order by page_number;`) +{ +select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY' order by page_number; +} + +select count(*) from t1 force index (second); + +# secondary index is pretty much the same size as primary index so the number of pages should be similar. +if (!`select count(*) < 180 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second' order by page_number;`) +{ +select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second' order by page_number; +} + +## Test-4 defragment with larger n_pages + +# delete some more records +--disable_query_log +let $size = $delete_size; +while ($size) +{ + let $j = 100 * $size; + eval delete from t1 where a between $j - 30 and $j - 20; + dec $size; +} +--enable_query_log + +SET @@global.innodb_defragment_n_pages = 3; + +# This will not reduce number of pages by a lot +optimize table t1; + +--source include/restart_mysqld.inc + +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t2%' and stat_name in ('n_pages_freed', 'n_page_split', 'n_leaf_pages_defrag'); + +select count(*) from t1; + +# We didn't create large wholes with the previous deletion, so if innodb_defragment_n_pages = 3, we won't be able to free up many pages. +if (!`select count(*) > 130 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY' order by page_number;`) +{ +select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY' order by page_number; +} + +select count(*) from t1 force index (second); + +# Same holds for secondary index, not many pages are released. +if (!`select count(*) > 100 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second' order by page_number;`) +{ +select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second' order by page_number; +} + +SET @@global.innodb_defragment_n_pages = 10; + +optimize table t1; + +--source include/restart_mysqld.inc + +select count(stat_value) > 0 from mysql.innodb_index_stats where table_name like '%t2%' and stat_name in ('n_pages_freed', 'n_page_split', 'n_leaf_pages_defrag'); + +select count(*) from t1; + +# This time we used innodb_defragment_n_pages = 10, so we should be able to free up some pages. +if (!`select count(*) < 165 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY' order by page_number;`) +{ +select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY' order by page_number; +} + +select count(*) from t1 force index (second); + +if (!`select count(*) < 165 from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second' order by page_number;`) +{ +select count(*) from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second' order by page_number; +} + +DROP PROCEDURE defragment; +DROP TABLE t1; +# reset system +--disable_query_log +EVAL SET GLOBAL innodb_defragment_n_pages = $innodb_defragment_n_pages_orig; +EVAL SET GLOBAL innodb_defragment_stats_accuracy = $innodb_defragment_stats_accuracy_orig; +--enable_query_log + diff --git a/mysql-test/suite/innodb/t/innodb_defragment_fill_factor.opt b/mysql-test/suite/innodb/t/innodb_defragment_fill_factor.opt new file mode 100644 index 00000000000..6426bac41a0 --- /dev/null +++ b/mysql-test/suite/innodb/t/innodb_defragment_fill_factor.opt @@ -0,0 +1,4 @@ +--loose-innodb-buffer-pool-stats +--loose-innodb-buffer-page +--loose-innodb-buffer-page-lru +--innodb-defragment=1
\ No newline at end of file diff --git a/mysql-test/suite/innodb/t/innodb_defragment_fill_factor.test b/mysql-test/suite/innodb/t/innodb_defragment_fill_factor.test new file mode 100644 index 00000000000..b2fdfa4a409 --- /dev/null +++ b/mysql-test/suite/innodb/t/innodb_defragment_fill_factor.test @@ -0,0 +1,130 @@ +--source include/have_innodb.inc +--disable_warnings +DROP TABLE if exists t1; +DROP TABLE if exists t2; +--enable_warnings +--echo Testing tables with large records +# Create table. +CREATE TABLE t1 (a INT NOT NULL PRIMARY KEY AUTO_INCREMENT, b VARCHAR(256), KEY SECOND(a, b)) ENGINE=INNODB; +# Populate table. +let $i = 1000; +--disable_query_log +while ($i) +{ + eval + INSERT INTO t1 VALUES ($i, REPEAT('A', 256)); + dec $i; +} +--enable_query_log +--disable_query_log +let $size = 10; +while ($size) +{ + let $j = 100 * $size; + eval delete from t1 where a between $j - 20 and $j; + dec $size; +} +--enable_query_log +optimize table t1; +--source include/restart_mysqld.inc +select count(*) from t1; +# After deletion & defragmentation, there are 800 records left. Each page can hold about 57 records. We fill the page 90% full, +# so there should be less than 16 pages total. +--let $primary_before = query_get_value(select count(*) as Value from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY', Value, 1) +select count(*) from t1 force index (second); +# secondary index is slightly bigger than primary index so the number of pages should be similar. +--let $second_before = query_get_value(select count(*) as Value from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second', Value, 1) +--echo # A few more insertions on the page should not cause a page split. +insert into t1 values (81, REPEAT('A', 256)); +insert into t1 values (83, REPEAT('A', 256)); +insert into t1 values (87, REPEAT('A', 256)); +insert into t1 values (82, REPEAT('A', 256)); +insert into t1 values (86, REPEAT('A', 256)); +--let $primary_after = query_get_value(select count(*) as Value from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY', Value, 1) +--let $second_after = query_get_value(select count(*) as Value from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second', Value, 1) +if ($primary_before != $primary_after) { + --echo Insertion caused page split on primary, which should be avoided by innodb_defragment_fill_factor. +} +if ($second_before != $second_after) { + --echo Insertion caused page split on second, which should be avoided by innodb_defragment_fill_factor. +} +--echo # More insertions will cause page splits +insert into t1 values (88, REPEAT('A', 50)); +#insert into t1 values (85, REPEAT('A', 256)); +#insert into t1 values (84, REPEAT('A', 256)); +#insert into t1 values (89, REPEAT('A', 256)); +--let $primary_after = query_get_value(select count(*) as Value from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'PRIMARY', Value, 1) +--let $second_after = query_get_value(select count(*) as Value from information_schema.innodb_buffer_page where table_name like '%t1%' and index_name = 'second', Value, 1) +if ($primary_before == $primary_after) { + --echo Too much space are reserved on primary index. +} +if ($second_before == $second_after) { + --echo Too much space are reserved on second index. +} +DROP TABLE t1; +--echo Testing table with small records +CREATE TABLE t2 (a INT NOT NULL PRIMARY KEY AUTO_INCREMENT, b VARchar(16), KEY SECOND(a,b)) ENGINE=INNODB; +# Populate table. +--disable_query_log +INSERT INTO t2 VALUES (1, REPEAT('A', 16)); +INSERT INTO t2 (b) SELECT b from t2; +INSERT INTO t2 (b) SELECT b from t2; +INSERT INTO t2 (b) SELECT b from t2; +INSERT INTO t2 (b) SELECT b from t2; +INSERT INTO t2 (b) SELECT b from t2; +INSERT INTO t2 (b) SELECT b from t2; +INSERT INTO t2 (b) SELECT b from t2; +INSERT INTO t2 (b) SELECT b from t2; +INSERT INTO t2 (b) SELECT b from t2; +INSERT INTO t2 (b) SELECT b from t2; +INSERT INTO t2 (b) SELECT b from t2; +INSERT INTO t2 (b) SELECT b from t2; +--enable_query_log +--disable_query_log +let $size = 40; +while ($size) +{ + let $j = 100 * $size; + eval delete from t2 where a between $j - 20 and $j; + dec $size; +} +--enable_query_log +optimize table t2; +--source include/restart_mysqld.inc +select count(*) from t2 force index(second); +--let $second_before = query_get_value(select count(*) as Value from information_schema.innodb_buffer_page where table_name like '%t2%' and index_name = 'second', Value, 1) +--echo The page should have room for about 20 insertions +insert into t2 values(1181, REPEAT('A', 16)); +insert into t2 values(1191, REPEAT('A', 16)); +insert into t2 values(1182, REPEAT('A', 16)); +insert into t2 values(1192, REPEAT('A', 16)); +insert into t2 values(1183, REPEAT('A', 16)); +insert into t2 values(1193, REPEAT('A', 16)); +insert into t2 values(1184, REPEAT('A', 16)); +insert into t2 values(1194, REPEAT('A', 16)); +insert into t2 values(1185, REPEAT('A', 16)); +insert into t2 values(1195, REPEAT('A', 16)); +insert into t2 values(1186, REPEAT('A', 16)); +insert into t2 values(1196, REPEAT('A', 16)); +insert into t2 values(1187, REPEAT('A', 16)); +insert into t2 values(1197, REPEAT('A', 16)); +insert into t2 values(1188, REPEAT('A', 16)); +insert into t2 values(1198, REPEAT('A', 16)); +insert into t2 values(1189, REPEAT('A', 16)); +insert into t2 values(1199, REPEAT('A', 16)); +insert into t2 values(1190, REPEAT('A', 16)); +insert into t2 values(1180, REPEAT('A', 16)); +--let $second_after = query_get_value(select count(*) as Value from information_schema.innodb_buffer_page where table_name like '%t2%' and index_name = 'second', Value, 1) +if ($second_before != $second_after) { + --echo Insertion caused page split on second, which should be avoided by innodb_defragment_fill_factor. +} +--echo More insertions will cause page split. +insert into t2 values(1280, REPEAT('A', 16)); +insert into t2 values(1290, REPEAT('A', 16)); +insert into t2 values(1281, REPEAT('A', 16)); +insert into t2 values(1291, REPEAT('A', 16)); +--let $second_after = query_get_value(select count(*) as Value from information_schema.innodb_buffer_page where table_name like '%t2%' and index_name = 'second', Value, 1) +if ($second_before == $second_after) { + --echo Too much space are reserved on second index. +} +DROP TABLE t2; diff --git a/mysql-test/suite/sys_vars/r/innodb_defragment_basic.result b/mysql-test/suite/sys_vars/r/innodb_defragment_basic.result new file mode 100644 index 00000000000..916bb5ca1a9 --- /dev/null +++ b/mysql-test/suite/sys_vars/r/innodb_defragment_basic.result @@ -0,0 +1,18 @@ +SET @orig = @@global.innodb_defragment; +SELECT @orig; +@orig +0 +SET GLOBAL innodb_defragment = OFF; +SELECT @@global.innodb_defragment; +@@global.innodb_defragment +0 +SET GLOBAL innodb_defragment = ON; +SELECT @@global.innodb_defragment; +@@global.innodb_defragment +1 +SET GLOBAL innodb_defragment = 100; +ERROR 42000: Variable 'innodb_defragment' can't be set to the value of '100' +SELECT @@global.innodb_defragment; +@@global.innodb_defragment +1 +SET GLOBAL innodb_defragment = @orig; diff --git a/mysql-test/suite/sys_vars/r/innodb_defragment_fill_factor_basic.result b/mysql-test/suite/sys_vars/r/innodb_defragment_fill_factor_basic.result new file mode 100644 index 00000000000..93a5af727c3 --- /dev/null +++ b/mysql-test/suite/sys_vars/r/innodb_defragment_fill_factor_basic.result @@ -0,0 +1,37 @@ +SET @start_innodb_defragment_fill_factor = @@global.innodb_defragment_fill_factor; +SELECT @start_innodb_defragment_fill_factor; +@start_innodb_defragment_fill_factor +0.9 +SELECT COUNT(@@global.innodb_defragment_fill_factor); +COUNT(@@global.innodb_defragment_fill_factor) +1 +SET @@global.innodb_defragment_fill_factor = 0.77777777777777; +SELECT @@global.innodb_defragment_fill_factor; +@@global.innodb_defragment_fill_factor +0.777778 +SET @@global.innodb_defragment_fill_factor = 1; +SELECT @@global.innodb_defragment_fill_factor; +@@global.innodb_defragment_fill_factor +1.000000 +SET @@global.innodb_defragment_fill_factor = 0.7; +SELECT @@global.innodb_defragment_fill_factor; +@@global.innodb_defragment_fill_factor +0.700000 +SET @@global.innodb_defragment_fill_factor = -1; +Warnings: +Warning 1292 Truncated incorrect innodb_defragment_fill_factor value: '-1' +SELECT @@global.innodb_defragment_fill_factor; +@@global.innodb_defragment_fill_factor +0.700000 +SET @@global.innodb_defragment_fill_factor = 2; +Warnings: +Warning 1292 Truncated incorrect innodb_defragment_fill_factor value: '2' +SELECT @@global.innodb_defragment_fill_factor; +@@global.innodb_defragment_fill_factor +1.000000 +SET @@global.innodb_defragment_fill_factor = "abc"; +ERROR 42000: Incorrect argument type to variable 'innodb_defragment_fill_factor' +SELECT @@global.innodb_defragment_fill_factor; +@@global.innodb_defragment_fill_factor +1.000000 +SET @@global.innodb_defragment_fill_factor = @start_innodb_defragment_fill_factor; diff --git a/mysql-test/suite/sys_vars/r/innodb_defragment_fill_factor_n_recs_basic.result b/mysql-test/suite/sys_vars/r/innodb_defragment_fill_factor_n_recs_basic.result new file mode 100644 index 00000000000..ffbeb39fe33 --- /dev/null +++ b/mysql-test/suite/sys_vars/r/innodb_defragment_fill_factor_n_recs_basic.result @@ -0,0 +1,42 @@ +SET @start_innodb_defragment_fill_factor_n_recs = @@global.innodb_defragment_fill_factor_n_recs; +SELECT @start_innodb_defragment_fill_factor_n_recs; +@start_innodb_defragment_fill_factor_n_recs +20 +SELECT COUNT(@@global.innodb_defragment_fill_factor_n_recs); +COUNT(@@global.innodb_defragment_fill_factor_n_recs) +1 +SET @@global.innodb_defragment_fill_factor_n_recs = 50; +SELECT @@global.innodb_defragment_fill_factor_n_recs; +@@global.innodb_defragment_fill_factor_n_recs +50 +SET @@global.innodb_defragment_fill_factor_n_recs = 100; +SELECT @@global.innodb_defragment_fill_factor_n_recs; +@@global.innodb_defragment_fill_factor_n_recs +100 +SET @@global.innodb_defragment_fill_factor_n_recs = 1; +SELECT @@global.innodb_defragment_fill_factor_n_recs; +@@global.innodb_defragment_fill_factor_n_recs +1 +SET @@global.innodb_defragment_fill_factor_n_recs = -1; +Warnings: +Warning 1292 Truncated incorrect innodb_defragment_fill_factor_n_ value: '-1' +SELECT @@global.innodb_defragment_fill_factor_n_recs; +@@global.innodb_defragment_fill_factor_n_recs +1 +SET @@global.innodb_defragment_fill_factor_n_recs = 10000; +Warnings: +Warning 1292 Truncated incorrect innodb_defragment_fill_factor_n_ value: '10000' +SELECT @@global.innodb_defragment_fill_factor_n_recs; +@@global.innodb_defragment_fill_factor_n_recs +100 +SET @@global.innodb_defragment_fill_factor_n_recs = 10.5; +ERROR 42000: Incorrect argument type to variable 'innodb_defragment_fill_factor_n_recs' +SELECT @@global.innodb_defragment_fill_factor_n_recs; +@@global.innodb_defragment_fill_factor_n_recs +100 +SET @@global.innodb_defragment_fill_factor_n_recs = "abc"; +ERROR 42000: Incorrect argument type to variable 'innodb_defragment_fill_factor_n_recs' +SELECT @@global.innodb_defragment_fill_factor_n_recs; +@@global.innodb_defragment_fill_factor_n_recs +100 +SET @@global.innodb_defragment_fill_factor_n_recs = @start_innodb_defragment_fill_factor_n_recs; diff --git a/mysql-test/suite/sys_vars/r/innodb_defragment_frequency_basic.result b/mysql-test/suite/sys_vars/r/innodb_defragment_frequency_basic.result new file mode 100644 index 00000000000..d4314d6506e --- /dev/null +++ b/mysql-test/suite/sys_vars/r/innodb_defragment_frequency_basic.result @@ -0,0 +1,42 @@ +SET @start_innodb_defragment_frequency = @@global.innodb_defragment_frequency; +SELECT @start_innodb_defragment_frequency; +@start_innodb_defragment_frequency +40 +SELECT COUNT(@@global.innodb_defragment_frequency); +COUNT(@@global.innodb_defragment_frequency) +1 +SET @@global.innodb_defragment_frequency = 200; +SELECT @@global.innodb_defragment_frequency; +@@global.innodb_defragment_frequency +200 +SET @@global.innodb_defragment_frequency = 1; +SELECT @@global.innodb_defragment_frequency; +@@global.innodb_defragment_frequency +1 +SET @@global.innodb_defragment_frequency = 1000; +SELECT @@global.innodb_defragment_frequency; +@@global.innodb_defragment_frequency +1000 +SET @@global.innodb_defragment_frequency = -1; +Warnings: +Warning 1292 Truncated incorrect innodb_defragment_frequency value: '-1' +SELECT @@global.innodb_defragment_frequency; +@@global.innodb_defragment_frequency +1 +SET @@global.innodb_defragment_frequency = 10000; +Warnings: +Warning 1292 Truncated incorrect innodb_defragment_frequency value: '10000' +SELECT @@global.innodb_defragment_frequency; +@@global.innodb_defragment_frequency +1000 +SET @@global.innodb_defragment_frequency = 10.5; +ERROR 42000: Incorrect argument type to variable 'innodb_defragment_frequency' +SELECT @@global.innodb_defragment_frequency; +@@global.innodb_defragment_frequency +1000 +SET @@global.innodb_defragment_frequency = "abc"; +ERROR 42000: Incorrect argument type to variable 'innodb_defragment_frequency' +SELECT @@global.innodb_defragment_frequency; +@@global.innodb_defragment_frequency +1000 +SET @@global.innodb_defragment_frequency = @start_innodb_defragment_frequency; diff --git a/mysql-test/suite/sys_vars/r/innodb_defragment_n_pages_basic.result b/mysql-test/suite/sys_vars/r/innodb_defragment_n_pages_basic.result new file mode 100644 index 00000000000..99b68b39ec4 --- /dev/null +++ b/mysql-test/suite/sys_vars/r/innodb_defragment_n_pages_basic.result @@ -0,0 +1,28 @@ +SET @start_innodb_defragment_n_pages = @@global.innodb_defragment_n_pages; +SELECT @start_innodb_defragment_n_pages; +@start_innodb_defragment_n_pages +7 +SELECT COUNT(@@global.innodb_defragment_n_pages); +COUNT(@@global.innodb_defragment_n_pages) +1 +SET @@global.innodb_defragment_n_pages = 1; +Warnings: +Warning 1292 Truncated incorrect innodb_defragment_n_pages value: '1' +SELECT @@global.innodb_defragment_n_pages; +@@global.innodb_defragment_n_pages +2 +SET @@global.innodb_defragment_n_pages = 2; +SELECT @@global.innodb_defragment_n_pages; +@@global.innodb_defragment_n_pages +2 +SET @@global.innodb_defragment_n_pages = 32; +SELECT @@global.innodb_defragment_n_pages; +@@global.innodb_defragment_n_pages +32 +SET @@global.innodb_defragment_n_pages = 64; +Warnings: +Warning 1292 Truncated incorrect innodb_defragment_n_pages value: '64' +SELECT @@global.innodb_defragment_n_pages; +@@global.innodb_defragment_n_pages +32 +SET @@global.innodb_defragment_n_pages = @start_innodb_defragment_n_pages; diff --git a/mysql-test/suite/sys_vars/r/innodb_defragment_stats_accuracy_basic.result b/mysql-test/suite/sys_vars/r/innodb_defragment_stats_accuracy_basic.result new file mode 100644 index 00000000000..025dacdb1ec --- /dev/null +++ b/mysql-test/suite/sys_vars/r/innodb_defragment_stats_accuracy_basic.result @@ -0,0 +1,33 @@ +SET @start_innodb_defragment_stats_accuracy = @@global.innodb_defragment_stats_accuracy; +SELECT @start_innodb_defragment_stats_accuracy; +@start_innodb_defragment_stats_accuracy +0 +SELECT COUNT(@@global.innodb_defragment_stats_accuracy); +COUNT(@@global.innodb_defragment_stats_accuracy) +1 +SET @@global.innodb_defragment_stats_accuracy = 1; +SELECT @@global.innodb_defragment_stats_accuracy; +@@global.innodb_defragment_stats_accuracy +1 +SET @@global.innodb_defragment_stats_accuracy = 1000; +SELECT @@global.innodb_defragment_stats_accuracy; +@@global.innodb_defragment_stats_accuracy +1000 +SET @@global.innodb_defragment_stats_accuracy = -1; +Warnings: +Warning 1292 Truncated incorrect innodb_defragment_stats_accuracy value: '-1' +SELECT @@global.innodb_defragment_stats_accuracy; +@@global.innodb_defragment_stats_accuracy +0 +SET @@global.innodb_defragment_stats_accuracy = 1000000000000; +Warnings: +Warning 1292 Truncated incorrect innodb_defragment_stats_accuracy value: '1000000000000' +SELECT @@global.innodb_defragment_stats_accuracy; +@@global.innodb_defragment_stats_accuracy +4294967295 +SET @@global.innodb_defragment_stats_accuracy = "abc"; +ERROR 42000: Incorrect argument type to variable 'innodb_defragment_stats_accuracy' +SELECT @@global.innodb_defragment_stats_accuracy; +@@global.innodb_defragment_stats_accuracy +4294967295 +SET @@global.innodb_defragment_stats_accuracy = @start_innodb_defragment_stats_accuracy; diff --git a/mysql-test/suite/sys_vars/t/innodb_defragment_basic.test b/mysql-test/suite/sys_vars/t/innodb_defragment_basic.test new file mode 100644 index 00000000000..9667f63f687 --- /dev/null +++ b/mysql-test/suite/sys_vars/t/innodb_defragment_basic.test @@ -0,0 +1,20 @@ +-- source include/have_innodb.inc + +# Check the default value +SET @orig = @@global.innodb_defragment; +SELECT @orig; + +# Turn off +SET GLOBAL innodb_defragment = OFF; +SELECT @@global.innodb_defragment; + +# Turn on +SET GLOBAL innodb_defragment = ON; +SELECT @@global.innodb_defragment; + +# Wrong value +--error ER_WRONG_VALUE_FOR_VAR +SET GLOBAL innodb_defragment = 100; +SELECT @@global.innodb_defragment; + +SET GLOBAL innodb_defragment = @orig; diff --git a/mysql-test/suite/sys_vars/t/innodb_defragment_fill_factor_basic.test b/mysql-test/suite/sys_vars/t/innodb_defragment_fill_factor_basic.test new file mode 100644 index 00000000000..ae9863e806a --- /dev/null +++ b/mysql-test/suite/sys_vars/t/innodb_defragment_fill_factor_basic.test @@ -0,0 +1,27 @@ +--source include/have_innodb.inc + +SET @start_innodb_defragment_fill_factor = @@global.innodb_defragment_fill_factor; +SELECT @start_innodb_defragment_fill_factor; + +SELECT COUNT(@@global.innodb_defragment_fill_factor); + +SET @@global.innodb_defragment_fill_factor = 0.77777777777777; +SELECT @@global.innodb_defragment_fill_factor; + +SET @@global.innodb_defragment_fill_factor = 1; +SELECT @@global.innodb_defragment_fill_factor; + +SET @@global.innodb_defragment_fill_factor = 0.7; +SELECT @@global.innodb_defragment_fill_factor; + +SET @@global.innodb_defragment_fill_factor = -1; +SELECT @@global.innodb_defragment_fill_factor; + +SET @@global.innodb_defragment_fill_factor = 2; +SELECT @@global.innodb_defragment_fill_factor; + +--Error ER_WRONG_TYPE_FOR_VAR +SET @@global.innodb_defragment_fill_factor = "abc"; +SELECT @@global.innodb_defragment_fill_factor; + +SET @@global.innodb_defragment_fill_factor = @start_innodb_defragment_fill_factor; diff --git a/mysql-test/suite/sys_vars/t/innodb_defragment_fill_factor_n_recs_basic.test b/mysql-test/suite/sys_vars/t/innodb_defragment_fill_factor_n_recs_basic.test new file mode 100644 index 00000000000..366817c4bbc --- /dev/null +++ b/mysql-test/suite/sys_vars/t/innodb_defragment_fill_factor_n_recs_basic.test @@ -0,0 +1,31 @@ +--source include/have_innodb.inc + +SET @start_innodb_defragment_fill_factor_n_recs = @@global.innodb_defragment_fill_factor_n_recs; +SELECT @start_innodb_defragment_fill_factor_n_recs; + +SELECT COUNT(@@global.innodb_defragment_fill_factor_n_recs); + +SET @@global.innodb_defragment_fill_factor_n_recs = 50; +SELECT @@global.innodb_defragment_fill_factor_n_recs; + +SET @@global.innodb_defragment_fill_factor_n_recs = 100; +SELECT @@global.innodb_defragment_fill_factor_n_recs; + +SET @@global.innodb_defragment_fill_factor_n_recs = 1; +SELECT @@global.innodb_defragment_fill_factor_n_recs; + +SET @@global.innodb_defragment_fill_factor_n_recs = -1; +SELECT @@global.innodb_defragment_fill_factor_n_recs; + +SET @@global.innodb_defragment_fill_factor_n_recs = 10000; +SELECT @@global.innodb_defragment_fill_factor_n_recs; + +--Error ER_WRONG_TYPE_FOR_VAR +SET @@global.innodb_defragment_fill_factor_n_recs = 10.5; +SELECT @@global.innodb_defragment_fill_factor_n_recs; + +--Error ER_WRONG_TYPE_FOR_VAR +SET @@global.innodb_defragment_fill_factor_n_recs = "abc"; +SELECT @@global.innodb_defragment_fill_factor_n_recs; + +SET @@global.innodb_defragment_fill_factor_n_recs = @start_innodb_defragment_fill_factor_n_recs; diff --git a/mysql-test/suite/sys_vars/t/innodb_defragment_frequency_basic.test b/mysql-test/suite/sys_vars/t/innodb_defragment_frequency_basic.test new file mode 100644 index 00000000000..3ab45744a9c --- /dev/null +++ b/mysql-test/suite/sys_vars/t/innodb_defragment_frequency_basic.test @@ -0,0 +1,37 @@ +--source include/have_innodb.inc + +SET @start_innodb_defragment_frequency = @@global.innodb_defragment_frequency; +SELECT @start_innodb_defragment_frequency; + +SELECT COUNT(@@global.innodb_defragment_frequency); + +# test valid value +SET @@global.innodb_defragment_frequency = 200; +SELECT @@global.innodb_defragment_frequency; + +# test valid min +SET @@global.innodb_defragment_frequency = 1; +SELECT @@global.innodb_defragment_frequency; + +# test valid max +SET @@global.innodb_defragment_frequency = 1000; +SELECT @@global.innodb_defragment_frequency; + +# test invalid value < min +SET @@global.innodb_defragment_frequency = -1; +SELECT @@global.innodb_defragment_frequency; + +# test invalid value > max +SET @@global.innodb_defragment_frequency = 10000; +SELECT @@global.innodb_defragment_frequency; + +# test wrong type +--Error ER_WRONG_TYPE_FOR_VAR +SET @@global.innodb_defragment_frequency = 10.5; +SELECT @@global.innodb_defragment_frequency; + +--Error ER_WRONG_TYPE_FOR_VAR +SET @@global.innodb_defragment_frequency = "abc"; +SELECT @@global.innodb_defragment_frequency; + +SET @@global.innodb_defragment_frequency = @start_innodb_defragment_frequency; diff --git a/mysql-test/suite/sys_vars/t/innodb_defragment_n_pages_basic.test b/mysql-test/suite/sys_vars/t/innodb_defragment_n_pages_basic.test new file mode 100644 index 00000000000..64aa20a615f --- /dev/null +++ b/mysql-test/suite/sys_vars/t/innodb_defragment_n_pages_basic.test @@ -0,0 +1,22 @@ +--source include/have_innodb.inc + +SET @start_innodb_defragment_n_pages = @@global.innodb_defragment_n_pages; +SELECT @start_innodb_defragment_n_pages; + +SELECT COUNT(@@global.innodb_defragment_n_pages); + +SET @@global.innodb_defragment_n_pages = 1; +SELECT @@global.innodb_defragment_n_pages; + +SET @@global.innodb_defragment_n_pages = 2; +SELECT @@global.innodb_defragment_n_pages; + +SET @@global.innodb_defragment_n_pages = 32; +SELECT @@global.innodb_defragment_n_pages; + +SET @@global.innodb_defragment_n_pages = 64; +SELECT @@global.innodb_defragment_n_pages; + +SET @@global.innodb_defragment_n_pages = @start_innodb_defragment_n_pages; + + diff --git a/mysql-test/suite/sys_vars/t/innodb_defragment_stats_accuracy_basic.test b/mysql-test/suite/sys_vars/t/innodb_defragment_stats_accuracy_basic.test new file mode 100644 index 00000000000..062753f27ea --- /dev/null +++ b/mysql-test/suite/sys_vars/t/innodb_defragment_stats_accuracy_basic.test @@ -0,0 +1,24 @@ +--source include/have_innodb.inc + +SET @start_innodb_defragment_stats_accuracy = @@global.innodb_defragment_stats_accuracy; +SELECT @start_innodb_defragment_stats_accuracy; + +SELECT COUNT(@@global.innodb_defragment_stats_accuracy); + +SET @@global.innodb_defragment_stats_accuracy = 1; +SELECT @@global.innodb_defragment_stats_accuracy; + +SET @@global.innodb_defragment_stats_accuracy = 1000; +SELECT @@global.innodb_defragment_stats_accuracy; + +SET @@global.innodb_defragment_stats_accuracy = -1; +SELECT @@global.innodb_defragment_stats_accuracy; + +SET @@global.innodb_defragment_stats_accuracy = 1000000000000; +SELECT @@global.innodb_defragment_stats_accuracy; + +--Error ER_WRONG_TYPE_FOR_VAR +SET @@global.innodb_defragment_stats_accuracy = "abc"; +SELECT @@global.innodb_defragment_stats_accuracy; + +SET @@global.innodb_defragment_stats_accuracy = @start_innodb_defragment_stats_accuracy; diff --git a/mysql-test/t/lock_sync-master.opt b/mysql-test/t/lock_sync-master.opt index 96f0ce3f36c..a6700b8d18e 100644 --- a/mysql-test/t/lock_sync-master.opt +++ b/mysql-test/t/lock_sync-master.opt @@ -1 +1,2 @@ --default-storage-engine=MyISAM +--innodb-defragment=0 diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt index 622fff87536..e783f3e6459 100644 --- a/storage/innobase/CMakeLists.txt +++ b/storage/innobase/CMakeLists.txt @@ -285,6 +285,7 @@ SET(INNOBASE_SOURCES btr/btr0cur.cc btr/btr0pcur.cc btr/btr0sea.cc + btr/btr0defragment.cc buf/buf0buddy.cc buf/buf0buf.cc buf/buf0dblwr.cc @@ -395,7 +396,8 @@ SET(INNOBASE_SOURCES ut/ut0rnd.cc ut/ut0ut.cc ut/ut0vec.cc - ut/ut0wqueue.cc) + ut/ut0wqueue.cc + ut/ut0timer.cc) IF(WITH_INNODB) # Legacy option diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc index 104c2f00ef6..4f9ccbe061a 100644 --- a/storage/innobase/btr/btr0btr.cc +++ b/storage/innobase/btr/btr0btr.cc @@ -38,6 +38,7 @@ Created 6/2/1994 Heikki Tuuri #include "btr0cur.h" #include "btr0sea.h" #include "btr0pcur.h" +#include "btr0defragment.h" #include "rem0cmp.h" #include "lock0lock.h" #include "ibuf0ibuf.h" @@ -1193,6 +1194,32 @@ btr_get_size( mtr_t* mtr) /*!< in/out: mini-transaction where index is s-latched */ { + ulint used; + if (flag == BTR_N_LEAF_PAGES) { + btr_get_size_and_reserved(index, flag, &used, mtr); + return used; + } else if (flag == BTR_TOTAL_SIZE) { + return btr_get_size_and_reserved(index, flag, &used, mtr); + } else { + ut_error; + } + return (ULINT_UNDEFINED); +} + +/**************************************************************//** +Gets the number of reserved and used pages in a B-tree. +@return number of pages reserved, or ULINT_UNDEFINED if the index +is unavailable */ +UNIV_INTERN +ulint +btr_get_size_and_reserved( +/*======================*/ + dict_index_t* index, /*!< in: index */ + ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */ + ulint* used, /*!< out: number of pages used (<= reserved) */ + mtr_t* mtr) /*!< in/out: mini-transaction where index + is s-latched */ +{ fseg_header_t* seg_header; page_t* root; ulint n; @@ -1201,6 +1228,8 @@ btr_get_size( ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), MTR_MEMO_S_LOCK)); + ut_a(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE); + if (index->page == FIL_NULL || dict_index_is_online_ddl(index) || *index->name == TEMP_INDEX_PREFIX) { return(ULINT_UNDEFINED); @@ -1208,21 +1237,16 @@ btr_get_size( root = btr_root_get(index, mtr); - if (flag == BTR_N_LEAF_PAGES) { - seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; + seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; - fseg_n_reserved_pages(seg_header, &n, mtr); + n = fseg_n_reserved_pages(seg_header, used, mtr); - } else if (flag == BTR_TOTAL_SIZE) { + if (flag == BTR_TOTAL_SIZE) { seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP; - n = fseg_n_reserved_pages(seg_header, &dummy, mtr); - - seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; - n += fseg_n_reserved_pages(seg_header, &dummy, mtr); - } else { - ut_error; + *used += dummy; + } return(n); @@ -1971,7 +1995,7 @@ IBUF_BITMAP_FREE is unaffected by reorganization. @retval true if the operation was successful @retval false if it is a compressed page, and recompression failed */ -static __attribute__((nonnull)) +UNIV_INTERN bool btr_page_reorganize_block( /*======================*/ @@ -2923,6 +2947,12 @@ func_start: new_page_zip = buf_block_get_page_zip(new_block); btr_page_create(new_block, new_page_zip, cursor->index, btr_page_get_level(page, mtr), mtr); + /* Only record the leaf level page splits. */ + if (btr_page_get_level(page, mtr) == 0) { + cursor->index->stat_defrag_n_page_split ++; + cursor->index->stat_defrag_modified_counter ++; + btr_defragment_save_defrag_stats_if_needed(cursor->index); + } /* 3. Calculate the first record on the upper half-page, and the first record (move_limit) on original page which ends up on the @@ -3181,31 +3211,9 @@ func_exit: return(rec); } -#ifdef UNIV_SYNC_DEBUG -/*************************************************************//** -Removes a page from the level list of pages. -@param space in: space where removed -@param zip_size in: compressed page size in bytes, or 0 for uncompressed -@param page in/out: page to remove -@param index in: index tree -@param mtr in/out: mini-transaction */ -# define btr_level_list_remove(space,zip_size,page,index,mtr) \ - btr_level_list_remove_func(space,zip_size,page,index,mtr) -#else /* UNIV_SYNC_DEBUG */ -/*************************************************************//** -Removes a page from the level list of pages. -@param space in: space where removed -@param zip_size in: compressed page size in bytes, or 0 for uncompressed -@param page in/out: page to remove -@param index in: index tree -@param mtr in/out: mini-transaction */ -# define btr_level_list_remove(space,zip_size,page,index,mtr) \ - btr_level_list_remove_func(space,zip_size,page,mtr) -#endif /* UNIV_SYNC_DEBUG */ - /*************************************************************//** Removes a page from the level list of pages. */ -static __attribute__((nonnull)) +UNIV_INTERN void btr_level_list_remove_func( /*=======================*/ @@ -3377,7 +3385,7 @@ btr_node_ptr_delete( If page is the only on its level, this function moves its records to the father page, thus reducing the tree height. @return father block */ -static +UNIV_INTERN buf_block_t* btr_lift_page_up( /*=============*/ diff --git a/storage/innobase/btr/btr0defragment.cc b/storage/innobase/btr/btr0defragment.cc new file mode 100644 index 00000000000..e315a291359 --- /dev/null +++ b/storage/innobase/btr/btr0defragment.cc @@ -0,0 +1,814 @@ +/***************************************************************************** + +Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved. +Copyright (C) 2014, SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ +/**************************************************//** +@file btr/btr0defragment.cc +Index defragmentation. + +Created 05/29/2014 Rongrong Zhong +Modified 16/07/2014 Sunguck Lee +Modified 30/07/2014 Jan Lindström jan.lindstrom@skysql.com +*******************************************************/ + +#include "btr0defragment.h" +#ifndef UNIV_HOTBACKUP +#include "btr0cur.h" +#include "btr0sea.h" +#include "btr0pcur.h" +#include "dict0stats.h" +#include "dict0stats_bg.h" +#include "ibuf0ibuf.h" +#include "lock0lock.h" +#include "srv0start.h" +#include "ut0timer.h" + +#include <list> + +/**************************************************//** +Custom nullptr implementation for under g++ 4.6 +*******************************************************/ +// #pragma once +namespace std +{ + // based on SC22/WG21/N2431 = J16/07-0301 + struct nullptr_t + { + template<typename any> operator any * () const + { + return 0; + } + template<class any, typename T> operator T any:: * () const + { + return 0; + } + +#ifdef _MSC_VER + struct pad {}; + pad __[sizeof(void*)/sizeof(pad)]; +#else + char __[sizeof(void*)]; +#endif +private: + // nullptr_t();// {} + // nullptr_t(const nullptr_t&); + // void operator = (const nullptr_t&); + void operator &() const; + template<typename any> void operator +(any) const + { + /*I Love MSVC 2005!*/ + } + template<typename any> void operator -(any) const + { + /*I Love MSVC 2005!*/ + } + }; +static const nullptr_t __nullptr = {}; +} + +#ifndef nullptr +#define nullptr std::__nullptr +#endif +/**************************************************//** +End of Custom nullptr implementation for under g++ 4.6 +*******************************************************/ + +/* When there's no work, either because defragment is disabled, or because no +query is submitted, thread checks state every BTR_DEFRAGMENT_SLEEP_IN_USECS.*/ +#define BTR_DEFRAGMENT_SLEEP_IN_USECS 1000000 +/* Reduce the target page size by this amount when compression failure happens +during defragmentaiton. 512 is chosen because it's a power of 2 and it is about +3% of the page size. When there are compression failures in defragmentation, +our goal is to get a decent defrag ratio with as few compression failure as +possible. From experimentation it seems that reduce the target size by 512 every +time will make sure the page is compressible within a couple of iterations. */ +#define BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE 512 + +/* Work queue for defragmentation. */ +typedef std::list<btr_defragment_item_t*> btr_defragment_wq_t; +static btr_defragment_wq_t btr_defragment_wq; + +/* Mutex protecting the defragmentation work queue.*/ +ib_mutex_t btr_defragment_mutex; +#ifdef UNIV_PFS_MUTEX +UNIV_INTERN mysql_pfs_key_t btr_defragment_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +/* Number of compression failures caused by defragmentation since server +start. */ +ulint btr_defragment_compression_failures = 0; +/* Number of btr_defragment_n_pages calls that altered page but didn't +manage to release any page. */ +ulint btr_defragment_failures = 0; +/* Total number of btr_defragment_n_pages calls that altered page. +The difference between btr_defragment_count and btr_defragment_failures shows +the amount of effort wasted. */ +ulint btr_defragment_count = 0; + +/******************************************************************//** +Constructor for btr_defragment_item_t. */ +btr_defragment_item_t::btr_defragment_item_t( + btr_pcur_t* pcur, + os_event_t event) +{ + this->pcur = pcur; + this->event = event; + this->removed = false; + this->last_processed = 0; +} + +/******************************************************************//** +Destructor for btr_defragment_item_t. */ +btr_defragment_item_t::~btr_defragment_item_t() { + if (this->pcur) { + btr_pcur_free_for_mysql(this->pcur); + } + if (this->event) { + os_event_set(this->event); + } +} + +/******************************************************************//** +Initialize defragmentation. */ +void +btr_defragment_init() +{ + srv_defragment_interval = ut_microseconds_to_timer( + 1000000.0 / srv_defragment_frequency); + mutex_create(btr_defragment_mutex_key, &btr_defragment_mutex, + SYNC_ANY_LATCH); + os_thread_create(btr_defragment_thread, NULL, NULL); +} + +/******************************************************************//** +Shutdown defragmentation. Release all resources. */ +void +btr_defragment_shutdown() +{ + mutex_enter(&btr_defragment_mutex); + list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); + while(iter != btr_defragment_wq.end()) { + btr_defragment_item_t* item = *iter; + iter = btr_defragment_wq.erase(iter); + delete item; + } + mutex_exit(&btr_defragment_mutex); + mutex_free(&btr_defragment_mutex); +} + + +/******************************************************************//** +Functions used by the query threads: btr_defragment_xxx_index +Query threads find/add/remove index. */ +/******************************************************************//** +Check whether the given index is in btr_defragment_wq. We use index->id +to identify indices. */ +bool +btr_defragment_find_index( + dict_index_t* index) /*!< Index to find. */ +{ + mutex_enter(&btr_defragment_mutex); + for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); + iter != btr_defragment_wq.end(); + ++iter) { + btr_defragment_item_t* item = *iter; + btr_pcur_t* pcur = item->pcur; + btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur); + dict_index_t* idx = btr_cur_get_index(cursor); + if (index->id == idx->id) { + mutex_exit(&btr_defragment_mutex); + return true; + } + } + mutex_exit(&btr_defragment_mutex); + return false; +} + +/******************************************************************//** +Query thread uses this function to add an index to btr_defragment_wq. +Return a pointer to os_event for the query thread to wait on if this is a +synchronized defragmentation. */ +os_event_t +btr_defragment_add_index( + dict_index_t* index, /*!< index to be added */ + bool async) /*!< whether this is an async defragmentation */ +{ + mtr_t mtr; + ulint space = dict_index_get_space(index); + ulint zip_size = dict_table_zip_size(index->table); + ulint page_no = dict_index_get_page(index); + mtr_start(&mtr); + // Load index rood page. + page_t* page = btr_page_get(space, zip_size, page_no, + RW_NO_LATCH, index, &mtr); + if (btr_page_get_level(page, &mtr) == 0) { + // Index root is a leaf page, no need to defragment. + mtr_commit(&mtr); + return NULL; + } + btr_pcur_t* pcur = btr_pcur_create_for_mysql(); + os_event_t event = NULL; + if (!async) { + event = os_event_create(); + } + btr_pcur_open_at_index_side(true, index, BTR_SEARCH_LEAF, pcur, + true, 0, &mtr); + btr_pcur_move_to_next(pcur, &mtr); + btr_pcur_store_position(pcur, &mtr); + mtr_commit(&mtr); + dict_stats_empty_defrag_summary(index); + btr_defragment_item_t* item = new btr_defragment_item_t(pcur, event); + mutex_enter(&btr_defragment_mutex); + btr_defragment_wq.push_back(item); + mutex_exit(&btr_defragment_mutex); + return event; +} + +/******************************************************************//** +When table is dropped, this function is called to mark a table as removed in +btr_efragment_wq. The difference between this function and the remove_index +function is this will not NULL the event. */ +void +btr_defragment_remove_table( + dict_table_t* table) /*!< Index to be removed. */ +{ + mutex_enter(&btr_defragment_mutex); + for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); + iter != btr_defragment_wq.end(); + ++iter) { + btr_defragment_item_t* item = *iter; + btr_pcur_t* pcur = item->pcur; + btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur); + dict_index_t* idx = btr_cur_get_index(cursor); + if (table->id == idx->table->id) { + item->removed = true; + } + } + mutex_exit(&btr_defragment_mutex); +} + +/******************************************************************//** +Query thread uses this function to mark an index as removed in +btr_efragment_wq. */ +void +btr_defragment_remove_index( + dict_index_t* index) /*!< Index to be removed. */ +{ + mutex_enter(&btr_defragment_mutex); + for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); + iter != btr_defragment_wq.end(); + ++iter) { + btr_defragment_item_t* item = *iter; + btr_pcur_t* pcur = item->pcur; + btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur); + dict_index_t* idx = btr_cur_get_index(cursor); + if (index->id == idx->id) { + item->removed = true; + item->event = NULL; + break; + } + } + mutex_exit(&btr_defragment_mutex); +} + +/******************************************************************//** +Functions used by defragmentation thread: btr_defragment_xxx_item. +Defragmentation thread operates on the work *item*. It gets/removes +item from the work queue. */ +/******************************************************************//** +Defragment thread uses this to remove an item from btr_defragment_wq. +When an item is removed from the work queue, all resources associated with it +are free as well. */ +void +btr_defragment_remove_item( + btr_defragment_item_t* item) /*!< Item to be removed. */ +{ + mutex_enter(&btr_defragment_mutex); + for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); + iter != btr_defragment_wq.end(); + ++iter) { + if (item == *iter) { + btr_defragment_wq.erase(iter); + delete item; + break; + } + } + mutex_exit(&btr_defragment_mutex); +} + +/******************************************************************//** +Defragment thread uses this to get an item from btr_defragment_wq to work on. +The item is not removed from the work queue so query threads can still access +this item. We keep it this way so query threads can find and kill a +defragmentation even if that index is being worked on. Be aware that while you +work on this item you have no lock protection on it whatsoever. This is OK as +long as the query threads and defragment thread won't modify the same fields +without lock protection. +*/ +btr_defragment_item_t* +btr_defragment_get_item() +{ + if (btr_defragment_wq.empty()) { + return nullptr; + } + mutex_enter(&btr_defragment_mutex); + list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); + if (iter == btr_defragment_wq.end()) { + iter = btr_defragment_wq.begin(); + } + btr_defragment_item_t* item = *iter; + iter++; + mutex_exit(&btr_defragment_mutex); + return item; +} + +/*********************************************************************//** +Check whether we should save defragmentation statistics to persistent storage. +Currently we save the stats to persistent storage every 100 updates. */ +UNIV_INTERN +void +btr_defragment_save_defrag_stats_if_needed( + dict_index_t* index) /*!< in: index */ +{ + if (srv_defragment_stats_accuracy != 0 // stats tracking disabled + && dict_index_get_space(index) != 0 // do not track system tables + && index->stat_defrag_modified_counter + >= srv_defragment_stats_accuracy) { + dict_stats_defrag_pool_add(index); + index->stat_defrag_modified_counter = 0; + } +} + +/*********************************************************************//** +Main defragment functionalities used by defragment thread.*/ +/*************************************************************//** +Calculate number of records from beginning of block that can +fit into size_limit +@return number of records */ +UNIV_INTERN +ulint +btr_defragment_calc_n_recs_for_size( + buf_block_t* block, /*!< in: B-tree page */ + dict_index_t* index, /*!< in: index of the page */ + ulint size_limit, /*!< in: size limit to fit records in */ + ulint* n_recs_size) /*!< out: actual size of the records that fit + in size_limit. */ +{ + page_t* page = buf_block_get_frame(block); + ulint n_recs = 0; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + mem_heap_t* heap = NULL; + ulint size = 0; + page_cur_t cur; + + page_cur_set_before_first(block, &cur); + page_cur_move_to_next(&cur); + while (page_cur_get_rec(&cur) != page_get_supremum_rec(page)) { + rec_t* cur_rec = page_cur_get_rec(&cur); + offsets = rec_get_offsets(cur_rec, index, offsets, + ULINT_UNDEFINED, &heap); + ulint rec_size = rec_offs_size(offsets); + size += rec_size; + if (size > size_limit) { + size = size - rec_size; + break; + } + n_recs ++; + page_cur_move_to_next(&cur); + } + *n_recs_size = size; + return n_recs; +} + +/*************************************************************//** +Merge as many records from the from_block to the to_block. Delete +the from_block if all records are successfully merged to to_block. +@return the to_block to target for next merge operation. */ +UNIV_INTERN +buf_block_t* +btr_defragment_merge_pages( + dict_index_t* index, /*!< in: index tree */ + buf_block_t* from_block, /*!< in: origin of merge */ + buf_block_t* to_block, /*!< in: destination of merge */ + ulint zip_size, /*!< in: zip size of the block */ + ulint reserved_space, /*!< in: space reserved for future + insert to avoid immediate page split */ + ulint* max_data_size, /*!< in/out: max data size to + fit in a single compressed page. */ + mem_heap_t* heap, /*!< in/out: pointer to memory heap */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + page_t* from_page = buf_block_get_frame(from_block); + page_t* to_page = buf_block_get_frame(to_block); + ulint space = dict_index_get_space(index); + ulint level = btr_page_get_level(from_page, mtr); + ulint n_recs = page_get_n_recs(from_page); + ulint new_data_size = page_get_data_size(to_page); + ulint max_ins_size = + page_get_max_insert_size(to_page, n_recs); + ulint max_ins_size_reorg = + page_get_max_insert_size_after_reorganize( + to_page, n_recs); + ulint max_ins_size_to_use = max_ins_size_reorg > reserved_space + ? max_ins_size_reorg - reserved_space : 0; + ulint move_size = 0; + ulint n_recs_to_move = 0; + rec_t* rec = NULL; + ulint target_n_recs = 0; + rec_t* orig_pred; + + // Estimate how many records can be moved from the from_page to + // the to_page. + if (zip_size) { + ulint page_diff = UNIV_PAGE_SIZE - *max_data_size; + max_ins_size_to_use = (max_ins_size_to_use > page_diff) + ? max_ins_size_to_use - page_diff : 0; + } + n_recs_to_move = btr_defragment_calc_n_recs_for_size( + from_block, index, max_ins_size_to_use, &move_size); + + // If max_ins_size >= move_size, we can move the records without + // reorganizing the page, otherwise we need to reorganize the page + // first to release more space. + if (move_size > max_ins_size) { + if (!btr_page_reorganize_block(false, page_zip_level, + to_block, index, + mtr)) { + if (!dict_index_is_clust(index) + && page_is_leaf(to_page)) { + ibuf_reset_free_bits(to_block); + } + // If reorganization fails, that means page is + // not compressable. There's no point to try + // merging into this page. Continue to the + // next page. + return from_block; + } + ut_ad(page_validate(to_page, index)); + max_ins_size = page_get_max_insert_size(to_page, n_recs); + ut_a(max_ins_size >= move_size); + } + + // Move records to pack to_page more full. + orig_pred = NULL; + target_n_recs = n_recs_to_move; + while (n_recs_to_move > 0) { + rec = page_rec_get_nth(from_page, + n_recs_to_move + 1); + orig_pred = page_copy_rec_list_start( + to_block, from_block, rec, index, mtr); + if (orig_pred) + break; + // If we reach here, that means compression failed after packing + // n_recs_to_move number of records to to_page. We try to reduce + // the targeted data size on the to_page by + // BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE and try again. + os_atomic_increment_ulint( + &btr_defragment_compression_failures, 1); + max_ins_size_to_use = + move_size > BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE + ? move_size - BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE + : 0; + if (max_ins_size_to_use == 0) { + n_recs_to_move = 0; + move_size = 0; + break; + } + n_recs_to_move = btr_defragment_calc_n_recs_for_size( + from_block, index, max_ins_size_to_use, &move_size); + } + // If less than target_n_recs are moved, it means there are + // compression failures during page_copy_rec_list_start. Adjust + // the max_data_size estimation to reduce compression failures + // in the following runs. + if (target_n_recs > n_recs_to_move + && *max_data_size > new_data_size + move_size) { + *max_data_size = new_data_size + move_size; + } + // Set ibuf free bits if necessary. + if (!dict_index_is_clust(index) + && page_is_leaf(to_page)) { + if (zip_size) { + ibuf_reset_free_bits(to_block); + } else { + ibuf_update_free_bits_if_full( + to_block, + UNIV_PAGE_SIZE, + ULINT_UNDEFINED); + } + } + if (n_recs_to_move == n_recs) { + /* The whole page is merged with the previous page, + free it. */ + lock_update_merge_left(to_block, orig_pred, + from_block); + btr_search_drop_page_hash_index(from_block); + btr_level_list_remove(space, zip_size, from_page, + index, mtr); + btr_node_ptr_delete(index, from_block, mtr); + btr_blob_dbg_remove(from_page, index, + "btr_defragment_n_pages"); + btr_page_free(index, from_block, mtr); + } else { + // There are still records left on the page, so + // increment n_defragmented. Node pointer will be changed + // so remove the old node pointer. + if (n_recs_to_move > 0) { + // Part of the page is merged to left, remove + // the merged records, update record locks and + // node pointer. + dtuple_t* node_ptr; + page_delete_rec_list_start(rec, from_block, + index, mtr); + lock_update_split_and_merge(to_block, + orig_pred, + from_block); + btr_node_ptr_delete(index, from_block, mtr); + rec = page_rec_get_next( + page_get_infimum_rec(from_page)); + node_ptr = dict_index_build_node_ptr( + index, rec, page_get_page_no(from_page), + heap, level + 1); + btr_insert_on_non_leaf_level(0, index, level+1, + node_ptr, mtr); + } + to_block = from_block; + } + return to_block; +} + +/*************************************************************//** +Tries to merge N consecutive pages, starting from the page pointed by the +cursor. Skip space 0. Only consider leaf pages. +This function first loads all N pages into memory, then for each of +the pages other than the first page, it tries to move as many records +as possible to the left sibling to keep the left sibling full. During +the process, if any page becomes empty, that page will be removed from +the level list. Record locks, hash, and node pointers are updated after +page reorganization. +@return pointer to the last block processed, or NULL if reaching end of index */ +UNIV_INTERN +buf_block_t* +btr_defragment_n_pages( + buf_block_t* block, /*!< in: starting block for defragmentation */ + dict_index_t* index, /*!< in: index tree */ + uint n_pages,/*!< in: number of pages to defragment */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint space; + ulint zip_size; + /* We will need to load the n+1 block because if the last page is freed + and we need to modify the prev_page_no of that block. */ + buf_block_t* blocks[BTR_DEFRAGMENT_MAX_N_PAGES + 1]; + page_t* first_page; + buf_block_t* current_block; + ulint total_data_size = 0; + ulint total_n_recs = 0; + ulint data_size_per_rec; + ulint optimal_page_size; + ulint reserved_space; + ulint level; + ulint max_data_size = 0; + uint n_defragmented = 0; + uint n_new_slots; + mem_heap_t* heap; + ibool end_of_index = FALSE; + + /* It doesn't make sense to call this function with n_pages = 1. */ + ut_ad(n_pages > 1); + + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + space = dict_index_get_space(index); + if (space == 0) { + /* Ignore space 0. */ + return NULL; + } + + if (n_pages > BTR_DEFRAGMENT_MAX_N_PAGES) { + n_pages = BTR_DEFRAGMENT_MAX_N_PAGES; + } + + zip_size = dict_table_zip_size(index->table); + first_page = buf_block_get_frame(block); + level = btr_page_get_level(first_page, mtr); + + if (level != 0) { + return NULL; + } + + /* 1. Load the pages and calculate the total data size. */ + blocks[0] = block; + for (uint i = 1; i <= n_pages; i++) { + page_t* page = buf_block_get_frame(blocks[i-1]); + ulint page_no = btr_page_get_next(page, mtr); + total_data_size += page_get_data_size(page); + total_n_recs += page_get_n_recs(page); + if (page_no == FIL_NULL) { + n_pages = i; + end_of_index = TRUE; + break; + } + blocks[i] = btr_block_get(space, zip_size, page_no, + RW_X_LATCH, index, mtr); + } + + if (n_pages == 1) { + if (btr_page_get_prev(first_page, mtr) == FIL_NULL) { + /* last page in the index */ + if (dict_index_get_page(index) + == page_get_page_no(first_page)) + return NULL; + /* given page is the last page. + Lift the records to father. */ + btr_lift_page_up(index, block, mtr); + } + return NULL; + } + + /* 2. Calculate how many pages data can fit in. If not compressable, + return early. */ + ut_a(total_n_recs != 0); + data_size_per_rec = total_data_size / total_n_recs; + // For uncompressed pages, the optimal data size if the free space of a + // empty page. + optimal_page_size = page_get_free_space_of_empty( + page_is_comp(first_page)); + // For compressed pages, we take compression failures into account. + if (zip_size) { + ulint size = 0; + int i = 0; + // We estimate the optimal data size of the index use samples of + // data size. These samples are taken when pages failed to + // compress due to insertion on the page. We use the average + // of all samples we have as the estimation. Different pages of + // the same index vary in compressibility. Average gives a good + // enough estimation. + for (;i < STAT_DEFRAG_DATA_SIZE_N_SAMPLE; i++) { + if (index->stat_defrag_data_size_sample[i] == 0) { + break; + } + size += index->stat_defrag_data_size_sample[i]; + } + if (i != 0) { + size = size / i; + optimal_page_size = min(optimal_page_size, size); + } + max_data_size = optimal_page_size; + } + + reserved_space = min((ulint)(optimal_page_size + * (1 - srv_defragment_fill_factor)), + (data_size_per_rec + * srv_defragment_fill_factor_n_recs)); + optimal_page_size -= reserved_space; + n_new_slots = (total_data_size + optimal_page_size - 1) + / optimal_page_size; + if (n_new_slots >= n_pages) { + /* Can't defragment. */ + if (end_of_index) + return NULL; + return blocks[n_pages-1]; + } + + /* 3. Defragment pages. */ + heap = mem_heap_create(256); + // First defragmented page will be the first page. + current_block = blocks[0]; + // Start from the second page. + for (uint i = 1; i < n_pages; i ++) { + buf_block_t* new_block = btr_defragment_merge_pages( + index, blocks[i], current_block, zip_size, + reserved_space, &max_data_size, heap, mtr); + if (new_block != current_block) { + n_defragmented ++; + current_block = new_block; + } + } + mem_heap_free(heap); + n_defragmented ++; + os_atomic_increment_ulint( + &btr_defragment_count, 1); + if (n_pages == n_defragmented) { + os_atomic_increment_ulint( + &btr_defragment_failures, 1); + } else { + index->stat_defrag_n_pages_freed += (n_pages - n_defragmented); + } + if (end_of_index) + return NULL; + return current_block; +} + +/******************************************************************//** +Thread that merges consecutive b-tree pages into fewer pages to defragment +the index. */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(btr_defragment_thread)( +/*==========================================*/ + void* arg) /*!< in: work queue */ +{ + btr_pcur_t* pcur; + btr_cur_t* cursor; + dict_index_t* index; + mtr_t mtr; + buf_block_t* first_block; + buf_block_t* last_block; + + while (srv_shutdown_state == SRV_SHUTDOWN_NONE) { + /* If defragmentation is disabled, sleep before + checking whether it's enabled. */ + if (!srv_defragment) { + os_thread_sleep(BTR_DEFRAGMENT_SLEEP_IN_USECS); + continue; + } + /* The following call won't remove the item from work queue. + We only get a pointer to it to work on. This will make sure + when user issue a kill command, all indices are in the work + queue to be searched. This also means that the user thread + cannot directly remove the item from queue (since we might be + using it). So user thread only marks index as removed. */ + btr_defragment_item_t* item = btr_defragment_get_item(); + /* If work queue is empty, sleep and check later. */ + if (!item) { + os_thread_sleep(BTR_DEFRAGMENT_SLEEP_IN_USECS); + continue; + } + /* If an index is marked as removed, we remove it from the work + queue. No other thread could be using this item at this point so + it's safe to remove now. */ + if (item->removed) { + btr_defragment_remove_item(item); + continue; + } + + pcur = item->pcur; + ulonglong now = ut_timer_now(); + ulonglong elapsed = now - item->last_processed; + + if (elapsed < srv_defragment_interval) { + /* If we see an index again before the interval + determined by the configured frequency is reached, + we just sleep until the interval pass. Since + defragmentation of all indices queue up on a single + thread, it's likely other indices that follow this one + don't need to sleep again. */ + os_thread_sleep(((ulint)ut_timer_to_microseconds( + srv_defragment_interval - elapsed))); + } + + now = ut_timer_now(); + mtr_start(&mtr); + btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, &mtr); + cursor = btr_pcur_get_btr_cur(pcur); + index = btr_cur_get_index(cursor); + first_block = btr_cur_get_block(cursor); + last_block = btr_defragment_n_pages(first_block, index, + srv_defragment_n_pages, + &mtr); + if (last_block) { + /* If we haven't reached the end of the index, + place the cursor on the last record of last page, + store the cursor position, and put back in queue. */ + page_t* last_page = buf_block_get_frame(last_block); + rec_t* rec = page_rec_get_prev( + page_get_supremum_rec(last_page)); + ut_a(page_rec_is_user_rec(rec)); + page_cur_position(rec, last_block, + btr_cur_get_page_cur(cursor)); + btr_pcur_store_position(pcur, &mtr); + mtr_commit(&mtr); + /* Update the last_processed time of this index. */ + item->last_processed = now; + } else { + mtr_commit(&mtr); + /* Reaching the end of the index. */ + dict_stats_empty_defrag_stats(index); + dict_stats_save_defrag_stats(index); + dict_stats_save_defrag_summary(index); + btr_defragment_remove_item(item); + } + } + btr_defragment_shutdown(); + os_thread_exit(NULL); + OS_THREAD_DUMMY_RETURN; +} + +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc index 0c83089478a..52ba11fc9ab 100644 --- a/storage/innobase/dict/dict0dict.cc +++ b/storage/innobase/dict/dict0dict.cc @@ -408,7 +408,7 @@ dict_table_try_drop_aborted( if (table == NULL) { table = dict_table_open_on_id_low( - table_id, DICT_ERR_IGNORE_NONE); + table_id, DICT_ERR_IGNORE_NONE, FALSE); } else { ut_ad(table->id == table_id); } @@ -795,7 +795,8 @@ dict_table_open_on_id( table_id, table_op == DICT_TABLE_OP_LOAD_TABLESPACE ? DICT_ERR_IGNORE_RECOVER_LOCK - : DICT_ERR_IGNORE_NONE); + : DICT_ERR_IGNORE_NONE, + table_op == DICT_TABLE_OP_OPEN_ONLY_IF_CACHED); if (table != NULL) { @@ -1313,7 +1314,7 @@ dict_table_move_from_non_lru_to_lru( /**********************************************************************//** Looks for an index with the given id given a table instance. @return index or NULL */ -static +UNIV_INTERN dict_index_t* dict_table_find_index_on_id( /*========================*/ @@ -2408,6 +2409,13 @@ undo_size_ok: new_index->stat_index_size = 1; new_index->stat_n_leaf_pages = 1; + new_index->stat_defrag_n_pages_freed = 0; + new_index->stat_defrag_n_page_split = 0; + + new_index->stat_defrag_sample_next_slot = 0; + memset(&new_index->stat_defrag_data_size_sample, + 0x0, sizeof(ulint) * STAT_DEFRAG_DATA_SIZE_N_SAMPLE); + /* Add the new index as the last index for the table */ UT_LIST_ADD_LAST(indexes, table->indexes, new_index); diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc index 928bdb3f2ef..bec0079942b 100644 --- a/storage/innobase/dict/dict0stats.cc +++ b/storage/innobase/dict/dict0stats.cc @@ -492,6 +492,9 @@ dict_stats_table_clone_create( heap, idx->n_uniq * sizeof(idx->stat_n_non_null_key_vals[0])); ut_d(idx->magic_n = DICT_INDEX_MAGIC_N); + + idx->stat_defrag_n_page_split = 0; + idx->stat_defrag_n_pages_freed = 0; } ut_d(t->magic_n = DICT_TABLE_MAGIC_N); @@ -520,7 +523,9 @@ static void dict_stats_empty_index( /*===================*/ - dict_index_t* index) /*!< in/out: index */ + dict_index_t* index, /*!< in/out: index */ + bool empty_defrag_stats) + /*!< in: whether to empty defrag stats */ { ut_ad(!(index->type & DICT_FTS)); ut_ad(!dict_index_is_univ(index)); @@ -535,6 +540,34 @@ dict_stats_empty_index( index->stat_index_size = 1; index->stat_n_leaf_pages = 1; + + if (empty_defrag_stats) { + dict_stats_empty_defrag_stats(index); + dict_stats_empty_defrag_summary(index); + } +} + +/**********************************************************************//** +Clear defragmentation summary. */ +UNIV_INTERN +void +dict_stats_empty_defrag_summary( +/*==================*/ + dict_index_t* index) /*!< in: index to clear defragmentation stats */ +{ + index->stat_defrag_n_pages_freed = 0; +} + +/**********************************************************************//** +Clear defragmentation related index stats. */ +UNIV_INTERN +void +dict_stats_empty_defrag_stats( +/*==================*/ + dict_index_t* index) /*!< in: index to clear defragmentation stats */ +{ + index->stat_defrag_modified_counter = 0; + index->stat_defrag_n_page_split = 0; } /*********************************************************************//** @@ -544,7 +577,9 @@ static void dict_stats_empty_table( /*===================*/ - dict_table_t* table) /*!< in/out: table */ + dict_table_t* table, /*!< in/out: table */ + bool empty_defrag_stats) + /*!< in: whether to empty defrag stats */ { /* Zero the stats members */ @@ -569,7 +604,7 @@ dict_stats_empty_table( ut_ad(!dict_index_is_univ(index)); - dict_stats_empty_index(index); + dict_stats_empty_index(index, empty_defrag_stats); } table->stat_initialized = TRUE; @@ -704,7 +739,7 @@ dict_stats_copy( } if (!INDEX_EQ(src_idx, dst_idx)) { - dict_stats_empty_index(dst_idx); + dict_stats_empty_index(dst_idx, true); continue; } @@ -715,7 +750,7 @@ dict_stats_copy( /* Since src is smaller some elements in dst will remain untouched by the following memmove(), thus we init all of them here. */ - dict_stats_empty_index(dst_idx); + dict_stats_empty_index(dst_idx, true); } else { n_copy_el = dst_idx->n_uniq; } @@ -735,6 +770,13 @@ dict_stats_copy( dst_idx->stat_index_size = src_idx->stat_index_size; dst_idx->stat_n_leaf_pages = src_idx->stat_n_leaf_pages; + + dst_idx->stat_defrag_modified_counter = + src_idx->stat_defrag_modified_counter; + dst_idx->stat_defrag_n_pages_freed = + src_idx->stat_defrag_n_pages_freed; + dst_idx->stat_defrag_n_page_split = + src_idx->stat_defrag_n_page_split; } dst->stat_initialized = TRUE; @@ -758,6 +800,9 @@ dict_index_t::stat_n_sample_sizes[] dict_index_t::stat_n_non_null_key_vals[] dict_index_t::stat_index_size dict_index_t::stat_n_leaf_pages +dict_index_t::stat_defrag_modified_counter +dict_index_t::stat_defrag_n_pages_freed +dict_index_t::stat_defrag_n_page_split The returned object should be freed with dict_stats_snapshot_free() when no longer needed. @return incomplete table object */ @@ -807,7 +852,9 @@ dict_stats_snapshot_free( Calculates new estimates for index statistics. This function is relatively quick and is used to calculate transient statistics that are not saved on disk. This was the only way to calculate statistics -before the Persistent Statistics feature was introduced. */ +before the Persistent Statistics feature was introduced. +This function doesn't update the defragmentation related stats. +Only persistent statistics supports defragmentation stats. */ static void dict_stats_update_transient_for_index( @@ -823,10 +870,10 @@ dict_stats_update_transient_for_index( Initialize some bogus index cardinality statistics, so that the data can be queried in various means, also via secondary indexes. */ - dict_stats_empty_index(index); + dict_stats_empty_index(index, false); #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG } else if (ibuf_debug && !dict_index_is_clust(index)) { - dict_stats_empty_index(index); + dict_stats_empty_index(index, false); #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ } else { mtr_t mtr; @@ -847,7 +894,7 @@ dict_stats_update_transient_for_index( switch (size) { case ULINT_UNDEFINED: - dict_stats_empty_index(index); + dict_stats_empty_index(index, false); return; case 0: /* The root node of the tree is a leaf */ @@ -882,7 +929,7 @@ dict_stats_update_transient( if (dict_table_is_discarded(table)) { /* Nothing to do. */ - dict_stats_empty_table(table); + dict_stats_empty_table(table, false); return; } else if (index == NULL) { /* Table definition is corrupt */ @@ -892,7 +939,7 @@ dict_stats_update_transient( fprintf(stderr, " InnoDB: table %s has no indexes. " "Cannot calculate statistics.\n", ut_format_name(table->name, TRUE, buf, sizeof(buf))); - dict_stats_empty_table(table); + dict_stats_empty_table(table, false); return; } @@ -904,7 +951,7 @@ dict_stats_update_transient( continue; } - dict_stats_empty_index(index); + dict_stats_empty_index(index, false); if (dict_stats_should_ignore_index(index)) { continue; @@ -1794,7 +1841,7 @@ dict_stats_analyze_index( DEBUG_PRINTF(" %s(index=%s)\n", __func__, index->name); - dict_stats_empty_index(index); + dict_stats_empty_index(index, false); mtr_start(&mtr); @@ -2059,7 +2106,7 @@ dict_stats_update_persistent( /* Table definition is corrupt */ dict_table_stats_unlock(table, RW_X_LATCH); - dict_stats_empty_table(table); + dict_stats_empty_table(table, true); return(DB_CORRUPTION); } @@ -2088,7 +2135,7 @@ dict_stats_update_persistent( continue; } - dict_stats_empty_index(index); + dict_stats_empty_index(index, false); if (dict_stats_should_ignore_index(index)) { continue; @@ -2657,6 +2704,16 @@ dict_stats_fetch_index_stats_step( == 0) { index->stat_n_leaf_pages = (ulint) stat_value; arg->stats_were_modified = true; + } else if (stat_name_len == 12 /* strlen("n_page_split") */ + && strncasecmp("n_page_split", stat_name, stat_name_len) + == 0) { + index->stat_defrag_n_page_split = (ulint) stat_value; + arg->stats_were_modified = true; + } else if (stat_name_len == 13 /* strlen("n_pages_freed") */ + && strncasecmp("n_pages_freed", stat_name, stat_name_len) + == 0) { + index->stat_defrag_n_pages_freed = (ulint) stat_value; + arg->stats_were_modified = true; } else if (stat_name_len > PFX_LEN /* e.g. stat_name=="n_diff_pfx01" */ && strncasecmp(PFX, stat_name, PFX_LEN) == 0) { @@ -2776,7 +2833,7 @@ dict_stats_fetch_from_ps( the persistent storage contains incomplete stats (e.g. missing stats for some index) then we would end up with (partially) uninitialized stats. */ - dict_stats_empty_table(table); + dict_stats_empty_table(table, true); trx = trx_allocate_for_background(); @@ -2878,6 +2935,22 @@ dict_stats_fetch_from_ps( } /*********************************************************************//** +Clear defragmentation stats modified counter for all indices in table. */ +static +void +dict_stats_empty_defrag_modified_counter( + dict_table_t* table) /*!< in: table */ +{ + dict_index_t* index; + ut_a(table); + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + index->stat_defrag_modified_counter = 0; + } +} + +/*********************************************************************//** Fetches or calculates new estimates for index statistics. */ UNIV_INTERN void @@ -2949,13 +3022,13 @@ dict_stats_update( "because the .ibd file is missing. For help, please " "refer to " REFMAN "innodb-troubleshooting.html\n", ut_format_name(table->name, TRUE, buf, sizeof(buf))); - dict_stats_empty_table(table); + dict_stats_empty_table(table, true); return(DB_TABLESPACE_DELETED); } else if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) { /* If we have set a high innodb_force_recovery level, do not calculate statistics, as a badly corrupted index can cause a crash in it. */ - dict_stats_empty_table(table); + dict_stats_empty_table(table, false); return(DB_SUCCESS); } @@ -3014,7 +3087,7 @@ dict_stats_update( case DICT_STATS_EMPTY_TABLE: - dict_stats_empty_table(table); + dict_stats_empty_table(table, true); /* If table is using persistent stats, then save the stats on disk */ @@ -3073,6 +3146,7 @@ dict_stats_update( t->stats_last_recalc = table->stats_last_recalc; t->stat_modified_counter = 0; + dict_stats_empty_defrag_modified_counter(t); switch (err) { case DB_SUCCESS: @@ -3083,7 +3157,7 @@ dict_stats_update( copying because dict_stats_table_clone_create() does skip corrupted indexes so our dummy object 't' may have less indexes than the real object 'table'. */ - dict_stats_empty_table(table); + dict_stats_empty_table(table, true); dict_stats_copy(table, t); @@ -3650,6 +3724,117 @@ dict_stats_rename_table( return(ret); } +/*********************************************************************//** +Save defragmentation result. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_stats_save_defrag_summary( + dict_index_t* index) /*!< in: index */ +{ + dberr_t ret; + lint now = (lint) ut_time(); + if (dict_index_is_univ(index)) { + return DB_SUCCESS; + } + rw_lock_x_lock(&dict_operation_lock); + mutex_enter(&dict_sys->mutex); + ret = dict_stats_save_index_stat(index, now, "n_pages_freed", + index->stat_defrag_n_pages_freed, + NULL, + "Number of pages freed during" + " last defragmentation run.", + NULL); + + mutex_exit(&dict_sys->mutex); + rw_lock_x_unlock(&dict_operation_lock); + return (ret); +} + +/*********************************************************************//** +Save defragmentation stats for a given index. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_stats_save_defrag_stats( + dict_index_t* index) /*!< in: index */ +{ + dberr_t ret; + + if (index->table->ibd_file_missing) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Cannot save defragment stats because " + ".ibd file is missing.\n"); + return (DB_TABLESPACE_DELETED); + } + if (dict_index_is_corrupted(index)) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Cannot save defragment stats because " + "index is corrupted.\n"); + return(DB_CORRUPTION); + } + + if (dict_index_is_univ(index)) { + return DB_SUCCESS; + } + + lint now = (lint) ut_time(); + mtr_t mtr; + ulint n_leaf_pages; + ulint n_leaf_reserved; + mtr_start(&mtr); + mtr_s_lock(dict_index_get_lock(index), &mtr); + n_leaf_reserved = btr_get_size_and_reserved(index, BTR_N_LEAF_PAGES, + &n_leaf_pages, &mtr); + mtr_commit(&mtr); + + if (n_leaf_reserved == ULINT_UNDEFINED) { + // The index name is different during fast index creation, + // so the stats won't be associated with the right index + // for later use. We just return without saving. + return DB_SUCCESS; + } + + rw_lock_x_lock(&dict_operation_lock); + + mutex_enter(&dict_sys->mutex); + ret = dict_stats_save_index_stat(index, now, "n_page_split", + index->stat_defrag_n_page_split, + NULL, + "Number of new page splits on leaves" + " since last defragmentation.", + NULL); + if (ret != DB_SUCCESS) { + goto end; + } + + ret = dict_stats_save_index_stat( + index, now, "n_leaf_pages_defrag", + n_leaf_pages, + NULL, + "Number of leaf pages when this stat is saved to disk", + NULL); + if (ret != DB_SUCCESS) { + goto end; + } + + ret = dict_stats_save_index_stat( + index, now, "n_leaf_pages_reserved", + n_leaf_reserved, + NULL, + "Number of pages reserved for this index leaves when this stat " + "is saved to disk", + NULL); + +end: + mutex_exit(&dict_sys->mutex); + rw_lock_x_unlock(&dict_operation_lock); + + return (ret); +} + /* tests @{ */ #ifdef UNIV_COMPILE_TEST_FUNCS diff --git a/storage/innobase/dict/dict0stats_bg.cc b/storage/innobase/dict/dict0stats_bg.cc index ecd723ca39a..0089f9897ae 100644 --- a/storage/innobase/dict/dict0stats_bg.cc +++ b/storage/innobase/dict/dict0stats_bg.cc @@ -25,6 +25,7 @@ Created Apr 25, 2012 Vasil Dimov #include "row0mysql.h" #include "srv0start.h" +#include "dict0dict.h" #include "dict0stats.h" #include "dict0stats_bg.h" @@ -44,8 +45,10 @@ UNIV_INTERN os_event_t dict_stats_event = NULL; /** This mutex protects the "recalc_pool" variable. */ static ib_mutex_t recalc_pool_mutex; +static ib_mutex_t defrag_pool_mutex; #ifdef HAVE_PSI_INTERFACE static mysql_pfs_key_t recalc_pool_mutex_key; +static mysql_pfs_key_t defrag_pool_mutex_key; #endif /* HAVE_PSI_INTERFACE */ /** The number of tables that can be added to "recalc_pool" before @@ -59,16 +62,26 @@ static recalc_pool_t recalc_pool; typedef recalc_pool_t::iterator recalc_pool_iterator_t; +/** Indices whose defrag stats need to be saved to persistent storage.*/ +struct defrag_pool_item_t { + table_id_t table_id; + index_id_t index_id; +}; +typedef std::vector<defrag_pool_item_t> defrag_pool_t; +static defrag_pool_t defrag_pool; +typedef defrag_pool_t::iterator defrag_pool_iterator_t; + /*****************************************************************//** Initialize the recalc pool, called once during thread initialization. */ static void -dict_stats_recalc_pool_init() +dict_stats_pool_init() /*=========================*/ { ut_ad(!srv_read_only_mode); recalc_pool.reserve(RECALC_POOL_INITIAL_SLOTS); + defrag_pool.reserve(RECALC_POOL_INITIAL_SLOTS); } /*****************************************************************//** @@ -76,12 +89,13 @@ Free the resources occupied by the recalc pool, called once during thread de-initialization. */ static void -dict_stats_recalc_pool_deinit() -/*===========================*/ +dict_stats_pool_deinit() +/*====================*/ { ut_ad(!srv_read_only_mode); recalc_pool.clear(); + defrag_pool.clear(); /* recalc_pool may still have its buffer allocated. It will free it when its destructor is called. @@ -90,8 +104,12 @@ dict_stats_recalc_pool_deinit() memory. To avoid that, we force recalc_pool to surrender its buffer to empty_pool object, which will free it when leaving this function: */ - recalc_pool_t empty_pool; - recalc_pool.swap(empty_pool); + recalc_pool_t recalc_empty_pool; + defrag_pool_t defrag_empty_pool; + memset(&recalc_empty_pool, 0, sizeof(recalc_pool_t)); + memset(&defrag_empty_pool, 0, sizeof(defrag_pool_t)); + recalc_pool.swap(recalc_empty_pool); + defrag_pool.swap(defrag_empty_pool); } /*****************************************************************//** @@ -188,6 +206,111 @@ dict_stats_recalc_pool_del( } /*****************************************************************//** +Add an index in a table to the defrag pool, which is processed by the +background stats gathering thread. Only the table id and index id are +added to the list, so the table can be closed after being enqueued and +it will be opened when needed. If the table or index does not exist later +(has been DROPped), then it will be removed from the pool and skipped. */ +UNIV_INTERN +void +dict_stats_defrag_pool_add( +/*=======================*/ + const dict_index_t* index) /*!< in: table to add */ +{ + defrag_pool_item_t item; + + ut_ad(!srv_read_only_mode); + + mutex_enter(&defrag_pool_mutex); + + /* quit if already in the list */ + for (defrag_pool_iterator_t iter = defrag_pool.begin(); + iter != defrag_pool.end(); + ++iter) { + if ((*iter).table_id == index->table->id + && (*iter).index_id == index->id) { + mutex_exit(&defrag_pool_mutex); + return; + } + } + + item.table_id = index->table->id; + item.index_id = index->id; + defrag_pool.push_back(item); + + mutex_exit(&defrag_pool_mutex); + + os_event_set(dict_stats_event); +} + +/*****************************************************************//** +Get an index from the auto defrag pool. The returned index id is removed +from the pool. +@return true if the pool was non-empty and "id" was set, false otherwise */ +static +bool +dict_stats_defrag_pool_get( +/*=======================*/ + table_id_t* table_id, /*!< out: table id, or unmodified if + list is empty */ + index_id_t* index_id) /*!< out: index id, or unmodified if + list is empty */ +{ + ut_ad(!srv_read_only_mode); + + mutex_enter(&defrag_pool_mutex); + + if (defrag_pool.empty()) { + mutex_exit(&defrag_pool_mutex); + return(false); + } + + defrag_pool_item_t& item = defrag_pool.back(); + *table_id = item.table_id; + *index_id = item.index_id; + + defrag_pool.pop_back(); + + mutex_exit(&defrag_pool_mutex); + + return(true); +} + +/*****************************************************************//** +Delete a given index from the auto defrag pool. */ +UNIV_INTERN +void +dict_stats_defrag_pool_del( +/*=======================*/ + const dict_table_t* table, /*!<in: if given, remove + all entries for the table */ + const dict_index_t* index) /*!< in: if given, remove this index */ +{ + ut_a((table && !index) || (!table && index)); + ut_ad(!srv_read_only_mode); + ut_ad(mutex_own(&dict_sys->mutex)); + + mutex_enter(&defrag_pool_mutex); + + defrag_pool_iterator_t iter = defrag_pool.begin(); + while (iter != defrag_pool.end()) { + if ((table && (*iter).table_id == table->id) + || (index + && (*iter).table_id == index->table->id + && (*iter).index_id == index->id)) { + /* erase() invalidates the iterator */ + iter = defrag_pool.erase(iter); + if (index) + break; + } else { + iter++; + } + } + + mutex_exit(&defrag_pool_mutex); +} + +/*****************************************************************//** Wait until background stats thread has stopped using the specified table. The caller must have locked the data dictionary using row_mysql_lock_data_dictionary() and this function may unlock it temporarily @@ -237,7 +360,10 @@ dict_stats_thread_init() mutex_create(recalc_pool_mutex_key, &recalc_pool_mutex, SYNC_STATS_AUTO_RECALC); - dict_stats_recalc_pool_init(); + /* We choose SYNC_STATS_DEFRAG to be below SYNC_FSP_PAGE. */ + mutex_create(defrag_pool_mutex_key, &defrag_pool_mutex, + SYNC_STATS_DEFRAG); + dict_stats_pool_init(); } /*****************************************************************//** @@ -251,11 +377,14 @@ dict_stats_thread_deinit() ut_a(!srv_read_only_mode); ut_ad(!srv_dict_stats_thread_active); - dict_stats_recalc_pool_deinit(); + dict_stats_pool_deinit(); mutex_free(&recalc_pool_mutex); memset(&recalc_pool_mutex, 0x0, sizeof(recalc_pool_mutex)); + mutex_free(&defrag_pool_mutex); + memset(&defrag_pool_mutex, 0x0, sizeof(defrag_pool_mutex)); + os_event_free(dict_stats_event); dict_stats_event = NULL; } @@ -333,6 +462,63 @@ dict_stats_process_entry_from_recalc_pool() } /*****************************************************************//** +Get the first index that has been added for updating persistent defrag +stats and eventually save its stats. */ +static +void +dict_stats_process_entry_from_defrag_pool() +/*=======================================*/ +{ + table_id_t table_id; + index_id_t index_id; + + ut_ad(!srv_read_only_mode); + + /* pop the first index from the auto defrag pool */ + if (!dict_stats_defrag_pool_get(&table_id, &index_id)) { + /* no index in defrag pool */ + return; + } + + dict_table_t* table; + + mutex_enter(&dict_sys->mutex); + + /* If the table is no longer cached, we've already lost the in + memory stats so there's nothing really to write to disk. */ + table = dict_table_open_on_id(table_id, TRUE, + DICT_TABLE_OP_OPEN_ONLY_IF_CACHED); + + if (table == NULL) { + mutex_exit(&dict_sys->mutex); + return; + } + + /* Check whether table is corrupted */ + if (table->corrupted) { + dict_table_close(table, TRUE, FALSE); + mutex_exit(&dict_sys->mutex); + return; + } + mutex_exit(&dict_sys->mutex); + + dict_index_t* index = dict_table_find_index_on_id(table, index_id); + + if (index == NULL) { + return; + } + + /* Check whether index is corrupted */ + if (dict_index_is_corrupted(index)) { + dict_table_close(table, FALSE, FALSE); + return; + } + + dict_stats_save_defrag_stats(index); + dict_table_close(table, FALSE, FALSE); +} + +/*****************************************************************//** This is the thread for background stats gathering. It pops tables, from the auto recalc list and proceeds them, eventually recalculating their statistics. @@ -364,6 +550,9 @@ DECLARE_THREAD(dict_stats_thread)( dict_stats_process_entry_from_recalc_pool(); + while (defrag_pool.size()) + dict_stats_process_entry_from_defrag_pool(); + os_event_reset(dict_stats_event); } diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index ead86fd3085..7887951a026 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -57,6 +57,7 @@ this program; if not, write to the Free Software Foundation, Inc., #include "buf0flu.h" #include "buf0dblwr.h" #include "btr0sea.h" +#include "btr0defragment.h" #include "os0file.h" #include "os0thread.h" #include "srv0start.h" @@ -65,7 +66,6 @@ this program; if not, write to the Free Software Foundation, Inc., #include "trx0trx.h" #include "trx0sys.h" -#include "mtr0mtr.h" #include "rem0types.h" #include "row0ins.h" #include "row0mysql.h" @@ -86,6 +86,7 @@ this program; if not, write to the Free Software Foundation, Inc., #include "dict0stats_bg.h" #include "ha_prototypes.h" #include "ut0mem.h" +#include "ut0timer.h" #include "ibuf0ibuf.h" #include "dict0dict.h" #include "srv0mon.h" @@ -752,6 +753,14 @@ static SHOW_VAR innodb_status_variables[]= { {"have_bzip2", (char*) &innodb_have_bzip2, SHOW_BOOL}, + /* Defragmentation */ + {"defragment_compression_failures", + (char*) &export_vars.innodb_defragment_compression_failures, SHOW_LONG}, + {"defragment_failures", + (char*) &export_vars.innodb_defragment_failures, SHOW_LONG}, + {"defragment_count", + (char*) &export_vars.innodb_defragment_count, SHOW_LONG}, + {NullS, NullS, SHOW_LONG} }; @@ -2351,7 +2360,8 @@ ha_innobase::ha_innobase( (srv_force_primary_key ? HA_REQUIRE_PRIMARY_KEY : 0 ) | HA_CAN_FULLTEXT_EXT | HA_CAN_EXPORT), start_of_scan(0), - num_write_row(0) + num_write_row(0), + ha_partition_stats(NULL) {} /*********************************************************************//** @@ -10678,6 +10688,71 @@ ha_innobase::delete_table( DBUG_RETURN(convert_error_code_to_mysql(err, 0, NULL)); } +/*****************************************************************//** +Defragment table. +@return error number */ +UNIV_INTERN +int +ha_innobase::defragment_table( +/*==========================*/ + const char* name, /*!< in: table name */ + const char* index_name, /*!< in: index name */ + bool async) /*!< in: whether to wait until finish */ +{ + char norm_name[FN_REFLEN]; + dict_table_t* table; + dict_index_t* index; + ibool one_index = (index_name != 0); + int ret = 0; + if (!srv_defragment) { + return ER_FEATURE_DISABLED; + } + normalize_table_name(norm_name, name); + table = dict_table_open_on_name(norm_name, FALSE, + FALSE, DICT_ERR_IGNORE_NONE); + for (index = dict_table_get_first_index(table); index; + index = dict_table_get_next_index(index)) { + if (one_index && strcasecmp(index_name, index->name) != 0) + continue; + if (btr_defragment_find_index(index)) { + // We borrow this error code. When the same index is + // already in the defragmentation queue, issue another + // defragmentation only introduces overhead. We return + // an error here to let the user know this is not + // necessary. Note that this will fail a query that's + // trying to defragment a full table if one of the + // indicies in that table is already in defragmentation. + // We choose this behavior so user is aware of this + // rather than silently defragment other indicies of + // that table. + ret = ER_SP_ALREADY_EXISTS; + break; + } + os_event_t event = btr_defragment_add_index(index, async); + if (!async && event) { + while(os_event_wait_time(event, 1000000)) { + if (thd_killed(current_thd)) { + btr_defragment_remove_index(index); + ret = ER_QUERY_INTERRUPTED; + break; + } + } + os_event_free(event); + } + if (ret) { + break; + } + if (one_index) { + one_index = FALSE; + break; + } + } + dict_table_close(table, FALSE, FALSE); + if (ret == 0 && one_index) { + ret = ER_NO_SUCH_INDEX; + } + return ret; +} /*****************************************************************//** Removes all tables in the named database inside InnoDB. */ @@ -11816,6 +11891,27 @@ ha_innobase::optimize( This works OK otherwise, but MySQL locks the entire table during calls to OPTIMIZE, which is undesirable. */ + if (srv_defragment) { + int err; + + err = defragment_table(prebuilt->table->name, NULL, false); + + if (err == 0) { + return (HA_ADMIN_OK); + } else { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + err, + "InnoDB: Cannot defragment table %s: returned error code %d\n", + prebuilt->table->name, err); + + if(err == ER_SP_ALREADY_EXISTS) { + return (HA_ADMIN_OK); + } else { + return (HA_ADMIN_TRY_ALTER); + } + } + } + if (innodb_optimize_fulltext_only) { if (prebuilt->table->fts && prebuilt->table->fts->cache && !dict_table_is_discarded(prebuilt->table)) { @@ -14520,6 +14616,13 @@ innodb_max_dirty_pages_pct_lwm_update( srv_max_dirty_pages_pct_lwm = in_val; } +UNIV_INTERN +void +ha_innobase::set_partition_owner_stats(ha_statistics *stats) +{ + ha_partition_stats= stats; +} + /************************************************************//** Validate the file format name and return its corresponding id. @return valid file format id */ @@ -15773,6 +15876,23 @@ innodb_reset_all_monitor_update( TRUE); } +static +void +innodb_defragment_frequency_update( +/*===============================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + srv_defragment_frequency = (*static_cast<const uint*>(save)); + srv_defragment_interval = ut_microseconds_to_timer( + 1000000.0 / srv_defragment_frequency); +} + /****************************************************************//** Parse and enable InnoDB monitor counters during server startup. User can list the monitor counters/groups to be enable by specifying @@ -16631,6 +16751,60 @@ static MYSQL_SYSVAR_BOOL(buffer_pool_load_at_startup, srv_buffer_pool_load_at_st "Load the buffer pool from a file named @@innodb_buffer_pool_filename", NULL, NULL, FALSE); +static MYSQL_SYSVAR_BOOL(defragment, srv_defragment, + PLUGIN_VAR_RQCMDARG, + "Enable/disable InnoDB defragmentation (default FALSE). When set to FALSE, all existing " + "defragmentation will be paused. And new defragmentation command will fail." + "Paused defragmentation commands will resume when this variable is set to " + "true again.", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_UINT(defragment_n_pages, srv_defragment_n_pages, + PLUGIN_VAR_RQCMDARG, + "Number of pages considered at once when merging multiple pages to " + "defragment", + NULL, NULL, 7, 2, 32, 0); + +static MYSQL_SYSVAR_UINT(defragment_stats_accuracy, + srv_defragment_stats_accuracy, + PLUGIN_VAR_RQCMDARG, + "How many defragment stats changes there are before the stats " + "are written to persistent storage. Set to 0 meaning disable " + "defragment stats tracking.", + NULL, NULL, 0, 0, ~0U, 0); + +static MYSQL_SYSVAR_UINT(defragment_fill_factor_n_recs, + srv_defragment_fill_factor_n_recs, + PLUGIN_VAR_RQCMDARG, + "How many records of space defragmentation should leave on the page. " + "This variable, together with innodb_defragment_fill_factor, is introduced " + "so defragmentation won't pack the page too full and cause page split on " + "the next insert on every page. The variable indicating more defragmentation" + " gain is the one effective.", + NULL, NULL, 20, 1, 100, 0); + +static MYSQL_SYSVAR_DOUBLE(defragment_fill_factor, srv_defragment_fill_factor, + PLUGIN_VAR_RQCMDARG, + "A number between [0.7, 1] that tells defragmentation how full it should " + "fill a page. Default is 0.9. Number below 0.7 won't make much sense." + "This variable, together with innodb_defragment_fill_factor_n_recs, is " + "introduced so defragmentation won't pack the page too full and cause " + "page split on the next insert on every page. The variable indicating more " + "defragmentation gain is the one effective.", + NULL, NULL, 0.9, 0.7, 1, 0); + +static MYSQL_SYSVAR_UINT(defragment_frequency, srv_defragment_frequency, + PLUGIN_VAR_RQCMDARG, + "Do not defragment a single index more than this number of time per second." + "This controls the number of time defragmentation thread can request X_LOCK " + "on an index. Defragmentation thread will check whether " + "1/defragment_frequency (s) has passed since it worked on this index last " + "time, and put the index back to the queue if not enough time has passed. " + "The actual frequency can only be lower than this given number.", + NULL, innodb_defragment_frequency_update, + SRV_DEFRAGMENT_FREQUENCY_DEFAULT, 1, 1000, 0); + + static MYSQL_SYSVAR_ULONG(lru_scan_depth, srv_LRU_scan_depth, PLUGIN_VAR_RQCMDARG, "How deep to scan LRU to keep it clean", @@ -17116,6 +17290,12 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(buffer_pool_load_now), MYSQL_SYSVAR(buffer_pool_load_abort), MYSQL_SYSVAR(buffer_pool_load_at_startup), + MYSQL_SYSVAR(defragment), + MYSQL_SYSVAR(defragment_n_pages), + MYSQL_SYSVAR(defragment_stats_accuracy), + MYSQL_SYSVAR(defragment_fill_factor), + MYSQL_SYSVAR(defragment_fill_factor_n_recs), + MYSQL_SYSVAR(defragment_frequency), MYSQL_SYSVAR(lru_scan_depth), MYSQL_SYSVAR(flush_neighbors), MYSQL_SYSVAR(checksum_algorithm), diff --git a/storage/innobase/handler/ha_innodb.h b/storage/innobase/handler/ha_innodb.h index 912be30c0ec..4e9586d8a3d 100644 --- a/storage/innobase/handler/ha_innodb.h +++ b/storage/innobase/handler/ha_innodb.h @@ -105,6 +105,8 @@ class ha_innobase: public handler or undefined */ uint num_write_row; /*!< number of write_row() calls */ + ha_statistics* ha_partition_stats; /*!< stats of the partition owner + handler (if there is one) */ uint store_key_val_for_row(uint keynr, char* buff, uint buff_len, const uchar* record); inline void update_thd(THD* thd); @@ -206,6 +208,8 @@ class ha_innobase: public handler int truncate(); int delete_table(const char *name); int rename_table(const char* from, const char* to); + int defragment_table(const char* name, const char* index_name, + bool async); int check(THD* thd, HA_CHECK_OPT* check_opt); char* update_table_comment(const char* comment); char* get_foreign_key_create_info(); @@ -309,6 +313,7 @@ class ha_innobase: public handler Alter_inplace_info* ha_alter_info, bool commit); /** @} */ + void set_partition_owner_stats(ha_statistics *stats); bool check_if_incompatible_data(HA_CREATE_INFO *info, uint table_changes); private: diff --git a/storage/innobase/include/btr0btr.h b/storage/innobase/include/btr0btr.h index 305acf7e322..b6f8a685ae9 100644 --- a/storage/innobase/include/btr0btr.h +++ b/storage/innobase/include/btr0btr.h @@ -2,6 +2,7 @@ Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2014, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -671,6 +672,21 @@ btr_get_size( is s-latched */ __attribute__((nonnull, warn_unused_result)); /**************************************************************//** +Gets the number of reserved and used pages in a B-tree. +@return number of pages reserved, or ULINT_UNDEFINED if the index +is unavailable */ +UNIV_INTERN +ulint +btr_get_size_and_reserved( +/*======================*/ + dict_index_t* index, /*!< in: index */ + ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */ + ulint* used, /*!< out: number of pages used (<= reserved) */ + mtr_t* mtr) /*!< in/out: mini-transaction where index + is s-latched */ + __attribute__((nonnull)); + +/**************************************************************//** Allocates a new file page to be used in an index tree. NOTE: we assume that the caller has made the reservation for free extents! @retval NULL if no page could be allocated @@ -717,6 +733,33 @@ btr_page_free_low( ulint level, /*!< in: page level */ mtr_t* mtr) /*!< in: mtr */ __attribute__((nonnull)); +/*************************************************************//** +Reorganizes an index page. + +IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. This has to +be done either within the same mini-transaction, or by invoking +ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages, +IBUF_BITMAP_FREE is unaffected by reorganization. + +@retval true if the operation was successful +@retval false if it is a compressed page, and recompression failed */ +UNIV_INTERN +bool +btr_page_reorganize_block( +/*======================*/ + bool recovery,/*!< in: true if called in recovery: + locks should not be updated, i.e., + there cannot exist locks on the + page, and a hash index should not be + dropped: it cannot exist */ + ulint z_level,/*!< in: compression level to be used + if dealing with compressed page */ + buf_block_t* block, /*!< in/out: B-tree page */ + dict_index_t* index, /*!< in: the index tree of the page */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); + #ifdef UNIV_BTR_PRINT /*************************************************************//** Prints size info of a B-tree. */ @@ -762,6 +805,60 @@ btr_validate_index( const trx_t* trx) /*!< in: transaction or 0 */ __attribute__((nonnull(1), warn_unused_result)); +#ifdef UNIV_SYNC_DEBUG +/*************************************************************//** +Removes a page from the level list of pages. +@param space in: space where removed +@param zip_size in: compressed page size in bytes, or 0 for uncompressed +@param page in/out: page to remove +@param index in: index tree +@param mtr in/out: mini-transaction */ +# define btr_level_list_remove(space,zip_size,page,index,mtr) \ + btr_level_list_remove_func(space,zip_size,page,index,mtr) +#else /* UNIV_SYNC_DEBUG */ +/*************************************************************//** +Removes a page from the level list of pages. +@param space in: space where removed +@param zip_size in: compressed page size in bytes, or 0 for uncompressed +@param page in/out: page to remove +@param index in: index tree +@param mtr in/out: mini-transaction */ +# define btr_level_list_remove(space,zip_size,page,index,mtr) \ + btr_level_list_remove_func(space,zip_size,page,mtr) +#endif /* UNIV_SYNC_DEBUG */ + +/*************************************************************//** +Removes a page from the level list of pages. */ +UNIV_INTERN +void +btr_level_list_remove_func( +/*=======================*/ + ulint space, /*!< in: space where removed */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + page_t* page, /*!< in/out: page to remove */ +#ifdef UNIV_SYNC_DEBUG + const dict_index_t* index, /*!< in: index tree */ +#endif /* UNIV_SYNC_DEBUG */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); + +/*************************************************************//** +If page is the only on its level, this function moves its records to the +father page, thus reducing the tree height. +@return father block */ +UNIV_INTERN +buf_block_t* +btr_lift_page_up( +/*=============*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: page which is the only on its level; + must not be empty: use + btr_discard_only_page_on_level if the last + record from the page should be removed */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); + #define BTR_N_LEAF_PAGES 1 #define BTR_TOTAL_SIZE 2 #endif /* !UNIV_HOTBACKUP */ diff --git a/storage/innobase/include/btr0btr.ic b/storage/innobase/include/btr0btr.ic index 00f50b5dcaf..40b468b200a 100644 --- a/storage/innobase/include/btr0btr.ic +++ b/storage/innobase/include/btr0btr.ic @@ -163,9 +163,10 @@ btr_page_get_next( /*!< in: mini-transaction handle */ { ut_ad(page && mtr); +#ifndef UNIV_INNOCHECKSUM ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX) || mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_S_FIX)); - +#endif /* UNIV_INNOCHECKSUM */ return(mach_read_from_4(page + FIL_PAGE_NEXT)); } diff --git a/storage/innobase/include/btr0defragment.h b/storage/innobase/include/btr0defragment.h new file mode 100644 index 00000000000..8fef3c6519a --- /dev/null +++ b/storage/innobase/include/btr0defragment.h @@ -0,0 +1,101 @@ +/***************************************************************************** + +Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +#ifndef btr0defragment_h +#define btr0defragment_h + +#include "univ.i" + +#ifndef UNIV_HOTBACKUP + +#include "btr0pcur.h" + +/* Max number of pages to consider at once during defragmentation. */ +#define BTR_DEFRAGMENT_MAX_N_PAGES 32 + +/** stats in btr_defragment */ +extern ulint btr_defragment_compression_failures; +extern ulint btr_defragment_failures; +extern ulint btr_defragment_count; + +/** Item in the work queue for btr_degrament_thread. */ +struct btr_defragment_item_t +{ + btr_pcur_t* pcur; /* persistent cursor where + btr_defragment_n_pages should start */ + os_event_t event; /* if not null, signal after work + is done */ + bool removed; /* Mark an item as removed */ + ulonglong last_processed; /* timestamp of last time this index + is processed by defragment thread */ + + btr_defragment_item_t(btr_pcur_t* pcur, os_event_t event); + ~btr_defragment_item_t(); +}; + +/******************************************************************//** +Initialize defragmentation. */ +void +btr_defragment_init(void); +/******************************************************************//** +Shutdown defragmentation. */ +void +btr_defragment_shutdown(); +/******************************************************************//** +Check whether the given index is in btr_defragment_wq. */ +bool +btr_defragment_find_index( + dict_index_t* index); /*!< Index to find. */ +/******************************************************************//** +Add an index to btr_defragment_wq. Return a pointer to os_event if this +is a synchronized defragmentation. */ +os_event_t +btr_defragment_add_index( + dict_index_t* index, /*!< index to be added */ + bool async); /*!< whether this is an async defragmentation */ +/******************************************************************//** +When table is dropped, this function is called to mark a table as removed in +btr_efragment_wq. The difference between this function and the remove_index +function is this will not NULL the event. */ +void +btr_defragment_remove_table( + dict_table_t* table); /*!< Index to be removed. */ +/******************************************************************//** +Mark an index as removed from btr_defragment_wq. */ +void +btr_defragment_remove_index( + dict_index_t* index); /*!< Index to be removed. */ +/*********************************************************************//** +Check whether we should save defragmentation statistics to persistent storage.*/ +UNIV_INTERN +void +btr_defragment_save_defrag_stats_if_needed( + dict_index_t* index); /*!< in: index */ +/******************************************************************//** +Thread that merges consecutive b-tree pages into fewer pages to defragment +the index. */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(btr_defragment_thread)( +/*==========================================*/ + void* arg); /*!< in: a dummy parameter required by + os_thread_create */ + + +#endif /* !UNIV_HOTBACKUP */ +#endif diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h index 2a96f5299bb..7d14df09cb2 100644 --- a/storage/innobase/include/dict0dict.h +++ b/storage/innobase/include/dict0dict.h @@ -120,7 +120,9 @@ enum dict_table_op_t { DICT_TABLE_OP_DROP_ORPHAN, /** Silently load the tablespace if it does not exist, and do not load the definitions of incomplete indexes. */ - DICT_TABLE_OP_LOAD_TABLESPACE + DICT_TABLE_OP_LOAD_TABLESPACE, + /** Open the table only if it's in table cache. */ + DICT_TABLE_OP_OPEN_ONLY_IF_CACHED }; /**********************************************************************//** @@ -1496,6 +1498,16 @@ dict_table_get_index_on_name( const char* name) /*!< in: name of the index to find */ __attribute__((nonnull, warn_unused_result)); /**********************************************************************//** +Looks for an index with the given id given a table instance. +@return index or NULL */ +UNIV_INTERN +dict_index_t* +dict_table_find_index_on_id( +/*========================*/ + const dict_table_t* table, /*!< in: table instance */ + index_id_t id) /*!< in: index id */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************************//** In case there is more than one index with the same name return the index with the min(id). @return index, NULL if does not exist */ diff --git a/storage/innobase/include/dict0mem.h b/storage/innobase/include/dict0mem.h index b026210b214..ccca7af1c03 100644 --- a/storage/innobase/include/dict0mem.h +++ b/storage/innobase/include/dict0mem.h @@ -588,6 +588,10 @@ struct zip_pad_info_t { rounds */ }; +/** Number of samples of data size kept when page compression fails for +a certain index.*/ +#define STAT_DEFRAG_DATA_SIZE_N_SAMPLE 10 + /** Data structure for an index. Most fields will be initialized to 0, NULL or FALSE in dict_mem_index_create(). */ struct dict_index_t{ @@ -676,6 +680,23 @@ struct dict_index_t{ /*!< approximate number of leaf pages in the index tree */ /* @} */ + /** Statistics for defragmentation, these numbers are estimations and + could be very inaccurate at certain times, e.g. right after restart, + during defragmentation, etc. */ + /* @{ */ + ulint stat_defrag_modified_counter; + ulint stat_defrag_n_pages_freed; + /* number of pages freed by defragmentation. */ + ulint stat_defrag_n_page_split; + /* number of page splits since last full index + defragmentation. */ + ulint stat_defrag_data_size_sample[STAT_DEFRAG_DATA_SIZE_N_SAMPLE]; + /* data size when compression failure happened + the most recent 10 times. */ + ulint stat_defrag_sample_next_slot; + /* in which slot the next sample should be + saved. */ + /* @} */ rw_lock_t lock; /*!< read-write lock protecting the upper levels of the index tree */ trx_id_t trx_id; /*!< id of the transaction that created this diff --git a/storage/innobase/include/dict0priv.h b/storage/innobase/include/dict0priv.h index 9a3c8e22992..e034662aba0 100644 --- a/storage/innobase/include/dict0priv.h +++ b/storage/innobase/include/dict0priv.h @@ -53,8 +53,9 @@ dict_table_t* dict_table_open_on_id_low( /*=====================*/ table_id_t table_id, /*!< in: table id */ - dict_err_ignore_t ignore_err); /*!< in: errors to ignore + dict_err_ignore_t ignore_err, /*!< in: errors to ignore when loading the table */ + ibool open_only_if_in_cache); #ifndef UNIV_NONINL #include "dict0priv.ic" diff --git a/storage/innobase/include/dict0priv.ic b/storage/innobase/include/dict0priv.ic index 30ba8fb60aa..983218af78a 100644 --- a/storage/innobase/include/dict0priv.ic +++ b/storage/innobase/include/dict0priv.ic @@ -74,8 +74,9 @@ dict_table_t* dict_table_open_on_id_low( /*======================*/ table_id_t table_id, /*!< in: table id */ - dict_err_ignore_t ignore_err) /*!< in: errors to ignore + dict_err_ignore_t ignore_err, /*!< in: errors to ignore when loading the table */ + ibool open_only_if_in_cache) { dict_table_t* table; ulint fold; @@ -88,7 +89,7 @@ dict_table_open_on_id_low( HASH_SEARCH(id_hash, dict_sys->table_id_hash, fold, dict_table_t*, table, ut_ad(table->cached), table->id == table_id); - if (table == NULL) { + if (table == NULL && !open_only_if_in_cache) { table = dict_load_table_on_id(table_id, ignore_err); } diff --git a/storage/innobase/include/dict0stats.h b/storage/innobase/include/dict0stats.h index 186f90e3694..abf56b2f0c7 100644 --- a/storage/innobase/include/dict0stats.h +++ b/storage/innobase/include/dict0stats.h @@ -195,6 +195,39 @@ dict_stats_rename_table( is returned */ size_t errstr_sz); /*!< in: errstr size */ +/*********************************************************************//** +Save defragmentation result. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_stats_save_defrag_summary( + dict_index_t* index); /*!< in: index */ + +/*********************************************************************//** +Save defragmentation stats for a given index. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_stats_save_defrag_stats( + dict_index_t* index); /*!< in: index */ + +/**********************************************************************//** +Clear defragmentation summary. */ +UNIV_INTERN +void +dict_stats_empty_defrag_summary( +/*==================*/ + dict_index_t* index); /*!< in: index to clear defragmentation stats */ + +/**********************************************************************//** +Clear defragmentation related index stats. */ +UNIV_INTERN +void +dict_stats_empty_defrag_stats( +/*==================*/ + dict_index_t* index); /*!< in: index to clear defragmentation stats */ + + #ifndef UNIV_NONINL #include "dict0stats.ic" #endif diff --git a/storage/innobase/include/dict0stats_bg.h b/storage/innobase/include/dict0stats_bg.h index e866ab419fe..32fac3015e8 100644 --- a/storage/innobase/include/dict0stats_bg.h +++ b/storage/innobase/include/dict0stats_bg.h @@ -56,6 +56,28 @@ dict_stats_recalc_pool_del( /*=======================*/ const dict_table_t* table); /*!< in: table to remove */ +/*****************************************************************//** +Add an index in a table to the defrag pool, which is processed by the +background stats gathering thread. Only the table id and index id are +added to the list, so the table can be closed after being enqueued and +it will be opened when needed. If the table or index does not exist later +(has been DROPped), then it will be removed from the pool and skipped. */ +UNIV_INTERN +void +dict_stats_defrag_pool_add( +/*=======================*/ + const dict_index_t* index); /*!< in: table to add */ + +/*****************************************************************//** +Delete a given index from the auto defrag pool. */ +UNIV_INTERN +void +dict_stats_defrag_pool_del( +/*=======================*/ + const dict_table_t* table, /*!<in: if given, remove + all entries for the table */ + const dict_index_t* index); /*!< in: index to remove */ + /** Yield the data dictionary latch when waiting for the background thread to stop accessing a table. @param trx transaction holding the data dictionary locks */ diff --git a/storage/innobase/include/lock0lock.h b/storage/innobase/include/lock0lock.h index 6d5ed35d5d8..3babc4d82fd 100644 --- a/storage/innobase/include/lock0lock.h +++ b/storage/innobase/include/lock0lock.h @@ -181,6 +181,16 @@ lock_update_merge_left( const buf_block_t* right_block); /*!< in: merged index page which will be discarded */ /*************************************************************//** +Updates the lock table when a page is splited and merged to +two pages. */ +UNIV_INTERN +void +lock_update_split_and_merge( + const buf_block_t* left_block, /*!< in: left page to which merged */ + const rec_t* orig_pred, /*!< in: original predecessor of + supremum on the left page before merge*/ + const buf_block_t* right_block);/*!< in: right page from which merged */ +/*************************************************************//** Resets the original locks on heir and replaces them with gap type locks inherited from rec. */ UNIV_INTERN diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index 905d4a0afa7..231537b3cde 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -335,6 +335,15 @@ extern my_bool srv_random_read_ahead; extern ulong srv_read_ahead_threshold; extern ulint srv_n_read_io_threads; extern ulint srv_n_write_io_threads; +/* Defragmentation, Origianlly facebook default value is 100, but it's too high */ +#define SRV_DEFRAGMENT_FREQUENCY_DEFAULT 40 +extern my_bool srv_defragment; +extern uint srv_defragment_n_pages; +extern uint srv_defragment_stats_accuracy; +extern uint srv_defragment_fill_factor_n_recs; +extern double srv_defragment_fill_factor; +extern uint srv_defragment_frequency; +extern ulonglong srv_defragment_interval; /* Number of IO operations per second the server can do */ extern ulong srv_io_capacity; @@ -888,7 +897,12 @@ struct export_var_t{ ulint innodb_rows_deleted; /*!< srv_n_rows_deleted */ ulint innodb_num_open_files; /*!< fil_n_file_opened */ ulint innodb_truncated_status_writes; /*!< srv_truncated_status_writes */ - ulint innodb_available_undo_logs; /*!< srv_available_undo_logs */ + ulint innodb_available_undo_logs; /*!< srv_available_undo_logs + */ + ulint innodb_defragment_compression_failures; + ulint innodb_defragment_failures; + ulint innodb_defragment_count; + #ifdef UNIV_DEBUG ulint innodb_purge_trx_id_age; /*!< rw_max_trx_id - purged trx_id */ ulint innodb_purge_view_trx_id_age; /*!< rw_max_trx_id diff --git a/storage/innobase/include/sync0sync.h b/storage/innobase/include/sync0sync.h index 7b00e16476b..f26e66f1a87 100644 --- a/storage/innobase/include/sync0sync.h +++ b/storage/innobase/include/sync0sync.h @@ -687,6 +687,7 @@ or row lock! */ #define SYNC_EXTERN_STORAGE 500 #define SYNC_FSP 400 #define SYNC_FSP_PAGE 395 +#define SYNC_STATS_DEFRAG 390 /*------------------------------------- Change buffer headers */ #define SYNC_IBUF_MUTEX 370 /* ibuf_mutex */ /*------------------------------------- Change buffer tree */ diff --git a/storage/innobase/include/ut0timer.h b/storage/innobase/include/ut0timer.h new file mode 100644 index 00000000000..f361ae79bf5 --- /dev/null +++ b/storage/innobase/include/ut0timer.h @@ -0,0 +1,104 @@ +/***************************************************************************** + +Copyright (c) 2013, 2014, Facebook, Inc. All Rights Reserved. +Copyright (c) 2014, SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/ut0timer.h +Timer rountines + +Created 30/07/2014 Jan Lindström jan.lindstrom@skysql.com +modified from https://github.com/facebook/mysql-5.6/commit/c75a413edeb96eb99bf11d7269bdfea06f96d6b6 +*************************************************************************/ +#ifndef ut0timer_h +#define ut0timer_h + +#include "univ.i" +#include "data0type.h" +#include <my_rdtsc.h> + +/* Current timer stats */ +extern struct my_timer_unit_info ut_timer; + +/**************************************************************//** +Function pointer to point selected timer function. +@return timer current value */ +extern ulonglong (*ut_timer_now)(void); + +/**************************************************************//** +Sets up the data required for use of my_timer_* functions. +Selects the best timer by high frequency, and tight resolution. +Points my_timer_now() to the selected timer function. +Initializes my_timer struct to contain the info for selected timer.*/ +UNIV_INTERN +void ut_init_timer(void); + +/**************************************************************//** +Return time passed since time then, automatically adjusted +for the estimated timer overhead. +@return time passed since "then" */ +UNIV_INLINE +ulonglong +ut_timer_since( +/*===========*/ + ulonglong then); /*!< in: time where to calculate */ +/**************************************************************//** +Get time passed since "then", and update then to now +@return time passed sinche "then" */ +UNIV_INLINE +ulonglong +ut_timer_since_and_update( +/*======================*/ + ulonglong *then); /*!< in: time where to calculate */ +/**************************************************************//** +Convert native timer units in a ulonglong into seconds in a double +@return time in a seconds */ +UNIV_INLINE +double +ut_timer_to_seconds( +/*=================*/ + ulonglong when); /*!< in: time where to calculate */ +/**************************************************************//** +Convert native timer units in a ulonglong into milliseconds in a double +@return time in milliseconds */ +UNIV_INLINE +double +ut_timer_to_milliseconds( +/*=====================*/ + ulonglong when); /*!< in: time where to calculate */ +/**************************************************************//** +Convert native timer units in a ulonglong into microseconds in a double +@return time in microseconds */ +UNIV_INLINE +double +ut_timer_to_microseconds( +/*=====================*/ + ulonglong when); /*!< in: time where to calculate */ +/**************************************************************//** +Convert microseconds in a double to native timer units in a ulonglong +@return time in microseconds */ +UNIV_INLINE +ulonglong +ut_microseconds_to_timer( +/*=====================*/ + ulonglong when); /*!< in: time where to calculate */ + +#ifndef UNIV_NONINL +#include "ut0timer.ic" +#endif + +#endif diff --git a/storage/innobase/include/ut0timer.ic b/storage/innobase/include/ut0timer.ic new file mode 100644 index 00000000000..027e89c6279 --- /dev/null +++ b/storage/innobase/include/ut0timer.ic @@ -0,0 +1,113 @@ +/***************************************************************************** + +Copyright (c) 2013, 2014, Facebook, Inc. All Rights Reserved. +Copyright (c) 2014, SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/ut0timer.ic +Timer rountines + +Created 30/07/2014 Jan Lindström jan.lindstrom@skysql.com +modified from https://github.com/facebook/mysql-5.6/commit/c75a413edeb96eb99bf11d7269bdfea06f96d6b6 +*************************************************************************/ + +/**************************************************************//** +Return time passed since time then, automatically adjusted +for the estimated timer overhead. +@return time passed since "then" */ +UNIV_INLINE +ulonglong +ut_timer_since( +/*===========*/ + ulonglong then) /*!< in: time where to calculate */ +{ + return (ut_timer_now() - then) - ut_timer.overhead; +} + +/**************************************************************//** +Get time passed since "then", and update then to now +@return time passed sinche "then" */ +UNIV_INLINE +ulonglong +ut_timer_since_and_update( +/*======================*/ + ulonglong *then) /*!< in: time where to calculate */ +{ + ulonglong now = ut_timer_now(); + ulonglong ret = (now - (*then)) - ut_timer.overhead; + *then = now; + return ret; +} + +/**************************************************************//** +Convert native timer units in a ulonglong into seconds in a double +@return time in a seconds */ +UNIV_INLINE +double +ut_timer_to_seconds( +/*=================*/ + ulonglong when) /*!< in: time where to calculate */ +{ + double ret = (double)(when); + ret /= (double)(ut_timer.frequency); + return ret; +} + +/**************************************************************//** +Convert native timer units in a ulonglong into milliseconds in a double +@return time in milliseconds */ +UNIV_INLINE +double +ut_timer_to_milliseconds( +/*=====================*/ + ulonglong when) /*!< in: time where to calculate */ +{ + double ret = (double)(when); + ret *= 1000.0; + ret /= (double)(ut_timer.frequency); + return ret; +} + +/**************************************************************//** +Convert native timer units in a ulonglong into microseconds in a double +@return time in microseconds */ +UNIV_INLINE +double +ut_timer_to_microseconds( +/*=====================*/ + ulonglong when) /*!< in: time where to calculate */ +{ + double ret = (double)(when); + ret *= 1000000.0; + ret /= (double)(ut_timer.frequency); + return ret; +} + +/**************************************************************//** +Convert microseconds in a double to native timer units in a ulonglong +@return time in microseconds */ +UNIV_INLINE +ulonglong +ut_microseconds_to_timer( +/*=====================*/ + ulonglong when) /*!< in: time where to calculate */ +{ + double ret = when; + ret *= (double)(ut_timer.frequency); + ret /= 1000000.0; + return (ulonglong)ret; +} diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index e4db2c30751..c1e12ea2928 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -3268,6 +3268,47 @@ lock_update_merge_left( } /*************************************************************//** +Updates the lock table when a page is split and merged to +two pages. */ +UNIV_INTERN +void +lock_update_split_and_merge( + const buf_block_t* left_block, /*!< in: left page to which merged */ + const rec_t* orig_pred, /*!< in: original predecessor of + supremum on the left page before merge*/ + const buf_block_t* right_block) /*!< in: right page from which merged */ +{ + const rec_t* left_next_rec; + + ut_a(left_block && right_block); + ut_a(orig_pred); + + lock_mutex_enter(); + + left_next_rec = page_rec_get_next_const(orig_pred); + + /* Inherit the locks on the supremum of the left page to the + first record which was moved from the right page */ + lock_rec_inherit_to_gap( + left_block, left_block, + page_rec_get_heap_no(left_next_rec), + PAGE_HEAP_NO_SUPREMUM); + + /* Reset the locks on the supremum of the left page, + releasing waiting transactions */ + lock_rec_reset_and_release_wait(left_block, + PAGE_HEAP_NO_SUPREMUM); + + /* Inherit the locks to the supremum of the left page from the + successor of the infimum on the right page */ + lock_rec_inherit_to_gap(left_block, right_block, + PAGE_HEAP_NO_SUPREMUM, + lock_get_min_heap_no(right_block)); + + lock_mutex_exit(); +} + +/*************************************************************//** Resets the original locks on heir and replaces them with gap type locks inherited from rec. */ UNIV_INTERN diff --git a/storage/innobase/page/page0cur.cc b/storage/innobase/page/page0cur.cc index f5f7e1299ce..97405261392 100644 --- a/storage/innobase/page/page0cur.cc +++ b/storage/innobase/page/page0cur.cc @@ -1349,6 +1349,21 @@ page_cur_insert_rec_zip( return(insert_rec); } + /* Page compress failed. If this happened on a + leaf page, put the data size into the sample + buffer. */ + if (page_is_leaf(page)) { + ulint occupied = page_get_data_size(page) + + page_dir_calc_reserved_space( + page_get_n_recs(page)); + index->stat_defrag_data_size_sample[ + index->stat_defrag_sample_next_slot] = + occupied; + index->stat_defrag_sample_next_slot = + (index->stat_defrag_sample_next_slot + + 1) % STAT_DEFRAG_DATA_SIZE_N_SAMPLE; + } + ut_ad(cursor->rec == (pos > 1 ? page_rec_get_nth( diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc index 93d13ea49ee..8def475e1f9 100644 --- a/storage/innobase/row/row0mysql.cc +++ b/storage/innobase/row/row0mysql.cc @@ -54,6 +54,7 @@ Created 9/17/2000 Heikki Tuuri #include "rem0cmp.h" #include "log0log.h" #include "btr0sea.h" +#include "btr0defragment.h" #include "fil0fil.h" #include "ibuf0ibuf.h" #include "fts0fts.h" @@ -3843,6 +3844,8 @@ row_drop_table_for_mysql( if (!dict_table_is_temporary(table)) { dict_stats_recalc_pool_del(table); + dict_stats_defrag_pool_del(table, NULL); + btr_defragment_remove_table(table); /* Remove stats for this table and all of its indexes from the persistent storage if it exists and if there are stats for this @@ -5128,18 +5131,6 @@ end: trx->error_state = DB_SUCCESS; trx_rollback_to_savepoint(trx, NULL); trx->error_state = DB_SUCCESS; - } else { - if (old_is_tmp && !new_is_tmp) { - /* After ALTER TABLE the table statistics - needs to be rebuilt. Even if we close - table below there could be other - transactions using this table (e.g. - SELECT * FROM INFORMATION_SCHEMA.`TABLE_CONSTRAINTS`), - thus we can't remove table from dictionary cache - here. Therefore, we initialize the - transient statistics here. */ - dict_stats_update_transient(table); - } } } diff --git a/storage/innobase/srv/srv0srv.cc b/storage/innobase/srv/srv0srv.cc index b9cfb3544b9..7ca29d1ace1 100644 --- a/storage/innobase/srv/srv0srv.cc +++ b/storage/innobase/srv/srv0srv.cc @@ -68,6 +68,7 @@ Created 10/8/1995 Heikki Tuuri #include "os0sync.h" /* for HAVE_ATOMIC_BUILTINS */ #include "srv0mon.h" #include "ut0crc32.h" +#include "btr0defragment.h" #include "mysql/plugin.h" #include "mysql/service_thd_wait.h" @@ -396,6 +397,15 @@ UNIV_INTERN ib_uint64_t srv_page_compressed_trim_op = 0; UNIV_INTERN ib_uint64_t srv_page_compressed_trim_op_saved = 0; UNIV_INTERN ib_uint64_t srv_index_page_decompressed = 0; +/* Defragmentation */ +UNIV_INTERN my_bool srv_defragment = FALSE; +UNIV_INTERN uint srv_defragment_n_pages = 7; +UNIV_INTERN uint srv_defragment_stats_accuracy = 0; +UNIV_INTERN uint srv_defragment_fill_factor_n_recs = 20; +UNIV_INTERN double srv_defragment_fill_factor = 0.9; +UNIV_INTERN uint srv_defragment_frequency = + SRV_DEFRAGMENT_FREQUENCY_DEFAULT; +UNIV_INTERN ulonglong srv_defragment_interval = 0; /* Set the following to 0 if you want InnoDB to write messages on stderr on startup/shutdown. */ @@ -1492,6 +1502,11 @@ srv_export_innodb_status(void) export_vars.innodb_page_compressed_trim_op_saved = srv_stats.page_compressed_trim_op_saved; export_vars.innodb_pages_page_decompressed = srv_stats.pages_page_decompressed; + export_vars.innodb_defragment_compression_failures = + btr_defragment_compression_failures; + export_vars.innodb_defragment_failures = btr_defragment_failures; + export_vars.innodb_defragment_count = btr_defragment_count; + #ifdef UNIV_DEBUG rw_lock_s_lock(&purge_sys->latch); trx_id_t done_trx_no = purge_sys->done.trx_no; diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index ece16c6bd70..6a02b08c3b7 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -43,6 +43,7 @@ Created 2/16/1996 Heikki Tuuri #include "pars0pars.h" #include "row0ftsort.h" #include "ut0mem.h" +#include "ut0timer.h" #include "mem0mem.h" #include "data0data.h" #include "data0type.h" @@ -67,6 +68,8 @@ Created 2/16/1996 Heikki Tuuri #include "ibuf0ibuf.h" #include "srv0start.h" #include "srv0srv.h" +#include "btr0defragment.h" + #ifndef UNIV_HOTBACKUP # include "trx0rseg.h" # include "os0proc.h" @@ -1531,6 +1534,9 @@ innobase_start_or_create_for_mysql(void) char* logfile0 = NULL; size_t dirnamelen; + /* This should be initialized early */ + ut_init_timer(); + if (srv_force_recovery > SRV_FORCE_NO_TRX_UNDO) { srv_read_only_mode = true; } @@ -2877,6 +2883,9 @@ files_checked: fts_optimize_init(); } + /* Initialize online defragmentation. */ + btr_defragment_init(); + srv_was_started = TRUE; return(DB_SUCCESS); diff --git a/storage/innobase/sync/sync0sync.cc b/storage/innobase/sync/sync0sync.cc index 5ef8a02fb3f..3532f513646 100644 --- a/storage/innobase/sync/sync0sync.cc +++ b/storage/innobase/sync/sync0sync.cc @@ -1164,6 +1164,7 @@ sync_thread_add_level( case SYNC_IBUF_MUTEX: case SYNC_INDEX_ONLINE_LOG: case SYNC_STATS_AUTO_RECALC: + case SYNC_STATS_DEFRAG: if (!sync_thread_levels_g(array, level, TRUE)) { fprintf(stderr, "InnoDB: sync_thread_levels_g(array, %lu)" diff --git a/storage/innobase/ut/ut0timer.cc b/storage/innobase/ut/ut0timer.cc new file mode 100644 index 00000000000..85292cce28c --- /dev/null +++ b/storage/innobase/ut/ut0timer.cc @@ -0,0 +1,92 @@ +/***************************************************************************** + +Copyright (c) 2013, 2014, Facebook, Inc. All Rights Reserved. +Copyright (c) 2014, SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file ut/ut0timer.cc +Timer rountines + +Created 30/07/2014 Jan Lindström jan.lindstrom@skysql.com +modified from https://github.com/facebook/mysql-5.6/commit/c75a413edeb96eb99bf11d7269bdfea06f96d6b6 +*************************************************************************/ + +#include "data0type.h" +#include <my_rdtsc.h> +#include <ut0timer.h> + +/**************************************************************//** +Initial timer definition +@return 0 */ +static +ulonglong +ut_timer_none(void) +/*===============*/ +{ + return 0; +} + +/**************************************************************//** +Function pointer to point selected timer function. +@return timer current value */ +ulonglong (*ut_timer_now)(void) = &ut_timer_none; + +struct my_timer_unit_info ut_timer; + +/**************************************************************//** +Sets up the data required for use of my_timer_* functions. +Selects the best timer by high frequency, and tight resolution. +Points my_timer_now() to the selected timer function. +Initializes my_timer struct to contain the info for selected timer.*/ +UNIV_INTERN +void +ut_init_timer(void) +/*===============*/ +{ + MY_TIMER_INFO all_timer_info; + my_timer_init(&all_timer_info); + + if (all_timer_info.cycles.frequency > 1000000 && + all_timer_info.cycles.resolution == 1) { + ut_timer = all_timer_info.cycles; + ut_timer_now = &my_timer_cycles; + } else if (all_timer_info.nanoseconds.frequency > 1000000 && + all_timer_info.nanoseconds.resolution == 1) { + ut_timer = all_timer_info.nanoseconds; + ut_timer_now = &my_timer_nanoseconds; + } else if (all_timer_info.microseconds.frequency >= 1000000 && + all_timer_info.microseconds.resolution == 1) { + ut_timer = all_timer_info.microseconds; + ut_timer_now = &my_timer_microseconds; + + } else if (all_timer_info.milliseconds.frequency >= 1000 && + all_timer_info.milliseconds.resolution == 1) { + ut_timer = all_timer_info.milliseconds; + ut_timer_now = &my_timer_milliseconds; + } else if (all_timer_info.ticks.frequency >= 1000 && + /* Will probably be false */ + all_timer_info.ticks.resolution == 1) { + ut_timer = all_timer_info.ticks; + ut_timer_now = &my_timer_ticks; + } else { + /* None are acceptable, so leave it as "None", and fill in struct */ + ut_timer.frequency = 1; /* Avoid div-by-zero */ + ut_timer.overhead = 0; /* Since it doesn't do anything */ + ut_timer.resolution = 10; /* Another sign it's bad */ + ut_timer.routine = 0; /* None */ + } +} diff --git a/storage/xtradb/CMakeLists.txt b/storage/xtradb/CMakeLists.txt index 528c6f87fcc..e34add61886 100644 --- a/storage/xtradb/CMakeLists.txt +++ b/storage/xtradb/CMakeLists.txt @@ -292,6 +292,7 @@ SET(INNOBASE_SOURCES btr/btr0cur.cc btr/btr0pcur.cc btr/btr0sea.cc + btr/btr0defragment.cc buf/buf0buddy.cc buf/buf0buf.cc buf/buf0dblwr.cc @@ -405,7 +406,8 @@ SET(INNOBASE_SOURCES ut/ut0rnd.cc ut/ut0ut.cc ut/ut0vec.cc - ut/ut0wqueue.cc) + ut/ut0wqueue.cc + ut/ut0timer.cc) IF(NOT XTRADB_OK) MESSAGE(FATAL_ERROR "Percona XtraDB is not supported on this platform") diff --git a/storage/xtradb/btr/btr0btr.cc b/storage/xtradb/btr/btr0btr.cc index cce91bdab6e..926c3be0fb5 100644 --- a/storage/xtradb/btr/btr0btr.cc +++ b/storage/xtradb/btr/btr0btr.cc @@ -38,6 +38,7 @@ Created 6/2/1994 Heikki Tuuri #include "btr0cur.h" #include "btr0sea.h" #include "btr0pcur.h" +#include "btr0defragment.h" #include "rem0cmp.h" #include "lock0lock.h" #include "ibuf0ibuf.h" @@ -1213,6 +1214,32 @@ btr_get_size( mtr_t* mtr) /*!< in/out: mini-transaction where index is s-latched */ { + ulint used; + if (flag == BTR_N_LEAF_PAGES) { + btr_get_size_and_reserved(index, flag, &used, mtr); + return used; + } else if (flag == BTR_TOTAL_SIZE) { + return btr_get_size_and_reserved(index, flag, &used, mtr); + } else { + ut_error; + } + return (ULINT_UNDEFINED); +} + +/**************************************************************//** +Gets the number of reserved and used pages in a B-tree. +@return number of pages reserved, or ULINT_UNDEFINED if the index +is unavailable */ +UNIV_INTERN +ulint +btr_get_size_and_reserved( +/*======================*/ + dict_index_t* index, /*!< in: index */ + ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */ + ulint* used, /*!< out: number of pages used (<= reserved) */ + mtr_t* mtr) /*!< in/out: mini-transaction where index + is s-latched */ +{ fseg_header_t* seg_header; page_t* root; ulint n; @@ -1221,6 +1248,8 @@ btr_get_size( ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), MTR_MEMO_S_LOCK)); + ut_a(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE); + if (index->page == FIL_NULL || dict_index_is_online_ddl(index) || *index->name == TEMP_INDEX_PREFIX) { return(ULINT_UNDEFINED); @@ -1228,27 +1257,16 @@ btr_get_size( root = btr_root_get(index, mtr); - SRV_CORRUPT_TABLE_CHECK(root, - { - mtr_commit(mtr); - return(0); - }); + seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; - if (flag == BTR_N_LEAF_PAGES) { - seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; - - fseg_n_reserved_pages(seg_header, &n, mtr); + n = fseg_n_reserved_pages(seg_header, used, mtr); - } else if (flag == BTR_TOTAL_SIZE) { + if (flag == BTR_TOTAL_SIZE) { seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP; - n = fseg_n_reserved_pages(seg_header, &dummy, mtr); - - seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF; - n += fseg_n_reserved_pages(seg_header, &dummy, mtr); - } else { - ut_error; + *used += dummy; + } return(n); @@ -2013,7 +2031,7 @@ IBUF_BITMAP_FREE is unaffected by reorganization. @retval true if the operation was successful @retval false if it is a compressed page, and recompression failed */ -static __attribute__((nonnull)) +UNIV_INTERN bool btr_page_reorganize_block( /*======================*/ @@ -2965,6 +2983,12 @@ func_start: new_page_zip = buf_block_get_page_zip(new_block); btr_page_create(new_block, new_page_zip, cursor->index, btr_page_get_level(page, mtr), mtr); + /* Only record the leaf level page splits. */ + if (btr_page_get_level(page, mtr) == 0) { + cursor->index->stat_defrag_n_page_split ++; + cursor->index->stat_defrag_modified_counter ++; + btr_defragment_save_defrag_stats_if_needed(cursor->index); + } /* 3. Calculate the first record on the upper half-page, and the first record (move_limit) on original page which ends up on the @@ -3223,31 +3247,9 @@ func_exit: return(rec); } -#ifdef UNIV_SYNC_DEBUG -/*************************************************************//** -Removes a page from the level list of pages. -@param space in: space where removed -@param zip_size in: compressed page size in bytes, or 0 for uncompressed -@param page in/out: page to remove -@param index in: index tree -@param mtr in/out: mini-transaction */ -# define btr_level_list_remove(space,zip_size,page,index,mtr) \ - btr_level_list_remove_func(space,zip_size,page,index,mtr) -#else /* UNIV_SYNC_DEBUG */ -/*************************************************************//** -Removes a page from the level list of pages. -@param space in: space where removed -@param zip_size in: compressed page size in bytes, or 0 for uncompressed -@param page in/out: page to remove -@param index in: index tree -@param mtr in/out: mini-transaction */ -# define btr_level_list_remove(space,zip_size,page,index,mtr) \ - btr_level_list_remove_func(space,zip_size,page,mtr) -#endif /* UNIV_SYNC_DEBUG */ - /*************************************************************//** Removes a page from the level list of pages. */ -static __attribute__((nonnull)) +UNIV_INTERN void btr_level_list_remove_func( /*=======================*/ @@ -3419,7 +3421,7 @@ btr_node_ptr_delete( If page is the only on its level, this function moves its records to the father page, thus reducing the tree height. @return father block */ -static +UNIV_INTERN buf_block_t* btr_lift_page_up( /*=============*/ diff --git a/storage/xtradb/btr/btr0defragment.cc b/storage/xtradb/btr/btr0defragment.cc new file mode 100644 index 00000000000..a784c8c5be7 --- /dev/null +++ b/storage/xtradb/btr/btr0defragment.cc @@ -0,0 +1,815 @@ +/***************************************************************************** + +Copyright (C) 2012, 2014 Facebook, Inc. All Rights Reserved. +Copyright (C) 2014, SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ +/**************************************************//** +@file btr/btr0defragment.cc +Index defragmentation. + +Created 05/29/2014 Rongrong Zhong +Modified 16/07/2014 Sunguck Lee +Modified 30/07/2014 Jan Lindström jan.lindstrom@skysql.com +*******************************************************/ + +#include "btr0defragment.h" +#ifndef UNIV_HOTBACKUP +#include "btr0cur.h" +#include "btr0sea.h" +#include "btr0pcur.h" +#include "dict0stats.h" +#include "dict0stats_bg.h" +#include "ibuf0ibuf.h" +#include "lock0lock.h" +#include "srv0start.h" +#include "srv0srv.h" +#include "ut0timer.h" + +#include <list> + +/**************************************************//** +Custom nullptr implementation for under g++ 4.6 +*******************************************************/ +// #pragma once +namespace std +{ + // based on SC22/WG21/N2431 = J16/07-0301 + struct nullptr_t + { + template<typename any> operator any * () const + { + return 0; + } + template<class any, typename T> operator T any:: * () const + { + return 0; + } + +#ifdef _MSC_VER + struct pad {}; + pad __[sizeof(void*)/sizeof(pad)]; +#else + char __[sizeof(void*)]; +#endif +private: + // nullptr_t();// {} + // nullptr_t(const nullptr_t&); + // void operator = (const nullptr_t&); + void operator &() const; + template<typename any> void operator +(any) const + { + /*I Love MSVC 2005!*/ + } + template<typename any> void operator -(any) const + { + /*I Love MSVC 2005!*/ + } + }; +static const nullptr_t __nullptr = {}; +} + +#ifndef nullptr +#define nullptr std::__nullptr +#endif +/**************************************************//** +End of Custom nullptr implementation for under g++ 4.6 +*******************************************************/ + +/* When there's no work, either because defragment is disabled, or because no +query is submitted, thread checks state every BTR_DEFRAGMENT_SLEEP_IN_USECS.*/ +#define BTR_DEFRAGMENT_SLEEP_IN_USECS 1000000 +/* Reduce the target page size by this amount when compression failure happens +during defragmentaiton. 512 is chosen because it's a power of 2 and it is about +3% of the page size. When there are compression failures in defragmentation, +our goal is to get a decent defrag ratio with as few compression failure as +possible. From experimentation it seems that reduce the target size by 512 every +time will make sure the page is compressible within a couple of iterations. */ +#define BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE 512 + +/* Work queue for defragmentation. */ +typedef std::list<btr_defragment_item_t*> btr_defragment_wq_t; +static btr_defragment_wq_t btr_defragment_wq; + +/* Mutex protecting the defragmentation work queue.*/ +ib_mutex_t btr_defragment_mutex; +#ifdef UNIV_PFS_MUTEX +UNIV_INTERN mysql_pfs_key_t btr_defragment_mutex_key; +#endif /* UNIV_PFS_MUTEX */ + +/* Number of compression failures caused by defragmentation since server +start. */ +ulint btr_defragment_compression_failures = 0; +/* Number of btr_defragment_n_pages calls that altered page but didn't +manage to release any page. */ +ulint btr_defragment_failures = 0; +/* Total number of btr_defragment_n_pages calls that altered page. +The difference between btr_defragment_count and btr_defragment_failures shows +the amount of effort wasted. */ +ulint btr_defragment_count = 0; + +/******************************************************************//** +Constructor for btr_defragment_item_t. */ +btr_defragment_item_t::btr_defragment_item_t( + btr_pcur_t* pcur, + os_event_t event) +{ + this->pcur = pcur; + this->event = event; + this->removed = false; + this->last_processed = 0; +} + +/******************************************************************//** +Destructor for btr_defragment_item_t. */ +btr_defragment_item_t::~btr_defragment_item_t() { + if (this->pcur) { + btr_pcur_free_for_mysql(this->pcur); + } + if (this->event) { + os_event_set(this->event); + } +} + +/******************************************************************//** +Initialize defragmentation. */ +void +btr_defragment_init() +{ + srv_defragment_interval = ut_microseconds_to_timer( + 1000000.0 / srv_defragment_frequency); + mutex_create(btr_defragment_mutex_key, &btr_defragment_mutex, + SYNC_ANY_LATCH); + os_thread_create(btr_defragment_thread, NULL, NULL); +} + +/******************************************************************//** +Shutdown defragmentation. Release all resources. */ +void +btr_defragment_shutdown() +{ + mutex_enter(&btr_defragment_mutex); + list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); + while(iter != btr_defragment_wq.end()) { + btr_defragment_item_t* item = *iter; + iter = btr_defragment_wq.erase(iter); + delete item; + } + mutex_exit(&btr_defragment_mutex); + mutex_free(&btr_defragment_mutex); +} + + +/******************************************************************//** +Functions used by the query threads: btr_defragment_xxx_index +Query threads find/add/remove index. */ +/******************************************************************//** +Check whether the given index is in btr_defragment_wq. We use index->id +to identify indices. */ +bool +btr_defragment_find_index( + dict_index_t* index) /*!< Index to find. */ +{ + mutex_enter(&btr_defragment_mutex); + for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); + iter != btr_defragment_wq.end(); + ++iter) { + btr_defragment_item_t* item = *iter; + btr_pcur_t* pcur = item->pcur; + btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur); + dict_index_t* idx = btr_cur_get_index(cursor); + if (index->id == idx->id) { + mutex_exit(&btr_defragment_mutex); + return true; + } + } + mutex_exit(&btr_defragment_mutex); + return false; +} + +/******************************************************************//** +Query thread uses this function to add an index to btr_defragment_wq. +Return a pointer to os_event for the query thread to wait on if this is a +synchronized defragmentation. */ +os_event_t +btr_defragment_add_index( + dict_index_t* index, /*!< index to be added */ + bool async) /*!< whether this is an async defragmentation */ +{ + mtr_t mtr; + ulint space = dict_index_get_space(index); + ulint zip_size = dict_table_zip_size(index->table); + ulint page_no = dict_index_get_page(index); + mtr_start(&mtr); + // Load index rood page. + page_t* page = btr_page_get(space, zip_size, page_no, + RW_NO_LATCH, index, &mtr); + if (btr_page_get_level(page, &mtr) == 0) { + // Index root is a leaf page, no need to defragment. + mtr_commit(&mtr); + return NULL; + } + btr_pcur_t* pcur = btr_pcur_create_for_mysql(); + os_event_t event = NULL; + if (!async) { + event = os_event_create(); + } + btr_pcur_open_at_index_side(true, index, BTR_SEARCH_LEAF, pcur, + true, 0, &mtr); + btr_pcur_move_to_next(pcur, &mtr); + btr_pcur_store_position(pcur, &mtr); + mtr_commit(&mtr); + dict_stats_empty_defrag_summary(index); + btr_defragment_item_t* item = new btr_defragment_item_t(pcur, event); + mutex_enter(&btr_defragment_mutex); + btr_defragment_wq.push_back(item); + mutex_exit(&btr_defragment_mutex); + return event; +} + +/******************************************************************//** +When table is dropped, this function is called to mark a table as removed in +btr_efragment_wq. The difference between this function and the remove_index +function is this will not NULL the event. */ +void +btr_defragment_remove_table( + dict_table_t* table) /*!< Index to be removed. */ +{ + mutex_enter(&btr_defragment_mutex); + for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); + iter != btr_defragment_wq.end(); + ++iter) { + btr_defragment_item_t* item = *iter; + btr_pcur_t* pcur = item->pcur; + btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur); + dict_index_t* idx = btr_cur_get_index(cursor); + if (table->id == idx->table->id) { + item->removed = true; + } + } + mutex_exit(&btr_defragment_mutex); +} + +/******************************************************************//** +Query thread uses this function to mark an index as removed in +btr_efragment_wq. */ +void +btr_defragment_remove_index( + dict_index_t* index) /*!< Index to be removed. */ +{ + mutex_enter(&btr_defragment_mutex); + for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); + iter != btr_defragment_wq.end(); + ++iter) { + btr_defragment_item_t* item = *iter; + btr_pcur_t* pcur = item->pcur; + btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur); + dict_index_t* idx = btr_cur_get_index(cursor); + if (index->id == idx->id) { + item->removed = true; + item->event = NULL; + break; + } + } + mutex_exit(&btr_defragment_mutex); +} + +/******************************************************************//** +Functions used by defragmentation thread: btr_defragment_xxx_item. +Defragmentation thread operates on the work *item*. It gets/removes +item from the work queue. */ +/******************************************************************//** +Defragment thread uses this to remove an item from btr_defragment_wq. +When an item is removed from the work queue, all resources associated with it +are free as well. */ +void +btr_defragment_remove_item( + btr_defragment_item_t* item) /*!< Item to be removed. */ +{ + mutex_enter(&btr_defragment_mutex); + for (list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); + iter != btr_defragment_wq.end(); + ++iter) { + if (item == *iter) { + btr_defragment_wq.erase(iter); + delete item; + break; + } + } + mutex_exit(&btr_defragment_mutex); +} + +/******************************************************************//** +Defragment thread uses this to get an item from btr_defragment_wq to work on. +The item is not removed from the work queue so query threads can still access +this item. We keep it this way so query threads can find and kill a +defragmentation even if that index is being worked on. Be aware that while you +work on this item you have no lock protection on it whatsoever. This is OK as +long as the query threads and defragment thread won't modify the same fields +without lock protection. +*/ +btr_defragment_item_t* +btr_defragment_get_item() +{ + if (btr_defragment_wq.empty()) { + return nullptr; + } + mutex_enter(&btr_defragment_mutex); + list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); + if (iter == btr_defragment_wq.end()) { + iter = btr_defragment_wq.begin(); + } + btr_defragment_item_t* item = *iter; + iter++; + mutex_exit(&btr_defragment_mutex); + return item; +} + +/*********************************************************************//** +Check whether we should save defragmentation statistics to persistent storage. +Currently we save the stats to persistent storage every 100 updates. */ +UNIV_INTERN +void +btr_defragment_save_defrag_stats_if_needed( + dict_index_t* index) /*!< in: index */ +{ + if (srv_defragment_stats_accuracy != 0 // stats tracking disabled + && dict_index_get_space(index) != 0 // do not track system tables + && index->stat_defrag_modified_counter + >= srv_defragment_stats_accuracy) { + dict_stats_defrag_pool_add(index); + index->stat_defrag_modified_counter = 0; + } +} + +/*********************************************************************//** +Main defragment functionalities used by defragment thread.*/ +/*************************************************************//** +Calculate number of records from beginning of block that can +fit into size_limit +@return number of records */ +UNIV_INTERN +ulint +btr_defragment_calc_n_recs_for_size( + buf_block_t* block, /*!< in: B-tree page */ + dict_index_t* index, /*!< in: index of the page */ + ulint size_limit, /*!< in: size limit to fit records in */ + ulint* n_recs_size) /*!< out: actual size of the records that fit + in size_limit. */ +{ + page_t* page = buf_block_get_frame(block); + ulint n_recs = 0; + ulint offsets_[REC_OFFS_NORMAL_SIZE]; + ulint* offsets = offsets_; + rec_offs_init(offsets_); + mem_heap_t* heap = NULL; + ulint size = 0; + page_cur_t cur; + + page_cur_set_before_first(block, &cur); + page_cur_move_to_next(&cur); + while (page_cur_get_rec(&cur) != page_get_supremum_rec(page)) { + rec_t* cur_rec = page_cur_get_rec(&cur); + offsets = rec_get_offsets(cur_rec, index, offsets, + ULINT_UNDEFINED, &heap); + ulint rec_size = rec_offs_size(offsets); + size += rec_size; + if (size > size_limit) { + size = size - rec_size; + break; + } + n_recs ++; + page_cur_move_to_next(&cur); + } + *n_recs_size = size; + return n_recs; +} + +/*************************************************************//** +Merge as many records from the from_block to the to_block. Delete +the from_block if all records are successfully merged to to_block. +@return the to_block to target for next merge operation. */ +UNIV_INTERN +buf_block_t* +btr_defragment_merge_pages( + dict_index_t* index, /*!< in: index tree */ + buf_block_t* from_block, /*!< in: origin of merge */ + buf_block_t* to_block, /*!< in: destination of merge */ + ulint zip_size, /*!< in: zip size of the block */ + ulint reserved_space, /*!< in: space reserved for future + insert to avoid immediate page split */ + ulint* max_data_size, /*!< in/out: max data size to + fit in a single compressed page. */ + mem_heap_t* heap, /*!< in/out: pointer to memory heap */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + page_t* from_page = buf_block_get_frame(from_block); + page_t* to_page = buf_block_get_frame(to_block); + ulint space = dict_index_get_space(index); + ulint level = btr_page_get_level(from_page, mtr); + ulint n_recs = page_get_n_recs(from_page); + ulint new_data_size = page_get_data_size(to_page); + ulint max_ins_size = + page_get_max_insert_size(to_page, n_recs); + ulint max_ins_size_reorg = + page_get_max_insert_size_after_reorganize( + to_page, n_recs); + ulint max_ins_size_to_use = max_ins_size_reorg > reserved_space + ? max_ins_size_reorg - reserved_space : 0; + ulint move_size = 0; + ulint n_recs_to_move = 0; + rec_t* rec = NULL; + ulint target_n_recs = 0; + rec_t* orig_pred; + + // Estimate how many records can be moved from the from_page to + // the to_page. + if (zip_size) { + ulint page_diff = UNIV_PAGE_SIZE - *max_data_size; + max_ins_size_to_use = (max_ins_size_to_use > page_diff) + ? max_ins_size_to_use - page_diff : 0; + } + n_recs_to_move = btr_defragment_calc_n_recs_for_size( + from_block, index, max_ins_size_to_use, &move_size); + + // If max_ins_size >= move_size, we can move the records without + // reorganizing the page, otherwise we need to reorganize the page + // first to release more space. + if (move_size > max_ins_size) { + if (!btr_page_reorganize_block(false, page_zip_level, + to_block, index, + mtr)) { + if (!dict_index_is_clust(index) + && page_is_leaf(to_page)) { + ibuf_reset_free_bits(to_block); + } + // If reorganization fails, that means page is + // not compressable. There's no point to try + // merging into this page. Continue to the + // next page. + return from_block; + } + ut_ad(page_validate(to_page, index)); + max_ins_size = page_get_max_insert_size(to_page, n_recs); + ut_a(max_ins_size >= move_size); + } + + // Move records to pack to_page more full. + orig_pred = NULL; + target_n_recs = n_recs_to_move; + while (n_recs_to_move > 0) { + rec = page_rec_get_nth(from_page, + n_recs_to_move + 1); + orig_pred = page_copy_rec_list_start( + to_block, from_block, rec, index, mtr); + if (orig_pred) + break; + // If we reach here, that means compression failed after packing + // n_recs_to_move number of records to to_page. We try to reduce + // the targeted data size on the to_page by + // BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE and try again. + os_atomic_increment_ulint( + &btr_defragment_compression_failures, 1); + max_ins_size_to_use = + move_size > BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE + ? move_size - BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE + : 0; + if (max_ins_size_to_use == 0) { + n_recs_to_move = 0; + move_size = 0; + break; + } + n_recs_to_move = btr_defragment_calc_n_recs_for_size( + from_block, index, max_ins_size_to_use, &move_size); + } + // If less than target_n_recs are moved, it means there are + // compression failures during page_copy_rec_list_start. Adjust + // the max_data_size estimation to reduce compression failures + // in the following runs. + if (target_n_recs > n_recs_to_move + && *max_data_size > new_data_size + move_size) { + *max_data_size = new_data_size + move_size; + } + // Set ibuf free bits if necessary. + if (!dict_index_is_clust(index) + && page_is_leaf(to_page)) { + if (zip_size) { + ibuf_reset_free_bits(to_block); + } else { + ibuf_update_free_bits_if_full( + to_block, + UNIV_PAGE_SIZE, + ULINT_UNDEFINED); + } + } + if (n_recs_to_move == n_recs) { + /* The whole page is merged with the previous page, + free it. */ + lock_update_merge_left(to_block, orig_pred, + from_block); + btr_search_drop_page_hash_index(from_block); + btr_level_list_remove(space, zip_size, from_page, + index, mtr); + btr_node_ptr_delete(index, from_block, mtr); + btr_blob_dbg_remove(from_page, index, + "btr_defragment_n_pages"); + btr_page_free(index, from_block, mtr); + } else { + // There are still records left on the page, so + // increment n_defragmented. Node pointer will be changed + // so remove the old node pointer. + if (n_recs_to_move > 0) { + // Part of the page is merged to left, remove + // the merged records, update record locks and + // node pointer. + dtuple_t* node_ptr; + page_delete_rec_list_start(rec, from_block, + index, mtr); + lock_update_split_and_merge(to_block, + orig_pred, + from_block); + btr_node_ptr_delete(index, from_block, mtr); + rec = page_rec_get_next( + page_get_infimum_rec(from_page)); + node_ptr = dict_index_build_node_ptr( + index, rec, page_get_page_no(from_page), + heap, level + 1); + btr_insert_on_non_leaf_level(0, index, level+1, + node_ptr, mtr); + } + to_block = from_block; + } + return to_block; +} + +/*************************************************************//** +Tries to merge N consecutive pages, starting from the page pointed by the +cursor. Skip space 0. Only consider leaf pages. +This function first loads all N pages into memory, then for each of +the pages other than the first page, it tries to move as many records +as possible to the left sibling to keep the left sibling full. During +the process, if any page becomes empty, that page will be removed from +the level list. Record locks, hash, and node pointers are updated after +page reorganization. +@return pointer to the last block processed, or NULL if reaching end of index */ +UNIV_INTERN +buf_block_t* +btr_defragment_n_pages( + buf_block_t* block, /*!< in: starting block for defragmentation */ + dict_index_t* index, /*!< in: index tree */ + uint n_pages,/*!< in: number of pages to defragment */ + mtr_t* mtr) /*!< in/out: mini-transaction */ +{ + ulint space; + ulint zip_size; + /* We will need to load the n+1 block because if the last page is freed + and we need to modify the prev_page_no of that block. */ + buf_block_t* blocks[BTR_DEFRAGMENT_MAX_N_PAGES + 1]; + page_t* first_page; + buf_block_t* current_block; + ulint total_data_size = 0; + ulint total_n_recs = 0; + ulint data_size_per_rec; + ulint optimal_page_size; + ulint reserved_space; + ulint level; + ulint max_data_size = 0; + uint n_defragmented = 0; + uint n_new_slots; + mem_heap_t* heap; + ibool end_of_index = FALSE; + + /* It doesn't make sense to call this function with n_pages = 1. */ + ut_ad(n_pages > 1); + + ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index), + MTR_MEMO_X_LOCK)); + space = dict_index_get_space(index); + if (space == 0) { + /* Ignore space 0. */ + return NULL; + } + + if (n_pages > BTR_DEFRAGMENT_MAX_N_PAGES) { + n_pages = BTR_DEFRAGMENT_MAX_N_PAGES; + } + + zip_size = dict_table_zip_size(index->table); + first_page = buf_block_get_frame(block); + level = btr_page_get_level(first_page, mtr); + + if (level != 0) { + return NULL; + } + + /* 1. Load the pages and calculate the total data size. */ + blocks[0] = block; + for (uint i = 1; i <= n_pages; i++) { + page_t* page = buf_block_get_frame(blocks[i-1]); + ulint page_no = btr_page_get_next(page, mtr); + total_data_size += page_get_data_size(page); + total_n_recs += page_get_n_recs(page); + if (page_no == FIL_NULL) { + n_pages = i; + end_of_index = TRUE; + break; + } + blocks[i] = btr_block_get(space, zip_size, page_no, + RW_X_LATCH, index, mtr); + } + + if (n_pages == 1) { + if (btr_page_get_prev(first_page, mtr) == FIL_NULL) { + /* last page in the index */ + if (dict_index_get_page(index) + == page_get_page_no(first_page)) + return NULL; + /* given page is the last page. + Lift the records to father. */ + btr_lift_page_up(index, block, mtr); + } + return NULL; + } + + /* 2. Calculate how many pages data can fit in. If not compressable, + return early. */ + ut_a(total_n_recs != 0); + data_size_per_rec = total_data_size / total_n_recs; + // For uncompressed pages, the optimal data size if the free space of a + // empty page. + optimal_page_size = page_get_free_space_of_empty( + page_is_comp(first_page)); + // For compressed pages, we take compression failures into account. + if (zip_size) { + ulint size = 0; + int i = 0; + // We estimate the optimal data size of the index use samples of + // data size. These samples are taken when pages failed to + // compress due to insertion on the page. We use the average + // of all samples we have as the estimation. Different pages of + // the same index vary in compressibility. Average gives a good + // enough estimation. + for (;i < STAT_DEFRAG_DATA_SIZE_N_SAMPLE; i++) { + if (index->stat_defrag_data_size_sample[i] == 0) { + break; + } + size += index->stat_defrag_data_size_sample[i]; + } + if (i != 0) { + size = size / i; + optimal_page_size = min(optimal_page_size, size); + } + max_data_size = optimal_page_size; + } + + reserved_space = min((ulint)(optimal_page_size + * (1 - srv_defragment_fill_factor)), + (data_size_per_rec + * srv_defragment_fill_factor_n_recs)); + optimal_page_size -= reserved_space; + n_new_slots = (total_data_size + optimal_page_size - 1) + / optimal_page_size; + if (n_new_slots >= n_pages) { + /* Can't defragment. */ + if (end_of_index) + return NULL; + return blocks[n_pages-1]; + } + + /* 3. Defragment pages. */ + heap = mem_heap_create(256); + // First defragmented page will be the first page. + current_block = blocks[0]; + // Start from the second page. + for (uint i = 1; i < n_pages; i ++) { + buf_block_t* new_block = btr_defragment_merge_pages( + index, blocks[i], current_block, zip_size, + reserved_space, &max_data_size, heap, mtr); + if (new_block != current_block) { + n_defragmented ++; + current_block = new_block; + } + } + mem_heap_free(heap); + n_defragmented ++; + os_atomic_increment_ulint( + &btr_defragment_count, 1); + if (n_pages == n_defragmented) { + os_atomic_increment_ulint( + &btr_defragment_failures, 1); + } else { + index->stat_defrag_n_pages_freed += (n_pages - n_defragmented); + } + if (end_of_index) + return NULL; + return current_block; +} + +/******************************************************************//** +Thread that merges consecutive b-tree pages into fewer pages to defragment +the index. */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(btr_defragment_thread)( +/*==========================================*/ + void* arg) /*!< in: work queue */ +{ + btr_pcur_t* pcur; + btr_cur_t* cursor; + dict_index_t* index; + mtr_t mtr; + buf_block_t* first_block; + buf_block_t* last_block; + + while (srv_shutdown_state == SRV_SHUTDOWN_NONE) { + /* If defragmentation is disabled, sleep before + checking whether it's enabled. */ + if (!srv_defragment) { + os_thread_sleep(BTR_DEFRAGMENT_SLEEP_IN_USECS); + continue; + } + /* The following call won't remove the item from work queue. + We only get a pointer to it to work on. This will make sure + when user issue a kill command, all indices are in the work + queue to be searched. This also means that the user thread + cannot directly remove the item from queue (since we might be + using it). So user thread only marks index as removed. */ + btr_defragment_item_t* item = btr_defragment_get_item(); + /* If work queue is empty, sleep and check later. */ + if (!item) { + os_thread_sleep(BTR_DEFRAGMENT_SLEEP_IN_USECS); + continue; + } + /* If an index is marked as removed, we remove it from the work + queue. No other thread could be using this item at this point so + it's safe to remove now. */ + if (item->removed) { + btr_defragment_remove_item(item); + continue; + } + + pcur = item->pcur; + ulonglong now = ut_timer_now(); + ulonglong elapsed = now - item->last_processed; + + if (elapsed < srv_defragment_interval) { + /* If we see an index again before the interval + determined by the configured frequency is reached, + we just sleep until the interval pass. Since + defragmentation of all indices queue up on a single + thread, it's likely other indices that follow this one + don't need to sleep again. */ + os_thread_sleep(((ulint)ut_timer_to_microseconds( + srv_defragment_interval - elapsed))); + } + + now = ut_timer_now(); + mtr_start(&mtr); + btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, &mtr); + cursor = btr_pcur_get_btr_cur(pcur); + index = btr_cur_get_index(cursor); + first_block = btr_cur_get_block(cursor); + last_block = btr_defragment_n_pages(first_block, index, + srv_defragment_n_pages, + &mtr); + if (last_block) { + /* If we haven't reached the end of the index, + place the cursor on the last record of last page, + store the cursor position, and put back in queue. */ + page_t* last_page = buf_block_get_frame(last_block); + rec_t* rec = page_rec_get_prev( + page_get_supremum_rec(last_page)); + ut_a(page_rec_is_user_rec(rec)); + page_cur_position(rec, last_block, + btr_cur_get_page_cur(cursor)); + btr_pcur_store_position(pcur, &mtr); + mtr_commit(&mtr); + /* Update the last_processed time of this index. */ + item->last_processed = now; + } else { + mtr_commit(&mtr); + /* Reaching the end of the index. */ + dict_stats_empty_defrag_stats(index); + dict_stats_save_defrag_stats(index); + dict_stats_save_defrag_summary(index); + btr_defragment_remove_item(item); + } + } + btr_defragment_shutdown(); + os_thread_exit(NULL); + OS_THREAD_DUMMY_RETURN; +} + +#endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/dict/dict0dict.cc b/storage/xtradb/dict/dict0dict.cc index af3518337d4..e8576177967 100644 --- a/storage/xtradb/dict/dict0dict.cc +++ b/storage/xtradb/dict/dict0dict.cc @@ -408,7 +408,7 @@ dict_table_try_drop_aborted( if (table == NULL) { table = dict_table_open_on_id_low( - table_id, DICT_ERR_IGNORE_NONE); + table_id, DICT_ERR_IGNORE_NONE, FALSE); } else { ut_ad(table->id == table_id); } @@ -795,7 +795,8 @@ dict_table_open_on_id( table_id, table_op == DICT_TABLE_OP_LOAD_TABLESPACE ? DICT_ERR_IGNORE_RECOVER_LOCK - : DICT_ERR_IGNORE_NONE); + : DICT_ERR_IGNORE_NONE, + table_op == DICT_TABLE_OP_OPEN_ONLY_IF_CACHED); if (table != NULL) { @@ -1313,7 +1314,7 @@ dict_table_move_from_non_lru_to_lru( /**********************************************************************//** Looks for an index with the given id given a table instance. @return index or NULL */ -static +UNIV_INTERN dict_index_t* dict_table_find_index_on_id( /*========================*/ @@ -2408,6 +2409,13 @@ undo_size_ok: new_index->stat_index_size = 1; new_index->stat_n_leaf_pages = 1; + new_index->stat_defrag_n_pages_freed = 0; + new_index->stat_defrag_n_page_split = 0; + + new_index->stat_defrag_sample_next_slot = 0; + memset(&new_index->stat_defrag_data_size_sample, + 0x0, sizeof(ulint) * STAT_DEFRAG_DATA_SIZE_N_SAMPLE); + /* Add the new index as the last index for the table */ UT_LIST_ADD_LAST(indexes, table->indexes, new_index); diff --git a/storage/xtradb/dict/dict0stats.cc b/storage/xtradb/dict/dict0stats.cc index 928bdb3f2ef..bec0079942b 100644 --- a/storage/xtradb/dict/dict0stats.cc +++ b/storage/xtradb/dict/dict0stats.cc @@ -492,6 +492,9 @@ dict_stats_table_clone_create( heap, idx->n_uniq * sizeof(idx->stat_n_non_null_key_vals[0])); ut_d(idx->magic_n = DICT_INDEX_MAGIC_N); + + idx->stat_defrag_n_page_split = 0; + idx->stat_defrag_n_pages_freed = 0; } ut_d(t->magic_n = DICT_TABLE_MAGIC_N); @@ -520,7 +523,9 @@ static void dict_stats_empty_index( /*===================*/ - dict_index_t* index) /*!< in/out: index */ + dict_index_t* index, /*!< in/out: index */ + bool empty_defrag_stats) + /*!< in: whether to empty defrag stats */ { ut_ad(!(index->type & DICT_FTS)); ut_ad(!dict_index_is_univ(index)); @@ -535,6 +540,34 @@ dict_stats_empty_index( index->stat_index_size = 1; index->stat_n_leaf_pages = 1; + + if (empty_defrag_stats) { + dict_stats_empty_defrag_stats(index); + dict_stats_empty_defrag_summary(index); + } +} + +/**********************************************************************//** +Clear defragmentation summary. */ +UNIV_INTERN +void +dict_stats_empty_defrag_summary( +/*==================*/ + dict_index_t* index) /*!< in: index to clear defragmentation stats */ +{ + index->stat_defrag_n_pages_freed = 0; +} + +/**********************************************************************//** +Clear defragmentation related index stats. */ +UNIV_INTERN +void +dict_stats_empty_defrag_stats( +/*==================*/ + dict_index_t* index) /*!< in: index to clear defragmentation stats */ +{ + index->stat_defrag_modified_counter = 0; + index->stat_defrag_n_page_split = 0; } /*********************************************************************//** @@ -544,7 +577,9 @@ static void dict_stats_empty_table( /*===================*/ - dict_table_t* table) /*!< in/out: table */ + dict_table_t* table, /*!< in/out: table */ + bool empty_defrag_stats) + /*!< in: whether to empty defrag stats */ { /* Zero the stats members */ @@ -569,7 +604,7 @@ dict_stats_empty_table( ut_ad(!dict_index_is_univ(index)); - dict_stats_empty_index(index); + dict_stats_empty_index(index, empty_defrag_stats); } table->stat_initialized = TRUE; @@ -704,7 +739,7 @@ dict_stats_copy( } if (!INDEX_EQ(src_idx, dst_idx)) { - dict_stats_empty_index(dst_idx); + dict_stats_empty_index(dst_idx, true); continue; } @@ -715,7 +750,7 @@ dict_stats_copy( /* Since src is smaller some elements in dst will remain untouched by the following memmove(), thus we init all of them here. */ - dict_stats_empty_index(dst_idx); + dict_stats_empty_index(dst_idx, true); } else { n_copy_el = dst_idx->n_uniq; } @@ -735,6 +770,13 @@ dict_stats_copy( dst_idx->stat_index_size = src_idx->stat_index_size; dst_idx->stat_n_leaf_pages = src_idx->stat_n_leaf_pages; + + dst_idx->stat_defrag_modified_counter = + src_idx->stat_defrag_modified_counter; + dst_idx->stat_defrag_n_pages_freed = + src_idx->stat_defrag_n_pages_freed; + dst_idx->stat_defrag_n_page_split = + src_idx->stat_defrag_n_page_split; } dst->stat_initialized = TRUE; @@ -758,6 +800,9 @@ dict_index_t::stat_n_sample_sizes[] dict_index_t::stat_n_non_null_key_vals[] dict_index_t::stat_index_size dict_index_t::stat_n_leaf_pages +dict_index_t::stat_defrag_modified_counter +dict_index_t::stat_defrag_n_pages_freed +dict_index_t::stat_defrag_n_page_split The returned object should be freed with dict_stats_snapshot_free() when no longer needed. @return incomplete table object */ @@ -807,7 +852,9 @@ dict_stats_snapshot_free( Calculates new estimates for index statistics. This function is relatively quick and is used to calculate transient statistics that are not saved on disk. This was the only way to calculate statistics -before the Persistent Statistics feature was introduced. */ +before the Persistent Statistics feature was introduced. +This function doesn't update the defragmentation related stats. +Only persistent statistics supports defragmentation stats. */ static void dict_stats_update_transient_for_index( @@ -823,10 +870,10 @@ dict_stats_update_transient_for_index( Initialize some bogus index cardinality statistics, so that the data can be queried in various means, also via secondary indexes. */ - dict_stats_empty_index(index); + dict_stats_empty_index(index, false); #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG } else if (ibuf_debug && !dict_index_is_clust(index)) { - dict_stats_empty_index(index); + dict_stats_empty_index(index, false); #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ } else { mtr_t mtr; @@ -847,7 +894,7 @@ dict_stats_update_transient_for_index( switch (size) { case ULINT_UNDEFINED: - dict_stats_empty_index(index); + dict_stats_empty_index(index, false); return; case 0: /* The root node of the tree is a leaf */ @@ -882,7 +929,7 @@ dict_stats_update_transient( if (dict_table_is_discarded(table)) { /* Nothing to do. */ - dict_stats_empty_table(table); + dict_stats_empty_table(table, false); return; } else if (index == NULL) { /* Table definition is corrupt */ @@ -892,7 +939,7 @@ dict_stats_update_transient( fprintf(stderr, " InnoDB: table %s has no indexes. " "Cannot calculate statistics.\n", ut_format_name(table->name, TRUE, buf, sizeof(buf))); - dict_stats_empty_table(table); + dict_stats_empty_table(table, false); return; } @@ -904,7 +951,7 @@ dict_stats_update_transient( continue; } - dict_stats_empty_index(index); + dict_stats_empty_index(index, false); if (dict_stats_should_ignore_index(index)) { continue; @@ -1794,7 +1841,7 @@ dict_stats_analyze_index( DEBUG_PRINTF(" %s(index=%s)\n", __func__, index->name); - dict_stats_empty_index(index); + dict_stats_empty_index(index, false); mtr_start(&mtr); @@ -2059,7 +2106,7 @@ dict_stats_update_persistent( /* Table definition is corrupt */ dict_table_stats_unlock(table, RW_X_LATCH); - dict_stats_empty_table(table); + dict_stats_empty_table(table, true); return(DB_CORRUPTION); } @@ -2088,7 +2135,7 @@ dict_stats_update_persistent( continue; } - dict_stats_empty_index(index); + dict_stats_empty_index(index, false); if (dict_stats_should_ignore_index(index)) { continue; @@ -2657,6 +2704,16 @@ dict_stats_fetch_index_stats_step( == 0) { index->stat_n_leaf_pages = (ulint) stat_value; arg->stats_were_modified = true; + } else if (stat_name_len == 12 /* strlen("n_page_split") */ + && strncasecmp("n_page_split", stat_name, stat_name_len) + == 0) { + index->stat_defrag_n_page_split = (ulint) stat_value; + arg->stats_were_modified = true; + } else if (stat_name_len == 13 /* strlen("n_pages_freed") */ + && strncasecmp("n_pages_freed", stat_name, stat_name_len) + == 0) { + index->stat_defrag_n_pages_freed = (ulint) stat_value; + arg->stats_were_modified = true; } else if (stat_name_len > PFX_LEN /* e.g. stat_name=="n_diff_pfx01" */ && strncasecmp(PFX, stat_name, PFX_LEN) == 0) { @@ -2776,7 +2833,7 @@ dict_stats_fetch_from_ps( the persistent storage contains incomplete stats (e.g. missing stats for some index) then we would end up with (partially) uninitialized stats. */ - dict_stats_empty_table(table); + dict_stats_empty_table(table, true); trx = trx_allocate_for_background(); @@ -2878,6 +2935,22 @@ dict_stats_fetch_from_ps( } /*********************************************************************//** +Clear defragmentation stats modified counter for all indices in table. */ +static +void +dict_stats_empty_defrag_modified_counter( + dict_table_t* table) /*!< in: table */ +{ + dict_index_t* index; + ut_a(table); + for (index = dict_table_get_first_index(table); + index != NULL; + index = dict_table_get_next_index(index)) { + index->stat_defrag_modified_counter = 0; + } +} + +/*********************************************************************//** Fetches or calculates new estimates for index statistics. */ UNIV_INTERN void @@ -2949,13 +3022,13 @@ dict_stats_update( "because the .ibd file is missing. For help, please " "refer to " REFMAN "innodb-troubleshooting.html\n", ut_format_name(table->name, TRUE, buf, sizeof(buf))); - dict_stats_empty_table(table); + dict_stats_empty_table(table, true); return(DB_TABLESPACE_DELETED); } else if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) { /* If we have set a high innodb_force_recovery level, do not calculate statistics, as a badly corrupted index can cause a crash in it. */ - dict_stats_empty_table(table); + dict_stats_empty_table(table, false); return(DB_SUCCESS); } @@ -3014,7 +3087,7 @@ dict_stats_update( case DICT_STATS_EMPTY_TABLE: - dict_stats_empty_table(table); + dict_stats_empty_table(table, true); /* If table is using persistent stats, then save the stats on disk */ @@ -3073,6 +3146,7 @@ dict_stats_update( t->stats_last_recalc = table->stats_last_recalc; t->stat_modified_counter = 0; + dict_stats_empty_defrag_modified_counter(t); switch (err) { case DB_SUCCESS: @@ -3083,7 +3157,7 @@ dict_stats_update( copying because dict_stats_table_clone_create() does skip corrupted indexes so our dummy object 't' may have less indexes than the real object 'table'. */ - dict_stats_empty_table(table); + dict_stats_empty_table(table, true); dict_stats_copy(table, t); @@ -3650,6 +3724,117 @@ dict_stats_rename_table( return(ret); } +/*********************************************************************//** +Save defragmentation result. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_stats_save_defrag_summary( + dict_index_t* index) /*!< in: index */ +{ + dberr_t ret; + lint now = (lint) ut_time(); + if (dict_index_is_univ(index)) { + return DB_SUCCESS; + } + rw_lock_x_lock(&dict_operation_lock); + mutex_enter(&dict_sys->mutex); + ret = dict_stats_save_index_stat(index, now, "n_pages_freed", + index->stat_defrag_n_pages_freed, + NULL, + "Number of pages freed during" + " last defragmentation run.", + NULL); + + mutex_exit(&dict_sys->mutex); + rw_lock_x_unlock(&dict_operation_lock); + return (ret); +} + +/*********************************************************************//** +Save defragmentation stats for a given index. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_stats_save_defrag_stats( + dict_index_t* index) /*!< in: index */ +{ + dberr_t ret; + + if (index->table->ibd_file_missing) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Cannot save defragment stats because " + ".ibd file is missing.\n"); + return (DB_TABLESPACE_DELETED); + } + if (dict_index_is_corrupted(index)) { + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Cannot save defragment stats because " + "index is corrupted.\n"); + return(DB_CORRUPTION); + } + + if (dict_index_is_univ(index)) { + return DB_SUCCESS; + } + + lint now = (lint) ut_time(); + mtr_t mtr; + ulint n_leaf_pages; + ulint n_leaf_reserved; + mtr_start(&mtr); + mtr_s_lock(dict_index_get_lock(index), &mtr); + n_leaf_reserved = btr_get_size_and_reserved(index, BTR_N_LEAF_PAGES, + &n_leaf_pages, &mtr); + mtr_commit(&mtr); + + if (n_leaf_reserved == ULINT_UNDEFINED) { + // The index name is different during fast index creation, + // so the stats won't be associated with the right index + // for later use. We just return without saving. + return DB_SUCCESS; + } + + rw_lock_x_lock(&dict_operation_lock); + + mutex_enter(&dict_sys->mutex); + ret = dict_stats_save_index_stat(index, now, "n_page_split", + index->stat_defrag_n_page_split, + NULL, + "Number of new page splits on leaves" + " since last defragmentation.", + NULL); + if (ret != DB_SUCCESS) { + goto end; + } + + ret = dict_stats_save_index_stat( + index, now, "n_leaf_pages_defrag", + n_leaf_pages, + NULL, + "Number of leaf pages when this stat is saved to disk", + NULL); + if (ret != DB_SUCCESS) { + goto end; + } + + ret = dict_stats_save_index_stat( + index, now, "n_leaf_pages_reserved", + n_leaf_reserved, + NULL, + "Number of pages reserved for this index leaves when this stat " + "is saved to disk", + NULL); + +end: + mutex_exit(&dict_sys->mutex); + rw_lock_x_unlock(&dict_operation_lock); + + return (ret); +} + /* tests @{ */ #ifdef UNIV_COMPILE_TEST_FUNCS diff --git a/storage/xtradb/dict/dict0stats_bg.cc b/storage/xtradb/dict/dict0stats_bg.cc index 9e1f75a13a9..2cf8aff1e30 100644 --- a/storage/xtradb/dict/dict0stats_bg.cc +++ b/storage/xtradb/dict/dict0stats_bg.cc @@ -25,6 +25,7 @@ Created Apr 25, 2012 Vasil Dimov #include "row0mysql.h" #include "srv0start.h" +#include "dict0dict.h" #include "dict0stats.h" #include "dict0stats_bg.h" @@ -44,8 +45,10 @@ UNIV_INTERN os_event_t dict_stats_event = NULL; /** This mutex protects the "recalc_pool" variable. */ static ib_mutex_t recalc_pool_mutex; +static ib_mutex_t defrag_pool_mutex; #ifdef HAVE_PSI_INTERFACE static mysql_pfs_key_t recalc_pool_mutex_key; +static mysql_pfs_key_t defrag_pool_mutex_key; #endif /* HAVE_PSI_INTERFACE */ /** The number of tables that can be added to "recalc_pool" before @@ -59,16 +62,26 @@ static recalc_pool_t recalc_pool; typedef recalc_pool_t::iterator recalc_pool_iterator_t; +/** Indices whose defrag stats need to be saved to persistent storage.*/ +struct defrag_pool_item_t { + table_id_t table_id; + index_id_t index_id; +}; +typedef std::vector<defrag_pool_item_t> defrag_pool_t; +static defrag_pool_t defrag_pool; +typedef defrag_pool_t::iterator defrag_pool_iterator_t; + /*****************************************************************//** Initialize the recalc pool, called once during thread initialization. */ static void -dict_stats_recalc_pool_init() +dict_stats_pool_init() /*=========================*/ { ut_ad(!srv_read_only_mode); recalc_pool.reserve(RECALC_POOL_INITIAL_SLOTS); + defrag_pool.reserve(RECALC_POOL_INITIAL_SLOTS); } /*****************************************************************//** @@ -76,12 +89,13 @@ Free the resources occupied by the recalc pool, called once during thread de-initialization. */ static void -dict_stats_recalc_pool_deinit() +dict_stats_pool_deinit() /*===========================*/ { ut_ad(!srv_read_only_mode); recalc_pool.clear(); + defrag_pool.clear(); } /*****************************************************************//** @@ -178,6 +192,111 @@ dict_stats_recalc_pool_del( } /*****************************************************************//** +Add an index in a table to the defrag pool, which is processed by the +background stats gathering thread. Only the table id and index id are +added to the list, so the table can be closed after being enqueued and +it will be opened when needed. If the table or index does not exist later +(has been DROPped), then it will be removed from the pool and skipped. */ +UNIV_INTERN +void +dict_stats_defrag_pool_add( +/*=======================*/ + const dict_index_t* index) /*!< in: table to add */ +{ + defrag_pool_item_t item; + + ut_ad(!srv_read_only_mode); + + mutex_enter(&defrag_pool_mutex); + + /* quit if already in the list */ + for (defrag_pool_iterator_t iter = defrag_pool.begin(); + iter != defrag_pool.end(); + ++iter) { + if ((*iter).table_id == index->table->id + && (*iter).index_id == index->id) { + mutex_exit(&defrag_pool_mutex); + return; + } + } + + item.table_id = index->table->id; + item.index_id = index->id; + defrag_pool.push_back(item); + + mutex_exit(&defrag_pool_mutex); + + os_event_set(dict_stats_event); +} + +/*****************************************************************//** +Get an index from the auto defrag pool. The returned index id is removed +from the pool. +@return true if the pool was non-empty and "id" was set, false otherwise */ +static +bool +dict_stats_defrag_pool_get( +/*=======================*/ + table_id_t* table_id, /*!< out: table id, or unmodified if + list is empty */ + index_id_t* index_id) /*!< out: index id, or unmodified if + list is empty */ +{ + ut_ad(!srv_read_only_mode); + + mutex_enter(&defrag_pool_mutex); + + if (defrag_pool.empty()) { + mutex_exit(&defrag_pool_mutex); + return(false); + } + + defrag_pool_item_t& item = defrag_pool.back(); + *table_id = item.table_id; + *index_id = item.index_id; + + defrag_pool.pop_back(); + + mutex_exit(&defrag_pool_mutex); + + return(true); +} + +/*****************************************************************//** +Delete a given index from the auto defrag pool. */ +UNIV_INTERN +void +dict_stats_defrag_pool_del( +/*=======================*/ + const dict_table_t* table, /*!<in: if given, remove + all entries for the table */ + const dict_index_t* index) /*!< in: if given, remove this index */ +{ + ut_a((table && !index) || (!table && index)); + ut_ad(!srv_read_only_mode); + ut_ad(mutex_own(&dict_sys->mutex)); + + mutex_enter(&defrag_pool_mutex); + + defrag_pool_iterator_t iter = defrag_pool.begin(); + while (iter != defrag_pool.end()) { + if ((table && (*iter).table_id == table->id) + || (index + && (*iter).table_id == index->table->id + && (*iter).index_id == index->id)) { + /* erase() invalidates the iterator */ + iter = defrag_pool.erase(iter); + if (index) + break; + } else { + iter++; + } + } + + mutex_exit(&defrag_pool_mutex); +} + +/*****************************************************************//** Wait until background stats thread has stopped using the specified table. The caller must have locked the data dictionary using row_mysql_lock_data_dictionary() and this function may unlock it temporarily @@ -227,7 +346,10 @@ dict_stats_thread_init() mutex_create(recalc_pool_mutex_key, &recalc_pool_mutex, SYNC_STATS_AUTO_RECALC); - dict_stats_recalc_pool_init(); + /* We choose SYNC_STATS_DEFRAG to be below SYNC_FSP_PAGE. */ + mutex_create(defrag_pool_mutex_key, &defrag_pool_mutex, + SYNC_STATS_DEFRAG); + dict_stats_pool_init(); } /*****************************************************************//** @@ -241,11 +363,14 @@ dict_stats_thread_deinit() ut_a(!srv_read_only_mode); ut_ad(!srv_dict_stats_thread_active); - dict_stats_recalc_pool_deinit(); + dict_stats_pool_deinit(); mutex_free(&recalc_pool_mutex); memset(&recalc_pool_mutex, 0x0, sizeof(recalc_pool_mutex)); + mutex_free(&defrag_pool_mutex); + memset(&defrag_pool_mutex, 0x0, sizeof(defrag_pool_mutex)); + os_event_free(dict_stats_event); dict_stats_event = NULL; } @@ -323,6 +448,63 @@ dict_stats_process_entry_from_recalc_pool() } /*****************************************************************//** +Get the first index that has been added for updating persistent defrag +stats and eventually save its stats. */ +static +void +dict_stats_process_entry_from_defrag_pool() +/*=======================================*/ +{ + table_id_t table_id; + index_id_t index_id; + + ut_ad(!srv_read_only_mode); + + /* pop the first index from the auto defrag pool */ + if (!dict_stats_defrag_pool_get(&table_id, &index_id)) { + /* no index in defrag pool */ + return; + } + + dict_table_t* table; + + mutex_enter(&dict_sys->mutex); + + /* If the table is no longer cached, we've already lost the in + memory stats so there's nothing really to write to disk. */ + table = dict_table_open_on_id(table_id, TRUE, + DICT_TABLE_OP_OPEN_ONLY_IF_CACHED); + + if (table == NULL) { + mutex_exit(&dict_sys->mutex); + return; + } + + /* Check whether table is corrupted */ + if (table->corrupted) { + dict_table_close(table, TRUE, FALSE); + mutex_exit(&dict_sys->mutex); + return; + } + mutex_exit(&dict_sys->mutex); + + dict_index_t* index = dict_table_find_index_on_id(table, index_id); + + if (index == NULL) { + return; + } + + /* Check whether index is corrupted */ + if (dict_index_is_corrupted(index)) { + dict_table_close(table, FALSE, FALSE); + return; + } + + dict_stats_save_defrag_stats(index); + dict_table_close(table, FALSE, FALSE); +} + +/*****************************************************************//** This is the thread for background stats gathering. It pops tables, from the auto recalc list and proceeds them, eventually recalculating their statistics. @@ -354,6 +536,9 @@ DECLARE_THREAD(dict_stats_thread)( dict_stats_process_entry_from_recalc_pool(); + while (defrag_pool.size()) + dict_stats_process_entry_from_defrag_pool(); + os_event_reset(dict_stats_event); } diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index fb3e097491d..8f3bdcf0614 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -58,6 +58,7 @@ this program; if not, write to the Free Software Foundation, Inc., #include "buf0flu.h" #include "buf0dblwr.h" #include "btr0sea.h" +#include "btr0defragment.h" #include "os0file.h" #include "os0thread.h" #include "srv0start.h" @@ -66,7 +67,6 @@ this program; if not, write to the Free Software Foundation, Inc., #include "trx0trx.h" #include "trx0sys.h" -#include "mtr0mtr.h" #include "rem0types.h" #include "row0ins.h" #include "row0mysql.h" @@ -88,6 +88,7 @@ this program; if not, write to the Free Software Foundation, Inc., #include "dict0stats_bg.h" #include "ha_prototypes.h" #include "ut0mem.h" +#include "ut0timer.h" #include "ibuf0ibuf.h" #include "dict0dict.h" #include "srv0mon.h" @@ -946,6 +947,14 @@ static SHOW_VAR innodb_status_variables[]= { {"have_bzip2", (char*) &innodb_have_bzip2, SHOW_BOOL}, + /* Defragment */ + {"defragment_compression_failures", + (char*) &export_vars.innodb_defragment_compression_failures, SHOW_LONG}, + {"defragment_failures", + (char*) &export_vars.innodb_defragment_failures, SHOW_LONG}, + {"defragment_count", + (char*) &export_vars.innodb_defragment_count, SHOW_LONG}, + {NullS, NullS, SHOW_LONG} }; @@ -2700,7 +2709,8 @@ ha_innobase::ha_innobase( (srv_force_primary_key ? HA_REQUIRE_PRIMARY_KEY : 0 ) | HA_CAN_FULLTEXT_EXT | HA_CAN_EXPORT), start_of_scan(0), - num_write_row(0) + num_write_row(0), + ha_partition_stats(NULL) {} /*********************************************************************//** @@ -11223,6 +11233,72 @@ ha_innobase::delete_table( } /*****************************************************************//** +Defragment table. +@return error number */ +UNIV_INTERN +int +ha_innobase::defragment_table( +/*==========================*/ + const char* name, /*!< in: table name */ + const char* index_name, /*!< in: index name */ + bool async) /*!< in: whether to wait until finish */ +{ + char norm_name[FN_REFLEN]; + dict_table_t* table; + dict_index_t* index; + ibool one_index = (index_name != 0); + int ret = 0; + if (!srv_defragment) { + return ER_FEATURE_DISABLED; + } + normalize_table_name(norm_name, name); + table = dict_table_open_on_name(norm_name, FALSE, + FALSE, DICT_ERR_IGNORE_NONE); + for (index = dict_table_get_first_index(table); index; + index = dict_table_get_next_index(index)) { + if (one_index && strcasecmp(index_name, index->name) != 0) + continue; + if (btr_defragment_find_index(index)) { + // We borrow this error code. When the same index is + // already in the defragmentation queue, issue another + // defragmentation only introduces overhead. We return + // an error here to let the user know this is not + // necessary. Note that this will fail a query that's + // trying to defragment a full table if one of the + // indicies in that table is already in defragmentation. + // We choose this behavior so user is aware of this + // rather than silently defragment other indicies of + // that table. + ret = ER_SP_ALREADY_EXISTS; + break; + } + os_event_t event = btr_defragment_add_index(index, async); + if (!async && event) { + while(os_event_wait_time(event, 1000000)) { + if (thd_killed(current_thd)) { + btr_defragment_remove_index(index); + ret = ER_QUERY_INTERRUPTED; + break; + } + } + os_event_free(event); + } + if (ret) { + break; + } + if (one_index) { + one_index = FALSE; + break; + } + } + dict_table_close(table, FALSE, FALSE); + if (ret == 0 && one_index) { + ret = ER_NO_SUCH_INDEX; + } + return ret; +} + +/*****************************************************************//** Removes all tables in the named database inside InnoDB. */ static void @@ -12389,6 +12465,27 @@ ha_innobase::optimize( This works OK otherwise, but MySQL locks the entire table during calls to OPTIMIZE, which is undesirable. */ + if (srv_defragment) { + int err; + + err = defragment_table(prebuilt->table->name, NULL, false); + + if (err == 0) { + return (HA_ADMIN_OK); + } else { + push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, + err, + "InnoDB: Cannot defragment table %s: returned error code %d\n", + prebuilt->table->name, err); + + if(err == ER_SP_ALREADY_EXISTS) { + return (HA_ADMIN_OK); + } else { + return (HA_ADMIN_TRY_ALTER); + } + } + } + if (innodb_optimize_fulltext_only) { if (prebuilt->table->fts && prebuilt->table->fts->cache && !dict_table_is_discarded(prebuilt->table)) { @@ -15190,6 +15287,13 @@ innodb_max_dirty_pages_pct_lwm_update( srv_max_dirty_pages_pct_lwm = in_val; } +UNIV_INTERN +void +ha_innobase::set_partition_owner_stats(ha_statistics *stats) +{ + ha_partition_stats= stats; +} + /************************************************************//** Validate the file format name and return its corresponding id. @return valid file format id */ @@ -16448,6 +16552,23 @@ innodb_reset_all_monitor_update( TRUE); } +static +void +innodb_defragment_frequency_update( +/*===============================*/ + THD* thd, /*!< in: thread handle */ + struct st_mysql_sys_var* var, /*!< in: pointer to + system variable */ + void* var_ptr,/*!< out: where the + formal string goes */ + const void* save) /*!< in: immediate result + from check function */ +{ + srv_defragment_frequency = (*static_cast<const uint*>(save)); + srv_defragment_interval = ut_microseconds_to_timer( + 1000000.0 / srv_defragment_frequency); +} + /****************************************************************//** Parse and enable InnoDB monitor counters during server startup. User can list the monitor counters/groups to be enable by specifying @@ -17735,6 +17856,60 @@ static MYSQL_SYSVAR_BOOL(buffer_pool_load_at_startup, srv_buffer_pool_load_at_st "Load the buffer pool from a file named @@innodb_buffer_pool_filename", NULL, NULL, FALSE); +static MYSQL_SYSVAR_BOOL(defragment, srv_defragment, + PLUGIN_VAR_RQCMDARG, + "Enable/disable InnoDB defragmentation (default FALSE). When set to FALSE, all existing " + "defragmentation will be paused. And new defragmentation command will fail." + "Paused defragmentation commands will resume when this variable is set to " + "true again.", + NULL, NULL, FALSE); + +static MYSQL_SYSVAR_UINT(defragment_n_pages, srv_defragment_n_pages, + PLUGIN_VAR_RQCMDARG, + "Number of pages considered at once when merging multiple pages to " + "defragment", + NULL, NULL, 7, 2, 32, 0); + +static MYSQL_SYSVAR_UINT(defragment_stats_accuracy, + srv_defragment_stats_accuracy, + PLUGIN_VAR_RQCMDARG, + "How many defragment stats changes there are before the stats " + "are written to persistent storage. Set to 0 meaning disable " + "defragment stats tracking.", + NULL, NULL, 0, 0, ~0U, 0); + +static MYSQL_SYSVAR_UINT(defragment_fill_factor_n_recs, + srv_defragment_fill_factor_n_recs, + PLUGIN_VAR_RQCMDARG, + "How many records of space defragmentation should leave on the page. " + "This variable, together with innodb_defragment_fill_factor, is introduced " + "so defragmentation won't pack the page too full and cause page split on " + "the next insert on every page. The variable indicating more defragmentation" + " gain is the one effective.", + NULL, NULL, 20, 1, 100, 0); + +static MYSQL_SYSVAR_DOUBLE(defragment_fill_factor, srv_defragment_fill_factor, + PLUGIN_VAR_RQCMDARG, + "A number between [0.7, 1] that tells defragmentation how full it should " + "fill a page. Default is 0.9. Number below 0.7 won't make much sense." + "This variable, together with innodb_defragment_fill_factor_n_recs, is " + "introduced so defragmentation won't pack the page too full and cause " + "page split on the next insert on every page. The variable indicating more " + "defragmentation gain is the one effective.", + NULL, NULL, 0.9, 0.7, 1, 0); + +static MYSQL_SYSVAR_UINT(defragment_frequency, srv_defragment_frequency, + PLUGIN_VAR_RQCMDARG, + "Do not defragment a single index more than this number of time per second." + "This controls the number of time defragmentation thread can request X_LOCK " + "on an index. Defragmentation thread will check whether " + "1/defragment_frequency (s) has passed since it worked on this index last " + "time, and put the index back to the queue if not enough time has passed. " + "The actual frequency can only be lower than this given number.", + NULL, innodb_defragment_frequency_update, + SRV_DEFRAGMENT_FREQUENCY_DEFAULT, 1, 1000, 0); + + static MYSQL_SYSVAR_ULONG(lru_scan_depth, srv_LRU_scan_depth, PLUGIN_VAR_RQCMDARG, "How deep to scan LRU to keep it clean", @@ -18291,6 +18466,12 @@ static struct st_mysql_sys_var* innobase_system_variables[]= { MYSQL_SYSVAR(buffer_pool_load_now), MYSQL_SYSVAR(buffer_pool_load_abort), MYSQL_SYSVAR(buffer_pool_load_at_startup), + MYSQL_SYSVAR(defragment), + MYSQL_SYSVAR(defragment_n_pages), + MYSQL_SYSVAR(defragment_stats_accuracy), + MYSQL_SYSVAR(defragment_fill_factor), + MYSQL_SYSVAR(defragment_fill_factor_n_recs), + MYSQL_SYSVAR(defragment_frequency), MYSQL_SYSVAR(lru_scan_depth), MYSQL_SYSVAR(flush_neighbors), MYSQL_SYSVAR(checksum_algorithm), diff --git a/storage/xtradb/handler/ha_innodb.h b/storage/xtradb/handler/ha_innodb.h index b4df711356c..19356750640 100644 --- a/storage/xtradb/handler/ha_innodb.h +++ b/storage/xtradb/handler/ha_innodb.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2000, 2012, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2013, SkySQL Ab. All Rights Reserved. +Copyright (c) 2013, 2014, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -105,6 +105,8 @@ class ha_innobase: public handler or undefined */ uint num_write_row; /*!< number of write_row() calls */ + ha_statistics* ha_partition_stats; /*!< stats of the partition owner + handler (if there is one) */ uint store_key_val_for_row(uint keynr, char* buff, uint buff_len, const uchar* record); inline void update_thd(THD* thd); @@ -207,6 +209,8 @@ class ha_innobase: public handler int truncate(); int delete_table(const char *name); int rename_table(const char* from, const char* to); + int defragment_table(const char* name, const char* index_name, + bool async); int check(THD* thd, HA_CHECK_OPT* check_opt); char* update_table_comment(const char* comment); char* get_foreign_key_create_info(); @@ -310,6 +314,7 @@ class ha_innobase: public handler Alter_inplace_info* ha_alter_info, bool commit); /** @} */ + void set_partition_owner_stats(ha_statistics *stats); bool check_if_incompatible_data(HA_CREATE_INFO *info, uint table_changes); bool check_if_supported_virtual_columns(void) { return TRUE; } diff --git a/storage/xtradb/include/btr0btr.h b/storage/xtradb/include/btr0btr.h index a3f7cee2733..001e1af7d2d 100644 --- a/storage/xtradb/include/btr0btr.h +++ b/storage/xtradb/include/btr0btr.h @@ -2,6 +2,7 @@ Copyright (c) 1994, 2013, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. +Copyright (c) 2014, SkySQL Ab. All Rights Reserved. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -674,6 +675,21 @@ btr_get_size( is s-latched */ __attribute__((nonnull, warn_unused_result)); /**************************************************************//** +Gets the number of reserved and used pages in a B-tree. +@return number of pages reserved, or ULINT_UNDEFINED if the index +is unavailable */ +UNIV_INTERN +ulint +btr_get_size_and_reserved( +/*======================*/ + dict_index_t* index, /*!< in: index */ + ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */ + ulint* used, /*!< out: number of pages used (<= reserved) */ + mtr_t* mtr) /*!< in/out: mini-transaction where index + is s-latched */ + __attribute__((nonnull)); + +/**************************************************************//** Allocates a new file page to be used in an index tree. NOTE: we assume that the caller has made the reservation for free extents! @retval NULL if no page could be allocated @@ -720,6 +736,33 @@ btr_page_free_low( ulint level, /*!< in: page level */ mtr_t* mtr) /*!< in: mtr */ __attribute__((nonnull)); +/*************************************************************//** +Reorganizes an index page. + +IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE +if this is a compressed leaf page in a secondary index. This has to +be done either within the same mini-transaction, or by invoking +ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages, +IBUF_BITMAP_FREE is unaffected by reorganization. + +@retval true if the operation was successful +@retval false if it is a compressed page, and recompression failed */ +UNIV_INTERN +bool +btr_page_reorganize_block( +/*======================*/ + bool recovery,/*!< in: true if called in recovery: + locks should not be updated, i.e., + there cannot exist locks on the + page, and a hash index should not be + dropped: it cannot exist */ + ulint z_level,/*!< in: compression level to be used + if dealing with compressed page */ + buf_block_t* block, /*!< in/out: B-tree page */ + dict_index_t* index, /*!< in: the index tree of the page */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); + #ifdef UNIV_BTR_PRINT /*************************************************************//** Prints size info of a B-tree. */ @@ -765,6 +808,60 @@ btr_validate_index( const trx_t* trx) /*!< in: transaction or 0 */ __attribute__((nonnull(1), warn_unused_result)); +#ifdef UNIV_SYNC_DEBUG +/*************************************************************//** +Removes a page from the level list of pages. +@param space in: space where removed +@param zip_size in: compressed page size in bytes, or 0 for uncompressed +@param page in/out: page to remove +@param index in: index tree +@param mtr in/out: mini-transaction */ +# define btr_level_list_remove(space,zip_size,page,index,mtr) \ + btr_level_list_remove_func(space,zip_size,page,index,mtr) +#else /* UNIV_SYNC_DEBUG */ +/*************************************************************//** +Removes a page from the level list of pages. +@param space in: space where removed +@param zip_size in: compressed page size in bytes, or 0 for uncompressed +@param page in/out: page to remove +@param index in: index tree +@param mtr in/out: mini-transaction */ +# define btr_level_list_remove(space,zip_size,page,index,mtr) \ + btr_level_list_remove_func(space,zip_size,page,mtr) +#endif /* UNIV_SYNC_DEBUG */ + +/*************************************************************//** +Removes a page from the level list of pages. */ +UNIV_INTERN +void +btr_level_list_remove_func( +/*=======================*/ + ulint space, /*!< in: space where removed */ + ulint zip_size,/*!< in: compressed page size in bytes + or 0 for uncompressed pages */ + page_t* page, /*!< in/out: page to remove */ +#ifdef UNIV_SYNC_DEBUG + const dict_index_t* index, /*!< in: index tree */ +#endif /* UNIV_SYNC_DEBUG */ + mtr_t* mtr) /*!< in/out: mini-transaction */ + __attribute__((nonnull)); + +/*************************************************************//** +If page is the only on its level, this function moves its records to the +father page, thus reducing the tree height. +@return father block */ +UNIV_INTERN +buf_block_t* +btr_lift_page_up( +/*=============*/ + dict_index_t* index, /*!< in: index tree */ + buf_block_t* block, /*!< in: page which is the only on its level; + must not be empty: use + btr_discard_only_page_on_level if the last + record from the page should be removed */ + mtr_t* mtr) /*!< in: mtr */ + __attribute__((nonnull)); + #define BTR_N_LEAF_PAGES 1 #define BTR_TOTAL_SIZE 2 #endif /* !UNIV_HOTBACKUP */ diff --git a/storage/xtradb/include/btr0btr.ic b/storage/xtradb/include/btr0btr.ic index 9cc611ee450..40b468b200a 100644 --- a/storage/xtradb/include/btr0btr.ic +++ b/storage/xtradb/include/btr0btr.ic @@ -28,7 +28,7 @@ Created 6/2/1994 Heikki Tuuri #include "mtr0mtr.h" #include "mtr0log.h" #include "page0zip.h" -#include "srv0srv.h" + #define BTR_MAX_NODE_LEVEL 50 /*!< Maximum B-tree page level (not really a hard limit). Used in debug assertions @@ -59,9 +59,7 @@ btr_block_get_func( block = buf_page_get_gen(space, zip_size, page_no, mode, NULL, BUF_GET, file, line, mtr); - SRV_CORRUPT_TABLE_CHECK(block, ; /* do nothing */); - - if (block && mode != RW_NO_LATCH) { + if (mode != RW_NO_LATCH) { buf_block_dbg_add_level( block, index != NULL && dict_index_is_ibuf(index) @@ -165,9 +163,10 @@ btr_page_get_next( /*!< in: mini-transaction handle */ { ut_ad(page && mtr); +#ifndef UNIV_INNOCHECKSUM ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX) || mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_S_FIX)); - +#endif /* UNIV_INNOCHECKSUM */ return(mach_read_from_4(page + FIL_PAGE_NEXT)); } diff --git a/storage/xtradb/include/btr0defragment.h b/storage/xtradb/include/btr0defragment.h new file mode 100644 index 00000000000..99beb0a24ba --- /dev/null +++ b/storage/xtradb/include/btr0defragment.h @@ -0,0 +1,100 @@ +/***************************************************************************** + +Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +#ifndef btr0defragment_h +#define btr0defragment_h + +#include "univ.i" + +#ifndef UNIV_HOTBACKUP + +#include "btr0pcur.h" + +/* Max number of pages to consider at once during defragmentation. */ +#define BTR_DEFRAGMENT_MAX_N_PAGES 32 + +/** stats in btr_defragment */ +extern ulint btr_defragment_compression_failures; +extern ulint btr_defragment_failures; +extern ulint btr_defragment_count; + +/** Item in the work queue for btr_degrament_thread. */ +struct btr_defragment_item_t +{ + btr_pcur_t* pcur; /* persistent cursor where + btr_defragment_n_pages should start */ + os_event_t event; /* if not null, signal after work + is done */ + bool removed; /* Mark an item as removed */ + ulonglong last_processed; /* timestamp of last time this index + is processed by defragment thread */ + + btr_defragment_item_t(btr_pcur_t* pcur, os_event_t event); + ~btr_defragment_item_t(); +}; + +/******************************************************************//** +Initialize defragmentation. */ +void +btr_defragment_init(void); +/******************************************************************//** +Shutdown defragmentation. */ +void +btr_defragment_shutdown(); +/******************************************************************//** +Check whether the given index is in btr_defragment_wq. */ +bool +btr_defragment_find_index( + dict_index_t* index); /*!< Index to find. */ +/******************************************************************//** +Add an index to btr_defragment_wq. Return a pointer to os_event if this +is a synchronized defragmentation. */ +os_event_t +btr_defragment_add_index( + dict_index_t* index, /*!< index to be added */ + bool async); /*!< whether this is an async defragmentation */ +/******************************************************************//** +When table is dropped, this function is called to mark a table as removed in +btr_efragment_wq. The difference between this function and the remove_index +function is this will not NULL the event. */ +void +btr_defragment_remove_table( + dict_table_t* table); /*!< Index to be removed. */ +/******************************************************************//** +Mark an index as removed from btr_defragment_wq. */ +void +btr_defragment_remove_index( + dict_index_t* index); /*!< Index to be removed. */ +/*********************************************************************//** +Check whether we should save defragmentation statistics to persistent storage.*/ +UNIV_INTERN +void +btr_defragment_save_defrag_stats_if_needed( + dict_index_t* index); /*!< in: index */ +/******************************************************************//** +Thread that merges consecutive b-tree pages into fewer pages to defragment +the index. */ +extern "C" UNIV_INTERN +os_thread_ret_t +DECLARE_THREAD(btr_defragment_thread)( +/*==========================================*/ + void* arg); /*!< in: a dummy parameter required by + os_thread_create */ + +#endif /* !UNIV_HOTBACKUP */ +#endif diff --git a/storage/xtradb/include/dict0dict.h b/storage/xtradb/include/dict0dict.h index 47790a158da..52ac5eee86b 100644 --- a/storage/xtradb/include/dict0dict.h +++ b/storage/xtradb/include/dict0dict.h @@ -120,7 +120,9 @@ enum dict_table_op_t { DICT_TABLE_OP_DROP_ORPHAN, /** Silently load the tablespace if it does not exist, and do not load the definitions of incomplete indexes. */ - DICT_TABLE_OP_LOAD_TABLESPACE + DICT_TABLE_OP_LOAD_TABLESPACE, + /** Open the table only if it's in table cache. */ + DICT_TABLE_OP_OPEN_ONLY_IF_CACHED }; /**********************************************************************//** @@ -1495,6 +1497,16 @@ dict_table_get_index_on_name( const char* name) /*!< in: name of the index to find */ __attribute__((nonnull, warn_unused_result)); /**********************************************************************//** +Looks for an index with the given id given a table instance. +@return index or NULL */ +UNIV_INTERN +dict_index_t* +dict_table_find_index_on_id( +/*========================*/ + const dict_table_t* table, /*!< in: table instance */ + index_id_t id) /*!< in: index id */ + __attribute__((nonnull, warn_unused_result)); +/**********************************************************************//** In case there is more than one index with the same name return the index with the min(id). @return index, NULL if does not exist */ diff --git a/storage/xtradb/include/dict0mem.h b/storage/xtradb/include/dict0mem.h index a347a75ea42..68cf7f1ba61 100644 --- a/storage/xtradb/include/dict0mem.h +++ b/storage/xtradb/include/dict0mem.h @@ -597,6 +597,10 @@ struct zip_pad_info_t { rounds */ }; +/** Number of samples of data size kept when page compression fails for +a certain index.*/ +#define STAT_DEFRAG_DATA_SIZE_N_SAMPLE 10 + /** Data structure for an index. Most fields will be initialized to 0, NULL or FALSE in dict_mem_index_create(). */ struct dict_index_t{ @@ -689,6 +693,23 @@ struct dict_index_t{ /*!< approximate number of leaf pages in the index tree */ /* @} */ + /** Statistics for defragmentation, these numbers are estimations and + could be very inaccurate at certain times, e.g. right after restart, + during defragmentation, etc. */ + /* @{ */ + ulint stat_defrag_modified_counter; + ulint stat_defrag_n_pages_freed; + /* number of pages freed by defragmentation. */ + ulint stat_defrag_n_page_split; + /* number of page splits since last full index + defragmentation. */ + ulint stat_defrag_data_size_sample[STAT_DEFRAG_DATA_SIZE_N_SAMPLE]; + /* data size when compression failure happened + the most recent 10 times. */ + ulint stat_defrag_sample_next_slot; + /* in which slot the next sample should be + saved. */ + /* @} */ prio_rw_lock_t lock; /*!< read-write lock protecting the upper levels of the index tree */ trx_id_t trx_id; /*!< id of the transaction that created this diff --git a/storage/xtradb/include/dict0priv.h b/storage/xtradb/include/dict0priv.h index 9a3c8e22992..e034662aba0 100644 --- a/storage/xtradb/include/dict0priv.h +++ b/storage/xtradb/include/dict0priv.h @@ -53,8 +53,9 @@ dict_table_t* dict_table_open_on_id_low( /*=====================*/ table_id_t table_id, /*!< in: table id */ - dict_err_ignore_t ignore_err); /*!< in: errors to ignore + dict_err_ignore_t ignore_err, /*!< in: errors to ignore when loading the table */ + ibool open_only_if_in_cache); #ifndef UNIV_NONINL #include "dict0priv.ic" diff --git a/storage/xtradb/include/dict0priv.ic b/storage/xtradb/include/dict0priv.ic index 30ba8fb60aa..983218af78a 100644 --- a/storage/xtradb/include/dict0priv.ic +++ b/storage/xtradb/include/dict0priv.ic @@ -74,8 +74,9 @@ dict_table_t* dict_table_open_on_id_low( /*======================*/ table_id_t table_id, /*!< in: table id */ - dict_err_ignore_t ignore_err) /*!< in: errors to ignore + dict_err_ignore_t ignore_err, /*!< in: errors to ignore when loading the table */ + ibool open_only_if_in_cache) { dict_table_t* table; ulint fold; @@ -88,7 +89,7 @@ dict_table_open_on_id_low( HASH_SEARCH(id_hash, dict_sys->table_id_hash, fold, dict_table_t*, table, ut_ad(table->cached), table->id == table_id); - if (table == NULL) { + if (table == NULL && !open_only_if_in_cache) { table = dict_load_table_on_id(table_id, ignore_err); } diff --git a/storage/xtradb/include/dict0stats.h b/storage/xtradb/include/dict0stats.h index 186f90e3694..abf56b2f0c7 100644 --- a/storage/xtradb/include/dict0stats.h +++ b/storage/xtradb/include/dict0stats.h @@ -195,6 +195,39 @@ dict_stats_rename_table( is returned */ size_t errstr_sz); /*!< in: errstr size */ +/*********************************************************************//** +Save defragmentation result. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_stats_save_defrag_summary( + dict_index_t* index); /*!< in: index */ + +/*********************************************************************//** +Save defragmentation stats for a given index. +@return DB_SUCCESS or error code */ +UNIV_INTERN +dberr_t +dict_stats_save_defrag_stats( + dict_index_t* index); /*!< in: index */ + +/**********************************************************************//** +Clear defragmentation summary. */ +UNIV_INTERN +void +dict_stats_empty_defrag_summary( +/*==================*/ + dict_index_t* index); /*!< in: index to clear defragmentation stats */ + +/**********************************************************************//** +Clear defragmentation related index stats. */ +UNIV_INTERN +void +dict_stats_empty_defrag_stats( +/*==================*/ + dict_index_t* index); /*!< in: index to clear defragmentation stats */ + + #ifndef UNIV_NONINL #include "dict0stats.ic" #endif diff --git a/storage/xtradb/include/dict0stats_bg.h b/storage/xtradb/include/dict0stats_bg.h index e866ab419fe..32fac3015e8 100644 --- a/storage/xtradb/include/dict0stats_bg.h +++ b/storage/xtradb/include/dict0stats_bg.h @@ -56,6 +56,28 @@ dict_stats_recalc_pool_del( /*=======================*/ const dict_table_t* table); /*!< in: table to remove */ +/*****************************************************************//** +Add an index in a table to the defrag pool, which is processed by the +background stats gathering thread. Only the table id and index id are +added to the list, so the table can be closed after being enqueued and +it will be opened when needed. If the table or index does not exist later +(has been DROPped), then it will be removed from the pool and skipped. */ +UNIV_INTERN +void +dict_stats_defrag_pool_add( +/*=======================*/ + const dict_index_t* index); /*!< in: table to add */ + +/*****************************************************************//** +Delete a given index from the auto defrag pool. */ +UNIV_INTERN +void +dict_stats_defrag_pool_del( +/*=======================*/ + const dict_table_t* table, /*!<in: if given, remove + all entries for the table */ + const dict_index_t* index); /*!< in: index to remove */ + /** Yield the data dictionary latch when waiting for the background thread to stop accessing a table. @param trx transaction holding the data dictionary locks */ diff --git a/storage/xtradb/include/lock0lock.h b/storage/xtradb/include/lock0lock.h index 633e4f6626b..8d5515b5eb5 100644 --- a/storage/xtradb/include/lock0lock.h +++ b/storage/xtradb/include/lock0lock.h @@ -183,6 +183,16 @@ lock_update_merge_left( const buf_block_t* right_block); /*!< in: merged index page which will be discarded */ /*************************************************************//** +Updates the lock table when a page is splited and merged to +two pages. */ +UNIV_INTERN +void +lock_update_split_and_merge( + const buf_block_t* left_block, /*!< in: left page to which merged */ + const rec_t* orig_pred, /*!< in: original predecessor of + supremum on the left page before merge*/ + const buf_block_t* right_block);/*!< in: right page from which merged */ +/*************************************************************//** Resets the original locks on heir and replaces them with gap type locks inherited from rec. */ UNIV_INTERN diff --git a/storage/xtradb/include/srv0srv.h b/storage/xtradb/include/srv0srv.h index a02c8a96e1a..57f9b2c72e5 100644 --- a/storage/xtradb/include/srv0srv.h +++ b/storage/xtradb/include/srv0srv.h @@ -397,6 +397,15 @@ extern my_bool srv_random_read_ahead; extern ulong srv_read_ahead_threshold; extern ulint srv_n_read_io_threads; extern ulint srv_n_write_io_threads; +/* Defragmentation, Origianlly facebook default value is 100, but it's too high */ +#define SRV_DEFRAGMENT_FREQUENCY_DEFAULT 40 +extern my_bool srv_defragment; +extern uint srv_defragment_n_pages; +extern uint srv_defragment_stats_accuracy; +extern uint srv_defragment_fill_factor_n_recs; +extern double srv_defragment_fill_factor; +extern uint srv_defragment_frequency; +extern ulonglong srv_defragment_interval; /* Number of IO operations per second the server can do */ extern ulong srv_io_capacity; @@ -1099,6 +1108,9 @@ struct export_var_t{ ib_int64_t innodb_x_lock_os_waits; ib_int64_t innodb_x_lock_spin_rounds; ib_int64_t innodb_x_lock_spin_waits; + ulint innodb_defragment_compression_failures; + ulint innodb_defragment_failures; + ulint innodb_defragment_count; #ifdef UNIV_DEBUG ulint innodb_purge_trx_id_age; /*!< rw_max_trx_id - purged trx_id */ ulint innodb_purge_view_trx_id_age; /*!< rw_max_trx_id diff --git a/storage/xtradb/include/sync0sync.h b/storage/xtradb/include/sync0sync.h index 788f765f919..72cfbf61dd8 100644 --- a/storage/xtradb/include/sync0sync.h +++ b/storage/xtradb/include/sync0sync.h @@ -864,6 +864,7 @@ or row lock! */ #define SYNC_EXTERN_STORAGE 500 #define SYNC_FSP 400 #define SYNC_FSP_PAGE 395 +#define SYNC_STATS_DEFRAG 390 /*------------------------------------- Change buffer headers */ #define SYNC_IBUF_MUTEX 370 /* ibuf_mutex */ /*------------------------------------- Change buffer tree */ diff --git a/storage/xtradb/include/ut0timer.h b/storage/xtradb/include/ut0timer.h new file mode 100644 index 00000000000..f361ae79bf5 --- /dev/null +++ b/storage/xtradb/include/ut0timer.h @@ -0,0 +1,104 @@ +/***************************************************************************** + +Copyright (c) 2013, 2014, Facebook, Inc. All Rights Reserved. +Copyright (c) 2014, SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/ut0timer.h +Timer rountines + +Created 30/07/2014 Jan Lindström jan.lindstrom@skysql.com +modified from https://github.com/facebook/mysql-5.6/commit/c75a413edeb96eb99bf11d7269bdfea06f96d6b6 +*************************************************************************/ +#ifndef ut0timer_h +#define ut0timer_h + +#include "univ.i" +#include "data0type.h" +#include <my_rdtsc.h> + +/* Current timer stats */ +extern struct my_timer_unit_info ut_timer; + +/**************************************************************//** +Function pointer to point selected timer function. +@return timer current value */ +extern ulonglong (*ut_timer_now)(void); + +/**************************************************************//** +Sets up the data required for use of my_timer_* functions. +Selects the best timer by high frequency, and tight resolution. +Points my_timer_now() to the selected timer function. +Initializes my_timer struct to contain the info for selected timer.*/ +UNIV_INTERN +void ut_init_timer(void); + +/**************************************************************//** +Return time passed since time then, automatically adjusted +for the estimated timer overhead. +@return time passed since "then" */ +UNIV_INLINE +ulonglong +ut_timer_since( +/*===========*/ + ulonglong then); /*!< in: time where to calculate */ +/**************************************************************//** +Get time passed since "then", and update then to now +@return time passed sinche "then" */ +UNIV_INLINE +ulonglong +ut_timer_since_and_update( +/*======================*/ + ulonglong *then); /*!< in: time where to calculate */ +/**************************************************************//** +Convert native timer units in a ulonglong into seconds in a double +@return time in a seconds */ +UNIV_INLINE +double +ut_timer_to_seconds( +/*=================*/ + ulonglong when); /*!< in: time where to calculate */ +/**************************************************************//** +Convert native timer units in a ulonglong into milliseconds in a double +@return time in milliseconds */ +UNIV_INLINE +double +ut_timer_to_milliseconds( +/*=====================*/ + ulonglong when); /*!< in: time where to calculate */ +/**************************************************************//** +Convert native timer units in a ulonglong into microseconds in a double +@return time in microseconds */ +UNIV_INLINE +double +ut_timer_to_microseconds( +/*=====================*/ + ulonglong when); /*!< in: time where to calculate */ +/**************************************************************//** +Convert microseconds in a double to native timer units in a ulonglong +@return time in microseconds */ +UNIV_INLINE +ulonglong +ut_microseconds_to_timer( +/*=====================*/ + ulonglong when); /*!< in: time where to calculate */ + +#ifndef UNIV_NONINL +#include "ut0timer.ic" +#endif + +#endif diff --git a/storage/xtradb/include/ut0timer.ic b/storage/xtradb/include/ut0timer.ic new file mode 100644 index 00000000000..62e17a10fb1 --- /dev/null +++ b/storage/xtradb/include/ut0timer.ic @@ -0,0 +1,113 @@ +/***************************************************************************** + +Copyright (c) 2013, 2014, Facebook, Inc. All Rights Reserved. +Copyright (c) 2014, SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file include/ut0timer.ic +Timer rountines + +Created 30/07/2014 Jan Lindström jan.lindstrom@skysql.com +modified from https://github.com/facebook/mysql-5.6/commit/c75a413edeb96eb99bf11d7269bdfea06f96d6b6 +*************************************************************************/ + +/**************************************************************//** +Return time passed since time then, automatically adjusted +for the estimated timer overhead. +@return time passed since "then" */ +UNIV_INLINE +ulonglong +ut_timer_since( +/*===========*/ + ulonglong then) /*!< in: time where to calculate */ +{ + return (ut_timer_now() - then) - ut_timer.overhead; +} + +/**************************************************************//** +Get time passed since "then", and update then to now +@return time passed sinche "then" */ +UNIV_INLINE +ulonglong +ut_timer_since_and_update( +/*======================*/ + ulonglong *then) /*!< in: time where to calculate */ +{ + ulonglong now = ut_timer_now(); + ulonglong ret = (now - (*then)) - ut_timer.overhead; + *then = now; + return ret; +} + +/**************************************************************//** +Convert native timer units in a ulonglong into seconds in a double +@return time in a seconds */ +UNIV_INLINE +double +ut_timer_to_seconds( +/*=================*/ + ulonglong when) /*!< in: time where to calculate */ +{ + double ret = (double)(when); + ret /= (double)(ut_timer.frequency); + return ret; +} + +/**************************************************************//** +Convert native timer units in a ulonglong into milliseconds in a double +@return time in milliseconds */ +UNIV_INLINE +double +ut_timer_to_milliseconds( +/*=====================*/ + ulonglong when) /*!< in: time where to calculate */ +{ + double ret = (double)(when); + ret *= 1000.0; + ret /= (double)(ut_timer.frequency); + return ret; +} + +/**************************************************************//** +Convert native timer units in a ulonglong into microseconds in a double +@return time in microseconds */ +UNIV_INLINE +double +ut_timer_to_microseconds( +/*=====================*/ + ulonglong when) /*!< in: time where to calculate */ +{ + double ret = (double)(when); + ret *= 1000000.0; + ret /= (double)(ut_timer.frequency); + return ret; +} + +/**************************************************************//** +Convert microseconds in a double to native timer units in a ulonglong +@return time in microseconds */ +UNIV_INLINE +ulonglong +ut_microseconds_to_timer( +/*=====================*/ + ulonglong when) /*!< in: time where to calculate */ +{ + double ret = when; + ret *= (double)(ut_timer.frequency); + ret /= 1000000.0; + return (ulonglong)ret; +} diff --git a/storage/xtradb/lock/lock0lock.cc b/storage/xtradb/lock/lock0lock.cc index 4f9395e27d8..d6f7b4217c3 100644 --- a/storage/xtradb/lock/lock0lock.cc +++ b/storage/xtradb/lock/lock0lock.cc @@ -3291,6 +3291,47 @@ lock_update_merge_left( } /*************************************************************//** +Updates the lock table when a page is split and merged to +two pages. */ +UNIV_INTERN +void +lock_update_split_and_merge( + const buf_block_t* left_block, /*!< in: left page to which merged */ + const rec_t* orig_pred, /*!< in: original predecessor of + supremum on the left page before merge*/ + const buf_block_t* right_block) /*!< in: right page from which merged */ +{ + const rec_t* left_next_rec; + + ut_a(left_block && right_block); + ut_a(orig_pred); + + lock_mutex_enter(); + + left_next_rec = page_rec_get_next_const(orig_pred); + + /* Inherit the locks on the supremum of the left page to the + first record which was moved from the right page */ + lock_rec_inherit_to_gap( + left_block, left_block, + page_rec_get_heap_no(left_next_rec), + PAGE_HEAP_NO_SUPREMUM); + + /* Reset the locks on the supremum of the left page, + releasing waiting transactions */ + lock_rec_reset_and_release_wait(left_block, + PAGE_HEAP_NO_SUPREMUM); + + /* Inherit the locks to the supremum of the left page from the + successor of the infimum on the right page */ + lock_rec_inherit_to_gap(left_block, right_block, + PAGE_HEAP_NO_SUPREMUM, + lock_get_min_heap_no(right_block)); + + lock_mutex_exit(); +} + +/*************************************************************//** Resets the original locks on heir and replaces them with gap type locks inherited from rec. */ UNIV_INTERN diff --git a/storage/xtradb/page/page0cur.cc b/storage/xtradb/page/page0cur.cc index f5f7e1299ce..97405261392 100644 --- a/storage/xtradb/page/page0cur.cc +++ b/storage/xtradb/page/page0cur.cc @@ -1349,6 +1349,21 @@ page_cur_insert_rec_zip( return(insert_rec); } + /* Page compress failed. If this happened on a + leaf page, put the data size into the sample + buffer. */ + if (page_is_leaf(page)) { + ulint occupied = page_get_data_size(page) + + page_dir_calc_reserved_space( + page_get_n_recs(page)); + index->stat_defrag_data_size_sample[ + index->stat_defrag_sample_next_slot] = + occupied; + index->stat_defrag_sample_next_slot = + (index->stat_defrag_sample_next_slot + + 1) % STAT_DEFRAG_DATA_SIZE_N_SAMPLE; + } + ut_ad(cursor->rec == (pos > 1 ? page_rec_get_nth( diff --git a/storage/xtradb/row/row0mysql.cc b/storage/xtradb/row/row0mysql.cc index c65c39b7971..86de2eeb14c 100644 --- a/storage/xtradb/row/row0mysql.cc +++ b/storage/xtradb/row/row0mysql.cc @@ -53,6 +53,7 @@ Created 9/17/2000 Heikki Tuuri #include "rem0cmp.h" #include "log0log.h" #include "btr0sea.h" +#include "btr0defragment.h" #include "fil0fil.h" #include "ibuf0ibuf.h" #include "fts0fts.h" @@ -3857,6 +3858,8 @@ row_drop_table_for_mysql( if (!dict_table_is_temporary(table)) { dict_stats_recalc_pool_del(table); + dict_stats_defrag_pool_del(table, NULL); + btr_defragment_remove_table(table); /* Remove stats for this table and all of its indexes from the persistent storage if it exists and if there are stats for this diff --git a/storage/xtradb/srv/srv0srv.cc b/storage/xtradb/srv/srv0srv.cc index 8e01ea7402e..bec8c9b95c3 100644 --- a/storage/xtradb/srv/srv0srv.cc +++ b/storage/xtradb/srv/srv0srv.cc @@ -70,10 +70,11 @@ Created 10/8/1995 Heikki Tuuri #include "srv0mon.h" #include "ut0crc32.h" #include "os0file.h" - +#include "btr0defragment.h" #include "mysql/plugin.h" #include "mysql/service_thd_wait.h" #include "fil0pagecompress.h" +#include <my_rdtsc.h> /* prototypes of new functions added to ha_innodb.cc for kill_idle_transaction */ ibool innobase_thd_is_idle(const void* thd); @@ -280,6 +281,16 @@ UNIV_INTERN ulint srv_buf_pool_curr_size = 0; UNIV_INTERN ulint srv_mem_pool_size = ULINT_MAX; UNIV_INTERN ulint srv_lock_table_size = ULINT_MAX; +/* Defragmentation */ +UNIV_INTERN my_bool srv_defragment = FALSE; +UNIV_INTERN uint srv_defragment_n_pages = 7; +UNIV_INTERN uint srv_defragment_stats_accuracy = 0; +UNIV_INTERN uint srv_defragment_fill_factor_n_recs = 20; +UNIV_INTERN double srv_defragment_fill_factor = 0.9; +UNIV_INTERN uint srv_defragment_frequency = + SRV_DEFRAGMENT_FREQUENCY_DEFAULT; +UNIV_INTERN ulonglong srv_defragment_interval = 0; + /** Query thread preflush algorithm */ UNIV_INTERN ulong srv_foreground_preflush = SRV_FOREGROUND_PREFLUSH_EXP_BACKOFF; @@ -1876,6 +1887,11 @@ srv_export_innodb_status(void) export_vars.innodb_page_compressed_trim_op_saved = srv_stats.page_compressed_trim_op_saved; export_vars.innodb_pages_page_decompressed = srv_stats.pages_page_decompressed; + export_vars.innodb_defragment_compression_failures = + btr_defragment_compression_failures; + export_vars.innodb_defragment_failures = btr_defragment_failures; + export_vars.innodb_defragment_count = btr_defragment_count; + #ifdef UNIV_DEBUG rw_lock_s_lock(&purge_sys->latch); trx_id_t done_trx_no = purge_sys->done.trx_no; diff --git a/storage/xtradb/srv/srv0start.cc b/storage/xtradb/srv/srv0start.cc index 3aedede7c97..cb7aa9bc3c7 100644 --- a/storage/xtradb/srv/srv0start.cc +++ b/storage/xtradb/srv/srv0start.cc @@ -69,6 +69,8 @@ Created 2/16/1996 Heikki Tuuri #include "srv0start.h" #include "srv0srv.h" #include "buf0flu.h" +#include "btr0defragment.h" +#include "ut0timer.h" #ifndef UNIV_HOTBACKUP # include "trx0rseg.h" @@ -1575,6 +1577,9 @@ innobase_start_or_create_for_mysql(void) char* logfile0 = NULL; size_t dirnamelen; + /* This should be initialized early */ + ut_init_timer(); + if (srv_force_recovery > SRV_FORCE_NO_TRX_UNDO) { srv_read_only_mode = true; } @@ -2960,6 +2965,9 @@ files_checked: fts_optimize_init(); } + /* Initialize online defragmentation. */ + btr_defragment_init(); + srv_was_started = TRUE; return(DB_SUCCESS); diff --git a/storage/xtradb/sync/sync0sync.cc b/storage/xtradb/sync/sync0sync.cc index e698b7dcf10..1c5b144eb24 100644 --- a/storage/xtradb/sync/sync0sync.cc +++ b/storage/xtradb/sync/sync0sync.cc @@ -1272,6 +1272,7 @@ sync_thread_add_level( case SYNC_IBUF_MUTEX: case SYNC_INDEX_ONLINE_LOG: case SYNC_STATS_AUTO_RECALC: + case SYNC_STATS_DEFRAG: if (!sync_thread_levels_g(array, level, TRUE)) { fprintf(stderr, "InnoDB: sync_thread_levels_g(array, %lu)" diff --git a/storage/xtradb/ut/ut0timer.cc b/storage/xtradb/ut/ut0timer.cc new file mode 100644 index 00000000000..85292cce28c --- /dev/null +++ b/storage/xtradb/ut/ut0timer.cc @@ -0,0 +1,92 @@ +/***************************************************************************** + +Copyright (c) 2013, 2014, Facebook, Inc. All Rights Reserved. +Copyright (c) 2014, SkySQL Ab. All Rights Reserved. + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; version 2 of the License. + +This program is distributed in the hope that it will be useful, but WITHOUT +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along with +this program; if not, write to the Free Software Foundation, Inc., +51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA + +*****************************************************************************/ + +/********************************************************************//** +@file ut/ut0timer.cc +Timer rountines + +Created 30/07/2014 Jan Lindström jan.lindstrom@skysql.com +modified from https://github.com/facebook/mysql-5.6/commit/c75a413edeb96eb99bf11d7269bdfea06f96d6b6 +*************************************************************************/ + +#include "data0type.h" +#include <my_rdtsc.h> +#include <ut0timer.h> + +/**************************************************************//** +Initial timer definition +@return 0 */ +static +ulonglong +ut_timer_none(void) +/*===============*/ +{ + return 0; +} + +/**************************************************************//** +Function pointer to point selected timer function. +@return timer current value */ +ulonglong (*ut_timer_now)(void) = &ut_timer_none; + +struct my_timer_unit_info ut_timer; + +/**************************************************************//** +Sets up the data required for use of my_timer_* functions. +Selects the best timer by high frequency, and tight resolution. +Points my_timer_now() to the selected timer function. +Initializes my_timer struct to contain the info for selected timer.*/ +UNIV_INTERN +void +ut_init_timer(void) +/*===============*/ +{ + MY_TIMER_INFO all_timer_info; + my_timer_init(&all_timer_info); + + if (all_timer_info.cycles.frequency > 1000000 && + all_timer_info.cycles.resolution == 1) { + ut_timer = all_timer_info.cycles; + ut_timer_now = &my_timer_cycles; + } else if (all_timer_info.nanoseconds.frequency > 1000000 && + all_timer_info.nanoseconds.resolution == 1) { + ut_timer = all_timer_info.nanoseconds; + ut_timer_now = &my_timer_nanoseconds; + } else if (all_timer_info.microseconds.frequency >= 1000000 && + all_timer_info.microseconds.resolution == 1) { + ut_timer = all_timer_info.microseconds; + ut_timer_now = &my_timer_microseconds; + + } else if (all_timer_info.milliseconds.frequency >= 1000 && + all_timer_info.milliseconds.resolution == 1) { + ut_timer = all_timer_info.milliseconds; + ut_timer_now = &my_timer_milliseconds; + } else if (all_timer_info.ticks.frequency >= 1000 && + /* Will probably be false */ + all_timer_info.ticks.resolution == 1) { + ut_timer = all_timer_info.ticks; + ut_timer_now = &my_timer_ticks; + } else { + /* None are acceptable, so leave it as "None", and fill in struct */ + ut_timer.frequency = 1; /* Avoid div-by-zero */ + ut_timer.overhead = 0; /* Since it doesn't do anything */ + ut_timer.resolution = 10; /* Another sign it's bad */ + ut_timer.routine = 0; /* None */ + } +} |