diff options
author | Sergei Golubchik <serg@mariadb.org> | 2016-09-28 17:55:28 +0200 |
---|---|---|
committer | Sergei Golubchik <serg@mariadb.org> | 2016-09-28 17:55:28 +0200 |
commit | 66d9696596edbc20ad36bf3d5bffb5595e8235c3 (patch) | |
tree | bbef37c9a90b63d25bee59386cac04298a13846f | |
parent | 66a58f46e937cdc3d7e0529b52ad8b658d9b2cd4 (diff) | |
parent | 23af6f5942e7235a7c14a36cb8dd0d2796b5ef37 (diff) | |
download | mariadb-git-66d9696596edbc20ad36bf3d5bffb5595e8235c3.tar.gz |
Merge branch '10.0' into 10.1
160 files changed, 4948 insertions, 4270 deletions
@@ -10,6 +10,7 @@ Visma http://visma.com (2015 - 2016) Acronis http://acronis.com (2016) Nexedi https://www.nexedi.com (2016) Automattic https://automattic.com (2014 - 2016) +Tencent Game DBA http://tencentdba.com/about (2016) Verkkokauppa.com https://www.verkkokauppa.com (2015 - 2016) Virtuozzo https://virtuozzo.com (2016) diff --git a/cmake/cpack_rpm.cmake b/cmake/cpack_rpm.cmake index 0e0a121dbb8..703e7424159 100644 --- a/cmake/cpack_rpm.cmake +++ b/cmake/cpack_rpm.cmake @@ -230,6 +230,9 @@ SETA(CPACK_RPM_test_PACKAGE_PROVIDES "perl(mtr_io.pl)" "perl(mtr_match)" "perl(mtr_misc.pl)" + "perl(mtr_gcov.pl)" + "perl(mtr_gprof.pl)" + "perl(mtr_process.pl)" "perl(mtr_report)" "perl(mtr_results)" "perl(mtr_unique)") diff --git a/include/my_global.h b/include/my_global.h index f5af8083cdc..bca03bfc4d6 100644 --- a/include/my_global.h +++ b/include/my_global.h @@ -880,8 +880,7 @@ typedef long long my_ptrdiff_t; and related routines are refactored. */ -#define my_offsetof(TYPE, MEMBER) \ - ((size_t)((char *)&(((TYPE *)0x10)->MEMBER) - (char*)0x10)) +#define my_offsetof(TYPE, MEMBER) PTR_BYTE_DIFF(&((TYPE *)0x10)->MEMBER, 0x10) #define NullS (char *) 0 diff --git a/include/my_sys.h b/include/my_sys.h index 36530eb94e9..a89480d3fcc 100644 --- a/include/my_sys.h +++ b/include/my_sys.h @@ -1,5 +1,5 @@ /* Copyright (c) 2000, 2013, Oracle and/or its affiliates. - Copyright (c) 2010, 2013, Monty Program Ab. + Copyright (c) 2010, 2016, Monty Program Ab. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -279,7 +279,7 @@ extern my_bool my_use_symdir; extern ulong my_default_record_cache_size; extern my_bool my_disable_locking, my_disable_async_io, my_disable_flush_key_blocks, my_disable_symlinks; -extern my_bool my_disable_sync; +extern my_bool my_disable_sync, my_disable_copystat_in_redel; extern char wild_many,wild_one,wild_prefix; extern const char *charsets_dir; extern my_bool timed_mutexes; diff --git a/mysql-test/extra/binlog_tests/database.test b/mysql-test/extra/binlog_tests/database.test index 6b3da087f01..2e093aacb0d 100644 --- a/mysql-test/extra/binlog_tests/database.test +++ b/mysql-test/extra/binlog_tests/database.test @@ -52,7 +52,7 @@ eval SELECT 'hello' INTO OUTFILE 'fake_file.$prefix'; # Use '/' instead of '\' in the error message. On windows platform, dir is # formed with '\'. ---replace_regex /\\testing_1\\*/\/testing_1\// /66/39/ /17/39/ /File exists/Directory not empty/ +--replace_regex /\\testing_1\\*/\/testing_1\// /66/39/ /17/39/ /247/39/ /File exists/Directory not empty/ --error 1010 DROP DATABASE testing_1; let $wait_binlog_event= DROP TABLE IF EXIST; diff --git a/mysql-test/include/index_merge2.inc b/mysql-test/include/index_merge2.inc index c50a45a9923..03afa49d323 100644 --- a/mysql-test/include/index_merge2.inc +++ b/mysql-test/include/index_merge2.inc @@ -341,6 +341,7 @@ while ($1) alter table t1 add index i2(key2); alter table t1 add index i3(key3); update t1 set key2=key1,key3=key1; +analyze table t1; # to test the bug, the following must use "sort_union": --replace_column 9 REF diff --git a/mysql-test/lib/My/CoreDump.pm b/mysql-test/lib/My/CoreDump.pm index 0e90967ef95..f9f7b3d8d4b 100644 --- a/mysql-test/lib/My/CoreDump.pm +++ b/mysql-test/lib/My/CoreDump.pm @@ -261,11 +261,7 @@ sub show { # On Windows, rely on cdb to be there... if (IS_WINDOWS) { - # Starting cdb is unsafe when used with --parallel > 1 option - if ( $parallel < 2 ) - { - _cdb($core_name); - } + _cdb($core_name); return; } diff --git a/mysql-test/lib/mtr_cases.pm b/mysql-test/lib/mtr_cases.pm index 2be903abf42..d758b81c1c7 100644 --- a/mysql-test/lib/mtr_cases.pm +++ b/mysql-test/lib/mtr_cases.pm @@ -58,8 +58,6 @@ use My::Test; use My::Find; use My::Suite; -require "mtr_misc.pl"; - # locate plugin suites, depending on whether it's a build tree or installed my @plugin_suitedirs; my $plugin_suitedir_regex; @@ -1096,7 +1094,7 @@ sub get_tags_from_file($$) { $file_to_tags{$file}= $tags; $file_to_master_opts{$file}= $master_opts; $file_to_slave_opts{$file}= $slave_opts; - $file_combinations{$file}= [ uniq(@combinations) ]; + $file_combinations{$file}= [ ::uniq(@combinations) ]; $file_in_overlay{$file} = 1 if $in_overlay; return @{$tags}; } diff --git a/mysql-test/lib/mtr_report.pm b/mysql-test/lib/mtr_report.pm index 9ab82c454ed..97ace54f0fb 100644 --- a/mysql-test/lib/mtr_report.pm +++ b/mysql-test/lib/mtr_report.pm @@ -34,7 +34,6 @@ use mtr_match; use My::Platform; use POSIX qw[ _exit ]; use IO::Handle qw[ flush ]; -require "mtr_io.pl"; use mtr_results; my $tot_real_time= 0; @@ -92,7 +91,7 @@ sub mtr_report_test_passed ($) { my $timer_str= ""; if ( $timer and -f "$::opt_vardir/log/timer" ) { - $timer_str= mtr_fromfile("$::opt_vardir/log/timer"); + $timer_str= ::mtr_fromfile("$::opt_vardir/log/timer"); $tinfo->{timer}= $timer_str; resfile_test_info('duration', $timer_str) if $::opt_resfile; } diff --git a/mysql-test/mysql-test-run.pl b/mysql-test/mysql-test-run.pl index c6a71b91f69..2bd89f5ae49 100755 --- a/mysql-test/mysql-test-run.pl +++ b/mysql-test/mysql-test-run.pl @@ -102,11 +102,11 @@ use mtr_results; use IO::Socket::INET; use IO::Select; -require "lib/mtr_process.pl"; -require "lib/mtr_io.pl"; -require "lib/mtr_gcov.pl"; -require "lib/mtr_gprof.pl"; -require "lib/mtr_misc.pl"; +require "mtr_process.pl"; +require "mtr_io.pl"; +require "mtr_gcov.pl"; +require "mtr_gprof.pl"; +require "mtr_misc.pl"; $SIG{INT}= sub { mtr_error("Got ^C signal"); }; $SIG{HUP}= sub { mtr_error("Hangup detected on controlling terminal"); }; diff --git a/mysql-test/r/contributors.result b/mysql-test/r/contributors.result index 918ceaa496f..f3f5e227d3a 100644 --- a/mysql-test/r/contributors.result +++ b/mysql-test/r/contributors.result @@ -9,6 +9,7 @@ Acronis http://www.acronis.com Silver Sponsor of the MariaDB Foundation Auttomattic https://automattic.com Bronze Sponsor of the MariaDB Foundation Verkkokauppa.com https://virtuozzo.com Bronze Sponsor of the MariaDB Foundation Virtuozzo https://virtuozzo.com/ Bronze Sponsor of the MariaDB Foundation +Tencent Game DBA http://tencentdba.com/about/ Bronze Sponsor of the MariaDB Foundation Google USA Sponsoring encryption, parallel replication and GTID Facebook USA Sponsoring non-blocking API, LIMIT ROWS EXAMINED etc Ronald Bradford Brisbane, Australia EFF contribution for UC2006 Auction diff --git a/mysql-test/r/ctype_utf32.result b/mysql-test/r/ctype_utf32.result index a0a8072265c..5797a030e73 100644 --- a/mysql-test/r/ctype_utf32.result +++ b/mysql-test/r/ctype_utf32.result @@ -1662,6 +1662,11 @@ CHAR_LENGTH(TRIM(BOTH 0x61 FROM _utf32 0x00000061)) SELECT CHAR_LENGTH(TRIM(BOTH 0x00 FROM _utf32 0x00000061)); CHAR_LENGTH(TRIM(BOTH 0x00 FROM _utf32 0x00000061)) 1 +select hex(lower(cast(0xffff0000 as char character set utf32))) as c; +c +0000003F0000003F0000003F0000003F +Warnings: +Warning 1300 Invalid utf32 character string: '\xFF\xFF\x00\x00' # # End of 5.5 tests # diff --git a/mysql-test/r/group_min_max_innodb.result b/mysql-test/r/group_min_max_innodb.result index 77c74fbc041..2803107b97e 100644 --- a/mysql-test/r/group_min_max_innodb.result +++ b/mysql-test/r/group_min_max_innodb.result @@ -286,3 +286,19 @@ F 28 28 F 29 29 F 30 30 DROP TABLE t0,t1,t2; +# +# MDEV-MariaDB daemon leaks memory with specific query +# +CREATE TABLE t1 (`voter_id` int(11) unsigned NOT NULL, +`language_id` int(11) unsigned NOT NULL DEFAULT '1' +) ENGINE=InnoDB DEFAULT CHARSET=utf8; +CREATE TABLE t2 (`voter_id` int(10) unsigned NOT NULL DEFAULT '0', +`serialized_c` mediumblob) ENGINE=InnoDB DEFAULT CHARSET=utf8; +insert into t2 values (1,repeat("a",1000)),(2,repeat("a",1000)),(3,repeat("b",1000)),(4,repeat("c",1000)),(4,repeat("b",1000)); +SELECT GROUP_CONCAT(t1.language_id SEPARATOR ',') AS `translation_resources`, `d`.`serialized_c` FROM t2 AS `d` LEFT JOIN t1 ON `d`.`voter_id` = t1.`voter_id` GROUP BY `d`.`voter_id` ORDER BY 10-d.voter_id+RAND()*0; +translation_resources serialized_c +NULL cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc +NULL bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb +NULL aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +NULL aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa +drop table t1,t2; diff --git a/mysql-test/r/index_merge_innodb.result b/mysql-test/r/index_merge_innodb.result index 5202c79f3c7..5bf56e213ab 100644 --- a/mysql-test/r/index_merge_innodb.result +++ b/mysql-test/r/index_merge_innodb.result @@ -311,6 +311,9 @@ set @d=@d*2; alter table t1 add index i2(key2); alter table t1 add index i3(key3); update t1 set key2=key1,key3=key1; +analyze table t1; +Table Op Msg_type Msg_text +test.t1 analyze status OK explain select * from t1 where (key3 > 30 and key3<35) or (key2 >32 and key2 < 40); id select_type table type possible_keys key key_len ref rows Extra 1 SIMPLE t1 index_merge i2,i3 i3,i2 4,4 NULL REF Using sort_union(i3,i2); Using where diff --git a/mysql-test/r/index_merge_myisam.result b/mysql-test/r/index_merge_myisam.result index a857e2a21ff..b3f78c12af9 100644 --- a/mysql-test/r/index_merge_myisam.result +++ b/mysql-test/r/index_merge_myisam.result @@ -1146,6 +1146,9 @@ set @d=@d*2; alter table t1 add index i2(key2); alter table t1 add index i3(key3); update t1 set key2=key1,key3=key1; +analyze table t1; +Table Op Msg_type Msg_text +test.t1 analyze status OK explain select * from t1 where (key3 > 30 and key3<35) or (key2 >32 and key2 < 40); id select_type table type possible_keys key key_len ref rows Extra 1 SIMPLE t1 index_merge i2,i3 i3,i2 4,4 NULL REF Using sort_union(i3,i2); Using where diff --git a/mysql-test/r/merge.result b/mysql-test/r/merge.result index 804313af701..36e196497e5 100644 --- a/mysql-test/r/merge.result +++ b/mysql-test/r/merge.result @@ -3835,6 +3835,23 @@ test.m1 repair error Corrupt # Clean-up. drop tables m1, t1, t4; drop view t3; +# +# MDEV-10424 - Assertion `ticket == __null' failed in +# MDL_request::set_type +# +CREATE TABLE t1 (f1 INT) ENGINE=MyISAM; +CREATE TABLE tmerge (f1 INT) ENGINE=MERGE UNION=(t1); +PREPARE stmt FROM "ANALYZE TABLE tmerge, t1"; +EXECUTE stmt; +Table Op Msg_type Msg_text +test.tmerge analyze note The storage engine for the table doesn't support analyze +test.t1 analyze status Table is already up to date +EXECUTE stmt; +Table Op Msg_type Msg_text +test.tmerge analyze note The storage engine for the table doesn't support analyze +test.t1 analyze status Table is already up to date +DEALLOCATE PREPARE stmt; +DROP TABLE t1, tmerge; End of 5.5 tests # # Additional coverage for refactoring which is made as part diff --git a/mysql-test/r/ps.result b/mysql-test/r/ps.result index ca6a5cf876a..6c21f4225a0 100644 --- a/mysql-test/r/ps.result +++ b/mysql-test/r/ps.result @@ -4077,4 +4077,35 @@ id value deallocate prepare stmt; SET SESSION sql_mode = @save_sql_mode; DROP TABLE t1,t2; -# End of 10.0 tests +# +# MDEV-8833: Crash of server on prepared statement with +# conversion to semi-join +# +CREATE TABLE t1 (column1 INT); +INSERT INTO t1 VALUES (3),(9); +CREATE TABLE t2 (column2 INT); +INSERT INTO t2 VALUES (1),(4); +CREATE TABLE t3 (column3 INT); +INSERT INTO t3 VALUES (6),(8); +CREATE TABLE t4 (column4 INT); +INSERT INTO t4 VALUES (2),(5); +PREPARE stmt FROM "SELECT ( SELECT MAX( table1.column1 ) AS field1 +FROM t1 AS table1 +WHERE table3.column3 IN ( SELECT table2.column2 AS field2 FROM t2 AS table2 ) +) AS sq +FROM t3 AS table3, t4 AS table4"; +EXECUTE stmt; +sq +NULL +NULL +NULL +NULL +EXECUTE stmt; +sq +NULL +NULL +NULL +NULL +deallocate prepare stmt; +drop table t1,t2,t3,t4; +# End of 5.5 tests diff --git a/mysql-test/r/type_uint.result b/mysql-test/r/type_uint.result index 10aa2f2f393..c970f2ff896 100644 --- a/mysql-test/r/type_uint.result +++ b/mysql-test/r/type_uint.result @@ -14,6 +14,25 @@ this 0 4294967295 drop table t1; +create table t1 (a bigint unsigned, b mediumint unsigned); +insert t1 values (1,2),(0xffffffffffffffff,0xffffff); +select coalesce(a,b), coalesce(b,a) from t1; +coalesce(a,b) coalesce(b,a) +1 2 +18446744073709551615 16777215 +create table t2 as select a from t1 union select b from t1; +show create table t2; +Table Create Table +t2 CREATE TABLE `t2` ( + `a` bigint(20) unsigned DEFAULT NULL +) ENGINE=MyISAM DEFAULT CHARSET=latin1 +select * from t2; +a +1 +18446744073709551615 +2 +16777215 +drop table t1, t2; # # Start of 10.0 tests # diff --git a/mysql-test/suite/innodb/r/innodb_bug54044.result b/mysql-test/suite/innodb/r/innodb_bug54044.result index d80c451c841..7d6133adb74 100644 --- a/mysql-test/suite/innodb/r/innodb_bug54044.result +++ b/mysql-test/suite/innodb/r/innodb_bug54044.result @@ -6,7 +6,8 @@ table_54044 CREATE TEMPORARY TABLE `table_54044` ( `IF(NULL IS NOT NULL, NULL, NULL)` binary(0) DEFAULT NULL ) ENGINE=InnoDB DEFAULT CHARSET=latin1 DROP TABLE table_54044; -CREATE TABLE tmp ENGINE = INNODB AS SELECT COALESCE(NULL, NULL, NULL), GREATEST(NULL, NULL), NULL; +CREATE TABLE tmp ENGINE = INNODB +AS SELECT COALESCE(NULL, NULL, NULL), GREATEST(NULL, NULL), NULL; SHOW CREATE TABLE tmp; Table Create Table tmp CREATE TABLE `tmp` ( diff --git a/mysql-test/suite/innodb/r/system_tables.result b/mysql-test/suite/innodb/r/system_tables.result new file mode 100644 index 00000000000..79a24f7e455 --- /dev/null +++ b/mysql-test/suite/innodb/r/system_tables.result @@ -0,0 +1,8 @@ +alter table mysql.time_zone_name engine=InnoDB; +create table envois3 (starttime datetime) engine=InnoDB; +insert envois3 values ('2008-08-11 22:43:00'); +select convert_tz(starttime,'UTC','Europe/Moscow') starttime from envois3; +starttime +2008-08-12 02:43:00 +drop table envois3; +alter table mysql.time_zone_name engine=MyISAM; diff --git a/mysql-test/suite/innodb/t/innodb_bug54044.test b/mysql-test/suite/innodb/t/innodb_bug54044.test index aa19c51018c..61a09375ae1 100644 --- a/mysql-test/suite/innodb/t/innodb_bug54044.test +++ b/mysql-test/suite/innodb/t/innodb_bug54044.test @@ -10,7 +10,10 @@ CREATE TEMPORARY TABLE table_54044 ENGINE = INNODB SHOW CREATE TABLE table_54044; DROP TABLE table_54044; -CREATE TABLE tmp ENGINE = INNODB AS SELECT COALESCE(NULL, NULL, NULL), GREATEST(NULL, NULL), NULL; +# This 'create table' should pass since it uses a Field_string of size 0. + +CREATE TABLE tmp ENGINE = INNODB + AS SELECT COALESCE(NULL, NULL, NULL), GREATEST(NULL, NULL), NULL; SHOW CREATE TABLE tmp; DROP TABLE tmp; @@ -23,4 +26,3 @@ FLUSH TABLES; --error 1005 CREATE TEMPORARY TABLE tmp ENGINE=InnoDB AS SELECT VALUES(a) FROM t1; DROP TABLE t1; - diff --git a/mysql-test/suite/innodb/t/system_tables.test b/mysql-test/suite/innodb/t/system_tables.test new file mode 100644 index 00000000000..90cb8c59fbd --- /dev/null +++ b/mysql-test/suite/innodb/t/system_tables.test @@ -0,0 +1,12 @@ +--source include/have_innodb.inc + +# +# MDEV-10775 System table in InnoDB format allowed in MariaDB could lead to crash +# +alter table mysql.time_zone_name engine=InnoDB; +create table envois3 (starttime datetime) engine=InnoDB; +insert envois3 values ('2008-08-11 22:43:00'); +--source include/restart_mysqld.inc +select convert_tz(starttime,'UTC','Europe/Moscow') starttime from envois3; +drop table envois3; +alter table mysql.time_zone_name engine=MyISAM; diff --git a/mysql-test/suite/perfschema/r/aggregate.result b/mysql-test/suite/perfschema/r/aggregate.result deleted file mode 100644 index c8fa1cc2b24..00000000000 --- a/mysql-test/suite/perfschema/r/aggregate.result +++ /dev/null @@ -1,121 +0,0 @@ -"General cleanup" -set @aria_checkpoint_interval_save= @@global.aria_checkpoint_interval; -set @@global.aria_checkpoint_interval= 0; -drop table if exists t1; -update performance_schema.setup_instruments set enabled = 'NO'; -update performance_schema.setup_consumers set enabled = 'NO'; -truncate table performance_schema.file_summary_by_event_name; -truncate table performance_schema.file_summary_by_instance; -truncate table performance_schema.socket_summary_by_event_name; -truncate table performance_schema.socket_summary_by_instance; -truncate table performance_schema.events_waits_summary_global_by_event_name; -truncate table performance_schema.events_waits_summary_by_instance; -truncate table performance_schema.events_waits_summary_by_thread_by_event_name; -update performance_schema.setup_consumers set enabled = 'YES'; -update performance_schema.setup_instruments -set enabled = 'YES', timed = 'YES'; -create table t1 ( -id INT PRIMARY KEY, -b CHAR(100) DEFAULT 'initial value') -ENGINE=MyISAM; -insert into t1 (id) values (1), (2), (3), (4), (5), (6), (7), (8); -update performance_schema.setup_instruments SET enabled = 'NO'; -update performance_schema.setup_consumers set enabled = 'NO'; -set @dump_all=FALSE; -"Verifying file aggregate consistency" -SELECT EVENT_NAME, e.COUNT_READ, SUM(i.COUNT_READ) -FROM performance_schema.file_summary_by_event_name AS e -JOIN performance_schema.file_summary_by_instance AS i USING (EVENT_NAME) -GROUP BY EVENT_NAME -HAVING (e.COUNT_READ <> SUM(i.COUNT_READ)) -OR @dump_all; -EVENT_NAME COUNT_READ SUM(i.COUNT_READ) -SELECT EVENT_NAME, e.COUNT_WRITE, SUM(i.COUNT_WRITE) -FROM performance_schema.file_summary_by_event_name AS e -JOIN performance_schema.file_summary_by_instance AS i USING (EVENT_NAME) -GROUP BY EVENT_NAME -HAVING (e.COUNT_WRITE <> SUM(i.COUNT_WRITE)) -OR @dump_all; -EVENT_NAME COUNT_WRITE SUM(i.COUNT_WRITE) -SELECT EVENT_NAME, e.COUNT_READ, SUM(i.COUNT_READ) -FROM performance_schema.socket_summary_by_event_name AS e -JOIN performance_schema.socket_summary_by_instance AS i USING (EVENT_NAME) -GROUP BY EVENT_NAME -HAVING (e.COUNT_READ <> SUM(i.COUNT_READ)) -OR @dump_all; -EVENT_NAME COUNT_READ SUM(i.COUNT_READ) -SELECT EVENT_NAME, e.COUNT_WRITE, SUM(i.COUNT_WRITE) -FROM performance_schema.socket_summary_by_event_name AS e -JOIN performance_schema.socket_summary_by_instance AS i USING (EVENT_NAME) -GROUP BY EVENT_NAME -HAVING (e.COUNT_WRITE <> SUM(i.COUNT_WRITE)) -OR @dump_all; -EVENT_NAME COUNT_WRITE SUM(i.COUNT_WRITE) -SELECT EVENT_NAME, e.SUM_NUMBER_OF_BYTES_READ, SUM(i.SUM_NUMBER_OF_BYTES_READ) -FROM performance_schema.file_summary_by_event_name AS e -JOIN performance_schema.file_summary_by_instance AS i USING (EVENT_NAME) -GROUP BY EVENT_NAME -HAVING (e.SUM_NUMBER_OF_BYTES_READ <> SUM(i.SUM_NUMBER_OF_BYTES_READ)) -OR @dump_all; -EVENT_NAME SUM_NUMBER_OF_BYTES_READ SUM(i.SUM_NUMBER_OF_BYTES_READ) -SELECT EVENT_NAME, e.SUM_NUMBER_OF_BYTES_WRITE, SUM(i.SUM_NUMBER_OF_BYTES_WRITE) -FROM performance_schema.file_summary_by_event_name AS e -JOIN performance_schema.file_summary_by_instance AS i USING (EVENT_NAME) -GROUP BY EVENT_NAME -HAVING (e.SUM_NUMBER_OF_BYTES_WRITE <> SUM(i.SUM_NUMBER_OF_BYTES_WRITE)) -OR @dump_all; -EVENT_NAME SUM_NUMBER_OF_BYTES_WRITE SUM(i.SUM_NUMBER_OF_BYTES_WRITE) -"Verifying waits aggregate consistency (instance)" -SELECT EVENT_NAME, e.SUM_TIMER_WAIT, SUM(i.SUM_TIMER_WAIT) -FROM performance_schema.events_waits_summary_global_by_event_name AS e -JOIN performance_schema.events_waits_summary_by_instance AS i USING (EVENT_NAME) -GROUP BY EVENT_NAME -HAVING (e.SUM_TIMER_WAIT < SUM(i.SUM_TIMER_WAIT)) -OR @dump_all; -EVENT_NAME SUM_TIMER_WAIT SUM(i.SUM_TIMER_WAIT) -SELECT EVENT_NAME, e.MIN_TIMER_WAIT, MIN(i.MIN_TIMER_WAIT) -FROM performance_schema.events_waits_summary_global_by_event_name AS e -JOIN performance_schema.events_waits_summary_by_instance AS i USING (EVENT_NAME) -GROUP BY EVENT_NAME -HAVING (e.MIN_TIMER_WAIT > MIN(i.MIN_TIMER_WAIT)) -AND (MIN(i.MIN_TIMER_WAIT) != 0) -OR @dump_all; -EVENT_NAME MIN_TIMER_WAIT MIN(i.MIN_TIMER_WAIT) -SELECT EVENT_NAME, e.MAX_TIMER_WAIT, MAX(i.MAX_TIMER_WAIT) -FROM performance_schema.events_waits_summary_global_by_event_name AS e -JOIN performance_schema.events_waits_summary_by_instance AS i USING (EVENT_NAME) -GROUP BY EVENT_NAME -HAVING (e.MAX_TIMER_WAIT < MAX(i.MAX_TIMER_WAIT)) -OR @dump_all; -EVENT_NAME MAX_TIMER_WAIT MAX(i.MAX_TIMER_WAIT) -"Verifying waits aggregate consistency (thread)" -SELECT EVENT_NAME, e.SUM_TIMER_WAIT, SUM(t.SUM_TIMER_WAIT) -FROM performance_schema.events_waits_summary_global_by_event_name AS e -JOIN performance_schema.events_waits_summary_by_thread_by_event_name AS t -USING (EVENT_NAME) -GROUP BY EVENT_NAME -HAVING (e.SUM_TIMER_WAIT < SUM(t.SUM_TIMER_WAIT)) -OR @dump_all; -EVENT_NAME SUM_TIMER_WAIT SUM(t.SUM_TIMER_WAIT) -SELECT EVENT_NAME, e.MIN_TIMER_WAIT, MIN(t.MIN_TIMER_WAIT) -FROM performance_schema.events_waits_summary_global_by_event_name AS e -JOIN performance_schema.events_waits_summary_by_thread_by_event_name AS t -USING (EVENT_NAME) -GROUP BY EVENT_NAME -HAVING (e.MIN_TIMER_WAIT > MIN(t.MIN_TIMER_WAIT)) -AND (MIN(t.MIN_TIMER_WAIT) != 0) -OR @dump_all; -EVENT_NAME MIN_TIMER_WAIT MIN(t.MIN_TIMER_WAIT) -SELECT EVENT_NAME, e.MAX_TIMER_WAIT, MAX(t.MAX_TIMER_WAIT) -FROM performance_schema.events_waits_summary_global_by_event_name AS e -JOIN performance_schema.events_waits_summary_by_thread_by_event_name AS t -USING (EVENT_NAME) -GROUP BY EVENT_NAME -HAVING (e.MAX_TIMER_WAIT < MAX(t.MAX_TIMER_WAIT)) -OR @dump_all; -EVENT_NAME MAX_TIMER_WAIT MAX(t.MAX_TIMER_WAIT) -update performance_schema.setup_consumers set enabled = 'YES'; -update performance_schema.setup_instruments -set enabled = 'YES', timed = 'YES'; -drop table test.t1; -set @@global.aria_checkpoint_interval= @aria_checkpoint_interval_save; diff --git a/mysql-test/suite/perfschema/t/aggregate.test b/mysql-test/suite/perfschema/t/aggregate.test deleted file mode 100644 index fe30a7b8697..00000000000 --- a/mysql-test/suite/perfschema/t/aggregate.test +++ /dev/null @@ -1,197 +0,0 @@ -# Tests for PERFORMANCE_SCHEMA -# Verify that statistics aggregated by different criteria are consistent. - ---source include/not_embedded.inc ---source include/have_perfschema.inc - ---echo "General cleanup" - -# MDEV-7187 - test fails sporadically in buildbot -set @aria_checkpoint_interval_save= @@global.aria_checkpoint_interval; -set @@global.aria_checkpoint_interval= 0; - ---disable_warnings -drop table if exists t1; ---enable_warnings - -update performance_schema.setup_instruments set enabled = 'NO'; -update performance_schema.setup_consumers set enabled = 'NO'; - -# Cleanup statistics -truncate table performance_schema.file_summary_by_event_name; -truncate table performance_schema.file_summary_by_instance; -truncate table performance_schema.socket_summary_by_event_name; -truncate table performance_schema.socket_summary_by_instance; -truncate table performance_schema.events_waits_summary_global_by_event_name; -truncate table performance_schema.events_waits_summary_by_instance; -truncate table performance_schema.events_waits_summary_by_thread_by_event_name; - -# Start recording data -update performance_schema.setup_consumers set enabled = 'YES'; -update performance_schema.setup_instruments - set enabled = 'YES', timed = 'YES'; - - -create table t1 ( - id INT PRIMARY KEY, - b CHAR(100) DEFAULT 'initial value') - ENGINE=MyISAM; - -insert into t1 (id) values (1), (2), (3), (4), (5), (6), (7), (8); - -# Stop recording data, so the select below don't add noise. -update performance_schema.setup_instruments SET enabled = 'NO'; -# Disable all consumers, for long standing waits -update performance_schema.setup_consumers set enabled = 'NO'; - -# Helper to debug -set @dump_all=FALSE; - -# Note that in general: -# - COUNT/SUM/MAX(file_summary_by_event_name) >= -# COUNT/SUM/MAX(file_summary_by_instance). -# - MIN(file_summary_by_event_name) <= -# MIN(file_summary_by_instance). -# There will be equality only when file instances are not removed, -# aka when a file is not deleted from the file system, -# because doing so removes a row in file_summary_by_instance. - -# Likewise: -# - COUNT/SUM/MAX(events_waits_summary_global_by_event_name) >= -# COUNT/SUM/MAX(events_waits_summary_by_instance) -# - MIN(events_waits_summary_global_by_event_name) <= -# MIN(events_waits_summary_by_instance) -# There will be equality only when an instrument instance -# is not removed, which is next to impossible to predictably guarantee -# in the server. -# For example, a MyISAM table removed from the table cache -# will cause a mysql_mutex_destroy on myisam/MYISAM_SHARE::intern_lock. -# Another example, a thread terminating will cause a mysql_mutex_destroy -# on sql/LOCK_delete -# Both cause a row to be deleted from events_waits_summary_by_instance. - -# Likewise: -# - COUNT/SUM/MAX(events_waits_summary_global_by_event_name) >= -# COUNT/SUM/MAX(events_waits_summary_by_thread_by_event_name) -# - MIN(events_waits_summary_global_by_event_name) <= -# MIN(events_waits_summary_by_thread_by_event_name) -# There will be equality only when no thread is removed, -# that is if no thread disconnects, or no sub thread (for example insert -# delayed) ever completes. -# A thread completing will cause rows in -# events_waits_summary_by_thread_by_event_name to be removed. - ---echo "Verifying file aggregate consistency" - -# Since the code generating the load in this test does: -# - create table -# - insert -# - does not cause temporary tables to be used -# we can test for equality here for file aggregates. - -# If any of these queries returns data, the test failed. - -SELECT EVENT_NAME, e.COUNT_READ, SUM(i.COUNT_READ) -FROM performance_schema.file_summary_by_event_name AS e -JOIN performance_schema.file_summary_by_instance AS i USING (EVENT_NAME) -GROUP BY EVENT_NAME -HAVING (e.COUNT_READ <> SUM(i.COUNT_READ)) -OR @dump_all; - -SELECT EVENT_NAME, e.COUNT_WRITE, SUM(i.COUNT_WRITE) -FROM performance_schema.file_summary_by_event_name AS e -JOIN performance_schema.file_summary_by_instance AS i USING (EVENT_NAME) -GROUP BY EVENT_NAME -HAVING (e.COUNT_WRITE <> SUM(i.COUNT_WRITE)) -OR @dump_all; - -SELECT EVENT_NAME, e.COUNT_READ, SUM(i.COUNT_READ) -FROM performance_schema.socket_summary_by_event_name AS e -JOIN performance_schema.socket_summary_by_instance AS i USING (EVENT_NAME) -GROUP BY EVENT_NAME -HAVING (e.COUNT_READ <> SUM(i.COUNT_READ)) -OR @dump_all; - -SELECT EVENT_NAME, e.COUNT_WRITE, SUM(i.COUNT_WRITE) -FROM performance_schema.socket_summary_by_event_name AS e -JOIN performance_schema.socket_summary_by_instance AS i USING (EVENT_NAME) -GROUP BY EVENT_NAME -HAVING (e.COUNT_WRITE <> SUM(i.COUNT_WRITE)) -OR @dump_all; - -SELECT EVENT_NAME, e.SUM_NUMBER_OF_BYTES_READ, SUM(i.SUM_NUMBER_OF_BYTES_READ) -FROM performance_schema.file_summary_by_event_name AS e -JOIN performance_schema.file_summary_by_instance AS i USING (EVENT_NAME) -GROUP BY EVENT_NAME -HAVING (e.SUM_NUMBER_OF_BYTES_READ <> SUM(i.SUM_NUMBER_OF_BYTES_READ)) -OR @dump_all; - -SELECT EVENT_NAME, e.SUM_NUMBER_OF_BYTES_WRITE, SUM(i.SUM_NUMBER_OF_BYTES_WRITE) -FROM performance_schema.file_summary_by_event_name AS e -JOIN performance_schema.file_summary_by_instance AS i USING (EVENT_NAME) -GROUP BY EVENT_NAME -HAVING (e.SUM_NUMBER_OF_BYTES_WRITE <> SUM(i.SUM_NUMBER_OF_BYTES_WRITE)) -OR @dump_all; - ---echo "Verifying waits aggregate consistency (instance)" - -SELECT EVENT_NAME, e.SUM_TIMER_WAIT, SUM(i.SUM_TIMER_WAIT) -FROM performance_schema.events_waits_summary_global_by_event_name AS e -JOIN performance_schema.events_waits_summary_by_instance AS i USING (EVENT_NAME) -GROUP BY EVENT_NAME -HAVING (e.SUM_TIMER_WAIT < SUM(i.SUM_TIMER_WAIT)) -OR @dump_all; - -SELECT EVENT_NAME, e.MIN_TIMER_WAIT, MIN(i.MIN_TIMER_WAIT) -FROM performance_schema.events_waits_summary_global_by_event_name AS e -JOIN performance_schema.events_waits_summary_by_instance AS i USING (EVENT_NAME) -GROUP BY EVENT_NAME -HAVING (e.MIN_TIMER_WAIT > MIN(i.MIN_TIMER_WAIT)) -AND (MIN(i.MIN_TIMER_WAIT) != 0) -OR @dump_all; - -SELECT EVENT_NAME, e.MAX_TIMER_WAIT, MAX(i.MAX_TIMER_WAIT) -FROM performance_schema.events_waits_summary_global_by_event_name AS e -JOIN performance_schema.events_waits_summary_by_instance AS i USING (EVENT_NAME) -GROUP BY EVENT_NAME -HAVING (e.MAX_TIMER_WAIT < MAX(i.MAX_TIMER_WAIT)) -OR @dump_all; - ---echo "Verifying waits aggregate consistency (thread)" - -SELECT EVENT_NAME, e.SUM_TIMER_WAIT, SUM(t.SUM_TIMER_WAIT) -FROM performance_schema.events_waits_summary_global_by_event_name AS e -JOIN performance_schema.events_waits_summary_by_thread_by_event_name AS t -USING (EVENT_NAME) -GROUP BY EVENT_NAME -HAVING (e.SUM_TIMER_WAIT < SUM(t.SUM_TIMER_WAIT)) -OR @dump_all; - -SELECT EVENT_NAME, e.MIN_TIMER_WAIT, MIN(t.MIN_TIMER_WAIT) -FROM performance_schema.events_waits_summary_global_by_event_name AS e -JOIN performance_schema.events_waits_summary_by_thread_by_event_name AS t -USING (EVENT_NAME) -GROUP BY EVENT_NAME -HAVING (e.MIN_TIMER_WAIT > MIN(t.MIN_TIMER_WAIT)) -AND (MIN(t.MIN_TIMER_WAIT) != 0) -OR @dump_all; - -SELECT EVENT_NAME, e.MAX_TIMER_WAIT, MAX(t.MAX_TIMER_WAIT) -FROM performance_schema.events_waits_summary_global_by_event_name AS e -JOIN performance_schema.events_waits_summary_by_thread_by_event_name AS t -USING (EVENT_NAME) -GROUP BY EVENT_NAME -HAVING (e.MAX_TIMER_WAIT < MAX(t.MAX_TIMER_WAIT)) -OR @dump_all; - - -# Cleanup - -update performance_schema.setup_consumers set enabled = 'YES'; -update performance_schema.setup_instruments - set enabled = 'YES', timed = 'YES'; - -drop table test.t1; - -set @@global.aria_checkpoint_interval= @aria_checkpoint_interval_save; - diff --git a/mysql-test/suite/plugins/r/server_audit.result b/mysql-test/suite/plugins/r/server_audit.result index 83b88ed0480..ceb75176b43 100644 --- a/mysql-test/suite/plugins/r/server_audit.result +++ b/mysql-test/suite/plugins/r/server_audit.result @@ -8,7 +8,6 @@ server_audit_file_rotate_now OFF server_audit_file_rotate_size 1000000 server_audit_file_rotations 9 server_audit_incl_users -server_audit_loc_info server_audit_logging OFF server_audit_mode 0 server_audit_output_type file @@ -72,7 +71,6 @@ server_audit_file_rotate_now OFF server_audit_file_rotate_size 1000000 server_audit_file_rotations 9 server_audit_incl_users odin, root, dva, tri -server_audit_loc_info server_audit_logging ON server_audit_mode 0 server_audit_output_type file @@ -218,7 +216,6 @@ server_audit_file_rotate_now OFF server_audit_file_rotate_size 1000000 server_audit_file_rotations 9 server_audit_incl_users odin, root, dva, tri -server_audit_loc_info server_audit_logging ON server_audit_mode 1 server_audit_output_type file diff --git a/mysql-test/suite/plugins/r/thread_pool_server_audit.result b/mysql-test/suite/plugins/r/thread_pool_server_audit.result index 83b88ed0480..ceb75176b43 100644 --- a/mysql-test/suite/plugins/r/thread_pool_server_audit.result +++ b/mysql-test/suite/plugins/r/thread_pool_server_audit.result @@ -8,7 +8,6 @@ server_audit_file_rotate_now OFF server_audit_file_rotate_size 1000000 server_audit_file_rotations 9 server_audit_incl_users -server_audit_loc_info server_audit_logging OFF server_audit_mode 0 server_audit_output_type file @@ -72,7 +71,6 @@ server_audit_file_rotate_now OFF server_audit_file_rotate_size 1000000 server_audit_file_rotations 9 server_audit_incl_users odin, root, dva, tri -server_audit_loc_info server_audit_logging ON server_audit_mode 0 server_audit_output_type file @@ -218,7 +216,6 @@ server_audit_file_rotate_now OFF server_audit_file_rotate_size 1000000 server_audit_file_rotations 9 server_audit_incl_users odin, root, dva, tri -server_audit_loc_info server_audit_logging ON server_audit_mode 1 server_audit_output_type file diff --git a/mysql-test/suite/rpl/t/rpl_drop_db.test b/mysql-test/suite/rpl/t/rpl_drop_db.test index a67850a66dd..dae1651dc93 100644 --- a/mysql-test/suite/rpl/t/rpl_drop_db.test +++ b/mysql-test/suite/rpl/t/rpl_drop_db.test @@ -13,7 +13,7 @@ insert into mysqltest1.t1 values (1); select * from mysqltest1.t1 into outfile 'mysqltest1/f1.txt'; create table mysqltest1.t2 (n int); create table mysqltest1.t3 (n int); ---replace_result \\ / 66 39 17 39 "File exists" "Directory not empty" +--replace_result \\ / 66 39 17 39 247 39 "File exists" "Directory not empty" --error 1010 drop database mysqltest1; use mysqltest1; @@ -30,7 +30,7 @@ while ($1) } --enable_query_log ---replace_result \\ / 66 39 17 39 "File exists" "Directory not empty" +--replace_result \\ / 66 39 17 39 247 39 "File exists" "Directory not empty" --error 1010 drop database mysqltest1; use mysqltest1; diff --git a/mysql-test/t/ctype_utf32.test b/mysql-test/t/ctype_utf32.test index 4bff8a867f1..190b9f3fb50 100644 --- a/mysql-test/t/ctype_utf32.test +++ b/mysql-test/t/ctype_utf32.test @@ -889,6 +889,11 @@ SELECT CHAR_LENGTH(TRIM(BOTH 0x0001 FROM _utf32 0x00000061)); SELECT CHAR_LENGTH(TRIM(BOTH 0x61 FROM _utf32 0x00000061)); SELECT CHAR_LENGTH(TRIM(BOTH 0x00 FROM _utf32 0x00000061)); +# +# potential signedness issue +# +select hex(lower(cast(0xffff0000 as char character set utf32))) as c; + --echo # --echo # End of 5.5 tests --echo # diff --git a/mysql-test/t/group_min_max_innodb.test b/mysql-test/t/group_min_max_innodb.test index 6967f847147..91e0bd3279f 100644 --- a/mysql-test/t/group_min_max_innodb.test +++ b/mysql-test/t/group_min_max_innodb.test @@ -230,3 +230,16 @@ eval EXPLAIN $query; eval $query; DROP TABLE t0,t1,t2; + +--echo # +--echo # MDEV-MariaDB daemon leaks memory with specific query +--echo # + +CREATE TABLE t1 (`voter_id` int(11) unsigned NOT NULL, + `language_id` int(11) unsigned NOT NULL DEFAULT '1' +) ENGINE=InnoDB DEFAULT CHARSET=utf8; +CREATE TABLE t2 (`voter_id` int(10) unsigned NOT NULL DEFAULT '0', + `serialized_c` mediumblob) ENGINE=InnoDB DEFAULT CHARSET=utf8; +insert into t2 values (1,repeat("a",1000)),(2,repeat("a",1000)),(3,repeat("b",1000)),(4,repeat("c",1000)),(4,repeat("b",1000)); +SELECT GROUP_CONCAT(t1.language_id SEPARATOR ',') AS `translation_resources`, `d`.`serialized_c` FROM t2 AS `d` LEFT JOIN t1 ON `d`.`voter_id` = t1.`voter_id` GROUP BY `d`.`voter_id` ORDER BY 10-d.voter_id+RAND()*0; +drop table t1,t2; diff --git a/mysql-test/t/merge.test b/mysql-test/t/merge.test index 519094d6350..e9d69b446d5 100644 --- a/mysql-test/t/merge.test +++ b/mysql-test/t/merge.test @@ -2881,6 +2881,19 @@ drop tables m1, t1, t4; drop view t3; +--echo # +--echo # MDEV-10424 - Assertion `ticket == __null' failed in +--echo # MDL_request::set_type +--echo # +CREATE TABLE t1 (f1 INT) ENGINE=MyISAM; +CREATE TABLE tmerge (f1 INT) ENGINE=MERGE UNION=(t1); +PREPARE stmt FROM "ANALYZE TABLE tmerge, t1"; +EXECUTE stmt; +EXECUTE stmt; +DEALLOCATE PREPARE stmt; +DROP TABLE t1, tmerge; + + --echo End of 5.5 tests diff --git a/mysql-test/t/ps.test b/mysql-test/t/ps.test index 3881d522bbf..67f6f021434 100644 --- a/mysql-test/t/ps.test +++ b/mysql-test/t/ps.test @@ -3670,5 +3670,32 @@ deallocate prepare stmt; SET SESSION sql_mode = @save_sql_mode; DROP TABLE t1,t2; +--echo # +--echo # MDEV-8833: Crash of server on prepared statement with +--echo # conversion to semi-join +--echo # + +CREATE TABLE t1 (column1 INT); +INSERT INTO t1 VALUES (3),(9); + +CREATE TABLE t2 (column2 INT); +INSERT INTO t2 VALUES (1),(4); + +CREATE TABLE t3 (column3 INT); +INSERT INTO t3 VALUES (6),(8); + +CREATE TABLE t4 (column4 INT); +INSERT INTO t4 VALUES (2),(5); + +PREPARE stmt FROM "SELECT ( SELECT MAX( table1.column1 ) AS field1 +FROM t1 AS table1 +WHERE table3.column3 IN ( SELECT table2.column2 AS field2 FROM t2 AS table2 ) +) AS sq +FROM t3 AS table3, t4 AS table4"; +EXECUTE stmt; +EXECUTE stmt; +deallocate prepare stmt; +drop table t1,t2,t3,t4; + ---echo # End of 10.0 tests +--echo # End of 5.5 tests diff --git a/mysql-test/t/type_uint.test b/mysql-test/t/type_uint.test index 3a949c5c47a..84fca993d09 100644 --- a/mysql-test/t/type_uint.test +++ b/mysql-test/t/type_uint.test @@ -16,6 +16,13 @@ drop table t1; # End of 4.1 tests +create table t1 (a bigint unsigned, b mediumint unsigned); +insert t1 values (1,2),(0xffffffffffffffff,0xffffff); +select coalesce(a,b), coalesce(b,a) from t1; +create table t2 as select a from t1 union select b from t1; +show create table t2; +select * from t2; +drop table t1, t2; --echo # --echo # Start of 10.0 tests diff --git a/mysys/my_redel.c b/mysys/my_redel.c index 61e61b40791..976fc5a18c3 100644 --- a/mysys/my_redel.c +++ b/mysys/my_redel.c @@ -1,5 +1,5 @@ -/* - Copyright (c) 2000, 2010, Oracle and/or its affiliates +/* Copyright (c) 2000, 2010, Oracle and/or its affiliates + Copyright (c) 2009, 2016, MariaDB This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by @@ -49,7 +49,8 @@ int my_redel(const char *org_name, const char *tmp_name, DBUG_PRINT("my",("org_name: '%s' tmp_name: '%s' MyFlags: %lu", org_name,tmp_name,MyFlags)); - if (my_copystat(org_name,tmp_name,MyFlags) < 0) + if (!my_disable_copystat_in_redel && + my_copystat(org_name,tmp_name,MyFlags) < 0) goto end; if (MyFlags & MY_REDEL_MAKE_BACKUP) { diff --git a/mysys/my_static.c b/mysys/my_static.c index 4aca78e30a9..9236c1395fb 100644 --- a/mysys/my_static.c +++ b/mysys/my_static.c @@ -98,3 +98,4 @@ my_bool my_disable_sync=0; my_bool my_disable_async_io=0; my_bool my_disable_flush_key_blocks=0; my_bool my_disable_symlinks=0; +my_bool my_disable_copystat_in_redel=0; diff --git a/plugin/server_audit/server_audit.c b/plugin/server_audit/server_audit.c index b84f2b94806..d48b6c37728 100644 --- a/plugin/server_audit/server_audit.c +++ b/plugin/server_audit/server_audit.c @@ -427,9 +427,8 @@ static MYSQL_SYSVAR_UINT(query_log_limit, query_log_limit, char locinfo_ini_value[sizeof(struct connection_info)+4]; static MYSQL_THDVAR_STR(loc_info, - PLUGIN_VAR_READONLY | PLUGIN_VAR_MEMALLOC, - "Auxiliary info.", NULL, NULL, - locinfo_ini_value); + PLUGIN_VAR_NOSYSVAR | PLUGIN_VAR_NOCMDOPT | PLUGIN_VAR_MEMALLOC, + "Internal info", NULL, NULL, locinfo_ini_value); static const char *syslog_facility_names[]= { diff --git a/sql/contributors.h b/sql/contributors.h index f52d3243453..0359ec54022 100644 --- a/sql/contributors.h +++ b/sql/contributors.h @@ -46,6 +46,7 @@ struct show_table_contributors_st show_table_contributors[]= { {"Auttomattic", "https://automattic.com", "Bronze Sponsor of the MariaDB Foundation"}, {"Verkkokauppa.com", "https://virtuozzo.com", "Bronze Sponsor of the MariaDB Foundation"}, {"Virtuozzo", "https://virtuozzo.com/", "Bronze Sponsor of the MariaDB Foundation"}, + {"Tencent Game DBA", "http://tencentdba.com/about/", "Bronze Sponsor of the MariaDB Foundation"}, /* Sponsors of important features */ {"Google", "USA", "Sponsoring encryption, parallel replication and GTID"}, diff --git a/sql/field.cc b/sql/field.cc index ae815187019..b909d14ec8f 100644 --- a/sql/field.cc +++ b/sql/field.cc @@ -355,7 +355,7 @@ static enum_field_types field_types_merge_rules [FIELDTYPE_NUM][FIELDTYPE_NUM]= //MYSQL_TYPE_NULL MYSQL_TYPE_TIMESTAMP MYSQL_TYPE_LONGLONG, MYSQL_TYPE_VARCHAR, //MYSQL_TYPE_LONGLONG MYSQL_TYPE_INT24 - MYSQL_TYPE_LONGLONG, MYSQL_TYPE_LONG, + MYSQL_TYPE_LONGLONG, MYSQL_TYPE_LONGLONG, //MYSQL_TYPE_DATE MYSQL_TYPE_TIME MYSQL_TYPE_VARCHAR, MYSQL_TYPE_VARCHAR, //MYSQL_TYPE_DATETIME MYSQL_TYPE_YEAR diff --git a/sql/item.cc b/sql/item.cc index a9c17ef620c..47635b14f46 100644 --- a/sql/item.cc +++ b/sql/item.cc @@ -2673,9 +2673,28 @@ void Item_field::fix_after_pullout(st_select_lex *new_parent, Item **ref) if (context) { Name_resolution_context *ctx= new Name_resolution_context(); - ctx->outer_context= NULL; // We don't build a complete name resolver - ctx->table_list= NULL; // We rely on first_name_resolution_table instead + if (context->select_lex == new_parent) + { + /* + This field was pushed in then pulled out + (for example left part of IN) + */ + ctx->outer_context= context->outer_context; + } + else if (context->outer_context) + { + /* just pull to the upper context */ + ctx->outer_context= context->outer_context->outer_context; + } + else + { + /* No upper context (merging Derived/VIEW where context chain ends) */ + ctx->outer_context= NULL; + } + ctx->table_list= context->first_name_resolution_table; ctx->select_lex= new_parent; + if (context->select_lex == NULL) + ctx->select_lex= NULL; ctx->first_name_resolution_table= context->first_name_resolution_table; ctx->last_name_resolution_table= context->last_name_resolution_table; ctx->error_processor= context->error_processor; diff --git a/sql/log.cc b/sql/log.cc index be24bcd718a..45ab5c8827b 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -3102,7 +3102,7 @@ bool MYSQL_QUERY_LOG::write(THD *thd, time_t current_time, if (! write_error) { write_error= 1; - sql_print_error(ER_THD(thd, ER_ERROR_ON_WRITE), name, error); + sql_print_error(ER_THD(thd, ER_ERROR_ON_WRITE), name, tmp_errno); } } } diff --git a/sql/mysqld.cc b/sql/mysqld.cc index 41f6def8e08..8da8273083c 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -4096,6 +4096,7 @@ static int init_common_variables() max_system_variables.pseudo_thread_id= (ulong)~0; server_start_time= flush_status_time= my_time(0); + my_disable_copystat_in_redel= 1; global_rpl_filter= new Rpl_filter; binlog_filter= new Rpl_filter; diff --git a/sql/net_serv.cc b/sql/net_serv.cc index ef7a46a7109..da3c5646e84 100644 --- a/sql/net_serv.cc +++ b/sql/net_serv.cc @@ -1,5 +1,5 @@ -/* Copyright (c) 2000, 2013, Oracle and/or its affiliates. - Copyright (c) 2010, 2014, SkySQL Ab. +/* Copyright (c) 2000, 2016, Oracle and/or its affiliates. + Copyright (c) 2012, 2016, MariaDB This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by diff --git a/sql/parse_file.h b/sql/parse_file.h index e4756e6c8af..87917dbd71b 100644 --- a/sql/parse_file.h +++ b/sql/parse_file.h @@ -42,9 +42,9 @@ enum file_opt_type { struct File_option { - LEX_STRING name; /**< Name of the option */ - int offset; /**< offset to base address of value */ - file_opt_type type; /**< Option type */ + LEX_STRING name; /**< Name of the option */ + my_ptrdiff_t offset; /**< offset to base address of value */ + file_opt_type type; /**< Option type */ }; diff --git a/sql/signal_handler.cc b/sql/signal_handler.cc index 9dd3e532d1e..f72eb676743 100644 --- a/sql/signal_handler.cc +++ b/sql/signal_handler.cc @@ -64,13 +64,13 @@ extern "C" sig_handler handle_fatal_signal(int sig) struct tm tm; #ifdef HAVE_STACKTRACE THD *thd; -#endif /* This flag remembers if the query pointer was found invalid. We will try and print the query at the end of the signal handler, in case we're wrong. */ bool print_invalid_query_pointer= false; +#endif if (segfaulted) { @@ -276,6 +276,7 @@ extern "C" sig_handler handle_fatal_signal(int sig) "\"mlockall\" bugs.\n"); } +#ifdef HAVE_STACKTRACE if (print_invalid_query_pointer) { my_safe_printf_stderr( @@ -285,6 +286,7 @@ extern "C" sig_handler handle_fatal_signal(int sig) my_write_stderr(thd->query(), MY_MIN(65536U, thd->query_length())); my_safe_printf_stderr("\n\n"); } +#endif #ifdef HAVE_WRITE_CORE if (test_flags & TEST_CORE_ON_SIGNAL) diff --git a/sql/sql_admin.cc b/sql/sql_admin.cc index b974075b442..1f4426f2043 100644 --- a/sql/sql_admin.cc +++ b/sql/sql_admin.cc @@ -466,7 +466,19 @@ static bool mysql_admin_table(THD* thd, TABLE_LIST* tables, } thd->prepare_derived_at_open= FALSE; - table->next_global= save_next_global; + /* + MERGE engine may adjust table->next_global chain, thus we have to + append save_next_global after merge children. + */ + if (save_next_global) + { + TABLE_LIST *table_list_iterator= table; + while (table_list_iterator->next_global) + table_list_iterator= table_list_iterator->next_global; + table_list_iterator->next_global= save_next_global; + save_next_global->prev_global= &table_list_iterator->next_global; + } + table->next_local= save_next_local; thd->open_options&= ~extra_open_options; diff --git a/sql/sql_base.cc b/sql/sql_base.cc index 9ea5b20dce6..b4a3cc27d2c 100644 --- a/sql/sql_base.cc +++ b/sql/sql_base.cc @@ -9267,6 +9267,7 @@ open_system_tables_for_read(THD *thd, TABLE_LIST *table_list, */ lex->reset_n_backup_query_tables_list(&query_tables_list_backup); thd->reset_n_backup_open_tables_state(backup); + thd->lex->sql_command= SQLCOM_SELECT; if (open_and_lock_tables(thd, table_list, FALSE, MYSQL_OPEN_IGNORE_FLUSH | diff --git a/sql/sql_class.cc b/sql/sql_class.cc index 430191cee5d..4143d2cc419 100644 --- a/sql/sql_class.cc +++ b/sql/sql_class.cc @@ -5828,9 +5828,11 @@ int THD::decide_logging_format(TABLE_LIST *tables) { static const char *prelocked_mode_name[] = { "NON_PRELOCKED", + "LOCK_TABLES", "PRELOCKED", "PRELOCKED_UNDER_LOCK_TABLES", }; + compile_time_assert(array_elements(prelocked_mode_name) == LTM_always_last); DBUG_PRINT("debug", ("prelocked_mode: %s", prelocked_mode_name[locked_tables_mode])); } diff --git a/sql/sql_class.h b/sql/sql_class.h index b22dc8142d8..da885c3dbac 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -1262,7 +1262,8 @@ enum enum_locked_tables_mode LTM_NONE= 0, LTM_LOCK_TABLES, LTM_PRELOCKED, - LTM_PRELOCKED_UNDER_LOCK_TABLES + LTM_PRELOCKED_UNDER_LOCK_TABLES, + LTM_always_last }; @@ -4610,6 +4611,11 @@ public: save_copy_field_end= copy_field_end= NULL; } } + void free_copy_field_data() + { + for (Copy_field *ptr= copy_field ; ptr != copy_field_end ; ptr++) + ptr->tmp.free(); + } }; class select_union :public select_result_interceptor diff --git a/sql/sql_plugin.cc b/sql/sql_plugin.cc index 60248f3fef4..e7286960599 100644 --- a/sql/sql_plugin.cc +++ b/sql/sql_plugin.cc @@ -2849,6 +2849,22 @@ static st_bookmark *find_bookmark(const char *plugin, const char *name, } +static size_t var_storage_size(int flags) +{ + switch (flags & PLUGIN_VAR_TYPEMASK) { + case PLUGIN_VAR_BOOL: return sizeof(my_bool); + case PLUGIN_VAR_INT: return sizeof(int); + case PLUGIN_VAR_LONG: return sizeof(long); + case PLUGIN_VAR_ENUM: return sizeof(long); + case PLUGIN_VAR_LONGLONG: return sizeof(ulonglong); + case PLUGIN_VAR_SET: return sizeof(ulonglong); + case PLUGIN_VAR_STR: return sizeof(char*); + case PLUGIN_VAR_DOUBLE: return sizeof(double); + default: DBUG_ASSERT(0); return 0; + } +} + + /* returns a bookmark for thd-local variables, creating if neccessary. returns null for non thd-local variables. @@ -2857,39 +2873,13 @@ static st_bookmark *find_bookmark(const char *plugin, const char *name, static st_bookmark *register_var(const char *plugin, const char *name, int flags) { - uint length= strlen(plugin) + strlen(name) + 3, size= 0, offset, new_size; + uint length= strlen(plugin) + strlen(name) + 3, size, offset, new_size; st_bookmark *result; char *varname, *p; - if (!(flags & PLUGIN_VAR_THDLOCAL)) - return NULL; - - switch (flags & PLUGIN_VAR_TYPEMASK) { - case PLUGIN_VAR_BOOL: - size= sizeof(my_bool); - break; - case PLUGIN_VAR_INT: - size= sizeof(int); - break; - case PLUGIN_VAR_LONG: - case PLUGIN_VAR_ENUM: - size= sizeof(long); - break; - case PLUGIN_VAR_LONGLONG: - case PLUGIN_VAR_SET: - size= sizeof(ulonglong); - break; - case PLUGIN_VAR_STR: - size= sizeof(char*); - break; - case PLUGIN_VAR_DOUBLE: - size= sizeof(double); - break; - default: - DBUG_ASSERT(0); - return NULL; - }; + DBUG_ASSERT(flags & PLUGIN_VAR_THDLOCAL); + size= var_storage_size(flags); varname= ((char*) my_alloca(length)); strxmov(varname + 1, plugin, "_", name, NullS); for (p= varname + 1; *p; p++) @@ -2983,25 +2973,17 @@ void sync_dynamic_session_variables(THD* thd, bool global_lock) */ for (idx= 0; idx < bookmark_hash.records; idx++) { - sys_var_pluginvar *pi; - sys_var *var; st_bookmark *v= (st_bookmark*) my_hash_element(&bookmark_hash,idx); if (v->version <= thd->variables.dynamic_variables_version) continue; /* already in thd->variables */ - if (!(var= intern_find_sys_var(v->key + 1, v->name_len)) || - !(pi= var->cast_pluginvar()) || - v->key[0] != plugin_var_bookmark_key(pi->plugin_var->flags)) - continue; - /* Here we do anything special that may be required of the data types */ - if ((pi->plugin_var->flags & PLUGIN_VAR_TYPEMASK) == PLUGIN_VAR_STR && - pi->plugin_var->flags & PLUGIN_VAR_MEMALLOC) + if ((v->key[0] & PLUGIN_VAR_TYPEMASK) == PLUGIN_VAR_STR && + v->key[0] & BOOKMARK_MEMALLOC) { - int offset= ((thdvar_str_t *)(pi->plugin_var))->offset; - char **pp= (char**) (thd->variables.dynamic_variables_ptr + offset); + char **pp= (char**) (thd->variables.dynamic_variables_ptr + v->offset); if (*pp) *pp= my_strdup(*pp, MYF(MY_WME|MY_FAE)); } @@ -3448,69 +3430,58 @@ bool sys_var_pluginvar::session_update(THD *thd, set_var *var) return false; } -bool sys_var_pluginvar::global_update(THD *thd, set_var *var) +static const void *var_def_ptr(st_mysql_sys_var *pv) { - DBUG_ASSERT(!is_readonly()); - mysql_mutex_assert_owner(&LOCK_global_system_variables); - - void *tgt= real_value_ptr(thd, OPT_GLOBAL); - const void *src= &var->save_result; - - if (!var->value) - { - switch (plugin_var->flags & (PLUGIN_VAR_TYPEMASK | PLUGIN_VAR_THDLOCAL)) { + switch (pv->flags & (PLUGIN_VAR_TYPEMASK | PLUGIN_VAR_THDLOCAL)) { case PLUGIN_VAR_INT: - src= &((sysvar_uint_t*) plugin_var)->def_val; - break; + return &((sysvar_uint_t*) pv)->def_val; case PLUGIN_VAR_LONG: - src= &((sysvar_ulong_t*) plugin_var)->def_val; - break; + return &((sysvar_ulong_t*) pv)->def_val; case PLUGIN_VAR_LONGLONG: - src= &((sysvar_ulonglong_t*) plugin_var)->def_val; - break; + return &((sysvar_ulonglong_t*) pv)->def_val; case PLUGIN_VAR_ENUM: - src= &((sysvar_enum_t*) plugin_var)->def_val; - break; + return &((sysvar_enum_t*) pv)->def_val; case PLUGIN_VAR_SET: - src= &((sysvar_set_t*) plugin_var)->def_val; - break; + return &((sysvar_set_t*) pv)->def_val; case PLUGIN_VAR_BOOL: - src= &((sysvar_bool_t*) plugin_var)->def_val; - break; + return &((sysvar_bool_t*) pv)->def_val; case PLUGIN_VAR_STR: - src= &((sysvar_str_t*) plugin_var)->def_val; - break; + return &((sysvar_str_t*) pv)->def_val; case PLUGIN_VAR_DOUBLE: - src= &((sysvar_double_t*) plugin_var)->def_val; - break; + return &((sysvar_double_t*) pv)->def_val; case PLUGIN_VAR_INT | PLUGIN_VAR_THDLOCAL: - src= &((thdvar_uint_t*) plugin_var)->def_val; - break; + return &((thdvar_uint_t*) pv)->def_val; case PLUGIN_VAR_LONG | PLUGIN_VAR_THDLOCAL: - src= &((thdvar_ulong_t*) plugin_var)->def_val; - break; + return &((thdvar_ulong_t*) pv)->def_val; case PLUGIN_VAR_LONGLONG | PLUGIN_VAR_THDLOCAL: - src= &((thdvar_ulonglong_t*) plugin_var)->def_val; - break; + return &((thdvar_ulonglong_t*) pv)->def_val; case PLUGIN_VAR_ENUM | PLUGIN_VAR_THDLOCAL: - src= &((thdvar_enum_t*) plugin_var)->def_val; - break; + return &((thdvar_enum_t*) pv)->def_val; case PLUGIN_VAR_SET | PLUGIN_VAR_THDLOCAL: - src= &((thdvar_set_t*) plugin_var)->def_val; - break; + return &((thdvar_set_t*) pv)->def_val; case PLUGIN_VAR_BOOL | PLUGIN_VAR_THDLOCAL: - src= &((thdvar_bool_t*) plugin_var)->def_val; - break; + return &((thdvar_bool_t*) pv)->def_val; case PLUGIN_VAR_STR | PLUGIN_VAR_THDLOCAL: - src= &((thdvar_str_t*) plugin_var)->def_val; - break; + return &((thdvar_str_t*) pv)->def_val; case PLUGIN_VAR_DOUBLE | PLUGIN_VAR_THDLOCAL: - src= &((thdvar_double_t*) plugin_var)->def_val; - break; + return &((thdvar_double_t*) pv)->def_val; default: DBUG_ASSERT(0); + return NULL; } - } +} + + +bool sys_var_pluginvar::global_update(THD *thd, set_var *var) +{ + DBUG_ASSERT(!is_readonly()); + mysql_mutex_assert_owner(&LOCK_global_system_variables); + + void *tgt= real_value_ptr(thd, OPT_GLOBAL); + const void *src= &var->save_result; + + if (!var->value) + src= var_def_ptr(plugin_var); plugin_var->update(thd, plugin_var, tgt, src); return false; @@ -3863,7 +3834,18 @@ static int construct_options(MEM_ROOT *mem_root, struct st_plugin_int *tmp, *(int*)(opt + 1)= offset= v->offset; if (opt->flags & PLUGIN_VAR_NOCMDOPT) + { + char *val= global_system_variables.dynamic_variables_ptr + offset; + if (((opt->flags & PLUGIN_VAR_TYPEMASK) == PLUGIN_VAR_STR) && + (opt->flags & PLUGIN_VAR_MEMALLOC)) + { + char *def_val= *(char**)var_def_ptr(opt); + *(char**)val= def_val ? my_strdup(def_val, MYF(0)) : NULL; + } + else + memcpy(val, var_def_ptr(opt), var_storage_size(opt->flags)); continue; + } optname= (char*) memdup_root(mem_root, v->key + 1, (optnamelen= v->name_len) + 1); diff --git a/sql/sql_select.cc b/sql/sql_select.cc index 5dc50c92104..239e5b6b5d2 100644 --- a/sql/sql_select.cc +++ b/sql/sql_select.cc @@ -9172,9 +9172,26 @@ JOIN::make_simple_join(JOIN *parent, TABLE *temp_table) We need to destruct the copy_field (allocated in create_tmp_table()) before setting it to 0 if the join is not "reusable". */ - if (!tmp_join || tmp_join != this) - tmp_table_param.cleanup(); - tmp_table_param.copy_field= tmp_table_param.copy_field_end=0; + if (!tmp_join || tmp_join != this) + tmp_table_param.cleanup(); + else + { + /* + Free data buffered in copy_fields, but keep data pointed by copy_field + around for next iteration (possibly stored in save_copy_fields). + + It would be logically simpler to not clear copy_field + below, but as we have loops that runs over copy_field to + copy_field_end that should not be done anymore, it's simpler to + just clear the pointers. + + Another option would be to just clear copy_field_end and not run + the loops if this is not set or to have tmp_table_param.cleanup() + to run cleanup on save_copy_field if copy_field is not set. + */ + tmp_table_param.free_copy_field_data(); + tmp_table_param.copy_field= tmp_table_param.copy_field_end=0; + } first_record= sort_and_group=0; send_records= (ha_rows) 0; @@ -11890,7 +11907,7 @@ void JOIN::join_free() /** Free resources of given join. - @param fill true if we should free all resources, call with full==1 + @param full true if we should free all resources, call with full==1 should be last, before it this function can be called with full==0 @@ -12010,7 +12027,7 @@ void JOIN::cleanup(bool full) /* If we have tmp_join and 'this' JOIN is not tmp_join and tmp_table_param.copy_field's of them are equal then we have to remove - pointer to tmp_table_param.copy_field from tmp_join, because it qill + pointer to tmp_table_param.copy_field from tmp_join, because it will be removed in tmp_table_param.cleanup(). */ if (tmp_join && @@ -16114,6 +16131,7 @@ Field *create_tmp_field(THD *thd, TABLE *table,Item *item, Item::Type type, case Item::VARBIN_ITEM: case Item::CACHE_ITEM: case Item::EXPR_CACHE_ITEM: + case Item::PARAM_ITEM: if (make_copy_field) { DBUG_ASSERT(((Item_result_field*)item)->result_field); @@ -22917,7 +22935,7 @@ setup_copy_fields(THD *thd, TMP_TABLE_PARAM *param, err: if (copy) delete [] param->copy_field; // This is never 0 - param->copy_field=0; + param->copy_field= 0; err2: DBUG_RETURN(TRUE); } diff --git a/sql/table_cache.cc b/sql/table_cache.cc index 2dd368a1945..16a47b37417 100644 --- a/sql/table_cache.cc +++ b/sql/table_cache.cc @@ -778,6 +778,8 @@ void tdc_release_share(TABLE_SHARE *share) mysql_mutex_lock(&share->tdc->LOCK_table_share); if (--share->tdc->ref_count) { + if (!share->is_view) + mysql_cond_broadcast(&share->tdc->COND_release); mysql_mutex_unlock(&share->tdc->LOCK_table_share); mysql_mutex_unlock(&LOCK_unused_shares); DBUG_VOID_RETURN; diff --git a/storage/innobase/dict/dict0stats.cc b/storage/innobase/dict/dict0stats.cc index f21fd560235..c13d4583fef 100644 --- a/storage/innobase/dict/dict0stats.cc +++ b/storage/innobase/dict/dict0stats.cc @@ -736,7 +736,7 @@ dict_stats_copy( if (dst_idx->type & DICT_FTS) { continue; } - dict_stats_empty_index(dst_idx); + dict_stats_empty_index(dst_idx, true); } else { continue; } diff --git a/storage/innobase/fts/fts0fts.cc b/storage/innobase/fts/fts0fts.cc index 1457417d5dc..61b6f5408cf 100644 --- a/storage/innobase/fts/fts0fts.cc +++ b/storage/innobase/fts/fts0fts.cc @@ -108,6 +108,7 @@ UNIV_INTERN mysql_pfs_key_t fts_pll_tokenize_mutex_key; /** variable to record innodb_fts_internal_tbl_name for information schema table INNODB_FTS_INSERTED etc. */ UNIV_INTERN char* fts_internal_tbl_name = NULL; +UNIV_INTERN char* fts_internal_tbl_name2 = NULL; /** InnoDB default stopword list: There are different versions of stopwords, the stop words listed @@ -6570,6 +6571,36 @@ fts_check_corrupt_index( return(0); } +/* Get parent table name if it's a fts aux table +@param[in] aux_table_name aux table name +@param[in] aux_table_len aux table length +@return parent table name, or NULL */ +char* +fts_get_parent_table_name( + const char* aux_table_name, + ulint aux_table_len) +{ + fts_aux_table_t aux_table; + char* parent_table_name = NULL; + + if (fts_is_aux_table_name(&aux_table, aux_table_name, aux_table_len)) { + dict_table_t* parent_table; + + parent_table = dict_table_open_on_id( + aux_table.parent_id, TRUE, DICT_TABLE_OP_NORMAL); + + if (parent_table != NULL) { + parent_table_name = mem_strdupl( + parent_table->name, + strlen(parent_table->name)); + + dict_table_close(parent_table, TRUE, FALSE); + } + } + + return(parent_table_name); +} + /** Check the validity of the parent table. @param[in] aux_table auxiliary table @return true if it is a valid table or false if it is not */ diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index fd15092d96c..7ba54a1c360 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -16916,7 +16916,12 @@ innodb_internal_table_update( my_free(old); } - fts_internal_tbl_name = *(char**) var_ptr; + fts_internal_tbl_name2 = *(char**) var_ptr; + if (fts_internal_tbl_name2 == NULL) { + fts_internal_tbl_name = const_cast<char*>("default"); + } else { + fts_internal_tbl_name = fts_internal_tbl_name2; + } } /****************************************************************//** @@ -19148,7 +19153,7 @@ static MYSQL_SYSVAR_BOOL(disable_sort_file_cache, srv_disable_sort_file_cache, "Whether to disable OS system file cache for sort I/O", NULL, NULL, FALSE); -static MYSQL_SYSVAR_STR(ft_aux_table, fts_internal_tbl_name, +static MYSQL_SYSVAR_STR(ft_aux_table, fts_internal_tbl_name2, PLUGIN_VAR_NOCMDARG, "FTS internal auxiliary table to be checked", innodb_internal_table_validate, diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc index 5385c7ab920..0ccc9bebf29 100644 --- a/storage/innobase/handler/handler0alter.cc +++ b/storage/innobase/handler/handler0alter.cc @@ -210,7 +210,10 @@ innobase_need_rebuild( const Alter_inplace_info* ha_alter_info, const TABLE* altered_table) { - if (ha_alter_info->handler_flags + Alter_inplace_info::HA_ALTER_FLAGS alter_inplace_flags = + ha_alter_info->handler_flags & ~(INNOBASE_INPLACE_IGNORE); + + if (alter_inplace_flags == Alter_inplace_info::CHANGE_CREATE_OPTION && !(ha_alter_info->create_info->used_fields & (HA_CREATE_USED_ROW_FORMAT @@ -3985,7 +3988,7 @@ err_exit: } if (!(ha_alter_info->handler_flags & INNOBASE_ALTER_DATA) - || (ha_alter_info->handler_flags + || ((ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE) == Alter_inplace_info::CHANGE_CREATE_OPTION && !innobase_need_rebuild(ha_alter_info, table))) { @@ -4159,7 +4162,7 @@ ok_exit: DBUG_RETURN(false); } - if (ha_alter_info->handler_flags + if ((ha_alter_info->handler_flags & ~INNOBASE_INPLACE_IGNORE) == Alter_inplace_info::CHANGE_CREATE_OPTION && !innobase_need_rebuild(ha_alter_info, table)) { goto ok_exit; diff --git a/storage/innobase/handler/i_s.cc b/storage/innobase/handler/i_s.cc index d1e6e3ed808..a73446440aa 100644 --- a/storage/innobase/handler/i_s.cc +++ b/storage/innobase/handler/i_s.cc @@ -3959,6 +3959,8 @@ i_s_fts_config_fill( DBUG_RETURN(0); } + DEBUG_SYNC_C("i_s_fts_config_fille_check"); + fields = table->field; /* Prevent DDL to drop fts aux tables. */ diff --git a/storage/innobase/include/fts0fts.h b/storage/innobase/include/fts0fts.h index 87b5787d416..3e2f359bbeb 100644 --- a/storage/innobase/include/fts0fts.h +++ b/storage/innobase/include/fts0fts.h @@ -375,6 +375,7 @@ extern bool fts_need_sync; /** Variable specifying the table that has Fulltext index to display its content through information schema table */ extern char* fts_internal_tbl_name; +extern char* fts_internal_tbl_name2; #define fts_que_graph_free(graph) \ do { \ @@ -823,6 +824,15 @@ void fts_drop_orphaned_tables(void); /*==========================*/ +/* Get parent table name if it's a fts aux table +@param[in] aux_table_name aux table name +@param[in] aux_table_len aux table length +@return parent table name, or NULL */ +char* +fts_get_parent_table_name( + const char* aux_table_name, + ulint aux_table_len); + /******************************************************************//** Since we do a horizontal split on the index table, we need to drop all the split tables. diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i index 8557f74f756..81190c3ad2e 100644 --- a/storage/innobase/include/univ.i +++ b/storage/innobase/include/univ.i @@ -45,7 +45,7 @@ Created 1/20/1994 Heikki Tuuri #define INNODB_VERSION_MAJOR 5 #define INNODB_VERSION_MINOR 6 -#define INNODB_VERSION_BUGFIX 32 +#define INNODB_VERSION_BUGFIX 33 /* The following is the InnoDB version as shown in SELECT plugin_version FROM information_schema.plugins; diff --git a/storage/innobase/row/row0log.cc b/storage/innobase/row/row0log.cc index 5e32663ad32..12d4a59da6b 100644 --- a/storage/innobase/row/row0log.cc +++ b/storage/innobase/row/row0log.cc @@ -621,7 +621,7 @@ row_log_table_delete( &old_pk_extra_size); ut_ad(old_pk_extra_size < 0x100); - mrec_size = 4 + old_pk_size; + mrec_size = 6 + old_pk_size; /* Log enough prefix of the BLOB unless both the old and new table are in COMPACT or REDUNDANT format, @@ -651,8 +651,8 @@ row_log_table_delete( *b++ = static_cast<byte>(old_pk_extra_size); /* Log the size of external prefix we saved */ - mach_write_to_2(b, ext_size); - b += 2; + mach_write_to_4(b, ext_size); + b += 4; rec_convert_dtuple_to_temp( b + old_pk_extra_size, new_index, @@ -2276,14 +2276,14 @@ row_log_table_apply_op( break; case ROW_T_DELETE: - /* 1 (extra_size) + 2 (ext_size) + at least 1 (payload) */ - if (mrec + 4 >= mrec_end) { + /* 1 (extra_size) + 4 (ext_size) + at least 1 (payload) */ + if (mrec + 6 >= mrec_end) { return(NULL); } extra_size = *mrec++; - ext_size = mach_read_from_2(mrec); - mrec += 2; + ext_size = mach_read_from_4(mrec); + mrec += 4; ut_ad(mrec < mrec_end); /* We assume extra_size < 0x100 for the PRIMARY KEY prefix. diff --git a/storage/innobase/row/row0mysql.cc b/storage/innobase/row/row0mysql.cc index b2c96a7ed7b..35011247105 100644 --- a/storage/innobase/row/row0mysql.cc +++ b/storage/innobase/row/row0mysql.cc @@ -2734,6 +2734,10 @@ loop: return(n_tables + n_tables_dropped); } + DBUG_EXECUTE_IF("row_drop_tables_in_background_sleep", + os_thread_sleep(5000000); + ); + table = dict_table_open_on_name(drop->table_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE); @@ -2744,6 +2748,16 @@ loop: goto already_dropped; } + if (!table->to_be_dropped) { + /* There is a scenario: the old table is dropped + just after it's added into drop list, and new + table with the same name is created, then we try + to drop the new table in background. */ + dict_table_close(table, FALSE, FALSE); + + goto already_dropped; + } + ut_a(!table->can_be_evicted); dict_table_close(table, FALSE, FALSE); @@ -4075,6 +4089,13 @@ row_drop_table_for_mysql( } } + + DBUG_EXECUTE_IF("row_drop_table_add_to_background", + row_add_table_to_background_drop_list(table->name); + err = DB_SUCCESS; + goto funct_exit; + ); + /* TODO: could we replace the counter n_foreign_key_checks_running with lock checks on the table? Acquire here an exclusive lock on the table, and rewrite lock0lock.cc and the lock wait in srv0srv.cc so that @@ -4700,6 +4721,19 @@ loop: row_mysql_lock_data_dictionary(trx); while ((table_name = dict_get_first_table_name_in_db(name))) { + /* Drop parent table if it is a fts aux table, to + avoid accessing dropped fts aux tables in information + scheam when parent table still exists. + Note: Drop parent table will drop fts aux tables. */ + char* parent_table_name; + parent_table_name = fts_get_parent_table_name( + table_name, strlen(table_name)); + + if (parent_table_name != NULL) { + mem_free(table_name); + table_name = parent_table_name; + } + ut_a(memcmp(table_name, name, namelen) == 0); table = dict_table_open_on_name( diff --git a/storage/perfschema/ha_perfschema.cc b/storage/perfschema/ha_perfschema.cc index 7c85431c57c..50d91842344 100644 --- a/storage/perfschema/ha_perfschema.cc +++ b/storage/perfschema/ha_perfschema.cc @@ -225,7 +225,7 @@ maria_declare_plugin(perfschema) 0x0001, pfs_status_vars, NULL, - "5.6.32", + "5.6.33", MariaDB_PLUGIN_MATURITY_STABLE } maria_declare_plugin_end; diff --git a/storage/tokudb/CMakeLists.txt b/storage/tokudb/CMakeLists.txt index 765e6733a98..53a4a675bbf 100644 --- a/storage/tokudb/CMakeLists.txt +++ b/storage/tokudb/CMakeLists.txt @@ -1,4 +1,4 @@ -SET(TOKUDB_VERSION 5.6.31-77.0) +SET(TOKUDB_VERSION 5.6.32-78.1) # PerconaFT only supports x86-64 and cmake-2.8.9+ IF(CMAKE_VERSION VERSION_LESS "2.8.9") MESSAGE(STATUS "CMake 2.8.9 or higher is required by TokuDB") diff --git a/storage/tokudb/PerconaFT/buildheader/make_tdb.cc b/storage/tokudb/PerconaFT/buildheader/make_tdb.cc index 4b62703480f..576f902f6ae 100644 --- a/storage/tokudb/PerconaFT/buildheader/make_tdb.cc +++ b/storage/tokudb/PerconaFT/buildheader/make_tdb.cc @@ -367,8 +367,8 @@ static void print_db_env_struct (void) { "int (*checkpointing_get_period) (DB_ENV*, uint32_t*) /* Retrieve the delay between automatic checkpoints. 0 means disabled. */", "int (*cleaner_set_period) (DB_ENV*, uint32_t) /* Change the delay between automatic cleaner attempts. 0 means disabled. */", "int (*cleaner_get_period) (DB_ENV*, uint32_t*) /* Retrieve the delay between automatic cleaner attempts. 0 means disabled. */", - "int (*cleaner_set_iterations) (DB_ENV*, uint32_t) /* Change the number of attempts on each cleaner invokation. 0 means disabled. */", - "int (*cleaner_get_iterations) (DB_ENV*, uint32_t*) /* Retrieve the number of attempts on each cleaner invokation. 0 means disabled. */", + "int (*cleaner_set_iterations) (DB_ENV*, uint32_t) /* Change the number of attempts on each cleaner invocation. 0 means disabled. */", + "int (*cleaner_get_iterations) (DB_ENV*, uint32_t*) /* Retrieve the number of attempts on each cleaner invocation. 0 means disabled. */", "int (*evictor_set_enable_partial_eviction) (DB_ENV*, bool) /* Enables or disabled partial eviction of nodes from cachetable. */", "int (*evictor_get_enable_partial_eviction) (DB_ENV*, bool*) /* Retrieve the status of partial eviction of nodes from cachetable. */", "int (*checkpointing_postpone) (DB_ENV*) /* Use for 'rename table' or any other operation that must be disjoint from a checkpoint */", diff --git a/storage/tokudb/PerconaFT/cmake_modules/TokuSetupCompiler.cmake b/storage/tokudb/PerconaFT/cmake_modules/TokuSetupCompiler.cmake index 77f6d8f67b7..cce12d575bf 100644 --- a/storage/tokudb/PerconaFT/cmake_modules/TokuSetupCompiler.cmake +++ b/storage/tokudb/PerconaFT/cmake_modules/TokuSetupCompiler.cmake @@ -71,6 +71,7 @@ set_cflags_if_supported( -Wno-pointer-bool-conversion -fno-rtti -fno-exceptions + -Wno-error=nonnull-compare ) ## set_cflags_if_supported_named("-Weffc++" -Weffcpp) diff --git a/storage/tokudb/PerconaFT/ft/CMakeLists.txt b/storage/tokudb/PerconaFT/ft/CMakeLists.txt index 11091073ac2..6696c26ecc0 100644 --- a/storage/tokudb/PerconaFT/ft/CMakeLists.txt +++ b/storage/tokudb/PerconaFT/ft/CMakeLists.txt @@ -55,8 +55,8 @@ set(FT_SOURCES msg_buffer node pivotkeys + serialize/rbtree_mhs serialize/block_allocator - serialize/block_allocator_strategy serialize/block_table serialize/compress serialize/ft_node-serialize diff --git a/storage/tokudb/PerconaFT/ft/ft-flusher.cc b/storage/tokudb/PerconaFT/ft/ft-flusher.cc index fb456ea6a18..e6452f60cfc 100644 --- a/storage/tokudb/PerconaFT/ft/ft-flusher.cc +++ b/storage/tokudb/PerconaFT/ft/ft-flusher.cc @@ -496,7 +496,7 @@ handle_split_of_child( // We never set the rightmost blocknum to be the root. // Instead, we wait for the root to split and let promotion initialize the rightmost - // blocknum to be the first non-root leaf node on the right extreme to recieve an insert. + // blocknum to be the first non-root leaf node on the right extreme to receive an insert. BLOCKNUM rightmost_blocknum = toku_unsafe_fetch(&ft->rightmost_blocknum); invariant(ft->h->root_blocknum.b != rightmost_blocknum.b); if (childa->blocknum.b == rightmost_blocknum.b) { @@ -1470,7 +1470,7 @@ void toku_ft_flush_some_child(FT ft, FTNODE parent, struct flusher_advice *fa) // It is possible after reading in the entire child, // that we now know that the child is not reactive // if so, we can unpin parent right now - // we wont be splitting/merging child + // we won't be splitting/merging child // and we have already replaced the bnc // for the root with a fresh one enum reactivity child_re = toku_ftnode_get_reactivity(ft, child); diff --git a/storage/tokudb/PerconaFT/ft/ft-ops.cc b/storage/tokudb/PerconaFT/ft/ft-ops.cc index 8f61bc67339..f131668889e 100644 --- a/storage/tokudb/PerconaFT/ft/ft-ops.cc +++ b/storage/tokudb/PerconaFT/ft/ft-ops.cc @@ -598,15 +598,12 @@ void toku_ftnode_checkpoint_complete_callback(void *value_data) { } } -void toku_ftnode_clone_callback( - void* value_data, - void** cloned_value_data, - long* clone_size, - PAIR_ATTR* new_attr, - bool for_checkpoint, - void* write_extraargs - ) -{ +void toku_ftnode_clone_callback(void *value_data, + void **cloned_value_data, + long *clone_size, + PAIR_ATTR *new_attr, + bool for_checkpoint, + void *write_extraargs) { FTNODE node = static_cast<FTNODE>(value_data); toku_ftnode_assert_fully_in_memory(node); FT ft = static_cast<FT>(write_extraargs); @@ -618,13 +615,16 @@ void toku_ftnode_clone_callback( toku_ftnode_leaf_rebalance(node, ft->h->basementnodesize); } - cloned_node->oldest_referenced_xid_known = node->oldest_referenced_xid_known; - cloned_node->max_msn_applied_to_node_on_disk = node->max_msn_applied_to_node_on_disk; + cloned_node->oldest_referenced_xid_known = + node->oldest_referenced_xid_known; + cloned_node->max_msn_applied_to_node_on_disk = + node->max_msn_applied_to_node_on_disk; cloned_node->flags = node->flags; cloned_node->blocknum = node->blocknum; cloned_node->layout_version = node->layout_version; cloned_node->layout_version_original = node->layout_version_original; - cloned_node->layout_version_read_from_disk = node->layout_version_read_from_disk; + cloned_node->layout_version_read_from_disk = + node->layout_version_read_from_disk; cloned_node->build_id = node->build_id; cloned_node->height = node->height; cloned_node->dirty = node->dirty; @@ -649,38 +649,39 @@ void toku_ftnode_clone_callback( // set new pair attr if necessary if (node->height == 0) { *new_attr = make_ftnode_pair_attr(node); - } - else { + for (int i = 0; i < node->n_children; i++) { + BLB(node, i)->logical_rows_delta = 0; + BLB(cloned_node, i)->logical_rows_delta = 0; + } + } else { new_attr->is_valid = false; } *clone_size = ftnode_memory_size(cloned_node); *cloned_value_data = cloned_node; } -void toku_ftnode_flush_callback( - CACHEFILE UU(cachefile), - int fd, - BLOCKNUM blocknum, - void *ftnode_v, - void** disk_data, - void *extraargs, - PAIR_ATTR size __attribute__((unused)), - PAIR_ATTR* new_size, - bool write_me, - bool keep_me, - bool for_checkpoint, - bool is_clone - ) -{ - FT ft = (FT) extraargs; - FTNODE ftnode = (FTNODE) ftnode_v; - FTNODE_DISK_DATA* ndd = (FTNODE_DISK_DATA*)disk_data; +void toku_ftnode_flush_callback(CACHEFILE UU(cachefile), + int fd, + BLOCKNUM blocknum, + void *ftnode_v, + void **disk_data, + void *extraargs, + PAIR_ATTR size __attribute__((unused)), + PAIR_ATTR *new_size, + bool write_me, + bool keep_me, + bool for_checkpoint, + bool is_clone) { + FT ft = (FT)extraargs; + FTNODE ftnode = (FTNODE)ftnode_v; + FTNODE_DISK_DATA *ndd = (FTNODE_DISK_DATA *)disk_data; assert(ftnode->blocknum.b == blocknum.b); int height = ftnode->height; if (write_me) { toku_ftnode_assert_fully_in_memory(ftnode); if (height > 0 && !is_clone) { - // cloned nodes already had their stale messages moved, see toku_ftnode_clone_callback() + // cloned nodes already had their stale messages moved, see + // toku_ftnode_clone_callback() toku_move_ftnode_messages_to_stale(ft, ftnode); } else if (height == 0) { toku_ftnode_leaf_run_gc(ft, ftnode); @@ -688,7 +689,8 @@ void toku_ftnode_flush_callback( toku_ftnode_update_disk_stats(ftnode, ft, for_checkpoint); } } - int r = toku_serialize_ftnode_to(fd, ftnode->blocknum, ftnode, ndd, !is_clone, ft, for_checkpoint); + int r = toku_serialize_ftnode_to( + fd, ftnode->blocknum, ftnode, ndd, !is_clone, ft, for_checkpoint); assert_zero(r); ftnode->layout_version_read_from_disk = FT_LAYOUT_VERSION; } @@ -703,20 +705,22 @@ void toku_ftnode_flush_callback( FT_STATUS_INC(FT_FULL_EVICTIONS_NONLEAF_BYTES, node_size); } toku_free(*disk_data); - } - else { + } else { if (ftnode->height == 0) { for (int i = 0; i < ftnode->n_children; i++) { - if (BP_STATE(ftnode,i) == PT_AVAIL) { + if (BP_STATE(ftnode, i) == PT_AVAIL) { BASEMENTNODE bn = BLB(ftnode, i); - toku_ft_decrease_stats(&ft->in_memory_stats, bn->stat64_delta); + toku_ft_decrease_stats(&ft->in_memory_stats, + bn->stat64_delta); + if (!ftnode->dirty) + toku_ft_adjust_logical_row_count( + ft, -bn->logical_rows_delta); } } } } toku_ftnode_free(&ftnode); - } - else { + } else { *new_size = make_ftnode_pair_attr(ftnode); } } @@ -845,10 +849,13 @@ static void compress_internal_node_partition(FTNODE node, int i, enum toku_compr } // callback for partially evicting a node -int toku_ftnode_pe_callback(void *ftnode_pv, PAIR_ATTR old_attr, void *write_extraargs, - void (*finalize)(PAIR_ATTR new_attr, void *extra), void *finalize_extra) { - FTNODE node = (FTNODE) ftnode_pv; - FT ft = (FT) write_extraargs; +int toku_ftnode_pe_callback(void *ftnode_pv, + PAIR_ATTR old_attr, + void *write_extraargs, + void (*finalize)(PAIR_ATTR new_attr, void *extra), + void *finalize_extra) { + FTNODE node = (FTNODE)ftnode_pv; + FT ft = (FT)write_extraargs; int num_partial_evictions = 0; // Hold things we intend to destroy here. @@ -866,7 +873,8 @@ int toku_ftnode_pe_callback(void *ftnode_pv, PAIR_ATTR old_attr, void *write_ext } // Don't partially evict nodes whose partitions can't be read back // from disk individually - if (node->layout_version_read_from_disk < FT_FIRST_LAYOUT_VERSION_WITH_BASEMENT_NODES) { + if (node->layout_version_read_from_disk < + FT_FIRST_LAYOUT_VERSION_WITH_BASEMENT_NODES) { goto exit; } // @@ -874,77 +882,77 @@ int toku_ftnode_pe_callback(void *ftnode_pv, PAIR_ATTR old_attr, void *write_ext // if (node->height > 0) { for (int i = 0; i < node->n_children; i++) { - if (BP_STATE(node,i) == PT_AVAIL) { - if (BP_SHOULD_EVICT(node,i)) { + if (BP_STATE(node, i) == PT_AVAIL) { + if (BP_SHOULD_EVICT(node, i)) { NONLEAF_CHILDINFO bnc = BNC(node, i); if (ft_compress_buffers_before_eviction && - // We may not serialize and compress a partition in memory if its - // in memory layout version is different than what's on disk (and - // therefore requires upgrade). + // We may not serialize and compress a partition in + // memory if its in memory layout version is different + // than what's on disk (and therefore requires upgrade). // - // Auto-upgrade code assumes that if a node's layout version read - // from disk is not current, it MUST require upgrade. Breaking - // this rule would cause upgrade code to upgrade this partition - // again after we serialize it as the current version, which is bad. - node->layout_version == node->layout_version_read_from_disk) { + // Auto-upgrade code assumes that if a node's layout + // version read from disk is not current, it MUST + // require upgrade. + // Breaking this rule would cause upgrade code to + // upgrade this partition again after we serialize it as + // the current version, which is bad. + node->layout_version == + node->layout_version_read_from_disk) { toku_ft_bnc_move_messages_to_stale(ft, bnc); compress_internal_node_partition( node, i, // Always compress with quicklz - TOKU_QUICKLZ_METHOD - ); + TOKU_QUICKLZ_METHOD); } else { // We're not compressing buffers before eviction. Simply - // detach the buffer and set the child's state to on-disk. + // detach the buffer and set the child's state to + // on-disk. set_BNULL(node, i); BP_STATE(node, i) = PT_ON_DISK; } buffers_to_destroy[num_buffers_to_destroy++] = bnc; num_partial_evictions++; + } else { + BP_SWEEP_CLOCK(node, i); } - else { - BP_SWEEP_CLOCK(node,i); - } - } - else { + } else { continue; } } - } - // - // partial eviction strategy for basement nodes: - // if the bn is compressed, evict it - // else: check if it requires eviction, if it does, evict it, if not, sweep the clock count - // - else { + } else { + // + // partial eviction strategy for basement nodes: + // if the bn is compressed, evict it + // else: check if it requires eviction, if it does, evict it, if not, + // sweep the clock count + // for (int i = 0; i < node->n_children; i++) { // Get rid of compressed stuff no matter what. - if (BP_STATE(node,i) == PT_COMPRESSED) { + if (BP_STATE(node, i) == PT_COMPRESSED) { SUB_BLOCK sb = BSB(node, i); pointers_to_free[num_pointers_to_free++] = sb->compressed_ptr; pointers_to_free[num_pointers_to_free++] = sb; set_BNULL(node, i); - BP_STATE(node,i) = PT_ON_DISK; + BP_STATE(node, i) = PT_ON_DISK; num_partial_evictions++; - } - else if (BP_STATE(node,i) == PT_AVAIL) { - if (BP_SHOULD_EVICT(node,i)) { + } else if (BP_STATE(node, i) == PT_AVAIL) { + if (BP_SHOULD_EVICT(node, i)) { BASEMENTNODE bn = BLB(node, i); basements_to_destroy[num_basements_to_destroy++] = bn; - toku_ft_decrease_stats(&ft->in_memory_stats, bn->stat64_delta); + toku_ft_decrease_stats(&ft->in_memory_stats, + bn->stat64_delta); + toku_ft_adjust_logical_row_count(ft, + -bn->logical_rows_delta); set_BNULL(node, i); BP_STATE(node, i) = PT_ON_DISK; num_partial_evictions++; + } else { + BP_SWEEP_CLOCK(node, i); } - else { - BP_SWEEP_CLOCK(node,i); - } - } - else if (BP_STATE(node,i) == PT_ON_DISK) { + } else if (BP_STATE(node, i) == PT_ON_DISK) { continue; - } - else { + } else { abort(); } } @@ -2378,12 +2386,16 @@ ft_send_update_msg(FT_HANDLE ft_h, const ft_msg &msg, TOKUTXN txn) { toku_ft_root_put_msg(ft_h->ft, msg, &gc_info); } -void toku_ft_maybe_update(FT_HANDLE ft_h, const DBT *key, const DBT *update_function_extra, - TOKUTXN txn, bool oplsn_valid, LSN oplsn, - bool do_logging) { +void toku_ft_maybe_update(FT_HANDLE ft_h, + const DBT *key, + const DBT *update_function_extra, + TOKUTXN txn, + bool oplsn_valid, + LSN oplsn, + bool do_logging) { TXNID_PAIR xid = toku_txn_get_txnid(txn); if (txn) { - BYTESTRING keybs = { key->size, (char *) key->data }; + BYTESTRING keybs = {key->size, (char *)key->data}; toku_logger_save_rollback_cmdupdate( txn, toku_cachefile_filenum(ft_h->ft->cf), &keybs); toku_txn_maybe_note_ft(txn, ft_h->ft); @@ -2392,22 +2404,33 @@ void toku_ft_maybe_update(FT_HANDLE ft_h, const DBT *key, const DBT *update_func TOKULOGGER logger; logger = toku_txn_logger(txn); if (do_logging && logger) { - BYTESTRING keybs = {.len=key->size, .data=(char *) key->data}; - BYTESTRING extrabs = {.len=update_function_extra->size, - .data = (char *) update_function_extra->data}; - toku_log_enq_update(logger, NULL, 0, txn, - toku_cachefile_filenum(ft_h->ft->cf), - xid, keybs, extrabs); + BYTESTRING keybs = {.len = key->size, .data = (char *)key->data}; + BYTESTRING extrabs = {.len = update_function_extra->size, + .data = (char *)update_function_extra->data}; + toku_log_enq_update(logger, + NULL, + 0, + txn, + toku_cachefile_filenum(ft_h->ft->cf), + xid, + keybs, + extrabs); } LSN treelsn; - if (oplsn_valid && oplsn.lsn <= (treelsn = toku_ft_checkpoint_lsn(ft_h->ft)).lsn) { + if (oplsn_valid && + oplsn.lsn <= (treelsn = toku_ft_checkpoint_lsn(ft_h->ft)).lsn) { // do nothing } else { - XIDS message_xids = txn ? toku_txn_get_xids(txn) : toku_xids_get_root_xids(); - ft_msg msg(key, update_function_extra, FT_UPDATE, ZERO_MSN, message_xids); + XIDS message_xids = + txn ? toku_txn_get_xids(txn) : toku_xids_get_root_xids(); + ft_msg msg( + key, update_function_extra, FT_UPDATE, ZERO_MSN, message_xids); ft_send_update_msg(ft_h, msg, txn); } + // updates get converted to insert messages, which should do a -1 on the + // logical row count when the messages are permanently applied + toku_ft_adjust_logical_row_count(ft_h->ft, 1); } void toku_ft_maybe_update_broadcast(FT_HANDLE ft_h, const DBT *update_function_extra, diff --git a/storage/tokudb/PerconaFT/ft/ft-recount-rows.cc b/storage/tokudb/PerconaFT/ft/ft-recount-rows.cc index adac96f4882..e31d80772d5 100644 --- a/storage/tokudb/PerconaFT/ft/ft-recount-rows.cc +++ b/storage/tokudb/PerconaFT/ft/ft-recount-rows.cc @@ -73,30 +73,20 @@ static bool recount_rows_interrupt(void* extra, uint64_t deleted_rows) { return rre->_cancelled = rre->_progress_callback(rre->_keys, deleted_rows, rre->_progress_extra); } -int toku_ft_recount_rows( - FT_HANDLE ft, - int (*progress_callback)( - uint64_t count, - uint64_t deleted, - void* progress_extra), - void* progress_extra) { - +int toku_ft_recount_rows(FT_HANDLE ft, + int (*progress_callback)(uint64_t count, + uint64_t deleted, + void* progress_extra), + void* progress_extra) { int ret = 0; - recount_rows_extra_t rre = { - progress_callback, - progress_extra, - 0, - false - }; + recount_rows_extra_t rre = {progress_callback, progress_extra, 0, false}; ft_cursor c; ret = toku_ft_cursor_create(ft, &c, nullptr, C_READ_ANY, false, false); - if (ret) return ret; + if (ret) + return ret; - toku_ft_cursor_set_check_interrupt_cb( - &c, - recount_rows_interrupt, - &rre); + toku_ft_cursor_set_check_interrupt_cb(&c, recount_rows_interrupt, &rre); ret = toku_ft_cursor_first(&c, recount_rows_found, &rre); while (FT_LIKELY(ret == 0)) { @@ -108,6 +98,7 @@ int toku_ft_recount_rows( if (rre._cancelled == false) { // update ft count toku_unsafe_set(&ft->ft->in_memory_logical_rows, rre._keys); + ft->ft->h->dirty = 1; ret = 0; } diff --git a/storage/tokudb/PerconaFT/ft/ft.cc b/storage/tokudb/PerconaFT/ft/ft.cc index 93d21233bf7..699fcc57603 100644 --- a/storage/tokudb/PerconaFT/ft/ft.cc +++ b/storage/tokudb/PerconaFT/ft/ft.cc @@ -903,6 +903,9 @@ void toku_ft_adjust_logical_row_count(FT ft, int64_t delta) { // must be returned in toku_ft_stat64. if (delta != 0 && ft->in_memory_logical_rows != (uint64_t)-1) { toku_sync_fetch_and_add(&(ft->in_memory_logical_rows), delta); + if (ft->in_memory_logical_rows == (uint64_t)-1) { + toku_sync_fetch_and_add(&(ft->in_memory_logical_rows), 1); + } } } diff --git a/storage/tokudb/PerconaFT/ft/loader/loader-internal.h b/storage/tokudb/PerconaFT/ft/loader/loader-internal.h index dd070373e26..1aa2c203831 100644 --- a/storage/tokudb/PerconaFT/ft/loader/loader-internal.h +++ b/storage/tokudb/PerconaFT/ft/loader/loader-internal.h @@ -301,7 +301,7 @@ int toku_ft_loader_internal_init (/* out */ FTLOADER *blp, void toku_ft_loader_internal_destroy (FTLOADER bl, bool is_error); -// For test purposes only. (In production, the rowset size is determined by negotation with the cachetable for some memory. See #2613.) +// For test purposes only. (In production, the rowset size is determined by negotiation with the cachetable for some memory. See #2613.) uint64_t toku_ft_loader_get_rowset_budget_for_testing (void); int toku_ft_loader_finish_extractor(FTLOADER bl); diff --git a/storage/tokudb/PerconaFT/ft/loader/loader.cc b/storage/tokudb/PerconaFT/ft/loader/loader.cc index 20f9363da1e..528c86a8f79 100644 --- a/storage/tokudb/PerconaFT/ft/loader/loader.cc +++ b/storage/tokudb/PerconaFT/ft/loader/loader.cc @@ -91,7 +91,7 @@ toku_ft_loader_set_size_factor(uint32_t factor) { uint64_t toku_ft_loader_get_rowset_budget_for_testing (void) -// For test purposes only. In production, the rowset size is determined by negotation with the cachetable for some memory. (See #2613). +// For test purposes only. In production, the rowset size is determined by negotiation with the cachetable for some memory. (See #2613). { return 16ULL*size_factor*1024ULL; } diff --git a/storage/tokudb/PerconaFT/ft/node.cc b/storage/tokudb/PerconaFT/ft/node.cc index 58ba675eb7c..12e5fda226e 100644 --- a/storage/tokudb/PerconaFT/ft/node.cc +++ b/storage/tokudb/PerconaFT/ft/node.cc @@ -373,52 +373,48 @@ find_bounds_within_message_tree( } } -/** - * For each message in the ancestor's buffer (determined by childnum) that - * is key-wise between lower_bound_exclusive and upper_bound_inclusive, - * apply the message to the basement node. We treat the bounds as minus - * or plus infinity respectively if they are NULL. Do not mark the node - * as dirty (preserve previous state of 'dirty' bit). - */ +// For each message in the ancestor's buffer (determined by childnum) that +// is key-wise between lower_bound_exclusive and upper_bound_inclusive, +// apply the message to the basement node. We treat the bounds as minus +// or plus infinity respectively if they are NULL. Do not mark the node +// as dirty (preserve previous state of 'dirty' bit). static void bnc_apply_messages_to_basement_node( - FT_HANDLE t, // used for comparison function - BASEMENTNODE bn, // where to apply messages + FT_HANDLE t, // used for comparison function + BASEMENTNODE bn, // where to apply messages FTNODE ancestor, // the ancestor node where we can find messages to apply - int childnum, // which child buffer of ancestor contains messages we want - const pivot_bounds &bounds, // contains pivot key bounds of this basement node - txn_gc_info* gc_info, - bool* msgs_applied) { - + int childnum, // which child buffer of ancestor contains messages we want + const pivot_bounds & + bounds, // contains pivot key bounds of this basement node + txn_gc_info *gc_info, + bool *msgs_applied) { int r; NONLEAF_CHILDINFO bnc = BNC(ancestor, childnum); // Determine the offsets in the message trees between which we need to // apply messages from this buffer - STAT64INFO_S stats_delta = {0,0}; + STAT64INFO_S stats_delta = {0, 0}; uint64_t workdone_this_ancestor = 0; int64_t logical_rows_delta = 0; uint32_t stale_lbi, stale_ube; if (!bn->stale_ancestor_messages_applied) { - find_bounds_within_message_tree( - t->ft->cmp, - bnc->stale_message_tree, - &bnc->msg_buffer, - bounds, - &stale_lbi, - &stale_ube); + find_bounds_within_message_tree(t->ft->cmp, + bnc->stale_message_tree, + &bnc->msg_buffer, + bounds, + &stale_lbi, + &stale_ube); } else { stale_lbi = 0; stale_ube = 0; } uint32_t fresh_lbi, fresh_ube; - find_bounds_within_message_tree( - t->ft->cmp, - bnc->fresh_message_tree, - &bnc->msg_buffer, - bounds, - &fresh_lbi, - &fresh_ube); + find_bounds_within_message_tree(t->ft->cmp, + bnc->fresh_message_tree, + &bnc->msg_buffer, + bounds, + &fresh_lbi, + &fresh_ube); // We now know where all the messages we must apply are, so one of the // following 4 cases will do the application, depending on which of @@ -432,44 +428,53 @@ static void bnc_apply_messages_to_basement_node( // We have messages in multiple trees, so we grab all // the relevant messages' offsets and sort them by MSN, then apply // them in MSN order. - const int buffer_size = ((stale_ube - stale_lbi) + - (fresh_ube - fresh_lbi) + - bnc->broadcast_list.size()); + const int buffer_size = + ((stale_ube - stale_lbi) + (fresh_ube - fresh_lbi) + + bnc->broadcast_list.size()); toku::scoped_malloc offsets_buf(buffer_size * sizeof(int32_t)); int32_t *offsets = reinterpret_cast<int32_t *>(offsets_buf.get()); - struct store_msg_buffer_offset_extra sfo_extra = { .offsets = offsets, .i = 0 }; + struct store_msg_buffer_offset_extra sfo_extra = {.offsets = offsets, + .i = 0}; // Populate offsets array with offsets to stale messages - r = bnc->stale_message_tree.iterate_on_range<struct store_msg_buffer_offset_extra, store_msg_buffer_offset>(stale_lbi, stale_ube, &sfo_extra); + r = bnc->stale_message_tree + .iterate_on_range<struct store_msg_buffer_offset_extra, + store_msg_buffer_offset>( + stale_lbi, stale_ube, &sfo_extra); assert_zero(r); // Then store fresh offsets, and mark them to be moved to stale later. - r = bnc->fresh_message_tree.iterate_and_mark_range<struct store_msg_buffer_offset_extra, store_msg_buffer_offset>(fresh_lbi, fresh_ube, &sfo_extra); + r = bnc->fresh_message_tree + .iterate_and_mark_range<struct store_msg_buffer_offset_extra, + store_msg_buffer_offset>( + fresh_lbi, fresh_ube, &sfo_extra); assert_zero(r); // Store offsets of all broadcast messages. - r = bnc->broadcast_list.iterate<struct store_msg_buffer_offset_extra, store_msg_buffer_offset>(&sfo_extra); + r = bnc->broadcast_list.iterate<struct store_msg_buffer_offset_extra, + store_msg_buffer_offset>(&sfo_extra); assert_zero(r); invariant(sfo_extra.i == buffer_size); // Sort by MSN. - toku::sort<int32_t, message_buffer, msg_buffer_offset_msn_cmp>::mergesort_r(offsets, buffer_size, bnc->msg_buffer); + toku::sort<int32_t, message_buffer, msg_buffer_offset_msn_cmp>:: + mergesort_r(offsets, buffer_size, bnc->msg_buffer); // Apply the messages in MSN order. for (int i = 0; i < buffer_size; ++i) { *msgs_applied = true; - do_bn_apply_msg( - t, - bn, - &bnc->msg_buffer, - offsets[i], - gc_info, - &workdone_this_ancestor, - &stats_delta, - &logical_rows_delta); + do_bn_apply_msg(t, + bn, + &bnc->msg_buffer, + offsets[i], + gc_info, + &workdone_this_ancestor, + &stats_delta, + &logical_rows_delta); } } else if (stale_lbi == stale_ube) { - // No stale messages to apply, we just apply fresh messages, and mark them to be moved to stale later. + // No stale messages to apply, we just apply fresh messages, and mark + // them to be moved to stale later. struct iterate_do_bn_apply_msg_extra iter_extra = { .t = t, .bn = bn, @@ -477,16 +482,20 @@ static void bnc_apply_messages_to_basement_node( .gc_info = gc_info, .workdone = &workdone_this_ancestor, .stats_to_update = &stats_delta, - .logical_rows_delta = &logical_rows_delta - }; - if (fresh_ube - fresh_lbi > 0) *msgs_applied = true; - r = bnc->fresh_message_tree.iterate_and_mark_range<struct iterate_do_bn_apply_msg_extra, iterate_do_bn_apply_msg>(fresh_lbi, fresh_ube, &iter_extra); + .logical_rows_delta = &logical_rows_delta}; + if (fresh_ube - fresh_lbi > 0) + *msgs_applied = true; + r = bnc->fresh_message_tree + .iterate_and_mark_range<struct iterate_do_bn_apply_msg_extra, + iterate_do_bn_apply_msg>( + fresh_lbi, fresh_ube, &iter_extra); assert_zero(r); } else { invariant(fresh_lbi == fresh_ube); // No fresh messages to apply, we just apply stale messages. - if (stale_ube - stale_lbi > 0) *msgs_applied = true; + if (stale_ube - stale_lbi > 0) + *msgs_applied = true; struct iterate_do_bn_apply_msg_extra iter_extra = { .t = t, .bn = bn, @@ -494,22 +503,26 @@ static void bnc_apply_messages_to_basement_node( .gc_info = gc_info, .workdone = &workdone_this_ancestor, .stats_to_update = &stats_delta, - .logical_rows_delta = &logical_rows_delta - }; + .logical_rows_delta = &logical_rows_delta}; - r = bnc->stale_message_tree.iterate_on_range<struct iterate_do_bn_apply_msg_extra, iterate_do_bn_apply_msg>(stale_lbi, stale_ube, &iter_extra); + r = bnc->stale_message_tree + .iterate_on_range<struct iterate_do_bn_apply_msg_extra, + iterate_do_bn_apply_msg>( + stale_lbi, stale_ube, &iter_extra); assert_zero(r); } // // update stats // if (workdone_this_ancestor > 0) { - (void) toku_sync_fetch_and_add(&BP_WORKDONE(ancestor, childnum), workdone_this_ancestor); + (void)toku_sync_fetch_and_add(&BP_WORKDONE(ancestor, childnum), + workdone_this_ancestor); } if (stats_delta.numbytes || stats_delta.numrows) { toku_ft_update_stats(&t->ft->in_memory_stats, stats_delta); } toku_ft_adjust_logical_row_count(t->ft, logical_rows_delta); + bn->logical_rows_delta += logical_rows_delta; } static void diff --git a/storage/tokudb/PerconaFT/ft/node.h b/storage/tokudb/PerconaFT/ft/node.h index ad0298e81c5..52eefec0936 100644 --- a/storage/tokudb/PerconaFT/ft/node.h +++ b/storage/tokudb/PerconaFT/ft/node.h @@ -199,6 +199,7 @@ struct ftnode_leaf_basement_node { MSN max_msn_applied; // max message sequence number applied bool stale_ancestor_messages_applied; STAT64INFO_S stat64_delta; // change in stat64 counters since basement was last written to disk + int64_t logical_rows_delta; }; typedef struct ftnode_leaf_basement_node *BASEMENTNODE; diff --git a/storage/tokudb/PerconaFT/ft/serialize/block_allocator.cc b/storage/tokudb/PerconaFT/ft/serialize/block_allocator.cc index 1355f3739ee..19811373d16 100644 --- a/storage/tokudb/PerconaFT/ft/serialize/block_allocator.cc +++ b/storage/tokudb/PerconaFT/ft/serialize/block_allocator.cc @@ -46,415 +46,214 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. #include "portability/toku_stdlib.h" #include "ft/serialize/block_allocator.h" -#include "ft/serialize/block_allocator_strategy.h" +#include "ft/serialize/rbtree_mhs.h" #if TOKU_DEBUG_PARANOID -#define VALIDATE() validate() +#define VALIDATE() Validate() #else #define VALIDATE() #endif -static FILE *ba_trace_file = nullptr; - -void block_allocator::maybe_initialize_trace(void) { - const char *ba_trace_path = getenv("TOKU_BA_TRACE_PATH"); - if (ba_trace_path != nullptr) { - ba_trace_file = toku_os_fopen(ba_trace_path, "w"); - if (ba_trace_file == nullptr) { - fprintf(stderr, "tokuft: error: block allocator trace path found in environment (%s), " - "but it could not be opened for writing (errno %d)\n", - ba_trace_path, get_maybe_error_errno()); - } else { - fprintf(stderr, "tokuft: block allocator tracing enabled, path: %s\n", ba_trace_path); - } - } -} - -void block_allocator::maybe_close_trace() { - if (ba_trace_file != nullptr) { - int r = toku_os_fclose(ba_trace_file); - if (r != 0) { - fprintf(stderr, "tokuft: error: block allocator trace file did not close properly (r %d, errno %d)\n", - r, get_maybe_error_errno()); - } else { - fprintf(stderr, "tokuft: block allocator tracing finished, file closed successfully\n"); - } - } -} - -void block_allocator::_create_internal(uint64_t reserve_at_beginning, uint64_t alignment) { - // the alignment must be at least 512 and aligned with 512 to work with direct I/O - assert(alignment >= 512 && (alignment % 512) == 0); +void BlockAllocator::CreateInternal(uint64_t reserve_at_beginning, + uint64_t alignment) { + // the alignment must be at least 512 and aligned with 512 to work with + // direct I/O + invariant(alignment >= 512 && (alignment % 512) == 0); _reserve_at_beginning = reserve_at_beginning; _alignment = alignment; _n_blocks = 0; - _blocks_array_size = 1; - XMALLOC_N(_blocks_array_size, _blocks_array); _n_bytes_in_use = reserve_at_beginning; - _strategy = BA_STRATEGY_FIRST_FIT; - - memset(&_trace_lock, 0, sizeof(toku_mutex_t)); - toku_mutex_init(&_trace_lock, nullptr); + _tree = new MhsRbTree::Tree(alignment); +} +void BlockAllocator::Create(uint64_t reserve_at_beginning, uint64_t alignment) { + CreateInternal(reserve_at_beginning, alignment); + _tree->Insert({reserve_at_beginning, MAX_BYTE}); VALIDATE(); } -void block_allocator::create(uint64_t reserve_at_beginning, uint64_t alignment) { - _create_internal(reserve_at_beginning, alignment); - _trace_create(); +void BlockAllocator::Destroy() { + delete _tree; } -void block_allocator::destroy() { - toku_free(_blocks_array); - _trace_destroy(); - toku_mutex_destroy(&_trace_lock); -} +void BlockAllocator::CreateFromBlockPairs(uint64_t reserve_at_beginning, + uint64_t alignment, + struct BlockPair *translation_pairs, + uint64_t n_blocks) { + CreateInternal(reserve_at_beginning, alignment); + _n_blocks = n_blocks; -void block_allocator::set_strategy(enum allocation_strategy strategy) { - _strategy = strategy; -} + struct BlockPair *XMALLOC_N(n_blocks, pairs); + memcpy(pairs, translation_pairs, n_blocks * sizeof(struct BlockPair)); + std::sort(pairs, pairs + n_blocks); -void block_allocator::grow_blocks_array_by(uint64_t n_to_add) { - if (_n_blocks + n_to_add > _blocks_array_size) { - uint64_t new_size = _n_blocks + n_to_add; - uint64_t at_least = _blocks_array_size * 2; - if (at_least > new_size) { - new_size = at_least; - } - _blocks_array_size = new_size; - XREALLOC_N(_blocks_array_size, _blocks_array); + if (pairs[0]._offset > reserve_at_beginning) { + _tree->Insert( + {reserve_at_beginning, pairs[0]._offset - reserve_at_beginning}); } -} - -void block_allocator::grow_blocks_array() { - grow_blocks_array_by(1); -} - -void block_allocator::create_from_blockpairs(uint64_t reserve_at_beginning, uint64_t alignment, - struct blockpair *pairs, uint64_t n_blocks) { - _create_internal(reserve_at_beginning, alignment); - - _n_blocks = n_blocks; - grow_blocks_array_by(_n_blocks); - memcpy(_blocks_array, pairs, _n_blocks * sizeof(struct blockpair)); - std::sort(_blocks_array, _blocks_array + _n_blocks); for (uint64_t i = 0; i < _n_blocks; i++) { - // Allocator does not support size 0 blocks. See block_allocator_free_block. - invariant(_blocks_array[i].size > 0); - invariant(_blocks_array[i].offset >= _reserve_at_beginning); - invariant(_blocks_array[i].offset % _alignment == 0); - - _n_bytes_in_use += _blocks_array[i].size; + // Allocator does not support size 0 blocks. See + // block_allocator_free_block. + invariant(pairs[i]._size > 0); + invariant(pairs[i]._offset >= _reserve_at_beginning); + invariant(pairs[i]._offset % _alignment == 0); + + _n_bytes_in_use += pairs[i]._size; + + MhsRbTree::OUUInt64 free_size(MAX_BYTE); + MhsRbTree::OUUInt64 free_offset(pairs[i]._offset + pairs[i]._size); + if (i < n_blocks - 1) { + MhsRbTree::OUUInt64 next_offset(pairs[i + 1]._offset); + invariant(next_offset >= free_offset); + free_size = next_offset - free_offset; + if (free_size == 0) + continue; + } + _tree->Insert({free_offset, free_size}); } - + toku_free(pairs); VALIDATE(); - - _trace_create_from_blockpairs(); } // Effect: align a value by rounding up. -static inline uint64_t align(uint64_t value, uint64_t ba_alignment) { +static inline uint64_t Align(uint64_t value, uint64_t ba_alignment) { return ((value + ba_alignment - 1) / ba_alignment) * ba_alignment; } -struct block_allocator::blockpair * -block_allocator::choose_block_to_alloc_after(size_t size, uint64_t heat) { - switch (_strategy) { - case BA_STRATEGY_FIRST_FIT: - return block_allocator_strategy::first_fit(_blocks_array, _n_blocks, size, _alignment); - case BA_STRATEGY_BEST_FIT: - return block_allocator_strategy::best_fit(_blocks_array, _n_blocks, size, _alignment); - case BA_STRATEGY_HEAT_ZONE: - return block_allocator_strategy::heat_zone(_blocks_array, _n_blocks, size, _alignment, heat); - case BA_STRATEGY_PADDED_FIT: - return block_allocator_strategy::padded_fit(_blocks_array, _n_blocks, size, _alignment); - default: - abort(); - } -} - -// Effect: Allocate a block. The resulting block must be aligned on the ba->alignment (which to make direct_io happy must be a positive multiple of 512). -void block_allocator::alloc_block(uint64_t size, uint64_t heat, uint64_t *offset) { - struct blockpair *bp; - +// Effect: Allocate a block. The resulting block must be aligned on the +// ba->alignment (which to make direct_io happy must be a positive multiple of +// 512). +void BlockAllocator::AllocBlock(uint64_t size, + uint64_t *offset) { // Allocator does not support size 0 blocks. See block_allocator_free_block. invariant(size > 0); - grow_blocks_array(); _n_bytes_in_use += size; + *offset = _tree->Remove(size); - uint64_t end_of_reserve = align(_reserve_at_beginning, _alignment); - - if (_n_blocks == 0) { - // First and only block - assert(_n_bytes_in_use == _reserve_at_beginning + size); // we know exactly how many are in use - _blocks_array[0].offset = align(_reserve_at_beginning, _alignment); - _blocks_array[0].size = size; - *offset = _blocks_array[0].offset; - goto done; - } else if (end_of_reserve + size <= _blocks_array[0].offset ) { - // Check to see if the space immediately after the reserve is big enough to hold the new block. - bp = &_blocks_array[0]; - memmove(bp + 1, bp, _n_blocks * sizeof(*bp)); - bp[0].offset = end_of_reserve; - bp[0].size = size; - *offset = end_of_reserve; - goto done; - } - - bp = choose_block_to_alloc_after(size, heat); - if (bp != nullptr) { - // our allocation strategy chose the space after `bp' to fit the new block - uint64_t answer_offset = align(bp->offset + bp->size, _alignment); - uint64_t blocknum = bp - _blocks_array; - invariant(&_blocks_array[blocknum] == bp); - invariant(blocknum < _n_blocks); - memmove(bp + 2, bp + 1, (_n_blocks - blocknum - 1) * sizeof(*bp)); - bp[1].offset = answer_offset; - bp[1].size = size; - *offset = answer_offset; - } else { - // It didn't fit anywhere, so fit it on the end. - assert(_n_blocks < _blocks_array_size); - bp = &_blocks_array[_n_blocks]; - uint64_t answer_offset = align(bp[-1].offset + bp[-1].size, _alignment); - bp->offset = answer_offset; - bp->size = size; - *offset = answer_offset; - } - -done: _n_blocks++; VALIDATE(); - - _trace_alloc(size, heat, *offset); -} - -// Find the index in the blocks array that has a particular offset. Requires that the block exist. -// Use binary search so it runs fast. -int64_t block_allocator::find_block(uint64_t offset) { - VALIDATE(); - if (_n_blocks == 1) { - assert(_blocks_array[0].offset == offset); - return 0; - } - - uint64_t lo = 0; - uint64_t hi = _n_blocks; - while (1) { - assert(lo < hi); // otherwise no such block exists. - uint64_t mid = (lo + hi) / 2; - uint64_t thisoff = _blocks_array[mid].offset; - if (thisoff < offset) { - lo = mid + 1; - } else if (thisoff > offset) { - hi = mid; - } else { - return mid; - } - } } -// To support 0-sized blocks, we need to include size as an input to this function. +// To support 0-sized blocks, we need to include size as an input to this +// function. // All 0-sized blocks at the same offset can be considered identical, but // a 0-sized block can share offset with a non-zero sized block. -// The non-zero sized block is not exchangable with a zero sized block (or vice versa), -// so inserting 0-sized blocks can cause corruption here. -void block_allocator::free_block(uint64_t offset) { +// The non-zero sized block is not exchangable with a zero sized block (or vice +// versa), so inserting 0-sized blocks can cause corruption here. +void BlockAllocator::FreeBlock(uint64_t offset, uint64_t size) { VALIDATE(); - int64_t bn = find_block(offset); - assert(bn >= 0); // we require that there is a block with that offset. - _n_bytes_in_use -= _blocks_array[bn].size; - memmove(&_blocks_array[bn], &_blocks_array[bn + 1], - (_n_blocks - bn - 1) * sizeof(struct blockpair)); + _n_bytes_in_use -= size; + _tree->Insert({offset, size}); _n_blocks--; VALIDATE(); - - _trace_free(offset); -} - -uint64_t block_allocator::block_size(uint64_t offset) { - int64_t bn = find_block(offset); - assert(bn >=0); // we require that there is a block with that offset. - return _blocks_array[bn].size; } -uint64_t block_allocator::allocated_limit() const { - if (_n_blocks == 0) { - return _reserve_at_beginning; - } else { - struct blockpair *last = &_blocks_array[_n_blocks - 1]; - return last->offset + last->size; - } +uint64_t BlockAllocator::AllocatedLimit() const { + MhsRbTree::Node *max_node = _tree->MaxNode(); + return rbn_offset(max_node).ToInt(); } -// Effect: Consider the blocks in sorted order. The reserved block at the beginning is number 0. The next one is number 1 and so forth. +// Effect: Consider the blocks in sorted order. The reserved block at the +// beginning is number 0. The next one is number 1 and so forth. // Return the offset and size of the block with that number. // Return 0 if there is a block that big, return nonzero if b is too big. -int block_allocator::get_nth_block_in_layout_order(uint64_t b, uint64_t *offset, uint64_t *size) { - if (b ==0 ) { +int BlockAllocator::NthBlockInLayoutOrder(uint64_t b, + uint64_t *offset, + uint64_t *size) { + MhsRbTree::Node *x, *y; + if (b == 0) { *offset = 0; *size = _reserve_at_beginning; - return 0; + return 0; } else if (b > _n_blocks) { return -1; } else { - *offset =_blocks_array[b - 1].offset; - *size =_blocks_array[b - 1].size; + x = _tree->MinNode(); + for (uint64_t i = 1; i <= b; i++) { + y = x; + x = _tree->Successor(x); + } + *size = (rbn_offset(x) - (rbn_offset(y) + rbn_size(y))).ToInt(); + *offset = (rbn_offset(y) + rbn_size(y)).ToInt(); return 0; } } +struct VisUnusedExtra { + TOKU_DB_FRAGMENTATION _report; + uint64_t _align; +}; + +static void VisUnusedCollector(void *extra, + MhsRbTree::Node *node, + uint64_t UU(depth)) { + struct VisUnusedExtra *v_e = (struct VisUnusedExtra *)extra; + TOKU_DB_FRAGMENTATION report = v_e->_report; + uint64_t alignm = v_e->_align; + + MhsRbTree::OUUInt64 offset = rbn_offset(node); + MhsRbTree::OUUInt64 size = rbn_size(node); + MhsRbTree::OUUInt64 answer_offset(Align(offset.ToInt(), alignm)); + uint64_t free_space = (offset + size - answer_offset).ToInt(); + if (free_space > 0) { + report->unused_bytes += free_space; + report->unused_blocks++; + if (free_space > report->largest_unused_block) { + report->largest_unused_block = free_space; + } + } +} // Requires: report->file_size_bytes is filled in // Requires: report->data_bytes is filled in // Requires: report->checkpoint_bytes_additional is filled in -void block_allocator::get_unused_statistics(TOKU_DB_FRAGMENTATION report) { - assert(_n_bytes_in_use == report->data_bytes + report->checkpoint_bytes_additional); +void BlockAllocator::UnusedStatistics(TOKU_DB_FRAGMENTATION report) { + invariant(_n_bytes_in_use == + report->data_bytes + report->checkpoint_bytes_additional); report->unused_bytes = 0; report->unused_blocks = 0; report->largest_unused_block = 0; - if (_n_blocks > 0) { - //Deal with space before block 0 and after reserve: - { - struct blockpair *bp = &_blocks_array[0]; - assert(bp->offset >= align(_reserve_at_beginning, _alignment)); - uint64_t free_space = bp->offset - align(_reserve_at_beginning, _alignment); - if (free_space > 0) { - report->unused_bytes += free_space; - report->unused_blocks++; - if (free_space > report->largest_unused_block) { - report->largest_unused_block = free_space; - } - } - } - - //Deal with space between blocks: - for (uint64_t blocknum = 0; blocknum +1 < _n_blocks; blocknum ++) { - // Consider the space after blocknum - struct blockpair *bp = &_blocks_array[blocknum]; - uint64_t this_offset = bp[0].offset; - uint64_t this_size = bp[0].size; - uint64_t end_of_this_block = align(this_offset+this_size, _alignment); - uint64_t next_offset = bp[1].offset; - uint64_t free_space = next_offset - end_of_this_block; - if (free_space > 0) { - report->unused_bytes += free_space; - report->unused_blocks++; - if (free_space > report->largest_unused_block) { - report->largest_unused_block = free_space; - } - } - } - - //Deal with space after last block - { - struct blockpair *bp = &_blocks_array[_n_blocks-1]; - uint64_t this_offset = bp[0].offset; - uint64_t this_size = bp[0].size; - uint64_t end_of_this_block = align(this_offset+this_size, _alignment); - if (end_of_this_block < report->file_size_bytes) { - uint64_t free_space = report->file_size_bytes - end_of_this_block; - assert(free_space > 0); - report->unused_bytes += free_space; - report->unused_blocks++; - if (free_space > report->largest_unused_block) { - report->largest_unused_block = free_space; - } - } - } - } else { - // No blocks. Just the reserve. - uint64_t end_of_this_block = align(_reserve_at_beginning, _alignment); - if (end_of_this_block < report->file_size_bytes) { - uint64_t free_space = report->file_size_bytes - end_of_this_block; - assert(free_space > 0); - report->unused_bytes += free_space; - report->unused_blocks++; - if (free_space > report->largest_unused_block) { - report->largest_unused_block = free_space; - } - } - } + struct VisUnusedExtra extra = {report, _alignment}; + _tree->InOrderVisitor(VisUnusedCollector, &extra); } -void block_allocator::get_statistics(TOKU_DB_FRAGMENTATION report) { - report->data_bytes = _n_bytes_in_use; - report->data_blocks = _n_blocks; +void BlockAllocator::Statistics(TOKU_DB_FRAGMENTATION report) { + report->data_bytes = _n_bytes_in_use; + report->data_blocks = _n_blocks; report->file_size_bytes = 0; report->checkpoint_bytes_additional = 0; - get_unused_statistics(report); + UnusedStatistics(report); } -void block_allocator::validate() const { - uint64_t n_bytes_in_use = _reserve_at_beginning; - for (uint64_t i = 0; i < _n_blocks; i++) { - n_bytes_in_use += _blocks_array[i].size; - if (i > 0) { - assert(_blocks_array[i].offset > _blocks_array[i - 1].offset); - assert(_blocks_array[i].offset >= _blocks_array[i - 1].offset + _blocks_array[i - 1].size ); - } - } - assert(n_bytes_in_use == _n_bytes_in_use); -} - -// Tracing - -void block_allocator::_trace_create(void) { - if (ba_trace_file != nullptr) { - toku_mutex_lock(&_trace_lock); - fprintf(ba_trace_file, "ba_trace_create %p %" PRIu64 " %" PRIu64 "\n", - this, _reserve_at_beginning, _alignment); - toku_mutex_unlock(&_trace_lock); - - fflush(ba_trace_file); - } -} - -void block_allocator::_trace_create_from_blockpairs(void) { - if (ba_trace_file != nullptr) { - toku_mutex_lock(&_trace_lock); - fprintf(ba_trace_file, "ba_trace_create_from_blockpairs %p %" PRIu64 " %" PRIu64 " ", - this, _reserve_at_beginning, _alignment); - for (uint64_t i = 0; i < _n_blocks; i++) { - fprintf(ba_trace_file, "[%" PRIu64 " %" PRIu64 "] ", - _blocks_array[i].offset, _blocks_array[i].size); - } - fprintf(ba_trace_file, "\n"); - toku_mutex_unlock(&_trace_lock); - - fflush(ba_trace_file); - } -} - -void block_allocator::_trace_destroy(void) { - if (ba_trace_file != nullptr) { - toku_mutex_lock(&_trace_lock); - fprintf(ba_trace_file, "ba_trace_destroy %p\n", this); - toku_mutex_unlock(&_trace_lock); - - fflush(ba_trace_file); - } -} - -void block_allocator::_trace_alloc(uint64_t size, uint64_t heat, uint64_t offset) { - if (ba_trace_file != nullptr) { - toku_mutex_lock(&_trace_lock); - fprintf(ba_trace_file, "ba_trace_alloc %p %" PRIu64 " %" PRIu64 " %" PRIu64 "\n", - this, size, heat, offset); - toku_mutex_unlock(&_trace_lock); - - fflush(ba_trace_file); +struct ValidateExtra { + uint64_t _bytes; + MhsRbTree::Node *_pre_node; +}; +static void VisUsedBlocksInOrder(void *extra, + MhsRbTree::Node *cur_node, + uint64_t UU(depth)) { + struct ValidateExtra *v_e = (struct ValidateExtra *)extra; + MhsRbTree::Node *pre_node = v_e->_pre_node; + // verify no overlaps + if (pre_node) { + invariant(rbn_size(pre_node) > 0); + invariant(rbn_offset(cur_node) > + rbn_offset(pre_node) + rbn_size(pre_node)); + MhsRbTree::OUUInt64 used_space = + rbn_offset(cur_node) - (rbn_offset(pre_node) + rbn_size(pre_node)); + v_e->_bytes += used_space.ToInt(); + } else { + v_e->_bytes += rbn_offset(cur_node).ToInt(); } + v_e->_pre_node = cur_node; } -void block_allocator::_trace_free(uint64_t offset) { - if (ba_trace_file != nullptr) { - toku_mutex_lock(&_trace_lock); - fprintf(ba_trace_file, "ba_trace_free %p %" PRIu64 "\n", this, offset); - toku_mutex_unlock(&_trace_lock); - - fflush(ba_trace_file); - } +void BlockAllocator::Validate() const { + _tree->ValidateBalance(); + _tree->ValidateMhs(); + struct ValidateExtra extra = {0, nullptr}; + _tree->InOrderVisitor(VisUsedBlocksInOrder, &extra); + invariant(extra._bytes == _n_bytes_in_use); } diff --git a/storage/tokudb/PerconaFT/ft/serialize/block_allocator.h b/storage/tokudb/PerconaFT/ft/serialize/block_allocator.h index 9b2c1553e7f..648ea9a9ef2 100644 --- a/storage/tokudb/PerconaFT/ft/serialize/block_allocator.h +++ b/storage/tokudb/PerconaFT/ft/serialize/block_allocator.h @@ -43,6 +43,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. #include "portability/toku_pthread.h" #include "portability/toku_stdint.h" #include "portability/toku_stdlib.h" +#include "ft/serialize/rbtree_mhs.h" // Block allocator. // @@ -51,151 +52,128 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. // The allocation of block numbers is handled elsewhere. // // When creating a block allocator we also specify a certain-sized -// block at the beginning that is preallocated (and cannot be allocated or freed) +// block at the beginning that is preallocated (and cannot be allocated or +// freed) // // We can allocate blocks of a particular size at a particular location. -// We can allocate blocks of a particular size at a location chosen by the allocator. // We can free blocks. // We can determine the size of a block. - -class block_allocator { -public: +#define MAX_BYTE 0xffffffffffffffff +class BlockAllocator { + public: static const size_t BLOCK_ALLOCATOR_ALIGNMENT = 4096; // How much must be reserved at the beginning for the block? - // The actual header is 8+4+4+8+8_4+8+ the length of the db names + 1 pointer for each root. + // The actual header is 8+4+4+8+8_4+8+ the length of the db names + 1 + // pointer for each root. // So 4096 should be enough. static const size_t BLOCK_ALLOCATOR_HEADER_RESERVE = 4096; - - static_assert(BLOCK_ALLOCATOR_HEADER_RESERVE % BLOCK_ALLOCATOR_ALIGNMENT == 0, + + static_assert(BLOCK_ALLOCATOR_HEADER_RESERVE % BLOCK_ALLOCATOR_ALIGNMENT == + 0, "block allocator header must have proper alignment"); - static const size_t BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE = BLOCK_ALLOCATOR_HEADER_RESERVE * 2; + static const size_t BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE = + BLOCK_ALLOCATOR_HEADER_RESERVE * 2; - enum allocation_strategy { - BA_STRATEGY_FIRST_FIT = 1, - BA_STRATEGY_BEST_FIT, - BA_STRATEGY_PADDED_FIT, - BA_STRATEGY_HEAT_ZONE - }; - - struct blockpair { - uint64_t offset; - uint64_t size; - blockpair(uint64_t o, uint64_t s) : - offset(o), size(s) { - } - int operator<(const struct blockpair &rhs) const { - return offset < rhs.offset; - } - int operator<(const uint64_t &o) const { - return offset < o; + struct BlockPair { + uint64_t _offset; + uint64_t _size; + BlockPair(uint64_t o, uint64_t s) : _offset(o), _size(s) {} + int operator<(const struct BlockPair &rhs) const { + return _offset < rhs._offset; } + int operator<(const uint64_t &o) const { return _offset < o; } }; - // Effect: Create a block allocator, in which the first RESERVE_AT_BEGINNING bytes are not put into a block. - // The default allocation strategy is first fit (BA_STRATEGY_FIRST_FIT) + // Effect: Create a block allocator, in which the first RESERVE_AT_BEGINNING + // bytes are not put into a block. + // The default allocation strategy is first fit + // (BA_STRATEGY_FIRST_FIT) // All blocks be start on a multiple of ALIGNMENT. // Aborts if we run out of memory. // Parameters - // reserve_at_beginning (IN) Size of reserved block at beginning. This size does not have to be aligned. + // reserve_at_beginning (IN) Size of reserved block at beginning. + // This size does not have to be aligned. // alignment (IN) Block alignment. - void create(uint64_t reserve_at_beginning, uint64_t alignment); + void Create(uint64_t reserve_at_beginning, uint64_t alignment); - // Effect: Create a block allocator, in which the first RESERVE_AT_BEGINNING bytes are not put into a block. - // The default allocation strategy is first fit (BA_STRATEGY_FIRST_FIT) - // The allocator is initialized to contain `n_blocks' of blockpairs, taken from `pairs' + // Effect: Create a block allocator, in which the first RESERVE_AT_BEGINNING + // bytes are not put into a block. + // The allocator is initialized to contain `n_blocks' of BlockPairs, + // taken from `pairs' // All blocks be start on a multiple of ALIGNMENT. // Aborts if we run out of memory. // Parameters // pairs, unowned array of pairs to copy // n_blocks, Size of pairs array - // reserve_at_beginning (IN) Size of reserved block at beginning. This size does not have to be aligned. + // reserve_at_beginning (IN) Size of reserved block at beginning. + // This size does not have to be aligned. // alignment (IN) Block alignment. - void create_from_blockpairs(uint64_t reserve_at_beginning, uint64_t alignment, - struct blockpair *pairs, uint64_t n_blocks); + void CreateFromBlockPairs(uint64_t reserve_at_beginning, + uint64_t alignment, + struct BlockPair *pairs, + uint64_t n_blocks); // Effect: Destroy this block allocator - void destroy(); - - // Effect: Set the allocation strategy that the allocator should use - // Requires: No other threads are operating on this block allocator - void set_strategy(enum allocation_strategy strategy); + void Destroy(); - // Effect: Allocate a block of the specified size at an address chosen by the allocator. + // Effect: Allocate a block of the specified size at an address chosen by + // the allocator. // Aborts if anything goes wrong. // The block address will be a multiple of the alignment. // Parameters: - // size (IN): The size of the block. (The size does not have to be aligned.) + // size (IN): The size of the block. (The size does not have to be + // aligned.) // offset (OUT): The location of the block. - // heat (IN): A higher heat means we should be prepared to free this block soon (perhaps in the next checkpoint) - // Heat values are lexiographically ordered (like integers), but their specific values are arbitrary - void alloc_block(uint64_t size, uint64_t heat, uint64_t *offset); + // block soon (perhaps in the next checkpoint) + // Heat values are lexiographically ordered (like integers), + // but their specific values are arbitrary + void AllocBlock(uint64_t size, uint64_t *offset); // Effect: Free the block at offset. // Requires: There must be a block currently allocated at that offset. // Parameters: // offset (IN): The offset of the block. - void free_block(uint64_t offset); + void FreeBlock(uint64_t offset, uint64_t size); - // Effect: Return the size of the block that starts at offset. - // Requires: There must be a block currently allocated at that offset. - // Parameters: - // offset (IN): The offset of the block. - uint64_t block_size(uint64_t offset); - - // Effect: Check to see if the block allocator is OK. This may take a long time. + // Effect: Check to see if the block allocator is OK. This may take a long + // time. // Usage Hints: Probably only use this for unit tests. // TODO: Private? - void validate() const; + void Validate() const; // Effect: Return the unallocated block address of "infinite" size. - // That is, return the smallest address that is above all the allocated blocks. - uint64_t allocated_limit() const; + // That is, return the smallest address that is above all the allocated + // blocks. + uint64_t AllocatedLimit() const; - // Effect: Consider the blocks in sorted order. The reserved block at the beginning is number 0. The next one is number 1 and so forth. + // Effect: Consider the blocks in sorted order. The reserved block at the + // beginning is number 0. The next one is number 1 and so forth. // Return the offset and size of the block with that number. // Return 0 if there is a block that big, return nonzero if b is too big. // Rationale: This is probably useful only for tests. - int get_nth_block_in_layout_order(uint64_t b, uint64_t *offset, uint64_t *size); + int NthBlockInLayoutOrder(uint64_t b, uint64_t *offset, uint64_t *size); // Effect: Fill in report to indicate how the file is used. - // Requires: + // Requires: // report->file_size_bytes is filled in // report->data_bytes is filled in // report->checkpoint_bytes_additional is filled in - void get_unused_statistics(TOKU_DB_FRAGMENTATION report); + void UnusedStatistics(TOKU_DB_FRAGMENTATION report); // Effect: Fill in report->data_bytes with the number of bytes in use - // Fill in report->data_blocks with the number of blockpairs in use + // Fill in report->data_blocks with the number of BlockPairs in use // Fill in unused statistics using this->get_unused_statistics() // Requires: // report->file_size is ignored on return // report->checkpoint_bytes_additional is ignored on return - void get_statistics(TOKU_DB_FRAGMENTATION report); - - // Block allocator tracing. - // - Enabled by setting TOKU_BA_TRACE_PATH to the file that the trace file - // should be written to. - // - Trace may be replayed by ba_trace_replay tool in tools/ directory - // eg: "cat mytracefile | ba_trace_replay" - static void maybe_initialize_trace(); - static void maybe_close_trace(); - -private: - void _create_internal(uint64_t reserve_at_beginning, uint64_t alignment); - void grow_blocks_array_by(uint64_t n_to_add); - void grow_blocks_array(); - int64_t find_block(uint64_t offset); - struct blockpair *choose_block_to_alloc_after(size_t size, uint64_t heat); - - // Tracing - toku_mutex_t _trace_lock; - void _trace_create(void); - void _trace_create_from_blockpairs(void); - void _trace_destroy(void); - void _trace_alloc(uint64_t size, uint64_t heat, uint64_t offset); - void _trace_free(uint64_t offset); + void Statistics(TOKU_DB_FRAGMENTATION report); + + virtual ~BlockAllocator(){}; + + private: + void CreateInternal(uint64_t reserve_at_beginning, uint64_t alignment); // How much to reserve at the beginning uint64_t _reserve_at_beginning; @@ -203,12 +181,8 @@ private: uint64_t _alignment; // How many blocks uint64_t _n_blocks; - // How big is the blocks_array. Must be >= n_blocks. - uint64_t _blocks_array_size; - // These blocks are sorted by address. - struct blockpair *_blocks_array; - // Including the reserve_at_beginning uint64_t _n_bytes_in_use; - // The allocation strategy are we using - enum allocation_strategy _strategy; + + // These blocks are sorted by address. + MhsRbTree::Tree *_tree; }; diff --git a/storage/tokudb/PerconaFT/ft/serialize/block_allocator_strategy.cc b/storage/tokudb/PerconaFT/ft/serialize/block_allocator_strategy.cc deleted file mode 100644 index 62bb8fc4a87..00000000000 --- a/storage/tokudb/PerconaFT/ft/serialize/block_allocator_strategy.cc +++ /dev/null @@ -1,224 +0,0 @@ -/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ -// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: -#ident "$Id$" -/*====== -This file is part of PerconaFT. - - -Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. - - PerconaFT is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License, version 2, - as published by the Free Software Foundation. - - PerconaFT is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. - ----------------------------------------- - - PerconaFT is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License, version 3, - as published by the Free Software Foundation. - - PerconaFT is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. -======= */ - -#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." - -#include <algorithm> - -#include <string.h> - -#include "portability/toku_assert.h" - -#include "ft/serialize/block_allocator_strategy.h" - -static uint64_t _align(uint64_t value, uint64_t ba_alignment) { - return ((value + ba_alignment - 1) / ba_alignment) * ba_alignment; -} - -static uint64_t _roundup_to_power_of_two(uint64_t value) { - uint64_t r = 4096; - while (r < value) { - r *= 2; - invariant(r > 0); - } - return r; -} - -// First fit block allocation -static struct block_allocator::blockpair * -_first_fit(struct block_allocator::blockpair *blocks_array, - uint64_t n_blocks, uint64_t size, uint64_t alignment, - uint64_t max_padding) { - if (n_blocks == 1) { - // won't enter loop, can't underflow the direction < 0 case - return nullptr; - } - - struct block_allocator::blockpair *bp = &blocks_array[0]; - for (uint64_t n_spaces_to_check = n_blocks - 1; n_spaces_to_check > 0; - n_spaces_to_check--, bp++) { - // Consider the space after bp - uint64_t padded_alignment = max_padding != 0 ? _align(max_padding, alignment) : alignment; - uint64_t possible_offset = _align(bp->offset + bp->size, padded_alignment); - if (possible_offset + size <= bp[1].offset) { // bp[1] is always valid since bp < &blocks_array[n_blocks-1] - invariant(bp - blocks_array < (int64_t) n_blocks); - return bp; - } - } - return nullptr; -} - -static struct block_allocator::blockpair * -_first_fit_bw(struct block_allocator::blockpair *blocks_array, - uint64_t n_blocks, uint64_t size, uint64_t alignment, - uint64_t max_padding, struct block_allocator::blockpair *blocks_array_limit) { - if (n_blocks == 1) { - // won't enter loop, can't underflow the direction < 0 case - return nullptr; - } - - struct block_allocator::blockpair *bp = &blocks_array[-1]; - for (uint64_t n_spaces_to_check = n_blocks - 1; n_spaces_to_check > 0; - n_spaces_to_check--, bp--) { - // Consider the space after bp - uint64_t padded_alignment = max_padding != 0 ? _align(max_padding, alignment) : alignment; - uint64_t possible_offset = _align(bp->offset + bp->size, padded_alignment); - if (&bp[1] < blocks_array_limit && possible_offset + size <= bp[1].offset) { - invariant(blocks_array - bp < (int64_t) n_blocks); - return bp; - } - } - return nullptr; -} - -struct block_allocator::blockpair * -block_allocator_strategy::first_fit(struct block_allocator::blockpair *blocks_array, - uint64_t n_blocks, uint64_t size, uint64_t alignment) { - return _first_fit(blocks_array, n_blocks, size, alignment, 0); -} - -// Best fit block allocation -struct block_allocator::blockpair * -block_allocator_strategy::best_fit(struct block_allocator::blockpair *blocks_array, - uint64_t n_blocks, uint64_t size, uint64_t alignment) { - struct block_allocator::blockpair *best_bp = nullptr; - uint64_t best_hole_size = 0; - for (uint64_t blocknum = 0; blocknum + 1 < n_blocks; blocknum++) { - // Consider the space after blocknum - struct block_allocator::blockpair *bp = &blocks_array[blocknum]; - uint64_t possible_offset = _align(bp->offset + bp->size, alignment); - uint64_t possible_end_offset = possible_offset + size; - if (possible_end_offset <= bp[1].offset) { - // It fits here. Is it the best fit? - uint64_t hole_size = bp[1].offset - possible_end_offset; - if (best_bp == nullptr || hole_size < best_hole_size) { - best_hole_size = hole_size; - best_bp = bp; - } - } - } - return best_bp; -} - -static uint64_t padded_fit_alignment = 4096; - -// TODO: These compiler specific directives should be abstracted in a portability header -// portability/toku_compiler.h? -__attribute__((__constructor__)) -static void determine_padded_fit_alignment_from_env(void) { - // TODO: Should be in portability as 'toku_os_getenv()?' - const char *s = getenv("TOKU_BA_PADDED_FIT_ALIGNMENT"); - if (s != nullptr && strlen(s) > 0) { - const int64_t alignment = strtoll(s, nullptr, 10); - if (alignment <= 0) { - fprintf(stderr, "tokuft: error: block allocator padded fit alignment found in environment (%s), " - "but it's out of range (should be an integer > 0). defaulting to %" PRIu64 "\n", - s, padded_fit_alignment); - } else { - padded_fit_alignment = _roundup_to_power_of_two(alignment); - fprintf(stderr, "tokuft: setting block allocator padded fit alignment to %" PRIu64 "\n", - padded_fit_alignment); - } - } -} - -// First fit into a block that is oversized by up to max_padding. -// The hope is that if we purposefully waste a bit of space at allocation -// time we'll be more likely to reuse this block later. -struct block_allocator::blockpair * -block_allocator_strategy::padded_fit(struct block_allocator::blockpair *blocks_array, - uint64_t n_blocks, uint64_t size, uint64_t alignment) { - return _first_fit(blocks_array, n_blocks, size, alignment, padded_fit_alignment); -} - -static double hot_zone_threshold = 0.85; - -// TODO: These compiler specific directives should be abstracted in a portability header -// portability/toku_compiler.h? -__attribute__((__constructor__)) -static void determine_hot_zone_threshold_from_env(void) { - // TODO: Should be in portability as 'toku_os_getenv()?' - const char *s = getenv("TOKU_BA_HOT_ZONE_THRESHOLD"); - if (s != nullptr && strlen(s) > 0) { - const double hot_zone = strtod(s, nullptr); - if (hot_zone < 1 || hot_zone > 99) { - fprintf(stderr, "tokuft: error: block allocator hot zone threshold found in environment (%s), " - "but it's out of range (should be an integer 1 through 99). defaulting to 85\n", s); - hot_zone_threshold = 85 / 100; - } else { - fprintf(stderr, "tokuft: setting block allocator hot zone threshold to %s\n", s); - hot_zone_threshold = hot_zone / 100; - } - } -} - -struct block_allocator::blockpair * -block_allocator_strategy::heat_zone(struct block_allocator::blockpair *blocks_array, - uint64_t n_blocks, uint64_t size, uint64_t alignment, - uint64_t heat) { - if (heat > 0) { - struct block_allocator::blockpair *bp, *boundary_bp; - - // Hot allocation. Find the beginning of the hot zone. - boundary_bp = &blocks_array[n_blocks - 1]; - uint64_t highest_offset = _align(boundary_bp->offset + boundary_bp->size, alignment); - uint64_t hot_zone_offset = static_cast<uint64_t>(hot_zone_threshold * highest_offset); - - boundary_bp = std::lower_bound(blocks_array, blocks_array + n_blocks, hot_zone_offset); - uint64_t blocks_in_zone = (blocks_array + n_blocks) - boundary_bp; - uint64_t blocks_outside_zone = boundary_bp - blocks_array; - invariant(blocks_in_zone + blocks_outside_zone == n_blocks); - - if (blocks_in_zone > 0) { - // Find the first fit in the hot zone, going forward. - bp = _first_fit(boundary_bp, blocks_in_zone, size, alignment, 0); - if (bp != nullptr) { - return bp; - } - } - if (blocks_outside_zone > 0) { - // Find the first fit in the cold zone, going backwards. - bp = _first_fit_bw(boundary_bp, blocks_outside_zone, size, alignment, 0, &blocks_array[n_blocks]); - if (bp != nullptr) { - return bp; - } - } - } else { - // Cold allocations are simply first-fit from the beginning. - return _first_fit(blocks_array, n_blocks, size, alignment, 0); - } - return nullptr; -} diff --git a/storage/tokudb/PerconaFT/ft/serialize/block_table.cc b/storage/tokudb/PerconaFT/ft/serialize/block_table.cc index 7101ba9f58c..d2532134d96 100644 --- a/storage/tokudb/PerconaFT/ft/serialize/block_table.cc +++ b/storage/tokudb/PerconaFT/ft/serialize/block_table.cc @@ -46,31 +46,27 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. #include "ft/ft-internal.h" // TODO: reorganize this dependency (FT-303) -#include "ft/ft-ops.h" // for toku_maybe_truncate_file +#include "ft/ft-ops.h" // for toku_maybe_truncate_file #include "ft/serialize/block_table.h" #include "ft/serialize/rbuf.h" #include "ft/serialize/wbuf.h" #include "ft/serialize/block_allocator.h" - #include "util/nb_mutex.h" #include "util/scoped_malloc.h" // indicates the end of a freelist -static const BLOCKNUM freelist_null = { -1 }; +static const BLOCKNUM freelist_null = {-1}; // value of block_translation_pair.size if blocknum is unused -static const DISKOFF size_is_free = (DISKOFF) -1; +static const DISKOFF size_is_free = (DISKOFF)-1; -// value of block_translation_pair.u.diskoff if blocknum is used but does not yet have a diskblock -static const DISKOFF diskoff_unused = (DISKOFF) -2; +// value of block_translation_pair.u.diskoff if blocknum is used but does not +// yet have a diskblock +static const DISKOFF diskoff_unused = (DISKOFF)-2; -void block_table::_mutex_lock() { - toku_mutex_lock(&_mutex); -} +void block_table::_mutex_lock() { toku_mutex_lock(&_mutex); } -void block_table::_mutex_unlock() { - toku_mutex_unlock(&_mutex); -} +void block_table::_mutex_unlock() { toku_mutex_unlock(&_mutex); } // TODO: Move lock to FT void toku_ft_lock(FT ft) { @@ -85,13 +81,16 @@ void toku_ft_unlock(FT ft) { bt->_mutex_unlock(); } -// There are two headers: the reserve must fit them both and be suitably aligned. -static_assert(block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE % - block_allocator::BLOCK_ALLOCATOR_ALIGNMENT == 0, +// There are two headers: the reserve must fit them both and be suitably +// aligned. +static_assert(BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE % + BlockAllocator::BLOCK_ALLOCATOR_ALIGNMENT == + 0, "Block allocator's header reserve must be suitibly aligned"); -static_assert(block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE * 2 == - block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, - "Block allocator's total header reserve must exactly fit two headers"); +static_assert( + BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE * 2 == + BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, + "Block allocator's total header reserve must exactly fit two headers"); // does NOT initialize the block allocator: the caller is responsible void block_table::_create_internal() { @@ -99,25 +98,30 @@ void block_table::_create_internal() { memset(&_inprogress, 0, sizeof(struct translation)); memset(&_checkpointed, 0, sizeof(struct translation)); memset(&_mutex, 0, sizeof(_mutex)); + _bt_block_allocator = new BlockAllocator(); toku_mutex_init(&_mutex, nullptr); nb_mutex_init(&_safe_file_size_lock); } -// Fill in the checkpointed translation from buffer, and copy checkpointed to current. -// The one read from disk is the last known checkpointed one, so we are keeping it in -// place and then setting current (which is never stored on disk) for current use. -// The translation_buffer has translation only, we create the rest of the block_table. -int block_table::create_from_buffer(int fd, - DISKOFF location_on_disk, //Location of translation_buffer - DISKOFF size_on_disk, - unsigned char *translation_buffer) { +// Fill in the checkpointed translation from buffer, and copy checkpointed to +// current. +// The one read from disk is the last known checkpointed one, so we are keeping +// it in +// place and then setting current (which is never stored on disk) for current +// use. +// The translation_buffer has translation only, we create the rest of the +// block_table. +int block_table::create_from_buffer( + int fd, + DISKOFF location_on_disk, // Location of translation_buffer + DISKOFF size_on_disk, + unsigned char *translation_buffer) { // Does not initialize the block allocator _create_internal(); // Deserialize the translation and copy it to current - int r = _translation_deserialize_from_buffer(&_checkpointed, - location_on_disk, size_on_disk, - translation_buffer); + int r = _translation_deserialize_from_buffer( + &_checkpointed, location_on_disk, size_on_disk, translation_buffer); if (r != 0) { return r; } @@ -130,22 +134,26 @@ int block_table::create_from_buffer(int fd, invariant(file_size >= 0); _safe_file_size = file_size; - // Gather the non-empty translations and use them to create the block allocator + // Gather the non-empty translations and use them to create the block + // allocator toku::scoped_malloc pairs_buf(_checkpointed.smallest_never_used_blocknum.b * - sizeof(struct block_allocator::blockpair)); - struct block_allocator::blockpair *CAST_FROM_VOIDP(pairs, pairs_buf.get()); + sizeof(struct BlockAllocator::BlockPair)); + struct BlockAllocator::BlockPair *CAST_FROM_VOIDP(pairs, pairs_buf.get()); uint64_t n_pairs = 0; for (int64_t i = 0; i < _checkpointed.smallest_never_used_blocknum.b; i++) { struct block_translation_pair pair = _checkpointed.block_translation[i]; if (pair.size > 0) { invariant(pair.u.diskoff != diskoff_unused); - pairs[n_pairs++] = block_allocator::blockpair(pair.u.diskoff, pair.size); + pairs[n_pairs++] = + BlockAllocator::BlockPair(pair.u.diskoff, pair.size); } } - _bt_block_allocator.create_from_blockpairs(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, - block_allocator::BLOCK_ALLOCATOR_ALIGNMENT, - pairs, n_pairs); + _bt_block_allocator->CreateFromBlockPairs( + BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, + BlockAllocator::BLOCK_ALLOCATOR_ALIGNMENT, + pairs, + n_pairs); return 0; } @@ -155,8 +163,10 @@ void block_table::create() { _create_internal(); _checkpointed.type = TRANSLATION_CHECKPOINTED; - _checkpointed.smallest_never_used_blocknum = make_blocknum(RESERVED_BLOCKNUMS); - _checkpointed.length_of_array = _checkpointed.smallest_never_used_blocknum.b; + _checkpointed.smallest_never_used_blocknum = + make_blocknum(RESERVED_BLOCKNUMS); + _checkpointed.length_of_array = + _checkpointed.smallest_never_used_blocknum.b; _checkpointed.blocknum_freelist_head = freelist_null; XMALLOC_N(_checkpointed.length_of_array, _checkpointed.block_translation); for (int64_t i = 0; i < _checkpointed.length_of_array; i++) { @@ -164,12 +174,13 @@ void block_table::create() { _checkpointed.block_translation[i].u.diskoff = diskoff_unused; } - // we just created a default checkpointed, now copy it to current. + // we just created a default checkpointed, now copy it to current. _copy_translation(&_current, &_checkpointed, TRANSLATION_CURRENT); // Create an empty block allocator. - _bt_block_allocator.create(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, - block_allocator::BLOCK_ALLOCATOR_ALIGNMENT); + _bt_block_allocator->Create( + BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, + BlockAllocator::BLOCK_ALLOCATOR_ALIGNMENT); } // TODO: Refactor with FT-303 @@ -185,20 +196,24 @@ static void ft_set_dirty(FT ft, bool for_checkpoint) { void block_table::_maybe_truncate_file(int fd, uint64_t size_needed_before) { toku_mutex_assert_locked(&_mutex); - uint64_t new_size_needed = _bt_block_allocator.allocated_limit(); - //Save a call to toku_os_get_file_size (kernel call) if unlikely to be useful. - if (new_size_needed < size_needed_before && new_size_needed < _safe_file_size) { + uint64_t new_size_needed = _bt_block_allocator->AllocatedLimit(); + // Save a call to toku_os_get_file_size (kernel call) if unlikely to be + // useful. + if (new_size_needed < size_needed_before && + new_size_needed < _safe_file_size) { nb_mutex_lock(&_safe_file_size_lock, &_mutex); // Must hold _safe_file_size_lock to change _safe_file_size. if (new_size_needed < _safe_file_size) { int64_t safe_file_size_before = _safe_file_size; - // Not safe to use the 'to-be-truncated' portion until truncate is done. + // Not safe to use the 'to-be-truncated' portion until truncate is + // done. _safe_file_size = new_size_needed; _mutex_unlock(); uint64_t size_after; - toku_maybe_truncate_file(fd, new_size_needed, safe_file_size_before, &size_after); + toku_maybe_truncate_file( + fd, new_size_needed, safe_file_size_before, &size_after); _mutex_lock(); _safe_file_size = size_after; @@ -213,26 +228,35 @@ void block_table::maybe_truncate_file_on_open(int fd) { _mutex_unlock(); } -void block_table::_copy_translation(struct translation *dst, struct translation *src, enum translation_type newtype) { - // We intend to malloc a fresh block, so the incoming translation should be empty +void block_table::_copy_translation(struct translation *dst, + struct translation *src, + enum translation_type newtype) { + // We intend to malloc a fresh block, so the incoming translation should be + // empty invariant_null(dst->block_translation); invariant(src->length_of_array >= src->smallest_never_used_blocknum.b); invariant(newtype == TRANSLATION_DEBUG || - (src->type == TRANSLATION_CURRENT && newtype == TRANSLATION_INPROGRESS) || - (src->type == TRANSLATION_CHECKPOINTED && newtype == TRANSLATION_CURRENT)); + (src->type == TRANSLATION_CURRENT && + newtype == TRANSLATION_INPROGRESS) || + (src->type == TRANSLATION_CHECKPOINTED && + newtype == TRANSLATION_CURRENT)); dst->type = newtype; dst->smallest_never_used_blocknum = src->smallest_never_used_blocknum; - dst->blocknum_freelist_head = src->blocknum_freelist_head; + dst->blocknum_freelist_head = src->blocknum_freelist_head; - // destination btt is of fixed size. Allocate + memcpy the exact length necessary. + // destination btt is of fixed size. Allocate + memcpy the exact length + // necessary. dst->length_of_array = dst->smallest_never_used_blocknum.b; XMALLOC_N(dst->length_of_array, dst->block_translation); - memcpy(dst->block_translation, src->block_translation, dst->length_of_array * sizeof(*dst->block_translation)); + memcpy(dst->block_translation, + src->block_translation, + dst->length_of_array * sizeof(*dst->block_translation)); // New version of btt is not yet stored on disk. dst->block_translation[RESERVED_BLOCKNUM_TRANSLATION].size = 0; - dst->block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff = diskoff_unused; + dst->block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff = + diskoff_unused; } int64_t block_table::get_blocks_in_use_unlocked() { @@ -240,8 +264,9 @@ int64_t block_table::get_blocks_in_use_unlocked() { struct translation *t = &_current; int64_t num_blocks = 0; { - //Reserved blocknums do not get upgraded; They are part of the header. - for (b.b = RESERVED_BLOCKNUMS; b.b < t->smallest_never_used_blocknum.b; b.b++) { + // Reserved blocknums do not get upgraded; They are part of the header. + for (b.b = RESERVED_BLOCKNUMS; b.b < t->smallest_never_used_blocknum.b; + b.b++) { if (t->block_translation[b.b].size != size_is_free) { num_blocks++; } @@ -251,38 +276,43 @@ int64_t block_table::get_blocks_in_use_unlocked() { } void block_table::_maybe_optimize_translation(struct translation *t) { - //Reduce 'smallest_never_used_blocknum.b' (completely free blocknums instead of just - //on a free list. Doing so requires us to regenerate the free list. - //This is O(n) work, so do it only if you're already doing that. + // Reduce 'smallest_never_used_blocknum.b' (completely free blocknums + // instead of just + // on a free list. Doing so requires us to regenerate the free list. + // This is O(n) work, so do it only if you're already doing that. BLOCKNUM b; paranoid_invariant(t->smallest_never_used_blocknum.b >= RESERVED_BLOCKNUMS); - //Calculate how large the free suffix is. + // Calculate how large the free suffix is. int64_t freed; { - for (b.b = t->smallest_never_used_blocknum.b; b.b > RESERVED_BLOCKNUMS; b.b--) { - if (t->block_translation[b.b-1].size != size_is_free) { + for (b.b = t->smallest_never_used_blocknum.b; b.b > RESERVED_BLOCKNUMS; + b.b--) { + if (t->block_translation[b.b - 1].size != size_is_free) { break; } } freed = t->smallest_never_used_blocknum.b - b.b; } - if (freed>0) { + if (freed > 0) { t->smallest_never_used_blocknum.b = b.b; - if (t->length_of_array/4 > t->smallest_never_used_blocknum.b) { - //We're using more memory than necessary to represent this now. Reduce. + if (t->length_of_array / 4 > t->smallest_never_used_blocknum.b) { + // We're using more memory than necessary to represent this now. + // Reduce. uint64_t new_length = t->smallest_never_used_blocknum.b * 2; XREALLOC_N(new_length, t->block_translation); t->length_of_array = new_length; - //No need to zero anything out. + // No need to zero anything out. } - //Regenerate free list. + // Regenerate free list. t->blocknum_freelist_head.b = freelist_null.b; - for (b.b = RESERVED_BLOCKNUMS; b.b < t->smallest_never_used_blocknum.b; b.b++) { + for (b.b = RESERVED_BLOCKNUMS; b.b < t->smallest_never_used_blocknum.b; + b.b++) { if (t->block_translation[b.b].size == size_is_free) { - t->block_translation[b.b].u.next_free_blocknum = t->blocknum_freelist_head; - t->blocknum_freelist_head = b; + t->block_translation[b.b].u.next_free_blocknum = + t->blocknum_freelist_head; + t->blocknum_freelist_head = b; } } } @@ -303,14 +333,16 @@ void block_table::note_start_checkpoint_unlocked() { } void block_table::note_skipped_checkpoint() { - //Purpose, alert block translation that the checkpoint was skipped, e.x. for a non-dirty header + // Purpose, alert block translation that the checkpoint was skipped, e.x. + // for a non-dirty header _mutex_lock(); paranoid_invariant_notnull(_inprogress.block_translation); _checkpoint_skipped = true; _mutex_unlock(); } -// Purpose: free any disk space used by previous checkpoint that isn't in use by either +// Purpose: free any disk space used by previous checkpoint that isn't in use by +// either // - current state // - in-progress checkpoint // capture inprogress as new checkpointed. @@ -323,7 +355,7 @@ void block_table::note_skipped_checkpoint() { void block_table::note_end_checkpoint(int fd) { // Free unused blocks _mutex_lock(); - uint64_t allocated_limit_at_start = _bt_block_allocator.allocated_limit(); + uint64_t allocated_limit_at_start = _bt_block_allocator->AllocatedLimit(); paranoid_invariant_notnull(_inprogress.block_translation); if (_checkpoint_skipped) { toku_free(_inprogress.block_translation); @@ -331,17 +363,23 @@ void block_table::note_end_checkpoint(int fd) { goto end; } - //Make certain inprogress was allocated space on disk - assert(_inprogress.block_translation[RESERVED_BLOCKNUM_TRANSLATION].size > 0); - assert(_inprogress.block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff > 0); + // Make certain inprogress was allocated space on disk + invariant( + _inprogress.block_translation[RESERVED_BLOCKNUM_TRANSLATION].size > 0); + invariant( + _inprogress.block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff > + 0); { struct translation *t = &_checkpointed; for (int64_t i = 0; i < t->length_of_array; i++) { struct block_translation_pair *pair = &t->block_translation[i]; - if (pair->size > 0 && !_translation_prevents_freeing(&_inprogress, make_blocknum(i), pair)) { - assert(!_translation_prevents_freeing(&_current, make_blocknum(i), pair)); - _bt_block_allocator.free_block(pair->u.diskoff); + if (pair->size > 0 && + !_translation_prevents_freeing( + &_inprogress, make_blocknum(i), pair)) { + invariant(!_translation_prevents_freeing( + &_current, make_blocknum(i), pair)); + _bt_block_allocator->FreeBlock(pair->u.diskoff, pair->size); } } toku_free(_checkpointed.block_translation); @@ -359,53 +397,65 @@ bool block_table::_is_valid_blocknum(struct translation *t, BLOCKNUM b) { return b.b >= 0 && b.b < t->smallest_never_used_blocknum.b; } -void block_table::_verify_valid_blocknum(struct translation *UU(t), BLOCKNUM UU(b)) { +void block_table::_verify_valid_blocknum(struct translation *UU(t), + BLOCKNUM UU(b)) { invariant(_is_valid_blocknum(t, b)); } -bool block_table::_is_valid_freeable_blocknum(struct translation *t, BLOCKNUM b) { +bool block_table::_is_valid_freeable_blocknum(struct translation *t, + BLOCKNUM b) { invariant(t->length_of_array >= t->smallest_never_used_blocknum.b); return b.b >= RESERVED_BLOCKNUMS && b.b < t->smallest_never_used_blocknum.b; } // should be freeable -void block_table::_verify_valid_freeable_blocknum(struct translation *UU(t), BLOCKNUM UU(b)) { +void block_table::_verify_valid_freeable_blocknum(struct translation *UU(t), + BLOCKNUM UU(b)) { invariant(_is_valid_freeable_blocknum(t, b)); } // Also used only in ft-serialize-test. -void block_table::block_free(uint64_t offset) { +void block_table::block_free(uint64_t offset, uint64_t size) { _mutex_lock(); - _bt_block_allocator.free_block(offset); + _bt_block_allocator->FreeBlock(offset, size); _mutex_unlock(); } int64_t block_table::_calculate_size_on_disk(struct translation *t) { - return 8 + // smallest_never_used_blocknum - 8 + // blocknum_freelist_head - t->smallest_never_used_blocknum.b * 16 + // Array - 4; // 4 for checksum + return 8 + // smallest_never_used_blocknum + 8 + // blocknum_freelist_head + t->smallest_never_used_blocknum.b * 16 + // Array + 4; // 4 for checksum } -// We cannot free the disk space allocated to this blocknum if it is still in use by the given translation table. -bool block_table::_translation_prevents_freeing(struct translation *t, BLOCKNUM b, struct block_translation_pair *old_pair) { - return t->block_translation && - b.b < t->smallest_never_used_blocknum.b && +// We cannot free the disk space allocated to this blocknum if it is still in +// use by the given translation table. +bool block_table::_translation_prevents_freeing( + struct translation *t, + BLOCKNUM b, + struct block_translation_pair *old_pair) { + return t->block_translation && b.b < t->smallest_never_used_blocknum.b && old_pair->u.diskoff == t->block_translation[b.b].u.diskoff; } -void block_table::_realloc_on_disk_internal(BLOCKNUM b, DISKOFF size, DISKOFF *offset, FT ft, bool for_checkpoint, uint64_t heat) { +void block_table::_realloc_on_disk_internal(BLOCKNUM b, + DISKOFF size, + DISKOFF *offset, + FT ft, + bool for_checkpoint) { toku_mutex_assert_locked(&_mutex); ft_set_dirty(ft, for_checkpoint); struct translation *t = &_current; struct block_translation_pair old_pair = t->block_translation[b.b]; - //Free the old block if it is not still in use by the checkpoint in progress or the previous checkpoint - bool cannot_free = (bool) - ((!for_checkpoint && _translation_prevents_freeing(&_inprogress, b, &old_pair)) || - _translation_prevents_freeing(&_checkpointed, b, &old_pair)); - if (!cannot_free && old_pair.u.diskoff!=diskoff_unused) { - _bt_block_allocator.free_block(old_pair.u.diskoff); + // Free the old block if it is not still in use by the checkpoint in + // progress or the previous checkpoint + bool cannot_free = + (!for_checkpoint && + _translation_prevents_freeing(&_inprogress, b, &old_pair)) || + _translation_prevents_freeing(&_checkpointed, b, &old_pair); + if (!cannot_free && old_pair.u.diskoff != diskoff_unused) { + _bt_block_allocator->FreeBlock(old_pair.u.diskoff, old_pair.size); } uint64_t allocator_offset = diskoff_unused; @@ -413,19 +463,22 @@ void block_table::_realloc_on_disk_internal(BLOCKNUM b, DISKOFF size, DISKOFF *o if (size > 0) { // Allocate a new block if the size is greater than 0, // if the size is just 0, offset will be set to diskoff_unused - _bt_block_allocator.alloc_block(size, heat, &allocator_offset); + _bt_block_allocator->AllocBlock(size, &allocator_offset); } t->block_translation[b.b].u.diskoff = allocator_offset; *offset = allocator_offset; - //Update inprogress btt if appropriate (if called because Pending bit is set). + // Update inprogress btt if appropriate (if called because Pending bit is + // set). if (for_checkpoint) { paranoid_invariant(b.b < _inprogress.length_of_array); _inprogress.block_translation[b.b] = t->block_translation[b.b]; } } -void block_table::_ensure_safe_write_unlocked(int fd, DISKOFF block_size, DISKOFF block_offset) { +void block_table::_ensure_safe_write_unlocked(int fd, + DISKOFF block_size, + DISKOFF block_offset) { // Requires: holding _mutex uint64_t size_needed = block_size + block_offset; if (size_needed > _safe_file_size) { @@ -435,7 +488,8 @@ void block_table::_ensure_safe_write_unlocked(int fd, DISKOFF block_size, DISKOF _mutex_unlock(); int64_t size_after; - toku_maybe_preallocate_in_file(fd, size_needed, _safe_file_size, &size_after); + toku_maybe_preallocate_in_file( + fd, size_needed, _safe_file_size, &size_after); _mutex_lock(); _safe_file_size = size_after; @@ -444,11 +498,16 @@ void block_table::_ensure_safe_write_unlocked(int fd, DISKOFF block_size, DISKOF } } -void block_table::realloc_on_disk(BLOCKNUM b, DISKOFF size, DISKOFF *offset, FT ft, int fd, bool for_checkpoint, uint64_t heat) { +void block_table::realloc_on_disk(BLOCKNUM b, + DISKOFF size, + DISKOFF *offset, + FT ft, + int fd, + bool for_checkpoint) { _mutex_lock(); struct translation *t = &_current; _verify_valid_freeable_blocknum(t, b); - _realloc_on_disk_internal(b, size, offset, ft, for_checkpoint, heat); + _realloc_on_disk_internal(b, size, offset, ft, for_checkpoint); _ensure_safe_write_unlocked(fd, size, *offset); _mutex_unlock(); @@ -458,70 +517,97 @@ bool block_table::_pair_is_unallocated(struct block_translation_pair *pair) { return pair->size == 0 && pair->u.diskoff == diskoff_unused; } -// Effect: figure out where to put the inprogress btt on disk, allocate space for it there. -// The space must be 512-byte aligned (both the starting address and the size). -// As a result, the allcoated space may be a little bit bigger (up to the next 512-byte boundary) than the actual btt. +// Effect: figure out where to put the inprogress btt on disk, allocate space +// for it there. +// The space must be 512-byte aligned (both the starting address and the +// size). +// As a result, the allcoated space may be a little bit bigger (up to the next +// 512-byte boundary) than the actual btt. void block_table::_alloc_inprogress_translation_on_disk_unlocked() { toku_mutex_assert_locked(&_mutex); struct translation *t = &_inprogress; paranoid_invariant_notnull(t->block_translation); BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_TRANSLATION); - //Each inprogress is allocated only once + // Each inprogress is allocated only once paranoid_invariant(_pair_is_unallocated(&t->block_translation[b.b])); - //Allocate a new block + // Allocate a new block int64_t size = _calculate_size_on_disk(t); uint64_t offset; - _bt_block_allocator.alloc_block(size, 0, &offset); + _bt_block_allocator->AllocBlock(size, &offset); t->block_translation[b.b].u.diskoff = offset; - t->block_translation[b.b].size = size; + t->block_translation[b.b].size = size; } // Effect: Serializes the blocktable to a wbuf (which starts uninitialized) -// A clean shutdown runs checkpoint start so that current and inprogress are copies. -// The resulting wbuf buffer is guaranteed to be be 512-byte aligned and the total length is a multiple of 512 (so we pad with zeros at the end if needd) -// The address is guaranteed to be 512-byte aligned, but the size is not guaranteed. -// It *is* guaranteed that we can read up to the next 512-byte boundary, however -void block_table::serialize_translation_to_wbuf(int fd, struct wbuf *w, - int64_t *address, int64_t *size) { +// A clean shutdown runs checkpoint start so that current and inprogress are +// copies. +// The resulting wbuf buffer is guaranteed to be be 512-byte aligned and the +// total length is a multiple of 512 (so we pad with zeros at the end if +// needd) +// The address is guaranteed to be 512-byte aligned, but the size is not +// guaranteed. +// It *is* guaranteed that we can read up to the next 512-byte boundary, +// however +void block_table::serialize_translation_to_wbuf(int fd, + struct wbuf *w, + int64_t *address, + int64_t *size) { _mutex_lock(); struct translation *t = &_inprogress; BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_TRANSLATION); - _alloc_inprogress_translation_on_disk_unlocked(); // The allocated block must be 512-byte aligned to make O_DIRECT happy. + _alloc_inprogress_translation_on_disk_unlocked(); // The allocated block + // must be 512-byte + // aligned to make + // O_DIRECT happy. uint64_t size_translation = _calculate_size_on_disk(t); - uint64_t size_aligned = roundup_to_multiple(512, size_translation); - assert((int64_t)size_translation==t->block_translation[b.b].size); + uint64_t size_aligned = roundup_to_multiple(512, size_translation); + invariant((int64_t)size_translation == t->block_translation[b.b].size); { - //Init wbuf + // Init wbuf if (0) - printf("%s:%d writing translation table of size_translation %" PRIu64 " at %" PRId64 "\n", __FILE__, __LINE__, size_translation, t->block_translation[b.b].u.diskoff); + printf( + "%s:%d writing translation table of size_translation %" PRIu64 + " at %" PRId64 "\n", + __FILE__, + __LINE__, + size_translation, + t->block_translation[b.b].u.diskoff); char *XMALLOC_N_ALIGNED(512, size_aligned, buf); - for (uint64_t i=size_translation; i<size_aligned; i++) buf[i]=0; // fill in the end of the buffer with zeros. + for (uint64_t i = size_translation; i < size_aligned; i++) + buf[i] = 0; // fill in the end of the buffer with zeros. wbuf_init(w, buf, size_aligned); } - wbuf_BLOCKNUM(w, t->smallest_never_used_blocknum); - wbuf_BLOCKNUM(w, t->blocknum_freelist_head); + wbuf_BLOCKNUM(w, t->smallest_never_used_blocknum); + wbuf_BLOCKNUM(w, t->blocknum_freelist_head); int64_t i; - for (i=0; i<t->smallest_never_used_blocknum.b; i++) { + for (i = 0; i < t->smallest_never_used_blocknum.b; i++) { if (0) - printf("%s:%d %" PRId64 ",%" PRId64 "\n", __FILE__, __LINE__, t->block_translation[i].u.diskoff, t->block_translation[i].size); + printf("%s:%d %" PRId64 ",%" PRId64 "\n", + __FILE__, + __LINE__, + t->block_translation[i].u.diskoff, + t->block_translation[i].size); wbuf_DISKOFF(w, t->block_translation[i].u.diskoff); wbuf_DISKOFF(w, t->block_translation[i].size); } uint32_t checksum = toku_x1764_finish(&w->checksum); wbuf_int(w, checksum); *address = t->block_translation[b.b].u.diskoff; - *size = size_translation; - assert((*address)%512 == 0); + *size = size_translation; + invariant((*address) % 512 == 0); _ensure_safe_write_unlocked(fd, size_aligned, *address); _mutex_unlock(); } -// Perhaps rename: purpose is get disk address of a block, given its blocknum (blockid?) -void block_table::_translate_blocknum_to_offset_size_unlocked(BLOCKNUM b, DISKOFF *offset, DISKOFF *size) { +// Perhaps rename: purpose is get disk address of a block, given its blocknum +// (blockid?) +void block_table::_translate_blocknum_to_offset_size_unlocked(BLOCKNUM b, + DISKOFF *offset, + DISKOFF *size) { struct translation *t = &_current; _verify_valid_blocknum(t, b); if (offset) { @@ -532,8 +618,11 @@ void block_table::_translate_blocknum_to_offset_size_unlocked(BLOCKNUM b, DISKOF } } -// Perhaps rename: purpose is get disk address of a block, given its blocknum (blockid?) -void block_table::translate_blocknum_to_offset_size(BLOCKNUM b, DISKOFF *offset, DISKOFF *size) { +// Perhaps rename: purpose is get disk address of a block, given its blocknum +// (blockid?) +void block_table::translate_blocknum_to_offset_size(BLOCKNUM b, + DISKOFF *offset, + DISKOFF *size) { _mutex_lock(); _translate_blocknum_to_offset_size_unlocked(b, offset, size); _mutex_unlock(); @@ -544,13 +633,13 @@ void block_table::translate_blocknum_to_offset_size(BLOCKNUM b, DISKOFF *offset, // given that one more never-used blocknum will soon be used. void block_table::_maybe_expand_translation(struct translation *t) { if (t->length_of_array <= t->smallest_never_used_blocknum.b) { - //expansion is necessary + // expansion is necessary uint64_t new_length = t->smallest_never_used_blocknum.b * 2; XREALLOC_N(new_length, t->block_translation); uint64_t i; for (i = t->length_of_array; i < new_length; i++) { t->block_translation[i].u.next_free_blocknum = freelist_null; - t->block_translation[i].size = size_is_free; + t->block_translation[i].size = size_is_free; } t->length_of_array = new_length; } @@ -563,7 +652,8 @@ void block_table::_allocate_blocknum_unlocked(BLOCKNUM *res, FT ft) { if (t->blocknum_freelist_head.b == freelist_null.b) { // no previously used blocknums are available // use a never used blocknum - _maybe_expand_translation(t); //Ensure a never used blocknums is available + _maybe_expand_translation( + t); // Ensure a never used blocknums is available result = t->smallest_never_used_blocknum; t->smallest_never_used_blocknum.b++; } else { // reuse a previously used blocknum @@ -571,11 +661,11 @@ void block_table::_allocate_blocknum_unlocked(BLOCKNUM *res, FT ft) { BLOCKNUM next = t->block_translation[result.b].u.next_free_blocknum; t->blocknum_freelist_head = next; } - //Verify the blocknum is free + // Verify the blocknum is free paranoid_invariant(t->block_translation[result.b].size == size_is_free); - //blocknum is not free anymore + // blocknum is not free anymore t->block_translation[result.b].u.diskoff = diskoff_unused; - t->block_translation[result.b].size = 0; + t->block_translation[result.b].size = 0; _verify_valid_freeable_blocknum(t, result); *res = result; ft_set_dirty(ft, false); @@ -587,42 +677,46 @@ void block_table::allocate_blocknum(BLOCKNUM *res, FT ft) { _mutex_unlock(); } -void block_table::_free_blocknum_in_translation(struct translation *t, BLOCKNUM b) { +void block_table::_free_blocknum_in_translation(struct translation *t, + BLOCKNUM b) { _verify_valid_freeable_blocknum(t, b); paranoid_invariant(t->block_translation[b.b].size != size_is_free); - t->block_translation[b.b].size = size_is_free; + t->block_translation[b.b].size = size_is_free; t->block_translation[b.b].u.next_free_blocknum = t->blocknum_freelist_head; - t->blocknum_freelist_head = b; + t->blocknum_freelist_head = b; } // Effect: Free a blocknum. // If the blocknum holds the only reference to a block on disk, free that block -void block_table::_free_blocknum_unlocked(BLOCKNUM *bp, FT ft, bool for_checkpoint) { +void block_table::_free_blocknum_unlocked(BLOCKNUM *bp, + FT ft, + bool for_checkpoint) { toku_mutex_assert_locked(&_mutex); BLOCKNUM b = *bp; - bp->b = 0; //Remove caller's reference. + bp->b = 0; // Remove caller's reference. struct block_translation_pair old_pair = _current.block_translation[b.b]; _free_blocknum_in_translation(&_current, b); if (for_checkpoint) { - paranoid_invariant(ft->checkpoint_header->type == FT_CHECKPOINT_INPROGRESS); + paranoid_invariant(ft->checkpoint_header->type == + FT_CHECKPOINT_INPROGRESS); _free_blocknum_in_translation(&_inprogress, b); } - //If the size is 0, no disk block has ever been assigned to this blocknum. + // If the size is 0, no disk block has ever been assigned to this blocknum. if (old_pair.size > 0) { - //Free the old block if it is not still in use by the checkpoint in progress or the previous checkpoint - bool cannot_free = (bool) - (_translation_prevents_freeing(&_inprogress, b, &old_pair) || - _translation_prevents_freeing(&_checkpointed, b, &old_pair)); + // Free the old block if it is not still in use by the checkpoint in + // progress or the previous checkpoint + bool cannot_free = + _translation_prevents_freeing(&_inprogress, b, &old_pair) || + _translation_prevents_freeing(&_checkpointed, b, &old_pair); if (!cannot_free) { - _bt_block_allocator.free_block(old_pair.u.diskoff); + _bt_block_allocator->FreeBlock(old_pair.u.diskoff, old_pair.size); } - } - else { - paranoid_invariant(old_pair.size==0); + } else { + paranoid_invariant(old_pair.size == 0); paranoid_invariant(old_pair.u.diskoff == diskoff_unused); } ft_set_dirty(ft, for_checkpoint); @@ -644,13 +738,14 @@ void block_table::verify_no_free_blocknums() { void block_table::free_unused_blocknums(BLOCKNUM root) { _mutex_lock(); int64_t smallest = _current.smallest_never_used_blocknum.b; - for (int64_t i=RESERVED_BLOCKNUMS; i < smallest; i++) { + for (int64_t i = RESERVED_BLOCKNUMS; i < smallest; i++) { if (i == root.b) { continue; } BLOCKNUM b = make_blocknum(i); if (_current.block_translation[b.b].size == 0) { - invariant(_current.block_translation[b.b].u.diskoff == diskoff_unused); + invariant(_current.block_translation[b.b].u.diskoff == + diskoff_unused); _free_blocknum_in_translation(&_current, b); } } @@ -675,13 +770,14 @@ bool block_table::_no_data_blocks_except_root(BLOCKNUM root) { goto cleanup; } } - cleanup: +cleanup: _mutex_unlock(); return ok; } // Verify there are no data blocks except root. -// TODO(leif): This actually takes a lock, but I don't want to fix all the callers right now. +// TODO(leif): This actually takes a lock, but I don't want to fix all the +// callers right now. void block_table::verify_no_data_blocks_except_root(BLOCKNUM UU(root)) { paranoid_invariant(_no_data_blocks_except_root(root)); } @@ -705,13 +801,24 @@ void block_table::_dump_translation_internal(FILE *f, struct translation *t) { if (t->block_translation) { BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_TRANSLATION); fprintf(f, " length_of_array[%" PRId64 "]", t->length_of_array); - fprintf(f, " smallest_never_used_blocknum[%" PRId64 "]", t->smallest_never_used_blocknum.b); - fprintf(f, " blocknum_free_list_head[%" PRId64 "]", t->blocknum_freelist_head.b); - fprintf(f, " size_on_disk[%" PRId64 "]", t->block_translation[b.b].size); - fprintf(f, " location_on_disk[%" PRId64 "]\n", t->block_translation[b.b].u.diskoff); + fprintf(f, + " smallest_never_used_blocknum[%" PRId64 "]", + t->smallest_never_used_blocknum.b); + fprintf(f, + " blocknum_free_list_head[%" PRId64 "]", + t->blocknum_freelist_head.b); + fprintf( + f, " size_on_disk[%" PRId64 "]", t->block_translation[b.b].size); + fprintf(f, + " location_on_disk[%" PRId64 "]\n", + t->block_translation[b.b].u.diskoff); int64_t i; - for (i=0; i<t->length_of_array; i++) { - fprintf(f, " %" PRId64 ": %" PRId64 " %" PRId64 "\n", i, t->block_translation[i].u.diskoff, t->block_translation[i].size); + for (i = 0; i < t->length_of_array; i++) { + fprintf(f, + " %" PRId64 ": %" PRId64 " %" PRId64 "\n", + i, + t->block_translation[i].u.diskoff, + t->block_translation[i].size); } fprintf(f, "\n"); } else { @@ -724,9 +831,13 @@ void block_table::_dump_translation_internal(FILE *f, struct translation *t) { void block_table::dump_translation_table_pretty(FILE *f) { _mutex_lock(); struct translation *t = &_checkpointed; - assert(t->block_translation != nullptr); + invariant(t->block_translation != nullptr); for (int64_t i = 0; i < t->length_of_array; ++i) { - fprintf(f, "%" PRId64 "\t%" PRId64 "\t%" PRId64 "\n", i, t->block_translation[i].u.diskoff, t->block_translation[i].size); + fprintf(f, + "%" PRId64 "\t%" PRId64 "\t%" PRId64 "\n", + i, + t->block_translation[i].u.diskoff, + t->block_translation[i].size); } _mutex_unlock(); } @@ -750,7 +861,10 @@ void block_table::blocknum_dump_translation(BLOCKNUM b) { struct translation *t = &_current; if (b.b < t->length_of_array) { struct block_translation_pair *bx = &t->block_translation[b.b]; - printf("%" PRId64 ": %" PRId64 " %" PRId64 "\n", b.b, bx->u.diskoff, bx->size); + printf("%" PRId64 ": %" PRId64 " %" PRId64 "\n", + b.b, + bx->u.diskoff, + bx->size); } _mutex_unlock(); } @@ -763,26 +877,31 @@ void block_table::destroy(void) { toku_free(_inprogress.block_translation); toku_free(_checkpointed.block_translation); - _bt_block_allocator.destroy(); + _bt_block_allocator->Destroy(); + delete _bt_block_allocator; toku_mutex_destroy(&_mutex); nb_mutex_destroy(&_safe_file_size_lock); } -int block_table::_translation_deserialize_from_buffer(struct translation *t, - DISKOFF location_on_disk, - uint64_t size_on_disk, - // out: buffer with serialized translation - unsigned char *translation_buffer) { +int block_table::_translation_deserialize_from_buffer( + struct translation *t, + DISKOFF location_on_disk, + uint64_t size_on_disk, + // out: buffer with serialized translation + unsigned char *translation_buffer) { int r = 0; - assert(location_on_disk != 0); + invariant(location_on_disk != 0); t->type = TRANSLATION_CHECKPOINTED; // check the checksum uint32_t x1764 = toku_x1764_memory(translation_buffer, size_on_disk - 4); uint64_t offset = size_on_disk - 4; - uint32_t stored_x1764 = toku_dtoh32(*(int*)(translation_buffer + offset)); + uint32_t stored_x1764 = toku_dtoh32(*(int *)(translation_buffer + offset)); if (x1764 != stored_x1764) { - fprintf(stderr, "Translation table checksum failure: calc=0x%08x read=0x%08x\n", x1764, stored_x1764); + fprintf(stderr, + "Translation table checksum failure: calc=0x%08x read=0x%08x\n", + x1764, + stored_x1764); r = TOKUDB_BAD_CHECKSUM; goto exit; } @@ -790,42 +909,47 @@ int block_table::_translation_deserialize_from_buffer(struct translation *t, struct rbuf rb; rb.buf = translation_buffer; rb.ndone = 0; - rb.size = size_on_disk-4;//4==checksum + rb.size = size_on_disk - 4; // 4==checksum - t->smallest_never_used_blocknum = rbuf_blocknum(&rb); + t->smallest_never_used_blocknum = rbuf_blocknum(&rb); t->length_of_array = t->smallest_never_used_blocknum.b; invariant(t->smallest_never_used_blocknum.b >= RESERVED_BLOCKNUMS); - t->blocknum_freelist_head = rbuf_blocknum(&rb); + t->blocknum_freelist_head = rbuf_blocknum(&rb); XMALLOC_N(t->length_of_array, t->block_translation); for (int64_t i = 0; i < t->length_of_array; i++) { t->block_translation[i].u.diskoff = rbuf_DISKOFF(&rb); t->block_translation[i].size = rbuf_DISKOFF(&rb); } - invariant(_calculate_size_on_disk(t) == (int64_t) size_on_disk); - invariant(t->block_translation[RESERVED_BLOCKNUM_TRANSLATION].size == (int64_t) size_on_disk); - invariant(t->block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff == location_on_disk); + invariant(_calculate_size_on_disk(t) == (int64_t)size_on_disk); + invariant(t->block_translation[RESERVED_BLOCKNUM_TRANSLATION].size == + (int64_t)size_on_disk); + invariant(t->block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff == + location_on_disk); exit: return r; } int block_table::iterate(enum translation_type type, - BLOCKTABLE_CALLBACK f, void *extra, bool data_only, bool used_only) { + BLOCKTABLE_CALLBACK f, + void *extra, + bool data_only, + bool used_only) { struct translation *src; - + int r = 0; switch (type) { - case TRANSLATION_CURRENT: - src = &_current; - break; - case TRANSLATION_INPROGRESS: - src = &_inprogress; - break; - case TRANSLATION_CHECKPOINTED: - src = &_checkpointed; - break; - default: - r = EINVAL; + case TRANSLATION_CURRENT: + src = &_current; + break; + case TRANSLATION_INPROGRESS: + src = &_inprogress; + break; + case TRANSLATION_CHECKPOINTED: + src = &_checkpointed; + break; + default: + r = EINVAL; } struct translation fakecurrent; @@ -839,12 +963,15 @@ int block_table::iterate(enum translation_type type, src->block_translation[RESERVED_BLOCKNUM_TRANSLATION]; _mutex_unlock(); int64_t i; - for (i=0; i<t->smallest_never_used_blocknum.b; i++) { + for (i = 0; i < t->smallest_never_used_blocknum.b; i++) { struct block_translation_pair pair = t->block_translation[i]; - if (data_only && i< RESERVED_BLOCKNUMS) continue; - if (used_only && pair.size <= 0) continue; + if (data_only && i < RESERVED_BLOCKNUMS) + continue; + if (used_only && pair.size <= 0) + continue; r = f(make_blocknum(i), pair.size, pair.u.diskoff, extra); - if (r!=0) break; + if (r != 0) + break; } toku_free(t->block_translation); } @@ -856,8 +983,11 @@ typedef struct { int64_t total_space; } frag_extra; -static int frag_helper(BLOCKNUM UU(b), int64_t size, int64_t address, void *extra) { - frag_extra *info = (frag_extra *) extra; +static int frag_helper(BLOCKNUM UU(b), + int64_t size, + int64_t address, + void *extra) { + frag_extra *info = (frag_extra *)extra; if (size + address > info->total_space) info->total_space = size + address; @@ -865,22 +995,30 @@ static int frag_helper(BLOCKNUM UU(b), int64_t size, int64_t address, void *extr return 0; } -void block_table::internal_fragmentation(int64_t *total_sizep, int64_t *used_sizep) { - frag_extra info = { 0, 0 }; +void block_table::internal_fragmentation(int64_t *total_sizep, + int64_t *used_sizep) { + frag_extra info = {0, 0}; int r = iterate(TRANSLATION_CHECKPOINTED, frag_helper, &info, false, true); - assert_zero(r); + invariant_zero(r); - if (total_sizep) *total_sizep = info.total_space; - if (used_sizep) *used_sizep = info.used_space; + if (total_sizep) + *total_sizep = info.total_space; + if (used_sizep) + *used_sizep = info.used_space; } -void block_table::_realloc_descriptor_on_disk_unlocked(DISKOFF size, DISKOFF *offset, FT ft) { +void block_table::_realloc_descriptor_on_disk_unlocked(DISKOFF size, + DISKOFF *offset, + FT ft) { toku_mutex_assert_locked(&_mutex); BLOCKNUM b = make_blocknum(RESERVED_BLOCKNUM_DESCRIPTOR); - _realloc_on_disk_internal(b, size, offset, ft, false, 0); + _realloc_on_disk_internal(b, size, offset, ft, false); } -void block_table::realloc_descriptor_on_disk(DISKOFF size, DISKOFF *offset, FT ft, int fd) { +void block_table::realloc_descriptor_on_disk(DISKOFF size, + DISKOFF *offset, + FT ft, + int fd) { _mutex_lock(); _realloc_descriptor_on_disk_unlocked(size, offset, ft); _ensure_safe_write_unlocked(fd, size, *offset); @@ -897,11 +1035,12 @@ void block_table::get_descriptor_offset_size(DISKOFF *offset, DISKOFF *size) { void block_table::get_fragmentation_unlocked(TOKU_DB_FRAGMENTATION report) { // Requires: blocktable lock is held. // Requires: report->file_size_bytes is already filled in. - + // Count the headers. - report->data_bytes = block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE; + report->data_bytes = BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE; report->data_blocks = 1; - report->checkpoint_bytes_additional = block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE; + report->checkpoint_bytes_additional = + BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE; report->checkpoint_blocks_additional = 1; struct translation *current = &_current; @@ -915,30 +1054,34 @@ void block_table::get_fragmentation_unlocked(TOKU_DB_FRAGMENTATION report) { struct translation *checkpointed = &_checkpointed; for (int64_t i = 0; i < checkpointed->length_of_array; i++) { - struct block_translation_pair *pair = &checkpointed->block_translation[i]; - if (pair->size > 0 && !(i < current->length_of_array && - current->block_translation[i].size > 0 && - current->block_translation[i].u.diskoff == pair->u.diskoff)) { - report->checkpoint_bytes_additional += pair->size; - report->checkpoint_blocks_additional++; + struct block_translation_pair *pair = + &checkpointed->block_translation[i]; + if (pair->size > 0 && + !(i < current->length_of_array && + current->block_translation[i].size > 0 && + current->block_translation[i].u.diskoff == pair->u.diskoff)) { + report->checkpoint_bytes_additional += pair->size; + report->checkpoint_blocks_additional++; } } struct translation *inprogress = &_inprogress; for (int64_t i = 0; i < inprogress->length_of_array; i++) { struct block_translation_pair *pair = &inprogress->block_translation[i]; - if (pair->size > 0 && !(i < current->length_of_array && - current->block_translation[i].size > 0 && - current->block_translation[i].u.diskoff == pair->u.diskoff) && - !(i < checkpointed->length_of_array && - checkpointed->block_translation[i].size > 0 && - checkpointed->block_translation[i].u.diskoff == pair->u.diskoff)) { + if (pair->size > 0 && + !(i < current->length_of_array && + current->block_translation[i].size > 0 && + current->block_translation[i].u.diskoff == pair->u.diskoff) && + !(i < checkpointed->length_of_array && + checkpointed->block_translation[i].size > 0 && + checkpointed->block_translation[i].u.diskoff == + pair->u.diskoff)) { report->checkpoint_bytes_additional += pair->size; report->checkpoint_blocks_additional++; } } - _bt_block_allocator.get_unused_statistics(report); + _bt_block_allocator->UnusedStatistics(report); } void block_table::get_info64(struct ftinfo64 *s) { @@ -967,25 +1110,38 @@ void block_table::get_info64(struct ftinfo64 *s) { _mutex_unlock(); } -int block_table::iterate_translation_tables(uint64_t checkpoint_count, - int (*iter)(uint64_t checkpoint_count, - int64_t total_num_rows, - int64_t blocknum, - int64_t diskoff, - int64_t size, - void *extra), - void *iter_extra) { +int block_table::iterate_translation_tables( + uint64_t checkpoint_count, + int (*iter)(uint64_t checkpoint_count, + int64_t total_num_rows, + int64_t blocknum, + int64_t diskoff, + int64_t size, + void *extra), + void *iter_extra) { int error = 0; _mutex_lock(); - int64_t total_num_rows = _current.length_of_array + _checkpointed.length_of_array; + int64_t total_num_rows = + _current.length_of_array + _checkpointed.length_of_array; for (int64_t i = 0; error == 0 && i < _current.length_of_array; ++i) { struct block_translation_pair *block = &_current.block_translation[i]; - error = iter(checkpoint_count, total_num_rows, i, block->u.diskoff, block->size, iter_extra); + error = iter(checkpoint_count, + total_num_rows, + i, + block->u.diskoff, + block->size, + iter_extra); } for (int64_t i = 0; error == 0 && i < _checkpointed.length_of_array; ++i) { - struct block_translation_pair *block = &_checkpointed.block_translation[i]; - error = iter(checkpoint_count - 1, total_num_rows, i, block->u.diskoff, block->size, iter_extra); + struct block_translation_pair *block = + &_checkpointed.block_translation[i]; + error = iter(checkpoint_count - 1, + total_num_rows, + i, + block->u.diskoff, + block->size, + iter_extra); } _mutex_unlock(); diff --git a/storage/tokudb/PerconaFT/ft/serialize/block_table.h b/storage/tokudb/PerconaFT/ft/serialize/block_table.h index 8d391674540..dd732d4f372 100644 --- a/storage/tokudb/PerconaFT/ft/serialize/block_table.h +++ b/storage/tokudb/PerconaFT/ft/serialize/block_table.h @@ -62,13 +62,16 @@ enum { RESERVED_BLOCKNUMS }; -typedef int (*BLOCKTABLE_CALLBACK)(BLOCKNUM b, int64_t size, int64_t address, void *extra); +typedef int (*BLOCKTABLE_CALLBACK)(BLOCKNUM b, + int64_t size, + int64_t address, + void *extra); static inline BLOCKNUM make_blocknum(int64_t b) { - BLOCKNUM result = { .b = b }; + BLOCKNUM result = {.b = b}; return result; } -static const BLOCKNUM ROLLBACK_NONE = { .b = 0 }; +static const BLOCKNUM ROLLBACK_NONE = {.b = 0}; /** * There are three copies of the translation table (btt) in the block table: @@ -80,18 +83,20 @@ static const BLOCKNUM ROLLBACK_NONE = { .b = 0 }; * * inprogress Is only filled by copying from current, * and is the only version ever serialized to disk. - * (It is serialized to disk on checkpoint and clean shutdown.) + * (It is serialized to disk on checkpoint and clean + *shutdown.) * At end of checkpoint it replaces 'checkpointed'. * During a checkpoint, any 'pending' dirty writes will update * inprogress. * * current Is initialized by copying from checkpointed, - * is the only version ever modified while the database is in use, + * is the only version ever modified while the database is in + *use, * and is the only version ever copied to inprogress. * It is never stored on disk. */ class block_table { -public: + public: enum translation_type { TRANSLATION_NONE = 0, TRANSLATION_CURRENT, @@ -102,7 +107,10 @@ public: void create(); - int create_from_buffer(int fd, DISKOFF location_on_disk, DISKOFF size_on_disk, unsigned char *translation_buffer); + int create_from_buffer(int fd, + DISKOFF location_on_disk, + DISKOFF size_on_disk, + unsigned char *translation_buffer); void destroy(); @@ -114,11 +122,21 @@ public: // Blocknums void allocate_blocknum(BLOCKNUM *res, struct ft *ft); - void realloc_on_disk(BLOCKNUM b, DISKOFF size, DISKOFF *offset, struct ft *ft, int fd, bool for_checkpoint, uint64_t heat); + void realloc_on_disk(BLOCKNUM b, + DISKOFF size, + DISKOFF *offset, + struct ft *ft, + int fd, + bool for_checkpoint); void free_blocknum(BLOCKNUM *b, struct ft *ft, bool for_checkpoint); - void translate_blocknum_to_offset_size(BLOCKNUM b, DISKOFF *offset, DISKOFF *size); + void translate_blocknum_to_offset_size(BLOCKNUM b, + DISKOFF *offset, + DISKOFF *size); void free_unused_blocknums(BLOCKNUM root); - void realloc_descriptor_on_disk(DISKOFF size, DISKOFF *offset, struct ft *ft, int fd); + void realloc_descriptor_on_disk(DISKOFF size, + DISKOFF *offset, + struct ft *ft, + int fd); void get_descriptor_offset_size(DISKOFF *offset, DISKOFF *size); // External verfication @@ -127,15 +145,22 @@ public: void verify_no_free_blocknums(); // Serialization - void serialize_translation_to_wbuf(int fd, struct wbuf *w, int64_t *address, int64_t *size); + void serialize_translation_to_wbuf(int fd, + struct wbuf *w, + int64_t *address, + int64_t *size); // DEBUG ONLY (ftdump included), tests included void blocknum_dump_translation(BLOCKNUM b); void dump_translation_table_pretty(FILE *f); void dump_translation_table(FILE *f); - void block_free(uint64_t offset); + void block_free(uint64_t offset, uint64_t size); - int iterate(enum translation_type type, BLOCKTABLE_CALLBACK f, void *extra, bool data_only, bool used_only); + int iterate(enum translation_type type, + BLOCKTABLE_CALLBACK f, + void *extra, + bool data_only, + bool used_only); void internal_fragmentation(int64_t *total_sizep, int64_t *used_sizep); // Requires: blocktable lock is held. @@ -146,13 +171,16 @@ public: void get_info64(struct ftinfo64 *); - int iterate_translation_tables(uint64_t, int (*)(uint64_t, int64_t, int64_t, int64_t, int64_t, void *), void *); + int iterate_translation_tables( + uint64_t, + int (*)(uint64_t, int64_t, int64_t, int64_t, int64_t, void *), + void *); -private: + private: struct block_translation_pair { // If in the freelist, use next_free_blocknum, otherwise diskoff. union { - DISKOFF diskoff; + DISKOFF diskoff; BLOCKNUM next_free_blocknum; } u; @@ -173,7 +201,8 @@ private: struct translation { enum translation_type type; - // Number of elements in array (block_translation). always >= smallest_never_used_blocknum + // Number of elements in array (block_translation). always >= + // smallest_never_used_blocknum int64_t length_of_array; BLOCKNUM smallest_never_used_blocknum; @@ -181,20 +210,28 @@ private: BLOCKNUM blocknum_freelist_head; struct block_translation_pair *block_translation; - // size_on_disk is stored in block_translation[RESERVED_BLOCKNUM_TRANSLATION].size - // location_on is stored in block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff + // size_on_disk is stored in + // block_translation[RESERVED_BLOCKNUM_TRANSLATION].size + // location_on is stored in + // block_translation[RESERVED_BLOCKNUM_TRANSLATION].u.diskoff }; void _create_internal(); - int _translation_deserialize_from_buffer(struct translation *t, // destination into which to deserialize - DISKOFF location_on_disk, // location of translation_buffer - uint64_t size_on_disk, - unsigned char * translation_buffer); // buffer with serialized translation - - void _copy_translation(struct translation *dst, struct translation *src, enum translation_type newtype); + int _translation_deserialize_from_buffer( + struct translation *t, // destination into which to deserialize + DISKOFF location_on_disk, // location of translation_buffer + uint64_t size_on_disk, + unsigned char * + translation_buffer); // buffer with serialized translation + + void _copy_translation(struct translation *dst, + struct translation *src, + enum translation_type newtype); void _maybe_optimize_translation(struct translation *t); void _maybe_expand_translation(struct translation *t); - bool _translation_prevents_freeing(struct translation *t, BLOCKNUM b, struct block_translation_pair *old_pair); + bool _translation_prevents_freeing(struct translation *t, + BLOCKNUM b, + struct block_translation_pair *old_pair); void _free_blocknum_in_translation(struct translation *t, BLOCKNUM b); int64_t _calculate_size_on_disk(struct translation *t); bool _pair_is_unallocated(struct block_translation_pair *pair); @@ -203,14 +240,26 @@ private: // Blocknum management void _allocate_blocknum_unlocked(BLOCKNUM *res, struct ft *ft); - void _free_blocknum_unlocked(BLOCKNUM *bp, struct ft *ft, bool for_checkpoint); - void _realloc_descriptor_on_disk_unlocked(DISKOFF size, DISKOFF *offset, struct ft *ft); - void _realloc_on_disk_internal(BLOCKNUM b, DISKOFF size, DISKOFF *offset, struct ft *ft, bool for_checkpoint, uint64_t heat); - void _translate_blocknum_to_offset_size_unlocked(BLOCKNUM b, DISKOFF *offset, DISKOFF *size); + void _free_blocknum_unlocked(BLOCKNUM *bp, + struct ft *ft, + bool for_checkpoint); + void _realloc_descriptor_on_disk_unlocked(DISKOFF size, + DISKOFF *offset, + struct ft *ft); + void _realloc_on_disk_internal(BLOCKNUM b, + DISKOFF size, + DISKOFF *offset, + struct ft *ft, + bool for_checkpoint); + void _translate_blocknum_to_offset_size_unlocked(BLOCKNUM b, + DISKOFF *offset, + DISKOFF *size); // File management void _maybe_truncate_file(int fd, uint64_t size_needed_before); - void _ensure_safe_write_unlocked(int fd, DISKOFF block_size, DISKOFF block_offset); + void _ensure_safe_write_unlocked(int fd, + DISKOFF block_size, + DISKOFF block_offset); // Verification bool _is_valid_blocknum(struct translation *t, BLOCKNUM b); @@ -220,29 +269,33 @@ private: bool _no_data_blocks_except_root(BLOCKNUM root); bool _blocknum_allocated(BLOCKNUM b); - // Locking + // Locking // // TODO: Move the lock to the FT void _mutex_lock(); void _mutex_unlock(); - // The current translation is the one used by client threads. + // The current translation is the one used by client threads. // It is not represented on disk. struct translation _current; - // The translation used by the checkpoint currently in progress. - // If the checkpoint thread allocates a block, it must also update the current translation. + // The translation used by the checkpoint currently in progress. + // If the checkpoint thread allocates a block, it must also update the + // current translation. struct translation _inprogress; - // The translation for the data that shall remain inviolate on disk until the next checkpoint finishes, + // The translation for the data that shall remain inviolate on disk until + // the next checkpoint finishes, // after which any blocks used only in this translation can be freed. struct translation _checkpointed; - // The in-memory data structure for block allocation. + // The in-memory data structure for block allocation. // There is no on-disk data structure for block allocation. - // Note: This is *allocation* not *translation* - the block allocator is unaware of which - // blocks are used for which translation, but simply allocates and deallocates blocks. - block_allocator _bt_block_allocator; + // Note: This is *allocation* not *translation* - the block allocator is + // unaware of which + // blocks are used for which translation, but simply allocates and + // deallocates blocks. + BlockAllocator *_bt_block_allocator; toku_mutex_t _mutex; struct nb_mutex _safe_file_size_lock; bool _checkpoint_skipped; @@ -257,16 +310,16 @@ private: #include "ft/serialize/wbuf.h" -static inline void wbuf_BLOCKNUM (struct wbuf *w, BLOCKNUM b) { +static inline void wbuf_BLOCKNUM(struct wbuf *w, BLOCKNUM b) { wbuf_ulonglong(w, b.b); } -static inline void wbuf_nocrc_BLOCKNUM (struct wbuf *w, BLOCKNUM b) { +static inline void wbuf_nocrc_BLOCKNUM(struct wbuf *w, BLOCKNUM b) { wbuf_nocrc_ulonglong(w, b.b); } static inline void wbuf_DISKOFF(struct wbuf *wb, DISKOFF off) { - wbuf_ulonglong(wb, (uint64_t) off); + wbuf_ulonglong(wb, (uint64_t)off); } #include "ft/serialize/rbuf.h" @@ -280,6 +333,8 @@ static inline BLOCKNUM rbuf_blocknum(struct rbuf *rb) { return result; } -static inline void rbuf_ma_BLOCKNUM(struct rbuf *rb, memarena *UU(ma), BLOCKNUM *blocknum) { +static inline void rbuf_ma_BLOCKNUM(struct rbuf *rb, + memarena *UU(ma), + BLOCKNUM *blocknum) { *blocknum = rbuf_blocknum(rb); } diff --git a/storage/tokudb/PerconaFT/ft/serialize/compress.cc b/storage/tokudb/PerconaFT/ft/serialize/compress.cc index 1719b6b7cb5..c2f815c6cf2 100644 --- a/storage/tokudb/PerconaFT/ft/serialize/compress.cc +++ b/storage/tokudb/PerconaFT/ft/serialize/compress.cc @@ -235,7 +235,7 @@ void toku_decompress (Bytef *dest, uLongf destLen, strm.zalloc = Z_NULL; strm.zfree = Z_NULL; strm.opaque = Z_NULL; - char windowBits = source[1]; + int8_t windowBits = source[1]; int r = inflateInit2(&strm, windowBits); lazy_assert(r == Z_OK); strm.next_out = dest; diff --git a/storage/tokudb/PerconaFT/ft/serialize/ft-serialize.cc b/storage/tokudb/PerconaFT/ft/serialize/ft-serialize.cc index 49d4368a3ab..8fcb5293412 100644 --- a/storage/tokudb/PerconaFT/ft/serialize/ft-serialize.cc +++ b/storage/tokudb/PerconaFT/ft/serialize/ft-serialize.cc @@ -217,8 +217,8 @@ int deserialize_ft_versioned(int fd, struct rbuf *rb, FT *ftp, uint32_t version) // translation table itself won't fit in main memory. ssize_t readsz = toku_os_pread(fd, tbuf, size_to_read, translation_address_on_disk); - assert(readsz >= translation_size_on_disk); - assert(readsz <= (ssize_t)size_to_read); + invariant(readsz >= translation_size_on_disk); + invariant(readsz <= (ssize_t)size_to_read); } // Create table and read in data. r = ft->blocktable.create_from_buffer(fd, @@ -411,73 +411,90 @@ exit: return r; } -static size_t -serialize_ft_min_size (uint32_t version) { +static size_t serialize_ft_min_size(uint32_t version) { size_t size = 0; - switch(version) { - case FT_LAYOUT_VERSION_29: - size += sizeof(uint64_t); // logrows in ft - case FT_LAYOUT_VERSION_28: - size += sizeof(uint32_t); // fanout in ft - case FT_LAYOUT_VERSION_27: - case FT_LAYOUT_VERSION_26: - case FT_LAYOUT_VERSION_25: - case FT_LAYOUT_VERSION_24: - case FT_LAYOUT_VERSION_23: - case FT_LAYOUT_VERSION_22: - case FT_LAYOUT_VERSION_21: - size += sizeof(MSN); // max_msn_in_ft - case FT_LAYOUT_VERSION_20: - case FT_LAYOUT_VERSION_19: - size += 1; // compression method - size += sizeof(MSN); // highest_unused_msn_for_upgrade - case FT_LAYOUT_VERSION_18: - size += sizeof(uint64_t); // time_of_last_optimize_begin - size += sizeof(uint64_t); // time_of_last_optimize_end - size += sizeof(uint32_t); // count_of_optimize_in_progress - size += sizeof(MSN); // msn_at_start_of_last_completed_optimize - size -= 8; // removed num_blocks_to_upgrade_14 - size -= 8; // removed num_blocks_to_upgrade_13 - case FT_LAYOUT_VERSION_17: - size += 16; - invariant(sizeof(STAT64INFO_S) == 16); - case FT_LAYOUT_VERSION_16: - case FT_LAYOUT_VERSION_15: - size += 4; // basement node size - size += 8; // num_blocks_to_upgrade_14 (previously num_blocks_to_upgrade, now one int each for upgrade from 13, 14 - size += 8; // time of last verification - case FT_LAYOUT_VERSION_14: - size += 8; //TXNID that created - case FT_LAYOUT_VERSION_13: - size += ( 4 // build_id - +4 // build_id_original - +8 // time_of_creation - +8 // time_of_last_modification - ); + switch (version) { + case FT_LAYOUT_VERSION_29: + size += sizeof(uint64_t); // logrows in ft + case FT_LAYOUT_VERSION_28: + size += sizeof(uint32_t); // fanout in ft + case FT_LAYOUT_VERSION_27: + case FT_LAYOUT_VERSION_26: + case FT_LAYOUT_VERSION_25: + case FT_LAYOUT_VERSION_24: + case FT_LAYOUT_VERSION_23: + case FT_LAYOUT_VERSION_22: + case FT_LAYOUT_VERSION_21: + size += sizeof(MSN); // max_msn_in_ft + case FT_LAYOUT_VERSION_20: + case FT_LAYOUT_VERSION_19: + size += 1; // compression method + size += sizeof(MSN); // highest_unused_msn_for_upgrade + case FT_LAYOUT_VERSION_18: + size += sizeof(uint64_t); // time_of_last_optimize_begin + size += sizeof(uint64_t); // time_of_last_optimize_end + size += sizeof(uint32_t); // count_of_optimize_in_progress + size += sizeof(MSN); // msn_at_start_of_last_completed_optimize + size -= 8; // removed num_blocks_to_upgrade_14 + size -= 8; // removed num_blocks_to_upgrade_13 + case FT_LAYOUT_VERSION_17: + size += 16; + invariant(sizeof(STAT64INFO_S) == 16); + case FT_LAYOUT_VERSION_16: + case FT_LAYOUT_VERSION_15: + size += 4; // basement node size + size += 8; // num_blocks_to_upgrade_14 (previously + // num_blocks_to_upgrade, now one int each for upgrade + // from 13, 14 + size += 8; // time of last verification + case FT_LAYOUT_VERSION_14: + size += 8; // TXNID that created + case FT_LAYOUT_VERSION_13: + size += (4 // build_id + + + 4 // build_id_original + + + 8 // time_of_creation + + + 8 // time_of_last_modification + ); // fall through - case FT_LAYOUT_VERSION_12: - size += (+8 // "tokudata" - +4 // version - +4 // original_version - +4 // size - +8 // byte order verification - +8 // checkpoint_count - +8 // checkpoint_lsn - +4 // tree's nodesize - +8 // translation_size_on_disk - +8 // translation_address_on_disk - +4 // checksum - +8 // Number of blocks in old version. - +8 // diskoff - +4 // flags - ); - break; - default: - abort(); - } - - lazy_assert(size <= block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE); + case FT_LAYOUT_VERSION_12: + size += (+8 // "tokudata" + + + 4 // version + + + 4 // original_version + + + 4 // size + + + 8 // byte order verification + + + 8 // checkpoint_count + + + 8 // checkpoint_lsn + + + 4 // tree's nodesize + + + 8 // translation_size_on_disk + + + 8 // translation_address_on_disk + + + 4 // checksum + + + 8 // Number of blocks in old version. + + + 8 // diskoff + + + 4 // flags + ); + break; + default: + abort(); + } + + lazy_assert(size <= BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE); return size; } @@ -486,7 +503,7 @@ int deserialize_ft_from_fd_into_rbuf(int fd, struct rbuf *rb, uint64_t *checkpoint_count, LSN *checkpoint_lsn, - uint32_t * version_p) + uint32_t *version_p) // Effect: Read and parse the header of a fractalal tree // // Simply reading the raw bytes of the header into an rbuf is insensitive @@ -496,18 +513,18 @@ int deserialize_ft_from_fd_into_rbuf(int fd, // file AND the header is useless { int r = 0; - const int64_t prefix_size = 8 + // magic ("tokudata") - 4 + // version - 4 + // build_id - 4; // size + const int64_t prefix_size = 8 + // magic ("tokudata") + 4 + // version + 4 + // build_id + 4; // size const int64_t read_size = roundup_to_multiple(512, prefix_size); unsigned char *XMALLOC_N_ALIGNED(512, read_size, prefix); rb->buf = NULL; int64_t n = toku_os_pread(fd, prefix, read_size, offset_of_header); if (n != read_size) { - if (n==0) { + if (n == 0) { r = TOKUDB_DICTIONARY_NO_HEADER; - } else if (n<0) { + } else if (n < 0) { r = get_error_errno(); } else { r = EINVAL; @@ -518,95 +535,102 @@ int deserialize_ft_from_fd_into_rbuf(int fd, rbuf_init(rb, prefix, prefix_size); - //Check magic number + // Check magic number const void *magic; rbuf_literal_bytes(rb, &magic, 8); - if (memcmp(magic,"tokudata",8)!=0) { - if ((*(uint64_t*)magic) == 0) { + if (memcmp(magic, "tokudata", 8) != 0) { + if ((*(uint64_t *)magic) == 0) { r = TOKUDB_DICTIONARY_NO_HEADER; } else { - r = EINVAL; //Not a tokudb file! Do not use. + r = EINVAL; // Not a tokudb file! Do not use. } goto exit; } - //Version MUST be in network order regardless of disk order. + // Version MUST be in network order regardless of disk order. uint32_t version; version = rbuf_network_int(rb); *version_p = version; if (version < FT_LAYOUT_MIN_SUPPORTED_VERSION) { - r = TOKUDB_DICTIONARY_TOO_OLD; //Cannot use + r = TOKUDB_DICTIONARY_TOO_OLD; // Cannot use goto exit; } else if (version > FT_LAYOUT_VERSION) { - r = TOKUDB_DICTIONARY_TOO_NEW; //Cannot use + r = TOKUDB_DICTIONARY_TOO_NEW; // Cannot use goto exit; } - //build_id MUST be in network order regardless of disk order. + // build_id MUST be in network order regardless of disk order. uint32_t build_id __attribute__((__unused__)); build_id = rbuf_network_int(rb); int64_t min_header_size; min_header_size = serialize_ft_min_size(version); - //Size MUST be in network order regardless of disk order. + // Size MUST be in network order regardless of disk order. uint32_t size; size = rbuf_network_int(rb); - //If too big, it is corrupt. We would probably notice during checksum - //but may have to do a multi-gigabyte malloc+read to find out. - //If its too small reading rbuf would crash, so verify. - if (size > block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE || size < min_header_size) { + // If too big, it is corrupt. We would probably notice during checksum + // but may have to do a multi-gigabyte malloc+read to find out. + // If its too small reading rbuf would crash, so verify. + if (size > BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE || + size < min_header_size) { r = TOKUDB_DICTIONARY_NO_HEADER; goto exit; } - lazy_assert(rb->ndone==prefix_size); + lazy_assert(rb->ndone == prefix_size); rb->size = size; { toku_free(rb->buf); uint32_t size_to_read = roundup_to_multiple(512, size); XMALLOC_N_ALIGNED(512, size_to_read, rb->buf); - assert(offset_of_header%512==0); + invariant(offset_of_header % 512 == 0); n = toku_os_pread(fd, rb->buf, size_to_read, offset_of_header); if (n != size_to_read) { if (n < 0) { r = get_error_errno(); } else { - r = EINVAL; //Header might be useless (wrong size) or could be a disk read error. + r = EINVAL; // Header might be useless (wrong size) or could be + // a disk read error. } goto exit; } } - //It's version 14 or later. Magic looks OK. - //We have an rbuf that represents the header. - //Size is within acceptable bounds. + // It's version 14 or later. Magic looks OK. + // We have an rbuf that represents the header. + // Size is within acceptable bounds. - //Verify checksum (FT_LAYOUT_VERSION_13 or later, when checksum function changed) + // Verify checksum (FT_LAYOUT_VERSION_13 or later, when checksum function + // changed) uint32_t calculated_x1764; - calculated_x1764 = toku_x1764_memory(rb->buf, rb->size-4); + calculated_x1764 = toku_x1764_memory(rb->buf, rb->size - 4); uint32_t stored_x1764; - stored_x1764 = toku_dtoh32(*(int*)(rb->buf+rb->size-4)); + stored_x1764 = toku_dtoh32(*(int *)(rb->buf + rb->size - 4)); if (calculated_x1764 != stored_x1764) { - r = TOKUDB_BAD_CHECKSUM; //Header useless - fprintf(stderr, "Header checksum failure: calc=0x%08x read=0x%08x\n", calculated_x1764, stored_x1764); + r = TOKUDB_BAD_CHECKSUM; // Header useless + fprintf(stderr, + "Header checksum failure: calc=0x%08x read=0x%08x\n", + calculated_x1764, + stored_x1764); goto exit; } - //Verify byte order + // Verify byte order const void *tmp_byte_order_check; lazy_assert((sizeof toku_byte_order_host) == 8); - rbuf_literal_bytes(rb, &tmp_byte_order_check, 8); //Must not translate byte order + rbuf_literal_bytes( + rb, &tmp_byte_order_check, 8); // Must not translate byte order int64_t byte_order_stored; - byte_order_stored = *(int64_t*)tmp_byte_order_check; + byte_order_stored = *(int64_t *)tmp_byte_order_check; if (byte_order_stored != toku_byte_order_host) { - r = TOKUDB_DICTIONARY_NO_HEADER; //Cannot use dictionary + r = TOKUDB_DICTIONARY_NO_HEADER; // Cannot use dictionary goto exit; } - //Load checkpoint count + // Load checkpoint count *checkpoint_count = rbuf_ulonglong(rb); *checkpoint_lsn = rbuf_LSN(rb); - //Restart at beginning during regular deserialization + // Restart at beginning during regular deserialization rb->ndone = 0; exit: @@ -620,11 +644,7 @@ exit: // Read ft from file into struct. Read both headers and use one. // We want the latest acceptable header whose checkpoint_lsn is no later // than max_acceptable_lsn. -int -toku_deserialize_ft_from(int fd, - LSN max_acceptable_lsn, - FT *ft) -{ +int toku_deserialize_ft_from(int fd, LSN max_acceptable_lsn, FT *ft) { struct rbuf rb_0; struct rbuf rb_1; uint64_t checkpoint_count_0 = 0; @@ -638,13 +658,23 @@ toku_deserialize_ft_from(int fd, int r0, r1, r; toku_off_t header_0_off = 0; - r0 = deserialize_ft_from_fd_into_rbuf(fd, header_0_off, &rb_0, &checkpoint_count_0, &checkpoint_lsn_0, &version_0); + r0 = deserialize_ft_from_fd_into_rbuf(fd, + header_0_off, + &rb_0, + &checkpoint_count_0, + &checkpoint_lsn_0, + &version_0); if (r0 == 0 && checkpoint_lsn_0.lsn <= max_acceptable_lsn.lsn) { h0_acceptable = true; } - toku_off_t header_1_off = block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE; - r1 = deserialize_ft_from_fd_into_rbuf(fd, header_1_off, &rb_1, &checkpoint_count_1, &checkpoint_lsn_1, &version_1); + toku_off_t header_1_off = BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE; + r1 = deserialize_ft_from_fd_into_rbuf(fd, + header_1_off, + &rb_1, + &checkpoint_count_1, + &checkpoint_lsn_1, + &version_1); if (r1 == 0 && checkpoint_lsn_1.lsn <= max_acceptable_lsn.lsn) { h1_acceptable = true; } @@ -655,24 +685,29 @@ toku_deserialize_ft_from(int fd, // We were unable to read either header or at least one is too // new. Certain errors are higher priority than others. Order of // these if/else if is important. - if (r0 == TOKUDB_DICTIONARY_TOO_NEW || r1 == TOKUDB_DICTIONARY_TOO_NEW) { + if (r0 == TOKUDB_DICTIONARY_TOO_NEW || + r1 == TOKUDB_DICTIONARY_TOO_NEW) { r = TOKUDB_DICTIONARY_TOO_NEW; - } else if (r0 == TOKUDB_DICTIONARY_TOO_OLD || r1 == TOKUDB_DICTIONARY_TOO_OLD) { + } else if (r0 == TOKUDB_DICTIONARY_TOO_OLD || + r1 == TOKUDB_DICTIONARY_TOO_OLD) { r = TOKUDB_DICTIONARY_TOO_OLD; } else if (r0 == TOKUDB_BAD_CHECKSUM && r1 == TOKUDB_BAD_CHECKSUM) { fprintf(stderr, "Both header checksums failed.\n"); r = TOKUDB_BAD_CHECKSUM; - } else if (r0 == TOKUDB_DICTIONARY_NO_HEADER || r1 == TOKUDB_DICTIONARY_NO_HEADER) { + } else if (r0 == TOKUDB_DICTIONARY_NO_HEADER || + r1 == TOKUDB_DICTIONARY_NO_HEADER) { r = TOKUDB_DICTIONARY_NO_HEADER; } else { - r = r0 ? r0 : r1; //Arbitrarily report the error from the - //first header, unless it's readable + r = r0 ? r0 : r1; // Arbitrarily report the error from the + // first header, unless it's readable } - // it should not be possible for both headers to be later than the max_acceptable_lsn - invariant(!((r0==0 && checkpoint_lsn_0.lsn > max_acceptable_lsn.lsn) && - (r1==0 && checkpoint_lsn_1.lsn > max_acceptable_lsn.lsn))); - invariant(r!=0); + // it should not be possible for both headers to be later than the + // max_acceptable_lsn + invariant( + !((r0 == 0 && checkpoint_lsn_0.lsn > max_acceptable_lsn.lsn) && + (r1 == 0 && checkpoint_lsn_1.lsn > max_acceptable_lsn.lsn))); + invariant(r != 0); goto exit; } @@ -682,8 +717,7 @@ toku_deserialize_ft_from(int fd, invariant(version_0 >= version_1); rb = &rb_0; version = version_0; - } - else { + } else { invariant(checkpoint_count_1 == checkpoint_count_0 + 1); invariant(version_1 >= version_0); rb = &rb_1; @@ -692,14 +726,18 @@ toku_deserialize_ft_from(int fd, } else if (h0_acceptable) { if (r1 == TOKUDB_BAD_CHECKSUM) { // print something reassuring - fprintf(stderr, "Header 2 checksum failed, but header 1 ok. Proceeding.\n"); + fprintf( + stderr, + "Header 2 checksum failed, but header 1 ok. Proceeding.\n"); } rb = &rb_0; version = version_0; } else if (h1_acceptable) { if (r0 == TOKUDB_BAD_CHECKSUM) { // print something reassuring - fprintf(stderr, "Header 1 checksum failed, but header 2 ok. Proceeding.\n"); + fprintf( + stderr, + "Header 1 checksum failed, but header 2 ok. Proceeding.\n"); } rb = &rb_1; version = version_1; @@ -718,15 +756,13 @@ exit: return r; } - -size_t toku_serialize_ft_size (FT_HEADER h) { +size_t toku_serialize_ft_size(FT_HEADER h) { size_t size = serialize_ft_min_size(h->layout_version); - //There is no dynamic data. - lazy_assert(size <= block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE); + // There is no dynamic data. + lazy_assert(size <= BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE); return size; } - void toku_serialize_ft_to_wbuf ( struct wbuf *wbuf, FT_HEADER h, @@ -771,52 +807,60 @@ void toku_serialize_ft_to_wbuf ( } void toku_serialize_ft_to(int fd, FT_HEADER h, block_table *bt, CACHEFILE cf) { - lazy_assert(h->type==FT_CHECKPOINT_INPROGRESS); + lazy_assert(h->type == FT_CHECKPOINT_INPROGRESS); struct wbuf w_translation; int64_t size_translation; int64_t address_translation; // Must serialize translation first, to get address,size for header. - bt->serialize_translation_to_wbuf(fd, &w_translation, - &address_translation, - &size_translation); - assert(size_translation == w_translation.ndone); + bt->serialize_translation_to_wbuf( + fd, &w_translation, &address_translation, &size_translation); + invariant(size_translation == w_translation.ndone); - // the number of bytes available in the buffer is 0 mod 512, and those last bytes are all initialized. - assert(w_translation.size % 512 == 0); + // the number of bytes available in the buffer is 0 mod 512, and those last + // bytes are all initialized. + invariant(w_translation.size % 512 == 0); struct wbuf w_main; - size_t size_main = toku_serialize_ft_size(h); + size_t size_main = toku_serialize_ft_size(h); size_t size_main_aligned = roundup_to_multiple(512, size_main); - assert(size_main_aligned<block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE); + invariant(size_main_aligned < + BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE); char *XMALLOC_N_ALIGNED(512, size_main_aligned, mainbuf); - for (size_t i=size_main; i<size_main_aligned; i++) mainbuf[i]=0; // initialize the end of the buffer with zeros + for (size_t i = size_main; i < size_main_aligned; i++) + mainbuf[i] = 0; // initialize the end of the buffer with zeros wbuf_init(&w_main, mainbuf, size_main); - toku_serialize_ft_to_wbuf(&w_main, h, address_translation, size_translation); + toku_serialize_ft_to_wbuf( + &w_main, h, address_translation, size_translation); lazy_assert(w_main.ndone == size_main); // Actually write translation table - // This write is guaranteed to read good data at the end of the buffer, since the + // This write is guaranteed to read good data at the end of the buffer, + // since the // w_translation.buf is padded with zeros to a 512-byte boundary. - toku_os_full_pwrite(fd, w_translation.buf, roundup_to_multiple(512, size_translation), address_translation); - - //Everything but the header MUST be on disk before header starts. - //Otherwise we will think the header is good and some blocks might not - //yet be on disk. - //If the header has a cachefile we need to do cachefile fsync (to - //prevent crash if we redirected to dev null) - //If there is no cachefile we still need to do an fsync. + toku_os_full_pwrite(fd, + w_translation.buf, + roundup_to_multiple(512, size_translation), + address_translation); + + // Everything but the header MUST be on disk before header starts. + // Otherwise we will think the header is good and some blocks might not + // yet be on disk. + // If the header has a cachefile we need to do cachefile fsync (to + // prevent crash if we redirected to dev null) + // If there is no cachefile we still need to do an fsync. if (cf) { toku_cachefile_fsync(cf); - } - else { + } else { toku_file_fsync(fd); } - //Alternate writing header to two locations: + // Alternate writing header to two locations: // Beginning (0) or BLOCK_ALLOCATOR_HEADER_RESERVE toku_off_t main_offset; - main_offset = (h->checkpoint_count & 0x1) ? 0 : block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE; + main_offset = (h->checkpoint_count & 0x1) + ? 0 + : BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE; toku_os_full_pwrite(fd, w_main.buf, size_main_aligned, main_offset); toku_free(w_main.buf); toku_free(w_translation.buf); diff --git a/storage/tokudb/PerconaFT/ft/serialize/ft_node-serialize.cc b/storage/tokudb/PerconaFT/ft/serialize/ft_node-serialize.cc index c4f4886b6a0..5914f8a1050 100644 --- a/storage/tokudb/PerconaFT/ft/serialize/ft_node-serialize.cc +++ b/storage/tokudb/PerconaFT/ft/serialize/ft_node-serialize.cc @@ -99,13 +99,11 @@ void toku_ft_serialize_layer_init(void) { num_cores = toku_os_get_number_active_processors(); int r = toku_thread_pool_create(&ft_pool, num_cores); lazy_assert_zero(r); - block_allocator::maybe_initialize_trace(); toku_serialize_in_parallel = false; } void toku_ft_serialize_layer_destroy(void) { toku_thread_pool_destroy(&ft_pool); - block_allocator::maybe_close_trace(); } enum { FILE_CHANGE_INCREMENT = (16 << 20) }; @@ -773,19 +771,23 @@ int toku_serialize_ftnode_to_memory(FTNODE node, return 0; } -int -toku_serialize_ftnode_to (int fd, BLOCKNUM blocknum, FTNODE node, FTNODE_DISK_DATA* ndd, bool do_rebalancing, FT ft, bool for_checkpoint) { - +int toku_serialize_ftnode_to(int fd, + BLOCKNUM blocknum, + FTNODE node, + FTNODE_DISK_DATA *ndd, + bool do_rebalancing, + FT ft, + bool for_checkpoint) { size_t n_to_write; size_t n_uncompressed_bytes; char *compressed_buf = nullptr; - // because toku_serialize_ftnode_to is only called for + // because toku_serialize_ftnode_to is only called for // in toku_ftnode_flush_callback, we pass false // for in_parallel. The reasoning is that when we write - // nodes to disk via toku_ftnode_flush_callback, we + // nodes to disk via toku_ftnode_flush_callback, we // assume that it is being done on a non-critical - // background thread (probably for checkpointing), and therefore + // background thread (probably for checkpointing), and therefore // should not hog CPU, // // Should the above facts change, we may want to revisit @@ -802,32 +804,32 @@ toku_serialize_ftnode_to (int fd, BLOCKNUM blocknum, FTNODE node, FTNODE_DISK_DA toku_unsafe_fetch(&toku_serialize_in_parallel), &n_to_write, &n_uncompressed_bytes, - &compressed_buf - ); + &compressed_buf); if (r != 0) { return r; } - // If the node has never been written, then write the whole buffer, including the zeros - invariant(blocknum.b>=0); + // If the node has never been written, then write the whole buffer, + // including the zeros + invariant(blocknum.b >= 0); DISKOFF offset; // Dirties the ft - ft->blocktable.realloc_on_disk(blocknum, n_to_write, &offset, - ft, fd, for_checkpoint, - // Allocations for nodes high in the tree are considered 'hot', - // as they are likely to move again in the next checkpoint. - node->height); + ft->blocktable.realloc_on_disk( + blocknum, n_to_write, &offset, ft, fd, for_checkpoint); tokutime_t t0 = toku_time_now(); toku_os_full_pwrite(fd, compressed_buf, n_to_write, offset); tokutime_t t1 = toku_time_now(); tokutime_t io_time = t1 - t0; - toku_ft_status_update_flush_reason(node, n_uncompressed_bytes, n_to_write, io_time, for_checkpoint); + toku_ft_status_update_flush_reason( + node, n_uncompressed_bytes, n_to_write, io_time, for_checkpoint); toku_free(compressed_buf); - node->dirty = 0; // See #1957. Must set the node to be clean after serializing it so that it doesn't get written again on the next checkpoint or eviction. + node->dirty = 0; // See #1957. Must set the node to be clean after + // serializing it so that it doesn't get written again on + // the next checkpoint or eviction. return 0; } @@ -994,6 +996,7 @@ BASEMENTNODE toku_clone_bn(BASEMENTNODE orig_bn) { bn->seqinsert = orig_bn->seqinsert; bn->stale_ancestor_messages_applied = orig_bn->stale_ancestor_messages_applied; bn->stat64_delta = orig_bn->stat64_delta; + bn->logical_rows_delta = orig_bn->logical_rows_delta; bn->data_buffer.clone(&orig_bn->data_buffer); return bn; } @@ -1004,6 +1007,7 @@ BASEMENTNODE toku_create_empty_bn_no_buffer(void) { bn->seqinsert = 0; bn->stale_ancestor_messages_applied = false; bn->stat64_delta = ZEROSTATS; + bn->logical_rows_delta = 0; bn->data_buffer.init_zero(); return bn; } @@ -1897,7 +1901,7 @@ read_and_decompress_block_from_fd_into_rbuf(int fd, BLOCKNUM blocknum, /* out */ int *layout_version_p); // This function upgrades a version 14 or 13 ftnode to the current -// verison. NOTE: This code assumes the first field of the rbuf has +// version. NOTE: This code assumes the first field of the rbuf has // already been read from the buffer (namely the layout_version of the // ftnode.) static int @@ -2488,9 +2492,12 @@ toku_serialize_rollback_log_to_memory_uncompressed(ROLLBACK_LOG_NODE log, SERIAL serialized->blocknum = log->blocknum; } -int -toku_serialize_rollback_log_to (int fd, ROLLBACK_LOG_NODE log, SERIALIZED_ROLLBACK_LOG_NODE serialized_log, bool is_serialized, - FT ft, bool for_checkpoint) { +int toku_serialize_rollback_log_to(int fd, + ROLLBACK_LOG_NODE log, + SERIALIZED_ROLLBACK_LOG_NODE serialized_log, + bool is_serialized, + FT ft, + bool for_checkpoint) { size_t n_to_write; char *compressed_buf; struct serialized_rollback_log_node serialized_local; @@ -2511,21 +2518,21 @@ toku_serialize_rollback_log_to (int fd, ROLLBACK_LOG_NODE log, SERIALIZED_ROLLBA serialized_log->n_sub_blocks, serialized_log->sub_block, ft->h->compression_method, - &n_to_write, &compressed_buf); + &n_to_write, + &compressed_buf); // Dirties the ft DISKOFF offset; - ft->blocktable.realloc_on_disk(blocknum, n_to_write, &offset, - ft, fd, for_checkpoint, - // We consider rollback log flushing the hottest possible allocation, - // since rollback logs are short-lived compared to FT nodes. - INT_MAX); + ft->blocktable.realloc_on_disk( + blocknum, n_to_write, &offset, ft, fd, for_checkpoint); toku_os_full_pwrite(fd, compressed_buf, n_to_write, offset); toku_free(compressed_buf); if (!is_serialized) { toku_static_serialized_rollback_log_destroy(&serialized_local); - log->dirty = 0; // See #1957. Must set the node to be clean after serializing it so that it doesn't get written again on the next checkpoint or eviction. + log->dirty = 0; // See #1957. Must set the node to be clean after + // serializing it so that it doesn't get written again + // on the next checkpoint or eviction. } return 0; } @@ -2704,7 +2711,7 @@ exit: } static int decompress_from_raw_block_into_rbuf_versioned(uint32_t version, uint8_t *raw_block, size_t raw_block_size, struct rbuf *rb, BLOCKNUM blocknum) { - // This function exists solely to accomodate future changes in compression. + // This function exists solely to accommodate future changes in compression. int r = 0; if ((version == FT_LAYOUT_VERSION_13 || version == FT_LAYOUT_VERSION_14) || (FT_LAYOUT_VERSION_25 <= version && version <= FT_LAYOUT_VERSION_27) || diff --git a/storage/tokudb/PerconaFT/ft/serialize/rbtree_mhs.cc b/storage/tokudb/PerconaFT/ft/serialize/rbtree_mhs.cc new file mode 100644 index 00000000000..922850fb3e0 --- /dev/null +++ b/storage/tokudb/PerconaFT/ft/serialize/rbtree_mhs.cc @@ -0,0 +1,833 @@ +/*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILIT or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. +======= */ + +#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include "ft/serialize/rbtree_mhs.h" +#include "portability/toku_assert.h" +#include "portability/toku_portability.h" +#include <algorithm> + +namespace MhsRbTree { + + Tree::Tree() : _root(NULL), _align(1) {} + + Tree::Tree(uint64_t align) : _root(NULL), _align(align) {} + + Tree::~Tree() { Destroy(); } + + void Tree::PreOrder(Node *tree) const { + if (tree != NULL) { + fprintf(stderr, "%" PRIu64 " ", rbn_offset(tree).ToInt()); + PreOrder(tree->_left); + PreOrder(tree->_right); + } + } + + void Tree::PreOrder() { PreOrder(_root); } + + void Tree::InOrder(Node *tree) const { + if (tree != NULL) { + InOrder(tree->_left); + fprintf(stderr, "%" PRIu64 " ", rbn_offset(tree).ToInt()); + InOrder(tree->_right); + } + } + + // yeah, i only care about in order visitor. -Jun + void Tree::InOrderVisitor(Node *tree, + void (*f)(void *, Node *, uint64_t), + void *extra, + uint64_t depth) { + if (tree != NULL) { + InOrderVisitor(tree->_left, f, extra, depth + 1); + f(extra, tree, depth); + InOrderVisitor(tree->_right, f, extra, depth + 1); + } + } + + void Tree::InOrderVisitor(void (*f)(void *, Node *, uint64_t), + void *extra) { + InOrderVisitor(_root, f, extra, 0); + } + + void Tree::InOrder() { InOrder(_root); } + + void Tree::PostOrder(Node *tree) const { + if (tree != NULL) { + PostOrder(tree->_left); + PostOrder(tree->_right); + fprintf(stderr, "%" PRIu64 " ", rbn_offset(tree).ToInt()); + } + } + + void Tree::PostOrder() { PostOrder(_root); } + + Node *Tree::SearchByOffset(uint64_t offset) { + Node *x = _root; + while ((x != NULL) && (rbn_offset(x).ToInt() != offset)) { + if (offset < rbn_offset(x).ToInt()) + x = x->_left; + else + x = x->_right; + } + + return x; + } + + // mostly for testing + Node *Tree::SearchFirstFitBySize(uint64_t size) { + if (EffectiveSize(_root) < size && rbn_left_mhs(_root) < size && + rbn_right_mhs(_root) < size) { + return nullptr; + } else { + return SearchFirstFitBySizeHelper(_root, size); + } + } + + Node *Tree::SearchFirstFitBySizeHelper(Node *x, uint64_t size) { + if (EffectiveSize(x) >= size) { + // only possible to go left + if (rbn_left_mhs(x) >= size) + return SearchFirstFitBySizeHelper(x->_left, size); + else + return x; + } + if (rbn_left_mhs(x) >= size) + return SearchFirstFitBySizeHelper(x->_left, size); + + if (rbn_right_mhs(x) >= size) + return SearchFirstFitBySizeHelper(x->_right, size); + + // this is an invalid state + Dump(); + ValidateBalance(); + ValidateMhs(); + invariant(0); + return NULL; + } + + Node *Tree::MinNode(Node *tree) { + if (tree == NULL) + return NULL; + + while (tree->_left != NULL) + tree = tree->_left; + return tree; + } + + Node *Tree::MinNode() { return MinNode(_root); } + + Node *Tree::MaxNode(Node *tree) { + if (tree == NULL) + return NULL; + + while (tree->_right != NULL) + tree = tree->_right; + return tree; + } + + Node *Tree::MaxNode() { return MaxNode(_root); } + + Node *Tree::SuccessorHelper(Node *y, Node *x) { + while ((y != NULL) && (x == y->_right)) { + x = y; + y = y->_parent; + } + return y; + } + Node *Tree::Successor(Node *x) { + if (x->_right != NULL) + return MinNode(x->_right); + + Node *y = x->_parent; + return SuccessorHelper(y, x); + } + + Node *Tree::PredecessorHelper(Node *y, Node *x) { + while ((y != NULL) && (x == y->_left)) { + x = y; + y = y->_parent; + } + + return y; + } + Node *Tree::Predecessor(Node *x) { + if (x->_left != NULL) + return MaxNode(x->_left); + + Node *y = x->_parent; + return SuccessorHelper(y, x); + } + + /* + * px px + * / / + * x y + * / \ --(left rotation)--> / \ # + * lx y x ry + * / \ / \ + * ly ry lx ly + * max_hole_size updates are pretty local + */ + + void Tree::LeftRotate(Node *&root, Node *x) { + Node *y = x->_right; + + x->_right = y->_left; + rbn_right_mhs(x) = rbn_left_mhs(y); + + if (y->_left != NULL) + y->_left->_parent = x; + + y->_parent = x->_parent; + + if (x->_parent == NULL) { + root = y; + } else { + if (x->_parent->_left == x) { + x->_parent->_left = y; + } else { + x->_parent->_right = y; + } + } + y->_left = x; + rbn_left_mhs(y) = mhs_of_subtree(x); + + x->_parent = y; + } + + /* py py + * / / + * y x + * / \ --(right rotate)--> / \ # + * x ry lx y + * / \ / \ # + * lx rx rx ry + * + */ + + void Tree::RightRotate(Node *&root, Node *y) { + Node *x = y->_left; + + y->_left = x->_right; + rbn_left_mhs(y) = rbn_right_mhs(x); + + if (x->_right != NULL) + x->_right->_parent = y; + + x->_parent = y->_parent; + + if (y->_parent == NULL) { + root = x; + } else { + if (y == y->_parent->_right) + y->_parent->_right = x; + else + y->_parent->_left = x; + } + + x->_right = y; + rbn_right_mhs(x) = mhs_of_subtree(y); + y->_parent = x; + } + + // walking from this node up to update the mhs info + // whenver there is change on left/right mhs or size we should recalculate. + // prerequisit: the children of the node are mhs up-to-date. + void Tree::RecalculateMhs(Node *node) { + uint64_t *p_node_mhs = 0; + Node *parent = node->_parent; + + if (!parent) + return; + + uint64_t max_mhs = mhs_of_subtree(node); + if (node == parent->_left) { + p_node_mhs = &rbn_left_mhs(parent); + } else if (node == parent->_right) { + p_node_mhs = &rbn_right_mhs(parent); + } else { + return; + } + if (*p_node_mhs != max_mhs) { + *p_node_mhs = max_mhs; + RecalculateMhs(parent); + } + } + + void Tree::IsNewNodeMergable(Node *pred, + Node *succ, + Node::BlockPair pair, + bool *left_merge, + bool *right_merge) { + if (pred) { + OUUInt64 end_of_pred = rbn_size(pred) + rbn_offset(pred); + if (end_of_pred < pair._offset) + *left_merge = false; + else { + invariant(end_of_pred == pair._offset); + *left_merge = true; + } + } + if (succ) { + OUUInt64 begin_of_succ = rbn_offset(succ); + OUUInt64 end_of_node = pair._offset + pair._size; + if (end_of_node < begin_of_succ) { + *right_merge = false; + } else { + invariant(end_of_node == begin_of_succ); + *right_merge = true; + } + } + } + + void Tree::AbsorbNewNode(Node *pred, + Node *succ, + Node::BlockPair pair, + bool left_merge, + bool right_merge, + bool is_right_child) { + invariant(left_merge || right_merge); + if (left_merge && right_merge) { + // merge to the succ + if (!is_right_child) { + rbn_size(succ) += pair._size; + rbn_offset(succ) = pair._offset; + // merge to the pred + rbn_size(pred) += rbn_size(succ); + // to keep the invariant of the tree -no overlapping holes + rbn_offset(succ) += rbn_size(succ); + rbn_size(succ) = 0; + RecalculateMhs(succ); + RecalculateMhs(pred); + // pred dominates succ. this is going to + // update the pred labels separately. + // remove succ + RawRemove(_root, succ); + } else { + rbn_size(pred) += pair._size; + rbn_offset(succ) = rbn_offset(pred); + rbn_size(succ) += rbn_size(pred); + rbn_offset(pred) += rbn_size(pred); + rbn_size(pred) = 0; + RecalculateMhs(pred); + RecalculateMhs(succ); + // now remove pred + RawRemove(_root, pred); + } + } else if (left_merge) { + rbn_size(pred) += pair._size; + RecalculateMhs(pred); + } else if (right_merge) { + rbn_offset(succ) -= pair._size; + rbn_size(succ) += pair._size; + RecalculateMhs(succ); + } + } + // this is the most tedious part, but not complicated: + // 1.find where to insert the pair + // 2.if the pred and succ can merge with the pair. merge with them. either + // pred + // or succ can be removed. + // 3. if only left-mergable or right-mergeable, just merge + // 4. non-mergable case. insert the node and run the fixup. + + int Tree::Insert(Node *&root, Node::BlockPair pair) { + Node *x = _root; + Node *y = NULL; + bool left_merge = false; + bool right_merge = false; + Node *node = NULL; + + while (x != NULL) { + y = x; + if (pair._offset < rbn_key(x)) + x = x->_left; + else + x = x->_right; + } + + // we found where to insert, lets find out the pred and succ for + // possible + // merges. + // node->parent = y; + Node *pred, *succ; + if (y != NULL) { + if (pair._offset < rbn_key(y)) { + // as the left child + pred = PredecessorHelper(y->_parent, y); + succ = y; + IsNewNodeMergable(pred, succ, pair, &left_merge, &right_merge); + if (left_merge || right_merge) { + AbsorbNewNode( + pred, succ, pair, left_merge, right_merge, false); + } else { + // construct the node + Node::Pair mhsp {0, 0}; + node = + new Node(EColor::BLACK, pair, mhsp, nullptr, nullptr, nullptr); + if (!node) + return -1; + y->_left = node; + node->_parent = y; + RecalculateMhs(node); + } + + } else { + // as the right child + pred = y; + succ = SuccessorHelper(y->_parent, y); + IsNewNodeMergable(pred, succ, pair, &left_merge, &right_merge); + if (left_merge || right_merge) { + AbsorbNewNode( + pred, succ, pair, left_merge, right_merge, true); + } else { + // construct the node + Node::Pair mhsp {0, 0}; + node = + new Node(EColor::BLACK, pair, mhsp, nullptr, nullptr, nullptr); + if (!node) + return -1; + y->_right = node; + node->_parent = y; + RecalculateMhs(node); + } + } + } else { + Node::Pair mhsp {0, 0}; + node = new Node(EColor::BLACK, pair, mhsp, nullptr, nullptr, nullptr); + if (!node) + return -1; + root = node; + } + if (!left_merge && !right_merge) { + invariant_notnull(node); + node->_color = EColor::RED; + return InsertFixup(root, node); + } + return 0; + } + + int Tree::InsertFixup(Node *&root, Node *node) { + Node *parent, *gparent; + while ((parent = rbn_parent(node)) && rbn_is_red(parent)) { + gparent = rbn_parent(parent); + if (parent == gparent->_left) { + { + Node *uncle = gparent->_right; + if (uncle && rbn_is_red(uncle)) { + rbn_set_black(uncle); + rbn_set_black(parent); + rbn_set_red(gparent); + node = gparent; + continue; + } + } + + if (parent->_right == node) { + Node *tmp; + LeftRotate(root, parent); + tmp = parent; + parent = node; + node = tmp; + } + + rbn_set_black(parent); + rbn_set_red(gparent); + RightRotate(root, gparent); + } else { + { + Node *uncle = gparent->_left; + if (uncle && rbn_is_red(uncle)) { + rbn_set_black(uncle); + rbn_set_black(parent); + rbn_set_red(gparent); + node = gparent; + continue; + } + } + + if (parent->_left == node) { + Node *tmp; + RightRotate(root, parent); + tmp = parent; + parent = node; + node = tmp; + } + rbn_set_black(parent); + rbn_set_red(gparent); + LeftRotate(root, gparent); + } + } + rbn_set_black(root); + return 0; + } + + int Tree::Insert(Node::BlockPair pair) { return Insert(_root, pair); } + + uint64_t Tree::Remove(size_t size) { + Node *node = SearchFirstFitBySize(size); + return Remove(_root, node, size); + } + + void Tree::RawRemove(Node *&root, Node *node) { + Node *child, *parent; + EColor color; + + if ((node->_left != NULL) && (node->_right != NULL)) { + Node *replace = node; + replace = replace->_right; + while (replace->_left != NULL) + replace = replace->_left; + + if (rbn_parent(node)) { + if (rbn_parent(node)->_left == node) + rbn_parent(node)->_left = replace; + else + rbn_parent(node)->_right = replace; + } else { + root = replace; + } + child = replace->_right; + parent = rbn_parent(replace); + color = rbn_color(replace); + + if (parent == node) { + parent = replace; + } else { + if (child) + rbn_parent(child) = parent; + + parent->_left = child; + rbn_left_mhs(parent) = rbn_right_mhs(replace); + RecalculateMhs(parent); + replace->_right = node->_right; + rbn_set_parent(node->_right, replace); + rbn_right_mhs(replace) = rbn_right_mhs(node); + } + + replace->_parent = node->_parent; + replace->_color = node->_color; + replace->_left = node->_left; + rbn_left_mhs(replace) = rbn_left_mhs(node); + node->_left->_parent = replace; + RecalculateMhs(replace); + if (color == EColor::BLACK) + RawRemoveFixup(root, child, parent); + delete node; + return; + } + + if (node->_left != NULL) + child = node->_left; + else + child = node->_right; + + parent = node->_parent; + color = node->_color; + + if (child) + child->_parent = parent; + + if (parent) { + if (parent->_left == node) { + parent->_left = child; + rbn_left_mhs(parent) = child ? mhs_of_subtree(child) : 0; + } else { + parent->_right = child; + rbn_right_mhs(parent) = child ? mhs_of_subtree(child) : 0; + } + RecalculateMhs(parent); + } else + root = child; + if (color == EColor::BLACK) + RawRemoveFixup(root, child, parent); + delete node; + } + + void Tree::RawRemove(uint64_t offset) { + Node *node = SearchByOffset(offset); + RawRemove(_root, node); + } + static inline uint64_t align(uint64_t value, uint64_t ba_alignment) { + return ((value + ba_alignment - 1) / ba_alignment) * ba_alignment; + } + uint64_t Tree::Remove(Node *&root, Node *node, size_t size) { + OUUInt64 n_offset = rbn_offset(node); + OUUInt64 n_size = rbn_size(node); + OUUInt64 answer_offset(align(rbn_offset(node).ToInt(), _align)); + + invariant((answer_offset + size) <= (n_offset + n_size)); + if (answer_offset == n_offset) { + rbn_offset(node) += size; + rbn_size(node) -= size; + RecalculateMhs(node); + if (rbn_size(node) == 0) { + RawRemove(root, node); + } + + } else { + if (answer_offset + size == n_offset + n_size) { + rbn_size(node) -= size; + RecalculateMhs(node); + } else { + // well, cut in the middle... + rbn_size(node) = answer_offset - n_offset; + RecalculateMhs(node); + Insert(_root, + {(answer_offset + size), + (n_offset + n_size) - (answer_offset + size)}); + } + } + return answer_offset.ToInt(); + } + + void Tree::RawRemoveFixup(Node *&root, Node *node, Node *parent) { + Node *other; + while ((!node || rbn_is_black(node)) && node != root) { + if (parent->_left == node) { + other = parent->_right; + if (rbn_is_red(other)) { + // Case 1: the brother of X, w, is read + rbn_set_black(other); + rbn_set_red(parent); + LeftRotate(root, parent); + other = parent->_right; + } + if ((!other->_left || rbn_is_black(other->_left)) && + (!other->_right || rbn_is_black(other->_right))) { + // Case 2: w is black and both of w's children are black + rbn_set_red(other); + node = parent; + parent = rbn_parent(node); + } else { + if (!other->_right || rbn_is_black(other->_right)) { + // Case 3: w is black and left child of w is red but + // right + // child is black + rbn_set_black(other->_left); + rbn_set_red(other); + RightRotate(root, other); + other = parent->_right; + } + // Case 4: w is black and right child of w is red, + // regardless of + // left child's color + rbn_set_color(other, rbn_color(parent)); + rbn_set_black(parent); + rbn_set_black(other->_right); + LeftRotate(root, parent); + node = root; + break; + } + } else { + other = parent->_left; + if (rbn_is_red(other)) { + // Case 1: w is red + rbn_set_black(other); + rbn_set_red(parent); + RightRotate(root, parent); + other = parent->_left; + } + if ((!other->_left || rbn_is_black(other->_left)) && + (!other->_right || rbn_is_black(other->_right))) { + // Case 2: w is black and both children are black + rbn_set_red(other); + node = parent; + parent = rbn_parent(node); + } else { + if (!other->_left || rbn_is_black(other->_left)) { + // Case 3: w is black and left child of w is red whereas + // right child is black + rbn_set_black(other->_right); + rbn_set_red(other); + LeftRotate(root, other); + other = parent->_left; + } + // Case 4:w is black and right child of w is red, regardless + // of + // the left child's color + rbn_set_color(other, rbn_color(parent)); + rbn_set_black(parent); + rbn_set_black(other->_left); + RightRotate(root, parent); + node = root; + break; + } + } + } + if (node) + rbn_set_black(node); + } + + void Tree::Destroy(Node *&tree) { + if (tree == NULL) + return; + + if (tree->_left != NULL) + Destroy(tree->_left); + if (tree->_right != NULL) + Destroy(tree->_right); + + delete tree; + tree = NULL; + } + + void Tree::Destroy() { Destroy(_root); } + + void Tree::Dump(Node *tree, Node::BlockPair pair, EDirection dir) { + if (tree != NULL) { + if (dir == EDirection::NONE) + fprintf(stderr, + "(%" PRIu64 ",%" PRIu64 ", mhs:(%" PRIu64 ",%" PRIu64 + "))(B) is root\n", + rbn_offset(tree).ToInt(), + rbn_size(tree).ToInt(), + rbn_left_mhs(tree), + rbn_right_mhs(tree)); + else + fprintf(stderr, + "(%" PRIu64 ",%" PRIu64 ",mhs:(%" PRIu64 ",%" PRIu64 + "))(%c) is %" PRIu64 "'s %s\n", + rbn_offset(tree).ToInt(), + rbn_size(tree).ToInt(), + rbn_left_mhs(tree), + rbn_right_mhs(tree), + rbn_is_red(tree) ? 'R' : 'B', + pair._offset.ToInt(), + dir == EDirection::RIGHT ? "right child" : "left child"); + + Dump(tree->_left, tree->_hole, EDirection::LEFT); + Dump(tree->_right, tree->_hole, EDirection::RIGHT); + } + } + + uint64_t Tree::EffectiveSize(Node *node) { + OUUInt64 offset = rbn_offset(node); + OUUInt64 size = rbn_size(node); + OUUInt64 end = offset + size; + OUUInt64 aligned_offset(align(offset.ToInt(), _align)); + if (aligned_offset > end) { + return 0; + } + return (end - aligned_offset).ToInt(); + } + + void Tree::Dump() { + if (_root != NULL) + Dump(_root, _root->_hole, (EDirection)0); + } + + static void vis_bal_f(void *extra, Node *node, uint64_t depth) { + uint64_t **p = (uint64_t **)extra; + uint64_t min = *p[0]; + uint64_t max = *p[1]; + if (node->_left) { + Node *left = node->_left; + invariant(node == left->_parent); + } + + if (node->_right) { + Node *right = node->_right; + invariant(node == right->_parent); + } + + if (!node->_left || !node->_right) { + if (min > depth) { + *p[0] = depth; + } else if (max < depth) { + *p[1] = depth; + } + } + } + + void Tree::ValidateBalance() { + uint64_t min_depth = 0xffffffffffffffff; + uint64_t max_depth = 0; + if (!_root) { + return; + } + uint64_t *p[2] = {&min_depth, &max_depth}; + InOrderVisitor(vis_bal_f, (void *)p); + invariant((min_depth + 1) * 2 >= max_depth + 1); + } + + static void vis_cmp_f(void *extra, Node *node, uint64_t UU(depth)) { + Node::BlockPair **p = (Node::BlockPair **)extra; + + invariant_notnull(*p); + invariant((*p)->_offset == node->_hole._offset); + + *p = *p + 1; + } + + // validate the input pairs matches with sorted pairs + void Tree::ValidateInOrder(Node::BlockPair *pairs) { + InOrderVisitor(vis_cmp_f, &pairs); + } + + uint64_t Tree::ValidateMhs(Node *node) { + if (!node) + return 0; + else { + uint64_t mhs_left = ValidateMhs(node->_left); + uint64_t mhs_right = ValidateMhs(node->_right); + if (mhs_left != rbn_left_mhs(node)) { + printf("assert failure: mhs_left = %" PRIu64 "\n", mhs_left); + Dump(node, node->_hole, (EDirection)0); + } + invariant(mhs_left == rbn_left_mhs(node)); + + if (mhs_right != rbn_right_mhs(node)) { + printf("assert failure: mhs_right = %" PRIu64 "\n", mhs_right); + Dump(node, node->_hole, (EDirection)0); + } + invariant(mhs_right == rbn_right_mhs(node)); + return std::max(EffectiveSize(node), std::max(mhs_left, mhs_right)); + } + } + + void Tree::ValidateMhs() { + if (!_root) + return; + uint64_t mhs_left = ValidateMhs(_root->_left); + uint64_t mhs_right = ValidateMhs(_root->_right); + invariant(mhs_left == rbn_left_mhs(_root)); + invariant(mhs_right == rbn_right_mhs(_root)); + } + +} // namespace MhsRbTree diff --git a/storage/tokudb/PerconaFT/ft/serialize/rbtree_mhs.h b/storage/tokudb/PerconaFT/ft/serialize/rbtree_mhs.h new file mode 100644 index 00000000000..92f1e278e1a --- /dev/null +++ b/storage/tokudb/PerconaFT/ft/serialize/rbtree_mhs.h @@ -0,0 +1,351 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. +======= */ + +#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#pragma once + +#include <db.h> + +#include "portability/toku_pthread.h" +#include "portability/toku_stdint.h" +#include "portability/toku_stdlib.h" + +// RBTree(Red-black tree) with max hole sizes for subtrees. + +// This is a tentative data struct to improve the block allocation time +// complexity from the linear time to the log time. Please be noted this DS only +// supports first-fit for now. It is actually easier to do it with +// best-fit.(just +// sort by size). + +// RBTree is a classic data struct with O(log(n)) for insertion, deletion and +// search. Many years have seen its efficiency. + +// a *hole* is the representation of an available BlockPair for allocation. +// defined as (start_address,size) or (offset, size) interchangably. + +// each node has a *label* to indicate a pair of the max hole sizes for its +// subtree. + +// We are implementing a RBTree with max hole sizes for subtree. It is a red +// black tree that is sorted by the start_address but also labeld with the max +// hole sizes of the subtrees. + +// [(6,3)] -> [(offset, size)], the hole +// [{2,5}] -> [{mhs_of_left, mhs_of_right}], the label +/* / \ */ +// [(0, 1)] [(10, 5)] +// [{0, 2}] [{0, 0}] +/* \ */ +// [(3, 2)] +// [{0, 0}] +// request of allocation size=2 goes from root to [(3,2)]. + +// above example shows a simplified RBTree_max_holes. +// it is easier to tell the search time is O(log(n)) as we can make a decision +// on each descent until we get to the target. + +// the only question is if we can keep the maintenance cost low -- and i think +// it is not a problem becoz an insertion/deletion is only going to update the +// max_hole_sizes of the nodes along the path from the root to the node to be +// deleted/inserted. The path can be cached and search is anyway O(log(n)). + +// unlike the typical rbtree, Tree has to handle the inserts and deletes +// with more care: an allocation that triggers the delete might leave some +// unused space which we can simply update the start_addr and size without +// worrying overlapping. An free might not only mean the insertion but also +// *merging* with the adjacent holes. + +namespace MhsRbTree { + +#define offset_t uint64_t + enum class EColor { RED, BLACK }; + enum class EDirection { NONE = 0, LEFT, RIGHT }; + + // I am a bit tired of fixing overflow/underflow, just quickly craft some + // int + // class that has an infinity-like max value and prevents overflow and + // underflow. If you got a file offset larger than MHS_MAX_VAL, it is not + // a problem here. :-/ - JYM + class OUUInt64 { + public: + static const uint64_t MHS_MAX_VAL = 0xffffffffffffffff; + OUUInt64() : _value(0) {} + OUUInt64(uint64_t s) : _value(s) {} + bool operator<(const OUUInt64 &r) const { + invariant(!(_value == MHS_MAX_VAL && r.ToInt() == MHS_MAX_VAL)); + return _value < r.ToInt(); + } + bool operator>(const OUUInt64 &r) const { + invariant(!(_value == MHS_MAX_VAL && r.ToInt() == MHS_MAX_VAL)); + return _value > r.ToInt(); + } + bool operator<=(const OUUInt64 &r) const { + invariant(!(_value == MHS_MAX_VAL && r.ToInt() == MHS_MAX_VAL)); + return _value <= r.ToInt(); + } + bool operator>=(const OUUInt64 &r) const { + invariant(!(_value == MHS_MAX_VAL && r.ToInt() == MHS_MAX_VAL)); + return _value >= r.ToInt(); + } + OUUInt64 operator+(const OUUInt64 &r) const { + if (_value == MHS_MAX_VAL || r.ToInt() == MHS_MAX_VAL) { + OUUInt64 tmp(MHS_MAX_VAL); + return tmp; + } else { + // detecting overflow + invariant((MHS_MAX_VAL - _value) >= r.ToInt()); + uint64_t plus = _value + r.ToInt(); + OUUInt64 tmp(plus); + return tmp; + } + } + OUUInt64 operator-(const OUUInt64 &r) const { + invariant(r.ToInt() != MHS_MAX_VAL); + if (_value == MHS_MAX_VAL) { + return *this; + } else { + invariant(_value >= r.ToInt()); + uint64_t minus = _value - r.ToInt(); + OUUInt64 tmp(minus); + return tmp; + } + } + OUUInt64 operator-=(const OUUInt64 &r) { + if (_value != MHS_MAX_VAL) { + invariant(r.ToInt() != MHS_MAX_VAL); + invariant(_value >= r.ToInt()); + _value -= r.ToInt(); + } + return *this; + } + OUUInt64 operator+=(const OUUInt64 &r) { + if (_value != MHS_MAX_VAL) { + if (r.ToInt() == MHS_MAX_VAL) { + _value = MHS_MAX_VAL; + } else { + invariant((MHS_MAX_VAL - _value) >= r.ToInt()); + this->_value += r.ToInt(); + } + } + return *this; + } + bool operator==(const OUUInt64 &r) const { + return _value == r.ToInt(); + } + bool operator!=(const OUUInt64 &r) const { + return _value != r.ToInt(); + } + OUUInt64 operator=(const OUUInt64 &r) { + _value = r.ToInt(); + return *this; + } + uint64_t ToInt() const { return _value; } + + private: + uint64_t _value; + }; + + class Node { + public: + struct BlockPair { + OUUInt64 _offset; + OUUInt64 _size; + + BlockPair() : _offset(0), _size(0) {} + BlockPair(uint64_t o, uint64_t s) : _offset(o), _size(s) {} + + BlockPair(OUUInt64 o, OUUInt64 s) : _offset(o), _size(s) {} + int operator<(const struct BlockPair &rhs) const { + return _offset < rhs._offset; + } + int operator<(const uint64_t &o) const { return _offset < o; } + }; + + struct Pair { + uint64_t _left; + uint64_t _right; + Pair(uint64_t l, uint64_t r) : _left(l), _right(r) {} + }; + + EColor _color; + struct BlockPair _hole; + struct Pair _label; + Node *_left; + Node *_right; + Node *_parent; + + Node(EColor c, + Node::BlockPair h, + struct Pair lb, + Node *l, + Node *r, + Node *p) + : _color(c), + _hole(h), + _label(lb), + _left(l), + _right(r), + _parent(p) {} + }; + + class Tree { + private: + Node *_root; + uint64_t _align; + + public: + Tree(); + Tree(uint64_t); + ~Tree(); + + void PreOrder(); + void InOrder(); + void PostOrder(); + // immutable operations + Node *SearchByOffset(uint64_t addr); + Node *SearchFirstFitBySize(uint64_t size); + + Node *MinNode(); + Node *MaxNode(); + + Node *Successor(Node *); + Node *Predecessor(Node *); + + // mapped from tree_allocator::free_block + int Insert(Node::BlockPair pair); + // mapped from tree_allocator::alloc_block + uint64_t Remove(size_t size); + // mapped from tree_allocator::alloc_block_after + + void RawRemove(uint64_t offset); + void Destroy(); + // print the tree + void Dump(); + // validation + // balance + void ValidateBalance(); + void ValidateInOrder(Node::BlockPair *); + void InOrderVisitor(void (*f)(void *, Node *, uint64_t), void *); + void ValidateMhs(); + + private: + void PreOrder(Node *node) const; + void InOrder(Node *node) const; + void PostOrder(Node *node) const; + Node *SearchByOffset(Node *node, offset_t addr) const; + Node *SearchFirstFitBySize(Node *node, size_t size) const; + + Node *MinNode(Node *node); + Node *MaxNode(Node *node); + + // rotations to fix up. we will have to update the labels too. + void LeftRotate(Node *&root, Node *x); + void RightRotate(Node *&root, Node *y); + + int Insert(Node *&root, Node::BlockPair pair); + int InsertFixup(Node *&root, Node *node); + + void RawRemove(Node *&root, Node *node); + uint64_t Remove(Node *&root, Node *node, size_t size); + void RawRemoveFixup(Node *&root, Node *node, Node *parent); + + void Destroy(Node *&tree); + void Dump(Node *tree, Node::BlockPair pair, EDirection dir); + void RecalculateMhs(Node *node); + void IsNewNodeMergable(Node *, Node *, Node::BlockPair, bool *, bool *); + void AbsorbNewNode(Node *, Node *, Node::BlockPair, bool, bool, bool); + Node *SearchFirstFitBySizeHelper(Node *x, uint64_t size); + + Node *SuccessorHelper(Node *y, Node *x); + + Node *PredecessorHelper(Node *y, Node *x); + + void InOrderVisitor(Node *, + void (*f)(void *, Node *, uint64_t), + void *, + uint64_t); + uint64_t ValidateMhs(Node *); + + uint64_t EffectiveSize(Node *); +// mixed with some macros..... +#define rbn_parent(r) ((r)->_parent) +#define rbn_color(r) ((r)->_color) +#define rbn_is_red(r) ((r)->_color == EColor::RED) +#define rbn_is_black(r) ((r)->_color == EColor::BLACK) +#define rbn_set_black(r) \ + do { \ + (r)->_color = EColor::BLACK; \ + } while (0) +#define rbn_set_red(r) \ + do { \ + (r)->_color = EColor::RED; \ + } while (0) +#define rbn_set_parent(r, p) \ + do { \ + (r)->_parent = (p); \ + } while (0) +#define rbn_set_color(r, c) \ + do { \ + (r)->_color = (c); \ + } while (0) +#define rbn_set_offset(r) \ + do { \ + (r)->_hole._offset = (c); \ + } while (0) +#define rbn_set_size(r, c) \ + do { \ + (r)->_hole._size = (c); \ + } while (0) +#define rbn_set_left_mhs(r, c) \ + do { \ + (r)->_label._left = (c); \ + } while (0) +#define rbn_set_right_mhs(r, c) \ + do { \ + (r)->_label._right = (c); \ + } while (0) +#define rbn_size(r) ((r)->_hole._size) +#define rbn_offset(r) ((r)->_hole._offset) +#define rbn_key(r) ((r)->_hole._offset) +#define rbn_left_mhs(r) ((r)->_label._left) +#define rbn_right_mhs(r) ((r)->_label._right) +#define mhs_of_subtree(y) \ + (std::max(std::max(rbn_left_mhs(y), rbn_right_mhs(y)), EffectiveSize(y))) + }; + +} // namespace MhsRbTree diff --git a/storage/tokudb/PerconaFT/ft/tests/block_allocator_strategy_test.cc b/storage/tokudb/PerconaFT/ft/tests/block_allocator_strategy_test.cc deleted file mode 100644 index 3670ef81cc2..00000000000 --- a/storage/tokudb/PerconaFT/ft/tests/block_allocator_strategy_test.cc +++ /dev/null @@ -1,126 +0,0 @@ -/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ -// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: -#ident "$Id$" -/*====== -This file is part of PerconaFT. - - -Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. - - PerconaFT is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License, version 2, - as published by the Free Software Foundation. - - PerconaFT is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. - ----------------------------------------- - - PerconaFT is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License, version 3, - as published by the Free Software Foundation. - - PerconaFT is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. -======= */ - -#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." - -#include "ft/tests/test.h" - -#include "ft/serialize/block_allocator_strategy.h" - -static const uint64_t alignment = 4096; - -static void test_first_vs_best_fit(void) { - struct block_allocator::blockpair pairs[] = { - block_allocator::blockpair(1 * alignment, 6 * alignment), - // hole between 7x align -> 8x align - block_allocator::blockpair(8 * alignment, 4 * alignment), - // hole between 12x align -> 16x align - block_allocator::blockpair(16 * alignment, 1 * alignment), - block_allocator::blockpair(17 * alignment, 2 * alignment), - // hole between 19 align -> 21x align - block_allocator::blockpair(21 * alignment, 2 * alignment), - }; - const uint64_t n_blocks = sizeof(pairs) / sizeof(pairs[0]); - - block_allocator::blockpair *bp; - - // first fit - bp = block_allocator_strategy::first_fit(pairs, n_blocks, 100, alignment); - assert(bp == &pairs[0]); - bp = block_allocator_strategy::first_fit(pairs, n_blocks, 4096, alignment); - assert(bp == &pairs[0]); - bp = block_allocator_strategy::first_fit(pairs, n_blocks, 3 * 4096, alignment); - assert(bp == &pairs[1]); - bp = block_allocator_strategy::first_fit(pairs, n_blocks, 5 * 4096, alignment); - assert(bp == nullptr); - - // best fit - bp = block_allocator_strategy::best_fit(pairs, n_blocks, 100, alignment); - assert(bp == &pairs[0]); - bp = block_allocator_strategy::best_fit(pairs, n_blocks, 4100, alignment); - assert(bp == &pairs[3]); - bp = block_allocator_strategy::best_fit(pairs, n_blocks, 3 * 4096, alignment); - assert(bp == &pairs[1]); - bp = block_allocator_strategy::best_fit(pairs, n_blocks, 5 * 4096, alignment); - assert(bp == nullptr); -} - -static void test_padded_fit(void) { - struct block_allocator::blockpair pairs[] = { - block_allocator::blockpair(1 * alignment, 1 * alignment), - // 4096 byte hole after bp[0] - block_allocator::blockpair(3 * alignment, 1 * alignment), - // 8192 byte hole after bp[1] - block_allocator::blockpair(6 * alignment, 1 * alignment), - // 16384 byte hole after bp[2] - block_allocator::blockpair(11 * alignment, 1 * alignment), - // 32768 byte hole after bp[3] - block_allocator::blockpair(17 * alignment, 1 * alignment), - // 116kb hole after bp[4] - block_allocator::blockpair(113 * alignment, 1 * alignment), - // 256kb hole after bp[5] - block_allocator::blockpair(371 * alignment, 1 * alignment), - }; - const uint64_t n_blocks = sizeof(pairs) / sizeof(pairs[0]); - - block_allocator::blockpair *bp; - - // padding for a 100 byte allocation will be < than standard alignment, - // so it should fit in the first 4096 byte hole. - bp = block_allocator_strategy::padded_fit(pairs, n_blocks, 4000, alignment); - assert(bp == &pairs[0]); - - // Even padded, a 12kb alloc will fit in a 16kb hole - bp = block_allocator_strategy::padded_fit(pairs, n_blocks, 3 * alignment, alignment); - assert(bp == &pairs[2]); - - // would normally fit in the 116kb hole but the padding will bring it over - bp = block_allocator_strategy::padded_fit(pairs, n_blocks, 116 * alignment, alignment); - assert(bp == &pairs[5]); - - bp = block_allocator_strategy::padded_fit(pairs, n_blocks, 127 * alignment, alignment); - assert(bp == &pairs[5]); -} - -int test_main(int argc, const char *argv[]) { - (void) argc; - (void) argv; - - test_first_vs_best_fit(); - test_padded_fit(); - - return 0; -} diff --git a/storage/tokudb/PerconaFT/ft/tests/block_allocator_test.cc b/storage/tokudb/PerconaFT/ft/tests/block_allocator_test.cc index d80ee83cbc9..3eff52b915d 100644 --- a/storage/tokudb/PerconaFT/ft/tests/block_allocator_test.cc +++ b/storage/tokudb/PerconaFT/ft/tests/block_allocator_test.cc @@ -38,253 +38,243 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. #include "test.h" -static void ba_alloc(block_allocator *ba, uint64_t size, uint64_t *answer) { - ba->validate(); +static void ba_alloc(BlockAllocator *ba, uint64_t size, uint64_t *answer) { + ba->Validate(); uint64_t actual_answer; - const uint64_t heat = random() % 2; - ba->alloc_block(512 * size, heat, &actual_answer); - ba->validate(); + ba->AllocBlock(512 * size, &actual_answer); + ba->Validate(); - assert(actual_answer%512==0); - *answer = actual_answer/512; + invariant(actual_answer % 512 == 0); + *answer = actual_answer / 512; } -static void ba_free(block_allocator *ba, uint64_t offset) { - ba->validate(); - ba->free_block(offset * 512); - ba->validate(); +static void ba_free(BlockAllocator *ba, uint64_t offset, uint64_t size) { + ba->Validate(); + ba->FreeBlock(offset * 512, 512 * size); + ba->Validate(); } -static void ba_check_l(block_allocator *ba, uint64_t blocknum_in_layout_order, - uint64_t expected_offset, uint64_t expected_size) { +static void ba_check_l(BlockAllocator *ba, + uint64_t blocknum_in_layout_order, + uint64_t expected_offset, + uint64_t expected_size) { uint64_t actual_offset, actual_size; - int r = ba->get_nth_block_in_layout_order(blocknum_in_layout_order, &actual_offset, &actual_size); - assert(r==0); - assert(expected_offset*512 == actual_offset); - assert(expected_size *512 == actual_size); + int r = ba->NthBlockInLayoutOrder( + blocknum_in_layout_order, &actual_offset, &actual_size); + invariant(r == 0); + invariant(expected_offset * 512 == actual_offset); + invariant(expected_size * 512 == actual_size); } -static void ba_check_none(block_allocator *ba, uint64_t blocknum_in_layout_order) { +static void ba_check_none(BlockAllocator *ba, + uint64_t blocknum_in_layout_order) { uint64_t actual_offset, actual_size; - int r = ba->get_nth_block_in_layout_order(blocknum_in_layout_order, &actual_offset, &actual_size); - assert(r==-1); + int r = ba->NthBlockInLayoutOrder( + blocknum_in_layout_order, &actual_offset, &actual_size); + invariant(r == -1); } - // Simple block allocator test -static void test_ba0(block_allocator::allocation_strategy strategy) { - block_allocator allocator; - block_allocator *ba = &allocator; - ba->create(100*512, 1*512); - ba->set_strategy(strategy); - assert(ba->allocated_limit()==100*512); +static void test_ba0() { + BlockAllocator allocator; + BlockAllocator *ba = &allocator; + ba->Create(100 * 512, 1 * 512); + invariant(ba->AllocatedLimit() == 100 * 512); uint64_t b2, b3, b4, b5, b6, b7; - ba_alloc(ba, 100, &b2); - ba_alloc(ba, 100, &b3); - ba_alloc(ba, 100, &b4); - ba_alloc(ba, 100, &b5); - ba_alloc(ba, 100, &b6); - ba_alloc(ba, 100, &b7); - ba_free(ba, b2); - ba_alloc(ba, 100, &b2); - ba_free(ba, b4); - ba_free(ba, b6); + ba_alloc(ba, 100, &b2); + ba_alloc(ba, 100, &b3); + ba_alloc(ba, 100, &b4); + ba_alloc(ba, 100, &b5); + ba_alloc(ba, 100, &b6); + ba_alloc(ba, 100, &b7); + ba_free(ba, b2, 100); + ba_alloc(ba, 100, &b2); + ba_free(ba, b4, 100); + ba_free(ba, b6, 100); uint64_t b8, b9; - ba_alloc(ba, 100, &b4); - ba_free(ba, b2); - ba_alloc(ba, 100, &b6); - ba_alloc(ba, 100, &b8); - ba_alloc(ba, 100, &b9); - ba_free(ba, b6); - ba_free(ba, b7); - ba_free(ba, b8); - ba_alloc(ba, 100, &b6); - ba_alloc(ba, 100, &b7); - ba_free(ba, b4); - ba_alloc(ba, 100, &b4); - - ba->destroy(); + ba_alloc(ba, 100, &b4); + ba_free(ba, b2, 100); + ba_alloc(ba, 100, &b6); + ba_alloc(ba, 100, &b8); + ba_alloc(ba, 100, &b9); + ba_free(ba, b6, 100); + ba_free(ba, b7, 100); + ba_free(ba, b8, 100); + ba_alloc(ba, 100, &b6); + ba_alloc(ba, 100, &b7); + ba_free(ba, b4, 100); + ba_alloc(ba, 100, &b4); + + ba->Destroy(); } // Manually to get coverage of all the code in the block allocator. -static void -test_ba1(block_allocator::allocation_strategy strategy, int n_initial) { - block_allocator allocator; - block_allocator *ba = &allocator; - ba->create(0*512, 1*512); - ba->set_strategy(strategy); - - int n_blocks=0; +static void test_ba1(int n_initial) { + BlockAllocator allocator; + BlockAllocator *ba = &allocator; + ba->Create(0 * 512, 1 * 512); + + int n_blocks = 0; uint64_t blocks[1000]; for (int i = 0; i < 1000; i++) { - if (i < n_initial || random() % 2 == 0) { - if (n_blocks < 1000) { - ba_alloc(ba, 1, &blocks[n_blocks]); - //printf("A[%d]=%ld\n", n_blocks, blocks[n_blocks]); - n_blocks++; - } - } else { - if (n_blocks > 0) { - int blocknum = random()%n_blocks; - //printf("F[%d]%ld\n", blocknum, blocks[blocknum]); - ba_free(ba, blocks[blocknum]); - blocks[blocknum]=blocks[n_blocks-1]; - n_blocks--; - } - } + if (i < n_initial || random() % 2 == 0) { + if (n_blocks < 1000) { + ba_alloc(ba, 1, &blocks[n_blocks]); + // printf("A[%d]=%ld\n", n_blocks, blocks[n_blocks]); + n_blocks++; + } + } else { + if (n_blocks > 0) { + int blocknum = random() % n_blocks; + // printf("F[%d]=%ld\n", blocknum, blocks[blocknum]); + ba_free(ba, blocks[blocknum], 1); + blocks[blocknum] = blocks[n_blocks - 1]; + n_blocks--; + } + } } - - ba->destroy(); + + ba->Destroy(); } - + // Check to see if it is first fit or best fit. -static void -test_ba2 (void) -{ - block_allocator allocator; - block_allocator *ba = &allocator; +static void test_ba2(void) { + BlockAllocator allocator; + BlockAllocator *ba = &allocator; uint64_t b[6]; enum { BSIZE = 1024 }; - ba->create(100*512, BSIZE*512); - ba->set_strategy(block_allocator::BA_STRATEGY_FIRST_FIT); - assert(ba->allocated_limit()==100*512); - - ba_check_l (ba, 0, 0, 100); - ba_check_none (ba, 1); - - ba_alloc (ba, 100, &b[0]); - ba_check_l (ba, 0, 0, 100); - ba_check_l (ba, 1, BSIZE, 100); - ba_check_none (ba, 2); - - ba_alloc (ba, BSIZE + 100, &b[1]); - ba_check_l (ba, 0, 0, 100); - ba_check_l (ba, 1, BSIZE, 100); - ba_check_l (ba, 2, 2*BSIZE, BSIZE + 100); - ba_check_none (ba, 3); - - ba_alloc (ba, 100, &b[2]); - ba_check_l (ba, 0, 0, 100); - ba_check_l (ba, 1, BSIZE, 100); - ba_check_l (ba, 2, 2*BSIZE, BSIZE + 100); - ba_check_l (ba, 3, 4*BSIZE, 100); - ba_check_none (ba, 4); - - ba_alloc (ba, 100, &b[3]); - ba_alloc (ba, 100, &b[4]); - ba_alloc (ba, 100, &b[5]); - ba_check_l (ba, 0, 0, 100); - ba_check_l (ba, 1, BSIZE, 100); - ba_check_l (ba, 2, 2*BSIZE, BSIZE + 100); - ba_check_l (ba, 3, 4*BSIZE, 100); - ba_check_l (ba, 4, 5*BSIZE, 100); - ba_check_l (ba, 5, 6*BSIZE, 100); - ba_check_l (ba, 6, 7*BSIZE, 100); - ba_check_none (ba, 7); - - ba_free (ba, 4*BSIZE); - ba_check_l (ba, 0, 0, 100); - ba_check_l (ba, 1, BSIZE, 100); - ba_check_l (ba, 2, 2*BSIZE, BSIZE + 100); - ba_check_l (ba, 3, 5*BSIZE, 100); - ba_check_l (ba, 4, 6*BSIZE, 100); - ba_check_l (ba, 5, 7*BSIZE, 100); - ba_check_none (ba, 6); + ba->Create(100 * 512, BSIZE * 512); + invariant(ba->AllocatedLimit() == 100 * 512); + + ba_check_l(ba, 0, 0, 100); + ba_check_none(ba, 1); + + ba_alloc(ba, 100, &b[0]); + ba_check_l(ba, 0, 0, 100); + ba_check_l(ba, 1, BSIZE, 100); + ba_check_none(ba, 2); + + ba_alloc(ba, BSIZE + 100, &b[1]); + ba_check_l(ba, 0, 0, 100); + ba_check_l(ba, 1, BSIZE, 100); + ba_check_l(ba, 2, 2 * BSIZE, BSIZE + 100); + ba_check_none(ba, 3); + + ba_alloc(ba, 100, &b[2]); + ba_check_l(ba, 0, 0, 100); + ba_check_l(ba, 1, BSIZE, 100); + ba_check_l(ba, 2, 2 * BSIZE, BSIZE + 100); + ba_check_l(ba, 3, 4 * BSIZE, 100); + ba_check_none(ba, 4); + + ba_alloc(ba, 100, &b[3]); + ba_alloc(ba, 100, &b[4]); + ba_alloc(ba, 100, &b[5]); + ba_check_l(ba, 0, 0, 100); + ba_check_l(ba, 1, BSIZE, 100); + ba_check_l(ba, 2, 2 * BSIZE, BSIZE + 100); + ba_check_l(ba, 3, 4 * BSIZE, 100); + ba_check_l(ba, 4, 5 * BSIZE, 100); + ba_check_l(ba, 5, 6 * BSIZE, 100); + ba_check_l(ba, 6, 7 * BSIZE, 100); + ba_check_none(ba, 7); + + ba_free(ba, 4 * BSIZE, 100); + ba_check_l(ba, 0, 0, 100); + ba_check_l(ba, 1, BSIZE, 100); + ba_check_l(ba, 2, 2 * BSIZE, BSIZE + 100); + ba_check_l(ba, 3, 5 * BSIZE, 100); + ba_check_l(ba, 4, 6 * BSIZE, 100); + ba_check_l(ba, 5, 7 * BSIZE, 100); + ba_check_none(ba, 6); uint64_t b2; ba_alloc(ba, 100, &b2); - assert(b2==4*BSIZE); - ba_check_l (ba, 0, 0, 100); - ba_check_l (ba, 1, BSIZE, 100); - ba_check_l (ba, 2, 2*BSIZE, BSIZE + 100); - ba_check_l (ba, 3, 4*BSIZE, 100); - ba_check_l (ba, 4, 5*BSIZE, 100); - ba_check_l (ba, 5, 6*BSIZE, 100); - ba_check_l (ba, 6, 7*BSIZE, 100); - ba_check_none (ba, 7); - - ba_free (ba, BSIZE); - ba_free (ba, 5*BSIZE); - ba_check_l (ba, 0, 0, 100); - ba_check_l (ba, 1, 2*BSIZE, BSIZE + 100); - ba_check_l (ba, 2, 4*BSIZE, 100); - ba_check_l (ba, 3, 6*BSIZE, 100); - ba_check_l (ba, 4, 7*BSIZE, 100); - ba_check_none (ba, 5); - - // This alloc will allocate the first block after the reserve space in the case of first fit. + invariant(b2 == 4 * BSIZE); + ba_check_l(ba, 0, 0, 100); + ba_check_l(ba, 1, BSIZE, 100); + ba_check_l(ba, 2, 2 * BSIZE, BSIZE + 100); + ba_check_l(ba, 3, 4 * BSIZE, 100); + ba_check_l(ba, 4, 5 * BSIZE, 100); + ba_check_l(ba, 5, 6 * BSIZE, 100); + ba_check_l(ba, 6, 7 * BSIZE, 100); + ba_check_none(ba, 7); + + ba_free(ba, BSIZE, 100); + ba_free(ba, 5 * BSIZE, 100); + ba_check_l(ba, 0, 0, 100); + ba_check_l(ba, 1, 2 * BSIZE, BSIZE + 100); + ba_check_l(ba, 2, 4 * BSIZE, 100); + ba_check_l(ba, 3, 6 * BSIZE, 100); + ba_check_l(ba, 4, 7 * BSIZE, 100); + ba_check_none(ba, 5); + + // This alloc will allocate the first block after the reserve space in the + // case of first fit. uint64_t b3; ba_alloc(ba, 100, &b3); - assert(b3== BSIZE); // First fit. + invariant(b3 == BSIZE); // First fit. // if (b3==5*BSIZE) then it is next fit. // Now 5*BSIZE is free uint64_t b5; ba_alloc(ba, 100, &b5); - assert(b5==5*BSIZE); - ba_check_l (ba, 0, 0, 100); - ba_check_l (ba, 1, BSIZE, 100); - ba_check_l (ba, 2, 2*BSIZE, BSIZE + 100); - ba_check_l (ba, 3, 4*BSIZE, 100); - ba_check_l (ba, 4, 5*BSIZE, 100); - ba_check_l (ba, 5, 6*BSIZE, 100); - ba_check_l (ba, 6, 7*BSIZE, 100); - ba_check_none (ba, 7); + invariant(b5 == 5 * BSIZE); + ba_check_l(ba, 0, 0, 100); + ba_check_l(ba, 1, BSIZE, 100); + ba_check_l(ba, 2, 2 * BSIZE, BSIZE + 100); + ba_check_l(ba, 3, 4 * BSIZE, 100); + ba_check_l(ba, 4, 5 * BSIZE, 100); + ba_check_l(ba, 5, 6 * BSIZE, 100); + ba_check_l(ba, 6, 7 * BSIZE, 100); + ba_check_none(ba, 7); // Now all blocks are busy uint64_t b6, b7, b8; ba_alloc(ba, 100, &b6); ba_alloc(ba, 100, &b7); ba_alloc(ba, 100, &b8); - assert(b6==8*BSIZE); - assert(b7==9*BSIZE); - assert(b8==10*BSIZE); - ba_check_l (ba, 0, 0, 100); - ba_check_l (ba, 1, BSIZE, 100); - ba_check_l (ba, 2, 2*BSIZE, BSIZE + 100); - ba_check_l (ba, 3, 4*BSIZE, 100); - ba_check_l (ba, 4, 5*BSIZE, 100); - ba_check_l (ba, 5, 6*BSIZE, 100); - ba_check_l (ba, 6, 7*BSIZE, 100); - ba_check_l (ba, 7, 8*BSIZE, 100); - ba_check_l (ba, 8, 9*BSIZE, 100); - ba_check_l (ba, 9, 10*BSIZE, 100); - ba_check_none (ba, 10); - - ba_free(ba, 9*BSIZE); - ba_free(ba, 7*BSIZE); + invariant(b6 == 8 * BSIZE); + invariant(b7 == 9 * BSIZE); + invariant(b8 == 10 * BSIZE); + ba_check_l(ba, 0, 0, 100); + ba_check_l(ba, 1, BSIZE, 100); + ba_check_l(ba, 2, 2 * BSIZE, BSIZE + 100); + ba_check_l(ba, 3, 4 * BSIZE, 100); + ba_check_l(ba, 4, 5 * BSIZE, 100); + ba_check_l(ba, 5, 6 * BSIZE, 100); + ba_check_l(ba, 6, 7 * BSIZE, 100); + ba_check_l(ba, 7, 8 * BSIZE, 100); + ba_check_l(ba, 8, 9 * BSIZE, 100); + ba_check_l(ba, 9, 10 * BSIZE, 100); + ba_check_none(ba, 10); + + ba_free(ba, 9 * BSIZE, 100); + ba_free(ba, 7 * BSIZE, 100); uint64_t b9; ba_alloc(ba, 100, &b9); - assert(b9==7*BSIZE); + invariant(b9 == 7 * BSIZE); - ba_free(ba, 5*BSIZE); - ba_free(ba, 2*BSIZE); + ba_free(ba, 5 * BSIZE, 100); + ba_free(ba, 2 * BSIZE, BSIZE + 100); uint64_t b10, b11; ba_alloc(ba, 100, &b10); - assert(b10==2*BSIZE); + invariant(b10 == 2 * BSIZE); ba_alloc(ba, 100, &b11); - assert(b11==3*BSIZE); + invariant(b11 == 3 * BSIZE); ba_alloc(ba, 100, &b11); - assert(b11==5*BSIZE); + invariant(b11 == 5 * BSIZE); - ba->destroy(); + ba->Destroy(); } -int -test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) { - enum block_allocator::allocation_strategy strategies[] = { - block_allocator::BA_STRATEGY_FIRST_FIT, - block_allocator::BA_STRATEGY_BEST_FIT, - block_allocator::BA_STRATEGY_PADDED_FIT, - block_allocator::BA_STRATEGY_HEAT_ZONE, - }; - for (size_t i = 0; i < sizeof(strategies) / sizeof(strategies[0]); i++) { - test_ba0(strategies[i]); - test_ba1(strategies[i], 0); - test_ba1(strategies[i], 10); - test_ba1(strategies[i], 20); - } +int test_main(int argc __attribute__((__unused__)), + const char *argv[] __attribute__((__unused__))) { + test_ba0(); + test_ba1(0); + test_ba1(10); + test_ba1(20); test_ba2(); return 0; } diff --git a/storage/tokudb/PerconaFT/ft/tests/cachetable-5978.cc b/storage/tokudb/PerconaFT/ft/tests/cachetable-5978.cc index a7c48ef709a..ee68ab3ef0b 100644 --- a/storage/tokudb/PerconaFT/ft/tests/cachetable-5978.cc +++ b/storage/tokudb/PerconaFT/ft/tests/cachetable-5978.cc @@ -45,7 +45,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. // #5978 is fixed. Here is what we do. We have four pairs with // blocknums and fullhashes of 1,2,3,4. The cachetable has only // two bucket mutexes, so 1 and 3 share a pair mutex, as do 2 and 4. -// We pin all four with expensive write locks. Then, on backgroud threads, +// We pin all four with expensive write locks. Then, on background threads, // we call get_and_pin_nonblocking on 3, where the unlockers unpins 2, and // we call get_and_pin_nonblocking on 4, where the unlockers unpins 1. Run this // enough times, and we should see a deadlock before the fix, and no deadlock diff --git a/storage/tokudb/PerconaFT/ft/tests/cachetable-simple-clone2.cc b/storage/tokudb/PerconaFT/ft/tests/cachetable-simple-clone2.cc index be4bae898be..51cf70c3e76 100644 --- a/storage/tokudb/PerconaFT/ft/tests/cachetable-simple-clone2.cc +++ b/storage/tokudb/PerconaFT/ft/tests/cachetable-simple-clone2.cc @@ -77,7 +77,7 @@ flush ( // // test the following things for simple cloning: -// - verifies that after teh checkpoint ends, the PAIR is properly +// - verifies that after the checkpoint ends, the PAIR is properly // dirty or clean based on the second unpin // static void diff --git a/storage/tokudb/PerconaFT/ft/tests/ft-bfe-query.cc b/storage/tokudb/PerconaFT/ft/tests/ft-bfe-query.cc index cb03a23e0fc..7abd2267a7e 100644 --- a/storage/tokudb/PerconaFT/ft/tests/ft-bfe-query.cc +++ b/storage/tokudb/PerconaFT/ft/tests/ft-bfe-query.cc @@ -38,69 +38,72 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. #include "test.h" -static int -int64_key_cmp (DB *db UU(), const DBT *a, const DBT *b) { - int64_t x = *(int64_t *) a->data; - int64_t y = *(int64_t *) b->data; - - if (x<y) return -1; - if (x>y) return 1; +static int int64_key_cmp(DB *db UU(), const DBT *a, const DBT *b) { + int64_t x = *(int64_t *)a->data; + int64_t y = *(int64_t *)b->data; + + if (x < y) + return -1; + if (x > y) + return 1; return 0; } -static void -test_prefetch_read(int fd, FT_HANDLE UU(ft), FT ft_h) { +static void test_prefetch_read(int fd, FT_HANDLE UU(ft), FT ft_h) { int r; FT_CURSOR XMALLOC(cursor); FTNODE dn = NULL; PAIR_ATTR attr; - + // first test that prefetching everything should work - memset(&cursor->range_lock_left_key, 0 , sizeof(DBT)); - memset(&cursor->range_lock_right_key, 0 , sizeof(DBT)); + memset(&cursor->range_lock_left_key, 0, sizeof(DBT)); + memset(&cursor->range_lock_right_key, 0, sizeof(DBT)); cursor->left_is_neg_infty = true; cursor->right_is_pos_infty = true; cursor->disable_prefetching = false; - + ftnode_fetch_extra bfe; // quick test to see that we have the right behavior when we set // disable_prefetching to true cursor->disable_prefetching = true; - bfe.create_for_prefetch( ft_h, cursor); + bfe.create_for_prefetch(ft_h, cursor); FTNODE_DISK_DATA ndd = NULL; - r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe); - assert(r==0); - assert(dn->n_children == 3); - assert(BP_STATE(dn,0) == PT_ON_DISK); - assert(BP_STATE(dn,1) == PT_ON_DISK); - assert(BP_STATE(dn,2) == PT_ON_DISK); + r = toku_deserialize_ftnode_from( + fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe); + invariant(r == 0); + invariant(dn->n_children == 3); + invariant(BP_STATE(dn, 0) == PT_ON_DISK); + invariant(BP_STATE(dn, 1) == PT_ON_DISK); + invariant(BP_STATE(dn, 2) == PT_ON_DISK); r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr); - assert(BP_STATE(dn,0) == PT_ON_DISK); - assert(BP_STATE(dn,1) == PT_ON_DISK); - assert(BP_STATE(dn,2) == PT_ON_DISK); + invariant(BP_STATE(dn, 0) == PT_ON_DISK); + invariant(BP_STATE(dn, 1) == PT_ON_DISK); + invariant(BP_STATE(dn, 2) == PT_ON_DISK); bfe.destroy(); toku_ftnode_free(&dn); toku_free(ndd); // now enable prefetching again cursor->disable_prefetching = false; - - bfe.create_for_prefetch( ft_h, cursor); - r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe); - assert(r==0); - assert(dn->n_children == 3); - assert(BP_STATE(dn,0) == PT_AVAIL); - assert(BP_STATE(dn,1) == PT_AVAIL); - assert(BP_STATE(dn,2) == PT_AVAIL); - toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr); - assert(BP_STATE(dn,0) == PT_COMPRESSED); - assert(BP_STATE(dn,1) == PT_COMPRESSED); - assert(BP_STATE(dn,2) == PT_COMPRESSED); + + bfe.create_for_prefetch(ft_h, cursor); + r = toku_deserialize_ftnode_from( + fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe); + invariant(r == 0); + invariant(dn->n_children == 3); + invariant(BP_STATE(dn, 0) == PT_AVAIL); + invariant(BP_STATE(dn, 1) == PT_AVAIL); + invariant(BP_STATE(dn, 2) == PT_AVAIL); + toku_ftnode_pe_callback( + dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr); + invariant(BP_STATE(dn, 0) == PT_COMPRESSED); + invariant(BP_STATE(dn, 1) == PT_COMPRESSED); + invariant(BP_STATE(dn, 2) == PT_COMPRESSED); r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr); - assert(BP_STATE(dn,0) == PT_AVAIL); - assert(BP_STATE(dn,1) == PT_AVAIL); - assert(BP_STATE(dn,2) == PT_AVAIL); + invariant(BP_STATE(dn, 0) == PT_AVAIL); + invariant(BP_STATE(dn, 1) == PT_AVAIL); + invariant(BP_STATE(dn, 2) == PT_AVAIL); bfe.destroy(); toku_ftnode_free(&dn); toku_free(ndd); @@ -108,21 +111,23 @@ test_prefetch_read(int fd, FT_HANDLE UU(ft), FT ft_h) { uint64_t left_key = 150; toku_fill_dbt(&cursor->range_lock_left_key, &left_key, sizeof(uint64_t)); cursor->left_is_neg_infty = false; - bfe.create_for_prefetch( ft_h, cursor); - r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe); - assert(r==0); - assert(dn->n_children == 3); - assert(BP_STATE(dn,0) == PT_ON_DISK); - assert(BP_STATE(dn,1) == PT_AVAIL); - assert(BP_STATE(dn,2) == PT_AVAIL); - toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr); - assert(BP_STATE(dn,0) == PT_ON_DISK); - assert(BP_STATE(dn,1) == PT_COMPRESSED); - assert(BP_STATE(dn,2) == PT_COMPRESSED); + bfe.create_for_prefetch(ft_h, cursor); + r = toku_deserialize_ftnode_from( + fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe); + invariant(r == 0); + invariant(dn->n_children == 3); + invariant(BP_STATE(dn, 0) == PT_ON_DISK); + invariant(BP_STATE(dn, 1) == PT_AVAIL); + invariant(BP_STATE(dn, 2) == PT_AVAIL); + toku_ftnode_pe_callback( + dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr); + invariant(BP_STATE(dn, 0) == PT_ON_DISK); + invariant(BP_STATE(dn, 1) == PT_COMPRESSED); + invariant(BP_STATE(dn, 2) == PT_COMPRESSED); r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr); - assert(BP_STATE(dn,0) == PT_ON_DISK); - assert(BP_STATE(dn,1) == PT_AVAIL); - assert(BP_STATE(dn,2) == PT_AVAIL); + invariant(BP_STATE(dn, 0) == PT_ON_DISK); + invariant(BP_STATE(dn, 1) == PT_AVAIL); + invariant(BP_STATE(dn, 2) == PT_AVAIL); bfe.destroy(); toku_ftnode_free(&dn); toku_free(ndd); @@ -130,63 +135,69 @@ test_prefetch_read(int fd, FT_HANDLE UU(ft), FT ft_h) { uint64_t right_key = 151; toku_fill_dbt(&cursor->range_lock_right_key, &right_key, sizeof(uint64_t)); cursor->right_is_pos_infty = false; - bfe.create_for_prefetch( ft_h, cursor); - r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe); - assert(r==0); - assert(dn->n_children == 3); - assert(BP_STATE(dn,0) == PT_ON_DISK); - assert(BP_STATE(dn,1) == PT_AVAIL); - assert(BP_STATE(dn,2) == PT_ON_DISK); - toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr); - assert(BP_STATE(dn,0) == PT_ON_DISK); - assert(BP_STATE(dn,1) == PT_COMPRESSED); - assert(BP_STATE(dn,2) == PT_ON_DISK); + bfe.create_for_prefetch(ft_h, cursor); + r = toku_deserialize_ftnode_from( + fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe); + invariant(r == 0); + invariant(dn->n_children == 3); + invariant(BP_STATE(dn, 0) == PT_ON_DISK); + invariant(BP_STATE(dn, 1) == PT_AVAIL); + invariant(BP_STATE(dn, 2) == PT_ON_DISK); + toku_ftnode_pe_callback( + dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr); + invariant(BP_STATE(dn, 0) == PT_ON_DISK); + invariant(BP_STATE(dn, 1) == PT_COMPRESSED); + invariant(BP_STATE(dn, 2) == PT_ON_DISK); r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr); - assert(BP_STATE(dn,0) == PT_ON_DISK); - assert(BP_STATE(dn,1) == PT_AVAIL); - assert(BP_STATE(dn,2) == PT_ON_DISK); + invariant(BP_STATE(dn, 0) == PT_ON_DISK); + invariant(BP_STATE(dn, 1) == PT_AVAIL); + invariant(BP_STATE(dn, 2) == PT_ON_DISK); bfe.destroy(); toku_ftnode_free(&dn); toku_free(ndd); left_key = 100000; right_key = 100000; - bfe.create_for_prefetch( ft_h, cursor); - r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe); - assert(r==0); - assert(dn->n_children == 3); - assert(BP_STATE(dn,0) == PT_ON_DISK); - assert(BP_STATE(dn,1) == PT_ON_DISK); - assert(BP_STATE(dn,2) == PT_AVAIL); - toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr); - assert(BP_STATE(dn,0) == PT_ON_DISK); - assert(BP_STATE(dn,1) == PT_ON_DISK); - assert(BP_STATE(dn,2) == PT_COMPRESSED); + bfe.create_for_prefetch(ft_h, cursor); + r = toku_deserialize_ftnode_from( + fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe); + invariant(r == 0); + invariant(dn->n_children == 3); + invariant(BP_STATE(dn, 0) == PT_ON_DISK); + invariant(BP_STATE(dn, 1) == PT_ON_DISK); + invariant(BP_STATE(dn, 2) == PT_AVAIL); + toku_ftnode_pe_callback( + dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr); + invariant(BP_STATE(dn, 0) == PT_ON_DISK); + invariant(BP_STATE(dn, 1) == PT_ON_DISK); + invariant(BP_STATE(dn, 2) == PT_COMPRESSED); r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr); - assert(BP_STATE(dn,0) == PT_ON_DISK); - assert(BP_STATE(dn,1) == PT_ON_DISK); - assert(BP_STATE(dn,2) == PT_AVAIL); + invariant(BP_STATE(dn, 0) == PT_ON_DISK); + invariant(BP_STATE(dn, 1) == PT_ON_DISK); + invariant(BP_STATE(dn, 2) == PT_AVAIL); bfe.destroy(); toku_free(ndd); toku_ftnode_free(&dn); left_key = 100; right_key = 100; - bfe.create_for_prefetch( ft_h, cursor); - r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe); - assert(r==0); - assert(dn->n_children == 3); - assert(BP_STATE(dn,0) == PT_AVAIL); - assert(BP_STATE(dn,1) == PT_ON_DISK); - assert(BP_STATE(dn,2) == PT_ON_DISK); - toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr); - assert(BP_STATE(dn,0) == PT_COMPRESSED); - assert(BP_STATE(dn,1) == PT_ON_DISK); - assert(BP_STATE(dn,2) == PT_ON_DISK); + bfe.create_for_prefetch(ft_h, cursor); + r = toku_deserialize_ftnode_from( + fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe); + invariant(r == 0); + invariant(dn->n_children == 3); + invariant(BP_STATE(dn, 0) == PT_AVAIL); + invariant(BP_STATE(dn, 1) == PT_ON_DISK); + invariant(BP_STATE(dn, 2) == PT_ON_DISK); + toku_ftnode_pe_callback( + dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr); + invariant(BP_STATE(dn, 0) == PT_COMPRESSED); + invariant(BP_STATE(dn, 1) == PT_ON_DISK); + invariant(BP_STATE(dn, 2) == PT_ON_DISK); r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr); - assert(BP_STATE(dn,0) == PT_AVAIL); - assert(BP_STATE(dn,1) == PT_ON_DISK); - assert(BP_STATE(dn,2) == PT_ON_DISK); + invariant(BP_STATE(dn, 0) == PT_AVAIL); + invariant(BP_STATE(dn, 1) == PT_ON_DISK); + invariant(BP_STATE(dn, 2) == PT_ON_DISK); bfe.destroy(); toku_ftnode_free(&dn); toku_free(ndd); @@ -194,20 +205,19 @@ test_prefetch_read(int fd, FT_HANDLE UU(ft), FT ft_h) { toku_free(cursor); } -static void -test_subset_read(int fd, FT_HANDLE UU(ft), FT ft_h) { +static void test_subset_read(int fd, FT_HANDLE UU(ft), FT ft_h) { int r; FT_CURSOR XMALLOC(cursor); FTNODE dn = NULL; FTNODE_DISK_DATA ndd = NULL; PAIR_ATTR attr; - + // first test that prefetching everything should work - memset(&cursor->range_lock_left_key, 0 , sizeof(DBT)); - memset(&cursor->range_lock_right_key, 0 , sizeof(DBT)); + memset(&cursor->range_lock_left_key, 0, sizeof(DBT)); + memset(&cursor->range_lock_right_key, 0, sizeof(DBT)); cursor->left_is_neg_infty = true; cursor->right_is_pos_infty = true; - + uint64_t left_key = 150; uint64_t right_key = 151; DBT left, right; @@ -216,101 +226,106 @@ test_subset_read(int fd, FT_HANDLE UU(ft), FT ft_h) { ftnode_fetch_extra bfe; bfe.create_for_subset_read( - ft_h, - NULL, - &left, - &right, - false, - false, - false, - false - ); - + ft_h, NULL, &left, &right, false, false, false, false); + // fake the childnum to read // set disable_prefetching ON bfe.child_to_read = 2; bfe.disable_prefetching = true; - r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe); - assert(r==0); - assert(dn->n_children == 3); - assert(BP_STATE(dn,0) == PT_ON_DISK); - assert(BP_STATE(dn,1) == PT_ON_DISK); - assert(BP_STATE(dn,2) == PT_AVAIL); - // need to call this twice because we had a subset read before, that touched the clock - toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr); - assert(BP_STATE(dn,0) == PT_ON_DISK); - assert(BP_STATE(dn,1) == PT_ON_DISK); - assert(BP_STATE(dn,2) == PT_AVAIL); - toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr); - assert(BP_STATE(dn,0) == PT_ON_DISK); - assert(BP_STATE(dn,1) == PT_ON_DISK); - assert(BP_STATE(dn,2) == PT_COMPRESSED); + r = toku_deserialize_ftnode_from( + fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe); + invariant(r == 0); + invariant(dn->n_children == 3); + invariant(BP_STATE(dn, 0) == PT_ON_DISK); + invariant(BP_STATE(dn, 1) == PT_ON_DISK); + invariant(BP_STATE(dn, 2) == PT_AVAIL); + // need to call this twice because we had a subset read before, that touched + // the clock + toku_ftnode_pe_callback( + dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr); + invariant(BP_STATE(dn, 0) == PT_ON_DISK); + invariant(BP_STATE(dn, 1) == PT_ON_DISK); + invariant(BP_STATE(dn, 2) == PT_AVAIL); + toku_ftnode_pe_callback( + dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr); + invariant(BP_STATE(dn, 0) == PT_ON_DISK); + invariant(BP_STATE(dn, 1) == PT_ON_DISK); + invariant(BP_STATE(dn, 2) == PT_COMPRESSED); r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr); - assert(BP_STATE(dn,0) == PT_ON_DISK); - assert(BP_STATE(dn,1) == PT_ON_DISK); - assert(BP_STATE(dn,2) == PT_AVAIL); + invariant(BP_STATE(dn, 0) == PT_ON_DISK); + invariant(BP_STATE(dn, 1) == PT_ON_DISK); + invariant(BP_STATE(dn, 2) == PT_AVAIL); toku_ftnode_free(&dn); toku_free(ndd); // fake the childnum to read bfe.child_to_read = 2; bfe.disable_prefetching = false; - r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe); - assert(r==0); - assert(dn->n_children == 3); - assert(BP_STATE(dn,0) == PT_ON_DISK); - assert(BP_STATE(dn,1) == PT_AVAIL); - assert(BP_STATE(dn,2) == PT_AVAIL); - // need to call this twice because we had a subset read before, that touched the clock - toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr); - assert(BP_STATE(dn,0) == PT_ON_DISK); - assert(BP_STATE(dn,1) == PT_COMPRESSED); - assert(BP_STATE(dn,2) == PT_AVAIL); - toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr); - assert(BP_STATE(dn,0) == PT_ON_DISK); - assert(BP_STATE(dn,1) == PT_COMPRESSED); - assert(BP_STATE(dn,2) == PT_COMPRESSED); + r = toku_deserialize_ftnode_from( + fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe); + invariant(r == 0); + invariant(dn->n_children == 3); + invariant(BP_STATE(dn, 0) == PT_ON_DISK); + invariant(BP_STATE(dn, 1) == PT_AVAIL); + invariant(BP_STATE(dn, 2) == PT_AVAIL); + // need to call this twice because we had a subset read before, that touched + // the clock + toku_ftnode_pe_callback( + dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr); + invariant(BP_STATE(dn, 0) == PT_ON_DISK); + invariant(BP_STATE(dn, 1) == PT_COMPRESSED); + invariant(BP_STATE(dn, 2) == PT_AVAIL); + toku_ftnode_pe_callback( + dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr); + invariant(BP_STATE(dn, 0) == PT_ON_DISK); + invariant(BP_STATE(dn, 1) == PT_COMPRESSED); + invariant(BP_STATE(dn, 2) == PT_COMPRESSED); r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr); - assert(BP_STATE(dn,0) == PT_ON_DISK); - assert(BP_STATE(dn,1) == PT_AVAIL); - assert(BP_STATE(dn,2) == PT_AVAIL); + invariant(BP_STATE(dn, 0) == PT_ON_DISK); + invariant(BP_STATE(dn, 1) == PT_AVAIL); + invariant(BP_STATE(dn, 2) == PT_AVAIL); toku_ftnode_free(&dn); toku_free(ndd); // fake the childnum to read bfe.child_to_read = 0; - r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd, &bfe); - assert(r==0); - assert(dn->n_children == 3); - assert(BP_STATE(dn,0) == PT_AVAIL); - assert(BP_STATE(dn,1) == PT_AVAIL); - assert(BP_STATE(dn,2) == PT_ON_DISK); - // need to call this twice because we had a subset read before, that touched the clock - toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr); - assert(BP_STATE(dn,0) == PT_AVAIL); - assert(BP_STATE(dn,1) == PT_COMPRESSED); - assert(BP_STATE(dn,2) == PT_ON_DISK); - toku_ftnode_pe_callback(dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr); - assert(BP_STATE(dn,0) == PT_COMPRESSED); - assert(BP_STATE(dn,1) == PT_COMPRESSED); - assert(BP_STATE(dn,2) == PT_ON_DISK); + r = toku_deserialize_ftnode_from( + fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd, &bfe); + invariant(r == 0); + invariant(dn->n_children == 3); + invariant(BP_STATE(dn, 0) == PT_AVAIL); + invariant(BP_STATE(dn, 1) == PT_AVAIL); + invariant(BP_STATE(dn, 2) == PT_ON_DISK); + // need to call this twice because we had a subset read before, that touched + // the clock + toku_ftnode_pe_callback( + dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr); + invariant(BP_STATE(dn, 0) == PT_AVAIL); + invariant(BP_STATE(dn, 1) == PT_COMPRESSED); + invariant(BP_STATE(dn, 2) == PT_ON_DISK); + toku_ftnode_pe_callback( + dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr); + invariant(BP_STATE(dn, 0) == PT_COMPRESSED); + invariant(BP_STATE(dn, 1) == PT_COMPRESSED); + invariant(BP_STATE(dn, 2) == PT_ON_DISK); r = toku_ftnode_pf_callback(dn, ndd, &bfe, fd, &attr); - assert(BP_STATE(dn,0) == PT_AVAIL); - assert(BP_STATE(dn,1) == PT_AVAIL); - assert(BP_STATE(dn,2) == PT_ON_DISK); + invariant(BP_STATE(dn, 0) == PT_AVAIL); + invariant(BP_STATE(dn, 1) == PT_AVAIL); + invariant(BP_STATE(dn, 2) == PT_ON_DISK); toku_ftnode_free(&dn); toku_free(ndd); toku_free(cursor); } - -static void -test_prefetching(void) { +static void test_prefetching(void) { // struct ft_handle source_ft; struct ftnode sn; - int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0); + int fd = open(TOKU_TEST_FILENAME, + O_RDWR | O_CREAT | O_BINARY, + S_IRWXU | S_IRWXG | S_IRWXO); + invariant(fd >= 0); int r; @@ -327,7 +342,7 @@ test_prefetching(void) { uint64_t key1 = 100; uint64_t key2 = 200; - + MALLOC_N(sn.n_children, sn.bp); DBT pivotkeys[2]; toku_fill_dbt(&pivotkeys[0], &key1, sizeof(key1)); @@ -336,13 +351,13 @@ test_prefetching(void) { BP_BLOCKNUM(&sn, 0).b = 30; BP_BLOCKNUM(&sn, 1).b = 35; BP_BLOCKNUM(&sn, 2).b = 40; - BP_STATE(&sn,0) = PT_AVAIL; - BP_STATE(&sn,1) = PT_AVAIL; - BP_STATE(&sn,2) = PT_AVAIL; + BP_STATE(&sn, 0) = PT_AVAIL; + BP_STATE(&sn, 1) = PT_AVAIL; + BP_STATE(&sn, 2) = PT_AVAIL; set_BNC(&sn, 0, toku_create_empty_nl()); set_BNC(&sn, 1, toku_create_empty_nl()); set_BNC(&sn, 2, toku_create_empty_nl()); - //Create XIDS + // Create XIDS XIDS xids_0 = toku_xids_get_root_xids(); XIDS xids_123; XIDS xids_234; @@ -352,7 +367,7 @@ test_prefetching(void) { CKERR(r); // data in the buffers does not matter in this test - //Cleanup: + // Cleanup: toku_xids_destroy(&xids_0); toku_xids_destroy(&xids_123); toku_xids_destroy(&xids_234); @@ -363,41 +378,48 @@ test_prefetching(void) { make_blocknum(0), ZERO_LSN, TXNID_NONE, - 4*1024*1024, - 128*1024, + 4 * 1024 * 1024, + 128 * 1024, TOKU_DEFAULT_COMPRESSION_METHOD, 16); ft_h->cmp.create(int64_key_cmp, nullptr); ft->ft = ft_h; ft_h->blocktable.create(); - { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); } - //Want to use block #20 + { + int r_truncate = ftruncate(fd, 0); + CKERR(r_truncate); + } + // Want to use block #20 BLOCKNUM b = make_blocknum(0); while (b.b < 20) { ft_h->blocktable.allocate_blocknum(&b, ft_h); } - assert(b.b == 20); + invariant(b.b == 20); { DISKOFF offset; DISKOFF size; - ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0); - assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false); + invariant(offset == + (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size); - assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); - assert(size == 100); + invariant(offset == + (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + invariant(size == 100); } FTNODE_DISK_DATA ndd = NULL; - r = toku_serialize_ftnode_to(fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false); - assert(r==0); + r = toku_serialize_ftnode_to( + fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false); + invariant(r == 0); - test_prefetch_read(fd, ft, ft_h); + test_prefetch_read(fd, ft, ft_h); test_subset_read(fd, ft, ft_h); toku_destroy_ftnode_internals(&sn); - ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + ft_h->blocktable.block_free( + BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100); ft_h->blocktable.destroy(); ft_h->cmp.destroy(); toku_free(ft_h->h); @@ -405,11 +427,12 @@ test_prefetching(void) { toku_free(ft); toku_free(ndd); - r = close(fd); assert(r != -1); + r = close(fd); + invariant(r != -1); } -int -test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) { +int test_main(int argc __attribute__((__unused__)), + const char *argv[] __attribute__((__unused__))) { test_prefetching(); return 0; diff --git a/storage/tokudb/PerconaFT/ft/tests/ft-clock-test.cc b/storage/tokudb/PerconaFT/ft/tests/ft-clock-test.cc index ceef3772e2a..26a3dae673c 100644 --- a/storage/tokudb/PerconaFT/ft/tests/ft-clock-test.cc +++ b/storage/tokudb/PerconaFT/ft/tests/ft-clock-test.cc @@ -40,38 +40,28 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. #include "ft/cursor.h" -enum ftnode_verify_type { - read_all=1, - read_compressed, - read_none -}; +enum ftnode_verify_type { read_all = 1, read_compressed, read_none }; #ifndef MIN #define MIN(x, y) (((x) < (y)) ? (x) : (y)) #endif -static int -string_key_cmp(DB *UU(e), const DBT *a, const DBT *b) -{ +static int string_key_cmp(DB *UU(e), const DBT *a, const DBT *b) { char *CAST_FROM_VOIDP(s, a->data); char *CAST_FROM_VOIDP(t, b->data); return strcmp(s, t); } -static void -le_add_to_bn(bn_data* bn, uint32_t idx, const char *key, int keylen, const char *val, int vallen) -{ +static void le_add_to_bn(bn_data *bn, + uint32_t idx, + const char *key, + int keylen, + const char *val, + int vallen) { LEAFENTRY r = NULL; uint32_t size_needed = LE_CLEAN_MEMSIZE(vallen); void *maybe_free = nullptr; - bn->get_space_for_insert( - idx, - key, - keylen, - size_needed, - &r, - &maybe_free - ); + bn->get_space_for_insert(idx, key, keylen, size_needed, &r, &maybe_free); if (maybe_free) { toku_free(maybe_free); } @@ -81,70 +71,67 @@ le_add_to_bn(bn_data* bn, uint32_t idx, const char *key, int keylen, const char memcpy(r->u.clean.val, val, vallen); } - -static void -le_malloc(bn_data* bn, uint32_t idx, const char *key, const char *val) -{ +static void le_malloc(bn_data *bn, + uint32_t idx, + const char *key, + const char *val) { int keylen = strlen(key) + 1; int vallen = strlen(val) + 1; le_add_to_bn(bn, idx, key, keylen, val, vallen); } - -static void -test1(int fd, FT ft_h, FTNODE *dn) { +static void test1(int fd, FT ft_h, FTNODE *dn) { int r; ftnode_fetch_extra bfe_all; bfe_all.create_for_full_read(ft_h); FTNODE_DISK_DATA ndd = NULL; - r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, &ndd, &bfe_all); + r = toku_deserialize_ftnode_from( + fd, make_blocknum(20), 0 /*pass zero for hash*/, dn, &ndd, &bfe_all); bool is_leaf = ((*dn)->height == 0); - assert(r==0); + invariant(r == 0); for (int i = 0; i < (*dn)->n_children; i++) { - assert(BP_STATE(*dn,i) == PT_AVAIL); + invariant(BP_STATE(*dn, i) == PT_AVAIL); } // should sweep and NOT get rid of anything PAIR_ATTR attr; - memset(&attr,0,sizeof(attr)); + memset(&attr, 0, sizeof(attr)); toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr); for (int i = 0; i < (*dn)->n_children; i++) { - assert(BP_STATE(*dn,i) == PT_AVAIL); + invariant(BP_STATE(*dn, i) == PT_AVAIL); } // should sweep and get compress all toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr); for (int i = 0; i < (*dn)->n_children; i++) { if (!is_leaf) { - assert(BP_STATE(*dn,i) == PT_COMPRESSED); - } - else { - assert(BP_STATE(*dn,i) == PT_ON_DISK); + invariant(BP_STATE(*dn, i) == PT_COMPRESSED); + } else { + invariant(BP_STATE(*dn, i) == PT_ON_DISK); } } PAIR_ATTR size; bool req = toku_ftnode_pf_req_callback(*dn, &bfe_all); - assert(req); + invariant(req); toku_ftnode_pf_callback(*dn, ndd, &bfe_all, fd, &size); toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr); for (int i = 0; i < (*dn)->n_children; i++) { - assert(BP_STATE(*dn,i) == PT_AVAIL); + invariant(BP_STATE(*dn, i) == PT_AVAIL); } // should sweep and get compress all toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr); for (int i = 0; i < (*dn)->n_children; i++) { if (!is_leaf) { - assert(BP_STATE(*dn,i) == PT_COMPRESSED); - } - else { - assert(BP_STATE(*dn,i) == PT_ON_DISK); + invariant(BP_STATE(*dn, i) == PT_COMPRESSED); + } else { + invariant(BP_STATE(*dn, i) == PT_ON_DISK); } - } + } req = toku_ftnode_pf_req_callback(*dn, &bfe_all); - assert(req); + invariant(req); toku_ftnode_pf_callback(*dn, ndd, &bfe_all, fd, &size); toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr); for (int i = 0; i < (*dn)->n_children; i++) { - assert(BP_STATE(*dn,i) == PT_AVAIL); + invariant(BP_STATE(*dn, i) == PT_AVAIL); } (*dn)->dirty = 1; toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr); @@ -152,101 +139,102 @@ test1(int fd, FT ft_h, FTNODE *dn) { toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr); toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr); for (int i = 0; i < (*dn)->n_children; i++) { - assert(BP_STATE(*dn,i) == PT_AVAIL); + invariant(BP_STATE(*dn, i) == PT_AVAIL); } toku_free(ndd); toku_ftnode_free(dn); } - -static int search_cmp(const struct ft_search& UU(so), const DBT* UU(key)) { +static int search_cmp(const struct ft_search &UU(so), const DBT *UU(key)) { return 0; } -static void -test2(int fd, FT ft_h, FTNODE *dn) { +static void test2(int fd, FT ft_h, FTNODE *dn) { DBT left, right; DB dummy_db; memset(&dummy_db, 0, sizeof(dummy_db)); memset(&left, 0, sizeof(left)); memset(&right, 0, sizeof(right)); ft_search search; - + ftnode_fetch_extra bfe_subset; bfe_subset.create_for_subset_read( ft_h, - ft_search_init(&search, search_cmp, FT_SEARCH_LEFT, nullptr, nullptr, nullptr), + ft_search_init( + &search, search_cmp, FT_SEARCH_LEFT, nullptr, nullptr, nullptr), &left, &right, true, true, false, - false - ); + false); FTNODE_DISK_DATA ndd = NULL; - int r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, &ndd, &bfe_subset); - assert(r==0); + int r = toku_deserialize_ftnode_from( + fd, make_blocknum(20), 0 /*pass zero for hash*/, dn, &ndd, &bfe_subset); + invariant(r == 0); bool is_leaf = ((*dn)->height == 0); - // at this point, although both partitions are available, only the + // at this point, although both partitions are available, only the // second basement node should have had its clock // touched - assert(BP_STATE(*dn, 0) == PT_AVAIL); - assert(BP_STATE(*dn, 1) == PT_AVAIL); - assert(BP_SHOULD_EVICT(*dn, 0)); - assert(!BP_SHOULD_EVICT(*dn, 1)); + invariant(BP_STATE(*dn, 0) == PT_AVAIL); + invariant(BP_STATE(*dn, 1) == PT_AVAIL); + invariant(BP_SHOULD_EVICT(*dn, 0)); + invariant(!BP_SHOULD_EVICT(*dn, 1)); PAIR_ATTR attr; - memset(&attr,0,sizeof(attr)); + memset(&attr, 0, sizeof(attr)); toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr); - assert(BP_STATE(*dn, 0) == (is_leaf) ? PT_ON_DISK : PT_COMPRESSED); - assert(BP_STATE(*dn, 1) == PT_AVAIL); - assert(BP_SHOULD_EVICT(*dn, 1)); + invariant(BP_STATE(*dn, 0) == (is_leaf) ? PT_ON_DISK : PT_COMPRESSED); + invariant(BP_STATE(*dn, 1) == PT_AVAIL); + invariant(BP_SHOULD_EVICT(*dn, 1)); toku_ftnode_pe_callback(*dn, attr, ft_h, def_pe_finalize_impl, nullptr); - assert(BP_STATE(*dn, 1) == (is_leaf) ? PT_ON_DISK : PT_COMPRESSED); + invariant(BP_STATE(*dn, 1) == (is_leaf) ? PT_ON_DISK : PT_COMPRESSED); bool req = toku_ftnode_pf_req_callback(*dn, &bfe_subset); - assert(req); + invariant(req); toku_ftnode_pf_callback(*dn, ndd, &bfe_subset, fd, &attr); - assert(BP_STATE(*dn, 0) == PT_AVAIL); - assert(BP_STATE(*dn, 1) == PT_AVAIL); - assert(BP_SHOULD_EVICT(*dn, 0)); - assert(!BP_SHOULD_EVICT(*dn, 1)); + invariant(BP_STATE(*dn, 0) == PT_AVAIL); + invariant(BP_STATE(*dn, 1) == PT_AVAIL); + invariant(BP_SHOULD_EVICT(*dn, 0)); + invariant(!BP_SHOULD_EVICT(*dn, 1)); toku_free(ndd); toku_ftnode_free(dn); } -static void -test3_leaf(int fd, FT ft_h, FTNODE *dn) { +static void test3_leaf(int fd, FT ft_h, FTNODE *dn) { DBT left, right; DB dummy_db; memset(&dummy_db, 0, sizeof(dummy_db)); memset(&left, 0, sizeof(left)); memset(&right, 0, sizeof(right)); - + ftnode_fetch_extra bfe_min; bfe_min.create_for_min_read(ft_h); FTNODE_DISK_DATA ndd = NULL; - int r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, &ndd, &bfe_min); - assert(r==0); + int r = toku_deserialize_ftnode_from( + fd, make_blocknum(20), 0 /*pass zero for hash*/, dn, &ndd, &bfe_min); + invariant(r == 0); // // make sure we have a leaf // - assert((*dn)->height == 0); + invariant((*dn)->height == 0); for (int i = 0; i < (*dn)->n_children; i++) { - assert(BP_STATE(*dn, i) == PT_ON_DISK); + invariant(BP_STATE(*dn, i) == PT_ON_DISK); } toku_ftnode_free(dn); toku_free(ndd); } -static void -test_serialize_nonleaf(void) { +static void test_serialize_nonleaf(void) { // struct ft_handle source_ft; struct ftnode sn, *dn; - int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0); + int fd = open(TOKU_TEST_FILENAME, + O_RDWR | O_CREAT | O_BINARY, + S_IRWXU | S_IRWXG | S_IRWXO); + invariant(fd >= 0); int r; @@ -265,11 +253,11 @@ test_serialize_nonleaf(void) { sn.pivotkeys.create_from_dbts(toku_fill_dbt(&pivotkey, "hello", 6), 1); BP_BLOCKNUM(&sn, 0).b = 30; BP_BLOCKNUM(&sn, 1).b = 35; - BP_STATE(&sn,0) = PT_AVAIL; - BP_STATE(&sn,1) = PT_AVAIL; + BP_STATE(&sn, 0) = PT_AVAIL; + BP_STATE(&sn, 1) = PT_AVAIL; set_BNC(&sn, 0, toku_create_empty_nl()); set_BNC(&sn, 1, toku_create_empty_nl()); - //Create XIDS + // Create XIDS XIDS xids_0 = toku_xids_get_root_xids(); XIDS xids_123; XIDS xids_234; @@ -281,11 +269,38 @@ test_serialize_nonleaf(void) { toku::comparator cmp; cmp.create(string_key_cmp, nullptr); - toku_bnc_insert_msg(BNC(&sn, 0), "a", 2, "aval", 5, FT_NONE, next_dummymsn(), xids_0, true, cmp); - toku_bnc_insert_msg(BNC(&sn, 0), "b", 2, "bval", 5, FT_NONE, next_dummymsn(), xids_123, false, cmp); - toku_bnc_insert_msg(BNC(&sn, 1), "x", 2, "xval", 5, FT_NONE, next_dummymsn(), xids_234, true, cmp); - - //Cleanup: + toku_bnc_insert_msg(BNC(&sn, 0), + "a", + 2, + "aval", + 5, + FT_NONE, + next_dummymsn(), + xids_0, + true, + cmp); + toku_bnc_insert_msg(BNC(&sn, 0), + "b", + 2, + "bval", + 5, + FT_NONE, + next_dummymsn(), + xids_123, + false, + cmp); + toku_bnc_insert_msg(BNC(&sn, 1), + "x", + 2, + "xval", + 5, + FT_NONE, + next_dummymsn(), + xids_234, + true, + cmp); + + // Cleanup: toku_xids_destroy(&xids_0); toku_xids_destroy(&xids_123); toku_xids_destroy(&xids_234); @@ -297,35 +312,41 @@ test_serialize_nonleaf(void) { make_blocknum(0), ZERO_LSN, TXNID_NONE, - 4*1024*1024, - 128*1024, + 4 * 1024 * 1024, + 128 * 1024, TOKU_DEFAULT_COMPRESSION_METHOD, 16); ft_h->cmp.create(string_key_cmp, nullptr); ft->ft = ft_h; - + ft_h->blocktable.create(); - { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); } - //Want to use block #20 + { + int r_truncate = ftruncate(fd, 0); + CKERR(r_truncate); + } + // Want to use block #20 BLOCKNUM b = make_blocknum(0); while (b.b < 20) { ft_h->blocktable.allocate_blocknum(&b, ft_h); } - assert(b.b == 20); + invariant(b.b == 20); { DISKOFF offset; DISKOFF size; - ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0); - assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false); + invariant(offset == + (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size); - assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); - assert(size == 100); + invariant(offset == + (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + invariant(size == 100); } FTNODE_DISK_DATA ndd = NULL; - r = toku_serialize_ftnode_to(fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false); - assert(r==0); + r = toku_serialize_ftnode_to( + fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false); + invariant(r == 0); test1(fd, ft_h, &dn); test2(fd, ft_h, &dn); @@ -333,22 +354,26 @@ test_serialize_nonleaf(void) { toku_destroy_ftnode_internals(&sn); toku_free(ndd); - ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + ft_h->blocktable.block_free( + BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100); ft_h->blocktable.destroy(); toku_free(ft_h->h); ft_h->cmp.destroy(); toku_free(ft_h); toku_free(ft); - r = close(fd); assert(r != -1); + r = close(fd); + invariant(r != -1); } -static void -test_serialize_leaf(void) { +static void test_serialize_leaf(void) { // struct ft_handle source_ft; struct ftnode sn, *dn; - int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0); + int fd = open(TOKU_TEST_FILENAME, + O_RDWR | O_CREAT | O_BINARY, + S_IRWXU | S_IRWXG | S_IRWXO); + invariant(fd >= 0); int r; @@ -364,8 +389,8 @@ test_serialize_leaf(void) { MALLOC_N(sn.n_children, sn.bp); DBT pivotkey; sn.pivotkeys.create_from_dbts(toku_fill_dbt(&pivotkey, "b", 2), 1); - BP_STATE(&sn,0) = PT_AVAIL; - BP_STATE(&sn,1) = PT_AVAIL; + BP_STATE(&sn, 0) = PT_AVAIL; + BP_STATE(&sn, 1) = PT_AVAIL; set_BLB(&sn, 0, toku_create_empty_bn()); set_BLB(&sn, 1, toku_create_empty_bn()); le_malloc(BLB_DATA(&sn, 0), 0, "a", "aval"); @@ -378,51 +403,59 @@ test_serialize_leaf(void) { make_blocknum(0), ZERO_LSN, TXNID_NONE, - 4*1024*1024, - 128*1024, + 4 * 1024 * 1024, + 128 * 1024, TOKU_DEFAULT_COMPRESSION_METHOD, 16); ft->ft = ft_h; - + ft_h->blocktable.create(); - { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); } - //Want to use block #20 + { + int r_truncate = ftruncate(fd, 0); + CKERR(r_truncate); + } + // Want to use block #20 BLOCKNUM b = make_blocknum(0); while (b.b < 20) { ft_h->blocktable.allocate_blocknum(&b, ft_h); } - assert(b.b == 20); + invariant(b.b == 20); { DISKOFF offset; DISKOFF size; - ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0); - assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false); + invariant(offset == + (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size); - assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); - assert(size == 100); + invariant(offset == + (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + invariant(size == 100); } FTNODE_DISK_DATA ndd = NULL; - r = toku_serialize_ftnode_to(fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false); - assert(r==0); + r = toku_serialize_ftnode_to( + fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false); + invariant(r == 0); test1(fd, ft_h, &dn); - test3_leaf(fd, ft_h,&dn); + test3_leaf(fd, ft_h, &dn); toku_destroy_ftnode_internals(&sn); - ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + ft_h->blocktable.block_free( + BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100); ft_h->blocktable.destroy(); toku_free(ft_h->h); toku_free(ft_h); toku_free(ft); toku_free(ndd); - r = close(fd); assert(r != -1); + r = close(fd); + invariant(r != -1); } -int -test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) { +int test_main(int argc __attribute__((__unused__)), + const char *argv[] __attribute__((__unused__))) { initialize_dummymsn(); test_serialize_nonleaf(); test_serialize_leaf(); diff --git a/storage/tokudb/PerconaFT/ft/tests/ft-serialize-benchmark.cc b/storage/tokudb/PerconaFT/ft/tests/ft-serialize-benchmark.cc index 9828f49513c..d50488ae197 100644 --- a/storage/tokudb/PerconaFT/ft/tests/ft-serialize-benchmark.cc +++ b/storage/tokudb/PerconaFT/ft/tests/ft-serialize-benchmark.cc @@ -41,27 +41,21 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. #include <sys/time.h> #include "test.h" - - #ifndef MIN #define MIN(x, y) (((x) < (y)) ? (x) : (y)) #endif const double USECS_PER_SEC = 1000000.0; -static void -le_add_to_bn(bn_data* bn, uint32_t idx, char *key, int keylen, char *val, int vallen) -{ +static void le_add_to_bn(bn_data *bn, + uint32_t idx, + char *key, + int keylen, + char *val, + int vallen) { LEAFENTRY r = NULL; uint32_t size_needed = LE_CLEAN_MEMSIZE(vallen); void *maybe_free = nullptr; - bn->get_space_for_insert( - idx, - key, - keylen, - size_needed, - &r, - &maybe_free - ); + bn->get_space_for_insert(idx, key, keylen, size_needed, &r, &maybe_free); if (maybe_free) { toku_free(maybe_free); } @@ -71,20 +65,24 @@ le_add_to_bn(bn_data* bn, uint32_t idx, char *key, int keylen, char *val, int va memcpy(r->u.clean.val, val, vallen); } -static int -long_key_cmp(DB *UU(e), const DBT *a, const DBT *b) -{ +static int long_key_cmp(DB *UU(e), const DBT *a, const DBT *b) { const long *CAST_FROM_VOIDP(x, a->data); const long *CAST_FROM_VOIDP(y, b->data); return (*x > *y) - (*x < *y); } -static void -test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int deser_runs) { +static void test_serialize_leaf(int valsize, + int nelts, + double entropy, + int ser_runs, + int deser_runs) { // struct ft_handle source_ft; struct ftnode *sn, *dn; - int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0); + int fd = open(TOKU_TEST_FILENAME, + O_RDWR | O_CREAT | O_BINARY, + S_IRWXU | S_IRWXG | S_IRWXO); + invariant(fd >= 0); int r; @@ -102,7 +100,7 @@ test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int de MALLOC_N(sn->n_children, sn->bp); sn->pivotkeys.create_empty(); for (int i = 0; i < sn->n_children; ++i) { - BP_STATE(sn,i) = PT_AVAIL; + BP_STATE(sn, i) = PT_AVAIL; set_BLB(sn, i, toku_create_empty_bn()); } int nperbn = nelts / sn->n_children; @@ -112,24 +110,19 @@ test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int de k = ck * nperbn + i; char buf[valsize]; int c; - for (c = 0; c < valsize * entropy; ) { - int *p = (int *) &buf[c]; + for (c = 0; c < valsize * entropy;) { + int *p = (int *)&buf[c]; *p = rand(); c += sizeof(*p); } memset(&buf[c], 0, valsize - c); le_add_to_bn( - BLB_DATA(sn,ck), - i, - (char *)&k, - sizeof k, - buf, - sizeof buf - ); + BLB_DATA(sn, ck), i, (char *)&k, sizeof k, buf, sizeof buf); } if (ck < 7) { DBT pivotkey; - sn->pivotkeys.insert_at(toku_fill_dbt(&pivotkey, &k, sizeof(k)), ck); + sn->pivotkeys.insert_at(toku_fill_dbt(&pivotkey, &k, sizeof(k)), + ck); } } @@ -139,31 +132,36 @@ test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int de make_blocknum(0), ZERO_LSN, TXNID_NONE, - 4*1024*1024, - 128*1024, + 4 * 1024 * 1024, + 128 * 1024, TOKU_DEFAULT_COMPRESSION_METHOD, 16); ft_h->cmp.create(long_key_cmp, nullptr); ft->ft = ft_h; - + ft_h->blocktable.create(); - { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); } - //Want to use block #20 + { + int r_truncate = ftruncate(fd, 0); + CKERR(r_truncate); + } + // Want to use block #20 BLOCKNUM b = make_blocknum(0); while (b.b < 20) { ft_h->blocktable.allocate_blocknum(&b, ft_h); } - assert(b.b == 20); + invariant(b.b == 20); { DISKOFF offset; DISKOFF size; - ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0); - assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false); + invariant(offset == + (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size); - assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); - assert(size == 100); + invariant(offset == + (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + invariant(size == 100); } struct timeval total_start; @@ -176,8 +174,9 @@ test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int de gettimeofday(&t[0], NULL); ndd = NULL; sn->dirty = 1; - r = toku_serialize_ftnode_to(fd, make_blocknum(20), sn, &ndd, true, ft->ft, false); - assert(r==0); + r = toku_serialize_ftnode_to( + fd, make_blocknum(20), sn, &ndd, true, ft->ft, false); + invariant(r == 0); gettimeofday(&t[1], NULL); total_start.tv_sec += t[0].tv_sec; total_start.tv_usec += t[0].tv_usec; @@ -186,12 +185,14 @@ test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int de toku_free(ndd); } double dt; - dt = (total_end.tv_sec - total_start.tv_sec) + ((total_end.tv_usec - total_start.tv_usec) / USECS_PER_SEC); + dt = (total_end.tv_sec - total_start.tv_sec) + + ((total_end.tv_usec - total_start.tv_usec) / USECS_PER_SEC); dt *= 1000; dt /= ser_runs; - printf("serialize leaf(ms): %0.05lf (average of %d runs)\n", dt, ser_runs); + printf( + "serialize leaf(ms): %0.05lf (average of %d runs)\n", dt, ser_runs); - //reset + // reset total_start.tv_sec = total_start.tv_usec = 0; total_end.tv_sec = total_end.tv_usec = 0; @@ -200,8 +201,9 @@ test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int de bfe.create_for_full_read(ft_h); gettimeofday(&t[0], NULL); FTNODE_DISK_DATA ndd2 = NULL; - r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd2, &bfe); - assert(r==0); + r = toku_deserialize_ftnode_from( + fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd2, &bfe); + invariant(r == 0); gettimeofday(&t[1], NULL); total_start.tv_sec += t[0].tv_sec; @@ -212,35 +214,46 @@ test_serialize_leaf(int valsize, int nelts, double entropy, int ser_runs, int de toku_ftnode_free(&dn); toku_free(ndd2); } - dt = (total_end.tv_sec - total_start.tv_sec) + ((total_end.tv_usec - total_start.tv_usec) / USECS_PER_SEC); + dt = (total_end.tv_sec - total_start.tv_sec) + + ((total_end.tv_usec - total_start.tv_usec) / USECS_PER_SEC); dt *= 1000; dt /= deser_runs; - printf("deserialize leaf(ms): %0.05lf (average of %d runs)\n", dt, deser_runs); - printf("io time(ms) %lf decompress time(ms) %lf deserialize time(ms) %lf (average of %d runs)\n", - tokutime_to_seconds(bfe.io_time)*1000, - tokutime_to_seconds(bfe.decompress_time)*1000, - tokutime_to_seconds(bfe.deserialize_time)*1000, - deser_runs - ); + printf( + "deserialize leaf(ms): %0.05lf (average of %d runs)\n", dt, deser_runs); + printf( + "io time(ms) %lf decompress time(ms) %lf deserialize time(ms) %lf " + "(average of %d runs)\n", + tokutime_to_seconds(bfe.io_time) * 1000, + tokutime_to_seconds(bfe.decompress_time) * 1000, + tokutime_to_seconds(bfe.deserialize_time) * 1000, + deser_runs); toku_ftnode_free(&sn); - ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + ft_h->blocktable.block_free( + BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100); ft_h->blocktable.destroy(); ft_h->cmp.destroy(); toku_free(ft_h->h); toku_free(ft_h); toku_free(ft); - r = close(fd); assert(r != -1); + r = close(fd); + invariant(r != -1); } -static void -test_serialize_nonleaf(int valsize, int nelts, double entropy, int ser_runs, int deser_runs) { +static void test_serialize_nonleaf(int valsize, + int nelts, + double entropy, + int ser_runs, + int deser_runs) { // struct ft_handle source_ft; struct ftnode sn, *dn; - int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0); + int fd = open(TOKU_TEST_FILENAME, + O_RDWR | O_CREAT | O_BINARY, + S_IRWXU | S_IRWXG | S_IRWXO); + invariant(fd >= 0); int r; @@ -257,11 +270,11 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy, int ser_runs, int MALLOC_N(sn.n_children, sn.bp); sn.pivotkeys.create_empty(); for (int i = 0; i < sn.n_children; ++i) { - BP_BLOCKNUM(&sn, i).b = 30 + (i*5); - BP_STATE(&sn,i) = PT_AVAIL; + BP_BLOCKNUM(&sn, i).b = 30 + (i * 5); + BP_STATE(&sn, i) = PT_AVAIL; set_BNC(&sn, i, toku_create_empty_nl()); } - //Create XIDS + // Create XIDS XIDS xids_0 = toku_xids_get_root_xids(); XIDS xids_123; r = toku_xids_create_child(xids_0, &xids_123, (TXNID)123); @@ -276,14 +289,23 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy, int ser_runs, int k = ck * nperchild + i; char buf[valsize]; int c; - for (c = 0; c < valsize * entropy; ) { - int *p = (int *) &buf[c]; + for (c = 0; c < valsize * entropy;) { + int *p = (int *)&buf[c]; *p = rand(); c += sizeof(*p); } memset(&buf[c], 0, valsize - c); - toku_bnc_insert_msg(bnc, &k, sizeof k, buf, valsize, FT_NONE, next_dummymsn(), xids_123, true, cmp); + toku_bnc_insert_msg(bnc, + &k, + sizeof k, + buf, + valsize, + FT_NONE, + next_dummymsn(), + xids_123, + true, + cmp); } if (ck < 7) { DBT pivotkey; @@ -291,7 +313,7 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy, int ser_runs, int } } - //Cleanup: + // Cleanup: toku_xids_destroy(&xids_0); toku_xids_destroy(&xids_123); cmp.destroy(); @@ -302,65 +324,78 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy, int ser_runs, int make_blocknum(0), ZERO_LSN, TXNID_NONE, - 4*1024*1024, - 128*1024, + 4 * 1024 * 1024, + 128 * 1024, TOKU_DEFAULT_COMPRESSION_METHOD, 16); ft_h->cmp.create(long_key_cmp, nullptr); ft->ft = ft_h; - + ft_h->blocktable.create(); - { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); } - //Want to use block #20 + { + int r_truncate = ftruncate(fd, 0); + CKERR(r_truncate); + } + // Want to use block #20 BLOCKNUM b = make_blocknum(0); while (b.b < 20) { ft_h->blocktable.allocate_blocknum(&b, ft_h); } - assert(b.b == 20); + invariant(b.b == 20); { DISKOFF offset; DISKOFF size; - ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0); - assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false); + invariant(offset == + (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size); - assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); - assert(size == 100); + invariant(offset == + (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + invariant(size == 100); } struct timeval t[2]; gettimeofday(&t[0], NULL); FTNODE_DISK_DATA ndd = NULL; - r = toku_serialize_ftnode_to(fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false); - assert(r==0); + r = toku_serialize_ftnode_to( + fd, make_blocknum(20), &sn, &ndd, true, ft->ft, false); + invariant(r == 0); gettimeofday(&t[1], NULL); double dt; - dt = (t[1].tv_sec - t[0].tv_sec) + ((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC); + dt = (t[1].tv_sec - t[0].tv_sec) + + ((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC); dt *= 1000; - printf("serialize nonleaf(ms): %0.05lf (IGNORED RUNS=%d)\n", dt, ser_runs); + printf( + "serialize nonleaf(ms): %0.05lf (IGNORED RUNS=%d)\n", dt, ser_runs); ftnode_fetch_extra bfe; bfe.create_for_full_read(ft_h); gettimeofday(&t[0], NULL); FTNODE_DISK_DATA ndd2 = NULL; - r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, &dn, &ndd2, &bfe); - assert(r==0); + r = toku_deserialize_ftnode_from( + fd, make_blocknum(20), 0 /*pass zero for hash*/, &dn, &ndd2, &bfe); + invariant(r == 0); gettimeofday(&t[1], NULL); - dt = (t[1].tv_sec - t[0].tv_sec) + ((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC); + dt = (t[1].tv_sec - t[0].tv_sec) + + ((t[1].tv_usec - t[0].tv_usec) / USECS_PER_SEC); dt *= 1000; - printf("deserialize nonleaf(ms): %0.05lf (IGNORED RUNS=%d)\n", dt, deser_runs); - printf("io time(ms) %lf decompress time(ms) %lf deserialize time(ms) %lf (IGNORED RUNS=%d)\n", - tokutime_to_seconds(bfe.io_time)*1000, - tokutime_to_seconds(bfe.decompress_time)*1000, - tokutime_to_seconds(bfe.deserialize_time)*1000, - deser_runs - ); + printf( + "deserialize nonleaf(ms): %0.05lf (IGNORED RUNS=%d)\n", dt, deser_runs); + printf( + "io time(ms) %lf decompress time(ms) %lf deserialize time(ms) %lf " + "(IGNORED RUNS=%d)\n", + tokutime_to_seconds(bfe.io_time) * 1000, + tokutime_to_seconds(bfe.decompress_time) * 1000, + tokutime_to_seconds(bfe.deserialize_time) * 1000, + deser_runs); toku_ftnode_free(&dn); toku_destroy_ftnode_internals(&sn); - ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + ft_h->blocktable.block_free( + BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100); ft_h->blocktable.destroy(); toku_free(ft_h->h); ft_h->cmp.destroy(); @@ -369,17 +404,21 @@ test_serialize_nonleaf(int valsize, int nelts, double entropy, int ser_runs, int toku_free(ndd); toku_free(ndd2); - r = close(fd); assert(r != -1); + r = close(fd); + invariant(r != -1); } -int -test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) { +int test_main(int argc __attribute__((__unused__)), + const char *argv[] __attribute__((__unused__))) { const int DEFAULT_RUNS = 5; long valsize, nelts, ser_runs = DEFAULT_RUNS, deser_runs = DEFAULT_RUNS; double entropy = 0.3; if (argc != 3 && argc != 5) { - fprintf(stderr, "Usage: %s <valsize> <nelts> [<serialize_runs> <deserialize_runs>]\n", argv[0]); + fprintf(stderr, + "Usage: %s <valsize> <nelts> [<serialize_runs> " + "<deserialize_runs>]\n", + argv[0]); fprintf(stderr, "Default (and min) runs is %d\n", DEFAULT_RUNS); return 2; } diff --git a/storage/tokudb/PerconaFT/ft/tests/ft-serialize-test.cc b/storage/tokudb/PerconaFT/ft/tests/ft-serialize-test.cc index 332aaa0c170..0cddaf19651 100644 --- a/storage/tokudb/PerconaFT/ft/tests/ft-serialize-test.cc +++ b/storage/tokudb/PerconaFT/ft/tests/ft-serialize-test.cc @@ -39,26 +39,20 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. #include "test.h" #include "bndata.h" - - #ifndef MIN #define MIN(x, y) (((x) < (y)) ? (x) : (y)) #endif -static size_t -le_add_to_bn(bn_data* bn, uint32_t idx, const char *key, int keysize, const char *val, int valsize) -{ +static size_t le_add_to_bn(bn_data *bn, + uint32_t idx, + const char *key, + int keysize, + const char *val, + int valsize) { LEAFENTRY r = NULL; uint32_t size_needed = LE_CLEAN_MEMSIZE(valsize); void *maybe_free = nullptr; - bn->get_space_for_insert( - idx, - key, - keysize, - size_needed, - &r, - &maybe_free - ); + bn->get_space_for_insert(idx, key, keysize, size_needed, &r, &maybe_free); if (maybe_free) { toku_free(maybe_free); } @@ -70,16 +64,19 @@ le_add_to_bn(bn_data* bn, uint32_t idx, const char *key, int keysize, const cha } class test_key_le_pair { - public: + public: uint32_t keylen; - char* keyp; + char *keyp; LEAFENTRY le; test_key_le_pair() : keylen(), keyp(), le() {} void init(const char *_keyp, const char *_val) { init(_keyp, strlen(_keyp) + 1, _val, strlen(_val) + 1); } - void init(const char * _keyp, uint32_t _keylen, const char*_val, uint32_t _vallen) { + void init(const char *_keyp, + uint32_t _keylen, + const char *_val, + uint32_t _vallen) { keylen = _keylen; CAST_FROM_VOIDP(le, toku_malloc(LE_CLEAN_MEMSIZE(_vallen))); @@ -95,126 +92,144 @@ class test_key_le_pair { } }; -enum ftnode_verify_type { - read_all=1, - read_compressed, - read_none -}; +enum ftnode_verify_type { read_all = 1, read_compressed, read_none }; -static int -string_key_cmp(DB *UU(e), const DBT *a, const DBT *b) -{ +static int string_key_cmp(DB *UU(e), const DBT *a, const DBT *b) { char *CAST_FROM_VOIDP(s, a->data); char *CAST_FROM_VOIDP(t, b->data); return strcmp(s, t); } -static void -setup_dn(enum ftnode_verify_type bft, int fd, FT ft_h, FTNODE *dn, FTNODE_DISK_DATA* ndd) { +static void setup_dn(enum ftnode_verify_type bft, + int fd, + FT ft_h, + FTNODE *dn, + FTNODE_DISK_DATA *ndd) { int r; if (bft == read_all) { ftnode_fetch_extra bfe; bfe.create_for_full_read(ft_h); - r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, ndd, &bfe); - assert(r==0); - } - else if (bft == read_compressed || bft == read_none) { + r = toku_deserialize_ftnode_from( + fd, make_blocknum(20), 0 /*pass zero for hash*/, dn, ndd, &bfe); + invariant(r == 0); + } else if (bft == read_compressed || bft == read_none) { ftnode_fetch_extra bfe; bfe.create_for_min_read(ft_h); - r = toku_deserialize_ftnode_from(fd, make_blocknum(20), 0/*pass zero for hash*/, dn, ndd, &bfe); - assert(r==0); - // assert all bp's are compressed or on disk. + r = toku_deserialize_ftnode_from( + fd, make_blocknum(20), 0 /*pass zero for hash*/, dn, ndd, &bfe); + invariant(r == 0); + // invariant all bp's are compressed or on disk. for (int i = 0; i < (*dn)->n_children; i++) { - assert(BP_STATE(*dn,i) == PT_COMPRESSED || BP_STATE(*dn, i) == PT_ON_DISK); + invariant(BP_STATE(*dn, i) == PT_COMPRESSED || + BP_STATE(*dn, i) == PT_ON_DISK); } // if read_none, get rid of the compressed bp's if (bft == read_none) { if ((*dn)->height == 0) { - toku_ftnode_pe_callback(*dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr); - // assert all bp's are on disk + toku_ftnode_pe_callback(*dn, + make_pair_attr(0xffffffff), + ft_h, + def_pe_finalize_impl, + nullptr); + // invariant all bp's are on disk for (int i = 0; i < (*dn)->n_children; i++) { if ((*dn)->height == 0) { - assert(BP_STATE(*dn,i) == PT_ON_DISK); - assert(is_BNULL(*dn, i)); - } - else { - assert(BP_STATE(*dn,i) == PT_COMPRESSED); + invariant(BP_STATE(*dn, i) == PT_ON_DISK); + invariant(is_BNULL(*dn, i)); + } else { + invariant(BP_STATE(*dn, i) == PT_COMPRESSED); } } - } - else { + } else { // first decompress everything, and make sure // that it is available // then run partial eviction to get it compressed PAIR_ATTR attr; bfe.create_for_full_read(ft_h); - assert(toku_ftnode_pf_req_callback(*dn, &bfe)); + invariant(toku_ftnode_pf_req_callback(*dn, &bfe)); r = toku_ftnode_pf_callback(*dn, *ndd, &bfe, fd, &attr); - assert(r==0); - // assert all bp's are available + invariant(r == 0); + // invariant all bp's are available for (int i = 0; i < (*dn)->n_children; i++) { - assert(BP_STATE(*dn,i) == PT_AVAIL); + invariant(BP_STATE(*dn, i) == PT_AVAIL); } - toku_ftnode_pe_callback(*dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr); + toku_ftnode_pe_callback(*dn, + make_pair_attr(0xffffffff), + ft_h, + def_pe_finalize_impl, + nullptr); for (int i = 0; i < (*dn)->n_children; i++) { - // assert all bp's are still available, because we touched the clock - assert(BP_STATE(*dn,i) == PT_AVAIL); - // now assert all should be evicted - assert(BP_SHOULD_EVICT(*dn, i)); + // invariant all bp's are still available, because we touched + // the clock + invariant(BP_STATE(*dn, i) == PT_AVAIL); + // now invariant all should be evicted + invariant(BP_SHOULD_EVICT(*dn, i)); } - toku_ftnode_pe_callback(*dn, make_pair_attr(0xffffffff), ft_h, def_pe_finalize_impl, nullptr); + toku_ftnode_pe_callback(*dn, + make_pair_attr(0xffffffff), + ft_h, + def_pe_finalize_impl, + nullptr); for (int i = 0; i < (*dn)->n_children; i++) { - assert(BP_STATE(*dn,i) == PT_COMPRESSED); + invariant(BP_STATE(*dn, i) == PT_COMPRESSED); } } } // now decompress them bfe.create_for_full_read(ft_h); - assert(toku_ftnode_pf_req_callback(*dn, &bfe)); + invariant(toku_ftnode_pf_req_callback(*dn, &bfe)); PAIR_ATTR attr; r = toku_ftnode_pf_callback(*dn, *ndd, &bfe, fd, &attr); - assert(r==0); - // assert all bp's are available + invariant(r == 0); + // invariant all bp's are available for (int i = 0; i < (*dn)->n_children; i++) { - assert(BP_STATE(*dn,i) == PT_AVAIL); + invariant(BP_STATE(*dn, i) == PT_AVAIL); } // continue on with test - } - else { + } else { // if we get here, this is a test bug, NOT a bug in development code - assert(false); + invariant(false); } } -static void write_sn_to_disk(int fd, FT_HANDLE ft, FTNODE sn, FTNODE_DISK_DATA* src_ndd, bool do_clone) { +static void write_sn_to_disk(int fd, + FT_HANDLE ft, + FTNODE sn, + FTNODE_DISK_DATA *src_ndd, + bool do_clone) { int r; if (do_clone) { - void* cloned_node_v = NULL; + void *cloned_node_v = NULL; PAIR_ATTR attr; long clone_size; - toku_ftnode_clone_callback(sn, &cloned_node_v, &clone_size, &attr, false, ft->ft); + toku_ftnode_clone_callback( + sn, &cloned_node_v, &clone_size, &attr, false, ft->ft); FTNODE CAST_FROM_VOIDP(cloned_node, cloned_node_v); - r = toku_serialize_ftnode_to(fd, make_blocknum(20), cloned_node, src_ndd, false, ft->ft, false); - assert(r==0); + r = toku_serialize_ftnode_to( + fd, make_blocknum(20), cloned_node, src_ndd, false, ft->ft, false); + invariant(r == 0); toku_ftnode_free(&cloned_node); - } - else { - r = toku_serialize_ftnode_to(fd, make_blocknum(20), sn, src_ndd, true, ft->ft, false); - assert(r==0); + } else { + r = toku_serialize_ftnode_to( + fd, make_blocknum(20), sn, src_ndd, true, ft->ft, false); + invariant(r == 0); } } -static void -test_serialize_leaf_check_msn(enum ftnode_verify_type bft, bool do_clone) { +static void test_serialize_leaf_check_msn(enum ftnode_verify_type bft, + bool do_clone) { // struct ft_handle source_ft; struct ftnode sn, *dn; - int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0); + int fd = open(TOKU_TEST_FILENAME, + O_RDWR | O_CREAT | O_BINARY, + S_IRWXU | S_IRWXG | S_IRWXO); + invariant(fd >= 0); int r; -#define PRESERIALIZE_MSN_ON_DISK ((MSN) { MIN_MSN.msn + 42 }) -#define POSTSERIALIZE_MSN_ON_DISK ((MSN) { MIN_MSN.msn + 84 }) +#define PRESERIALIZE_MSN_ON_DISK ((MSN){MIN_MSN.msn + 42}) +#define POSTSERIALIZE_MSN_ON_DISK ((MSN){MIN_MSN.msn + 84}) sn.max_msn_applied_to_node_on_disk = PRESERIALIZE_MSN_ON_DISK; sn.flags = 0x11223344; @@ -228,14 +243,14 @@ test_serialize_leaf_check_msn(enum ftnode_verify_type bft, bool do_clone) { MALLOC_N(sn.n_children, sn.bp); DBT pivotkey; sn.pivotkeys.create_from_dbts(toku_fill_dbt(&pivotkey, "b", 2), 1); - BP_STATE(&sn,0) = PT_AVAIL; - BP_STATE(&sn,1) = PT_AVAIL; + BP_STATE(&sn, 0) = PT_AVAIL; + BP_STATE(&sn, 1) = PT_AVAIL; set_BLB(&sn, 0, toku_create_empty_bn()); set_BLB(&sn, 1, toku_create_empty_bn()); le_add_to_bn(BLB_DATA(&sn, 0), 0, "a", 2, "aval", 5); le_add_to_bn(BLB_DATA(&sn, 0), 1, "b", 2, "bval", 5); le_add_to_bn(BLB_DATA(&sn, 1), 0, "x", 2, "xval", 5); - BLB_MAX_MSN_APPLIED(&sn, 0) = ((MSN) { MIN_MSN.msn + 73 }); + BLB_MAX_MSN_APPLIED(&sn, 0) = ((MSN){MIN_MSN.msn + 73}); BLB_MAX_MSN_APPLIED(&sn, 1) = POSTSERIALIZE_MSN_ON_DISK; FT_HANDLE XMALLOC(ft); @@ -244,30 +259,35 @@ test_serialize_leaf_check_msn(enum ftnode_verify_type bft, bool do_clone) { make_blocknum(0), ZERO_LSN, TXNID_NONE, - 4*1024*1024, - 128*1024, + 4 * 1024 * 1024, + 128 * 1024, TOKU_DEFAULT_COMPRESSION_METHOD, 16); ft->ft = ft_h; ft_h->blocktable.create(); - { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); } + { + int r_truncate = ftruncate(fd, 0); + CKERR(r_truncate); + } - //Want to use block #20 + // Want to use block #20 BLOCKNUM b = make_blocknum(0); while (b.b < 20) { ft_h->blocktable.allocate_blocknum(&b, ft_h); } - assert(b.b == 20); + invariant(b.b == 20); { DISKOFF offset; DISKOFF size; - ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0); - assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false); + invariant(offset == + (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size); - assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); - assert(size == 100); + invariant(offset == + (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + invariant(size == 100); } FTNODE_DISK_DATA src_ndd = NULL; FTNODE_DISK_DATA dest_ndd = NULL; @@ -276,16 +296,18 @@ test_serialize_leaf_check_msn(enum ftnode_verify_type bft, bool do_clone) { setup_dn(bft, fd, ft_h, &dn, &dest_ndd); - assert(dn->blocknum.b==20); + invariant(dn->blocknum.b == 20); - assert(dn->layout_version ==FT_LAYOUT_VERSION); - assert(dn->layout_version_original ==FT_LAYOUT_VERSION); - assert(dn->layout_version_read_from_disk ==FT_LAYOUT_VERSION); - assert(dn->height == 0); - assert(dn->n_children>=1); - assert(dn->max_msn_applied_to_node_on_disk.msn == POSTSERIALIZE_MSN_ON_DISK.msn); + invariant(dn->layout_version == FT_LAYOUT_VERSION); + invariant(dn->layout_version_original == FT_LAYOUT_VERSION); + invariant(dn->layout_version_read_from_disk == FT_LAYOUT_VERSION); + invariant(dn->height == 0); + invariant(dn->n_children >= 1); + invariant(dn->max_msn_applied_to_node_on_disk.msn == + POSTSERIALIZE_MSN_ON_DISK.msn); { - // Man, this is way too ugly. This entire test suite needs to be refactored. + // Man, this is way too ugly. This entire test suite needs to be + // refactored. // Create a dummy mempool and put the leaves there. Ugh. test_key_le_pair elts[3]; elts[0].init("a", "aval"); @@ -294,34 +316,41 @@ test_serialize_leaf_check_msn(enum ftnode_verify_type bft, bool do_clone) { const uint32_t npartitions = dn->n_children; uint32_t last_i = 0; for (uint32_t bn = 0; bn < npartitions; ++bn) { - assert(BLB_MAX_MSN_APPLIED(dn, bn).msn == POSTSERIALIZE_MSN_ON_DISK.msn); - assert(dest_ndd[bn].start > 0); - assert(dest_ndd[bn].size > 0); + invariant(BLB_MAX_MSN_APPLIED(dn, bn).msn == + POSTSERIALIZE_MSN_ON_DISK.msn); + invariant(dest_ndd[bn].start > 0); + invariant(dest_ndd[bn].size > 0); if (bn > 0) { - assert(dest_ndd[bn].start >= dest_ndd[bn-1].start + dest_ndd[bn-1].size); + invariant(dest_ndd[bn].start >= + dest_ndd[bn - 1].start + dest_ndd[bn - 1].size); } for (uint32_t i = 0; i < BLB_DATA(dn, bn)->num_klpairs(); i++) { LEAFENTRY curr_le; uint32_t curr_keylen; - void* curr_key; - BLB_DATA(dn, bn)->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key); - assert(leafentry_memsize(curr_le) == leafentry_memsize(elts[last_i].le)); - assert(memcmp(curr_le, elts[last_i].le, leafentry_memsize(curr_le)) == 0); - if (bn < npartitions-1) { - assert(strcmp((char*)dn->pivotkeys.get_pivot(bn).data, elts[last_i].keyp) <= 0); + void *curr_key; + BLB_DATA(dn, bn) + ->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key); + invariant(leafentry_memsize(curr_le) == + leafentry_memsize(elts[last_i].le)); + invariant(memcmp(curr_le, + elts[last_i].le, + leafentry_memsize(curr_le)) == 0); + if (bn < npartitions - 1) { + invariant(strcmp((char *)dn->pivotkeys.get_pivot(bn).data, + elts[last_i].keyp) <= 0); } // TODO for later, get a key comparison here as well last_i++; } - } - assert(last_i == 3); + invariant(last_i == 3); } toku_ftnode_free(&dn); toku_destroy_ftnode_internals(&sn); - ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + ft_h->blocktable.block_free( + BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100); ft_h->blocktable.destroy(); toku_free(ft_h->h); toku_free(ft_h); @@ -329,17 +358,21 @@ test_serialize_leaf_check_msn(enum ftnode_verify_type bft, bool do_clone) { toku_free(src_ndd); toku_free(dest_ndd); - r = close(fd); assert(r != -1); + r = close(fd); + invariant(r != -1); } -static void -test_serialize_leaf_with_large_pivots(enum ftnode_verify_type bft, bool do_clone) { +static void test_serialize_leaf_with_large_pivots(enum ftnode_verify_type bft, + bool do_clone) { int r; struct ftnode sn, *dn; - const int keylens = 256*1024, vallens = 0; + const int keylens = 256 * 1024, vallens = 0; const uint32_t nrows = 8; - // assert(val_size > BN_MAX_SIZE); // BN_MAX_SIZE isn't visible - int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0); + // invariant(val_size > BN_MAX_SIZE); // BN_MAX_SIZE isn't visible + int fd = open(TOKU_TEST_FILENAME, + O_RDWR | O_CREAT | O_BINARY, + S_IRWXU | S_IRWXG | S_IRWXO); + invariant(fd >= 0); sn.max_msn_applied_to_node_on_disk.msn = 0; sn.flags = 0x11223344; @@ -354,21 +387,27 @@ test_serialize_leaf_with_large_pivots(enum ftnode_verify_type bft, bool do_clone MALLOC_N(sn.n_children, sn.bp); sn.pivotkeys.create_empty(); for (int i = 0; i < sn.n_children; ++i) { - BP_STATE(&sn,i) = PT_AVAIL; + BP_STATE(&sn, i) = PT_AVAIL; set_BLB(&sn, i, toku_create_empty_bn()); } for (uint32_t i = 0; i < nrows; ++i) { // one basement per row char key[keylens], val[vallens]; - key[keylens-1] = '\0'; + key[keylens - 1] = '\0'; char c = 'a' + i; - memset(key, c, keylens-1); - le_add_to_bn(BLB_DATA(&sn, i), 0, (char *) &key, sizeof(key), (char *) &val, sizeof(val)); - if (i < nrows-1) { + memset(key, c, keylens - 1); + le_add_to_bn(BLB_DATA(&sn, i), + 0, + (char *)&key, + sizeof(key), + (char *)&val, + sizeof(val)); + if (i < nrows - 1) { uint32_t keylen; - void* curr_key; + void *curr_key; BLB_DATA(&sn, i)->fetch_key_and_len(0, &keylen, &curr_key); DBT pivotkey; - sn.pivotkeys.insert_at(toku_fill_dbt(&pivotkey, curr_key, keylen), i); + sn.pivotkeys.insert_at(toku_fill_dbt(&pivotkey, curr_key, keylen), + i); } } @@ -378,29 +417,34 @@ test_serialize_leaf_with_large_pivots(enum ftnode_verify_type bft, bool do_clone make_blocknum(0), ZERO_LSN, TXNID_NONE, - 4*1024*1024, - 128*1024, + 4 * 1024 * 1024, + 128 * 1024, TOKU_DEFAULT_COMPRESSION_METHOD, 16); ft->ft = ft_h; ft_h->blocktable.create(); - { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); } - //Want to use block #20 + { + int r_truncate = ftruncate(fd, 0); + CKERR(r_truncate); + } + // Want to use block #20 BLOCKNUM b = make_blocknum(0); while (b.b < 20) { ft_h->blocktable.allocate_blocknum(&b, ft_h); } - assert(b.b == 20); + invariant(b.b == 20); { DISKOFF offset; DISKOFF size; - ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0); - assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false); + invariant(offset == + (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size); - assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); - assert(size == 100); + invariant(offset == + (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + invariant(size == 100); } FTNODE_DISK_DATA src_ndd = NULL; FTNODE_DISK_DATA dest_ndd = NULL; @@ -408,55 +452,64 @@ test_serialize_leaf_with_large_pivots(enum ftnode_verify_type bft, bool do_clone write_sn_to_disk(fd, ft, &sn, &src_ndd, do_clone); setup_dn(bft, fd, ft_h, &dn, &dest_ndd); - - assert(dn->blocknum.b==20); - assert(dn->layout_version ==FT_LAYOUT_VERSION); - assert(dn->layout_version_original ==FT_LAYOUT_VERSION); + invariant(dn->blocknum.b == 20); + + invariant(dn->layout_version == FT_LAYOUT_VERSION); + invariant(dn->layout_version_original == FT_LAYOUT_VERSION); { - // Man, this is way too ugly. This entire test suite needs to be refactored. + // Man, this is way too ugly. This entire test suite needs to be + // refactored. // Create a dummy mempool and put the leaves there. Ugh. test_key_le_pair *les = new test_key_le_pair[nrows]; { char key[keylens], val[vallens]; - key[keylens-1] = '\0'; + key[keylens - 1] = '\0'; for (uint32_t i = 0; i < nrows; ++i) { char c = 'a' + i; - memset(key, c, keylens-1); - les[i].init((char *) &key, sizeof(key), (char *) &val, sizeof(val)); + memset(key, c, keylens - 1); + les[i].init( + (char *)&key, sizeof(key), (char *)&val, sizeof(val)); } } const uint32_t npartitions = dn->n_children; uint32_t last_i = 0; for (uint32_t bn = 0; bn < npartitions; ++bn) { - assert(dest_ndd[bn].start > 0); - assert(dest_ndd[bn].size > 0); + invariant(dest_ndd[bn].start > 0); + invariant(dest_ndd[bn].size > 0); if (bn > 0) { - assert(dest_ndd[bn].start >= dest_ndd[bn-1].start + dest_ndd[bn-1].size); + invariant(dest_ndd[bn].start >= + dest_ndd[bn - 1].start + dest_ndd[bn - 1].size); } - assert(BLB_DATA(dn, bn)->num_klpairs() > 0); + invariant(BLB_DATA(dn, bn)->num_klpairs() > 0); for (uint32_t i = 0; i < BLB_DATA(dn, bn)->num_klpairs(); i++) { LEAFENTRY curr_le; uint32_t curr_keylen; - void* curr_key; - BLB_DATA(dn, bn)->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key); - assert(leafentry_memsize(curr_le) == leafentry_memsize(les[last_i].le)); - assert(memcmp(curr_le, les[last_i].le, leafentry_memsize(curr_le)) == 0); - if (bn < npartitions-1) { - assert(strcmp((char*)dn->pivotkeys.get_pivot(bn).data, les[last_i].keyp) <= 0); + void *curr_key; + BLB_DATA(dn, bn) + ->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key); + invariant(leafentry_memsize(curr_le) == + leafentry_memsize(les[last_i].le)); + invariant(memcmp(curr_le, + les[last_i].le, + leafentry_memsize(curr_le)) == 0); + if (bn < npartitions - 1) { + invariant(strcmp((char *)dn->pivotkeys.get_pivot(bn).data, + les[last_i].keyp) <= 0); } // TODO for later, get a key comparison here as well last_i++; } } - assert(last_i == nrows); + invariant(last_i == nrows); delete[] les; } toku_ftnode_free(&dn); toku_destroy_ftnode_internals(&sn); - ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + ft_h->blocktable.block_free( + BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100); ft_h->blocktable.destroy(); toku_free(ft_h->h); toku_free(ft_h); @@ -464,15 +517,19 @@ test_serialize_leaf_with_large_pivots(enum ftnode_verify_type bft, bool do_clone toku_free(src_ndd); toku_free(dest_ndd); - r = close(fd); assert(r != -1); + r = close(fd); + invariant(r != -1); } -static void -test_serialize_leaf_with_many_rows(enum ftnode_verify_type bft, bool do_clone) { +static void test_serialize_leaf_with_many_rows(enum ftnode_verify_type bft, + bool do_clone) { int r; struct ftnode sn, *dn; - const uint32_t nrows = 196*1024; - int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0); + const uint32_t nrows = 196 * 1024; + int fd = open(TOKU_TEST_FILENAME, + O_RDWR | O_CREAT | O_BINARY, + S_IRWXU | S_IRWXG | S_IRWXO); + invariant(fd >= 0); sn.max_msn_applied_to_node_on_disk.msn = 0; sn.flags = 0x11223344; @@ -487,14 +544,19 @@ test_serialize_leaf_with_many_rows(enum ftnode_verify_type bft, bool do_clone) { XMALLOC_N(sn.n_children, sn.bp); sn.pivotkeys.create_empty(); for (int i = 0; i < sn.n_children; ++i) { - BP_STATE(&sn,i) = PT_AVAIL; - set_BLB(&sn, i, toku_create_empty_bn()); + BP_STATE(&sn, i) = PT_AVAIL; + set_BLB(&sn, i, toku_create_empty_bn()); } size_t total_size = 0; for (uint32_t i = 0; i < nrows; ++i) { uint32_t key = i; uint32_t val = i; - total_size += le_add_to_bn(BLB_DATA(&sn, 0), i, (char *) &key, sizeof(key), (char *) &val, sizeof(val)); + total_size += le_add_to_bn(BLB_DATA(&sn, 0), + i, + (char *)&key, + sizeof(key), + (char *)&val, + sizeof(val)); } FT_HANDLE XMALLOC(ft); @@ -503,30 +565,35 @@ test_serialize_leaf_with_many_rows(enum ftnode_verify_type bft, bool do_clone) { make_blocknum(0), ZERO_LSN, TXNID_NONE, - 4*1024*1024, - 128*1024, + 4 * 1024 * 1024, + 128 * 1024, TOKU_DEFAULT_COMPRESSION_METHOD, 16); ft->ft = ft_h; - + ft_h->blocktable.create(); - { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); } - //Want to use block #20 + { + int r_truncate = ftruncate(fd, 0); + CKERR(r_truncate); + } + // Want to use block #20 BLOCKNUM b = make_blocknum(0); while (b.b < 20) { ft_h->blocktable.allocate_blocknum(&b, ft_h); } - assert(b.b == 20); + invariant(b.b == 20); { DISKOFF offset; DISKOFF size; - ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0); - assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false); + invariant(offset == + (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size); - assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); - assert(size == 100); + invariant(offset == + (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + invariant(size == 100); } FTNODE_DISK_DATA src_ndd = NULL; @@ -535,56 +602,66 @@ test_serialize_leaf_with_many_rows(enum ftnode_verify_type bft, bool do_clone) { setup_dn(bft, fd, ft_h, &dn, &dest_ndd); - assert(dn->blocknum.b==20); + invariant(dn->blocknum.b == 20); - assert(dn->layout_version ==FT_LAYOUT_VERSION); - assert(dn->layout_version_original ==FT_LAYOUT_VERSION); + invariant(dn->layout_version == FT_LAYOUT_VERSION); + invariant(dn->layout_version_original == FT_LAYOUT_VERSION); { - // Man, this is way too ugly. This entire test suite needs to be refactored. + // Man, this is way too ugly. This entire test suite needs to be + // refactored. // Create a dummy mempool and put the leaves there. Ugh. test_key_le_pair *les = new test_key_le_pair[nrows]; { int key = 0, val = 0; for (uint32_t i = 0; i < nrows; ++i, key++, val++) { - les[i].init((char *) &key, sizeof(key), (char *) &val, sizeof(val)); + les[i].init( + (char *)&key, sizeof(key), (char *)&val, sizeof(val)); } } const uint32_t npartitions = dn->n_children; uint32_t last_i = 0; for (uint32_t bn = 0; bn < npartitions; ++bn) { - assert(dest_ndd[bn].start > 0); - assert(dest_ndd[bn].size > 0); + invariant(dest_ndd[bn].start > 0); + invariant(dest_ndd[bn].size > 0); if (bn > 0) { - assert(dest_ndd[bn].start >= dest_ndd[bn-1].start + dest_ndd[bn-1].size); + invariant(dest_ndd[bn].start >= + dest_ndd[bn - 1].start + dest_ndd[bn - 1].size); } - assert(BLB_DATA(dn, bn)->num_klpairs() > 0); + invariant(BLB_DATA(dn, bn)->num_klpairs() > 0); for (uint32_t i = 0; i < BLB_DATA(dn, bn)->num_klpairs(); i++) { LEAFENTRY curr_le; uint32_t curr_keylen; - void* curr_key; - BLB_DATA(dn, bn)->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key); - assert(leafentry_memsize(curr_le) == leafentry_memsize(les[last_i].le)); - assert(memcmp(curr_le, les[last_i].le, leafentry_memsize(curr_le)) == 0); - if (bn < npartitions-1) { - uint32_t *CAST_FROM_VOIDP(pivot, dn->pivotkeys.get_pivot(bn).data); - void* tmp = les[last_i].keyp; + void *curr_key; + BLB_DATA(dn, bn) + ->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key); + invariant(leafentry_memsize(curr_le) == + leafentry_memsize(les[last_i].le)); + invariant(memcmp(curr_le, + les[last_i].le, + leafentry_memsize(curr_le)) == 0); + if (bn < npartitions - 1) { + uint32_t *CAST_FROM_VOIDP(pivot, + dn->pivotkeys.get_pivot(bn).data); + void *tmp = les[last_i].keyp; uint32_t *CAST_FROM_VOIDP(item, tmp); - assert(*pivot >= *item); + invariant(*pivot >= *item); } // TODO for later, get a key comparison here as well last_i++; } // don't check soft_copy_is_up_to_date or seqinsert - assert(BLB_DATA(dn, bn)->get_disk_size() < 128*1024); // BN_MAX_SIZE, apt to change + invariant(BLB_DATA(dn, bn)->get_disk_size() < + 128 * 1024); // BN_MAX_SIZE, apt to change } - assert(last_i == nrows); + invariant(last_i == nrows); delete[] les; } toku_ftnode_free(&dn); toku_destroy_ftnode_internals(&sn); - ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + ft_h->blocktable.block_free( + BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100); ft_h->blocktable.destroy(); toku_free(ft_h->h); toku_free(ft_h); @@ -592,19 +669,22 @@ test_serialize_leaf_with_many_rows(enum ftnode_verify_type bft, bool do_clone) { toku_free(src_ndd); toku_free(dest_ndd); - r = close(fd); assert(r != -1); + r = close(fd); + invariant(r != -1); } - -static void -test_serialize_leaf_with_large_rows(enum ftnode_verify_type bft, bool do_clone) { +static void test_serialize_leaf_with_large_rows(enum ftnode_verify_type bft, + bool do_clone) { int r; struct ftnode sn, *dn; const uint32_t nrows = 7; const size_t key_size = 8; - const size_t val_size = 512*1024; - // assert(val_size > BN_MAX_SIZE); // BN_MAX_SIZE isn't visible - int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0); + const size_t val_size = 512 * 1024; + // invariant(val_size > BN_MAX_SIZE); // BN_MAX_SIZE isn't visible + int fd = open(TOKU_TEST_FILENAME, + O_RDWR | O_CREAT | O_BINARY, + S_IRWXU | S_IRWXG | S_IRWXO); + invariant(fd >= 0); sn.max_msn_applied_to_node_on_disk.msn = 0; sn.flags = 0x11223344; @@ -615,21 +695,21 @@ test_serialize_leaf_with_large_rows(enum ftnode_verify_type bft, bool do_clone) sn.n_children = 1; sn.dirty = 1; sn.oldest_referenced_xid_known = TXNID_NONE; - + MALLOC_N(sn.n_children, sn.bp); sn.pivotkeys.create_empty(); for (int i = 0; i < sn.n_children; ++i) { - BP_STATE(&sn,i) = PT_AVAIL; + BP_STATE(&sn, i) = PT_AVAIL; set_BLB(&sn, i, toku_create_empty_bn()); } for (uint32_t i = 0; i < nrows; ++i) { char key[key_size], val[val_size]; - key[key_size-1] = '\0'; - val[val_size-1] = '\0'; + key[key_size - 1] = '\0'; + val[val_size - 1] = '\0'; char c = 'a' + i; - memset(key, c, key_size-1); - memset(val, c, val_size-1); - le_add_to_bn(BLB_DATA(&sn, 0), i,key, 8, val, val_size); + memset(key, c, key_size - 1); + memset(val, c, val_size - 1); + le_add_to_bn(BLB_DATA(&sn, 0), i, key, 8, val, val_size); } FT_HANDLE XMALLOC(ft); @@ -638,30 +718,35 @@ test_serialize_leaf_with_large_rows(enum ftnode_verify_type bft, bool do_clone) make_blocknum(0), ZERO_LSN, TXNID_NONE, - 4*1024*1024, - 128*1024, + 4 * 1024 * 1024, + 128 * 1024, TOKU_DEFAULT_COMPRESSION_METHOD, 16); ft->ft = ft_h; - + ft_h->blocktable.create(); - { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); } - //Want to use block #20 + { + int r_truncate = ftruncate(fd, 0); + CKERR(r_truncate); + } + // Want to use block #20 BLOCKNUM b = make_blocknum(0); while (b.b < 20) { ft_h->blocktable.allocate_blocknum(&b, ft_h); } - assert(b.b == 20); + invariant(b.b == 20); { DISKOFF offset; DISKOFF size; - ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0); - assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false); + invariant(offset == + (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size); - assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); - assert(size == 100); + invariant(offset == + (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + invariant(size == 100); } FTNODE_DISK_DATA src_ndd = NULL; @@ -670,58 +755,66 @@ test_serialize_leaf_with_large_rows(enum ftnode_verify_type bft, bool do_clone) setup_dn(bft, fd, ft_h, &dn, &dest_ndd); - assert(dn->blocknum.b==20); + invariant(dn->blocknum.b == 20); - assert(dn->layout_version ==FT_LAYOUT_VERSION); - assert(dn->layout_version_original ==FT_LAYOUT_VERSION); + invariant(dn->layout_version == FT_LAYOUT_VERSION); + invariant(dn->layout_version_original == FT_LAYOUT_VERSION); { - // Man, this is way too ugly. This entire test suite needs to be refactored. + // Man, this is way too ugly. This entire test suite needs to be + // refactored. // Create a dummy mempool and put the leaves there. Ugh. test_key_le_pair *les = new test_key_le_pair[nrows]; { char key[key_size], val[val_size]; - key[key_size-1] = '\0'; - val[val_size-1] = '\0'; + key[key_size - 1] = '\0'; + val[val_size - 1] = '\0'; for (uint32_t i = 0; i < nrows; ++i) { char c = 'a' + i; - memset(key, c, key_size-1); - memset(val, c, val_size-1); + memset(key, c, key_size - 1); + memset(val, c, val_size - 1); les[i].init(key, key_size, val, val_size); } } const uint32_t npartitions = dn->n_children; - assert(npartitions == nrows); + invariant(npartitions == nrows); uint32_t last_i = 0; for (uint32_t bn = 0; bn < npartitions; ++bn) { - assert(dest_ndd[bn].start > 0); - assert(dest_ndd[bn].size > 0); + invariant(dest_ndd[bn].start > 0); + invariant(dest_ndd[bn].size > 0); if (bn > 0) { - assert(dest_ndd[bn].start >= dest_ndd[bn-1].start + dest_ndd[bn-1].size); + invariant(dest_ndd[bn].start >= + dest_ndd[bn - 1].start + dest_ndd[bn - 1].size); } - assert(BLB_DATA(dn, bn)->num_klpairs() > 0); + invariant(BLB_DATA(dn, bn)->num_klpairs() > 0); for (uint32_t i = 0; i < BLB_DATA(dn, bn)->num_klpairs(); i++) { LEAFENTRY curr_le; uint32_t curr_keylen; - void* curr_key; - BLB_DATA(dn, bn)->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key); - assert(leafentry_memsize(curr_le) == leafentry_memsize(les[last_i].le)); - assert(memcmp(curr_le, les[last_i].le, leafentry_memsize(curr_le)) == 0); - if (bn < npartitions-1) { - assert(strcmp((char*)dn->pivotkeys.get_pivot(bn).data, (char*)(les[last_i].keyp)) <= 0); + void *curr_key; + BLB_DATA(dn, bn) + ->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key); + invariant(leafentry_memsize(curr_le) == + leafentry_memsize(les[last_i].le)); + invariant(memcmp(curr_le, + les[last_i].le, + leafentry_memsize(curr_le)) == 0); + if (bn < npartitions - 1) { + invariant(strcmp((char *)dn->pivotkeys.get_pivot(bn).data, + (char *)(les[last_i].keyp)) <= 0); } // TODO for later, get a key comparison here as well last_i++; } // don't check soft_copy_is_up_to_date or seqinsert } - assert(last_i == 7); + invariant(last_i == 7); delete[] les; } toku_ftnode_free(&dn); toku_destroy_ftnode_internals(&sn); - ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + ft_h->blocktable.block_free( + BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100); ft_h->blocktable.destroy(); toku_free(ft_h->h); toku_free(ft_h); @@ -729,15 +822,19 @@ test_serialize_leaf_with_large_rows(enum ftnode_verify_type bft, bool do_clone) toku_free(src_ndd); toku_free(dest_ndd); - r = close(fd); assert(r != -1); + r = close(fd); + invariant(r != -1); } - -static void -test_serialize_leaf_with_empty_basement_nodes(enum ftnode_verify_type bft, bool do_clone) { +static void test_serialize_leaf_with_empty_basement_nodes( + enum ftnode_verify_type bft, + bool do_clone) { struct ftnode sn, *dn; - int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0); + int fd = open(TOKU_TEST_FILENAME, + O_RDWR | O_CREAT | O_BINARY, + S_IRWXU | S_IRWXG | S_IRWXO); + invariant(fd >= 0); int r; @@ -760,7 +857,7 @@ test_serialize_leaf_with_empty_basement_nodes(enum ftnode_verify_type bft, bool toku_fill_dbt(&pivotkeys[5], "x", 2); sn.pivotkeys.create_from_dbts(pivotkeys, 6); for (int i = 0; i < sn.n_children; ++i) { - BP_STATE(&sn,i) = PT_AVAIL; + BP_STATE(&sn, i) = PT_AVAIL; set_BLB(&sn, i, toku_create_empty_bn()); BLB_SEQINSERT(&sn, i) = 0; } @@ -774,30 +871,35 @@ test_serialize_leaf_with_empty_basement_nodes(enum ftnode_verify_type bft, bool make_blocknum(0), ZERO_LSN, TXNID_NONE, - 4*1024*1024, - 128*1024, + 4 * 1024 * 1024, + 128 * 1024, TOKU_DEFAULT_COMPRESSION_METHOD, 16); ft->ft = ft_h; - + ft_h->blocktable.create(); - { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); } - //Want to use block #20 + { + int r_truncate = ftruncate(fd, 0); + CKERR(r_truncate); + } + // Want to use block #20 BLOCKNUM b = make_blocknum(0); while (b.b < 20) { ft_h->blocktable.allocate_blocknum(&b, ft_h); } - assert(b.b == 20); + invariant(b.b == 20); { DISKOFF offset; DISKOFF size; - ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0); - assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false); + invariant(offset == + (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size); - assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); - assert(size == 100); + invariant(offset == + (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + invariant(size == 100); } FTNODE_DISK_DATA src_ndd = NULL; FTNODE_DISK_DATA dest_ndd = NULL; @@ -805,17 +907,18 @@ test_serialize_leaf_with_empty_basement_nodes(enum ftnode_verify_type bft, bool setup_dn(bft, fd, ft_h, &dn, &dest_ndd); - assert(dn->blocknum.b==20); + invariant(dn->blocknum.b == 20); - assert(dn->layout_version ==FT_LAYOUT_VERSION); - assert(dn->layout_version_original ==FT_LAYOUT_VERSION); - assert(dn->layout_version_read_from_disk ==FT_LAYOUT_VERSION); - assert(dn->height == 0); - assert(dn->n_children>0); + invariant(dn->layout_version == FT_LAYOUT_VERSION); + invariant(dn->layout_version_original == FT_LAYOUT_VERSION); + invariant(dn->layout_version_read_from_disk == FT_LAYOUT_VERSION); + invariant(dn->height == 0); + invariant(dn->n_children > 0); { test_key_le_pair elts[3]; - // Man, this is way too ugly. This entire test suite needs to be refactored. + // Man, this is way too ugly. This entire test suite needs to be + // refactored. // Create a dummy mempool and put the leaves there. Ugh. elts[0].init("a", "aval"); elts[1].init("b", "bval"); @@ -823,33 +926,39 @@ test_serialize_leaf_with_empty_basement_nodes(enum ftnode_verify_type bft, bool const uint32_t npartitions = dn->n_children; uint32_t last_i = 0; for (uint32_t bn = 0; bn < npartitions; ++bn) { - assert(dest_ndd[bn].start > 0); - assert(dest_ndd[bn].size > 0); + invariant(dest_ndd[bn].start > 0); + invariant(dest_ndd[bn].size > 0); if (bn > 0) { - assert(dest_ndd[bn].start >= dest_ndd[bn-1].start + dest_ndd[bn-1].size); + invariant(dest_ndd[bn].start >= + dest_ndd[bn - 1].start + dest_ndd[bn - 1].size); } for (uint32_t i = 0; i < BLB_DATA(dn, bn)->num_klpairs(); i++) { LEAFENTRY curr_le; uint32_t curr_keylen; - void* curr_key; - BLB_DATA(dn, bn)->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key); - assert(leafentry_memsize(curr_le) == leafentry_memsize(elts[last_i].le)); - assert(memcmp(curr_le, elts[last_i].le, leafentry_memsize(curr_le)) == 0); - if (bn < npartitions-1) { - assert(strcmp((char*)dn->pivotkeys.get_pivot(bn).data, (char*)(elts[last_i].keyp)) <= 0); + void *curr_key; + BLB_DATA(dn, bn) + ->fetch_klpair(i, &curr_le, &curr_keylen, &curr_key); + invariant(leafentry_memsize(curr_le) == + leafentry_memsize(elts[last_i].le)); + invariant(memcmp(curr_le, + elts[last_i].le, + leafentry_memsize(curr_le)) == 0); + if (bn < npartitions - 1) { + invariant(strcmp((char *)dn->pivotkeys.get_pivot(bn).data, + (char *)(elts[last_i].keyp)) <= 0); } // TODO for later, get a key comparison here as well last_i++; } - } - assert(last_i == 3); + invariant(last_i == 3); } toku_ftnode_free(&dn); toku_destroy_ftnode_internals(&sn); - ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + ft_h->blocktable.block_free( + BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100); ft_h->blocktable.destroy(); toku_free(ft_h->h); toku_free(ft_h); @@ -857,14 +966,19 @@ test_serialize_leaf_with_empty_basement_nodes(enum ftnode_verify_type bft, bool toku_free(src_ndd); toku_free(dest_ndd); - r = close(fd); assert(r != -1); + r = close(fd); + invariant(r != -1); } -static void -test_serialize_leaf_with_multiple_empty_basement_nodes(enum ftnode_verify_type bft, bool do_clone) { +static void test_serialize_leaf_with_multiple_empty_basement_nodes( + enum ftnode_verify_type bft, + bool do_clone) { struct ftnode sn, *dn; - int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0); + int fd = open(TOKU_TEST_FILENAME, + O_RDWR | O_CREAT | O_BINARY, + S_IRWXU | S_IRWXG | S_IRWXO); + invariant(fd >= 0); int r; @@ -884,7 +998,7 @@ test_serialize_leaf_with_multiple_empty_basement_nodes(enum ftnode_verify_type b toku_fill_dbt(&pivotkeys[2], "A", 2); sn.pivotkeys.create_from_dbts(pivotkeys, 3); for (int i = 0; i < sn.n_children; ++i) { - BP_STATE(&sn,i) = PT_AVAIL; + BP_STATE(&sn, i) = PT_AVAIL; set_BLB(&sn, i, toku_create_empty_bn()); } @@ -894,30 +1008,35 @@ test_serialize_leaf_with_multiple_empty_basement_nodes(enum ftnode_verify_type b make_blocknum(0), ZERO_LSN, TXNID_NONE, - 4*1024*1024, - 128*1024, + 4 * 1024 * 1024, + 128 * 1024, TOKU_DEFAULT_COMPRESSION_METHOD, 16); ft->ft = ft_h; - + ft_h->blocktable.create(); - { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); } - //Want to use block #20 + { + int r_truncate = ftruncate(fd, 0); + CKERR(r_truncate); + } + // Want to use block #20 BLOCKNUM b = make_blocknum(0); while (b.b < 20) { ft_h->blocktable.allocate_blocknum(&b, ft_h); } - assert(b.b == 20); + invariant(b.b == 20); { DISKOFF offset; DISKOFF size; - ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0); - assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false); + invariant(offset == + (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size); - assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); - assert(size == 100); + invariant(offset == + (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + invariant(size == 100); } FTNODE_DISK_DATA src_ndd = NULL; @@ -926,29 +1045,31 @@ test_serialize_leaf_with_multiple_empty_basement_nodes(enum ftnode_verify_type b setup_dn(bft, fd, ft_h, &dn, &dest_ndd); - assert(dn->blocknum.b==20); + invariant(dn->blocknum.b == 20); - assert(dn->layout_version ==FT_LAYOUT_VERSION); - assert(dn->layout_version_original ==FT_LAYOUT_VERSION); - assert(dn->layout_version_read_from_disk ==FT_LAYOUT_VERSION); - assert(dn->height == 0); - assert(dn->n_children == 1); + invariant(dn->layout_version == FT_LAYOUT_VERSION); + invariant(dn->layout_version_original == FT_LAYOUT_VERSION); + invariant(dn->layout_version_read_from_disk == FT_LAYOUT_VERSION); + invariant(dn->height == 0); + invariant(dn->n_children == 1); { const uint32_t npartitions = dn->n_children; for (uint32_t i = 0; i < npartitions; ++i) { - assert(dest_ndd[i].start > 0); - assert(dest_ndd[i].size > 0); + invariant(dest_ndd[i].start > 0); + invariant(dest_ndd[i].size > 0); if (i > 0) { - assert(dest_ndd[i].start >= dest_ndd[i-1].start + dest_ndd[i-1].size); + invariant(dest_ndd[i].start >= + dest_ndd[i - 1].start + dest_ndd[i - 1].size); } - assert(BLB_DATA(dn, i)->num_klpairs() == 0); + invariant(BLB_DATA(dn, i)->num_klpairs() == 0); } } - + toku_ftnode_free(&dn); toku_destroy_ftnode_internals(&sn); - ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + ft_h->blocktable.block_free( + BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100); ft_h->blocktable.destroy(); toku_free(ft_h->h); toku_free(ft_h); @@ -956,16 +1077,18 @@ test_serialize_leaf_with_multiple_empty_basement_nodes(enum ftnode_verify_type b toku_free(src_ndd); toku_free(dest_ndd); - r = close(fd); assert(r != -1); + r = close(fd); + invariant(r != -1); } - -static void -test_serialize_nonleaf(enum ftnode_verify_type bft, bool do_clone) { +static void test_serialize_nonleaf(enum ftnode_verify_type bft, bool do_clone) { // struct ft_handle source_ft; struct ftnode sn, *dn; - int fd = open(TOKU_TEST_FILENAME, O_RDWR|O_CREAT|O_BINARY, S_IRWXU|S_IRWXG|S_IRWXO); assert(fd >= 0); + int fd = open(TOKU_TEST_FILENAME, + O_RDWR | O_CREAT | O_BINARY, + S_IRWXU | S_IRWXG | S_IRWXO); + invariant(fd >= 0); int r; @@ -984,11 +1107,11 @@ test_serialize_nonleaf(enum ftnode_verify_type bft, bool do_clone) { sn.pivotkeys.create_from_dbts(toku_fill_dbt(&pivotkey, "hello", 6), 1); BP_BLOCKNUM(&sn, 0).b = 30; BP_BLOCKNUM(&sn, 1).b = 35; - BP_STATE(&sn,0) = PT_AVAIL; - BP_STATE(&sn,1) = PT_AVAIL; + BP_STATE(&sn, 0) = PT_AVAIL; + BP_STATE(&sn, 1) = PT_AVAIL; set_BNC(&sn, 0, toku_create_empty_nl()); set_BNC(&sn, 1, toku_create_empty_nl()); - //Create XIDS + // Create XIDS XIDS xids_0 = toku_xids_get_root_xids(); XIDS xids_123; XIDS xids_234; @@ -1000,11 +1123,38 @@ test_serialize_nonleaf(enum ftnode_verify_type bft, bool do_clone) { toku::comparator cmp; cmp.create(string_key_cmp, nullptr); - toku_bnc_insert_msg(BNC(&sn, 0), "a", 2, "aval", 5, FT_NONE, next_dummymsn(), xids_0, true, cmp); - toku_bnc_insert_msg(BNC(&sn, 0), "b", 2, "bval", 5, FT_NONE, next_dummymsn(), xids_123, false, cmp); - toku_bnc_insert_msg(BNC(&sn, 1), "x", 2, "xval", 5, FT_NONE, next_dummymsn(), xids_234, true, cmp); - - //Cleanup: + toku_bnc_insert_msg(BNC(&sn, 0), + "a", + 2, + "aval", + 5, + FT_NONE, + next_dummymsn(), + xids_0, + true, + cmp); + toku_bnc_insert_msg(BNC(&sn, 0), + "b", + 2, + "bval", + 5, + FT_NONE, + next_dummymsn(), + xids_123, + false, + cmp); + toku_bnc_insert_msg(BNC(&sn, 1), + "x", + 2, + "xval", + 5, + FT_NONE, + next_dummymsn(), + xids_234, + true, + cmp); + + // Cleanup: toku_xids_destroy(&xids_0); toku_xids_destroy(&xids_123); toku_xids_destroy(&xids_234); @@ -1016,31 +1166,36 @@ test_serialize_nonleaf(enum ftnode_verify_type bft, bool do_clone) { make_blocknum(0), ZERO_LSN, TXNID_NONE, - 4*1024*1024, - 128*1024, + 4 * 1024 * 1024, + 128 * 1024, TOKU_DEFAULT_COMPRESSION_METHOD, 16); ft_h->cmp.create(string_key_cmp, nullptr); ft->ft = ft_h; - + ft_h->blocktable.create(); - { int r_truncate = ftruncate(fd, 0); CKERR(r_truncate); } - //Want to use block #20 + { + int r_truncate = ftruncate(fd, 0); + CKERR(r_truncate); + } + // Want to use block #20 BLOCKNUM b = make_blocknum(0); while (b.b < 20) { ft_h->blocktable.allocate_blocknum(&b, ft_h); } - assert(b.b == 20); + invariant(b.b == 20); { DISKOFF offset; DISKOFF size; - ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false, 0); - assert(offset==(DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + ft_h->blocktable.realloc_on_disk(b, 100, &offset, ft_h, fd, false); + invariant(offset == + (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); ft_h->blocktable.translate_blocknum_to_offset_size(b, &offset, &size); - assert(offset == (DISKOFF)block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); - assert(size == 100); + invariant(offset == + (DISKOFF)BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + invariant(size == 100); } FTNODE_DISK_DATA src_ndd = NULL; FTNODE_DISK_DATA dest_ndd = NULL; @@ -1048,30 +1203,31 @@ test_serialize_nonleaf(enum ftnode_verify_type bft, bool do_clone) { setup_dn(bft, fd, ft_h, &dn, &dest_ndd); - assert(dn->blocknum.b==20); + invariant(dn->blocknum.b == 20); - assert(dn->layout_version ==FT_LAYOUT_VERSION); - assert(dn->layout_version_original ==FT_LAYOUT_VERSION); - assert(dn->layout_version_read_from_disk ==FT_LAYOUT_VERSION); - assert(dn->height == 1); - assert(dn->n_children==2); - assert(strcmp((char*)dn->pivotkeys.get_pivot(0).data, "hello")==0); - assert(dn->pivotkeys.get_pivot(0).size==6); - assert(BP_BLOCKNUM(dn,0).b==30); - assert(BP_BLOCKNUM(dn,1).b==35); + invariant(dn->layout_version == FT_LAYOUT_VERSION); + invariant(dn->layout_version_original == FT_LAYOUT_VERSION); + invariant(dn->layout_version_read_from_disk == FT_LAYOUT_VERSION); + invariant(dn->height == 1); + invariant(dn->n_children == 2); + invariant(strcmp((char *)dn->pivotkeys.get_pivot(0).data, "hello") == 0); + invariant(dn->pivotkeys.get_pivot(0).size == 6); + invariant(BP_BLOCKNUM(dn, 0).b == 30); + invariant(BP_BLOCKNUM(dn, 1).b == 35); message_buffer *src_msg_buffer1 = &BNC(&sn, 0)->msg_buffer; message_buffer *src_msg_buffer2 = &BNC(&sn, 1)->msg_buffer; message_buffer *dest_msg_buffer1 = &BNC(dn, 0)->msg_buffer; message_buffer *dest_msg_buffer2 = &BNC(dn, 1)->msg_buffer; - assert(src_msg_buffer1->equals(dest_msg_buffer1)); - assert(src_msg_buffer2->equals(dest_msg_buffer2)); + invariant(src_msg_buffer1->equals(dest_msg_buffer1)); + invariant(src_msg_buffer2->equals(dest_msg_buffer2)); toku_ftnode_free(&dn); toku_destroy_ftnode_internals(&sn); - ft_h->blocktable.block_free(block_allocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE); + ft_h->blocktable.block_free( + BlockAllocator::BLOCK_ALLOCATOR_TOTAL_HEADER_RESERVE, 100); ft_h->blocktable.destroy(); ft_h->cmp.destroy(); toku_free(ft_h->h); @@ -1080,11 +1236,12 @@ test_serialize_nonleaf(enum ftnode_verify_type bft, bool do_clone) { toku_free(src_ndd); toku_free(dest_ndd); - r = close(fd); assert(r != -1); + r = close(fd); + invariant(r != -1); } -int -test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) { +int test_main(int argc __attribute__((__unused__)), + const char *argv[] __attribute__((__unused__))) { initialize_dummymsn(); test_serialize_nonleaf(read_none, false); @@ -1103,10 +1260,12 @@ test_main (int argc __attribute__((__unused__)), const char *argv[] __attribute_ test_serialize_leaf_with_multiple_empty_basement_nodes(read_none, false); test_serialize_leaf_with_multiple_empty_basement_nodes(read_all, false); - test_serialize_leaf_with_multiple_empty_basement_nodes(read_compressed, false); + test_serialize_leaf_with_multiple_empty_basement_nodes(read_compressed, + false); test_serialize_leaf_with_multiple_empty_basement_nodes(read_none, true); test_serialize_leaf_with_multiple_empty_basement_nodes(read_all, true); - test_serialize_leaf_with_multiple_empty_basement_nodes(read_compressed, true); + test_serialize_leaf_with_multiple_empty_basement_nodes(read_compressed, + true); test_serialize_leaf_with_empty_basement_nodes(read_none, false); test_serialize_leaf_with_empty_basement_nodes(read_all, false); diff --git a/storage/tokudb/PerconaFT/ft/tests/ft-test.cc b/storage/tokudb/PerconaFT/ft/tests/ft-test.cc index 598a1cc7085..706bd94fbc3 100644 --- a/storage/tokudb/PerconaFT/ft/tests/ft-test.cc +++ b/storage/tokudb/PerconaFT/ft/tests/ft-test.cc @@ -164,17 +164,16 @@ static void test_read_what_was_written (void) { int r; const int NVALS=10000; - if (verbose) printf("test_read_what_was_written(): "); fflush(stdout); + if (verbose) { + printf("test_read_what_was_written(): "); fflush(stdout); + } unlink(fname); - toku_cachetable_create(&ct, 0, ZERO_LSN, nullptr); r = toku_open_ft_handle(fname, 1, &ft, 1<<12, 1<<9, TOKU_DEFAULT_COMPRESSION_METHOD, ct, null_txn, toku_builtin_compare_fun); assert(r==0); r = toku_close_ft_handle_nolsn(ft, 0); assert(r==0); - toku_cachetable_close(&ct); - - + toku_cachetable_close(&ct); /* Now see if we can read an empty tree in. */ toku_cachetable_create(&ct, 0, ZERO_LSN, nullptr); @@ -189,8 +188,6 @@ static void test_read_what_was_written (void) { r = toku_close_ft_handle_nolsn(ft, 0); assert(r==0); toku_cachetable_close(&ct); - - /* Now see if we can read it in and get the value. */ toku_cachetable_create(&ct, 0, ZERO_LSN, nullptr); r = toku_open_ft_handle(fname, 0, &ft, 1<<12, 1<<9, TOKU_DEFAULT_COMPRESSION_METHOD, ct, null_txn, toku_builtin_compare_fun); assert(r==0); diff --git a/storage/tokudb/PerconaFT/ft/tests/pqueue-test.cc b/storage/tokudb/PerconaFT/ft/tests/pqueue-test.cc index 53973794eae..aeb5a897c48 100644 --- a/storage/tokudb/PerconaFT/ft/tests/pqueue-test.cc +++ b/storage/tokudb/PerconaFT/ft/tests/pqueue-test.cc @@ -109,7 +109,9 @@ static int run_test(void) r = pqueue_pop(pq, &node); assert(r==0); if (verbose) printf("%d : %d\n", i, *(int*)(node->key->data)); if ( *(int*)(node->key->data) != i ) { - if (verbose) printf("FAIL\n"); return -1; + if (verbose) + printf("FAIL\n"); + return -1; } } pqueue_free(pq); diff --git a/storage/tokudb/PerconaFT/ft/tests/test-leafentry-nested.cc b/storage/tokudb/PerconaFT/ft/tests/test-leafentry-nested.cc index a78f787cdf2..f2004964862 100644 --- a/storage/tokudb/PerconaFT/ft/tests/test-leafentry-nested.cc +++ b/storage/tokudb/PerconaFT/ft/tests/test-leafentry-nested.cc @@ -793,7 +793,7 @@ static void test_le_garbage_collection_birdie(void) { do_garbage_collect = ule_worth_running_garbage_collection(&ule, 200); invariant(do_garbage_collect); - // It is definately worth doing when the above case is true + // It is definitely worth doing when the above case is true // and there is more than one provisional entry. ule.num_cuxrs = 1; ule.num_puxrs = 2; diff --git a/storage/tokudb/PerconaFT/ft/tests/test-oldest-referenced-xid-flush.cc b/storage/tokudb/PerconaFT/ft/tests/test-oldest-referenced-xid-flush.cc index 419af550545..71357a1e16a 100644 --- a/storage/tokudb/PerconaFT/ft/tests/test-oldest-referenced-xid-flush.cc +++ b/storage/tokudb/PerconaFT/ft/tests/test-oldest-referenced-xid-flush.cc @@ -72,7 +72,7 @@ static void dummy_update_status(FTNODE UU(child), int UU(dirtied), void* UU(extr enum { NODESIZE = 1024, KSIZE=NODESIZE-100, TOKU_PSIZE=20 }; -static void test_oldest_referenced_xid_gets_propogated(void) { +static void test_oldest_referenced_xid_gets_propagated(void) { int r; CACHETABLE ct; FT_HANDLE t; @@ -166,7 +166,7 @@ static void test_oldest_referenced_xid_gets_propogated(void) { toku_ft_flush_some_child(t->ft, node, &fa); // pin the child, verify that oldest referenced xid was - // propogated from parent to child during the flush + // propagated from parent to child during the flush toku_pin_ftnode( t->ft, child_nonleaf_blocknum, @@ -185,6 +185,6 @@ static void test_oldest_referenced_xid_gets_propogated(void) { int test_main(int argc __attribute__((__unused__)), const char *argv[] __attribute__((__unused__))) { default_parse_args(argc, argv); - test_oldest_referenced_xid_gets_propogated(); + test_oldest_referenced_xid_gets_propagated(); return 0; } diff --git a/storage/tokudb/PerconaFT/ft/serialize/block_allocator_strategy.h b/storage/tokudb/PerconaFT/ft/tests/test-rbtree-insert-remove-with-mhs.cc index 8aded3898c1..ea4f9374dc3 100644 --- a/storage/tokudb/PerconaFT/ft/serialize/block_allocator_strategy.h +++ b/storage/tokudb/PerconaFT/ft/tests/test-rbtree-insert-remove-with-mhs.cc @@ -36,30 +36,62 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. #ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." -#pragma once - -#include <db.h> - -#include "ft/serialize/block_allocator.h" - -// Block allocation strategy implementations - -class block_allocator_strategy { -public: - static struct block_allocator::blockpair * - first_fit(struct block_allocator::blockpair *blocks_array, - uint64_t n_blocks, uint64_t size, uint64_t alignment); - - static struct block_allocator::blockpair * - best_fit(struct block_allocator::blockpair *blocks_array, - uint64_t n_blocks, uint64_t size, uint64_t alignment); - - static struct block_allocator::blockpair * - padded_fit(struct block_allocator::blockpair *blocks_array, - uint64_t n_blocks, uint64_t size, uint64_t alignment); - - static struct block_allocator::blockpair * - heat_zone(struct block_allocator::blockpair *blocks_array, - uint64_t n_blocks, uint64_t size, uint64_t alignment, - uint64_t heat); -}; +#include "ft/serialize/rbtree_mhs.h" +#include "test.h" +#include <algorithm> +#include <vector> +#include <ctime> +#include <cstdlib> + +static void test_insert_remove(void) { + uint64_t i; + MhsRbTree::Tree *tree = new MhsRbTree::Tree(); + verbose = 0; + + tree->Insert({0, 100}); + + for (i = 0; i < 10; i++) { + tree->Remove(3); + tree->Remove(2); + } + tree->ValidateBalance(); + tree->ValidateMhs(); + + for (i = 0; i < 10; i++) { + tree->Insert({5 * i, 3}); + } + tree->ValidateBalance(); + tree->ValidateMhs(); + + uint64_t offset = tree->Remove(2); + invariant(offset == 0); + offset = tree->Remove(10); + invariant(offset == 50); + offset = tree->Remove(3); + invariant(offset == 5); + tree->ValidateBalance(); + tree->ValidateMhs(); + + tree->Insert({48, 2}); + tree->Insert({50, 10}); + + tree->ValidateBalance(); + tree->ValidateMhs(); + + tree->Insert({3, 7}); + offset = tree->Remove(10); + invariant(offset == 2); + tree->ValidateBalance(); + tree->ValidateMhs(); + tree->Dump(); + delete tree; +} + +int test_main(int argc, const char *argv[]) { + default_parse_args(argc, argv); + + test_insert_remove(); + if (verbose) + printf("test ok\n"); + return 0; +} diff --git a/storage/tokudb/PerconaFT/ft/tests/test-rbtree-insert-remove-without-mhs.cc b/storage/tokudb/PerconaFT/ft/tests/test-rbtree-insert-remove-without-mhs.cc new file mode 100644 index 00000000000..85f29ce9813 --- /dev/null +++ b/storage/tokudb/PerconaFT/ft/tests/test-rbtree-insert-remove-without-mhs.cc @@ -0,0 +1,102 @@ +/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ +// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: +#ident "$Id$" +/*====== +This file is part of PerconaFT. + + +Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License, version 2, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. + +---------------------------------------- + + PerconaFT is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License, version 3, + as published by the Free Software Foundation. + + PerconaFT is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. +======= */ + +#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." + +#include "ft/serialize/rbtree_mhs.h" +#include "test.h" +#include <algorithm> +#include <vector> +#include <ctime> +#include <cstdlib> + +#define N 1000000 +std::vector<MhsRbTree::Node::BlockPair> input_vector; +MhsRbTree::Node::BlockPair old_vector[N]; + +static int myrandom(int i) { return std::rand() % i; } + +static void generate_random_input() { + std::srand(unsigned(std::time(0))); + + // set some values: + for (uint64_t i = 1; i < N; ++i) { + input_vector.push_back({i, 0}); + old_vector[i] = {i, 0}; + } + // using built-in random generator: + std::random_shuffle(input_vector.begin(), input_vector.end(), myrandom); +} + +static void test_insert_remove(void) { + int i; + MhsRbTree::Tree *tree = new MhsRbTree::Tree(); + verbose = 0; + generate_random_input(); + if (verbose) { + printf("\n we are going to insert the following block offsets\n"); + for (i = 0; i < N; i++) + printf("%" PRIu64 "\t", input_vector[i]._offset.ToInt()); + } + for (i = 0; i < N; i++) { + tree->Insert(input_vector[i]); + // tree->ValidateBalance(); + } + tree->ValidateBalance(); + MhsRbTree::Node::BlockPair *p_bps = &old_vector[0]; + tree->ValidateInOrder(p_bps); + printf("min node of the tree:%" PRIu64 "\n", + rbn_offset(tree->MinNode()).ToInt()); + printf("max node of the tree:%" PRIu64 "\n", + rbn_offset(tree->MaxNode()).ToInt()); + + for (i = 0; i < N; i++) { + // tree->ValidateBalance(); + tree->RawRemove(input_vector[i]._offset.ToInt()); + } + + tree->Destroy(); + delete tree; +} + +int test_main(int argc, const char *argv[]) { + default_parse_args(argc, argv); + + test_insert_remove(); + if (verbose) + printf("test ok\n"); + return 0; +} diff --git a/storage/tokudb/PerconaFT/ft/txn/roll.cc b/storage/tokudb/PerconaFT/ft/txn/roll.cc index 407116b983c..90eee1e580a 100644 --- a/storage/tokudb/PerconaFT/ft/txn/roll.cc +++ b/storage/tokudb/PerconaFT/ft/txn/roll.cc @@ -49,7 +49,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. // functionality provided by roll.c is exposed by an autogenerated // header file, logheader.h // -// this (poorly) explains the absense of "roll.h" +// this (poorly) explains the absence of "roll.h" // these flags control whether or not we send commit messages for // various operations diff --git a/storage/tokudb/PerconaFT/ft/txn/rollback-apply.cc b/storage/tokudb/PerconaFT/ft/txn/rollback-apply.cc index df830afd0df..c9464c3ed60 100644 --- a/storage/tokudb/PerconaFT/ft/txn/rollback-apply.cc +++ b/storage/tokudb/PerconaFT/ft/txn/rollback-apply.cc @@ -169,7 +169,7 @@ int toku_rollback_commit(TOKUTXN txn, LSN lsn) { txn->roll_info.spilled_rollback_head = ROLLBACK_NONE; txn->roll_info.spilled_rollback_tail = ROLLBACK_NONE; } - // if we're commiting a child rollback, put its entries into the parent + // if we're committing a child rollback, put its entries into the parent // by pinning both child and parent and then linking the child log entry // list to the end of the parent log entry list. if (txn_has_current_rollback_log(txn)) { diff --git a/storage/tokudb/PerconaFT/ft/txn/rollback-ct-callbacks.cc b/storage/tokudb/PerconaFT/ft/txn/rollback-ct-callbacks.cc index 68c94c2ad11..08d7c8874e5 100644 --- a/storage/tokudb/PerconaFT/ft/txn/rollback-ct-callbacks.cc +++ b/storage/tokudb/PerconaFT/ft/txn/rollback-ct-callbacks.cc @@ -59,21 +59,18 @@ rollback_log_destroy(ROLLBACK_LOG_NODE log) { // flush an ununused log to disk, by allocating a size 0 blocknum in // the blocktable -static void -toku_rollback_flush_unused_log( - ROLLBACK_LOG_NODE log, - BLOCKNUM logname, - int fd, - FT ft, - bool write_me, - bool keep_me, - bool for_checkpoint, - bool is_clone - ) -{ +static void toku_rollback_flush_unused_log(ROLLBACK_LOG_NODE log, + BLOCKNUM logname, + int fd, + FT ft, + bool write_me, + bool keep_me, + bool for_checkpoint, + bool is_clone) { if (write_me) { DISKOFF offset; - ft->blocktable.realloc_on_disk(logname, 0, &offset, ft, fd, for_checkpoint, INT_MAX); + ft->blocktable.realloc_on_disk( + logname, 0, &offset, ft, fd, for_checkpoint); } if (!keep_me && !is_clone) { toku_free(log); diff --git a/storage/tokudb/PerconaFT/ft/ule.cc b/storage/tokudb/PerconaFT/ft/ule.cc index ac393fbf179..e3dce6d27dd 100644 --- a/storage/tokudb/PerconaFT/ft/ule.cc +++ b/storage/tokudb/PerconaFT/ft/ule.cc @@ -587,8 +587,8 @@ bool toku_le_worth_running_garbage_collection( // by new txns. // 2.) There is only one committed entry, but the outermost // provisional entry is older than the oldest known referenced -// xid, so it must have commited. Therefor we can promote it to -// committed and get rid of the old commited entry. +// xid, so it must have committed. Therefor we can promote it to +// committed and get rid of the old committed entry. if (le->type != LE_MVCC) { return false; } diff --git a/storage/tokudb/PerconaFT/portability/CMakeLists.txt b/storage/tokudb/PerconaFT/portability/CMakeLists.txt index 9f84d9b03df..4793db63cc1 100644 --- a/storage/tokudb/PerconaFT/portability/CMakeLists.txt +++ b/storage/tokudb/PerconaFT/portability/CMakeLists.txt @@ -14,12 +14,11 @@ set(tokuportability_srcs ) add_library(${LIBTOKUPORTABILITY} SHARED ${tokuportability_srcs}) -target_link_libraries(${LIBTOKUPORTABILITY} LINK_PRIVATE ${LIBJEMALLOC}) target_link_libraries(${LIBTOKUPORTABILITY} LINK_PUBLIC ${CMAKE_THREAD_LIBS_INIT} ${EXTRA_SYSTEM_LIBS}) add_library(tokuportability_static_conv STATIC ${tokuportability_srcs}) set_target_properties(tokuportability_static_conv PROPERTIES POSITION_INDEPENDENT_CODE ON) -set(tokuportability_source_libs tokuportability_static_conv ${LIBJEMALLOC} ${CMAKE_THREAD_LIBS_INIT} ${EXTRA_SYSTEM_LIBS}) +set(tokuportability_source_libs tokuportability_static_conv ${CMAKE_THREAD_LIBS_INIT} ${EXTRA_SYSTEM_LIBS}) toku_merge_static_libs(${LIBTOKUPORTABILITY}_static ${LIBTOKUPORTABILITY}_static "${tokuportability_source_libs}") maybe_add_gcov_to_libraries(${LIBTOKUPORTABILITY} tokuportability_static_conv) diff --git a/storage/tokudb/PerconaFT/portability/huge_page_detection.cc b/storage/tokudb/PerconaFT/portability/huge_page_detection.cc index bc48e93937d..8e73c56a6c5 100644 --- a/storage/tokudb/PerconaFT/portability/huge_page_detection.cc +++ b/storage/tokudb/PerconaFT/portability/huge_page_detection.cc @@ -90,7 +90,13 @@ static bool check_huge_pages_in_practice(void) const long pagesize = 4096; const long n_pages = TWO_MB/pagesize; +#ifdef __linux__ + // On linux mincore is defined as mincore(void *, size_t, unsigned char *) unsigned char vec[n_pages]; +#else + // On BSD (OS X included) it is defined as mincore(void *, size_t, char *) + char vec[n_pages]; +#endif { int r = mincore(second, TWO_MB, vec); if (r!=0 && errno==ENOMEM) { diff --git a/storage/tokudb/PerconaFT/portability/tests/test-max-data.cc b/storage/tokudb/PerconaFT/portability/tests/test-max-data.cc index 880f9a3a9bb..dbbea974a49 100644 --- a/storage/tokudb/PerconaFT/portability/tests/test-max-data.cc +++ b/storage/tokudb/PerconaFT/portability/tests/test-max-data.cc @@ -64,7 +64,7 @@ int main(int argc, char *const argv[]) { if (verbose) printf("maxdata=%" PRIu64 " 0x%" PRIx64 "\n", maxdata, maxdata); // check the data size -#if __x86_64__ +#if defined(__x86_64__) || defined(__aarch64__) assert(maxdata > (1ULL << 32)); #elif __i386__ assert(maxdata < (1ULL << 32)); diff --git a/storage/tokudb/PerconaFT/portability/toku_config.h.in b/storage/tokudb/PerconaFT/portability/toku_config.h.in index e1412cc9e14..1a34bf1ef45 100644 --- a/storage/tokudb/PerconaFT/portability/toku_config.h.in +++ b/storage/tokudb/PerconaFT/portability/toku_config.h.in @@ -42,7 +42,6 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. #cmakedefine TOKU_DEBUG_PARANOID 1 #cmakedefine USE_VALGRIND 1 - #cmakedefine HAVE_ALLOCA_H 1 #cmakedefine HAVE_ARPA_INET_H 1 #cmakedefine HAVE_BYTESWAP_H 1 diff --git a/storage/tokudb/PerconaFT/portability/toku_time.h b/storage/tokudb/PerconaFT/portability/toku_time.h index 11a3f3aa2b9..a1278ef0337 100644 --- a/storage/tokudb/PerconaFT/portability/toku_time.h +++ b/storage/tokudb/PerconaFT/portability/toku_time.h @@ -98,9 +98,17 @@ double tokutime_to_seconds(tokutime_t) __attribute__((__visibility__("default") // Get the value of tokutime for right now. We want this to be fast, so we expose the implementation as RDTSC. static inline tokutime_t toku_time_now(void) { +#if defined(__x86_64__) || defined(__i386__) uint32_t lo, hi; __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); return (uint64_t)hi << 32 | lo; +#elif defined (__aarch64__) + uint64_t result; + __asm __volatile__ ("mrs %[rt], cntvct_el0" : [rt] "=r" (result)); + return result; +#else +#error No timer implementation for this platform +#endif } static inline uint64_t toku_current_time_microsec(void) { diff --git a/storage/tokudb/PerconaFT/src/indexer-internal.h b/storage/tokudb/PerconaFT/src/indexer-internal.h index 48e62ee49b2..fdaa561e3d0 100644 --- a/storage/tokudb/PerconaFT/src/indexer-internal.h +++ b/storage/tokudb/PerconaFT/src/indexer-internal.h @@ -42,7 +42,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. #include <toku_pthread.h> // the indexer_commit_keys is an ordered set of keys described by a DBT in the keys array. -// the array is a resizeable array with max size "max_keys" and current size "current_keys". +// the array is a resizable array with max size "max_keys" and current size "current_keys". // the ordered set is used by the hotindex undo function to collect the commit keys. struct indexer_commit_keys { int max_keys; // max number of keys diff --git a/storage/tokudb/PerconaFT/src/indexer-undo-do.cc b/storage/tokudb/PerconaFT/src/indexer-undo-do.cc index 8d0b080b9fe..4c7f5336161 100644 --- a/storage/tokudb/PerconaFT/src/indexer-undo-do.cc +++ b/storage/tokudb/PerconaFT/src/indexer-undo-do.cc @@ -528,7 +528,7 @@ indexer_find_prev_xr(DB_INDEXER *UU(indexer), ULEHANDLE ule, uint64_t xrindex, u } // inject "delete" message into ft with logging in recovery and rollback logs, -// and making assocation between txn and ft +// and making association between txn and ft static int indexer_ft_delete_provisional(DB_INDEXER *indexer, DB *hotdb, DBT *hotkey, XIDS xids, TOKUTXN txn) { int result = 0; @@ -577,7 +577,7 @@ indexer_ft_delete_committed(DB_INDEXER *indexer, DB *hotdb, DBT *hotkey, XIDS xi } // inject "insert" message into ft with logging in recovery and rollback logs, -// and making assocation between txn and ft +// and making association between txn and ft static int indexer_ft_insert_provisional(DB_INDEXER *indexer, DB *hotdb, DBT *hotkey, DBT *hotval, XIDS xids, TOKUTXN txn) { int result = 0; diff --git a/storage/tokudb/PerconaFT/src/tests/hotindexer-undo-do-tests/commit.i0.test b/storage/tokudb/PerconaFT/src/tests/hotindexer-undo-do-tests/commit.i0.test index 20df13923e6..7cce68e6ff8 100644 --- a/storage/tokudb/PerconaFT/src/tests/hotindexer-undo-do-tests/commit.i0.test +++ b/storage/tokudb/PerconaFT/src/tests/hotindexer-undo-do-tests/commit.i0.test @@ -1,3 +1,3 @@ -# commited insert +# committed insert key k1 insert committed 0 v100 diff --git a/storage/tokudb/PerconaFT/src/tests/loader-dup-test.cc b/storage/tokudb/PerconaFT/src/tests/loader-dup-test.cc index 3f2f8d7455a..aaf77c503cc 100644 --- a/storage/tokudb/PerconaFT/src/tests/loader-dup-test.cc +++ b/storage/tokudb/PerconaFT/src/tests/loader-dup-test.cc @@ -51,7 +51,7 @@ int DISALLOW_PUTS=0; int COMPRESS=0; enum {MAGIC=311}; -bool dup_row_at_end = false; // false: duplicate at the begining. true: duplicate at the end. The duplicated row is row 0. +bool dup_row_at_end = false; // false: duplicate at the beginning. true: duplicate at the end. The duplicated row is row 0. int dup_row_id = 0; // 0 means to use row 1 if inserting at the end, row NUM_ROWS if inserting at the beginning. Otherwise insert the row specified here. // diff --git a/storage/tokudb/PerconaFT/src/tests/recovery_fileops_unit.cc b/storage/tokudb/PerconaFT/src/tests/recovery_fileops_unit.cc index a4dc0ea9236..2c905c5ff12 100644 --- a/storage/tokudb/PerconaFT/src/tests/recovery_fileops_unit.cc +++ b/storage/tokudb/PerconaFT/src/tests/recovery_fileops_unit.cc @@ -156,7 +156,7 @@ do_args(int argc, char * const argv[]) { choices[i] = -1; } - char c; + int c; while ((c = getopt(argc, argv, "vqhcrO:A:B:C:D:E:F:G:H:I:X:")) != -1) { switch(c) { case 'v': diff --git a/storage/tokudb/PerconaFT/src/tests/stat64-root-changes.cc b/storage/tokudb/PerconaFT/src/tests/stat64-root-changes.cc index a2b48e443cd..48843a0bd32 100644 --- a/storage/tokudb/PerconaFT/src/tests/stat64-root-changes.cc +++ b/storage/tokudb/PerconaFT/src/tests/stat64-root-changes.cc @@ -166,7 +166,7 @@ run_test (void) { DB_BTREE_STAT64 s; r = db->stat64(db, NULL, &s); CKERR(r); - assert(s.bt_nkeys == 0); + assert(s.bt_nkeys == 1); r = db->close(db, 0); CKERR(r); @@ -176,7 +176,7 @@ run_test (void) { r = txn->commit(txn, 0); CKERR(r); r = db->stat64(db, NULL, &s); CKERR(r); - assert(s.bt_nkeys == 0); + assert(s.bt_nkeys == 1); } // verify update callback overwrites the row diff --git a/storage/tokudb/PerconaFT/src/tests/test_insert_many_gc.cc b/storage/tokudb/PerconaFT/src/tests/test_insert_many_gc.cc index 8e5109cd2a9..f6111d4b67c 100644 --- a/storage/tokudb/PerconaFT/src/tests/test_insert_many_gc.cc +++ b/storage/tokudb/PerconaFT/src/tests/test_insert_many_gc.cc @@ -78,7 +78,7 @@ static void test_insert_many_gc(void) { // from having an MVCC stack of size 'N'. At the time of this // writing, we run full GC on leaf-inject when the leaf is // 32mb or larger. A good invariant is that the max LE size - // never grew larger than 35mb and that the max commited xr stack + // never grew larger than 35mb and that the max committed xr stack // length never exceeded 35 const uint64_t le_max_memsize = get_engine_status_val(env, "LE_MAX_MEMSIZE"); const uint64_t le_max_committed_xr = get_engine_status_val(env, "LE_MAX_COMMITTED_XR"); diff --git a/storage/tokudb/PerconaFT/src/tests/test_stress0.cc b/storage/tokudb/PerconaFT/src/tests/test_stress0.cc index aaafe284906..88140dd1731 100644 --- a/storage/tokudb/PerconaFT/src/tests/test_stress0.cc +++ b/storage/tokudb/PerconaFT/src/tests/test_stress0.cc @@ -53,7 +53,7 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. // This test is a micro stress test that does multithreaded updates on a fixed size table. // There is also a thread that scans the table with bulk fetch, ensuring the sum is zero. // -// This test is targetted at stressing the locktree, hence the small table and many update threads. +// This test is targeted at stressing the locktree, hence the small table and many update threads. // static int UU() lock_escalation_op(DB_TXN *UU(txn), ARG arg, void* operation_extra, void *UU(stats_extra)) { diff --git a/storage/tokudb/PerconaFT/src/tests/test_txn_abort5a.cc b/storage/tokudb/PerconaFT/src/tests/test_txn_abort5a.cc index fec454b8009..301eed1560e 100644 --- a/storage/tokudb/PerconaFT/src/tests/test_txn_abort5a.cc +++ b/storage/tokudb/PerconaFT/src/tests/test_txn_abort5a.cc @@ -123,7 +123,8 @@ test_main(int argc, char *const argv[]) { continue; } } - if (verbose>0) printf("%s", __FILE__); if (verbose>1) printf("\n"); + if (verbose>0) printf("%s", __FILE__); + if (verbose>1) printf("\n"); for (i=1; i<100; i++) test_txn_abort(i); if (verbose>1) printf("%s OK\n", __FILE__); diff --git a/storage/tokudb/PerconaFT/src/ydb-internal.h b/storage/tokudb/PerconaFT/src/ydb-internal.h index 462a2a3d861..2d6c84126e1 100644 --- a/storage/tokudb/PerconaFT/src/ydb-internal.h +++ b/storage/tokudb/PerconaFT/src/ydb-internal.h @@ -114,7 +114,7 @@ struct __toku_db_env_internal { char *real_data_dir; // data dir used when the env is opened (relative to cwd, or absolute with leading /) char *real_log_dir; // log dir used when the env is opened (relative to cwd, or absolute with leading /) - char *real_tmp_dir; // tmp dir used for temporary files (relative to cwd, or absoulte with leading /) + char *real_tmp_dir; // tmp dir used for temporary files (relative to cwd, or absolute with leading /) fs_redzone_state fs_state; uint64_t fs_seq; // how many times has fs_poller run? diff --git a/storage/tokudb/PerconaFT/third_party/xz-4.999.9beta/build-aux/config.guess b/storage/tokudb/PerconaFT/third_party/xz-4.999.9beta/build-aux/config.guess index da833146088..7501b1bee01 100644 --- a/storage/tokudb/PerconaFT/third_party/xz-4.999.9beta/build-aux/config.guess +++ b/storage/tokudb/PerconaFT/third_party/xz-4.999.9beta/build-aux/config.guess @@ -1,10 +1,10 @@ #! /bin/sh # Attempt to guess a canonical system name. # Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, -# 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 -# Free Software Foundation, Inc. +# 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, +# 2011, 2012 Free Software Foundation, Inc. -timestamp='2009-04-27' +timestamp='2016-06-22' # This file is free software; you can redistribute it and/or modify it # under the terms of the GNU General Public License as published by @@ -17,9 +17,7 @@ timestamp='2009-04-27' # General Public License for more details. # # You should have received a copy of the GNU General Public License -# along with this program; if not, write to the Free Software -# Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA -# 02110-1301, USA. +# along with this program; if not, see <http://www.gnu.org/licenses/>. # # As a special exception to the GNU General Public License, if you # distribute this file as part of a program that contains a @@ -27,16 +25,16 @@ timestamp='2009-04-27' # the same distribution terms that you use for the rest of that program. -# Originally written by Per Bothner <per@bothner.com>. -# Please send patches to <config-patches@gnu.org>. Submit a context -# diff and a properly formatted ChangeLog entry. +# Originally written by Per Bothner. Please send patches (context +# diff format) to <config-patches@gnu.org> and include a ChangeLog +# entry. # # This script attempts to guess a canonical system name similar to # config.sub. If it succeeds, it prints the system name on stdout, and # exits with 0. Otherwise, it exits with 1. # -# The plan is that this can be called by configure scripts if you -# don't specify an explicit build system type. +# You can get the latest version of this script from: +# http://git.savannah.gnu.org/gitweb/?p=config.git;a=blob_plain;f=config.guess;hb=HEAD me=`echo "$0" | sed -e 's,.*/,,'` @@ -56,8 +54,9 @@ version="\ GNU config.guess ($timestamp) Originally written by Per Bothner. -Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, -2002, 2003, 2004, 2005, 2006, 2007, 2008 Free Software Foundation, Inc. +Copyright (C) 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, +2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012 +Free Software Foundation, Inc. This is free software; see the source for copying conditions. There is NO warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." @@ -144,7 +143,7 @@ UNAME_VERSION=`(uname -v) 2>/dev/null` || UNAME_VERSION=unknown case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in *:NetBSD:*:*) # NetBSD (nbsd) targets should (where applicable) match one or - # more of the tupples: *-*-netbsdelf*, *-*-netbsdaout*, + # more of the tuples: *-*-netbsdelf*, *-*-netbsdaout*, # *-*-netbsdecoff* and *-*-netbsd*. For targets that recently # switched to ELF, *-*-netbsd* would select the old # object file format. This provides both forward @@ -170,7 +169,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in arm*|i386|m68k|ns32k|sh3*|sparc|vax) eval $set_cc_for_build if echo __ELF__ | $CC_FOR_BUILD -E - 2>/dev/null \ - | grep __ELF__ >/dev/null + | grep -q __ELF__ then # Once all utilities can be ECOFF (netbsdecoff) or a.out (netbsdaout). # Return netbsd for either. FIX? @@ -180,7 +179,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in fi ;; *) - os=netbsd + os=netbsd ;; esac # The OS release @@ -223,7 +222,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $3}'` ;; *5.*) - UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'` + UNAME_RELEASE=`/usr/sbin/sizer -v | awk '{print $4}'` ;; esac # According to Compaq, /usr/sbin/psrinfo has been available on @@ -269,7 +268,10 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in # A Xn.n version is an unreleased experimental baselevel. # 1.2 uses "1.2" for uname -r. echo ${UNAME_MACHINE}-dec-osf`echo ${UNAME_RELEASE} | sed -e 's/^[PVTX]//' | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` - exit ;; + # Reset EXIT trap before exiting to avoid spurious non-zero exit code. + exitcode=$? + trap '' 0 + exit $exitcode ;; Alpha\ *:Windows_NT*:*) # How do we know it's Interix rather than the generic POSIX subsystem? # Should we change UNAME_MACHINE based on the output of uname instead @@ -295,7 +297,7 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in echo s390-ibm-zvmoe exit ;; *:OS400:*:*) - echo powerpc-ibm-os400 + echo powerpc-ibm-os400 exit ;; arm:RISC*:1.[012]*:*|arm:riscix:1.[012]*:*) echo arm-acorn-riscix${UNAME_RELEASE} @@ -333,6 +335,9 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in sun4*:SunOS:5.*:* | tadpole*:SunOS:5.*:*) echo sparc-sun-solaris2`echo ${UNAME_RELEASE}|sed -e 's/[^.]*//'` exit ;; + i86pc:AuroraUX:5.*:* | i86xen:AuroraUX:5.*:*) + echo i386-pc-auroraux${UNAME_RELEASE} + exit ;; i86pc:SunOS:5.*:* | i86xen:SunOS:5.*:*) eval $set_cc_for_build SUN_ARCH="i386" @@ -391,23 +396,23 @@ case "${UNAME_MACHINE}:${UNAME_SYSTEM}:${UNAME_RELEASE}:${UNAME_VERSION}" in # MiNT. But MiNT is downward compatible to TOS, so this should # be no problem. atarist[e]:*MiNT:*:* | atarist[e]:*mint:*:* | atarist[e]:*TOS:*:*) - echo m68k-atari-mint${UNAME_RELEASE} + echo m68k-atari-mint${UNAME_RELEASE} exit ;; atari*:*MiNT:*:* | atari*:*mint:*:* | atarist[e]:*TOS:*:*) echo m68k-atari-mint${UNAME_RELEASE} - exit ;; + exit ;; *falcon*:*MiNT:*:* | *falcon*:*mint:*:* | *falcon*:*TOS:*:*) - echo m68k-atari-mint${UNAME_RELEASE} + echo m68k-atari-mint${UNAME_RELEASE} exit ;; milan*:*MiNT:*:* | milan*:*mint:*:* | *milan*:*TOS:*:*) - echo m68k-milan-mint${UNAME_RELEASE} - exit ;; + echo m68k-milan-mint${UNAME_RELEASE} + exit ;; hades*:*MiNT:*:* | hades*:*mint:*:* | *hades*:*TOS:*:*) - echo m68k-hades-mint${UNAME_RELEASE} - exit ;; + echo m68k-hades-mint${UNAME_RELEASE} + exit ;; *:*MiNT:*:* | *:*mint:*:* | *:*TOS:*:*) - echo m68k-unknown-mint${UNAME_RELEASE} - exit ;; + echo m68k-unknown-mint${UNAME_RELEASE} + exit ;; m68k:machten:*:*) echo m68k-apple-machten${UNAME_RELEASE} exit ;; @@ -477,8 +482,8 @@ EOF echo m88k-motorola-sysv3 exit ;; AViiON:dgux:*:*) - # DG/UX returns AViiON for all architectures - UNAME_PROCESSOR=`/usr/bin/uname -p` + # DG/UX returns AViiON for all architectures + UNAME_PROCESSOR=`/usr/bin/uname -p` if [ $UNAME_PROCESSOR = mc88100 ] || [ $UNAME_PROCESSOR = mc88110 ] then if [ ${TARGET_BINARY_INTERFACE}x = m88kdguxelfx ] || \ @@ -491,7 +496,7 @@ EOF else echo i586-dg-dgux${UNAME_RELEASE} fi - exit ;; + exit ;; M88*:DolphinOS:*:*) # DolphinOS (SVR3) echo m88k-dolphin-sysv3 exit ;; @@ -548,7 +553,7 @@ EOF echo rs6000-ibm-aix3.2 fi exit ;; - *:AIX:*:[456]) + *:AIX:*:[4567]) IBM_CPU_ID=`/usr/sbin/lsdev -C -c processor -S available | sed 1q | awk '{ print $1 }'` if /usr/sbin/lsattr -El ${IBM_CPU_ID} | grep ' POWER' >/dev/null 2>&1; then IBM_ARCH=rs6000 @@ -591,52 +596,52 @@ EOF 9000/[678][0-9][0-9]) if [ -x /usr/bin/getconf ]; then sc_cpu_version=`/usr/bin/getconf SC_CPU_VERSION 2>/dev/null` - sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null` - case "${sc_cpu_version}" in - 523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0 - 528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1 - 532) # CPU_PA_RISC2_0 - case "${sc_kernel_bits}" in - 32) HP_ARCH="hppa2.0n" ;; - 64) HP_ARCH="hppa2.0w" ;; + sc_kernel_bits=`/usr/bin/getconf SC_KERNEL_BITS 2>/dev/null` + case "${sc_cpu_version}" in + 523) HP_ARCH="hppa1.0" ;; # CPU_PA_RISC1_0 + 528) HP_ARCH="hppa1.1" ;; # CPU_PA_RISC1_1 + 532) # CPU_PA_RISC2_0 + case "${sc_kernel_bits}" in + 32) HP_ARCH="hppa2.0n" ;; + 64) HP_ARCH="hppa2.0w" ;; '') HP_ARCH="hppa2.0" ;; # HP-UX 10.20 - esac ;; - esac + esac ;; + esac fi if [ "${HP_ARCH}" = "" ]; then eval $set_cc_for_build - sed 's/^ //' << EOF >$dummy.c + sed 's/^ //' << EOF >$dummy.c - #define _HPUX_SOURCE - #include <stdlib.h> - #include <unistd.h> + #define _HPUX_SOURCE + #include <stdlib.h> + #include <unistd.h> - int main () - { - #if defined(_SC_KERNEL_BITS) - long bits = sysconf(_SC_KERNEL_BITS); - #endif - long cpu = sysconf (_SC_CPU_VERSION); + int main () + { + #if defined(_SC_KERNEL_BITS) + long bits = sysconf(_SC_KERNEL_BITS); + #endif + long cpu = sysconf (_SC_CPU_VERSION); - switch (cpu) - { - case CPU_PA_RISC1_0: puts ("hppa1.0"); break; - case CPU_PA_RISC1_1: puts ("hppa1.1"); break; - case CPU_PA_RISC2_0: - #if defined(_SC_KERNEL_BITS) - switch (bits) - { - case 64: puts ("hppa2.0w"); break; - case 32: puts ("hppa2.0n"); break; - default: puts ("hppa2.0"); break; - } break; - #else /* !defined(_SC_KERNEL_BITS) */ - puts ("hppa2.0"); break; - #endif - default: puts ("hppa1.0"); break; - } - exit (0); - } + switch (cpu) + { + case CPU_PA_RISC1_0: puts ("hppa1.0"); break; + case CPU_PA_RISC1_1: puts ("hppa1.1"); break; + case CPU_PA_RISC2_0: + #if defined(_SC_KERNEL_BITS) + switch (bits) + { + case 64: puts ("hppa2.0w"); break; + case 32: puts ("hppa2.0n"); break; + default: puts ("hppa2.0"); break; + } break; + #else /* !defined(_SC_KERNEL_BITS) */ + puts ("hppa2.0"); break; + #endif + default: puts ("hppa1.0"); break; + } + exit (0); + } EOF (CCOPTS= $CC_FOR_BUILD -o $dummy $dummy.c 2>/dev/null) && HP_ARCH=`$dummy` test -z "$HP_ARCH" && HP_ARCH=hppa @@ -656,7 +661,7 @@ EOF # => hppa64-hp-hpux11.23 if echo __LP64__ | (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | - grep __LP64__ >/dev/null + grep -q __LP64__ then HP_ARCH="hppa2.0w" else @@ -727,22 +732,22 @@ EOF exit ;; C1*:ConvexOS:*:* | convex:ConvexOS:C1*:*) echo c1-convex-bsd - exit ;; + exit ;; C2*:ConvexOS:*:* | convex:ConvexOS:C2*:*) if getsysinfo -f scalar_acc then echo c32-convex-bsd else echo c2-convex-bsd fi - exit ;; + exit ;; C34*:ConvexOS:*:* | convex:ConvexOS:C34*:*) echo c34-convex-bsd - exit ;; + exit ;; C38*:ConvexOS:*:* | convex:ConvexOS:C38*:*) echo c38-convex-bsd - exit ;; + exit ;; C4*:ConvexOS:*:* | convex:ConvexOS:C4*:*) echo c4-convex-bsd - exit ;; + exit ;; CRAY*Y-MP:*:*:*) echo ymp-cray-unicos${UNAME_RELEASE} | sed -e 's/\.[^.]*$/.X/' exit ;; @@ -766,14 +771,14 @@ EOF exit ;; F30[01]:UNIX_System_V:*:* | F700:UNIX_System_V:*:*) FUJITSU_PROC=`uname -m | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz'` - FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` - FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'` - echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" - exit ;; + FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` + FUJITSU_REL=`echo ${UNAME_RELEASE} | sed -e 's/ /_/'` + echo "${FUJITSU_PROC}-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" + exit ;; 5000:UNIX_System_V:4.*:*) - FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` - FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'` - echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" + FUJITSU_SYS=`uname -p | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/\///'` + FUJITSU_REL=`echo ${UNAME_RELEASE} | tr 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' 'abcdefghijklmnopqrstuvwxyz' | sed -e 's/ /_/'` + echo "sparc-fujitsu-${FUJITSU_SYS}${FUJITSU_REL}" exit ;; i*86:BSD/386:*:* | i*86:BSD/OS:*:* | *:Ascend\ Embedded/OS:*:*) echo ${UNAME_MACHINE}-pc-bsdi${UNAME_RELEASE} @@ -785,13 +790,12 @@ EOF echo ${UNAME_MACHINE}-unknown-bsdi${UNAME_RELEASE} exit ;; *:FreeBSD:*:*) - case ${UNAME_MACHINE} in - pc98) - echo i386-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; + UNAME_PROCESSOR=`/usr/bin/uname -p` + case ${UNAME_PROCESSOR} in amd64) echo x86_64-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; *) - echo ${UNAME_MACHINE}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; + echo ${UNAME_PROCESSOR}-unknown-freebsd`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` ;; esac exit ;; i*:CYGWIN*:*) @@ -800,19 +804,22 @@ EOF *:MINGW*:*) echo ${UNAME_MACHINE}-pc-mingw32 exit ;; + i*:MSYS*:*) + echo ${UNAME_MACHINE}-pc-msys + exit ;; i*:windows32*:*) - # uname -m includes "-pc" on this system. - echo ${UNAME_MACHINE}-mingw32 + # uname -m includes "-pc" on this system. + echo ${UNAME_MACHINE}-mingw32 exit ;; i*:PW*:*) echo ${UNAME_MACHINE}-pc-pw32 exit ;; - *:Interix*:[3456]*) - case ${UNAME_MACHINE} in + *:Interix*:*) + case ${UNAME_MACHINE} in x86) echo i586-pc-interix${UNAME_RELEASE} exit ;; - EM64T | authenticamd | genuineintel) + authenticamd | genuineintel | EM64T) echo x86_64-unknown-interix${UNAME_RELEASE} exit ;; IA64) @@ -822,6 +829,9 @@ EOF [345]86:Windows_95:* | [345]86:Windows_98:* | [345]86:Windows_NT:*) echo i${UNAME_MACHINE}-pc-mks exit ;; + 8664:Windows_NT:*) + echo x86_64-pc-mks + exit ;; i*:Windows_NT*:* | Pentium*:Windows_NT*:*) # How do we know it's Interix rather than the generic POSIX subsystem? # It also conflicts with pre-2.0 versions of AT&T UWIN. Should we @@ -851,6 +861,27 @@ EOF i*86:Minix:*:*) echo ${UNAME_MACHINE}-pc-minix exit ;; + aarch64:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + aarch64_be:Linux:*:*) + UNAME_MACHINE=aarch64_be + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + alpha:Linux:*:*) + case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in + EV5) UNAME_MACHINE=alphaev5 ;; + EV56) UNAME_MACHINE=alphaev56 ;; + PCA56) UNAME_MACHINE=alphapca56 ;; + PCA57) UNAME_MACHINE=alphapca56 ;; + EV6) UNAME_MACHINE=alphaev6 ;; + EV67) UNAME_MACHINE=alphaev67 ;; + EV68*) UNAME_MACHINE=alphaev68 ;; + esac + objdump --private-headers /bin/sh | grep -q ld.so.1 + if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi + echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC} + exit ;; arm*:Linux:*:*) eval $set_cc_for_build if echo __ARM_EABI__ | $CC_FOR_BUILD -E - 2>/dev/null \ @@ -858,20 +889,40 @@ EOF then echo ${UNAME_MACHINE}-unknown-linux-gnu else - echo ${UNAME_MACHINE}-unknown-linux-gnueabi + if echo __ARM_PCS_VFP | $CC_FOR_BUILD -E - 2>/dev/null \ + | grep -q __ARM_PCS_VFP + then + echo ${UNAME_MACHINE}-unknown-linux-gnueabi + else + echo ${UNAME_MACHINE}-unknown-linux-gnueabihf + fi fi exit ;; avr32*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; cris:Linux:*:*) - echo cris-axis-linux-gnu + echo ${UNAME_MACHINE}-axis-linux-gnu exit ;; crisv32:Linux:*:*) - echo crisv32-axis-linux-gnu + echo ${UNAME_MACHINE}-axis-linux-gnu exit ;; frv:Linux:*:*) - echo frv-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + hexagon:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; + i*86:Linux:*:*) + LIBC=gnu + eval $set_cc_for_build + sed 's/^ //' << EOF >$dummy.c + #ifdef __dietlibc__ + LIBC=dietlibc + #endif +EOF + eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^LIBC'` + echo "${UNAME_MACHINE}-pc-linux-${LIBC}" exit ;; ia64:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu @@ -882,78 +933,34 @@ EOF m68*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; - mips:Linux:*:*) + mips:Linux:*:* | mips64:Linux:*:*) eval $set_cc_for_build sed 's/^ //' << EOF >$dummy.c #undef CPU - #undef mips - #undef mipsel + #undef ${UNAME_MACHINE} + #undef ${UNAME_MACHINE}el #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) - CPU=mipsel + CPU=${UNAME_MACHINE}el #else #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB) - CPU=mips + CPU=${UNAME_MACHINE} #else CPU= #endif #endif EOF - eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n ' - /^CPU/{ - s: ::g - p - }'`" - test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; } - ;; - mips64:Linux:*:*) - eval $set_cc_for_build - sed 's/^ //' << EOF >$dummy.c - #undef CPU - #undef mips64 - #undef mips64el - #if defined(__MIPSEL__) || defined(__MIPSEL) || defined(_MIPSEL) || defined(MIPSEL) - CPU=mips64el - #else - #if defined(__MIPSEB__) || defined(__MIPSEB) || defined(_MIPSEB) || defined(MIPSEB) - CPU=mips64 - #else - CPU= - #endif - #endif -EOF - eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n ' - /^CPU/{ - s: ::g - p - }'`" + eval `$CC_FOR_BUILD -E $dummy.c 2>/dev/null | grep '^CPU'` test x"${CPU}" != x && { echo "${CPU}-unknown-linux-gnu"; exit; } ;; or32:Linux:*:*) - echo or32-unknown-linux-gnu - exit ;; - ppc:Linux:*:*) - echo powerpc-unknown-linux-gnu - exit ;; - ppc64:Linux:*:*) - echo powerpc64-unknown-linux-gnu - exit ;; - alpha:Linux:*:*) - case `sed -n '/^cpu model/s/^.*: \(.*\)/\1/p' < /proc/cpuinfo` in - EV5) UNAME_MACHINE=alphaev5 ;; - EV56) UNAME_MACHINE=alphaev56 ;; - PCA56) UNAME_MACHINE=alphapca56 ;; - PCA57) UNAME_MACHINE=alphapca56 ;; - EV6) UNAME_MACHINE=alphaev6 ;; - EV67) UNAME_MACHINE=alphaev67 ;; - EV68*) UNAME_MACHINE=alphaev68 ;; - esac - objdump --private-headers /bin/sh | grep ld.so.1 >/dev/null - if test "$?" = 0 ; then LIBC="libc1" ; else LIBC="" ; fi - echo ${UNAME_MACHINE}-unknown-linux-gnu${LIBC} + echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; padre:Linux:*:*) echo sparc-unknown-linux-gnu exit ;; + parisc64:Linux:*:* | hppa64:Linux:*:*) + echo hppa64-unknown-linux-gnu + exit ;; parisc:Linux:*:* | hppa:Linux:*:*) # Look for CPU level case `grep '^cpu[^a-z]*:' /proc/cpuinfo 2>/dev/null | cut -d' ' -f2` in @@ -962,14 +969,17 @@ EOF *) echo hppa-unknown-linux-gnu ;; esac exit ;; - parisc64:Linux:*:* | hppa64:Linux:*:*) - echo hppa64-unknown-linux-gnu + ppc64:Linux:*:*) + echo powerpc64-unknown-linux-gnu + exit ;; + ppc:Linux:*:*) + echo powerpc-unknown-linux-gnu exit ;; s390:Linux:*:* | s390x:Linux:*:*) echo ${UNAME_MACHINE}-ibm-linux exit ;; sh64*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; sh*:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu @@ -977,75 +987,18 @@ EOF sparc:Linux:*:* | sparc64:Linux:*:*) echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; + tile*:Linux:*:*) + echo ${UNAME_MACHINE}-unknown-linux-gnu + exit ;; vax:Linux:*:*) echo ${UNAME_MACHINE}-dec-linux-gnu exit ;; x86_64:Linux:*:*) - echo x86_64-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; xtensa*:Linux:*:*) - echo ${UNAME_MACHINE}-unknown-linux-gnu + echo ${UNAME_MACHINE}-unknown-linux-gnu exit ;; - i*86:Linux:*:*) - # The BFD linker knows what the default object file format is, so - # first see if it will tell us. cd to the root directory to prevent - # problems with other programs or directories called `ld' in the path. - # Set LC_ALL=C to ensure ld outputs messages in English. - ld_supported_targets=`cd /; LC_ALL=C ld --help 2>&1 \ - | sed -ne '/supported targets:/!d - s/[ ][ ]*/ /g - s/.*supported targets: *// - s/ .*// - p'` - case "$ld_supported_targets" in - elf32-i386) - TENTATIVE="${UNAME_MACHINE}-pc-linux-gnu" - ;; - a.out-i386-linux) - echo "${UNAME_MACHINE}-pc-linux-gnuaout" - exit ;; - "") - # Either a pre-BFD a.out linker (linux-gnuoldld) or - # one that does not give us useful --help. - echo "${UNAME_MACHINE}-pc-linux-gnuoldld" - exit ;; - esac - # Determine whether the default compiler is a.out or elf - eval $set_cc_for_build - sed 's/^ //' << EOF >$dummy.c - #include <features.h> - #ifdef __ELF__ - # ifdef __GLIBC__ - # if __GLIBC__ >= 2 - LIBC=gnu - # else - LIBC=gnulibc1 - # endif - # else - LIBC=gnulibc1 - # endif - #else - #if defined(__INTEL_COMPILER) || defined(__PGI) || defined(__SUNPRO_C) || defined(__SUNPRO_CC) - LIBC=gnu - #else - LIBC=gnuaout - #endif - #endif - #ifdef __dietlibc__ - LIBC=dietlibc - #endif -EOF - eval "`$CC_FOR_BUILD -E $dummy.c 2>/dev/null | sed -n ' - /^LIBC/{ - s: ::g - p - }'`" - test x"${LIBC}" != x && { - echo "${UNAME_MACHINE}-pc-linux-${LIBC}" - exit - } - test x"${TENTATIVE}" != x && { echo "${TENTATIVE}"; exit; } - ;; i*86:DYNIX/ptx:4*:*) # ptx 4.0 does uname -s correctly, with DYNIX/ptx in there. # earlier versions are messed up and put the nodename in both @@ -1053,11 +1006,11 @@ EOF echo i386-sequent-sysv4 exit ;; i*86:UNIX_SV:4.2MP:2.*) - # Unixware is an offshoot of SVR4, but it has its own version - # number series starting with 2... - # I am not positive that other SVR4 systems won't match this, + # Unixware is an offshoot of SVR4, but it has its own version + # number series starting with 2... + # I am not positive that other SVR4 systems won't match this, # I just have to hope. -- rms. - # Use sysv4.2uw... so that sysv4* matches it. + # Use sysv4.2uw... so that sysv4* matches it. echo ${UNAME_MACHINE}-pc-sysv4.2uw${UNAME_VERSION} exit ;; i*86:OS/2:*:*) @@ -1074,7 +1027,7 @@ EOF i*86:syllable:*:*) echo ${UNAME_MACHINE}-pc-syllable exit ;; - i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.0*:*) + i*86:LynxOS:2.*:* | i*86:LynxOS:3.[01]*:* | i*86:LynxOS:4.[02]*:*) echo i386-unknown-lynxos${UNAME_RELEASE} exit ;; i*86:*DOS:*:*) @@ -1089,7 +1042,7 @@ EOF fi exit ;; i*86:*:5:[678]*) - # UnixWare 7.x, OpenUNIX and OpenServer 6. + # UnixWare 7.x, OpenUNIX and OpenServer 6. case `/bin/uname -X | grep "^Machine"` in *486*) UNAME_MACHINE=i486 ;; *Pentium) UNAME_MACHINE=i586 ;; @@ -1117,13 +1070,13 @@ EOF exit ;; pc:*:*:*) # Left here for compatibility: - # uname -m prints for DJGPP always 'pc', but it prints nothing about - # the processor, so we play safe by assuming i586. + # uname -m prints for DJGPP always 'pc', but it prints nothing about + # the processor, so we play safe by assuming i586. # Note: whatever this is, it MUST be the same as what config.sub # prints for the "djgpp" host, or else GDB configury will decide that # this is a cross-build. echo i586-pc-msdosdjgpp - exit ;; + exit ;; Intel:Mach:3*:*) echo i386-pc-mach3 exit ;; @@ -1158,8 +1111,8 @@ EOF /bin/uname -p 2>/dev/null | /bin/grep entium >/dev/null \ && { echo i586-ncr-sysv4.3${OS_REL}; exit; } ;; 3[34]??:*:4.0:* | 3[34]??,*:*:4.0:*) - /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ - && { echo i486-ncr-sysv4; exit; } ;; + /bin/uname -p 2>/dev/null | grep 86 >/dev/null \ + && { echo i486-ncr-sysv4; exit; } ;; NCR*:*:4.2:* | MPRAS*:*:4.2:*) OS_REL='.3' test -r /etc/.relid \ @@ -1182,7 +1135,7 @@ EOF rs6000:LynxOS:2.*:*) echo rs6000-unknown-lynxos${UNAME_RELEASE} exit ;; - PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.0*:*) + PowerPC:LynxOS:2.*:* | PowerPC:LynxOS:3.[01]*:* | PowerPC:LynxOS:4.[02]*:*) echo powerpc-unknown-lynxos${UNAME_RELEASE} exit ;; SM[BE]S:UNIX_SV:*:*) @@ -1202,10 +1155,10 @@ EOF echo ns32k-sni-sysv fi exit ;; - PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort - # says <Richard.M.Bartel@ccMail.Census.GOV> - echo i586-unisys-sysv4 - exit ;; + PENTIUM:*:4.0*:*) # Unisys `ClearPath HMP IX 4000' SVR4/MP effort + # says <Richard.M.Bartel@ccMail.Census.GOV> + echo i586-unisys-sysv4 + exit ;; *:UNIX_System_V:4*:FTX*) # From Gerald Hewes <hewes@openmarket.com>. # How about differentiating between stratus architectures? -djm @@ -1231,11 +1184,11 @@ EOF exit ;; R[34]000:*System_V*:*:* | R4000:UNIX_SYSV:*:* | R*000:UNIX_SV:*:*) if [ -d /usr/nec ]; then - echo mips-nec-sysv${UNAME_RELEASE} + echo mips-nec-sysv${UNAME_RELEASE} else - echo mips-unknown-sysv${UNAME_RELEASE} + echo mips-unknown-sysv${UNAME_RELEASE} fi - exit ;; + exit ;; BeBox:BeOS:*:*) # BeOS running on hardware made by Be, PPC only. echo powerpc-be-beos exit ;; @@ -1275,6 +1228,16 @@ EOF *:Darwin:*:*) UNAME_PROCESSOR=`uname -p` || UNAME_PROCESSOR=unknown case $UNAME_PROCESSOR in + i386) + eval $set_cc_for_build + if [ "$CC_FOR_BUILD" != 'no_compiler_found' ]; then + if (echo '#ifdef __LP64__'; echo IS_64BIT_ARCH; echo '#endif') | \ + (CCOPTS= $CC_FOR_BUILD -E - 2>/dev/null) | \ + grep IS_64BIT_ARCH >/dev/null + then + UNAME_PROCESSOR="x86_64" + fi + fi ;; unknown) UNAME_PROCESSOR=powerpc ;; esac echo ${UNAME_PROCESSOR}-apple-darwin${UNAME_RELEASE} @@ -1290,6 +1253,9 @@ EOF *:QNX:*:4*) echo i386-pc-qnx exit ;; + NEO-?:NONSTOP_KERNEL:*:*) + echo neo-tandem-nsk${UNAME_RELEASE} + exit ;; NSE-?:NONSTOP_KERNEL:*:*) echo nse-tandem-nsk${UNAME_RELEASE} exit ;; @@ -1335,13 +1301,13 @@ EOF echo pdp10-unknown-its exit ;; SEI:*:*:SEIUX) - echo mips-sei-seiux${UNAME_RELEASE} + echo mips-sei-seiux${UNAME_RELEASE} exit ;; *:DragonFly:*:*) echo ${UNAME_MACHINE}-unknown-dragonfly`echo ${UNAME_RELEASE}|sed -e 's/[-(].*//'` exit ;; *:*VMS:*:*) - UNAME_MACHINE=`(uname -p) 2>/dev/null` + UNAME_MACHINE=`(uname -p) 2>/dev/null` case "${UNAME_MACHINE}" in A*) echo alpha-dec-vms ; exit ;; I*) echo ia64-dec-vms ; exit ;; @@ -1359,6 +1325,9 @@ EOF i*86:AROS:*:*) echo ${UNAME_MACHINE}-pc-aros exit ;; + x86_64:VMkernel:*:*) + echo ${UNAME_MACHINE}-unknown-esx + exit ;; esac #echo '(No uname command or uname output not recognized.)' 1>&2 @@ -1381,11 +1350,11 @@ main () #include <sys/param.h> printf ("m68k-sony-newsos%s\n", #ifdef NEWSOS4 - "4" + "4" #else - "" + "" #endif - ); exit (0); + ); exit (0); #endif #endif diff --git a/storage/tokudb/PerconaFT/tools/CMakeLists.txt b/storage/tokudb/PerconaFT/tools/CMakeLists.txt index af82b4357d2..f11b9f350d7 100644 --- a/storage/tokudb/PerconaFT/tools/CMakeLists.txt +++ b/storage/tokudb/PerconaFT/tools/CMakeLists.txt @@ -1,6 +1,6 @@ set_property(DIRECTORY APPEND PROPERTY COMPILE_DEFINITIONS _GNU_SOURCE DONT_DEPRECATE_ERRNO) -set(tools tokudb_dump tokuftdump tokuft_logprint tdb-recover ftverify ba_replay) +set(tools tokudb_dump tokuftdump tokuft_logprint tdb-recover ftverify) foreach(tool ${tools}) add_executable(${tool} ${tool}.cc) add_dependencies(${tool} install_tdb_h) @@ -14,4 +14,3 @@ target_link_libraries(ftverify m) install(TARGETS tokuftdump DESTINATION ${INSTALL_BINDIR} COMPONENT Server) install(TARGETS tokuft_logprint DESTINATION ${INSTALL_BINDIR} COMPONENT Server) - diff --git a/storage/tokudb/PerconaFT/tools/ba_replay.cc b/storage/tokudb/PerconaFT/tools/ba_replay.cc deleted file mode 100644 index cade7e5dfaf..00000000000 --- a/storage/tokudb/PerconaFT/tools/ba_replay.cc +++ /dev/null @@ -1,629 +0,0 @@ -/* -*- mode: C++; c-basic-offset: 4; indent-tabs-mode: nil -*- */ -// vim: ft=cpp:expandtab:ts=8:sw=4:softtabstop=4: -#ident "$Id$" -/*====== -This file is part of PerconaFT. - - -Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. - - PerconaFT is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License, version 2, - as published by the Free Software Foundation. - - PerconaFT is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. - ----------------------------------------- - - PerconaFT is free software: you can redistribute it and/or modify - it under the terms of the GNU Affero General Public License, version 3, - as published by the Free Software Foundation. - - PerconaFT is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU Affero General Public License for more details. - - You should have received a copy of the GNU Affero General Public License - along with PerconaFT. If not, see <http://www.gnu.org/licenses/>. -======= */ - -#ident "Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved." - -// Replay a block allocator trace against different strategies and compare -// the results - -#include <db.h> - -#include <getopt.h> -#include <math.h> -#include <stdio.h> -#include <string.h> - -#include <map> -#include <set> -#include <string> -#include <sstream> -#include <vector> - -#include <portability/memory.h> -#include <portability/toku_assert.h> -#include <portability/toku_stdlib.h> - -#include "ft/serialize/block_allocator.h" - -using std::map; -using std::set; -using std::string; -using std::vector; - -static int verbose = false; - -static void ba_replay_assert(bool pred, const char *msg, const char *line, int line_num) { - if (!pred) { - fprintf(stderr, "%s, line (#%d): %s\n", msg, line_num, line); - abort(); - } -} - -static char *trim_whitespace(char *line) { - // skip leading whitespace - while (isspace(*line)) { - line++; - } - return line; -} - -static int64_t parse_number(char **ptr, int line_num, int base) { - *ptr = trim_whitespace(*ptr); - char *line = *ptr; - - char *new_ptr; - int64_t n = strtoll(line, &new_ptr, base); - ba_replay_assert(n >= 0, "malformed trace (bad numeric token)", line, line_num); - ba_replay_assert(new_ptr > *ptr, "malformed trace (missing numeric token)", line, line_num); - *ptr = new_ptr; - return n; -} - -static uint64_t parse_uint64(char **ptr, int line_num) { - int64_t n = parse_number(ptr, line_num, 10); - // we happen to know that the uint64's we deal with will - // take less than 63 bits (they come from pointers) - return static_cast<uint64_t>(n); -} - -static string parse_token(char **ptr, int line_num) { - *ptr = trim_whitespace(*ptr); - char *line = *ptr; - - // parse the first token, which represents the traced function - char token[64]; - int r = sscanf(*ptr, "%64s", token); - ba_replay_assert(r == 1, "malformed trace (missing string token)", line, line_num); - *ptr += strlen(token); - return string(token); -} - -static block_allocator::blockpair parse_blockpair(char **ptr, int line_num) { - *ptr = trim_whitespace(*ptr); - char *line = *ptr; - - uint64_t offset, size; - int bytes_read; - int r = sscanf(line, "[%" PRIu64 " %" PRIu64 "]%n", &offset, &size, &bytes_read); - ba_replay_assert(r == 2, "malformed trace (bad offset/size pair)", line, line_num); - *ptr += bytes_read; - return block_allocator::blockpair(offset, size); -} - -static char *strip_newline(char *line, bool *found) { - char *ptr = strchr(line, '\n'); - if (ptr != nullptr) { - if (found != nullptr) { - *found = true; - } - *ptr = '\0'; - } - return line; -} - -static char *read_trace_line(FILE *file) { - const int buf_size = 4096; - char buf[buf_size]; - std::stringstream ss; - while (true) { - if (fgets(buf, buf_size, file) == nullptr) { - break; - } - bool has_newline = false; - ss << strip_newline(buf, &has_newline); - if (has_newline) { - // end of the line, we're done out - break; - } - } - std::string s = ss.str(); - return s.size() ? toku_strdup(s.c_str()) : nullptr; -} - -static vector<string> canonicalize_trace_from(FILE *file) { - // new trace, canonicalized from a raw trace - vector<string> canonicalized_trace; - - // raw allocator id -> canonical allocator id - // - // keeps track of allocators that were created as part of the trace, - // and therefore will be part of the canonicalized trace. - uint64_t allocator_id_seq_num = 0; - map<uint64_t, uint64_t> allocator_ids; - - // allocated offset -> allocation seq num - // - uint64_t allocation_seq_num = 0; - static const uint64_t ASN_NONE = (uint64_t) -1; - typedef map<uint64_t, uint64_t> offset_seq_map; - - // raw allocator id -> offset_seq_map that tracks its allocations - map<uint64_t, offset_seq_map> offset_to_seq_num_maps; - - int line_num = 0; - char *line; - while ((line = read_trace_line(file)) != nullptr) { - line_num++; - char *ptr = line; - - string fn = parse_token(&ptr, line_num); - int64_t allocator_id = parse_number(&ptr, line_num, 16); - - std::stringstream ss; - if (fn.find("ba_trace_create") != string::npos) { - ba_replay_assert(allocator_ids.count(allocator_id) == 0, "corrupted trace: double create", line, line_num); - ba_replay_assert(fn == "ba_trace_create" || fn == "ba_trace_create_from_blockpairs", - "corrupted trace: bad fn", line, line_num); - - // we only convert the allocator_id to an allocator_id_seq_num - // in the canonical trace and leave the rest of the line as-is. - allocator_ids[allocator_id] = allocator_id_seq_num; - ss << fn << ' ' << allocator_id_seq_num << ' ' << trim_whitespace(ptr) << std::endl; - allocator_id_seq_num++; - - // First, read passed the reserve / alignment values. - (void) parse_uint64(&ptr, line_num); - (void) parse_uint64(&ptr, line_num); - if (fn == "ba_trace_create_from_blockpairs") { - // For each blockpair created by this traceline, add its offset to the offset seq map - // with asn ASN_NONE so that later canonicalizations of `free' know whether to write - // down the asn or the raw offset. - offset_seq_map *map = &offset_to_seq_num_maps[allocator_id]; - while (*trim_whitespace(ptr) != '\0') { - const block_allocator::blockpair bp = parse_blockpair(&ptr, line_num); - (*map)[bp.offset] = ASN_NONE; - } - } - } else { - ba_replay_assert(allocator_ids.count(allocator_id) > 0, "corrupted trace: unknown allocator", line, line_num); - uint64_t canonical_allocator_id = allocator_ids[allocator_id]; - - // this is the map that tracks allocations for this allocator - offset_seq_map *map = &offset_to_seq_num_maps[allocator_id]; - - if (fn == "ba_trace_alloc") { - const uint64_t size = parse_uint64(&ptr, line_num); - const uint64_t heat = parse_uint64(&ptr, line_num); - const uint64_t offset = parse_uint64(&ptr, line_num); - ba_replay_assert(map->count(offset) == 0, "corrupted trace: double alloc", line, line_num); - - // remember that an allocation at `offset' has the current alloc seq num - (*map)[offset] = allocation_seq_num; - - // translate `offset = alloc(size)' to `asn = alloc(size)' - ss << fn << ' ' << canonical_allocator_id << ' ' << size << ' ' << heat << ' ' << allocation_seq_num << std::endl; - allocation_seq_num++; - } else if (fn == "ba_trace_free") { - const uint64_t offset = parse_uint64(&ptr, line_num); - ba_replay_assert(map->count(offset) != 0, "corrupted trace: invalid free", line, line_num); - - // get the alloc seq num for an allcation that occurred at `offset' - const uint64_t asn = (*map)[offset]; - map->erase(offset); - - // if there's an asn, then a corresponding ba_trace_alloc occurred and we should - // write `free(asn)'. otherwise, the blockpair was initialized from create_from_blockpairs - // and we write the original offset. - if (asn != ASN_NONE) { - ss << "ba_trace_free_asn" << ' ' << canonical_allocator_id << ' ' << asn << std::endl; - } else { - ss << "ba_trace_free_offset" << ' ' << canonical_allocator_id << ' ' << offset << std::endl; - } - } else if (fn == "ba_trace_destroy") { - // Remove this allocator from both maps - allocator_ids.erase(allocator_id); - offset_to_seq_num_maps.erase(allocator_id); - - // translate `destroy(ptr_id) to destroy(canonical_id)' - ss << fn << ' ' << canonical_allocator_id << ' ' << std::endl; - } else { - ba_replay_assert(false, "corrupted trace: bad fn", line, line_num); - } - } - canonicalized_trace.push_back(ss.str()); - - toku_free(line); - } - - if (allocator_ids.size() != 0) { - fprintf(stderr, "warning: leaked allocators. this might be ok if the tracing process is still running"); - } - - return canonicalized_trace; -} - -struct streaming_variance_calculator { - int64_t n_samples; - int64_t mean; - int64_t variance; - - // math credit: AoCP, Donald Knuth, '62 - void add_sample(int64_t x) { - n_samples++; - if (n_samples == 1) { - mean = x; - variance = 0; - } else { - int64_t old_mean = mean; - mean = old_mean + ((x - old_mean) / n_samples); - variance = (((n_samples - 1) * variance) + - ((x - old_mean) * (x - mean))) / n_samples; - } - } -}; - -struct canonical_trace_stats { - uint64_t n_lines_replayed; - - uint64_t n_create; - uint64_t n_create_from_blockpairs; - uint64_t n_alloc_hot; - uint64_t n_alloc_cold; - uint64_t n_free; - uint64_t n_destroy; - - struct streaming_variance_calculator alloc_hot_bytes; - struct streaming_variance_calculator alloc_cold_bytes; - - canonical_trace_stats() { - memset(this, 0, sizeof(*this)); - } -}; - -struct fragmentation_report { - TOKU_DB_FRAGMENTATION_S beginning; - TOKU_DB_FRAGMENTATION_S end; - fragmentation_report() { - memset(this, 0, sizeof(*this)); - } - void merge(const struct fragmentation_report &src_report) { - for (int i = 0; i < 2; i++) { - TOKU_DB_FRAGMENTATION_S *dst = i == 0 ? &beginning : &end; - const TOKU_DB_FRAGMENTATION_S *src = i == 0 ? &src_report.beginning : &src_report.end; - dst->file_size_bytes += src->file_size_bytes; - dst->data_bytes += src->data_bytes; - dst->data_blocks += src->data_blocks; - dst->checkpoint_bytes_additional += src->checkpoint_bytes_additional; - dst->checkpoint_blocks_additional += src->checkpoint_blocks_additional; - dst->unused_bytes += src->unused_bytes; - dst->unused_blocks += src->unused_blocks; - dst->largest_unused_block += src->largest_unused_block; - } - } -}; - -static void replay_canonicalized_trace(const vector<string> &canonicalized_trace, - block_allocator::allocation_strategy strategy, - map<uint64_t, struct fragmentation_report> *reports, - struct canonical_trace_stats *stats) { - // maps an allocator id to its block allocator - map<uint64_t, block_allocator *> allocator_map; - - // maps allocation seq num to allocated offset - map<uint64_t, uint64_t> seq_num_to_offset; - - for (vector<string>::const_iterator it = canonicalized_trace.begin(); - it != canonicalized_trace.end(); it++) { - const int line_num = stats->n_lines_replayed++; - - char *line = toku_strdup(it->c_str()); - line = strip_newline(line, nullptr); - - char *ptr = trim_whitespace(line); - - // canonical allocator id is in base 10, not 16 - string fn = parse_token(&ptr, line_num); - int64_t allocator_id = parse_number(&ptr, line_num, 10); - - if (fn.find("ba_trace_create") != string::npos) { - const uint64_t reserve_at_beginning = parse_uint64(&ptr, line_num); - const uint64_t alignment = parse_uint64(&ptr, line_num); - ba_replay_assert(allocator_map.count(allocator_id) == 0, - "corrupted canonical trace: double create", line, line_num); - - block_allocator *ba = new block_allocator(); - if (fn == "ba_trace_create") { - ba->create(reserve_at_beginning, alignment); - stats->n_create++; - } else { - ba_replay_assert(fn == "ba_trace_create_from_blockpairs", - "corrupted canonical trace: bad create fn", line, line_num); - vector<block_allocator::blockpair> pairs; - while (*trim_whitespace(ptr) != '\0') { - const block_allocator::blockpair bp = parse_blockpair(&ptr, line_num); - pairs.push_back(bp); - } - ba->create_from_blockpairs(reserve_at_beginning, alignment, &pairs[0], pairs.size()); - stats->n_create_from_blockpairs++; - } - ba->set_strategy(strategy); - - TOKU_DB_FRAGMENTATION_S report; - ba->get_statistics(&report); - (*reports)[allocator_id].beginning = report; - allocator_map[allocator_id] = ba; - } else { - ba_replay_assert(allocator_map.count(allocator_id) > 0, - "corrupted canonical trace: no such allocator", line, line_num); - - block_allocator *ba = allocator_map[allocator_id]; - if (fn == "ba_trace_alloc") { - // replay an `alloc' whose result will be associated with a certain asn - const uint64_t size = parse_uint64(&ptr, line_num); - const uint64_t heat = parse_uint64(&ptr, line_num); - const uint64_t asn = parse_uint64(&ptr, line_num); - ba_replay_assert(seq_num_to_offset.count(asn) == 0, - "corrupted canonical trace: double alloc (asn in use)", line, line_num); - - uint64_t offset; - ba->alloc_block(size, heat, &offset); - seq_num_to_offset[asn] = offset; - heat ? stats->n_alloc_hot++ : stats->n_alloc_cold++; - heat ? stats->alloc_hot_bytes.add_sample(size) : stats->alloc_cold_bytes.add_sample(size); - } else if (fn == "ba_trace_free_asn") { - // replay a `free' on a block whose offset is the result of an alloc with an asn - const uint64_t asn = parse_uint64(&ptr, line_num); - ba_replay_assert(seq_num_to_offset.count(asn) == 1, - "corrupted canonical trace: double free (asn unused)", line, line_num); - - const uint64_t offset = seq_num_to_offset[asn]; - ba->free_block(offset); - seq_num_to_offset.erase(asn); - stats->n_free++; - } else if (fn == "ba_trace_free_offset") { - // replay a `free' on a block whose offset was explicitly set during a create_from_blockpairs - const uint64_t offset = parse_uint64(&ptr, line_num); - ba->free_block(offset); - stats->n_free++; - } else if (fn == "ba_trace_destroy") { - TOKU_DB_FRAGMENTATION_S report; - ba->get_statistics(&report); - ba->destroy(); - (*reports)[allocator_id].end = report; - allocator_map.erase(allocator_id); - stats->n_destroy++; - } else { - ba_replay_assert(false, "corrupted canonical trace: bad fn", line, line_num); - } - } - - toku_free(line); - } -} - -static const char *strategy_to_cstring(block_allocator::allocation_strategy strategy) { - switch (strategy) { - case block_allocator::allocation_strategy::BA_STRATEGY_FIRST_FIT: - return "first-fit"; - case block_allocator::allocation_strategy::BA_STRATEGY_BEST_FIT: - return "best-fit"; - case block_allocator::allocation_strategy::BA_STRATEGY_HEAT_ZONE: - return "heat-zone"; - case block_allocator::allocation_strategy::BA_STRATEGY_PADDED_FIT: - return "padded-fit"; - default: - abort(); - } -} - -static block_allocator::allocation_strategy cstring_to_strategy(const char *str) { - if (strcmp(str, "first-fit") == 0) { - return block_allocator::allocation_strategy::BA_STRATEGY_FIRST_FIT; - } - if (strcmp(str, "best-fit") == 0) { - return block_allocator::allocation_strategy::BA_STRATEGY_BEST_FIT; - } - if (strcmp(str, "heat-zone") == 0) { - return block_allocator::allocation_strategy::BA_STRATEGY_HEAT_ZONE; - } - if (strcmp(str, "padded-fit") != 0) { - fprintf(stderr, "bad strategy string: %s\n", str); - abort(); - } - return block_allocator::allocation_strategy::BA_STRATEGY_PADDED_FIT; -} - -static void print_result_verbose(uint64_t allocator_id, - block_allocator::allocation_strategy strategy, - const struct fragmentation_report &report) { - if (report.end.data_bytes + report.end.unused_bytes + - report.beginning.data_bytes + report.beginning.unused_bytes - < 32UL * 1024 * 1024) { - printf(" ...skipping allocator_id %" PRId64 " (total bytes < 32mb)\n", allocator_id); - return; - } - - printf(" allocator_id: %20" PRId64 "\n", allocator_id); - printf(" strategy: %20s\n", strategy_to_cstring(strategy)); - - for (int i = 0; i < 2; i++) { - const TOKU_DB_FRAGMENTATION_S *r = i == 0 ? &report.beginning : &report.end; - printf("%s\n", i == 0 ? "BEFORE" : "AFTER"); - - uint64_t total_bytes = r->data_bytes + r->unused_bytes; - uint64_t total_blocks = r->data_blocks + r->unused_blocks; - - // byte statistics - printf(" total bytes: %20" PRId64 "\n", total_bytes); - printf(" used bytes: %20" PRId64 " (%.3lf)\n", r->data_bytes, - static_cast<double>(r->data_bytes) / total_bytes); - printf(" unused bytes: %20" PRId64 " (%.3lf)\n", r->unused_bytes, - static_cast<double>(r->unused_bytes) / total_bytes); - - // block statistics - printf(" total blocks: %20" PRId64 "\n", total_blocks); - printf(" used blocks: %20" PRId64 " (%.3lf)\n", r->data_blocks, - static_cast<double>(r->data_blocks) / total_blocks); - printf(" unused blocks: %20" PRId64 " (%.3lf)\n", r->unused_blocks, - static_cast<double>(r->unused_blocks) / total_blocks); - - // misc - printf(" largest unused: %20" PRId64 "\n", r->largest_unused_block); - } -} - -static void print_result(uint64_t allocator_id, - block_allocator::allocation_strategy strategy, - const struct fragmentation_report &report) { - const TOKU_DB_FRAGMENTATION_S *beginning = &report.beginning; - const TOKU_DB_FRAGMENTATION_S *end = &report.end; - - uint64_t total_beginning_bytes = beginning->data_bytes + beginning->unused_bytes; - uint64_t total_end_bytes = end->data_bytes + end->unused_bytes; - if (total_end_bytes + total_beginning_bytes < 32UL * 1024 * 1024) { - if (verbose) { - printf("\n"); - printf(" ...skipping allocator_id %" PRId64 " (total bytes < 32mb)\n", allocator_id); - } - return; - } - printf("\n"); - if (verbose) { - print_result_verbose(allocator_id, strategy, report); - } else { - printf(" %-15s: allocator %" PRId64 ", %.3lf used bytes (%.3lf before)\n", - strategy_to_cstring(strategy), allocator_id, - static_cast<double>(report.end.data_bytes) / total_end_bytes, - static_cast<double>(report.beginning.data_bytes) / total_beginning_bytes); - } -} - -static int only_aggregate_reports; - -static struct option getopt_options[] = { - { "verbose", no_argument, &verbose, 1 }, - { "only-aggregate-reports", no_argument, &only_aggregate_reports, 1 }, - { "include-strategy", required_argument, nullptr, 'i' }, - { "exclude-strategy", required_argument, nullptr, 'x' }, - { nullptr, 0, nullptr, 0 }, -}; - -int main(int argc, char *argv[]) { - int opt; - set<block_allocator::allocation_strategy> candidate_strategies, excluded_strategies; - while ((opt = getopt_long(argc, argv, "", getopt_options, nullptr)) != -1) { - switch (opt) { - case 0: - break; - case 'i': - candidate_strategies.insert(cstring_to_strategy(optarg)); - break; - case 'x': - excluded_strategies.insert(cstring_to_strategy(optarg)); - break; - case '?': - default: - abort(); - }; - } - // Default to everything if nothing was explicitly included. - if (candidate_strategies.empty()) { - candidate_strategies.insert(block_allocator::allocation_strategy::BA_STRATEGY_FIRST_FIT); - candidate_strategies.insert(block_allocator::allocation_strategy::BA_STRATEGY_BEST_FIT); - candidate_strategies.insert(block_allocator::allocation_strategy::BA_STRATEGY_PADDED_FIT); - candidate_strategies.insert(block_allocator::allocation_strategy::BA_STRATEGY_HEAT_ZONE); - } - // ..but remove anything that was explicitly excluded - for (set<block_allocator::allocation_strategy>::const_iterator it = excluded_strategies.begin(); - it != excluded_strategies.end(); it++) { - candidate_strategies.erase(*it); - } - - // Run the real trace - // - // First, read the raw trace from stdin - vector<string> canonicalized_trace = canonicalize_trace_from(stdin); - - if (!only_aggregate_reports) { - printf("\n"); - printf("Individual reports, by allocator:\n"); - } - - struct canonical_trace_stats stats; - map<block_allocator::allocation_strategy, struct fragmentation_report> reports_by_strategy; - for (set<block_allocator::allocation_strategy>::const_iterator it = candidate_strategies.begin(); - it != candidate_strategies.end(); it++) { - const block_allocator::allocation_strategy strategy(*it); - - // replay the canonicalized trace against the current strategy. - // - // we provided the allocator map so we can gather statistics later - struct canonical_trace_stats dummy_stats; - map<uint64_t, struct fragmentation_report> reports; - replay_canonicalized_trace(canonicalized_trace, strategy, &reports, - // Only need to gather canonical trace stats once - it == candidate_strategies.begin() ? &stats : &dummy_stats); - - struct fragmentation_report aggregate_report; - memset(&aggregate_report, 0, sizeof(aggregate_report)); - for (map<uint64_t, struct fragmentation_report>::iterator rp = reports.begin(); - rp != reports.end(); rp++) { - const struct fragmentation_report &report = rp->second; - aggregate_report.merge(report); - if (!only_aggregate_reports) { - print_result(rp->first, strategy, report); - } - } - reports_by_strategy[strategy] = aggregate_report; - } - - printf("\n"); - printf("Aggregate reports, by strategy:\n"); - - for (map<block_allocator::allocation_strategy, struct fragmentation_report>::iterator it = reports_by_strategy.begin(); - it != reports_by_strategy.end(); it++) { - print_result(0, it->first, it->second); - } - - printf("\n"); - printf("Overall trace stats:\n"); - printf("\n"); - printf(" n_lines_played: %15" PRIu64 "\n", stats.n_lines_replayed); - printf(" n_create: %15" PRIu64 "\n", stats.n_create); - printf(" n_create_from_blockpairs: %15" PRIu64 "\n", stats.n_create_from_blockpairs); - printf(" n_alloc_hot: %15" PRIu64 "\n", stats.n_alloc_hot); - printf(" n_alloc_cold: %15" PRIu64 "\n", stats.n_alloc_cold); - printf(" n_free: %15" PRIu64 "\n", stats.n_free); - printf(" n_destroy: %15" PRIu64 "\n", stats.n_destroy); - printf("\n"); - printf(" avg_alloc_hot: %15" PRIu64 "\n", stats.alloc_hot_bytes.mean); - printf(" stddev_alloc_hot: %15" PRIu64 "\n", (uint64_t) sqrt(stats.alloc_hot_bytes.variance)); - printf(" avg_alloc_cold: %15" PRIu64 "\n", stats.alloc_cold_bytes.mean); - printf(" stddev_alloc_cold: %15" PRIu64 "\n", (uint64_t) sqrt(stats.alloc_cold_bytes.variance)); - printf("\n"); - - return 0; -} diff --git a/storage/tokudb/PerconaFT/tools/ftverify.cc b/storage/tokudb/PerconaFT/tools/ftverify.cc index 5920be8deda..2324249ba00 100644 --- a/storage/tokudb/PerconaFT/tools/ftverify.cc +++ b/storage/tokudb/PerconaFT/tools/ftverify.cc @@ -148,7 +148,7 @@ deserialize_headers(int fd, struct ft **h1p, struct ft **h2p) } } { - toku_off_t header_1_off = block_allocator::BLOCK_ALLOCATOR_HEADER_RESERVE; + toku_off_t header_1_off = BlockAllocator::BLOCK_ALLOCATOR_HEADER_RESERVE; r1 = deserialize_ft_from_fd_into_rbuf( fd, header_1_off, diff --git a/storage/tokudb/PerconaFT/tools/tokuftdump.cc b/storage/tokudb/PerconaFT/tools/tokuftdump.cc index 23ef72218ac..f6d777b4161 100644 --- a/storage/tokudb/PerconaFT/tools/tokuftdump.cc +++ b/storage/tokudb/PerconaFT/tools/tokuftdump.cc @@ -192,6 +192,7 @@ static void dump_header(FT ft) { dump_descriptor(&ft->descriptor); printf(" estimated numrows=%" PRId64 "\n", ft->in_memory_stats.numrows); printf(" estimated numbytes=%" PRId64 "\n", ft->in_memory_stats.numbytes); + printf(" logical row count=%" PRId64 "\n", ft->in_memory_logical_rows); } static int64_t getRootNode(FT ft) { diff --git a/storage/tokudb/PerconaFT/util/tests/x1764-test.cc b/storage/tokudb/PerconaFT/util/tests/x1764-test.cc index 48ff28e89af..76b1d9c713e 100644 --- a/storage/tokudb/PerconaFT/util/tests/x1764-test.cc +++ b/storage/tokudb/PerconaFT/util/tests/x1764-test.cc @@ -110,7 +110,7 @@ test2 (void) { static void test3 (void) -// Compare the simple version to the highly optimized verison. +// Compare the simple version to the highly optimized version. { const int datalen = 1000; char data[datalen]; diff --git a/storage/tokudb/ha_tokudb.cc b/storage/tokudb/ha_tokudb.cc index 672ae32f80a..7e9e6100c6e 100644 --- a/storage/tokudb/ha_tokudb.cc +++ b/storage/tokudb/ha_tokudb.cc @@ -382,17 +382,17 @@ void TOKUDB_SHARE::update_row_count( pct_of_rows_changed_to_trigger = ((_rows * auto_threshold) / 100); if (_row_delta_activity >= pct_of_rows_changed_to_trigger) { char msg[200]; - snprintf( - msg, - sizeof(msg), - "TokuDB: Auto %s background analysis for %s, delta_activity " - "%llu is greater than %llu percent of %llu rows.", - tokudb::sysvars::analyze_in_background(thd) > 0 ? - "scheduling" : "running", - full_table_name(), - _row_delta_activity, - auto_threshold, - (ulonglong)(_rows)); + snprintf(msg, + sizeof(msg), + "TokuDB: Auto %s analysis for %s, delta_activity %llu is " + "greater than %llu percent of %llu rows.", + tokudb::sysvars::analyze_in_background(thd) > 0 + ? "scheduling background" + : "running foreground", + full_table_name(), + _row_delta_activity, + auto_threshold, + (ulonglong)(_rows)); // analyze_standard will unlock _mutex regardless of success/failure int ret = analyze_standard(thd, NULL); @@ -4097,7 +4097,7 @@ int ha_tokudb::write_row(uchar * record) { goto cleanup; } if (curr_num_DBs == 1) { - error = insert_row_to_main_dictionary(record,&prim_key, &row, txn); + error = insert_row_to_main_dictionary(record, &prim_key, &row, txn); if (error) { goto cleanup; } } else { error = insert_rows_to_dictionaries_mult(&prim_key, &row, txn, thd); @@ -6130,7 +6130,7 @@ int ha_tokudb::info(uint flag) { // we should always have a primary key assert_always(share->file != NULL); - error = estimate_num_rows(share->file,&num_rows, txn); + error = estimate_num_rows(share->file, &num_rows, txn); if (error == 0) { share->set_row_count(num_rows, false); stats.records = num_rows; diff --git a/storage/tokudb/ha_tokudb_admin.cc b/storage/tokudb/ha_tokudb_admin.cc index db3d6c112d4..6d8e7173c8d 100644 --- a/storage/tokudb/ha_tokudb_admin.cc +++ b/storage/tokudb/ha_tokudb_admin.cc @@ -7,7 +7,7 @@ This file is part of TokuDB Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. - TokuDBis is free software: you can redistribute it and/or modify + TokuDB is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2, as published by the Free Software Foundation. @@ -43,13 +43,11 @@ public: virtual ~recount_rows_t(); virtual const char* key(); - - virtual void status( - char* database, - char* table, - char* type, - char* params, - char* status); + virtual const char* database(); + virtual const char* table(); + virtual const char* type(); + virtual const char* parameters(); + virtual const char* status(); protected: virtual void on_run(); @@ -64,6 +62,8 @@ private: ulonglong _throttle; // for recount rows status reporting + char _parameters[256]; + char _status[1024]; int _result; ulonglong _recount_start; // in microseconds ulonglong _total_elapsed_time; // in microseconds @@ -78,7 +78,6 @@ private: uint64_t deleted, void* extra); int analyze_recount_rows_progress(uint64_t count, uint64_t deleted); - void get_analyze_status(char*); }; void* recount_rows_t::operator new(size_t sz) { @@ -114,10 +113,19 @@ recount_rows_t::recount_rows_t( } _throttle = tokudb::sysvars::analyze_throttle(thd); + + snprintf(_parameters, + sizeof(_parameters), + "TOKUDB_ANALYZE_THROTTLE=%llu;", + _throttle); + _status[0] = '\0'; } recount_rows_t::~recount_rows_t() { } void recount_rows_t::on_run() { + const char* orig_proc_info = NULL; + if (_thd) + orig_proc_info = tokudb_thd_get_proc_info(_thd); _recount_start = tokudb::time::microsec(); _total_elapsed_time = 0; @@ -171,6 +179,8 @@ void recount_rows_t::on_run() { _result, _share->row_count()); error: + if(_thd) + tokudb_thd_set_proc_info(_thd, orig_proc_info); return; } void recount_rows_t::on_destroy() { @@ -179,18 +189,21 @@ void recount_rows_t::on_destroy() { const char* recount_rows_t::key() { return _share->full_table_name(); } -void recount_rows_t::status( - char* database, - char* table, - char* type, - char* params, - char* status) { - - strcpy(database, _share->database_name()); - strcpy(table, _share->table_name()); - strcpy(type, "TOKUDB_ANALYZE_MODE_RECOUNT_ROWS"); - sprintf(params, "TOKUDB_ANALYZE_THROTTLE=%llu;", _throttle); - get_analyze_status(status); +const char* recount_rows_t::database() { + return _share->database_name(); +} +const char* recount_rows_t::table() { + return _share->table_name(); +} +const char* recount_rows_t::type() { + static const char* type = "TOKUDB_ANALYZE_MODE_RECOUNT_ROWS"; + return type; +} +const char* recount_rows_t::parameters() { + return _parameters; +} +const char* recount_rows_t::status() { + return _status; } int recount_rows_t::analyze_recount_rows_progress( uint64_t count, @@ -217,12 +230,32 @@ int recount_rows_t::analyze_recount_rows_progress( return ER_ABORTING_CONNECTION; } + // rebuild status + // There is a slight race condition here, + // _status is used here for tokudb_thd_set_proc_info and it is also used + // for the status column in i_s.background_job_status. + // If someone happens to be querying/building the i_s table + // at the exact same time that the status is being rebuilt here, + // the i_s table could get some garbage status. + // This solution is a little heavy handed but it works, it prevents us + // from changing the status while someone might be immediately observing + // us and it prevents someone from observing us while we change the + // status + tokudb::background::_job_manager->lock(); + snprintf(_status, + sizeof(_status), + "recount_rows %s.%s counted %llu rows and %llu deleted " + "in %llu seconds.", + _share->database_name(), + _share->table_name(), + _rows, + _deleted_rows, + _total_elapsed_time / tokudb::time::MICROSECONDS); + tokudb::background::_job_manager->unlock(); + // report - if (_thd) { - char status[256]; - get_analyze_status(status); - thd_proc_info(_thd, status); - } + if (_thd) + tokudb_thd_set_proc_info(_thd, _status); // throttle // given the throttle value, lets calculate the maximum number of rows @@ -238,18 +271,6 @@ int recount_rows_t::analyze_recount_rows_progress( } return 0; } -void recount_rows_t::get_analyze_status(char* msg) { - sprintf( - msg, - "recount_rows %s.%s counted %llu rows and %llu deleted in %llu " - "seconds.", - _share->database_name(), - _share->table_name(), - _rows, - _deleted_rows, - _total_elapsed_time / tokudb::time::MICROSECONDS); -} - class standard_t : public tokudb::background::job_manager_t::job_t { public: @@ -261,13 +282,11 @@ public: virtual ~standard_t(); virtual const char* key(void); - - virtual void status( - char* database, - char* table, - char* type, - char* params, - char* status); + virtual const char* database(); + virtual const char* table(); + virtual const char* type(); + virtual const char* parameters(); + virtual const char* status(); protected: virtual void on_run(); @@ -284,6 +303,8 @@ private: double _delete_fraction; // for analyze status reporting, may also use other state + char _parameters[256]; + char _status[1024]; int _result; ulonglong _analyze_start; // in microseconds ulonglong _total_elapsed_time; // in microseconds @@ -305,7 +326,6 @@ private: uint64_t deleted_rows); bool analyze_standard_cursor_callback(uint64_t deleted_rows); - void get_analyze_status(char*); int analyze_key_progress(); int analyze_key(uint64_t* rec_per_key_part); }; @@ -351,6 +371,16 @@ standard_t::standard_t( _time_limit = tokudb::sysvars::analyze_time(thd) * tokudb::time::MICROSECONDS; _delete_fraction = tokudb::sysvars::analyze_delete_fraction(thd); + + snprintf(_parameters, + sizeof(_parameters), + "TOKUDB_ANALYZE_DELETE_FRACTION=%f; " + "TOKUDB_ANALYZE_TIME=%llu; TOKUDB_ANALYZE_THROTTLE=%llu;", + _delete_fraction, + _time_limit / tokudb::time::MICROSECONDS, + _throttle); + + _status[0] = '\0'; } standard_t::~standard_t() { } @@ -358,6 +388,10 @@ void standard_t::on_run() { DB_BTREE_STAT64 stat64; uint64_t rec_per_key_part[_share->_max_key_parts]; uint64_t total_key_parts = 0; + const char* orig_proc_info = NULL; + if (_thd) + orig_proc_info = tokudb_thd_get_proc_info(_thd); + _analyze_start = tokudb::time::microsec(); _half_time = _time_limit > 0 ? _time_limit/2 : 0; @@ -395,7 +429,7 @@ void standard_t::on_run() { _result = HA_ADMIN_FAILED; } if (_thd && (_result == HA_ADMIN_FAILED || - (double)_deleted_rows > + static_cast<double>(_deleted_rows) > _delete_fraction * (_rows + _deleted_rows))) { char name[256]; int namelen; @@ -460,8 +494,9 @@ cleanup: } error: + if (_thd) + tokudb_thd_set_proc_info(_thd, orig_proc_info); return; - } void standard_t::on_destroy() { _share->lock(); @@ -472,24 +507,21 @@ void standard_t::on_destroy() { const char* standard_t::key() { return _share->full_table_name(); } -void standard_t::status( - char* database, - char* table, - char* type, - char* params, - char* status) { - - strcpy(database, _share->database_name()); - strcpy(table, _share->table_name()); - strcpy(type, "TOKUDB_ANALYZE_MODE_STANDARD"); - sprintf( - params, - "TOKUDB_ANALYZE_DELETE_FRACTION=%f; " - "TOKUDB_ANALYZE_TIME=%llu; TOKUDB_ANALYZE_THROTTLE=%llu;", - _delete_fraction, - _time_limit / tokudb::time::MICROSECONDS, - _throttle); - get_analyze_status(status); +const char* standard_t::database() { + return _share->database_name(); +} +const char* standard_t::table() { + return _share->table_name(); +} +const char* standard_t::type() { + static const char* type = "TOKUDB_ANALYZE_MODE_STANDARD"; + return type; +} +const char* standard_t::parameters() { + return _parameters; +} +const char* standard_t::status() { + return _status; } bool standard_t::analyze_standard_cursor_callback( void* extra, @@ -502,41 +534,6 @@ bool standard_t::analyze_standard_cursor_callback(uint64_t deleted_rows) { _ticks += deleted_rows; return analyze_key_progress() != 0; } -void standard_t::get_analyze_status(char* msg) { - static const char* scan_direction_str[] = { - "not scanning", - "scanning forward", - "scanning backward", - "scan unknown" - }; - - const char* scan_direction = NULL; - switch (_scan_direction) { - case 0: scan_direction = scan_direction_str[0]; break; - case DB_NEXT: scan_direction = scan_direction_str[1]; break; - case DB_PREV: scan_direction = scan_direction_str[2]; break; - default: scan_direction = scan_direction_str[3]; break; - } - - float progress_rows = 0.0; - if (_share->row_count() > 0) - progress_rows = (float) _rows / (float) _share->row_count(); - float progress_time = 0.0; - if (_time_limit > 0) - progress_time = (float) _key_elapsed_time / (float) _time_limit; - sprintf( - msg, - "analyze table standard %s.%s.%s %llu of %u %.lf%% rows %.lf%% time, " - "%s", - _share->database_name(), - _share->table_name(), - _share->_key_descriptors[_current_key]._name, - _current_key, - _share->_keys, - progress_rows * 100.0, - progress_time * 100.0, - scan_direction); -} int standard_t::analyze_key_progress(void) { if (_ticks > 1000) { _ticks = 0; @@ -546,19 +543,72 @@ int standard_t::analyze_key_progress(void) { if ((_thd && thd_killed(_thd)) || cancelled()) { // client killed return ER_ABORTING_CONNECTION; - } else if(_time_limit > 0 && - (uint64_t)_key_elapsed_time > _time_limit) { + } else if (_time_limit > 0 && + static_cast<uint64_t>(_key_elapsed_time) > _time_limit) { // time limit reached return ETIME; } - // report - if (_thd) { - char status[256]; - get_analyze_status(status); - thd_proc_info(_thd, status); + // rebuild status + // There is a slight race condition here, + // _status is used here for tokudb_thd_set_proc_info and it is also used + // for the status column in i_s.background_job_status. + // If someone happens to be querying/building the i_s table + // at the exact same time that the status is being rebuilt here, + // the i_s table could get some garbage status. + // This solution is a little heavy handed but it works, it prevents us + // from changing the status while someone might be immediately observing + // us and it prevents someone from observing us while we change the + // status. + static const char* scan_direction_str[] = {"not scanning", + "scanning forward", + "scanning backward", + "scan unknown"}; + + const char* scan_direction = NULL; + switch (_scan_direction) { + case 0: + scan_direction = scan_direction_str[0]; + break; + case DB_NEXT: + scan_direction = scan_direction_str[1]; + break; + case DB_PREV: + scan_direction = scan_direction_str[2]; + break; + default: + scan_direction = scan_direction_str[3]; + break; } + float progress_rows = 0.0; + if (_share->row_count() > 0) + progress_rows = static_cast<float>(_rows) / + static_cast<float>(_share->row_count()); + float progress_time = 0.0; + if (_time_limit > 0) + progress_time = static_cast<float>(_key_elapsed_time) / + static_cast<float>(_time_limit); + tokudb::background::_job_manager->lock(); + snprintf( + _status, + sizeof(_status), + "analyze table standard %s.%s.%s %llu of %u %.lf%% rows %.lf%% " + "time, %s", + _share->database_name(), + _share->table_name(), + _share->_key_descriptors[_current_key]._name, + _current_key, + _share->_keys, + progress_rows * 100.0, + progress_time * 100.0, + scan_direction); + tokudb::background::_job_manager->unlock(); + + // report + if (_thd) + tokudb_thd_set_proc_info(_thd, _status); + // throttle // given the throttle value, lets calculate the maximum number of rows // we should have seen so far in a .1 sec resolution @@ -694,6 +744,11 @@ int standard_t::analyze_key(uint64_t* rec_per_key_part) { assert_always(close_error == 0); done: + // in case we timed out (bunch of deleted records) without hitting a + // single row + if (_rows == 0) + _rows = 1; + // return cardinality for (uint64_t i = 0; i < num_key_parts; i++) { rec_per_key_part[i] = _rows / unique_rows[i]; @@ -733,7 +788,6 @@ int TOKUDB_SHARE::analyze_recount_rows(THD* thd,DB_TXN* txn) { assert_always(thd != NULL); - const char *orig_proc_info = tokudb_thd_get_proc_info(thd); int result = HA_ADMIN_OK; tokudb::analyze::recount_rows_t* job @@ -753,8 +807,6 @@ int TOKUDB_SHARE::analyze_recount_rows(THD* thd,DB_TXN* txn) { result = HA_ADMIN_FAILED; } - thd_proc_info(thd, orig_proc_info); - TOKUDB_HANDLER_DBUG_RETURN(result); } @@ -778,8 +830,6 @@ int TOKUDB_SHARE::analyze_standard(THD* thd, DB_TXN* txn) { TOKUDB_HANDLER_DBUG_RETURN(result); } - const char *orig_proc_info = tokudb_thd_get_proc_info(thd); - tokudb::analyze::standard_t* job = new tokudb::analyze::standard_t(txn == NULL ? false : true, thd, this, txn); @@ -808,8 +858,6 @@ int TOKUDB_SHARE::analyze_standard(THD* thd, DB_TXN* txn) { lock(); - thd_proc_info(thd, orig_proc_info); - TOKUDB_HANDLER_DBUG_RETURN(result); } diff --git a/storage/tokudb/hatoku_defines.h b/storage/tokudb/hatoku_defines.h index b7726a746ad..1b33e0a53e4 100644 --- a/storage/tokudb/hatoku_defines.h +++ b/storage/tokudb/hatoku_defines.h @@ -7,7 +7,7 @@ This file is part of TokuDB Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. - TokuDBis is free software: you can redistribute it and/or modify + TokuDB is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2, as published by the Free Software Foundation. @@ -234,9 +234,12 @@ Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. // mysql 5.6.15 removed the test macro, so we define our own #define tokudb_test(e) ((e) ? 1 : 0) -inline const char* tokudb_thd_get_proc_info(const THD *thd) { +inline const char* tokudb_thd_get_proc_info(const THD* thd) { return thd->proc_info; } +inline void tokudb_thd_set_proc_info(THD* thd, const char* proc_info) { + thd_proc_info(thd, proc_info); +} // uint3korr reads 4 bytes and valgrind reports an error, so we use this function instead inline uint tokudb_uint3korr(const uchar *a) { diff --git a/storage/tokudb/mysql-test/rpl/r/rpl_foreign_key_tokudb.result b/storage/tokudb/mysql-test/rpl/r/rpl_foreign_key_tokudb.result deleted file mode 100644 index ccfffb53976..00000000000 --- a/storage/tokudb/mysql-test/rpl/r/rpl_foreign_key_tokudb.result +++ /dev/null @@ -1,51 +0,0 @@ -include/master-slave.inc -[connection master] -CREATE TABLE t1 (a INT AUTO_INCREMENT KEY) ENGINE=TokuDB; -CREATE TABLE t2 (b INT AUTO_INCREMENT KEY, c INT, FOREIGN KEY(b) REFERENCES t1(a)) ENGINE=TokuDB; -SET FOREIGN_KEY_CHECKS=0; -INSERT INTO t1 VALUES (10); -INSERT INTO t1 VALUES (NULL),(NULL),(NULL); -INSERT INTO t2 VALUES (5,0); -INSERT INTO t2 VALUES (NULL,LAST_INSERT_ID()); -SET FOREIGN_KEY_CHECKS=1; -SELECT * FROM t1 ORDER BY a; -a -10 -11 -12 -13 -SELECT * FROM t2 ORDER BY b; -b c -5 0 -6 11 -SELECT * FROM t1 ORDER BY a; -a -10 -11 -12 -13 -SELECT * FROM t2 ORDER BY b; -b c -5 0 -6 11 -SET TIMESTAMP=1000000000; -CREATE TABLE t3 ( a INT UNIQUE ); -SET FOREIGN_KEY_CHECKS=0; -INSERT INTO t3 VALUES (1),(1); -ERROR 23000: Duplicate entry '1' for key 'a' -SET FOREIGN_KEY_CHECKS=0; -DROP TABLE IF EXISTS t1,t2,t3; -SET FOREIGN_KEY_CHECKS=1; -create table t1 (b int primary key) engine = TokuDB; -create table t2 (a int primary key, b int, foreign key (b) references t1(b)) -engine = TokuDB; -insert into t1 set b=1; -insert into t2 set a=1, b=1; -set foreign_key_checks=0; -delete from t1; -must sync w/o a problem (could not with the buggy code) -select count(*) from t1 /* must be zero */; -count(*) -0 -drop table t2,t1; -include/rpl_end.inc diff --git a/storage/tokudb/mysql-test/rpl/t/rpl_foreign_key_tokudb.test b/storage/tokudb/mysql-test/rpl/t/rpl_foreign_key_tokudb.test deleted file mode 100644 index 120ad0d5c1e..00000000000 --- a/storage/tokudb/mysql-test/rpl/t/rpl_foreign_key_tokudb.test +++ /dev/null @@ -1,3 +0,0 @@ --- source include/have_tokudb.inc -let $engine_type=TokuDB; --- source extra/rpl_tests/rpl_foreign_key.test diff --git a/storage/tokudb/mysql-test/tokudb/r/background_job_manager.result b/storage/tokudb/mysql-test/tokudb/r/background_job_manager.result index 5769ee74071..8b53f89efa3 100644 --- a/storage/tokudb/mysql-test/tokudb/r/background_job_manager.result +++ b/storage/tokudb/mysql-test/tokudb/r/background_job_manager.result @@ -25,7 +25,7 @@ TokuDB_background_job_status CREATE TEMPORARY TABLE `TokuDB_background_job_statu `scheduler` varchar(32) NOT NULL DEFAULT '', `scheduled_time` datetime NOT NULL DEFAULT '0000-00-00 00:00:00', `started_time` datetime DEFAULT NULL, - `status` varchar(256) DEFAULT NULL + `status` varchar(1024) DEFAULT NULL ) ENGINE=MEMORY DEFAULT CHARSET=utf8 create table t1 (a int not null auto_increment, b int, c int, primary key(a), key kb(b), key kc(c), key kabc(a,b,c), key kab(a,b), key kbc(b,c)); insert into t1(b,c) values(0,0), (1,1), (2,2), (3,3); diff --git a/storage/tokudb/mysql-test/tokudb_bugs/t/frm_store.test b/storage/tokudb/mysql-test/tokudb_bugs/t/frm_store.test index 6100d9aeec2..8b6df4966f4 100644 --- a/storage/tokudb/mysql-test/tokudb_bugs/t/frm_store.test +++ b/storage/tokudb/mysql-test/tokudb_bugs/t/frm_store.test @@ -12,33 +12,11 @@ let $MYSQLD_DATADIR= `SELECT @@datadir`; create table foo (a int, b int); create table bar (a int, key(a)); -# Write file to make mysql-test-run.pl expect the "crash", but don't start -# it until it's told to ---write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect -wait -EOF - -# Send shutdown to the connected server and give -# it 10 seconds to die before zapping it -shutdown_server 10; - +--source include/shutdown_mysqld.inc remove_file $MYSQLD_DATADIR/test/foo.frm; copy_file $MYSQLD_DATADIR/test/bar.frm $MYSQLD_DATADIR/test/foo.frm; remove_file $MYSQLD_DATADIR/test/bar.frm; - -# Write file to make mysql-test-run.pl start up the server again ---append_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect -restart -EOF - -# Turn on reconnect ---enable_reconnect - -# Call script that will poll the server waiting for it to be back online again ---source include/wait_until_connected_again.inc - -# Turn off reconnect again ---disable_reconnect +--source include/start_mysqld.inc show create table foo; show create table bar; diff --git a/storage/tokudb/mysql-test/tokudb_bugs/t/frm_store2.test b/storage/tokudb/mysql-test/tokudb_bugs/t/frm_store2.test index e1acea13ed7..53c1037b051 100644 --- a/storage/tokudb/mysql-test/tokudb_bugs/t/frm_store2.test +++ b/storage/tokudb/mysql-test/tokudb_bugs/t/frm_store2.test @@ -15,33 +15,11 @@ create table bar (a int); alter table foo drop column a; alter table bar add column b int, add column c int; -# Write file to make mysql-test-run.pl expect the "crash", but don't start -# it until it's told to ---write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect -wait -EOF - -# Send shutdown to the connected server and give -# it 10 seconds to die before zapping it -shutdown_server 10; - +--source include/shutdown_mysqld.inc remove_file $MYSQLD_DATADIR/test/foo.frm; copy_file $MYSQLD_DATADIR/test/bar.frm $MYSQLD_DATADIR/test/foo.frm; remove_file $MYSQLD_DATADIR/test/bar.frm; - -# Write file to make mysql-test-run.pl start up the server again ---append_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect -restart -EOF - -# Turn on reconnect ---enable_reconnect - -# Call script that will poll the server waiting for it to be back online again ---source include/wait_until_connected_again.inc - -# Turn off reconnect again ---disable_reconnect +--source include/start_mysqld.inc show create table foo; show create table bar; diff --git a/storage/tokudb/mysql-test/tokudb_bugs/t/frm_store3.test b/storage/tokudb/mysql-test/tokudb_bugs/t/frm_store3.test index 17a124249da..0421b8e9d26 100644 --- a/storage/tokudb/mysql-test/tokudb_bugs/t/frm_store3.test +++ b/storage/tokudb/mysql-test/tokudb_bugs/t/frm_store3.test @@ -14,33 +14,11 @@ create table bar (a bigint)engine=TokuDB; alter table foo drop index b; alter table bar add index (a); -# Write file to make mysql-test-run.pl expect the "crash", but don't start -# it until it's told to ---write_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect -wait -EOF - -# Send shutdown to the connected server and give -# it 10 seconds to die before zapping it -shutdown_server 10; - +--source include/shutdown_mysqld.inc remove_file $MYSQLD_DATADIR/test/foo.frm; copy_file $MYSQLD_DATADIR/test/bar.frm $MYSQLD_DATADIR/test/foo.frm; remove_file $MYSQLD_DATADIR/test/bar.frm; - -# Write file to make mysql-test-run.pl start up the server again ---append_file $MYSQLTEST_VARDIR/tmp/mysqld.1.expect -restart -EOF - -# Turn on reconnect ---enable_reconnect - -# Call script that will poll the server waiting for it to be back online again ---source include/wait_until_connected_again.inc - -# Turn off reconnect again ---disable_reconnect +--source include/start_mysqld.inc show create table foo; show create table bar; diff --git a/storage/tokudb/mysql-test/tokudb_bugs/t/tokudb_drop_part_table_668.test b/storage/tokudb/mysql-test/tokudb_bugs/t/tokudb_drop_part_table_668.test index 42dbb30058a..4c40339be5a 100644 --- a/storage/tokudb/mysql-test/tokudb_bugs/t/tokudb_drop_part_table_668.test +++ b/storage/tokudb/mysql-test/tokudb_bugs/t/tokudb_drop_part_table_668.test @@ -7,17 +7,7 @@ set default_storage_engine='tokudb'; # capture the datadir let $MYSQLD_DATADIR= `SELECT @@datadir`; -# shutdown mysqld (code stolen from mysql_plugin.test) -let $expect_file= $MYSQLTEST_VARDIR/tmp/mysqld.1.expect; -# MTR will remove this file later, but this might be too late. ---error 0,1 ---remove_file $expect_file ---write_file $expect_file -wait -EOF ---shutdown_server 10 ---source include/wait_until_disconnected.inc - +--source include/shutdown_mysqld.inc # remove all tokudb file in the datadir system mkdir $MYSQLD_DATADIR/save; system mv $MYSQLD_DATADIR/*toku* $MYSQLD_DATADIR/test $MYSQLD_DATADIR/save; @@ -25,13 +15,7 @@ system mkdir $MYSQLD_DATADIR/test; # install 6.6.8 tokudb test files system cp -r std_data/tokudb_drop_part_table_668/data/* $MYSQLD_DATADIR; - -# restart mysqld ---append_file $expect_file -restart -EOF ---enable_reconnect ---source include/wait_until_connected_again.inc +--source include/start_mysqld.inc create table tc (a int, b int, c int, primary key(a), key(b)) engine=tokudb partition by hash(a) partitions 2; @@ -45,26 +29,9 @@ select dictionary_name from information_schema.tokudb_file_map; # check that the test dir is empty list_files $MYSQLD_DATADIR/test *.frm; -# shutdown mysqld (code stolen from mysql_plugin.test) -let $expect_file= $MYSQLTEST_VARDIR/tmp/mysqld.1.expect; -# MTR will remove this file later, but this might be too late. ---error 0,1 ---remove_file $expect_file ---write_file $expect_file -wait -EOF ---shutdown_server 10 ---source include/wait_until_disconnected.inc - +--source include/shutdown_mysqld.inc # restore saved datadir system rm -rf $MYSQLD_DATADIR/*toku* $MYSQLD_DATADIR/test; system mv $MYSQLD_DATADIR/save/* $MYSQLD_DATADIR; system rmdir $MYSQLD_DATADIR/save; - -# restart mysqld ---append_file $expect_file -restart -EOF ---enable_reconnect ---source include/wait_until_connected_again.inc - +--source include/start_mysqld.inc diff --git a/storage/tokudb/mysql-test/tokudb_bugs/t/tokudb_drop_simple_table_668.test b/storage/tokudb/mysql-test/tokudb_bugs/t/tokudb_drop_simple_table_668.test index 3903c2cef9f..0340b960fa5 100644 --- a/storage/tokudb/mysql-test/tokudb_bugs/t/tokudb_drop_simple_table_668.test +++ b/storage/tokudb/mysql-test/tokudb_bugs/t/tokudb_drop_simple_table_668.test @@ -6,17 +6,7 @@ set default_storage_engine='tokudb'; # capture the datadir let $MYSQLD_DATADIR= `SELECT @@datadir`; -# shutdown mysqld (code stolen from mysql_plugin.test) -let $expect_file= $MYSQLTEST_VARDIR/tmp/mysqld.1.expect; -# MTR will remove this file later, but this might be too late. ---error 0,1 ---remove_file $expect_file ---write_file $expect_file -wait -EOF ---shutdown_server 10 ---source include/wait_until_disconnected.inc - +--source include/shutdown_mysqld.inc # remove all tokudb file in the datadir system mkdir $MYSQLD_DATADIR/save; system mv $MYSQLD_DATADIR/*toku* $MYSQLD_DATADIR/test $MYSQLD_DATADIR/save; @@ -24,13 +14,7 @@ system mkdir $MYSQLD_DATADIR/test; # install 6.6.8 tokudb test files system cp -r std_data/tokudb_drop_simple_table_668/data/* $MYSQLD_DATADIR; - -# restart mysqld ---append_file $expect_file -restart -EOF ---enable_reconnect ---source include/wait_until_connected_again.inc +--source include/start_mysqld.inc create table tc (id int, x int, primary key(id), key(x)); @@ -46,26 +30,9 @@ select dictionary_name from information_schema.tokudb_file_map; # check that the test dir is empty list_files $MYSQLD_DATADIR/test *.frm; -# shutdown mysqld (code stolen from mysql_plugin.test) -let $expect_file= $MYSQLTEST_VARDIR/tmp/mysqld.1.expect; -# MTR will remove this file later, but this might be too late. ---error 0,1 ---remove_file $expect_file ---write_file $expect_file -wait -EOF ---shutdown_server 10 ---source include/wait_until_disconnected.inc - +--source include/shutdown_mysqld.inc # restore saved datadir system rm -rf $MYSQLD_DATADIR/*toku* $MYSQLD_DATADIR/test; system mv $MYSQLD_DATADIR/save/* $MYSQLD_DATADIR; system rmdir $MYSQLD_DATADIR/save; - -# restart mysqld ---append_file $expect_file -restart -EOF ---enable_reconnect ---source include/wait_until_connected_again.inc - +--source include/start_mysqld.inc diff --git a/storage/tokudb/tokudb_background.cc b/storage/tokudb/tokudb_background.cc index d8ef54a5972..e019e41c788 100644 --- a/storage/tokudb/tokudb_background.cc +++ b/storage/tokudb/tokudb_background.cc @@ -8,7 +8,7 @@ This file is part of TokuDB Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. - TokuDBis is free software: you can redistribute it and/or modify + TokuDB is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2, as published by the Free Software Foundation. @@ -68,7 +68,8 @@ void job_manager_t::destroy() { while (_background_jobs.size()) { _mutex.lock(); job_t* job = _background_jobs.front(); - cancel(job); + if (!job->cancelled()) + cancel(job); _background_jobs.pop_front(); delete job; _mutex.unlock(); @@ -148,11 +149,8 @@ bool job_manager_t::cancel_job(const char* key) { it != _background_jobs.end(); it++) { job_t* job = *it; - if (!job->cancelled() && - strcmp(job->key(), key) == 0) { - + if (!job->cancelled() && strcmp(job->key(), key) == 0) { cancel(job); - ret = true; } } @@ -162,8 +160,6 @@ bool job_manager_t::cancel_job(const char* key) { } void job_manager_t::iterate_jobs(pfn_iterate_t callback, void* extra) const { - char database[256], table[256], type[256], params[256], status[256]; - _mutex.lock(); for (jobs_t::const_iterator it = _background_jobs.begin(); @@ -171,19 +167,7 @@ void job_manager_t::iterate_jobs(pfn_iterate_t callback, void* extra) const { it++) { job_t* job = *it; if (!job->cancelled()) { - database[0] = table[0] = type[0] = params[0] = status[0] = '\0'; - job->status(database, table, type, params, status); - callback( - job->id(), - database, - table, - type, - params, - status, - job->user_scheduled(), - job->scheduled_time(), - job->started_time(), - extra); + callback(job, extra); } } @@ -233,6 +217,7 @@ void job_manager_t::run(job_t* job) { } void job_manager_t::cancel(job_t* job) { assert_debug(_mutex.is_owned_by_me()); + assert_always(!job->cancelled()); job->cancel(); } job_manager_t* _job_manager = NULL; diff --git a/storage/tokudb/tokudb_background.h b/storage/tokudb/tokudb_background.h index 3786701fd0f..29991ab325d 100644 --- a/storage/tokudb/tokudb_background.h +++ b/storage/tokudb/tokudb_background.h @@ -7,7 +7,7 @@ This file is part of TokuDB Copyright (c) 2006, 2015, Percona and/or its affiliates. All rights reserved. - TokuDBis is free software: you can redistribute it and/or modify + TokuDB is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License, version 2, as published by the Free Software Foundation. @@ -58,13 +58,20 @@ public: // (or jobs) usually used to find jobs to cancel virtual const char* key() = 0; - // method to get info for information schema, 255 chars per buffer - virtual void status( - char* database, - char* table, - char* type, - char* params, - char* status) = 0; + // method to obtain the database name the job is scheduled on + virtual const char* database() = 0; + + // method to obtain the table name the job is scheduled on + virtual const char* table() = 0; + + // method to obtain the type of job + virtual const char* type() = 0; + + // method to obtain a stringized list of job parameters + virtual const char* parameters() = 0; + + // method to obtain a sting identifying the current status of the job + virtual const char* status() = 0; inline bool running() const; @@ -99,17 +106,7 @@ public: }; // pfn for iterate callback - typedef void (*pfn_iterate_t)( - uint64_t, - const char*, - const char*, - const char*, - const char*, - const char*, - bool, - time_t, - time_t, - void*); + typedef void (*pfn_iterate_t)(class job_t*, void*); public: void* operator new(size_t sz); @@ -144,6 +141,11 @@ public: // data passed when the job was scheduled void iterate_jobs(pfn_iterate_t callback, void* extra) const; + // lock the bjm, this prevents anyone from running, cancelling or iterating + // jobs in the bjm. + inline void lock(); + inline void unlock(); + private: static void* thread_func(void* v); @@ -170,6 +172,15 @@ extern job_manager_t* _job_manager; bool initialize(); bool destroy(); +inline void job_manager_t::lock() { + assert_debug(!_mutex.is_owned_by_me()); + _mutex.lock(); +} +inline void job_manager_t::unlock() { + assert_debug(_mutex.is_owned_by_me()); + _mutex.unlock(); +} + inline void job_manager_t::job_t::run() { if (!_cancelled) { _running = true; diff --git a/storage/tokudb/tokudb_information_schema.cc b/storage/tokudb/tokudb_information_schema.cc index e69a7899b45..b3d77eef2d9 100644 --- a/storage/tokudb/tokudb_information_schema.cc +++ b/storage/tokudb/tokudb_information_schema.cc @@ -1085,7 +1085,7 @@ ST_FIELD_INFO background_job_status_field_info[] = { {"scheduler", 32, MYSQL_TYPE_STRING, 0, 0, NULL, SKIP_OPEN_TABLE }, {"scheduled_time", 0, MYSQL_TYPE_DATETIME, 0, 0, NULL, SKIP_OPEN_TABLE }, {"started_time", 0, MYSQL_TYPE_DATETIME, 0, MY_I_S_MAYBE_NULL, NULL, SKIP_OPEN_TABLE }, - {"status", 256, MYSQL_TYPE_STRING, 0, MY_I_S_MAYBE_NULL, SKIP_OPEN_TABLE }, + {"status", 1024, MYSQL_TYPE_STRING, 0, MY_I_S_MAYBE_NULL, SKIP_OPEN_TABLE }, {NULL, 0, MYSQL_TYPE_NULL, 0, 0, NULL, SKIP_OPEN_TABLE} }; @@ -1095,15 +1095,7 @@ struct background_job_status_extra { }; void background_job_status_callback( - uint64_t id, - const char* database_name, - const char* table_name, - const char* type, - const char* params, - const char* status, - bool user_scheduled, - time_t scheduled_time, - time_t started_time, + tokudb::background::job_manager_t::job_t* job, void* extra) { background_job_status_extra* e = @@ -1111,24 +1103,33 @@ void background_job_status_callback( THD* thd = e->thd; TABLE* table = e->table; + const char* tmp = NULL; - table->field[0]->store(id, false); - table->field[1]->store( - database_name, - strlen(database_name), - system_charset_info); - table->field[2]->store(table_name, strlen(table_name), system_charset_info); - table->field[3]->store(type, strlen(type), system_charset_info); - table->field[4]->store(params, strlen(params), system_charset_info); - if (user_scheduled) + table->field[0]->store(job->id(), false); + + tmp = job->database(); + table->field[1]->store(tmp, strlen(tmp), system_charset_info); + + tmp = job->table(); + table->field[2]->store(tmp, strlen(tmp), system_charset_info); + + tmp = job->type(); + table->field[3]->store(tmp, strlen(tmp), system_charset_info); + + tmp = job->parameters(); + table->field[4]->store(tmp, strlen(tmp), system_charset_info); + + if (job->user_scheduled()) table->field[5]->store("USER", strlen("USER"), system_charset_info); else table->field[5]->store("AUTO", strlen("AUTO"), system_charset_info); - field_store_time_t(table->field[6], scheduled_time); - field_store_time_t(table->field[7], started_time); - if (status[0] != '\0') { - table->field[8]->store(status, strlen(status), system_charset_info); + field_store_time_t(table->field[6], job->scheduled_time()); + field_store_time_t(table->field[7], job->started_time()); + + tmp = job->status(); + if (tmp && tmp[0] != '\0') { + table->field[8]->store(tmp, strlen(tmp), system_charset_info); table->field[8]->set_notnull(); } else { table->field[8]->store(NULL, 0, system_charset_info); diff --git a/storage/xtradb/btr/btr0btr.cc b/storage/xtradb/btr/btr0btr.cc index c2a70cce7aa..bce81f95ead 100644 --- a/storage/xtradb/btr/btr0btr.cc +++ b/storage/xtradb/btr/btr0btr.cc @@ -80,7 +80,7 @@ btr_corruption_report( buf_block_get_zip_size(block), BUF_PAGE_PRINT_NO_CRASH); } - buf_page_print(buf_block_get_frame_fast(block), 0, 0); + buf_page_print(buf_nonnull_block_get_frame(block), 0, 0); } #ifndef UNIV_HOTBACKUP @@ -827,11 +827,12 @@ btr_height_get( /* S latches the page */ root_block = btr_root_block_get(index, RW_S_LATCH, mtr); + ut_ad(root_block); // The index must not be corrupted if (root_block) { - height = btr_page_get_level(buf_block_get_frame_fast(root_block), mtr); - + height = btr_page_get_level(buf_nonnull_block_get_frame(root_block), + mtr); /* Release the S latch on the root page. */ mtr_memo_release(mtr, root_block, MTR_MEMO_PAGE_S_FIX); #ifdef UNIV_SYNC_DEBUG @@ -2912,7 +2913,7 @@ btr_attach_half_pages( } /* Get the level of the split pages */ - level = btr_page_get_level(buf_block_get_frame_fast(block), mtr); + level = btr_page_get_level(buf_nonnull_block_get_frame(block), mtr); ut_ad(level == btr_page_get_level(buf_block_get_frame(new_block), mtr)); @@ -4289,8 +4290,10 @@ btr_discard_page( /* Decide the page which will inherit the locks */ - left_page_no = btr_page_get_prev(buf_block_get_frame_fast(block), mtr); - right_page_no = btr_page_get_next(buf_block_get_frame_fast(block), mtr); + left_page_no = btr_page_get_prev(buf_nonnull_block_get_frame(block), + mtr); + right_page_no = btr_page_get_next(buf_nonnull_block_get_frame(block), + mtr); if (left_page_no != FIL_NULL) { merge_block = btr_block_get(space, zip_size, left_page_no, diff --git a/storage/xtradb/buf/buf0flu.cc b/storage/xtradb/buf/buf0flu.cc index a5ce3f3f983..873edec62b4 100644 --- a/storage/xtradb/buf/buf0flu.cc +++ b/storage/xtradb/buf/buf0flu.cc @@ -305,6 +305,8 @@ buf_flush_init_flush_rbt(void) buf_flush_list_mutex_enter(buf_pool); + ut_ad(buf_pool->flush_rbt == NULL); + /* Create red black tree for speedy insertions in flush list. */ buf_pool->flush_rbt = rbt_create( sizeof(buf_page_t*), buf_flush_block_cmp); diff --git a/storage/xtradb/dict/dict0stats.cc b/storage/xtradb/dict/dict0stats.cc index f21fd560235..c13d4583fef 100644 --- a/storage/xtradb/dict/dict0stats.cc +++ b/storage/xtradb/dict/dict0stats.cc @@ -736,7 +736,7 @@ dict_stats_copy( if (dst_idx->type & DICT_FTS) { continue; } - dict_stats_empty_index(dst_idx); + dict_stats_empty_index(dst_idx, true); } else { continue; } diff --git a/storage/xtradb/fil/fil0fil.cc b/storage/xtradb/fil/fil0fil.cc index 81f26b27662..93df92e6e63 100644 --- a/storage/xtradb/fil/fil0fil.cc +++ b/storage/xtradb/fil/fil0fil.cc @@ -1787,6 +1787,9 @@ fil_close_all_files(void) { fil_space_t* space; + // Must check both flags as it's possible for this to be called during + // server startup with srv_track_changed_pages == true but + // srv_redo_log_thread_started == false if (srv_track_changed_pages && srv_redo_log_thread_started) os_event_wait(srv_redo_log_tracked_event); @@ -1826,6 +1829,9 @@ fil_close_log_files( { fil_space_t* space; + // Must check both flags as it's possible for this to be called during + // server startup with srv_track_changed_pages == true but + // srv_redo_log_thread_started == false if (srv_track_changed_pages && srv_redo_log_thread_started) os_event_wait(srv_redo_log_tracked_event); diff --git a/storage/xtradb/fts/fts0fts.cc b/storage/xtradb/fts/fts0fts.cc index 5e008b37b8d..0507be04412 100644 --- a/storage/xtradb/fts/fts0fts.cc +++ b/storage/xtradb/fts/fts0fts.cc @@ -265,13 +265,15 @@ FTS auxiliary INDEX table and clear the cache at the end. @param[in,out] sync sync state @param[in] unlock_cache whether unlock cache lock when write node @param[in] wait whether wait when a sync is in progress +@param[in] has_dict whether has dict operation lock @return DB_SUCCESS if all OK */ static dberr_t fts_sync( fts_sync_t* sync, bool unlock_cache, - bool wait); + bool wait, + bool has_dict); /****************************************************************//** Release all resources help by the words rb tree e.g., the node ilist. */ @@ -3567,7 +3569,7 @@ fts_add_doc_by_id( DBUG_EXECUTE_IF( "fts_instrument_sync_debug", - fts_sync(cache->sync, true, true); + fts_sync(cache->sync, true, true, false); ); DEBUG_SYNC_C("fts_instrument_sync_request"); @@ -4379,13 +4381,11 @@ fts_sync_index( } /** Check if index cache has been synced completely -@param[in,out] sync sync state @param[in,out] index_cache index cache @return true if index is synced, otherwise false. */ static bool fts_sync_index_check( - fts_sync_t* sync, fts_index_cache_t* index_cache) { const ib_rbt_node_t* rbt_node; @@ -4408,14 +4408,36 @@ fts_sync_index_check( return(true); } -/*********************************************************************//** -Commit the SYNC, change state of processed doc ids etc. +/** Reset synced flag in index cache when rollback +@param[in,out] index_cache index cache */ +static +void +fts_sync_index_reset( + fts_index_cache_t* index_cache) +{ + const ib_rbt_node_t* rbt_node; + + for (rbt_node = rbt_first(index_cache->words); + rbt_node != NULL; + rbt_node = rbt_next(index_cache->words, rbt_node)) { + + fts_tokenizer_word_t* word; + word = rbt_value(fts_tokenizer_word_t, rbt_node); + + fts_node_t* fts_node; + fts_node = static_cast<fts_node_t*>(ib_vector_last(word->nodes)); + + fts_node->synced = false; + } +} + +/** Commit the SYNC, change state of processed doc ids etc. +@param[in,out] sync sync state @return DB_SUCCESS if all OK */ static MY_ATTRIBUTE((nonnull, warn_unused_result)) dberr_t fts_sync_commit( -/*============*/ - fts_sync_t* sync) /*!< in: sync state */ + fts_sync_t* sync) { dberr_t error; trx_t* trx = sync->trx; @@ -4468,6 +4490,8 @@ fts_sync_commit( (double) n_nodes/ (double) elapsed_time); } + /* Avoid assertion in trx_free(). */ + trx->dict_operation_lock_mode = 0; trx_free_for_background(trx); return(error); @@ -4490,6 +4514,10 @@ fts_sync_rollback( index_cache = static_cast<fts_index_cache_t*>( ib_vector_get(cache->indexes, i)); + /* Reset synced flag so nodes will not be skipped + in the next sync, see fts_sync_write_words(). */ + fts_sync_index_reset(index_cache); + for (j = 0; fts_index_selector[j].value; ++j) { if (index_cache->ins_graph[j] != NULL) { @@ -4515,6 +4543,9 @@ fts_sync_rollback( rw_lock_x_unlock(&cache->lock); fts_sql_rollback(trx); + + /* Avoid assertion in trx_free(). */ + trx->dict_operation_lock_mode = 0; trx_free_for_background(trx); } @@ -4523,13 +4554,15 @@ FTS auxiliary INDEX table and clear the cache at the end. @param[in,out] sync sync state @param[in] unlock_cache whether unlock cache lock when write node @param[in] wait whether wait when a sync is in progress +@param[in] has_dict whether has dict operation lock @return DB_SUCCESS if all OK */ static dberr_t fts_sync( fts_sync_t* sync, bool unlock_cache, - bool wait) + bool wait, + bool has_dict) { ulint i; dberr_t error = DB_SUCCESS; @@ -4558,6 +4591,12 @@ fts_sync( DEBUG_SYNC_C("fts_sync_begin"); fts_sync_begin(sync); + /* When sync in background, we hold dict operation lock + to prevent DDL like DROP INDEX, etc. */ + if (has_dict) { + sync->trx->dict_operation_lock_mode = RW_S_LATCH; + } + begin_sync: if (cache->total_size > fts_max_cache_size) { /* Avoid the case: sync never finish when @@ -4598,7 +4637,7 @@ begin_sync: ib_vector_get(cache->indexes, i)); if (index_cache->index->to_be_dropped - || fts_sync_index_check(sync, index_cache)) { + || fts_sync_index_check(index_cache)) { continue; } @@ -4613,6 +4652,7 @@ end_sync: } rw_lock_x_lock(&cache->lock); + sync->interrupted = false; sync->in_progress = false; os_event_set(sync->event); rw_lock_x_unlock(&cache->lock); @@ -4636,20 +4676,23 @@ FTS auxiliary INDEX table and clear the cache at the end. @param[in,out] table fts table @param[in] unlock_cache whether unlock cache when write node @param[in] wait whether wait for existing sync to finish +@param[in] has_dict whether has dict operation lock @return DB_SUCCESS on success, error code on failure. */ UNIV_INTERN dberr_t fts_sync_table( dict_table_t* table, bool unlock_cache, - bool wait) + bool wait, + bool has_dict) { dberr_t err = DB_SUCCESS; ut_ad(table->fts); if (!dict_table_is_discarded(table) && table->fts->cache) { - err = fts_sync(table->fts->cache->sync, unlock_cache, wait); + err = fts_sync(table->fts->cache->sync, + unlock_cache, wait, has_dict); } return(err); diff --git a/storage/xtradb/fts/fts0opt.cc b/storage/xtradb/fts/fts0opt.cc index d9f2532578e..ea937c20752 100644 --- a/storage/xtradb/fts/fts0opt.cc +++ b/storage/xtradb/fts/fts0opt.cc @@ -2986,7 +2986,7 @@ fts_optimize_sync_table( if (table) { if (dict_table_has_fts_index(table) && table->fts->cache) { - fts_sync_table(table, true, false); + fts_sync_table(table, true, false, true); } dict_table_close(table, FALSE, FALSE); diff --git a/storage/xtradb/handler/ha_innodb.cc b/storage/xtradb/handler/ha_innodb.cc index 14870659b0e..320b900d019 100644 --- a/storage/xtradb/handler/ha_innodb.cc +++ b/storage/xtradb/handler/ha_innodb.cc @@ -864,6 +864,19 @@ innobase_is_fake_change( THD* thd) __attribute__((unused)); /*!< in: MySQL thread handle of the user for whom the transaction is being committed */ +/** Get the list of foreign keys referencing a specified table +table. +@param thd The thread handle +@param path Path to the table +@param f_key_list[out] The list of foreign keys + +@return error code or zero for success */ +static +int +innobase_get_parent_fk_list( + THD* thd, + const char* path, + List<FOREIGN_KEY_INFO>* f_key_list); /******************************************************************//** Maps a MySQL trx isolation level code to the InnoDB isolation level code @@ -8398,6 +8411,7 @@ dberr_t ha_innobase::innobase_lock_autoinc(void) /*====================================*/ { + DBUG_ENTER("ha_innobase::innobase_lock_autoinc"); dberr_t error = DB_SUCCESS; ut_ad(!srv_read_only_mode); @@ -8437,6 +8451,8 @@ ha_innobase::innobase_lock_autoinc(void) /* Fall through to old style locking. */ case AUTOINC_OLD_STYLE_LOCKING: + DBUG_EXECUTE_IF("die_if_autoinc_old_lock_style_used", + ut_ad(0);); error = row_lock_table_autoinc_for_mysql(prebuilt); if (error == DB_SUCCESS) { @@ -8450,7 +8466,7 @@ ha_innobase::innobase_lock_autoinc(void) ut_error; } - return(error); + DBUG_RETURN(error); } /********************************************************************//** @@ -14469,7 +14485,7 @@ ha_innobase::optimize( if (innodb_optimize_fulltext_only) { if (prebuilt->table->fts && prebuilt->table->fts->cache && !dict_table_is_discarded(prebuilt->table)) { - fts_sync_table(prebuilt->table, false, true); + fts_sync_table(prebuilt->table, false, true, false); fts_optimize_table(prebuilt->table); } return(HA_ADMIN_OK); @@ -14686,7 +14702,14 @@ ha_innobase::check( prebuilt->select_lock_type = LOCK_NONE; - if (!row_check_index_for_mysql(prebuilt, index, &n_rows)) { + bool check_result + = row_check_index_for_mysql(prebuilt, index, &n_rows); + DBUG_EXECUTE_IF( + "dict_set_index_corrupted", + if (!(index->type & DICT_CLUSTERED)) { + check_result = false; + }); + if (!check_result) { innobase_format_name( index_name, sizeof index_name, index->name, TRUE); @@ -15013,6 +15036,75 @@ get_foreign_key_info( return(pf_key_info); } +/** Get the list of foreign keys referencing a specified table +table. +@param thd The thread handle +@param path Path to the table +@param f_key_list[out] The list of foreign keys */ +static +void +fill_foreign_key_list(THD* thd, + const dict_table_t* table, + List<FOREIGN_KEY_INFO>* f_key_list) +{ + ut_ad(mutex_own(&dict_sys->mutex)); + + for (dict_foreign_set::iterator it = table->referenced_set.begin(); + it != table->referenced_set.end(); ++it) { + + dict_foreign_t* foreign = *it; + + FOREIGN_KEY_INFO* pf_key_info + = get_foreign_key_info(thd, foreign); + if (pf_key_info) { + f_key_list->push_back(pf_key_info); + } + } +} + +/** Get the list of foreign keys referencing a specified table +table. +@param thd The thread handle +@param path Path to the table +@param f_key_list[out] The list of foreign keys + +@return error code or zero for success */ +static +int +innobase_get_parent_fk_list( + THD* thd, + const char* path, + List<FOREIGN_KEY_INFO>* f_key_list) +{ + ut_a(strlen(path) <= FN_REFLEN); + char norm_name[FN_REFLEN + 1]; + normalize_table_name(norm_name, path); + + trx_t* parent_trx = check_trx_exists(thd); + parent_trx->op_info = "getting list of referencing foreign keys"; + trx_search_latch_release_if_reserved(parent_trx); + + mutex_enter(&dict_sys->mutex); + + dict_table_t* table + = dict_table_open_on_name(norm_name, TRUE, FALSE, + static_cast<dict_err_ignore_t>( + DICT_ERR_IGNORE_INDEX_ROOT + | DICT_ERR_IGNORE_CORRUPT)); + if (!table) { + mutex_exit(&dict_sys->mutex); + return(HA_ERR_NO_SUCH_TABLE); + } + + fill_foreign_key_list(thd, table, f_key_list); + + dict_table_close(table, TRUE, FALSE); + + mutex_exit(&dict_sys->mutex); + parent_trx->op_info = ""; + return(0); +} + /*******************************************************************//** Gets the list of foreign keys in this table. @return always 0, that is, always succeeds */ @@ -15065,9 +15157,6 @@ ha_innobase::get_parent_foreign_key_list( THD* thd, /*!< in: user thread handle */ List<FOREIGN_KEY_INFO>* f_key_list) /*!< out: foreign key list */ { - FOREIGN_KEY_INFO* pf_key_info; - dict_foreign_t* foreign; - ut_a(prebuilt != NULL); update_thd(ha_thd()); @@ -15076,20 +15165,7 @@ ha_innobase::get_parent_foreign_key_list( trx_search_latch_release_if_reserved(prebuilt->trx); mutex_enter(&(dict_sys->mutex)); - - for (dict_foreign_set::iterator it - = prebuilt->table->referenced_set.begin(); - it != prebuilt->table->referenced_set.end(); - ++it) { - - foreign = *it; - - pf_key_info = get_foreign_key_info(thd, foreign); - if (pf_key_info) { - f_key_list->push_back(pf_key_info); - } - } - + fill_foreign_key_list(thd, prebuilt->table, f_key_list); mutex_exit(&(dict_sys->mutex)); prebuilt->trx->op_info = ""; @@ -18892,7 +18968,6 @@ innodb_track_changed_pages_validate( for update function */ struct st_mysql_value* value) /*!< in: incoming bool */ { - static bool enabled_on_startup = false; long long intbuf = 0; if (value->val_int(value, &intbuf)) { @@ -18900,8 +18975,7 @@ innodb_track_changed_pages_validate( return 1; } - if (srv_track_changed_pages || enabled_on_startup) { - enabled_on_startup = true; + if (srv_redo_log_thread_started) { *reinterpret_cast<ulong*>(save) = static_cast<ulong>(intbuf); return 0; diff --git a/storage/xtradb/handler/i_s.cc b/storage/xtradb/handler/i_s.cc index d0e26f1352c..d96ff377b4a 100644 --- a/storage/xtradb/handler/i_s.cc +++ b/storage/xtradb/handler/i_s.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2007, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2007, 2016, Oracle and/or its affiliates. Copyrigth (c) 2014, 2016, MariaDB Corporation This program is free software; you can redistribute it and/or modify it under @@ -2935,15 +2935,26 @@ i_s_fts_deleted_generic_fill( DBUG_RETURN(0); } - deleted = fts_doc_ids_create(); + /* Prevent DDL to drop fts aux tables. */ + rw_lock_s_lock(&dict_operation_lock); user_table = dict_table_open_on_name( fts_internal_tbl_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE); if (!user_table) { + rw_lock_s_unlock(&dict_operation_lock); + + DBUG_RETURN(0); + } else if (!dict_table_has_fts_index(user_table)) { + dict_table_close(user_table, FALSE, FALSE); + + rw_lock_s_unlock(&dict_operation_lock); + DBUG_RETURN(0); } + deleted = fts_doc_ids_create(); + trx = trx_allocate_for_background(); trx->op_info = "Select for FTS DELETE TABLE"; @@ -2971,6 +2982,8 @@ i_s_fts_deleted_generic_fill( dict_table_close(user_table, FALSE, FALSE); + rw_lock_s_unlock(&dict_operation_lock); + DBUG_RETURN(0); } @@ -3342,6 +3355,12 @@ i_s_fts_index_cache_fill( DBUG_RETURN(0); } + if (user_table->fts == NULL || user_table->fts->cache == NULL) { + dict_table_close(user_table, FALSE, FALSE); + + DBUG_RETURN(0); + } + cache = user_table->fts->cache; ut_a(cache); @@ -3775,10 +3794,15 @@ i_s_fts_index_table_fill( DBUG_RETURN(0); } + /* Prevent DDL to drop fts aux tables. */ + rw_lock_s_lock(&dict_operation_lock); + user_table = dict_table_open_on_name( fts_internal_tbl_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE); if (!user_table) { + rw_lock_s_unlock(&dict_operation_lock); + DBUG_RETURN(0); } @@ -3791,6 +3815,8 @@ i_s_fts_index_table_fill( dict_table_close(user_table, FALSE, FALSE); + rw_lock_s_unlock(&dict_operation_lock); + DBUG_RETURN(0); } @@ -3925,14 +3951,21 @@ i_s_fts_config_fill( fields = table->field; + /* Prevent DDL to drop fts aux tables. */ + rw_lock_s_lock(&dict_operation_lock); + user_table = dict_table_open_on_name( fts_internal_tbl_name, FALSE, FALSE, DICT_ERR_IGNORE_NONE); if (!user_table) { + rw_lock_s_unlock(&dict_operation_lock); + DBUG_RETURN(0); } else if (!dict_table_has_fts_index(user_table)) { dict_table_close(user_table, FALSE, FALSE); + rw_lock_s_unlock(&dict_operation_lock); + DBUG_RETURN(0); } @@ -3988,6 +4021,8 @@ i_s_fts_config_fill( dict_table_close(user_table, FALSE, FALSE); + rw_lock_s_unlock(&dict_operation_lock); + DBUG_RETURN(0); } diff --git a/storage/xtradb/include/buf0buf.h b/storage/xtradb/include/buf0buf.h index f599997be02..6924481af49 100644 --- a/storage/xtradb/include/buf0buf.h +++ b/storage/xtradb/include/buf0buf.h @@ -1084,10 +1084,20 @@ buf_block_get_frame( /*================*/ const buf_block_t* block) /*!< in: pointer to the control block */ MY_ATTRIBUTE((pure)); -# define buf_block_get_frame_fast(block) buf_block_get_frame(block) + +/*********************************************************************//** +Gets a pointer to the memory frame of a block, where block is known not to be +NULL. +@return pointer to the frame */ +UNIV_INLINE +buf_frame_t* +buf_nonnull_block_get_frame( + const buf_block_t* block) /*!< in: pointer to the control block */ + MY_ATTRIBUTE((pure)); + #else /* UNIV_DEBUG */ # define buf_block_get_frame(block) (block ? (block)->frame : 0) -# define buf_block_get_frame_fast(block) (block)->frame +# define buf_nonnull_block_get_frame(block) ((block)->frame) #endif /* UNIV_DEBUG */ /*********************************************************************//** Gets the space id of a block. diff --git a/storage/xtradb/include/buf0buf.ic b/storage/xtradb/include/buf0buf.ic index 7b1c66f2a05..20721b28ef2 100644 --- a/storage/xtradb/include/buf0buf.ic +++ b/storage/xtradb/include/buf0buf.ic @@ -744,6 +744,19 @@ buf_block_get_frame( SRV_CORRUPT_TABLE_CHECK(block, return(0);); + return(buf_nonnull_block_get_frame(block)); +} + +/*********************************************************************//** +Gets a pointer to the memory frame of a block, where block is known not to be +NULL. +@return pointer to the frame */ +UNIV_INLINE +buf_frame_t* +buf_nonnull_block_get_frame( +/*========================*/ + const buf_block_t* block) /*!< in: pointer to the control block */ +{ switch (buf_block_get_state(block)) { case BUF_BLOCK_POOL_WATCH: case BUF_BLOCK_ZIP_PAGE: @@ -768,6 +781,7 @@ buf_block_get_frame( ok: return((buf_frame_t*) block->frame); } + #endif /* UNIV_DEBUG */ /*********************************************************************//** diff --git a/storage/xtradb/include/fts0fts.h b/storage/xtradb/include/fts0fts.h index 68d4d333245..87b5787d416 100644 --- a/storage/xtradb/include/fts0fts.h +++ b/storage/xtradb/include/fts0fts.h @@ -840,13 +840,15 @@ FTS auxiliary INDEX table and clear the cache at the end. @param[in,out] table fts table @param[in] unlock_cache whether unlock cache when write node @param[in] wait whether wait for existing sync to finish +@param[in] has_dict whether has dict operation lock @return DB_SUCCESS on success, error code on failure. */ UNIV_INTERN dberr_t fts_sync_table( dict_table_t* table, bool unlock_cache, - bool wait); + bool wait, + bool has_dict); /****************************************************************//** Free the query graph but check whether dict_sys->mutex is already diff --git a/storage/xtradb/include/srv0srv.h b/storage/xtradb/include/srv0srv.h index d95adf00814..f60cfde1264 100644 --- a/storage/xtradb/include/srv0srv.h +++ b/storage/xtradb/include/srv0srv.h @@ -225,8 +225,10 @@ extern os_event_t srv_checkpoint_completed_event; log tracking iteration */ extern os_event_t srv_redo_log_tracked_event; -/** srv_redo_log_follow_thread spawn flag */ -extern bool srv_redo_log_thread_started; +/** Whether the redo log tracker thread has been started. Does not take into +account whether the tracking is currently enabled (see srv_track_changed_pages +for that) */ +extern bool srv_redo_log_thread_started; /* If the last data file is auto-extended, we add this many pages to it at a time */ @@ -344,6 +346,10 @@ extern char** srv_data_file_names; extern ulint* srv_data_file_sizes; extern ulint* srv_data_file_is_raw_partition; + +/** Whether the redo log tracking is currently enabled. Note that it is +possible for the log tracker thread to be running and the tracking to be +disabled */ extern my_bool srv_track_changed_pages; extern ulonglong srv_max_bitmap_file_size; diff --git a/storage/xtradb/include/univ.i b/storage/xtradb/include/univ.i index 5320776c042..a42b8b8bc25 100644 --- a/storage/xtradb/include/univ.i +++ b/storage/xtradb/include/univ.i @@ -45,10 +45,10 @@ Created 1/20/1994 Heikki Tuuri #define INNODB_VERSION_MAJOR 5 #define INNODB_VERSION_MINOR 6 -#define INNODB_VERSION_BUGFIX 31 +#define INNODB_VERSION_BUGFIX 32 #ifndef PERCONA_INNODB_VERSION -#define PERCONA_INNODB_VERSION 77.0 +#define PERCONA_INNODB_VERSION 78.1 #endif /* Enable UNIV_LOG_ARCHIVE in XtraDB */ diff --git a/storage/xtradb/log/log0log.cc b/storage/xtradb/log/log0log.cc index 0b5d27b8fd1..411fed91ac5 100644 --- a/storage/xtradb/log/log0log.cc +++ b/storage/xtradb/log/log0log.cc @@ -3752,7 +3752,7 @@ loop: /* Wake the log tracking thread which will then immediatelly quit because of srv_shutdown_state value */ - if (srv_track_changed_pages) { + if (srv_redo_log_thread_started) { os_event_reset(srv_redo_log_tracked_event); os_event_set(srv_checkpoint_completed_event); } @@ -3831,7 +3831,7 @@ loop: srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE; /* Signal the log following thread to quit */ - if (srv_track_changed_pages) { + if (srv_redo_log_thread_started) { os_event_reset(srv_redo_log_tracked_event); os_event_set(srv_checkpoint_completed_event); } diff --git a/storage/xtradb/log/log0online.cc b/storage/xtradb/log/log0online.cc index 63f1ef39568..167d46e2ae8 100644 --- a/storage/xtradb/log/log0online.cc +++ b/storage/xtradb/log/log0online.cc @@ -1788,20 +1788,20 @@ log_online_purge_changed_page_bitmaps( lsn = LSN_MAX; } - if (srv_track_changed_pages) { + if (srv_redo_log_thread_started) { /* User requests might happen with both enabled and disabled tracking */ mutex_enter(&log_bmp_sys->mutex); } if (!log_online_setup_bitmap_file_range(&bitmap_files, 0, LSN_MAX)) { - if (srv_track_changed_pages) { + if (srv_redo_log_thread_started) { mutex_exit(&log_bmp_sys->mutex); } return TRUE; } - if (srv_track_changed_pages && lsn > log_bmp_sys->end_lsn) { + if (srv_redo_log_thread_started && lsn > log_bmp_sys->end_lsn) { /* If we have to delete the current output file, close it first. */ os_file_close(log_bmp_sys->out.file); @@ -1834,7 +1834,7 @@ log_online_purge_changed_page_bitmaps( } } - if (srv_track_changed_pages) { + if (srv_redo_log_thread_started) { if (lsn > log_bmp_sys->end_lsn) { lsn_t new_file_lsn; if (lsn == LSN_MAX) { @@ -1845,9 +1845,7 @@ log_online_purge_changed_page_bitmaps( new_file_lsn = log_bmp_sys->end_lsn; } if (!log_online_rotate_bitmap_file(new_file_lsn)) { - /* If file create failed, signal the log - tracking thread to quit next time it wakes - up. */ + /* If file create failed, stop log tracking */ srv_track_changed_pages = FALSE; } } diff --git a/storage/xtradb/log/log0recv.cc b/storage/xtradb/log/log0recv.cc index 759687e3fe5..092c2ed88dc 100644 --- a/storage/xtradb/log/log0recv.cc +++ b/storage/xtradb/log/log0recv.cc @@ -392,12 +392,6 @@ recv_sys_init( } #ifndef UNIV_HOTBACKUP - /* Initialize red-black tree for fast insertions into the - flush_list during recovery process. - As this initialization is done while holding the buffer pool - mutex we perform it before acquiring recv_sys->mutex. */ - buf_flush_init_flush_rbt(); - mutex_enter(&(recv_sys->mutex)); recv_sys->heap = mem_heap_create_typed(256, @@ -490,9 +484,6 @@ recv_sys_debug_free(void) recv_sys->last_block_buf_start = NULL; mutex_exit(&(recv_sys->mutex)); - - /* Free up the flush_rbt. */ - buf_flush_free_flush_rbt(); } # endif /* UNIV_LOG_DEBUG */ @@ -3140,6 +3131,11 @@ recv_recovery_from_checkpoint_start_func( byte* log_hdr_buf_base = reinterpret_cast<byte *> (alloca(LOG_FILE_HDR_SIZE + OS_FILE_LOG_BLOCK_SIZE)); dberr_t err; + + /* Initialize red-black tree for fast insertions into the + flush_list during recovery process. */ + buf_flush_init_flush_rbt(); + ut_when_dtor<recv_dblwr_t> tmp(recv_sys->dblwr); log_hdr_buf = static_cast<byte *> @@ -3568,6 +3564,9 @@ recv_recovery_from_checkpoint_finish(void) #ifndef UNIV_LOG_DEBUG recv_sys_debug_free(); #endif + /* Free up the flush_rbt. */ + buf_flush_free_flush_rbt(); + /* Roll back any recovered data dictionary transactions, so that the data dictionary tables will be free of any locks. The data dictionary latch should guarantee that there is at diff --git a/storage/xtradb/row/row0merge.cc b/storage/xtradb/row/row0merge.cc index f5967ede3e7..3d7a5d2ef5d 100644 --- a/storage/xtradb/row/row0merge.cc +++ b/storage/xtradb/row/row0merge.cc @@ -2177,7 +2177,7 @@ wait_again: /* Sync fts cache for other fts indexes to keep all fts indexes consistent in sync_doc_id. */ err = fts_sync_table(const_cast<dict_table_t*>(new_table), - false, true); + false, true, false); if (err == DB_SUCCESS) { fts_update_next_doc_id( diff --git a/storage/xtradb/srv/srv0mon.cc b/storage/xtradb/srv/srv0mon.cc index 1e0d21d4a9e..7c2e549e188 100644 --- a/storage/xtradb/srv/srv0mon.cc +++ b/storage/xtradb/srv/srv0mon.cc @@ -1,6 +1,6 @@ /***************************************************************************** -Copyright (c) 2010, 2014, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2010, 2016, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2012, Facebook Inc. Copyright (c) 2013, 2016, MariaDB Corporation. @@ -1499,7 +1499,10 @@ srv_mon_set_module_control( module */ set_current_module = FALSE; } else if (module_id == MONITOR_ALL_COUNTER) { - continue; + if (!(innodb_counter_info[ix].monitor_type + & MONITOR_GROUP_MODULE)) { + continue; + } } else { /* Hitting the next module, stop */ break; diff --git a/storage/xtradb/srv/srv0srv.cc b/storage/xtradb/srv/srv0srv.cc index a836442eb70..f9c75ffe576 100644 --- a/storage/xtradb/srv/srv0srv.cc +++ b/storage/xtradb/srv/srv0srv.cc @@ -219,6 +219,9 @@ UNIV_INTERN char** srv_data_file_names = NULL; /* size in database pages */ UNIV_INTERN ulint* srv_data_file_sizes = NULL; +/** Whether the redo log tracking is currently enabled. Note that it is +possible for the log tracker thread to be running and the tracking to be +disabled */ UNIV_INTERN my_bool srv_track_changed_pages = FALSE; UNIV_INTERN ulonglong srv_max_bitmap_file_size = 100 * 1024 * 1024; @@ -848,6 +851,9 @@ UNIV_INTERN os_event_t srv_checkpoint_completed_event; UNIV_INTERN os_event_t srv_redo_log_tracked_event; +/** Whether the redo log tracker thread has been started. Does not take into +account whether the tracking is currently enabled (see srv_track_changed_pages +for that) */ UNIV_INTERN bool srv_redo_log_thread_started = false; /*********************************************************************//** @@ -2546,13 +2552,8 @@ DECLARE_THREAD(srv_redo_log_follow_thread)( os_event_wait(srv_checkpoint_completed_event); os_event_reset(srv_checkpoint_completed_event); -#ifdef UNIV_DEBUG - if (!srv_track_changed_pages) { - continue; - } -#endif - - if (srv_shutdown_state < SRV_SHUTDOWN_LAST_PHASE) { + if (srv_track_changed_pages + && srv_shutdown_state < SRV_SHUTDOWN_LAST_PHASE) { if (!log_online_follow_redo_log()) { /* TODO: sync with I_S log tracking status? */ ib_logf(IB_LOG_LEVEL_ERROR, diff --git a/strings/ctype-ucs2.c b/strings/ctype-ucs2.c index cae85f38c12..f2bcf69bbc6 100644 --- a/strings/ctype-ucs2.c +++ b/strings/ctype-ucs2.c @@ -1,5 +1,5 @@ /* Copyright (c) 2003, 2013, Oracle and/or its affiliates - Copyright (c) 2009, 2014, SkySQL Ab. + Copyright (c) 2009, 2016, MariaDB This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public diff --git a/strings/ctype-utf8.c b/strings/ctype-utf8.c index e4eb2832dff..900e2d3500a 100644 --- a/strings/ctype-utf8.c +++ b/strings/ctype-utf8.c @@ -1,5 +1,5 @@ /* Copyright (c) 2000, 2013, Oracle and/or its affiliates. - Copyright (c) 2009, 2013, Monty Program Ab + Copyright (c) 2009, 2016, MariaDB This library is free software; you can redistribute it and/or modify it under the terms of the GNU Library General Public diff --git a/support-files/mysql.server.sh b/support-files/mysql.server.sh index e5cdbfd3ce8..54f9ff55e3b 100644 --- a/support-files/mysql.server.sh +++ b/support-files/mysql.server.sh @@ -308,7 +308,7 @@ case "$mode" in then # Give extra arguments to mysqld with the my.cnf file. This script # may be overwritten at next upgrade. - $bindir/mysqld_safe --datadir="$datadir" --pid-file="$mysqld_pid_file_path" "$@" >/dev/null 2>&1 & + $bindir/mysqld_safe --datadir="$datadir" --pid-file="$mysqld_pid_file_path" "$@" >/dev/null & wait_for_ready; return_value=$? # Make lock for RedHat / SuSE diff --git a/tests/async_queries.c b/tests/async_queries.c index 76e884e6a69..a8889fc8d5a 100644 --- a/tests/async_queries.c +++ b/tests/async_queries.c @@ -425,7 +425,7 @@ main(int argc, char *argv[]) event_dispatch(); - free(sds); + my_free(sds); mysql_library_end(); diff --git a/win/packaging/CMakeLists.txt b/win/packaging/CMakeLists.txt index 0535a486d57..1682bae6986 100644 --- a/win/packaging/CMakeLists.txt +++ b/win/packaging/CMakeLists.txt @@ -24,10 +24,13 @@ ENDIF() SET(MANUFACTURER "MariaDB Corporation Ab") -FIND_PATH(WIX_DIR heat.exe - "$ENV{ProgramFiles}/WiX Toolset v3.9/bin" - "$ENV{ProgramFiles}/WiX Toolset v3.10/bin" -) +SET(WIX_BIN_PATHS) +FOREACH(WIX_VER 3.9 3.10 3.11) + LIST(APPEND WIX_BIN_PATHS "$ENV{ProgramFiles}/WiX Toolset v${WIX_VER}/bin") + LIST(APPEND WIX_BIN_PATHS "$ENV{ProgramFiles} (x86)/WiX Toolset v${WIX_VER}/bin") +ENDFOREACH() + +FIND_PATH(WIX_DIR heat.exe ${WIX_BIN_PATHS}) SET(CPACK_WIX_PACKAGE_BASE_NAME "MariaDB") IF(CMAKE_SIZEOF_VOID_P EQUAL 4) SET(CPACK_WIX_UPGRADE_CODE "49EB7A6A-1CEF-4A1E-9E89-B9A4993963E3") diff --git a/win/packaging/create_msi.cmake.in b/win/packaging/create_msi.cmake.in index c2ab648a6db..1f847a39695 100644 --- a/win/packaging/create_msi.cmake.in +++ b/win/packaging/create_msi.cmake.in @@ -434,6 +434,7 @@ EXECUTE_PROCESS( IF(SIGNCODE) EXECUTE_PROCESS( COMMAND ${SIGNTOOL_EXECUTABLE} sign ${SIGNTOOL_PARAMETERS} + /d ${CPACK_PACKAGE_FILE_NAME}.msi ${CPACK_PACKAGE_FILE_NAME}.msi ) ENDIF() |