diff options
94 files changed, 4102 insertions, 7045 deletions
diff --git a/mysql-test/suite/encryption/r/innodb-redo-badkey.result b/mysql-test/suite/encryption/r/innodb-redo-badkey.result index f90e7aeb780..87377a01479 100644 --- a/mysql-test/suite/encryption/r/innodb-redo-badkey.result +++ b/mysql-test/suite/encryption/r/innodb-redo-badkey.result @@ -1,10 +1,10 @@ call mtr.add_suppression("Plugin 'file_key_management'"); call mtr.add_suppression("Plugin 'InnoDB' init function returned error."); -call mtr.add_suppression("InnoDB: The page \\[page id: space=[1-9][0-9]*, page number=[0-9]+\\] in file '.*test/t[1-4]\\.ibd' cannot be decrypted"); +call mtr.add_suppression("InnoDB: The page \\[page id: space=[1-9][0-9]*, page number=[1-9][0-9]*\\] in file '.*test.t[1-4]\\.ibd' cannot be decrypted"); call mtr.add_suppression("failed to read or decrypt \\[page id: space=[1-9][0-9]*, page number=[1-9][0-9]*\\]"); call mtr.add_suppression("InnoDB: Unable to decompress .*.test.t[12]\\.ibd\\[page id: space=[1-9][0-9]*, page number=[0-9]+\\]"); -call mtr.add_suppression("InnoDB: Database page corruption on disk or a failed file read of tablespace test/t[12] page \\[page id: space=[1-9][0-9]*, page number=[0-9]*\\]"); -call mtr.add_suppression("InnoDB: Failed to read file '.*' at offset .*"); +call mtr.add_suppression("InnoDB: Database page corruption on disk or a failed read of file '.*test.t[12]\\.ibd'"); +call mtr.add_suppression("InnoDB: Failed to read page .* from file '.*'"); call mtr.add_suppression("InnoDB: Plugin initialization aborted"); call mtr.add_suppression("Plugin 'InnoDB' registration as a STORAGE ENGINE failed"); call mtr.add_suppression("\\[ERROR\\] InnoDB: Cannot decrypt \\[page id: space="); diff --git a/mysql-test/suite/encryption/t/corrupted_during_recovery.test b/mysql-test/suite/encryption/t/corrupted_during_recovery.test index 7e13168e759..48445ccb08b 100644 --- a/mysql-test/suite/encryption/t/corrupted_during_recovery.test +++ b/mysql-test/suite/encryption/t/corrupted_during_recovery.test @@ -5,8 +5,8 @@ call mtr.add_suppression("InnoDB: Plugin initialization aborted"); call mtr.add_suppression("Plugin 'InnoDB' init function returned error"); call mtr.add_suppression("Plugin 'InnoDB' registration as a STORAGE ENGINE failed"); -call mtr.add_suppression("InnoDB: Database page corruption on disk or a failed file read of tablespace test/t1 page"); -call mtr.add_suppression("InnoDB: Failed to read file '.*test.t1\\.ibd' at offset 3: Table is encrypted but decrypt failed"); +call mtr.add_suppression("InnoDB: Database page corruption on disk or a failed read of file '.*test.t1\\.ibd' page"); +call mtr.add_suppression("InnoDB: Failed to read page [123] from file '.*test.t1\\.ibd': Table is encrypted but decrypt failed"); call mtr.add_suppression("InnoDB: The page \\[page id: space=\\d+, page number=3\\] in file '.*test.t1\\.ibd' cannot be decrypted"); call mtr.add_suppression("InnoDB: Table in tablespace \\d+ encrypted. However key management plugin or used key_version \\d+ is not found or used encryption algorithm or method does not match. Can't continue opening the table."); --enable_query_log diff --git a/mysql-test/suite/encryption/t/innodb-redo-badkey.test b/mysql-test/suite/encryption/t/innodb-redo-badkey.test index 50f81deb462..09ad7a7d5a3 100644 --- a/mysql-test/suite/encryption/t/innodb-redo-badkey.test +++ b/mysql-test/suite/encryption/t/innodb-redo-badkey.test @@ -9,11 +9,11 @@ call mtr.add_suppression("Plugin 'file_key_management'"); call mtr.add_suppression("Plugin 'InnoDB' init function returned error."); -call mtr.add_suppression("InnoDB: The page \\[page id: space=[1-9][0-9]*, page number=[0-9]+\\] in file '.*test/t[1-4]\\.ibd' cannot be decrypted"); +call mtr.add_suppression("InnoDB: The page \\[page id: space=[1-9][0-9]*, page number=[1-9][0-9]*\\] in file '.*test.t[1-4]\\.ibd' cannot be decrypted"); call mtr.add_suppression("failed to read or decrypt \\[page id: space=[1-9][0-9]*, page number=[1-9][0-9]*\\]"); call mtr.add_suppression("InnoDB: Unable to decompress .*.test.t[12]\\.ibd\\[page id: space=[1-9][0-9]*, page number=[0-9]+\\]"); -call mtr.add_suppression("InnoDB: Database page corruption on disk or a failed file read of tablespace test/t[12] page \\[page id: space=[1-9][0-9]*, page number=[0-9]*\\]"); -call mtr.add_suppression("InnoDB: Failed to read file '.*' at offset .*"); +call mtr.add_suppression("InnoDB: Database page corruption on disk or a failed read of file '.*test.t[12]\\.ibd'"); +call mtr.add_suppression("InnoDB: Failed to read page .* from file '.*'"); call mtr.add_suppression("InnoDB: Plugin initialization aborted"); call mtr.add_suppression("Plugin 'InnoDB' registration as a STORAGE ENGINE failed"); # for innodb_checksum_algorithm=full_crc32 only diff --git a/mysql-test/suite/innodb/r/innodb_information_schema_buffer.result b/mysql-test/suite/innodb/r/innodb_information_schema_buffer.result index 6328458d46e..46372cd85f2 100644 --- a/mysql-test/suite/innodb/r/innodb_information_schema_buffer.result +++ b/mysql-test/suite/innodb/r/innodb_information_schema_buffer.result @@ -1,35 +1,24 @@ SELECT * FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; -SELECT count(*) FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; -SELECT * FROM INFORMATION_SCHEMA.INNODB_BUFFER_PAGE; -SELECT COUNT(*) FROM INFORMATION_SCHEMA.INNODB_BUFFER_PAGE; +POOL_ID POOL_SIZE FREE_BUFFERS DATABASE_PAGES OLD_DATABASE_PAGES MODIFIED_DATABASE_PAGES PENDING_DECOMPRESS PENDING_READS PENDING_FLUSH_LRU PENDING_FLUSH_LIST PAGES_MADE_YOUNG PAGES_NOT_MADE_YOUNG PAGES_MADE_YOUNG_RATE PAGES_MADE_NOT_YOUNG_RATE NUMBER_PAGES_READ NUMBER_PAGES_CREATED NUMBER_PAGES_WRITTEN PAGES_READ_RATE PAGES_CREATE_RATE PAGES_WRITTEN_RATE NUMBER_PAGES_GET HIT_RATE YOUNG_MAKE_PER_THOUSAND_GETS NOT_YOUNG_MAKE_PER_THOUSAND_GETS NUMBER_PAGES_READ_AHEAD NUMBER_READ_AHEAD_EVICTED READ_AHEAD_RATE READ_AHEAD_EVICTED_RATE LRU_IO_TOTAL LRU_IO_CURRENT UNCOMPRESS_TOTAL UNCOMPRESS_CURRENT +# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # CREATE TABLE infoschema_buffer_test (col1 INT) ENGINE = INNODB; INSERT INTO infoschema_buffer_test VALUES(9); -SELECT TABLE_NAME, INDEX_NAME, NUMBER_RECORDS, DATA_SIZE, PAGE_STATE, PAGE_TYPE -FROM INFORMATION_SCHEMA.INNODB_BUFFER_PAGE -WHERE TABLE_NAME like "%infoschema_buffer_test%" - and PAGE_STATE="file_page" and PAGE_TYPE="index"; -TABLE_NAME INDEX_NAME NUMBER_RECORDS DATA_SIZE PAGE_STATE PAGE_TYPE -`test`.`infoschema_buffer_test` GEN_CLUST_INDEX 1 29 FILE_PAGE INDEX +SELECT * FROM INFORMATION_SCHEMA.INNODB_BUFFER_PAGE +WHERE TABLE_NAME LIKE '%infoschema_buffer_test%' AND PAGE_TYPE='index'; +POOL_ID BLOCK_ID SPACE PAGE_NUMBER PAGE_TYPE FLUSH_TYPE FIX_COUNT IS_HASHED NEWEST_MODIFICATION OLDEST_MODIFICATION ACCESS_TIME TABLE_NAME INDEX_NAME NUMBER_RECORDS DATA_SIZE COMPRESSED_SIZE PAGE_STATE IO_FIX IS_OLD FREE_PAGE_CLOCK +0 # # 3 INDEX 0 FIX AHI LSN LSN TIME `test`.`infoschema_buffer_test` GEN_CLUST_INDEX 1 29 0 FILE_PAGE IO_FIX OLD # INSERT INTO infoschema_buffer_test VALUES(19); -SELECT TABLE_NAME, INDEX_NAME, NUMBER_RECORDS, DATA_SIZE, PAGE_STATE, PAGE_TYPE -FROM INFORMATION_SCHEMA.INNODB_BUFFER_PAGE -WHERE TABLE_NAME like "%infoschema_buffer_test%" -and PAGE_STATE="file_page" and PAGE_TYPE="index"; -TABLE_NAME INDEX_NAME NUMBER_RECORDS DATA_SIZE PAGE_STATE PAGE_TYPE -`test`.`infoschema_buffer_test` GEN_CLUST_INDEX 2 58 FILE_PAGE INDEX CREATE INDEX idx ON infoschema_buffer_test(col1); -SELECT TABLE_NAME, INDEX_NAME, NUMBER_RECORDS, DATA_SIZE, PAGE_STATE, PAGE_TYPE -FROM INFORMATION_SCHEMA.INNODB_BUFFER_PAGE -WHERE TABLE_NAME like "%infoschema_buffer_test%" -and PAGE_STATE="file_page" and INDEX_NAME = "idx" and PAGE_TYPE="index"; -TABLE_NAME INDEX_NAME NUMBER_RECORDS DATA_SIZE PAGE_STATE PAGE_TYPE -`test`.`infoschema_buffer_test` idx 2 32 FILE_PAGE INDEX -`test`.`infoschema_buffer_test` idx 2 32 FILE_PAGE INDEX +SELECT * FROM INFORMATION_SCHEMA.INNODB_BUFFER_PAGE +WHERE TABLE_NAME LIKE '%infoschema_buffer_test%' AND PAGE_TYPE='index'; +POOL_ID BLOCK_ID SPACE PAGE_NUMBER PAGE_TYPE FLUSH_TYPE FIX_COUNT IS_HASHED NEWEST_MODIFICATION OLDEST_MODIFICATION ACCESS_TIME TABLE_NAME INDEX_NAME NUMBER_RECORDS DATA_SIZE COMPRESSED_SIZE PAGE_STATE IO_FIX IS_OLD FREE_PAGE_CLOCK +0 # # 3 INDEX 0 FIX AHI LSN LSN TIME `test`.`infoschema_buffer_test` GEN_CLUST_INDEX 2 58 0 FILE_PAGE IO_FIX OLD # +0 # # 4 INDEX 0 FIX AHI LSN LSN TIME `test`.`infoschema_buffer_test` idx 2 32 0 FILE_PAGE IO_FIX OLD # +0 # # 5 INDEX 0 FIX AHI LSN LSN TIME `test`.`infoschema_buffer_test` idx 2 32 0 FILE_PAGE IO_FIX OLD # DROP TABLE infoschema_buffer_test; -SELECT TABLE_NAME, INDEX_NAME, NUMBER_RECORDS, DATA_SIZE, PAGE_STATE, PAGE_TYPE -FROM INFORMATION_SCHEMA.INNODB_BUFFER_PAGE -WHERE TABLE_NAME like "%infoschema_buffer_test%"; -TABLE_NAME INDEX_NAME NUMBER_RECORDS DATA_SIZE PAGE_STATE PAGE_TYPE +SELECT * FROM INFORMATION_SCHEMA.INNODB_BUFFER_PAGE +WHERE TABLE_NAME LIKE '%infoschema_buffer_test%'; +POOL_ID BLOCK_ID SPACE PAGE_NUMBER PAGE_TYPE FLUSH_TYPE FIX_COUNT IS_HASHED NEWEST_MODIFICATION OLDEST_MODIFICATION ACCESS_TIME TABLE_NAME INDEX_NAME NUMBER_RECORDS DATA_SIZE COMPRESSED_SIZE PAGE_STATE IO_FIX IS_OLD FREE_PAGE_CLOCK CREATE TABLE infoschema_parent (id INT NOT NULL, PRIMARY KEY (id)) ENGINE=INNODB; CREATE TABLE infoschema_child (id INT, parent_id INT, INDEX par_ind (parent_id), @@ -37,11 +26,10 @@ FOREIGN KEY (parent_id) REFERENCES infoschema_parent(id) ON DELETE CASCADE) ENGINE=INNODB; -SELECT count(*) -FROM INFORMATION_SCHEMA.INNODB_BUFFER_PAGE -WHERE TABLE_NAME like "%infoschema_child%" and PAGE_STATE="file_page" -and PAGE_TYPE="index"; -count(*) -2 +SELECT * FROM INFORMATION_SCHEMA.INNODB_BUFFER_PAGE +WHERE TABLE_NAME LIKE '%infoschema_child%'; +POOL_ID BLOCK_ID SPACE PAGE_NUMBER PAGE_TYPE FLUSH_TYPE FIX_COUNT IS_HASHED NEWEST_MODIFICATION OLDEST_MODIFICATION ACCESS_TIME TABLE_NAME INDEX_NAME NUMBER_RECORDS DATA_SIZE COMPRESSED_SIZE PAGE_STATE IO_FIX IS_OLD FREE_PAGE_CLOCK +0 # # 3 INDEX 0 FIX AHI LSN LSN TIME `test`.`infoschema_child` GEN_CLUST_INDEX 0 0 0 FILE_PAGE IO_FIX OLD # +0 # # 4 INDEX 0 FIX AHI LSN LSN TIME `test`.`infoschema_child` par_ind 0 0 0 FILE_PAGE IO_FIX OLD # DROP TABLE infoschema_child; DROP TABLE infoschema_parent; diff --git a/mysql-test/suite/innodb/t/corrupted_during_recovery.test b/mysql-test/suite/innodb/t/corrupted_during_recovery.test index 5f1de1bedf9..31fd1f18b8a 100644 --- a/mysql-test/suite/innodb/t/corrupted_during_recovery.test +++ b/mysql-test/suite/innodb/t/corrupted_during_recovery.test @@ -4,8 +4,8 @@ call mtr.add_suppression("InnoDB: Plugin initialization aborted"); call mtr.add_suppression("Plugin 'InnoDB' init function returned error"); call mtr.add_suppression("Plugin 'InnoDB' registration as a STORAGE ENGINE failed"); -call mtr.add_suppression("InnoDB: Database page corruption on disk or a failed file read of tablespace test/t1 page"); -call mtr.add_suppression("InnoDB: Failed to read file '.*test.t1\\.ibd' at offset 3: Page read from tablespace is corrupted."); +call mtr.add_suppression("InnoDB: Database page corruption on disk or a failed read of file '.*test.t1\\.ibd' page"); +call mtr.add_suppression("InnoDB: Failed to read page 3 from file '.*test.t1\\.ibd': Page read from tablespace is corrupted."); call mtr.add_suppression("InnoDB: Background Page read failed to read or decrypt \\[page id: space=\\d+, page number=3\\]"); call mtr.add_suppression("InnoDB: Table `test`.`t1` is corrupted. Please drop the table and recreate."); --enable_query_log diff --git a/mysql-test/suite/innodb/t/innodb_bug14147491.test b/mysql-test/suite/innodb/t/innodb_bug14147491.test index 44b9d16ca78..c6e4f01a642 100644 --- a/mysql-test/suite/innodb/t/innodb_bug14147491.test +++ b/mysql-test/suite/innodb/t/innodb_bug14147491.test @@ -9,7 +9,7 @@ --disable_query_log call mtr.add_suppression("InnoDB: Table `test`\\.`t1` is corrupted\\. Please drop the table and recreate\\."); -call mtr.add_suppression("InnoDB: Database page corruption on disk or a failed file read of tablespace test/t1 page"); +call mtr.add_suppression("InnoDB: Database page corruption on disk or a failed read of file '.*test.t1\\.ibd' page"); call mtr.add_suppression("InnoDB: We detected index corruption in an InnoDB type table"); call mtr.add_suppression("Index for table 't1' is corrupt; try to repair it"); --enable_query_log diff --git a/mysql-test/suite/innodb/t/innodb_information_schema_buffer.test b/mysql-test/suite/innodb/t/innodb_information_schema_buffer.test index 5bfac22ec39..7f0d4f0a737 100644 --- a/mysql-test/suite/innodb/t/innodb_information_schema_buffer.test +++ b/mysql-test/suite/innodb/t/innodb_information_schema_buffer.test @@ -8,19 +8,9 @@ -- source include/have_innodb.inc --- disable_result_log +--replace_regex /([0-9]*\.)?[0-9]+/#/ SELECT * FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; -# How many buffer pools we have -SELECT count(*) FROM INFORMATION_SCHEMA.INNODB_BUFFER_POOL_STATS; - -SELECT * FROM INFORMATION_SCHEMA.INNODB_BUFFER_PAGE; - -# This gives the over all buffer pool size -SELECT COUNT(*) FROM INFORMATION_SCHEMA.INNODB_BUFFER_PAGE; - --- enable_result_log - # Create a table and check its page info behave correctly in the pool CREATE TABLE infoschema_buffer_test (col1 INT) ENGINE = INNODB; @@ -28,36 +18,27 @@ INSERT INTO infoschema_buffer_test VALUES(9); # We should be able to see this table in the buffer pool if we check # right away -SELECT TABLE_NAME, INDEX_NAME, NUMBER_RECORDS, DATA_SIZE, PAGE_STATE, PAGE_TYPE -FROM INFORMATION_SCHEMA.INNODB_BUFFER_PAGE -WHERE TABLE_NAME like "%infoschema_buffer_test%" - and PAGE_STATE="file_page" and PAGE_TYPE="index"; +--sorted_result +--replace_column 2 # 3 # 7 FIX 8 AHI 9 LSN 10 LSN 11 TIME 18 IO_FIX 19 OLD 20 # +SELECT * FROM INFORMATION_SCHEMA.INNODB_BUFFER_PAGE +WHERE TABLE_NAME LIKE '%infoschema_buffer_test%' AND PAGE_TYPE='index'; # The NUMBER_RECORDS and DATA_SIZE should check with each insertion INSERT INTO infoschema_buffer_test VALUES(19); -SELECT TABLE_NAME, INDEX_NAME, NUMBER_RECORDS, DATA_SIZE, PAGE_STATE, PAGE_TYPE -FROM INFORMATION_SCHEMA.INNODB_BUFFER_PAGE -WHERE TABLE_NAME like "%infoschema_buffer_test%" -and PAGE_STATE="file_page" and PAGE_TYPE="index"; - CREATE INDEX idx ON infoschema_buffer_test(col1); -SELECT TABLE_NAME, INDEX_NAME, NUMBER_RECORDS, DATA_SIZE, PAGE_STATE, PAGE_TYPE -FROM INFORMATION_SCHEMA.INNODB_BUFFER_PAGE -WHERE TABLE_NAME like "%infoschema_buffer_test%" -and PAGE_STATE="file_page" and INDEX_NAME = "idx" and PAGE_TYPE="index"; - +--sorted_result +--replace_column 2 # 3 # 7 FIX 8 AHI 9 LSN 10 LSN 11 TIME 18 IO_FIX 19 OLD 20 # +SELECT * FROM INFORMATION_SCHEMA.INNODB_BUFFER_PAGE +WHERE TABLE_NAME LIKE '%infoschema_buffer_test%' AND PAGE_TYPE='index'; # Check the buffer after dropping the table DROP TABLE infoschema_buffer_test; -SELECT TABLE_NAME, INDEX_NAME, NUMBER_RECORDS, DATA_SIZE, PAGE_STATE, PAGE_TYPE -FROM INFORMATION_SCHEMA.INNODB_BUFFER_PAGE -WHERE TABLE_NAME like "%infoschema_buffer_test%"; +SELECT * FROM INFORMATION_SCHEMA.INNODB_BUFFER_PAGE +WHERE TABLE_NAME LIKE '%infoschema_buffer_test%'; -# Do one more test -#--replace_regex /'*[0-9]*'/'NUM'/ CREATE TABLE infoschema_parent (id INT NOT NULL, PRIMARY KEY (id)) ENGINE=INNODB; @@ -67,11 +48,10 @@ CREATE TABLE infoschema_child (id INT, parent_id INT, INDEX par_ind (parent_id), ON DELETE CASCADE) ENGINE=INNODB; -SELECT count(*) -FROM INFORMATION_SCHEMA.INNODB_BUFFER_PAGE -WHERE TABLE_NAME like "%infoschema_child%" and PAGE_STATE="file_page" -and PAGE_TYPE="index"; +--sorted_result +--replace_column 2 # 3 # 7 FIX 8 AHI 9 LSN 10 LSN 11 TIME 18 IO_FIX 19 OLD 20 # +SELECT * FROM INFORMATION_SCHEMA.INNODB_BUFFER_PAGE +WHERE TABLE_NAME LIKE '%infoschema_child%'; DROP TABLE infoschema_child; DROP TABLE infoschema_parent; - diff --git a/mysql-test/suite/innodb/t/leaf_page_corrupted_during_recovery.test b/mysql-test/suite/innodb/t/leaf_page_corrupted_during_recovery.test index 7ffb9bb6596..f20f17f869d 100644 --- a/mysql-test/suite/innodb/t/leaf_page_corrupted_during_recovery.test +++ b/mysql-test/suite/innodb/t/leaf_page_corrupted_during_recovery.test @@ -2,9 +2,9 @@ --source include/have_debug.inc --disable_query_log -call mtr.add_suppression("InnoDB: Database page corruption on disk or a failed file read of tablespace test/t1 page "); +call mtr.add_suppression("InnoDB: Database page corruption on disk or a failed read of file '.*test.t1\\.ibd' page"); call mtr.add_suppression("InnoDB: Background Page read failed to read or decrypt \\[page id: space=\\d+, page number=19\\]"); -call mtr.add_suppression("\\[ERROR\\] InnoDB: Failed to read file '.*test.t1\\.ibd' at offset 19: Page read from tablespace is corrupted\\."); +call mtr.add_suppression("\\[ERROR\\] InnoDB: Failed to read page 19 from file '.*test.t1\\.ibd': Page read from tablespace is corrupted\\."); call mtr.add_suppression("\\[ERROR\\] InnoDB: Plugin initialization aborted at srv0start\\.cc.* with error Data structure corruption"); call mtr.add_suppression("\\[ERROR\\] Plugin 'InnoDB' (init function|registration)"); call mtr.add_suppression("\\[ERROR\\] InnoDB: We detected index corruption"); diff --git a/mysql-test/suite/innodb_i_s/innodb_buffer_page.result b/mysql-test/suite/innodb_i_s/innodb_buffer_page.result index 3ef5608e155..9d4c43b1a8f 100644 --- a/mysql-test/suite/innodb_i_s/innodb_buffer_page.result +++ b/mysql-test/suite/innodb_i_s/innodb_buffer_page.result @@ -17,7 +17,7 @@ INNODB_BUFFER_PAGE CREATE TEMPORARY TABLE `INNODB_BUFFER_PAGE` ( `NUMBER_RECORDS` bigint(21) unsigned NOT NULL DEFAULT 0, `DATA_SIZE` bigint(21) unsigned NOT NULL DEFAULT 0, `COMPRESSED_SIZE` bigint(21) unsigned NOT NULL DEFAULT 0, - `PAGE_STATE` enum('NOT_USED','READY_FOR_USE','FILE_PAGE','MEMORY','REMOVE_HASH') NOT NULL, + `PAGE_STATE` enum('NOT_USED','MEMORY','REMOVE_HASH','FILE_PAGE') NOT NULL, `IO_FIX` enum('IO_NONE','IO_READ','IO_WRITE','IO_PIN') NOT NULL, `IS_OLD` int(1) NOT NULL DEFAULT 0, `FREE_PAGE_CLOCK` bigint(21) unsigned NOT NULL DEFAULT 0 diff --git a/mysql-test/suite/innodb_i_s/innodb_buffer_page_lru.result b/mysql-test/suite/innodb_i_s/innodb_buffer_page_lru.result index 73871eb2eda..42d33fdbeac 100644 --- a/mysql-test/suite/innodb_i_s/innodb_buffer_page_lru.result +++ b/mysql-test/suite/innodb_i_s/innodb_buffer_page_lru.result @@ -6,7 +6,7 @@ INNODB_BUFFER_PAGE_LRU CREATE TEMPORARY TABLE `INNODB_BUFFER_PAGE_LRU` ( `SPACE` int(11) unsigned NOT NULL DEFAULT 0, `PAGE_NUMBER` int(11) unsigned NOT NULL DEFAULT 0, `PAGE_TYPE` varchar(64) DEFAULT NULL, - `FLUSH_TYPE` bigint(21) unsigned NOT NULL DEFAULT 0, + `FLUSH_TYPE` int(11) unsigned NOT NULL DEFAULT 0, `FIX_COUNT` int(11) unsigned NOT NULL DEFAULT 0, `IS_HASHED` int(1) NOT NULL DEFAULT 0, `NEWEST_MODIFICATION` bigint(21) unsigned NOT NULL DEFAULT 0, diff --git a/storage/innobase/btr/btr0btr.cc b/storage/innobase/btr/btr0btr.cc index 10a2612c09f..b7a9b3fc008 100644 --- a/storage/innobase/btr/btr0btr.cc +++ b/storage/innobase/btr/btr0btr.cc @@ -65,7 +65,7 @@ btr_can_merge_with_page( void btr_corruption_report(const buf_block_t* block, const dict_index_t* index) { ib::fatal() - << "Flag mismatch in page " << block->page.id + << "Flag mismatch in page " << block->page.id() << " index " << index->name << " of table " << index->table->name; } @@ -722,9 +722,10 @@ void btr_page_free(dict_index_t* index, buf_block_t* block, mtr_t* mtr, ut_ad(page_is_leaf(block->frame)); } #endif - ut_ad(index->table->space_id == block->page.id.space()); + const page_id_t id(block->page.id()); + ut_ad(index->table->space_id == id.space()); /* The root page is freed by btr_free_root(). */ - ut_ad(block->page.id.page_no() != index->page); + ut_ad(id.page_no() != index->page); ut_ad(mtr->is_named_space(index->table->space)); /* The page gets invalid for optimistic searches: increment the frame @@ -745,8 +746,8 @@ void btr_page_free(dict_index_t* index, buf_block_t* block, mtr_t* mtr, ? PAGE_HEADER + PAGE_BTR_SEG_LEAF : PAGE_HEADER + PAGE_BTR_SEG_TOP]; fseg_free_page(seg_header, - index->table->space, block->page.id.page_no(), mtr); - buf_page_free(block->page.id, mtr, __FILE__, __LINE__); + index->table->space, id.page_no(), mtr); + buf_page_free(id, mtr, __FILE__, __LINE__); /* The page was marked free in the allocation bitmap, but it should remain exclusively latched until mtr_t::commit() or until it @@ -835,7 +836,7 @@ btr_page_get_father_node_ptr_func( ut_ad(latch_mode == BTR_CONT_MODIFY_TREE || latch_mode == BTR_CONT_SEARCH_TREE); - page_no = btr_cur_get_block(cursor)->page.id.page_no(); + page_no = btr_cur_get_block(cursor)->page.id().page_no(); index = btr_cur_get_index(cursor); ut_ad(!dict_index_is_spatial(index)); @@ -969,13 +970,13 @@ static void btr_free_root(buf_block_t *block, mtr_t *mtr, bool invalidate) { ut_ad(mtr_memo_contains_flagged(mtr, block, MTR_MEMO_PAGE_X_FIX | MTR_MEMO_PAGE_SX_FIX)); - ut_ad(mtr->is_named_space(block->page.id.space())); + ut_ad(mtr->is_named_space(block->page.id().space())); btr_search_drop_page_hash_index(block); #ifdef UNIV_BTR_DEBUG ut_a(btr_root_fseg_validate(PAGE_HEADER + PAGE_BTR_SEG_TOP + block->frame, - block->page.id.space())); + block->page.id().space())); #endif /* UNIV_BTR_DEBUG */ if (invalidate) { @@ -1069,7 +1070,7 @@ btr_create( buf_block_dbg_add_level( ibuf_hdr_block, SYNC_IBUF_TREE_NODE_NEW); - ut_ad(ibuf_hdr_block->page.id.page_no() + ut_ad(ibuf_hdr_block->page.id().page_no() == IBUF_HEADER_PAGE_NO); /* Allocate then the next page to the segment: it will be the tree root page */ @@ -1084,7 +1085,7 @@ btr_create( return(FIL_NULL); } - ut_ad(block->page.id.page_no() == IBUF_TREE_ROOT_PAGE_NO); + ut_ad(block->page.id().page_no() == IBUF_TREE_ROOT_PAGE_NO); buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW); @@ -1099,7 +1100,7 @@ btr_create( buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW); - if (!fseg_create(space, block->page.id.page_no(), + if (!fseg_create(space, block->page.id().page_no(), PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr)) { /* Not enough space for new segment, free root segment before return. */ @@ -1165,7 +1166,7 @@ btr_create( ut_ad(page_get_max_insert_size(block->frame, 2) > 2 * BTR_PAGE_MAX_REC_SIZE); - return(block->page.id.page_no()); + return(block->page.id().page_no()); } /** Free a B-tree except the root page. The root page MUST be freed after @@ -1185,7 +1186,7 @@ btr_free_but_not_root( leaf_loop: mtr_start(&mtr); mtr_set_log_mode(&mtr, log_mode); - mtr.set_named_space_id(block->page.id.space()); + mtr.set_named_space_id(block->page.id().space()); page_t* root = block->frame; @@ -1196,9 +1197,9 @@ leaf_loop: #ifdef UNIV_BTR_DEBUG ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF - + root, block->page.id.space())); + + root, block->page.id().space())); ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP - + root, block->page.id.space())); + + root, block->page.id().space())); #endif /* UNIV_BTR_DEBUG */ /* NOTE: page hash indexes are dropped when a page is freed inside @@ -1215,13 +1216,13 @@ leaf_loop: top_loop: mtr_start(&mtr); mtr_set_log_mode(&mtr, log_mode); - mtr.set_named_space_id(block->page.id.space()); + mtr.set_named_space_id(block->page.id().space()); root = block->frame; #ifdef UNIV_BTR_DEBUG ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP - + root, block->page.id.space())); + + root, block->page.id().space())); #endif /* UNIV_BTR_DEBUG */ finished = fseg_free_step_not_header( @@ -1387,8 +1388,9 @@ static void btr_page_reorganize_low(page_cur_t *cursor, dict_index_t *index, ut_ad(!is_buf_block_get_page_zip(block)); btr_assert_not_corrupted(block, index); ut_ad(fil_page_index_page_check(block->frame)); - ut_ad(index->is_dummy || block->page.id.space() == index->table->space->id); - ut_ad(index->is_dummy || block->page.id.page_no() != index->page || + ut_ad(index->is_dummy || + block->page.id().space() == index->table->space->id); + ut_ad(index->is_dummy || block->page.id().page_no() != index->page || !page_has_siblings(block->frame)); buf_block_t *old= buf_block_alloc(); @@ -1423,7 +1425,7 @@ static void btr_page_reorganize_low(page_cur_t *cursor, dict_index_t *index, clustered index root pages. */ ut_ad(dict_index_is_sec_or_ibuf(index) ? page_is_leaf(block->frame) - : block->page.id.page_no() == index->page); + : block->page.id().page_no() == index->page); else /* PAGE_MAX_TRX_ID is unused in clustered index pages (other than the root where it is repurposed as PAGE_ROOT_AUTO_INC), non-leaf @@ -1451,7 +1453,7 @@ static void btr_page_reorganize_low(page_cur_t *cursor, dict_index_t *index, else ut_ad(cursor->rec == page_get_infimum_rec(block->frame)); - if (block->page.id.page_no() == index->page && + if (block->page.id().page_no() == index->page && fil_page_get_type(old->frame) == FIL_PAGE_TYPE_INSTANT) { /* Preserve the PAGE_INSTANT information. */ @@ -1694,7 +1696,7 @@ btr_page_empty( ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); ut_ad(page_zip == buf_block_get_page_zip(block)); ut_ad(!index->is_dummy); - ut_ad(index->table->space->id == block->page.id.space()); + ut_ad(index->table->space->id == block->page.id().space()); #ifdef UNIV_ZIP_DEBUG ut_a(!page_zip || page_zip_validate(page_zip, block->frame, index)); #endif /* UNIV_ZIP_DEBUG */ @@ -1708,7 +1710,7 @@ btr_page_empty( root page. */ const ib_uint64_t autoinc = dict_index_is_clust(index) - && index->page == block->page.id.page_no() + && index->page == block->page.id().page_no() ? page_get_autoinc(block->frame) : 0; @@ -1749,7 +1751,7 @@ void btr_set_instant(buf_block_t* root, const dict_index_t& index, mtr_t* mtr) ut_ad(fil_page_get_type(root->frame) == FIL_PAGE_TYPE_INSTANT || fil_page_get_type(root->frame) == FIL_PAGE_INDEX); ut_ad(!page_has_siblings(root->frame)); - ut_ad(root->page.id.page_no() == index.page); + ut_ad(root->page.id().page_no() == index.page); rec_t* infimum = page_get_infimum_rec(root->frame); rec_t* supremum = page_get_supremum_rec(root->frame); @@ -1891,7 +1893,7 @@ btr_root_raise_and_insert( + root->frame, space)); } - ut_a(dict_index_get_page(index) == root->page.id.page_no()); + ut_a(dict_index_get_page(index) == root->page.id().page_no()); #endif /* UNIV_BTR_DEBUG */ ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index), MTR_MEMO_X_LOCK @@ -2004,7 +2006,7 @@ btr_root_raise_and_insert( } rec = page_rec_get_next(page_get_infimum_rec(new_block->frame)); - new_page_no = new_block->page.id.page_no(); + new_page_no = new_block->page.id().page_no(); /* Build the node pointer (= node key and page address) for the child */ @@ -2513,7 +2515,7 @@ btr_attach_half_pages( btr_node_ptr_set_child_page_no( btr_cur_get_block(&cursor), btr_cur_get_rec(&cursor), - offsets, lower_block->page.id.page_no(), mtr); + offsets, lower_block->page.id().page_no(), mtr); mem_heap_empty(heap); } else { lower_block = block; @@ -2542,7 +2544,8 @@ btr_attach_half_pages( half */ node_ptr_upper = dict_index_build_node_ptr( - index, split_rec, upper_block->page.id.page_no(), heap, level); + index, split_rec, upper_block->page.id().page_no(), + heap, level); /* Insert it next to the pointer to the lower half. Note that this may generate recursion leading to a split on the higher level. */ @@ -2560,9 +2563,9 @@ btr_attach_half_pages( ut_a(page_is_comp(prev_block->frame) == page_is_comp(block->frame)); ut_a(btr_page_get_next(prev_block->frame) - == block->page.id.page_no()); + == block->page.id().page_no()); #endif /* UNIV_BTR_DEBUG */ - btr_page_set_next(prev_block, lower_block->page.id.page_no(), + btr_page_set_next(prev_block, lower_block->page.id().page_no(), mtr); } @@ -2571,9 +2574,9 @@ btr_attach_half_pages( ut_a(page_is_comp(next_block->frame) == page_is_comp(block->frame)); ut_a(btr_page_get_prev(next_block->frame) - == block->page.id.page_no()); + == block->page.id().page_no()); #endif /* UNIV_BTR_DEBUG */ - btr_page_set_prev(next_block, upper_block->page.id.page_no(), + btr_page_set_prev(next_block, upper_block->page.id().page_no(), mtr); } @@ -2587,8 +2590,8 @@ btr_attach_half_pages( btr_page_set_next(upper_block, next_page_no, mtr); } - btr_page_set_prev(upper_block, lower_block->page.id.page_no(), mtr); - btr_page_set_next(lower_block, upper_block->page.id.page_no(), mtr); + btr_page_set_prev(upper_block, lower_block->page.id().page_no(), mtr); + btr_page_set_next(lower_block, upper_block->page.id().page_no(), mtr); } /*************************************************************//** @@ -2732,7 +2735,7 @@ btr_insert_into_right_sibling( } dtuple_t* node_ptr = dict_index_build_node_ptr( - cursor->index, rec, next_block->page.id.page_no(), + cursor->index, rec, next_block->page.id().page_no(), heap, level); btr_insert_on_non_leaf_level( @@ -2838,7 +2841,7 @@ func_start: tuple to be inserted should be the first record on the upper half-page */ bool insert_left = false; - ulint hint_page_no = block->page.id.page_no() + 1; + ulint hint_page_no = block->page.id().page_no() + 1; byte direction = FSP_UP; if (tuple && n_iterations > 0) { @@ -3176,7 +3179,7 @@ void btr_level_list_remove(const buf_block_t& block, const dict_index_t& index, { ut_ad(mtr_memo_contains(mtr, &block, MTR_MEMO_PAGE_X_FIX)); ut_ad(block.zip_size() == index.table->space->zip_size()); - ut_ad(index.table->space->id == block.page.id.space()); + ut_ad(index.table->space->id == block.page.id().space()); /* Get the previous and next page numbers of page */ const page_t* page = block.frame; @@ -3277,7 +3280,7 @@ btr_lift_page_up( the first level, the tree is in an inconsistent state and can not be searched. */ for (b = father_block; - b->page.id.page_no() != root_page_no; ) { + b->page.id().page_no() != root_page_no; ) { ut_a(n_blocks < BTR_MAX_LEVELS); if (dict_index_is_spatial(index)) { @@ -3328,7 +3331,7 @@ btr_lift_page_up( ut_ad(!page_get_instant(father_block->frame)); if (index->is_instant() - && father_block->page.id.page_no() == root_page_no) { + && father_block->page.id().page_no() == root_page_no) { ut_ad(!father_page_zip); btr_set_instant(father_block, *index, mtr); } @@ -3478,16 +3481,15 @@ btr_compress( if (dict_index_is_spatial(index)) { offsets = rtr_page_get_father_block( NULL, heap, index, block, mtr, cursor, &father_cursor); - ut_ad(cursor->page_cur.block->page.id.page_no() - == block->page.id.page_no()); + ut_ad(cursor->page_cur.block->page.id() == block->page.id()); rec_t* my_rec = father_cursor.page_cur.rec; ulint page_no = btr_node_ptr_get_child_page_no(my_rec, offsets); - if (page_no != block->page.id.page_no()) { + if (page_no != block->page.id().page_no()) { ib::info() << "father positioned on page " << page_no << "instead of " - << block->page.id.page_no(); + << block->page.id().page_no(); offsets = btr_page_get_father_block( NULL, heap, index, block, mtr, &father_cursor); } @@ -3538,25 +3540,13 @@ retry: #ifdef UNIV_BTR_DEBUG if (is_left) { ut_a(btr_page_get_next(merge_page) - == block->page.id.page_no()); + == block->page.id().page_no()); } else { ut_a(btr_page_get_prev(merge_page) - == block->page.id.page_no()); + == block->page.id().page_no()); } #endif /* UNIV_BTR_DEBUG */ -#ifdef UNIV_GIS_DEBUG - if (dict_index_is_spatial(index)) { - if (is_left) { - fprintf(stderr, "GIS_DIAG: merge left %ld to %ld \n", - (long) block->page.id.page_no(), left_page_no); - } else { - fprintf(stderr, "GIS_DIAG: merge right %ld to %ld\n", - (long) block->page.id.page_no(), right_page_no); - } - } -#endif /* UNIV_GIS_DEBUG */ - ut_ad(page_validate(merge_page, index)); merge_page_zip = buf_block_get_page_zip(merge_block); @@ -3623,13 +3613,10 @@ retry: ulint page_no = btr_node_ptr_get_child_page_no( my_rec, offsets); - if (page_no != block->page.id.page_no()) { - + if (page_no != block->page.id().page_no()) { ib::fatal() << "father positioned on " << page_no << " instead of " - << block->page.id.page_no(); - - ut_ad(0); + << block->page.id().page_no(); } if (mbr_changed) { @@ -3748,8 +3735,8 @@ retry: btr_level_list_remove(*block, *index, mtr); ut_ad(btr_node_ptr_get_child_page_no( - btr_cur_get_rec(&father_cursor), offsets) - == block->page.id.page_no()); + btr_cur_get_rec(&father_cursor), offsets) + == block->page.id().page_no()); /* Replace the address of the old child node (= page) with the address of the merge page to the right */ @@ -3878,13 +3865,6 @@ retry: #endif /* UNIV_ZIP_DEBUG */ if (dict_index_is_spatial(index)) { -#ifdef UNIV_GIS_DEBUG - fprintf(stderr, "GIS_DIAG: compressed away %ld\n", - (long) block->page.id.page_no()); - fprintf(stderr, "GIS_DIAG: merged to %ld\n", - (long) merge_block->page.id.page_no()); -#endif - rtr_check_discard_page(index, NULL, block); } @@ -3946,7 +3926,7 @@ btr_discard_only_page_on_level( /* Save the PAGE_MAX_TRX_ID from the leaf page. */ max_trx_id = page_get_max_trx_id(buf_block_get_frame(block)); - while (block->page.id.page_no() != dict_index_get_page(index)) { + while (block->page.id().page_no() != dict_index_get_page(index)) { btr_cur_t cursor; buf_block_t* father; const page_t* page = buf_block_get_frame(block); @@ -3955,7 +3935,7 @@ btr_discard_only_page_on_level( ut_a(page_level == btr_page_get_level(page)); ut_a(!page_has_siblings(page)); ut_ad(fil_page_index_page_check(page)); - ut_ad(block->page.id.space() == index->table->space->id); + ut_ad(block->page.id().space() == index->table->space->id); ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); btr_search_drop_page_hash_index(block); @@ -4064,7 +4044,7 @@ btr_discard_page( block = btr_cur_get_block(cursor); index = btr_cur_get_index(cursor); - ut_ad(dict_index_get_page(index) != block->page.id.page_no()); + ut_ad(dict_index_get_page(index) != block->page.id().page_no()); ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index), MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK)); @@ -4090,7 +4070,7 @@ btr_discard_page( true, mtr); #ifdef UNIV_BTR_DEBUG ut_a(btr_page_get_next(merge_block->frame) - == block->page.id.page_no()); + == block->page.id().page_no()); #endif /* UNIV_BTR_DEBUG */ ut_d(parent_is_different = (page_rec_get_next( @@ -4103,7 +4083,7 @@ btr_discard_page( true, mtr); #ifdef UNIV_BTR_DEBUG ut_a(btr_page_get_prev(merge_block->frame) - == block->page.id.page_no()); + == block->page.id().page_no()); #endif /* UNIV_BTR_DEBUG */ ut_d(parent_is_different = page_rec_is_supremum( page_rec_get_next(btr_cur_get_rec(&parent_cursor)))); @@ -4170,7 +4150,8 @@ btr_discard_page( ut_ad(parent_is_different || btr_check_node_ptr(index, merge_block, mtr)); - if (btr_cur_get_block(&parent_cursor)->page.id.page_no() == index->page + if (btr_cur_get_block(&parent_cursor)->page.id().page_no() + == index->page && !page_has_siblings(btr_cur_get_page(&parent_cursor)) && page_get_n_recs(btr_cur_get_page(&parent_cursor)) == 1) { btr_lift_page_up(index, merge_block, mtr); @@ -4332,7 +4313,7 @@ btr_check_node_ptr( ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); - if (dict_index_get_page(index) == block->page.id.page_no()) { + if (dict_index_get_page(index) == block->page.id().page_no()) { return(TRUE); } @@ -4615,7 +4596,7 @@ btr_validate_report1( const buf_block_t* block) /*!< in: index page */ { ib::error error; - error << "In page " << block->page.id.page_no() + error << "In page " << block->page.id().page_no() << " of index " << index->name << " of table " << index->table->name; @@ -4635,14 +4616,13 @@ btr_validate_report2( const buf_block_t* block1, /*!< in: first index page */ const buf_block_t* block2) /*!< in: second index page */ { - ib::error error; - error << "In pages " << block1->page.id - << " and " << block2->page.id << " of index " << index->name - << " of table " << index->table->name; + ib::error error; + error << "In pages " << block1->page.id() + << " and " << block2->page.id() << " of index " << index->name + << " of table " << index->table->name; - if (level > 0) { - error << ", index tree level " << level; - } + if (level) + error << ", index tree level " << level; } /************************************************************//** @@ -4701,7 +4681,7 @@ btr_validate_level( while (level != btr_page_get_level(page)) { const rec_t* node_ptr; - if (fseg_page_is_free(space, block->page.id.page_no())) { + if (fseg_page_is_free(space, block->page.id().page_no())) { btr_validate_report1(index, level, block); @@ -4710,8 +4690,8 @@ btr_validate_level( ret = false; } - ut_a(index->table->space_id == block->page.id.space()); - ut_a(block->page.id.space() == page_get_space_id(page)); + ut_a(index->table->space_id == block->page.id().space()); + ut_a(block->page.id().space() == page_get_space_id(page)); #ifdef UNIV_ZIP_DEBUG page_zip = buf_block_get_page_zip(block); ut_a(!page_zip || page_zip_validate(page_zip, page, index)); @@ -4774,9 +4754,9 @@ loop: ut_a(!page_zip || page_zip_validate(page_zip, page, index)); #endif /* UNIV_ZIP_DEBUG */ - ut_a(block->page.id.space() == index->table->space_id); + ut_a(block->page.id().space() == index->table->space_id); - if (fseg_page_is_free(space, block->page.id.page_no())) { + if (fseg_page_is_free(space, block->page.id().page_no())) { btr_validate_report1(index, level, block); @@ -4886,7 +4866,7 @@ loop: 2) Search parent from root is very costly for R-tree. We will add special validation mechanism for R-tree later (WL #7520) */ if (!dict_index_is_spatial(index) - && block->page.id.page_no() != dict_index_get_page(index)) { + && block->page.id().page_no() != dict_index_get_page(index)) { /* Check father node pointers */ rec_t* node_ptr; @@ -4915,7 +4895,7 @@ loop: if (node_ptr != btr_cur_get_rec(&node_cur) || btr_node_ptr_get_child_page_no(node_ptr, offsets) - != block->page.id.page_no()) { + != block->page.id().page_no()) { btr_validate_report1(index, level, block); diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc index 9ddc2025fcf..119f16dd667 100644 --- a/storage/innobase/btr/btr0cur.cc +++ b/storage/innobase/btr/btr0cur.cc @@ -227,10 +227,10 @@ btr_cur_latch_leaves( compile_time_assert(int(MTR_MEMO_PAGE_S_FIX) == int(RW_S_LATCH)); compile_time_assert(int(MTR_MEMO_PAGE_X_FIX) == int(RW_X_LATCH)); compile_time_assert(int(MTR_MEMO_PAGE_SX_FIX) == int(RW_SX_LATCH)); - ut_ad(block->page.id.space() == cursor->index->table->space->id); + ut_ad(block->page.id().space() == cursor->index->table->space->id); spatial = dict_index_is_spatial(cursor->index) && cursor->rtr_info; - ut_ad(buf_page_in_file(&block->page)); + ut_ad(block->page.in_file()); switch (latch_mode) { case BTR_SEARCH_LEAF: @@ -244,7 +244,7 @@ btr_cur_latch_leaves( mode = latch_mode == BTR_MODIFY_LEAF ? RW_X_LATCH : RW_S_LATCH; latch_leaves.savepoints[1] = mtr_set_savepoint(mtr); get_block = btr_block_get(*cursor->index, - block->page.id.page_no(), mode, + block->page.id().page_no(), mode, true, mtr); latch_leaves.blocks[1] = get_block; #ifdef UNIV_BTR_DEBUG @@ -293,7 +293,7 @@ btr_cur_latch_leaves( latch_leaves.savepoints[1] = mtr_set_savepoint(mtr); get_block = btr_block_get( - *cursor->index, block->page.id.page_no(), + *cursor->index, block->page.id().page_no(), RW_X_LATCH, true, mtr); latch_leaves.blocks[1] = get_block; @@ -303,7 +303,7 @@ btr_cur_latch_leaves( ut_a(page_is_comp(latch_leaves.blocks[0]->frame) == page_is_comp(block->frame)); ut_a(btr_page_get_next(latch_leaves.blocks[0]->frame) - == block->page.id.page_no()); + == block->page.id().page_no()); } ut_a(page_is_comp(get_block->frame) == page_is_comp(block->frame)); @@ -331,7 +331,7 @@ btr_cur_latch_leaves( ut_a(page_is_comp(get_block->frame) == page_is_comp(block->frame)); ut_a(btr_page_get_prev(get_block->frame) - == block->page.id.page_no()); + == block->page.id().page_no()); #endif /* UNIV_BTR_DEBUG */ if (spatial) { cursor->rtr_info->tree_blocks[ @@ -360,13 +360,13 @@ btr_cur_latch_leaves( ut_a(page_is_comp(get_block->frame) == page_is_comp(block->frame)); ut_a(btr_page_get_next(get_block->frame) - == block->page.id.page_no()); + == block->page.id().page_no()); #endif /* UNIV_BTR_DEBUG */ } latch_leaves.savepoints[1] = mtr_set_savepoint(mtr); get_block = btr_block_get(*cursor->index, - block->page.id.page_no(), mode, + block->page.id().page_no(), mode, true, mtr); latch_leaves.blocks[1] = get_block; #ifdef UNIV_BTR_DEBUG @@ -784,14 +784,11 @@ btr_cur_optimistic_latch_leaves( mode = *latch_mode == BTR_SEARCH_PREV ? RW_S_LATCH : RW_X_LATCH; - buf_page_mutex_enter(block); - if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) { - buf_page_mutex_exit(block); + if (block->page.state() != BUF_BLOCK_FILE_PAGE) { return(false); } /* pin the block not to be relocated */ buf_block_buf_fix_inc(block, file, line); - buf_page_mutex_exit(block); rw_lock_s_lock(&block->lock); if (block->modify_clock != modify_clock) { @@ -1678,11 +1675,11 @@ retry_page_get: cursor->flag = BTR_CUR_DELETE_IBUF; } else { /* The purge could not be buffered. */ - buf_pool_watch_unset(page_id); + buf_pool.watch_unset(page_id); break; } - buf_pool_watch_unset(page_id); + buf_pool.watch_unset(page_id); goto func_exit; default: @@ -2280,7 +2277,7 @@ need_opposite_intention: - (leftmost_from_level - 1); page_id.set_page_no( - tree_blocks[idx]->page.id.page_no()); + tree_blocks[idx]->page.id().page_no()); for (ulint i = n_blocks - (leftmost_from_level - 1); @@ -3306,16 +3303,14 @@ static void btr_cur_prefetch_siblings(const buf_block_t* block) const page_t *page= block->frame; ut_ad(page_is_leaf(page)); - uint32_t left_page_no= mach_read_from_4(my_assume_aligned<4> - (page + FIL_PAGE_PREV)); - uint32_t right_page_no= mach_read_from_4(my_assume_aligned<4> - (page + FIL_PAGE_NEXT)); + uint32_t prev= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_PREV)); + uint32_t next= mach_read_from_4(my_assume_aligned<4>(page + FIL_PAGE_NEXT)); - if (left_page_no != FIL_NULL) - buf_read_page_background(page_id_t(block->page.id.space(), left_page_no), + if (prev != FIL_NULL) + buf_read_page_background(page_id_t(block->page.id().space(), prev), block->zip_size(), false); - if (right_page_no != FIL_NULL) - buf_read_page_background(page_id_t(block->page.id.space(), right_page_no), + if (next != FIL_NULL) + buf_read_page_background(page_id_t(block->page.id().space(), next), block->zip_size(), false); } @@ -3731,7 +3726,7 @@ btr_cur_pessimistic_insert( } if (dict_index_get_page(index) - == btr_cur_get_block(cursor)->page.id.page_no()) { + == btr_cur_get_block(cursor)->page.id().page_no()) { /* The page is the root page */ *rec = btr_root_raise_and_insert( @@ -4795,13 +4790,14 @@ btr_cur_pess_upd_restore_supremum( const uint32_t prev_page_no = btr_page_get_prev(page); - const page_id_t page_id(block->page.id.space(), prev_page_no); + const page_id_t page_id(block->page.id().space(), prev_page_no); ut_ad(prev_page_no != FIL_NULL); prev_block = buf_page_get_with_no_latch(page_id, block->zip_size(), mtr); #ifdef UNIV_BTR_DEBUG - ut_a(btr_page_get_next(prev_block->frame) == block->page.id.page_no()); + ut_a(btr_page_get_next(prev_block->frame) + == block->page.id().page_no()); #endif /* UNIV_BTR_DEBUG */ /* We must already have an x-latch on prev_block! */ @@ -5409,8 +5405,8 @@ btr_cur_compress_if_useful( const buf_block_t* block = btr_cur_get_block(cursor); /* Check whether page lock prevents the compression */ - if (!lock_test_prdt_page_lock(trx, block->page.id.space(), - block->page.id.page_no())) { + if (!lock_test_prdt_page_lock(trx, block->page.id().space(), + block->page.id().page_no())) { return(false); } } @@ -5457,7 +5453,7 @@ btr_cur_optimistic_delete_func( block = btr_cur_get_block(cursor); - ut_ad(block->page.id.space() == cursor->index->table->space->id); + ut_ad(block->page.id().space() == cursor->index->table->space->id); ut_ad(page_is_leaf(buf_block_get_frame(block))); ut_ad(!dict_index_is_online_ddl(cursor->index) || dict_index_is_clust(cursor->index) @@ -5465,7 +5461,7 @@ btr_cur_optimistic_delete_func( rec = btr_cur_get_rec(cursor); - if (UNIV_UNLIKELY(block->page.id.page_no() == cursor->index->page + if (UNIV_UNLIKELY(block->page.id().page_no() == cursor->index->page && page_get_n_recs(block->frame) == 1 + (cursor->index->is_instant() && !rec_is_metadata(rec, *cursor->index)))) { @@ -5644,7 +5640,7 @@ btr_cur_pessimistic_delete( ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); ut_ad(mtr->is_named_space(index->table->space)); ut_ad(!index->is_dummy); - ut_ad(block->page.id.space() == index->table->space->id); + ut_ad(block->page.id().space() == index->table->space->id); if (!has_reserved_extents) { /* First reserve enough free space for the file segments @@ -5701,7 +5697,7 @@ btr_cur_pessimistic_delete( lock_update_delete(block, rec); } - if (block->page.id.page_no() != index->page) { + if (block->page.id().page_no() != index->page) { if (page_get_n_recs(page) < 2) { goto discard_page; } @@ -5815,7 +5811,7 @@ discard_page: const ulint level = btr_page_get_level(page); // FIXME: reuse the node_ptr from above dtuple_t* node_ptr = dict_index_build_node_ptr( - index, next_rec, block->page.id.page_no(), + index, next_rec, block->page.id().page_no(), heap, level); btr_insert_on_non_leaf_level( @@ -5856,7 +5852,7 @@ discard_page: cursor, FALSE, mtr); } else { ib::warn() << "Not merging page " - << block->page.id + << block->page.id() << " in index " << index->name << " of " << index->table->name; ut_ad("MDEV-14637" == 0); @@ -5938,7 +5934,7 @@ btr_cur_add_path_info( slot->nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor)); slot->n_recs = page_get_n_recs(block->frame); - slot->page_no = block->page.id.page_no(); + slot->page_no = block->page.id().page_no(); slot->page_level = btr_page_get_level(block->frame); } @@ -6216,7 +6212,7 @@ btr_estimate_n_rows_in_range_low( should_count_the_left_border = false; } - tuple1->page_id= cursor.page_cur.block->page.id; + tuple1->page_id= cursor.page_cur.block->page.id(); mtr_commit(&mtr); @@ -6289,7 +6285,7 @@ btr_estimate_n_rows_in_range_low( should_count_the_right_border = false; } - tuple2->page_id= cursor.page_cur.block->page.id; + tuple2->page_id= cursor.page_cur.block->page.id(); mtr_commit(&mtr); @@ -7038,13 +7034,14 @@ btr_blob_get_next_page_no( @param mtr mini-transaction to commit */ static void btr_blob_free(buf_block_t *block, bool all, mtr_t *mtr) { - const page_id_t page_id= block->page.id; + const page_id_t page_id(block->page.id()); ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); mtr->commit(); mutex_enter(&buf_pool.mutex); /* Free the block if it is still allocated to the same file page. */ - if (block->page.state == BUF_BLOCK_FILE_PAGE && block->page.id == page_id && + if (block->page.state() == BUF_BLOCK_FILE_PAGE && + block->page.id() == page_id && !buf_LRU_free_page(&block->page, all) && all && block->page.zip.data) /* Attempt to deallocate the redundant copy of the uncompressed page if the whole ROW_FORMAT=COMPRESSED block cannot be deallocted. */ @@ -7106,7 +7103,7 @@ struct btr_blob_log_check_t { if (UNIV_UNLIKELY(m_op == BTR_STORE_INSERT_BULK)) { offs = page_offset(*m_rec); - page_no = (*m_block)->page.id.page_no(); + page_no = (*m_block)->page.id().page_no(); buf_block_buf_fix_inc(*m_block, __FILE__, __LINE__); } else { btr_pcur_store_position(m_pcur, m_mtr); @@ -7218,8 +7215,8 @@ btr_store_big_rec_extern_fields( btr_blob_log_check_t redo_log(pcur, btr_mtr, offsets, &rec_block, &rec, op); page_zip = buf_block_get_page_zip(rec_block); - space_id = rec_block->page.id.space(); - rec_page_no = rec_block->page.id.page_no(); + space_id = rec_block->page.id().space(); + rec_page_no = rec_block->page.id().page_no(); ut_a(fil_page_index_page_check(page_align(rec)) || op == BTR_STORE_INSERT_BULK); @@ -7310,14 +7307,14 @@ btr_store_big_rec_extern_fields( rec, offsets, field_no); page_zip = buf_block_get_page_zip(rec_block); - rec_page_no = rec_block->page.id.page_no(); + rec_page_no = rec_block->page.id().page_no(); } mtr.start(); index->set_modified(mtr); mtr.set_log_mode(btr_mtr->get_log_mode()); - buf_page_get(rec_block->page.id, + buf_page_get(rec_block->page.id(), rec_block->zip_size(), RW_X_LATCH, &mtr); if (prev_page_no == FIL_NULL) { @@ -7341,7 +7338,7 @@ btr_store_big_rec_extern_fields( ut_a(block != NULL); - page_no = block->page.id.page_no(); + page_no = block->page.id().page_no(); if (prev_page_no != FIL_NULL) { buf_block_t* prev_block; @@ -7573,7 +7570,7 @@ static void btr_check_blob_fil_page_type(const buf_block_t& block, bool read) if (UNIV_LIKELY(type == FIL_PAGE_TYPE_BLOB)) return; /* FIXME: take the tablespace as a parameter */ - if (fil_space_t *space= fil_space_acquire_silent(block.page.id.space())) + if (fil_space_t *space= fil_space_acquire_silent(block.page.id().space())) { /* Old versions of InnoDB did not initialize FIL_PAGE_TYPE on BLOB pages. Do not print anything about the type mismatch when reading @@ -7583,7 +7580,7 @@ static void btr_check_blob_fil_page_type(const buf_block_t& block, bool read) ib::fatal() << "FIL_PAGE_TYPE=" << type << (read ? " on BLOB read file " : " on BLOB purge file ") << space->chain.start->name - << " page " << block.page.id.page_no(); + << " page " << block.page.id().page_no(); } space->release(); } diff --git a/storage/innobase/btr/btr0pcur.cc b/storage/innobase/btr/btr0pcur.cc index 7b2251fd7de..ad52eb6a58a 100644 --- a/storage/innobase/btr/btr0pcur.cc +++ b/storage/innobase/btr/btr0pcur.cc @@ -112,8 +112,8 @@ btr_pcur_store_position( rec = page_cur_get_rec(page_cursor); offs = rec - block->frame; - ut_ad(block->page.id.page_no() == page_get_page_no(block->frame)); - ut_ad(block->page.buf_fix_count); + ut_ad(block->page.id().page_no() == page_get_page_no(block->frame)); + ut_ad(block->page.buf_fix_count()); /* For spatial index, when we do positioning on parent buffer if necessary, it might not hold latches, but the tree must be locked to prevent change on the page */ @@ -134,7 +134,7 @@ btr_pcur_store_position( ut_a(!page_has_siblings(block->frame)); ut_ad(page_is_leaf(block->frame)); - ut_ad(block->page.id.page_no() == index->page); + ut_ad(block->page.id().page_no() == index->page); if (page_rec_is_supremum_low(offs)) { cursor->rel_pos = BTR_PCUR_AFTER_LAST_IN_TREE; @@ -483,7 +483,7 @@ btr_pcur_move_to_next_page( #ifdef UNIV_BTR_DEBUG ut_a(page_is_comp(next_page) == page_is_comp(page)); ut_a(btr_page_get_prev(next_page) - == btr_pcur_get_block(cursor)->page.id.page_no()); + == btr_pcur_get_block(cursor)->page.id().page_no()); #endif /* UNIV_BTR_DEBUG */ btr_leaf_page_release(btr_pcur_get_block(cursor), mode, mtr); diff --git a/storage/innobase/btr/btr0sea.cc b/storage/innobase/btr/btr0sea.cc index cd2ee953994..ba646dd738a 100644 --- a/storage/innobase/btr/btr0sea.cc +++ b/storage/innobase/btr/btr0sea.cc @@ -548,8 +548,11 @@ btr_search_update_block_hash_info(btr_search_t* info, buf_block_t* block) RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); info->last_hash_succ = FALSE; - - ut_a(buf_block_state_valid(block)); + ut_d(auto state= block->page.state()); + ut_ad(state == BUF_BLOCK_NOT_USED + || state == BUF_BLOCK_FILE_PAGE + || state == BUF_BLOCK_MEMORY + || state == BUF_BLOCK_REMOVE_HASH); ut_ad(info->magic_n == BTR_SEARCH_MAGIC_N); if ((block->n_hash_helps > 0) @@ -628,7 +631,7 @@ btr_search_update_hash_ref( return; } - ut_ad(block->page.id.space() == index->table->space_id); + ut_ad(block->page.id().space() == index->table->space_id); ut_ad(index == cursor->index); ut_ad(!dict_index_is_ibuf(index)); rw_lock_t* const latch = btr_get_search_latch(index); @@ -846,7 +849,7 @@ inline void buf_pool_t::clear_hash_index() continue; } - ut_d(buf_page_state state= buf_block_get_state(block)); + ut_d(buf_page_state state= block->page.state()); /* Another thread may have set the state to BUF_BLOCK_REMOVE_HASH in buf_LRU_block_remove_hashed(). @@ -892,10 +895,10 @@ inline buf_block_t* buf_pool_t::block_from_ahi(const byte *ptr) const /* buf_pool_t::chunk_t::init() invokes buf_block_init() so that block[n].frame == block->frame + n * srv_page_size. Check it. */ ut_ad(block->frame == page_align(ptr)); - /* Read the state of the block without holding a mutex. + /* Read the state of the block without holding hash_lock. A state transition from BUF_BLOCK_FILE_PAGE to BUF_BLOCK_REMOVE_HASH is possible during this execution. */ - ut_d(const buf_page_state state = buf_block_get_state(block)); + ut_d(const buf_page_state state = block->page.state()); ut_ad(state == BUF_BLOCK_FILE_PAGE || state == BUF_BLOCK_REMOVE_HASH); return block; } @@ -1004,25 +1007,27 @@ fail: buf_block_t* block = buf_pool.block_from_ahi(rec); if (use_latch) { - mutex_enter(&block->mutex); + rw_lock_t* hash_lock = buf_pool.hash_lock_get( + block->page.id()); + rw_lock_s_lock(hash_lock); - if (buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH) { + if (block->page.state() == BUF_BLOCK_REMOVE_HASH) { /* Another thread is just freeing the block from the LRU list of the buffer pool: do not try to access this page. */ - mutex_exit(&block->mutex); + rw_lock_s_unlock(hash_lock); goto fail; } const bool fail = index != block->index && index_id == block->index->id; ut_a(!fail || block->index->freed()); - ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE); DBUG_ASSERT(fail || block->page.status != buf_page_t::FREED); - buf_page_set_accessed(&block->page); buf_block_buf_fix_inc(block, __FILE__, __LINE__); - mutex_exit(&block->mutex); + rw_lock_s_unlock(hash_lock); + block->page.set_accessed(); buf_page_make_young_if_needed(&block->page); mtr_memo_type_t fix_type; @@ -1058,9 +1063,9 @@ got_no_latch: goto fail_and_release_page; } - if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) { + if (block->page.state() != BUF_BLOCK_FILE_PAGE) { - ut_ad(buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH); + ut_ad(block->page.state() == BUF_BLOCK_REMOVE_HASH); fail_and_release_page: if (!ahi_latch) { @@ -1176,8 +1181,8 @@ retry: return; } - ut_ad(block->page.buf_fix_count == 0 - || buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH + ut_ad(!block->page.buf_fix_count() + || block->page.state() == BUF_BLOCK_REMOVE_HASH || rw_lock_own_flagged(&block->lock, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S | RW_LOCK_FLAG_SX)); @@ -1191,7 +1196,7 @@ retry: = btr_page_get_index_id(block->frame); const ulint ahi_slot = ut_fold_ulint_pair(static_cast<ulint>(index_id), - static_cast<ulint>(block->page.id.space())) + block->page.id().space()) % btr_ahi_parts; latch = btr_search_latches[ahi_slot]; @@ -1209,7 +1214,7 @@ retry: #endif ut_ad(btr_search_enabled); - ut_ad(block->page.id.space() == index->table->space_id); + ut_ad(block->page.id().space() == index->table->space_id); ut_a(index_id == index->id); ut_ad(!dict_index_is_ibuf(index)); #ifdef UNIV_DEBUG @@ -1426,13 +1431,13 @@ btr_search_build_page_hash_index( rec_offs_init(offsets_); ut_ad(ahi_latch == btr_get_search_latch(index)); ut_ad(index); - ut_ad(block->page.id.space() == index->table->space_id); + ut_ad(block->page.id().space() == index->table->space_id); ut_ad(!dict_index_is_ibuf(index)); ut_ad(page_is_leaf(block->frame)); ut_ad(rw_lock_own_flagged(&block->lock, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); - ut_ad(block->page.id.page_no() >= 3); + ut_ad(block->page.id().page_no() >= 3); rw_lock_s_lock(ahi_latch); @@ -1740,7 +1745,7 @@ void btr_search_update_hash_on_delete(btr_cur_t* cursor) return; } - ut_ad(block->page.id.space() == index->table->space_id); + ut_ad(block->page.id().space() == index->table->space_id); ut_a(index == cursor->index); ut_a(block->curr_n_fields > 0 || block->curr_n_bytes > 0); ut_ad(!dict_index_is_ibuf(index)); @@ -1895,7 +1900,7 @@ btr_search_update_hash_on_insert(btr_cur_t* cursor, rw_lock_t* ahi_latch) return; } - ut_ad(block->page.id.space() == index->table->space_id); + ut_ad(block->page.id().space() == index->table->space_id); btr_search_check_free_space_in_heap(index); table = btr_get_search_table(index); @@ -2081,7 +2086,7 @@ btr_search_hash_table_validate(ulint hash_table_id) const buf_block_t* hash_block; index_id_t page_index_id; - if (UNIV_LIKELY(buf_block_get_state(block) + if (UNIV_LIKELY(block->page.state() == BUF_BLOCK_FILE_PAGE)) { /* The space and offset are only valid @@ -2090,7 +2095,7 @@ btr_search_hash_table_validate(ulint hash_table_id) (BUF_BLOCK_REMOVE_HASH, see the assertion and the comment below) */ hash_block = buf_block_hash_get( - block->page.id); + block->page.id()); } else { hash_block = NULL; } @@ -2108,12 +2113,12 @@ btr_search_hash_table_validate(ulint hash_table_id) remove the block from btr_search_sys->hash_tables[i]. */ - ut_a(buf_block_get_state(block) + ut_a(block->page.state() == BUF_BLOCK_REMOVE_HASH); } ut_ad(!dict_index_is_ibuf(block->index)); - ut_ad(block->page.id.space() + ut_ad(block->page.id().space() == block->index->table->space_id); page_index_id = btr_page_get_index_id(block->frame); @@ -2137,7 +2142,7 @@ btr_search_hash_table_validate(ulint hash_table_id) ib::error() << "Error in an adaptive hash" << " index pointer to page " - << block->page.id + << block->page.id() << ", ptr mem address " << reinterpret_cast<const void*>( node->data) diff --git a/storage/innobase/buf/buf0buddy.cc b/storage/innobase/buf/buf0buddy.cc index a78974992b0..3eabc981ab7 100644 --- a/storage/innobase/buf/buf0buddy.cc +++ b/storage/innobase/buf/buf0buddy.cc @@ -357,27 +357,23 @@ buf_buddy_block_free(void* buf) buf_block_t* block; ut_ad(mutex_own(&buf_pool.mutex)); - ut_ad(!mutex_own(&buf_pool.zip_mutex)); ut_a(!ut_align_offset(buf, srv_page_size)); HASH_SEARCH(hash, buf_pool.zip_hash, fold, buf_page_t*, bpage, - ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY - && bpage->in_zip_hash && !bpage->in_page_hash), + ut_ad(bpage->state() == BUF_BLOCK_MEMORY + && bpage->in_zip_hash), ((buf_block_t*) bpage)->frame == buf); ut_a(bpage); - ut_a(buf_page_get_state(bpage) == BUF_BLOCK_MEMORY); - ut_ad(!bpage->in_page_hash); + ut_a(bpage->state() == BUF_BLOCK_MEMORY); ut_ad(bpage->in_zip_hash); - ut_d(bpage->in_zip_hash = FALSE); + ut_d(bpage->in_zip_hash = false); HASH_DELETE(buf_page_t, hash, buf_pool.zip_hash, fold, bpage); ut_d(memset(buf, 0, srv_page_size)); UNIV_MEM_INVALID(buf, srv_page_size); block = (buf_block_t*) bpage; - buf_page_mutex_enter(block); buf_LRU_block_free_non_file_page(block); - buf_page_mutex_exit(block); ut_ad(buf_pool.buddy_n_frames > 0); ut_d(buf_pool.buddy_n_frames--); @@ -392,18 +388,13 @@ buf_buddy_block_register( buf_block_t* block) /*!< in: buffer frame to allocate */ { const ulint fold = BUF_POOL_ZIP_FOLD(block); - ut_ad(mutex_own(&buf_pool.mutex)); - ut_ad(!mutex_own(&buf_pool.zip_mutex)); - ut_ad(buf_block_get_state(block) == BUF_BLOCK_READY_FOR_USE); - - buf_block_set_state(block, BUF_BLOCK_MEMORY); + ut_ad(block->page.state() == BUF_BLOCK_MEMORY); ut_a(block->frame); ut_a(!ut_align_offset(block->frame, srv_page_size)); - ut_ad(!block->page.in_page_hash); ut_ad(!block->page.in_zip_hash); - ut_d(block->page.in_zip_hash = TRUE); + ut_d(block->page.in_zip_hash = true); HASH_INSERT(buf_page_t, hash, buf_pool.zip_hash, fold, &block->page); ut_d(buf_pool.buddy_n_frames++); @@ -449,7 +440,6 @@ byte *buf_buddy_alloc_low(ulint i, bool *lru) buf_block_t* block; ut_ad(mutex_own(&buf_pool.mutex)); - ut_ad(!mutex_own(&buf_pool.zip_mutex)); ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); if (i < BUF_BUDDY_SIZES) { @@ -469,9 +459,7 @@ byte *buf_buddy_alloc_low(ulint i, bool *lru) } /* Try replacing an uncompressed page in the buffer pool. */ - mutex_exit(&buf_pool.mutex); - block = buf_LRU_get_free_block(); - mutex_enter(&buf_pool.mutex); + block = buf_LRU_get_free_block(true); if (lru) { *lru = true; } @@ -502,7 +490,6 @@ static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force) ulint offset; ut_ad(mutex_own(&buf_pool.mutex)); - ut_ad(!mutex_own(&buf_pool.zip_mutex)); ut_ad(!ut_align_offset(src, size)); ut_ad(!ut_align_offset(dst, size)); ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); @@ -522,11 +509,11 @@ static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force) const page_id_t page_id(space, offset); - rw_lock_t* hash_lock = buf_page_hash_lock_get(page_id); + rw_lock_t* hash_lock = buf_pool.hash_lock_get(page_id); rw_lock_x_lock(hash_lock); - bpage = buf_page_hash_get_low(page_id); + bpage = buf_pool.page_hash_get_low(page_id); if (!bpage || bpage->zip.data != src) { /* The block has probably been freshly @@ -546,7 +533,8 @@ static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force) bpage = UT_LIST_GET_FIRST(buf_pool.LRU); while (bpage != NULL) { if (bpage->zip.data == src) { - hash_lock = buf_page_hash_lock_get(bpage->id); + ut_ad(bpage->id() == page_id); + hash_lock = buf_pool.hash_lock_get(page_id); rw_lock_x_lock(hash_lock); break; } @@ -573,11 +561,7 @@ static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force) contain uninitialized data. */ UNIV_MEM_ASSERT_W(src, size); - BPageMutex* block_mutex = buf_page_get_mutex(bpage); - - mutex_enter(block_mutex); - - if (buf_page_can_relocate(bpage)) { + if (bpage->can_relocate()) { /* Relocate the compressed page. */ const ulonglong ns = my_interval_timer(); @@ -588,8 +572,6 @@ static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force) rw_lock_x_unlock(hash_lock); - mutex_exit(block_mutex); - buf_buddy_mem_invalid( reinterpret_cast<buf_buddy_free_t*>(src), i); @@ -601,7 +583,6 @@ static bool buf_buddy_relocate(void* src, void* dst, ulint i, bool force) rw_lock_x_unlock(hash_lock); - mutex_exit(block_mutex); return(false); } @@ -614,7 +595,6 @@ void buf_buddy_free_low(void* buf, ulint i) buf_buddy_free_t* buddy; ut_ad(mutex_own(&buf_pool.mutex)); - ut_ad(!mutex_own(&buf_pool.zip_mutex)); ut_ad(i <= BUF_BUDDY_SIZES); ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); ut_ad(buf_pool.buddy_stat[i].used > 0); @@ -701,7 +681,6 @@ buf_buddy_realloc(void* buf, ulint size) ulint i = buf_buddy_get_slot(size); ut_ad(mutex_own(&buf_pool.mutex)); - ut_ad(!mutex_own(&buf_pool.zip_mutex)); ut_ad(i <= BUF_BUDDY_SIZES); ut_ad(i >= buf_buddy_get_slot(UNIV_ZIP_SIZE_MIN)); diff --git a/storage/innobase/buf/buf0buf.cc b/storage/innobase/buf/buf0buf.cc index 2b70c0ccff1..9baf9a361e4 100644 --- a/storage/innobase/buf/buf0buf.cc +++ b/storage/innobase/buf/buf0buf.cc @@ -200,7 +200,7 @@ in the main memory, undisturbed. The unzip_LRU list contains a subset of the common LRU list. The blocks on the unzip_LRU list hold a compressed file page and the corresponding uncompressed page frame. A block is in unzip_LRU if and -only if the predicate buf_page_belongs_to_unzip_LRU(&block->page) +only if the predicate block->page.belongs_to_unzip_LRU() holds. The blocks in unzip_LRU will be in same order as they are in the common LRU list. That is, each manipulation of the common LRU list will result in the same manipulation of the unzip_LRU list. @@ -279,15 +279,13 @@ the read requests for the whole area. #ifndef UNIV_INNOCHECKSUM /** Value in microseconds */ -static const int WAIT_FOR_READ = 100; -static const int WAIT_FOR_WRITE = 100; +constexpr int WAIT_FOR_READ= 100; +constexpr int WAIT_FOR_WRITE= 100; /** Number of attempts made to read in a page in the buffer pool */ -static const ulint BUF_PAGE_READ_MAX_RETRIES = 100; -/** Number of pages to read ahead */ -static const ulint BUF_READ_AHEAD_PAGES = 64; +constexpr ulint BUF_PAGE_READ_MAX_RETRIES= 100; /** The maximum portion of the buffer pool that can be used for the read-ahead buffer. (Divide buf_pool size by this amount) */ -static const ulint BUF_READ_AHEAD_PORTION = 32; +constexpr uint32_t BUF_READ_AHEAD_PORTION= 32; /** The InnoDB buffer pool */ buf_pool_t buf_pool; @@ -297,20 +295,16 @@ buf_pool_t::chunk_t::map *buf_pool_t::chunk_t::map_ref; #ifdef UNIV_DEBUG /** Disable resizing buffer pool to make assertion code not expensive. */ my_bool buf_disable_resize_buffer_pool_debug = TRUE; -#endif /* UNIV_DEBUG */ -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG /** This is used to insert validation operations in execution in the debug version */ -static ulint buf_dbg_counter = 0; -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +static ulint buf_dbg_counter; +#endif /* UNIV_DEBUG */ #if defined UNIV_PFS_MUTEX || defined UNIV_PFS_RWLOCK # ifndef PFS_SKIP_BUFFER_MUTEX_RWLOCK -/* Buffer block mutexes and rwlocks can be registered -in one group rather than individually. If PFS_GROUP_BUFFER_SYNC -is defined, register buffer block mutex and rwlock +/* If defined, register buf_block_t::lock in one group after their initialization. */ # define PFS_GROUP_BUFFER_SYNC @@ -374,24 +368,27 @@ static bool buf_tmp_page_decrypt(byte* tmp_frame, byte* src_frame) /** Decrypt a page. @param[in,out] bpage Page control block -@param[in,out] space tablespace +@param[in] node data file @return whether the operation was successful */ -static bool buf_page_decrypt_after_read(buf_page_t* bpage, fil_space_t* space) +static bool buf_page_decrypt_after_read(buf_page_t *bpage, + const fil_node_t &node) { - ut_ad(space->pending_io()); - ut_ad(space->id == bpage->id.space()); + ut_ad(node.space->pending_io()); + ut_ad(node.space->id == bpage->id().space()); + const auto flags = node.space->flags; byte* dst_frame = bpage->zip.data ? bpage->zip.data : ((buf_block_t*) bpage)->frame; - bool page_compressed = space->is_compressed() - && buf_page_is_compressed(dst_frame, space->flags); + bool page_compressed = node.space->is_compressed() + && buf_page_is_compressed(dst_frame, flags); + const page_id_t id(bpage->id()); - if (bpage->id.page_no() == 0) { + if (id.page_no() == 0) { /* File header pages are not encrypted/compressed */ return (true); } - if (space->purpose == FIL_TYPE_TEMPORARY + if (node.space->purpose == FIL_TYPE_TEMPORARY && innodb_encrypt_temporary_tables) { buf_tmp_buffer_t* slot = buf_pool.io_buf_reserve(); ut_a(slot); @@ -399,8 +396,8 @@ static bool buf_page_decrypt_after_read(buf_page_t* bpage, fil_space_t* space) if (!buf_tmp_page_decrypt(slot->crypt_buf, dst_frame)) { slot->release(); - ib::error() << "Encrypted page " << bpage->id - << " in file " << space->chain.start->name; + ib::error() << "Encrypted page " << id + << " in file " << node.name; return false; } @@ -413,14 +410,14 @@ static bool buf_page_decrypt_after_read(buf_page_t* bpage, fil_space_t* space) also for pages first compressed and then encrypted. */ buf_tmp_buffer_t* slot; - uint key_version = buf_page_get_key_version(dst_frame, space->flags); + uint key_version = buf_page_get_key_version(dst_frame, flags); if (page_compressed && !key_version) { /* the page we read is unencrypted */ /* Find free slot from temporary memory array */ decompress: - if (space->full_crc32() - && buf_page_is_corrupted(true, dst_frame, space->flags)) { + if (fil_space_t::full_crc32(flags) + && buf_page_is_corrupted(true, dst_frame, flags)) { return false; } @@ -429,24 +426,24 @@ decompress: slot->allocate(); decompress_with_slot: - ut_d(fil_page_type_validate(space, dst_frame)); + ut_d(fil_page_type_validate(node.space, dst_frame)); ulint write_size = fil_page_decompress( - slot->crypt_buf, dst_frame, space->flags); + slot->crypt_buf, dst_frame, flags); slot->release(); - - ut_ad(!write_size || fil_page_type_validate(space, dst_frame)); - ut_ad(space->pending_io()); + ut_ad(!write_size + || fil_page_type_validate(node.space, dst_frame)); + ut_ad(node.space->pending_io()); return write_size != 0; } - if (key_version && space->crypt_data) { + if (key_version && node.space->crypt_data) { /* Verify encryption checksum before we even try to decrypt. */ - if (!buf_page_verify_crypt_checksum(dst_frame, space->flags)) { + if (!buf_page_verify_crypt_checksum(dst_frame, flags)) { decrypt_failed: - ib::error() << "Encrypted page " << bpage->id - << " in file " << space->chain.start->name + ib::error() << "Encrypted page " << id + << " in file " << node.name << " looks corrupted; key_version=" << key_version; return false; @@ -455,17 +452,17 @@ decrypt_failed: slot = buf_pool.io_buf_reserve(); ut_a(slot); slot->allocate(); - ut_d(fil_page_type_validate(space, dst_frame)); + ut_d(fil_page_type_validate(node.space, dst_frame)); /* decrypt using crypt_buf to dst_frame */ - if (!fil_space_decrypt(space, slot->crypt_buf, dst_frame)) { + if (!fil_space_decrypt(node.space, slot->crypt_buf, dst_frame)) { slot->release(); goto decrypt_failed; } - ut_d(fil_page_type_validate(space, dst_frame)); + ut_d(fil_page_type_validate(node.space, dst_frame)); - if ((space->full_crc32() && page_compressed) + if ((fil_space_t::full_crc32(flags) && page_compressed) || fil_page_get_type(dst_frame) == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) { goto decompress_with_slot; @@ -477,7 +474,7 @@ decrypt_failed: goto decompress; } - ut_ad(space->pending_io()); + ut_ad(node.space->pending_io()); return true; } @@ -494,27 +491,17 @@ lsn_t buf_pool_t::get_oldest_modification() contents is still needed (the page has not been freed). */ const buf_page_t *bpage; for (bpage= UT_LIST_GET_LAST(flush_list); - bpage && fsp_is_system_temporary(bpage->id.space()); + bpage && fsp_is_system_temporary(bpage->id().space()); bpage= UT_LIST_GET_PREV(list, bpage)) - ut_ad(bpage->in_flush_list); + ut_ad(bpage->oldest_modification()); - lsn_t oldest_lsn= bpage ? bpage->oldest_modification : 0; + lsn_t oldest_lsn= bpage ? bpage->oldest_modification() : 0; mutex_exit(&flush_list_mutex); /* The result may become stale as soon as we released the mutex. On log checkpoint, also log_sys.flush_order_mutex will be needed. */ return oldest_lsn; } - -/** Allocate a buffer block. -@return own: the allocated block, in state BUF_BLOCK_MEMORY */ -buf_block_t* -buf_block_alloc() -{ - buf_block_t* block = buf_LRU_get_free_block(); - buf_block_set_state(block, BUF_BLOCK_MEMORY); - return(block); -} #endif /* !UNIV_INNOCHECKSUM */ /** Checks if the page is in crc32 checksum format. @@ -1214,8 +1201,6 @@ void buf_page_print(const byte* read_buf, ulint zip_size) } # ifdef PFS_GROUP_BUFFER_SYNC -extern mysql_pfs_key_t buffer_block_mutex_key; - /********************************************************************//** This function registers mutexes and rwlocks in buffer blocks with performance schema. If PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER is @@ -1237,13 +1222,6 @@ pfs_register_buffer_block( chunk->size, PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER); for (ulint i = 0; i < num_to_register; i++) { -# ifdef UNIV_PFS_MUTEX - BPageMutex* mutex; - - mutex = &block->mutex; - mutex->pfs_add(buffer_block_mutex_key); -# endif /* UNIV_PFS_MUTEX */ - rw_lock_t* rwlock; # ifdef UNIV_PFS_RWLOCK @@ -1283,31 +1261,18 @@ buf_block_init(buf_block_t* block, byte* frame) block->frame = frame; - block->page.flush_type = BUF_FLUSH_LRU; - block->page.state = BUF_BLOCK_NOT_USED; - block->page.buf_fix_count = 0; - block->page.io_fix = BUF_IO_NONE; - block->page.real_size = 0; block->modify_clock = 0; - block->page.slot = NULL; - block->page.status = buf_page_t::NORMAL; - + block->page.init(BUF_BLOCK_NOT_USED, page_id_t(~0ULL)); #ifdef BTR_CUR_HASH_ADAPT block->index = NULL; #endif /* BTR_CUR_HASH_ADAPT */ block->skip_flush_check = false; - ut_d(block->page.in_page_hash = FALSE); - ut_d(block->page.in_zip_hash = FALSE); - ut_d(block->page.in_flush_list = FALSE); - ut_d(block->page.in_free_list = FALSE); - ut_d(block->page.in_LRU_list = FALSE); - ut_d(block->in_unzip_LRU_list = FALSE); - ut_d(block->in_withdraw_list = FALSE); + ut_d(block->in_unzip_LRU_list = false); + ut_d(block->in_withdraw_list = false); page_zip_des_init(&block->page.zip); - mutex_create(LATCH_ID_BUF_BLOCK_MUTEX, &block->mutex); ut_d(block->debug_latch = (rw_lock_t *) ut_malloc_nokey(sizeof(rw_lock_t))); #if defined PFS_SKIP_BUFFER_MUTEX_RWLOCK || defined PFS_GROUP_BUFFER_SYNC @@ -1431,16 +1396,13 @@ inline const buf_block_t *buf_pool_t::chunk_t::not_freed() const buf_block_t *block= blocks; for (auto i= size; i--; block++) { - switch (buf_block_get_state(block)) { - case BUF_BLOCK_POOL_WATCH: + switch (block->page.state()) { case BUF_BLOCK_ZIP_PAGE: - case BUF_BLOCK_ZIP_DIRTY: /* The uncompressed buffer pool should never contain ROW_FORMAT=COMPRESSED block descriptors. */ ut_error; break; case BUF_BLOCK_NOT_USED: - case BUF_BLOCK_READY_FOR_USE: case BUF_BLOCK_MEMORY: case BUF_BLOCK_REMOVE_HASH: /* Skip blocks that are not being used for file pages. */ @@ -1450,19 +1412,16 @@ inline const buf_block_t *buf_pool_t::chunk_t::not_freed() const { /* The page cleaner is disabled in read-only mode. No pages can be dirtied, so all of them must be clean. */ - ut_ad(block->page.oldest_modification == 0 || - block->page.oldest_modification == recv_sys.recovered_lsn || + ut_d(lsn_t oldest_modification= block->page.oldest_modification()); + ut_ad(oldest_modification == 0 || + oldest_modification == recv_sys.recovered_lsn || srv_force_recovery == SRV_FORCE_NO_LOG_REDO); - ut_ad(block->page.buf_fix_count == 0); - ut_ad(block->page.io_fix == BUF_IO_NONE); + ut_ad(!block->page.buf_fix_count()); + ut_ad(block->page.io_fix() == BUF_IO_NONE); break; } - buf_page_mutex_enter(block); - auto ready= buf_flush_ready_for_replace(&block->page); - buf_page_mutex_exit(block); - - if (!ready) + if (!block->page.ready_for_replace()) return block; break; @@ -1477,7 +1436,6 @@ inline const buf_block_t *buf_pool_t::chunk_t::not_freed() const @param[in,out] block buffer pool block descriptor */ static void buf_block_free_mutexes(buf_block_t* block) { - mutex_free(&block->mutex); rw_lock_free(&block->lock); ut_d(rw_lock_free(block->debug_latch)); ut_d(ut_free(block->debug_latch)); @@ -1519,17 +1477,17 @@ bool buf_pool_t::create() { buf_block_t* block= chunk->blocks; - for (auto i= chunk->size; i--; block++) + for (auto i= chunk->size; i--; block++) buf_block_free_mutexes(block); - allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx); - } - ut_free(chunks); - chunks= nullptr; - UT_DELETE(chunk_t::map_reg); - chunk_t::map_reg= nullptr; - ut_ad(!is_initialised()); - return true; + allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx); + } + ut_free(chunks); + chunks= nullptr; + UT_DELETE(chunk_t::map_reg); + chunk_t::map_reg= nullptr; + ut_ad(!is_initialised()); + return true; } curr_size+= chunk->size; @@ -1538,7 +1496,6 @@ bool buf_pool_t::create() ut_ad(is_initialised()); mutex_create(LATCH_ID_BUF_POOL, &mutex); - mutex_create(LATCH_ID_BUF_POOL_ZIP, &zip_mutex); UT_LIST_INIT(LRU, &buf_page_t::LRU); UT_LIST_INIT(withdraw, &buf_page_t::list); @@ -1546,23 +1503,23 @@ bool buf_pool_t::create() UT_LIST_INIT(flush_list, &buf_page_t::list); UT_LIST_INIT(unzip_LRU, &buf_block_t::unzip_LRU); -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - UT_LIST_INIT(zip_clean, &buf_page_t::list); -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + ut_d(UT_LIST_INIT(zip_clean, &buf_page_t::list)); for (size_t i= 0; i < UT_ARR_SIZE(zip_free); ++i) UT_LIST_INIT(zip_free[i], &buf_buddy_free_t::list); - - read_ahead_area= ut_min(BUF_READ_AHEAD_PAGES, - ut_2_power_up(curr_size / BUF_READ_AHEAD_PORTION)); + ulint s= curr_size; + old_size= s; + s/= BUF_READ_AHEAD_PORTION; + read_ahead_area= s >= READ_AHEAD_PAGES + ? READ_AHEAD_PAGES + : my_round_up_to_next_power(static_cast<uint32_t>(s)); curr_pool_size= srv_buf_pool_size; - old_size= curr_size; n_chunks_new= n_chunks; /* Number of locks protecting page_hash must be a power of two */ - srv_n_page_hash_locks= static_cast<ulong> - (ut_2_power_up(srv_n_page_hash_locks)); + srv_n_page_hash_locks= my_round_up_to_next_power(static_cast<uint32_t> + (srv_n_page_hash_locks)); ut_a(srv_n_page_hash_locks != 0); ut_a(srv_n_page_hash_locks <= MAX_PAGE_HASH_LOCKS); @@ -1576,12 +1533,9 @@ bool buf_pool_t::create() mutex_create(LATCH_ID_FLUSH_LIST, &flush_list_mutex); - for (int i= BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) + for (int i= 0; i < 3; i++) no_flush[i]= os_event_create(0); - watch= static_cast<buf_page_t*> - (ut_zalloc_nokey(sizeof *watch * BUF_POOL_WATCH_SIZE)); - try_LRU_scan= true; ut_d(flush_hp.m_mutex= &flush_list_mutex;); @@ -1612,7 +1566,6 @@ void buf_pool_t::close() return; mutex_free(&mutex); - mutex_free(&zip_mutex); mutex_free(&flush_list_mutex); if (flush_rbt) @@ -1625,22 +1578,18 @@ void buf_pool_t::close() bpage= prev_bpage) { prev_bpage= UT_LIST_GET_PREV(LRU, bpage); - buf_page_state state= buf_page_get_state(bpage); - - ut_ad(buf_page_in_file(bpage)); + ut_ad(bpage->in_file()); ut_ad(bpage->in_LRU_list); + /* The buffer pool must be clean during normal shutdown. + Only on aborted startup (with recovery) or with innodb_fast_shutdown=2 + we may discard changes. */ + ut_ad(!bpage->oldest_modification() || srv_is_being_started || + srv_fast_shutdown == 2); - if (state != BUF_BLOCK_FILE_PAGE) - { - /* We must not have any dirty block except during a fast shutdown. */ - ut_ad(state == BUF_BLOCK_ZIP_PAGE || srv_fast_shutdown == 2); + if (bpage->state() != BUF_BLOCK_FILE_PAGE) buf_page_free_descriptor(bpage); - } } - ut_free(watch); - watch= nullptr; - for (auto chunk= chunks + n_chunks; --chunk >= chunks; ) { buf_block_t *block= chunk->blocks; @@ -1651,7 +1600,7 @@ void buf_pool_t::close() allocator.deallocate_large_dodump(chunk->mem, &chunk->mem_pfx); } - for (ulint i= BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; ++i) + for (int i= 0; i < 3; ++i) os_event_destroy(no_flush[i]); ut_free(chunks); @@ -1674,7 +1623,7 @@ inline bool buf_pool_t::realloc(buf_block_t *block) ut_ad(withdrawing); ut_ad(mutex_own(&mutex)); - ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE); new_block = buf_LRU_get_free_only(); @@ -1682,28 +1631,17 @@ inline bool buf_pool_t::realloc(buf_block_t *block) return(false); /* free list was not enough */ } - rw_lock_t* hash_lock = buf_page_hash_lock_get(block->page.id); + const page_id_t id(block->page.id()); + rw_lock_t* hash_lock = hash_lock_get(id); rw_lock_x_lock(hash_lock); - mutex_enter(&block->mutex); - - if (buf_page_can_relocate(&block->page)) { - mutex_enter(&new_block->mutex); + if (block->page.can_relocate()) { memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>( new_block->frame, block->frame, srv_page_size); new (&new_block->page) buf_page_t(block->page); /* relocate LRU list */ - ut_ad(block->page.in_LRU_list); - ut_ad(!block->page.in_zip_hash); - ut_d(block->page.in_LRU_list = FALSE); - - buf_LRU_adjust_hp(&block->page); - - buf_page_t* prev_b = UT_LIST_GET_PREV(LRU, &block->page); - UT_LIST_REMOVE(LRU, &block->page); - - if (prev_b != NULL) { + if (buf_page_t* prev_b = buf_pool.LRU_remove(&block->page)) { UT_LIST_INSERT_AFTER(LRU, prev_b, &new_block->page); } else { UT_LIST_ADD_FIRST(LRU, &new_block->page); @@ -1718,14 +1656,14 @@ inline bool buf_pool_t::realloc(buf_block_t *block) /* relocate unzip_LRU list */ if (block->page.zip.data != NULL) { ut_ad(block->in_unzip_LRU_list); - ut_d(new_block->in_unzip_LRU_list = TRUE); + ut_d(new_block->in_unzip_LRU_list = true); UNIV_MEM_DESC(&new_block->page.zip.data, page_zip_get_size(&new_block->page.zip)); buf_block_t* prev_block = UT_LIST_GET_PREV(unzip_LRU, block); UT_LIST_REMOVE(unzip_LRU, block); - ut_d(block->in_unzip_LRU_list = FALSE); + ut_d(block->in_unzip_LRU_list = false); block->page.zip.data = NULL; page_zip_set_size(&block->page.zip, 0); @@ -1736,19 +1674,16 @@ inline bool buf_pool_t::realloc(buf_block_t *block) } } else { ut_ad(!block->in_unzip_LRU_list); - ut_d(new_block->in_unzip_LRU_list = FALSE); + ut_d(new_block->in_unzip_LRU_list = false); } /* relocate page_hash */ ut_ad(block->page.in_page_hash); - ut_ad(&block->page == buf_page_hash_get_low(block->page.id)); - ut_d(block->page.in_page_hash = FALSE); - ulint fold = block->page.id.fold(); - ut_ad(fold == new_block->page.id.fold()); - HASH_REPLACE(buf_page_t, hash, page_hash, fold, - &block->page, &new_block->page); - ut_ad(new_block->page.in_page_hash); + ut_ad(&block->page == page_hash_get_low(id)); + ut_d(block->page.in_page_hash = false); + HASH_REPLACE(buf_page_t, hash, page_hash, id.fold(), + &block->page, &new_block->page); buf_block_modify_clock_inc(block); static_assert(FIL_PAGE_OFFSET % 4 == 0, "alignment"); @@ -1758,16 +1693,16 @@ inline bool buf_pool_t::realloc(buf_block_t *block) memset_aligned<2>(block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4); UNIV_MEM_INVALID(block->frame, srv_page_size); - buf_block_set_state(block, BUF_BLOCK_REMOVE_HASH); - block->page.id - = page_id_t(ULINT32_UNDEFINED, ULINT32_UNDEFINED); + block->page.set_state(BUF_BLOCK_REMOVE_HASH); /* Relocate flush_list. */ - if (block->page.oldest_modification) { + if (block->page.oldest_modification()) { buf_flush_relocate_on_flush_list( &block->page, &new_block->page); } + block->page.set_corrupt_id(); + /* set other flags of buf_block_t */ #ifdef BTR_CUR_HASH_ADAPT @@ -1784,25 +1719,16 @@ inline bool buf_pool_t::realloc(buf_block_t *block) new_block->lock_hash_val = block->lock_hash_val; ut_ad(new_block->lock_hash_val == lock_rec_hash( - new_block->page.id.space(), - new_block->page.id.page_no())); + id.space(), id.page_no())); rw_lock_x_unlock(hash_lock); - mutex_exit(&new_block->mutex); /* free block */ - buf_block_set_state(block, BUF_BLOCK_MEMORY); + ut_d(block->page.set_state(BUF_BLOCK_MEMORY)); buf_LRU_block_free_non_file_page(block); - - mutex_exit(&block->mutex); } else { rw_lock_x_unlock(hash_lock); - mutex_exit(&block->mutex); - - /* free new_block */ - mutex_enter(&new_block->mutex); buf_LRU_block_free_non_file_page(new_block); - mutex_exit(&new_block->mutex); } return(true); /* free_list was enough */ @@ -1859,9 +1785,9 @@ inline bool buf_pool_t::withdraw_blocks() while (block != NULL && UT_LIST_GET_LEN(withdraw) < withdraw_target) { ut_ad(block->page.in_free_list); - ut_ad(!block->page.in_flush_list); + ut_ad(!block->page.oldest_modification()); ut_ad(!block->page.in_LRU_list); - ut_a(!buf_page_in_file(&block->page)); + ut_a(!block->page.in_file()); buf_block_t* next_block; next_block = reinterpret_cast<buf_block_t*>( @@ -1872,7 +1798,7 @@ inline bool buf_pool_t::withdraw_blocks() /* This should be withdrawn */ UT_LIST_REMOVE(free, &block->page); UT_LIST_ADD_LAST(withdraw, &block->page); - ut_d(block->in_withdraw_list = TRUE); + ut_d(block->in_withdraw_list = true); count1++; } @@ -1896,8 +1822,8 @@ inline bool buf_pool_t::withdraw_blocks() static_cast<ulint>(srv_LRU_scan_depth)), scan_depth); - buf_flush_do_batch(BUF_FLUSH_LRU, scan_depth, 0, &n); - buf_flush_wait_batch_end(BUF_FLUSH_LRU); + buf_flush_do_batch(true, scan_depth, 0, &n); + buf_flush_wait_batch_end(true); if (n.flushed) { MONITOR_INC_VALUE_CUMULATIVE( @@ -1915,18 +1841,10 @@ inline bool buf_pool_t::withdraw_blocks() buf_page_t* bpage; bpage = UT_LIST_GET_FIRST(LRU); while (bpage != NULL) { - BPageMutex* block_mutex; - buf_page_t* next_bpage; - - block_mutex = buf_page_get_mutex(bpage); - mutex_enter(block_mutex); - - next_bpage = UT_LIST_GET_NEXT(LRU, bpage); - + buf_page_t* next_bpage = UT_LIST_GET_NEXT(LRU, bpage); if (bpage->zip.data != NULL && will_be_withdrawn(bpage->zip.data) - && buf_page_can_relocate(bpage)) { - mutex_exit(block_mutex); + && bpage->can_relocate()) { buf_pool_mutex_exit_forbid(); if (!buf_buddy_realloc( bpage->zip.data, @@ -1936,15 +1854,12 @@ inline bool buf_pool_t::withdraw_blocks() break; } buf_pool_mutex_exit_allow(); - mutex_enter(block_mutex); count2++; } - if (buf_page_get_state(bpage) - == BUF_BLOCK_FILE_PAGE + if (bpage->state() == BUF_BLOCK_FILE_PAGE && buf_pool.will_be_withdrawn(*bpage)) { - if (buf_page_can_relocate(bpage)) { - mutex_exit(block_mutex); + if (bpage->can_relocate()) { buf_pool_mutex_exit_forbid(); if (!realloc( reinterpret_cast<buf_block_t*>( @@ -1955,13 +1870,9 @@ inline bool buf_pool_t::withdraw_blocks() } buf_pool_mutex_exit_allow(); count2++; - } else { - mutex_exit(block_mutex); } /* NOTE: if the page is in use, not relocated yet */ - } else { - mutex_exit(block_mutex); } bpage = next_bpage; @@ -1995,7 +1906,7 @@ inline bool buf_pool_t::withdraw_blocks() * const echunk = chunks + n_chunks; chunk != echunk; chunk++) { block = chunk->blocks; for (ulint j = chunk->size; j--; block++) { - ut_a(buf_block_get_state(block) == BUF_BLOCK_NOT_USED); + ut_a(block->page.state() == BUF_BLOCK_NOT_USED); ut_ad(block->in_withdraw_list); } } @@ -2031,11 +1942,12 @@ static void buf_pool_resize_hash() buf_page_t* prev_bpage = bpage; ulint fold; + ut_ad(bpage->in_page_hash); bpage = static_cast<buf_page_t*>( HASH_GET_NEXT( hash, prev_bpage)); - fold = prev_bpage->id.fold(); + fold = prev_bpage->id().fold(); HASH_DELETE(buf_page_t, hash, buf_pool.page_hash, fold, @@ -2085,6 +1997,7 @@ static void buf_pool_resize_hash() buf_pool.zip_hash = new_hash_table; } + /** Resize from srv_buf_pool_old_size to srv_buf_pool_size. */ inline void buf_pool_t::resize() { @@ -2249,7 +2162,7 @@ withdraw_retry: resizing.store(true, std::memory_order_relaxed); mutex_enter(&mutex); - hash_lock_x_all(page_hash); + page_hash_lock_all(); chunk_t::map_reg = UT_NEW_NOKEY(chunk_t::map()); /* add/delete chunks */ @@ -2376,13 +2289,15 @@ calc_buf_pool_size: /* set size */ ut_ad(UT_LIST_GET_LEN(withdraw) == 0); - read_ahead_area = ut_min( - BUF_READ_AHEAD_PAGES, - ut_2_power_up(curr_size / BUF_READ_AHEAD_PORTION)); - curr_pool_size = n_chunks * srv_buf_pool_chunk_unit; - srv_buf_pool_curr_size = curr_pool_size;/* FIXME: remove*/ - old_size = curr_size; - innodb_set_buf_pool_size(buf_pool_size_align(srv_buf_pool_curr_size)); + ulint s= curr_size; + old_size= s; + s/= BUF_READ_AHEAD_PORTION; + read_ahead_area= s >= READ_AHEAD_PAGES + ? READ_AHEAD_PAGES + : my_round_up_to_next_power(static_cast<uint32_t>(s)); + curr_pool_size= n_chunks * srv_buf_pool_chunk_unit; + srv_buf_pool_curr_size= curr_pool_size;/* FIXME: remove*/ + innodb_set_buf_pool_size(buf_pool_size_align(srv_buf_pool_curr_size)); const bool new_size_too_diff = srv_buf_pool_base_size > srv_buf_pool_size * 2 @@ -2396,7 +2311,7 @@ calc_buf_pool_size: ib::info() << "hash tables were resized"; } - hash_unlock_x_all(page_hash); + page_hash_unlock_all(); mutex_exit(&mutex); if (page_hash_old != NULL) { @@ -2462,9 +2377,7 @@ calc_buf_pool_size: " finished resizing at %s.", now); } -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - validate(); -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + ut_d(validate()); return; } @@ -2509,32 +2422,24 @@ void buf_resize_shutdown() /** Relocate a ROW_FORMAT=COMPRESSED block in the LRU list and buf_pool.page_hash. The caller must relocate bpage->list. -@param bpage control block in BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_ZIP_PAGE +@param bpage BUF_BLOCK_ZIP_PAGE block @param dpage destination control block */ static void buf_relocate(buf_page_t *bpage, buf_page_t *dpage) { - buf_page_t* b; - + ut_ad(bpage->state() == BUF_BLOCK_ZIP_PAGE); ut_ad(mutex_own(&buf_pool.mutex)); - ut_ad(buf_page_hash_lock_held_x(bpage)); - ut_ad(mutex_own(buf_page_get_mutex(bpage))); - ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE); - ut_a(bpage->buf_fix_count == 0); - ut_ad(bpage->in_LRU_list); - ut_ad(!bpage->in_zip_hash); - ut_ad(bpage->in_page_hash); - ut_ad(bpage == buf_page_hash_get_low(bpage->id)); - ut_ad(!buf_pool_watch_is_sentinel(bpage)); + ut_ad(rw_lock_own(buf_pool.hash_lock_get(bpage->id()), RW_LOCK_X)); + ut_a(bpage->io_fix() == BUF_IO_NONE); + ut_a(!bpage->buf_fix_count()); + ut_ad(bpage == buf_pool.page_hash_get_low(bpage->id())); + ut_ad(!buf_pool.watch_is_sentinel(*bpage)); #ifdef UNIV_DEBUG - switch (buf_page_get_state(bpage)) { - case BUF_BLOCK_POOL_WATCH: + switch (bpage->state()) { case BUF_BLOCK_NOT_USED: - case BUF_BLOCK_READY_FOR_USE: case BUF_BLOCK_FILE_PAGE: case BUF_BLOCK_MEMORY: case BUF_BLOCK_REMOVE_HASH: ut_error; - case BUF_BLOCK_ZIP_DIRTY: case BUF_BLOCK_ZIP_PAGE: break; } @@ -2544,16 +2449,7 @@ static void buf_relocate(buf_page_t *bpage, buf_page_t *dpage) /* Important that we adjust the hazard pointer before removing bpage from LRU list. */ - buf_LRU_adjust_hp(bpage); - - ut_d(bpage->in_LRU_list = FALSE); - ut_d(bpage->in_page_hash = FALSE); - - /* relocate buf_pool.LRU */ - b = UT_LIST_GET_PREV(LRU, bpage); - UT_LIST_REMOVE(buf_pool.LRU, bpage); - - if (b != NULL) { + if (buf_page_t* b = buf_pool.LRU_remove(bpage)) { UT_LIST_INSERT_AFTER(buf_pool.LRU, b, dpage); } else { UT_LIST_ADD_FIRST(buf_pool.LRU, dpage); @@ -2572,235 +2468,98 @@ static void buf_relocate(buf_page_t *bpage, buf_page_t *dpage) } else { /* Check that the "old" flag is consistent in the block and its neighbours. */ - buf_page_set_old(dpage, buf_page_is_old(dpage)); + dpage->set_old(dpage->is_old()); #endif /* UNIV_LRU_DEBUG */ } ut_d(CheckInLRUList::validate()); /* relocate buf_pool.page_hash */ - ulint fold = bpage->id.fold(); - ut_ad(fold == dpage->id.fold()); + ulint fold = bpage->id().fold(); + ut_ad(fold == dpage->id().fold()); + ut_ad(bpage->in_page_hash); + ut_ad(dpage->in_page_hash); + ut_d(bpage->in_page_hash = false); HASH_REPLACE(buf_page_t, hash, buf_pool.page_hash, fold, bpage, dpage); } -/** Determine if a block is a sentinel for a buffer pool watch. -@param[in] bpage block -@return whether bpage a sentinel for a buffer pool watch */ -bool buf_pool_watch_is_sentinel(const buf_page_t* bpage) -{ - /* We must own the appropriate hash lock. */ - ut_ad(buf_page_hash_lock_held_s_or_x(bpage)); - ut_ad(buf_page_in_file(bpage)); - - if (bpage < &buf_pool.watch[0] - || bpage >= &buf_pool.watch[BUF_POOL_WATCH_SIZE]) { - - ut_ad(buf_page_get_state(bpage) != BUF_BLOCK_ZIP_PAGE - || bpage->zip.data != NULL); - - return false; - } - - ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE); - ut_ad(!bpage->in_zip_hash); - ut_ad(bpage->in_page_hash); - ut_ad(bpage->zip.data == NULL); - return true; -} - -/** Add watch for the given page to be read in. Caller must have -appropriate hash_lock for the bpage. This function may release the -hash_lock and reacquire it. -@param[in] page_id page id -@param[in,out] hash_lock hash_lock currently latched -@return NULL if watch set, block if the page is in the buffer pool */ -static -buf_page_t* -buf_pool_watch_set( - const page_id_t page_id, - rw_lock_t** hash_lock) +/** Register a watch for a page identifier. The caller must hold an +exclusive page hash latch. The *hash_lock may be released, +relocated, and reacquired. +@param id page identifier +@param hash_lock page_hash latch that is held in RW_LOCK_X mode +@return a buffer pool block corresponding to id +@retval nullptr if the block was not present, and a watch was installed */ +inline buf_page_t *buf_pool_t::watch_set(const page_id_t id, + rw_lock_t **hash_lock) { - buf_page_t* bpage; - ulint i; - - ut_ad(*hash_lock == buf_page_hash_lock_get(page_id)); - - ut_ad(rw_lock_own(*hash_lock, RW_LOCK_X)); - - bpage = buf_page_hash_get_low(page_id); + const ulint fold= id.fold(); + ut_ad(*hash_lock == hash_lock_get_low(fold)); + ut_ad(rw_lock_own(*hash_lock, RW_LOCK_X)); - if (bpage != NULL) { -page_found: - if (!buf_pool_watch_is_sentinel(bpage)) { - /* The page was loaded meanwhile. */ - return(bpage); - } - - /* Add to an existing watch. */ - bpage->fix(); - return(NULL); - } - - /* From this point this function becomes fairly heavy in terms - of latching. We acquire the buf_pool mutex as well as all the - hash_locks. buf_pool mutex is needed because any changes to - the page_hash must be covered by it and hash_locks are needed - because we don't want to read any stale information in - buf_pool.watch[]. However, it is not in the critical code path - as this function will be called only by the purge thread. */ - - /* To obey latching order first release the hash_lock. */ - rw_lock_x_unlock(*hash_lock); - - mutex_enter(&buf_pool.mutex); - hash_lock_x_all(buf_pool.page_hash); - - /* We have to recheck that the page - was not loaded or a watch set by some other - purge thread. This is because of the small - time window between when we release the - hash_lock to acquire buf_pool.mutex above. */ - - *hash_lock = buf_page_hash_lock_get(page_id); - - bpage = buf_page_hash_get_low(page_id); - if (UNIV_LIKELY_NULL(bpage)) { - mutex_exit(&buf_pool.mutex); - hash_unlock_x_all_but(buf_pool.page_hash, *hash_lock); - goto page_found; - } - - /* The maximum number of purge threads should never exceed - BUF_POOL_WATCH_SIZE. So there is no way for a purge task - to hold a watch when setting another watch. */ - for (i = 0; i < BUF_POOL_WATCH_SIZE; i++) { - bpage = &buf_pool.watch[i]; - - ut_ad(bpage->access_time == 0); - ut_ad(bpage->oldest_modification == 0); - ut_ad(bpage->zip.data == NULL); - ut_ad(!bpage->in_zip_hash); - - switch (bpage->state) { - case BUF_BLOCK_POOL_WATCH: - ut_ad(!bpage->in_page_hash); - ut_ad(bpage->buf_fix_count == 0); - - /* bpage is pointing to buf_pool.watch[], - which is protected by buf_pool.mutex. - Normally, buf_page_t objects are protected by - buf_block_t::mutex or buf_pool.zip_mutex or both. */ - - bpage->state = BUF_BLOCK_ZIP_PAGE; - bpage->id = page_id; - bpage->buf_fix_count = 1; - - ut_d(bpage->in_page_hash = TRUE); - HASH_INSERT(buf_page_t, hash, buf_pool.page_hash, - page_id.fold(), bpage); - - mutex_exit(&buf_pool.mutex); - /* Once the sentinel is in the page_hash we can - safely release all locks except just the - relevant hash_lock */ - hash_unlock_x_all_but(buf_pool.page_hash, - *hash_lock); - - return(NULL); - case BUF_BLOCK_ZIP_PAGE: - ut_ad(bpage->in_page_hash); - ut_ad(bpage->buf_fix_count > 0); - break; - default: - ut_error; - } - } - - /* Allocation failed. Either the maximum number of purge - threads should never exceed BUF_POOL_WATCH_SIZE, or this code - should be modified to return a special non-NULL value and the - caller should purge the record directly. */ - ut_error; - - /* Fix compiler warning */ - return(NULL); -} - -/** Remove the sentinel block for the watch before replacing it with a -real block. buf_pool_watch_unset() or buf_pool_watch_occurred() will notice -that the block has been replaced with the real block. -@param[in,out] watch sentinel for watch -@return reference count, to be added to the replacement block */ -static void buf_pool_watch_remove(buf_page_t *watch) -{ - ut_ad(rw_lock_own(buf_page_hash_lock_get(watch->id), RW_LOCK_X)); - ut_ad(mutex_own(&buf_pool.mutex)); - - ut_ad(watch->in_page_hash); - ut_d(watch->in_page_hash= FALSE); - HASH_DELETE(buf_page_t, hash, buf_pool.page_hash, watch->id.fold(), watch); - watch->buf_fix_count= 0; - watch->state= BUF_BLOCK_POOL_WATCH; -} - -/** Stop watching if the page has been read in. -buf_pool_watch_set(same_page_id) must have returned NULL before. -@param[in] page_id page id */ -void buf_pool_watch_unset(const page_id_t page_id) -{ - /* FIXME: We only need buf_pool.mutex during the HASH_DELETE - because it protects watch->in_page_hash. */ - mutex_enter(&buf_pool.mutex); - - rw_lock_t *hash_lock= buf_page_hash_lock_get(page_id); - rw_lock_x_lock(hash_lock); - - /* The page must exist because buf_pool_watch_set() increments - buf_fix_count. */ - buf_page_t *watch= buf_page_hash_get_low(page_id); - - if (watch->unfix() == 0 && buf_pool_watch_is_sentinel(watch)) +retry: + if (buf_page_t *bpage= page_hash_get_low(id)) { - /* The following is based on buf_pool_watch_remove(). */ - ut_d(watch->in_page_hash= FALSE); - HASH_DELETE(buf_page_t, hash, buf_pool.page_hash, page_id.fold(), watch); - rw_lock_x_unlock(hash_lock); - /* Now that the watch is no longer reachable via buf_pool.page_hash, - release it to buf_pool.watch[] for reuse. */ - watch->buf_fix_count= 0; - watch->state= BUF_BLOCK_POOL_WATCH; + if (!watch_is_sentinel(*bpage)) + /* The page was loaded meanwhile. */ + return bpage; + /* Add to an existing watch. */ + bpage->fix(); + return nullptr; } - else - rw_lock_x_unlock(hash_lock); - mutex_exit(&buf_pool.mutex); -} - -/** Check if the page has been read in. -This may only be called after buf_pool_watch_set(same_page_id) -has returned NULL and before invoking buf_pool_watch_unset(same_page_id). -@param[in] page_id page id -@return false if the given page was not read in, true if it was */ -bool buf_pool_watch_occurred(const page_id_t page_id) -{ - bool ret; - buf_page_t* bpage; - rw_lock_t* hash_lock = buf_page_hash_lock_get(page_id); - - rw_lock_s_lock(hash_lock); - /* If not own buf_pool_mutex, page_hash can be changed. */ - hash_lock = buf_page_hash_lock_s_confirm(hash_lock, page_id); + rw_lock_x_unlock(*hash_lock); + /* Allocate a watch[] and then try to insert it into the page_hash. */ + mutex_enter(&mutex); - /* The page must exist because buf_pool_watch_set() - increments buf_fix_count. */ - bpage = buf_page_hash_get_low(page_id); + /* The maximum number of purge tasks should never exceed + the UT_ARR_SIZE(watch) - 1, and there is no way for a purge task to hold a + watch when setting another watch. */ + for (buf_page_t *w= &watch[UT_ARR_SIZE(watch)]; w-- >= watch; ) + { + ut_ad(w->access_time == 0); + ut_ad(!w->oldest_modification()); + ut_ad(!w->zip.data); + ut_ad(!w->in_zip_hash); + if (w->state() == BUF_BLOCK_ZIP_PAGE) + /* This watch may be in use for some other page. */ + continue; + ut_ad(w->state() == BUF_BLOCK_NOT_USED); + ut_ad(!w->buf_fix_count()); + /* w is pointing to watch[], which is protected by mutex. + Normally, buf_page_t::id for objects that are reachable by + page_hash_get_low(id) are protected by hash_lock. */ + w->set_state(BUF_BLOCK_ZIP_PAGE); + w->id_= id; + + *hash_lock= hash_lock_get_low(fold); + rw_lock_x_lock(*hash_lock); + mutex_exit(&mutex); + + buf_page_t *bpage= page_hash_get_low(id); + if (UNIV_LIKELY_NULL(bpage)) + { + rw_lock_x_unlock(*hash_lock); + mutex_enter(&mutex); + w->set_state(BUF_BLOCK_NOT_USED); + *hash_lock= hash_lock_get_low(fold); + rw_lock_x_lock(*hash_lock); + mutex_exit(&mutex); + goto retry; + } - ret = !buf_pool_watch_is_sentinel(bpage); - rw_lock_s_unlock(hash_lock); + ut_ad(!w->buf_fix_count_); + w->buf_fix_count_= 1; + ut_ad(!w->in_page_hash); + ut_d(w->in_page_hash= true); /* Not holding buf_pool.mutex here! */ + HASH_INSERT(buf_page_t, hash, page_hash, fold, w); + return nullptr; + } - return(ret); + ut_error; + mutex_exit(&mutex); + return nullptr; } /********************************************************************//** @@ -2812,7 +2571,7 @@ void buf_page_make_young(buf_page_t* bpage) { mutex_enter(&buf_pool.mutex); - ut_a(buf_page_in_file(bpage)); + ut_a(bpage->in_file()); buf_LRU_make_block_young(bpage); @@ -2837,15 +2596,12 @@ void buf_page_free(const page_id_t page_id, ut_ad(mtr); ut_ad(mtr->is_active()); buf_pool.stat.n_page_gets++; - rw_lock_t *hash_lock= buf_page_hash_lock_get(page_id); - rw_lock_s_lock(hash_lock); - /* page_hash can be changed. */ - hash_lock= buf_page_hash_lock_s_confirm(hash_lock, page_id); + rw_lock_t *hash_lock= buf_pool.page_hash_lock<false>(page_id.fold()); buf_block_t *block= reinterpret_cast<buf_block_t*> - (buf_page_hash_get_low(page_id)); + (buf_pool.page_hash_get_low(page_id)); - if (!block || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) + if (!block || block->page.state() != BUF_BLOCK_FILE_PAGE) { /* FIXME: if block!=NULL, convert to BUF_BLOCK_FILE_PAGE, but avoid buf_zip_decompress() */ @@ -2856,18 +2612,9 @@ void buf_page_free(const page_id_t page_id, } block->fix(); - mutex_enter(&block->mutex); - /* Now safe to release page_hash mutex */ - rw_lock_s_unlock(hash_lock); - ut_ad(block->page.buf_fix_count > 0); - -#ifdef UNIV_DEBUG - if (!fsp_is_system_temporary(page_id.space())) - { - ibool ret= rw_lock_s_lock_nowait(block->debug_latch, file, line); - ut_a(ret); - } -#endif /* UNIV_DEBUG */ + ut_ad(block->page.buf_fix_count()); + ut_ad(fsp_is_system_temporary(page_id.space()) || + rw_lock_s_lock_nowait(block->debug_latch, file, line)); mtr_memo_type_t fix_type= MTR_MEMO_PAGE_X_FIX; rw_lock_x_lock_inline(&block->lock, 0, file, line); @@ -2875,7 +2622,7 @@ void buf_page_free(const page_id_t page_id, block->page.status= buf_page_t::FREED; buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); - mutex_exit(&block->mutex); + rw_lock_s_unlock(hash_lock); } /** Attempts to discard the uncompressed frame of a compressed page. @@ -2915,7 +2662,6 @@ the same set of mutexes or latches. buf_page_t* buf_page_get_zip(const page_id_t page_id, ulint zip_size) { buf_page_t* bpage; - BPageMutex* block_mutex; rw_lock_t* hash_lock; ibool discard_attempted = FALSE; ibool must_read; @@ -2931,7 +2677,7 @@ lookup: mutex if the page is found. */ bpage = buf_page_hash_get_s_locked(page_id, &hash_lock); if (bpage) { - ut_ad(!buf_pool_watch_is_sentinel(bpage)); + ut_ad(!buf_pool.watch_is_sentinel(*bpage)); break; } @@ -2947,12 +2693,12 @@ lookup: goto err_exit; } -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +#ifdef UNIV_DEBUG if (!(++buf_dbg_counter % 5771)) buf_pool.validate(); -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#endif /* UNIV_DEBUG */ } - ut_ad(buf_page_hash_lock_held_s(bpage)); + ut_ad(rw_lock_own(buf_pool.hash_lock_get(bpage->id()), RW_LOCK_S)); if (!bpage->zip.data) { /* There is no compressed page. */ @@ -2961,13 +2707,11 @@ err_exit: return(NULL); } - ut_ad(!buf_pool_watch_is_sentinel(bpage)); + ut_ad(!buf_pool.watch_is_sentinel(*bpage)); - switch (buf_page_get_state(bpage)) { + switch (bpage->state()) { case BUF_BLOCK_ZIP_PAGE: - case BUF_BLOCK_ZIP_DIRTY: bpage->fix(); - block_mutex = &buf_pool.zip_mutex; goto got_block; case BUF_BLOCK_FILE_PAGE: /* Discard the uncompressed page frame if possible. */ @@ -2980,8 +2724,6 @@ err_exit: buf_block_buf_fix_inc((buf_block_t*) bpage, __FILE__, __LINE__); - - block_mutex = &((buf_block_t*) bpage)->mutex; goto got_block; default: break; @@ -2991,38 +2733,28 @@ err_exit: goto err_exit; got_block: - mutex_enter(block_mutex); - must_read = buf_page_get_io_fix(bpage) == BUF_IO_READ; + must_read = bpage->io_fix() == BUF_IO_READ; rw_lock_s_unlock(hash_lock); DBUG_ASSERT(bpage->status != buf_page_t::FREED); - buf_page_set_accessed(bpage); - - mutex_exit(block_mutex); + bpage->set_accessed(); buf_page_make_young_if_needed(bpage); -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +#ifdef UNIV_DEBUG if (!(++buf_dbg_counter % 5771)) buf_pool.validate(); - ut_a(bpage->buf_fix_count > 0); - ut_a(buf_page_in_file(bpage)); -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#endif /* UNIV_DEBUG */ + ut_ad(bpage->buf_fix_count()); + ut_ad(bpage->in_file()); if (must_read) { /* Let us wait until the read operation completes */ for (;;) { - enum buf_io_fix io_fix; - - mutex_enter(block_mutex); - io_fix = buf_page_get_io_fix(bpage); - mutex_exit(block_mutex); - - if (io_fix == BUF_IO_READ) { - + if (bpage->io_fix() == BUF_IO_READ) { os_thread_sleep(WAIT_FOR_READ); } else { break; @@ -3068,7 +2800,7 @@ buf_zip_decompress( ulint size = page_zip_get_size(&block->page.zip); /* The tablespace will not be found if this function is called during IMPORT. */ - fil_space_t* space = fil_space_acquire_for_io(block->page.id.space()); + fil_space_t* space= fil_space_acquire_for_io(block->page.id().space()); const unsigned key_version = mach_read_from_4( frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION); fil_space_crypt_t* crypt_data = space ? space->crypt_data : NULL; @@ -3078,13 +2810,13 @@ buf_zip_decompress( || srv_encrypt_tables); ut_ad(block->zip_size()); - ut_a(block->page.id.space() != 0); + ut_a(block->page.id().space() != 0); if (UNIV_UNLIKELY(check && !page_zip_verify_checksum(frame, size))) { ib::error() << "Compressed page checksum mismatch for " << (space ? space->chain.start->name : "") - << block->page.id << ": stored: " + << block->page.id() << ": stored: " << mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM) << ", crc32: " << page_zip_calc_checksum( @@ -3112,7 +2844,7 @@ buf_zip_decompress( ib::error() << "Unable to decompress " << (space ? space->chain.start->name : "") - << block->page.id; + << block->page.id(); goto err_exit; case FIL_PAGE_TYPE_ALLOCATED: case FIL_PAGE_INODE: @@ -3133,7 +2865,7 @@ buf_zip_decompress( ib::error() << "Unknown compressed page type " << fil_page_get_type(frame) << " in " << (space ? space->chain.start->name : "") - << block->page.id; + << block->page.id(); err_exit: if (encrypted) { @@ -3154,20 +2886,6 @@ err_exit: return(FALSE); } -#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG -/********************************************************************//** -Return true if probe is enabled. -@return true if probe enabled. */ -static -bool -buf_debug_execute_is_force_flush() -/*==============================*/ -{ - DBUG_EXECUTE_IF("ib_buf_force_flush", return(true); ); - return(false); -} -#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ - /** Wait for the block to be read in. @param[in] block The block to check */ static @@ -3177,35 +2895,15 @@ buf_wait_for_read( { /* Note: - We are using the block->lock to check for IO state (and a dirty read). - We set the IO_READ state under the protection of the hash_lock - (and block->mutex). This is safe because another thread can only + We are using the block->lock to check for IO state. + We set the IO_READ state under the protection of the hash_lock. + This is safe because another thread can only access the block (and check for IO state) after the block has been added to the page hashtable. */ - if (buf_block_get_io_fix(block) == BUF_IO_READ) { - - /* Wait until the read operation completes */ - - BPageMutex* mutex = buf_page_get_mutex(&block->page); - - for (;;) { - buf_io_fix io_fix; - - mutex_enter(mutex); - - io_fix = buf_block_get_io_fix(block); - - mutex_exit(mutex); - - if (io_fix == BUF_IO_READ) { - /* Wait by temporaly s-latch */ - rw_lock_s_lock(&block->lock); - rw_lock_s_unlock(&block->lock); - } else { - break; - } - } + while (block->page.io_fix() == BUF_IO_READ) { + rw_lock_s_lock(&block->lock); + rw_lock_s_unlock(&block->lock); } } @@ -3287,9 +2985,8 @@ buf_page_get_low( { buf_block_t* block; unsigned access_time; - rw_lock_t* hash_lock; - buf_block_t* fix_block; ulint retries = 0; + const ulint fold = page_id.fold(); ut_ad((mtr == NULL) == (mode == BUF_EVICT_IF_IN_POOL)); ut_ad(!mtr || mtr->is_active()); @@ -3339,83 +3036,53 @@ buf_page_get_low( || ibuf_page_low(page_id, zip_size, FALSE, file, line, NULL)); buf_pool.stat.n_page_gets++; - hash_lock = buf_page_hash_lock_get(page_id); loop: + buf_block_t* fix_block; block = guess; - rw_lock_s_lock(hash_lock); - - /* page_hash can be changed. */ - hash_lock = buf_page_hash_lock_s_confirm(hash_lock, page_id); + rw_lock_t* hash_lock = buf_pool.page_hash_lock<false>(fold); - if (block != NULL) { + if (block) { /* If the guess is a compressed page descriptor that has been allocated by buf_page_alloc_descriptor(), it may have been freed by buf_relocate(). */ if (!buf_pool.is_uncompressed(block) - || page_id != block->page.id - || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) { + || page_id != block->page.id() + || block->page.state() != BUF_BLOCK_FILE_PAGE) { /* Our guess was bogus or things have changed since. */ - block = guess = NULL; + guess = nullptr; + goto lookup; } else { ut_ad(!block->page.in_zip_hash); } + } else { +lookup: + block = reinterpret_cast<buf_block_t*>( + buf_pool.page_hash_get_low(page_id)); } - if (block == NULL) { - block = (buf_block_t*) buf_page_hash_get_low(page_id); - } - - if (!block || buf_pool_watch_is_sentinel(&block->page)) { + if (!block || buf_pool.watch_is_sentinel(block->page)) { rw_lock_s_unlock(hash_lock); - block = NULL; + block = nullptr; } - if (block == NULL) { - + if (UNIV_UNLIKELY(!block)) { /* Page not in buf_pool: needs to be read from file */ - if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) { - rw_lock_x_lock(hash_lock); + hash_lock = buf_pool.page_hash_lock<true>(fold); - /* If not own buf_pool_mutex, - page_hash can be changed. */ - hash_lock = buf_page_hash_lock_x_confirm( - hash_lock, page_id); - - block = (buf_block_t*) buf_pool_watch_set( - page_id, &hash_lock); - - if (block) { + if (buf_page_t *bpage= buf_pool.watch_set( + page_id, &hash_lock)) { /* We can release hash_lock after we increment the fix count to make sure that no state change takes place. */ - fix_block = block; - - if (fsp_is_system_temporary(page_id.space())) { - /* For temporary tablespace, - the mutex is being used for - synchronization between user - thread and flush thread, - instead of block->lock. See - buf_flush_page() for the flush - thread counterpart. */ - - BPageMutex* fix_mutex - = buf_page_get_mutex( - &fix_block->page); - mutex_enter(fix_mutex); - fix_block->fix(); - mutex_exit(fix_mutex); - } else { - fix_block->fix(); - } - - /* Now safe to release page_hash mutex */ + bpage->fix(); rw_lock_x_unlock(hash_lock); + block = reinterpret_cast<buf_block_t*>(bpage); + fix_block = block; goto got_block; } @@ -3427,15 +3094,12 @@ loop: case BUF_GET_IF_IN_POOL_OR_WATCH: case BUF_PEEK_IF_IN_POOL: case BUF_EVICT_IF_IN_POOL: - ut_ad(!rw_lock_own_flagged( - hash_lock, - RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); return(NULL); } /* The call path is buf_read_page() -> buf_read_page_low() (fil_io()) -> - buf_page_io_complete() -> + buf_page_read_complete() -> buf_decrypt_after_read(). Here fil_space_t* is used and we decrypt -> buf_page_check_corrupt() where page checksums are compared. Decryption, decompression as @@ -3507,29 +3171,15 @@ loop: " See https://mariadb.com/kb/en/library/innodb-recovery-modes/"; } -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +#ifdef UNIV_DEBUG if (!(++buf_dbg_counter % 5771)) buf_pool.validate(); -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#endif /* UNIV_DEBUG */ goto loop; } else { fix_block = block; } - if (fsp_is_system_temporary(page_id.space())) { - /* For temporary tablespace, the mutex is being used - for synchorization between user thread and flush thread, - instead of block->lock. See buf_flush_page() for the flush - thread counterpart. */ - BPageMutex* fix_mutex = buf_page_get_mutex( - &fix_block->page); - mutex_enter(fix_mutex); - fix_block->fix(); - mutex_exit(fix_mutex); - } else { - fix_block->fix(); - } - - /* Now safe to release page_hash mutex */ + fix_block->fix(); rw_lock_s_unlock(hash_lock); got_block: @@ -3540,28 +3190,19 @@ got_block: case BUF_GET_IF_IN_POOL: case BUF_PEEK_IF_IN_POOL: case BUF_EVICT_IF_IN_POOL: - buf_page_t* fix_page = &fix_block->page; - BPageMutex* fix_mutex = buf_page_get_mutex(fix_page); - mutex_enter(fix_mutex); - const bool must_read - = (buf_page_get_io_fix(fix_page) == BUF_IO_READ); - mutex_exit(fix_mutex); - - if (must_read) { + if (fix_block->page.io_fix() == BUF_IO_READ) { /* The page is being read to buffer pool, but we cannot wait around for the read to complete. */ fix_block->unfix(); - return(NULL); } } - switch (UNIV_EXPECT(buf_block_get_state(fix_block), - BUF_BLOCK_FILE_PAGE)) { + switch (UNIV_EXPECT(fix_block->page.state(), BUF_BLOCK_FILE_PAGE)) { case BUF_BLOCK_FILE_PAGE: if (fsp_is_system_temporary(page_id.space()) - && buf_block_get_io_fix(block) != BUF_IO_NONE) { + && block->page.io_fix() != BUF_IO_NONE) { /* This suggests that the page is being flushed. Avoid returning reference to this page. Instead wait for the flush action to complete. */ @@ -3572,7 +3213,7 @@ got_block: if (UNIV_UNLIKELY(mode == BUF_EVICT_IF_IN_POOL)) { evict_from_pool: - ut_ad(!fix_block->page.oldest_modification); + ut_ad(!fix_block->page.oldest_modification()); mutex_enter(&buf_pool.mutex); fix_block->unfix(); @@ -3590,7 +3231,6 @@ evict_from_pool: break; case BUF_BLOCK_ZIP_PAGE: - case BUF_BLOCK_ZIP_DIRTY: if (UNIV_UNLIKELY(mode == BUF_EVICT_IF_IN_POOL)) { goto evict_from_pool; } @@ -3608,8 +3248,8 @@ evict_from_pool: buf_page_t* bpage = &block->page; /* Note: We have already buffer fixed this block. */ - if (bpage->buf_fix_count > 1 - || buf_page_get_io_fix(bpage) != BUF_IO_NONE) { + if (bpage->buf_fix_count() > 1 + || bpage->io_fix() != BUF_IO_NONE) { /* This condition often occurs when the buffer is not buffer-fixed, but I/O-fixed by @@ -3627,43 +3267,36 @@ evict_from_pool: or relocated while we are attempting to allocate an uncompressed page. */ - block = buf_LRU_get_free_block(); + block = buf_LRU_get_free_block(false); + buf_block_init_low(block); mutex_enter(&buf_pool.mutex); - - hash_lock = buf_page_hash_lock_get(page_id); + hash_lock = buf_pool.hash_lock_get(page_id); rw_lock_x_lock(hash_lock); /* Buffer-fixing prevents the page_hash from changing. */ - ut_ad(bpage == buf_page_hash_get_low(page_id)); - - fix_block->unfix(); + ut_ad(bpage == buf_pool.page_hash_get_low(page_id)); - buf_page_mutex_enter(block); - mutex_enter(&buf_pool.zip_mutex); + fix_block->unfix(); /* hash_lock protects us after this */ - fix_block = block; - - if (bpage->buf_fix_count > 0 - || buf_page_get_io_fix(bpage) != BUF_IO_NONE) { - - mutex_exit(&buf_pool.zip_mutex); + if (bpage->buf_fix_count() || bpage->io_fix() != BUF_IO_NONE) { /* The block was buffer-fixed or I/O-fixed while buf_pool.mutex was not held by this thread. Free the block that was allocated and retry. This should be extremely unlikely, for example, if buf_page_get_zip() was invoked. */ + rw_lock_x_unlock(hash_lock); buf_LRU_block_free_non_file_page(block); mutex_exit(&buf_pool.mutex); - rw_lock_x_unlock(hash_lock); - buf_page_mutex_exit(block); /* Try again */ goto loop; } + fix_block = block; + /* Move the compressed page from bpage to block, and uncompress it. */ @@ -3672,10 +3305,8 @@ evict_from_pool: any list or hash table */ buf_relocate(bpage, &block->page); - buf_block_init_low(block); - /* Set after buf_relocate(). */ - block->page.buf_fix_count = 1; + block->page.set_buf_fix_count(1); block->lock_hash_val = lock_rec_hash(page_id.space(), page_id.page_no()); @@ -3683,11 +3314,8 @@ evict_from_pool: UNIV_MEM_DESC(&block->page.zip.data, page_zip_get_size(&block->page.zip)); - if (buf_page_get_state(&block->page) == BUF_BLOCK_ZIP_PAGE) { -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - UT_LIST_REMOVE(buf_pool.zip_clean, &block->page); -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ - ut_ad(!block->page.in_flush_list); + if (!block->page.oldest_modification()) { + ut_d(UT_LIST_REMOVE(buf_pool.zip_clean, &block->page)); } else { /* Relocate buf_pool.flush_list. */ buf_flush_relocate_on_flush_list(bpage, &block->page); @@ -3696,45 +3324,37 @@ evict_from_pool: /* Buffer-fix, I/O-fix, and X-latch the block for the duration of the decompression. Also add the block to the unzip_LRU list. */ - block->page.state = BUF_BLOCK_FILE_PAGE; + block->page.set_state(BUF_BLOCK_FILE_PAGE); /* Insert at the front of unzip_LRU list */ buf_unzip_LRU_add_block(block, FALSE); - buf_block_set_io_fix(block, BUF_IO_READ); + block->page.set_io_fix(BUF_IO_READ); rw_lock_x_lock_inline(&block->lock, 0, file, line); UNIV_MEM_INVALID(bpage, sizeof *bpage); + mutex_exit(&buf_pool.mutex); rw_lock_x_unlock(hash_lock); buf_pool.n_pend_unzip++; - mutex_exit(&buf_pool.zip_mutex); - mutex_exit(&buf_pool.mutex); - - access_time = buf_page_is_accessed(&block->page); - buf_page_mutex_exit(block); + access_time = block->page.is_accessed(); if (!access_time && !recv_no_ibuf_operations - && ibuf_page_exists(block->page.id, zip_size)) { + && ibuf_page_exists(block->page.id(), zip_size)) { block->page.ibuf_exist = true; } buf_page_free_descriptor(bpage); /* Decompress the page while not holding - buf_pool.mutex or block->mutex. */ + buf_pool.mutex. */ if (!buf_zip_decompress(block, false)) { - mutex_enter(&buf_pool.mutex); - buf_page_mutex_enter(fix_block); - buf_block_set_io_fix(fix_block, BUF_IO_NONE); - buf_page_mutex_exit(fix_block); - - --buf_pool.n_pend_unzip; - mutex_exit(&buf_pool.mutex); - fix_block->unfix(); rw_lock_x_unlock(&fix_block->lock); + fix_block->page.io_unfix(); + fix_block->unfix(); + --buf_pool.n_pend_unzip; if (err) { *err = DB_PAGE_CORRUPTED; @@ -3742,36 +3362,27 @@ evict_from_pool: return NULL; } - mutex_enter(&buf_pool.mutex); - - buf_page_mutex_enter(fix_block); - - buf_block_set_io_fix(fix_block, BUF_IO_NONE); - - buf_page_mutex_exit(fix_block); - - --buf_pool.n_pend_unzip; - - mutex_exit(&buf_pool.mutex); - rw_lock_x_unlock(&block->lock); - + fix_block->page.io_unfix(); + --buf_pool.n_pend_unzip; break; } ut_ad(block == fix_block); - ut_ad(fix_block->page.buf_fix_count > 0); + ut_ad(fix_block->page.buf_fix_count()); ut_ad(!rw_lock_own_flagged(hash_lock, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); - ut_ad(buf_block_get_state(fix_block) == BUF_BLOCK_FILE_PAGE); + ut_ad(fix_block->page.state() == BUF_BLOCK_FILE_PAGE); #if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG - if ((mode == BUF_GET_IF_IN_POOL || mode == BUF_GET_IF_IN_POOL_OR_WATCH) - && (ibuf_debug || buf_debug_execute_is_force_flush())) { - + if (mode != BUF_GET_IF_IN_POOL + && mode != BUF_GET_IF_IN_POOL_OR_WATCH) { + } else if (!ibuf_debug) { + } else if (fil_space_t* space = + fil_space_acquire_for_io(page_id.space())) { /* Try to evict the block from the buffer pool, to use the insert buffer (change buffer) as much as possible. */ @@ -3779,35 +3390,21 @@ evict_from_pool: fix_block->unfix(); - /* Now we are only holding the buf_pool.mutex, - not block->mutex or hash_lock. Blocks cannot be - relocated or enter or exit the buf_pool while we - are holding the buf_pool.mutex. */ + /* Blocks cannot be relocated or enter or exit the + buf_pool while we are holding the buf_pool.mutex. */ if (buf_LRU_free_page(&fix_block->page, true)) { - - mutex_exit(&buf_pool.mutex); - - /* page_hash can be changed. */ - hash_lock = buf_page_hash_lock_get(page_id); + space->release_for_io(); + hash_lock = buf_pool.hash_lock_get_low(fold); rw_lock_x_lock(hash_lock); - - /* If not own buf_pool_mutex, - page_hash can be changed. */ - hash_lock = buf_page_hash_lock_x_confirm( - hash_lock, page_id); - - if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) { - /* Set the watch, as it would have - been set if the page were not in the - buffer pool in the first place. */ - block = (buf_block_t*) buf_pool_watch_set( - page_id, &hash_lock); - } else { - block = (buf_block_t*) buf_page_hash_get_low( - page_id); - } - + mutex_exit(&buf_pool.mutex); + /* We may set the watch, as it would have + been set if the page were not in the + buffer pool in the first place. */ + block= reinterpret_cast<buf_block_t*>( + mode == BUF_GET_IF_IN_POOL_OR_WATCH + ? buf_pool.watch_set(page_id, &hash_lock) + : buf_pool.page_hash_get_low(page_id)); rw_lock_x_unlock(hash_lock); if (block != NULL) { @@ -3824,16 +3421,15 @@ evict_from_pool: return(NULL); } - buf_page_mutex_enter(fix_block); - - if (buf_flush_page_try(fix_block)) { + bool flushed = fix_block->page.ready_for_flush() + && buf_flush_page(&fix_block->page, + IORequest::SINGLE_PAGE, space, true); + space->release_for_io(); + if (flushed) { guess = fix_block; - goto loop; } - buf_page_mutex_exit(fix_block); - fix_block->fix(); /* Failed to evict the page; change it directly */ @@ -3842,7 +3438,7 @@ evict_from_pool: } #endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ - ut_ad(fix_block->page.buf_fix_count > 0); + ut_ad(fix_block->page.buf_fix_count()); #ifdef UNIV_DEBUG /* We have already buffer fixed the page, and we are committed to @@ -3868,33 +3464,22 @@ evict_from_pool: || mode == BUF_PEEK_IF_IN_POOL || fix_block->page.status != buf_page_t::FREED); - /* Check if this is the first access to the page */ - access_time = buf_page_is_accessed(&fix_block->page); - - /* This is a heuristic and we don't care about ordering issues. */ - if (access_time == 0) { - buf_page_mutex_enter(fix_block); - - buf_page_set_accessed(&fix_block->page); - - buf_page_mutex_exit(fix_block); - } + const bool first_access = fix_block->page.set_accessed(); if (mode != BUF_PEEK_IF_IN_POOL) { buf_page_make_young_if_needed(&fix_block->page); } -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +#ifdef UNIV_DEBUG if (!(++buf_dbg_counter % 5771)) buf_pool.validate(); - ut_a(buf_block_get_state(fix_block) == BUF_BLOCK_FILE_PAGE); -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#endif /* UNIV_DEBUG */ + ut_ad(fix_block->page.state() == BUF_BLOCK_FILE_PAGE); /* We have to wait here because the IO_READ state was set - under the protection of the hash_lock and not the block->mutex - and block->lock. */ + under the protection of the hash_lock and not block->lock. */ buf_wait_for_read(fix_block); - if (fix_block->page.id != page_id) { + if (fix_block->page.id() != page_id) { fix_block->unfix(); #ifdef UNIV_DEBUG @@ -3934,7 +3519,7 @@ get_latch: file, line); } - if (mode != BUF_PEEK_IF_IN_POOL && !access_time) { + if (mode != BUF_PEEK_IF_IN_POOL && first_access) { /* In the case of a first access, try to apply linear read-ahead */ @@ -4020,52 +3605,46 @@ buf_page_optimistic_get( unsigned line, /*!< in: line where called */ mtr_t* mtr) /*!< in: mini-transaction */ { - unsigned access_time; ibool success; ut_ad(block); ut_ad(mtr); ut_ad(mtr->is_active()); - ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH)); - - buf_page_mutex_enter(block); + ut_ad(rw_latch == RW_S_LATCH || rw_latch == RW_X_LATCH); - if (UNIV_UNLIKELY(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE)) { + if (UNIV_UNLIKELY(block->page.state() != BUF_BLOCK_FILE_PAGE + || block->page.io_fix() != BUF_IO_NONE)) { + return FALSE; + } - buf_page_mutex_exit(block); + rw_lock_t *hash_lock = buf_pool.hash_lock_get(block->page.id()); + rw_lock_s_lock(hash_lock); + if (UNIV_UNLIKELY(block->page.state() != BUF_BLOCK_FILE_PAGE + || block->page.io_fix() != BUF_IO_NONE)) { + rw_lock_s_unlock(hash_lock); return(FALSE); } buf_block_buf_fix_inc(block, file, line); + rw_lock_s_unlock(hash_lock); - access_time = buf_page_is_accessed(&block->page); - - buf_page_set_accessed(&block->page); - - buf_page_mutex_exit(block); + const bool first_access = block->page.set_accessed(); buf_page_make_young_if_needed(&block->page); ut_ad(!ibuf_inside(mtr) - || ibuf_page(block->page.id, block->zip_size(), NULL)); + || ibuf_page(block->page.id(), block->zip_size(), NULL)); mtr_memo_type_t fix_type; - switch (rw_latch) { - case RW_S_LATCH: - success = rw_lock_s_lock_nowait(&block->lock, file, line); - + if (rw_latch == RW_S_LATCH) { fix_type = MTR_MEMO_PAGE_S_FIX; - break; - case RW_X_LATCH: + success = rw_lock_s_lock_nowait(&block->lock, file, line); + } else { + fix_type = MTR_MEMO_PAGE_X_FIX; success = rw_lock_x_lock_func_nowait_inline( &block->lock, file, line); - - fix_type = MTR_MEMO_PAGE_X_FIX; - break; - default: - ut_error; /* RW_SX_LATCH is not implemented yet */ } if (!success) { @@ -4089,16 +3668,16 @@ buf_page_optimistic_get( mtr_memo_push(mtr, block, fix_type); -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +#ifdef UNIV_DEBUG if (!(++buf_dbg_counter % 5771)) buf_pool.validate(); - ut_a(block->page.buf_fix_count > 0); - ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#endif /* UNIV_DEBUG */ + ut_ad(block->page.buf_fix_count()); + ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE); - if (!access_time) { + if (first_access) { /* In the case of a first access, try to apply linear read-ahead */ - buf_read_ahead_linear(block->page.id, block->zip_size(), + buf_read_ahead_linear(block->page.id(), block->zip_size(), ibuf_inside(mtr)); } @@ -4131,25 +3710,19 @@ buf_page_try_get_func( block = buf_block_hash_get_s_locked(page_id, &hash_lock); - if (!block || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) { + if (!block || block->page.state() != BUF_BLOCK_FILE_PAGE) { if (block) { rw_lock_s_unlock(hash_lock); } return(NULL); } - ut_ad(!buf_pool_watch_is_sentinel(&block->page)); - - buf_page_mutex_enter(block); - rw_lock_s_unlock(hash_lock); - -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); - ut_a(page_id == block->page.id); -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + ut_ad(!buf_pool.watch_is_sentinel(block->page)); + ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE); + ut_ad(page_id == block->page.id()); buf_block_buf_fix_inc(block, file, line); - buf_page_mutex_exit(block); + rw_lock_s_unlock(hash_lock); mtr_memo_type_t fix_type = MTR_MEMO_PAGE_S_FIX; success = rw_lock_s_lock_nowait(&block->lock, file, line); @@ -4171,11 +3744,11 @@ buf_page_try_get_func( mtr_memo_push(mtr, block, fix_type); -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +#ifdef UNIV_DEBUG if (!(++buf_dbg_counter % 5771)) buf_pool.validate(); - ut_a(block->page.buf_fix_count > 0); - ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#endif /* UNIV_DEBUG */ + ut_ad(block->page.buf_fix_count()); + ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE); buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK); @@ -4184,325 +3757,17 @@ buf_page_try_get_func( return(block); } -/********************************************************************//** -Initialize some fields of a control block. */ -UNIV_INLINE -void -buf_page_init_low( -/*==============*/ - buf_page_t* bpage) /*!< in: block to init */ -{ - bpage->flush_type = BUF_FLUSH_LRU; - bpage->io_fix = BUF_IO_NONE; - bpage->buf_fix_count = 0; - bpage->old = 0; - bpage->freed_page_clock = 0; - bpage->access_time = 0; - bpage->oldest_modification = 0; - bpage->real_size = 0; - bpage->slot = NULL; - bpage->ibuf_exist = false; - bpage->status = buf_page_t::NORMAL; - HASH_INVALIDATE(bpage, hash); -} - -/** Inits a page to the buffer buf_pool. -@param[in] page_id page id -@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 -@param[in,out] block block to init */ -static void buf_page_init(const page_id_t page_id, ulint zip_size, - buf_block_t *block) -{ - buf_page_t* hash_page; - - ut_ad(mutex_own(&buf_pool.mutex)); - ut_ad(buf_page_mutex_own(block)); - ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE); - ut_ad(rw_lock_own(buf_page_hash_lock_get(page_id), RW_LOCK_X)); - - /* Set the state of the block */ - buf_block_set_file_page(block, page_id); - -#ifdef UNIV_DEBUG_VALGRIND - if (is_system_tablespace(page_id.space())) { - /* Silence valid Valgrind warnings about uninitialized - data being written to data files. There are some unused - bytes on some pages that InnoDB does not initialize. */ - UNIV_MEM_VALID(block->frame, srv_page_size); - } -#endif /* UNIV_DEBUG_VALGRIND */ - - buf_block_init_low(block); - - block->lock_hash_val = lock_rec_hash(page_id.space(), - page_id.page_no()); - - buf_page_init_low(&block->page); - - /* Insert into the hash table of file pages */ - - hash_page = buf_page_hash_get_low(page_id); - - if (hash_page == NULL) { - /* Block not found in hash table */ - } else if (buf_pool_watch_is_sentinel(hash_page)) { - /* Preserve the reference count. */ - ib_uint32_t buf_fix_count = hash_page->buf_fix_count; - - ut_a(buf_fix_count > 0); - - block->page.buf_fix_count += buf_fix_count; - - buf_pool_watch_remove(hash_page); - } else { - ib::fatal() << "Page " << page_id - << " already found in the hash table: " - << hash_page << ", " << block; - } - - ut_ad(!block->page.in_zip_hash); - ut_ad(!block->page.in_page_hash); - ut_d(block->page.in_page_hash = TRUE); - - block->page.id = page_id; - - HASH_INSERT(buf_page_t, hash, buf_pool.page_hash, - page_id.fold(), &block->page); - - page_zip_set_size(&block->page.zip, zip_size); -} - -/** Initialize a page for read to the buffer buf_pool. If the page is -(1) already in buf_pool, or -(2) if we specify to read only ibuf pages and the page is not an ibuf page, or -(3) if the space is deleted or being deleted, -then this function does nothing. -Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock -on the buffer frame. The io-handler must take care that the flag is cleared -and the lock released later. -@param[out] err DB_SUCCESS or DB_TABLESPACE_DELETED -@param[in] mode BUF_READ_IBUF_PAGES_ONLY, ... -@param[in] page_id page id -@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 -@param[in] unzip whether the uncompressed page is - requested (for ROW_FORMAT=COMPRESSED) -@return pointer to the block -@retval NULL in case of an error */ -buf_page_t* -buf_page_init_for_read( - dberr_t* err, - ulint mode, - const page_id_t page_id, - ulint zip_size, - bool unzip) +/** Initialize the block. +@param page_id page id +@param zip_size ROW_FORMAT=COMPRESSED page size, or 0 */ +void buf_block_t::initialise(const page_id_t page_id, ulint zip_size) { - buf_block_t* block; - buf_page_t* bpage = NULL; - buf_page_t* watch_page; - rw_lock_t* hash_lock; - mtr_t mtr; - bool lru = false; - void* data; - - *err = DB_SUCCESS; - - if (mode == BUF_READ_IBUF_PAGES_ONLY) { - /* It is a read-ahead within an ibuf routine */ - - ut_ad(!ibuf_bitmap_page(page_id, zip_size)); - - ibuf_mtr_start(&mtr); - - if (!recv_no_ibuf_operations - && !ibuf_page(page_id, zip_size, &mtr)) { - - ibuf_mtr_commit(&mtr); - - return(NULL); - } - } else { - ut_ad(mode == BUF_READ_ANY_PAGE); - } - - if (zip_size && !unzip && !recv_recovery_is_on()) { - block = NULL; - } else { - block = buf_LRU_get_free_block(); - ut_ad(block); - } - - mutex_enter(&buf_pool.mutex); - - hash_lock = buf_page_hash_lock_get(page_id); - rw_lock_x_lock(hash_lock); - - watch_page = buf_page_hash_get_low(page_id); - if (watch_page && !buf_pool_watch_is_sentinel(watch_page)) { - /* The page is already in the buffer pool. */ - watch_page = NULL; - rw_lock_x_unlock(hash_lock); - if (block) { - buf_page_mutex_enter(block); - buf_LRU_block_free_non_file_page(block); - buf_page_mutex_exit(block); - } - - bpage = NULL; - goto func_exit; - } - - if (block) { - bpage = &block->page; - - buf_page_mutex_enter(block); - - buf_page_init(page_id, zip_size, block); - - /* Note: We are using the hash_lock for protection. This is - safe because no other thread can lookup the block from the - page hashtable yet. */ - - buf_page_set_io_fix(bpage, BUF_IO_READ); - - rw_lock_x_unlock(hash_lock); - - /* The block must be put to the LRU list, to the old blocks */ - buf_LRU_add_block(bpage, TRUE/* to old blocks */); - - /* We set a pass-type x-lock on the frame because then - the same thread which called for the read operation - (and is running now at this point of code) can wait - for the read to complete by waiting for the x-lock on - the frame; if the x-lock were recursive, the same - thread would illegally get the x-lock before the page - read is completed. The x-lock is cleared by the - io-handler thread. */ - - rw_lock_x_lock_gen(&block->lock, BUF_IO_READ); - - if (zip_size) { - /* buf_pool.mutex may be released and - reacquired by buf_buddy_alloc(). Thus, we - must release block->mutex in order not to - break the latching order in the reacquisition - of buf_pool.mutex. We also must defer this - operation until after the block descriptor has - been added to buf_pool.LRU and - buf_pool.page_hash. */ - buf_page_mutex_exit(block); - data = buf_buddy_alloc(zip_size, &lru); - buf_page_mutex_enter(block); - block->page.zip.data = (page_zip_t*) data; - - /* To maintain the invariant - block->in_unzip_LRU_list - == buf_page_belongs_to_unzip_LRU(&block->page) - we have to add this block to unzip_LRU - after block->page.zip.data is set. */ - ut_ad(buf_page_belongs_to_unzip_LRU(&block->page)); - buf_unzip_LRU_add_block(block, TRUE); - } - - buf_page_mutex_exit(block); - } else { - rw_lock_x_unlock(hash_lock); - - /* The compressed page must be allocated before the - control block (bpage), in order to avoid the - invocation of buf_buddy_relocate_block() on - uninitialized data. */ - data = buf_buddy_alloc(zip_size, &lru); - - rw_lock_x_lock(hash_lock); - - /* If buf_buddy_alloc() allocated storage from the LRU list, - it released and reacquired buf_pool.mutex. Thus, we must - check the page_hash again, as it may have been modified. */ - if (UNIV_UNLIKELY(lru)) { - watch_page = buf_page_hash_get_low(page_id); - - if (UNIV_UNLIKELY(watch_page - && !buf_pool_watch_is_sentinel(watch_page))) { - - /* The block was added by some other thread. */ - rw_lock_x_unlock(hash_lock); - watch_page = NULL; - buf_buddy_free(data, zip_size); - - bpage = NULL; - goto func_exit; - } - } - - bpage = buf_page_alloc_descriptor(); - - page_zip_des_init(&bpage->zip); - page_zip_set_size(&bpage->zip, zip_size); - bpage->zip.data = (page_zip_t*) data; - - mutex_enter(&buf_pool.zip_mutex); - UNIV_MEM_DESC(bpage->zip.data, zip_size); - - buf_page_init_low(bpage); - - bpage->state = BUF_BLOCK_ZIP_PAGE; - bpage->id = page_id; - bpage->status = buf_page_t::NORMAL; - - ut_d(bpage->in_page_hash = FALSE); - ut_d(bpage->in_zip_hash = FALSE); - ut_d(bpage->in_flush_list = FALSE); - ut_d(bpage->in_free_list = FALSE); - ut_d(bpage->in_LRU_list = FALSE); - - ut_d(bpage->in_page_hash = TRUE); - - if (watch_page != NULL) { - - /* Preserve the reference count. */ - ib_uint32_t buf_fix_count; - - buf_fix_count = watch_page->buf_fix_count; - - ut_a(buf_fix_count > 0); - - bpage->buf_fix_count += buf_fix_count; - - ut_ad(buf_pool_watch_is_sentinel(watch_page)); - buf_pool_watch_remove(watch_page); - } - - HASH_INSERT(buf_page_t, hash, buf_pool.page_hash, - bpage->id.fold(), bpage); - - rw_lock_x_unlock(hash_lock); - - /* The block must be put to the LRU list, to the old blocks. - The zip size is already set into the page zip */ - buf_LRU_add_block(bpage, TRUE/* to old blocks */); -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - buf_LRU_insert_zip_clean(bpage); -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ - - buf_page_set_io_fix(bpage, BUF_IO_READ); - - mutex_exit(&buf_pool.zip_mutex); - } - - buf_pool.n_pend_reads++; -func_exit: - mutex_exit(&buf_pool.mutex); - - if (mode == BUF_READ_IBUF_PAGES_ONLY) { - - ibuf_mtr_commit(&mtr); - } - - ut_ad(!rw_lock_own_flagged(hash_lock, - RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); - ut_ad(!bpage || buf_page_in_file(bpage)); - - return(bpage); + ut_ad(page.state() != BUF_BLOCK_FILE_PAGE); + buf_block_init_low(this); + lock_hash_val= lock_rec_hash(page_id.space(), page_id.page_no()); + page.init(); + page.id_= page_id; + page_zip_set_size(&page.zip, zip_size); } /** Initialize a page in the buffer pool. The page is usually not read @@ -4514,165 +3779,132 @@ FILE_PAGE (the other is buf_page_get_gen). @param[in,out] mtr mini-transaction @return pointer to the block, page bufferfixed */ buf_block_t* -buf_page_create( - const page_id_t page_id, - ulint zip_size, - mtr_t* mtr) +buf_page_create(const page_id_t page_id, ulint zip_size, mtr_t *mtr) { - buf_frame_t* frame; - buf_block_t* block; - buf_block_t* free_block = NULL; - rw_lock_t* hash_lock; - - ut_ad(mtr->is_active()); - ut_ad(page_id.space() != 0 || !zip_size); - - free_block = buf_LRU_get_free_block(); + ut_ad(mtr->is_active()); + ut_ad(page_id.space() != 0 || !zip_size); - mutex_enter(&buf_pool.mutex); + buf_block_t *free_block= buf_LRU_get_free_block(false); + free_block->initialise(page_id, zip_size); - hash_lock = buf_page_hash_lock_get(page_id); - rw_lock_x_lock(hash_lock); + rw_lock_t *hash_lock= buf_pool.hash_lock_get(page_id); + mutex_enter(&buf_pool.mutex); + rw_lock_x_lock(hash_lock); - block = (buf_block_t*) buf_page_hash_get_low(page_id); + buf_block_t *block= reinterpret_cast<buf_block_t*> + (buf_pool.page_hash_get_low(page_id)); - if (block - && buf_page_in_file(&block->page) - && !buf_pool_watch_is_sentinel(&block->page)) { - /* Page can be found in buf_pool */ - mutex_exit(&buf_pool.mutex); - rw_lock_x_unlock(hash_lock); + if (block && block->page.in_file() && + !buf_pool.watch_is_sentinel(block->page)) + { + /* Page can be found in buf_pool */ + rw_lock_x_unlock(hash_lock); + buf_LRU_block_free_non_file_page(free_block); + mutex_exit(&buf_pool.mutex); - buf_block_free(free_block); #ifdef BTR_CUR_HASH_ADAPT - if (block->page.state == BUF_BLOCK_FILE_PAGE - && UNIV_LIKELY_NULL(block->index)) { - btr_search_drop_page_hash_index(block); - } + if (block->page.state() == BUF_BLOCK_FILE_PAGE && + UNIV_LIKELY_NULL(block->index)) + btr_search_drop_page_hash_index(block); #endif /* BTR_CUR_HASH_ADAPT */ + if (!recv_recovery_is_on()) + /* FIXME: Remove the redundant lookup and avoid + the unnecessary invocation of buf_zip_decompress(). + We may have to convert buf_page_t to buf_block_t, + but we are going to initialize the page. */ + return buf_page_get_gen(page_id, zip_size, RW_NO_LATCH, + block, BUF_GET_POSSIBLY_FREED, + __FILE__, __LINE__, mtr); + + mutex_exit(&recv_sys.mutex); + block= buf_page_get_with_no_latch(page_id, zip_size, mtr); + mutex_enter(&recv_sys.mutex); + return block; + } - if (!recv_recovery_is_on()) { - /* FIXME: Remove the redundant lookup and avoid - the unnecessary invocation of buf_zip_decompress(). - We may have to convert buf_page_t to buf_block_t, - but we are going to initialize the page. */ - return buf_page_get_gen(page_id, zip_size, RW_NO_LATCH, - block, BUF_GET_POSSIBLY_FREED, - __FILE__, __LINE__, mtr); - } - - mutex_exit(&recv_sys.mutex); - block = buf_page_get_with_no_latch(page_id, zip_size, mtr); - mutex_enter(&recv_sys.mutex); - return block; - } - - /* If we get here, the page was not in buf_pool: init it there */ - - DBUG_PRINT("ib_buf", ("create page %u:%u", - page_id.space(), page_id.page_no())); - - block = free_block; - - buf_page_mutex_enter(block); - - buf_page_init(page_id, zip_size, block); - - rw_lock_x_unlock(hash_lock); + /* If we get here, the page was not in buf_pool: init it there */ - /* The block must be put to the LRU list */ - buf_LRU_add_block(&block->page, FALSE); + DBUG_PRINT("ib_buf", ("create page %u:%u", + page_id.space(), page_id.page_no())); - buf_block_buf_fix_inc(block, __FILE__, __LINE__); - buf_pool.stat.n_pages_created++; + block= free_block; + buf_block_buf_fix_inc(block, __FILE__, __LINE__); - if (zip_size) { - /* Prevent race conditions during buf_buddy_alloc(), - which may release and reacquire buf_pool.mutex, - by IO-fixing and X-latching the block. */ - - buf_page_set_io_fix(&block->page, BUF_IO_READ); - rw_lock_x_lock(&block->lock); - - buf_page_mutex_exit(block); - /* buf_pool.mutex may be released and reacquired by - buf_buddy_alloc(). Thus, we must release block->mutex - in order not to break the latching order in - the reacquisition of buf_pool.mutex. We also must - defer this operation until after the block descriptor - has been added to buf_pool.LRU and buf_pool.page_hash. */ - block->page.zip.data = buf_buddy_alloc(zip_size); - buf_page_mutex_enter(block); - - /* To maintain the invariant - block->in_unzip_LRU_list - == buf_page_belongs_to_unzip_LRU(&block->page) - we have to add this block to unzip_LRU after - block->page.zip.data is set. */ - ut_ad(buf_page_belongs_to_unzip_LRU(&block->page)); - buf_unzip_LRU_add_block(block, FALSE); + /* The block must be put to the LRU list */ + block->page.set_state(BUF_BLOCK_FILE_PAGE); + buf_LRU_add_block(&block->page, false); + ut_d(block->page.in_page_hash= true); + HASH_INSERT(buf_page_t, hash, buf_pool.page_hash, page_id.fold(), + &block->page); - buf_page_set_io_fix(&block->page, BUF_IO_NONE); - rw_lock_x_unlock(&block->lock); - } + if (UNIV_UNLIKELY(zip_size)) + { + /* Prevent race conditions during buf_buddy_alloc(), which may + release and reacquire buf_pool.mutex, by IO-fixing and X-latching + the block. */ + block->page.set_io_fix(BUF_IO_READ); + rw_lock_x_lock(&block->lock); + rw_lock_x_unlock(hash_lock); - mutex_exit(&buf_pool.mutex); + /* buf_pool.mutex may be released and reacquired by + buf_buddy_alloc(). We must defer this operation until + after the block descriptor has been added to + buf_pool.LRU and buf_pool.page_hash. */ + block->page.zip.data= buf_buddy_alloc(zip_size); - mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX); + /* To maintain the invariant block->in_unzip_LRU_list == + block->page.belongs_to_unzip_LRU() we have to add this + block to unzip_LRU after block->page.zip.data is set. */ + ut_ad(block->page.belongs_to_unzip_LRU()); + buf_unzip_LRU_add_block(block, FALSE); - buf_page_set_accessed(&block->page); + block->page.set_io_fix(BUF_IO_NONE); + rw_lock_x_unlock(&block->lock); + } + else + rw_lock_x_unlock(hash_lock); - buf_page_mutex_exit(block); + mutex_exit(&buf_pool.mutex); - /* Delete possible entries for the page from the insert buffer: - such can exist if the page belonged to an index which was dropped */ - if (!recv_recovery_is_on()) { - ibuf_merge_or_delete_for_page(NULL, page_id, zip_size, true); - } + mtr->memo_push(block, MTR_MEMO_BUF_FIX); + block->page.set_accessed(); + buf_pool.stat.n_pages_created++; - frame = block->frame; + /* Delete possible entries for the page from the insert buffer: + such can exist if the page belonged to an index which was dropped */ + if (!recv_recovery_is_on()) + ibuf_merge_or_delete_for_page(nullptr, page_id, zip_size, true); - static_assert(FIL_PAGE_PREV % 8 == 0, "alignment"); - static_assert(FIL_PAGE_PREV + 4 == FIL_PAGE_NEXT, "adjacent"); - memset_aligned<8>(frame + FIL_PAGE_PREV, 0xff, 8); - mach_write_to_2(frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED); + static_assert(FIL_PAGE_PREV + 4 == FIL_PAGE_NEXT, "adjacent"); + memset_aligned<8>(block->frame + FIL_PAGE_PREV, 0xff, 8); + mach_write_to_2(block->frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED); - /* FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION is only used on the - following pages: - (1) The first page of the InnoDB system tablespace (page 0:0) - (2) FIL_RTREE_SPLIT_SEQ_NUM on R-tree pages - (3) key_version on encrypted pages (not page 0:0) */ + /* FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION is only used on the + following pages: + (1) The first page of the InnoDB system tablespace (page 0:0) + (2) FIL_RTREE_SPLIT_SEQ_NUM on R-tree pages + (3) key_version on encrypted pages (not page 0:0) */ - memset(frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8); - static_assert(FIL_PAGE_LSN % 8 == 0, "alignment"); - memset_aligned<8>(frame + FIL_PAGE_LSN, 0, 8); + memset(block->frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8); + memset_aligned<8>(block->frame + FIL_PAGE_LSN, 0, 8); -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - if (!(++buf_dbg_counter % 5771)) buf_pool.validate(); -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ - return(block); +#ifdef UNIV_DEBUG + if (!(++buf_dbg_counter % 5771)) buf_pool.validate(); +#endif /* UNIV_DEBUG */ + return block; } -/********************************************************************//** -Monitor the buffer page read/write activity, and increment corresponding -counter value if MONITOR_MODULE_BUF_PAGE (module_buf_page) module is -enabled. */ -static -void -buf_page_monitor( -/*=============*/ - const buf_page_t* bpage, /*!< in: pointer to the block */ - enum buf_io_fix io_type)/*!< in: io_fix types */ +/** Monitor the buffer page read/write activity, and increment corresponding +counter value in MONITOR_MODULE_BUF_PAGE. +@param bpage buffer page whose read or write was completed +@param io_type BUF_IO_READ or BUF_IO_WRITE */ +ATTRIBUTE_COLD __attribute__((nonnull)) +void buf_page_monitor(const buf_page_t *bpage, buf_io_fix io_type) { const byte* frame; monitor_id_t counter; - /* If the counter module is not turned on, just return */ - if (!MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)) { - return; - } - - ut_a(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE); + ut_ad(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE); frame = bpage->zip.data ? bpage->zip.data @@ -4778,50 +4010,45 @@ static void buf_mark_space_corrupt(buf_page_t* bpage, const fil_space_t& space) } } -/** Mark a table corrupted. -@param[in] bpage Corrupted page -@param[in] space Corrupted page belongs to tablespace -Also remove the bpage from LRU list. */ -static -void -buf_corrupt_page_release(buf_page_t* bpage, const fil_space_t* space) +/** Release and evict a corrupted page. +@param bpage page that was being read */ +void buf_pool_t::corrupted_evict(buf_page_t *bpage) { - const ibool uncompressed = (buf_page_get_state(bpage) - == BUF_BLOCK_FILE_PAGE); - page_id_t old_page_id = bpage->id; + const page_id_t id(bpage->id()); + rw_lock_t *hash_lock= hash_lock_get(id); - /* First unfix and release lock on the bpage */ - mutex_enter(&buf_pool.mutex); - mutex_enter(buf_page_get_mutex(bpage)); - ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_READ); - ut_ad(bpage->id.space() == space->id); + mutex_enter(&mutex); + rw_lock_x_lock(hash_lock); - /* buf_fix_count can be greater than zero. Because other thread - can wait in buf_page_wait_read() for the page to be read. */ + ut_ad(bpage->io_fix() == BUF_IO_READ); + ut_ad(!bpage->oldest_modification()); + bpage->set_corrupt_id(); - bpage->id.set_corrupt_id(); - /* Set BUF_IO_NONE before we remove the block from LRU list */ - buf_page_set_io_fix(bpage, BUF_IO_NONE); + if (bpage->state() == BUF_BLOCK_FILE_PAGE) + rw_lock_x_unlock_gen(&reinterpret_cast<buf_block_t*>(bpage)->lock, + BUF_IO_READ); - if (uncompressed) { - rw_lock_x_unlock_gen( - &((buf_block_t*) bpage)->lock, - BUF_IO_READ); - } + bpage->io_unfix(); - mutex_exit(buf_page_get_mutex(bpage)); - - if (!srv_force_recovery) { - buf_mark_space_corrupt(bpage, *space); - } + /* remove from LRU and page_hash */ + buf_LRU_free_one_page(bpage, id, hash_lock); + mutex_exit(&mutex); - /* After this point bpage can't be referenced. */ - buf_LRU_free_one_page(bpage, old_page_id); + ut_d(auto n=) n_pend_reads--; + ut_ad(n > 0); +} - ut_ad(buf_pool.n_pend_reads > 0); - buf_pool.n_pend_reads--; +/** Mark a table corrupted. +@param[in] bpage Corrupted page +@param[in] node data file +Also remove the bpage from LRU list. */ +static void buf_corrupt_page_release(buf_page_t *bpage, const fil_node_t &node) +{ + ut_ad(bpage->id().space() == node.space->id); + buf_pool.corrupted_evict(bpage); - mutex_exit(&buf_pool.mutex); + if (!srv_force_recovery) + buf_mark_space_corrupt(bpage, *node.space); } /** Check if the encrypted page is corrupted for the full crc32 format. @@ -4847,21 +4074,23 @@ static bool buf_page_full_crc32_is_corrupted(ulint space_id, const byte* d, corrupted page. Note that we can't be 100% sure if page is corrupted or decrypt/decompress just failed. @param[in,out] bpage page -@param[in,out] space tablespace from fil_space_acquire_for_io() +@param[in] node data file @return whether the operation succeeded @retval DB_SUCCESS if page has been read and is not corrupted @retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted @retval DB_DECRYPTION_FAILED if page post encryption checksum matches but after decryption normal page checksum does not match. @retval DB_TABLESPACE_DELETED if accessed tablespace is not found */ -static dberr_t buf_page_check_corrupt(buf_page_t* bpage, fil_space_t* space) +static dberr_t buf_page_check_corrupt(buf_page_t *bpage, + const fil_node_t &node) { - ut_ad(space->pending_io()); + ut_ad(node.space->pending_io()); byte* dst_frame = (bpage->zip.data) ? bpage->zip.data : ((buf_block_t*) bpage)->frame; dberr_t err = DB_SUCCESS; - uint key_version = buf_page_get_key_version(dst_frame, space->flags); + uint key_version = buf_page_get_key_version(dst_frame, + node.space->flags); /* In buf_decrypt_after_read we have either decrypted the page if page post encryption checksum matches and used key_id is found @@ -4869,33 +4098,35 @@ static dberr_t buf_page_check_corrupt(buf_page_t* bpage, fil_space_t* space) not decrypted and it could be either encrypted and corrupted or corrupted or good page. If we decrypted, there page could still be corrupted if used key does not match. */ - const bool seems_encrypted = !space->full_crc32() && key_version - && space->crypt_data - && space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED; - ut_ad(space->purpose != FIL_TYPE_TEMPORARY || space->full_crc32()); + const bool seems_encrypted = !node.space->full_crc32() && key_version + && node.space->crypt_data + && node.space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED; + ut_ad(node.space->purpose != FIL_TYPE_TEMPORARY || + node.space->full_crc32()); /* If traditional checksums match, we assume that page is not anymore encrypted. */ - if (space->full_crc32() + if (node.space->full_crc32() && !buf_is_zeroes(span<const byte>(dst_frame, - space->physical_size())) - && (key_version || space->is_compressed() - || space->purpose == FIL_TYPE_TEMPORARY)) { + node.space->physical_size())) + && (key_version || node.space->is_compressed() + || node.space->purpose == FIL_TYPE_TEMPORARY)) { if (buf_page_full_crc32_is_corrupted( - space->id, dst_frame, space->is_compressed())) { + bpage->id().space(), dst_frame, + node.space->is_compressed())) { err = DB_PAGE_CORRUPTED; } - } else if (buf_page_is_corrupted(true, dst_frame, space->flags)) { + } else if (buf_page_is_corrupted(true, dst_frame, node.space->flags)) { err = DB_PAGE_CORRUPTED; } if (seems_encrypted && err == DB_PAGE_CORRUPTED - && bpage->id.page_no() != 0) { + && bpage->id().page_no() != 0) { err = DB_DECRYPTION_FAILED; ib::error() - << "The page " << bpage->id << " in file '" - << space->chain.start->name + << "The page " << bpage->id() + << " in file '" << node.name << "' cannot be decrypted."; ib::info() @@ -4904,7 +4135,7 @@ static dberr_t buf_page_check_corrupt(buf_page_t* bpage, fil_space_t* space) << " is not found or" " used encryption algorithm or method does not match."; - if (bpage->id.space() != TRX_SYS_SPACE) { + if (bpage->id().space() != TRX_SYS_SPACE) { ib::info() << "Marking tablespace as missing." " You may drop this table or" @@ -4916,284 +4147,164 @@ static dberr_t buf_page_check_corrupt(buf_page_t* bpage, fil_space_t* space) return (err); } -/** Complete a read or write request of a file page to or from the buffer pool. -@param[in,out] bpage page to complete -@param[in] dblwr whether the doublewrite buffer was used (on write) -@param[in] evict whether or not to evict the page from LRU list +/** Complete a read request of a file page to buf_pool. +@param bpage recently read page +@param node data file @return whether the operation succeeded -@retval DB_SUCCESS always when writing, or if a read page was OK -@retval DB_TABLESPACE_DELETED if the tablespace does not exist -@retval DB_PAGE_CORRUPTED if the checksum fails on a page read -@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but - after decryption normal page checksum does - not match */ -UNIV_INTERN -dberr_t -buf_page_io_complete(buf_page_t* bpage, bool dblwr, bool evict) +@retval DB_SUCCESS always when writing, or if a read page was OK +@retval DB_PAGE_CORRUPTED if the checksum fails on a page read +@retval DB_DECRYPTION_FAILED if the page cannot be decrypted */ +dberr_t buf_page_read_complete(buf_page_t *bpage, const fil_node_t &node) { - enum buf_io_fix io_type; - const bool uncompressed = (buf_page_get_state(bpage) - == BUF_BLOCK_FILE_PAGE); - ut_a(buf_page_in_file(bpage)); - - /* We do not need protect io_fix here by mutex to read - it because this is the only function where we can change the value - from BUF_IO_READ or BUF_IO_WRITE to some other value, and our code - ensures that this is the only thread that handles the i/o for this - block. */ - - io_type = buf_page_get_io_fix(bpage); - ut_ad(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE); - ut_ad(!!bpage->zip.ssize == (bpage->zip.data != NULL)); - ut_ad(uncompressed || bpage->zip.data); - - if (io_type == BUF_IO_READ) { - ulint read_page_no = 0; - ulint read_space_id = 0; - byte* frame = bpage->zip.data - ? bpage->zip.data - : reinterpret_cast<buf_block_t*>(bpage)->frame; - ut_ad(frame); - fil_space_t* space = fil_space_acquire_for_io( - bpage->id.space()); - if (!space) { - return DB_TABLESPACE_DELETED; - } - - dberr_t err; - - if (!buf_page_decrypt_after_read(bpage, space)) { - err = DB_DECRYPTION_FAILED; - goto database_corrupted; - } - - if (bpage->zip.data && uncompressed) { - buf_pool.n_pend_unzip++; - ibool ok = buf_zip_decompress((buf_block_t*) bpage, - FALSE); - buf_pool.n_pend_unzip--; - - if (!ok) { - ib::info() << "Page " - << bpage->id - << " zip_decompress failure."; - - err = DB_PAGE_CORRUPTED; - goto database_corrupted; - } - } + const page_id_t id(bpage->id()); + ut_ad(bpage->in_file()); + ut_ad(id.space() || !buf_dblwr_page_inside(id.page_no())); + ut_ad(id.space() == node.space->id); + ut_ad(bpage->zip_size() == node.space->zip_size()); + + /* We do not need protect io_fix here by mutex to read it because + this and buf_page_write_complete() are the only functions where we can + change the value from BUF_IO_READ or BUF_IO_WRITE to some other + value, and our code ensures that this is the only thread that handles + the i/o for this block. */ + + ut_ad(bpage->io_fix() == BUF_IO_READ); + ut_ad(!!bpage->zip.ssize == !!bpage->zip.data); + ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE || bpage->zip.data); + + const byte *frame= bpage->zip.data + ? bpage->zip.data + : reinterpret_cast<buf_block_t*>(bpage)->frame; + ut_ad(frame); + + dberr_t err; + if (!buf_page_decrypt_after_read(bpage, node)) + { + err= DB_DECRYPTION_FAILED; + goto database_corrupted; + } - /* If this page is not uninitialized and not in the - doublewrite buffer, then the page number and space id - should be the same as in block. */ - read_page_no = mach_read_from_4(frame + FIL_PAGE_OFFSET); - read_space_id = mach_read_from_4( - frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); - - if (bpage->id.space() == TRX_SYS_SPACE - && buf_dblwr_page_inside(bpage->id.page_no())) { - - ib::error() << "Reading page " << bpage->id - << ", which is in the doublewrite buffer!"; - - } else if (read_space_id == 0 && read_page_no == 0) { - /* This is likely an uninitialized page. */ - } else if (((!space->full_crc32() - || bpage->id.space() != TRX_SYS_SPACE) - && bpage->id.space() != read_space_id) - || bpage->id.page_no() != read_page_no) { - /* We do not compare space_id to read_space_id - in the system tablespace unless space->full_crc32(), - because the field was written as garbage before - MySQL 4.1.1, which introduced support for - innodb_file_per_table. */ - - if (space->full_crc32() - && *reinterpret_cast<uint32_t*> - (&frame[FIL_PAGE_FCRC32_KEY_VERSION]) - && space->crypt_data - && space->crypt_data->type - != CRYPT_SCHEME_UNENCRYPTED) { - ib::error() << "Cannot decrypt " << bpage->id; - err = DB_DECRYPTION_FAILED; - goto release_page; - } + if (bpage->zip.data && bpage->state() == BUF_BLOCK_FILE_PAGE) + { + buf_pool.n_pend_unzip++; + auto ok= buf_zip_decompress(reinterpret_cast<buf_block_t*>(bpage), FALSE); + buf_pool.n_pend_unzip--; - ib::error() << "Space id and page no stored in " - "the page, read in are " - << page_id_t(read_space_id, read_page_no) - << ", should be " << bpage->id; - } + if (!ok) + { + ib::info() << "Page " << id << " zip_decompress failure."; + err= DB_PAGE_CORRUPTED; + goto database_corrupted; + } + } - err = buf_page_check_corrupt(bpage, space); + { + const page_id_t read_id(mach_read_from_4(frame + FIL_PAGE_SPACE_ID), + mach_read_from_4(frame + FIL_PAGE_OFFSET)); + + if (read_id == id); + else if (read_id == page_id_t(0, 0)) + /* This is likely an uninitialized page. */; + else if (!node.space->full_crc32() && + page_id_t(0, read_id.page_no()) == id) + /* FIL_PAGE_SPACE_ID was written as garbage in the system tablespace + before MySQL 4.1.1, which introduced innodb_file_per_table. */; + else if (node.space->full_crc32() && + *reinterpret_cast<const uint32_t*> + (&frame[FIL_PAGE_FCRC32_KEY_VERSION]) && + node.space->crypt_data && + node.space->crypt_data->type != CRYPT_SCHEME_UNENCRYPTED) + { + ib::error() << "Cannot decrypt " << id; + err= DB_DECRYPTION_FAILED; + goto release_page; + } + else + ib::error() << "Space id and page no stored in the page, read in are " + << read_id << ", should be " << id; + } - if (err != DB_SUCCESS) { + err= buf_page_check_corrupt(bpage, node); + if (err != DB_SUCCESS) + { database_corrupted: - /* Not a real corruption if it was triggered by - error injection */ - DBUG_EXECUTE_IF( - "buf_page_import_corrupt_failure", - if (!is_predefined_tablespace( - bpage->id.space())) { - buf_corrupt_page_release(bpage, space); - ib::info() << "Simulated IMPORT " - "corruption"; - space->release_for_io(); - return(err); - } - err = DB_SUCCESS; - goto page_not_corrupt; - ); - - if (uncompressed && bpage->zip.data) { - memset(reinterpret_cast<buf_block_t*>(bpage) - ->frame, 0, srv_page_size); - } - - if (err == DB_PAGE_CORRUPTED) { - ib::error() - << "Database page corruption on disk" - " or a failed file read of tablespace " - << space->name << " page " << bpage->id - << ". You may have to recover from " - << "a backup."; - - buf_page_print(frame, bpage->zip_size()); - - ib::info() - << "It is also possible that your" - " operating system has corrupted" - " its own file cache and rebooting" - " your computer removes the error." - " If the corrupt page is an index page." - " You can also try to fix the" - " corruption by dumping, dropping," - " and reimporting the corrupt table." - " You can use CHECK TABLE to scan" - " your table for corruption. " - << FORCE_RECOVERY_MSG; - } + /* Not a real corruption if it was triggered by error injection */ + DBUG_EXECUTE_IF("buf_page_import_corrupt_failure", + if (!is_predefined_tablespace(id.space())) + { + buf_corrupt_page_release(bpage, node); + ib::info() << "Simulated IMPORT corruption"; + return err; + } + err= DB_SUCCESS; + goto page_not_corrupt;); + + if (bpage->zip.data && bpage->state() == BUF_BLOCK_FILE_PAGE) + memset(reinterpret_cast<buf_block_t*>(bpage)->frame, 0, srv_page_size); + + if (err == DB_PAGE_CORRUPTED) + { + ib::error() << "Database page corruption on disk" + " or a failed read of file '" + << node.name << "' page " << id + << ". You may have to recover from a backup."; - if (!srv_force_recovery) { + buf_page_print(frame, bpage->zip_size()); - /* If page space id is larger than TRX_SYS_SPACE - (0), we will attempt to mark the corresponding - table as corrupted instead of crashing server */ - if (bpage->id.space() == TRX_SYS_SPACE) { - ib::fatal() << "Aborting because of" - " a corrupt database page."; - } + ib::info() << " You can use CHECK TABLE to scan" + " your table for corruption. " + << FORCE_RECOVERY_MSG; + } - buf_corrupt_page_release(bpage, space); - space->release_for_io(); - return(err); - } - } + if (!srv_force_recovery) + { + /* If the corruption is in the system tablespace, we will + intentionally crash the server. */ + if (id.space() == TRX_SYS_SPACE) + ib::fatal() << "Aborting because of a corrupt database page."; + buf_corrupt_page_release(bpage, node); + return err; + } + } - DBUG_EXECUTE_IF("buf_page_import_corrupt_failure", - page_not_corrupt: bpage = bpage; ); + DBUG_EXECUTE_IF("buf_page_import_corrupt_failure", + page_not_corrupt: bpage= bpage; ); - if (err == DB_PAGE_CORRUPTED - || err == DB_DECRYPTION_FAILED) { + if (err == DB_PAGE_CORRUPTED || err == DB_DECRYPTION_FAILED) + { release_page: - const page_id_t corrupt_page_id = bpage->id; - - buf_corrupt_page_release(bpage, space); - - if (recv_recovery_is_on()) { - recv_sys.free_corrupted_page(corrupt_page_id); - } - - space->release_for_io(); - return err; - } - - if (recv_recovery_is_on()) { - recv_recover_page(space, bpage); - } - - if (uncompressed - && !recv_no_ibuf_operations - && (bpage->id.space() == 0 - || !is_predefined_tablespace(bpage->id.space())) - && fil_page_get_type(frame) == FIL_PAGE_INDEX - && page_is_leaf(frame) - && ibuf_page_exists(bpage->id, bpage->zip_size())) { - bpage->ibuf_exist = true; - } - - space->release_for_io(); - } else { - /* io_type == BUF_IO_WRITE */ - if (bpage->slot) { - /* Mark slot free */ - bpage->slot->release(); - bpage->slot = NULL; - } - } - - BPageMutex* block_mutex = buf_page_get_mutex(bpage); - mutex_enter(&buf_pool.mutex); - mutex_enter(block_mutex); - - /* Because this thread which does the unlocking is not the same that - did the locking, we use a pass value != 0 in unlock, which simply - removes the newest lock debug record, without checking the thread - id. */ - - buf_page_set_io_fix(bpage, BUF_IO_NONE); - buf_page_monitor(bpage, io_type); - - if (io_type == BUF_IO_READ) { - /* NOTE that the call to ibuf may have moved the ownership of - the x-latch to this OS thread: do not let this confuse you in - debugging! */ - - ut_ad(buf_pool.n_pend_reads > 0); - buf_pool.n_pend_reads--; - buf_pool.stat.n_pages_read++; + buf_corrupt_page_release(bpage, node); + if (recv_recovery_is_on()) + recv_sys.free_corrupted_page(id); + return err; + } - if (uncompressed) { - rw_lock_x_unlock_gen(&((buf_block_t*) bpage)->lock, - BUF_IO_READ); - } + if (recv_recovery_is_on()) + recv_recover_page(node.space, bpage); - mutex_exit(block_mutex); - } else { - /* Write means a flush operation: call the completion - routine in the flush system */ + if (bpage->state() == BUF_BLOCK_FILE_PAGE && !recv_no_ibuf_operations && + (!id.space() || !is_predefined_tablespace(id.space())) && + fil_page_get_type(frame) == FIL_PAGE_INDEX && + page_is_leaf(frame) && ibuf_page_exists(id, bpage->zip_size())) + bpage->ibuf_exist= true; - buf_flush_write_complete(bpage, dblwr); + if (UNIV_UNLIKELY(MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE))) + buf_page_monitor(bpage, BUF_IO_READ); + DBUG_PRINT("ib_buf", ("read page %u:%u", + id.space(), id.page_no())); - if (uncompressed) { - rw_lock_sx_unlock_gen(&((buf_block_t*) bpage)->lock, - BUF_IO_WRITE); - } + /* Because this thread which does the unlocking might not be the same that + did the locking, we use a pass value != 0 in unlock, which simply + removes the newest lock debug record, without checking the thread id. */ + if (bpage->state() == BUF_BLOCK_FILE_PAGE) + rw_lock_x_unlock_gen(&((buf_block_t*) bpage)->lock, BUF_IO_READ); + bpage->io_unfix(); - buf_pool.stat.n_pages_written++; + ut_d(auto n=) buf_pool.n_pend_reads--; + ut_ad(n > 0); + buf_pool.stat.n_pages_read++; - /* We decide whether or not to evict the page from the - LRU list based on the flush_type. - * BUF_FLUSH_LIST: don't evict - * BUF_FLUSH_LRU: always evict - * BUF_FLUSH_SINGLE_PAGE: eviction preference is passed - by the caller explicitly. */ - if (buf_page_get_flush_type(bpage) == BUF_FLUSH_LRU) { - evict = true; - } - - mutex_exit(block_mutex); - - if (evict) { - buf_LRU_free_page(bpage, true); - } - } - - DBUG_PRINT("ib_buf", ("%s page %u:%u", - io_type == BUF_IO_READ ? "read" : "wrote", - bpage->id.space(), bpage->id.page_no())); - mutex_exit(&buf_pool.mutex); - return DB_SUCCESS; + return DB_SUCCESS; } #ifdef UNIV_DEBUG @@ -5206,7 +4317,7 @@ void buf_pool_t::assert_all_freed() const chunk_t *chunk= chunks; for (auto i= n_chunks; i--; chunk++) if (const buf_block_t* block= chunk->not_freed()) - ib::fatal() << "Page " << block->page.id << " still fixed or dirty"; + ib::fatal() << "Page " << block->page.id() << " still fixed or dirty"; mutex_exit(&mutex); } #endif /* UNIV_DEBUG */ @@ -5223,29 +4334,27 @@ All pages must be in a replaceable state (not modified or latched). */ void buf_pool_invalidate() { mutex_enter(&buf_pool.mutex); + ut_ad(!buf_pool.init_flush[IORequest::LRU]); + ut_ad(!buf_pool.init_flush[IORequest::FLUSH_LIST]); + ut_ad(!buf_pool.init_flush[IORequest::SINGLE_PAGE]); + ut_ad(!buf_pool.n_flush[IORequest::SINGLE_PAGE]); - for (unsigned i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) { - - /* As this function is called during startup and - during redo application phase during recovery, InnoDB - is single threaded (apart from IO helper threads) at - this stage. No new write batch can be in intialization - stage at this point. */ - ut_ad(!buf_pool.init_flush[i]); - - /* However, it is possible that a write batch that has - been posted earlier is still not complete. For buffer - pool invalidation to proceed we must ensure there is NO - write activity happening. */ - if (buf_pool.n_flush[i] > 0) { - buf_flush_t type = buf_flush_t(i); + if (buf_pool.n_flush[IORequest::LRU]) { + mutex_exit(&buf_pool.mutex); + buf_flush_wait_batch_end(true); + mutex_enter(&buf_pool.mutex); + } - mutex_exit(&buf_pool.mutex); - buf_flush_wait_batch_end(type); - mutex_enter(&buf_pool.mutex); - } + if (buf_pool.n_flush[IORequest::FLUSH_LIST]) { + mutex_exit(&buf_pool.mutex); + buf_flush_wait_batch_end(false); + mutex_enter(&buf_pool.mutex); } + /* It is possible that a write batch that has been posted + earlier is still not complete. For buffer pool invalidation to + proceed we must ensure there is NO write activity happening. */ + ut_d(mutex_exit(&buf_pool.mutex)); ut_d(buf_pool.assert_all_freed()); ut_d(mutex_enter(&buf_pool.mutex)); @@ -5264,85 +4373,38 @@ void buf_pool_invalidate() mutex_exit(&buf_pool.mutex); } -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +#ifdef UNIV_DEBUG /** Validate the buffer pool. */ void buf_pool_t::validate() { - buf_page_t* b; - chunk_t* chunk; - ulint i; - ulint n_lru_flush = 0; - ulint n_page_flush = 0; - ulint n_list_flush = 0; ulint n_lru = 0; - ulint n_flush = 0; + ulint n_flushing = 0; ulint n_free = 0; ulint n_zip = 0; - mutex_enter(&buf_pool.mutex); - hash_lock_x_all(buf_pool.page_hash); + mutex_enter(&mutex); + page_hash_lock_all(); - chunk = buf_pool.chunks; + chunk_t* chunk = chunks; /* Check the uncompressed blocks. */ - for (i = buf_pool.n_chunks; i--; chunk++) { + for (auto i = n_chunks; i--; chunk++) { ulint j; buf_block_t* block = chunk->blocks; for (j = chunk->size; j--; block++) { - - buf_page_mutex_enter(block); - - switch (buf_block_get_state(block)) { - case BUF_BLOCK_POOL_WATCH: + switch (block->page.state()) { case BUF_BLOCK_ZIP_PAGE: - case BUF_BLOCK_ZIP_DIRTY: /* These should only occur on zip_clean, zip_free[], or flush_list. */ ut_error; break; case BUF_BLOCK_FILE_PAGE: - ut_ad(buf_page_hash_get_low(block->page.id) + ut_ad(page_hash_get_low(block->page.id()) == &block->page); - - switch (buf_page_get_io_fix(&block->page)) { - case BUF_IO_NONE: - break; - - case BUF_IO_WRITE: - switch (buf_page_get_flush_type( - &block->page)) { - case BUF_FLUSH_LRU: - n_lru_flush++; - goto assert_s_latched; - case BUF_FLUSH_SINGLE_PAGE: - n_page_flush++; -assert_s_latched: - ut_a(rw_lock_is_locked( - &block->lock, - RW_LOCK_S) - || rw_lock_is_locked( - &block->lock, - RW_LOCK_SX)); - break; - case BUF_FLUSH_LIST: - n_list_flush++; - break; - default: - ut_error; - } - break; - case BUF_IO_READ: - ut_ad(rw_lock_is_locked(&block->lock, - RW_LOCK_X)); - break; - case BUF_IO_PIN: - break; - } - n_lru++; break; @@ -5350,25 +4412,21 @@ assert_s_latched: n_free++; break; - case BUF_BLOCK_READY_FOR_USE: case BUF_BLOCK_MEMORY: case BUF_BLOCK_REMOVE_HASH: /* do nothing */ break; } - - buf_page_mutex_exit(block); } } - mutex_enter(&buf_pool.zip_mutex); - /* Check clean compressed-only blocks. */ - for (b = UT_LIST_GET_FIRST(buf_pool.zip_clean); b; + for (buf_page_t* b = UT_LIST_GET_FIRST(zip_clean); b; b = UT_LIST_GET_NEXT(list, b)) { - ut_ad(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE); - switch (buf_page_get_io_fix(b)) { + ut_ad(b->state() == BUF_BLOCK_ZIP_PAGE); + ut_ad(!b->oldest_modification()); + switch (b->io_fix()) { case BUF_IO_NONE: case BUF_IO_PIN: /* All clean blocks should be I/O-unfixed. */ @@ -5384,102 +4442,67 @@ assert_s_latched: break; } - /* It is OK to read oldest_modification here because - we have acquired buf_pool.zip_mutex above which acts - as the 'block->mutex' for these bpages. */ - ut_ad(!b->oldest_modification); - ut_ad(buf_page_hash_get_low(b->id) == b); + ut_ad(page_hash_get_low(b->id()) == b); n_lru++; n_zip++; } /* Check dirty blocks. */ - mutex_enter(&buf_pool.flush_list_mutex); - for (b = UT_LIST_GET_FIRST(buf_pool.flush_list); b; + mutex_enter(&flush_list_mutex); + for (buf_page_t* b = UT_LIST_GET_FIRST(flush_list); b; b = UT_LIST_GET_NEXT(list, b)) { - ut_ad(b->in_flush_list); - ut_ad(b->oldest_modification); - n_flush++; + ut_ad(b->oldest_modification()); + n_flushing++; - switch (buf_page_get_state(b)) { - case BUF_BLOCK_ZIP_DIRTY: + switch (b->state()) { + case BUF_BLOCK_ZIP_PAGE: n_lru++; n_zip++; - switch (buf_page_get_io_fix(b)) { - case BUF_IO_NONE: - case BUF_IO_READ: - case BUF_IO_PIN: - break; - case BUF_IO_WRITE: - switch (buf_page_get_flush_type(b)) { - case BUF_FLUSH_LRU: - n_lru_flush++; - break; - case BUF_FLUSH_SINGLE_PAGE: - n_page_flush++; - break; - case BUF_FLUSH_LIST: - n_list_flush++; - break; - default: - ut_error; - } - break; - } break; case BUF_BLOCK_FILE_PAGE: /* uncompressed page */ break; - case BUF_BLOCK_POOL_WATCH: - case BUF_BLOCK_ZIP_PAGE: case BUF_BLOCK_NOT_USED: - case BUF_BLOCK_READY_FOR_USE: case BUF_BLOCK_MEMORY: case BUF_BLOCK_REMOVE_HASH: ut_error; break; } - ut_ad(buf_page_hash_get_low(b->id) == b); + ut_ad(page_hash_get_low(b->id()) == b); } - ut_ad(UT_LIST_GET_LEN(buf_pool.flush_list) == n_flush); + ut_ad(UT_LIST_GET_LEN(flush_list) == n_flushing); - hash_unlock_x_all(buf_pool.page_hash); - mutex_exit(&buf_pool.flush_list_mutex); - - mutex_exit(&buf_pool.zip_mutex); + page_hash_unlock_all(); + mutex_exit(&flush_list_mutex); - if (buf_pool.curr_size == buf_pool.old_size - && n_lru + n_free > buf_pool.curr_size + n_zip) { + if (curr_size == old_size + && n_lru + n_free > curr_size + n_zip) { ib::fatal() << "n_LRU " << n_lru << ", n_free " << n_free - << ", pool " << buf_pool.curr_size + << ", pool " << curr_size << " zip " << n_zip << ". Aborting..."; } - ut_ad(UT_LIST_GET_LEN(buf_pool.LRU) == n_lru); + ut_ad(UT_LIST_GET_LEN(LRU) == n_lru); - if (buf_pool.curr_size == buf_pool.old_size - && UT_LIST_GET_LEN(buf_pool.free) != n_free) { + if (curr_size == old_size + && UT_LIST_GET_LEN(free) != n_free) { ib::fatal() << "Free list len " - << UT_LIST_GET_LEN(buf_pool.free) + << UT_LIST_GET_LEN(free) << ", free blocks " << n_free << ". Aborting..."; } - ut_ad(buf_pool.n_flush[BUF_FLUSH_LIST] == n_list_flush); - ut_ad(buf_pool.n_flush[BUF_FLUSH_LRU] == n_lru_flush); - ut_ad(buf_pool.n_flush[BUF_FLUSH_SINGLE_PAGE] == n_page_flush); - - mutex_exit(&buf_pool.mutex); + mutex_exit(&mutex); ut_d(buf_LRU_validate()); ut_d(buf_flush_validate()); } -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#endif /* UNIV_DEBUG */ -#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG /** Write information of the buf_pool to the error log. */ void buf_pool_t::print() { @@ -5511,9 +4534,9 @@ void buf_pool_t::print() << UT_LIST_GET_LEN(flush_list) << ", n pending decompressions=" << n_pend_unzip << ", n pending reads=" << n_pend_reads - << ", n pending flush LRU=" << n_flush[BUF_FLUSH_LRU] - << " list=" << n_flush[BUF_FLUSH_LIST] - << " single page=" << n_flush[BUF_FLUSH_SINGLE_PAGE] + << ", n pending flush LRU=" << n_flush[IORequest::LRU] + << " list=" << n_flush[IORequest::FLUSH_LIST] + << " single page=" << n_flush[IORequest::SINGLE_PAGE] << ", pages made young=" << stat.n_pages_made_young << ", not young=" << stat.n_pages_not_made_young << ", pages read=" << stat.n_pages_read @@ -5583,7 +4606,7 @@ void buf_pool_t::print() validate(); } -#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */ +#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */ #ifdef UNIV_DEBUG /** @return the number of latched pages in the buffer pool */ @@ -5598,41 +4621,27 @@ ulint buf_get_latched_pages_number() auto chunk = buf_pool.chunks; for (i = buf_pool.n_chunks; i--; chunk++) { - buf_block_t* block; - ulint j; - - block = chunk->blocks; - - for (j = chunk->size; j--; block++) { - if (buf_block_get_state(block) - != BUF_BLOCK_FILE_PAGE) { - - continue; - } + buf_block_t* block= chunk->blocks; - buf_page_mutex_enter(block); + for (auto j= chunk->size; j--; block++) { + if (block->page.state() == BUF_BLOCK_FILE_PAGE + && (block->page.buf_fix_count() + || block->page.io_fix() != BUF_IO_NONE)) { - if (block->page.buf_fix_count != 0 - || buf_page_get_io_fix(&block->page) - != BUF_IO_NONE) { fixed_pages_number++; } - - buf_page_mutex_exit(block); } } - mutex_enter(&buf_pool.zip_mutex); - /* Traverse the lists of clean and dirty compressed-only blocks. */ for (b = UT_LIST_GET_FIRST(buf_pool.zip_clean); b; b = UT_LIST_GET_NEXT(list, b)) { - ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE); - ut_a(buf_page_get_io_fix(b) != BUF_IO_WRITE); + ut_a(b->state() == BUF_BLOCK_ZIP_PAGE); + ut_a(!b->oldest_modification()); + ut_a(b->io_fix() != BUF_IO_WRITE); - if (b->buf_fix_count != 0 - || buf_page_get_io_fix(b) != BUF_IO_NONE) { + if (b->buf_fix_count() || b->io_fix() != BUF_IO_NONE) { fixed_pages_number++; } } @@ -5640,31 +4649,26 @@ ulint buf_get_latched_pages_number() mutex_enter(&buf_pool.flush_list_mutex); for (b = UT_LIST_GET_FIRST(buf_pool.flush_list); b; b = UT_LIST_GET_NEXT(list, b)) { - ut_ad(b->in_flush_list); + ut_ad(b->oldest_modification()); - switch (buf_page_get_state(b)) { - case BUF_BLOCK_ZIP_DIRTY: - if (b->buf_fix_count != 0 - || buf_page_get_io_fix(b) != BUF_IO_NONE) { + switch (b->state()) { + case BUF_BLOCK_ZIP_PAGE: + if (b->buf_fix_count() || b->io_fix() != BUF_IO_NONE) { fixed_pages_number++; } - break; + continue; case BUF_BLOCK_FILE_PAGE: /* uncompressed page */ - break; - case BUF_BLOCK_POOL_WATCH: - case BUF_BLOCK_ZIP_PAGE: + continue; case BUF_BLOCK_NOT_USED: - case BUF_BLOCK_READY_FOR_USE: case BUF_BLOCK_MEMORY: case BUF_BLOCK_REMOVE_HASH: - ut_error; break; } + ut_error; } mutex_exit(&buf_pool.flush_list_mutex); - mutex_exit(&buf_pool.zip_mutex); mutex_exit(&buf_pool.mutex); return(fixed_pages_number); @@ -5696,16 +4700,16 @@ void buf_stats_get_pool_info(buf_pool_info_t *pool_info) pool_info->n_pend_reads = buf_pool.n_pend_reads; pool_info->n_pending_flush_lru = - (buf_pool.n_flush[BUF_FLUSH_LRU] - + buf_pool.init_flush[BUF_FLUSH_LRU]); + (buf_pool.n_flush[IORequest::LRU] + + buf_pool.init_flush[IORequest::LRU]); pool_info->n_pending_flush_list = - (buf_pool.n_flush[BUF_FLUSH_LIST] - + buf_pool.init_flush[BUF_FLUSH_LIST]); + (buf_pool.n_flush[IORequest::FLUSH_LIST] + + buf_pool.init_flush[IORequest::FLUSH_LIST]); pool_info->n_pending_flush_single_page = - (buf_pool.n_flush[BUF_FLUSH_SINGLE_PAGE] - + buf_pool.init_flush[BUF_FLUSH_SINGLE_PAGE]); + (buf_pool.n_flush[IORequest::SINGLE_PAGE] + + buf_pool.init_flush[IORequest::SINGLE_PAGE]); mutex_exit(&buf_pool.flush_list_mutex); @@ -5935,9 +4939,9 @@ ulint buf_pool_check_no_pending_io() ulint pending_io = buf_pool.n_pend_reads; mutex_enter(&buf_pool.mutex); pending_io += - + buf_pool.n_flush[BUF_FLUSH_LRU] - + buf_pool.n_flush[BUF_FLUSH_SINGLE_PAGE] - + buf_pool.n_flush[BUF_FLUSH_LIST]; + + buf_pool.n_flush[IORequest::LRU] + + buf_pool.n_flush[IORequest::FLUSH_LIST] + + buf_pool.n_flush[IORequest::SINGLE_PAGE]; mutex_exit(&buf_pool.mutex); return(pending_io); @@ -5955,17 +4959,6 @@ std::ostream& operator<<(std::ostream &out, const page_id_t page_id) } /** -Should we punch hole to deallocate unused portion of the page. -@param[in] bpage Page control block -@return true if punch hole should be used, false if not */ -bool -buf_page_should_punch_hole( - const buf_page_t* bpage) -{ - return bpage->real_size != bpage->physical_size(); -} - -/** Calculate the length of trim (punch_hole) operation. @param[in] bpage Page control block @param[in] write_length Write length diff --git a/storage/innobase/buf/buf0dblwr.cc b/storage/innobase/buf/buf0dblwr.cc index 38160a9e4a7..6a2d7fe4f12 100644 --- a/storage/innobase/buf/buf0dblwr.cc +++ b/storage/innobase/buf/buf0dblwr.cc @@ -82,18 +82,6 @@ inline buf_block_t *buf_dblwr_trx_sys_get(mtr_t *mtr) return block; } -/********************************************************************//** -Flush a batch of writes to the datafiles that have already been -written to the dblwr buffer on disk. */ -void -buf_dblwr_sync_datafiles() -/*======================*/ -{ - /* Wait that all async writes to tablespaces have been posted to - the OS */ - os_aio_wait_until_no_pending_writes(); -} - /****************************************************************//** Creates or initialializes the doublewrite buffer at a database start. */ static void buf_dblwr_init(const byte *doublewrite) @@ -125,15 +113,12 @@ static void buf_dblwr_init(const byte *doublewrite) buf_dblwr->block2 = mach_read_from_4( doublewrite + TRX_SYS_DOUBLEWRITE_BLOCK2); - buf_dblwr->in_use = static_cast<bool*>( - ut_zalloc_nokey(buf_size * sizeof(bool))); - buf_dblwr->write_buf = static_cast<byte*>( aligned_malloc(buf_size << srv_page_size_shift, srv_page_size)); - buf_dblwr->buf_block_arr = static_cast<buf_page_t**>( - ut_zalloc_nokey(buf_size * sizeof(void*))); + buf_dblwr->buf_block_arr = static_cast<buf_dblwr_t::element*>( + ut_zalloc_nokey(buf_size * sizeof(buf_dblwr_t::element))); } /** Create the doublewrite buffer if the doublewrite buffer header @@ -242,7 +227,7 @@ too_small: has not been written to in doublewrite. */ ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1); - page_no = new_block->page.id.page_no(); + page_no = new_block->page.id().page_no(); /* We only do this in the debug build, to ensure that the check in buf_flush_init_for_writing() will see a valid page type. The flushes of new_block are actually @@ -554,16 +539,20 @@ buf_dblwr_process() request.dblwr_recover(); /* Read in the actual page from the file */ - dberr_t err = fil_io( + fil_io_t fio = fil_io( request, true, page_id, zip_size, 0, physical_size, read_buf, NULL); - if (err != DB_SUCCESS) { + if (fio.err != DB_SUCCESS) { ib::warn() << "Double write buffer recovery: " << page_id << " read failed with " - << "error: " << ut_strerr(err); + << "error: " << ut_strerr(fio.err); + } + + if (fio.node) { + fio.node->space->release_for_io(); } const bool is_all_zero = buf_is_zeroes( @@ -649,15 +638,17 @@ bad: /* Write the good page from the doublewrite buffer to the intended position. */ - - IORequest write_request(IORequest::WRITE); - - fil_io(write_request, true, page_id, zip_size, - 0, physical_size, - const_cast<byte*>(page), NULL); - - ib::info() << "Recovered page " << page_id - << " from the doublewrite buffer."; + fio = fil_io(IORequestWrite, true, page_id, zip_size, + 0, physical_size, const_cast<byte*>(page), + nullptr); + + if (fio.node) { + ut_ad(fio.err == DB_SUCCESS); + ib::info() << "Recovered page " << page_id + << " to '" << fio.node->name + << "' from the doublewrite buffer."; + fio.node->space->release_for_io(); + } } recv_dblwr.pages.clear(); @@ -680,75 +671,59 @@ buf_dblwr_free() os_event_destroy(buf_dblwr->s_event); aligned_free(buf_dblwr->write_buf); ut_free(buf_dblwr->buf_block_arr); - ut_free(buf_dblwr->in_use); mutex_free(&buf_dblwr->mutex); ut_free(buf_dblwr); buf_dblwr = NULL; } -/********************************************************************//** -Updates the doublewrite buffer when an IO request is completed. */ -void -buf_dblwr_update( -/*=============*/ - const buf_page_t* bpage, /*!< in: buffer block descriptor */ - buf_flush_t flush_type)/*!< in: flush type */ +/** Update the doublewrite buffer on write completion. */ +void buf_dblwr_update(const buf_page_t &bpage, bool single_page) { - ut_ad(srv_use_doublewrite_buf); - ut_ad(buf_dblwr); - ut_ad(!fsp_is_system_temporary(bpage->id.space())); - ut_ad(!srv_read_only_mode); - - switch (flush_type) { - case BUF_FLUSH_LIST: - case BUF_FLUSH_LRU: - mutex_enter(&buf_dblwr->mutex); - - ut_ad(buf_dblwr->batch_running); - ut_ad(buf_dblwr->b_reserved > 0); - ut_ad(buf_dblwr->b_reserved <= buf_dblwr->first_free); - - buf_dblwr->b_reserved--; - - if (buf_dblwr->b_reserved == 0) { - mutex_exit(&buf_dblwr->mutex); - /* This will finish the batch. Sync data files - to the disk. */ - fil_flush_file_spaces(); - mutex_enter(&buf_dblwr->mutex); - - /* We can now reuse the doublewrite memory buffer: */ - buf_dblwr->first_free = 0; - buf_dblwr->batch_running = false; - os_event_set(buf_dblwr->b_event); - } - - mutex_exit(&buf_dblwr->mutex); - break; - case BUF_FLUSH_SINGLE_PAGE: - { - const ulint size = TRX_SYS_DOUBLEWRITE_BLOCKS * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE; - ulint i; - mutex_enter(&buf_dblwr->mutex); - for (i = srv_doublewrite_batch_size; i < size; ++i) { - if (buf_dblwr->buf_block_arr[i] == bpage) { - buf_dblwr->s_reserved--; - buf_dblwr->buf_block_arr[i] = NULL; - buf_dblwr->in_use[i] = false; - break; - } - } + ut_ad(srv_use_doublewrite_buf); + ut_ad(buf_dblwr); + ut_ad(!fsp_is_system_temporary(bpage.id().space())); + ut_ad(!srv_read_only_mode); + + if (!single_page) + { + mutex_enter(&buf_dblwr->mutex); + + ut_ad(buf_dblwr->batch_running); + ut_ad(buf_dblwr->b_reserved > 0); + ut_ad(buf_dblwr->b_reserved <= buf_dblwr->first_free); + + if (!--buf_dblwr->b_reserved) + { + mutex_exit(&buf_dblwr->mutex); + /* This will finish the batch. Sync data files to the disk. */ + fil_flush_file_spaces(); + mutex_enter(&buf_dblwr->mutex); + + /* We can now reuse the doublewrite memory buffer: */ + buf_dblwr->first_free= 0; + buf_dblwr->batch_running= false; + os_event_set(buf_dblwr->b_event); + } + + mutex_exit(&buf_dblwr->mutex); + return; + } + + ulint size= TRX_SYS_DOUBLEWRITE_BLOCKS * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE; + mutex_enter(&buf_dblwr->mutex); + for (ulint i= srv_doublewrite_batch_size; i < size; ++i) + { + if (buf_dblwr->buf_block_arr[i].bpage != &bpage) + continue; + buf_dblwr->s_reserved--; + buf_dblwr->buf_block_arr[i].bpage= nullptr; + os_event_set(buf_dblwr->s_event); + mutex_exit(&buf_dblwr->mutex); + return; + } - /* The block we are looking for must exist as a - reserved block. */ - ut_a(i < size); - } - os_event_set(buf_dblwr->s_event); - mutex_exit(&buf_dblwr->mutex); - break; - case BUF_FLUSH_N_TYPES: - ut_error; - } + /* The block must exist as a reserved block. */ + ut_error; } #ifdef UNIV_DEBUG @@ -757,25 +732,26 @@ buf_dblwr_update( @param[in] s tablespace */ static void buf_dblwr_check_page_lsn(const page_t* page, const fil_space_t& s) { - /* Ignore page compressed or encrypted pages */ + /* Ignore page_compressed or encrypted pages */ if (s.is_compressed() || buf_page_get_key_version(page, s.flags)) return; const byte* lsn_start= FIL_PAGE_LSN + 4 + page; - const byte* lsn_end= page + - srv_page_size - (s.full_crc32() - ? FIL_PAGE_FCRC32_END_LSN - : FIL_PAGE_END_LSN_OLD_CHKSUM - 4); + const byte* lsn_end= page + srv_page_size - + (s.full_crc32() + ? FIL_PAGE_FCRC32_END_LSN + : FIL_PAGE_END_LSN_OLD_CHKSUM - 4); static_assert(FIL_PAGE_FCRC32_END_LSN % 4 == 0, "alignment"); static_assert(FIL_PAGE_LSN % 4 == 0, "alignment"); ut_ad(!memcmp_aligned<4>(lsn_start, lsn_end, 4)); } -static void buf_dblwr_check_page_lsn(const buf_page_t& b, const byte* page) +static void buf_dblwr_check_page_lsn(const buf_page_t &b, const byte *page) { - if (fil_space_t* space = fil_space_acquire_for_io(b.id.space())) { - buf_dblwr_check_page_lsn(page, *space); - space->release_for_io(); - } + if (fil_space_t *space= fil_space_acquire_for_io(b.id().space())) + { + buf_dblwr_check_page_lsn(page, *space); + space->release_for_io(); + } } #endif /* UNIV_DEBUG */ @@ -791,7 +767,7 @@ buf_dblwr_assert_on_corrupt_block( buf_page_print(block->frame); ib::fatal() << "Apparent corruption of an index page " - << block->page.id + << block->page.id() << " to be written to data file. We intentionally crash" " the server to prevent corrupt data from ending up in" " data files."; @@ -806,7 +782,7 @@ buf_dblwr_check_block( /*==================*/ const buf_block_t* block) /*!< in: block to check */ { - ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE); if (block->skip_flush_check) { return; @@ -855,44 +831,39 @@ buf_dblwr_check_block( /********************************************************************//** Writes a page that has already been written to the doublewrite buffer to the datafile. It is the job of the caller to sync the datafile. */ -static -void -buf_dblwr_write_block_to_datafile( -/*==============================*/ - const buf_page_t* bpage, /*!< in: page to write */ - bool sync) /*!< in: true if sync IO - is requested */ +static void +buf_dblwr_write_block_to_datafile(const buf_dblwr_t::element &e, bool sync) { - ut_a(buf_page_in_file(bpage)); - - ulint type = IORequest::WRITE; - IORequest request(type, const_cast<buf_page_t*>(bpage)); + ut_ad(!sync || e.flush == IORequest::SINGLE_PAGE); + buf_page_t* bpage = e.bpage; + ut_a(bpage->in_file()); + IORequest request(IORequest::WRITE, bpage, e.flush); /* We request frame here to get correct buffer in case of encryption and/or page compression */ void * frame = buf_page_get_frame(bpage); - if (bpage->zip.data != NULL) { + fil_io_t fio; + + if (bpage->zip.data) { ut_ad(bpage->zip_size()); - fil_io(request, sync, bpage->id, bpage->zip_size(), 0, - bpage->zip_size(), - (void*) frame, - (void*) bpage); + fio = fil_io(request, sync, bpage->id(), bpage->zip_size(), 0, + bpage->zip_size(), frame, bpage); } else { + ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE); ut_ad(!bpage->zip_size()); - /* Our IO API is common for both reads and writes and is - therefore geared towards a non-const parameter. */ - - buf_block_t* block = reinterpret_cast<buf_block_t*>( - const_cast<buf_page_t*>(bpage)); + ut_d(buf_dblwr_check_page_lsn(*bpage, static_cast<const byte*> + (frame))); + fio = fil_io(request, + sync, bpage->id(), bpage->zip_size(), 0, + e.size, frame, bpage); + } - ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); - ut_d(buf_dblwr_check_page_lsn(block->page, block->frame)); - fil_io(request, - sync, bpage->id, bpage->zip_size(), 0, bpage->real_size, - frame, block); + if (sync && fio.node) { + ut_ad(fio.err == DB_SUCCESS); + fio.node->space->release_for_io(); } } @@ -910,7 +881,7 @@ buf_dblwr_flush_buffered_writes() if (!srv_use_doublewrite_buf || buf_dblwr == NULL) { /* Sync the writes to the disk. */ - buf_dblwr_sync_datafiles(); + os_aio_wait_until_no_pending_writes(); /* Now we flush the data to disk (for example, with fsync) */ fil_flush_file_spaces(); return; @@ -960,12 +931,9 @@ try_again: i < buf_dblwr->first_free; len2 += srv_page_size, i++) { - const buf_block_t* block; + buf_page_t* bpage= buf_dblwr->buf_block_arr[i].bpage; - block = (buf_block_t*) buf_dblwr->buf_block_arr[i]; - - if (buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE - || block->page.zip.data) { + if (bpage->state() != BUF_BLOCK_FILE_PAGE || bpage->zip.data) { /* No simple validate for compressed pages exists. */ continue; @@ -973,17 +941,18 @@ try_again: /* Check that the actual page in the buffer pool is not corrupt and the LSN values are sane. */ - buf_dblwr_check_block(block); - ut_d(buf_dblwr_check_page_lsn(block->page, write_buf + len2)); + buf_dblwr_check_block(reinterpret_cast<buf_block_t*>(bpage)); + ut_d(buf_dblwr_check_page_lsn(*bpage, write_buf + len2)); } /* Write out the first block of the doublewrite buffer */ len = std::min<ulint>(TRX_SYS_DOUBLEWRITE_BLOCK_SIZE, buf_dblwr->first_free) << srv_page_size_shift; - fil_io(IORequestWrite, true, - page_id_t(TRX_SYS_SPACE, buf_dblwr->block1), 0, - 0, len, (void*) write_buf, NULL); + fil_io_t fio = fil_io(IORequestWrite, true, + page_id_t(TRX_SYS_SPACE, buf_dblwr->block1), 0, + 0, len, write_buf, nullptr); + fio.node->space->release_for_io(); if (buf_dblwr->first_free <= TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { /* No unwritten pages in the second block. */ @@ -997,9 +966,10 @@ try_again: write_buf = buf_dblwr->write_buf + (TRX_SYS_DOUBLEWRITE_BLOCK_SIZE << srv_page_size_shift); - fil_io(IORequestWrite, true, - page_id_t(TRX_SYS_SPACE, buf_dblwr->block2), 0, - 0, len, (void*) write_buf, NULL); + fio = fil_io(IORequestWrite, true, + page_id_t(TRX_SYS_SPACE, buf_dblwr->block2), 0, + 0, len, write_buf, nullptr); + fio.node->space->release_for_io(); flush: /* increment the doublewrite flushed pages counter */ @@ -1031,218 +1001,146 @@ flush: } } -/********************************************************************//** -Posts a buffer page for writing. If the doublewrite memory buffer is -full, calls buf_dblwr_flush_buffered_writes and waits for for free -space to appear. */ -void -buf_dblwr_add_to_batch( -/*====================*/ - buf_page_t* bpage) /*!< in: buffer block to write */ +/** Schedule a page write. If the doublewrite memory buffer is full, +buf_dblwr_flush_buffered_writes() will be invoked to make space. +@param bpage buffer pool page to be written +@param flush type of flush +@param size payload size in bytes */ +void buf_dblwr_t::add_to_batch(buf_page_t *bpage, IORequest::flush_t flush, + size_t size) { - ut_a(buf_page_in_file(bpage)); + ut_ad(bpage->in_file()); + ut_ad(flush == IORequest::LRU || flush == IORequest::FLUSH_LIST); try_again: - mutex_enter(&buf_dblwr->mutex); - - ut_a(buf_dblwr->first_free <= srv_doublewrite_batch_size); - - if (buf_dblwr->batch_running) { - - /* This not nearly as bad as it looks. There is only - page_cleaner thread which does background flushing - in batches therefore it is unlikely to be a contention - point. The only exception is when a user thread is - forced to do a flush batch because of a sync - checkpoint. */ - int64_t sig_count = os_event_reset(buf_dblwr->b_event); - mutex_exit(&buf_dblwr->mutex); - - os_event_wait_low(buf_dblwr->b_event, sig_count); - goto try_again; - } - - if (buf_dblwr->first_free == srv_doublewrite_batch_size) { - mutex_exit(&(buf_dblwr->mutex)); - - buf_dblwr_flush_buffered_writes(); - - goto try_again; - } - - byte* p = buf_dblwr->write_buf - + srv_page_size * buf_dblwr->first_free; - - /* We request frame here to get correct buffer in case of - encryption and/or page compression */ - void * frame = buf_page_get_frame(bpage); - - if (auto zip_size = bpage->zip_size()) { - UNIV_MEM_ASSERT_RW(bpage->zip.data, zip_size); - /* Copy the compressed page and clear the rest. */ - memcpy(p, frame, zip_size); - memset(p + zip_size, 0x0, srv_page_size - zip_size); - } else { - ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); - - UNIV_MEM_ASSERT_RW(frame, srv_page_size); - memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>(p, frame, - srv_page_size); - } - - buf_dblwr->buf_block_arr[buf_dblwr->first_free] = bpage; - - buf_dblwr->first_free++; - buf_dblwr->b_reserved++; - - ut_ad(!buf_dblwr->batch_running); - ut_ad(buf_dblwr->first_free == buf_dblwr->b_reserved); - ut_ad(buf_dblwr->b_reserved <= srv_doublewrite_batch_size); - - if (buf_dblwr->first_free == srv_doublewrite_batch_size) { - mutex_exit(&(buf_dblwr->mutex)); - - buf_dblwr_flush_buffered_writes(); - - return; - } - - mutex_exit(&(buf_dblwr->mutex)); + mutex_enter(&mutex); + + ut_a(first_free <= srv_doublewrite_batch_size); + + if (batch_running) + { + /* This not nearly as bad as it looks. There is only page_cleaner + thread which does background flushing in batches therefore it is + unlikely to be a contention point. The only exception is when a + user thread is forced to do a flush batch because of a sync + checkpoint. */ + int64_t sig_count= os_event_reset(b_event); + mutex_exit(&mutex); + + os_event_wait_low(b_event, sig_count); + goto try_again; + } + + if (first_free == srv_doublewrite_batch_size) + { + mutex_exit(&mutex); + buf_dblwr_flush_buffered_writes(); + goto try_again; + } + + byte *p= write_buf + srv_page_size * first_free; + + /* We request frame here to get correct buffer in case of + encryption and/or page compression */ + void * frame = buf_page_get_frame(bpage); + + memcpy_aligned<OS_FILE_LOG_BLOCK_SIZE>(p, frame, size); + ut_ad(!bpage->zip_size() || bpage->zip_size() == size); + buf_block_arr[first_free++] = { bpage, flush, size }; + b_reserved++; + + ut_ad(!batch_running); + ut_ad(first_free == b_reserved); + ut_ad(b_reserved <= srv_doublewrite_batch_size); + + const bool need_flush= first_free == srv_doublewrite_batch_size; + mutex_exit(&mutex); + + if (need_flush) + buf_dblwr_flush_buffered_writes(); } -/********************************************************************//** -Writes a page to the doublewrite buffer on disk, sync it, then write +/** Write a page to the doublewrite buffer on disk, sync it, then write the page to the datafile and sync the datafile. This function is used for single page flushes. If all the buffers allocated for single page flushes in the doublewrite buffer are in use we wait here for one to become free. We are guaranteed that a slot will become free because any thread that is using a slot must also release the slot before leaving -this function. */ -void -buf_dblwr_write_single_page( -/*========================*/ - buf_page_t* bpage, /*!< in: buffer block to write */ - bool sync) /*!< in: true if sync IO requested */ +this function. +@param bpage buffer pool page to be written +@param sync whether synchronous operation is requested +@param size payload size in bytes */ +void buf_dblwr_t::write_single_page(buf_page_t *bpage, bool sync, size_t size) { - ulint n_slots; - ulint size; - ulint offset; - ulint i; - - ut_a(buf_page_in_file(bpage)); - ut_a(srv_use_doublewrite_buf); - ut_a(buf_dblwr != NULL); - - /* total number of slots available for single page flushes - starts from srv_doublewrite_batch_size to the end of the - buffer. */ - size = TRX_SYS_DOUBLEWRITE_BLOCKS * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE; - ut_a(size > srv_doublewrite_batch_size); - n_slots = size - srv_doublewrite_batch_size; - - if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) { - - /* Check that the actual page in the buffer pool is - not corrupt and the LSN values are sane. */ - buf_dblwr_check_block((buf_block_t*) bpage); - - /* Check that the page as written to the doublewrite - buffer has sane LSN values. */ - if (!bpage->zip.data) { - ut_d(buf_dblwr_check_page_lsn( - *bpage, ((buf_block_t*) bpage)->frame)); - } - } + ut_ad(bpage->in_file()); + ut_ad(srv_use_doublewrite_buf); + ut_ad(this == buf_dblwr); + + /* total number of slots available for single page flushes + starts from srv_doublewrite_batch_size to the end of the buffer. */ + ulint slots = TRX_SYS_DOUBLEWRITE_BLOCKS * TRX_SYS_DOUBLEWRITE_BLOCK_SIZE; + ut_a(slots > srv_doublewrite_batch_size); + ulint n_slots= slots - srv_doublewrite_batch_size; + + if (bpage->state() == BUF_BLOCK_FILE_PAGE) + { + /* Check that the actual page in the buffer pool is not corrupt + and the LSN values are sane. */ + buf_dblwr_check_block(reinterpret_cast<buf_block_t*>(bpage)); +#ifdef UNIV_DEBUG + /* Check that the page as written to the doublewrite buffer has + sane LSN values. */ + if (!bpage->zip.data) + buf_dblwr_check_page_lsn(*bpage, reinterpret_cast<buf_block_t*> + (bpage)->frame); +#endif + } retry: - mutex_enter(&buf_dblwr->mutex); - if (buf_dblwr->s_reserved == n_slots) { - - /* All slots are reserved. */ - int64_t sig_count = os_event_reset(buf_dblwr->s_event); - mutex_exit(&buf_dblwr->mutex); - os_event_wait_low(buf_dblwr->s_event, sig_count); - - goto retry; - } - - for (i = srv_doublewrite_batch_size; i < size; ++i) { - - if (!buf_dblwr->in_use[i]) { - break; - } - } - - /* We are guaranteed to find a slot. */ - ut_a(i < size); - buf_dblwr->in_use[i] = true; - buf_dblwr->s_reserved++; - buf_dblwr->buf_block_arr[i] = bpage; - - /* increment the doublewrite flushed pages counter */ - srv_stats.dblwr_pages_written.inc(); - srv_stats.dblwr_writes.inc(); - - mutex_exit(&buf_dblwr->mutex); - - /* Lets see if we are going to write in the first or second - block of the doublewrite buffer. */ - if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { - offset = buf_dblwr->block1 + i; - } else { - offset = buf_dblwr->block2 + i - - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE; - } - - /* We deal with compressed and uncompressed pages a little - differently here. In case of uncompressed pages we can - directly write the block to the allocated slot in the - doublewrite buffer in the system tablespace and then after - syncing the system table space we can proceed to write the page - in the datafile. - In case of compressed page we first do a memcpy of the block - to the in-memory buffer of doublewrite before proceeding to - write it. This is so because we want to pad the remaining - bytes in the doublewrite page with zeros. */ - - /* We request frame here to get correct buffer in case of - encryption and/or page compression */ - void * frame = buf_page_get_frame(bpage); - - if (auto zip_size = bpage->zip_size()) { - memcpy(buf_dblwr->write_buf + srv_page_size * i, - frame, zip_size); - - memset(buf_dblwr->write_buf + srv_page_size * i - + zip_size, 0x0, - srv_page_size - zip_size); - - fil_io(IORequestWrite, - true, - page_id_t(TRX_SYS_SPACE, offset), - 0, - 0, - srv_page_size, - (void *)(buf_dblwr->write_buf + srv_page_size * i), - NULL); - } else { - /* It is a regular page. Write it directly to the - doublewrite buffer */ - fil_io(IORequestWrite, - true, - page_id_t(TRX_SYS_SPACE, offset), - 0, - 0, - srv_page_size, - (void*) frame, - NULL); - } - - /* Now flush the doublewrite buffer data to disk */ - fil_flush(TRX_SYS_SPACE); - - /* We know that the write has been flushed to disk now - and during recovery we will find it in the doublewrite buffer - blocks. Next do the write to the intended position. */ - buf_dblwr_write_block_to_datafile(bpage, sync); + mutex_enter(&mutex); + if (s_reserved == n_slots) + { + /* All slots are reserved. */ + int64_t sig_count = os_event_reset(s_event); + mutex_exit(&mutex); + os_event_wait_low(s_event, sig_count); + goto retry; + } + + ulint i; + for (i = srv_doublewrite_batch_size; i < slots; ++i) + if (!buf_block_arr[i].bpage) + goto found; + /* We are guaranteed to find a slot. */ + ut_error; +found: + s_reserved++; + buf_block_arr[i]= { bpage, IORequest::SINGLE_PAGE, size }; + + /* increment the doublewrite flushed pages counter */ + srv_stats.dblwr_pages_written.inc(); + srv_stats.dblwr_writes.inc(); + + mutex_exit(&mutex); + + const ulint offset= i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE + ? block1 + i + : block2 + i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE; + + /* We request frame here to get correct buffer in case of + encryption and/or page compression */ + void * frame = buf_page_get_frame(bpage); + ut_ad(!bpage->zip_size() || bpage->zip_size() == size); + fil_io_t fio= fil_io(IORequestWrite, true, page_id_t(TRX_SYS_SPACE, offset), + 0, 0, size, frame, nullptr); + fio.node->space->release_for_io(); + + /* Now flush the doublewrite buffer data to disk */ + fil_flush(TRX_SYS_SPACE); + + /* We know that the write has been flushed to disk now + and during recovery we will find it in the doublewrite buffer + blocks. Next do the write to the intended position. */ + buf_dblwr_write_block_to_datafile({bpage, IORequest::SINGLE_PAGE, size}, + sync); } diff --git a/storage/innobase/buf/buf0dump.cc b/storage/innobase/buf/buf0dump.cc index 43c5d3c6a66..d468000f894 100644 --- a/storage/innobase/buf/buf0dump.cc +++ b/storage/innobase/buf/buf0dump.cc @@ -324,8 +324,8 @@ buf_dump( bpage != NULL && j < n_pages; bpage = UT_LIST_GET_NEXT(LRU, bpage)) { - ut_a(buf_page_in_file(bpage)); - const page_id_t id(bpage->id); + ut_a(bpage->in_file()); + const page_id_t id(bpage->id()); if (id.space() == SRV_TMP_SPACE_ID) { /* Ignore the innodb_temporary tablespace. */ diff --git a/storage/innobase/buf/buf0flu.cc b/storage/innobase/buf/buf0flu.cc index 6f2e685a441..04340ed79f8 100644 --- a/storage/innobase/buf/buf0flu.cc +++ b/storage/innobase/buf/buf0flu.cc @@ -209,7 +209,7 @@ static inline void incr_flush_list_size_in_bytes(const buf_block_t* block) ut_ad(buf_pool.stat.flush_list_bytes <= buf_pool.curr_pool_size); } -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +#ifdef UNIV_DEBUG /** Validate the flush list. */ static void buf_flush_validate_low(); @@ -234,7 +234,7 @@ static void buf_flush_validate_skip() buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP; buf_flush_validate_low(); } -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#endif /* UNIV_DEBUG */ /******************************************************************//** Insert a block in the flush_rbt and returns a pointer to its @@ -306,29 +306,33 @@ buf_flush_block_cmp( const void* p1, /*!< in: block1 */ const void* p2) /*!< in: block2 */ { - int ret; - const buf_page_t* b1 = *(const buf_page_t**) p1; - const buf_page_t* b2 = *(const buf_page_t**) p2; + const buf_page_t* b1 = *static_cast<const buf_page_t*const*>(p1); + const buf_page_t* b2 = *static_cast<const buf_page_t*const*>(p2); ut_ad(b1 != NULL); ut_ad(b2 != NULL); ut_ad(mutex_own(&buf_pool.flush_list_mutex)); - ut_ad(b1->in_flush_list); - ut_ad(b2->in_flush_list); + const lsn_t m1 = b1->oldest_modification(), + m2 = b2->oldest_modification(); - if (b2->oldest_modification > b1->oldest_modification) { + ut_ad(m1); + ut_ad(m2); + + if (m2 > m1) { return(1); - } else if (b2->oldest_modification < b1->oldest_modification) { + } else if (m2 < m1) { return(-1); } - /* If oldest_modification is same then decide on the space. */ - ret = (int)(b2->id.space() - b1->id.space()); - - /* Or else decide ordering on the page number. */ - return(ret ? ret : (int) (b2->id.page_no() - b1->id.page_no())); + if (b2->id() > b1->id()) { + return 1; + } + if (b2->id() < b1->id()) { + return -1; + } + return 0; } /********************************************************************//** @@ -354,9 +358,7 @@ buf_flush_free_flush_rbt(void) /*==========================*/ { mutex_enter(&buf_pool.flush_list_mutex); -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - buf_flush_validate_low(); -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ + ut_d(buf_flush_validate_low()); rbt_free(buf_pool.flush_rbt); buf_pool.flush_rbt = NULL; mutex_exit(&buf_pool.flush_list_mutex); @@ -369,14 +371,10 @@ void buf_flush_insert_into_flush_list(buf_block_t* block, lsn_t lsn) { ut_ad(!mutex_own(&buf_pool.mutex)); ut_ad(log_flush_order_mutex_own()); - ut_ad(buf_page_mutex_own(block)); ut_ad(lsn); mutex_enter(&buf_pool.flush_list_mutex); - ut_ad(!block->page.in_flush_list); - ut_d(block->page.in_flush_list = TRUE); - ut_ad(!block->page.oldest_modification); - block->page.oldest_modification = lsn; + block->page.set_oldest_modification(lsn); UNIV_MEM_ASSERT_RW(block->page.zip.data ? block->page.zip.data : block->frame, block->physical_size()); @@ -391,11 +389,10 @@ void buf_flush_insert_into_flush_list(buf_block_t* block, lsn_t lsn) page frame of a compressed block may be discarded or created (copying the block->page to or from a buf_page_t that is dynamically allocated from buf_buddy_alloc()). Because those - transitions hold block->mutex and the flush list mutex (via + transitions hold buf_pool.flush_list_mutex (via buf_flush_relocate_on_flush_list()), there is no possibility of a race condition in the assertions below. */ ut_ad(block->page.in_LRU_list); - ut_ad(block->page.in_page_hash); /* buf_buddy_block_register() will take a block in the BUF_BLOCK_MEMORY state, not a file page. */ ut_ad(!block->page.in_zip_hash); @@ -409,76 +406,10 @@ void buf_flush_insert_into_flush_list(buf_block_t* block, lsn_t lsn) UT_LIST_ADD_FIRST(buf_pool.flush_list, &block->page); func_exit: -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - buf_flush_validate_skip(); -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ - + ut_d(buf_flush_validate_skip()); mutex_exit(&buf_pool.flush_list_mutex); } -/********************************************************************//** -Returns TRUE if the file page block is immediately suitable for replacement, -i.e., the transition FILE_PAGE => NOT_USED allowed. -@return TRUE if can replace immediately */ -ibool -buf_flush_ready_for_replace( -/*========================*/ - buf_page_t* bpage) /*!< in: buffer control block, must be - buf_page_in_file(bpage) and in the LRU list */ -{ - ut_ad(mutex_own(&buf_pool.mutex)); - ut_ad(mutex_own(buf_page_get_mutex(bpage))); - ut_ad(bpage->in_LRU_list); - - if (buf_page_in_file(bpage)) { - - return(bpage->oldest_modification == 0 - && bpage->buf_fix_count == 0 - && buf_page_get_io_fix(bpage) == BUF_IO_NONE); - } - - ib::fatal() << "Buffer block " << bpage << " state " << bpage->state - << " in the LRU list!"; - - return(FALSE); -} - -/********************************************************************//** -Returns true if the block is modified and ready for flushing. -@return true if can flush immediately */ -bool -buf_flush_ready_for_flush( -/*======================*/ - buf_page_t* bpage, /*!< in: buffer control block, must be - buf_page_in_file(bpage) */ - buf_flush_t flush_type)/*!< in: type of flush */ -{ - ut_ad(mutex_own(&buf_pool.mutex)); - ut_a(buf_page_in_file(bpage)); - ut_ad(mutex_own(buf_page_get_mutex(bpage))); - ut_ad(flush_type < BUF_FLUSH_N_TYPES); - - if (bpage->oldest_modification == 0 - || buf_page_get_io_fix(bpage) != BUF_IO_NONE) { - return(false); - } - - ut_ad(bpage->in_flush_list); - - switch (flush_type) { - case BUF_FLUSH_LIST: - case BUF_FLUSH_LRU: - case BUF_FLUSH_SINGLE_PAGE: - return(true); - - case BUF_FLUSH_N_TYPES: - break; - } - - ut_error; - return(false); -} - /** Remove a block from the flush list of modified blocks. @param[in] bpage block to be removed from the flush list */ void buf_flush_remove(buf_page_t* bpage) @@ -493,36 +424,12 @@ void buf_flush_remove(buf_page_t* bpage) } #endif ut_ad(mutex_own(&buf_pool.mutex)); - ut_ad(mutex_own(buf_page_get_mutex(bpage))); - ut_ad(bpage->in_flush_list); - mutex_enter(&buf_pool.flush_list_mutex); /* Important that we adjust the hazard pointer before removing the bpage from flush list. */ buf_pool.flush_hp.adjust(bpage); - - switch (buf_page_get_state(bpage)) { - case BUF_BLOCK_POOL_WATCH: - case BUF_BLOCK_ZIP_PAGE: - /* Clean compressed pages should not be on the flush list */ - case BUF_BLOCK_NOT_USED: - case BUF_BLOCK_READY_FOR_USE: - case BUF_BLOCK_MEMORY: - case BUF_BLOCK_REMOVE_HASH: - ut_error; - return; - case BUF_BLOCK_ZIP_DIRTY: - buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE); - UT_LIST_REMOVE(buf_pool.flush_list, bpage); -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - buf_LRU_insert_zip_clean(bpage); -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ - break; - case BUF_BLOCK_FILE_PAGE: - UT_LIST_REMOVE(buf_pool.flush_list, bpage); - break; - } + UT_LIST_REMOVE(buf_pool.flush_list, bpage); /* If the flush_rbt is active then delete from there as well. */ if (UNIV_LIKELY_NULL(buf_pool.flush_rbt)) { @@ -530,16 +437,20 @@ void buf_flush_remove(buf_page_t* bpage) } /* Must be done after we have removed it from the flush_rbt - because we assert on in_flush_list in comparison function. */ - ut_d(bpage->in_flush_list = FALSE); + because we assert on it in buf_flush_block_cmp(). */ + bpage->clear_oldest_modification(); - buf_pool.stat.flush_list_bytes -= bpage->physical_size(); +#ifdef UNIV_DEBUG + if (bpage->state() == BUF_BLOCK_ZIP_PAGE) { + buf_LRU_insert_zip_clean(bpage); + } +#endif /* UNIV_DEBUG */ - bpage->oldest_modification = 0; + buf_pool.stat.flush_list_bytes -= bpage->physical_size(); -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +#ifdef UNIV_DEBUG buf_flush_validate_skip(); -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#endif /* UNIV_DEBUG */ mutex_exit(&buf_pool.flush_list_mutex); } @@ -565,8 +476,6 @@ buf_flush_relocate_on_flush_list( buf_page_t* prev_b = NULL; ut_ad(mutex_own(&buf_pool.mutex)); - ut_ad(mutex_own(buf_page_get_mutex(bpage))); - mutex_enter(&buf_pool.flush_list_mutex); /* FIXME: At this point we have both buf_pool and flush_list @@ -576,8 +485,7 @@ buf_flush_relocate_on_flush_list( is guaranteed to be in the flush list. We need to check if this will work without the assumption of block removing code having the buf_pool mutex. */ - ut_ad(bpage->in_flush_list); - ut_ad(dpage->in_flush_list); + ut_ad(dpage->oldest_modification()); /* If recovery is active we must swap the control blocks in the flush_rbt as well. */ @@ -591,14 +499,14 @@ buf_flush_relocate_on_flush_list( buf_pool.flush_hp.adjust(bpage); /* Must be done after we have removed it from the flush_rbt - because we assert on in_flush_list in comparison function. */ - ut_d(bpage->in_flush_list = FALSE); + because we assert on it in buf_flush_block_cmp(). */ + bpage->clear_oldest_modification(); prev = UT_LIST_GET_PREV(list, bpage); UT_LIST_REMOVE(buf_pool.flush_list, bpage); if (prev) { - ut_ad(prev->in_flush_list); + ut_ad(prev->oldest_modification()); UT_LIST_INSERT_AFTER( buf_pool.flush_list, prev, dpage); } else { UT_LIST_ADD_FIRST(buf_pool.flush_list, dpage); @@ -606,41 +514,81 @@ buf_flush_relocate_on_flush_list( /* Just an extra check. Previous in flush_list should be the same control block as in flush_rbt. */ - ut_a(buf_pool.flush_rbt == NULL || prev_b == prev); - -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG - buf_flush_validate_low(); -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ - + ut_a(!buf_pool.flush_rbt || prev_b == prev); + ut_d(buf_flush_validate_low()); mutex_exit(&buf_pool.flush_list_mutex); } -/** Update the flush system data structures when a write is completed. -@param[in,out] bpage flushed page -@param[in] dblwr whether the doublewrite buffer was used */ -void buf_flush_write_complete(buf_page_t* bpage, bool dblwr) +/** Update the buf_pool data structures on write completion. +@param[in,out] bpage written page +@param[in] flush_type write request type +@param[in] dblwr whether the doublewrite buffer was used */ +static void buf_flush_write_complete(buf_page_t *bpage, + IORequest::flush_t flush_type, bool dblwr) { - ut_ad(bpage); + ut_ad(mutex_own(&buf_pool.mutex)); + buf_flush_remove(bpage); - buf_flush_remove(bpage); + switch (--buf_pool.n_flush[flush_type]) { +#ifdef UNIV_DEBUG + case ULINT_UNDEFINED: + ut_error; + break; +#endif + case 0: + if (!buf_pool.init_flush[flush_type]) + os_event_set(buf_pool.no_flush[flush_type]); + } - const buf_flush_t flush_type = buf_page_get_flush_type(bpage); - buf_pool.n_flush[flush_type]--; - ut_ad(buf_pool.n_flush[flush_type] != ULINT_MAX); + if (dblwr) + buf_dblwr_update(*bpage, flush_type == IORequest::SINGLE_PAGE); +} - ut_ad(mutex_own(&buf_pool.mutex)); +/** Complete write of a file page from buf_pool. +@param bpage written page +@param request write request +@param dblwr whether the doublewrite buffer was used +@param evict whether or not to evict the page from LRU list */ +void buf_page_write_complete(buf_page_t *bpage, const IORequest &request, + bool dblwr, bool evict) +{ + ut_ad(request.is_write()); + ut_ad(bpage->in_file()); + ut_ad(bpage->io_fix() == BUF_IO_WRITE); + ut_ad(bpage->id().space() != TRX_SYS_SPACE || + !buf_dblwr_page_inside(bpage->id().page_no())); + + /* We do not need protect io_fix here by mutex to read it because + this and buf_page_write_complete() are the only functions where we can + change the value from BUF_IO_READ or BUF_IO_WRITE to some other + value, and our code ensures that this is the only thread that handles + the i/o for this block. */ + if (bpage->slot) + { + bpage->slot->release(); + bpage->slot= nullptr; + } - if (buf_pool.n_flush[flush_type] == 0 - && buf_pool.init_flush[flush_type] == FALSE) { + if (UNIV_UNLIKELY(MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE))) + buf_page_monitor(bpage, BUF_IO_WRITE); + DBUG_PRINT("ib_buf", ("write page %u:%u", + bpage->id().space(), bpage->id().page_no())); + mutex_enter(&buf_pool.mutex); + bpage->set_io_fix(BUF_IO_NONE); + buf_flush_write_complete(bpage, request.flush_type(), dblwr); - /* The running flush batch has ended */ + /* Because this thread which does the unlocking might not be the same that + did the locking, we use a pass value != 0 in unlock, which simply + removes the newest lock debug record, without checking the thread id. */ + if (bpage->state() == BUF_BLOCK_FILE_PAGE) + rw_lock_sx_unlock_gen(&((buf_block_t*) bpage)->lock, BUF_IO_WRITE); - os_event_set(buf_pool.no_flush[flush_type]); - } + buf_pool.stat.n_pages_written++; - if (dblwr) { - buf_dblwr_update(bpage, flush_type); - } + if (evict) + buf_LRU_free_page(bpage, true); + + mutex_exit(&buf_pool.mutex); } /** Calculate a ROW_FORMAT=COMPRESSED page checksum and update the page. @@ -754,9 +702,9 @@ buf_flush_init_for_writing( ulint page_type = fil_page_get_type(page); ulint reset_type = page_type; - switch (block->page.id.page_no() % 16384) { + switch (block->page.id().page_no() % 16384) { case 0: - reset_type = block->page.id.page_no() == 0 + reset_type = block->page.id().page_no() == 0 ? FIL_PAGE_TYPE_FSP_HDR : FIL_PAGE_TYPE_XDES; break; @@ -764,10 +712,8 @@ buf_flush_init_for_writing( reset_type = FIL_PAGE_IBUF_BITMAP; break; case FSP_TRX_SYS_PAGE_NO: - if (block->page.id.page_no() - == TRX_SYS_PAGE_NO - && block->page.id.space() - == TRX_SYS_SPACE) { + if (block->page.id() + == page_id_t(TRX_SYS_SPACE, TRX_SYS_PAGE_NO)) { reset_type = FIL_PAGE_TYPE_TRX_SYS; break; } @@ -802,7 +748,7 @@ buf_flush_init_for_writing( if (UNIV_UNLIKELY(page_type != reset_type)) { ib::info() << "Resetting invalid page " - << block->page.id << " type " + << block->page.id() << " type " << page_type << " to " << reset_type << " when flushing."; fil_page_set_type(page, reset_type); @@ -894,22 +840,21 @@ a page is written to disk. @param[in,out] space tablespace @param[in,out] bpage buffer page @param[in] s physical page frame that is being encrypted +@param[in,out] size payload size in bytes @return page frame to be written to file (may be src_frame or an encrypted/compressed copy of it) */ -static byte* buf_page_encrypt(fil_space_t* space, buf_page_t* bpage, byte* s) +static byte *buf_page_encrypt(fil_space_t* space, buf_page_t* bpage, byte* s, + size_t *size) { - if (bpage->status == buf_page_t::FREED) { - return s; - } - - ut_ad(space->id == bpage->id.space()); - bpage->real_size = srv_page_size; + ut_ad(bpage->status != buf_page_t::FREED); + ut_ad(space->id == bpage->id().space()); ut_d(fil_page_type_validate(space, s)); + const uint32_t page_no= bpage->id().page_no(); - switch (bpage->id.page_no()) { + switch (page_no) { case TRX_SYS_PAGE_NO: - if (bpage->id.space() != TRX_SYS_SPACE) + if (bpage->id().space() != TRX_SYS_SPACE) break; /* The TRX_SYS page is neither encrypted nor compressed, because it contains the address of the doublewrite buffer. */ @@ -970,8 +915,8 @@ static byte* buf_page_encrypt(fil_space_t* space, buf_page_t* bpage, byte* s) { not_compressed: byte *tmp= space->purpose == FIL_TYPE_TEMPORARY - ? buf_tmp_page_encrypt(bpage->id.page_no(), s, d) - : fil_space_encrypt(space, bpage->id.page_no(), s, d); + ? buf_tmp_page_encrypt(page_no, s, d) + : fil_space_encrypt(space, page_no, s, d); slot->out_buf= d= tmp; @@ -984,14 +929,13 @@ not_compressed: buf_tmp_reserve_compression_buf(slot); byte *tmp= slot->comp_buf; ulint len= fil_page_compress(s, tmp, space->flags, - fil_space_get_block_size(space, - bpage->id.page_no()), + fil_space_get_block_size(space, page_no), encrypted); if (!len) goto not_compressed; - bpage->real_size= len; + *size= len; if (full_crc32) { @@ -1011,7 +955,7 @@ not_compressed: ut_d(fil_page_type_validate(space, tmp)); if (encrypted) - tmp = fil_space_encrypt(space, bpage->id.page_no(), tmp, d); + tmp = fil_space_encrypt(space, page_no, tmp, d); if (full_crc32) { @@ -1034,369 +978,353 @@ not_compressed: This function also resets the IO_FIX to IO_NONE and making the page status as NORMAL. It initiates the write to the file only after releasing the page from flush list and its associated mutex. -@param[in,out] bpage freed buffer page -@param[in] space tablespace object of the freed page */ -static void buf_flush_freed_page(buf_page_t *bpage, fil_space_t *space) +@param[in,out] bpage freed buffer page +@param[in] space tablespace object of the freed page */ +static void buf_flush_freed_page(buf_page_t *bpage, const fil_space_t &space) { - ut_ad(buf_page_in_file(bpage)); - const bool uncompressed= buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE; - BPageMutex *block_mutex= uncompressed - ? &reinterpret_cast<buf_block_t*>(bpage)->mutex - : &buf_pool.zip_mutex; - + ut_ad(bpage->in_file()); + const bool uncompressed= bpage->state() == BUF_BLOCK_FILE_PAGE; + const page_id_t page_id(bpage->id()); + const auto zip_size= bpage->zip_size(); mutex_enter(&buf_pool.mutex); - mutex_enter(block_mutex); - - buf_page_set_io_fix(bpage, BUF_IO_NONE); + bpage->set_io_fix(BUF_IO_NONE); bpage->status= buf_page_t::NORMAL; - buf_flush_write_complete(bpage, false); + buf_flush_remove(bpage); + buf_pool.stat.n_pages_written++; + mutex_exit(&buf_pool.mutex); if (uncompressed) rw_lock_sx_unlock_gen(&reinterpret_cast<buf_block_t*>(bpage)->lock, - BUF_IO_WRITE); - - buf_pool.stat.n_pages_written++; - mutex_exit(&buf_pool.mutex); - const page_id_t page_id(bpage->id); - const auto zip_size= bpage->zip_size(); - mutex_exit(block_mutex); + BUF_IO_WRITE); const bool punch_hole= #if defined(HAVE_FALLOC_PUNCH_HOLE_AND_KEEP_SIZE) || defined(_WIN32) - space->is_compressed() || + space.is_compressed() || #endif false; - ut_ad(space->id == page_id.space()); - ut_ad(space->zip_size() == zip_size); + ut_ad(space.id == page_id.space()); + ut_ad(space.zip_size() == zip_size); if (punch_hole || srv_immediate_scrub_data_uncompressed) - fil_io(IORequestWrite, punch_hole, page_id, zip_size, 0, - zip_size ? zip_size : srv_page_size, - const_cast<byte*>(field_ref_zero), nullptr, false, punch_hole); - - space->release_for_io(); + { + fil_io_t fio= fil_io(IORequestWrite, punch_hole, page_id, zip_size, 0, + zip_size ? zip_size : srv_page_size, + const_cast<byte*>(field_ref_zero), nullptr, false, + punch_hole); + if (punch_hole && fio.node) + fio.node->space->release_for_io(); + } } -/********************************************************************//** -Does an asynchronous write of a buffer page. NOTE: when the -doublewrite buffer is used, we must call -buf_dblwr_flush_buffered_writes after we have posted a batch of -writes! */ -static -void -buf_flush_write_block_low( -/*======================*/ - buf_page_t* bpage, /*!< in: buffer block to write */ - buf_flush_t flush_type, /*!< in: type of flush */ - bool sync) /*!< in: true if sync IO request */ +/** Write a flushable page from buf_pool to a file. +buf_pool.mutex must be held. +@param bpage buffer control block +@param flush_type type of flush +@param space tablespace (or nullptr if not known) +@param sync whether this is a synchronous request + (only for flush_type=SINGLE_PAGE) +@return whether the page was flushed and buf_pool.mutex was released */ +bool buf_flush_page(buf_page_t *bpage, IORequest::flush_t flush_type, + fil_space_t *space, bool sync) { - fil_space_t* space = fil_space_acquire_for_io(bpage->id.space()); - if (!space) { - return; - } - ut_ad(space->purpose == FIL_TYPE_TEMPORARY - || space->purpose == FIL_TYPE_IMPORT - || space->purpose == FIL_TYPE_TABLESPACE); - ut_ad((space->purpose == FIL_TYPE_TEMPORARY) - == (space == fil_system.temp_space)); - - page_t* frame = NULL; - const bool full_crc32 = space->full_crc32(); - - DBUG_PRINT("ib_buf", ("flush %s %u page %u:%u", - sync ? "sync" : "async", (unsigned) flush_type, - bpage->id.space(), bpage->id.page_no())); - - ut_ad(buf_page_in_file(bpage)); - - /* We are not holding buf_pool.mutex or block_mutex here. - Nevertheless, it is safe to access bpage, because it is - io_fixed and oldest_modification != 0. Thus, it cannot be - relocated in the buffer pool or removed from flush_list or - LRU_list. */ - ut_ad(!mutex_own(&buf_pool.mutex)); - ut_ad(!mutex_own(&buf_pool.flush_list_mutex)); - ut_ad(!buf_page_get_mutex(bpage)->is_owned()); - ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE); - ut_ad(bpage->oldest_modification != 0); - - switch (buf_page_get_state(bpage)) { - case BUF_BLOCK_POOL_WATCH: - case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */ - case BUF_BLOCK_NOT_USED: - case BUF_BLOCK_READY_FOR_USE: - case BUF_BLOCK_MEMORY: - case BUF_BLOCK_REMOVE_HASH: - ut_error; - break; - case BUF_BLOCK_ZIP_DIRTY: - frame = bpage->zip.data; - buf_flush_update_zip_checksum(frame, bpage->zip_size()); - break; - case BUF_BLOCK_FILE_PAGE: - frame = bpage->zip.data; - if (!frame) { - frame = ((buf_block_t*) bpage)->frame; - } - - /* Skip the encryption and compression for the - freed page */ - if (bpage->status == buf_page_t::FREED) { - break; - } - - byte* page = reinterpret_cast<const buf_block_t*>(bpage)->frame; - - if (full_crc32) { - page = buf_page_encrypt(space, bpage, page); - frame = page; - } - - buf_flush_init_for_writing( - reinterpret_cast<const buf_block_t*>(bpage), page, - bpage->zip.data ? &bpage->zip : NULL, full_crc32); - break; - } - - if (!full_crc32) { - frame = buf_page_encrypt(space, bpage, frame); - } - - if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE)) { - const lsn_t lsn = mach_read_from_8(frame + FIL_PAGE_LSN); - ut_ad(lsn); - ut_ad(lsn >= bpage->oldest_modification); - ut_ad(!srv_read_only_mode); - log_write_up_to(lsn, true); - } else { - ut_ad(space->atomic_write_supported); - } + ut_ad(bpage->in_file()); + ut_ad(bpage->ready_for_flush()); + ut_ad(!sync || flush_type == IORequest::SINGLE_PAGE); + ut_ad(mutex_own(&buf_pool.mutex)); + + rw_lock_t *rw_lock; + bool no_fix_count= bpage->buf_fix_count() == 0; + + if (bpage->state() != BUF_BLOCK_FILE_PAGE) + rw_lock= nullptr; + else if (!(no_fix_count || flush_type == IORequest::FLUSH_LIST) || + (!no_fix_count && srv_shutdown_state <= SRV_SHUTDOWN_CLEANUP && + fsp_is_system_temporary(bpage->id().space()))) + /* This is a heuristic, to avoid expensive SX attempts. */ + /* For table residing in temporary tablespace sync is done + using IO_FIX and so before scheduling for flush ensure that + page is not fixed. */ + return false; + else + { + rw_lock= &reinterpret_cast<buf_block_t*>(bpage)->lock; + if (flush_type != IORequest::FLUSH_LIST && + !rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE)) + return false; + } - if (bpage->status == buf_page_t::FREED) { - buf_flush_freed_page(bpage, space); - return; - } + /* We are committed to flushing by the time we get here */ + bpage->set_io_fix(BUF_IO_WRITE); + mutex_exit(&buf_pool.mutex); - const bool use_doublewrite = bpage->status != buf_page_t::INIT_ON_FLUSH - && space->use_doublewrite(); + if (flush_type == IORequest::FLUSH_LIST && rw_lock && + !rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE)) + { + if (!fsp_is_system_temporary(bpage->id().space())) + /* Avoid a potential deadlock with the doublewrite buffer, + which might be holding another buf_block_t::lock. */ + buf_dblwr_flush_buffered_writes(); + else + os_aio_wait_until_no_pending_writes(); - if (!use_doublewrite) { - ulint type = IORequest::WRITE; + rw_lock_sx_lock_gen(rw_lock, BUF_IO_WRITE); + } - IORequest request(type, bpage); + /* We are holding rw_lock = buf_block_t::lock in SX mode except if + this is a ROW_FORMAT=COMPRESSED page whose uncompressed page frame + has been evicted from the buffer pool. - /* TODO: pass the tablespace to fil_io() */ - fil_io(request, - sync, bpage->id, bpage->zip_size(), 0, - bpage->physical_size(), - frame, bpage); - } else { - ut_ad(!srv_read_only_mode); + Apart from possible rw_lock protection, bpage is also protected by + io_fix and oldest_modification()!=0. Thus, it cannot be relocated in + the buffer pool or removed from flush_list or LRU_list. */ +#if 0 /* rw_lock_own() does not hold because we passed BUF_IO_WRITE above. */ + ut_ad(!rw_lock || rw_lock_own(rw_lock, RW_LOCK_SX)); +#endif - if (flush_type == BUF_FLUSH_SINGLE_PAGE) { - buf_dblwr_write_single_page(bpage, sync); - } else { - ut_ad(!sync); - buf_dblwr_add_to_batch(bpage); - } - } + const fil_space_t * const provided_space= space; + if (!space) + { + space= fil_space_acquire_for_io(bpage->id().space()); + if (UNIV_UNLIKELY(!space)) + { + mutex_enter(&buf_pool.mutex); + bpage->status= buf_page_t::NORMAL; + bpage->set_io_fix(BUF_IO_NONE); + if (rw_lock) + rw_lock_sx_unlock_gen(rw_lock, BUF_IO_WRITE); + return false; + } + } + ut_ad((space->purpose == FIL_TYPE_TEMPORARY) == + (space == fil_system.temp_space)); - /* When doing single page flushing the IO is done synchronously - and we flush the changes to disk only for the tablespace we - are working on. */ - if (sync) { - ut_ad(flush_type == BUF_FLUSH_SINGLE_PAGE); - if (space->purpose != FIL_TYPE_TEMPORARY) { - fil_flush(space); - } + const bool full_crc32= space->full_crc32(); - /* The tablespace could already have been dropped, - because fil_io(request, sync) would already have - decremented the node->n_pending. However, - buf_page_io_complete() only needs to look up the - tablespace during read requests, not during writes. */ - ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE); + DBUG_PRINT("ib_buf", ("flush %s %u page %u:%u", + sync ? "sync" : "async", (unsigned) flush_type, + bpage->id().space(), bpage->id().page_no())); + ut_ad(!mutex_own(&buf_pool.mutex)); + ut_ad(!mutex_own(&buf_pool.flush_list_mutex)); + ut_ad(bpage->io_fix() == BUF_IO_WRITE); + ut_ad(bpage->oldest_modification()); + ut_ad(bpage->state() == + (rw_lock ? BUF_BLOCK_FILE_PAGE : BUF_BLOCK_ZIP_PAGE)); + + /* Because bpage->status can only be changed while buf_block_t + exists, it cannot be modified for ROW_FORMAT=COMPRESSED pages + without first allocating the uncompressed page frame. Such + allocation cannot be completed due to our io_fix. So, bpage->status + is protected even if !rw_lock. */ + const auto status= bpage->status; + + if (status != buf_page_t::FREED) + { + switch (buf_pool.n_flush[flush_type]++) { + case 0: + os_event_reset(buf_pool.no_flush[flush_type]); + break; #ifdef UNIV_DEBUG - dberr_t err = + case ULINT_UNDEFINED: + ut_error; + break; #endif - /* true means we want to evict this page from the - LRU list as well. */ - buf_page_io_complete(bpage, use_doublewrite, true); - - ut_ad(err == DB_SUCCESS); - } - - space->release_for_io(); - - /* Increment the counter of I/O operations used - for selecting LRU policy. */ - buf_LRU_stat_inc_io(); -} - -/** Write a flushable page asynchronously from the buffer pool to a file. -NOTE: 1. in simulated aio we must call os_aio_simulated_wake_handler_threads -after we have posted a batch of writes! 2. buf_page_get_mutex(bpage) must be -held upon entering this function. The LRU list mutex must be held if flush_type -== BUF_FLUSH_SINGLE_PAGE. Both mutexes will be released by this function if it -returns true. -@param[in] bpage buffer control block -@param[in] flush_type type of flush -@param[in] sync true if sync IO request -@return whether the page was flushed */ -bool buf_flush_page(buf_page_t* bpage, buf_flush_t flush_type, bool sync) -{ - BPageMutex* block_mutex; - - ut_ad(flush_type < BUF_FLUSH_N_TYPES); - ut_ad(mutex_own(&buf_pool.mutex)); - ut_ad(buf_page_in_file(bpage)); - ut_ad(!sync || flush_type == BUF_FLUSH_SINGLE_PAGE); - - block_mutex = buf_page_get_mutex(bpage); - ut_ad(mutex_own(block_mutex)); - - ut_ad(buf_flush_ready_for_flush(bpage, flush_type)); - - bool is_uncompressed = (buf_page_get_state(bpage) - == BUF_BLOCK_FILE_PAGE); - ut_ad(is_uncompressed == (block_mutex != &buf_pool.zip_mutex)); - - rw_lock_t* rw_lock; - bool no_fix_count = bpage->buf_fix_count == 0; - - if (!is_uncompressed) { - rw_lock = NULL; - } else if (!(no_fix_count || flush_type == BUF_FLUSH_LIST) - || (!no_fix_count - && srv_shutdown_state <= SRV_SHUTDOWN_CLEANUP - && fsp_is_system_temporary(bpage->id.space()))) { - /* This is a heuristic, to avoid expensive SX attempts. */ - /* For table residing in temporary tablespace sync is done - using IO_FIX and so before scheduling for flush ensure that - page is not fixed. */ - return false; - } else { - rw_lock = &reinterpret_cast<buf_block_t*>(bpage)->lock; - if (flush_type != BUF_FLUSH_LIST - && !rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE)) { - return false; - } - } - - /* We are committed to flushing by the time we get here */ - - buf_page_set_io_fix(bpage, BUF_IO_WRITE); + } + } - buf_page_set_flush_type(bpage, flush_type); + page_t *frame= bpage->zip.data; + size_t size, orig_size; - if (buf_pool.n_flush[flush_type] == 0) { - os_event_reset(buf_pool.no_flush[flush_type]); - } + if (UNIV_UNLIKELY(!rw_lock)) /* ROW_FORMAT=COMPRESSED */ + { + ut_ad(!space->full_crc32()); + ut_ad(!space->is_compressed()); /* not page_compressed */ + orig_size= size= bpage->zip_size(); + if (status != buf_page_t::FREED) + { + buf_flush_update_zip_checksum(frame, orig_size); + frame= buf_page_encrypt(space, bpage, frame, &size); + } + ut_ad(size == bpage->zip_size()); + } + else + { + buf_block_t *block= reinterpret_cast<buf_block_t*>(bpage); + byte *page= block->frame; + orig_size= size= block->physical_size(); - ++buf_pool.n_flush[flush_type]; - ut_ad(buf_pool.n_flush[flush_type] != 0); + if (status != buf_page_t::FREED) + { + if (full_crc32) + { + /* innodb_checksum_algorithm=full_crc32 is not implemented for + ROW_FORMAT=COMPRESSED pages. */ + ut_ad(!frame); + page= buf_page_encrypt(space, bpage, page, &size); + } + + buf_flush_init_for_writing(block, page, frame ? &bpage->zip : nullptr, + full_crc32); + + if (!full_crc32) + page= buf_page_encrypt(space, bpage, frame ? frame : page, &size); + } - mutex_exit(block_mutex); + frame= page; + } - mutex_exit(&buf_pool.mutex); + if (UNIV_LIKELY(space->purpose == FIL_TYPE_TABLESPACE)) + { + const lsn_t lsn= mach_read_from_8(frame + FIL_PAGE_LSN); + ut_ad(lsn); + ut_ad(lsn >= bpage->oldest_modification()); + ut_ad(!srv_read_only_mode); + log_write_up_to(lsn, true); + } + else + ut_ad(space->atomic_write_supported); - if (flush_type == BUF_FLUSH_LIST - && is_uncompressed - && !rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE)) { + bool use_doublewrite; + IORequest request(IORequest::WRITE, bpage, flush_type); - if (!fsp_is_system_temporary(bpage->id.space())) { - /* avoiding deadlock possibility involves - doublewrite buffer, should flush it, because - it might hold the another block->lock. */ - buf_dblwr_flush_buffered_writes(); - } else { - buf_dblwr_sync_datafiles(); - } + ut_ad(status == bpage->status); - rw_lock_sx_lock_gen(rw_lock, BUF_IO_WRITE); - } + switch (status) { + default: + ut_ad(status == buf_page_t::FREED); + buf_flush_freed_page(bpage, *space); + goto done; + case buf_page_t::NORMAL: + use_doublewrite= space->use_doublewrite(); - /* Even though bpage is not protected by any mutex at this - point, it is safe to access bpage, because it is io_fixed and - oldest_modification != 0. Thus, it cannot be relocated in the - buffer pool or removed from flush_list or LRU_list. */ + if (use_doublewrite) + { + ut_ad(!srv_read_only_mode); + if (flush_type == IORequest::SINGLE_PAGE) + buf_dblwr->write_single_page(bpage, sync, size); + else + buf_dblwr->add_to_batch(bpage, flush_type, size); + break; + } + /* fall through */ + case buf_page_t::INIT_ON_FLUSH: + use_doublewrite= false; + if (size != orig_size) + request.set_punch_hole(); + /* FIXME: pass space to fil_io() */ + fil_io_t fio= fil_io(request, sync, bpage->id(), bpage->zip_size(), 0, + bpage->physical_size(), frame, bpage); + ut_ad(!fio.node || fio.node->space == space); + if (fio.node && sync) + fio.node->space->release_for_io(); + } - buf_flush_write_block_low(bpage, flush_type, sync); - return true; -} + if (sync) + { + ut_ad(bpage->io_fix() == BUF_IO_WRITE); -# if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG -/** Writes a flushable page asynchronously from the buffer pool to a file. -NOTE: block and LRU list mutexes must be held upon entering this function, and -they will be released by this function after flushing. This is loosely based on -buf_flush_batch() and buf_flush_page(). -@param[in,out] block buffer control block -@return whether the page was flushed and the mutex released */ -bool buf_flush_page_try(buf_block_t* block) -{ - ut_ad(mutex_own(&buf_pool.mutex)); - ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); - ut_ad(buf_page_mutex_own(block)); + /* When flushing single page synchronously, we flush the changes + only for the tablespace we are working on. */ + if (space->purpose != FIL_TYPE_TEMPORARY) + fil_flush(space); - if (!buf_flush_ready_for_flush(&block->page, BUF_FLUSH_SINGLE_PAGE)) { - return false; - } + if (size != orig_size && space->punch_hole) + request.set_punch_hole(); + buf_page_write_complete(bpage, request, use_doublewrite, true/*evict*/); + } - /* The following call will release the buf_pool and block mutex. */ - return buf_flush_page(&block->page, BUF_FLUSH_SINGLE_PAGE, true); +done: + if (!provided_space) + space->release_for_io(); + /* Increment the I/O operation count used for selecting LRU policy. */ + buf_LRU_stat_inc_io(); + return true; } -# endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ -/** Check the page is in buffer pool and can be flushed. -@param[in] page_id page id -@param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST -@return true if the page can be flushed. */ -static -bool -buf_flush_check_neighbor( - const page_id_t page_id, - buf_flush_t flush_type) +/** Check whether a page can be flushed from the buf_pool. +@param id page identifier +@param flush LRU or FLUSH_LIST +@return whether the page can be flushed */ +static bool buf_flush_check_neighbor(const page_id_t id, + IORequest::flush_t flush) { - buf_page_t* bpage; - bool ret; + ut_ad(flush == IORequest::LRU || flush == IORequest::FLUSH_LIST); + ut_ad(mutex_own(&buf_pool.mutex)); - ut_ad(flush_type == BUF_FLUSH_LRU - || flush_type == BUF_FLUSH_LIST); + buf_page_t *bpage= buf_pool.page_hash_get_low(id); - mutex_enter(&buf_pool.mutex); - - bpage = buf_page_hash_get(page_id); + if (!bpage || buf_pool.watch_is_sentinel(*bpage)) + return false; - if (!bpage) { + /* We avoid flushing 'non-old' blocks in an LRU flush, because the + flushed blocks are soon freed */ - mutex_exit(&buf_pool.mutex); - return(false); - } + return (flush != IORequest::LRU || bpage->is_old()) && + bpage->ready_for_flush(); +} - ut_a(buf_page_in_file(bpage)); +/** Check which neighbors of a page can be flushed from the buf_pool. +@param space tablespace +@param id page identifier of a dirty page +@param flush LRU or FLUSH_LIST +@return last page number that can be flushed */ +static page_id_t buf_flush_check_neighbors(const fil_space_t &space, + page_id_t &id, + IORequest::flush_t flush) +{ + ut_ad(id.page_no() < space.size); + ut_ad(flush == IORequest::LRU || flush == IORequest::FLUSH_LIST); + /* When flushed, dirty blocks are searched in neighborhoods of this + size, and flushed along with the original page. */ + const ulint s= buf_pool.curr_size / 16; + const uint32_t read_ahead= buf_pool.read_ahead_area; + const uint32_t buf_flush_area= read_ahead > s + ? static_cast<uint32_t>(s) : read_ahead; + page_id_t low= id - (id.page_no() % buf_flush_area); + page_id_t high= low + buf_flush_area; + high.set_page_no(std::min(high.page_no(), + static_cast<uint32_t>(space.size - 1))); + + /* Determine the contiguous dirty area around id. */ + const ulint id_fold= id.fold(); - /* We avoid flushing 'non-old' blocks in an LRU flush, - because the flushed blocks are soon freed */ + mutex_enter(&buf_pool.mutex); - ret = false; - if (flush_type != BUF_FLUSH_LRU || buf_page_is_old(bpage)) { - BPageMutex* block_mutex = buf_page_get_mutex(bpage); + if (id > low) + { + ulint fold= id_fold; + for (page_id_t i= id - 1;; --i) + { + fold--; + ut_ad(i.fold() == fold); + if (!buf_flush_check_neighbor(i, flush)) + { + low= i + 1; + break; + } + if (i == low) + break; + } + } - mutex_enter(block_mutex); - if (buf_flush_ready_for_flush(bpage, flush_type)) { - ret = true; - } - mutex_exit(block_mutex); - } - mutex_exit(&buf_pool.mutex); + page_id_t i= id; + id= low; + ulint fold= id_fold; + while (++i < high) + { + ++fold; + ut_ad(i.fold() == fold); + if (!buf_flush_check_neighbor(i, flush)) + break; + } - return(ret); + mutex_exit(&buf_pool.mutex); + return i; } /** Flushes to disk all flushable pages within the flush area. @param[in] page_id page id -@param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST +@param[in] flush LRU or FLUSH_LIST @param[in] n_flushed number of pages flushed so far in this batch @param[in] n_to_flush maximum number of pages we are allowed to flush @return number of pages flushed */ @@ -1404,86 +1332,27 @@ static ulint buf_flush_try_neighbors( const page_id_t page_id, - buf_flush_t flush_type, + IORequest::flush_t flush, ulint n_flushed, ulint n_to_flush) { - ulint i; - ulint low; - ulint high; ulint count = 0; - ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); + ut_ad(flush == IORequest::LRU || flush == IORequest::FLUSH_LIST); fil_space_t* space = fil_space_acquire_for_io(page_id.space()); if (!space) { return 0; } - if (UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN - || !srv_flush_neighbors || !space->is_rotational()) { - /* If there is little space or neighbor flushing is - not enabled then just flush the victim. */ - low = page_id.page_no(); - high = page_id.page_no() + 1; - } else { - /* When flushed, dirty blocks are searched in - neighborhoods of this size, and flushed along with the - original page. */ - - ulint buf_flush_area; - - buf_flush_area = ut_min( - buf_pool.read_ahead_area, - buf_pool.curr_size / 16); + page_id_t id = page_id; + page_id_t high = (srv_flush_neighbors != 1 + || UT_LIST_GET_LEN(buf_pool.LRU) + < BUF_LRU_OLD_MIN_LEN + || !space->is_rotational()) + ? id + 1 /* Flush the minimum. */ + : buf_flush_check_neighbors(*space, id, flush); - low = (page_id.page_no() / buf_flush_area) * buf_flush_area; - high = (page_id.page_no() / buf_flush_area + 1) * buf_flush_area; - - if (srv_flush_neighbors == 1) { - /* adjust 'low' and 'high' to limit - for contiguous dirty area */ - if (page_id.page_no() > low) { - for (i = page_id.page_no() - 1; i >= low; i--) { - if (!buf_flush_check_neighbor( - page_id_t(page_id.space(), i), - flush_type)) { - - break; - } - - if (i == low) { - /* Avoid overwrap when low == 0 - and calling - buf_flush_check_neighbor() with - i == (ulint) -1 */ - i--; - break; - } - } - low = i + 1; - } - - for (i = page_id.page_no() + 1; - i < high - && buf_flush_check_neighbor( - page_id_t(page_id.space(), i), - flush_type); - i++) { - /* do nothing */ - } - high = i; - } - } - - if (high > space->size) { - high = space->size; - } - - DBUG_PRINT("ib_buf", ("flush %u:%u..%u", - page_id.space(), - (unsigned) low, (unsigned) high)); - - for (ulint i = low; i < high; i++) { + for (; id < high; ++id) { buf_page_t* bpage; if ((count + n_flushed) >= n_to_flush) { @@ -1494,54 +1363,34 @@ buf_flush_try_neighbors( are flushing has not been flushed yet then we'll try to flush the victim that we selected originally. */ - if (i <= page_id.page_no()) { - i = page_id.page_no(); + if (id <= page_id) { + id = page_id; } else { break; } } - const page_id_t cur_page_id(page_id.space(), i); - mutex_enter(&buf_pool.mutex); - bpage = buf_page_hash_get(cur_page_id); + bpage = buf_page_hash_get(id); if (bpage == NULL) { mutex_exit(&buf_pool.mutex); continue; } - ut_a(buf_page_in_file(bpage)); + ut_a(bpage->in_file()); /* We avoid flushing 'non-old' blocks in an LRU flush, because the flushed blocks are soon freed */ - if (flush_type != BUF_FLUSH_LRU - || i == page_id.page_no() - || buf_page_is_old(bpage)) { - - BPageMutex* block_mutex = buf_page_get_mutex(bpage); - - mutex_enter(block_mutex); - - if (buf_flush_ready_for_flush(bpage, flush_type) - && (i == page_id.page_no() - || bpage->buf_fix_count == 0)) { - - /* We also try to flush those - neighbors != offset */ - - if (buf_flush_page(bpage, flush_type, false)) { - ++count; - } else { - mutex_exit(block_mutex); - mutex_exit(&buf_pool.mutex); - } - - continue; - } else { - mutex_exit(block_mutex); + if (flush != IORequest::LRU + || id == page_id || bpage->is_old()) { + if (bpage->ready_for_flush() + && (id == page_id || bpage->buf_fix_count() == 0) + && buf_flush_page(bpage, flush, space, false)) { + ++count; + continue; } } mutex_exit(&buf_pool.mutex); @@ -1560,56 +1409,6 @@ buf_flush_try_neighbors( return(count); } -/** Check if the block is modified and ready for flushing. -If the the block is ready to flush then flush the page and try o flush -its neighbors. -@param[in] bpage buffer control block, -must be buf_page_in_file(bpage) -@param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST -@param[in] n_to_flush number of pages to flush -@param[in,out] count number of pages flushed -@return TRUE if buf_pool mutex was released during this function. -This does not guarantee that some pages were written as well. -Number of pages written are incremented to the count. */ -static -bool -buf_flush_page_and_try_neighbors( - buf_page_t* bpage, - buf_flush_t flush_type, - ulint n_to_flush, - ulint* count) -{ - ut_ad(mutex_own(&buf_pool.mutex)); - - bool flushed; - BPageMutex* block_mutex = buf_page_get_mutex(bpage); - - mutex_enter(block_mutex); - - ut_a(buf_page_in_file(bpage)); - - if (buf_flush_ready_for_flush(bpage, flush_type)) { - const page_id_t page_id = bpage->id; - - mutex_exit(block_mutex); - mutex_exit(&buf_pool.mutex); - - /* Try to flush also all the neighbors */ - *count += buf_flush_try_neighbors( - page_id, flush_type, *count, n_to_flush); - - mutex_enter(&buf_pool.mutex); - flushed = true; - } else { - mutex_exit(block_mutex); - flushed = false; - } - - ut_ad(mutex_own(&buf_pool.mutex)); - - return(flushed); -} - /*******************************************************************//** This utility moves the uncompressed frames of pages to the free list. Note that this function does not actually flush any data to disk. It @@ -1642,14 +1441,11 @@ static ulint buf_free_from_unzip_LRU_list_batch(ulint max) released and reacquired */ ++count; block = UT_LIST_GET_LAST(buf_pool.unzip_LRU); - + free_len = UT_LIST_GET_LEN(buf_pool.free); + lru_len = UT_LIST_GET_LEN(buf_pool.unzip_LRU); } else { - block = UT_LIST_GET_PREV(unzip_LRU, block); } - - free_len = UT_LIST_GET_LEN(buf_pool.free); - lru_len = UT_LIST_GET_LEN(buf_pool.unzip_LRU); } ut_ad(mutex_own(&buf_pool.mutex)); @@ -1671,90 +1467,67 @@ The calling thread is not allowed to own any latches on pages! @param[in] max desired number of blocks to make available in the free list (best effort; not guaranteed) @param[out] n counts of flushed and evicted pages */ -static void buf_flush_LRU_list_batch(ulint max, flush_counters_t* n) +static void buf_flush_LRU_list_batch(ulint max, flush_counters_t *n) { - buf_page_t* bpage; - ulint scanned = 0; - ulint free_len = UT_LIST_GET_LEN(buf_pool.free); - ulint lru_len = UT_LIST_GET_LEN(buf_pool.LRU); - ulint withdraw_depth = 0; - - n->flushed = 0; - n->evicted = 0; - n->unzip_LRU_evicted = 0; - ut_ad(mutex_own(&buf_pool.mutex)); - if (buf_pool.curr_size < buf_pool.old_size - && buf_pool.withdraw_target > 0) { - withdraw_depth = buf_pool.withdraw_target - - UT_LIST_GET_LEN(buf_pool.withdraw); - } - - for (bpage = UT_LIST_GET_LAST(buf_pool.LRU); - bpage != NULL && n->flushed + n->evicted < max - && free_len < srv_LRU_scan_depth + withdraw_depth - && lru_len > BUF_LRU_MIN_LEN; - ++scanned, - bpage = buf_pool.lru_hp.get()) { - - buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage); - buf_pool.lru_hp.set(prev); - - BPageMutex* block_mutex = buf_page_get_mutex(bpage); - - mutex_enter(block_mutex); - - if (buf_flush_ready_for_replace(bpage)) { - /* block is ready for eviction i.e., it is - clean and is not IO-fixed or buffer fixed. */ - mutex_exit(block_mutex); - if (buf_LRU_free_page(bpage, true)) { - ++n->evicted; - } - } else if (buf_flush_ready_for_flush(bpage, BUF_FLUSH_LRU)) { - /* Block is ready for flush. Dispatch an IO - request. The IO helper thread will put it on - free list in IO completion routine. */ - mutex_exit(block_mutex); - buf_flush_page_and_try_neighbors( - bpage, BUF_FLUSH_LRU, max, &n->flushed); - } else { - /* Can't evict or dispatch this block. Go to - previous. */ - ut_ad(buf_pool.lru_hp.is_hp(prev)); - mutex_exit(block_mutex); - } - - ut_ad(!mutex_own(block_mutex)); - ut_ad(mutex_own(&buf_pool.mutex)); - - free_len = UT_LIST_GET_LEN(buf_pool.free); - lru_len = UT_LIST_GET_LEN(buf_pool.LRU); - } - - buf_pool.lru_hp.set(NULL); - - /* We keep track of all flushes happening as part of LRU - flush. When estimating the desired rate at which flush_list - should be flushed, we factor in this value. */ - buf_lru_flush_page_count += n->flushed; - - ut_ad(mutex_own(&buf_pool.mutex)); + ulint scanned= 0; + ulint free_limit= srv_LRU_scan_depth; + n->flushed = 0; + n->evicted = 0; + n->unzip_LRU_evicted = 0; + ut_ad(mutex_own(&buf_pool.mutex)); + if (buf_pool.withdraw_target && buf_pool.curr_size < buf_pool.old_size) + free_limit+= buf_pool.withdraw_target - UT_LIST_GET_LEN(buf_pool.withdraw); + + for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.LRU); + bpage && n->flushed + n->evicted < max && + UT_LIST_GET_LEN(buf_pool.LRU) > BUF_LRU_MIN_LEN && + UT_LIST_GET_LEN(buf_pool.free) < free_limit; + ++scanned, bpage= buf_pool.lru_hp.get()) + { + buf_page_t *prev= UT_LIST_GET_PREV(LRU, bpage); + buf_pool.lru_hp.set(prev); - if (n->evicted) { - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE, - MONITOR_LRU_BATCH_EVICT_COUNT, - MONITOR_LRU_BATCH_EVICT_PAGES, - n->evicted); - } + if (bpage->ready_for_replace()) + { + /* block is ready for eviction i.e., it is clean and is not + IO-fixed or buffer fixed. */ + if (buf_LRU_free_page(bpage, true)) + ++n->evicted; + } + else if (bpage->ready_for_flush()) + { + /* Block is ready for flush. Dispatch an IO request. The IO + helper thread will put it on free list in IO completion routine. */ + const page_id_t page_id(bpage->id()); + mutex_exit(&buf_pool.mutex); + n->flushed+= buf_flush_try_neighbors(page_id, IORequest::LRU, n->flushed, + max); + mutex_enter(&buf_pool.mutex); + } + else + /* Can't evict or dispatch this block. Go to previous. */ + ut_ad(buf_pool.lru_hp.is_hp(prev)); + } - if (scanned) { - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_LRU_BATCH_SCANNED, - MONITOR_LRU_BATCH_SCANNED_NUM_CALL, - MONITOR_LRU_BATCH_SCANNED_PER_CALL, - scanned); - } + buf_pool.lru_hp.set(nullptr); + + /* We keep track of all flushes happening as part of LRU flush. When + estimating the desired rate at which flush_list should be flushed, + we factor in this value. */ + buf_lru_flush_page_count+= n->flushed; + + ut_ad(mutex_own(&buf_pool.mutex)); + + if (n->evicted) + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE, + MONITOR_LRU_BATCH_EVICT_COUNT, + MONITOR_LRU_BATCH_EVICT_PAGES, + n->evicted); + if (scanned) + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_LRU_BATCH_SCANNED, + MONITOR_LRU_BATCH_SCANNED_NUM_CALL, + MONITOR_LRU_BATCH_SCANNED_PER_CALL, + scanned); } /** Flush and move pages from LRU or unzip_LRU list to the free list. @@ -1785,76 +1558,68 @@ The calling thread is not allowed to own any latches on pages! not guaranteed that the actual number is that big, though) @param[in] lsn_limit all blocks whose oldest_modification is smaller than this should be flushed (if their number does not exceed min_n) -@return number of blocks for which the write request was queued; -ULINT_UNDEFINED if there was a flush of the same type already -running */ +@return number of blocks for which the write request was queued */ static ulint buf_do_flush_list_batch(ulint min_n, lsn_t lsn_limit) { - ulint count = 0; - ulint scanned = 0; - - ut_ad(mutex_own(&buf_pool.mutex)); - - /* Start from the end of the list looking for a suitable - block to be flushed. */ - mutex_enter(&buf_pool.flush_list_mutex); - ulint len = UT_LIST_GET_LEN(buf_pool.flush_list); - - /* In order not to degenerate this scan to O(n*n) we attempt - to preserve pointer of previous block in the flush list. To do - so we declare it a hazard pointer. Any thread working on the - flush list must check the hazard pointer and if it is removing - the same block then it must reset it. */ - for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool.flush_list); - count < min_n && bpage != NULL && len > 0 - && bpage->oldest_modification < lsn_limit; - bpage = buf_pool.flush_hp.get(), - ++scanned) { - - buf_page_t* prev; - - ut_a(bpage->oldest_modification > 0); - ut_ad(bpage->in_flush_list); - - prev = UT_LIST_GET_PREV(list, bpage); - buf_pool.flush_hp.set(prev); - mutex_exit(&buf_pool.flush_list_mutex); - -#ifdef UNIV_DEBUG - bool flushed = -#endif /* UNIV_DEBUG */ - buf_flush_page_and_try_neighbors( - bpage, BUF_FLUSH_LIST, min_n, &count); - - mutex_enter(&buf_pool.flush_list_mutex); - - ut_ad(flushed || buf_pool.flush_hp.is_hp(prev)); - - --len; - } + ulint count= 0; + ulint scanned= 0; + + ut_ad(mutex_own(&buf_pool.mutex)); + + /* Start from the end of the list looking for a suitable block to be + flushed. */ + mutex_enter(&buf_pool.flush_list_mutex); + ulint len = UT_LIST_GET_LEN(buf_pool.flush_list); + + /* In order not to degenerate this scan to O(n*n) we attempt to + preserve pointer of previous block in the flush list. To do so we + declare it a hazard pointer. Any thread working on the flush list + must check the hazard pointer and if it is removing the same block + then it must reset it. */ + for (buf_page_t *bpage= UT_LIST_GET_LAST(buf_pool.flush_list); + bpage && len && count < min_n; + bpage= buf_pool.flush_hp.get(), ++scanned, len--) + { + const lsn_t oldest_modification= bpage->oldest_modification(); + if (oldest_modification >= lsn_limit) + break; + ut_a(oldest_modification); - buf_pool.flush_hp.set(NULL); - mutex_exit(&buf_pool.flush_list_mutex); + buf_page_t *prev= UT_LIST_GET_PREV(list, bpage); + buf_pool.flush_hp.set(prev); + mutex_exit(&buf_pool.flush_list_mutex); - if (scanned) { - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_FLUSH_BATCH_SCANNED, - MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL, - MONITOR_FLUSH_BATCH_SCANNED_PER_CALL, - scanned); - } + ut_ad(bpage->in_file()); + const bool flushed= bpage->ready_for_flush(); - if (count) { - MONITOR_INC_VALUE_CUMULATIVE( - MONITOR_FLUSH_BATCH_TOTAL_PAGE, - MONITOR_FLUSH_BATCH_COUNT, - MONITOR_FLUSH_BATCH_PAGES, - count); - } + if (flushed) + { + const page_id_t page_id(bpage->id()); + mutex_exit(&buf_pool.mutex); + count+= buf_flush_try_neighbors(page_id, IORequest::FLUSH_LIST, + count, min_n); + mutex_enter(&buf_pool.mutex); + } - ut_ad(mutex_own(&buf_pool.mutex)); + mutex_enter(&buf_pool.flush_list_mutex); + ut_ad(flushed || buf_pool.flush_hp.is_hp(prev)); + } - return(count); + buf_pool.flush_hp.set(nullptr); + mutex_exit(&buf_pool.flush_list_mutex); + + if (scanned) + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_SCANNED, + MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL, + MONITOR_FLUSH_BATCH_SCANNED_PER_CALL, + scanned); + if (count) + MONITOR_INC_VALUE_CUMULATIVE(MONITOR_FLUSH_BATCH_TOTAL_PAGE, + MONITOR_FLUSH_BATCH_COUNT, + MONITOR_FLUSH_BATCH_PAGES, + count); + ut_ad(mutex_own(&buf_pool.mutex)); + return count; } /** This utility flushes dirty blocks from the end of the LRU list or @@ -1863,45 +1628,39 @@ NOTE 1: in the case of an LRU flush the calling thread may own latches to pages: to avoid deadlocks, this function must be written so that it cannot end up waiting for these latches! NOTE 2: in the case of a flush list flush, the calling thread is not allowed to own any latches on pages! -@param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST; if -BUF_FLUSH_LIST, then the caller must not own any latches on pages +@param[in] lru true=LRU; false=FLUSH_LIST; +if !lru, then the caller must not own any latches on pages @param[in] min_n wished minimum mumber of blocks flushed (it is not guaranteed that the actual number is that big, though) -@param[in] lsn_limit in the case of BUF_FLUSH_LIST all blocks whose +@param[in] lsn_limit in the case of !lru all blocks whose @param[out] n counts of flushed and evicted pages oldest_modification is smaller than this should be flushed (if their number does not exceed min_n), otherwise ignored */ static void buf_flush_batch( - buf_flush_t flush_type, + bool lru, ulint min_n, lsn_t lsn_limit, flush_counters_t* n) { - ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); - ut_ad(flush_type == BUF_FLUSH_LRU - || !sync_check_iterate(dict_sync_check())); + ut_ad(lru || !sync_check_iterate(dict_sync_check())); mutex_enter(&buf_pool.mutex); /* Note: The buffer pool mutex is released and reacquired within the flush functions. */ - switch (flush_type) { - case BUF_FLUSH_LRU: + if (lru) { buf_do_LRU_batch(min_n, n); - break; - case BUF_FLUSH_LIST: + } else { n->flushed = buf_do_flush_list_batch(min_n, lsn_limit); n->evicted = 0; - break; - default: - ut_error; } mutex_exit(&buf_pool.mutex); - DBUG_LOG("ib_buf", "flush " << flush_type << " completed"); + DBUG_PRINT("ib_buf", + (lru ? "LRU flush completed" : "flush_list completed")); } /******************************************************************//** @@ -1925,101 +1684,84 @@ buf_flush_stats( } /** Start a buffer flush batch for LRU or flush list -@param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST +@param[in] lru true=buf_pool.LRU; false=buf_pool.flush_list @return whether the flush batch was started (was not already running) */ -static bool buf_flush_start(buf_flush_t flush_type) +static bool buf_flush_start(bool lru) { - ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); - - mutex_enter(&buf_pool.mutex); - - if (buf_pool.n_flush[flush_type] > 0 - || buf_pool.init_flush[flush_type] == TRUE) { - - /* There is already a flush batch of the same type running */ - - mutex_exit(&buf_pool.mutex); - - return(false); - } - - buf_pool.init_flush[flush_type] = TRUE; - - os_event_reset(buf_pool.no_flush[flush_type]); + IORequest::flush_t flush_type= lru ? IORequest::LRU : IORequest::FLUSH_LIST; + mutex_enter(&buf_pool.mutex); - mutex_exit(&buf_pool.mutex); + if (buf_pool.n_flush[flush_type] > 0 || buf_pool.init_flush[flush_type]) + { + /* There is already a flush batch of the same type running */ + mutex_exit(&buf_pool.mutex); + return false; + } - return(true); + buf_pool.init_flush[flush_type]= true; + os_event_reset(buf_pool.no_flush[flush_type]); + mutex_exit(&buf_pool.mutex); + return true; } /** End a buffer flush batch. -@param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST */ -static void buf_flush_end(buf_flush_t flush_type) +@param[in] lru true=buf_pool.LRU; false=buf_pool.flush_list */ +static void buf_flush_end(bool lru) { - mutex_enter(&buf_pool.mutex); + IORequest::flush_t flush_type= lru ? IORequest::LRU : IORequest::FLUSH_LIST; - buf_pool.init_flush[flush_type] = FALSE; - - buf_pool.try_LRU_scan = TRUE; - - if (buf_pool.n_flush[flush_type] == 0) { + mutex_enter(&buf_pool.mutex); - /* The running flush batch has ended */ + buf_pool.init_flush[flush_type]= false; + buf_pool.try_LRU_scan= true; - os_event_set(buf_pool.no_flush[flush_type]); - } + if (!buf_pool.n_flush[flush_type]) + /* The running flush batch has ended */ + os_event_set(buf_pool.no_flush[flush_type]); - mutex_exit(&buf_pool.mutex); + mutex_exit(&buf_pool.mutex); - if (!srv_read_only_mode) { - buf_dblwr_flush_buffered_writes(); - } + if (!srv_read_only_mode) + buf_dblwr_flush_buffered_writes(); } /** Wait until a flush batch ends. -@param[in] type BUF_FLUSH_LRU or BUF_FLUSH_LIST */ -void buf_flush_wait_batch_end(buf_flush_t type) +@param[in] lru true=buf_pool.LRU; false=buf_pool.flush_list */ +void buf_flush_wait_batch_end(bool lru) { - ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST); - thd_wait_begin(NULL, THD_WAIT_DISKIO); - os_event_wait(buf_pool.no_flush[type]); - thd_wait_end(NULL); + thd_wait_begin(nullptr, THD_WAIT_DISKIO); + os_event_wait(buf_pool.no_flush[lru + ? IORequest::LRU : IORequest::FLUSH_LIST]); + thd_wait_end(nullptr); } /** Do flushing batch of a given type. NOTE: The calling thread is not allowed to own any latches on pages! -@param[in] type flush type +@param[in] lru true=buf_pool.LRU; false=buf_pool.flush_list @param[in] min_n wished minimum mumber of blocks flushed (it is not guaranteed that the actual number is that big, though) -@param[in] lsn_limit in the case BUF_FLUSH_LIST all blocks whose +@param[in] lsn_limit if !lru, all blocks whose oldest_modification is smaller than this should be flushed (if their number does not exceed min_n), otherwise ignored @param[out] n_processed the number of pages which were processed is passed back to caller. Ignored if NULL @retval true if a batch was queued successfully. @retval false if another batch of same type was already running. */ -bool -buf_flush_do_batch( - buf_flush_t type, - ulint min_n, - lsn_t lsn_limit, - flush_counters_t* n) +bool buf_flush_do_batch(bool lru, ulint min_n, lsn_t lsn_limit, + flush_counters_t *n) { - ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST); - - if (n != NULL) { - n->flushed = 0; - } + if (n) + n->flushed= 0; - if (!buf_flush_start(type)) { - return(false); - } + if (!buf_flush_start(lru)) + return false; - buf_flush_batch(type, min_n, lsn_limit, n); - buf_flush_end(type); + buf_flush_batch(lru, min_n, lsn_limit, n); + buf_flush_end(lru); - return(true); + return true; } + /** Wait until a flush batch of the given lsn ends @param[in] new_oldest target oldest_modified_lsn to wait for */ void buf_flush_wait_flushed(lsn_t new_oldest) @@ -2032,18 +1774,17 @@ void buf_flush_wait_flushed(lsn_t new_oldest) mutex_enter(&buf_pool.flush_list_mutex); buf_page_t* bpage; - /* FIXME: Keep temporary tablespace pages in a separate flush list. We would only need to write out temporary pages if the page is about to be evicted from the buffer pool, and the page contents is still needed (the page has not been freed). */ for (bpage = UT_LIST_GET_LAST(buf_pool.flush_list); - bpage && fsp_is_system_temporary(bpage->id.space()); + bpage && fsp_is_system_temporary(bpage->id().space()); bpage = UT_LIST_GET_PREV(list, bpage)) { - ut_ad(bpage->in_flush_list); + ut_ad(bpage->oldest_modification()); } - lsn_t oldest = bpage ? bpage->oldest_modification : 0; + lsn_t oldest = bpage ? bpage->oldest_modification() : 0; mutex_exit(&buf_pool.flush_list_mutex); @@ -2062,7 +1803,7 @@ void buf_flush_wait_flushed(lsn_t new_oldest) NOTE: The calling thread is not allowed to own any latches on pages! @param[in] min_n wished minimum mumber of blocks flushed (it is not guaranteed that the actual number is that big, though) -@param[in] lsn_limit in the case BUF_FLUSH_LIST all blocks whose +@param[in] lsn_limit all blocks whose oldest_modification is smaller than this should be flushed (if their number does not exceed min_n), otherwise ignored @param[out] n_processed the number of pages which were processed is @@ -2073,8 +1814,7 @@ bool buf_flush_lists(ulint min_n, lsn_t lsn_limit, ulint *n_processed) { flush_counters_t n; - bool success = buf_flush_do_batch( - BUF_FLUSH_LIST, min_n, lsn_limit, &n); + bool success = buf_flush_do_batch(false, min_n, lsn_limit, &n); if (n.flushed) { buf_flush_stats(n.flushed, 0); @@ -2097,68 +1837,49 @@ is not fast enough to keep pace with the workload. @return true if success. */ bool buf_flush_single_page_from_LRU() { - ulint scanned; - buf_page_t* bpage; - ibool freed; + ulint scanned = 0; + bool freed = false; mutex_enter(&buf_pool.mutex); - for (bpage = buf_pool.single_scan_itr.start(), scanned = 0, - freed = false; - bpage != NULL; + for (buf_page_t* bpage = buf_pool.single_scan_itr.start(); bpage; ++scanned, bpage = buf_pool.single_scan_itr.get()) { ut_ad(mutex_own(&buf_pool.mutex)); buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage); buf_pool.single_scan_itr.set(prev); - BPageMutex* block_mutex; - - block_mutex = buf_page_get_mutex(bpage); - mutex_enter(block_mutex); + if (!bpage->ready_for_flush()) { // FIXME: ready_for_replace() + continue; + } - if (buf_flush_ready_for_replace(bpage)) { + if (!bpage->buf_fix_count() + && buf_LRU_free_page(bpage, true)) { /* block is ready for eviction i.e., it is clean and is not IO-fixed or buffer fixed. */ - mutex_exit(block_mutex); - - if (buf_LRU_free_page(bpage, true)) { - mutex_exit(&buf_pool.mutex); - freed = true; - break; - } - - } else if (buf_flush_ready_for_flush( - bpage, BUF_FLUSH_SINGLE_PAGE)) { - + freed = true; + break; + } else { /* Block is ready for flush. Try and dispatch an IO request. We'll put it on free list in IO completion routine if it is not buffer fixed. The following call - will release the buffer pool and block mutex. + will release the buf_pool.mutex. Note: There is no guarantee that this page has actually been freed, only that it has been flushed to disk */ - freed = buf_flush_page(bpage, BUF_FLUSH_SINGLE_PAGE, - true); + freed = buf_flush_page(bpage, IORequest::SINGLE_PAGE, + nullptr, true); if (freed) { - break; + goto found; } - - mutex_exit(block_mutex); - } else { - mutex_exit(block_mutex); } - ut_ad(!mutex_own(block_mutex)); - } - if (!freed) { - /* Can't find a single flushable page. */ - ut_ad(!bpage); - mutex_exit(&buf_pool.mutex); } + mutex_exit(&buf_pool.mutex); +found: if (scanned) { MONITOR_INC_VALUE_CUMULATIVE( MONITOR_LRU_SINGLE_FLUSH_SCANNED, @@ -2207,7 +1928,7 @@ static ulint buf_flush_LRU_list() that can trigger an LRU flush at the same time. So, it is not possible that a batch triggered during last iteration is still running, */ - buf_flush_do_batch(BUF_FLUSH_LRU, scan_depth, 0, &n); + buf_flush_do_batch(true, scan_depth, 0, &n); return(n.flushed); } @@ -2215,13 +1936,12 @@ static ulint buf_flush_LRU_list() /** Wait for any possible LRU flushes to complete. */ void buf_flush_wait_LRU_batch_end() { - mutex_enter(&buf_pool.mutex); - bool wait = buf_pool.n_flush[BUF_FLUSH_LRU] - || buf_pool.init_flush[BUF_FLUSH_LRU]; - mutex_exit(&buf_pool.mutex); - if (wait) { - buf_flush_wait_batch_end(BUF_FLUSH_LRU); - } + mutex_enter(&buf_pool.mutex); + bool wait= buf_pool.n_flush[IORequest::LRU] || + buf_pool.init_flush[IORequest::LRU]; + mutex_exit(&buf_pool.mutex); + if (wait) + buf_flush_wait_batch_end(true); } /*********************************************************************//** @@ -2461,7 +2181,7 @@ page_cleaner_flush_pages_recommendation(ulint last_pages_in) for (buf_page_t* b = UT_LIST_GET_LAST(buf_pool.flush_list); b != NULL; b = UT_LIST_GET_PREV(list, b)) { - if (b->oldest_modification > target_lsn) { + if (b->oldest_modification() > target_lsn) { break; } ++pages_for_lsn; @@ -2559,7 +2279,7 @@ pc_sleep_if_needed( Requests for all slots to flush. @param min_n wished minimum mumber of blocks flushed (it is not guaranteed that the actual number is that big) -@param lsn_limit in the case BUF_FLUSH_LIST all blocks whose +@param lsn_limit in the case of buf_pool.flush_list all blocks whose oldest_modification is smaller than this should be flushed (if their number does not exceed min_n), otherwise ignored */ @@ -2638,7 +2358,7 @@ static ulint pc_flush_slot() list_tm = ut_time_ms(); page_cleaner.slot.succeeded_list = buf_flush_do_batch( - BUF_FLUSH_LIST, + false, page_cleaner.slot.n_pages_requested, page_cleaner.lsn_limit, &n); @@ -2790,25 +2510,18 @@ static os_thread_ret_t DECLARE_THREAD(buf_flush_page_cleaner)(void*) break; } - switch (recv_sys.flush_type) { - case BUF_FLUSH_LRU: + if (recv_sys.flush_lru) { /* Flush pages from end of LRU if required */ pc_request(0, LSN_MAX); while (pc_flush_slot() > 0) {} pc_wait_finished(&n_flushed_lru, &n_flushed_list); - break; - - case BUF_FLUSH_LIST: + } else { /* Flush all pages */ do { pc_request(ULINT_MAX, LSN_MAX); while (pc_flush_slot() > 0) {} } while (!pc_wait_finished(&n_flushed_lru, &n_flushed_list)); - break; - - default: - ut_ad(0); } os_event_reset(recv_sys.flush_start); @@ -3064,7 +2777,7 @@ static os_thread_ret_t DECLARE_THREAD(buf_flush_page_cleaner)(void*) considering end of that batch as a finish of our final sweep and we'll come out of the loop leaving behind dirty pages in the flush_list */ - buf_flush_wait_batch_end(BUF_FLUSH_LIST); + buf_flush_wait_batch_end(false); buf_flush_wait_LRU_batch_end(); bool success; @@ -3080,7 +2793,7 @@ static os_thread_ret_t DECLARE_THREAD(buf_flush_page_cleaner)(void*) n_flushed = n_flushed_lru + n_flushed_list; - buf_flush_wait_batch_end(BUF_FLUSH_LIST); + buf_flush_wait_batch_end(false); buf_flush_wait_LRU_batch_end(); } while (!success || n_flushed > 0); @@ -3136,7 +2849,7 @@ void buf_flush_sync() bool success; do { success = buf_flush_lists(ULINT_MAX, LSN_MAX, NULL); - buf_flush_wait_batch_end(BUF_FLUSH_LIST); + buf_flush_wait_batch_end(false); } while (!success); } @@ -3155,13 +2868,13 @@ void buf_flush_request_force(lsn_t lsn_limit) os_event_set(buf_flush_event); } -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +#ifdef UNIV_DEBUG /** Functor to validate the flush list. */ struct Check { void operator()(const buf_page_t* elem) const { - ut_a(elem->in_flush_list); + ut_a(elem->oldest_modification()); } }; @@ -3185,17 +2898,15 @@ static void buf_flush_validate_low() } while (bpage != NULL) { - const lsn_t om = bpage->oldest_modification; - ut_ad(bpage->in_flush_list); - + const lsn_t om = bpage->oldest_modification(); /* A page in buf_pool.flush_list can be in BUF_BLOCK_REMOVE_HASH state. This happens when a page is in the middle of being relocated. In that case the original descriptor can have this state and still be in the flush list waiting to acquire the buf_pool.flush_list_mutex to complete the relocation. */ - ut_a(buf_page_in_file(bpage) - || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH); + ut_a(bpage->in_file() + || bpage->state() == BUF_BLOCK_REMOVE_HASH); ut_a(om > 0); if (UNIV_LIKELY_NULL(buf_pool.flush_rbt)) { @@ -3211,7 +2922,7 @@ static void buf_flush_validate_low() bpage = UT_LIST_GET_NEXT(list, bpage); - ut_a(bpage == NULL || om >= bpage->oldest_modification); + ut_a(!bpage || om >= bpage->oldest_modification()); } /* By this time we must have exhausted the traversal of @@ -3226,5 +2937,4 @@ void buf_flush_validate() buf_flush_validate_low(); mutex_exit(&buf_pool.flush_list_mutex); } - -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#endif /* UNIV_DEBUG */ diff --git a/storage/innobase/buf/buf0lru.cc b/storage/innobase/buf/buf0lru.cc index 66aa34254a6..0da921c120b 100644 --- a/storage/innobase/buf/buf0lru.cc +++ b/storage/innobase/buf/buf0lru.cc @@ -125,37 +125,30 @@ least this many milliseconds ago. Not protected by any mutex or latch. */ uint buf_LRU_old_threshold_ms; /* @} */ -/******************************************************************//** -Takes a block out of the LRU list and page hash table. -If the block is compressed-only (BUF_BLOCK_ZIP_PAGE), +/** Remove bpage from buf_pool.LRU and buf_pool.page_hash. + +If bpage->state() == BUF_BLOCK_ZIP_PAGE && !bpage->oldest_modification(), the object will be freed. -The caller must hold buf_pool.mutex, the buf_page_get_mutex() mutex -and the appropriate hash_lock. This function will release the -buf_page_get_mutex() and the hash_lock. +@param bpage buffer block +@param id page identifier +@param hash_lock buf_pool.page_hash latch (will be released here) +@param zip whether bpage->zip of BUF_BLOCK_FILE_PAGE should be freed If a compressed page is freed other compressed pages may be relocated. @retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The caller needs to free the page to the free list @retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In this case the block is already returned to the buddy allocator. */ -static MY_ATTRIBUTE((warn_unused_result)) -bool -buf_LRU_block_remove_hashed( -/*========================*/ - buf_page_t* bpage, /*!< in: block, must contain a file page and - be in a state where it can be freed; there - may or may not be a hash index to the page */ - bool zip); /*!< in: true if should remove also the - compressed page of an uncompressed page */ -/******************************************************************//** -Puts a file page whose has no hash index to the free list. */ -static -void -buf_LRU_block_free_hashed_page( -/*===========================*/ - buf_block_t* block); /*!< in: block, must contain a file page and - be in a state where it can be freed */ +static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id, + rw_lock_t *hash_lock, bool zip); + +/** Free a block to buf_pool */ +static void buf_LRU_block_free_hashed_page(buf_block_t *block) +{ + block->page.free_file_page(); + buf_LRU_block_free_non_file_page(block); +} /** Increase LRU size in bytes by the page size. @param[in] bpage control block */ @@ -217,35 +210,30 @@ mutex and try to force a context switch. Then reacquire the same mutexes. The current page is "fixed" before the release of the mutexes and then "unfixed" again once we have reacquired the mutexes. @param[in,out] bpage current page */ -static void buf_flush_yield(buf_page_t* bpage) +static void buf_flush_yield(buf_page_t *bpage) { - BPageMutex* block_mutex; - - ut_ad(buf_page_in_file(bpage)); - - block_mutex = buf_page_get_mutex(bpage); - - mutex_enter(block_mutex); - - /* "Fix" the block so that the position cannot be - changed after we release the buffer pool and - block mutexes. */ - buf_page_set_sticky(bpage); - - /* Now it is safe to release the buf_pool.mutex. */ - mutex_exit(&buf_pool.mutex); - - mutex_exit(block_mutex); - /* Try and force a context switch. */ - os_thread_yield(); + mutex_exit(&buf_pool.flush_list_mutex); + ut_ad(bpage->oldest_modification()); + ut_ad(bpage->in_file()); + ut_ad(bpage->io_fix() == BUF_IO_NONE); + /** Make the block sticky, so that even after we release buf_pool.mutex: + (1) it cannot be removed from the buf_pool.flush_list + (2) bpage cannot be relocated in buf_pool + (3) bpage->in_LRU_list cannot change + However, bpage->LRU can change. */ + bpage->set_io_fix(BUF_IO_PIN); + mutex_exit(&buf_pool.mutex); - mutex_enter(&buf_pool.mutex); - mutex_enter(block_mutex); + /* Try and force a context switch. */ + os_thread_yield(); - /* "Unfix" the block now that we have both the - buffer pool and block mutex again. */ - buf_page_unset_sticky(bpage); - mutex_exit(block_mutex); + mutex_enter(&buf_pool.mutex); + bpage->io_unfix(); + mutex_enter(&buf_pool.flush_list_mutex); + /* Should not have been removed from the flush + list during the yield. However, this check is + not sufficient to catch a remove -> add. */ + ut_ad(bpage->oldest_modification()); } /******************************************************************//** @@ -268,23 +256,12 @@ buf_flush_try_yield( if (bpage != NULL && processed >= BUF_LRU_DROP_SEARCH_SIZE - && buf_page_get_io_fix(bpage) == BUF_IO_NONE) { - - mutex_exit(&buf_pool.flush_list_mutex); + && bpage->io_fix() == BUF_IO_NONE) { - /* Release the buffer pool and block mutex + /* Release the buf_pool.mutex to give the other threads a go. */ buf_flush_yield(bpage); - - mutex_enter(&buf_pool.flush_list_mutex); - - /* Should not have been removed from the flush - list during the yield. However, this check is - not sufficient to catch a remove -> add. */ - - ut_ad(bpage->in_flush_list); - return(true); } @@ -301,11 +278,11 @@ static bool buf_flush_or_remove_page(buf_page_t *bpage, bool flush) ut_ad(mutex_own(&buf_pool.mutex)); ut_ad(mutex_own(&buf_pool.flush_list_mutex)); - /* bpage->space and bpage->io_fix are protected by - buf_pool.mutex and block_mutex. It is safe to check - them while holding buf_pool.mutex only. */ + /* bpage->id and bpage->io_fix are protected by + buf_pool.mutex (and bpage->id additionally by hash_lock). + It is safe to check them while holding buf_pool.mutex only. */ - if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) { + if (bpage->io_fix() != BUF_IO_NONE) { /* We cannot remove this page during this scan yet; maybe the system is currently reading it @@ -314,11 +291,8 @@ static bool buf_flush_or_remove_page(buf_page_t *bpage, bool flush) } - BPageMutex* block_mutex; bool processed = false; - block_mutex = buf_page_get_mutex(bpage); - /* We have to release the flush_list_mutex to obey the latching order. We are however guaranteed that the page will stay in the flush_list and won't be relocated because @@ -327,37 +301,22 @@ static bool buf_flush_or_remove_page(buf_page_t *bpage, bool flush) mutex_exit(&buf_pool.flush_list_mutex); - mutex_enter(block_mutex); - - ut_ad(bpage->oldest_modification != 0); + ut_ad(bpage->oldest_modification()); if (!flush) { - buf_flush_remove(bpage); - - mutex_exit(block_mutex); - processed = true; - - } else if (buf_flush_ready_for_flush(bpage, BUF_FLUSH_SINGLE_PAGE)) { - - /* The following call will release the buffer pool - and block mutex. */ - processed = buf_flush_page( - bpage, BUF_FLUSH_SINGLE_PAGE, false); + } else if (bpage->ready_for_flush()) { + processed = buf_flush_page(bpage, IORequest::SINGLE_PAGE, + nullptr, false); if (processed) { mutex_enter(&buf_pool.mutex); - } else { - mutex_exit(block_mutex); } - } else { - mutex_exit(block_mutex); } mutex_enter(&buf_pool.flush_list_mutex); - ut_ad(!mutex_own(block_mutex)); ut_ad(mutex_own(&buf_pool.mutex)); return(processed); @@ -385,17 +344,19 @@ rescan: bpage != NULL; bpage = prev) { - ut_a(buf_page_in_file(bpage)); + ut_a(bpage->in_file()); /* Save the previous link because once we free the page we can't rely on the links. */ prev = UT_LIST_GET_PREV(list, bpage); - if (id != bpage->id.space()) { + const page_id_t bpage_id(bpage->id()); + + if (id != bpage_id.space()) { /* Skip this block, because it is for a different tablespace. */ - } else if (bpage->id.page_no() < first) { + } else if (bpage_id.page_no() < first) { /* Skip this block, because it is below the limit. */ } else if (!buf_flush_or_remove_page(bpage, flush)) { @@ -470,10 +431,9 @@ static void buf_flush_dirty_pages(ulint id, bool flush, ulint first) for (buf_page_t *bpage= UT_LIST_GET_FIRST(buf_pool.flush_list); bpage; bpage= UT_LIST_GET_NEXT(list, bpage)) { - ut_ad(buf_page_in_file(bpage)); - ut_ad(bpage->in_flush_list); - ut_ad(bpage->oldest_modification > 0); - ut_ad(id != bpage->id.space()); + ut_ad(bpage->in_file()); + ut_ad(bpage->oldest_modification()); + ut_ad(id != bpage->id().space()); } mutex_exit(&buf_pool.flush_list_mutex); @@ -501,7 +461,7 @@ void buf_LRU_flush_or_remove_pages(ulint id, bool flush, ulint first) } } -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +#ifdef UNIV_DEBUG /********************************************************************//** Insert a compressed block into buf_pool.zip_clean in the LRU order. */ void @@ -510,7 +470,8 @@ buf_LRU_insert_zip_clean( buf_page_t* bpage) /*!< in: pointer to the block in question */ { ut_ad(mutex_own(&buf_pool.mutex)); - ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE); + ut_ad(bpage->state() == BUF_BLOCK_ZIP_PAGE); + ut_ad(!bpage->oldest_modification()); /* Find the first successor of bpage in the LRU list that is in the zip_clean list. */ @@ -518,7 +479,8 @@ buf_LRU_insert_zip_clean( do { b = UT_LIST_GET_NEXT(LRU, b); - } while (b && buf_page_get_state(b) != BUF_BLOCK_ZIP_PAGE); + } while (b && (b->state() != BUF_BLOCK_ZIP_PAGE + || b->oldest_modification())); /* Insert bpage before b, i.e., after the predecessor of b. */ if (b != NULL) { @@ -531,7 +493,7 @@ buf_LRU_insert_zip_clean( UT_LIST_ADD_FIRST(buf_pool.zip_clean, bpage); } } -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#endif /* UNIV_DEBUG */ /** Try to free an uncompressed page of a compressed block from the unzip LRU list. The compressed page is preserved, and it need not be clean. @@ -547,23 +509,21 @@ static bool buf_LRU_free_from_unzip_LRU_list(bool scan_all) } ulint scanned = 0; + const ulint limit = scan_all ? ULINT_UNDEFINED : srv_LRU_scan_depth; bool freed = false; for (buf_block_t* block = UT_LIST_GET_LAST(buf_pool.unzip_LRU); - block != NULL - && !freed - && (scan_all || scanned < srv_LRU_scan_depth); - ++scanned) { - - buf_block_t* prev_block; - - prev_block = UT_LIST_GET_PREV(unzip_LRU, block); + block && scanned < limit; ++scanned) { + buf_block_t* prev_block = UT_LIST_GET_PREV(unzip_LRU, block); - ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE); ut_ad(block->in_unzip_LRU_list); ut_ad(block->page.in_LRU_list); freed = buf_LRU_free_page(&block->page, false); + if (freed) { + break; + } block = prev_block; } @@ -591,39 +551,28 @@ static bool buf_LRU_free_from_common_LRU_list(bool scan_all) bool freed = false; for (buf_page_t* bpage = buf_pool.lru_scan_itr.start(); - bpage != NULL - && !freed - && (scan_all || scanned < BUF_LRU_SEARCH_SCAN_THRESHOLD); + bpage && (scan_all || scanned < BUF_LRU_SEARCH_SCAN_THRESHOLD); ++scanned, bpage = buf_pool.lru_scan_itr.get()) { - buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage); - BPageMutex* mutex = buf_page_get_mutex(bpage); - buf_pool.lru_scan_itr.set(prev); - mutex_enter(mutex); + const auto accessed = bpage->is_accessed(); + freed = bpage->ready_for_replace(); - ut_ad(buf_page_in_file(bpage)); - ut_ad(bpage->in_LRU_list); - - unsigned accessed = buf_page_is_accessed(bpage); - - if (buf_flush_ready_for_replace(bpage)) { - mutex_exit(mutex); + if (freed) { freed = buf_LRU_free_page(bpage, true); - } else { - mutex_exit(mutex); - } + if (!freed) { + continue; + } - if (freed && !accessed) { - /* Keep track of pages that are evicted without - ever being accessed. This gives us a measure of - the effectiveness of readahead */ - ++buf_pool.stat.n_ra_pages_evicted; + if (!accessed) { + /* Keep track of pages that are evicted without + ever being accessed. This gives us a measure of + the effectiveness of readahead */ + ++buf_pool.stat.n_ra_pages_evicted; + break; + } } - - ut_ad(mutex_own(&buf_pool.mutex)); - ut_ad(!mutex_own(mutex)); } if (scanned) { @@ -649,15 +598,6 @@ bool buf_LRU_scan_and_free_block(bool scan_all) || buf_LRU_free_from_common_LRU_list(scan_all)); } -/** @return whether less than 1/4 of the buffer pool is available */ -bool buf_LRU_buf_pool_running_out() -{ - return !recv_recovery_is_on() - && UT_LIST_GET_LEN(buf_pool.free) - + UT_LIST_GET_LEN(buf_pool.LRU) - < ut_min(buf_pool.curr_size, buf_pool.old_size) / 4; -} - /** @return a buffer block from the buf_pool.free list @retval NULL if the free list is empty */ buf_block_t* buf_LRU_get_free_only() @@ -670,28 +610,23 @@ buf_block_t* buf_LRU_get_free_only() UT_LIST_GET_FIRST(buf_pool.free)); while (block != NULL) { - ut_ad(block->page.in_free_list); ut_d(block->page.in_free_list = FALSE); - ut_ad(!block->page.in_flush_list); + ut_ad(!block->page.oldest_modification()); ut_ad(!block->page.in_LRU_list); - ut_a(!buf_page_in_file(&block->page)); + ut_a(!block->page.in_file()); UT_LIST_REMOVE(buf_pool.free, &block->page); if (buf_pool.curr_size >= buf_pool.old_size || UT_LIST_GET_LEN(buf_pool.withdraw) >= buf_pool.withdraw_target || !buf_pool.will_be_withdrawn(block->page)) { - /* found valid free block */ - buf_page_mutex_enter(block); /* No adaptive hash index entries may point to a free block. */ assert_block_ahi_empty(block); - buf_block_set_state(block, BUF_BLOCK_READY_FOR_USE); + block->page.set_state(BUF_BLOCK_MEMORY); UNIV_MEM_ALLOC(block->frame, srv_page_size); - - buf_page_mutex_exit(block); break; } @@ -699,7 +634,7 @@ buf_block_t* buf_LRU_get_free_only() UT_LIST_ADD_LAST( buf_pool.withdraw, &block->page); - ut_d(block->in_withdraw_list = TRUE); + ut_d(block->in_withdraw_list = true); block = reinterpret_cast<buf_block_t*>( UT_LIST_GET_FIRST(buf_pool.free)); @@ -777,6 +712,7 @@ static void buf_LRU_check_size_of_non_data_objects() /** Get a free block from the buf_pool. The block is taken off the free list. If free list is empty, blocks are moved from the end of the LRU list to the free list. + This function is called from a user thread when it needs a clean block to read in a page. Note that we only ever get a block from the free list. Even when we flush a page or find a page in LRU scan @@ -796,17 +732,23 @@ we put it to free list to be used. * scan LRU list even if buf_pool.try_LRU_scan is not set * iteration > 1: * same as iteration 1 but sleep 10ms -@return the free control block, in state BUF_BLOCK_READY_FOR_USE */ -buf_block_t* buf_LRU_get_free_block() + +@param have_mutex whether buf_pool.mutex is already being held +@return the free control block, in state BUF_BLOCK_MEMORY */ +buf_block_t* buf_LRU_get_free_block(bool have_mutex) { buf_block_t* block = NULL; bool freed = false; ulint n_iterations = 0; ulint flush_failures = 0; - MONITOR_INC(MONITOR_LRU_GET_FREE_SEARCH); + if (have_mutex) { + ut_ad(mutex_own(&buf_pool.mutex)); + goto got_mutex; + } loop: mutex_enter(&buf_pool.mutex); +got_mutex: buf_LRU_check_size_of_non_data_objects(); @@ -819,7 +761,9 @@ loop: block = buf_LRU_get_free_only(); if (block != NULL) { - mutex_exit(&buf_pool.mutex); + if (!have_mutex) { + mutex_exit(&buf_pool.mutex); + } memset(&block->page.zip, 0, sizeof block->page.zip); block->skip_flush_check = false; return(block); @@ -967,13 +911,13 @@ static void buf_LRU_old_adjust_len() ut_a(!LRU_old->old); #endif /* UNIV_LRU_DEBUG */ old_len = ++buf_pool.LRU_old_len; - buf_page_set_old(LRU_old, TRUE); + LRU_old->set_old(true); } else if (old_len > new_len + BUF_LRU_OLD_TOLERANCE) { buf_pool.LRU_old = UT_LIST_GET_NEXT(LRU, LRU_old); old_len = --buf_pool.LRU_old_len; - buf_page_set_old(LRU_old, FALSE); + LRU_old->set_old(false); } else { return; } @@ -996,11 +940,11 @@ static void buf_LRU_old_init() bpage = UT_LIST_GET_PREV(LRU, bpage)) { ut_ad(bpage->in_LRU_list); - ut_ad(buf_page_in_file(bpage)); + ut_ad(bpage->in_file()); /* This loop temporarily violates the - assertions of buf_page_set_old(). */ - bpage->old = TRUE; + assertions of buf_page_t::set_old(). */ + bpage->old = true; } buf_pool.LRU_old = UT_LIST_GET_FIRST(buf_pool.LRU); @@ -1013,41 +957,26 @@ static void buf_LRU_old_init() @param[in] bpage control block */ static void buf_unzip_LRU_remove_block_if_needed(buf_page_t* bpage) { - ut_ad(buf_page_in_file(bpage)); + ut_ad(bpage->in_file()); ut_ad(mutex_own(&buf_pool.mutex)); - if (buf_page_belongs_to_unzip_LRU(bpage)) { + if (bpage->belongs_to_unzip_LRU()) { buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage); ut_ad(block->in_unzip_LRU_list); - ut_d(block->in_unzip_LRU_list = FALSE); + ut_d(block->in_unzip_LRU_list = false); UT_LIST_REMOVE(buf_pool.unzip_LRU, block); } } -/** Adjust LRU hazard pointers if needed. -@param[in] bpage buffer page descriptor */ -void buf_LRU_adjust_hp(const buf_page_t* bpage) -{ - buf_pool.lru_hp.adjust(bpage); - buf_pool.lru_scan_itr.adjust(bpage); - buf_pool.single_scan_itr.adjust(bpage); -} - /** Removes a block from the LRU list. @param[in] bpage control block */ static inline void buf_LRU_remove_block(buf_page_t* bpage) { - ut_ad(mutex_own(&buf_pool.mutex)); - - ut_a(buf_page_in_file(bpage)); - - ut_ad(bpage->in_LRU_list); - /* Important that we adjust the hazard pointers before removing bpage from the LRU list. */ - buf_LRU_adjust_hp(bpage); + buf_page_t* prev_bpage = buf_pool.LRU_remove(bpage); /* If the LRU_old pointer is defined and points to just this block, move it backward one step */ @@ -1059,22 +988,16 @@ static inline void buf_LRU_remove_block(buf_page_t* bpage) by BUF_LRU_OLD_TOLERANCE from strict buf_pool.LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV of the LRU list length. */ - buf_page_t* prev_bpage = UT_LIST_GET_PREV(LRU, bpage); - ut_a(prev_bpage); #ifdef UNIV_LRU_DEBUG ut_a(!prev_bpage->old); #endif /* UNIV_LRU_DEBUG */ buf_pool.LRU_old = prev_bpage; - buf_page_set_old(prev_bpage, TRUE); + prev_bpage->set_old(true); buf_pool.LRU_old_len++; } - /* Remove the block from the LRU list */ - UT_LIST_REMOVE(buf_pool.LRU, bpage); - ut_d(bpage->in_LRU_list = FALSE); - buf_pool.stat.LRU_bytes -= bpage->physical_size(); buf_unzip_LRU_remove_block_if_needed(bpage); @@ -1088,8 +1011,8 @@ static inline void buf_LRU_remove_block(buf_page_t* bpage) bpage = UT_LIST_GET_NEXT(LRU, bpage)) { /* This loop temporarily violates the - assertions of buf_page_set_old(). */ - bpage->old = FALSE; + assertions of buf_page_t::set_old(). */ + bpage->old = false; } buf_pool.LRU_old = NULL; @@ -1101,8 +1024,7 @@ static inline void buf_LRU_remove_block(buf_page_t* bpage) ut_ad(buf_pool.LRU_old); /* Update the LRU_old_len field if necessary */ - if (buf_page_is_old(bpage)) { - + if (bpage->old) { buf_pool.LRU_old_len--; } @@ -1120,9 +1042,9 @@ buf_unzip_LRU_add_block( of the list, else put to the start */ { ut_ad(mutex_own(&buf_pool.mutex)); - ut_a(buf_page_belongs_to_unzip_LRU(&block->page)); + ut_a(block->page.belongs_to_unzip_LRU()); ut_ad(!block->in_unzip_LRU_list); - ut_d(block->in_unzip_LRU_list = TRUE); + ut_d(block->in_unzip_LRU_list = true); if (old) { UT_LIST_ADD_LAST(buf_pool.unzip_LRU, block); @@ -1135,18 +1057,16 @@ buf_unzip_LRU_add_block( Adds a block to the LRU list. Please make sure that the page_size is already set when invoking the function, so that we can get correct page_size from the buffer page when adding a block into LRU */ -UNIV_INLINE void -buf_LRU_add_block_low( -/*==================*/ +buf_LRU_add_block( buf_page_t* bpage, /*!< in: control block */ - ibool old) /*!< in: TRUE if should be put to the old blocks + bool old) /*!< in: true if should be put to the old blocks in the LRU list, else put to the start; if the LRU list is very short, the block is added to the start, regardless of this parameter */ { ut_ad(mutex_own(&buf_pool.mutex)); - ut_a(buf_page_in_file(bpage)); + ut_a(bpage->in_file()); ut_ad(!bpage->in_LRU_list); if (!old || (UT_LIST_GET_LEN(buf_pool.LRU) < BUF_LRU_OLD_MIN_LEN)) { @@ -1181,7 +1101,7 @@ buf_LRU_add_block_low( /* Adjust the length of the old block list if necessary */ - buf_page_set_old(bpage, old); + bpage->set_old(old); buf_LRU_old_adjust_len(); } else if (UT_LIST_GET_LEN(buf_pool.LRU) == BUF_LRU_OLD_MIN_LEN) { @@ -1191,34 +1111,17 @@ buf_LRU_add_block_low( buf_LRU_old_init(); } else { - buf_page_set_old(bpage, buf_pool.LRU_old != NULL); + bpage->set_old(buf_pool.LRU_old != NULL); } /* If this is a zipped block with decompressed frame as well then put it on the unzip_LRU list */ - if (buf_page_belongs_to_unzip_LRU(bpage)) { + if (bpage->belongs_to_unzip_LRU()) { buf_unzip_LRU_add_block((buf_block_t*) bpage, old); } } /******************************************************************//** -Adds a block to the LRU list. Please make sure that the page_size is -already set when invoking the function, so that we can get correct -page_size from the buffer page when adding a block into LRU */ -void -buf_LRU_add_block( -/*==============*/ - buf_page_t* bpage, /*!< in: control block */ - ibool old) /*!< in: TRUE if should be put to the old - blocks in the LRU list, else put to the start; - if the LRU list is very short, the block is - added to the start, regardless of this - parameter */ -{ - buf_LRU_add_block_low(bpage, old); -} - -/******************************************************************//** Moves a block to the start of the LRU list. */ void buf_LRU_make_block_young( @@ -1232,40 +1135,29 @@ buf_LRU_make_block_young( } buf_LRU_remove_block(bpage); - buf_LRU_add_block_low(bpage, FALSE); + buf_LRU_add_block(bpage, false); } -/******************************************************************//** -Try to free a block. If bpage is a descriptor of a compressed-only -page, the descriptor object will be freed as well. - -NOTE: If this function returns true, it will temporarily -release buf_pool.mutex. Furthermore, the page frame will no longer be -accessible via bpage. - -The caller must hold buf_pool.mutex and must not hold any -buf_page_get_mutex() when calling this function. -@return true if freed, false otherwise. */ -bool -buf_LRU_free_page( -/*===============*/ - buf_page_t* bpage, /*!< in: block to be freed */ - bool zip) /*!< in: true if should remove also the - compressed page of an uncompressed page */ +/** Try to free a block. If bpage is a descriptor of a compressed-only +ROW_FORMAT=COMPRESSED page, the buf_page_t object will be freed as well. +The caller must hold buf_pool.mutex. +@param bpage block to be freed +@param zip whether to remove both copies of a ROW_FORMAT=COMPRESSED page +@retval true if freed and buf_pool.mutex may have been temporarily released +@retval false if the page was not freed */ +bool buf_LRU_free_page(buf_page_t *bpage, bool zip) { - buf_page_t* b = NULL; - rw_lock_t* hash_lock = buf_page_hash_lock_get(bpage->id); - BPageMutex* block_mutex = buf_page_get_mutex(bpage); + const page_id_t id(bpage->id()); + buf_page_t* b = nullptr; ut_ad(mutex_own(&buf_pool.mutex)); - ut_ad(buf_page_in_file(bpage)); + ut_ad(bpage->in_file()); ut_ad(bpage->in_LRU_list); + rw_lock_t* hash_lock = buf_pool.hash_lock_get(id); rw_lock_x_lock(hash_lock); - mutex_enter(block_mutex); - - if (!buf_page_can_relocate(bpage)) { + if (!bpage->can_relocate()) { /* Do not free buffer fixed and I/O-fixed blocks. */ goto func_exit; } @@ -1274,37 +1166,32 @@ buf_LRU_free_page( /* This would completely free the block. */ /* Do not completely free dirty blocks. */ - if (bpage->oldest_modification) { + if (bpage->oldest_modification()) { goto func_exit; } - } else if (bpage->oldest_modification > 0 - && buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) { - - ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_DIRTY); - + } else if (bpage->oldest_modification() + && bpage->state() != BUF_BLOCK_FILE_PAGE) { func_exit: rw_lock_x_unlock(hash_lock); - mutex_exit(block_mutex); return(false); - } else if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) { + } else if (bpage->state() == BUF_BLOCK_FILE_PAGE) { b = buf_page_alloc_descriptor(); ut_a(b); new (b) buf_page_t(*bpage); + b->set_state(BUF_BLOCK_ZIP_PAGE); } ut_ad(mutex_own(&buf_pool.mutex)); - ut_ad(buf_page_in_file(bpage)); + ut_ad(bpage->in_file()); ut_ad(bpage->in_LRU_list); - ut_ad(!bpage->in_flush_list == !bpage->oldest_modification); DBUG_PRINT("ib_buf", ("free page %u:%u", - bpage->id.space(), bpage->id.page_no())); + id.space(), id.page_no())); - ut_ad(rw_lock_own(hash_lock, RW_LOCK_X)); - ut_ad(buf_page_can_relocate(bpage)); + ut_ad(bpage->can_relocate()); - if (!buf_LRU_block_remove_hashed(bpage, zip)) { + if (!buf_LRU_block_remove_hashed(bpage, id, hash_lock, zip)) { return(true); } @@ -1312,63 +1199,55 @@ func_exit: ut_ad(!rw_lock_own_flagged(hash_lock, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); - /* We have just freed a BUF_BLOCK_FILE_PAGE. If b != NULL + /* We have just freed a BUF_BLOCK_FILE_PAGE. If b != nullptr then it was a compressed page with an uncompressed frame and we are interested in freeing only the uncompressed frame. Therefore we have to reinsert the compressed page descriptor into the LRU and page_hash (and possibly flush_list). - if b == NULL then it was a regular page that has been freed */ + if !b then it was a regular page that has been freed */ - if (b != NULL) { + if (UNIV_LIKELY_NULL(b)) { buf_page_t* prev_b = UT_LIST_GET_PREV(LRU, b); rw_lock_x_lock(hash_lock); - mutex_enter(block_mutex); - - ut_a(!buf_page_hash_get_low(b->id)); - - b->state = b->oldest_modification - ? BUF_BLOCK_ZIP_DIRTY - : BUF_BLOCK_ZIP_PAGE; - + ut_ad(!buf_pool.page_hash_get_low(id)); ut_ad(b->zip_size()); UNIV_MEM_DESC(b->zip.data, b->zip_size()); - /* The fields in_page_hash and in_LRU_list of + /* The field in_LRU_list of the to-be-freed block descriptor should have been cleared in buf_LRU_block_remove_hashed(), which invokes buf_LRU_remove_block(). */ - ut_ad(!bpage->in_page_hash); ut_ad(!bpage->in_LRU_list); /* bpage->state was BUF_BLOCK_FILE_PAGE because - b != NULL. The type cast below is thus valid. */ + b != nullptr. The type cast below is thus valid. */ ut_ad(!((buf_block_t*) bpage)->in_unzip_LRU_list); /* The fields of bpage were copied to b before buf_LRU_block_remove_hashed() was invoked. */ ut_ad(!b->in_zip_hash); - ut_ad(b->in_page_hash); ut_ad(b->in_LRU_list); + ut_ad(b->in_page_hash); HASH_INSERT(buf_page_t, hash, buf_pool.page_hash, - b->id.fold(), b); + id.fold(), b); /* Insert b where bpage was in the LRU list. */ - if (prev_b != NULL) { + if (prev_b) { ulint lru_len; ut_ad(prev_b->in_LRU_list); - ut_ad(buf_page_in_file(prev_b)); + ut_ad(prev_b->in_file()); UT_LIST_INSERT_AFTER(buf_pool.LRU, prev_b, b); incr_LRU_size_in_bytes(b); - if (buf_page_is_old(b)) { + if (b->is_old()) { buf_pool.LRU_old_len++; if (buf_pool.LRU_old == UT_LIST_GET_NEXT(LRU, b)) { @@ -1393,39 +1272,30 @@ func_exit: #ifdef UNIV_LRU_DEBUG /* Check that the "old" flag is consistent in the block and its neighbours. */ - buf_page_set_old(b, buf_page_is_old(b)); + b->set_old(b->is_old()); #endif /* UNIV_LRU_DEBUG */ } else { ut_d(b->in_LRU_list = FALSE); - buf_LRU_add_block_low(b, buf_page_is_old(b)); + buf_LRU_add_block(b, b->old); } - if (b->state == BUF_BLOCK_ZIP_PAGE) { -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + if (!b->oldest_modification()) { +#ifdef UNIV_DEBUG buf_LRU_insert_zip_clean(b); -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#endif /* UNIV_DEBUG */ } else { /* Relocate on buf_pool.flush_list. */ buf_flush_relocate_on_flush_list(bpage, b); } - bpage->zip.data = NULL; + bpage->zip.data = nullptr; page_zip_set_size(&bpage->zip, 0); - mutex_exit(block_mutex); - /* Prevent buf_page_get_gen() from decompressing the block while we release - buf_pool.mutex and block_mutex. */ - block_mutex = buf_page_get_mutex(b); - - mutex_enter(block_mutex); - - buf_page_set_sticky(b); - - mutex_exit(block_mutex); - + hash_lock. */ + b->set_io_fix(BUF_IO_PIN); rw_lock_x_unlock(hash_lock); } @@ -1443,16 +1313,12 @@ func_exit: UNIV_MEM_INVALID(((buf_block_t*) bpage)->frame, srv_page_size); - mutex_enter(&buf_pool.mutex); - - if (b != NULL) { - mutex_enter(block_mutex); - - buf_page_unset_sticky(b); - - mutex_exit(block_mutex); + if (UNIV_LIKELY_NULL(b)) { + ut_ad(b->zip_size()); + b->io_unfix(); } + mutex_enter(&buf_pool.mutex); buf_LRU_block_free_hashed_page((buf_block_t*) bpage); return(true); @@ -1467,23 +1333,13 @@ buf_LRU_block_free_non_file_page( { void* data; - ut_ad(mutex_own(&buf_pool.mutex)); - ut_ad(buf_page_mutex_own(block)); - - switch (buf_block_get_state(block)) { - case BUF_BLOCK_MEMORY: - case BUF_BLOCK_READY_FOR_USE: - break; - default: - ut_error; - } - + ut_ad(block->page.state() == BUF_BLOCK_MEMORY); assert_block_ahi_empty(block); ut_ad(!block->page.in_free_list); - ut_ad(!block->page.in_flush_list); + ut_ad(!block->page.oldest_modification()); ut_ad(!block->page.in_LRU_list); - buf_block_set_state(block, BUF_BLOCK_NOT_USED); + block->page.set_state(BUF_BLOCK_NOT_USED); UNIV_MEM_ALLOC(block->frame, srv_page_size); #ifdef UNIV_DEBUG @@ -1502,7 +1358,6 @@ buf_LRU_block_free_non_file_page( if (data != NULL) { block->page.zip.data = NULL; - buf_page_mutex_exit(block); buf_pool_mutex_exit_forbid(); ut_ad(block->zip_size()); @@ -1510,8 +1365,6 @@ buf_LRU_block_free_non_file_page( buf_buddy_free(data, block->zip_size()); buf_pool_mutex_exit_allow(); - buf_page_mutex_enter(block); - page_zip_set_size(&block->page.zip, 0); } @@ -1522,57 +1375,44 @@ buf_LRU_block_free_non_file_page( UT_LIST_ADD_LAST( buf_pool.withdraw, &block->page); - ut_d(block->in_withdraw_list = TRUE); + ut_d(block->in_withdraw_list = true); } else { UT_LIST_ADD_FIRST(buf_pool.free, &block->page); - ut_d(block->page.in_free_list = TRUE); + ut_d(block->page.in_free_list = true); } UNIV_MEM_FREE(block->frame, srv_page_size); } -/******************************************************************//** -Takes a block out of the LRU list and page hash table. -If the block is compressed-only (BUF_BLOCK_ZIP_PAGE), +/** Remove bpage from buf_pool.LRU and buf_pool.page_hash. + +If bpage->state() == BUF_BLOCK_ZIP_PAGE && !bpage->oldest_modification(), the object will be freed. -The caller must hold buf_pool.mutex, the buf_page_get_mutex() mutex -and the appropriate hash_lock. This function will release the -buf_page_get_mutex() and the hash_lock. +@param bpage buffer block +@param id page identifier +@param hash_lock buf_pool.page_hash latch (will be released here) +@param zip whether bpage->zip of BUF_BLOCK_FILE_PAGE should be freed If a compressed page is freed other compressed pages may be relocated. @retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The caller needs to free the page to the free list @retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In this case the block is already returned to the buddy allocator. */ -static -bool -buf_LRU_block_remove_hashed( -/*========================*/ - buf_page_t* bpage, /*!< in: block, must contain a file page and - be in a state where it can be freed; there - may or may not be a hash index to the page */ - bool zip) /*!< in: true if should remove also the - compressed page of an uncompressed page */ +static bool buf_LRU_block_remove_hashed(buf_page_t *bpage, const page_id_t id, + rw_lock_t *hash_lock, bool zip) { - const buf_page_t* hashed_bpage; - rw_lock_t* hash_lock; - ut_ad(mutex_own(&buf_pool.mutex)); - ut_ad(mutex_own(buf_page_get_mutex(bpage))); - - hash_lock = buf_page_hash_lock_get(bpage->id); - ut_ad(rw_lock_own(hash_lock, RW_LOCK_X)); - ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE); - ut_a(bpage->buf_fix_count == 0); + ut_a(bpage->io_fix() == BUF_IO_NONE); + ut_a(!bpage->buf_fix_count()); buf_LRU_remove_block(bpage); buf_pool.freed_page_clock += 1; - switch (buf_page_get_state(bpage)) { + switch (bpage->state()) { case BUF_BLOCK_FILE_PAGE: UNIV_MEM_ASSERT_W(bpage, sizeof(buf_block_t)); UNIV_MEM_ASSERT_W(((buf_block_t*) bpage)->frame, @@ -1581,7 +1421,7 @@ buf_LRU_block_remove_hashed( if (bpage->zip.data) { const page_t* page = ((buf_block_t*) bpage)->frame; - ut_a(!zip || bpage->oldest_modification == 0); + ut_a(!zip || !bpage->oldest_modification()); ut_ad(bpage->zip_size()); switch (fil_page_get_type(page)) { @@ -1632,68 +1472,30 @@ buf_LRU_block_remove_hashed( } /* fall through */ case BUF_BLOCK_ZIP_PAGE: - ut_a(bpage->oldest_modification == 0); + ut_a(!bpage->oldest_modification()); UNIV_MEM_ASSERT_W(bpage->zip.data, bpage->zip_size()); break; - case BUF_BLOCK_POOL_WATCH: - case BUF_BLOCK_ZIP_DIRTY: case BUF_BLOCK_NOT_USED: - case BUF_BLOCK_READY_FOR_USE: case BUF_BLOCK_MEMORY: case BUF_BLOCK_REMOVE_HASH: ut_error; break; } - hashed_bpage = buf_page_hash_get_low(bpage->id); - if (bpage != hashed_bpage) { - ib::error() << "Page " << bpage->id - << " not found in the hash table"; - ib::error() -#ifdef UNIV_DEBUG - << "in_page_hash:" << bpage->in_page_hash - << " in_zip_hash:" << bpage->in_zip_hash - << " in_flush_list:" << bpage->in_flush_list - << " in_LRU_list:" << bpage->in_LRU_list -#endif - << " zip.data:" << bpage->zip.data - << " zip_size:" << bpage->zip_size() - << " page_state:" << buf_page_get_state(bpage); - - if (hashed_bpage) { - ib::error() << "In hash table we find block " - << hashed_bpage << " of " << hashed_bpage->id - << " which is not " << bpage; - } - - ut_d(mutex_exit(buf_page_get_mutex(bpage))); - ut_d(rw_lock_x_unlock(hash_lock)); - ut_d(mutex_exit(&buf_pool.mutex)); - ut_d(buf_pool.print()); - ut_d(buf_LRU_print()); - ut_d(buf_LRU_validate()); - ut_ad(0); - } - ut_ad(!bpage->in_zip_hash); - ut_ad(bpage->in_page_hash); - ut_d(bpage->in_page_hash = FALSE); + HASH_DELETE(buf_page_t, hash, buf_pool.page_hash, id.fold(), bpage); - HASH_DELETE(buf_page_t, hash, buf_pool.page_hash, bpage->id.fold(), - bpage); - - switch (buf_page_get_state(bpage)) { + switch (bpage->state()) { case BUF_BLOCK_ZIP_PAGE: ut_ad(!bpage->in_free_list); - ut_ad(!bpage->in_flush_list); ut_ad(!bpage->in_LRU_list); ut_a(bpage->zip.data); ut_a(bpage->zip.ssize); + ut_ad(!bpage->oldest_modification()); -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +#ifdef UNIV_DEBUG UT_LIST_REMOVE(buf_pool.zip_clean, bpage); -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ - mutex_exit(&buf_pool.zip_mutex); +#endif /* UNIV_DEBUG */ rw_lock_x_unlock(hash_lock); buf_pool_mutex_exit_forbid(); @@ -1714,9 +1516,9 @@ buf_LRU_block_remove_hashed( + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4); UNIV_MEM_INVALID(((buf_block_t*) bpage)->frame, srv_page_size); - buf_page_set_state(bpage, BUF_BLOCK_REMOVE_HASH); + bpage->set_state(BUF_BLOCK_REMOVE_HASH); - /* Question: If we release bpage and hash mutex here + /* Question: If we release hash_lock here then what protects us against: 1) Some other thread buffer fixing this page 2) Some other thread trying to read this page and @@ -1736,7 +1538,6 @@ buf_LRU_block_remove_hashed( have inserted the compressed only descriptor in the page_hash. */ rw_lock_x_unlock(hash_lock); - mutex_exit(&((buf_block_t*) bpage)->mutex); if (zip && bpage->zip.data) { /* Free the compressed page. */ @@ -1744,7 +1545,7 @@ buf_LRU_block_remove_hashed( bpage->zip.data = NULL; ut_ad(!bpage->in_free_list); - ut_ad(!bpage->in_flush_list); + ut_ad(!bpage->oldest_modification()); ut_ad(!bpage->in_LRU_list); buf_pool_mutex_exit_forbid(); @@ -1757,10 +1558,7 @@ buf_LRU_block_remove_hashed( return(true); - case BUF_BLOCK_POOL_WATCH: - case BUF_BLOCK_ZIP_DIRTY: case BUF_BLOCK_NOT_USED: - case BUF_BLOCK_READY_FOR_USE: case BUF_BLOCK_MEMORY: case BUF_BLOCK_REMOVE_HASH: break; @@ -1770,61 +1568,25 @@ buf_LRU_block_remove_hashed( return(false); } -/******************************************************************//** -Puts a file page whose has no hash index to the free list. */ -static -void -buf_LRU_block_free_hashed_page( -/*===========================*/ - buf_block_t* block) /*!< in: block, must contain a file page and - be in a state where it can be freed */ -{ - ut_ad(mutex_own(&buf_pool.mutex)); - - buf_page_mutex_enter(block); - - if (buf_pool.flush_rbt == NULL) { - block->page.id - = page_id_t(ULINT32_UNDEFINED, ULINT32_UNDEFINED); - } - - buf_block_set_state(block, BUF_BLOCK_MEMORY); - - buf_LRU_block_free_non_file_page(block); - buf_page_mutex_exit(block); -} - /** Remove one page from LRU list and put it to free list. -@param[in,out] bpage block, must contain a file page and be in - a freeable state; there may or may not be a - hash index to the page -@param[in] old_page_id page number before bpage->id was invalidated */ -void buf_LRU_free_one_page(buf_page_t* bpage, page_id_t old_page_id) +@param bpage file page to be freed +@param id page identifier +@param hash_lock buf_pool.page_hash latch (will be released here) */ +void buf_LRU_free_one_page(buf_page_t *bpage, const page_id_t id, + rw_lock_t *hash_lock) { - rw_lock_t* hash_lock = buf_page_hash_lock_get(old_page_id); - BPageMutex* block_mutex = buf_page_get_mutex(bpage); - - ut_ad(mutex_own(&buf_pool.mutex)); - - rw_lock_x_lock(hash_lock); - - while (bpage->buf_fix_count > 0) { - /* Wait for other threads to release the fix count - before releasing the bpage from LRU list. */ - } - - mutex_enter(block_mutex); - - bpage->id = old_page_id; + while (bpage->buf_fix_count()) + { + /* Wait for other threads to release the fix count + before releasing the bpage from LRU list. */ + ut_delay(1); + } - if (buf_LRU_block_remove_hashed(bpage, true)) { - buf_LRU_block_free_hashed_page((buf_block_t*) bpage); - } + if (buf_LRU_block_remove_hashed(bpage, id, hash_lock, true)) + buf_LRU_block_free_hashed_page(reinterpret_cast<buf_block_t*>(bpage)); - /* buf_LRU_block_remove_hashed() releases hash_lock and block_mutex */ - ut_ad(!rw_lock_own_flagged(hash_lock, - RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); - ut_ad(!mutex_own(block_mutex)); + /* buf_LRU_block_remove_hashed() releases hash_lock */ + ut_ad(!rw_lock_own_flagged(hash_lock, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); } /** Update buf_pool.LRU_old_ratio. @@ -1901,7 +1663,7 @@ func_exit: memset(&buf_LRU_stat_cur, 0, sizeof buf_LRU_stat_cur); } -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +#ifdef UNIV_DEBUG /** Validate the LRU list. */ void buf_LRU_validate() { @@ -1934,23 +1696,21 @@ void buf_LRU_validate() bpage != NULL; bpage = UT_LIST_GET_NEXT(LRU, bpage)) { - switch (buf_page_get_state(bpage)) { - case BUF_BLOCK_POOL_WATCH: + switch (bpage->state()) { case BUF_BLOCK_NOT_USED: - case BUF_BLOCK_READY_FOR_USE: case BUF_BLOCK_MEMORY: case BUF_BLOCK_REMOVE_HASH: ut_error; break; case BUF_BLOCK_FILE_PAGE: - ut_ad(((buf_block_t*) bpage)->in_unzip_LRU_list - == buf_page_belongs_to_unzip_LRU(bpage)); + ut_ad(reinterpret_cast<buf_block_t*>(bpage) + ->in_unzip_LRU_list + == bpage->belongs_to_unzip_LRU()); case BUF_BLOCK_ZIP_PAGE: - case BUF_BLOCK_ZIP_DIRTY: break; } - if (buf_page_is_old(bpage)) { + if (bpage->is_old()) { const buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage); const buf_page_t* next @@ -1959,10 +1719,10 @@ void buf_LRU_validate() if (!old_len++) { ut_a(buf_pool.LRU_old == bpage); } else { - ut_a(!prev || buf_page_is_old(prev)); + ut_a(!prev || prev->is_old()); } - ut_a(!next || buf_page_is_old(next)); + ut_a(!next || next->is_old()); } } @@ -1974,7 +1734,7 @@ void buf_LRU_validate() bpage != NULL; bpage = UT_LIST_GET_NEXT(list, bpage)) { - ut_a(buf_page_get_state(bpage) == BUF_BLOCK_NOT_USED); + ut_a(bpage->state() == BUF_BLOCK_NOT_USED); } CheckUnzipLRUAndLRUList::validate(); @@ -1985,47 +1745,44 @@ void buf_LRU_validate() ut_ad(block->in_unzip_LRU_list); ut_ad(block->page.in_LRU_list); - ut_a(buf_page_belongs_to_unzip_LRU(&block->page)); + ut_a(block->page.belongs_to_unzip_LRU()); } mutex_exit(&buf_pool.mutex); } -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#endif /* UNIV_DEBUG */ -#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG /** Dump the LRU list to stderr. */ void buf_LRU_print() { mutex_enter(&buf_pool.mutex); - for (const buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU); + for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU); bpage != NULL; bpage = UT_LIST_GET_NEXT(LRU, bpage)) { - - mutex_enter(buf_page_get_mutex(bpage)); + const page_id_t id(bpage->id()); fprintf(stderr, "BLOCK space %u page %u ", - bpage->id.space(), bpage->id.page_no()); + id.space(), id.page_no()); - if (buf_page_is_old(bpage)) { + if (bpage->is_old()) { fputs("old ", stderr); } - if (bpage->buf_fix_count) { - fprintf(stderr, "buffix count %u ", - uint32_t(bpage->buf_fix_count)); + if (const uint32_t buf_fix_count = bpage->buf_fix_count()) { + fprintf(stderr, "buffix count %u ", buf_fix_count); } - if (buf_page_get_io_fix(bpage)) { - fprintf(stderr, "io_fix %d ", - buf_page_get_io_fix(bpage)); + if (const auto io_fix = bpage->io_fix()) { + fprintf(stderr, "io_fix %d ", io_fix); } - if (bpage->oldest_modification) { + if (bpage->oldest_modification()) { fputs("modif. ", stderr); } - switch (buf_page_get_state(bpage)) { + switch (const auto state = bpage->state()) { const byte* frame; case BUF_BLOCK_FILE_PAGE: frame = buf_block_get_frame((buf_block_t*) bpage); @@ -2043,14 +1800,11 @@ void buf_LRU_print() break; default: - fprintf(stderr, "\n!state %d!\n", - buf_page_get_state(bpage)); + fprintf(stderr, "\n!state %d!\n", state); break; } - - mutex_exit(buf_page_get_mutex(bpage)); } mutex_exit(&buf_pool.mutex); } -#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */ +#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */ diff --git a/storage/innobase/buf/buf0rea.cc b/storage/innobase/buf/buf0rea.cc index b4bb5bc56a0..535ec39bb86 100644 --- a/storage/innobase/buf/buf0rea.cc +++ b/storage/innobase/buf/buf0rea.cc @@ -33,6 +33,7 @@ Created 11/5/1995 Heikki Tuuri #include "buf0buf.h" #include "buf0flu.h" #include "buf0lru.h" +#include "buf0buddy.h" #include "buf0dblwr.h" #include "ibuf0ibuf.h" #include "log0recv.h" @@ -46,43 +47,213 @@ read-ahead is not done: this is to prevent flooding the buffer pool with i/o-fixed buffer blocks */ #define BUF_READ_AHEAD_PEND_LIMIT 2 -/********************************************************************//** -Unfixes the pages, unlatches the page, -removes it from page_hash and removes it from LRU. */ -static -void -buf_read_page_handle_error( -/*=======================*/ - buf_page_t* bpage) /*!< in: pointer to the block */ +/** Remove the sentinel block for the watch before replacing it with a +real block. watch_unset() or watch_occurred() will notice +that the block has been replaced with the real block. +@param watch sentinel */ +inline void buf_pool_t::watch_remove(buf_page_t *watch) { - const bool uncompressed = (buf_page_get_state(bpage) - == BUF_BLOCK_FILE_PAGE); - const page_id_t old_page_id = bpage->id; - - /* First unfix and release lock on the bpage */ - mutex_enter(&buf_pool.mutex); - mutex_enter(buf_page_get_mutex(bpage)); - ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_READ); - - bpage->id.set_corrupt_id(); - /* Set BUF_IO_NONE before we remove the block from LRU list */ - buf_page_set_io_fix(bpage, BUF_IO_NONE); - - if (uncompressed) { - rw_lock_x_unlock_gen( - &((buf_block_t*) bpage)->lock, - BUF_IO_READ); - } + ut_ad(rw_lock_own(hash_lock_get(watch->id()), RW_LOCK_X)); + ut_a(watch_is_sentinel(*watch)); + if (watch->buf_fix_count()) + { + ut_ad(watch->in_page_hash); + ut_d(watch->in_page_hash= false); + HASH_DELETE(buf_page_t, hash, page_hash, watch->id().fold(), watch); + watch->set_buf_fix_count(0); + } + ut_ad(!watch->in_page_hash); + watch->set_state(BUF_BLOCK_NOT_USED); + watch->id_= page_id_t(~0ULL); +} - mutex_exit(buf_page_get_mutex(bpage)); +/** Initialize a page for read to the buffer buf_pool. If the page is +(1) already in buf_pool, or +(2) if we specify to read only ibuf pages and the page is not an ibuf page, or +(3) if the space is deleted or being deleted, +then this function does nothing. +Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock +on the buffer frame. The io-handler must take care that the flag is cleared +and the lock released later. +@param[in] mode BUF_READ_IBUF_PAGES_ONLY, ... +@param[in] page_id page id +@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 +@param[in] unzip whether the uncompressed page is + requested (for ROW_FORMAT=COMPRESSED) +@return pointer to the block +@retval NULL in case of an error */ +static buf_page_t* buf_page_init_for_read(ulint mode, const page_id_t page_id, + ulint zip_size, bool unzip) +{ + mtr_t mtr; + bool lru= false; + + if (mode == BUF_READ_IBUF_PAGES_ONLY) + { + /* It is a read-ahead within an ibuf routine */ + ut_ad(!ibuf_bitmap_page(page_id, zip_size)); + ibuf_mtr_start(&mtr); + + if (!recv_no_ibuf_operations && !ibuf_page(page_id, zip_size, &mtr)) + { + ibuf_mtr_commit(&mtr); + return nullptr; + } + } + else + ut_ad(mode == BUF_READ_ANY_PAGE); + + buf_page_t *bpage= nullptr; + buf_block_t *block= nullptr; + if (!zip_size || unzip || recv_recovery_is_on()) + { + block= buf_LRU_get_free_block(false); + block->initialise(page_id, zip_size); + /* We set a pass-type x-lock on the frame because then + the same thread which called for the read operation + (and is running now at this point of code) can wait + for the read to complete by waiting for the x-lock on + the frame; if the x-lock were recursive, the same + thread would illegally get the x-lock before the page + read is completed. The x-lock will be released + in buf_page_read_complete() by the io-handler thread. */ + rw_lock_x_lock_gen(&block->lock, BUF_IO_READ); + } + + mutex_enter(&buf_pool.mutex); + + rw_lock_t *hash_lock= buf_pool.hash_lock_get(page_id); + rw_lock_x_lock(hash_lock); + + buf_page_t *hash_page= buf_pool.page_hash_get_low(page_id); + if (hash_page && !buf_pool.watch_is_sentinel(*hash_page)) + { + /* The page is already in the buffer pool. */ + rw_lock_x_unlock(hash_lock); + if (block) + { + rw_lock_x_unlock_gen(&block->lock, BUF_IO_READ); + buf_LRU_block_free_non_file_page(block); + } + goto func_exit; + } + + if (UNIV_LIKELY(block != nullptr)) + { + bpage= &block->page; + + /* Insert into the hash table of file pages */ + if (hash_page) + { + /* Preserve the reference count. */ + auto buf_fix_count= hash_page->buf_fix_count(); + ut_a(buf_fix_count > 0); + block->page.add_buf_fix_count(buf_fix_count); + buf_pool.watch_remove(hash_page); + } + + block->page.set_state(BUF_BLOCK_FILE_PAGE); + block->page.set_io_fix(BUF_IO_READ); + ut_ad(!block->page.in_page_hash); + ut_d(block->page.in_page_hash= true); + HASH_INSERT(buf_page_t, hash, buf_pool.page_hash, page_id.fold(), bpage); + rw_lock_x_unlock(hash_lock); + + /* The block must be put to the LRU list, to the old blocks */ + buf_LRU_add_block(bpage, true/* to old blocks */); + + if (UNIV_UNLIKELY(zip_size)) + { + /* buf_pool.mutex may be released and reacquired by + buf_buddy_alloc(). We must defer this operation until after the + block descriptor has been added to buf_pool.LRU and + buf_pool.page_hash. */ + block->page.zip.data= static_cast<page_zip_t*> + (buf_buddy_alloc(zip_size, &lru)); + + /* To maintain the invariant + block->in_unzip_LRU_list == block->page.belongs_to_unzip_LRU() + we have to add this block to unzip_LRU + after block->page.zip.data is set. */ + ut_ad(block->page.belongs_to_unzip_LRU()); + buf_unzip_LRU_add_block(block, TRUE); + } + } + else + { + rw_lock_x_unlock(hash_lock); + + /* The compressed page must be allocated before the + control block (bpage), in order to avoid the + invocation of buf_buddy_relocate_block() on + uninitialized data. */ + bool lru; + void *data= buf_buddy_alloc(zip_size, &lru); + + rw_lock_x_lock(hash_lock); + + /* If buf_buddy_alloc() allocated storage from the LRU list, + it released and reacquired buf_pool.mutex. Thus, we must + check the page_hash again, as it may have been modified. */ + if (UNIV_UNLIKELY(lru)) + { + hash_page= buf_pool.page_hash_get_low(page_id); + + if (UNIV_UNLIKELY(hash_page && !buf_pool.watch_is_sentinel(*hash_page))) + { + /* The block was added by some other thread. */ + rw_lock_x_unlock(hash_lock); + buf_buddy_free(data, zip_size); + goto func_exit; + } + } + + bpage= buf_page_alloc_descriptor(); + + page_zip_des_init(&bpage->zip); + page_zip_set_size(&bpage->zip, zip_size); + bpage->zip.data = (page_zip_t*) data; + + UNIV_MEM_DESC(bpage->zip.data, zip_size); + + bpage->init(BUF_BLOCK_ZIP_PAGE, page_id); + + if (hash_page) + { + /* Preserve the reference count. It can be 0 if + buf_pool_t::watch_unset() is executing concurrently, + waiting for buf_pool.mutex, which we are holding. */ + bpage->add_buf_fix_count(hash_page->buf_fix_count()); + buf_pool.watch_remove(hash_page); + } + + ut_ad(!bpage->in_page_hash); + ut_d(bpage->in_page_hash= true); + HASH_INSERT(buf_page_t, hash, buf_pool.page_hash, page_id.fold(), bpage); + bpage->set_io_fix(BUF_IO_READ); + rw_lock_x_unlock(hash_lock); + + /* The block must be put to the LRU list, to the old blocks. + The zip size is already set into the page zip */ + buf_LRU_add_block(bpage, true/* to old blocks */); +#ifdef UNIV_DEBUG + buf_LRU_insert_zip_clean(bpage); +#endif /* UNIV_DEBUG */ + } - /* remove the block from LRU list */ - buf_LRU_free_one_page(bpage, old_page_id); + mutex_exit(&buf_pool.mutex); + buf_pool.n_pend_reads++; + goto func_exit_no_mutex; +func_exit: + mutex_exit(&buf_pool.mutex); +func_exit_no_mutex: + if (mode == BUF_READ_IBUF_PAGES_ONLY) + ibuf_mtr_commit(&mtr); - ut_ad(buf_pool.n_pend_reads > 0); - buf_pool.n_pend_reads--; + ut_ad(!rw_lock_own_flagged(hash_lock, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); + ut_ad(!bpage || bpage->in_file()); - mutex_exit(&buf_pool.mutex); + return bpage; } /** Low-level function which reads a page asynchronously from a file to the @@ -118,8 +289,7 @@ buf_read_page_low( *err = DB_SUCCESS; - if (page_id.space() == TRX_SYS_SPACE - && buf_dblwr_page_inside(page_id.page_no())) { + if (!page_id.space() && buf_dblwr_page_inside(page_id.page_no())) { ib::error() << "Trying to read doublewrite buffer page " << page_id; @@ -141,7 +311,7 @@ buf_read_page_low( or is being dropped; if we succeed in initing the page in the buffer pool for read, then DISCARD cannot proceed until the read has completed */ - bpage = buf_page_init_for_read(err, mode, page_id, zip_size, unzip); + bpage = buf_page_init_for_read(mode, page_id, zip_size, unzip); if (bpage == NULL) { @@ -152,7 +322,7 @@ buf_read_page_low( "read page " << page_id << " zip_size=" << zip_size << " unzip=" << unzip << ',' << (sync ? "sync" : "async")); - ut_ad(buf_page_in_file(bpage)); + ut_ad(bpage->in_file()); if (sync) { thd_wait_begin(NULL, THD_WAIT_DISKIO); @@ -163,23 +333,24 @@ buf_read_page_low( if (zip_size) { dst = bpage->zip.data; } else { - ut_a(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); + ut_a(bpage->state() == BUF_BLOCK_FILE_PAGE); dst = ((buf_block_t*) bpage)->frame; } - *err = fil_io( + fil_io_t fio = fil_io( IORequestRead, sync, page_id, zip_size, 0, zip_size ? zip_size : srv_page_size, dst, bpage, ignore); - if (sync) { - thd_wait_end(NULL); - } + *err= fio.err; - if (UNIV_UNLIKELY(*err != DB_SUCCESS)) { - if (ignore || *err == DB_TABLESPACE_DELETED) { - buf_read_page_handle_error(bpage); + if (UNIV_UNLIKELY(fio.err != DB_SUCCESS)) { + if (ignore || fio.err == DB_TABLESPACE_DELETED) { + buf_pool.corrupted_evict(bpage); + if (sync && fio.node) { + fio.node->space->release_for_io(); + } return(0); } @@ -187,9 +358,11 @@ buf_read_page_low( } if (sync) { - /* The i/o is already completed when we arrive from - fil_read */ - *err = buf_page_io_complete(bpage); + thd_wait_end(NULL); + + /* The i/o was already completed in fil_io() */ + *err = buf_page_read_complete(bpage, *fio.node); + fio.node->space->release_for_io(); if (*err != DB_SUCCESS) { return(0); @@ -218,153 +391,79 @@ get read even if we return a positive value! */ ulint buf_read_ahead_random(const page_id_t page_id, ulint zip_size, bool ibuf) { - ulint recent_blocks = 0; - ulint ibuf_mode; - ulint count; - ulint low, high; - dberr_t err = DB_SUCCESS; - ulint i; - - if (!srv_random_read_ahead) { - /* Disabled by user */ - return(0); - } - - if (srv_startup_is_before_trx_rollback_phase) { - /* No read-ahead to avoid thread deadlocks */ - return(0); - } - - if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id)) { - - /* If it is an ibuf bitmap page or trx sys hdr, we do - no read-ahead, as that could break the ibuf page access - order */ - - return(0); - } - - const ulint buf_read_ahead_random_area - = buf_pool.read_ahead_area; - low = (page_id.page_no() / buf_read_ahead_random_area) - * buf_read_ahead_random_area; - - high = (page_id.page_no() / buf_read_ahead_random_area + 1) - * buf_read_ahead_random_area; - - /* If DISCARD + IMPORT changes the actual .ibd file meanwhile, we - do not try to read outside the bounds of the tablespace! */ - if (fil_space_t* space = fil_space_acquire(page_id.space())) { - -#ifdef UNIV_DEBUG - if (srv_file_per_table) { - ulint size = 0; - const ulint physical_size = space->physical_size(); - - for (const fil_node_t* node = - UT_LIST_GET_FIRST(space->chain); - node != NULL; - node = UT_LIST_GET_NEXT(chain, node)) { - - size += ulint(os_file_get_size(node->handle) - / physical_size); - } - - ut_ad(size == space->size); - } -#endif /* UNIV_DEBUG */ - - if (high > space->size) { - high = space->size; - } - space->release(); - } else { - return(0); - } - - mutex_enter(&buf_pool.mutex); - - if (buf_pool.n_pend_reads - > buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT) { - mutex_exit(&buf_pool.mutex); - - return(0); - } - - /* Count how many blocks in the area have been recently accessed, - that is, reside near the start of the LRU list. */ - - for (i = low; i < high; i++) { - if (const buf_page_t* bpage = buf_page_hash_get( - page_id_t(page_id.space(), i))) { - if (buf_page_is_accessed(bpage) - && buf_page_peek_if_young(bpage) - && ++recent_blocks - >= 5 + buf_pool.read_ahead_area / 8) { - mutex_exit(&buf_pool.mutex); - goto read_ahead; - } - } - } - - mutex_exit(&buf_pool.mutex); - /* Do nothing */ - return(0); + if (!srv_random_read_ahead) + return 0; + + if (srv_startup_is_before_trx_rollback_phase) + /* No read-ahead to avoid thread deadlocks */ + return 0; + + if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id)) + /* If it is an ibuf bitmap page or trx sys hdr, we do no + read-ahead, as that could break the ibuf page access order */ + return 0; + + if (buf_pool.n_pend_reads > buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT) + return 0; + + fil_space_t* space= fil_space_acquire(page_id.space()); + if (!space) + return 0; + + const uint32_t buf_read_ahead_area= buf_pool.read_ahead_area; + ulint count= 5 + buf_read_ahead_area / 8; + const page_id_t low= page_id - (page_id.page_no() % buf_read_ahead_area); + page_id_t high= low + buf_read_ahead_area; + high.set_page_no(std::min(high.page_no(), + static_cast<uint32_t>(space->size - 1))); + + /* Count how many blocks in the area have been recently accessed, + that is, reside near the start of the LRU list. */ + + for (page_id_t i= low; i < high; ++i) + { + const ulint fold= i.fold(); + rw_lock_t *hash_lock= buf_pool.page_hash_lock<false>(fold); + const buf_page_t* bpage= buf_pool.page_hash_get_low(i); + bool found= bpage && bpage->is_accessed() && buf_page_peek_if_young(bpage); + rw_lock_s_unlock(hash_lock); + if (found && !--count) + goto read_ahead; + } + + space->release(); + return 0; read_ahead: - /* Read all the suitable blocks within the area */ - - ibuf_mode = ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE; - count = 0; - - for (i = low; i < high; i++) { - /* It is only sensible to do read-ahead in the non-sync aio - mode: hence FALSE as the first parameter */ - - const page_id_t cur_page_id(page_id.space(), i); - - if (!ibuf_bitmap_page(cur_page_id, zip_size)) { - count += buf_read_page_low( - &err, false, - ibuf_mode, - cur_page_id, zip_size, false); - - switch (err) { - case DB_SUCCESS: - case DB_ERROR: - break; - case DB_TABLESPACE_DELETED: - ib::info() << "Random readahead trying to" - " access page " << cur_page_id - << " in nonexisting or" - " being-dropped tablespace"; - break; - default: - ut_error; - } - } - } - - - if (count) { - DBUG_PRINT("ib_buf", ("random read-ahead %u pages, %u:%u", - (unsigned) count, - (unsigned) page_id.space(), - (unsigned) page_id.page_no())); - } - - /* Read ahead is considered one I/O operation for the purpose of - LRU policy decision. */ - buf_LRU_stat_inc_io(); - - buf_pool.stat.n_ra_pages_read_rnd += count; - srv_stats.buf_pool_reads.add(count); - return(count); + /* Read all the suitable blocks within the area */ + const ulint ibuf_mode= ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE; + + for (page_id_t i= low; i < high; ++i) + { + if (ibuf_bitmap_page(i, zip_size)) + continue; + dberr_t err; + count+= buf_read_page_low(&err, false, ibuf_mode, i, zip_size, false); + } + + if (count) + DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u", + count, space->chain.start->name, + low.page_no())); + space->release(); + + /* Read ahead is considered one I/O operation for the purpose of + LRU policy decision. */ + buf_LRU_stat_inc_io(); + + buf_pool.stat.n_ra_pages_read_rnd+= count; + srv_stats.buf_pool_reads.add(count); + return count; } -/** High-level function which reads a page asynchronously from a file to the -buffer buf_pool if it is not already there. Sets the io_fix flag and sets -an exclusive lock on the buffer frame. The flag is cleared and the x-lock +/** High-level function which reads a page from a file to buf_pool +if it is not already there. Sets the io_fix and an exclusive lock +on the buffer frame. The flag is cleared and the x-lock released by the i/o-handler thread. @param[in] page_id page id @param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 @@ -375,16 +474,9 @@ after decryption normal page checksum does not match. @retval DB_TABLESPACE_DELETED if tablespace .ibd file is missing */ dberr_t buf_read_page(const page_id_t page_id, ulint zip_size) { - ulint count; dberr_t err = DB_SUCCESS; - /* We do synchronous IO because our AIO completion code - is sub-optimal. See buf_page_io_complete(), we have to - acquire the buffer pool mutex before acquiring the block - mutex, required for updating the page state. The acquire - of the buffer pool mutex becomes an expensive bottleneck. */ - - count = buf_read_page_low( + ulint count = buf_read_page_low( &err, true, BUF_READ_ANY_PAGE, page_id, zip_size, false); srv_stats.buf_pool_reads.add(count); @@ -477,247 +569,155 @@ which could result in a deadlock if the OS does not support asynchronous io. ulint buf_read_ahead_linear(const page_id_t page_id, ulint zip_size, bool ibuf) { - buf_page_t* bpage; - buf_frame_t* frame; - buf_page_t* pred_bpage = NULL; - ulint pred_offset; - ulint succ_offset; - int asc_or_desc; - ulint new_offset; - ulint fail_count; - ulint low, high; - dberr_t err = DB_SUCCESS; - ulint i; - ulint threshold; - - /* check if readahead is disabled */ - if (!srv_read_ahead_threshold) { - return(0); - } - - if (srv_startup_is_before_trx_rollback_phase) { - /* No read-ahead to avoid thread deadlocks */ - return(0); - } - - const ulint buf_read_ahead_linear_area - = buf_pool.read_ahead_area; - low = (page_id.page_no() / buf_read_ahead_linear_area) - * buf_read_ahead_linear_area; - high = (page_id.page_no() / buf_read_ahead_linear_area + 1) - * buf_read_ahead_linear_area; - - if ((page_id.page_no() != low) && (page_id.page_no() != high - 1)) { - /* This is not a border page of the area: return */ - - return(0); - } - - if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id)) { - - /* If it is an ibuf bitmap page or trx sys hdr, we do - no read-ahead, as that could break the ibuf page access - order */ - - return(0); - } - - /* Remember the tablespace version before we ask te tablespace size - below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we - do not try to read outside the bounds of the tablespace! */ - ulint space_size; - - if (fil_space_t* space = fil_space_acquire(page_id.space())) { - space_size = space->size; - space->release(); - - if (high > space_size) { - /* The area is not whole */ - return(0); - } - } else { - return(0); - } - - mutex_enter(&buf_pool.mutex); - - if (buf_pool.n_pend_reads - > buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT) { - mutex_exit(&buf_pool.mutex); - - return(0); - } - - /* Check that almost all pages in the area have been accessed; if - offset == low, the accesses must be in a descending order, otherwise, - in an ascending order. */ - - asc_or_desc = 1; - - if (page_id.page_no() == low) { - asc_or_desc = -1; - } - - /* How many out of order accessed pages can we ignore - when working out the access pattern for linear readahead */ - threshold = ut_min(static_cast<ulint>(64 - srv_read_ahead_threshold), - buf_pool.read_ahead_area); - - fail_count = 0; - - for (i = low; i < high; i++) { - bpage = buf_page_hash_get(page_id_t(page_id.space(), i)); - - if (bpage == NULL || !buf_page_is_accessed(bpage)) { - /* Not accessed */ - fail_count++; - - } else if (pred_bpage) { - /* Note that buf_page_is_accessed() returns - the time of the first access. If some blocks - of the extent existed in the buffer pool at - the time of a linear access pattern, the first - access times may be nonmonotonic, even though - the latest access times were linear. The - threshold (srv_read_ahead_factor) should help - a little against this. */ - int res = ut_ulint_cmp( - buf_page_is_accessed(bpage), - buf_page_is_accessed(pred_bpage)); - /* Accesses not in the right order */ - if (res != 0 && res != asc_or_desc) { - fail_count++; - } - } - - if (fail_count > threshold) { - /* Too many failures: return */ - mutex_exit(&buf_pool.mutex); - return(0); - } - - if (bpage && buf_page_is_accessed(bpage)) { - pred_bpage = bpage; - } - } - - /* If we got this far, we know that enough pages in the area have - been accessed in the right order: linear read-ahead can be sensible */ - - bpage = buf_page_hash_get(page_id); - - if (bpage == NULL) { - mutex_exit(&buf_pool.mutex); - - return(0); - } - - switch (buf_page_get_state(bpage)) { - case BUF_BLOCK_ZIP_PAGE: - frame = bpage->zip.data; - break; - case BUF_BLOCK_FILE_PAGE: - frame = ((buf_block_t*) bpage)->frame; - break; - default: - ut_error; - break; - } - - /* Read the natural predecessor and successor page addresses from - the page; NOTE that because the calling thread may have an x-latch - on the page, we do not acquire an s-latch on the page, this is to - prevent deadlocks. Even if we read values which are nonsense, the - algorithm will work. */ - - pred_offset = mach_read_from_4(my_assume_aligned<4>(FIL_PAGE_PREV - + frame)); - succ_offset = mach_read_from_4(my_assume_aligned<4>(FIL_PAGE_NEXT - + frame)); - mutex_exit(&buf_pool.mutex); - - if ((page_id.page_no() == low) - && (succ_offset == page_id.page_no() + 1)) { - - /* This is ok, we can continue */ - new_offset = pred_offset; - - } else if ((page_id.page_no() == high - 1) - && (pred_offset == page_id.page_no() - 1)) { - - /* This is ok, we can continue */ - new_offset = succ_offset; - } else { - /* Successor or predecessor not in the right order */ - - return(0); - } - - low = (new_offset / buf_read_ahead_linear_area) - * buf_read_ahead_linear_area; - high = (new_offset / buf_read_ahead_linear_area + 1) - * buf_read_ahead_linear_area; - - if ((new_offset != low) && (new_offset != high - 1)) { - /* This is not a border page of the area: return */ - - return(0); - } - - if (high > space_size) { - /* The area is not whole, return */ - - return(0); - } - - ulint count = 0; - - /* If we got this far, read-ahead can be sensible: do it */ - - ulint ibuf_mode = ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE; - - for (i = low; i < high; i++) { - /* It is only sensible to do read-ahead in the non-sync - aio mode: hence FALSE as the first parameter */ - - const page_id_t cur_page_id(page_id.space(), i); - - if (!ibuf_bitmap_page(cur_page_id, zip_size)) { - count += buf_read_page_low( - &err, false, - ibuf_mode, cur_page_id, zip_size, false); - - switch (err) { - case DB_SUCCESS: - case DB_TABLESPACE_DELETED: - case DB_ERROR: - break; - case DB_PAGE_CORRUPTED: - case DB_DECRYPTION_FAILED: - ib::error() << "linear readahead failed to" - " read or decrypt " - << page_id_t(page_id.space(), i); - break; - default: - ut_error; - } - } - } - - if (count) { - DBUG_PRINT("ib_buf", ("linear read-ahead " ULINTPF " pages, " - "%u:%u", - count, - page_id.space(), - page_id.page_no())); - } - - /* Read ahead is considered one I/O operation for the purpose of - LRU policy decision. */ - buf_LRU_stat_inc_io(); - - buf_pool.stat.n_ra_pages_read += count; - return(count); + /* check if readahead is disabled */ + if (!srv_read_ahead_threshold) + return 0; + + if (srv_startup_is_before_trx_rollback_phase) + /* No read-ahead to avoid thread deadlocks */ + return 0; + + if (buf_pool.n_pend_reads > buf_pool.curr_size / BUF_READ_AHEAD_PEND_LIMIT) + return 0; + + const uint32_t buf_read_ahead_area= buf_pool.read_ahead_area; + const page_id_t low= page_id - (page_id.page_no() % buf_read_ahead_area); + const page_id_t high_1= low + (buf_read_ahead_area - 1); + + /* We will check that almost all pages in the area have been accessed + in the desired order. */ + const bool descending= page_id == low; + + if (!descending && page_id != high_1) + /* This is not a border page of the area */ + return 0; + + if (ibuf_bitmap_page(page_id, zip_size) || trx_sys_hdr_page(page_id)) + /* If it is an ibuf bitmap page or trx sys hdr, we do no + read-ahead, as that could break the ibuf page access order */ + return 0; + + fil_space_t *space= fil_space_acquire(page_id.space()); + if (!space) + return 0; + if (high_1.page_no() >= space->size) + { + /* The area is not whole. */ + space->release(); + return 0; + } + + /* How many out of order accessed pages can we ignore + when working out the access pattern for linear readahead */ + ulint count= std::min<ulint>(buf_pool_t::READ_AHEAD_PAGES - + srv_read_ahead_threshold, + uint32_t{buf_pool.read_ahead_area}); + page_id_t new_low= low, new_high_1= high_1; + unsigned prev_accessed= 0; + for (page_id_t i= low; i != high_1; ++i) + { + const ulint fold= i.fold(); + rw_lock_t *hash_lock= buf_pool.page_hash_lock<false>(fold); + const buf_page_t* bpage= buf_pool.page_hash_get_low(i); + if (i == page_id) + { + /* Read the natural predecessor and successor page addresses from + the page; NOTE that because the calling thread may have an x-latch + on the page, we do not acquire an s-latch on the page, this is to + prevent deadlocks. The hash_lock is only protecting the + buf_pool.page_hash for page i, not the bpage contents itself. */ + if (!bpage) + { +hard_fail: + rw_lock_s_unlock(hash_lock); + space->release(); + return 0; + } + const byte *f; + switch (UNIV_EXPECT(bpage->state(), BUF_BLOCK_FILE_PAGE)) { + case BUF_BLOCK_FILE_PAGE: + f= reinterpret_cast<const buf_block_t*>(bpage)->frame; + break; + case BUF_BLOCK_ZIP_PAGE: + f= bpage->zip.data; + break; + default: + goto hard_fail; + } + + uint32_t prev= mach_read_from_4(my_assume_aligned<4>(f + FIL_PAGE_PREV)); + uint32_t next= mach_read_from_4(my_assume_aligned<4>(f + FIL_PAGE_NEXT)); + if (prev == FIL_NULL || next == FIL_NULL) + goto hard_fail; + page_id_t id= page_id; + if (descending && next - 1 == page_id.page_no()) + id.set_page_no(prev); + else if (!descending && prev + 1 == page_id.page_no()) + id.set_page_no(next); + else + goto hard_fail; /* Successor or predecessor not in the right order */ + + new_low= id - (id.page_no() % buf_read_ahead_area); + new_high_1= new_low + (buf_read_ahead_area - 1); + + if (id != new_low && id != new_high_1) + /* This is not a border page of the area: return */ + goto hard_fail; + if (new_high_1.page_no() >= space->size) + /* The area is not whole */ + goto hard_fail; + } + else if (!bpage) + { +failed: + rw_lock_s_unlock(hash_lock); + if (--count) + continue; + space->release(); + return 0; + } + + const unsigned accessed= bpage->is_accessed(); + if (!accessed) + goto failed; + /* Note that buf_page_t::is_accessed() returns the time of the + first access. If some blocks of the extent existed in the buffer + pool at the time of a linear access pattern, the first access + times may be nonmonotonic, even though the latest access times + were linear. The threshold (srv_read_ahead_factor) should help a + little against this. */ + bool fail= prev_accessed && + (descending ? prev_accessed > accessed : prev_accessed < accessed); + prev_accessed= accessed; + if (fail) + goto failed; + rw_lock_s_unlock(hash_lock); + } + + /* If we got this far, read-ahead can be sensible: do it */ + count= 0; + for (ulint ibuf_mode= ibuf ? BUF_READ_IBUF_PAGES_ONLY : BUF_READ_ANY_PAGE; + new_low != new_high_1; ++new_low) + { + if (ibuf_bitmap_page(new_low, zip_size)) + continue; + dberr_t err; + count+= buf_read_page_low(&err, false, ibuf_mode, new_low, zip_size, + false); + } + + if (count) + DBUG_PRINT("ib_buf", ("random read-ahead %zu pages from %s: %u", + count, space->chain.start->name, + new_low.page_no())); + space->release(); + + /* Read ahead is considered one I/O operation for the purpose of + LRU policy decision. */ + buf_LRU_stat_inc_io(); + + buf_pool.stat.n_ra_pages_read+= count; + return count; } /** Issues read requests for pages which recovery wants to read in. diff --git a/storage/innobase/dict/dict0boot.cc b/storage/innobase/dict/dict0boot.cc index 79a3adc7aa6..833f2621de6 100644 --- a/storage/innobase/dict/dict0boot.cc +++ b/storage/innobase/dict/dict0boot.cc @@ -137,7 +137,7 @@ dict_hdr_create( block = fseg_create(fil_system.sys_space, 0, DICT_HDR + DICT_HDR_FSEG_HEADER, mtr); - ut_a(DICT_HDR_PAGE_NO == block->page.id.page_no()); + ut_a(block->page.id() == page_id_t(DICT_HDR_SPACE, DICT_HDR_PAGE_NO)); buf_block_t* d = dict_hdr_get(mtr); diff --git a/storage/innobase/dict/dict0dict.cc b/storage/innobase/dict/dict0dict.cc index 00d11d1806e..8ff0bfa9a5b 100644 --- a/storage/innobase/dict/dict0dict.cc +++ b/storage/innobase/dict/dict0dict.cc @@ -3754,7 +3754,7 @@ dict_index_get_if_in_cache_low( return(dict_index_find_on_id_low(index_id)); } -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +#ifdef UNIV_DEBUG /**********************************************************************//** Returns an index object if it is found in the dictionary cache. @return index, NULL if not found */ @@ -3777,9 +3777,7 @@ dict_index_get_if_in_cache( return(index); } -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ -#ifdef UNIV_DEBUG /**********************************************************************//** Checks that a tuple has n_fields_cmp value in a sensible range, so that no comparison can occur with the page number field in a node pointer. diff --git a/storage/innobase/fil/fil0crypt.cc b/storage/innobase/fil/fil0crypt.cc index 309eed4e597..0e62cff0ea1 100644 --- a/storage/innobase/fil/fil0crypt.cc +++ b/storage/innobase/fil/fil0crypt.cc @@ -1100,7 +1100,7 @@ static bool fil_crypt_start_encrypting_space(fil_space_t* space) do { ulint n_pages = 0; success = buf_flush_lists(ULINT_MAX, end_lsn, &n_pages); - buf_flush_wait_batch_end(BUF_FLUSH_LIST); + buf_flush_wait_batch_end(false); sum_pages += n_pages; } while (!success); @@ -1894,7 +1894,7 @@ fil_crypt_flush_space( do { success = buf_flush_lists(ULINT_MAX, end_lsn, &n_pages); - buf_flush_wait_batch_end(BUF_FLUSH_LIST); + buf_flush_wait_batch_end(false); sum_pages += n_pages; } while (!success && !space->is_stopping()); @@ -2438,7 +2438,7 @@ bool fil_space_verify_crypt_checksum(const byte* page, ulint zip_size) /* Compressed and encrypted pages do not have checksum. Assume not corrupted. Page verification happens after decompression in - buf_page_io_complete() using buf_page_is_corrupted(). */ + buf_page_read_complete() using buf_page_is_corrupted(). */ if (fil_page_get_type(page) == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED) { return true; } diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 844291db939..b627e7826a5 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -173,12 +173,6 @@ fil_system_t fil_system; UNIV_INTERN extern uint srv_fil_crypt_rotate_key_age; UNIV_INTERN extern ib_mutex_t fil_crypt_threads_mutex; -/** Determine if user has explicitly disabled fsync(). */ -# define fil_buffering_disabled(s) \ - ((s)->purpose == FIL_TYPE_TABLESPACE \ - && srv_file_flush_method \ - == SRV_O_DIRECT_NO_FSYNC) - /** Determine if the space id is a user tablespace id or not. @param[in] space_id Space ID to check @return true if it is a user tablespace ID */ @@ -249,67 +243,6 @@ fil_node_prepare_for_io( fil_node_t* node, /*!< in: file node */ fil_space_t* space); /*!< in: space */ -/** Update the data structures when an i/o operation finishes. -@param[in,out] node file node -@param[in] type IO context */ -static -void -fil_node_complete_io(fil_node_t* node, const IORequest& type); - -/** Reads data from a space to a buffer. Remember that the possible incomplete -blocks at the end of file are ignored: they are not taken into account when -calculating the byte offset within a space. -@param[in] page_id page id -@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 -@param[in] byte_offset remainder of offset in bytes; in aio this -must be divisible by the OS block size -@param[in] len how many bytes to read; this must not cross a -file boundary; in aio this must be a block size multiple -@param[in,out] buf buffer where to store data read; in aio this -must be appropriately aligned -@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do -i/o on a tablespace which does not exist */ -UNIV_INLINE -dberr_t -fil_read( - const page_id_t page_id, - ulint zip_size, - ulint byte_offset, - ulint len, - void* buf) -{ - return(fil_io(IORequestRead, true, page_id, zip_size, - byte_offset, len, buf, NULL)); -} - -/** Writes data to a space from a buffer. Remember that the possible incomplete -blocks at the end of file are ignored: they are not taken into account when -calculating the byte offset within a space. -@param[in] page_id page id -@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 -@param[in] byte_offset remainder of offset in bytes; in aio this -must be divisible by the OS block size -@param[in] len how many bytes to write; this must not cross -a file boundary; in aio this must be a block size multiple -@param[in] buf buffer from which to write; in aio this must -be appropriately aligned -@return DB_SUCCESS, or DB_TABLESPACE_DELETED if we are trying to do -i/o on a tablespace which does not exist */ -UNIV_INLINE -dberr_t -fil_write( - const page_id_t page_id, - ulint zip_size, - ulint byte_offset, - ulint len, - void* buf) -{ - ut_ad(!srv_read_only_mode); - - return(fil_io(IORequestWrite, true, page_id, zip_size, - byte_offset, len, buf, NULL)); -} - /*******************************************************************//** Returns the table space by a given id, NULL if not found. It is unsafe to dereference the returned pointer. It is fine to check @@ -395,8 +328,7 @@ fil_space_is_flushed( node = UT_LIST_GET_NEXT(chain, node)) { if (node->needs_flush) { - - ut_ad(!fil_buffering_disabled(space)); + ut_ad(srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC); return(false); } } @@ -678,8 +610,7 @@ static void fil_flush_low(fil_space_t* space, bool metadata = false) ut_ad(space); ut_ad(!space->stop_new_ops); - if (fil_buffering_disabled(space)) { - + if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC) { /* No need to flush. User has explicitly disabled buffering. */ ut_ad(!space->is_in_unflushed_spaces); @@ -839,7 +770,7 @@ fil_space_extend_must_retry( const ulint pages_in_MiB = node->size & ~ulint((1U << (20U - srv_page_size_shift)) - 1); - fil_node_complete_io(node,IORequestRead); + node->complete_io(); /* Keep the last data file size info up to date, rounded to full megabytes */ @@ -864,14 +795,12 @@ fil_space_extend_must_retry( } } -/*******************************************************************//** -Reserves the fil_system.mutex and tries to make sure we can open at least one +/** Acquire fil_system.mutex and try to make sure we can open at least one file while holding it. This should be called before calling fil_node_prepare_for_io(), because that function may need to open a file. */ static -void +fil_space_t* fil_mutex_enter_and_prepare_for_io( -/*===============================*/ ulint space_id) /*!< in: space id */ { for (ulint count = 0;;) { @@ -879,8 +808,8 @@ fil_mutex_enter_and_prepare_for_io( fil_space_t* space = fil_space_get_by_id(space_id); - if (space == NULL) { - break; + if (!space) { + return nullptr; } fil_node_t* node = UT_LIST_GET_LAST(space->chain); @@ -960,7 +889,7 @@ fil_mutex_enter_and_prepare_for_io( } } - break; + return space; } } @@ -987,87 +916,96 @@ fil_space_extend( return(success); } -/** Prepare to free a file node object from a tablespace memory cache. -@param[in,out] node file node -@param[in] space tablespace */ -static -void -fil_node_close_to_free( - fil_node_t* node, - fil_space_t* space) +/** Prepare to free a file from fil_system. */ +inline void fil_node_t::close_to_free() { - ut_ad(mutex_own(&fil_system.mutex)); - ut_a(node->magic_n == FIL_NODE_MAGIC_N); - ut_a(node->n_pending == 0); - ut_a(!node->being_extended); - - if (node->is_open()) { - /* We fool the assertion in fil_node_t::close() to think - there are no unflushed modifications in the file */ - - node->needs_flush = false; - - if (fil_buffering_disabled(space)) { - - ut_ad(!space->is_in_unflushed_spaces); - ut_ad(fil_space_is_flushed(space)); - - } else if (space->is_in_unflushed_spaces - && fil_space_is_flushed(space)) { + ut_ad(mutex_own(&fil_system.mutex)); + ut_a(magic_n == FIL_NODE_MAGIC_N); + ut_a(!being_extended); - fil_system.unflushed_spaces.remove(*space); - space->is_in_unflushed_spaces = false; - } - - node->close(); - } + while (is_open()) + { + if (space->is_in_unflushed_spaces) + { + ut_ad(srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC); + space->is_in_unflushed_spaces= false; + fil_system.unflushed_spaces.remove(*space); + } + + if (n_pending) + { + mutex_exit(&fil_system.mutex); + os_thread_sleep(100); + mutex_enter(&fil_system.mutex); + continue; + } + + if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC) + { + ut_ad(!space->is_in_unflushed_spaces); + ut_ad(fil_space_is_flushed(space)); + } + else if (space->is_in_unflushed_spaces && fil_space_is_flushed(space)) + { + space->is_in_unflushed_spaces= false; + fil_system.unflushed_spaces.remove(*space); + } + + if (fil_space_belongs_in_lru(space)) + { + ut_ad(UT_LIST_GET_LEN(fil_system.LRU) > 0); + UT_LIST_REMOVE(fil_system.LRU, this); + } + ut_a(!n_pending_flushes); + ut_a(!being_extended); + bool ret= os_file_close(handle); + ut_a(ret); + handle= OS_FILE_CLOSED; + break; + } } -/** Detach a space object from the tablespace memory cache. -Closes the files in the chain but does not delete them. -There must not be any pending i/o's or flushes on the files. -@param[in,out] space tablespace */ -static -void -fil_space_detach( - fil_space_t* space) +/** Detach a tablespace from the cache and close the files. */ +inline void fil_system_t::detach(fil_space_t *space) { - ut_ad(mutex_own(&fil_system.mutex)); - - HASH_DELETE(fil_space_t, hash, fil_system.spaces, space->id, space); - - if (space->is_in_unflushed_spaces) { + ut_ad(mutex_own(&fil_system.mutex)); + HASH_DELETE(fil_space_t, hash, spaces, space->id, space); - ut_ad(!fil_buffering_disabled(space)); - fil_system.unflushed_spaces.remove(*space); - space->is_in_unflushed_spaces = false; - } - - if (space->is_in_rotation_list) { - fil_system.rotation_list.remove(*space); - space->is_in_rotation_list = false; - } - - UT_LIST_REMOVE(fil_system.space_list, space); - - ut_a(space->magic_n == FIL_SPACE_MAGIC_N); - ut_a(space->n_pending_flushes == 0); - - for (fil_node_t* fil_node = UT_LIST_GET_FIRST(space->chain); - fil_node != NULL; - fil_node = UT_LIST_GET_NEXT(chain, fil_node)) { - - fil_node_close_to_free(fil_node, space); - } + if (space->is_in_unflushed_spaces) + { + ut_ad(srv_file_flush_method != SRV_O_DIRECT_NO_FSYNC); + space->is_in_unflushed_spaces= false; + unflushed_spaces.remove(*space); + } - if (space == fil_system.sys_space) { - fil_system.sys_space = NULL; - } else if (space == fil_system.temp_space) { - fil_system.temp_space = NULL; - } + if (space->is_in_rotation_list) + { + space->is_in_rotation_list= false; + rotation_list.remove(*space); + } + UT_LIST_REMOVE(space_list, space); + if (space == sys_space) + sys_space= nullptr; + else if (space == temp_space) + temp_space= nullptr; + + ut_a(space->magic_n == FIL_SPACE_MAGIC_N); + ut_a(space->n_pending_flushes == 0); + + for (fil_node_t* node= UT_LIST_GET_FIRST(space->chain); node; + node= UT_LIST_GET_NEXT(chain, node)) + if (node->is_open()) + { + ut_ad(n_open > 0); + n_open--; + } + + for (fil_node_t* node= UT_LIST_GET_FIRST(space->chain); node; + node= UT_LIST_GET_NEXT(chain, node)) + node->close_to_free(); } -/** Free a tablespace object on which fil_space_detach() was invoked. +/** Free a tablespace object on which fil_system_t::detach() was invoked. There must not be any pending i/o's or flushes on the files. @param[in,out] space tablespace */ static @@ -1080,7 +1018,7 @@ fil_space_free_low( || space->max_lsn == 0); /* Wait for fil_space_t::release_for_io(); after - fil_space_detach(), the tablespace cannot be found, so + fil_system_t::detach(), the tablespace cannot be found, so fil_space_acquire_for_io() would return NULL */ while (space->pending_io()) { os_thread_sleep(100); @@ -1121,7 +1059,7 @@ fil_space_free( fil_space_t* space = fil_space_get_by_id(id); if (space != NULL) { - fil_space_detach(space); + fil_system.detach(space); } mutex_exit(&fil_system.mutex); @@ -1332,9 +1270,7 @@ fil_space_t* fil_system_t::read_page0(ulint id) /* It is possible that the tablespace is dropped while we are not holding the mutex. */ - fil_mutex_enter_and_prepare_for_io(id); - - fil_space_t* space = fil_space_get_by_id(id); + fil_space_t* space = fil_mutex_enter_and_prepare_for_io(id); if (space == NULL || UT_LIST_GET_LEN(space->chain) == 0) { return(NULL); @@ -1356,7 +1292,7 @@ fil_space_t* fil_system_t::read_page0(ulint id) return(NULL); } - fil_node_complete_io(node, IORequestRead); + node->complete_io(); return space; } @@ -1634,25 +1570,24 @@ fil_open_system_tablespace_files() mutex_exit(&fil_system.mutex); } -/*******************************************************************//** -Closes all open files. There must not be any pending i/o's or not flushed -modifications in the files. */ -void -fil_close_all_files(void) -/*=====================*/ +/** Close all tablespace files at shutdown */ +void fil_close_all_files() { + if (!fil_system.is_initialised()) { + return; + } + fil_space_t* space; /* At shutdown, we should not have any files in this list. */ - ut_ad(fil_system.is_initialised()); ut_ad(srv_fast_shutdown == 2 || !srv_was_started || UT_LIST_GET_LEN(fil_system.named_spaces) == 0); + fil_flush_file_spaces(); mutex_enter(&fil_system.mutex); - for (space = UT_LIST_GET_FIRST(fil_system.space_list); - space != NULL; ) { + for (space = UT_LIST_GET_FIRST(fil_system.space_list); space; ) { fil_node_t* node; fil_space_t* prev_space = space; @@ -1660,13 +1595,31 @@ fil_close_all_files(void) node != NULL; node = UT_LIST_GET_NEXT(chain, node)) { - if (node->is_open()) { - node->close(); + if (!node->is_open()) { +next: + continue; } + + for (ulint count = 10000; count--; ) { + mutex_exit(&fil_system.mutex); + os_thread_sleep(100); + mutex_enter(&fil_system.mutex); + if (!node->is_open()) { + goto next; + } + if (!node->n_pending) { + node->close(); + goto next; + } + } + + ib::error() << "File '" << node->name + << "' has " << node->n_pending + << " operations"; } space = UT_LIST_GET_NEXT(space_list, space); - fil_space_detach(prev_space); + fil_system.detach(prev_space); fil_space_free_low(prev_space); } @@ -1708,15 +1661,17 @@ fil_write_flushed_lsn( lsn_t lsn) { byte* buf; - dberr_t err = DB_TABLESPACE_NOT_FOUND; + ut_ad(!srv_read_only_mode); buf = static_cast<byte*>(aligned_malloc(srv_page_size, srv_page_size)); const page_id_t page_id(TRX_SYS_SPACE, 0); - err = fil_read(page_id, 0, 0, srv_page_size, buf); + fil_io_t fio = fil_io(IORequestRead, true, page_id, 0, 0, + srv_page_size, buf, NULL); - if (err == DB_SUCCESS) { + if (fio.err == DB_SUCCESS) { + fio.node->space->release_for_io(); mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, lsn); ulint fsp_flags = mach_read_from_4( @@ -1726,12 +1681,17 @@ fil_write_flushed_lsn( buf_flush_assign_full_crc32_checksum(buf); } - err = fil_write(page_id, 0, 0, srv_page_size, buf); + fio = fil_io(IORequestWrite, true, page_id, 0, 0, + srv_page_size, buf, NULL); fil_flush_file_spaces(); } + if (fio.node) { + fio.node->space->release_for_io(); + } + aligned_free(buf); - return(err); + return fio.err; } /** Acquire a tablespace when it could be dropped concurrently. @@ -2008,13 +1968,6 @@ fil_op_replay_rename( return(true); } -/** File operations for tablespace */ -enum fil_operation_t { - FIL_OPERATION_DELETE, /*!< delete a single-table tablespace */ - FIL_OPERATION_CLOSE, /*!< close a single-table tablespace */ - FIL_OPERATION_TRUNCATE /*!< truncate an undo tablespace */ -}; - /** Check for pending operations. @param[in] space tablespace @param[in] count number of attempts so far @@ -2050,7 +2003,6 @@ static ulint fil_check_pending_io( /*=================*/ - fil_operation_t operation, /*!< in: File operation */ fil_space_t* space, /*!< in/out: Tablespace to check */ fil_node_t** node, /*!< out: Node in space list */ ulint count) /*!< in: number of attempts so far */ @@ -2058,15 +2010,6 @@ fil_check_pending_io( ut_ad(mutex_own(&fil_system.mutex)); ut_ad(!space->referenced()); - switch (operation) { - case FIL_OPERATION_DELETE: - case FIL_OPERATION_CLOSE: - break; - case FIL_OPERATION_TRUNCATE: - space->is_being_truncated = true; - break; - } - /* The following code must change when InnoDB supports multiple datafiles per tablespace. */ ut_a(UT_LIST_GET_LEN(space->chain) == 1); @@ -2095,24 +2038,18 @@ fil_check_pending_io( /*******************************************************************//** Check pending operations on a tablespace. -@return DB_SUCCESS or error failure. */ +@return tablespace */ static -dberr_t +fil_space_t* fil_check_pending_operations( /*=========================*/ ulint id, /*!< in: space id */ - fil_operation_t operation, /*!< in: File operation */ - fil_space_t** space, /*!< out: tablespace instance - in memory */ + bool truncate, /*!< in: whether to truncate a file */ char** path) /*!< out/own: tablespace path */ { ulint count = 0; ut_a(!is_system_tablespace(id)); - ut_ad(space); - - *space = 0; - mutex_enter(&fil_system.mutex); fil_space_t* sp = fil_space_get_by_id(id); @@ -2130,32 +2067,31 @@ fil_check_pending_operations( /* Check for pending operations. */ do { - sp = fil_space_get_by_id(id); - count = fil_check_pending_ops(sp, count); mutex_exit(&fil_system.mutex); - if (count > 0) { + if (count) { os_thread_sleep(20000); // Wait 0.02 seconds + } else if (!sp) { + return nullptr; } mutex_enter(&fil_system.mutex); - } while (count > 0); + + sp = fil_space_get_by_id(id); + } while (count); /* Check for pending IO. */ for (;;) { - sp = fil_space_get_by_id(id); - - if (sp == NULL) { - mutex_exit(&fil_system.mutex); - return(DB_TABLESPACE_NOT_FOUND); + if (truncate) { + sp->is_being_truncated = true; } fil_node_t* node; - count = fil_check_pending_io(operation, sp, &node, count); + count = fil_check_pending_io(sp, &node, count); if (count == 0 && path) { *path = mem_strdup(node->name); @@ -2169,40 +2105,29 @@ fil_check_pending_operations( os_thread_sleep(20000); // Wait 0.02 seconds mutex_enter(&fil_system.mutex); - } + sp = fil_space_get_by_id(id); - ut_ad(sp); + if (!sp) { + mutex_exit(&fil_system.mutex); + break; + } + } - *space = sp; - return(DB_SUCCESS); + return sp; } -/*******************************************************************//** -Closes a single-table tablespace. The tablespace must be cached in the -memory cache. Free all pages used by the tablespace. -@return DB_SUCCESS or error */ -dberr_t -fil_close_tablespace( -/*=================*/ - trx_t* trx, /*!< in/out: Transaction covering the close */ - ulint id) /*!< in: space id */ +/** Close a single-table tablespace on failed IMPORT TABLESPACE. +The tablespace must be cached in the memory cache. +Free all pages used by the tablespace. */ +void fil_close_tablespace(ulint id) { - char* path = 0; - fil_space_t* space = 0; - dberr_t err; - - ut_a(!is_system_tablespace(id)); - - err = fil_check_pending_operations(id, FIL_OPERATION_CLOSE, - &space, &path); - - if (err != DB_SUCCESS) { - return(err); + ut_ad(!is_system_tablespace(id)); + char* path = nullptr; + fil_space_t* space = fil_check_pending_operations(id, false, &path); + if (!space) { + return; } - ut_a(space); - ut_a(path != 0); - rw_lock_x_lock(&space->latch); /* Invalidate in the buffer pool all pages belonging to the @@ -2218,23 +2143,17 @@ fil_close_tablespace( if (!fil_space_free(id, true)) { rw_lock_x_unlock(&space->latch); - err = DB_TABLESPACE_NOT_FOUND; - } else { - err = DB_SUCCESS; } /* If it is a delete then also delete any generated files, otherwise when we drop the database the remove directory will fail. */ - char* cfg_name = fil_make_filepath(path, NULL, CFG, false); - if (cfg_name != NULL) { + if (char* cfg_name = fil_make_filepath(path, NULL, CFG, false)) { os_file_delete_if_exists(innodb_data_file_key, cfg_name, NULL); ut_free(cfg_name); } ut_free(path); - - return(err); } /** Determine whether a table can be accessed in operations that are @@ -2264,15 +2183,14 @@ bool fil_table_accessible(const dict_table_t* table) @return DB_SUCCESS or error */ dberr_t fil_delete_tablespace(ulint id, bool if_exists) { - char* path = 0; - fil_space_t* space = 0; - - ut_a(!is_system_tablespace(id)); + char* path = NULL; + ut_ad(!is_system_tablespace(id)); - dberr_t err = fil_check_pending_operations( - id, FIL_OPERATION_DELETE, &space, &path); + dberr_t err; + fil_space_t *space = fil_check_pending_operations(id, false, &path); - if (err != DB_SUCCESS) { + if (!space) { + err = DB_TABLESPACE_NOT_FOUND; if (!if_exists) { ib::error() << "Cannot delete tablespace " << id << " because it is not found" @@ -2282,9 +2200,6 @@ dberr_t fil_delete_tablespace(ulint id, bool if_exists) goto func_exit; } - ut_a(space); - ut_a(path != 0); - /* IMPORTANT: Because we have set space::stop_new_ops there can't be any new reads or flushes. We are here because node::n_pending was zero above. However, it is still @@ -2306,6 +2221,7 @@ dberr_t fil_delete_tablespace(ulint id, bool if_exists) To deal with potential read requests, we will check the ::stop_new_ops flag in fil_io(). */ + err = DB_SUCCESS; buf_LRU_flush_or_remove_pages(id, false); /* If it is a delete then also delete any generated files, otherwise @@ -2344,10 +2260,7 @@ dberr_t fil_delete_tablespace(ulint id, bool if_exists) ut_a(s == space); ut_a(!space->referenced()); ut_a(UT_LIST_GET_LEN(space->chain) == 1); - fil_node_t* node = UT_LIST_GET_FIRST(space->chain); - ut_a(node->n_pending == 0); - - fil_space_detach(space); + fil_system.detach(space); mutex_exit(&fil_system.mutex); log_mutex_enter(); @@ -2384,17 +2297,9 @@ func_exit: @param[in] space_id undo tablespace id @return the tablespace @retval NULL if tablespace not found */ -fil_space_t* fil_truncate_prepare(ulint space_id) +fil_space_t *fil_truncate_prepare(ulint space_id) { - /* Stop all I/O on the tablespace and ensure that related - pages are flushed to disk. */ - fil_space_t* space; - if (fil_check_pending_operations(space_id, FIL_OPERATION_TRUNCATE, - &space, NULL) != DB_SUCCESS) { - return NULL; - } - ut_ad(space != NULL); - return space; + return fil_check_pending_operations(space_id, true, nullptr); } /*******************************************************************//** @@ -3791,83 +3696,24 @@ fil_node_prepare_for_io( } } - if (node->n_pending == 0 && fil_space_belongs_in_lru(space)) { - /* The node is in the LRU list, remove it */ - ut_a(UT_LIST_GET_LEN(fil_system.LRU) > 0); + if (node->n_pending++ == 0 && fil_space_belongs_in_lru(space)) { UT_LIST_REMOVE(fil_system.LRU, node); } - node->n_pending++; - return(true); } -/** Update the data structures when an i/o operation finishes. -@param[in,out] node file node -@param[in] type IO context */ -static -void -fil_node_complete_io(fil_node_t* node, const IORequest& type) -{ - ut_ad(mutex_own(&fil_system.mutex)); - ut_a(node->n_pending > 0); - - --node->n_pending; - - ut_ad(type.validate()); - - if (type.is_write()) { - - ut_ad(!srv_read_only_mode - || node->space->purpose == FIL_TYPE_TEMPORARY); - - if (fil_buffering_disabled(node->space)) { - - /* We don't need to keep track of unflushed - changes as user has explicitly disabled - buffering. */ - ut_ad(!node->space->is_in_unflushed_spaces); - ut_ad(node->needs_flush == false); - - } else { - node->needs_flush = true; - - if (!node->space->is_in_unflushed_spaces) { - node->space->is_in_unflushed_spaces = true; - fil_system.unflushed_spaces.push_front( - *node->space); - } - } - } - - if (node->n_pending == 0 && fil_space_belongs_in_lru(node->space)) { - - /* The node must be put back to the LRU list */ - UT_LIST_ADD_FIRST(fil_system.LRU, node); - } -} - /** Report information about an invalid page access. */ -static -void -fil_report_invalid_page_access( - ulint block_offset, /*!< in: block offset */ - ulint space_id, /*!< in: space id */ - const char* space_name, /*!< in: space name */ - ulint byte_offset, /*!< in: byte offset */ - ulint len, /*!< in: I/O length */ - bool is_read) /*!< in: I/O type */ +ATTRIBUTE_COLD __attribute__((noreturn)) +static void +fil_report_invalid_page_access(const page_id_t id, const char *name, + ulint byte_offset, ulint len, bool is_read) { ib::fatal() - << "Trying to " << (is_read ? "read" : "write") - << " page number " << block_offset << " in" - " space " << space_id << ", space name " << space_name << "," - " which is outside the tablespace bounds. Byte offset " - << byte_offset << ", len " << len << - (space_id == 0 && !srv_was_started - ? "Please check that the configuration matches" - " the InnoDB system tablespace location (ibdata files)" - : ""); + << "Trying to " << (is_read ? "read " : "write ") + << id + << " which is outside the bounds of tablespace " << name + << ". Byte offset " << byte_offset << ", len " << len; } inline void IORequest::set_fil_node(fil_node_t* node) @@ -3895,12 +3741,11 @@ inline void IORequest::set_fil_node(fil_node_t* node) aligned @param[in] message message for aio handler if non-sync aio used, else ignored -@param[in] ignore whether to ignore out-of-bounds page_id +@param[in] ignore whether to ignore errors @param[in] punch_hole punch the hole to the file for page_compressed tablespace -@return DB_SUCCESS, or DB_TABLESPACE_DELETED - if we are trying to do i/o on a tablespace which does not exist */ -dberr_t +@return status and file descriptor */ +fil_io_t fil_io( const IORequest& type, bool sync, @@ -3958,23 +3803,18 @@ fil_io( srv_stats.data_written.add(len); } - /* Reserve the fil_system mutex and make sure that we can open at + /* Acquire fil_system.mutex and make sure that we can open at least one file while holding it, if the file is not already open */ + fil_space_t* space = fil_mutex_enter_and_prepare_for_io( + page_id.space()); - fil_mutex_enter_and_prepare_for_io(page_id.space()); - - fil_space_t* space = fil_space_get_by_id(page_id.space()); - - /* If we are deleting a tablespace we don't allow async read operations - on that. However, we do allow write operations and sync read operations. */ - if (space == NULL + if (!space || (req_type.is_read() && !sync && space->stop_new_ops && !space->is_being_truncated)) { mutex_exit(&fil_system.mutex); - if (!ignore) { ib::error() << "Trying to do I/O to a tablespace which" @@ -3984,7 +3824,7 @@ fil_io( << ", I/O length: " << len << " bytes"; } - return(DB_TABLESPACE_DELETED); + return {DB_TABLESPACE_DELETED, nullptr}; } ulint cur_page_no = page_id.page_no(); @@ -3995,12 +3835,11 @@ fil_io( if (node == NULL) { if (ignore) { mutex_exit(&fil_system.mutex); - return(DB_ERROR); + return {DB_ERROR, nullptr}; } fil_report_invalid_page_access( - page_id.page_no(), page_id.space(), - space->name, byte_offset, len, + page_id, space->name, byte_offset, len, req_type.is_read()); } else if (fil_is_user_tablespace_id(space->id) @@ -4022,48 +3861,42 @@ fil_io( } /* Open file if closed */ - if (!fil_node_prepare_for_io(node, space)) { - if (fil_is_user_tablespace_id(space->id)) { - mutex_exit(&fil_system.mutex); - - if (!ignore) { - ib::error() - << "Trying to do I/O to a tablespace" - " which exists without .ibd data file." - " I/O type: " - << (req_type.is_read() - ? "read" : "write") - << ", page: " - << page_id_t(page_id.space(), - cur_page_no) - << ", I/O length: " << len << " bytes"; - } + if (UNIV_UNLIKELY(!fil_node_prepare_for_io(node, space))) { + ut_ad(fil_is_user_tablespace_id(space->id)); + mutex_exit(&fil_system.mutex); - return(DB_TABLESPACE_DELETED); + if (!ignore) { + ib::error() + << "Trying to do I/O to a tablespace '" + << space->name + << "' which exists without .ibd data file." + " I/O type: " + << (req_type.is_read() + ? "read" : "write") + << ", page: " + << page_id + << ", I/O length: " << len << " bytes"; } - /* The tablespace is for log. Currently, we just assert here - to prevent handling errors along the way fil_io returns. - Also, if the log files are missing, it would be hard to - promise the server can continue running. */ - ut_a(0); + return {DB_TABLESPACE_DELETED, nullptr}; } - if (space->id && node->size <= cur_page_no) { + if (node->size <= cur_page_no) { if (ignore) { /* If we can tolerate the non-existent pages, we should return with DB_ERROR and let caller decide what to do. */ - fil_node_complete_io(node, req_type); + node->complete_io(req_type.is_write()); mutex_exit(&fil_system.mutex); - return(DB_ERROR); + return {DB_ERROR, nullptr}; } fil_report_invalid_page_access( - page_id.page_no(), page_id.space(), - space->name, byte_offset, len, req_type.is_read()); + page_id, space->name, byte_offset, len, + req_type.is_read()); } + space->acquire_for_io(); /* Now we have made the changes in the data structures of fil_system */ mutex_exit(&fil_system.mutex); @@ -4104,88 +3937,84 @@ fil_io( the decompression fails or the page is corrupt. */ ut_a(req_type.is_dblwr_recover() || err == DB_SUCCESS); - if (sync) { - /* The i/o operation is already completed when we return from - os_aio: */ - mutex_enter(&fil_system.mutex); - - fil_node_complete_io(node, req_type); - + node->complete_io(req_type.is_write()); mutex_exit(&fil_system.mutex); - ut_ad(fil_validate_skip()); } - - return(err); + return {err, node}; } #include <tpool.h> -/**********************************************************************/ /** Callback for AIO completion */ void fil_aio_callback(os_aio_userdata_t *data) { - fil_node_t* node= data->node; - void* message = data->message; - - ut_ad(fil_validate_skip()); + ut_ad(fil_validate_skip()); + fil_node_t *node= data->node; - if (node == NULL) { - ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS); - return; - } - - mutex_enter(&fil_system.mutex); - - fil_node_complete_io(node, data->type); - const ulint space_id= node->space->id; - bool dblwr = node->space->use_doublewrite(); - - mutex_exit(&fil_system.mutex); + if (UNIV_UNLIKELY(!node)) + { + ut_ad(srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS); + return; + } - ut_ad(fil_validate_skip()); + ut_ad(data->type.validate()); - /* Do the i/o handling */ - /* IMPORTANT: since i/o handling for reads will read also the insert - buffer in tablespace 0, you have to be very careful not to introduce - deadlocks in the i/o system. We keep tablespace 0 data files always - open, and use a special i/o thread to serve insert buffer requests. */ + buf_page_t *bpage= static_cast<buf_page_t*>(data->message); + if (!bpage) + { + /* Asynchronous single page writes from the doublewrite buffer + don't have access to the page. */ + ut_ad(data->type.is_write()); + ut_ad(node->space == fil_system.sys_space); + ut_ad(!srv_read_only_mode); +write_completed: + mutex_enter(&fil_system.mutex); + node->complete_io(true); + mutex_exit(&fil_system.mutex); + node->space->release_for_io(); + return; + } + if (data->type.is_write()) + { + ut_ad(!srv_read_only_mode || node->space->purpose == FIL_TYPE_TEMPORARY); + bool dblwr= node->space->use_doublewrite(); + if (dblwr && bpage->status == buf_page_t::INIT_ON_FLUSH) + { + bpage->status= buf_page_t::NORMAL; + dblwr= false; + } + buf_page_write_complete(bpage, data->type, dblwr, false); + goto write_completed; + } - /* async single page writes from the dblwr buffer don't have - access to the page */ - buf_page_t* bpage = static_cast<buf_page_t*>(message); - if (!bpage) { - return; - } + ut_ad(data->type.is_read()); - ulint offset = bpage->id.page_no(); - if (dblwr && bpage->status == buf_page_t::INIT_ON_FLUSH) { - bpage->status = buf_page_t::NORMAL; - dblwr = false; - } - dberr_t err = buf_page_io_complete(bpage, dblwr); - if (err == DB_SUCCESS) { - return; - } + /* IMPORTANT: since i/o handling for reads will read also the insert + buffer in fil_system.sys_space, we have to be very careful not to + introduce deadlocks. We never close the system tablespace (0) data + files via fil_system.LRU and we use a dedicated I/O thread to serve + change buffer requests. */ + const page_id_t id(bpage->id()); - ut_ad(data->type.is_read()); - if (recv_recovery_is_on() && !srv_force_recovery) { - recv_sys.found_corrupt_fs = true; - } + if (dberr_t err= buf_page_read_complete(bpage, *node)) + { + if (recv_recovery_is_on() && !srv_force_recovery) + recv_sys.found_corrupt_fs= true; - if (fil_space_t* space = fil_space_acquire_for_io(space_id)) { - if (space == node->space) { - ib::error() << "Failed to read file '" << node->name - << "' at offset " << offset << ": " - << ut_strerr(err); - } + ib::error() << "Failed to read page " << id.page_no() + << " from file '" << node->name << "': " + << ut_strerr(err); + } - space->release_for_io(); - } + mutex_enter(&fil_system.mutex); + node->complete_io(); + mutex_exit(&fil_system.mutex); + node->space->release_for_io(); } /**********************************************************************//** diff --git a/storage/innobase/fsp/fsp0fsp.cc b/storage/innobase/fsp/fsp0fsp.cc index 9bab0fe355a..323cd2e35a2 100644 --- a/storage/innobase/fsp/fsp0fsp.cc +++ b/storage/innobase/fsp/fsp0fsp.cc @@ -474,12 +474,12 @@ xdes_get_offset( void fsp_apply_init_file_page(buf_block_t *block) { memset_aligned<UNIV_PAGE_SIZE_MIN>(block->frame, 0, srv_page_size); + const page_id_t id(block->page.id()); - mach_write_to_4(block->frame + FIL_PAGE_OFFSET, block->page.id.page_no()); + mach_write_to_4(block->frame + FIL_PAGE_OFFSET, id.page_no()); if (log_sys.is_physical()) memset_aligned<8>(block->frame + FIL_PAGE_PREV, 0xff, 8); - mach_write_to_4(block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, - block->page.id.space()); + mach_write_to_4(block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, id.space()); if (page_zip_des_t* page_zip= buf_block_get_page_zip(block)) { memset_aligned<UNIV_ZIP_SIZE_MIN>(page_zip->data, 0, @@ -799,7 +799,7 @@ ATTRIBUTE_COLD void fil_block_reset_type(const buf_block_t& block, ulint type, mtr_t* mtr) { ib::info() - << "Resetting invalid page " << block.page.id << " type " + << "Resetting invalid page " << block.page.id() << " type " << fil_page_get_type(block.frame) << " to " << type << "."; mtr->write<2>(block, block.frame + FIL_PAGE_TYPE, type); } @@ -1385,7 +1385,7 @@ static bool fsp_alloc_seg_inode_page(fil_space_t *space, buf_block_t *header, mtr_t *mtr) { - ut_ad(header->page.id.space() == space->id); + ut_ad(header->page.id().space() == space->id); buf_block_t *block= fsp_alloc_free_page(space, 0, RW_SX_LATCH, mtr, mtr); if (!block) @@ -1500,7 +1500,7 @@ static void fsp_free_seg_inode( /* There are no other used headers left on the page: free it */ flst_remove(header, FSP_HEADER_OFFSET + FSP_SEG_INODES_FREE, iblock, FSEG_INODE_PAGE_NODE, mtr); - fsp_free_page(space, iblock->page.id.page_no(), mtr); + fsp_free_page(space, iblock->page.id().page_no(), mtr); } } @@ -1791,7 +1791,7 @@ fseg_create( + block->frame, page_offset(inode)); mtr->write<4>(*block, byte_offset + FSEG_HDR_PAGE_NO - + block->frame, iblock->page.id.page_no()); + + block->frame, iblock->page.id().page_no()); mtr->write<4,mtr_t::MAYBE_NOP>(*block, byte_offset + FSEG_HDR_SPACE + block->frame, space->id); @@ -1847,7 +1847,7 @@ ulint fseg_n_reserved_pages(const buf_block_t &block, { ut_ad(page_align(header) == block.frame); return fseg_n_reserved_pages_low(fseg_inode_get(header, - block.page.id.space(), + block.page.id().space(), block.zip_size(), mtr), used, mtr); } @@ -2162,8 +2162,8 @@ take_hinted_page: ut_a(n != ULINT_UNDEFINED); fseg_set_nth_frag_page_no( - seg_inode, iblock, n, block->page.id.page_no(), - mtr); + seg_inode, iblock, n, + block->page.id().page_no(), mtr); } /* fsp_alloc_free_page() invoked fsp_init_file_page() diff --git a/storage/innobase/fut/fut0lst.cc b/storage/innobase/fut/fut0lst.cc index 0a54d51b462..fbd38c9b215 100644 --- a/storage/innobase/fut/fut0lst.cc +++ b/storage/innobase/fut/fut0lst.cc @@ -101,7 +101,7 @@ static void flst_add_to_empty(buf_block_t *base, uint16_t boffset, mtr->write<1>(*base, base->frame + boffset + (FLST_LEN + 3), 1U); /* Update first and last fields of base node */ flst_write_addr(*base, base->frame + boffset + FLST_FIRST, - add->page.id.page_no(), aoffset, mtr); + add->page.id().page_no(), aoffset, mtr); memcpy(base->frame + boffset + FLST_LAST, base->frame + boffset + FLST_FIRST, FIL_ADDR_SIZE); /* Initialize FLST_LAST by (MEMMOVE|0x80,offset,FIL_ADDR_SIZE,source) @@ -145,24 +145,24 @@ static void flst_insert_after(buf_block_t *base, uint16_t boffset, fil_addr_t next_addr= flst_get_next_addr(cur->frame + coffset); flst_write_addr(*add, add->frame + aoffset + FLST_PREV, - cur->page.id.page_no(), coffset, mtr); + cur->page.id().page_no(), coffset, mtr); flst_write_addr(*add, add->frame + aoffset + FLST_NEXT, next_addr.page, next_addr.boffset, mtr); if (next_addr.page == FIL_NULL) flst_write_addr(*base, base->frame + boffset + FLST_LAST, - add->page.id.page_no(), aoffset, mtr); + add->page.id().page_no(), aoffset, mtr); else { buf_block_t *block; - flst_node_t *next= fut_get_ptr(add->page.id.space(), add->zip_size(), + flst_node_t *next= fut_get_ptr(add->page.id().space(), add->zip_size(), next_addr, RW_SX_LATCH, mtr, &block); flst_write_addr(*block, next + FLST_PREV, - add->page.id.page_no(), aoffset, mtr); + add->page.id().page_no(), aoffset, mtr); } flst_write_addr(*cur, cur->frame + coffset + FLST_NEXT, - add->page.id.page_no(), aoffset, mtr); + add->page.id().page_no(), aoffset, mtr); byte *len= &base->frame[boffset + FLST_LEN]; mtr->write<4>(*base, len, mach_read_from_4(len) + 1); @@ -201,22 +201,22 @@ static void flst_insert_before(buf_block_t *base, uint16_t boffset, flst_write_addr(*add, add->frame + aoffset + FLST_PREV, prev_addr.page, prev_addr.boffset, mtr); flst_write_addr(*add, add->frame + aoffset + FLST_NEXT, - cur->page.id.page_no(), coffset, mtr); + cur->page.id().page_no(), coffset, mtr); if (prev_addr.page == FIL_NULL) flst_write_addr(*base, base->frame + boffset + FLST_FIRST, - add->page.id.page_no(), aoffset, mtr); + add->page.id().page_no(), aoffset, mtr); else { buf_block_t *block; - flst_node_t *prev= fut_get_ptr(add->page.id.space(), add->zip_size(), + flst_node_t *prev= fut_get_ptr(add->page.id().space(), add->zip_size(), prev_addr, RW_SX_LATCH, mtr, &block); flst_write_addr(*block, prev + FLST_NEXT, - add->page.id.page_no(), aoffset, mtr); + add->page.id().page_no(), aoffset, mtr); } flst_write_addr(*cur, cur->frame + coffset + FLST_PREV, - add->page.id.page_no(), aoffset, mtr); + add->page.id().page_no(), aoffset, mtr); byte *len= &base->frame[boffset + FLST_LEN]; mtr->write<4>(*base, len, mach_read_from_4(len) + 1); @@ -260,9 +260,9 @@ void flst_add_last(buf_block_t *base, uint16_t boffset, { fil_addr_t addr= flst_get_last(base->frame + boffset); buf_block_t *cur= add; - const flst_node_t *c= addr.page == add->page.id.page_no() + const flst_node_t *c= addr.page == add->page.id().page_no() ? add->frame + addr.boffset - : fut_get_ptr(add->page.id.space(), add->zip_size(), addr, + : fut_get_ptr(add->page.id().space(), add->zip_size(), addr, RW_SX_LATCH, mtr, &cur); flst_insert_after(base, boffset, cur, static_cast<uint16_t>(c - cur->frame), @@ -295,9 +295,9 @@ void flst_add_first(buf_block_t *base, uint16_t boffset, { fil_addr_t addr= flst_get_first(base->frame + boffset); buf_block_t *cur= add; - const flst_node_t *c= addr.page == add->page.id.page_no() + const flst_node_t *c= addr.page == add->page.id().page_no() ? add->frame + addr.boffset - : fut_get_ptr(add->page.id.space(), add->zip_size(), addr, + : fut_get_ptr(add->page.id().space(), add->zip_size(), addr, RW_SX_LATCH, mtr, &cur); flst_insert_before(base, boffset, cur, static_cast<uint16_t>(c - cur->frame), @@ -332,9 +332,9 @@ void flst_remove(buf_block_t *base, uint16_t boffset, else { buf_block_t *block= cur; - flst_node_t *prev= prev_addr.page == cur->page.id.page_no() + flst_node_t *prev= prev_addr.page == cur->page.id().page_no() ? cur->frame + prev_addr.boffset - : fut_get_ptr(cur->page.id.space(), cur->zip_size(), prev_addr, + : fut_get_ptr(cur->page.id().space(), cur->zip_size(), prev_addr, RW_SX_LATCH, mtr, &block); flst_write_addr(*block, prev + FLST_NEXT, next_addr.page, next_addr.boffset, mtr); @@ -346,9 +346,9 @@ void flst_remove(buf_block_t *base, uint16_t boffset, else { buf_block_t *block= cur; - flst_node_t *next= next_addr.page == cur->page.id.page_no() + flst_node_t *next= next_addr.page == cur->page.id().page_no() ? cur->frame + next_addr.boffset - : fut_get_ptr(cur->page.id.space(), cur->zip_size(), next_addr, + : fut_get_ptr(cur->page.id().space(), cur->zip_size(), next_addr, RW_SX_LATCH, mtr, &block); flst_write_addr(*block, next + FLST_PREV, prev_addr.page, prev_addr.boffset, mtr); @@ -381,7 +381,7 @@ void flst_validate(const buf_block_t *base, uint16_t boffset, mtr_t *mtr) for (uint32_t i= len; i--; ) { mtr2.start(); - const flst_node_t *node= fut_get_ptr(base->page.id.space(), + const flst_node_t *node= fut_get_ptr(base->page.id().space(), base->zip_size(), addr, RW_SX_LATCH, &mtr2); addr= flst_get_next_addr(node); @@ -395,7 +395,7 @@ void flst_validate(const buf_block_t *base, uint16_t boffset, mtr_t *mtr) for (uint32_t i= len; i--; ) { mtr2.start(); - const flst_node_t *node= fut_get_ptr(base->page.id.space(), + const flst_node_t *node= fut_get_ptr(base->page.id().space(), base->zip_size(), addr, RW_SX_LATCH, &mtr2); addr= flst_get_prev_addr(node); diff --git a/storage/innobase/gis/gis0rtree.cc b/storage/innobase/gis/gis0rtree.cc index abb7c20c1db..a44f9bafbce 100644 --- a/storage/innobase/gis/gis0rtree.cc +++ b/storage/innobase/gis/gis0rtree.cc @@ -577,9 +577,9 @@ rtr_adjust_upper_level( level = btr_page_get_level(buf_block_get_frame(block)); ut_ad(level == btr_page_get_level(buf_block_get_frame(new_block))); - page_no = block->page.id.page_no(); + page_no = block->page.id().page_no(); - new_page_no = new_block->page.id.page_no(); + new_page_no = new_block->page.id().page_no(); /* Set new mbr for the old page on the upper level. */ /* Look up the index for the node pointer to page */ @@ -653,7 +653,7 @@ rtr_adjust_upper_level( lock_prdt_update_parent(block, new_block, &prdt, &new_prdt, index->table->space_id, - page_cursor->block->page.id.page_no()); + page_cursor->block->page.id().page_no()); mem_heap_free(heap); @@ -668,7 +668,7 @@ rtr_adjust_upper_level( ut_a(page_is_comp(next_block->frame) == page_is_comp(block->frame)); ut_a(btr_page_get_prev(next_block->frame) - == block->page.id.page_no()); + == block->page.id().page_no()); #endif /* UNIV_BTR_DEBUG */ btr_page_set_prev(next_block, new_page_no, mtr); @@ -931,7 +931,7 @@ func_start: ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX)); ut_ad(page_get_n_recs(page) >= 1); - page_no = block->page.id.page_no(); + page_no = block->page.id().page_no(); if (!page_has_prev(page) && !page_is_leaf(page)) { first_rec = page_rec_get_next( @@ -1395,9 +1395,9 @@ rtr_page_copy_rec_list_end_no_locks( ins_rec = page_cur_insert_rec_low(&page_cur, index, cur1_rec, offsets1, mtr); if (UNIV_UNLIKELY(!ins_rec)) { - fprintf(stderr, "page number %ld and %ld\n", - (long)new_block->page.id.page_no(), - (long)block->page.id.page_no()); + fprintf(stderr, "page number %u and %u\n", + new_block->page.id().page_no(), + block->page.id().page_no()); ib::fatal() << "rec offset " << page_offset(rec) << ", cur1 offset " @@ -1515,7 +1515,7 @@ rtr_page_copy_rec_list_start_no_locks( ins_rec = page_cur_insert_rec_low(&page_cur, index, cur1_rec, offsets1, mtr); if (UNIV_UNLIKELY(!ins_rec)) { - ib::fatal() << new_block->page.id + ib::fatal() << new_block->page.id() << "rec offset " << page_offset(rec) << ", cur1 offset " << page_offset(page_cur_get_rec(&cur1)) @@ -1653,7 +1653,7 @@ rtr_check_same_block( mem_heap_t* heap) /*!< in: memory heap */ { - ulint page_no = childb->page.id.page_no(); + ulint page_no = childb->page.id().page_no(); rec_offs* offsets; rec_t* rec = page_rec_get_next(page_get_infimum_rec( buf_block_get_frame(parentb))); diff --git a/storage/innobase/gis/gis0sea.cc b/storage/innobase/gis/gis0sea.cc index 4128be7f27d..1c0728d4a7b 100644 --- a/storage/innobase/gis/gis0sea.cc +++ b/storage/innobase/gis/gis0sea.cc @@ -674,7 +674,7 @@ rtr_page_get_father( ulint page_no = btr_node_ptr_get_child_page_no(cursor->page_cur.rec, offsets); - ut_ad(page_no == block->page.id.page_no()); + ut_ad(page_no == block->page.id().page_no()); #else rtr_page_get_father_block( NULL, heap, index, block, mtr, sea_cur, cursor); @@ -820,7 +820,7 @@ rtr_page_get_father_node_ptr( dict_index_t* index; rtr_mbr_t mbr; - page_no = btr_cur_get_block(cursor)->page.id.page_no(); + page_no = btr_cur_get_block(cursor)->page.id().page_no(); index = btr_cur_get_index(cursor); ut_ad(srv_read_only_mode @@ -1198,7 +1198,7 @@ rtr_check_discard_page( the root page */ buf_block_t* block) /*!< in: block of page to be discarded */ { - const ulint pageno = block->page.id.page_no(); + const ulint pageno = block->page.id().page_no(); mutex_enter(&index->rtr_track->rtr_active_mutex); @@ -1219,7 +1219,7 @@ rtr_check_discard_page( if (rtr_info->matches) { mutex_enter(&rtr_info->matches->rtr_match_mutex); - if ((&rtr_info->matches->block)->page.id.page_no() + if ((&rtr_info->matches->block)->page.id().page_no() == pageno) { if (!rtr_info->matches->matched_recs->empty()) { rtr_info->matches->matched_recs->clear(); @@ -1494,7 +1494,7 @@ rtr_non_leaf_insert_stack_push( { node_seq_t new_seq; btr_pcur_t* my_cursor; - ulint page_no = block->page.id.page_no(); + ulint page_no = block->page.id().page_no(); my_cursor = static_cast<btr_pcur_t*>( ut_malloc_nokey(sizeof(*my_cursor))); @@ -1510,7 +1510,7 @@ rtr_non_leaf_insert_stack_push( my_cursor, mbr_inc); } -/** Copy a buf_block_t strcuture, except "block->lock" and "block->mutex". +/** Copy a buf_block_t, except "block->lock". @param[in,out] matches copy to match->block @param[in] block block to copy */ static @@ -1519,13 +1519,11 @@ rtr_copy_buf( matched_rec_t* matches, const buf_block_t* block) { - /* Copy all members of "block" to "matches->block" except "mutex" - and "lock". We skip "mutex" and "lock" because they are not used + /* Copy all members of "block" to "matches->block" except "lock". + We skip "lock" because it is not used from the dummy buf_block_t we create here and because memcpy()ing - them generates (valid) compiler warnings that the vtable pointer - will be copied. It is also undefined what will happen with the - newly memcpy()ed mutex if the source mutex was acquired by - (another) thread while it was copied. */ + it generates (valid) compiler warnings that the vtable pointer + will be copied. */ new (&matches->block.page) buf_page_t(block->page); matches->block.frame = block->frame; matches->block.unzip_LRU = block->unzip_LRU; @@ -1533,7 +1531,6 @@ rtr_copy_buf( ut_d(matches->block.in_unzip_LRU_list = block->in_unzip_LRU_list); ut_d(matches->block.in_withdraw_list = block->in_withdraw_list); - /* Skip buf_block_t::mutex */ /* Skip buf_block_t::lock */ matches->block.lock_hash_val = block->lock_hash_val; matches->block.modify_clock = block->modify_clock; @@ -1697,7 +1694,7 @@ rtr_cur_search_with_match( const rec_t* best_rec; const rec_t* last_match_rec = NULL; bool match_init = false; - ulint space = block->page.id.space(); + ulint space = block->page.id().space(); page_cur_mode_t orig_mode = mode; const rec_t* first_rec = NULL; diff --git a/storage/innobase/ha/ha0ha.cc b/storage/innobase/ha/ha0ha.cc index de271eabc31..f73de63a97a 100644 --- a/storage/innobase/ha/ha0ha.cc +++ b/storage/innobase/ha/ha0ha.cc @@ -199,6 +199,42 @@ static const ulint MAX_N_POINTERS = UNIV_PAGE_SIZE_MAX / REC_N_NEW_EXTRA_BYTES; # endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */ +# ifdef UNIV_DEBUG +/** Assert that the synchronization object in a hash operation involving +possible change in the hash table is held in exclusive mode */ +void hash_assert_can_modify(hash_table_t *table, ulint fold) +{ + switch (table->type) { + case HASH_TABLE_SYNC_MUTEX: + ut_ad(mutex_own(hash_get_mutex(table, fold))); + return; + case HASH_TABLE_SYNC_RW_LOCK: + ut_ad(buf_pool.page_hash_lock_own_flagged(fold, RW_LOCK_FLAG_X)); + return; + case HASH_TABLE_SYNC_NONE: + return; + } + ut_ad(0); +} + +/** Assert that the synchronization object in a hash operation involving +possible change in the hash table is held in share dor exclusive mode */ +void hash_assert_can_search(hash_table_t *table, ulint fold) +{ + switch (table->type) { + case HASH_TABLE_SYNC_MUTEX: + ut_ad(mutex_own(hash_get_mutex(table, fold))); + return; + case HASH_TABLE_SYNC_RW_LOCK: + ut_ad(buf_pool.page_hash_lock_own_flagged(fold, RW_LOCK_FLAG_X | + RW_LOCK_FLAG_S)); + return; + case HASH_TABLE_SYNC_NONE: + return; + } +} +# endif + /*************************************************************//** Inserts an entry into a hash table. If an entry with the same fold number is found, its node is updated to point to the new data, and no new node diff --git a/storage/innobase/ha/hash0hash.cc b/storage/innobase/ha/hash0hash.cc index 51f3db09922..17e443a8bc8 100644 --- a/storage/innobase/ha/hash0hash.cc +++ b/storage/innobase/ha/hash0hash.cc @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -27,67 +28,6 @@ Created 5/20/1997 Heikki Tuuri #include "mem0mem.h" #include "sync0sync.h" -/************************************************************//** -Reserves all the locks of a hash table, in an ascending order. */ -void -hash_lock_x_all( -/*============*/ - hash_table_t* table) /*!< in: hash table */ -{ - ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK); - - for (ulint i = 0; i < table->n_sync_obj; i++) { - - rw_lock_t* lock = table->sync_obj.rw_locks + i; - - ut_ad(!rw_lock_own(lock, RW_LOCK_S)); - ut_ad(!rw_lock_own(lock, RW_LOCK_X)); - - rw_lock_x_lock(lock); - } -} - -/************************************************************//** -Releases all the locks of a hash table, in an ascending order. */ -void -hash_unlock_x_all( -/*==============*/ - hash_table_t* table) /*!< in: hash table */ -{ - ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK); - - for (ulint i = 0; i < table->n_sync_obj; i++) { - - rw_lock_t* lock = table->sync_obj.rw_locks + i; - - ut_ad(rw_lock_own(lock, RW_LOCK_X)); - - rw_lock_x_unlock(lock); - } -} - -/************************************************************//** -Releases all but passed in lock of a hash table, */ -void -hash_unlock_x_all_but( -/*==================*/ - hash_table_t* table, /*!< in: hash table */ - rw_lock_t* keep_lock) /*!< in: lock to keep */ -{ - ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK); - - for (ulint i = 0; i < table->n_sync_obj; i++) { - - rw_lock_t* lock = table->sync_obj.rw_locks + i; - - ut_ad(rw_lock_own(lock, RW_LOCK_X)); - - if (keep_lock != lock) { - rw_lock_x_unlock(lock); - } - } -} - /*************************************************************//** Creates a hash table with >= n array cells. The actual number of cells is chosen to be a prime number slightly bigger than n. diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index bea63919532..7e31f53ecd3 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -519,11 +519,7 @@ performance schema instrumented if "UNIV_PFS_MUTEX" is defined */ static PSI_mutex_info all_innodb_mutexes[] = { PSI_KEY(autoinc_mutex), -# ifndef PFS_SKIP_BUFFER_MUTEX_RWLOCK - PSI_KEY(buffer_block_mutex), -# endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */ PSI_KEY(buf_pool_mutex), - PSI_KEY(buf_pool_zip_mutex), PSI_KEY(cache_last_read_mutex), PSI_KEY(dict_foreign_err_mutex), PSI_KEY(dict_sys_mutex), @@ -17753,7 +17749,7 @@ func_exit: space->zip_size(), RW_X_LATCH, &mtr); if (block != NULL) { - ib::info() << "Dirtying page: " << block->page.id; + ib::info() << "Dirtying page: " << block->page.id(); mtr.write<1,mtr_t::FORCED>(*block, block->frame + FIL_PAGE_SPACE_ID, block->frame[FIL_PAGE_SPACE_ID]); @@ -18225,7 +18221,7 @@ static bool innodb_buffer_pool_evict_uncompressed() for (buf_block_t* block = UT_LIST_GET_LAST(buf_pool.unzip_LRU); block != NULL; ) { buf_block_t* prev_block = UT_LIST_GET_PREV(unzip_LRU, block); - ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE); ut_ad(block->in_unzip_LRU_list); ut_ad(block->page.in_LRU_list); diff --git a/storage/innobase/handler/handler0alter.cc b/storage/innobase/handler/handler0alter.cc index 2d8c7c3f942..a137eaf4406 100644 --- a/storage/innobase/handler/handler0alter.cc +++ b/storage/innobase/handler/handler0alter.cc @@ -5905,7 +5905,7 @@ empty_table: /* The table is empty. */ ut_ad(fil_page_index_page_check(block->frame)); ut_ad(!page_has_siblings(block->frame)); - ut_ad(block->page.id.page_no() == index->page); + ut_ad(block->page.id().page_no() == index->page); /* MDEV-17383: free metadata BLOBs! */ btr_page_empty(block, NULL, index, 0, &mtr); if (index->is_instant()) { diff --git a/storage/innobase/handler/i_s.cc b/storage/innobase/handler/i_s.cc index c7e11311348..41764afbee6 100644 --- a/storage/innobase/handler/i_s.cc +++ b/storage/innobase/handler/i_s.cc @@ -117,7 +117,6 @@ struct buf_page_info_t{ /** page identifier */ page_id_t id; unsigned access_time:32; /*!< Time of first access */ - unsigned flush_type:2; /*!< Flush type */ unsigned io_fix:2; /*!< type of pending I/O operation */ uint32_t fix_count; /*!< Count of how manyfold this block is bufferfixed */ @@ -131,7 +130,7 @@ struct buf_page_info_t{ buf_pool.freed_page_clock */ unsigned zip_ssize:PAGE_ZIP_SSIZE_BITS; /*!< Compressed page size */ - unsigned page_state:BUF_PAGE_STATE_BITS; /*!< Page state */ + unsigned page_state:3; /*!< Page state */ unsigned page_type:I_S_PAGE_TYPE_BITS; /*!< Page type */ unsigned num_recs:UNIV_PAGE_SIZE_SHIFT_MAX-2; /*!< Number of records on Page */ @@ -3840,17 +3839,16 @@ UNIV_INTERN struct st_maria_plugin i_s_innodb_buffer_stats = STRUCT_FLD(maturity, MariaDB_PLUGIN_MATURITY_STABLE), }; -/** These must correspond to the last values of buf_page_state */ +/** These must correspond to the first values of buf_page_state */ static const LEX_CSTRING page_state_values[] = { - { STRING_WITH_LEN("NOT_USED") }, - { STRING_WITH_LEN("READY_FOR_USE") }, - { STRING_WITH_LEN("FILE_PAGE") }, - { STRING_WITH_LEN("MEMORY") }, - { STRING_WITH_LEN("REMOVE_HASH") } + { STRING_WITH_LEN("NOT_USED") }, + { STRING_WITH_LEN("MEMORY") }, + { STRING_WITH_LEN("REMOVE_HASH") }, + { STRING_WITH_LEN("FILE_PAGE") }, }; -static const TypelibBuffer<5> page_state_values_typelib(page_state_values); +static const TypelibBuffer<4> page_state_values_typelib(page_state_values); static const LEX_CSTRING io_values[] = { @@ -3982,8 +3980,7 @@ i_s_innodb_buffer_page_fill( fields[IDX_BUFFER_PAGE_TYPE], i_s_page_type[page_info->page_type].type_str)); - OK(fields[IDX_BUFFER_PAGE_FLUSH_TYPE]->store( - page_info->flush_type, true)); + OK(fields[IDX_BUFFER_PAGE_FLUSH_TYPE]->store(0, true)); OK(fields[IDX_BUFFER_PAGE_FIX_COUNT]->store( page_info->fix_count, true)); @@ -4058,15 +4055,10 @@ i_s_innodb_buffer_page_fill( page_info->zip_ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << page_info->zip_ssize : 0, true)); - compile_time_assert(BUF_PAGE_STATE_BITS == 3); - /* First three states are for compression pages and - are not states we would get as we scan pages through - buffer blocks */ OK(fields[IDX_BUFFER_PAGE_STATE]->store( - page_info->page_state >= BUF_BLOCK_NOT_USED - ? page_info->page_state - (BUF_BLOCK_NOT_USED - 1) - : 0, true)); + 1 + std::min<unsigned>(page_info->page_state, + BUF_BLOCK_FILE_PAGE), true)); OK(fields[IDX_BUFFER_PAGE_IO_FIX]->store( 1 + page_info->io_fix, true)); @@ -4153,33 +4145,40 @@ i_s_innodb_buffer_page_get_info( { page_info->block_id = pos; - page_info->page_state = buf_page_get_state(bpage) & 7; + compile_time_assert(BUF_BLOCK_NOT_USED == 0); + compile_time_assert(BUF_BLOCK_MEMORY == 1); + compile_time_assert(BUF_BLOCK_REMOVE_HASH == 2); + compile_time_assert(BUF_BLOCK_FILE_PAGE == 3); + compile_time_assert(BUF_BLOCK_ZIP_PAGE == 4); - /* Only fetch information for buffers that map to a tablespace, - that is, buffer page with state BUF_BLOCK_ZIP_PAGE, - BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_FILE_PAGE */ - if (buf_page_in_file(bpage)) { - const byte* frame; + auto state = bpage->state(); + page_info->page_state= int{state} & 7; - page_info->id = bpage->id; + switch (state) { + default: + page_info->page_type = I_S_PAGE_TYPE_UNKNOWN; + break; + case BUF_BLOCK_FILE_PAGE: + case BUF_BLOCK_ZIP_PAGE: + const byte* frame; - page_info->flush_type = bpage->flush_type; + page_info->id = bpage->id(); - page_info->fix_count = bpage->buf_fix_count; + page_info->fix_count = bpage->buf_fix_count(); - page_info->oldest_mod = bpage->oldest_modification; + page_info->oldest_mod = bpage->oldest_modification(); page_info->access_time = bpage->access_time; page_info->zip_ssize = bpage->zip.ssize; - page_info->io_fix = bpage->io_fix & 3; + page_info->io_fix = bpage->io_fix() & 3; page_info->is_old = bpage->old; page_info->freed_page_clock = bpage->freed_page_clock; - switch (buf_page_get_io_fix(bpage)) { + switch (bpage->io_fix()) { case BUF_IO_NONE: case BUF_IO_WRITE: case BUF_IO_PIN: @@ -4190,7 +4189,7 @@ i_s_innodb_buffer_page_get_info( return; } - if (page_info->page_state == BUF_BLOCK_FILE_PAGE) { + if (state == BUF_BLOCK_FILE_PAGE) { const buf_block_t*block; block = reinterpret_cast<const buf_block_t*>(bpage); @@ -4209,8 +4208,6 @@ i_s_innodb_buffer_page_get_info( page_info->newest_mod = mach_read_from_8(FIL_PAGE_LSN + frame); i_s_innodb_set_page_type(page_info, frame); - } else { - page_info->page_type = I_S_PAGE_TYPE_UNKNOWN; } } @@ -4394,7 +4391,7 @@ static ST_FIELD_INFO i_s_innodb_buf_page_lru_fields_info[] = Column("PAGE_TYPE", Varchar(64), NULLABLE), #define IDX_BUF_LRU_PAGE_FLUSH_TYPE 5 - Column("FLUSH_TYPE", ULonglong(), NOT_NULL), + Column("FLUSH_TYPE", ULong(), NOT_NULL), #define IDX_BUF_LRU_PAGE_FIX_COUNT 6 Column("FIX_COUNT", ULong(), NOT_NULL), @@ -4487,8 +4484,7 @@ i_s_innodb_buf_page_lru_fill( fields[IDX_BUF_LRU_PAGE_TYPE], i_s_page_type[page_info->page_type].type_str)); - OK(fields[IDX_BUF_LRU_PAGE_FLUSH_TYPE]->store( - page_info->flush_type, true)); + OK(fields[IDX_BUF_LRU_PAGE_FLUSH_TYPE]->store(0, true)); OK(fields[IDX_BUF_LRU_PAGE_FIX_COUNT]->store( page_info->fix_count, true)); @@ -4564,8 +4560,7 @@ i_s_innodb_buf_page_lru_fill( ? 512 << page_info->zip_ssize : 0, true)); OK(fields[IDX_BUF_LRU_PAGE_STATE]->store( - page_info->page_state == BUF_BLOCK_ZIP_PAGE - || page_info->page_state == BUF_BLOCK_ZIP_DIRTY, + page_info->page_state == BUF_BLOCK_ZIP_PAGE, true)); OK(fields[IDX_BUF_LRU_PAGE_IO_FIX]->store( @@ -4612,7 +4607,7 @@ static int i_s_innodb_fill_buffer_lru(THD *thd, TABLE_LIST *tables, Item *) /* Print error message if malloc fail */ info_buffer = (buf_page_info_t*) my_malloc(PSI_INSTRUMENT_ME, - lru_len * sizeof *info_buffer, MYF(MY_WME | MY_ZEROFILL));; + lru_len * sizeof *info_buffer, MYF(MY_WME | MY_ZEROFILL)); if (!info_buffer) { status = 1; @@ -7189,20 +7184,12 @@ i_s_innodb_mutexes_fill_table( #ifdef JAN_TODO_FIXME ib_mutex_t* mutex; - ulint block_mutex_oswait_count = 0; - ib_mutex_t* block_mutex = NULL; for (mutex = UT_LIST_GET_FIRST(os_mutex_list); mutex != NULL; mutex = UT_LIST_GET_NEXT(list, mutex)) { if (mutex->count_os_wait == 0) { continue; } - if (buf_pool.is_block_mutex(mutex)) { - block_mutex = mutex; - block_mutex_oswait_count += mutex->count_os_wait; - continue; - } - OK(field_store_string(fields[MUTEXES_NAME], mutex->cmutex_name)); OK(field_store_string(fields[MUTEXES_CREATE_FILE], innobase_basename(mutex->cfile_name))); @@ -7213,20 +7200,6 @@ i_s_innodb_mutexes_fill_table( OK(schema_table_store_record(thd, tables->table)); } - if (block_mutex) { - char buf1[IO_SIZE]; - - snprintf(buf1, sizeof buf1, "combined %s", - innobase_basename(block_mutex->cfile_name)); - - OK(field_store_string(fields[MUTEXES_NAME], block_mutex->cmutex_name)); - OK(field_store_string(fields[MUTEXES_CREATE_FILE], buf1)); - OK(fields[MUTEXES_CREATE_LINE]->store(block_mutex->cline, true)); - fields[MUTEXES_CREATE_LINE]->set_notnull(); - OK(fields[MUTEXES_OS_WAITS]->store((longlong)block_mutex_oswait_count), true); - OK(schema_table_store_record(thd, tables->table)); - } - mutex_exit(&mutex_list_mutex); #endif /* JAN_TODO_FIXME */ diff --git a/storage/innobase/ibuf/ibuf0ibuf.cc b/storage/innobase/ibuf/ibuf0ibuf.cc index e09866c32c5..c2768d2bfb9 100644 --- a/storage/innobase/ibuf/ibuf0ibuf.cc +++ b/storage/innobase/ibuf/ibuf0ibuf.cc @@ -730,7 +730,7 @@ ibuf_set_free_bits_low( ulint val, /*!< in: value to set: < 4 */ mtr_t* mtr) /*!< in/out: mtr */ { - ut_ad(mtr->is_named_space(block->page.id.space())); + ut_ad(mtr->is_named_space(block->page.id().space())); if (!page_is_leaf(block->frame)) { return; } @@ -738,11 +738,11 @@ ibuf_set_free_bits_low( #ifdef UNIV_IBUF_DEBUG ut_a(val <= ibuf_index_page_calc_free(block)); #endif /* UNIV_IBUF_DEBUG */ + const page_id_t id(block->page.id()); ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>( - ibuf_bitmap_get_map_page(block->page.id, block->zip_size(), - mtr), - block->page.id, block->physical_size(), val, mtr); + ibuf_bitmap_get_map_page(id, block->zip_size(), mtr), + id, block->physical_size(), val, mtr); } /************************************************************************//** @@ -768,10 +768,11 @@ ibuf_set_free_bits_func( mtr_t mtr; mtr.start(); - const fil_space_t* space = mtr.set_named_space_id( - block->page.id.space()); + const page_id_t id(block->page.id()); + + const fil_space_t* space = mtr.set_named_space_id(id.space()); - buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(block->page.id, + buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(id, block->zip_size(), &mtr); @@ -784,7 +785,7 @@ ibuf_set_free_bits_func( ulint old_val; old_val = ibuf_bitmap_page_get_bits( - bitmap_page, block->page.id, + bitmap_page, id, IBUF_BITMAP_FREE, &mtr); ut_a(old_val <= max_val); } @@ -793,7 +794,7 @@ ibuf_set_free_bits_func( #endif /* UNIV_IBUF_DEBUG */ ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>( - bitmap_page, block->page.id, block->physical_size(), + bitmap_page, id, block->physical_size(), val, &mtr); mtr.commit(); @@ -841,7 +842,7 @@ ibuf_update_free_bits_low( ulint after; ut_a(!is_buf_block_get_page_zip(block)); - ut_ad(mtr->is_named_space(block->page.id.space())); + ut_ad(mtr->is_named_space(block->page.id().space())); before = ibuf_index_page_calc_free_bits(srv_page_size, max_ins_size); @@ -886,9 +887,9 @@ ibuf_update_free_bits_zip( } ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>( - ibuf_bitmap_get_map_page(block->page.id, block->zip_size(), + ibuf_bitmap_get_map_page(block->page.id(), block->zip_size(), mtr), - block->page.id, block->physical_size(), after, mtr); + block->page.id(), block->physical_size(), after, mtr); } /**********************************************************************//** @@ -907,8 +908,8 @@ ibuf_update_free_bits_for_two_pages_low( { ulint state; - ut_ad(mtr->is_named_space(block1->page.id.space())); - ut_ad(block1->page.id.space() == block2->page.id.space()); + ut_ad(mtr->is_named_space(block1->page.id().space())); + ut_ad(block1->page.id().space() == block2->page.id().space()); /* As we have to x-latch two random bitmap pages, we have to acquire the bitmap mutex to prevent a deadlock with a similar operation @@ -1877,7 +1878,7 @@ static bool ibuf_add_free_page() /* Set the bit indicating that this page is now an ibuf tree page (level 2 page) */ - const page_id_t page_id(IBUF_SPACE_ID, block->page.id.page_no()); + const page_id_t page_id(block->page.id()); buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(page_id, 0, &mtr); mutex_exit(&ibuf_mutex); @@ -2422,7 +2423,7 @@ ibuf_merge_pages( and the whole B-tree must be empty. InnoDB does not allow empty B-tree pages other than the root. */ ut_ad(ibuf.empty); - ut_ad(btr_pcur_get_block(&pcur)->page.id + ut_ad(btr_pcur_get_block(&pcur)->page.id() == page_id_t(IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO)); ibuf_mtr_commit(&mtr); @@ -2481,7 +2482,7 @@ ibuf_merge_space( and the whole B-tree must be empty. InnoDB does not allow empty B-tree pages other than the root. */ ut_ad(ibuf.empty); - ut_ad(btr_pcur_get_block(&pcur)->page.id + ut_ad(btr_pcur_get_block(&pcur)->page.id() == page_id_t(IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO)); } else { @@ -3291,7 +3292,7 @@ ibuf_insert_low( const ulint physical_size = zip_size ? zip_size : srv_page_size; if (op == IBUF_OP_DELETE - && (min_n_recs < 2 || buf_pool_watch_occurred(page_id))) { + && (min_n_recs < 2 || buf_pool.watch_occurred(page_id))) { /* The page could become empty after the record is deleted, or the page has been read in to the buffer pool. Refuse to buffer the operation. */ @@ -3323,7 +3324,7 @@ fail_exit: buffer pool, but we do not have to care about it, since we are holding a latch on the insert buffer leaf page that contains buffered changes for (space, page_no). If the page enters the - buffer pool, buf_page_io_complete() for (space, page_no) will + buffer pool, buf_page_read_complete() for (space, page_no) will have to acquire a latch on the same insert buffer leaf page, which it cannot do until we have buffered the IBUF_OP_DELETE and done mtr_commit(&mtr) to release the latch. */ @@ -3404,10 +3405,10 @@ fail_exit: ibuf_entry, &ins_rec, &dummy_big_rec, 0, thr, &mtr); block = btr_cur_get_block(cursor); - ut_ad(block->page.id.space() == IBUF_SPACE_ID); + ut_ad(block->page.id().space() == IBUF_SPACE_ID); /* If this is the root page, update ibuf.empty. */ - if (block->page.id.page_no() == FSP_IBUF_TREE_ROOT_PAGE_NO) { + if (block->page.id().page_no() == FSP_IBUF_TREE_ROOT_PAGE_NO) { const page_t* root = buf_block_get_frame(block); ut_ad(page_get_space_id(root) == IBUF_SPACE_ID); @@ -3447,7 +3448,7 @@ fail_exit: ibuf.empty = page_is_empty(root); block = btr_cur_get_block(cursor); - ut_ad(block->page.id.space() == IBUF_SPACE_ID); + ut_ad(block->page.id().space() == IBUF_SPACE_ID); } if (offsets_heap) { @@ -3669,13 +3670,13 @@ ibuf_insert_to_index_page_low( "InnoDB: is now probably corrupt. Please run CHECK TABLE on\n" "InnoDB: that table.\n", stderr); - ib::error() << "page " << block->page.id << ", size " + ib::error() << "page " << block->page.id() << ", size " << block->physical_size() << ", bitmap bits " << ibuf_bitmap_page_get_bits( - ibuf_bitmap_get_map_page(block->page.id, + ibuf_bitmap_get_map_page(block->page.id(), block->zip_size(), mtr)->frame, - block->page.id, block->zip_size(), + block->page.id(), block->zip_size(), IBUF_BITMAP_FREE, mtr); ib::error() << BUG_REPORT_MSG; @@ -3707,8 +3708,8 @@ ibuf_insert_to_index_page( DBUG_ENTER("ibuf_insert_to_index_page"); DBUG_PRINT("ibuf", ("page " UINT32PF ":" UINT32PF, - block->page.id.space(), - block->page.id.page_no())); + block->page.id().space(), + block->page.id().page_no())); ut_ad(!dict_index_is_online_ddl(index));// this is an ibuf_dummy index ut_ad(ibuf_inside(mtr)); @@ -3720,7 +3721,7 @@ ibuf_insert_to_index_page( ut_ad(!block->index); assert_block_ahi_empty(block); #endif /* BTR_CUR_HASH_ADAPT */ - ut_ad(mtr->is_named_space(block->page.id.space())); + ut_ad(mtr->is_named_space(block->page.id().space())); if (UNIV_UNLIKELY(dict_table_is_comp(index->table) != (ibool)!!page_is_comp(page))) { @@ -3905,7 +3906,7 @@ ibuf_set_del_mark( "InnoDB: record ", stderr); rec_print(stderr, page_cur_get_rec(&page_cur), index); - ib::error() << "page " << block->page.id << " (" + ib::error() << "page " << block->page.id() << " (" << page_get_n_recs(page) << " records, index id " << btr_page_get_index_id(page) << ")."; @@ -3968,8 +3969,8 @@ ibuf_delete( " (%u records, index id %llu)\n" "InnoDB: Submit a detailed bug report" " to https://jira.mariadb.org/\n", - block->page.id.space(), - block->page.id.page_no(), + block->page.id().space(), + block->page.id().page_no(), (unsigned) page_get_n_recs(page), (ulonglong) btr_page_get_index_id(page)); @@ -4203,8 +4204,8 @@ ibuf_merge_or_delete_for_page( ulint mops[IBUF_OP_COUNT]; ulint dops[IBUF_OP_COUNT]; - ut_ad(!block || page_id == block->page.id); - ut_ad(!block || buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + ut_ad(!block || page_id == block->page.id()); + ut_ad(!block || block->page.state() == BUF_BLOCK_FILE_PAGE); ut_ad(!block || block->page.status == buf_page_t::NORMAL); if (trx_sys_hdr_page(page_id) @@ -4788,20 +4789,20 @@ ibuf_set_bitmap_for_bulk_load( free_val = ibuf_index_page_calc_free(block); mtr.start(); - fil_space_t* space = mtr.set_named_space_id(block->page.id.space()); + fil_space_t* space = mtr.set_named_space_id(block->page.id().space()); - buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(block->page.id, + buf_block_t* bitmap_page = ibuf_bitmap_get_map_page(block->page.id(), space->zip_size(), &mtr); free_val = reset ? 0 : ibuf_index_page_calc_free(block); /* FIXME: update the bitmap byte only once! */ ibuf_bitmap_page_set_bits<IBUF_BITMAP_FREE>( - bitmap_page, block->page.id, block->physical_size(), + bitmap_page, block->page.id(), block->physical_size(), free_val, &mtr); ibuf_bitmap_page_set_bits<IBUF_BITMAP_BUFFERED>( - bitmap_page, block->page.id, block->physical_size(), + bitmap_page, block->page.id(), block->physical_size(), false, &mtr); mtr.commit(); diff --git a/storage/innobase/include/btr0cur.ic b/storage/innobase/include/btr0cur.ic index efd7da6a2d0..d49703dc7ee 100644 --- a/storage/innobase/include/btr0cur.ic +++ b/storage/innobase/include/btr0cur.ic @@ -147,7 +147,7 @@ btr_cur_compress_recommendation( root page. */ return cursor->index->page - != btr_cur_get_block(cursor)->page.id.page_no(); + != btr_cur_get_block(cursor)->page.id().page_no(); } return(FALSE); @@ -182,7 +182,7 @@ btr_cur_can_delete_without_compress( compression if this is not the root page. */ return cursor->index->page - == btr_cur_get_block(cursor)->page.id.page_no(); + == btr_cur_get_block(cursor)->page.id().page_no(); } return(TRUE); diff --git a/storage/innobase/include/buf0buddy.h b/storage/innobase/include/buf0buddy.h index bd1cc06aca6..17563ebbb7e 100644 --- a/storage/innobase/include/buf0buddy.h +++ b/storage/innobase/include/buf0buddy.h @@ -56,8 +56,7 @@ buf_buddy_get_slot(ulint size) byte *buf_buddy_alloc_low(ulint i, bool *lru) MY_ATTRIBUTE((malloc)); /** Allocate a ROW_FORMAT=COMPRESSED block. -The caller must not hold buf_pool.mutex nor buf_pool.zip_mutex nor any -block->mutex. +The caller must not hold buf_pool.mutex. @param[in] size compressed page size @param[out] lru whether buf_pool.mutex was temporarily released @return allocated block, never NULL */ diff --git a/storage/innobase/include/buf0buf.h b/storage/innobase/include/buf0buf.h index 3a169cd0fe2..a1b8fc5add2 100644 --- a/storage/innobase/include/buf0buf.h +++ b/storage/innobase/include/buf0buf.h @@ -70,9 +70,6 @@ struct fil_addr_t; #define BUF_EVICT_IF_IN_POOL 20 /*!< evict a clean block if found */ /* @} */ -#define BUF_POOL_WATCH_SIZE (srv_n_purge_threads + 1) - /*!< Maximum number of concurrent - buffer pool watches */ #define MAX_PAGE_HASH_LOCKS 1024 /*!< The maximum number of page_hash locks */ @@ -81,30 +78,20 @@ extern my_bool buf_disable_resize_buffer_pool_debug; /*!< if TRUE, resizing buffer pool is not allowed. */ # endif /* UNIV_DEBUG */ -/** @brief States of a control block -@see buf_page_t - -The enumeration values must be 0..7. */ -enum buf_page_state { - BUF_BLOCK_POOL_WATCH, /*!< a sentinel for the buffer pool - watch, element of buf_pool.watch[] */ - BUF_BLOCK_ZIP_PAGE, /*!< contains a clean - compressed page */ - BUF_BLOCK_ZIP_DIRTY, /*!< contains a compressed - page that is in the - buf_pool.flush_list */ - - BUF_BLOCK_NOT_USED, /*!< is in the free list; - must be after the BUF_BLOCK_ZIP_ - constants for compressed-only pages - @see buf_block_state_valid() */ - BUF_BLOCK_READY_FOR_USE, /*!< when buf_LRU_get_free_block - returns a block, it is in this state */ - BUF_BLOCK_FILE_PAGE, /*!< contains a buffered file page */ - BUF_BLOCK_MEMORY, /*!< contains some main memory - object */ - BUF_BLOCK_REMOVE_HASH /*!< hash index should be removed - before putting to the free list */ +/** buf_page_t::state() values, distinguishing buf_page_t and buf_block_t */ +enum buf_page_state +{ + /** available in buf_pool.free or buf_pool.watch */ + BUF_BLOCK_NOT_USED, + /** allocated for something else than a file page */ + BUF_BLOCK_MEMORY, + /** a previously allocated file page, in transit to NOT_USED */ + BUF_BLOCK_REMOVE_HASH, + /** a buf_block_t that is also in buf_pool.LRU */ + BUF_BLOCK_FILE_PAGE, + /** the buf_page_t of a ROW_FORMAT=COMPRESSED page + whose uncompressed page frame has been evicted */ + BUF_BLOCK_ZIP_PAGE }; /** This structure defines information we will fetch from each buffer pool. It @@ -218,8 +205,7 @@ buf_page_free_descriptor( /** Allocate a buffer block. @return own: the allocated block, in state BUF_BLOCK_MEMORY */ -buf_block_t* -buf_block_alloc(); +inline buf_block_t *buf_block_alloc(); /********************************************************************//** Frees a buffer block which does not contain a file page. */ UNIV_INLINE @@ -637,16 +623,6 @@ buf_block_get_lock_hash_val( buf_block_t* buf_pool_contains_zip(const void* data); #endif /* UNIV_DEBUG */ -/*********************************************************************** -FIXME_FTS: Gets the frame the pointer is pointing to. */ -UNIV_INLINE -buf_frame_t* -buf_frame_align( -/*============*/ - /* out: pointer to frame */ - byte* ptr); /* in: pointer to a frame */ - - /** Dump a page to stderr. @param[in] read_buf database page @param[in] zip_size compressed page size, or 0 */ @@ -705,207 +681,6 @@ buf_block_dbg_add_level( #else /* UNIV_DEBUG */ # define buf_block_dbg_add_level(block, level) /* nothing */ #endif /* UNIV_DEBUG */ -/*********************************************************************//** -Gets the state of a block. -@return state */ -UNIV_INLINE -enum buf_page_state -buf_page_get_state( -/*===============*/ - const buf_page_t* bpage); /*!< in: pointer to the control - block */ -/*********************************************************************//** -Gets the state of a block. -@return state */ -UNIV_INLINE -enum buf_page_state -buf_block_get_state( -/*================*/ - const buf_block_t* block) /*!< in: pointer to the control block */ - MY_ATTRIBUTE((warn_unused_result)); -/*********************************************************************//** -Sets the state of a block. */ -UNIV_INLINE -void -buf_page_set_state( -/*===============*/ - buf_page_t* bpage, /*!< in/out: pointer to control block */ - enum buf_page_state state); /*!< in: state */ -/*********************************************************************//** -Sets the state of a block. */ -UNIV_INLINE -void -buf_block_set_state( -/*================*/ - buf_block_t* block, /*!< in/out: pointer to control block */ - enum buf_page_state state); /*!< in: state */ -/*********************************************************************//** -Determines if a block is mapped to a tablespace. -@return TRUE if mapped */ -UNIV_INLINE -ibool -buf_page_in_file( -/*=============*/ - const buf_page_t* bpage) /*!< in: pointer to control block */ - MY_ATTRIBUTE((warn_unused_result)); - -/*********************************************************************//** -Determines if a block should be on unzip_LRU list. -@return TRUE if block belongs to unzip_LRU */ -UNIV_INLINE -ibool -buf_page_belongs_to_unzip_LRU( -/*==========================*/ - const buf_page_t* bpage) /*!< in: pointer to control block */ - MY_ATTRIBUTE((warn_unused_result)); - -/*********************************************************************//** -Gets the mutex of a block. -@return pointer to mutex protecting bpage */ -UNIV_INLINE -BPageMutex* -buf_page_get_mutex( -/*===============*/ - const buf_page_t* bpage) /*!< in: pointer to control block */ - MY_ATTRIBUTE((warn_unused_result)); - -/*********************************************************************//** -Get the flush type of a page. -@return flush type */ -UNIV_INLINE -buf_flush_t -buf_page_get_flush_type( -/*====================*/ - const buf_page_t* bpage) /*!< in: buffer page */ - MY_ATTRIBUTE((warn_unused_result)); -/*********************************************************************//** -Set the flush type of a page. */ -UNIV_INLINE -void -buf_page_set_flush_type( -/*====================*/ - buf_page_t* bpage, /*!< in: buffer page */ - buf_flush_t flush_type); /*!< in: flush type */ - -/** Map a block to a file page. -@param[in,out] block pointer to control block -@param[in] page_id page id */ -UNIV_INLINE -void -buf_block_set_file_page( - buf_block_t* block, - const page_id_t page_id); - -/*********************************************************************//** -Gets the io_fix state of a block. -@return io_fix state */ -UNIV_INLINE -enum buf_io_fix -buf_page_get_io_fix( -/*================*/ - const buf_page_t* bpage) /*!< in: pointer to the control block */ - MY_ATTRIBUTE((warn_unused_result)); -/*********************************************************************//** -Gets the io_fix state of a block. -@return io_fix state */ -UNIV_INLINE -enum buf_io_fix -buf_block_get_io_fix( -/*================*/ - const buf_block_t* block) /*!< in: pointer to the control block */ - MY_ATTRIBUTE((warn_unused_result)); -/*********************************************************************//** -Sets the io_fix state of a block. */ -UNIV_INLINE -void -buf_page_set_io_fix( -/*================*/ - buf_page_t* bpage, /*!< in/out: control block */ - enum buf_io_fix io_fix);/*!< in: io_fix state */ -/*********************************************************************//** -Sets the io_fix state of a block. */ -UNIV_INLINE -void -buf_block_set_io_fix( -/*=================*/ - buf_block_t* block, /*!< in/out: control block */ - enum buf_io_fix io_fix);/*!< in: io_fix state */ -/*********************************************************************//** -Makes a block sticky. A sticky block implies that even after we release -the buf_pool.mutex and the block->mutex: -* it cannot be removed from the flush_list -* the block descriptor cannot be relocated -* it cannot be removed from the LRU list -Note that: -* the block can still change its position in the LRU list -* the next and previous pointers can change. */ -UNIV_INLINE -void -buf_page_set_sticky( -/*================*/ - buf_page_t* bpage); /*!< in/out: control block */ -/*********************************************************************//** -Removes stickiness of a block. */ -UNIV_INLINE -void -buf_page_unset_sticky( -/*==================*/ - buf_page_t* bpage); /*!< in/out: control block */ -/********************************************************************//** -Determine if a buffer block can be relocated in memory. The block -can be dirty, but it must not be I/O-fixed or bufferfixed. */ -UNIV_INLINE -ibool -buf_page_can_relocate( -/*==================*/ - const buf_page_t* bpage) /*!< control block being relocated */ - MY_ATTRIBUTE((warn_unused_result)); - -/*********************************************************************//** -Determine if a block has been flagged old. -@return TRUE if old */ -UNIV_INLINE -ibool -buf_page_is_old( -/*============*/ - const buf_page_t* bpage) /*!< in: control block */ - MY_ATTRIBUTE((warn_unused_result)); -/*********************************************************************//** -Flag a block old. */ -UNIV_INLINE -void -buf_page_set_old( -/*=============*/ - buf_page_t* bpage, /*!< in/out: control block */ - bool old); /*!< in: old */ -/*********************************************************************//** -Determine the time of first access of a block in the buffer pool. -@return ut_time_ms() at the time of first access, 0 if not accessed */ -UNIV_INLINE -unsigned -buf_page_is_accessed( -/*=================*/ - const buf_page_t* bpage) /*!< in: control block */ - MY_ATTRIBUTE((warn_unused_result)); -/*********************************************************************//** -Flag a block accessed. */ -UNIV_INLINE -void -buf_page_set_accessed( -/*==================*/ - buf_page_t* bpage) /*!< in/out: control block */ - MY_ATTRIBUTE((nonnull)); -/*********************************************************************//** -Gets the buf_block_t handle of a buffered file block if an uncompressed -page frame exists, or NULL. Note: even though bpage is not declared a -const we don't update its value. -@return control block, or NULL */ -UNIV_INLINE -buf_block_t* -buf_page_get_block( -/*===============*/ - buf_page_t* bpage) /*!< in: control block, or NULL */ - MY_ATTRIBUTE((warn_unused_result)); #ifdef UNIV_DEBUG /*********************************************************************//** @@ -929,52 +704,21 @@ if applicable. */ #define is_buf_block_get_page_zip(block) \ UNIV_LIKELY_NULL((block)->page.zip.data) -/** Initialize a page for read to the buffer buf_pool. If the page is -(1) already in buf_pool, or -(2) if we specify to read only ibuf pages and the page is not an ibuf page, or -(3) if the space is deleted or being deleted, -then this function does nothing. -Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock -on the buffer frame. The io-handler must take care that the flag is cleared -and the lock released later. -@param[out] err DB_SUCCESS or DB_TABLESPACE_DELETED -@param[in] mode BUF_READ_IBUF_PAGES_ONLY, ... -@param[in] page_id page id -@param[in] zip_size ROW_FORMAT=COMPRESSED page size, or 0 -@param[in] unzip whether the uncompressed page is - requested (for ROW_FORMAT=COMPRESSED) -@return pointer to the block -@retval NULL in case of an error */ -buf_page_t* -buf_page_init_for_read( - dberr_t* err, - ulint mode, - const page_id_t page_id, - ulint zip_size, - bool unzip); +/** Monitor the buffer page read/write activity, and increment corresponding +counter value in MONITOR_MODULE_BUF_PAGE. +@param bpage buffer page whose read or write was completed +@param io_type BUF_IO_READ or BUF_IO_WRITE */ +ATTRIBUTE_COLD __attribute__((nonnull)) +void buf_page_monitor(const buf_page_t *bpage, buf_io_fix io_type); -/** Complete a read or write request of a file page to or from the buffer pool. -@param[in,out] bpage page to complete -@param[in] dblwr whether the doublewrite buffer was used (on write) -@param[in] evict whether or not to evict the page from LRU list +/** Complete a read request of a file page to buf_pool. +@param bpage recently read page +@param node data file @return whether the operation succeeded -@retval DB_SUCCESS always when writing, or if a read page was OK -@retval DB_PAGE_CORRUPTED if the checksum fails on a page read -@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but - after decryption normal page checksum does - not match */ -UNIV_INTERN -dberr_t -buf_page_io_complete( - buf_page_t* bpage, - bool dblwr = false, - bool evict = false) - MY_ATTRIBUTE((nonnull)); - -/** Returns the control block of a file page, NULL if not found. -@param[in] page_id page id -@return block, NULL if not found */ -inline buf_page_t *buf_page_hash_get_low(const page_id_t page_id); +@retval DB_SUCCESS always when writing, or if a read page was OK +@retval DB_PAGE_CORRUPTED if the checksum fails on a page read +@retval DB_DECRYPTION_FAILED if the page cannot be decrypted */ +dberr_t buf_page_read_complete(buf_page_t *bpage, const fil_node_t &node); /** Returns the control block of a file page, NULL if not found. If the block is found and lock is not NULL then the appropriate @@ -1022,7 +766,7 @@ buf_block_hash_get_locked( /* There are four different ways we can try to get a bpage or block from the page hash: 1) Caller already holds the appropriate page hash lock: in the case call -buf_page_hash_get_low() function. +buf_pool_t::page_hash_get_low(). 2) Caller wants to hold page hash lock in x-mode 3) Caller wants to hold page hash lock in s-mode 4) Caller doesn't want to hold page hash lock */ @@ -1031,35 +775,16 @@ buf_page_hash_get_low() function. #define buf_page_hash_get_x_locked(page_id, l) \ buf_page_hash_get_locked(page_id, l, RW_LOCK_X) #define buf_page_hash_get(page_id) \ - buf_page_hash_get_locked(page_id, NULL, 0) + buf_page_hash_get_locked(page_id, nullptr, RW_LOCK_S) #define buf_page_get_also_watch(page_id) \ - buf_page_hash_get_locked(page_id, NULL, 0, true) + buf_page_hash_get_locked(page_id, nullptr, RW_LOCK_S, true) #define buf_block_hash_get_s_locked(page_id, l) \ buf_block_hash_get_locked(page_id, l, RW_LOCK_S) #define buf_block_hash_get_x_locked(page_id, l) \ buf_block_hash_get_locked(page_id, l, RW_LOCK_X) #define buf_block_hash_get(page_id) \ - buf_block_hash_get_locked(page_id, NULL, 0) - -/** Determine if a block is a sentinel for a buffer pool watch. -@param[in] bpage block -@return whether bpage a sentinel for a buffer pool watch */ -bool buf_pool_watch_is_sentinel(const buf_page_t* bpage) - MY_ATTRIBUTE((nonnull, warn_unused_result)); - -/** Stop watching if the page has been read in. -buf_pool_watch_set(space,offset) must have returned NULL before. -@param[in] page_id page id */ -void buf_pool_watch_unset(const page_id_t page_id); - -/** Check if the page has been read in. -This may only be called after buf_pool_watch_set(space,offset) -has returned NULL and before invoking buf_pool_watch_unset(space,offset). -@param[in] page_id page id -@return FALSE if the given page was not read in, TRUE if it was */ -bool buf_pool_watch_occurred(const page_id_t page_id) -MY_ATTRIBUTE((warn_unused_result)); + buf_block_hash_get_locked(page_id, nullptr, RW_LOCK_S) /** Calculate aligned buffer pool size based on srv_buf_pool_chunk_unit, if needed. @@ -1123,40 +848,37 @@ public: /** The common buffer control block structure for compressed and uncompressed frames */ -/** Number of bits used for buffer page states. */ -#define BUF_PAGE_STATE_BITS 3 - -class buf_page_t { -public: - /** @name General fields - None of these bit-fields must be modified without holding - buf_page_get_mutex() [buf_block_t::mutex or - buf_pool.zip_mutex], since they can be stored in the same - machine word. Some of these fields are additionally protected - by buf_pool.mutex. */ - /* @{ */ - - /** Page id. Protected by buf_pool mutex. */ - page_id_t id; - buf_page_t* hash; /*!< node used in chaining to - buf_pool.page_hash or - buf_pool.zip_hash */ - - /** Count of how manyfold this block is currently bufferfixed. */ - Atomic_counter<uint32_t> buf_fix_count; - - /** type of pending I/O operation; also protected by - buf_pool.mutex for writes only */ - buf_io_fix io_fix; +class buf_pool_t; - /** Block state. @see buf_page_in_file */ - buf_page_state state; +class buf_page_t +{ + friend buf_pool_t; + friend buf_block_t; + /** @name General fields */ + /* @{ */ + +public: // FIXME: fix fil_iterate() + /** Page id. Protected by buf_pool.hash_lock_get(id) when + the page is in buf_pool.page_hash. */ + page_id_t id_; +private: + /** Count of how manyfold this block is currently bufferfixed. */ + Atomic_counter<uint32_t> buf_fix_count_; + + /** type of pending I/O operation; protected by buf_pool.mutex + if in_LRU_list */ + Atomic_relaxed<buf_io_fix> io_fix_; + /** Block state. @see in_file(). + State transitions between in_file() states and to + BUF_BLOCK_REMOVE_HASH are protected by buf_pool.hash_lock_get(id) + when the block is in buf_pool.page_hash. + Other transitions when in_LRU_list are protected by buf_pool.mutex. */ + buf_page_state state_; - unsigned flush_type:2; /*!< if this block is currently being - flushed to disk, this tells the - flush_type. - @see buf_flush_t */ - /* @} */ +public: + /** buf_pool.page_hash link; protected by buf_pool.hash_lock_get(id) */ + buf_page_t *hash; + /* @} */ page_zip_des_t zip; /*!< compressed page; zip.data (but not the data it points to) is also protected by buf_pool.mutex; @@ -1164,97 +886,50 @@ public: zip.data == NULL means an active buf_pool.watch */ - ulint real_size; /*!< Real size of the page - Normal pages == srv_page_size - page compressed pages, payload - size alligned to sector boundary. - */ - buf_tmp_buffer_t* slot; /*!< Slot for temporary memory used for encryption/compression or NULL */ #ifdef UNIV_DEBUG - /** whether the page is in buf_pool.page_hash; - protected by buf_pool.mutex(!) and the hash bucket rw-latch */ - ibool in_page_hash; - ibool in_zip_hash; /*!< TRUE if in buf_pool.zip_hash */ + /** whether this->list is in buf_pool.zip_hash; protected by buf_pool.mutex */ + bool in_zip_hash; + /** whether this->LRU is in buf_pool.LRU (in_file() holds); + protected by buf_pool.mutex */ + bool in_LRU_list; + /** whether this is in buf_pool.page_hash (in_file() holds); + protected by buf_pool.mutex */ + bool in_page_hash; + /** whether this->list is in buf_pool.free (state() == BUF_BLOCK_NOT_USED); + protected by buf_pool.flush_list_mutex */ + bool in_free_list; #endif /* UNIV_DEBUG */ + /** list member in one of the lists of buf_pool; protected by + buf_pool.mutex or buf_pool.flush_list_mutex - /** @name Page flushing fields - All these are protected by buf_pool.mutex. */ - /* @{ */ + state() == BUF_BLOCK_NOT_USED: buf_pool.free or buf_pool.withdraw - UT_LIST_NODE_T(buf_page_t) list; - /*!< based on state, this is a - list node, protected either by - buf_pool.mutex or by - buf_pool.flush_list_mutex, - in one of the following lists in - buf_pool: - - - BUF_BLOCK_NOT_USED: free, withdraw - - BUF_BLOCK_FILE_PAGE: flush_list - - BUF_BLOCK_ZIP_DIRTY: flush_list - - BUF_BLOCK_ZIP_PAGE: zip_clean - - If bpage is part of flush_list - then the node pointers are - covered by buf_pool.flush_list_mutex. - Otherwise these pointers are - protected by buf_pool.mutex. - - The contents of the list node - is undefined if !in_flush_list - && state == BUF_BLOCK_FILE_PAGE, - or if state is one of - BUF_BLOCK_MEMORY, - BUF_BLOCK_REMOVE_HASH or - BUF_BLOCK_READY_IN_USE. */ + state() == BUF_BLOCK_FILE_PAGE || + (state() == BUF_BLOCK_ZIP_PAGE && !oldest_modification()): + buf_pool.flush_list (protected by buf_pool.flush_list_mutex) -#ifdef UNIV_DEBUG - ibool in_flush_list; /*!< TRUE if in buf_pool.flush_list; - when buf_pool.flush_list_mutex is - free, the following should hold: - in_flush_list - == (state == BUF_BLOCK_FILE_PAGE - || state == BUF_BLOCK_ZIP_DIRTY) - Writes to this field must be - covered by both block->mutex - and buf_pool.flush_list_mutex. Hence - reads can happen while holding - any one of the two mutexes */ - ibool in_free_list; /*!< TRUE if in buf_pool.free; when - buf_pool.mutex is free, the following - should hold: in_free_list - == (state == BUF_BLOCK_NOT_USED) */ -#endif /* UNIV_DEBUG */ + state() == BUF_BLOCK_ZIP_PAGE && !oldest_modification(): buf_pool.zip_clean - lsn_t oldest_modification; - /*!< log sequence number of - the START of the log entry - written of the oldest - modification to this block - which has not yet been flushed - on disk; zero if all - modifications are on disk. - Writes to this field must be - covered by both block->mutex - and buf_pool.flush_list_mutex. Hence - reads can happen while holding - any one of the two mutexes */ - /* @} */ - /** @name LRU replacement algorithm fields - These fields are protected by buf_pool.mutex only (not - buf_pool.zip_mutex or buf_block_t::mutex). */ + The contents is undefined if + !oldest_modification() && state() == BUF_BLOCK_FILE_PAGE, + or if state() is not any of the above. */ + UT_LIST_NODE_T(buf_page_t) list; + +private: + /** log sequence number of the START of the log entry written of the + oldest modification to this block which has not yet been written + to the data file; 0 if no modifications are pending. */ + Atomic_counter<lsn_t> oldest_modification_; +public: + /** @name LRU replacement algorithm fields. + Protected by buf_pool.mutex. */ /* @{ */ UT_LIST_NODE_T(buf_page_t) LRU; /*!< node of the LRU list */ -#ifdef UNIV_DEBUG - ibool in_LRU_list; /*!< TRUE if the page is in - the LRU list; used in - debugging */ -#endif /* UNIV_DEBUG */ unsigned old:1; /*!< TRUE if the block is in the old blocks in buf_pool.LRU_old */ unsigned freed_page_clock:31;/*!< the value of @@ -1266,11 +941,9 @@ public: purposes without holding any mutex or latch */ /* @} */ - unsigned access_time; /*!< time of first access, or + Atomic_counter<unsigned> access_time; /*!< time of first access, or 0 if the block was never accessed - in the buffer pool. Protected by - block mutex for buf_page_in_file() - blocks. + in the buffer pool. For state==BUF_BLOCK_MEMORY blocks, this field can be repurposed @@ -1281,10 +954,10 @@ public: the field is protected by recv_sys_t::mutex. */ /** Change buffer entries for the page exist. - Protected by io_fix==BUF_IO_READ or by buf_block_t::lock. */ + Protected by io_fix()==BUF_IO_READ or by buf_block_t::lock. */ bool ibuf_exist; - /** Block initialization status. Can be modified while holding io_fix + /** Block initialization status. Can be modified while holding io_fix() or buf_block_t::lock X-latch */ enum { /** the page was read normally and should be flushed normally */ @@ -1299,10 +972,85 @@ public: FREED } status; - void fix() { buf_fix_count++; } + buf_page_t() : id_(0) + { + static_assert(BUF_BLOCK_NOT_USED == 0, "compatibility"); + memset((void*) this, 0, sizeof *this); + } + + /** Initialize some fields */ + void init() + { + io_fix_= BUF_IO_NONE; + buf_fix_count_= 0; + old= 0; + freed_page_clock= 0; + access_time= 0; + oldest_modification_= 0; + slot= nullptr; + ibuf_exist= false; + status= NORMAL; + ut_d(in_zip_hash= false); + ut_d(in_free_list= false); + ut_d(in_LRU_list= false); + ut_d(in_page_hash= false); + HASH_INVALIDATE(this, hash); + } + + /** Initialize some more fields */ + void init(buf_page_state state, page_id_t id, uint32_t buf_fix_count= 0) + { + init(); + state_= state; + id_= id; + buf_fix_count_= buf_fix_count; + } + +public: + const page_id_t &id() const { return id_; } + buf_page_state state() const { return state_; } + uint32_t buf_fix_count() const { return buf_fix_count_; } + buf_io_fix io_fix() const { return io_fix_; } + void io_unfix() + { + ut_d(const auto old_io_fix= io_fix()); + ut_ad(old_io_fix == BUF_IO_READ || old_io_fix == BUF_IO_PIN); + io_fix_= BUF_IO_NONE; + } + + /** @return if this belongs to buf_pool.unzip_LRU */ + bool belongs_to_unzip_LRU() const + { + ut_ad(in_file()); + return zip.data && state() == BUF_BLOCK_FILE_PAGE; + } + + inline void add_buf_fix_count(uint32_t count); + inline void set_buf_fix_count(uint32_t count); + inline void set_state(buf_page_state state); + inline void set_io_fix(buf_io_fix io_fix); + inline void set_corrupt_id(); + + /** @return the oldest modification */ + lsn_t oldest_modification() const { return oldest_modification_; } + /** Set oldest_modification when adding to buf_pool.flush_list */ + inline void set_oldest_modification(lsn_t lsn); + /** Clear oldest_modification when removing from buf_pool.flush_list */ + inline void clear_oldest_modification(); + + /** Prepare to release a file page to buf_pool.free. */ + void free_file_page() + { + ut_ad(state() == BUF_BLOCK_REMOVE_HASH); + ut_d(oldest_modification_= 0); /* for buf_LRU_free_page(this, false) */ + set_corrupt_id(); + ut_d(set_state(BUF_BLOCK_MEMORY)); + } + + void fix() { buf_fix_count_++; } uint32_t unfix() { - uint32_t count= buf_fix_count--; + uint32_t count= buf_fix_count_--; ut_ad(count != 0); return count - 1; } @@ -1319,6 +1067,47 @@ public: { return zip.ssize ? (UNIV_ZIP_SIZE_MIN >> 1) << zip.ssize : 0; } + + /** @return whether the block is mapped to a data file */ + bool in_file() const + { + switch (state_) { + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_FILE_PAGE: + return true; + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_MEMORY: + case BUF_BLOCK_REMOVE_HASH: + return false; + } + + ut_error; + return false; + } + + /** @return whether the block is modified and ready for flushing */ + inline bool ready_for_flush() const; + /** @return whether the state can be changed to BUF_BLOCK_NOT_USED */ + bool ready_for_replace() const + { return !oldest_modification() && can_relocate(); } + /** @return whether the block can be relocated in memory. + The block can be dirty, but it must not be I/O-fixed or bufferfixed. */ + inline bool can_relocate() const; + /** @return whether the block has been flagged old in buf_pool.LRU */ + inline bool is_old() const; + /** Set whether a block is old in buf_pool.LRU */ + inline void set_old(bool old); + /** Flag a page accessed in buf_pool + @return whether this is not the first access */ + bool set_accessed() + { + if (is_accessed()) return true; + access_time= static_cast<uint32_t>(ut_time_ms()); + return false; + } + /** @return ut_time_ms() at the time of first access of a block in buf_pool + @retval 0 if not accessed */ + unsigned is_accessed() const { ut_ad(in_file()); return access_time; } }; /** The buffer control block structure */ @@ -1338,21 +1127,25 @@ struct buf_block_t{ srv_page_size */ BPageLock lock; /*!< read-write lock of the buffer frame */ +#ifdef UNIV_DEBUG + /** whether page.list is in buf_pool.withdraw + ((state() == BUF_BLOCK_NOT_USED)) and the buffer pool is being shrunk; + protected by buf_pool.mutex */ + bool in_withdraw_list; + /** whether unzip_LRU is in buf_pool.unzip_LRU + (state() == BUF_BLOCK_FILE_PAGE and zip.data != nullptr); + protected by buf_pool.mutex */ + bool in_unzip_LRU_list; +#endif UT_LIST_NODE_T(buf_block_t) unzip_LRU; /*!< node of the decompressed LRU list; a block is in the unzip_LRU list - if page.state == BUF_BLOCK_FILE_PAGE + if page.state() == BUF_BLOCK_FILE_PAGE and page.zip.data != NULL */ -#ifdef UNIV_DEBUG - ibool in_unzip_LRU_list;/*!< TRUE if the page is in the - decompressed LRU list; - used in debugging */ - ibool in_withdraw_list; -#endif /* UNIV_DEBUG */ uint32_t lock_hash_val; /*!< hashed value of the page address in the record lock hash table; protected by buf_block_t::lock - (or buf_block_t::mutex, buf_pool.mutex + (or buf_pool.mutex in buf_page_get_gen(), buf_page_init_for_read() and buf_page_create()) */ @@ -1476,15 +1269,16 @@ struct buf_block_t{ debug utilities in sync0rw */ /* @} */ # endif - BPageMutex mutex; /*!< mutex protecting this block: - state (also protected by the buffer - pool mutex), io_fix, buf_fix_count, - and accessed; we introduce this new - mutex in InnoDB-5.1 to relieve - contention on the buffer pool mutex */ - void fix() { page.fix(); } - uint32_t unfix() { return page.unfix(); } + uint32_t unfix() + { + uint32_t fix_count= page.unfix(); + ut_ad(fix_count || page.io_fix() != BUF_IO_NONE || + page.state() == BUF_BLOCK_ZIP_PAGE || + !rw_lock_own_flagged(&lock, RW_LOCK_FLAG_X | RW_LOCK_FLAG_S | + RW_LOCK_FLAG_SX)); + return fix_count; + } /** @return the physical size, in bytes */ ulint physical_size() const { return page.physical_size(); } @@ -1492,15 +1286,12 @@ struct buf_block_t{ /** @return the ROW_FORMAT=COMPRESSED physical size, in bytes @retval 0 if not compressed */ ulint zip_size() const { return page.zip_size(); } -}; - -/** Check if a buf_block_t object is in a valid state -@param block buffer block -@return TRUE if valid */ -#define buf_block_state_valid(block) \ -(buf_block_get_state(block) >= BUF_BLOCK_NOT_USED \ - && (buf_block_get_state(block) <= BUF_BLOCK_REMOVE_HASH)) + /** Initialize the block. + @param page_id page id + @param zip_size ROW_FORMAT=COMPRESSED page size, or 0 */ + void initialise(const page_id_t page_id, ulint zip_size); +}; /**********************************************************************//** Compute the hash fold value for blocks in buf_pool.zip_hash. */ @@ -1523,11 +1314,11 @@ public: buf_page_t *get() const { ut_ad(mutex_own(m_mutex)); return m_hp; } /** Set current value - @param bpage buffer block to be set as hp */ + @param bpage buffer block to be set as hp */ void set(buf_page_t *bpage) { ut_ad(mutex_own(m_mutex)); - ut_ad(!bpage || buf_page_in_file(bpage)); + ut_ad(!bpage || bpage->in_file()); m_hp= bpage; } @@ -1570,7 +1361,7 @@ public: if (is_hp(bpage)) m_hp= UT_LIST_GET_PREV(list, m_hp); - ut_ad(!m_hp || m_hp->in_flush_list); + ut_ad(!m_hp || m_hp->oldest_modification()); } }; @@ -1828,6 +1619,10 @@ public: return false; } + /** Release and evict a corrupted page. + @param bpage page that was being read */ + void corrupted_evict(buf_page_t *bpage); + #ifdef UNIV_DEBUG /** Find a block that points to a ROW_FORMAT=COMPRESSED page @param data pointer to the start of a ROW_FORMAT=COMPRESSED page frame @@ -1858,8 +1653,6 @@ public: inline buf_block_t* block_from_ahi(const byte *ptr) const; #endif /* BTR_CUR_HASH_ADAPT */ - bool is_block_mutex(const BPageMutex *m) const - { return is_block_field(reinterpret_cast<const void*>(m)); } bool is_block_lock(const BPageLock *l) const { return is_block_field(reinterpret_cast<const void*>(l)); } @@ -1878,29 +1671,216 @@ public: is_block_field(reinterpret_cast<const void*>(block)); } -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG + /** Get the page_hash latch for a page */ + rw_lock_t *hash_lock_get(const page_id_t id) const + { + return hash_lock_get_low(id.fold()); + } + /** Get a page_hash latch. */ + rw_lock_t *hash_lock_get_low(ulint fold) const + { + return page_hash->sync_obj.rw_locks + + hash_get_sync_obj_index(page_hash, fold); + } +#ifdef UNIV_DEBUG + /** Check whether a page_hash latch is being held */ + bool page_hash_lock_own_flagged(ulint fold, rw_lock_flags_t flagged) const + { + return rw_lock_own_flagged(hash_lock_get_low(fold), flagged); + } +#endif + + /** Acquire a page_hash bucket latch, tolerating concurrent resize() + @tparam exclusive whether the latch is to be acquired exclusively + @param fold hash bucket key */ + template<bool exclusive> rw_lock_t *page_hash_lock(ulint fold) + { + rw_lock_t *latch= hash_lock_get_low(fold); + if (exclusive) + rw_lock_x_lock(latch); + else + rw_lock_s_lock(latch); + rw_lock_t *l; + while ((l= hash_lock_get_low(fold)) != latch) + { + if (exclusive) + rw_lock_x_unlock(latch); + else + rw_lock_s_unlock(latch); + /* FIXME: what if we resize() completes several times while we + are not holding any latch here? Is the latch guaranteed to be valid? */ + if (exclusive) + rw_lock_x_lock(l); + else + rw_lock_s_lock(l); + latch= l; + } + return latch; + } + + /** Look up a block descriptor. + @param id page identifier + @return block descriptor, possibly in watch[] + @retval nullptr if not found*/ + buf_page_t *page_hash_get_low(const page_id_t id) + { + ut_ad(mutex_own(&mutex) || + rw_lock_own_flagged(hash_lock_get(id), + RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); + buf_page_t* bpage; + /* Look for the page in the hash table */ + HASH_SEARCH(hash, page_hash, id.fold(), buf_page_t*, bpage, + ut_ad(bpage->in_page_hash), id == bpage->id()); + return bpage; + } + + /** Acquire exclusive latches on all page_hash buckets. */ + void page_hash_lock_all() const + { + ut_ad(page_hash->magic_n == HASH_TABLE_MAGIC_N); + ut_ad(page_hash->type == HASH_TABLE_SYNC_RW_LOCK); + for (ulint i= 0; i < page_hash->n_sync_obj; i++) + rw_lock_x_lock(&page_hash->sync_obj.rw_locks[i]); + } + /** Release exclusive latches on all the page_hash buckets. */ + void page_hash_unlock_all() const + { + ut_ad(page_hash->magic_n == HASH_TABLE_MAGIC_N); + ut_ad(page_hash->type == HASH_TABLE_SYNC_RW_LOCK); + + for (ulint i = 0; i < page_hash->n_sync_obj; i++) + rw_lock_x_unlock(&page_hash->sync_obj.rw_locks[i]); + } + + /** Determine if a block is a sentinel for a buffer pool watch. + @param bpage page descriptor + @return whether bpage a sentinel for a buffer pool watch */ + bool watch_is_sentinel(const buf_page_t &bpage) + { + ut_ad(mutex_own(&mutex) || + rw_lock_own_flagged(hash_lock_get(bpage.id()), + RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); + ut_ad(bpage.in_file()); + + if (&bpage < &watch[0] || &bpage >= &watch[UT_ARR_SIZE(watch)]) + { + ut_ad(bpage.state() != BUF_BLOCK_ZIP_PAGE || bpage.zip.data); + return false; + } + + ut_ad(bpage.state() == BUF_BLOCK_ZIP_PAGE); + ut_ad(!bpage.in_zip_hash); + ut_ad(!bpage.zip.data); + return true; + } + + /** Check if a watched page has been read. + This may only be called after !watch_set() and before invoking watch_unset(). + @param id page identifier + @return whether the page was read to the buffer pool */ + bool watch_occurred(const page_id_t id) + { + rw_lock_t *hash_lock= page_hash_lock<false>(id.fold()); + /* The page must exist because watch_set() increments buf_fix_count. */ + buf_page_t *bpage= page_hash_get_low(id); + const bool is_sentinel= watch_is_sentinel(*bpage); + rw_lock_s_unlock(hash_lock); + return !is_sentinel; + } + + /** Register a watch for a page identifier. The caller must hold an + exclusive page hash latch. The *hash_lock may be released, + relocated, and reacquired. + @param id page identifier + @param hash_lock page_hash latch that is held in RW_LOCK_X mode + @return a buffer pool block corresponding to id + @retval nullptr if the block was not present, and a watch was installed */ + inline buf_page_t *watch_set(const page_id_t id, rw_lock_t **hash_lock); + + /** Stop watching whether a page has been read in. + watch_set(id) must have returned nullptr before. + @param id page identifier */ + void watch_unset(const page_id_t id) + { + const ulint fold= id.fold(); + rw_lock_t *hash_lock= page_hash_lock<true>(fold); + /* The page must exist because watch_set() increments buf_fix_count. */ + buf_page_t *watch= page_hash_get_low(id); + if (watch->unfix() == 0 && watch_is_sentinel(*watch)) + { + /* The following is based on watch_remove(). */ + ut_ad(watch->in_page_hash); + ut_d(watch->in_page_hash= false); + HASH_DELETE(buf_page_t, hash, page_hash, fold, watch); + rw_lock_x_unlock(hash_lock); + // Now that the watch is detached from page_hash, release it to watch[]. + mutex_enter(&mutex); + /* It is possible that watch_remove() already removed the watch. */ + if (watch->id_ == id) + { + ut_ad(!watch->buf_fix_count()); + ut_ad(watch->state() == BUF_BLOCK_ZIP_PAGE); + watch->set_state(BUF_BLOCK_NOT_USED); + } + mutex_exit(&mutex); + } + else + rw_lock_x_unlock(hash_lock); + } + + /** Remove the sentinel block for the watch before replacing it with a + real block. watch_unset() or watch_occurred() will notice + that the block has been replaced with the real block. + @param watch sentinel */ + inline void watch_remove(buf_page_t *watch); + + /** @return whether less than 1/4 of the buffer pool is available */ + bool running_out() const + { + return !recv_recovery_is_on() && + UNIV_UNLIKELY(UT_LIST_GET_LEN(free) + UT_LIST_GET_LEN(LRU) < + std::min(curr_size, old_size) / 4); + } + +#ifdef UNIV_DEBUG /** Validate the buffer pool. */ void validate(); -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ -#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +#endif /* UNIV_DEBUG */ +#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG /** Write information of the buf_pool to the error log. */ void print(); -#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */ +#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */ + + /** Remove a block from the LRU list. + @return the predecessor in the LRU list */ + buf_page_t *LRU_remove(buf_page_t *bpage) + { + ut_ad(mutex_own(&mutex)); + ut_ad(bpage->in_LRU_list); + ut_ad(bpage->in_page_hash); + ut_ad(!bpage->in_zip_hash); + ut_ad(bpage->in_file()); + lru_hp.adjust(bpage); + lru_scan_itr.adjust(bpage); + single_scan_itr.adjust(bpage); + ut_d(bpage->in_LRU_list= false); + buf_page_t *prev= UT_LIST_GET_PREV(LRU, bpage); + UT_LIST_REMOVE(LRU, bpage); + return prev; + } + + /** Number of pages to read ahead */ + static constexpr uint32_t READ_AHEAD_PAGES= 64; /** @name General fields */ /* @{ */ BufPoolMutex mutex; /*!< Buffer pool mutex */ - BufPoolZipMutex zip_mutex; /*!< Zip mutex, protects compressed - only pages (of type buf_page_t, not - buf_block_t */ ulint curr_pool_size; /*!< Current pool size in bytes */ ulint LRU_old_ratio; /*!< Reserve this much of the buffer pool for "old" blocks */ #ifdef UNIV_DEBUG ulint buddy_n_frames; /*!< Number of frames allocated from the buffer pool to the buddy system */ -#endif -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG ulint mutex_exit_forbidden; /*!< Forbid release mutex */ #endif ut_allocator<unsigned char> allocator; /*!< Allocator used for @@ -1911,30 +1891,25 @@ public: chunk_t* chunks; /*!< buffer pool chunks */ chunk_t* chunks_old; /*!< old buffer pool chunks to be freed after resizing buffer pool */ - ulint curr_size; /*!< current pool size in pages */ - ulint old_size; /*!< previous pool size in pages */ - ulint read_ahead_area;/*!< size in pages of the area which - the read-ahead algorithms read if - invoked */ - hash_table_t* page_hash; /*!< hash table of buf_page_t or - buf_block_t file pages, - buf_page_in_file() == TRUE, - indexed by (space_id, offset). - page_hash is protected by an - array of mutexes. - Changes in page_hash are protected - by buf_pool.mutex and the relevant - page_hash mutex. Lookups can happen - while holding the buf_pool.mutex or - the relevant page_hash mutex. */ + /** current pool size in pages */ + Atomic_counter<ulint> curr_size; + /** previous pool size in pages */ + Atomic_counter<ulint> old_size; + /** read-ahead request size in pages */ + Atomic_counter<uint32_t> read_ahead_area; + + /** Hash table of file pages (buf_page_t::in_file() holds), + indexed by page_id_t. Protected by both mutex and hash_lock_get(id). */ + hash_table_t *page_hash; hash_table_t* page_hash_old; /*!< old pointer to page_hash to be freed after resizing buffer pool */ hash_table_t* zip_hash; /*!< hash table of buf_block_t blocks whose frames are allocated to the zip buddy system, - indexed by block->frame */ - ulint n_pend_reads; /*!< number of pending read - operations */ + indexed by block->frame; + protected by buf_pool.mutex*/ + /** number of pending read operations */ + Atomic_counter<ulint> n_pend_reads; Atomic_counter<ulint> n_pend_unzip; /*!< number of pending decompressions */ @@ -1968,13 +1943,13 @@ public: UT_LIST_BASE_NODE_T(buf_page_t) flush_list; /*!< base node of the modified block list */ - ibool init_flush[BUF_FLUSH_N_TYPES]; + ibool init_flush[3]; /*!< this is TRUE when a flush of the given type is being initialized */ - ulint n_flush[BUF_FLUSH_N_TYPES]; - /*!< this is the number of pending - writes in the given flush type */ - os_event_t no_flush[BUF_FLUSH_N_TYPES]; + /** Number of pending writes of a flush type. + The sum of these is approximately the sum of BUF_IO_WRITE blocks. */ + Atomic_counter<ulint> n_flush[3]; + os_event_t no_flush[3]; /*!< this is in the set state when there is no flush batch of the given type running; @@ -2071,10 +2046,11 @@ public: frames and buf_page_t descriptors of blocks that exist in the buffer pool only in compressed form. */ /* @{ */ -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +#ifdef UNIV_DEBUG + /** unmodified ROW_FORMAT=COMPRESSED pages; + protected by buf_pool.mutex */ UT_LIST_BASE_NODE_T(buf_page_t) zip_clean; - /*!< unmodified compressed pages */ -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#endif /* UNIV_DEBUG */ UT_LIST_BASE_NODE_T(buf_buddy_free_t) zip_free[BUF_BUDDY_SIZES_MAX]; /*!< buddy free lists */ #if BUF_BUDDY_LOW > UNIV_ZIP_SIZE_MIN @@ -2082,12 +2058,9 @@ public: #endif /* @} */ - buf_page_t* watch; - /*!< Sentinel records for buffer - pool watches. Protected by - buf_pool.mutex. */ - - + /** Sentinels to detect if pages are read into the buffer pool while + a delete-buffering operation is pending. Protected by mutex. */ + buf_page_t watch[innodb_purge_threads_MAX + 1]; /** Reserve a buffer. */ buf_tmp_buffer_t *io_buf_reserve() { return io_buf.reserve(); } private: @@ -2143,71 +2116,126 @@ private: /** The InnoDB buffer pool */ extern buf_pool_t buf_pool; -/** @name Accessors for buffer pool mutexes -Use these instead of accessing buffer pool mutexes directly. */ -/* @{ */ +inline void buf_page_t::add_buf_fix_count(uint32_t count) +{ + ut_ad(mutex_own(&buf_pool.mutex)); + buf_fix_count_+= count; +} -/** Test if block->mutex is owned. */ -#define buf_page_mutex_own(b) (b)->mutex.is_owned() +inline void buf_page_t::set_buf_fix_count(uint32_t count) +{ + ut_ad(mutex_own(&buf_pool.mutex)); + buf_fix_count_= count; +} -/** Acquire the block->mutex. */ -#define buf_page_mutex_enter(b) do { \ - mutex_enter(&(b)->mutex); \ -} while (0) +inline void buf_page_t::set_state(buf_page_state state) +{ + ut_ad(mutex_own(&buf_pool.mutex)); + state_= state; +} -/** Release the trx->mutex. */ -#define buf_page_mutex_exit(b) do { \ - (b)->mutex.exit(); \ -} while (0) +inline void buf_page_t::set_io_fix(buf_io_fix io_fix) +{ + ut_ad(mutex_own(&buf_pool.mutex)); + io_fix_= io_fix; +} +inline void buf_page_t::set_corrupt_id() +{ + ut_ad(!oldest_modification()); +#ifdef UNIV_DEBUG + switch (state()) { + case BUF_BLOCK_REMOVE_HASH: + break; + case BUF_BLOCK_ZIP_PAGE: + case BUF_BLOCK_FILE_PAGE: + ut_ad(rw_lock_own(buf_pool.hash_lock_get(id_), RW_LOCK_X)); + break; + case BUF_BLOCK_NOT_USED: + case BUF_BLOCK_MEMORY: + ut_ad("invalid state" == 0); + } +#endif + id_= page_id_t(~0ULL); +} -/** Get appropriate page_hash_lock. */ -UNIV_INLINE -rw_lock_t* -buf_page_hash_lock_get(const page_id_t& page_id) +/** Set oldest_modification when adding to buf_pool.flush_list */ +inline void buf_page_t::set_oldest_modification(lsn_t lsn) { - return hash_get_lock(buf_pool.page_hash, page_id.fold()); + ut_ad(mutex_own(&buf_pool.flush_list_mutex)); + ut_ad(!oldest_modification()); + oldest_modification_= lsn; } -/** If not appropriate page_hash_lock, relock until appropriate. */ -# define buf_page_hash_lock_s_confirm(hash_lock, page_id)\ - hash_lock_s_confirm(hash_lock, buf_pool.page_hash, (page_id).fold()) +/** Clear oldest_modification when removing from buf_pool.flush_list */ +inline void buf_page_t::clear_oldest_modification() +{ + ut_ad(mutex_own(&buf_pool.flush_list_mutex)); + ut_d(const auto state= state_); + ut_ad(state == BUF_BLOCK_FILE_PAGE || state == BUF_BLOCK_ZIP_PAGE || + state == BUF_BLOCK_REMOVE_HASH); + ut_ad(oldest_modification()); + oldest_modification_= 0; +} -# define buf_page_hash_lock_x_confirm(hash_lock, page_id)\ - hash_lock_x_confirm(hash_lock, buf_pool.page_hash, (page_id).fold()) +/** @return whether the block is modified and ready for flushing */ +inline bool buf_page_t::ready_for_flush() const +{ + ut_ad(mutex_own(&buf_pool.mutex)); + ut_ad(in_LRU_list); + ut_a(in_file()); + return oldest_modification() && io_fix_ == BUF_IO_NONE; +} -#ifdef UNIV_DEBUG -/** Test if page_hash lock is held in s-mode. */ -# define buf_page_hash_lock_held_s(bpage) \ - rw_lock_own(buf_page_hash_lock_get((bpage)->id), RW_LOCK_S) +/** @return whether the block can be relocated in memory. +The block can be dirty, but it must not be I/O-fixed or bufferfixed. */ +inline bool buf_page_t::can_relocate() const +{ + ut_ad(mutex_own(&buf_pool.mutex)); + ut_ad(in_file()); + ut_ad(in_LRU_list); + return io_fix_ == BUF_IO_NONE && !buf_fix_count_; +} -/** Test if page_hash lock is held in x-mode. */ -# define buf_page_hash_lock_held_x(bpage) \ - rw_lock_own(buf_page_hash_lock_get((bpage)->id), RW_LOCK_X) +/** @return whether the block has been flagged old in buf_pool.LRU */ +inline bool buf_page_t::is_old() const +{ + ut_ad(mutex_own(&buf_pool.mutex)); + ut_ad(in_file()); + ut_ad(in_LRU_list); + return old; +} -/** Test if page_hash lock is held in x or s-mode. */ -# define buf_page_hash_lock_held_s_or_x(bpage)\ - (buf_page_hash_lock_held_s(bpage) \ - || buf_page_hash_lock_held_x(bpage)) +/** Set whether a block is old in buf_pool.LRU */ +inline void buf_page_t::set_old(bool old) +{ + ut_ad(in_file()); + ut_ad(mutex_own(&buf_pool.mutex)); + ut_ad(in_LRU_list); -# define buf_block_hash_lock_held_s(block) \ - buf_page_hash_lock_held_s(&(block)->page) +#ifdef UNIV_LRU_DEBUG + ut_a((buf_pool.LRU_old_len == 0) == (buf_pool.LRU_old == nullptr)); + /* If a block is flagged "old", the LRU_old list must exist. */ + ut_a(!old || buf_pool.LRU_old); -# define buf_block_hash_lock_held_x(block) \ - buf_page_hash_lock_held_x(&(block)->page) + if (UT_LIST_GET_PREV(LRU, this) && UT_LIST_GET_NEXT(LRU, this)) + { + const buf_page_t *prev= UT_LIST_GET_PREV(LRU, this); + const buf_page_t *next = UT_LIST_GET_NEXT(LRU, this); + if (prev->old == next->old) + ut_a(prev->old == old); + else + { + ut_a(!prev->old); + ut_a(buf_pool.LRU_old == (old ? this : next)); + } + } +#endif /* UNIV_LRU_DEBUG */ -# define buf_block_hash_lock_held_s_or_x(block) \ - buf_page_hash_lock_held_s_or_x(&(block)->page) -#else /* UNIV_DEBUG */ -# define buf_page_hash_lock_held_s(p) (TRUE) -# define buf_page_hash_lock_held_x(p) (TRUE) -# define buf_page_hash_lock_held_s_or_x(p) (TRUE) -# define buf_block_hash_lock_held_s(p) (TRUE) -# define buf_block_hash_lock_held_x(p) (TRUE) -# define buf_block_hash_lock_held_s_or_x(p) (TRUE) -#endif /* UNIV_DEBUG */ + this->old= old; +} -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +#ifdef UNIV_DEBUG /** Forbid the release of the buffer pool mutex. */ # define buf_pool_mutex_exit_forbid() do { \ ut_ad(mutex_own(&buf_pool.mutex)); \ @@ -2224,15 +2252,12 @@ buf_page_hash_lock_get(const page_id_t& page_id) /** Allow the release of the buffer pool mutex. */ # define buf_pool_mutex_exit_allow() ((void) 0) #endif -/* @} */ /********************************************************************** Let us list the consistency conditions for different control block states. NOT_USED: is in free list, not in LRU list, not in flush list, nor page hash table -READY_FOR_USE: is not in free list, LRU list, or flush list, nor page - hash table MEMORY: is not in free list, LRU list, or flush list, nor page hash table FILE_PAGE: space and offset are defined, is in page hash table @@ -2260,9 +2285,8 @@ FILE_PAGE: space and offset are defined, is in page hash table State transitions: -NOT_USED => READY_FOR_USE -READY_FOR_USE => MEMORY -READY_FOR_USE => FILE_PAGE +NOT_USED => MEMORY +MEMORY => FILE_PAGE MEMORY => NOT_USED FILE_PAGE => NOT_USED NOTE: This transition is allowed if and only if (1) buf_fix_count == 0, @@ -2284,7 +2308,7 @@ inline buf_page_t *LRUItr::start() return m_hp; } -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +#ifdef UNIV_DEBUG /** Functor to validate the LRU list. */ struct CheckInLRUList { void operator()(const buf_page_t* elem) const @@ -2324,7 +2348,7 @@ struct CheckUnzipLRUAndLRUList { CheckUnzipLRUAndLRUList()); } }; -#endif /* UNIV_DEBUG || defined UNIV_BUF_DEBUG */ +#endif /* UNIV_DEBUG */ #include "buf0buf.ic" diff --git a/storage/innobase/include/buf0buf.ic b/storage/innobase/include/buf0buf.ic index 7b74705d5c7..bace66d5a39 100644 --- a/storage/innobase/include/buf0buf.ic +++ b/storage/innobase/include/buf0buf.ic @@ -101,7 +101,7 @@ inline bool buf_page_peek_if_too_old(const buf_page_t *bpage) either the warm-up phase or an in-memory workload. */ return(FALSE); } else if (buf_LRU_old_threshold_ms && bpage->old) { - unsigned access_time = buf_page_is_accessed(bpage); + uint32_t access_time = bpage->is_accessed(); /* It is possible that the below comparison returns an unexpected result. 2^32 milliseconds pass in about 50 days, @@ -109,7 +109,7 @@ inline bool buf_page_peek_if_too_old(const buf_page_t *bpage) is e.g. 50 days + 15 ms, then the below will behave as if it is 15 ms. This is known and fixing it would require to increase buf_page_t::access_time from 32 to 64 bits. */ - if (access_time > 0 + if (access_time && ((ib_uint32_t) (ut_time_ms() - access_time)) >= buf_LRU_old_threshold_ms) { return(TRUE); @@ -123,429 +123,6 @@ inline bool buf_page_peek_if_too_old(const buf_page_t *bpage) } /*********************************************************************//** -Gets the state of a block. -@return state */ -UNIV_INLINE -enum buf_page_state -buf_page_get_state( -/*===============*/ - const buf_page_t* bpage) /*!< in: pointer to the control block */ -{ - enum buf_page_state state = bpage->state; - -#ifdef UNIV_DEBUG - switch (state) { - case BUF_BLOCK_POOL_WATCH: - case BUF_BLOCK_ZIP_PAGE: - case BUF_BLOCK_ZIP_DIRTY: - case BUF_BLOCK_NOT_USED: - case BUF_BLOCK_READY_FOR_USE: - case BUF_BLOCK_FILE_PAGE: - case BUF_BLOCK_MEMORY: - case BUF_BLOCK_REMOVE_HASH: - break; - default: - ut_error; - } -#endif /* UNIV_DEBUG */ - - return(state); -} -/*********************************************************************//** -Gets the state of a block. -@return state */ -UNIV_INLINE -enum buf_page_state -buf_block_get_state( -/*================*/ - const buf_block_t* block) /*!< in: pointer to the control block */ -{ - return(buf_page_get_state(&block->page)); -} - -/*********************************************************************//** -Sets the state of a block. */ -UNIV_INLINE -void -buf_page_set_state( -/*===============*/ - buf_page_t* bpage, /*!< in/out: pointer to control block */ - enum buf_page_state state) /*!< in: state */ -{ -#ifdef UNIV_DEBUG - enum buf_page_state old_state = buf_page_get_state(bpage); - - switch (old_state) { - case BUF_BLOCK_POOL_WATCH: - ut_error; - break; - case BUF_BLOCK_ZIP_PAGE: - ut_ad(state == BUF_BLOCK_ZIP_DIRTY); - break; - case BUF_BLOCK_ZIP_DIRTY: - ut_ad(state == BUF_BLOCK_ZIP_PAGE); - break; - case BUF_BLOCK_NOT_USED: - ut_ad(state == BUF_BLOCK_READY_FOR_USE); - break; - case BUF_BLOCK_READY_FOR_USE: - ut_ad(state == BUF_BLOCK_MEMORY - || state == BUF_BLOCK_FILE_PAGE - || state == BUF_BLOCK_NOT_USED); - break; - case BUF_BLOCK_MEMORY: - ut_ad(state == BUF_BLOCK_NOT_USED); - break; - case BUF_BLOCK_FILE_PAGE: - ut_ad(state == BUF_BLOCK_NOT_USED - || state == BUF_BLOCK_REMOVE_HASH - || state == BUF_BLOCK_FILE_PAGE); - if (state == BUF_BLOCK_REMOVE_HASH) { - ut_ad(!bpage->in_page_hash); - ut_ad(!bpage->in_zip_hash); - ut_ad(!bpage->in_LRU_list); - ut_ad(!bpage->in_free_list); - } - break; - case BUF_BLOCK_REMOVE_HASH: - ut_ad(state == BUF_BLOCK_MEMORY); - break; - } -#endif /* UNIV_DEBUG */ - bpage->state = state; -} - -/*********************************************************************//** -Sets the state of a block. */ -UNIV_INLINE -void -buf_block_set_state( -/*================*/ - buf_block_t* block, /*!< in/out: pointer to control block */ - enum buf_page_state state) /*!< in: state */ -{ - buf_page_set_state(&block->page, state); -} - -/*********************************************************************//** -Determines if a block is mapped to a tablespace. -@return TRUE if mapped */ -UNIV_INLINE -ibool -buf_page_in_file( -/*=============*/ - const buf_page_t* bpage) /*!< in: pointer to control block */ -{ - switch (buf_page_get_state(bpage)) { - case BUF_BLOCK_POOL_WATCH: - ut_error; - break; - case BUF_BLOCK_ZIP_PAGE: - case BUF_BLOCK_ZIP_DIRTY: - case BUF_BLOCK_FILE_PAGE: - return(TRUE); - case BUF_BLOCK_NOT_USED: - case BUF_BLOCK_READY_FOR_USE: - case BUF_BLOCK_MEMORY: - case BUF_BLOCK_REMOVE_HASH: - break; - } - - return(FALSE); -} - -/*********************************************************************//** -Determines if a block should be on unzip_LRU list. -@return TRUE if block belongs to unzip_LRU */ -UNIV_INLINE -ibool -buf_page_belongs_to_unzip_LRU( -/*==========================*/ - const buf_page_t* bpage) /*!< in: pointer to control block */ -{ - ut_ad(buf_page_in_file(bpage)); - - return(bpage->zip.data - && buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); -} - -/*********************************************************************//** -Gets the mutex of a block. -@return pointer to mutex protecting bpage */ -UNIV_INLINE -BPageMutex* -buf_page_get_mutex( -/*===============*/ - const buf_page_t* bpage) /*!< in: pointer to control block */ -{ - switch (buf_page_get_state(bpage)) { - case BUF_BLOCK_POOL_WATCH: - ut_error; - return(NULL); - case BUF_BLOCK_ZIP_PAGE: - case BUF_BLOCK_ZIP_DIRTY: - return(&buf_pool.zip_mutex); - default: - return(&((buf_block_t*) bpage)->mutex); - } -} - -/*********************************************************************//** -Get the flush type of a page. -@return flush type */ -UNIV_INLINE -buf_flush_t -buf_page_get_flush_type( -/*====================*/ - const buf_page_t* bpage) /*!< in: buffer page */ -{ - buf_flush_t flush_type = (buf_flush_t) bpage->flush_type; - -#ifdef UNIV_DEBUG - switch (flush_type) { - case BUF_FLUSH_LRU: - case BUF_FLUSH_LIST: - case BUF_FLUSH_SINGLE_PAGE: - return(flush_type); - case BUF_FLUSH_N_TYPES: - ut_error; - } - ut_error; -#endif /* UNIV_DEBUG */ - return(flush_type); -} -/*********************************************************************//** -Set the flush type of a page. */ -UNIV_INLINE -void -buf_page_set_flush_type( -/*====================*/ - buf_page_t* bpage, /*!< in: buffer page */ - buf_flush_t flush_type) /*!< in: flush type */ -{ - bpage->flush_type = flush_type & 3; - ut_ad(buf_page_get_flush_type(bpage) == flush_type); -} - -/** Map a block to a file page. -@param[in,out] block pointer to control block -@param[in] page_id page id */ -UNIV_INLINE -void -buf_block_set_file_page( - buf_block_t* block, - const page_id_t page_id) -{ - buf_block_set_state(block, BUF_BLOCK_FILE_PAGE); - block->page.id = page_id; -} - -/*********************************************************************//** -Gets the io_fix state of a block. -@return io_fix state */ -UNIV_INLINE -enum buf_io_fix -buf_page_get_io_fix( -/*================*/ - const buf_page_t* bpage) /*!< in: pointer to the control block */ -{ - ut_ad(bpage != NULL); - - enum buf_io_fix io_fix = bpage->io_fix; - -#ifdef UNIV_DEBUG - switch (io_fix) { - case BUF_IO_NONE: - case BUF_IO_READ: - case BUF_IO_WRITE: - case BUF_IO_PIN: - return(io_fix); - } - ut_error; -#endif /* UNIV_DEBUG */ - return(io_fix); -} - -/*********************************************************************//** -Gets the io_fix state of a block. -@return io_fix state */ -UNIV_INLINE -enum buf_io_fix -buf_block_get_io_fix( -/*=================*/ - const buf_block_t* block) /*!< in: pointer to the control block */ -{ - return(buf_page_get_io_fix(&block->page)); -} - -/*********************************************************************//** -Sets the io_fix state of a block. */ -UNIV_INLINE -void -buf_page_set_io_fix( -/*================*/ - buf_page_t* bpage, /*!< in/out: control block */ - enum buf_io_fix io_fix) /*!< in: io_fix state */ -{ - ut_ad(mutex_own(&buf_pool.mutex)); - ut_ad(mutex_own(buf_page_get_mutex(bpage))); - - bpage->io_fix = io_fix; - ut_ad(buf_page_get_io_fix(bpage) == io_fix); -} - -/*********************************************************************//** -Sets the io_fix state of a block. */ -UNIV_INLINE -void -buf_block_set_io_fix( -/*=================*/ - buf_block_t* block, /*!< in/out: control block */ - enum buf_io_fix io_fix) /*!< in: io_fix state */ -{ - buf_page_set_io_fix(&block->page, io_fix); -} - -/*********************************************************************//** -Makes a block sticky. A sticky block implies that even after we release -the buf_pool.mutex and the block->mutex: -* it cannot be removed from the flush_list -* the block descriptor cannot be relocated -* it cannot be removed from the LRU list -Note that: -* the block can still change its position in the LRU list -* the next and previous pointers can change. */ -UNIV_INLINE -void -buf_page_set_sticky( -/*================*/ - buf_page_t* bpage) /*!< in/out: control block */ -{ - ut_ad(mutex_own(&buf_pool.mutex)); - ut_ad(mutex_own(buf_page_get_mutex(bpage))); - ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_NONE); - - bpage->io_fix = BUF_IO_PIN; -} - -/*********************************************************************//** -Removes stickiness of a block. */ -UNIV_INLINE -void -buf_page_unset_sticky( -/*==================*/ - buf_page_t* bpage) /*!< in/out: control block */ -{ - ut_ad(mutex_own(&buf_pool.mutex)); - ut_ad(mutex_own(buf_page_get_mutex(bpage))); - ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_PIN); - - bpage->io_fix = BUF_IO_NONE; -} - -/********************************************************************//** -Determine if a buffer block can be relocated in memory. The block -can be dirty, but it must not be I/O-fixed or bufferfixed. */ -UNIV_INLINE -ibool -buf_page_can_relocate( -/*==================*/ - const buf_page_t* bpage) /*!< control block being relocated */ -{ - ut_ad(mutex_own(&buf_pool.mutex)); - ut_ad(mutex_own(buf_page_get_mutex(bpage))); - ut_ad(buf_page_in_file(bpage)); - ut_ad(bpage->in_LRU_list); - - return(buf_page_get_io_fix(bpage) == BUF_IO_NONE - && bpage->buf_fix_count == 0); -} - -/*********************************************************************//** -Determine if a block has been flagged old. -@return TRUE if old */ -UNIV_INLINE -ibool -buf_page_is_old( -/*============*/ - const buf_page_t* bpage) /*!< in: control block */ -{ - /* Buffer page mutex is not strictly required here for heuristic - purposes even if LRU mutex is not being held. Keep the assertion - for not since all the callers hold it. */ - ut_ad(mutex_own(buf_page_get_mutex(bpage)) - || mutex_own(&buf_pool.mutex)); - ut_ad(buf_page_in_file(bpage)); - - return(bpage->old); -} - -/*********************************************************************//** -Flag a block old. */ -UNIV_INLINE -void -buf_page_set_old( -/*=============*/ - buf_page_t* bpage, /*!< in/out: control block */ - bool old) /*!< in: old */ -{ - ut_a(buf_page_in_file(bpage)); - ut_ad(mutex_own(&buf_pool.mutex)); - ut_ad(bpage->in_LRU_list); - -#ifdef UNIV_LRU_DEBUG - ut_a((buf_pool.LRU_old_len == 0) == (buf_pool.LRU_old == NULL)); - /* If a block is flagged "old", the LRU_old list must exist. */ - ut_a(!old || buf_pool.LRU_old); - - if (UT_LIST_GET_PREV(LRU, bpage) && UT_LIST_GET_NEXT(LRU, bpage)) { - const buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage); - const buf_page_t* next = UT_LIST_GET_NEXT(LRU, bpage); - if (prev->old == next->old) { - ut_a(prev->old == old); - } else { - ut_a(!prev->old); - ut_a(buf_pool.LRU_old == (old ? bpage : next)); - } - } -#endif /* UNIV_LRU_DEBUG */ - - bpage->old = old; -} - -/*********************************************************************//** -Determine the time of first access of a block in the buffer pool. -@return ut_time_ms() at the time of first access, 0 if not accessed */ -UNIV_INLINE -unsigned -buf_page_is_accessed( -/*=================*/ - const buf_page_t* bpage) /*!< in: control block */ -{ - ut_ad(buf_page_in_file(bpage)); - - return(bpage->access_time); -} - -/*********************************************************************//** -Flag a block accessed. */ -UNIV_INLINE -void -buf_page_set_accessed( -/*==================*/ - buf_page_t* bpage) /*!< in/out: control block */ -{ - ut_ad(!mutex_own(&buf_pool.mutex)); - ut_ad(mutex_own(buf_page_get_mutex(bpage))); - ut_a(buf_page_in_file(bpage)); - - if (bpage->access_time == 0) { - /* Make this the time of the first access. */ - bpage->access_time = static_cast<uint>(ut_time_ms()); - } -} - -/*********************************************************************//** Gets the buf_block_t handle of a buffered file block if an uncompressed page frame exists, or NULL. @return control block, or NULL */ @@ -556,12 +133,16 @@ buf_page_get_block( buf_page_t* bpage) /*!< in: control block, or NULL */ { if (bpage != NULL) { - ut_ad(buf_page_hash_lock_held_s_or_x(bpage) - || mutex_own(&buf_pool.mutex)); - ut_ad(buf_page_in_file(bpage)); - - if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) { - return((buf_block_t*) bpage); + ut_ad(mutex_own(&buf_pool.mutex) + || rw_lock_own_flagged(buf_pool.hash_lock_get(bpage->id()), + RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); + switch (bpage->state()) { + case BUF_BLOCK_FILE_PAGE: + return reinterpret_cast<buf_block_t*>(bpage); + case BUF_BLOCK_ZIP_PAGE: + return nullptr; + default: + ut_ad(0); } } @@ -582,17 +163,14 @@ buf_block_get_frame( return NULL; } - switch (buf_block_get_state(block)) { - case BUF_BLOCK_POOL_WATCH: + switch (block->page.state()) { case BUF_BLOCK_ZIP_PAGE: - case BUF_BLOCK_ZIP_DIRTY: case BUF_BLOCK_NOT_USED: ut_error; break; case BUF_BLOCK_FILE_PAGE: - ut_a(block->page.buf_fix_count > 0); + ut_a(block->page.buf_fix_count()); /* fall through */ - case BUF_BLOCK_READY_FOR_USE: case BUF_BLOCK_MEMORY: case BUF_BLOCK_REMOVE_HASH: goto ok; @@ -603,24 +181,6 @@ ok: } #endif /* UNIV_DEBUG */ -/*********************************************************************** -FIXME_FTS Gets the frame the pointer is pointing to. */ -UNIV_INLINE -buf_frame_t* -buf_frame_align( -/*============*/ - /* out: pointer to frame */ - byte* ptr) /* in: pointer to a frame */ -{ - buf_frame_t* frame; - - ut_ad(ptr); - - frame = (buf_frame_t*) ut_align_down(ptr, srv_page_size); - - return(frame); -} - /**********************************************************************//** Gets the hash value of the page the pointer is pointing to. This can be used in searches in the lock hash table. @@ -632,7 +192,7 @@ buf_block_get_lock_hash_val( const buf_block_t* block) /*!< in: block */ { ut_ad(block); - ut_ad(buf_page_in_file(&block->page)); + ut_ad(block->page.in_file()); ut_ad(rw_lock_own(&(((buf_block_t*) block)->lock), RW_LOCK_X) || rw_lock_own(&(((buf_block_t*) block)->lock), RW_LOCK_S)); @@ -668,6 +228,13 @@ buf_page_free_descriptor( ut_free(bpage); } +/** Allocate a buffer block. +@return own: the allocated block, in state BUF_BLOCK_MEMORY */ +inline buf_block_t *buf_block_alloc() +{ + return buf_LRU_get_free_block(false); +} + /********************************************************************//** Frees a buffer block which does not contain a file page. */ UNIV_INLINE @@ -677,15 +244,7 @@ buf_block_free( buf_block_t* block) /*!< in, own: block to be freed */ { mutex_enter(&buf_pool.mutex); - - buf_page_mutex_enter(block); - - ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE); - buf_LRU_block_free_non_file_page(block); - - buf_page_mutex_exit(block); - mutex_exit(&buf_pool.mutex); } @@ -700,9 +259,9 @@ buf_block_modify_clock_inc( buf_block_t* block) /*!< in: block */ { /* No latch is acquired for the shared temporary tablespace. */ - ut_ad(fsp_is_system_temporary(block->page.id.space()) + ut_ad(fsp_is_system_temporary(block->page.id().space()) || (mutex_own(&buf_pool.mutex) - && block->page.buf_fix_count == 0) + && !block->page.buf_fix_count()) || rw_lock_own_flagged(&block->lock, RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX)); assert_block_ahi_valid(block); @@ -722,7 +281,7 @@ buf_block_get_modify_clock( { #ifdef UNIV_DEBUG /* No latch is acquired for the shared temporary tablespace. */ - if (!fsp_is_system_temporary(block->page.id.space())) { + if (!fsp_is_system_temporary(block->page.id().space())) { ut_ad(rw_lock_own(&(block->lock), RW_LOCK_S) || rw_lock_own(&(block->lock), RW_LOCK_X) || rw_lock_own(&(block->lock), RW_LOCK_SX)); @@ -748,7 +307,7 @@ buf_block_buf_fix_inc_func( /* No debug latch is acquired if block belongs to system temporary. Debug latch is not of much help if access to block is single threaded. */ - if (!fsp_is_system_temporary(block->page.id.space())) { + if (!fsp_is_system_temporary(block->page.id().space())) { ibool ret; ret = rw_lock_s_lock_nowait(block->debug_latch, file, line); ut_a(ret); @@ -772,44 +331,13 @@ buf_block_buf_fix_dec( /* No debug latch is acquired if block belongs to system temporary. Debug latch is not of much help if access to block is single threaded. */ - if (!fsp_is_system_temporary(block->page.id.space())) { + if (!fsp_is_system_temporary(block->page.id().space())) { rw_lock_s_unlock(block->debug_latch); } #endif /* UNIV_DEBUG */ } /** Returns the control block of a file page, NULL if not found. -@param[in] page_id page id -@return block, NULL if not found */ -inline buf_page_t *buf_page_hash_get_low(page_id_t page_id) -{ - buf_page_t* bpage; - -#ifdef UNIV_DEBUG - rw_lock_t* hash_lock; - - hash_lock = hash_get_lock(buf_pool.page_hash, page_id.fold()); - ut_ad(rw_lock_own(hash_lock, RW_LOCK_X) - || rw_lock_own(hash_lock, RW_LOCK_S)); -#endif /* UNIV_DEBUG */ - - /* Look for the page in the hash table */ - - HASH_SEARCH(hash, buf_pool.page_hash, page_id.fold(), buf_page_t*, - bpage, - ut_ad(bpage->in_page_hash && !bpage->in_zip_hash - && buf_page_in_file(bpage)), - page_id == bpage->id); - if (bpage) { - ut_a(buf_page_in_file(bpage)); - ut_ad(bpage->in_page_hash); - ut_ad(!bpage->in_zip_hash); - } - - return(bpage); -} - -/** Returns the control block of a file page, NULL if not found. If the block is found and lock is not NULL then the appropriate page_hash lock is acquired in the specified lock mode. Otherwise, mode value is ignored. It is up to the caller to release the @@ -819,8 +347,7 @@ lock is released by this function. @param[in,out] lock lock of the page hash acquired if bpage is found, NULL otherwise. If NULL is passed then the hash_lock is released by this function. -@param[in] lock_mode RW_LOCK_X or RW_LOCK_S. Ignored if -lock == NULL +@param[in] lock_mode RW_LOCK_X or RW_LOCK_S @param[in] watch if true, return watch sentinel also. @return pointer to the bpage or NULL; if NULL, lock is also NULL or a watch sentinel. */ @@ -832,63 +359,43 @@ buf_page_hash_get_locked( ulint lock_mode, bool watch) { - buf_page_t* bpage = NULL; - rw_lock_t* hash_lock; - ulint mode = RW_LOCK_S; + ut_ad(lock_mode == RW_LOCK_X || lock_mode == RW_LOCK_S); + ut_ad(lock || lock_mode == RW_LOCK_S); if (lock != NULL) { *lock = NULL; - ut_ad(lock_mode == RW_LOCK_X - || lock_mode == RW_LOCK_S); - mode = lock_mode; } - hash_lock = hash_get_lock(buf_pool.page_hash, page_id.fold()); - - ut_ad(!rw_lock_own(hash_lock, RW_LOCK_X) - && !rw_lock_own(hash_lock, RW_LOCK_S)); + const ulint fold= page_id.fold(); + rw_lock_t* hash_lock = lock_mode == RW_LOCK_S + ? buf_pool.page_hash_lock<false>(fold) + : buf_pool.page_hash_lock<true>(fold); - if (mode == RW_LOCK_S) { - rw_lock_s_lock(hash_lock); + buf_page_t* bpage = buf_pool.page_hash_get_low(page_id); - /* If not own buf_pool_mutex, page_hash can be changed. */ - hash_lock = hash_lock_s_confirm( - hash_lock, buf_pool.page_hash, page_id.fold()); - } else { - rw_lock_x_lock(hash_lock); - /* If not own buf_pool_mutex, page_hash can be changed. */ - hash_lock = hash_lock_x_confirm( - hash_lock, buf_pool.page_hash, page_id.fold()); - } - - bpage = buf_page_hash_get_low(page_id); - - if (!bpage || buf_pool_watch_is_sentinel(bpage)) { + if (!bpage || buf_pool.watch_is_sentinel(*bpage)) { if (!watch) { bpage = NULL; } goto unlock_and_exit; } - ut_ad(buf_page_in_file(bpage)); - ut_ad(page_id == bpage->id); + ut_ad(bpage->in_file()); + ut_ad(page_id == bpage->id()); - if (lock == NULL) { - /* The caller wants us to release the page_hash lock */ - goto unlock_and_exit; - } else { + if (lock) { /* To be released by the caller */ *lock = hash_lock; - goto exit; + return bpage; } unlock_and_exit: - if (mode == RW_LOCK_S) { + if (lock_mode == RW_LOCK_S) { rw_lock_s_unlock(hash_lock); } else { rw_lock_x_unlock(hash_lock); } -exit: + return(bpage); } @@ -917,14 +424,12 @@ buf_block_hash_get_locked( buf_block_t* block = buf_page_get_block(bpage); if (block != NULL) { - - ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); + ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE); ut_ad(!lock || rw_lock_own(*lock, lock_mode)); - return(block); } else if (bpage) { /* It is not a block. Just a bpage */ - ut_ad(buf_page_in_file(bpage)); + ut_ad(bpage->in_file()); if (lock) { if (lock_mode == RW_LOCK_S) { @@ -937,7 +442,6 @@ buf_block_hash_get_locked( return(NULL); } - ut_ad(!bpage); ut_ad(lock == NULL ||*lock == NULL); return(NULL); } @@ -951,9 +455,9 @@ buf_page_release_zip( buf_page_t* bpage) /*!< in: buffer block */ { ut_ad(bpage); - ut_a(bpage->buf_fix_count > 0); + ut_a(bpage->buf_fix_count()); - switch (buf_page_get_state(bpage)) { + switch (bpage->state()) { case BUF_BLOCK_FILE_PAGE: #ifdef UNIV_DEBUG { @@ -961,20 +465,17 @@ buf_page_release_zip( temporary. Debug latch is not of much help if access to block is single threaded. */ buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage); - if (!fsp_is_system_temporary(block->page.id.space())) { + if (!fsp_is_system_temporary(block->page.id().space())) { rw_lock_s_unlock(block->debug_latch); } } #endif /* UNIV_DEBUG */ /* Fall through */ case BUF_BLOCK_ZIP_PAGE: - case BUF_BLOCK_ZIP_DIRTY: reinterpret_cast<buf_block_t*>(bpage)->unfix(); return; - case BUF_BLOCK_POOL_WATCH: case BUF_BLOCK_NOT_USED: - case BUF_BLOCK_READY_FOR_USE: case BUF_BLOCK_MEMORY: case BUF_BLOCK_REMOVE_HASH: break; @@ -997,7 +498,7 @@ buf_page_release_latch( /* No debug latch is acquired if block belongs to system temporary. Debug latch is not of much help if access to block is single threaded. */ - if (!fsp_is_system_temporary(block->page.id.space())) { + if (!fsp_is_system_temporary(block->page.id().space())) { rw_lock_s_unlock(block->debug_latch); } #endif /* UNIV_DEBUG */ diff --git a/storage/innobase/include/buf0dblwr.h b/storage/innobase/include/buf0dblwr.h index 17fa84a85cc..fca2c61124e 100644 --- a/storage/innobase/include/buf0dblwr.h +++ b/storage/innobase/include/buf0dblwr.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -69,13 +69,8 @@ frees doublewrite buffer. */ void buf_dblwr_free(); -/********************************************************************//** -Updates the doublewrite buffer when an IO request is completed. */ -void -buf_dblwr_update( -/*=============*/ - const buf_page_t* bpage, /*!< in: buffer block descriptor */ - buf_flush_t flush_type);/*!< in: flush type */ +/** Update the doublewrite buffer on write completion. */ +void buf_dblwr_update(const buf_page_t &bpage, bool single_page); /****************************************************************//** Determines if a page number is located inside the doublewrite buffer. @return TRUE if the location is inside the two blocks of the @@ -84,20 +79,6 @@ ibool buf_dblwr_page_inside( /*==================*/ ulint page_no); /*!< in: page number */ -/********************************************************************//** -Posts a buffer page for writing. If the doublewrite memory buffer is -full, calls buf_dblwr_flush_buffered_writes and waits for for free -space to appear. */ -void -buf_dblwr_add_to_batch( -/*====================*/ - buf_page_t* bpage); /*!< in: buffer block to write */ - -/********************************************************************//** -Flush a batch of writes to the datafiles that have already been -written to the dblwr buffer on disk. */ -void -buf_dblwr_sync_datafiles(); /********************************************************************//** Flushes possible buffered writes from the doublewrite memory buffer to disk. @@ -107,20 +88,6 @@ Otherwise a deadlock of threads can occur. */ void buf_dblwr_flush_buffered_writes(); -/********************************************************************//** -Writes a page to the doublewrite buffer on disk, sync it, then write -the page to the datafile and sync the datafile. This function is used -for single page flushes. If all the buffers allocated for single page -flushes in the doublewrite buffer are in use we wait here for one to -become free. We are guaranteed that a slot will become free because any -thread that is using a slot must also release the slot before leaving -this function. */ -void -buf_dblwr_write_single_page( -/*========================*/ - buf_page_t* bpage, /*!< in: buffer block to write */ - bool sync); /*!< in: true if sync IO requested */ - /** Doublewrite control struct */ struct buf_dblwr_t{ ib_mutex_t mutex; /*!< mutex protecting the first_free @@ -140,9 +107,6 @@ struct buf_dblwr_t{ reserved for single page flushes. */ os_event_t s_event;/*!< event where threads wait for a single page flush slot. Protected by mutex. */ - bool* in_use; /*!< flag used to indicate if a slot is - in use. Only used for single page - flushes. */ bool batch_running;/*!< set to TRUE if currently a batch is being written from the doublewrite buffer. */ @@ -150,9 +114,37 @@ struct buf_dblwr_t{ doublewrite buffer, aligned to an address divisible by srv_page_size (which is required by Windows aio) */ - buf_page_t** buf_block_arr;/*!< array to store pointers to - the buffer blocks which have been - cached to write_buf */ + + struct element + { + /** block descriptor */ + buf_page_t *bpage; + /** flush type */ + IORequest::flush_t flush; + /** payload size in bytes */ + size_t size; + }; + + /** buffer blocks to be written via write_buf */ + element *buf_block_arr; + + /** Schedule a page write. If the doublewrite memory buffer is full, + buf_dblwr_flush_buffered_writes() will be invoked to make space. + @param bpage buffer pool page to be written + @param flush type of flush + @param size payload size in bytes */ + void add_to_batch(buf_page_t *bpage, IORequest::flush_t flush, size_t size); + /** Write a page to the doublewrite buffer on disk, sync it, then write + the page to the datafile and sync the datafile. This function is used + for single page flushes. If all the buffers allocated for single page + flushes in the doublewrite buffer are in use we wait here for one to + become free. We are guaranteed that a slot will become free because any + thread that is using a slot must also release the slot before leaving + this function. + @param bpage buffer pool page to be written + @param sync whether synchronous operation is requested + @param size payload size in bytes */ + void write_single_page(buf_page_t *bpage, bool sync, size_t size); }; #endif diff --git a/storage/innobase/include/buf0flu.h b/storage/innobase/include/buf0flu.h index eb6351fb244..f7f89f1a9e9 100644 --- a/storage/innobase/include/buf0flu.h +++ b/storage/innobase/include/buf0flu.h @@ -70,10 +70,14 @@ buf_flush_relocate_on_flush_list( /*=============================*/ buf_page_t* bpage, /*!< in/out: control block being moved */ buf_page_t* dpage); /*!< in/out: destination block */ -/** Update the flush system data structures when a write is completed. -@param[in,out] bpage flushed page -@param[in] dblwr whether the doublewrite buffer was used */ -void buf_flush_write_complete(buf_page_t* bpage, bool dblwr); + +/** Complete write of a file page from buf_pool. +@param bpage written page +@param request write request +@param dblwr whether the doublewrite buffer was used +@param evict whether or not to evict the page from LRU list */ +void buf_page_write_complete(buf_page_t *bpage, const IORequest &request, + bool dblwr, bool evict); /** Assign the full crc32 checksum for non-compressed page. @param[in,out] page page to be updated */ @@ -91,19 +95,9 @@ buf_flush_init_for_writing( void* page_zip_, bool use_full_checksum); -# if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG -/** Writes a flushable page asynchronously from the buffer pool to a file. -NOTE: block and LRU list mutexes must be held upon entering this function, and -they will be released by this function after flushing. This is loosely based on -buf_flush_batch() and buf_flush_page(). -@param[in,out] block buffer control block -@return whether the page was flushed and the mutex released */ -bool buf_flush_page_try(buf_block_t* block) - MY_ATTRIBUTE((warn_unused_result)); -# endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ /** Do flushing batch of a given type. NOTE: The calling thread is not allowed to own any latches on pages! -@param[in] type flush type +@param[in] lru true=buf_pool.LRU; false=buf_pool.flush_list @param[in] min_n wished minimum mumber of blocks flushed (it is not guaranteed that the actual number is that big, though) @param[in] lsn_limit in the case BUF_FLUSH_LIST all blocks whose @@ -113,12 +107,8 @@ does not exceed min_n), otherwise ignored passed back to caller. Ignored if NULL @retval true if a batch was queued successfully. @retval false if another batch of same type was already running. */ -bool -buf_flush_do_batch( - buf_flush_t type, - ulint min_n, - lsn_t lsn_limit, - flush_counters_t* n); +bool buf_flush_do_batch(bool lru, ulint min_n, lsn_t lsn_limit, + flush_counters_t *n); /** This utility flushes dirty blocks from the end of the flush list. NOTE: The calling thread is not allowed to own any latches on pages! @@ -144,8 +134,8 @@ is not fast enough to keep pace with the workload. bool buf_flush_single_page_from_LRU(); /** Wait until a flush batch ends. -@param[in] type BUF_FLUSH_LRU or BUF_FLUSH_LIST */ -void buf_flush_wait_batch_end(buf_flush_t type); +@param[in] lru true=buf_pool.LRU; false=buf_pool.flush_list */ +void buf_flush_wait_batch_end(bool lru); /** Wait until a flush batch of the given lsn ends @param[in] new_oldest target oldest_modified_lsn to wait for */ void buf_flush_wait_flushed(lsn_t new_oldest); @@ -162,15 +152,6 @@ buf_flush_note_modification( set of mtr's */ lsn_t end_lsn); /*!< in: end lsn of the last mtr in the set of mtr's */ -/********************************************************************//** -Returns TRUE if the file page block is immediately suitable for replacement, -i.e., transition FILE_PAGE => NOT_USED allowed. -@return TRUE if can replace immediately */ -ibool -buf_flush_ready_for_replace( -/*========================*/ - buf_page_t* bpage); /*!< in: buffer control block, must be - buf_page_in_file(bpage) and in the LRU list */ /** Initialize page_cleaner. */ void buf_flush_page_cleaner_init(); @@ -178,10 +159,10 @@ void buf_flush_page_cleaner_init(); /** Wait for any possible LRU flushes to complete. */ void buf_flush_wait_LRU_batch_end(); -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +#ifdef UNIV_DEBUG /** Validate the flush list. */ void buf_flush_validate(); -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ +#endif /* UNIV_DEBUG */ /********************************************************************//** Initialize the red-black tree to speed up insertions into the flush_list @@ -197,29 +178,16 @@ void buf_flush_free_flush_rbt(void); /*==========================*/ -/** Write a flushable page asynchronously from the buffer pool to a file. -NOTE: 1. in simulated aio we must call os_aio_simulated_wake_handler_threads -after we have posted a batch of writes! 2. buf_page_get_mutex(bpage) must be -held upon entering this function. The LRU list mutex must be held if flush_type -== BUF_FLUSH_SINGLE_PAGE. Both mutexes will be released by this function if it -returns true. -@param[in] bpage buffer control block -@param[in] flush_type type of flush -@param[in] sync true if sync IO request -@return whether the page was flushed */ -bool buf_flush_page(buf_page_t* bpage, buf_flush_t flush_type, bool sync); - -/** Check if the block is modified and ready for flushing. -@param[in] bpage buffer control block, must be buf_page_in_file() -@param[in] flush_type type of flush -@return true if can flush immediately */ -bool -buf_flush_ready_for_flush( -/*======================*/ - buf_page_t* bpage, /*!< in: buffer control block, must be - buf_page_in_file(bpage) */ - buf_flush_t flush_type)/*!< in: type of flush */ - MY_ATTRIBUTE((warn_unused_result)); +/** Write a flushable page from buf_pool to a file. +buf_pool.mutex must be held. +@param bpage buffer control block +@param flush_type type of flush +@param space tablespace (or nullptr if not known) +@param sync whether this is a synchronous request + (only for flush_type=SINGLE_PAGE) +@return whether the page was flushed and buf_pool.mutex was released */ +bool buf_flush_page(buf_page_t *bpage, IORequest::flush_t flush_type, + fil_space_t *space, bool sync); /** Synchronously flush dirty blocks. NOTE: The calling thread is not allowed to hold any buffer page latches! */ diff --git a/storage/innobase/include/buf0flu.ic b/storage/innobase/include/buf0flu.ic index 4a3ecb57938..5f298c69e6d 100644 --- a/storage/innobase/include/buf0flu.ic +++ b/storage/innobase/include/buf0flu.ic @@ -52,11 +52,10 @@ buf_flush_note_modification( lsn_t end_lsn) /*!< in: end lsn of the mtr that modified this block */ { - mutex_enter(&block->mutex); ut_ad(!srv_read_only_mode - || fsp_is_system_temporary(block->page.id.space())); - ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); - ut_ad(block->page.buf_fix_count > 0); + || fsp_is_system_temporary(block->page.id().space())); + ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE); + ut_ad(block->page.buf_fix_count()); ut_ad(mach_read_from_8(block->frame + FIL_PAGE_LSN) <= end_lsn); mach_write_to_8(block->frame + FIL_PAGE_LSN, end_lsn); if (UNIV_LIKELY_NULL(block->page.zip.data)) { @@ -64,13 +63,13 @@ buf_flush_note_modification( FIL_PAGE_LSN + block->frame, 8); } - if (block->page.oldest_modification == 0) { + const lsn_t oldest_modification = block->page.oldest_modification(); + + if (!oldest_modification) { buf_flush_insert_into_flush_list(block, start_lsn); } else { - ut_ad(block->page.oldest_modification <= start_lsn); + ut_ad(oldest_modification <= start_lsn); } - mutex_exit(&block->mutex); - srv_stats.buf_pool_write_requests.inc(); } diff --git a/storage/innobase/include/buf0lru.h b/storage/innobase/include/buf0lru.h index eebe7bf5f1d..30400102db3 100644 --- a/storage/innobase/include/buf0lru.h +++ b/storage/innobase/include/buf0lru.h @@ -34,10 +34,6 @@ Created 11/5/1995 Heikki Tuuri struct trx_t; struct fil_space_t; -/** @return whether less than 1/4 of the buffer pool is available */ -bool -buf_LRU_buf_pool_running_out(); - /*####################################################################### These are low-level functions #########################################################################*/ @@ -51,33 +47,24 @@ These are low-level functions @param[in] first first page to be flushed or evicted */ void buf_LRU_flush_or_remove_pages(ulint id, bool flush, ulint first = 0); -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +#ifdef UNIV_DEBUG /********************************************************************//** Insert a compressed block into buf_pool.zip_clean in the LRU order. */ void buf_LRU_insert_zip_clean( /*=====================*/ buf_page_t* bpage); /*!< in: pointer to the block in question */ -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ - -/******************************************************************//** -Try to free a block. If bpage is a descriptor of a compressed-only -page, the descriptor object will be freed as well. - -NOTE: If this function returns true, it will temporarily -release buf_pool.mutex. Furthermore, the page frame will no longer be -accessible via bpage. - -The caller must hold buf_pool.mutex and must not hold any -buf_page_get_mutex() when calling this function. -@return true if freed, false otherwise. */ -bool -buf_LRU_free_page( -/*==============*/ - buf_page_t* bpage, /*!< in: block to be freed */ - bool zip) /*!< in: true if should remove also the - compressed page of an uncompressed page */ - MY_ATTRIBUTE((nonnull)); +#endif /* UNIV_DEBUG */ + +/** Try to free a block. If bpage is a descriptor of a compressed-only +ROW_FORMAT=COMPRESSED page, the buf_page_t object will be freed as well. +The caller must hold buf_pool.mutex. +@param bpage block to be freed +@param zip whether to remove both copies of a ROW_FORMAT=COMPRESSED page +@retval true if freed and buf_pool.mutex may have been temporarily released +@retval false if the page was not freed */ +bool buf_LRU_free_page(buf_page_t *bpage, bool zip) + MY_ATTRIBUTE((nonnull)); /** Try to free a replaceable block. @param[in] scan_all true=scan the whole LRU list, @@ -92,6 +79,7 @@ buf_block_t* buf_LRU_get_free_only(); /** Get a free block from the buf_pool. The block is taken off the free list. If free list is empty, blocks are moved from the end of the LRU list to the free list. + This function is called from a user thread when it needs a clean block to read in a page. Note that we only ever get a block from the free list. Even when we flush a page or find a page in LRU scan @@ -111,8 +99,10 @@ we put it to free list to be used. * scan LRU list even if buf_pool.try_LRU_scan is not set * iteration > 1: * same as iteration 1 but sleep 10ms -@return the free control block, in state BUF_BLOCK_READY_FOR_USE */ -buf_block_t* buf_LRU_get_free_block() + +@param have_mutex whether buf_pool.mutex is already being held +@return the free control block, in state BUF_BLOCK_MEMORY */ +buf_block_t* buf_LRU_get_free_block(bool have_mutex) MY_ATTRIBUTE((malloc,warn_unused_result)); /** @return whether the unzip_LRU list should be used for evicting a victim @@ -131,7 +121,7 @@ void buf_LRU_add_block( /*==============*/ buf_page_t* bpage, /*!< in: control block */ - ibool old); /*!< in: TRUE if should be put to the old + bool old); /*!< in: true if should be put to the old blocks in the LRU list, else put to the start; if the LRU list is very short, added to the start regardless of this parameter */ @@ -163,25 +153,21 @@ void buf_LRU_stat_update(); /** Remove one page from LRU list and put it to free list. -@param[in,out] bpage block, must contain a file page and be in - a freeable state; there may or may not be a - hash index to the page -@param[in] old_page_id page number before bpage->id was invalidated */ -void buf_LRU_free_one_page(buf_page_t* bpage, page_id_t old_page_id) - MY_ATTRIBUTE((nonnull)); - -/** Adjust LRU hazard pointers if needed. -@param[in] bpage buffer page descriptor */ -void buf_LRU_adjust_hp(const buf_page_t* bpage); - -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +@param bpage file page to be freed +@param id page identifier +@param hash_lock buf_pool.page_hash latch (will be released here) */ +void buf_LRU_free_one_page(buf_page_t *bpage, const page_id_t id, + rw_lock_t *hash_lock) + MY_ATTRIBUTE((nonnull)); + +#ifdef UNIV_DEBUG /** Validate the LRU list. */ void buf_LRU_validate(); -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ -#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +#endif /* UNIV_DEBUG */ +#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG /** Dump the LRU list to stderr. */ void buf_LRU_print(); -#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */ +#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG */ /** @name Heuristics for detecting index scan @{ */ /** The denominator of buf_pool.LRU_old_ratio. */ diff --git a/storage/innobase/include/buf0types.h b/storage/innobase/include/buf0types.h index 611fadae677..1802bd57ddd 100644 --- a/storage/innobase/include/buf0types.h +++ b/storage/innobase/include/buf0types.h @@ -44,16 +44,6 @@ struct buf_dblwr_t; /** A buffer frame. @see page_t */ typedef byte buf_frame_t; -/** Flags for flush types */ -enum buf_flush_t { - BUF_FLUSH_LRU = 0, /*!< flush via the LRU list */ - BUF_FLUSH_LIST, /*!< flush via the flush list - of dirty blocks */ - BUF_FLUSH_SINGLE_PAGE, /*!< flush via the LRU list - but only a single page */ - BUF_FLUSH_N_TYPES /*!< index of last element + 1 */ -}; - /** Flags for io_fix types */ enum buf_io_fix { BUF_IO_NONE = 0, /**< no pending I/O */ @@ -141,11 +131,30 @@ public: ut_ad(page_no <= 0xFFFFFFFFU); } - page_id_t(ulonglong id) : m_id(id) {} + page_id_t(uint64_t id) : m_id(id) {} bool operator==(const page_id_t& rhs) const { return m_id == rhs.m_id; } bool operator!=(const page_id_t& rhs) const { return m_id != rhs.m_id; } - bool operator<(const page_id_t& rhs) const { return m_id < rhs.m_id; } + bool operator>(const page_id_t& rhs) const { return m_id > rhs.m_id; } + bool operator<=(const page_id_t& rhs) const { return m_id <= rhs.m_id; } + bool operator>=(const page_id_t& rhs) const { return m_id >= rhs.m_id; } + page_id_t &operator--() { ut_ad(page_no()); m_id--; return *this; } + page_id_t &operator++() + { + ut_ad(page_no() < 0xFFFFFFFFU); + m_id++; + return *this; + } + page_id_t operator-(uint32_t i) const + { + ut_ad(page_no() >= i); + return page_id_t(m_id - i); + } + page_id_t operator+(uint32_t i) const + { + ut_ad(page_no() < ~i); + return page_id_t(m_id + i); + } /** Retrieve the tablespace id. @return tablespace id */ @@ -167,9 +176,6 @@ public: m_id= (m_id & ~uint64_t{0} << 32) | page_no; } - /** Set the FIL_NULL for the space and page_no */ - void set_corrupt_id() { m_id= ~uint64_t{0}; } - ulonglong raw() { return m_id; } private: /** The page identifier */ @@ -187,10 +193,8 @@ extern const byte field_ref_zero[UNIV_PAGE_SIZE_MAX]; #include "ut0mutex.h" #include "sync0rw.h" -typedef ib_bpmutex_t BPageMutex; typedef ib_mutex_t BufPoolMutex; typedef ib_mutex_t FlushListMutex; -typedef BPageMutex BufPoolZipMutex; typedef rw_lock_t BPageLock; #endif /* !UNIV_INNOCHECKSUM */ diff --git a/storage/innobase/include/dict0dict.h b/storage/innobase/include/dict0dict.h index 867c0c215f3..92f7435ab6d 100644 --- a/storage/innobase/include/dict0dict.h +++ b/storage/innobase/include/dict0dict.h @@ -1197,7 +1197,7 @@ dict_index_get_if_in_cache_low( /*===========================*/ index_id_t index_id) /*!< in: index id */ MY_ATTRIBUTE((warn_unused_result)); -#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG +#ifdef UNIV_DEBUG /**********************************************************************//** Returns an index object if it is found in the dictionary cache. @return index, NULL if not found */ @@ -1206,8 +1206,6 @@ dict_index_get_if_in_cache( /*=======================*/ index_id_t index_id) /*!< in: index id */ MY_ATTRIBUTE((warn_unused_result)); -#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ -#ifdef UNIV_DEBUG /**********************************************************************//** Checks that a tuple has n_fields_cmp value in a sensible range, so that no comparison can occur with the page number field in a node pointer. diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h index 1832fd18afe..bfacd0cbd2a 100644 --- a/storage/innobase/include/fil0fil.h +++ b/storage/innobase/include/fil0fil.h @@ -46,6 +46,53 @@ struct rotation_list_tag_t; // Forward declaration extern my_bool srv_use_doublewrite_buf; + +/** Possible values of innodb_flush_method */ +enum srv_flush_t +{ + /** fsync, the default */ + SRV_FSYNC= 0, + /** open log files in O_DSYNC mode */ + SRV_O_DSYNC, + /** do not call os_file_flush() when writing data files, but do flush + after writing to log files */ + SRV_LITTLESYNC, + /** do not flush after writing */ + SRV_NOSYNC, + /** invoke os_file_set_nocache() on data files. This implies using + non-buffered IO but still using fsync, the reason for which is that + some FS do not flush meta-data when unbuffered IO happens */ + SRV_O_DIRECT, + /** do not use fsync() when using direct IO i.e.: it can be set to + avoid the fsync() call that we make when using SRV_UNIX_O_DIRECT. + However, in this case user/DBA should be sure about the integrity of + the meta-data */ + SRV_O_DIRECT_NO_FSYNC +#ifdef _WIN32 + /** Traditional Windows appoach to open all files without caching, + and do FileFlushBuffers() */ + ,SRV_ALL_O_DIRECT_FSYNC +#endif +}; + +/** innodb_flush_method */ +extern ulong srv_file_flush_method; + +/** Undo tablespaces starts with space_id. */ +extern ulint srv_undo_space_id_start; +/** The number of UNDO tablespaces that are open and ready to use. */ +extern ulint srv_undo_tablespaces_open; + +/** Check whether given space id is undo tablespace id +@param[in] space_id space id to check +@return true if it is undo tablespace else false. */ +inline bool srv_is_undo_tablespace(ulint space_id) +{ + return srv_undo_space_id_start > 0 && + space_id >= srv_undo_space_id_start && + space_id < srv_undo_space_id_start + srv_undo_tablespaces_open; +} + extern struct buf_dblwr_t* buf_dblwr; class page_id_t; @@ -249,6 +296,24 @@ struct fil_space_t void release_for_io() { ut_ad(pending_io()); n_pending_ios--; } /** @return whether I/O is pending */ bool pending_io() const { return n_pending_ios; } + + /** @return whether the tablespace file can be closed and reopened */ + bool belongs_in_lru() const + { + switch (purpose) { + case FIL_TYPE_TEMPORARY: + ut_ad(id == SRV_TMP_SPACE_ID); + return false; + case FIL_TYPE_IMPORT: + ut_ad(id != SRV_TMP_SPACE_ID); + return true; + case FIL_TYPE_TABLESPACE: + ut_ad(id != SRV_TMP_SPACE_ID); + return id && !srv_is_undo_tablespace(id); + } + ut_ad(0); + return false; + } #endif /* !UNIV_INNOCHECKSUM */ /** FSP_SPACE_FLAGS and FSP_FLAGS_MEM_ flags; check fsp0types.h to more info about flags. */ @@ -586,8 +651,13 @@ struct fil_node_t { #endif ); - /** Close the file handle. */ - void close(); + /** Close the file handle. */ + void close(); + /** Prepare to free a file from fil_system. */ + inline void close_to_free(); + + /** Update the data structures on I/O completion */ + inline void complete_io(bool write= false); }; /** Value of fil_node_t::magic_n */ @@ -903,6 +973,9 @@ public: } #endif public: + /** Detach a tablespace from the cache and close the files. */ + inline void detach(fil_space_t *space); + ib_mutex_t mutex; /*!< The mutex protecting the cache */ fil_space_t* sys_space; /*!< The innodb_system tablespace */ fil_space_t* temp_space; /*!< The innodb_temporary tablespace */ @@ -979,6 +1052,41 @@ public: /** The tablespace memory cache. */ extern fil_system_t fil_system; +/** Update the data structures on I/O completion */ +inline void fil_node_t::complete_io(bool write) +{ + ut_ad(mutex_own(&fil_system.mutex)); + + if (write) + { + if (srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC) + { + /* We don't need to keep track of unflushed changes as user has + explicitly disabled buffering. */ + ut_ad(!space->is_in_unflushed_spaces); + ut_ad(!needs_flush); + } + else if (!space->is_stopping()) + { + needs_flush= true; + if (!space->is_in_unflushed_spaces) + { + space->is_in_unflushed_spaces= true; + fil_system.unflushed_spaces.push_front(*space); + } + } + } + + switch (n_pending--) { + case 0: + ut_error; + case 1: + if (space->belongs_in_lru()) + /* The node must be put back to the LRU list */ + UT_LIST_ADD_FIRST(fil_system.LRU, this); + } +} + #include "fil0crypt.h" /** Returns the latch of a file space. @@ -1051,14 +1159,9 @@ database server shutdown. This should be called at a server startup after the space objects for the system tablespace have been created. The purpose of this operation is to make sure we never run out of file descriptors if we need to read from the insert buffer. */ -void -fil_open_system_tablespace_files(); -/*==========================================*/ -/*******************************************************************//** -Closes all open files. There must not be any pending i/o's or not flushed -modifications in the files. */ -void -fil_close_all_files(void); +void fil_open_system_tablespace_files(); +/** Close all tablespace files at shutdown */ +void fil_close_all_files(); /*******************************************************************//** Sets the max tablespace id counter if the given number is bigger than the previous value. */ @@ -1191,15 +1294,10 @@ dberr_t fil_delete_tablespace(ulint id, bool if_exists= false); @retval NULL if the tablespace does not exist */ fil_space_t* fil_truncate_prepare(ulint space_id); -/*******************************************************************//** -Closes a single-table tablespace. The tablespace must be cached in the -memory cache. Free all pages used by the tablespace. -@return DB_SUCCESS or error */ -dberr_t -fil_close_tablespace( -/*=================*/ - trx_t* trx, /*!< in/out: Transaction covering the close */ - ulint id); /*!< in: space id */ +/** Close a single-table tablespace on failed IMPORT TABLESPACE. +The tablespace must be cached in the memory cache. +Free all pages used by the tablespace. */ +void fil_close_tablespace(ulint id); /*******************************************************************//** Allocates and builds a file name from a path, a table or tablespace name @@ -1350,6 +1448,14 @@ fil_space_extend( fil_space_t* space, ulint size); +struct fil_io_t +{ + /** error code */ + dberr_t err; + /** file; node->space->release_for_io() must follow fil_io(sync=true) call */ + fil_node_t *node; +}; + /** Reads or writes data. This operation could be asynchronous (aio). @param[in] type IO context @@ -1366,12 +1472,11 @@ fil_space_extend( aligned @param[in] message message for aio handler if non-sync aio used, else ignored -@param[in] ignore whether to ignore out-of-bounds page_id +@param[in] ignore whether to ignore errors @param[in] punch_hole punch the hole to the file for page_compressed tablespace -@return DB_SUCCESS, or DB_TABLESPACE_DELETED -if we are trying to do i/o on a tablespace which does not exist */ -dberr_t +@return status and file descriptor */ +fil_io_t fil_io( const IORequest& type, bool sync, diff --git a/storage/innobase/include/fsp0fsp.h b/storage/innobase/include/fsp0fsp.h index 7a1385598db..e5bc4c8a6ed 100644 --- a/storage/innobase/include/fsp0fsp.h +++ b/storage/innobase/include/fsp0fsp.h @@ -580,7 +580,7 @@ inline void fsp_init_file_page( buf_block_t* block, mtr_t* mtr) { ut_d(space->modify_check(*mtr)); - ut_ad(space->id == block->page.id.space()); + ut_ad(space->id == block->page.id().space()); fsp_apply_init_file_page(block); mtr->init(block); } diff --git a/storage/innobase/include/ha0ha.h b/storage/innobase/include/ha0ha.h index 1944309c8ec..5822fde2d04 100644 --- a/storage/innobase/include/ha0ha.h +++ b/storage/innobase/include/ha0ha.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2018, MariaDB Corporation. +Copyright (c) 2018, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -210,27 +210,12 @@ struct ha_node_t { #endif /* BTR_CUR_HASH_ADAPT */ #if defined UNIV_DEBUG && defined BTR_CUR_HASH_ADAPT -/********************************************************************//** -Assert that the synchronization object in a hash operation involving -possible change in the hash table is held. -Note that in case of mutexes we assert that mutex is owned while in case -of rw-locks we assert that it is held in exclusive mode. */ -UNIV_INLINE -void -hash_assert_can_modify( -/*===================*/ - hash_table_t* table, /*!< in: hash table */ - ulint fold); /*!< in: fold value */ -/********************************************************************//** -Assert that the synchronization object in a hash search operation is held. -Note that in case of mutexes we assert that mutex is owned while in case -of rw-locks we assert that it is held either in x-mode or s-mode. */ -UNIV_INLINE -void -hash_assert_can_search( -/*===================*/ - hash_table_t* table, /*!< in: hash table */ - ulint fold); /*!< in: fold value */ +/** Assert that the synchronization object in a hash operation involving +possible change in the hash table is held in exclusive mode */ +void hash_assert_can_modify(hash_table_t *table, ulint fold); +/** Assert that the synchronization object in a hash operation involving +possible change in the hash table is held in share dor exclusive mode */ +void hash_assert_can_search(hash_table_t *table, ulint fold); #else /* UNIV_DEBUG */ #define hash_assert_can_modify(t, f) #define hash_assert_can_search(t, f) diff --git a/storage/innobase/include/ha0ha.ic b/storage/innobase/include/ha0ha.ic index 0612ef1bb25..c83f4c35e81 100644 --- a/storage/innobase/include/ha0ha.ic +++ b/storage/innobase/include/ha0ha.ic @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2018, MariaDB Corporation. +Copyright (c) 2018, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -99,53 +99,6 @@ ha_chain_get_first( hash_get_nth_cell(table, hash_calc_hash(fold, table))->node); } -#ifdef UNIV_DEBUG -/********************************************************************//** -Assert that the synchronization object in a hash operation involving -possible change in the hash table is held. -Note that in case of mutexes we assert that mutex is owned while in case -of rw-locks we assert that it is held in exclusive mode. */ -UNIV_INLINE -void -hash_assert_can_modify( -/*===================*/ - hash_table_t* table, /*!< in: hash table */ - ulint fold) /*!< in: fold value */ -{ - if (table->type == HASH_TABLE_SYNC_MUTEX) { - ut_ad(mutex_own(hash_get_mutex(table, fold))); - } else if (table->type == HASH_TABLE_SYNC_RW_LOCK) { -# ifdef UNIV_DEBUG - rw_lock_t* lock = hash_get_lock(table, fold); - ut_ad(rw_lock_own(lock, RW_LOCK_X)); -# endif - } else { - ut_ad(table->type == HASH_TABLE_SYNC_NONE); - } -} - -/********************************************************************//** -Assert that the synchronization object in a hash search operation is held. -Note that in case of mutexes we assert that mutex is owned while in case -of rw-locks we assert that it is held either in x-mode or s-mode. */ -UNIV_INLINE -void -hash_assert_can_search( -/*===================*/ - hash_table_t* table, /*!< in: hash table */ - ulint fold) /*!< in: fold value */ -{ - if (table->type == HASH_TABLE_SYNC_MUTEX) { - ut_ad(mutex_own(hash_get_mutex(table, fold))); - } else if (table->type == HASH_TABLE_SYNC_RW_LOCK) { - ut_ad(rw_lock_own_flagged(hash_get_lock(table, fold), - RW_LOCK_FLAG_X | RW_LOCK_FLAG_S)); - } else { - ut_ad(table->type == HASH_TABLE_SYNC_NONE); - } -} -#endif /* UNIV_DEBUG */ - /*************************************************************//** Looks for an element in a hash table. @return pointer to the data of the first hash table node in chain diff --git a/storage/innobase/include/hash0hash.h b/storage/innobase/include/hash0hash.h index 4f55b051d80..94a5f3a02f7 100644 --- a/storage/innobase/include/hash0hash.h +++ b/storage/innobase/include/hash0hash.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2018, MariaDB Corporation. +Copyright (c) 2018, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -404,15 +404,6 @@ hash_get_nth_mutex( hash_table_t* table, /*!< in: hash table */ ulint i); /*!< in: index of the mutex */ /************************************************************//** -Gets the nth rw_lock in a hash table. -@return rw_lock */ -UNIV_INLINE -rw_lock_t* -hash_get_nth_lock( -/*==============*/ - hash_table_t* table, /*!< in: hash table */ - ulint i); /*!< in: index of the rw_lock */ -/************************************************************//** Gets the mutex for a fold value in a hash table. @return mutex */ UNIV_INLINE @@ -421,61 +412,6 @@ hash_get_mutex( /*===========*/ hash_table_t* table, /*!< in: hash table */ ulint fold); /*!< in: fold */ -/************************************************************//** -Gets the rw_lock for a fold value in a hash table. -@return rw_lock */ -UNIV_INLINE -rw_lock_t* -hash_get_lock( -/*==========*/ - hash_table_t* table, /*!< in: hash table */ - ulint fold); /*!< in: fold */ - -/** If not appropriate rw_lock for a fold value in a hash table, -relock S-lock the another rw_lock until appropriate for a fold value. -@param[in] hash_lock latched rw_lock to be confirmed -@param[in] table hash table -@param[in] fold fold value -@return latched rw_lock */ -UNIV_INLINE -rw_lock_t* -hash_lock_s_confirm( - rw_lock_t* hash_lock, - hash_table_t* table, - ulint fold); - -/** If not appropriate rw_lock for a fold value in a hash table, -relock X-lock the another rw_lock until appropriate for a fold value. -@param[in] hash_lock latched rw_lock to be confirmed -@param[in] table hash table -@param[in] fold fold value -@return latched rw_lock */ -UNIV_INLINE -rw_lock_t* -hash_lock_x_confirm( - rw_lock_t* hash_lock, - hash_table_t* table, - ulint fold); - -/************************************************************//** -Reserves all the locks of a hash table, in an ascending order. */ -void -hash_lock_x_all( -/*============*/ - hash_table_t* table); /*!< in: hash table */ -/************************************************************//** -Releases all the locks of a hash table, in an ascending order. */ -void -hash_unlock_x_all( -/*==============*/ - hash_table_t* table); /*!< in: hash table */ -/************************************************************//** -Releases all but passed in lock of a hash table, */ -void -hash_unlock_x_all_but( -/*==================*/ - hash_table_t* table, /*!< in: hash table */ - rw_lock_t* keep_lock); /*!< in: lock to keep */ struct hash_cell_t{ void* node; /*!< hash chain node, NULL if none */ @@ -503,9 +439,9 @@ struct hash_table_t { ib_mutex_t* mutexes;/* NULL, or an array of mutexes used to protect segments of the hash table */ - rw_lock_t* rw_locks;/* NULL, or an array of rw_lcoks + rw_lock_t* rw_locks;/* NULL, or an array of rw_locks used to protect segments of the - hash table */ + buf_pool.page_hash */ } sync_obj; mem_heap_t** heaps; /*!< if this is non-NULL, hash diff --git a/storage/innobase/include/hash0hash.ic b/storage/innobase/include/hash0hash.ic index d6dd104572f..cc717ec8559 100644 --- a/storage/innobase/include/hash0hash.ic +++ b/storage/innobase/include/hash0hash.ic @@ -1,6 +1,7 @@ /***************************************************************************** Copyright (c) 1997, 2015, Oracle and/or its affiliates. All Rights Reserved. +Copyright (c) 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -23,8 +24,6 @@ The simple hash table utility Created 5/20/1997 Heikki Tuuri *******************************************************/ -#include "ut0rnd.h" - /************************************************************//** Gets the nth cell in a hash table. @return pointer to cell */ @@ -182,96 +181,3 @@ hash_get_mutex( return(hash_get_nth_mutex(table, i)); } - -/************************************************************//** -Gets the nth rw_lock in a hash table. -@return rw_lock */ -UNIV_INLINE -rw_lock_t* -hash_get_nth_lock( -/*==============*/ - hash_table_t* table, /*!< in: hash table */ - ulint i) /*!< in: index of the rw_lock */ -{ - ut_ad(table); - ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); - ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK); - ut_ad(i < table->n_sync_obj); - - return(table->sync_obj.rw_locks + i); -} - -/************************************************************//** -Gets the rw_lock for a fold value in a hash table. -@return rw_lock */ -UNIV_INLINE -rw_lock_t* -hash_get_lock( -/*==========*/ - hash_table_t* table, /*!< in: hash table */ - ulint fold) /*!< in: fold */ -{ - ulint i; - - ut_ad(table); - ut_ad(table->type == HASH_TABLE_SYNC_RW_LOCK); - ut_ad(table->magic_n == HASH_TABLE_MAGIC_N); - - i = hash_get_sync_obj_index(table, fold); - - return(hash_get_nth_lock(table, i)); -} - -/** If not appropriate rw_lock for a fold value in a hash table, -relock S-lock the another rw_lock until appropriate for a fold value. -@param[in] hash_lock latched rw_lock to be confirmed -@param[in] table hash table -@param[in] fold fold value -@return latched rw_lock */ -UNIV_INLINE -rw_lock_t* -hash_lock_s_confirm( - rw_lock_t* hash_lock, - hash_table_t* table, - ulint fold) -{ - ut_ad(rw_lock_own(hash_lock, RW_LOCK_S)); - - rw_lock_t* hash_lock_tmp = hash_get_lock(table, fold); - - while (hash_lock_tmp != hash_lock) { - rw_lock_s_unlock(hash_lock); - hash_lock = hash_lock_tmp; - rw_lock_s_lock(hash_lock); - hash_lock_tmp = hash_get_lock(table, fold); - } - - return(hash_lock); -} - -/** If not appropriate rw_lock for a fold value in a hash table, -relock X-lock the another rw_lock until appropriate for a fold value. -@param[in] hash_lock latched rw_lock to be confirmed -@param[in] table hash table -@param[in] fold fold value -@return latched rw_lock */ -UNIV_INLINE -rw_lock_t* -hash_lock_x_confirm( - rw_lock_t* hash_lock, - hash_table_t* table, - ulint fold) -{ - ut_ad(rw_lock_own(hash_lock, RW_LOCK_X)); - - rw_lock_t* hash_lock_tmp = hash_get_lock(table, fold); - - while (hash_lock_tmp != hash_lock) { - rw_lock_x_unlock(hash_lock); - hash_lock = hash_lock_tmp; - rw_lock_x_lock(hash_lock); - hash_lock_tmp = hash_get_lock(table, fold); - } - - return(hash_lock); -} diff --git a/storage/innobase/include/ib0mutex.h b/storage/innobase/include/ib0mutex.h index ce0e911dbb4..91bc69ac228 100644 --- a/storage/innobase/include/ib0mutex.h +++ b/storage/innobase/include/ib0mutex.h @@ -457,11 +457,7 @@ struct TTASEventMutex { sync_cell_t* cell; sync_array_t *sync_arr = sync_array_get_and_reserve_cell( - this, - (m_policy.get_id() == LATCH_ID_BUF_BLOCK_MUTEX - || m_policy.get_id() == LATCH_ID_BUF_POOL_ZIP) - ? SYNC_BUF_BLOCK - : SYNC_MUTEX, + this, SYNC_MUTEX, filename, line, &cell); uint32_t oldval = MUTEX_STATE_LOCKED; diff --git a/storage/innobase/include/lock0lock.ic b/storage/innobase/include/lock0lock.ic index 6c235f23680..a01d866e6c0 100644 --- a/storage/innobase/include/lock0lock.ic +++ b/storage/innobase/include/lock0lock.ic @@ -123,12 +123,12 @@ lock_rec_create( trx mutex */ { btr_assert_not_corrupted(block, index); + const page_id_t id(block->page.id()); return lock_rec_create_low( #ifdef WITH_WSREP c_lock, thr, #endif - type_mode, - block->page.id.space(), block->page.id.page_no(), + type_mode, id.space(), id.page_no(), block->frame, heap_no, index, trx, caller_owns_trx_mutex); } diff --git a/storage/innobase/include/lock0priv.ic b/storage/innobase/include/lock0priv.ic index 217b61e3625..7468110deeb 100644 --- a/storage/innobase/include/lock0priv.ic +++ b/storage/innobase/include/lock0priv.ic @@ -164,8 +164,8 @@ lock_rec_get_first_on_page( { ut_ad(lock_mutex_own()); - ulint space = block->page.id.space(); - ulint page_no = block->page.id.page_no(); + ulint space = block->page.id().space(); + ulint page_no = block->page.id().page_no(); ulint hash = buf_block_get_lock_hash_val(block); for (lock_t* lock = static_cast<lock_t*>( diff --git a/storage/innobase/include/log0log.h b/storage/innobase/include/log0log.h index 30c113dbd19..882b3416379 100644 --- a/storage/innobase/include/log0log.h +++ b/storage/innobase/include/log0log.h @@ -175,14 +175,9 @@ bool log_checkpoint(); /** Make a checkpoint */ void log_make_checkpoint(); -/****************************************************************//** -Makes a checkpoint at the latest lsn and writes it to first page of each -data file in the database, so that we know that the file spaces contain -all modifications up to that lsn. This can only be called at database -shutdown. This function also writes all log in log file to the log archive. */ -void -logs_empty_and_mark_files_at_shutdown(void); -/*=======================================*/ +/** Make a checkpoint at the latest lsn on shutdown. */ +void logs_empty_and_mark_files_at_shutdown(); + /** Write checkpoint info to the log header and invoke log_mutex_exit(). @param[in] end_lsn start LSN of the FILE_CHECKPOINT mini-transaction */ void log_write_checkpoint_info(lsn_t end_lsn); diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h index 4fea9b8ec68..5e29a334d3a 100644 --- a/storage/innobase/include/log0recv.h +++ b/storage/innobase/include/log0recv.h @@ -211,7 +211,7 @@ struct recv_sys_t ib_mutex_t mutex; /** whether we are applying redo log records during crash recovery */ bool recovery_on; - /** whether recv_recover_page(), invoked from buf_page_io_complete(), + /** whether recv_recover_page(), invoked from buf_page_read_complete(), should apply log records*/ bool apply_log_recs; @@ -222,9 +222,8 @@ struct recv_sys_t page cleaner threads */ os_event_t flush_end;/*!< event to signal that the page cleaner has finished the request */ - buf_flush_t flush_type;/*!< type of the flush request. - BUF_FLUSH_LRU: flush end of LRU, keeping free blocks. - BUF_FLUSH_LIST: flush all of blocks. */ + /** whether to flush from buf_pool.LRU instead of buf_pool.flush_list */ + bool flush_lru; /** whether recv_apply_hashed_log_recs() is running */ bool apply_batch_on; byte* buf; /*!< buffer for parsing log records */ diff --git a/storage/innobase/include/mtr0log.h b/storage/innobase/include/mtr0log.h index a8840b187a8..afb9456ff30 100644 --- a/storage/innobase/include/mtr0log.h +++ b/storage/innobase/include/mtr0log.h @@ -226,7 +226,7 @@ inline void mtr_t::memset(const buf_block_t &b, ulint ofs, ulint len, byte val) static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency"); size_t lenlen= (len < MIN_2BYTE ? 1 + 1 : len < MIN_3BYTE ? 2 + 1 : 3 + 1); - byte *l= log_write<MEMSET>(b.page.id, &b.page, lenlen, true, ofs); + byte *l= log_write<MEMSET>(b.page.id(), &b.page, lenlen, true, ofs); l= mlog_encode_varint(l, len); *l++= val; m_log.close(l); @@ -263,7 +263,7 @@ inline void mtr_t::memset(const buf_block_t &b, ulint ofs, size_t len, static_assert(MIN_4BYTE > UNIV_PAGE_SIZE_MAX, "consistency"); size_t lenlen= (len < MIN_2BYTE ? 1 : len < MIN_3BYTE ? 2 : 3); - byte *l= log_write<MEMSET>(b.page.id, &b.page, lenlen + size, true, ofs); + byte *l= log_write<MEMSET>(b.page.id(), &b.page, lenlen + size, true, ofs); l= mlog_encode_varint(l, len); ::memcpy(l, str, size); l+= size; @@ -320,13 +320,14 @@ inline void mtr_t::memcpy_low(const buf_block_t &block, uint16_t offset, return; if (len < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5)) { - byte *end= log_write<WRITE>(block.page.id, &block.page, len, true, offset); + byte *end= log_write<WRITE>(block.page.id(), &block.page, len, true, + offset); ::memcpy(end, data, len); m_log.close(end + len); } else { - m_log.close(log_write<WRITE>(block.page.id, &block.page, len, false, + m_log.close(log_write<WRITE>(block.page.id(), &block.page, len, false, offset)); m_log.push(static_cast<const byte*>(data), static_cast<uint32_t>(len)); } @@ -363,7 +364,7 @@ inline void mtr_t::memmove(const buf_block_t &b, ulint d, ulint s, ulint len) /* The source offset 0 is not possible. */ s-= 1 << 1; size_t slen= (s < MIN_2BYTE ? 1 : s < MIN_3BYTE ? 2 : 3); - byte *l= log_write<MEMMOVE>(b.page.id, &b.page, lenlen + slen, true, d); + byte *l= log_write<MEMMOVE>(b.page.id(), &b.page, lenlen + slen, true, d); l= mlog_encode_varint(l, len); l= mlog_encode_varint(l, s); m_log.close(l); @@ -386,7 +387,7 @@ inline byte *mtr_t::log_write(const page_id_t id, const buf_page_t *bpage, static_assert(!(type & 15) && type != RESERVED && type != OPTION && type <= FILE_CHECKPOINT, "invalid type"); ut_ad(type >= FILE_CREATE || is_named_space(id.space())); - ut_ad(!bpage || bpage->id == id); + ut_ad(!bpage || bpage->id() == id); constexpr bool have_len= type != INIT_PAGE && type != FREE_PAGE; constexpr bool have_offset= type == WRITE || type == MEMSET || type == MEMMOVE; @@ -518,7 +519,7 @@ inline void mtr_t::init(buf_block_t *b) return; } - m_log.close(log_write<INIT_PAGE>(b->page.id, &b->page)); + m_log.close(log_write<INIT_PAGE>(b->page.id(), &b->page)); m_last_offset= FIL_PAGE_TYPE; } @@ -538,7 +539,7 @@ inline void mtr_t::log_write_extended(const buf_block_t &block, byte type) set_modified(block); if (m_log_mode != MTR_LOG_ALL) return; - byte *l= log_write<EXTENDED>(block.page.id, &block.page, 1, true); + byte *l= log_write<EXTENDED>(block.page.id(), &block.page, 1, true); *l++= type; m_log.close(l); m_last_offset= FIL_PAGE_TYPE; @@ -566,7 +567,7 @@ inline void mtr_t::page_delete(const buf_block_t &block, ulint prev_rec) if (m_log_mode != MTR_LOG_ALL) return; size_t len= (prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4); - byte *l= log_write<EXTENDED>(block.page.id, &block.page, len, true); + byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, true); ut_d(byte *end= l + len); *l++= DELETE_ROW_FORMAT_REDUNDANT; l= mlog_encode_varint(l, prev_rec); @@ -595,7 +596,7 @@ inline void mtr_t::page_delete(const buf_block_t &block, ulint prev_rec, size_t len= prev_rec < MIN_2BYTE ? 2 : prev_rec < MIN_3BYTE ? 3 : 4; len+= hdr_size < MIN_2BYTE ? 1 : 2; len+= data_size < MIN_2BYTE ? 1 : data_size < MIN_3BYTE ? 2 : 3; - byte *l= log_write<EXTENDED>(block.page.id, &block.page, len, true); + byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, true); ut_d(byte *end= l + len); *l++= DELETE_ROW_FORMAT_DYNAMIC; l= mlog_encode_varint(l, prev_rec); @@ -625,7 +626,7 @@ inline void mtr_t::undo_append(const buf_block_t &block, if (m_log_mode != MTR_LOG_ALL) return; const bool small= len + 1 < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5); - byte *end= log_write<EXTENDED>(block.page.id, &block.page, len + 1, small); + byte *end= log_write<EXTENDED>(block.page.id(), &block.page, len + 1, small); if (UNIV_LIKELY(small)) { *end++= UNDO_APPEND; diff --git a/storage/innobase/include/mtr0mtr.ic b/storage/innobase/include/mtr0mtr.ic index aee390cf8b2..16777da1044 100644 --- a/storage/innobase/include/mtr0mtr.ic +++ b/storage/innobase/include/mtr0mtr.ic @@ -28,16 +28,11 @@ Created 11/26/1995 Heikki Tuuri /** Check if a mini-transaction is dirtying a clean page. @return true if the mtr is dirtying a clean page. */ -bool -mtr_t::is_block_dirtied(const buf_block_t* block) +inline bool mtr_t::is_block_dirtied(const buf_block_t *block) { - ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); - ut_ad(block->page.buf_fix_count > 0); - - /* It is OK to read oldest_modification because no - other thread can be performing a write of it and it - is only during write that the value is reset to 0. */ - return(block->page.oldest_modification == 0); + ut_ad(block->page.state() == BUF_BLOCK_FILE_PAGE); + ut_ad(block->page.buf_fix_count()); + return !block->page.oldest_modification(); } /** @@ -170,10 +165,10 @@ mtr_t::release_block_at_savepoint( ut_a(slot->object == block); - reinterpret_cast<buf_block_t*>(block)->unfix(); - buf_page_release_latch(block, slot->type); + reinterpret_cast<buf_block_t*>(block)->unfix(); + slot->object = NULL; } diff --git a/storage/innobase/include/os0api.h b/storage/innobase/include/os0api.h index 3be7c0afaa4..bd9dc5b73a1 100644 --- a/storage/innobase/include/os0api.h +++ b/storage/innobase/include/os0api.h @@ -35,15 +35,6 @@ class buf_page_t; struct fil_node_t; /** -Should we punch hole to deallocate unused portion of the page. -@param[in] bpage Page control block -@return true if punch hole should be used, false if not */ -bool -buf_page_should_punch_hole( - const buf_page_t* bpage) - MY_ATTRIBUTE((warn_unused_result)); - -/** Calculate the length of trim (punch_hole) operation. @param[in] bpage Page control block @param[in] write_length Write length diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index 543e478f649..08ea482333b 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -189,9 +189,26 @@ static const ulint OS_FILE_ERROR_MAX = 200; #define IORequestWrite IORequest(IORequest::WRITE) /** -The IO Context that is passed down to the low level IO code */ -class IORequest { +The I/O context that is passed down to the low level IO code */ +class IORequest +{ public: + /** Buffer pool flush types */ + enum flush_t + { + /** via buf_pool.LRU */ + LRU= 0, + /** via buf_pool.flush_list */ + FLUSH_LIST, + /** single page of buf_poof.LRU */ + SINGLE_PAGE + }; + + IORequest(ulint type= READ, buf_page_t *bpage= nullptr, + flush_t flush_type= LRU) : + m_bpage(bpage), m_type(static_cast<uint16_t>(type)), + m_flush_type(flush_type) {} + /** Flags passed in the request, they can be ORred together. */ enum { READ = 1, @@ -212,25 +229,6 @@ public: PUNCH_HOLE = 64, }; - /** - @param[in] type Request type, can be a value that is - ORed from the above enum - @param[in] bpage Page to be written */ - IORequest(ulint type= READ, buf_page_t *bpage= nullptr) - : m_bpage(bpage), m_type(static_cast<uint16_t>(type)) - { - if (bpage && buf_page_should_punch_hole(bpage)) { - set_punch_hole(); - } - - if (!is_punch_hole_supported()) { - clear_punch_hole(); - } - } - - /** Destructor */ - ~IORequest() { } - /** @return true if it is a read request */ bool is_read() const MY_ATTRIBUTE((warn_unused_result)) @@ -342,6 +340,9 @@ public: @return DB_SUCCESS or error code */ dberr_t punch_hole(os_file_t fh, os_offset_t off, ulint len); + /** @return the flush type */ + flush_t flush_type() const { return m_flush_type; } + private: /** Page to be written on write operation. */ buf_page_t* const m_bpage= nullptr; @@ -351,6 +352,9 @@ private: /** Request type bit flags */ uint16_t m_type= READ; + + /** for writes, type of page flush */ + flush_t m_flush_type= LRU; }; /* @} */ diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index 23414448a71..2aa874edfad 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -278,30 +278,11 @@ extern char* srv_undo_dir; /** Number of undo tablespaces to use. */ extern ulong srv_undo_tablespaces; -/** The number of UNDO tablespaces that are open and ready to use. */ -extern ulint srv_undo_tablespaces_open; - /** The number of UNDO tablespaces that are active (hosting some rollback segment). It is quite possible that some of the tablespaces doesn't host any of the rollback-segment based on configuration used. */ extern ulint srv_undo_tablespaces_active; -/** Undo tablespaces starts with space_id. */ -extern ulint srv_undo_space_id_start; - -/** Check whether given space id is undo tablespace id -@param[in] space_id space id to check -@return true if it is undo tablespace else false. */ -inline -bool -srv_is_undo_tablespace(ulint space_id) -{ - return srv_undo_space_id_start > 0 - && space_id >= srv_undo_space_id_start - && space_id < (srv_undo_space_id_start - + srv_undo_tablespaces_open); -} - /** Maximum size of undo tablespace. */ extern unsigned long long srv_max_undo_log_size; @@ -601,37 +582,6 @@ extern PSI_stage_info srv_stage_alter_table_read_pk_internal_sort; extern PSI_stage_info srv_stage_buffer_pool_load; #endif /* HAVE_PSI_STAGE_INTERFACE */ - -/** Alternatives for innodb_flush_method */ -enum srv_flush_t { - SRV_FSYNC = 0, /*!< fsync, the default */ - SRV_O_DSYNC, /*!< open log files in O_DSYNC mode */ - SRV_LITTLESYNC, /*!< do not call os_file_flush() - when writing data files, but do flush - after writing to log files */ - SRV_NOSYNC, /*!< do not flush after writing */ - SRV_O_DIRECT, /*!< invoke os_file_set_nocache() on - data files. This implies using - non-buffered IO but still using fsync, - the reason for which is that some FS - do not flush meta-data when - unbuffered IO happens */ - SRV_O_DIRECT_NO_FSYNC - /*!< do not use fsync() when using - direct IO i.e.: it can be set to avoid - the fsync() call that we make when - using SRV_UNIX_O_DIRECT. However, in - this case user/DBA should be sure about - the integrity of the meta-data */ -#ifdef _WIN32 - ,SRV_ALL_O_DIRECT_FSYNC - /*!< Traditional Windows appoach to open - all files without caching, and do FileFlushBuffers()*/ -#endif -}; -/** innodb_flush_method */ -extern ulong srv_file_flush_method; - /** Alternatives for srv_force_recovery. Non-zero values are intended to help the user get a damaged database up so that he can dump intact tables and rows with SELECT INTO OUTFILE. The database must not otherwise diff --git a/storage/innobase/include/sync0policy.h b/storage/innobase/include/sync0policy.h index 94f49ff628c..4fd03eaea06 100644 --- a/storage/innobase/include/sync0policy.h +++ b/storage/innobase/include/sync0policy.h @@ -286,91 +286,4 @@ private: latch_id_t m_id; }; -/** Track agregate metrics policy, used by the page mutex. There are just -too many of them to count individually. */ -template <typename Mutex> -class BlockMutexPolicy -{ -public: - /** Called when the mutex is "created". Note: Not from the constructor - but when the mutex is initialised. - @param[in] id Mutex ID */ - void init(const Mutex&, latch_id_t id, const char*, uint32) - UNIV_NOTHROW - { - /* It can be LATCH_ID_BUF_BLOCK_MUTEX or - LATCH_ID_BUF_POOL_ZIP. Unfortunately, they - are mapped to the same mutex type in the - buffer pool code. */ - - m_id = id; - - latch_meta_t& meta = sync_latch_get_meta(m_id); - - ut_ad(meta.get_id() == id); - - m_count = meta.get_counter()->sum_register(); - } - - /** Called when the mutex is destroyed. */ - void destroy() - UNIV_NOTHROW - { - m_count = NULL; - } - - /** Called after a successful mutex acquire. - @param[in] n_spins Number of times the thread did - spins while trying to acquire the mutex - @param[in] n_waits Number of times the thread waited - in some type of OS queue */ - void add( - uint32_t n_spins, - uint32_t n_waits) - UNIV_NOTHROW - { - if (!m_count->m_enabled) { - - return; - } - - m_count->m_spins += n_spins; - m_count->m_waits += n_waits; - - ++m_count->m_calls; - } - - /** Print the information about the latch - @return the string representation */ - std::string print() const - UNIV_NOTHROW; - - /** @return the latch ID */ - latch_id_t get_id() const - { - return(m_id); - } - - - /** - I don't think it makes sense to keep track of the file name - and line number for each block mutex. Too much of overhead. Use the - latch id to figure out the location from the source. - - @return the string representation - */ - std::string to_string() const - { return(sync_mutex_to_string(get_id(), "buf0buf.cc:0")); } - -#ifdef UNIV_DEBUG - MutexDebug<Mutex> context; -#endif - -private: - /** The user visible counters, registered with the meta-data. */ - latch_meta_t::CounterType::Count *m_count; - - /** Latch meta data ID */ - latch_id_t m_id; -}; #endif /* sync0policy_h */ diff --git a/storage/innobase/include/sync0sync.h b/storage/innobase/include/sync0sync.h index 6c67fe9182d..8c45f415ef2 100644 --- a/storage/innobase/include/sync0sync.h +++ b/storage/innobase/include/sync0sync.h @@ -51,9 +51,7 @@ instrumentation due to their large number of instances. */ #ifdef UNIV_PFS_MUTEX /* Key defines to register InnoDB mutexes with performance schema */ extern mysql_pfs_key_t autoinc_mutex_key; -extern mysql_pfs_key_t buffer_block_mutex_key; extern mysql_pfs_key_t buf_pool_mutex_key; -extern mysql_pfs_key_t buf_pool_zip_mutex_key; extern mysql_pfs_key_t cache_last_read_mutex_key; extern mysql_pfs_key_t dict_foreign_err_mutex_key; extern mysql_pfs_key_t dict_sys_mutex_key; diff --git a/storage/innobase/include/sync0types.h b/storage/innobase/include/sync0types.h index 91908935086..d7ba055c403 100644 --- a/storage/innobase/include/sync0types.h +++ b/storage/innobase/include/sync0types.h @@ -194,7 +194,6 @@ enum latch_level_t { SYNC_BUF_FLUSH_LIST, - SYNC_BUF_BLOCK, SYNC_BUF_PAGE_HASH, SYNC_BUF_POOL, @@ -284,9 +283,7 @@ up its meta-data. See sync0debug.c. */ enum latch_id_t { LATCH_ID_NONE = 0, LATCH_ID_AUTOINC, - LATCH_ID_BUF_BLOCK_MUTEX, LATCH_ID_BUF_POOL, - LATCH_ID_BUF_POOL_ZIP, LATCH_ID_CACHE_LAST_READ, LATCH_ID_DICT_FOREIGN_ERR, LATCH_ID_DICT_SYS, diff --git a/storage/innobase/include/trx0undo.ic b/storage/innobase/include/trx0undo.ic index 06e31eb55b3..43af932708e 100644 --- a/storage/innobase/include/trx0undo.ic +++ b/storage/innobase/include/trx0undo.ic @@ -133,7 +133,7 @@ inline uint16_t trx_undo_page_get_end(const buf_block_t *undo_page, uint32_t page_no, uint16_t offset) { - if (page_no == undo_page->page.id.page_no()) + if (page_no == undo_page->page.id().page_no()) if (uint16_t end = mach_read_from_2(TRX_UNDO_NEXT_LOG + offset + undo_page->frame)) return end; diff --git a/storage/innobase/include/univ.i b/storage/innobase/include/univ.i index 4749ebfbb1c..147a6285e5c 100644 --- a/storage/innobase/include/univ.i +++ b/storage/innobase/include/univ.i @@ -191,8 +191,6 @@ using the call command. */ some debug print functions */ #define UNIV_AHI_DEBUG /* Enable adaptive hash index debugging without UNIV_DEBUG */ -#define UNIV_BUF_DEBUG /* Enable buffer pool - debugging without UNIV_DEBUG */ #define UNIV_BLOB_LIGHT_DEBUG /* Enable off-page column debugging without UNIV_DEBUG */ #define UNIV_DEBUG_LOCK_VALIDATE /* Enable diff --git a/storage/innobase/include/ut0mutex.h b/storage/innobase/include/ut0mutex.h index d7d48cd1f28..04ec10cc379 100644 --- a/storage/innobase/include/ut0mutex.h +++ b/storage/innobase/include/ut0mutex.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 2012, 2015, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, MariaDB Corporation. +Copyright (c) 2017, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -40,30 +40,21 @@ Created 2012-03-24 Sunny Bains. # ifdef HAVE_IB_LINUX_FUTEX UT_MUTEX_TYPE(TTASFutexMutex, GenericPolicy, FutexMutex); -UT_MUTEX_TYPE(TTASFutexMutex, BlockMutexPolicy, BlockFutexMutex); # endif /* HAVE_IB_LINUX_FUTEX */ UT_MUTEX_TYPE(TTASMutex, GenericPolicy, SpinMutex); -UT_MUTEX_TYPE(TTASMutex, BlockMutexPolicy, BlockSpinMutex); - UT_MUTEX_TYPE(OSTrackMutex, GenericPolicy, SysMutex); -UT_MUTEX_TYPE(OSTrackMutex, BlockMutexPolicy, BlockSysMutex); - UT_MUTEX_TYPE(TTASEventMutex, GenericPolicy, SyncArrayMutex); -UT_MUTEX_TYPE(TTASEventMutex, BlockMutexPolicy, BlockSyncArrayMutex); #ifdef MUTEX_FUTEX /** The default mutex type. */ typedef FutexMutex ib_mutex_t; -typedef BlockFutexMutex ib_bpmutex_t; #define MUTEX_TYPE "Uses futexes" #elif defined(MUTEX_SYS) typedef SysMutex ib_mutex_t; -typedef BlockSysMutex ib_bpmutex_t; #define MUTEX_TYPE "Uses system mutexes" #elif defined(MUTEX_EVENT) typedef SyncArrayMutex ib_mutex_t; -typedef BlockSyncArrayMutex ib_bpmutex_t; #define MUTEX_TYPE "Uses event mutexes" #else #error "ib_mutex_t type is unknown" diff --git a/storage/innobase/include/ut0ut.h b/storage/innobase/include/ut0ut.h index 14e31b1d9a8..0d8cfb817ea 100644 --- a/storage/innobase/include/ut0ut.h +++ b/storage/innobase/include/ut0ut.h @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2019, MariaDB Corporation. +Copyright (c) 2019, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -136,14 +136,6 @@ ulint ut_2_exp( /*=====*/ ulint n); /*!< in: number */ -/*************************************************************//** -Calculates fast the number rounded up to the nearest power of 2. -@return first power of 2 which is >= n */ -ulint -ut_2_power_up( -/*==========*/ - ulint n) /*!< in: number != 0 */ - MY_ATTRIBUTE((const)); /**********************************************************//** Returns the number of milliseconds since some epoch. The diff --git a/storage/innobase/lock/lock0lock.cc b/storage/innobase/lock/lock0lock.cc index c24d1f12623..e07efa4f094 100644 --- a/storage/innobase/lock/lock0lock.cc +++ b/storage/innobase/lock/lock0lock.cc @@ -519,12 +519,10 @@ void lock_sys_t::resize(ulint n_cells) mutex_enter(&buf_pool.mutex); for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool.LRU); bpage; bpage = UT_LIST_GET_NEXT(LRU, bpage)) { - if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) { - buf_block_t* block = reinterpret_cast<buf_block_t*>( - bpage); - - block->lock_hash_val = lock_rec_hash( - bpage->id.space(), bpage->id.page_no()); + if (bpage->state() == BUF_BLOCK_FILE_PAGE) { + const page_id_t id(bpage->id()); + reinterpret_cast<buf_block_t*>(bpage)->lock_hash_val + = lock_rec_hash(id.space(), id.page_no()); } } mutex_exit(&buf_pool.mutex); @@ -2327,8 +2325,8 @@ lock_rec_free_all_from_discard_page( ut_ad(lock_mutex_own()); - space = block->page.id.space(); - page_no = block->page.id.page_no(); + space = block->page.id().space(); + page_no = block->page.id().page_no(); lock_rec_free_all_from_discard_page_low( space, page_no, lock_sys.rec_hash); @@ -3078,9 +3076,10 @@ lock_update_merge_right( /* there should exist no page lock on the left page, otherwise, it will be blocked from merge */ - ut_ad(!lock_rec_get_first_on_page_addr(lock_sys.prdt_page_hash, - left_block->page.id.space(), - left_block->page.id.page_no())); + ut_ad(!lock_rec_get_first_on_page_addr( + lock_sys.prdt_page_hash, + left_block->page.id().space(), + left_block->page.id().page_no())); lock_rec_free_all_from_discard_page(left_block); @@ -3201,8 +3200,8 @@ lock_update_merge_left( otherwise, it will be blocked from merge */ ut_ad(!lock_rec_get_first_on_page_addr( lock_sys.prdt_page_hash, - right_block->page.id.space(), - right_block->page.id.page_no())); + right_block->page.id().space(), + right_block->page.id().page_no())); lock_rec_free_all_from_discard_page(right_block); @@ -3291,14 +3290,14 @@ lock_update_discard( } lock_rec_free_all_from_discard_page_low( - block->page.id.space(), block->page.id.page_no(), + block->page.id().space(), block->page.id().page_no(), lock_sys.rec_hash); } else { lock_rec_free_all_from_discard_page_low( - block->page.id.space(), block->page.id.page_no(), + block->page.id().space(), block->page.id().page_no(), lock_sys.prdt_hash); lock_rec_free_all_from_discard_page_low( - block->page.id.space(), block->page.id.page_no(), + block->page.id().space(), block->page.id().page_no(), lock_sys.prdt_page_hash); } @@ -4965,7 +4964,7 @@ lock_rec_validate_page( loop: lock = lock_rec_get_first_on_page_addr( lock_sys.rec_hash, - block->page.id.space(), block->page.id.page_no()); + block->page.id().space(), block->page.id().page_no()); if (!lock) { goto function_exit; diff --git a/storage/innobase/lock/lock0prdt.cc b/storage/innobase/lock/lock0prdt.cc index 6458d202bf2..512793a1786 100644 --- a/storage/innobase/lock/lock0prdt.cc +++ b/storage/innobase/lock/lock0prdt.cc @@ -1041,8 +1041,8 @@ lock_prdt_page_free_from_discard( ut_ad(lock_mutex_own()); - space = block->page.id.space(); - page_no = block->page.id.page_no(); + space = block->page.id().space(); + page_no = block->page.id().page_no(); lock = lock_rec_get_first_on_page_addr(lock_hash, space, page_no); diff --git a/storage/innobase/log/log0log.cc b/storage/innobase/log/log0log.cc index 93c966b1383..67b45ab9bac 100644 --- a/storage/innobase/log/log0log.cc +++ b/storage/innobase/log/log0log.cc @@ -1204,7 +1204,7 @@ static bool log_preflush_pool_modified_pages(lsn_t new_oldest) success = buf_flush_lists(ULINT_MAX, new_oldest, &n_pages); - buf_flush_wait_batch_end(BUF_FLUSH_LIST); + buf_flush_wait_batch_end(false); if (!success) { MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS); @@ -1510,14 +1510,9 @@ void log_check_margins() } extern void buf_resize_shutdown(); -/****************************************************************//** -Makes a checkpoint at the latest lsn and writes it to first page of each -data file in the database, so that we know that the file spaces contain -all modifications up to that lsn. This can only be called at database -shutdown. This function also writes log in log file to the log archive. */ -void -logs_empty_and_mark_files_at_shutdown(void) -/*=======================================*/ + +/** Make a checkpoint at the latest lsn on shutdown. */ +void logs_empty_and_mark_files_at_shutdown() { lsn_t lsn; ulint count = 0; @@ -1699,10 +1694,6 @@ wait_suspend_loop: } srv_shutdown_state = SRV_SHUTDOWN_LAST_PHASE; - - if (fil_system.is_initialised()) { - fil_close_all_files(); - } return; } @@ -1726,8 +1717,6 @@ wait_suspend_loop: goto loop; } - /* Ensure that all buffered changes are written to the - redo log before fil_close_all_files(). */ log_sys.log.flush(); } else { lsn = recv_sys.recovered_lsn; @@ -1762,8 +1751,6 @@ wait_suspend_loop: } } - fil_close_all_files(); - /* Make some checks that the server really is quiet */ ut_ad(!srv_any_background_activity()); diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc index 6f5ef55d958..3437f3182aa 100644 --- a/storage/innobase/log/log0recv.cc +++ b/storage/innobase/log/log0recv.cc @@ -165,7 +165,7 @@ public: free + len + 6 >= srv_page_size - FIL_PAGE_DATA_END)) { ib::error() << "Not applying UNDO_APPEND due to corruption on " - << block.page.id; + << block.page.id(); return true; } @@ -224,13 +224,13 @@ public: size_t idlen= mlog_decode_varint_length(*l); ut_ad(idlen <= 5); ut_ad(idlen < rlen); - ut_ad(mlog_decode_varint(l) == block.page.id.space()); + ut_ad(mlog_decode_varint(l) == block.page.id().space()); l+= idlen; rlen-= idlen; idlen= mlog_decode_varint_length(*l); ut_ad(idlen <= 5); ut_ad(idlen <= rlen); - ut_ad(mlog_decode_varint(l) == block.page.id.page_no()); + ut_ad(mlog_decode_varint(l) == block.page.id().page_no()); l+= idlen; rlen-= idlen; last_offset= 0; @@ -244,9 +244,9 @@ public: if (UNIV_LIKELY(rlen == 0)) { memset_aligned<UNIV_ZIP_SIZE_MIN>(frame, 0, size); - mach_write_to_4(frame + FIL_PAGE_OFFSET, block.page.id.page_no()); + mach_write_to_4(frame + FIL_PAGE_OFFSET, block.page.id().page_no()); memset_aligned<8>(FIL_PAGE_PREV + frame, 0xff, 8); - mach_write_to_4(frame + FIL_PAGE_SPACE_ID, block.page.id.space()); + mach_write_to_4(frame + FIL_PAGE_SPACE_ID, block.page.id().space()); last_offset= FIL_PAGE_TYPE; next_after_applying: if (applied == APPLIED_NO) @@ -269,9 +269,9 @@ public: } ut_ad(mach_read_from_4(frame + FIL_PAGE_OFFSET) == - block.page.id.page_no()); + block.page.id().page_no()); ut_ad(mach_read_from_4(frame + FIL_PAGE_SPACE_ID) == - block.page.id.space()); + block.page.id().space()); ut_ad(last_offset <= 1 || last_offset > 8); ut_ad(last_offset <= size); @@ -279,7 +279,7 @@ public: case OPTION: goto next; case EXTENDED: - if (UNIV_UNLIKELY(block.page.id.page_no() < 3 || + if (UNIV_UNLIKELY(block.page.id().page_no() < 3 || block.page.zip.ssize)) goto record_corrupted; static_assert(INIT_ROW_FORMAT_REDUNDANT == 0, "compatiblity"); @@ -462,7 +462,7 @@ page_corrupted: if (UNIV_UNLIKELY(rlen + last_offset > size)) goto record_corrupted; memcpy(frame + last_offset, l, llen); - if (UNIV_LIKELY(block.page.id.page_no())); + if (UNIV_LIKELY(block.page.id().page_no())); else if (llen == 11 + MY_AES_BLOCK_SIZE && last_offset == FSP_HEADER_OFFSET + MAGIC_SZ + fsp_header_get_encryption_offset(block.zip_size())) @@ -675,7 +675,7 @@ public: break; } ib::error() << "corrupted " - << block->page.id; + << block->page.id(); } } if (recv_no_ibuf_operations) { @@ -685,7 +685,7 @@ public: } mutex_exit(&recv_sys.mutex); block->page.ibuf_exist = ibuf_page_exists( - block->page.id, block->zip_size()); + block->page.id(), block->zip_size()); mtr.commit(); mtr.start(); mutex_enter(&recv_sys.mutex); @@ -966,7 +966,7 @@ DECLARE_THREAD(recv_writer_thread)( /* Flush pages from end of LRU if required */ os_event_reset(recv_sys.flush_end); - recv_sys.flush_type = BUF_FLUSH_LRU; + recv_sys.flush_lru = true; os_event_set(recv_sys.flush_start); os_event_wait(recv_sys.flush_end); @@ -999,7 +999,7 @@ void recv_sys_t::create() flush_end = os_event_create(0); } - flush_type = BUF_FLUSH_LRU; + flush_lru = true; apply_log_recs = false; apply_batch_on = false; @@ -1035,7 +1035,7 @@ inline void recv_sys_t::clear() for (buf_block_t *block= UT_LIST_GET_LAST(blocks); block; ) { buf_block_t *prev_block= UT_LIST_GET_PREV(unzip_LRU, block); - ut_ad(buf_block_get_state(block) == BUF_BLOCK_MEMORY); + ut_ad(block->page.state() == BUF_BLOCK_MEMORY); UT_LIST_REMOVE(blocks, block); buf_block_free(block); block= prev_block; @@ -1128,7 +1128,7 @@ inline void recv_sys_t::free(const void *data) continue; buf_block_t *block= &chunk->blocks[offs]; ut_ad(block->frame == data); - ut_ad(buf_block_get_state(block) == BUF_BLOCK_MEMORY); + ut_ad(block->page.state() == BUF_BLOCK_MEMORY); ut_ad(static_cast<uint16_t>(block->page.access_time - 1) < srv_page_size); ut_ad(block->page.access_time >= 1U << 16); @@ -2234,18 +2234,18 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr, ut_ad(recv_needed_recovery); ut_ad(!init || init->created); ut_ad(!init || init->lsn); - ut_ad(block->page.id == p->first); + ut_ad(block->page.id() == p->first); ut_ad(!p->second.is_being_processed()); - ut_ad(!space || space->id == block->page.id.space()); + ut_ad(!space || space->id == block->page.id().space()); ut_ad(log_sys.is_physical()); if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) { - ib::info() << "Applying log to page " << block->page.id; + ib::info() << "Applying log to page " << block->page.id(); } DBUG_PRINT("ib_log", ("Applying log to page %u:%u", - block->page.id.space(), - block->page.id.page_no())); + block->page.id().space(), + block->page.id().page_no())); p->second.state = page_recv_t::RECV_BEING_PROCESSED; @@ -2277,8 +2277,8 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr, /* This record has already been applied. */ DBUG_PRINT("ib_log", ("apply skip %u:%u LSN " LSN_PF " < " LSN_PF, - block->page.id.space(), - block->page.id.page_no(), + block->page.id().space(), + block->page.id().page_no(), l->start_lsn, page_lsn)); continue; } @@ -2286,8 +2286,8 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr, if (l->start_lsn < init_lsn) { DBUG_PRINT("ib_log", ("init skip %u:%u LSN " LSN_PF " < " LSN_PF, - block->page.id.space(), - block->page.id.page_no(), + block->page.id().space(), + block->page.id().page_no(), l->start_lsn, init_lsn)); continue; } @@ -2295,13 +2295,13 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr, if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) { ib::info() << "apply " << l->start_lsn - << ": " << block->page.id; + << ": " << block->page.id(); } DBUG_PRINT("ib_log", ("apply " LSN_PF ": %u:%u", l->start_lsn, - block->page.id.space(), - block->page.id.page_no())); + block->page.id().space(), + block->page.id().page_no())); log_phys_t::apply_status a= l->apply(*block, p->second.last_offset); @@ -2321,7 +2321,7 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr, if (fil_space_t* s = space ? space - : fil_space_acquire(block->page.id.space())) { + : fil_space_acquire(block->page.id().space())) { switch (a) { case log_phys_t::APPLIED_TO_FSP_HEADER: s->flags = mach_read_from_4( @@ -2448,7 +2448,7 @@ void recv_recover_page(fil_space_t* space, buf_page_t* bpage) mtr.start(); mtr.set_log_mode(MTR_LOG_NONE); - ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); + ut_ad(bpage->state() == BUF_BLOCK_FILE_PAGE); buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage); /* Move the ownership of the x-latch on the page to @@ -2462,7 +2462,7 @@ void recv_recover_page(fil_space_t* space, buf_page_t* bpage) mutex_enter(&recv_sys.mutex); if (recv_sys.apply_log_recs) { - recv_sys_t::map::iterator p = recv_sys.pages.find(bpage->id); + recv_sys_t::map::iterator p = recv_sys.pages.find(bpage->id()); if (p != recv_sys.pages.end() && !p->second.is_being_processed()) { recv_recover_page(block, mtr, p, space); @@ -2702,7 +2702,7 @@ void recv_sys_t::apply(bool last_batch) buf_flush_wait_LRU_batch_end(); os_event_reset(flush_end); - flush_type = BUF_FLUSH_LIST; + flush_lru= false; os_event_set(flush_start); os_event_wait(flush_end); diff --git a/storage/innobase/mtr/mtr0mtr.cc b/storage/innobase/mtr/mtr0mtr.cc index e77df7c61d5..77aa7b80ec4 100644 --- a/storage/innobase/mtr/mtr0mtr.cc +++ b/storage/innobase/mtr/mtr0mtr.cc @@ -226,11 +226,11 @@ static void memo_slot_release(mtr_memo_slot_t *slot) case MTR_MEMO_PAGE_SX_FIX: case MTR_MEMO_PAGE_X_FIX: buf_block_t *block= reinterpret_cast<buf_block_t*>(slot->object); - block->unfix(); buf_page_release_latch(block, slot->type); + block->unfix(); break; } - slot->object= NULL; + slot->object= nullptr; } /** Release the latches acquired by the mini-transaction. */ @@ -262,8 +262,8 @@ struct ReleaseLatches { case MTR_MEMO_PAGE_SX_FIX: case MTR_MEMO_PAGE_X_FIX: buf_block_t *block= reinterpret_cast<buf_block_t*>(slot->object); - block->unfix(); buf_page_release_latch(block, slot->type); + block->unfix(); break; } slot->object= NULL; diff --git a/storage/innobase/page/page0cur.cc b/storage/innobase/page/page0cur.cc index fed2f8f4657..666f7c8d788 100644 --- a/storage/innobase/page/page0cur.cc +++ b/storage/innobase/page/page0cur.cc @@ -1119,7 +1119,7 @@ inline void mtr_t::page_insert(const buf_block_t &block, bool reuse, len+= hdr_l + data_l; const bool small= len < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5); - byte *l= log_write<EXTENDED>(block.page.id, &block.page, len, small); + byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, small); if (UNIV_LIKELY(small)) { @@ -1227,7 +1227,7 @@ inline void mtr_t::page_insert(const buf_block_t &block, bool reuse, len+= hdr_l + data_l; const bool small= len < mtr_buf_t::MAX_DATA_SIZE - (1 + 3 + 3 + 5 + 5); - byte *l= log_write<EXTENDED>(block.page.id, &block.page, len, small); + byte *l= log_write<EXTENDED>(block.page.id(), &block.page, len, small); if (UNIV_LIKELY(small)) { @@ -2289,10 +2289,11 @@ bool page_apply_insert_redundant(const buf_block_t &block, bool reuse, byte *page_n_heap= my_assume_aligned<2>(PAGE_N_HEAP + PAGE_HEADER + block.frame); const uint16_t h= mach_read_from_2(page_n_heap); + const page_id_t id(block.page.id()); if (UNIV_UNLIKELY(n_slots < 2 || h < n_slots || h < PAGE_HEAP_NO_USER_LOW || h >= srv_page_size / REC_N_OLD_EXTRA_BYTES || !fil_page_index_page_check(block.frame) || - page_get_page_no(block.frame) != block.page.id.page_no() || + page_get_page_no(block.frame) != id.page_no() || mach_read_from_2(my_assume_aligned<2> (PAGE_OLD_SUPREMUM - REC_NEXT + block.frame)))) @@ -2303,7 +2304,7 @@ corrupted: " due to corruption on " : "Not applying INSERT_HEAP_REDUNDANT" " due to corruption on ") - << block.page.id; + << id; return true; } @@ -2530,11 +2531,12 @@ bool page_apply_insert_dynamic(const buf_block_t &block, bool reuse, byte *page_n_heap= my_assume_aligned<2>(PAGE_N_HEAP + PAGE_HEADER + block.frame); ulint h= mach_read_from_2(page_n_heap); + const page_id_t id(block.page.id()); if (UNIV_UNLIKELY(n_slots < 2 || h < (PAGE_HEAP_NO_USER_LOW | 0x8000) || (h & 0x7fff) >= srv_page_size / REC_N_NEW_EXTRA_BYTES || (h & 0x7fff) < n_slots || !fil_page_index_page_check(block.frame) || - page_get_page_no(block.frame) != block.page.id.page_no() || + page_get_page_no(block.frame) != id.page_no() || mach_read_from_2(my_assume_aligned<2> (PAGE_NEW_SUPREMUM - REC_NEXT + block.frame)) || @@ -2548,7 +2550,7 @@ corrupted: " due to corruption on " : "Not applying INSERT_HEAP_DYNAMIC" " due to corruption on ") - << block.page.id; + << id; return true; } @@ -2746,10 +2748,11 @@ bool page_apply_delete_redundant(const buf_block_t &block, ulint prev) { const uint16_t n_slots= page_dir_get_n_slots(block.frame); ulint n_recs= page_get_n_recs(block.frame); + const page_id_t id(block.page.id()); if (UNIV_UNLIKELY(!n_recs || n_slots < 2 || !fil_page_index_page_check(block.frame) || - page_get_page_no(block.frame) != block.page.id.page_no() || + page_get_page_no(block.frame) != id.page_no() || mach_read_from_2(my_assume_aligned<2> (PAGE_OLD_SUPREMUM - REC_NEXT + block.frame)) || @@ -2757,7 +2760,7 @@ bool page_apply_delete_redundant(const buf_block_t &block, ulint prev) { corrupted: ib::error() << "Not applying DELETE_ROW_FORMAT_REDUNDANT" - " due to corruption on " << block.page.id; + " due to corruption on " << id; return true; } @@ -2841,10 +2844,11 @@ bool page_apply_delete_dynamic(const buf_block_t &block, ulint prev, { const uint16_t n_slots= page_dir_get_n_slots(block.frame); ulint n_recs= page_get_n_recs(block.frame); + const page_id_t id(block.page.id()); if (UNIV_UNLIKELY(!n_recs || n_slots < 2 || !fil_page_index_page_check(block.frame) || - page_get_page_no(block.frame) != block.page.id.page_no() || + page_get_page_no(block.frame) != id.page_no() || mach_read_from_2(my_assume_aligned<2> (PAGE_NEW_SUPREMUM - REC_NEXT + block.frame)) || @@ -2852,7 +2856,7 @@ bool page_apply_delete_dynamic(const buf_block_t &block, ulint prev, { corrupted: ib::error() << "Not applying DELETE_ROW_FORMAT_DYNAMIC" - " due to corruption on " << block.page.id; + " due to corruption on " << id; return true; } diff --git a/storage/innobase/page/page0page.cc b/storage/innobase/page/page0page.cc index fb6fc5858e3..9e084e3f2a5 100644 --- a/storage/innobase/page/page0page.cc +++ b/storage/innobase/page/page0page.cc @@ -400,7 +400,7 @@ page_create_empty( ut_ad(fil_page_index_page_check(block->frame)); ut_ad(!index->is_dummy); - ut_ad(block->page.id.space() == index->table->space->id); + ut_ad(block->page.id().space() == index->table->space->id); /* Multiple transactions cannot simultaneously operate on the same temp-table in parallel. @@ -411,7 +411,7 @@ page_create_empty( && page_is_leaf(block->frame)) { max_trx_id = page_get_max_trx_id(block->frame); ut_ad(max_trx_id); - } else if (block->page.id.page_no() == index->page) { + } else if (block->page.id().page_no() == index->page) { /* Preserve PAGE_ROOT_AUTO_INC. */ max_trx_id = page_get_max_trx_id(block->frame); } else { diff --git a/storage/innobase/page/page0zip.cc b/storage/innobase/page/page0zip.cc index dd34628df08..f3a3655fefb 100644 --- a/storage/innobase/page/page0zip.cc +++ b/storage/innobase/page/page0zip.cc @@ -1388,7 +1388,7 @@ page_zip_compress( << " index " << index->name() << " page " - << block->page.id.page_no() + << block->page.id().page_no() << "(" << (page_is_leaf(page) ? "leaf" : "non-leaf") << ")"; diff --git a/storage/innobase/row/row0import.cc b/storage/innobase/row/row0import.cc index 36e1bc9fab3..99eddf03919 100644 --- a/storage/innobase/row/row0import.cc +++ b/storage/innobase/row/row0import.cc @@ -283,7 +283,7 @@ public: } if (!rec_offs_any_extern(offsets) - && m_cur.block->page.id.page_no() != index->page + && m_cur.block->page.id().page_no() != index->page && ((page_get_data_size(m_cur.block->frame) - rec_offs_size(offsets) < BTR_CUR_PAGE_COMPRESS_LIMIT(index)) @@ -697,14 +697,14 @@ dberr_t FetchIndexRootPages::operator()(buf_block_t* block) UNIV_NOTHROW ulint page_type = fil_page_get_type(page); if (page_type == FIL_PAGE_TYPE_XDES) { - return set_current_xdes(block->page.id.page_no(), page); + return set_current_xdes(block->page.id().page_no(), page); } else if (fil_page_index_page_check(page) - && !is_free(block->page.id.page_no()) + && !is_free(block->page.id().page_no()) && !page_has_siblings(page)) { index_id_t id = btr_page_get_index_id(page); - m_indexes.push_back(Index(id, block->page.id.page_no())); + m_indexes.push_back(Index(id, block->page.id().page_no())); if (m_indexes.size() == 1) { /* Check that the tablespace flags match the table flags. */ @@ -1572,7 +1572,7 @@ IndexPurge::next() UNIV_NOTHROW return status that will be checked in all callers! */ switch (next_page) { default: - if (next_page != block->page.id.page_no()) { + if (next_page != block->page.id().page_no()) { break; } /* MDEV-20931 FIXME: Check that @@ -1602,7 +1602,7 @@ IndexPurge::next() UNIV_NOTHROW != page_is_comp(block->frame) || btr_page_get_prev( next_block->frame) - != block->page.id.page_no())) { + != block->page.id().page_no())) { return DB_CORRUPTION; } @@ -1891,8 +1891,9 @@ PageConverter::update_index_page( { index_id_t id; buf_frame_t* page = block->frame; + const page_id_t page_id(block->page.id()); - if (is_free(block->page.id.page_no())) { + if (is_free(page_id.page_no())) { return(DB_SUCCESS); } else if ((id = btr_page_get_index_id(page)) != m_index->m_id) { @@ -1918,10 +1919,12 @@ PageConverter::update_index_page( return(DB_SUCCESS); } - if (m_index && block->page.id.page_no() == m_index->m_page_no) { + + + if (m_index && page_id.page_no() == m_index->m_page_no) { byte *b = FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF + FSEG_HDR_SPACE + page; - mach_write_to_4(b, block->page.id.space()); + mach_write_to_4(b, page_id.space()); memcpy(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP + FSEG_HDR_SPACE + page, b, 4); @@ -1950,7 +1953,7 @@ PageConverter::update_index_page( } if (m_index->m_srv_index->is_clust()) { - if (block->page.id.page_no() == m_index->m_srv_index->page) { + if (page_id.page_no() == m_index->m_srv_index->page) { dict_index_t* index = const_cast<dict_index_t*>( m_index->m_srv_index); /* Preserve the PAGE_ROOT_AUTO_INC. */ @@ -2068,7 +2071,7 @@ PageConverter::update_page(buf_block_t* block, uint16_t& page_type) switch (page_type = fil_page_get_type(get_frame(block))) { case FIL_PAGE_TYPE_FSP_HDR: - ut_a(block->page.id.page_no() == 0); + ut_a(block->page.id().page_no() == 0); /* Work directly on the uncompressed page headers. */ return(update_header(block)); @@ -2097,7 +2100,7 @@ PageConverter::update_page(buf_block_t* block, uint16_t& page_type) case FIL_PAGE_TYPE_XDES: err = set_current_xdes( - block->page.id.page_no(), get_frame(block)); + block->page.id().page_no(), get_frame(block)); /* fall through */ case FIL_PAGE_INODE: case FIL_PAGE_TYPE_TRX_SYS: @@ -2131,7 +2134,7 @@ dberr_t PageConverter::operator()(buf_block_t* block) UNIV_NOTHROW /* If we already had an old page with matching number in the buffer pool, evict it now, because we no longer evict the pages on DISCARD TABLESPACE. */ - buf_page_get_gen(block->page.id, get_zip_size(), + buf_page_get_gen(block->page.id(), get_zip_size(), RW_NO_LATCH, NULL, BUF_EVICT_IF_IN_POOL, __FILE__, __LINE__, NULL, NULL); @@ -2163,54 +2166,6 @@ dberr_t PageConverter::operator()(buf_block_t* block) UNIV_NOTHROW } /*****************************************************************//** -Clean up after import tablespace failure, this function will acquire -the dictionary latches on behalf of the transaction if the transaction -hasn't already acquired them. */ -static MY_ATTRIBUTE((nonnull)) -void -row_import_discard_changes( -/*=======================*/ - row_prebuilt_t* prebuilt, /*!< in/out: prebuilt from handler */ - trx_t* trx, /*!< in/out: transaction for import */ - dberr_t err) /*!< in: error code */ -{ - dict_table_t* table = prebuilt->table; - - ut_a(err != DB_SUCCESS); - - prebuilt->trx->error_info = NULL; - - ib::info() << "Discarding tablespace of table " - << prebuilt->table->name - << ": " << ut_strerr(err); - - if (trx->dict_operation_lock_mode != RW_X_LATCH) { - ut_a(trx->dict_operation_lock_mode == 0); - row_mysql_lock_data_dictionary(trx); - } - - ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); - - /* Since we update the index root page numbers on disk after - we've done a successful import. The table will not be loadable. - However, we need to ensure that the in memory root page numbers - are reset to "NULL". */ - - for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); - index != 0; - index = UT_LIST_GET_NEXT(indexes, index)) { - - index->page = FIL_NULL; - } - - table->file_unreadable = true; - if (table->space) { - fil_close_tablespace(trx, table->space_id); - table->space = NULL; - } -} - -/*****************************************************************//** Clean up after import tablespace. */ static MY_ATTRIBUTE((nonnull, warn_unused_result)) dberr_t @@ -2223,7 +2178,27 @@ row_import_cleanup( ut_a(prebuilt->trx != trx); if (err != DB_SUCCESS) { - row_import_discard_changes(prebuilt, trx, err); + dict_table_t* table = prebuilt->table; + table->file_unreadable = true; + if (table->space) { + fil_close_tablespace(table->space_id); + table->space = NULL; + } + + prebuilt->trx->error_info = NULL; + + ib::info() << "Discarding tablespace of table " + << table->name << ": " << ut_strerr(err); + + if (!trx->dict_operation_lock_mode) { + row_mysql_lock_data_dictionary(trx); + } + + for (dict_index_t* index = UT_LIST_GET_FIRST(table->indexes); + index; + index = UT_LIST_GET_NEXT(indexes, index)) { + index->page = FIL_NULL; + } } ut_a(trx->dict_operation_lock_mode == RW_X_LATCH); @@ -3471,14 +3446,15 @@ fil_iterate( bool updated = false; os_offset_t page_off = offset; ulint n_pages_read = n_bytes / size; - block->page.id.set_page_no(ulint(page_off / size)); + /* This block is not attached to buf_pool */ + block->page.id_.set_page_no(ulint(page_off / size)); for (ulint i = 0; i < n_pages_read; - block->page.id.set_page_no(block->page.id.page_no() + 1), + ++block->page.id_, ++i, page_off += size, block->frame += size) { byte* src = readptr + i * size; const ulint page_no = page_get_page_no(src); - if (!page_no && block->page.id.page_no()) { + if (!page_no && block->page.id().page_no()) { if (!buf_is_zeroes(span<const byte>(src, size))) { goto page_corrupted; @@ -3488,7 +3464,7 @@ fil_iterate( continue; } - if (page_no != block->page.id.page_no()) { + if (page_no != block->page.id().page_no()) { page_corrupted: ib::warn() << callback.filename() << ": Page " << (offset / size) @@ -3498,7 +3474,7 @@ page_corrupted: goto func_exit; } - if (block->page.id.page_no() == 0) { + if (block->page.id().page_no() == 0) { actual_space_id = mach_read_from_4( src + FIL_PAGE_SPACE_ID); } @@ -3526,7 +3502,7 @@ page_corrupted: if (!encrypted) { } else if (!key_version) { not_encrypted: - if (block->page.id.page_no() == 0 + if (block->page.id().page_no() == 0 && block->page.zip.data) { block->page.zip.data = src; frame_changed = true; @@ -3589,7 +3565,7 @@ not_encrypted: if ((err = callback(block)) != DB_SUCCESS) { goto func_exit; } else if (!updated) { - updated = buf_block_get_state(block) + updated = block->page.state() == BUF_BLOCK_FILE_PAGE; } @@ -3624,7 +3600,7 @@ not_encrypted: /* When tablespace is encrypted or compressed its first page (i.e. page 0) is not encrypted or compressed and there is no need to copy frame. */ - if (encrypted && block->page.id.page_no() != 0) { + if (encrypted && block->page.id().page_no() != 0) { byte *local_frame = callback.get_frame(block); ut_ad((writeptr + (i * size)) != local_frame); memcpy((writeptr + (i * size)), local_frame, size); @@ -3661,8 +3637,8 @@ not_encrypted: byte* tmp = fil_encrypt_buf( iter.crypt_data, - block->page.id.space(), - block->page.id.page_no(), + block->page.id().space(), + block->page.id().page_no(), src, block->zip_size(), dest, full_crc32); @@ -3789,10 +3765,7 @@ fil_tablespace_iterate( buf_block_t* block = reinterpret_cast<buf_block_t*> (ut_zalloc_nokey(sizeof *block)); block->frame = page; - block->page.id = page_id_t(0, 0); - block->page.io_fix = BUF_IO_NONE; - block->page.buf_fix_count = 1; - block->page.state = BUF_BLOCK_FILE_PAGE; + block->page.init(BUF_BLOCK_FILE_PAGE, page_id_t(~0ULL), 1); /* Read the first page and determine the page and zip size. */ @@ -3807,7 +3780,7 @@ fil_tablespace_iterate( } if (err == DB_SUCCESS) { - block->page.id = page_id_t(callback.get_space_id(), 0); + block->page.id_ = page_id_t(callback.get_space_id(), 0); if (ulint zip_size = callback.get_zip_size()) { page_zip_set_size(&block->page.zip, zip_size); /* ROW_FORMAT=COMPRESSED is not optimised for block IO diff --git a/storage/innobase/row/row0ins.cc b/storage/innobase/row/row0ins.cc index 4cf19e3ee8e..91c401cd4ba 100644 --- a/storage/innobase/row/row0ins.cc +++ b/storage/innobase/row/row0ins.cc @@ -239,7 +239,7 @@ row_ins_sec_index_entry_by_modify( } } else { ut_a(mode == BTR_MODIFY_TREE); - if (buf_LRU_buf_pool_running_out()) { + if (buf_pool.running_out()) { return(DB_LOCK_TABLE_FULL); } @@ -329,10 +329,8 @@ row_ins_clust_index_entry_by_modify( break; } } else { - if (buf_LRU_buf_pool_running_out()) { - - return(DB_LOCK_TABLE_FULL); - + if (buf_pool.running_out()) { + return DB_LOCK_TABLE_FULL; } big_rec_t* big_rec = NULL; @@ -2718,8 +2716,7 @@ do_insert: entry, &insert_rec, &big_rec, n_ext, thr, &mtr); } else { - if (buf_LRU_buf_pool_running_out()) { - + if (buf_pool.running_out()) { err = DB_LOCK_TABLE_FULL; goto err_exit; } @@ -3076,8 +3073,7 @@ row_ins_sec_index_entry_low( } } else { ut_ad(mode == BTR_MODIFY_TREE); - if (buf_LRU_buf_pool_running_out()) { - + if (buf_pool.running_out()) { err = DB_LOCK_TABLE_FULL; goto func_exit; } diff --git a/storage/innobase/row/row0merge.cc b/storage/innobase/row/row0merge.cc index 27a9d441dbd..3b898c21e12 100644 --- a/storage/innobase/row/row0merge.cc +++ b/storage/innobase/row/row0merge.cc @@ -1974,7 +1974,7 @@ row_merge_read_clustered_index( this is the only page in the index tree. */ ut_ad(btr_pcur_is_on_user_rec(&pcur) || btr_pcur_get_block( - &pcur)->page.id.page_no() + &pcur)->page.id().page_no() == clust_index->page); btr_pcur_store_position(&pcur, &mtr); diff --git a/storage/innobase/row/row0purge.cc b/storage/innobase/row/row0purge.cc index 9934ede605b..e9023473fd7 100644 --- a/storage/innobase/row/row0purge.cc +++ b/storage/innobase/row/row0purge.cc @@ -491,7 +491,7 @@ row_purge_remove_sec_if_poss_leaf( const buf_block_t* block = btr_cur_get_block( btr_cur); - if (block->page.id.page_no() + if (block->page.id().page_no() != index->page && page_get_n_recs(block->frame) < 2 && !lock_test_prdt_page_lock( @@ -500,8 +500,8 @@ row_purge_remove_sec_if_poss_leaf( ? thr_get_trx( btr_cur->rtr_info->thr) : NULL, - block->page.id.space(), - block->page.id.page_no())) { + block->page.id().space(), + block->page.id().page_no())) { /* this is the last record on page, and it has a "page" lock on it, which mean search is still depending @@ -509,7 +509,7 @@ row_purge_remove_sec_if_poss_leaf( DBUG_LOG("purge", "skip purging last" " record on page " - << block->page.id); + << block->page.id()); btr_pcur_close(&pcur); mtr.commit(); diff --git a/storage/innobase/row/row0sel.cc b/storage/innobase/row/row0sel.cc index 34d83ef5e80..d56985a0a2e 100644 --- a/storage/innobase/row/row0sel.cc +++ b/storage/innobase/row/row0sel.cc @@ -1241,11 +1241,9 @@ sel_set_rec_lock( trx = thr_get_trx(thr); - if (UT_LIST_GET_LEN(trx->lock.trx_locks) > 10000) { - if (buf_LRU_buf_pool_running_out()) { - - return(DB_LOCK_TABLE_FULL); - } + if (UT_LIST_GET_LEN(trx->lock.trx_locks) > 10000 + && buf_pool.running_out()) { + return DB_LOCK_TABLE_FULL; } if (dict_index_is_clust(index)) { @@ -3312,7 +3310,7 @@ Row_sel_get_clust_rec_for_mysql::operator()( same as btr_pcur_get_block(prebuilt->pcur), and is it not unsafe to use RW_NO_LATCH here? */ buf_block_t* block = buf_page_get_gen( - btr_pcur_get_block(prebuilt->pcur)->page.id, + btr_pcur_get_block(prebuilt->pcur)->page.id(), btr_pcur_get_block(prebuilt->pcur)->zip_size(), RW_NO_LATCH, NULL, BUF_GET, __FILE__, __LINE__, mtr, &err); @@ -4779,12 +4777,12 @@ wrong_offs: << static_cast<const void*>(rec) << ", buf block fix count " << btr_pcur_get_block(pcur)->page - .buf_fix_count; + .buf_fix_count(); ib::error() << "Index corruption: rec offs " << page_offset(rec) << " next offs " - << next_offs << ", page no " - << btr_pcur_get_block(pcur)->page.id.page_no() + << next_offs + << btr_pcur_get_block(pcur)->page.id() << ", index " << index->name << " of table " << index->table->name << ". Run CHECK TABLE. You may need to" @@ -4800,8 +4798,8 @@ wrong_offs: ib::info() << "Index corruption: rec offs " << page_offset(rec) << " next offs " - << next_offs << ", page no " - << btr_pcur_get_block(pcur)->page.id.page_no() + << next_offs + << btr_pcur_get_block(pcur)->page.id() << ", index " << index->name << " of table " << index->table->name << ". We try to skip the rest of the page."; @@ -4828,8 +4826,8 @@ wrong_offs: ib::error() << "Index corruption: rec offs " << page_offset(rec) << " next offs " - << next_offs << ", page no " - << btr_pcur_get_block(pcur)->page.id.page_no() + << next_offs + << btr_pcur_get_block(pcur)->page.id() << ", index " << index->name << " of table " << index->table->name << ". We try to skip the record."; diff --git a/storage/innobase/row/row0undo.cc b/storage/innobase/row/row0undo.cc index 8fca99a44b8..2d56ddb21b5 100644 --- a/storage/innobase/row/row0undo.cc +++ b/storage/innobase/row/row0undo.cc @@ -348,7 +348,7 @@ static bool row_undo_rec_get(undo_node_t* node) trx->pages_undone++; } - undo->top_page_no = prev_page->page.id.page_no(); + undo->top_page_no = prev_page->page.id().page_no(); undo->top_offset = page_offset(prev_rec); undo->top_undo_no = trx_undo_rec_get_undo_no(prev_rec); ut_ad(!undo->empty()); diff --git a/storage/innobase/row/row0upd.cc b/storage/innobase/row/row0upd.cc index 5583cab54da..c52e503237a 100644 --- a/storage/innobase/row/row0upd.cc +++ b/storage/innobase/row/row0upd.cc @@ -2573,8 +2573,7 @@ row_upd_clust_rec( mtr_commit(mtr); - if (buf_LRU_buf_pool_running_out()) { - + if (buf_pool.running_out()) { err = DB_LOCK_TABLE_FULL; goto func_exit; } diff --git a/storage/innobase/srv/srv0start.cc b/storage/innobase/srv/srv0start.cc index 846b93fe0ff..73e7c4b8704 100644 --- a/storage/innobase/srv/srv0start.cc +++ b/storage/innobase/srv/srv0start.cc @@ -947,6 +947,8 @@ srv_init_abort_low( #endif /* UNIV_DEBUG */ dberr_t err) { + ut_ad(srv_is_being_started); + if (create_new_db) { ib::error() << "Database creation was aborted" #ifdef UNIV_DEBUG @@ -2081,7 +2083,6 @@ void innodb_shutdown() case SRV_OPERATION_RESTORE: case SRV_OPERATION_RESTORE_DELTA: case SRV_OPERATION_RESTORE_EXPORT: - fil_close_all_files(); break; case SRV_OPERATION_NORMAL: /* Shut down the persistent files. */ @@ -2094,6 +2095,8 @@ void innodb_shutdown() } } + os_aio_free(); + fil_close_all_files(); /* Exit any remaining threads. */ srv_shutdown_all_bg_threads(); @@ -2157,7 +2160,6 @@ void innodb_shutdown() } dict_sys.close(); - os_aio_free(); btr_search_sys_free(); row_mysql_close(); srv_free(); diff --git a/storage/innobase/sync/sync0arr.cc b/storage/innobase/sync/sync0arr.cc index 05fb8d76979..a599ab2f644 100644 --- a/storage/innobase/sync/sync0arr.cc +++ b/storage/innobase/sync/sync0arr.cc @@ -77,7 +77,6 @@ infinite wait The error_monitor thread scans the global wait array to signal any waiting threads who have missed the signal. */ typedef TTASEventMutex<GenericPolicy> WaitMutex; -typedef TTASEventMutex<BlockMutexPolicy> BlockWaitMutex; /** The latch types that use the sync array. */ union sync_object_t { @@ -87,9 +86,6 @@ union sync_object_t { /** Mutex instance */ WaitMutex* mutex; - - /** Block mutex instance */ - BlockWaitMutex* bpmutex; }; /** A cell where an individual thread may wait suspended until a resource @@ -294,22 +290,12 @@ sync_cell_get_event( /*================*/ sync_cell_t* cell) /*!< in: non-empty sync array cell */ { - ulint type = cell->request_type; - - if (type == SYNC_MUTEX) { - + switch(cell->request_type) { + case SYNC_MUTEX: return(cell->latch.mutex->event()); - - } else if (type == SYNC_BUF_BLOCK) { - - return(cell->latch.bpmutex->event()); - - } else if (type == RW_LOCK_X_WAIT) { - + case RW_LOCK_X_WAIT: return(cell->latch.lock->wait_ex_event); - - } else { /* RW_LOCK_S and RW_LOCK_X wait on the same event */ - + default: return(cell->latch.lock->event); } } @@ -362,8 +348,6 @@ sync_array_reserve_cell( if (cell->request_type == SYNC_MUTEX) { cell->latch.mutex = reinterpret_cast<WaitMutex*>(object); - } else if (cell->request_type == SYNC_BUF_BLOCK) { - cell->latch.bpmutex = reinterpret_cast<BlockWaitMutex*>(object); } else { cell->latch.lock = reinterpret_cast<rw_lock_t*>(object); } @@ -499,65 +483,13 @@ sync_array_cell_print( innobase_basename(cell->file), (ulong) cell->line, difftime(time(NULL), cell->reservation_time)); - if (type == SYNC_MUTEX) { - WaitMutex* mutex = cell->latch.mutex; - const WaitMutex::MutexPolicy& policy = mutex->policy(); -#ifdef UNIV_DEBUG - const char* name = policy.context.get_enter_filename(); - if (name == NULL) { - /* The mutex might have been released. */ - name = "NULL"; - } -#endif /* UNIV_DEBUG */ - - if (mutex) { - fprintf(file, - "Mutex at %p, %s, lock var %x\n" -#ifdef UNIV_DEBUG - "Last time reserved in file %s line %u" -#endif /* UNIV_DEBUG */ - "\n", - (void*) mutex, - policy.to_string().c_str(), - mutex->state() -#ifdef UNIV_DEBUG - ,name, - policy.context.get_enter_line() -#endif /* UNIV_DEBUG */ - ); - } - } else if (type == SYNC_BUF_BLOCK) { - BlockWaitMutex* mutex = cell->latch.bpmutex; - - const BlockWaitMutex::MutexPolicy& policy = - mutex->policy(); -#ifdef UNIV_DEBUG - const char* name = policy.context.get_enter_filename(); - if (name == NULL) { - /* The mutex might have been released. */ - name = "NULL"; - } -#endif /* UNIV_DEBUG */ - - fprintf(file, - "Mutex at %p, %s, lock var %lu\n" -#ifdef UNIV_DEBUG - "Last time reserved in file %s line %lu" -#endif /* UNIV_DEBUG */ - "\n", - (void*) mutex, - policy.to_string().c_str(), - (ulong) mutex->state() -#ifdef UNIV_DEBUG - ,name, - (ulong) policy.context.get_enter_line() -#endif /* UNIV_DEBUG */ - ); - } else if (type == RW_LOCK_X - || type == RW_LOCK_X_WAIT - || type == RW_LOCK_SX - || type == RW_LOCK_S) { - + switch (type) { + default: + ut_error; + case RW_LOCK_X: + case RW_LOCK_X_WAIT: + case RW_LOCK_SX: + case RW_LOCK_S: fputs(type == RW_LOCK_X ? "X-lock on" : type == RW_LOCK_X_WAIT ? "X-lock (wait_ex) on" : type == RW_LOCK_SX ? "SX-lock on" @@ -606,9 +538,35 @@ sync_array_cell_print( #endif ); } + break; + case SYNC_MUTEX: + WaitMutex* mutex = cell->latch.mutex; + const WaitMutex::MutexPolicy& policy = mutex->policy(); +#ifdef UNIV_DEBUG + const char* name = policy.context.get_enter_filename(); + if (name == NULL) { + /* The mutex might have been released. */ + name = "NULL"; + } +#endif /* UNIV_DEBUG */ - } else { - ut_error; + if (mutex) { + fprintf(file, + "Mutex at %p, %s, lock var %x\n" +#ifdef UNIV_DEBUG + "Last time reserved in file %s line %u" +#endif /* UNIV_DEBUG */ + "\n", + (void*) mutex, + policy.to_string().c_str(), + mutex->state() +#ifdef UNIV_DEBUG + ,name, + policy.context.get_enter_line() +#endif /* UNIV_DEBUG */ + ); + } + break; } if (!cell->waiting) { @@ -781,52 +739,6 @@ sync_array_detect_deadlock( return(false); } - case SYNC_BUF_BLOCK: { - - BlockWaitMutex* mutex = cell->latch.bpmutex; - - const BlockWaitMutex::MutexPolicy& policy = - mutex->policy(); - - if (mutex->state() != MUTEX_STATE_UNLOCKED) { - thread = policy.context.get_thread_id(); - - /* Note that mutex->thread_id above may be - also OS_THREAD_ID_UNDEFINED, because the - thread which held the mutex maybe has not - yet updated the value, or it has already - released the mutex: in this case no deadlock - can occur, as the wait array cannot contain - a thread with ID_UNDEFINED value. */ - ret = sync_array_deadlock_step( - arr, start, thread, 0, depth); - - if (ret) { - const char* name; - - name = policy.context.get_enter_filename(); - - if (name == NULL) { - /* The mutex might have been - released. */ - name = "NULL"; - } - - ib::info() - << "Mutex " << mutex << " owned by" - " thread " << os_thread_pf(thread) - << " file " << name << " line " - << policy.context.get_enter_line(); - - - return(true); - } - } - - /* No deadlock */ - return(false); - } - case RW_LOCK_X: case RW_LOCK_X_WAIT: diff --git a/storage/innobase/sync/sync0debug.cc b/storage/innobase/sync/sync0debug.cc index 2e64fdd732e..8ad3c4d0119 100644 --- a/storage/innobase/sync/sync0debug.cc +++ b/storage/innobase/sync/sync0debug.cc @@ -456,7 +456,6 @@ LatchDebug::LatchDebug() LEVEL_MAP_INSERT(SYNC_ANY_LATCH); LEVEL_MAP_INSERT(SYNC_DOUBLEWRITE); LEVEL_MAP_INSERT(SYNC_BUF_FLUSH_LIST); - LEVEL_MAP_INSERT(SYNC_BUF_BLOCK); LEVEL_MAP_INSERT(SYNC_BUF_PAGE_HASH); LEVEL_MAP_INSERT(SYNC_BUF_POOL); LEVEL_MAP_INSERT(SYNC_POOL); @@ -834,18 +833,6 @@ LatchDebug::check_order( /* Fall through */ - case SYNC_BUF_BLOCK: - - /* Either the thread must own the (buffer pool) buf_pool.mutex - or it is allowed to latch only ONE of (buffer block) - block->mutex or buf_pool.zip_mutex. */ - - if (less(latches, level) != NULL) { - basic_check(latches, level, level - 1); - ut_a(find(latches, SYNC_BUF_POOL) != 0); - } - break; - case SYNC_REC_LOCK: if (find(latches, SYNC_LOCK_SYS) != 0) { @@ -1280,17 +1267,8 @@ sync_latch_meta_init() LATCH_ADD_MUTEX(AUTOINC, SYNC_DICT_AUTOINC_MUTEX, autoinc_mutex_key); -#if defined PFS_SKIP_BUFFER_MUTEX_RWLOCK || defined PFS_GROUP_BUFFER_SYNC - LATCH_ADD_MUTEX(BUF_BLOCK_MUTEX, SYNC_BUF_BLOCK, PFS_NOT_INSTRUMENTED); -#else - LATCH_ADD_MUTEX(BUF_BLOCK_MUTEX, SYNC_BUF_BLOCK, - buffer_block_mutex_key); -#endif /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */ - LATCH_ADD_MUTEX(BUF_POOL, SYNC_BUF_POOL, buf_pool_mutex_key); - LATCH_ADD_MUTEX(BUF_POOL_ZIP, SYNC_BUF_BLOCK, buf_pool_zip_mutex_key); - LATCH_ADD_MUTEX(CACHE_LAST_READ, SYNC_TRX_I_S_LAST_READ, cache_last_read_mutex_key); diff --git a/storage/innobase/sync/sync0sync.cc b/storage/innobase/sync/sync0sync.cc index af97603d551..068f15a77d8 100644 --- a/storage/innobase/sync/sync0sync.cc +++ b/storage/innobase/sync/sync0sync.cc @@ -38,9 +38,7 @@ Created 9/5/1995 Heikki Tuuri #ifdef UNIV_PFS_MUTEX /* Key to register autoinc_mutex with performance schema */ mysql_pfs_key_t autoinc_mutex_key; -mysql_pfs_key_t buffer_block_mutex_key; mysql_pfs_key_t buf_pool_mutex_key; -mysql_pfs_key_t buf_pool_zip_mutex_key; mysql_pfs_key_t cache_last_read_mutex_key; mysql_pfs_key_t dict_foreign_err_mutex_key; mysql_pfs_key_t dict_sys_mutex_key; diff --git a/storage/innobase/trx/trx0purge.cc b/storage/innobase/trx/trx0purge.cc index f2bfae77d30..c37a8b98cbd 100644 --- a/storage/innobase/trx/trx0purge.cc +++ b/storage/innobase/trx/trx0purge.cc @@ -718,7 +718,7 @@ not_free: rseg->id, sys_header, &mtr); ut_ad(rblock); rseg->page_no = rblock - ? rblock->page.id.page_no() : FIL_NULL; + ? rblock->page.id().page_no() : FIL_NULL; ut_ad(old_page == rseg->page_no); /* Before re-initialization ensure that we @@ -927,7 +927,7 @@ trx_purge_read_undo_rec() offset = page_offset(undo_rec); undo_no = trx_undo_rec_get_undo_no(undo_rec); - page_no = undo_page->page.id.page_no(); + page_no = undo_page->page.id().page_no(); } else { offset = 0; undo_no = 0; @@ -1029,7 +1029,7 @@ trx_purge_get_next_rec( page_id_t(space, page_no), &mtr); } else { purge_sys.offset = page_offset(rec2); - purge_sys.page_no = rec2_page->page.id.page_no(); + purge_sys.page_no = rec2_page->page.id().page_no(); purge_sys.tail.undo_no = trx_undo_rec_get_undo_no(rec2); if (undo_page != rec2_page) { diff --git a/storage/innobase/trx/trx0rec.cc b/storage/innobase/trx/trx0rec.cc index cda1bd6f22c..eb1ced21060 100644 --- a/storage/innobase/trx/trx0rec.cc +++ b/storage/innobase/trx/trx0rec.cc @@ -1891,7 +1891,8 @@ dberr_t trx_undo_report_rename(trx_t* trx, const dict_table_t* table) ut_ad(undo); for (ut_d(int loop_count = 0);;) { ut_ad(loop_count++ < 2); - ut_ad(undo->last_page_no == block->page.id.page_no()); + ut_ad(undo->last_page_no + == block->page.id().page_no()); if (uint16_t offset = trx_undo_page_report_rename( trx, table, block, &mtr)) { @@ -2047,7 +2048,7 @@ trx_undo_report_row_operation( undo->withdraw_clock = buf_pool.withdraw_clock(); mtr_commit(&mtr); - undo->top_page_no = undo_block->page.id.page_no(); + undo->top_page_no = undo_block->page.id().page_no(); undo->top_offset = offset; undo->top_undo_no = trx->undo_no++; undo->guess_block = undo_block; @@ -2079,7 +2080,7 @@ trx_undo_report_row_operation( return(DB_SUCCESS); } - ut_ad(undo_block->page.id.page_no() == undo->last_page_no); + ut_ad(undo_block->page.id().page_no() == undo->last_page_no); /* We have to extend the undo log by one page */ diff --git a/storage/innobase/trx/trx0rseg.cc b/storage/innobase/trx/trx0rseg.cc index 3790b89bf35..b6e934761a2 100644 --- a/storage/innobase/trx/trx0rseg.cc +++ b/storage/innobase/trx/trx0rseg.cc @@ -350,7 +350,7 @@ trx_rseg_header_create( *sys_header, TRX_SYS + TRX_SYS_RSEGS + TRX_SYS_RSEG_PAGE_NO + rseg_id * TRX_SYS_RSEG_SLOT_SIZE - + sys_header->frame, block->page.id.page_no()); + + sys_header->frame, block->page.id().page_no()); } return block; @@ -683,7 +683,8 @@ trx_rseg_create(ulint space_id) ut_ad(trx_sysf_rseg_get_space(sys_header, rseg_id) == space_id); rseg = trx_rseg_mem_create(rseg_id, space, - rblock->page.id.page_no()); + rblock->page.id(). + page_no()); ut_ad(rseg->id == rseg_id); ut_ad(rseg->is_persistent()); ut_ad(!trx_sys.rseg_array[rseg->id]); @@ -710,7 +711,7 @@ trx_temp_rseg_create() buf_block_t* rblock = trx_rseg_header_create( fil_system.temp_space, i, NULL, &mtr); trx_rseg_t* rseg = trx_rseg_mem_create( - i, fil_system.temp_space, rblock->page.id.page_no()); + i, fil_system.temp_space, rblock->page.id().page_no()); ut_ad(!rseg->is_persistent()); ut_ad(!trx_sys.temp_rsegs[i]); trx_sys.temp_rsegs[i] = rseg; diff --git a/storage/innobase/trx/trx0sys.cc b/storage/innobase/trx/trx0sys.cc index 5131a07d4a5..45d1c81621b 100644 --- a/storage/innobase/trx/trx0sys.cc +++ b/storage/innobase/trx/trx0sys.cc @@ -163,7 +163,7 @@ trx_sysf_create( mtr); buf_block_dbg_add_level(block, SYNC_TRX_SYS_HEADER); - ut_a(block->page.id.page_no() == TRX_SYS_PAGE_NO); + ut_a(block->page.id().page_no() == TRX_SYS_PAGE_NO); mtr->write<2>(*block, FIL_PAGE_TYPE + block->frame, FIL_PAGE_TYPE_TRX_SYS); @@ -195,7 +195,7 @@ trx_sysf_create( slot_no, block, mtr); ut_a(slot_no == TRX_SYS_SYSTEM_RSEG_ID); - ut_a(rblock->page.id.page_no() == FSP_FIRST_RSEG_PAGE_NO); + ut_a(rblock->page.id().page_no() == FSP_FIRST_RSEG_PAGE_NO); } /** Create the instance */ diff --git a/storage/innobase/trx/trx0undo.cc b/storage/innobase/trx/trx0undo.cc index 687258767ce..9ec219870c4 100644 --- a/storage/innobase/trx/trx0undo.cc +++ b/storage/innobase/trx/trx0undo.cc @@ -117,7 +117,7 @@ static uint16_t trx_undo_page_get_start(const buf_block_t *block, uint32_t page_no, uint16_t offset) { - return page_no == block->page.id.page_no() + return page_no == block->page.id().page_no() ? mach_read_from_2(offset + TRX_UNDO_LOG_START + block->frame) : TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_HDR_SIZE; } @@ -171,10 +171,10 @@ trx_undo_get_prev_rec_from_prev_page(buf_block_t *&block, uint16_t rec, block->frame).page; if (prev_page_no == FIL_NULL) - return NULL; + return nullptr; - block = buf_page_get(page_id_t(block->page.id.space(), prev_page_no), - 0, shared ? RW_S_LATCH : RW_X_LATCH, mtr); + block= buf_page_get(page_id_t(block->page.id().space(), prev_page_no), + 0, shared ? RW_S_LATCH : RW_X_LATCH, mtr); buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE); return trx_undo_page_get_last_rec(block, page_no, offset); @@ -233,7 +233,7 @@ static trx_undo_rec_t* trx_undo_get_next_rec_from_next_page(buf_block_t *&block, uint32_t page_no, uint16_t offset, ulint mode, mtr_t *mtr) { - if (page_no == block->page.id.page_no() && + if (page_no == block->page.id().page_no() && mach_read_from_2(block->frame + offset + TRX_UNDO_NEXT_LOG)) return NULL; @@ -242,7 +242,7 @@ trx_undo_get_next_rec_from_next_page(buf_block_t *&block, uint32_t page_no, if (next == FIL_NULL) return NULL; - block= buf_page_get(page_id_t(block->page.id.space(), next), 0, mode, mtr); + block= buf_page_get(page_id_t(block->page.id().space(), next), 0, mode, mtr); buf_block_dbg_add_level(block, SYNC_TRX_UNDO_PAGE); return trx_undo_page_get_first_rec(block, page_no, offset); @@ -414,7 +414,7 @@ trx_undo_seg_create(fil_space_t *space, buf_block_t *rseg_hdr, ulint *id, *id = slot_no; mtr->write<4>(*rseg_hdr, TRX_RSEG + TRX_RSEG_UNDO_SLOTS + slot_no * TRX_RSEG_SLOT_SIZE + rseg_hdr->frame, - block->page.id.page_no()); + block->page.id().page_no()); MONITOR_INC(MONITOR_NUM_UNDO_SLOT_USED); @@ -580,7 +580,7 @@ buf_block_t* trx_undo_add_page(trx_undo_t* undo, mtr_t* mtr) ut_ad(rw_lock_get_x_lock_count(&new_block->lock) == 1); buf_block_dbg_add_level(new_block, SYNC_TRX_UNDO_PAGE); - undo->last_page_no = new_block->page.id.page_no(); + undo->last_page_no = new_block->page.id().page_no(); mtr->undo_create(*new_block); trx_undo_page_init(*new_block); @@ -767,7 +767,7 @@ done: goto done; } - if (undo_page->page.id.page_no() == hdr_page_no) { + if (undo_page->page.id().page_no() == hdr_page_no) { uint16_t end = mach_read_from_2(hdr_offset + TRX_UNDO_NEXT_LOG + undo_page->frame); if (end == 0) { @@ -780,7 +780,7 @@ done: + TRX_UNDO_LOG_START, end); } else { trx_undo_free_page(rseg, true, hdr_page_no, - undo_page->page.id.page_no(), &mtr); + undo_page->page.id().page_no(), &mtr); } mtr_commit(&mtr); @@ -1044,7 +1044,7 @@ trx_undo_create(trx_t* trx, trx_rseg_t* rseg, trx_undo_t** undo, uint16_t offset = trx_undo_header_create(block, trx->id, mtr); *undo = trx_undo_mem_create(rseg, id, trx->id, trx->xid, - block->page.id.page_no(), offset); + block->page.id().page_no(), offset); if (*undo == NULL) { *err = DB_OUT_OF_MEMORY; /* FIXME: this will not free the undo block to the file */ diff --git a/storage/innobase/ut/ut0ut.cc b/storage/innobase/ut/ut0ut.cc index c7762fb2273..b3b6ad6099c 100644 --- a/storage/innobase/ut/ut0ut.cc +++ b/storage/innobase/ut/ut0ut.cc @@ -1,7 +1,7 @@ /***************************************************************************** Copyright (c) 1994, 2017, Oracle and/or its affiliates. All Rights Reserved. -Copyright (c) 2017, 2019, MariaDB Corporation. +Copyright (c) 2017, 2020, MariaDB Corporation. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -218,27 +218,6 @@ ut_print_buf( ut_print_buf_hex(o, buf, len); } -/*************************************************************//** -Calculates fast the number rounded up to the nearest power of 2. -@return first power of 2 which is >= n */ -ulint -ut_2_power_up( -/*==========*/ - ulint n) /*!< in: number != 0 */ -{ - ulint res; - - res = 1; - - ut_ad(n > 0); - - while (res < n) { - res = res * 2; - } - - return(res); -} - /** Get a fixed-length string, quoted as an SQL identifier. If the string contains a slash '/', the string will be output as two identifiers separated by a period (.), diff --git a/tpool/tpool.h b/tpool/tpool.h index ae9baf236b0..239be53c27e 100644 --- a/tpool/tpool.h +++ b/tpool/tpool.h @@ -229,7 +229,7 @@ public: m_aio.reset(); } int bind(native_file_handle &fd) { return m_aio->bind(fd); } - void unbind(const native_file_handle &fd) { m_aio->unbind(fd); } + void unbind(const native_file_handle &fd) { if (m_aio) m_aio->unbind(fd); } int submit_io(aiocb *cb) { return m_aio->submit_io(cb); } virtual void wait_begin() {}; virtual void wait_end() {}; |