summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarko Mäkelä <marko.makela@mariadb.com>2019-06-11 11:03:18 +0300
committerMarko Mäkelä <marko.makela@mariadb.com>2019-06-11 11:08:39 +0300
commit177a571e01e8ea949601aa1124ea86c376fd0472 (patch)
tree0b349bee88aaf0ff52618fc1a340e15e1ea3747b
parent992d2494e766f49e9c38eb631d0187ab63a8c7e2 (diff)
downloadmariadb-git-177a571e01e8ea949601aa1124ea86c376fd0472.tar.gz
MDEV-19586 Replace recv_sys_t::addr_hash with a std::map
InnoDB crash recovery buffers redo log records in a hash table. The function recv_read_in_area() would pick a random hash bucket and then try to submit read requests for a few nearby pages. Let us replace the recv_sys.addr_hash with a std::map, which will automatically be iterated in sorted order. recv_sys_t::pages: Replaces recv_sys_t::addr_hash, recv_sys_t::n_addrs. recv_sys_t::recs: Replaces most of recv_addr_t. recv_t: Encapsulate a raw singly-linked list of records. This reduces overhead compared to std::forward_list. Storage and cache overhead, because the next-element pointer also points to the data payload. Processing overhead, because recv_sys_t::recs_t::last will point to the last record, so that recv_sys_t::add() can append straight to the end of the list. RECV_PROCESSED, RECV_DISCARDED: Remove. When a page is fully processed, it will be deleted from recv_sys.pages. recv_sys_t::trim(): Replaces recv_addr_trim(). recv_sys_t::add(): Use page_id_t for identifying pages. recv_fold(), recv_hash(), recv_get_fil_addr_struct(): Remove. recv_read_in_area(): Simplify the iteration.
-rw-r--r--storage/innobase/include/fil0fil.h1
-rw-r--r--storage/innobase/include/log0recv.h71
-rw-r--r--storage/innobase/log/log0recv.cc693
3 files changed, 319 insertions, 446 deletions
diff --git a/storage/innobase/include/fil0fil.h b/storage/innobase/include/fil0fil.h
index 04c7f4f18d1..3e04d90e04e 100644
--- a/storage/innobase/include/fil0fil.h
+++ b/storage/innobase/include/fil0fil.h
@@ -31,6 +31,7 @@ Created 10/25/1995 Heikki Tuuri
#ifndef UNIV_INNOCHECKSUM
+#include "hash0hash.h"
#include "log0recv.h"
#include "dict0types.h"
#ifdef UNIV_LINUX
diff --git a/storage/innobase/include/log0recv.h b/storage/innobase/include/log0recv.h
index cfa0443f301..0914e2710d7 100644
--- a/storage/innobase/include/log0recv.h
+++ b/storage/innobase/include/log0recv.h
@@ -29,7 +29,6 @@ Created 9/20/1997 Heikki Tuuri
#include "ut0byte.h"
#include "buf0types.h"
-#include "hash0hash.h"
#include "log0log.h"
#include "mtr0types.h"
@@ -48,10 +47,10 @@ dberr_t
recv_find_max_checkpoint(ulint* max_field)
MY_ATTRIBUTE((nonnull, warn_unused_result));
-/** Reduces recv_sys.n_addrs for the corrupted page.
+/** Remove records for a corrupted page.
This function should called when srv_force_recovery > 0.
@param[in] bpage buffer pool page */
-void recv_recover_corrupt_page(buf_page_t* bpage);
+ATTRIBUTE_COLD void recv_recover_corrupt_page(buf_page_t* bpage);
/** Apply any buffered redo log to a page that was just read from a data file.
@param[in,out] bpage buffer pool page */
@@ -80,13 +79,13 @@ void
recv_sys_var_init(void);
/*===================*/
-/** Apply the hash table of stored log records to persistent data pages.
+/** Apply recv_sys.pages to persistent data pages.
@param[in] last_batch whether the change buffer merge will be
performed as part of the operation */
void
recv_apply_hashed_log_recs(bool last_batch);
-/** Whether to store redo log records to the hash table */
+/** Whether to store redo log records in recv_sys.pages */
enum store_t {
/** Do not store redo log records. */
STORE_NO,
@@ -105,8 +104,8 @@ recv_sys.parse_start_lsn is non-zero.
@return true if more data added */
bool recv_sys_add_to_parsing_buf(const byte* log_block, lsn_t scanned_lsn);
-/** Parse log records from a buffer and optionally store them to a
-hash table to wait merging to file pages.
+/** Parse log records from a buffer and optionally store them in recv_sys.pages
+to wait merging to file pages.
@param[in] checkpoint_lsn the LSN of the latest checkpoint
@param[in] store whether to store page operations
@param[in] apply whether to apply the records
@@ -144,8 +143,12 @@ struct recv_data_t{
/** Stored log record struct */
struct recv_t{
- mlog_id_t type; /*!< log record type */
- ulint len; /*!< log record body length in bytes */
+ /** next record */
+ recv_t* next;
+ /** log record body length in bytes */
+ uint32_t len;
+ /** log record type */
+ mlog_id_t type;
recv_data_t* data; /*!< chain of blocks containing the log record
body */
lsn_t start_lsn;/*!< start lsn of the log segment written by
@@ -156,8 +159,6 @@ struct recv_t{
the mtr which generated this log record: NOTE
that this is not necessarily the end lsn of
this log record */
- UT_LIST_NODE_T(recv_t)
- rec_list;/*!< list of log records for this page */
};
struct recv_dblwr_t {
@@ -205,7 +206,7 @@ struct recv_sys_t{
lsn_t parse_start_lsn;
/*!< this is the lsn from which we were able to
start parsing log records and adding them to
- the hash table; zero if a suitable
+ pages; zero if a suitable
start point not found yet */
lsn_t scanned_lsn;
/*!< the log data has been scanned up to this
@@ -234,9 +235,38 @@ struct recv_sys_t{
ib_time_t progress_time;
mem_heap_t* heap; /*!< memory heap of log records and file
addresses*/
- hash_table_t* addr_hash;/*!< hash table of file addresses of pages */
- ulint n_addrs;/*!< number of not processed hashed file
- addresses in the hash table */
+
+ /** buffered records waiting to be applied to a page */
+ struct recs_t
+ {
+ /** Recovery state */
+ enum {
+ /** not yet processed */
+ RECV_NOT_PROCESSED,
+ /** not processed; the page will be reinitialized */
+ RECV_WILL_NOT_READ,
+ /** page is being read */
+ RECV_BEING_READ,
+ /** log records are being applied on the page */
+ RECV_BEING_PROCESSED
+ } state;
+ /** First log record */
+ recv_t* log;
+ /** Last log record */
+ recv_t* last;
+ };
+
+ using map = std::map<const page_id_t, recs_t,
+ std::less<const page_id_t>,
+ ut_allocator<std::pair<const page_id_t,recs_t>>>;
+ /** buffered records waiting to be applied to pages */
+ map pages;
+
+ /** Process a record that indicates that a tablespace is
+ being shrunk in size.
+ @param page_id first page identifier that is not in the file
+ @param lsn log sequence number of the shrink operation */
+ inline void trim(const page_id_t page_id, lsn_t lsn);
/** Undo tablespaces for which truncate has been logged
(indexed by id - srv_undo_space_id_start) */
@@ -249,7 +279,7 @@ struct recv_sys_t{
recv_dblwr_t dblwr;
- /** Lastly added LSN to the hash table of log records. */
+ /** Last added LSN to pages. */
lsn_t last_stored_lsn;
/** Initialize the redo log recovery subsystem. */
@@ -265,13 +295,12 @@ struct recv_sys_t{
/** Store a redo log record for applying.
@param type record type
- @param space tablespace identifier
- @param page_no page number
+ @param page_id page identifier
@param body record body
@param rec_end end of record
@param lsn start LSN of the mini-transaction
@param end_lsn end LSN of the mini-transaction */
- inline void add(mlog_id_t type, ulint space, ulint page_no,
+ inline void add(mlog_id_t type, const page_id_t page_id,
byte* body, byte* rec_end, lsn_t lsn,
lsn_t end_lsn);
@@ -301,8 +330,8 @@ otherwise. Note that this is FALSE while a background thread is
rolling back incomplete transactions. */
extern volatile bool recv_recovery_on;
/** If the following is TRUE, the buffer pool file pages must be invalidated
-after recovery and no ibuf operations are allowed; this becomes TRUE if
-the log record hash table becomes too full, and log records must be merged
+after recovery and no ibuf operations are allowed; this will be set if
+recv_sys.pages becomes too full, and log records must be merged
to file pages already before the recovery is finished: in this case no
ibuf operations are allowed, as they could modify the pages read in the
buffer pool before the pages have been recovered to the up-to-date state.
diff --git a/storage/innobase/log/log0recv.cc b/storage/innobase/log/log0recv.cc
index f1c5a2b4948..86de0a15fcb 100644
--- a/storage/innobase/log/log0recv.cc
+++ b/storage/innobase/log/log0recv.cc
@@ -61,7 +61,7 @@ this must be less than srv_page_size as it is stored in the buffer pool */
#define RECV_DATA_BLOCK_SIZE (MEM_MAX_ALLOC_IN_BUF - sizeof(recv_data_t))
/** Read-ahead area in applying log records to file pages */
-#define RECV_READ_AHEAD_AREA 32
+#define RECV_READ_AHEAD_AREA 32U
/** The recovery system */
recv_sys_t recv_sys;
@@ -178,37 +178,6 @@ typedef std::map<
static recv_spaces_t recv_spaces;
-/** States of recv_addr_t */
-enum recv_addr_state {
- /** not yet processed */
- RECV_NOT_PROCESSED,
- /** not processed; the page will be reinitialized */
- RECV_WILL_NOT_READ,
- /** page is being read */
- RECV_BEING_READ,
- /** log records are being applied on the page */
- RECV_BEING_PROCESSED,
- /** log records have been applied on the page */
- RECV_PROCESSED,
- /** log records have been discarded because the tablespace
- does not exist */
- RECV_DISCARDED
-};
-
-/** Hashed page file address struct */
-struct recv_addr_t{
- /** recovery state of the page */
- recv_addr_state state;
- /** tablespace identifier */
- unsigned space:32;
- /** page number */
- unsigned page_no:32;
- /** list of log records for this page */
- UT_LIST_BASE_NODE_T(recv_t) rec_list;
- /** hash node in the hash bucket chain */
- hash_node_t addr_hash;
-};
-
/** Report optimized DDL operation (without redo log),
corresponding to MLOG_INDEX_LOAD.
@param[in] space_id tablespace identifier
@@ -250,19 +219,18 @@ private:
ut_allocator<std::pair<const page_id_t, init> > >
map;
/** Map of page initialization operations.
- FIXME: Merge this to recv_sys.addr_hash! */
+ FIXME: Merge this to recv_sys.pages! */
map inits;
public:
/** Record that a page will be initialized by the redo log.
- @param[in] space tablespace identifier
- @param[in] page_no page number
+ @param[in] page_id page identifier
@param[in] lsn log sequence number */
- void add(ulint space, ulint page_no, lsn_t lsn)
+ void add(const page_id_t page_id, lsn_t lsn)
{
ut_ad(mutex_own(&recv_sys.mutex));
const init init = { lsn, false };
std::pair<map::iterator, bool> p = inits.insert(
- map::value_type(page_id_t(space, page_no), init));
+ map::value_type(page_id, init));
ut_ad(!p.first->second.created);
if (!p.second && p.first->second.lsn < init.lsn) {
p.first->second = init;
@@ -337,54 +305,43 @@ public:
static mlog_init_t mlog_init;
-/** Process a MLOG_CREATE2 record that indicates that a tablespace
-is being shrunk in size.
-@param[in] space_id tablespace identifier
-@param[in] pages trimmed size of the file, in pages
-@param[in] lsn log sequence number of the operation */
-static void recv_addr_trim(ulint space_id, unsigned pages, lsn_t lsn)
+/** Process a record that indicates that a tablespace is
+being shrunk in size.
+@param page_id first page identifier that is not in the file
+@param lsn log sequence number of the shrink operation */
+inline void recv_sys_t::trim(const page_id_t page_id, lsn_t lsn)
{
- DBUG_ENTER("recv_addr_trim");
+ DBUG_ENTER("recv_sys_t::trim");
DBUG_LOG("ib_log",
"discarding log beyond end of tablespace "
- << page_id_t(space_id, pages) << " before LSN " << lsn);
- ut_ad(mutex_own(&recv_sys.mutex));
- for (ulint i = recv_sys.addr_hash->n_cells; i--; ) {
- hash_cell_t* const cell = hash_get_nth_cell(
- recv_sys.addr_hash, i);
- for (recv_addr_t* addr = static_cast<recv_addr_t*>(cell->node),
- *next;
- addr; addr = next) {
- next = static_cast<recv_addr_t*>(addr->addr_hash);
-
- if (addr->space != space_id || addr->page_no < pages) {
- continue;
+ << page_id << " before LSN " << lsn);
+ ut_ad(mutex_own(&mutex));
+ for (recv_sys_t::map::iterator p = pages.lower_bound(page_id);
+ p != pages.end() && p->first.space() == page_id.space();) {
+ for (recv_t** prev = &p->second.log;;) {
+ if (!*prev || (*prev)->start_lsn >= lsn) {
+ break;
}
+ DBUG_LOG("ib_log", "Discarding "
+ << get_mlog_string((*prev)->type)
+ << " for " << p->first << " at "
+ << (*prev)->start_lsn);
+ *prev = (*prev)->next;
+ }
- for (recv_t* recv = UT_LIST_GET_FIRST(addr->rec_list);
- recv; ) {
- recv_t* n = UT_LIST_GET_NEXT(rec_list, recv);
- if (recv->start_lsn < lsn) {
- DBUG_PRINT("ib_log",
- ("Discarding %s for"
- " page %u:%u at " LSN_PF,
- get_mlog_string(
- recv->type),
- addr->space, addr->page_no,
- recv->start_lsn));
- UT_LIST_REMOVE(addr->rec_list, recv);
- }
- recv = n;
- }
+ recv_sys_t::map::iterator r = p++;
+
+ if (!r->second.log) {
+ pages.erase(r);
}
}
- if (fil_space_t* space = fil_space_get(space_id)) {
+ if (fil_space_t* space = fil_space_get(page_id.space())) {
ut_ad(UT_LIST_GET_LEN(space->chain) == 1);
fil_node_t* file = UT_LIST_GET_FIRST(space->chain);
ut_ad(file->is_open());
os_file_truncate(file->name, file->handle,
- os_offset_t(pages) << srv_page_size_shift,
- true);
+ os_offset_t{page_id.page_no()}
+ << srv_page_size_shift, true);
}
DBUG_VOID_RETURN;
}
@@ -711,11 +668,7 @@ void recv_sys_t::close()
if (is_initialised()) {
dblwr.pages.clear();
-
- if (addr_hash) {
- hash_table_free(addr_hash);
- addr_hash = NULL;
- }
+ pages.clear();
if (heap) {
mem_heap_free(heap);
@@ -859,8 +812,6 @@ void recv_sys_t::create()
found_corrupt_fs = false;
mlog_checkpoint_lsn = 0;
- addr_hash = hash_create(size / 512);
- n_addrs = 0;
progress_time = ut_time();
recv_max_page_lsn = 0;
@@ -872,12 +823,8 @@ void recv_sys_t::create()
inline void recv_sys_t::empty()
{
ut_ad(mutex_own(&mutex));
- ut_a(n_addrs == 0);
-
- hash_table_free(addr_hash);
+ pages.clear();
mem_heap_empty(heap);
-
- addr_hash = hash_create(buf_pool_get_curr_size() / 512);
}
/** Free most recovery data structures. */
@@ -887,13 +834,12 @@ void recv_sys_t::debug_free()
ut_ad(is_initialised());
mutex_enter(&mutex);
- hash_table_free(addr_hash);
+ pages.clear();
mem_heap_free(heap);
ut_free_dodump(buf, buf_size);
buf = NULL;
heap = NULL;
- addr_hash = NULL;
/* wake page cleaner up to progress */
if (!srv_read_only_mode) {
@@ -1752,74 +1698,14 @@ parse_log:
return(ptr);
}
-/*********************************************************************//**
-Calculates the fold value of a page file address: used in inserting or
-searching for a log record in the hash table.
-@return folded value */
-UNIV_INLINE
-ulint
-recv_fold(
-/*======*/
- ulint space, /*!< in: space */
- ulint page_no)/*!< in: page number */
-{
- return(ut_fold_ulint_pair(space, page_no));
-}
-
-/*********************************************************************//**
-Calculates the hash value of a page file address: used in inserting or
-searching for a log record in the hash table.
-@return folded value */
-UNIV_INLINE
-ulint
-recv_hash(
-/*======*/
- ulint space, /*!< in: space */
- ulint page_no)/*!< in: page number */
-{
- return(hash_calc_hash(recv_fold(space, page_no), recv_sys.addr_hash));
-}
-
-/*********************************************************************//**
-Gets the hashed file address struct for a page.
-@return file address struct, NULL if not found from the hash table */
-static
-recv_addr_t*
-recv_get_fil_addr_struct(
-/*=====================*/
- ulint space, /*!< in: space id */
- ulint page_no)/*!< in: page number */
-{
- ut_ad(mutex_own(&recv_sys.mutex));
-
- recv_addr_t* recv_addr;
-
- for (recv_addr = static_cast<recv_addr_t*>(
- HASH_GET_FIRST(recv_sys.addr_hash,
- recv_hash(space, page_no)));
- recv_addr != 0;
- recv_addr = static_cast<recv_addr_t*>(
- HASH_GET_NEXT(addr_hash, recv_addr))) {
-
- if (recv_addr->space == space
- && recv_addr->page_no == page_no) {
-
- return(recv_addr);
- }
- }
-
- return(NULL);
-}
-
/** Store a redo log record for applying.
@param type record type
-@param space tablespace identifier
-@param page_no page number
+@param page_id page identifier
@param body record body
@param rec_end end of record
@param lsn start LSN of the mini-transaction
@param end_lsn end LSN of the mini-transaction */
-inline void recv_sys_t::add(mlog_id_t type, ulint space, ulint page_no,
+inline void recv_sys_t::add(mlog_id_t type, const page_id_t page_id,
byte* body, byte* rec_end, lsn_t lsn,
lsn_t end_lsn)
{
@@ -1832,46 +1718,33 @@ inline void recv_sys_t::add(mlog_id_t type, ulint space, ulint page_no,
ut_ad(type != MLOG_INDEX_LOAD);
ut_ad(type != MLOG_TRUNCATE);
- recv_t* recv= static_cast<recv_t*>(mem_heap_alloc(heap, sizeof *recv));
-
- recv->type = type;
- recv->len = ulint(rec_end - body);
- recv->start_lsn = lsn;
- recv->end_lsn = end_lsn;
-
- recv_addr_t* recv_addr = recv_get_fil_addr_struct(space, page_no);
-
- if (recv_addr == NULL) {
- recv_addr = static_cast<recv_addr_t*>(
- mem_heap_alloc(heap, sizeof(recv_addr_t)));
-
- recv_addr->space = space;
- recv_addr->page_no = page_no;
- recv_addr->state = RECV_NOT_PROCESSED;
-
- UT_LIST_INIT(recv_addr->rec_list, &recv_t::rec_list);
-
- HASH_INSERT(recv_addr_t, addr_hash, addr_hash,
- recv_fold(space, page_no), recv_addr);
- n_addrs++;
- }
+ std::pair<map::iterator, bool> p = pages.insert(
+ map::value_type(page_id, recs_t{recs_t::RECV_NOT_PROCESSED,
+ NULL, NULL}));
+ recv_sys_t::recs_t& recs = p.first->second;
+ ut_ad(p.second == !recs.log);
+ ut_ad(p.second == !recs.last);
+ recv_data_t** prev_field;
switch (type) {
case MLOG_INIT_FILE_PAGE2:
case MLOG_ZIP_PAGE_COMPRESS:
case MLOG_INIT_FREE_PAGE:
/* Ignore any earlier redo log records for this page. */
- ut_ad(recv_addr->state == RECV_NOT_PROCESSED
- || recv_addr->state == RECV_WILL_NOT_READ);
- recv_addr->state = RECV_WILL_NOT_READ;
- mlog_init.add(space, page_no, lsn);
+ ut_ad(recs.state == recs_t::RECV_NOT_PROCESSED
+ || recs.state == recs_t::RECV_WILL_NOT_READ);
+ recs.state = recs_t::RECV_WILL_NOT_READ;
+ mlog_init.add(page_id, lsn);
+ recs.last = NULL;
+ /* fall through */
default:
- break;
- }
+ recv_t** prev = recs.last ? &recs.last->next : &recs.log;
- UT_LIST_ADD_LAST(recv_addr->rec_list, recv);
-
- recv_data_t** prev_field = &recv->data;
+ *prev = recs.last = new (mem_heap_alloc(heap, sizeof(recv_t)))
+ recv_t{NULL, uint32_t(rec_end - body), type, NULL,
+ lsn, end_lsn};
+ prev_field = &(*prev)->data;
+ }
/* Store the log record body in chunks of less than srv_page_size:
heap grows into the buffer pool, and bigger chunks could not
@@ -1906,39 +1779,31 @@ void
recv_data_copy_to_buf(
/*==================*/
byte* buf, /*!< in: buffer of length at least recv->len */
- recv_t* recv) /*!< in: log record */
+ const recv_t& recv) /*!< in: log record */
{
- recv_data_t* recv_data;
- ulint part_len;
- ulint len;
-
- len = recv->len;
- recv_data = recv->data;
+ const recv_data_t* recv_data = recv.data;
+ ulint len = recv.len;
- while (len > 0) {
- if (len > RECV_DATA_BLOCK_SIZE) {
- part_len = RECV_DATA_BLOCK_SIZE;
- } else {
- part_len = len;
- }
-
- ut_memcpy(buf, ((byte*) recv_data) + sizeof(recv_data_t),
- part_len);
+ do {
+ const ulint part_len = std::min<ulint>(len,
+ RECV_DATA_BLOCK_SIZE);
+ memcpy(buf, &reinterpret_cast<const byte*>(recv_data)[
+ sizeof(recv_data_t)],
+ part_len);
+ recv_data = recv_data->next;
buf += part_len;
len -= part_len;
-
- recv_data = recv_data->next;
- }
+ } while (len);
}
/** Apply the hashed log records to the page, if the page lsn is less than the
lsn of a log record.
@param[in,out] block buffer pool page
@param[in,out] mtr mini-transaction
-@param[in,out] recv_addr recovery address
+@param[in,out] p recovery address
@param[in,out] init page initialization operation, or NULL */
static void recv_recover_page(buf_block_t* block, mtr_t& mtr,
- recv_addr_t* recv_addr,
+ const recv_sys_t::map::iterator& p,
mlog_init_t::init* init = NULL)
{
page_t* page;
@@ -1947,19 +1812,18 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr,
ut_ad(mutex_own(&recv_sys.mutex));
ut_ad(recv_sys.apply_log_recs);
ut_ad(recv_needed_recovery);
- ut_ad(recv_addr->state != RECV_BEING_PROCESSED);
- ut_ad(recv_addr->state != RECV_PROCESSED);
ut_ad(!init || init->created);
ut_ad(!init || init->lsn);
+ ut_ad(block->page.id == p->first);
+ ut_ad(p->second.state != recv_sys_t::recs_t::RECV_BEING_PROCESSED);
if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) {
- fprintf(stderr, "Applying log to page %u:%u\n",
- recv_addr->space, recv_addr->page_no);
+ ib::info() << "Applying log to page " << block->page.id;
}
DBUG_LOG("ib_log", "Applying log to page " << block->page.id);
- recv_addr->state = RECV_BEING_PROCESSED;
+ p->second.state = recv_sys_t::recs_t::RECV_BEING_PROCESSED;
mutex_exit(&recv_sys.mutex);
page = block->frame;
@@ -1974,11 +1838,15 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr,
bool free_page = false;
lsn_t start_lsn = 0, end_lsn = 0;
+ ut_d(lsn_t recv_start_lsn = 0);
const lsn_t init_lsn = init ? init->lsn : 0;
- for (recv_t* recv = UT_LIST_GET_FIRST(recv_addr->rec_list);
- recv; recv = UT_LIST_GET_NEXT(rec_list, recv)) {
+ for (const recv_t* recv = p->second.log; recv; recv = recv->next) {
ut_ad(recv->start_lsn);
+ ut_ad(recv->end_lsn);
+ ut_ad(recv_start_lsn < recv->start_lsn);
+ ut_d(recv_start_lsn = recv->start_lsn);
+ ut_ad(end_lsn <= recv->end_lsn);
end_lsn = recv->end_lsn;
ut_ad(end_lsn <= log_sys.log.scanned_lsn);
@@ -2003,10 +1871,10 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr,
}
if (UNIV_UNLIKELY(srv_print_verbose_log == 2)) {
- fprintf(stderr, "apply " LSN_PF ":"
- " %d len " ULINTPF " page %u:%u\n",
- recv->start_lsn, recv->type, recv->len,
- recv_addr->space, recv_addr->page_no);
+ ib::info() << "apply " << recv->start_lsn
+ << ":" << recv->type
+ << " len " << recv->len
+ << " page " << block->page.id;
}
DBUG_LOG("ib_log", "apply " << recv->start_lsn << ": "
@@ -2021,7 +1889,7 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr,
a separate buffer */
buf = static_cast<byte*>
(ut_malloc_nokey(recv->len));
- recv_data_copy_to_buf(buf, recv);
+ recv_data_copy_to_buf(buf, *recv);
} else {
buf = reinterpret_cast<byte*>(recv->data)
+ sizeof *recv->data;
@@ -2082,42 +1950,25 @@ static void recv_recover_page(buf_block_t* block, mtr_t& mtr,
recv_max_page_lsn = page_lsn;
}
- ut_ad(recv_addr->state == RECV_BEING_PROCESSED);
- recv_addr->state = RECV_PROCESSED;
+ ut_ad(p->second.state == recv_sys_t::recs_t::RECV_BEING_PROCESSED);
+ ut_ad(!recv_sys.pages.empty());
+ recv_sys.pages.erase(p);
- ut_a(recv_sys.n_addrs > 0);
- if (ulint n = --recv_sys.n_addrs) {
- if (recv_sys.report(time)) {
- ib::info() << "To recover: " << n << " pages from log";
- service_manager_extend_timeout(
- INNODB_EXTEND_TIMEOUT_INTERVAL, "To recover: " ULINTPF " pages from log", n);
- }
+ if (recv_sys.report(time)) {
+ const ulint n = recv_sys.pages.size();
+ ib::info() << "To recover: " << n << " pages from log";
+ service_manager_extend_timeout(
+ INNODB_EXTEND_TIMEOUT_INTERVAL, "To recover: " ULINTPF " pages from log", n);
}
}
-/** Reduces recv_sys.n_addrs for the corrupted page.
+/** Remove records for a corrupted page.
This function should called when srv_force_recovery > 0.
@param[in] bpage buffer pool page */
void recv_recover_corrupt_page(buf_page_t* bpage)
{
- ut_ad(srv_force_recovery);
mutex_enter(&recv_sys.mutex);
-
- if (!recv_sys.apply_log_recs) {
- mutex_exit(&recv_sys.mutex);
- return;
- }
-
- recv_addr_t* recv_addr = recv_get_fil_addr_struct(
- bpage->id.space(), bpage->id.page_no());
-
- ut_ad(recv_addr->state != RECV_WILL_NOT_READ);
-
- if (recv_addr->state != RECV_BEING_PROCESSED
- && recv_addr->state != RECV_PROCESSED) {
- recv_sys.n_addrs--;
- }
-
+ recv_sys.pages.erase(bpage->id);
mutex_exit(&recv_sys.mutex);
}
@@ -2144,15 +1995,12 @@ void recv_recover_page(buf_page_t* bpage)
ut_a(success);
mutex_enter(&recv_sys.mutex);
- if (!recv_sys.apply_log_recs) {
- } else if (recv_addr_t* recv_addr = recv_get_fil_addr_struct(
- bpage->id.space(), bpage->id.page_no())) {
- switch (recv_addr->state) {
- case RECV_BEING_PROCESSED:
- case RECV_PROCESSED:
- break;
- default:
- recv_recover_page(block, mtr, recv_addr);
+ if (recv_sys.apply_log_recs) {
+ recv_sys_t::map::iterator p = recv_sys.pages.find(bpage->id);
+ if (p != recv_sys.pages.end()
+ && p->second.state
+ != recv_sys_t::recs_t::RECV_BEING_PROCESSED) {
+ recv_recover_page(block, mtr, p);
goto func_exit;
}
}
@@ -2166,32 +2014,35 @@ func_exit:
/** Reads in pages which have hashed log records, from an area around a given
page number.
@param[in] page_id page id */
-static void recv_read_in_area(const page_id_t page_id)
+static void recv_read_in_area(page_id_t page_id)
{
ulint page_nos[RECV_READ_AHEAD_AREA];
- ulint page_no = page_id.page_no()
- - (page_id.page_no() % RECV_READ_AHEAD_AREA);
+ compile_time_assert(ut_is_2pow(RECV_READ_AHEAD_AREA));
+ page_id.set_page_no(ut_2pow_round(page_id.page_no(),
+ RECV_READ_AHEAD_AREA));
+ const ulint up_limit = page_id.page_no() + RECV_READ_AHEAD_AREA;
ulint* p = page_nos;
- for (const ulint up_limit = page_no + RECV_READ_AHEAD_AREA;
- page_no < up_limit; page_no++) {
- recv_addr_t* recv_addr = recv_get_fil_addr_struct(
- page_id.space(), page_no);
- if (recv_addr
- && recv_addr->state == RECV_NOT_PROCESSED
- && !buf_page_peek(page_id_t(page_id.space(), page_no))) {
- recv_addr->state = RECV_BEING_READ;
- *p++ = page_no;
+ for (recv_sys_t::map::iterator i= recv_sys.pages.lower_bound(page_id);
+ i != recv_sys.pages.end()
+ && i->first.space() == page_id.space()
+ && i->first.page_no() < up_limit; i++) {
+ if (i->second.state == recv_sys_t::recs_t::RECV_NOT_PROCESSED
+ && !buf_page_peek(i->first)) {
+ i->second.state = recv_sys_t::recs_t::RECV_BEING_READ;
+ *p++ = i->first.page_no();
}
}
- mutex_exit(&recv_sys.mutex);
- buf_read_recv_pages(FALSE, page_id.space(), page_nos,
- ulint(p - page_nos));
- mutex_enter(&recv_sys.mutex);
+ if (p != page_nos) {
+ mutex_exit(&recv_sys.mutex);
+ buf_read_recv_pages(FALSE, page_id.space(), page_nos,
+ ulint(p - page_nos));
+ mutex_enter(&recv_sys.mutex);
+ }
}
-/** Apply the hash table of stored log records to persistent data pages.
+/** Apply recv_sys.pages to persistent data pages.
@param[in] last_batch whether the change buffer merge will be
performed as part of the operation */
void recv_apply_hashed_log_recs(bool last_batch)
@@ -2222,163 +2073,157 @@ void recv_apply_hashed_log_recs(bool last_batch)
ut_d(recv_no_log_write = recv_no_ibuf_operations);
- if (ulint n = recv_sys.n_addrs) {
- if (!log_sys.log.subformat && !srv_force_recovery
- && srv_undo_tablespaces_open) {
- ib::error() << "Recovery of separately logged"
- " TRUNCATE operations is no longer supported."
- " Set innodb_force_recovery=1"
- " if no *trunc.log files exist";
- recv_sys.found_corrupt_log = true;
- mutex_exit(&recv_sys.mutex);
- return;
- }
+ mtr_t mtr;
+ if (recv_sys.pages.empty()) {
+ goto done;
+ }
+
+ if (!log_sys.log.subformat && !srv_force_recovery
+ && srv_undo_tablespaces_open) {
+ ib::error() << "Recovery of separately logged"
+ " TRUNCATE operations is no longer supported."
+ " Set innodb_force_recovery=1"
+ " if no *trunc.log files exist";
+ recv_sys.found_corrupt_log = true;
+ mutex_exit(&recv_sys.mutex);
+ return;
+ } else {
const char* msg = last_batch
? "Starting final batch to recover "
: "Starting a batch to recover ";
+ const ulint n = recv_sys.pages.size();
ib::info() << msg << n << " pages from redo log.";
sd_notifyf(0, "STATUS=%s" ULINTPF " pages from redo log",
msg, n);
}
+
recv_sys.apply_log_recs = true;
recv_sys.apply_batch_on = true;
- for (ulint id = srv_undo_tablespaces_open; id--; ) {
- recv_sys_t::trunc& t = recv_sys.truncated_undo_spaces[id];
+ for (ulint id = srv_undo_tablespaces_open; id--;) {
+ const recv_sys_t::trunc& t= recv_sys.truncated_undo_spaces[id];
if (t.lsn) {
- recv_addr_trim(id + srv_undo_space_id_start, t.pages,
- t.lsn);
+ recv_sys.trim(page_id_t(id + srv_undo_space_id_start,
+ t.pages), t.lsn);
}
}
- mtr_t mtr;
+ for (recv_sys_t::map::iterator p = recv_sys.pages.begin();
+ p != recv_sys.pages.end();) {
+ const page_id_t page_id = p->first;
+ recv_sys_t::recs_t& recs = p->second;
+ ut_ad(recs.log);
- for (ulint i = 0; i < hash_get_n_cells(recv_sys.addr_hash); i++) {
- for (recv_addr_t* recv_addr = static_cast<recv_addr_t*>(
- HASH_GET_FIRST(recv_sys.addr_hash, i));
- recv_addr;
- recv_addr = static_cast<recv_addr_t*>(
- HASH_GET_NEXT(addr_hash, recv_addr))) {
- if (!UT_LIST_GET_LEN(recv_addr->rec_list)) {
+ switch (recs.state) {
+ case recv_sys_t::recs_t::RECV_BEING_READ:
+ case recv_sys_t::recs_t::RECV_BEING_PROCESSED:
+ p++;
+ continue;
+ case recv_sys_t::recs_t::RECV_NOT_PROCESSED:
+ apply:
+ mtr.start();
+ mtr.set_log_mode(MTR_LOG_NONE);
+ if (buf_block_t* block = buf_page_get_gen(
+ page_id, 0, RW_X_LATCH, NULL,
+ BUF_GET_IF_IN_POOL,
+ __FILE__, __LINE__, &mtr, NULL)) {
+ buf_block_dbg_add_level(
+ block, SYNC_NO_ORDER_CHECK);
+ recv_recover_page(block, mtr, p);
+ ut_ad(mtr.has_committed());
+ } else {
+ mtr.commit();
+ recv_read_in_area(page_id);
+ }
+ break;
+ case recv_sys_t::recs_t::RECV_WILL_NOT_READ:
+ mlog_init_t::init& i = mlog_init.last(page_id);
+ lsn_t end_lsn = 0;
+ for (const recv_t* r = recs.log; r; r = r->next) {
+ ut_ad(r->end_lsn);
+ ut_ad(r->end_lsn >= end_lsn);
+ end_lsn = r->end_lsn;
+ }
+ if (end_lsn < i.lsn) {
+ DBUG_LOG("ib_log", "skip log for page "
+ << page_id
+ << " LSN " << end_lsn
+ << " < " << i.lsn);
ignore:
- ut_a(recv_sys.n_addrs);
- recv_sys.n_addrs--;
+ recv_sys_t::map::iterator r = p++;
+ recv_sys.pages.erase(r);
continue;
}
- switch (recv_addr->state) {
- case RECV_BEING_READ:
- case RECV_BEING_PROCESSED:
- case RECV_PROCESSED:
- continue;
- case RECV_DISCARDED:
+ fil_space_t* space = fil_space_acquire_for_io(
+ page_id.space());
+ if (!space) {
goto ignore;
- case RECV_NOT_PROCESSED:
- case RECV_WILL_NOT_READ:
- break;
}
- const page_id_t page_id(recv_addr->space,
- recv_addr->page_no);
-
- if (recv_addr->state == RECV_NOT_PROCESSED) {
-apply:
- mtr.start();
- mtr.set_log_mode(MTR_LOG_NONE);
- if (buf_block_t* block = buf_page_get_gen(
- page_id, 0, RW_X_LATCH, NULL,
- BUF_GET_IF_IN_POOL,
- __FILE__, __LINE__, &mtr, NULL)) {
- buf_block_dbg_add_level(
- block, SYNC_NO_ORDER_CHECK);
- recv_recover_page(block, mtr,
- recv_addr);
- ut_ad(mtr.has_committed());
- } else {
- mtr.commit();
- recv_read_in_area(page_id);
- }
- } else {
- mlog_init_t::init& i = mlog_init.last(page_id);
- const lsn_t end_lsn = UT_LIST_GET_LAST(
- recv_addr->rec_list)->end_lsn;
-
- if (end_lsn < i.lsn) {
- DBUG_LOG("ib_log", "skip log for page "
- << page_id
- << " LSN " << end_lsn
- << " < " << i.lsn);
-skip:
- recv_addr->state = RECV_PROCESSED;
- goto ignore;
- }
-
- fil_space_t* space = fil_space_acquire_for_io(
- recv_addr->space);
- if (!space) {
- goto skip;
- }
-
- if (space->enable_lsn) {
+ if (space->enable_lsn) {
do_read:
- space->release_for_io();
- recv_addr->state = RECV_NOT_PROCESSED;
- goto apply;
- }
-
- /* Determine if a tablespace could be
- for an internal table for FULLTEXT INDEX.
- For those tables, no MLOG_INDEX_LOAD record
- used to be written when redo logging was
- disabled. Hence, we cannot optimize
- away page reads when crash-upgrading
- from MariaDB versions before 10.4,
- because all the redo log records for
- initializing and modifying the page in
- the past could be older than the page
- in the data file.
-
- The check is too broad, causing all
- tables whose names start with FTS_ to
- skip the optimization. */
- if ((log_sys.log.format
- & ~LOG_HEADER_FORMAT_ENCRYPTED)
- != LOG_HEADER_FORMAT_10_4
- && strstr(space->name, "/FTS_")) {
- goto do_read;
- }
+ space->release_for_io();
+ recs.state = recv_sys_t::recs_t::
+ RECV_NOT_PROCESSED;
+ goto apply;
+ }
- mtr.start();
- mtr.set_log_mode(MTR_LOG_NONE);
- buf_block_t* block = buf_page_create(
- page_id, space->zip_size(), &mtr);
- if (recv_addr->state == RECV_PROCESSED) {
- /* The page happened to exist
- in the buffer pool, or it was
- just being read in. Before
- buf_page_get_with_no_latch()
- returned, all changes must have
- been applied to the page already. */
- mtr.commit();
- } else {
- i.created = true;
- buf_block_dbg_add_level(
- block, SYNC_NO_ORDER_CHECK);
- mtr.x_latch_at_savepoint(0, block);
- recv_recover_page(block, mtr,
- recv_addr, &i);
- ut_ad(mtr.has_committed());
- }
+ /* Determine if a tablespace could be
+ for an internal table for FULLTEXT INDEX.
+ For those tables, no MLOG_INDEX_LOAD record
+ used to be written when redo logging was
+ disabled. Hence, we cannot optimize
+ away page reads when crash-upgrading
+ from MariaDB versions before 10.4,
+ because all the redo log records for
+ initializing and modifying the page in
+ the past could be older than the page
+ in the data file.
+
+ The check is too broad, causing all
+ tables whose names start with FTS_ to
+ skip the optimization. */
+ if ((log_sys.log.format
+ & ~LOG_HEADER_FORMAT_ENCRYPTED)
+ != LOG_HEADER_FORMAT_10_4
+ && strstr(space->name, "/FTS_")) {
+ goto do_read;
+ }
- space->release_for_io();
+ mtr.start();
+ mtr.set_log_mode(MTR_LOG_NONE);
+ buf_block_t* block = buf_page_create(
+ page_id, space->zip_size(), &mtr);
+ p = recv_sys.pages.find(page_id);
+ if (p == recv_sys.pages.end()) {
+ /* The page happened to exist
+ in the buffer pool, or it was
+ just being read in. Before
+ buf_page_get_with_no_latch()
+ returned, all changes must have
+ been applied to the page already. */
+ mtr.commit();
+ } else {
+ ut_ad(&recs == &p->second);
+ i.created = true;
+ buf_block_dbg_add_level(
+ block, SYNC_NO_ORDER_CHECK);
+ mtr.x_latch_at_savepoint(0, block);
+ recv_recover_page(block, mtr, p, &i);
+ ut_ad(mtr.has_committed());
}
+
+ space->release_for_io();
}
+
+ p = recv_sys.pages.lower_bound(page_id);
}
/* Wait until all the pages have been processed */
- while (recv_sys.n_addrs != 0) {
+ while (!recv_sys.pages.empty()) {
const bool abort = recv_sys.found_corrupt_log
|| recv_sys.found_corrupt_fs;
@@ -2398,6 +2243,7 @@ do_read:
mutex_enter(&(recv_sys.mutex));
}
+done:
if (!last_batch) {
/* Flush all the file pages to disk and invalidate them in
the buffer pool */
@@ -2784,7 +2630,7 @@ loop:
/* fall through */
case STORE_YES:
recv_sys.add(
- type, space, page_no, body,
+ type, page_id_t(space, page_no), body,
ptr + len, old_lsn,
recv_sys.recovered_lsn);
}
@@ -2968,7 +2814,8 @@ corrupted_log:
/* fall through */
case STORE_YES:
recv_sys.add(
- type, space, page_no,
+ type,
+ page_id_t(space, page_no),
body, ptr + len,
old_lsn,
new_recovered_lsn);
@@ -3278,7 +3125,6 @@ recv_group_scan_log_recs(
mutex_enter(&recv_sys.mutex);
recv_sys.len = 0;
recv_sys.recovered_offset = 0;
- recv_sys.n_addrs = 0;
recv_sys.empty();
srv_start_lsn = *contiguous_lsn;
recv_sys.parse_start_lsn = *contiguous_lsn;
@@ -3386,35 +3232,32 @@ recv_validate_tablespace(bool rescan, bool& missing_tablespace)
{
dberr_t err = DB_SUCCESS;
- for (ulint h = 0; h < hash_get_n_cells(recv_sys.addr_hash); h++) {
- for (recv_addr_t* recv_addr = static_cast<recv_addr_t*>(
- HASH_GET_FIRST(recv_sys.addr_hash, h));
- recv_addr != 0;
- recv_addr = static_cast<recv_addr_t*>(
- HASH_GET_NEXT(addr_hash, recv_addr))) {
-
- const ulint space = recv_addr->space;
-
- if (is_predefined_tablespace(space)) {
- continue;
- }
+ for (recv_sys_t::map::iterator p = recv_sys.pages.begin();
+ p != recv_sys.pages.end();) {
+ ut_ad(p->second.log);
+ const ulint space = p->first.space();
+ if (is_predefined_tablespace(space)) {
+next:
+ p++;
+ continue;
+ }
- recv_spaces_t::iterator i = recv_spaces.find(space);
- ut_ad(i != recv_spaces.end());
+ recv_spaces_t::iterator i = recv_spaces.find(space);
+ ut_ad(i != recv_spaces.end());
- switch (i->second.status) {
- case file_name_t::MISSING:
- err = recv_init_missing_space(err, i);
- i->second.status = file_name_t::DELETED;
- /* fall through */
- case file_name_t::DELETED:
- recv_addr->state = RECV_DISCARDED;
- /* fall through */
- case file_name_t::NORMAL:
- continue;
- }
- ut_ad(0);
+ switch (i->second.status) {
+ case file_name_t::NORMAL:
+ goto next;
+ case file_name_t::MISSING:
+ err = recv_init_missing_space(err, i);
+ i->second.status = file_name_t::DELETED;
+ /* fall through */
+ case file_name_t::DELETED:
+ recv_sys_t::map::iterator r = p++;
+ recv_sys.pages.erase(r);
+ continue;
}
+ ut_ad(0);
}
if (err != DB_SUCCESS) {
@@ -3567,7 +3410,7 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
const lsn_t end_lsn = mach_read_from_8(
buf + LOG_CHECKPOINT_END_LSN);
- ut_ad(recv_sys.n_addrs == 0);
+ ut_ad(recv_sys.pages.empty());
contiguous_lsn = checkpoint_lsn;
switch (log_sys.log.format) {
case 0:
@@ -3590,7 +3433,7 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
/* Look for MLOG_CHECKPOINT. */
recv_group_scan_log_recs(checkpoint_lsn, &contiguous_lsn, false);
/* The first scan should not have stored or applied any records. */
- ut_ad(recv_sys.n_addrs == 0);
+ ut_ad(recv_sys.pages.empty());
ut_ad(!recv_sys.found_corrupt_fs);
if (srv_read_only_mode && recv_needed_recovery) {
@@ -3740,7 +3583,7 @@ recv_recovery_from_checkpoint_start(lsn_t flush_lsn)
}
}
} else {
- ut_ad(!rescan || recv_sys.n_addrs == 0);
+ ut_ad(!rescan || recv_sys.pages.empty());
}
if (log_sys.log.scanned_lsn < checkpoint_lsn