diff options
author | unknown <heikki@hundin.mysql.fi> | 2003-10-07 17:28:59 +0300 |
---|---|---|
committer | unknown <heikki@hundin.mysql.fi> | 2003-10-07 17:28:59 +0300 |
commit | d1485aad0eb79559902b1af26502b7cc53f8e95a (patch) | |
tree | d53a5cc4e9736e149276ba08d53a7dd09a14b4fd /innobase | |
parent | d1ab51eb947623f45a8314cb9c0b6ddc15f0d148 (diff) | |
download | mariadb-git-d1485aad0eb79559902b1af26502b7cc53f8e95a.tar.gz |
Many files:
Multiple tablespaces for InnoDB
sql_table.cc:
Tell explicitly that InnoDB should retrieve all columns in CHECKSUM TABLE
sql_update.cc, sql_select.cc, my_base.h:
More descriptive flag name HA_EXTRA_RETRIEVE_ALL_COLS
include/my_base.h:
More descriptive flag name HA_EXTRA_RETRIEVE_ALL_COLS
sql/sql_select.cc:
More descriptive flag name HA_EXTRA_RETRIEVE_ALL_COLS
sql/sql_update.cc:
More descriptive flag name HA_EXTRA_RETRIEVE_ALL_COLS
sql/sql_table.cc:
Tell explicitly that InnoDB should retrieve all columns in CHECKSUM TABLE
sql/sql_db.cc:
Multiple tablespaces for InnoDB
sql/ha_innodb.cc:
Multiple tablespaces for InnoDB
sql/mysqld.cc:
Multiple tablespaces for InnoDB
sql/set_var.cc:
Multiple tablespaces for InnoDB
sql/sql_cache.cc:
Multiple tablespaces for InnoDB
sql/ha_innodb.h:
Multiple tablespaces for InnoDB
innobase/include/btr0btr.ic:
Multiple tablespaces for InnoDB
innobase/include/btr0pcur.ic:
Multiple tablespaces for InnoDB
innobase/include/data0type.ic:
Multiple tablespaces for InnoDB
innobase/include/dyn0dyn.ic:
Multiple tablespaces for InnoDB
innobase/include/fut0lst.ic:
Multiple tablespaces for InnoDB
innobase/include/log0log.ic:
Multiple tablespaces for InnoDB
innobase/include/mach0data.ic:
Multiple tablespaces for InnoDB
innobase/include/mtr0log.ic:
Multiple tablespaces for InnoDB
innobase/include/rem0rec.ic:
Multiple tablespaces for InnoDB
innobase/include/ut0byte.ic:
Multiple tablespaces for InnoDB
innobase/include/ut0ut.ic:
Multiple tablespaces for InnoDB
innobase/include/buf0buf.h:
Multiple tablespaces for InnoDB
innobase/include/buf0lru.h:
Multiple tablespaces for InnoDB
innobase/include/buf0rea.h:
Multiple tablespaces for InnoDB
innobase/include/data0type.h:
Multiple tablespaces for InnoDB
innobase/include/db0err.h:
Multiple tablespaces for InnoDB
innobase/include/dict0boot.h:
Multiple tablespaces for InnoDB
innobase/include/dict0dict.h:
Multiple tablespaces for InnoDB
innobase/include/dict0load.h:
Multiple tablespaces for InnoDB
innobase/include/dict0mem.h:
Multiple tablespaces for InnoDB
innobase/include/fil0fil.h:
Multiple tablespaces for InnoDB
innobase/include/fsp0fsp.h:
Multiple tablespaces for InnoDB
innobase/include/ibuf0ibuf.h:
Multiple tablespaces for InnoDB
innobase/include/lock0lock.h:
Multiple tablespaces for InnoDB
innobase/include/log0log.h:
Multiple tablespaces for InnoDB
innobase/include/log0recv.h:
Multiple tablespaces for InnoDB
innobase/include/os0file.h:
Multiple tablespaces for InnoDB
innobase/include/page0page.h:
Multiple tablespaces for InnoDB
innobase/include/que0types.h:
Multiple tablespaces for InnoDB
innobase/include/rem0rec.h:
Multiple tablespaces for InnoDB
innobase/include/srv0srv.h:
Multiple tablespaces for InnoDB
innobase/include/srv0start.h:
Multiple tablespaces for InnoDB
innobase/include/sync0sync.h:
Multiple tablespaces for InnoDB
innobase/include/trx0sys.h:
Multiple tablespaces for InnoDB
innobase/include/ut0byte.h:
Multiple tablespaces for InnoDB
innobase/include/univ.i:
Multiple tablespaces for InnoDB
innobase/btr/btr0cur.c:
Multiple tablespaces for InnoDB
innobase/btr/btr0sea.c:
Multiple tablespaces for InnoDB
innobase/buf/buf0buf.c:
Multiple tablespaces for InnoDB
innobase/buf/buf0flu.c:
Multiple tablespaces for InnoDB
innobase/buf/buf0lru.c:
Multiple tablespaces for InnoDB
innobase/buf/buf0rea.c:
Multiple tablespaces for InnoDB
innobase/data/data0type.c:
Multiple tablespaces for InnoDB
innobase/dict/dict0boot.c:
Multiple tablespaces for InnoDB
innobase/dict/dict0crea.c:
Multiple tablespaces for InnoDB
innobase/dict/dict0dict.c:
Multiple tablespaces for InnoDB
innobase/dict/dict0load.c:
Multiple tablespaces for InnoDB
innobase/dict/dict0mem.c:
Multiple tablespaces for InnoDB
innobase/fil/fil0fil.c:
Multiple tablespaces for InnoDB
innobase/fsp/fsp0fsp.c:
Multiple tablespaces for InnoDB
innobase/ha/ha0ha.c:
Multiple tablespaces for InnoDB
innobase/ibuf/ibuf0ibuf.c:
Multiple tablespaces for InnoDB
innobase/log/log0log.c:
Multiple tablespaces for InnoDB
innobase/log/log0recv.c:
Multiple tablespaces for InnoDB
innobase/mach/mach0data.c:
Multiple tablespaces for InnoDB
innobase/mem/mem0dbg.c:
Multiple tablespaces for InnoDB
innobase/mem/mem0pool.c:
Multiple tablespaces for InnoDB
innobase/mtr/mtr0log.c:
Multiple tablespaces for InnoDB
innobase/os/os0file.c:
Multiple tablespaces for InnoDB
innobase/os/os0proc.c:
Multiple tablespaces for InnoDB
innobase/page/page0cur.c:
Multiple tablespaces for InnoDB
innobase/que/que0que.c:
Multiple tablespaces for InnoDB
innobase/row/row0ins.c:
Multiple tablespaces for InnoDB
innobase/row/row0mysql.c:
Multiple tablespaces for InnoDB
innobase/row/row0sel.c:
Multiple tablespaces for InnoDB
innobase/row/row0upd.c:
Multiple tablespaces for InnoDB
innobase/srv/srv0srv.c:
Multiple tablespaces for InnoDB
innobase/srv/srv0start.c:
Multiple tablespaces for InnoDB
innobase/sync/sync0rw.c:
Multiple tablespaces for InnoDB
innobase/sync/sync0sync.c:
Multiple tablespaces for InnoDB
innobase/trx/trx0sys.c:
Multiple tablespaces for InnoDB
innobase/trx/trx0trx.c:
Multiple tablespaces for InnoDB
innobase/trx/trx0undo.c:
Multiple tablespaces for InnoDB
innobase/ut/ut0byte.c:
Multiple tablespaces for InnoDB
innobase/ut/ut0ut.c:
Multiple tablespaces for InnoDB
Diffstat (limited to 'innobase')
75 files changed, 5984 insertions, 1504 deletions
diff --git a/innobase/btr/btr0cur.c b/innobase/btr/btr0cur.c index f6b4a2964f5..5743ba54544 100644 --- a/innobase/btr/btr0cur.c +++ b/innobase/btr/btr0cur.c @@ -957,7 +957,7 @@ calculate_sizes_again: /* Now, try the insert */ *rec = page_cur_insert_rec_low(page_cursor, entry, data_size, - NULL, mtr); + NULL, mtr); if (!(*rec)) { /* If the record did not fit, reorganize */ btr_page_reorganize(page, mtr); @@ -1048,6 +1048,7 @@ btr_cur_pessimistic_insert( ibool dummy_inh; ibool success; ulint n_extents = 0; + ulint n_reserved; ut_ad(dtuple_check_typed(entry)); @@ -1067,7 +1068,7 @@ btr_cur_pessimistic_insert( cursor->flag = BTR_CUR_BINARY; err = btr_cur_optimistic_insert(flags, cursor, entry, rec, big_rec, - thr, mtr); + thr, mtr); if (err != DB_FAIL) { return(err); @@ -1090,7 +1091,7 @@ btr_cur_pessimistic_insert( n_extents = cursor->tree_height / 16 + 3; - success = fsp_reserve_free_extents(index->space, + success = fsp_reserve_free_extents(&n_reserved, index->space, n_extents, FSP_NORMAL, mtr); if (!success) { err = DB_OUT_OF_FILE_SPACE; @@ -1112,7 +1113,7 @@ btr_cur_pessimistic_insert( if (n_extents > 0) { fil_space_release_free_extents(index->space, - n_extents); + n_reserved); } return(DB_TOO_BIG_RECORD); } @@ -1140,7 +1141,7 @@ btr_cur_pessimistic_insert( err = DB_SUCCESS; if (n_extents > 0) { - fil_space_release_free_extents(index->space, n_extents); + fil_space_release_free_extents(index->space, n_reserved); } *big_rec = big_rec_vec; @@ -1721,6 +1722,7 @@ btr_cur_pessimistic_update( ibool was_first; ibool success; ulint n_extents = 0; + ulint n_reserved; ulint* ext_vect; ulint n_ext_vect; ulint reserve_flag; @@ -1767,7 +1769,8 @@ btr_cur_pessimistic_update( reserve_flag = FSP_NORMAL; } - success = fsp_reserve_free_extents(cursor->index->space, + success = fsp_reserve_free_extents(&n_reserved, + cursor->index->space, n_extents, reserve_flag, mtr); if (!success) { err = DB_OUT_OF_FILE_SPACE; @@ -1916,7 +1919,7 @@ return_after_reservations: if (n_extents > 0) { fil_space_release_free_extents(cursor->index->space, - n_extents); + n_reserved); } *big_rec = big_rec_vec; @@ -2387,6 +2390,7 @@ btr_cur_pessimistic_delete( rec_t* rec; dtuple_t* node_ptr; ulint n_extents = 0; + ulint n_reserved; ibool success; ibool ret = FALSE; mem_heap_t* heap; @@ -2405,7 +2409,8 @@ btr_cur_pessimistic_delete( n_extents = cursor->tree_height / 32 + 1; - success = fsp_reserve_free_extents(cursor->index->space, + success = fsp_reserve_free_extents(&n_reserved, + cursor->index->space, n_extents, FSP_CLEANING, mtr); if (!success) { *err = DB_OUT_OF_FILE_SPACE; @@ -2484,7 +2489,8 @@ return_after_reservations: } if (n_extents > 0) { - fil_space_release_free_extents(cursor->index->space, n_extents); + fil_space_release_free_extents(cursor->index->space, + n_reserved); } return(ret); @@ -3156,7 +3162,7 @@ btr_store_big_rec_extern_fields( ut_ad(mtr_memo_contains(local_mtr, dict_tree_get_lock(index->tree), MTR_MEMO_X_LOCK)); ut_ad(mtr_memo_contains(local_mtr, buf_block_align(rec), - MTR_MEMO_PAGE_X_FIX)); + MTR_MEMO_PAGE_X_FIX)); ut_a(index->type & DICT_CLUSTERED); space_id = buf_frame_get_space_id(rec); @@ -3322,7 +3328,7 @@ btr_free_externally_stored_field( ut_ad(mtr_memo_contains(local_mtr, dict_tree_get_lock(index->tree), MTR_MEMO_X_LOCK)); ut_ad(mtr_memo_contains(local_mtr, buf_block_align(data), - MTR_MEMO_PAGE_X_FIX)); + MTR_MEMO_PAGE_X_FIX)); ut_a(local_len >= BTR_EXTERN_FIELD_REF_SIZE); local_len -= BTR_EXTERN_FIELD_REF_SIZE; diff --git a/innobase/btr/btr0sea.c b/innobase/btr/btr0sea.c index 5c5ed934a9b..8dbffd82a20 100644 --- a/innobase/btr/btr0sea.c +++ b/innobase/btr/btr0sea.c @@ -1022,12 +1022,14 @@ btr_search_drop_page_hash_when_freed( mtr_start(&mtr); - /* We assume that if the caller has a latch on the page, - then the caller has already dropped the hash index for the page, - and we never get here. Therefore we can acquire the s-latch to - the page without fearing a deadlock. */ + /* We assume that if the caller has a latch on the page, then the + caller has already dropped the hash index for the page, and we never + get here. Therefore we can acquire the s-latch to the page without + having to fear a deadlock. */ - page = buf_page_get(space, page_no, RW_S_LATCH, &mtr); + page = buf_page_get_gen(space, page_no, RW_S_LATCH, NULL, + BUF_GET_IF_IN_POOL, IB__FILE__, __LINE__, + &mtr); buf_page_dbg_add_level(page, SYNC_TREE_NODE_FROM_HASH); diff --git a/innobase/buf/buf0buf.c b/innobase/buf/buf0buf.c index 9d920c7c0d8..1e1c6b25c1d 100644 --- a/innobase/buf/buf0buf.c +++ b/innobase/buf/buf0buf.c @@ -243,9 +243,10 @@ buf_calc_page_new_checksum( { ulint checksum; - /* Since the fields FIL_PAGE_FILE_FLUSH_LSN and ..._ARCH_LOG_NO - are written outside the buffer pool to the first pages of data - files, we have to skip them in the page checksum calculation. + /* Since the field FIL_PAGE_FILE_FLUSH_LSN, and in versions <= 4.1.x + ..._ARCH_LOG_NO, are written outside the buffer pool to the first + pages of data files, we have to skip them in the page checksum + calculation. We must also skip the field FIL_PAGE_SPACE_OR_CHKSUM where the checksum is stored, and also the last 8 bytes of page because there we store the old formula checksum. */ @@ -255,7 +256,7 @@ buf_calc_page_new_checksum( + ut_fold_binary(page + FIL_PAGE_DATA, UNIV_PAGE_SIZE - FIL_PAGE_DATA - FIL_PAGE_END_LSN_OLD_CHKSUM); - checksum = checksum & 0xFFFFFFFF; + checksum = checksum & 0xFFFFFFFFUL; return(checksum); } @@ -278,7 +279,7 @@ buf_calc_page_old_checksum( checksum = ut_fold_binary(page, FIL_PAGE_FILE_FLUSH_LSN); - checksum = checksum & 0xFFFFFFFF; + checksum = checksum & 0xFFFFFFFFUL; return(checksum); } @@ -378,7 +379,7 @@ buf_page_print( ut_print_timestamp(stderr); fprintf(stderr, - " InnoDB: Page dump in ascii and hex (%lu bytes):\n%s", +" InnoDB: Page dump in ascii and hex (%lu bytes):\n%s", (ulint)UNIV_PAGE_SIZE, buf); fprintf(stderr, "InnoDB: End of page dump\n"); @@ -396,11 +397,16 @@ buf_page_print( mach_read_from_4(read_buf + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM)); fprintf(stderr, - "InnoDB: Page lsn %lu %lu, low 4 bytes of lsn at page end %lu\n", +"InnoDB: Page lsn %lu %lu, low 4 bytes of lsn at page end %lu\n" +"InnoDB: Page number (if stored to page already) %lu,\n" +"InnoDB: space id (if created with >= MySQL-4.1.1 and stored already) %lu\n", mach_read_from_4(read_buf + FIL_PAGE_LSN), mach_read_from_4(read_buf + FIL_PAGE_LSN + 4), mach_read_from_4(read_buf + UNIV_PAGE_SIZE - - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)); + - FIL_PAGE_END_LSN_OLD_CHKSUM + 4), + mach_read_from_4(read_buf + FIL_PAGE_OFFSET), + mach_read_from_4(read_buf + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID)); + if (mach_read_from_2(read_buf + TRX_UNDO_PAGE_HDR + TRX_UNDO_PAGE_TYPE) == TRX_UNDO_INSERT) { fprintf(stderr, @@ -414,10 +420,7 @@ buf_page_print( if (fil_page_get_type(read_buf) == FIL_PAGE_INDEX) { fprintf(stderr, - "InnoDB: Page may be an index page "); - - fprintf(stderr, - "where index id is %lu %lu\n", +"InnoDB: Page may be an index page where index id is %lu %lu\n", ut_dulint_get_high(btr_page_get_index_id(read_buf)), ut_dulint_get_low(btr_page_get_index_id(read_buf))); @@ -435,7 +438,6 @@ buf_page_print( index->name); } } - } else if (fil_page_get_type(read_buf) == FIL_PAGE_INODE) { fprintf(stderr, "InnoDB: Page may be an 'inode' page\n"); } else if (fil_page_get_type(read_buf) == FIL_PAGE_IBUF_FREE_LIST) { @@ -581,8 +583,8 @@ buf_pool_init( the window */ os_awe_map_physical_mem_to_window(buf_pool->frame_zero, - n_frames * - (UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE), + n_frames * + (UNIV_PAGE_SIZE / OS_AWE_X86_PAGE_SIZE), buf_pool->awe_info); /*----------------------------------------*/ } @@ -1554,25 +1556,35 @@ buf_page_init( /************************************************************************ Function which inits a page for read to the buffer buf_pool. If the page is -already in buf_pool, does nothing. Sets the io_fix flag to BUF_IO_READ and -sets a non-recursive exclusive lock on the buffer frame. The io-handler must -take care that the flag is cleared and the lock released later. This is one -of the functions which perform the state transition NOT_USED => FILE_PAGE to -a block (the other is buf_page_create). */ +(1) already in buf_pool, or +(2) if we specify to read only ibuf pages and the page is not an ibuf page, or +(3) if the space is deleted or being deleted, +then this function does nothing. +Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock +on the buffer frame. The io-handler must take care that the flag is cleared +and the lock released later. This is one of the functions which perform the +state transition NOT_USED => FILE_PAGE to a block (the other is +buf_page_create). */ buf_block_t* buf_page_init_for_read( /*===================*/ - /* out: pointer to the block or NULL */ - ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ... */ - ulint space, /* in: space id */ - ulint offset) /* in: page number */ + /* out: pointer to the block or NULL */ + ulint* err, /* out: DB_SUCCESS or DB_TABLESPACE_DELETED */ + ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ... */ + ulint space, /* in: space id */ + ib_longlong tablespace_version,/* in: prevents reading from a wrong + version of the tablespace in case we have done + DISCARD + IMPORT */ + ulint offset) /* in: page number */ { buf_block_t* block; mtr_t mtr; - + ut_ad(buf_pool); + *err = DB_SUCCESS; + if (mode == BUF_READ_IBUF_PAGES_ONLY) { /* It is a read-ahead within an ibuf routine */ @@ -1596,10 +1608,17 @@ buf_page_init_for_read( ut_ad(block); mutex_enter(&(buf_pool->mutex)); - - if (NULL != buf_page_hash_get(space, offset)) { - /* The page is already in buf_pool, return */ + if (fil_tablespace_deleted_or_being_deleted_in_mem(space, + tablespace_version)) { + *err = DB_TABLESPACE_DELETED; + } + + if (*err == DB_TABLESPACE_DELETED + || NULL != buf_page_hash_get(space, offset)) { + + /* The page belongs to a space which has been deleted or is + being deleted, or the page is already in buf_pool, return */ mutex_exit(&(buf_pool->mutex)); buf_block_free(block); @@ -1715,7 +1734,7 @@ buf_page_create( /* Delete possible entries for the page from the insert buffer: such can exist if the page belonged to an index which was dropped */ - ibuf_merge_or_delete_for_page(NULL, space, offset); + ibuf_merge_or_delete_for_page(NULL, space, offset, TRUE); /* Flush pages from the end of the LRU list if necessary */ buf_flush_free_margin(); @@ -1828,7 +1847,7 @@ buf_page_io_complete( if (!recv_no_ibuf_operations) { ibuf_merge_or_delete_for_page(block->frame, - block->space, block->offset); + block->space, block->offset, TRUE); } } @@ -2294,7 +2313,7 @@ buf_all_freed(void) if (!buf_flush_ready_for_replace(block)) { - /* printf("Page %lu %lu still fixed or dirty\n", + /* printf("Page %lu %lu still fixed or dirty\n", block->space, block->offset); */ ut_error; } diff --git a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c index 5102674a8df..3f8022f36bc 100644 --- a/innobase/buf/buf0flu.c +++ b/innobase/buf/buf0flu.c @@ -361,16 +361,15 @@ buf_flush_init_for_writing( ulint space, /* in: space id */ ulint page_no) /* in: page number */ { - UT_NOT_USED(space); - /* Write the newest modification lsn to the page header and trailer */ mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn); mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, newest_lsn); - /* Write the page number */ + /* Write the page number and the space id */ mach_write_to_4(page + FIL_PAGE_OFFSET, page_no); + mach_write_to_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, space); /* Store the new formula checksum */ diff --git a/innobase/buf/buf0lru.c b/innobase/buf/buf0lru.c index 40f49f1fddc..5d48d460745 100644 --- a/innobase/buf/buf0lru.c +++ b/innobase/buf/buf0lru.c @@ -62,6 +62,87 @@ buf_LRU_block_free_hashed_page( be in a state where it can be freed */ /********************************************************************** +Invalidates all pages belonging to a given tablespace when we are deleting +the data file(s) of that tablespace. */ + +void +buf_LRU_invalidate_tablespace( +/*==========================*/ + ulint id) /* in: space id */ +{ + buf_block_t* block; + ulint page_no; + ibool all_freed; + +scan_again: + mutex_enter(&(buf_pool->mutex)); + + all_freed = TRUE; + + block = UT_LIST_GET_LAST(buf_pool->LRU); + + while (block != NULL) { + if (block->space == id + && (block->buf_fix_count > 0 || block->io_fix != 0)) { + + /* We cannot remove this page during this scan yet; + maybe the system is currently reading it in, or + flushing the modifications to the file */ + + all_freed = FALSE; + + goto next_page; + } + + if (block->space == id) { + if (buf_debug_prints) { + printf( + "Dropping space %lu page %lu\n", + block->space, block->offset); + } + + if (block->is_hashed) { + page_no = block->offset; + + mutex_exit(&(buf_pool->mutex)); + + /* Note that the following call will acquire + an S-latch on the page */ + + btr_search_drop_page_hash_when_freed(id, + page_no); + goto scan_again; + } + + if (0 != ut_dulint_cmp(block->oldest_modification, + ut_dulint_zero)) { + + /* Remove from the flush list of modified + blocks */ + block->oldest_modification = ut_dulint_zero; + + UT_LIST_REMOVE(flush_list, + buf_pool->flush_list, block); + } + + /* Remove from the LRU list */ + buf_LRU_block_remove_hashed_page(block); + buf_LRU_block_free_hashed_page(block); + } +next_page: + block = UT_LIST_GET_PREV(LRU, block); + } + + mutex_exit(&(buf_pool->mutex)); + + if (!all_freed) { + os_thread_sleep(20000); + + goto scan_again; + } +} + +/********************************************************************** Gets the minimum LRU_position field for the blocks in an initial segment (determined by BUF_LRU_INITIAL_RATIO) of the LRU list. The limit is not guaranteed to be precise, because the ulint_clock may wrap around. */ diff --git a/innobase/buf/buf0rea.c b/innobase/buf/buf0rea.c index bb6670296b9..8cc379e75d8 100644 --- a/innobase/buf/buf0rea.c +++ b/innobase/buf/buf0rea.c @@ -49,19 +49,30 @@ ulint buf_read_page_low( /*==============*/ /* out: 1 if a read request was queued, 0 if the page - already resided in buf_pool or if the page is in + already resided in buf_pool, or if the page is in the doublewrite buffer blocks in which case it is never - read into the pool */ + read into the pool, or if the tablespace does not + exist or is being dropped */ + ulint* err, /* out: DB_SUCCESS or DB_TABLESPACE_DELETED if we are + trying to read from a non-existent tablespace, or a + tablespace which is just now being dropped */ ibool sync, /* in: TRUE if synchronous aio is desired */ ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ..., ORed to OS_AIO_SIMULATED_WAKE_LATER (see below at read-ahead functions) */ ulint space, /* in: space id */ + ib_longlong tablespace_version, /* in: if the space memory object has + this timestamp different from what we are giving here, + treat the tablespace as dropped; this is a timestamp we + use to stop dangling page reads from a tablespace + which we have DISCARDed + IMPORTed back */ ulint offset) /* in: page number */ { buf_block_t* block; ulint wake_later; + *err = DB_SUCCESS; + wake_later = mode & OS_AIO_SIMULATED_WAKE_LATER; mode = mode & ~OS_AIO_SIMULATED_WAKE_LATER; @@ -72,6 +83,10 @@ buf_read_page_low( || (offset >= trx_doublewrite->block2 && offset < trx_doublewrite->block2 + TRX_SYS_DOUBLEWRITE_BLOCK_SIZE))) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: trying to read doublewrite buffer page %lu\n", offset); + return(0); } @@ -97,27 +112,36 @@ buf_read_page_low( sync = TRUE; } - block = buf_page_init_for_read(mode, space, offset); - - if (block != NULL) { - if (buf_debug_prints) { - printf("Posting read request for page %lu, sync %lu\n", - offset, sync); - } + /* The following call will also check if the tablespace does not exist + or is being dropped; if we succeed in initing the page in the buffer + pool for read, then DISCARD cannot proceed until the read has + completed */ - fil_io(OS_FILE_READ | wake_later, - sync, space, offset, 0, UNIV_PAGE_SIZE, - (void*)block->frame, (void*)block); - if (sync) { - /* The i/o is already completed when we arrive from - fil_read */ - buf_page_io_complete(block); - } + block = buf_page_init_for_read(err, mode, space, tablespace_version, + offset); + if (block == NULL) { - return(1); + return(0); } - return(0); + if (buf_debug_prints) { + printf("Posting read request for page %lu, sync %lu\n", + offset, sync); + } + + *err = fil_io(OS_FILE_READ | wake_later, + sync, space, + offset, 0, UNIV_PAGE_SIZE, + (void*)block->frame, (void*)block); + ut_a(*err == DB_SUCCESS); + + if (sync) { + /* The i/o is already completed when we arrive from + fil_read */ + buf_page_io_complete(block); + } + + return(1); } /************************************************************************ @@ -142,12 +166,14 @@ buf_read_ahead_random( ulint offset) /* in: page number of a page which the current thread wants to access */ { + ib_longlong tablespace_version; buf_block_t* block; ulint recent_blocks = 0; ulint count; ulint LRU_recent_limit; ulint ibuf_mode; ulint low, high; + ulint err; ulint i; if (srv_startup_is_before_trx_rollback_phase) { @@ -164,11 +190,16 @@ buf_read_ahead_random( return(0); } + /* Remember the tablespace version before we ask te tablespace size + below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we + do not try to read outside the bounds of the tablespace! */ + + tablespace_version = fil_space_get_version(space); + low = (offset / BUF_READ_AHEAD_RANDOM_AREA) * BUF_READ_AHEAD_RANDOM_AREA; high = (offset / BUF_READ_AHEAD_RANDOM_AREA + 1) * BUF_READ_AHEAD_RANDOM_AREA; - if (high > fil_space_get_size(space)) { high = fil_space_get_size(space); @@ -193,7 +224,6 @@ buf_read_ahead_random( that is, reside near the start of the LRU list. */ for (i = low; i < high; i++) { - block = buf_page_hash_get(space, i); if ((block) @@ -227,10 +257,17 @@ buf_read_ahead_random( mode: hence FALSE as the first parameter */ if (!ibuf_bitmap_page(i)) { - - count += buf_read_page_low(FALSE, ibuf_mode + count += buf_read_page_low(&err, FALSE, ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER, - space, i); + space, tablespace_version, i); + if (err == DB_TABLESPACE_DELETED) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: in random readahead trying to access tablespace\n" +"InnoDB: %lu page no. %lu,\n" +"InnoDB: but the tablespace does not exist or is just being dropped.\n", + space, i); + } } } @@ -264,15 +301,27 @@ buf_read_page( ulint space, /* in: space id */ ulint offset) /* in: page number */ { - ulint count; - ulint count2; + ib_longlong tablespace_version; + ulint count; + ulint count2; + ulint err; + + tablespace_version = fil_space_get_version(space); count = buf_read_ahead_random(space, offset); /* We do the i/o in the synchronous aio mode to save thread switches: hence TRUE */ - count2 = buf_read_page_low(TRUE, BUF_READ_ANY_PAGE, space, offset); + count2 = buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space, + tablespace_version, offset); + if (err == DB_TABLESPACE_DELETED) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: error: trying to access tablespace %lu page no. %lu,\n" +"InnoDB: but the tablespace does not exist or is just being dropped.\n", + space, offset); + } /* Flush pages from the end of the LRU list if necessary */ buf_flush_free_margin(); @@ -312,6 +361,7 @@ buf_read_ahead_linear( ulint offset) /* in: page number of a page; NOTE: the current thread must want access to this page (see NOTE 3 above) */ { + ib_longlong tablespace_version; buf_block_t* block; buf_frame_t* frame; buf_block_t* pred_block = NULL; @@ -323,6 +373,7 @@ buf_read_ahead_linear( ulint fail_count; ulint ibuf_mode; ulint low, high; + ulint err; ulint i; if (srv_startup_is_before_trx_rollback_phase) { @@ -350,14 +401,21 @@ buf_read_ahead_linear( return(0); } + /* Remember the tablespace version before we ask te tablespace size + below: if DISCARD + IMPORT changes the actual .ibd file meanwhile, we + do not try to read outside the bounds of the tablespace! */ + + tablespace_version = fil_space_get_version(space); + + mutex_enter(&(buf_pool->mutex)); + if (high > fil_space_get_size(space)) { + mutex_exit(&(buf_pool->mutex)); /* The area is not whole, return */ return(0); } - mutex_enter(&(buf_pool->mutex)); - if (buf_pool->n_pend_reads > buf_pool->curr_size / BUF_READ_AHEAD_PEND_LIMIT) { mutex_exit(&(buf_pool->mutex)); @@ -378,18 +436,15 @@ buf_read_ahead_linear( fail_count = 0; for (i = low; i < high; i++) { - block = buf_page_hash_get(space, i); if ((block == NULL) || !block->accessed) { - /* Not accessed */ fail_count++; } else if (pred_block && (ut_ulint_cmp(block->LRU_position, pred_block->LRU_position) != asc_or_desc)) { - /* Accesses not in the right order */ fail_count++; @@ -462,7 +517,7 @@ buf_read_ahead_linear( return(0); } - /* If we got this far, read-ahead can be sensible: do it */ + /* If we got this far, read-ahead can be sensible: do it */ if (ibuf_inside()) { ibuf_mode = BUF_READ_IBUF_PAGES_ONLY; @@ -483,9 +538,17 @@ buf_read_ahead_linear( aio mode: hence FALSE as the first parameter */ if (!ibuf_bitmap_page(i)) { - count += buf_read_page_low(FALSE, ibuf_mode + count += buf_read_page_low(&err, FALSE, ibuf_mode | OS_AIO_SIMULATED_WAKE_LATER, - space, i); + space, tablespace_version, i); + if (err == DB_TABLESPACE_DELETED) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: in linear readahead trying to access tablespace\n" +"InnoDB: %lu page no. %lu,\n" +"InnoDB: but the tablespace does not exist or is just being dropped.\n", + space, i); + } } } @@ -509,7 +572,7 @@ buf_read_ahead_linear( /************************************************************************ Issues read requests for pages which the ibuf module wants to read in, in -order to contract insert buffer trees. Technically, this function is like +order to contract the insert buffer tree. Technically, this function is like a read-ahead function. */ void @@ -518,11 +581,17 @@ buf_read_ibuf_merge_pages( ibool sync, /* in: TRUE if the caller wants this function to wait for the highest address page to get read in, before this function returns */ - ulint space, /* in: space id */ + ulint* space_ids, /* in: array of space ids */ + ib_longlong* space_versions,/* in: the spaces must have this version + number (timestamp), otherwise we discard the + read; we use this to cancel reads if + DISCARD + IMPORT may have changed the + tablespace size */ ulint* page_nos, /* in: array of page numbers to read, with the highest page number the last in the array */ ulint n_stored) /* in: number of page numbers in the array */ { + ulint err; ulint i; ut_ad(!ibuf_inside()); @@ -535,12 +604,21 @@ buf_read_ibuf_merge_pages( } for (i = 0; i < n_stored; i++) { + if ((i + 1 == n_stored) && sync) { - buf_read_page_low(TRUE, BUF_READ_ANY_PAGE, space, - page_nos[i]); + buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, + space_ids[i], space_versions[i], page_nos[i]); } else { - buf_read_page_low(FALSE, BUF_READ_ANY_PAGE, space, - page_nos[i]); + buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE, + space_ids[i], space_versions[i], page_nos[i]); + } + + if (err == DB_TABLESPACE_DELETED) { + /* We have deleted or are deleting the single-table + tablespace: remove the entries for that page */ + + ibuf_merge_or_delete_for_page(NULL, space_ids[i], + page_nos[i], FALSE); } } @@ -548,8 +626,7 @@ buf_read_ibuf_merge_pages( buf_flush_free_margin(); if (buf_debug_prints) { - printf("Ibuf merge read-ahead space %lu pages %lu\n", - space, n_stored); + printf("Ibuf merge read-ahead pages %lu\n", n_stored); } } @@ -567,8 +644,12 @@ buf_read_recv_pages( highest page number the last in the array */ ulint n_stored) /* in: number of page numbers in the array */ { - ulint count; - ulint i; + ib_longlong tablespace_version; + ulint count; + ulint err; + ulint i; + + tablespace_version = fil_space_get_version(space); for (i = 0; i < n_stored; i++) { @@ -596,12 +677,12 @@ buf_read_recv_pages( os_aio_print_debug = FALSE; if ((i + 1 == n_stored) && sync) { - buf_read_page_low(TRUE, BUF_READ_ANY_PAGE, space, - page_nos[i]); + buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space, + tablespace_version, page_nos[i]); } else { - buf_read_page_low(FALSE, BUF_READ_ANY_PAGE + buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE | OS_AIO_SIMULATED_WAKE_LATER, - space, page_nos[i]); + space, tablespace_version, page_nos[i]); } } diff --git a/innobase/data/data0type.c b/innobase/data/data0type.c index df430f06bcb..268da7eaf5c 100644 --- a/innobase/data/data0type.c +++ b/innobase/data/data0type.c @@ -12,7 +12,7 @@ Created 1/16/1996 Heikki Tuuri #include "data0type.ic" #endif -dtype_t dtype_binary_val = {DATA_BINARY, 0, 0, 0}; +dtype_t dtype_binary_val = {DATA_BINARY, 0, 0, 0, 0}; dtype_t* dtype_binary = &dtype_binary_val; /************************************************************************* diff --git a/innobase/dict/dict0boot.c b/innobase/dict/dict0boot.c index 0bf2ace3324..f0d41018948 100644 --- a/innobase/dict/dict0boot.c +++ b/innobase/dict/dict0boot.c @@ -419,6 +419,4 @@ dict_create(void) dict_boot(); dict_insert_initial_data(); - - sync_order_checks_on = TRUE; } diff --git a/innobase/dict/dict0crea.c b/innobase/dict/dict0crea.c index 9139e589a0a..9d1ec53645d 100644 --- a/innobase/dict/dict0crea.c +++ b/innobase/dict/dict0crea.c @@ -264,6 +264,8 @@ dict_build_table_def_step( dict_table_t* table; dict_table_t* cluster_table; dtuple_t* row; + ulint error; + mtr_t mtr; UT_NOT_USED(thr); ut_ad(mutex_own(&(dict_sys->mutex))); @@ -291,6 +293,29 @@ dict_build_table_def_step( table->mix_id = dict_hdr_get_new_id(DICT_HDR_MIX_ID); } + if (srv_file_per_table) { + /* We create a new single-table tablespace for the table. + We initially let it be 4 pages: + - page 0 is the fsp header and an extent descriptor page, + - page 1 is an ibuf bitmap page, + - page 2 is the first inode page, + - page 3 will contain the root of the clustered index of the + table we create here. */ + + error = fil_create_new_single_table_tablespace( + &(table->space), table->name, 4); + if (error != DB_SUCCESS) { + + return(error); + } + + mtr_start(&mtr); + + fsp_header_init(table->space, 4, &mtr); + + mtr_commit(&mtr); + } + row = dict_create_sys_tables_tuple(table, node->heap); ins_node_set_new_row(node->tab_def, row); @@ -317,7 +342,6 @@ dict_build_col_def_step( } #ifdef notdefined - /************************************************************************* Creates the single index for a cluster: it contains all the columns of the cluster definition in the order they were defined. */ @@ -508,8 +532,8 @@ dict_create_sys_fields_tuple( } /********************************************************************* -Creates the tuple with which the index entry is searched for -writing the index tree root page number, if such a tree is created. */ +Creates the tuple with which the index entry is searched for writing the index +tree root page number, if such a tree is created. */ static dtuple_t* dict_create_search_tuple( @@ -577,10 +601,10 @@ dict_build_index_def_step( index->id = dict_hdr_get_new_id(DICT_HDR_INDEX_ID); - if (index->type & DICT_CLUSTERED) { - /* Inherit the space from the table */ - index->space = table->space; - } + /* Inherit the space id from the table; we store all indexes of a + table in the same tablespace */ + + index->space = table->space; index->page_no = FIL_NULL; @@ -664,6 +688,9 @@ dict_create_index_tree_step( index->page_no = btr_create(index->type, index->space, index->id, &mtr); + /* printf("Created a new index tree in space %lu root page %lu\n", + index->space, index->page_no); */ + page_rec_write_index_page_no(btr_pcur_get_rec(&pcur), DICT_SYS_INDEXES_PAGE_NO_FIELD, index->page_no, &mtr); @@ -712,7 +739,14 @@ dict_drop_index_tree( ut_ad(len == 4); space = mtr_read_ulint(ptr, MLOG_4BYTES, mtr); - + + if (!fil_tablespace_exists_in_mem(space)) { + /* It is a single table tablespace and the .ibd file is + missing: do nothing */ + + return; + } + /* We free all the pages but the root page first; this operation may span several mini-transactions */ @@ -722,6 +756,8 @@ dict_drop_index_tree( we write FIL_NULL to the appropriate field in the SYS_INDEXES record: this mini-transaction marks the B-tree totally freed */ + /* printf("Dropping index tree in space %lu root page %lu\n", space, + root_page_no); */ btr_free_root(space, root_page_no, mtr); page_rec_write_index_page_no(rec, DICT_SYS_INDEXES_PAGE_NO_FIELD, @@ -746,7 +782,6 @@ dict_create_default_index( dict_create_index(index, trx); } - #endif /************************************************************************* diff --git a/innobase/dict/dict0dict.c b/innobase/dict/dict0dict.c index a576a886b97..9056c974f02 100644 --- a/innobase/dict/dict0dict.c +++ b/innobase/dict/dict0dict.c @@ -139,7 +139,8 @@ dict_tree_find_index_low( /*=====================*/ /* out: index */ dict_tree_t* tree, /* in: index tree */ - rec_t* rec); /* in: record for which to find correct index */ + rec_t* rec); /* in: record for which to find correct + index */ /************************************************************************** Removes a foreign constraint struct from the dictionet cache. */ static @@ -717,7 +718,7 @@ dict_table_get_and_increment_handle_count( mutex_exit(&(dict_sys->mutex)); if (table != NULL) { - if (!table->stat_initialized) { + if (!table->stat_initialized && !table->ibd_file_missing) { dict_update_statistics(table); } } @@ -869,6 +870,7 @@ dict_table_rename_in_cache( ulint fold; ulint old_size; char* name_buf; + ibool success; ulint i; ut_ad(table); @@ -884,6 +886,21 @@ dict_table_rename_in_cache( HASH_SEARCH(name_hash, dict_sys->table_hash, fold, table2, (ut_strcmp(table2->name, new_name) == 0)); if (table2) { + fprintf(stderr, +"InnoDB: Error: dictionary cache already contains a table of name %s\n", + new_name); + return(FALSE); + } + } + + /* If the table is stored in a single-table tablespace, rename the + .ibd file */ + + if (table->space != 0) { + success = fil_rename_tablespace(table->name, table->space, + new_name); + if (!success) { + return(FALSE); } } @@ -909,7 +926,6 @@ dict_table_rename_in_cache( /* Add table to hash table of tables */ HASH_INSERT(dict_table_t, name_hash, dict_sys->table_hash, fold, table); - dict_sys->size += (mem_heap_get_size(table->heap) - old_size); /* Update the table_name field in indexes */ @@ -1000,6 +1016,31 @@ dict_table_rename_in_cache( } /************************************************************************** +Change the id of a table object in the dictionary cache. This is used in +DISCARD TABLESPACE. */ + +void +dict_table_change_id_in_cache( +/*==========================*/ + dict_table_t* table, /* in: table object already in cache */ + dulint new_id) /* in: new id to set */ +{ + ut_ad(table); + ut_ad(mutex_own(&(dict_sys->mutex))); + ut_ad(table->magic_n == DICT_TABLE_MAGIC_N); + + /* Remove the table from the hash table of id's */ + + HASH_DELETE(dict_table_t, id_hash, dict_sys->table_id_hash, + ut_fold_dulint(table->id), table); + table->id = new_id; + + /* Add the table back to the hash table */ + HASH_INSERT(dict_table_t, id_hash, dict_sys->table_id_hash, + ut_fold_dulint(table->id), table); +} + +/************************************************************************** Removes a table object from the dictionary cache. */ void @@ -3295,8 +3336,8 @@ dict_tree_free( /*===========*/ dict_tree_t* tree) /* in, own: index tree */ { - ut_ad(tree); - ut_ad(tree->magic_n == DICT_TREE_MAGIC_N); + ut_a(tree); + ut_a(tree->magic_n == DICT_TREE_MAGIC_N); rw_lock_free(&(tree->lock)); mem_free(tree); @@ -3310,7 +3351,8 @@ dict_tree_find_index_low( /*=====================*/ /* out: index */ dict_tree_t* tree, /* in: index tree */ - rec_t* rec) /* in: record for which to find correct index */ + rec_t* rec) /* in: record for which to find correct + index */ { dict_index_t* index; dict_table_t* table; @@ -3348,7 +3390,8 @@ dict_tree_find_index( /*=================*/ /* out: index */ dict_tree_t* tree, /* in: index tree */ - rec_t* rec) /* in: record for which to find correct index */ + rec_t* rec) /* in: record for which to find correct + index */ { dict_index_t* index; @@ -3438,7 +3481,8 @@ dict_tree_build_node_ptr( /*=====================*/ /* out, own: node pointer */ dict_tree_t* tree, /* in: index tree */ - rec_t* rec, /* in: record for which to build node pointer */ + rec_t* rec, /* in: record for which to build node + pointer */ ulint page_no,/* in: page number to put in node pointer */ mem_heap_t* heap, /* in: memory heap where pointer created */ ulint level) /* in: level of rec in tree: 0 means leaf @@ -3600,6 +3644,16 @@ dict_update_statistics_low( ulint size; ulint sum_of_index_sizes = 0; + if (table->ibd_file_missing) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: cannot calculate statistics for table %s\n" +"InnoDB: because the .ibd file is missing. See section 15.1 of\n" +"InnoDB: http:/www.innodb.com/ibman.html for help\n", table->name); + + return; + } + /* If we have set a high innodb_force_recovery level, do not calculate statistics, as a badly corrupted index can cause a crash in it. */ diff --git a/innobase/dict/dict0load.c b/innobase/dict/dict0load.c index d5c51a43747..e9076db08f3 100644 --- a/innobase/dict/dict0load.c +++ b/innobase/dict/dict0load.c @@ -19,6 +19,7 @@ Created 4/24/1996 Heikki Tuuri #include "mach0data.h" #include "dict0dict.h" #include "dict0boot.h" +#include "srv0start.h" /************************************************************************ Finds the first table name in the given database. */ @@ -120,8 +121,8 @@ dict_print(void) rec_t* rec; byte* field; ulint len; - char table_name[10000]; mtr_t mtr; + char table_name[10000]; mutex_enter(&(dict_sys->mutex)); @@ -186,6 +187,100 @@ loop: } /************************************************************************ +In a crash recovery we already have all the tablespace objects created. +This function compares the space id information in the InnoDB data dictionary +to what we already read with fil_load_single_table_tablespaces(). +In a normal startup we just scan the biggest space id, and store it to +fil_system. */ + +void +dict_check_tablespaces_or_store_max_id( +/*===================================*/ + ibool in_crash_recovery) /* in: are we doing a crash recovery */ +{ + dict_table_t* sys_tables; + dict_index_t* sys_index; + btr_pcur_t pcur; + rec_t* rec; + byte* field; + ulint len; + ulint space_id; + ulint max_space_id = 0; + mtr_t mtr; + char name[OS_FILE_MAX_PATH]; + + mutex_enter(&(dict_sys->mutex)); + + mtr_start(&mtr); + + sys_tables = dict_table_get_low((char *) "SYS_TABLES"); + sys_index = UT_LIST_GET_FIRST(sys_tables->indexes); + + btr_pcur_open_at_index_side(TRUE, sys_index, BTR_SEARCH_LEAF, &pcur, + TRUE, &mtr); +loop: + btr_pcur_move_to_next_user_rec(&pcur, &mtr); + + rec = btr_pcur_get_rec(&pcur); + + if (!btr_pcur_is_on_user_rec(&pcur, &mtr)) { + /* end of index */ + + btr_pcur_close(&pcur); + mtr_commit(&mtr); + + /* We must make the tablespace cache aware of the biggest + known space id */ + + /* printf("Biggest space id in data dictionary %lu\n", + max_space_id); */ + fil_set_max_space_id_if_bigger(max_space_id); + + mutex_exit(&(dict_sys->mutex)); + + return; + } + + field = rec_get_nth_field(rec, 0, &len); + + if (!rec_get_deleted_flag(rec)) { + + /* We found one */ + + ut_a(len < OS_FILE_MAX_PATH - 10); + ut_memcpy(name, field, len); + name[len] = '\0'; + + field = rec_get_nth_field(rec, 9, &len); + ut_a(len == 4); + + space_id = mach_read_from_4(field); + + btr_pcur_store_position(&pcur, &mtr); + + mtr_commit(&mtr); + + if (space_id != 0 && in_crash_recovery) { + /* Check that the tablespace (the .ibd file) really + exists; print a warning to the .err log if not */ + + fil_space_for_table_exists_in_mem(space_id, name, + TRUE, TRUE); + } + + if (space_id > max_space_id) { + max_space_id = space_id; + } + + mtr_start(&mtr); + + btr_pcur_restore_position(BTR_SEARCH_LEAF, &pcur, &mtr); + } + + goto loop; +} + +/************************************************************************ Loads definitions for table columns. */ static void @@ -359,13 +454,13 @@ dict_load_fields( pos_and_prefix_len = mach_read_from_4(field); - ut_a((pos_and_prefix_len & 0xFFFF) == i - || (pos_and_prefix_len & 0xFFFF0000) == (i << 16)); + ut_a((pos_and_prefix_len & 0xFFFFUL) == i + || (pos_and_prefix_len & 0xFFFF0000UL) == (i << 16)); if ((i == 0 && pos_and_prefix_len > 0) - || (pos_and_prefix_len & 0xFFFF0000) > 0) { + || (pos_and_prefix_len & 0xFFFF0000UL) > 0) { - prefix_len = pos_and_prefix_len & 0xFFFF; + prefix_len = pos_and_prefix_len & 0xFFFFUL; } else { prefix_len = 0; } @@ -540,8 +635,8 @@ dict_load_indexes( && (0 == ut_memcmp(name_buf, (char*) "ID_IND", name_len))))) { - /* The index was created in memory already in - booting */ + /* The index was created in memory already at booting + of the database server */ } else { index = dict_mem_index_create(table->name, name_buf, space, type, n_fields); @@ -572,9 +667,14 @@ dictionary cache. */ dict_table_t* dict_load_table( /*============*/ - /* out: table, NULL if does not exist */ - char* name) /* in: table name */ + /* out: table, NULL if does not exist; if the table is + stored in an .ibd file, but the file does not exist, + then we set the ibd_file_missing flag TRUE in the table + object we return */ + char* name) /* in: table name in the databasename/tablename + format */ { + ibool ibd_file_missing = FALSE; dict_table_t* table; dict_table_t* sys_tables; btr_pcur_t pcur; @@ -641,6 +741,23 @@ dict_load_table( field = rec_get_nth_field(rec, 9, &len); space = mach_read_from_4(field); + /* Check if the tablespace exists and has the right name */ + if (space != 0) { + if (fil_space_for_table_exists_in_mem(space, name, FALSE, + FALSE)) { + /* Ok; (if we did a crash recovery then the tablespace + can already be in the memory cache) */ + } else { + /* Try to open the tablespace */ + if (!fil_open_single_table_tablespace(space, name)) { + /* We failed to find a sensible tablespace + file */ + + ibd_file_missing = TRUE; + } + } + } + ut_a(0 == ut_strcmp((char *) "N_COLS", dict_field_get_col( dict_index_get_nth_field( @@ -651,6 +768,8 @@ dict_load_table( table = dict_mem_table_create(name, space, n_cols); + table->ibd_file_missing = ibd_file_missing; + ut_a(0 == ut_strcmp((char *) "ID", dict_field_get_col( dict_index_get_nth_field( @@ -1003,7 +1122,7 @@ dict_load_foreign( /* We store the type to the bits 24-31 of n_fields */ foreign->type = foreign->n_fields >> 24; - foreign->n_fields = foreign->n_fields & 0xFFFFFF; + foreign->n_fields = foreign->n_fields & 0xFFFFFFUL; foreign->id = mem_heap_alloc(foreign->heap, ut_strlen(id) + 1); diff --git a/innobase/dict/dict0mem.c b/innobase/dict/dict0mem.c index 56efc0a0117..aed44eb246c 100644 --- a/innobase/dict/dict0mem.c +++ b/innobase/dict/dict0mem.c @@ -56,6 +56,8 @@ dict_mem_table_create( table->type = DICT_TABLE_ORDINARY; table->name = str; table->space = space; + table->ibd_file_missing = FALSE; + table->tablespace_discarded = FALSE; table->n_def = 0; table->n_cols = n_cols + DATA_N_SYS_COLS; table->mem_fix = 0; diff --git a/innobase/fil/fil0fil.c b/innobase/fil/fil0fil.c index f55df90846c..2b0138ccb5a 100644 --- a/innobase/fil/fil0fil.c +++ b/innobase/fil/fil0fil.c @@ -1,5 +1,5 @@ /****************************************************** -The low-level file system +The tablespace memory cache (c) 1995 Innobase Oy @@ -16,16 +16,19 @@ Created 10/25/1995 Heikki Tuuri #include "mach0data.h" #include "ibuf0ibuf.h" #include "buf0buf.h" +#include "buf0flu.h" +#include "buf0lru.h" #include "log0log.h" #include "log0recv.h" #include "fsp0fsp.h" #include "srv0srv.h" +#include "srv0start.h" /* - IMPLEMENTATION OF THE LOW-LEVEL FILE SYSTEM - =========================================== + IMPLEMENTATION OF THE TABLESPACE MEMORY CACHE + ============================================= -The file system is responsible for providing fast read/write access to +The tablespace cache is responsible for providing fast read/write access to tablespaces and logs of the database. File creation and deletion is done in other modules which know more of the logic of the operation, however. @@ -83,20 +86,31 @@ ulint fil_n_pending_tablespace_flushes = 0; /* Null file address */ fil_addr_t fil_addr_null = {FIL_NULL, 0}; -/* File system file node data structure */ +/* File node of a tablespace or the log data space */ typedef struct fil_node_struct fil_node_t; struct fil_node_struct { - char* name; /* the file name or path */ + fil_space_t* space; /* backpointer to the space where this node + belongs */ + char* name; /* path to the file */ ibool open; /* TRUE if file open */ os_file_t handle; /* OS handle to the file, if file open */ - ulint size; /* size of the file in database pages - (where the possible last incomplete megabyte - is ignored) */ + ibool is_raw_disk;/* TRUE if the 'file' is actually a raw + device or a raw disk partition */ + ulint size; /* size of the file in database pages, 0 if + not known yet; the possible last incomplete + megabyte is ignored if space == 0 */ ulint n_pending; - /* count of pending i/o-ops on this file */ - ibool is_modified; /* this is set to TRUE when we write - to the file and FALSE when we call fil_flush - for this file space */ + /* count of pending i/o's on this file; + closing of the file is not allowed if + this is > 0 */ + ulint n_pending_flushes; + /* count of pending flushes on this file; + closing of the file is not allowed if + this is > 0 */ + ib_longlong modification_counter;/* when we write to the file we + increment this by one */ + ib_longlong flush_counter;/* up to what modification_counter value + we have flushed the modifications to disk */ UT_LIST_NODE_T(fil_node_t) chain; /* link field for the file chain */ UT_LIST_NODE_T(fil_node_t) LRU; @@ -106,19 +120,52 @@ struct fil_node_struct { #define FIL_NODE_MAGIC_N 89389 -/* File system tablespace or log data structure: let us call them by a common -name space */ +/* Tablespace or log data space: let us call them by a common name space */ struct fil_space_struct { - char* name; /* space name */ + char* name; /* space name = the path to the first file in + it */ ulint id; /* space id */ + ib_longlong tablespace_version; + /* in DISCARD/IMPORT this timestamp is used to + check if we should ignore an insert buffer + merge request for a page because it actually + was for the previous incarnation of the + space */ + ibool mark; /* this is set to TRUE at database startup if + the space corresponds to a table in the InnoDB + data dictionary; so we can print a warning of + orphaned tablespaces */ + ibool stop_ios;/* TRUE if we want to rename the .ibd file of + tablespace and want to stop temporarily + posting of new i/o requests on the file */ + ibool stop_ibuf_merges; + /* we set this TRUE when we start deleting a + single-table tablespace */ + ibool is_being_deleted; + /* this is set to TRUE when we start + deleting a single-table tablespace and its + file; when this flag is set no further i/o + or flush requests can be placed on this space, + though there may be such requests still being + processed on this space */ ulint purpose;/* FIL_TABLESPACE, FIL_LOG, or FIL_ARCH_LOG */ UT_LIST_BASE_NODE_T(fil_node_t) chain; /* base node for the file chain */ - ulint size; /* space size in pages */ + ulint size; /* space size in pages; 0 if a single-table + tablespace whose size we do not know yet */ ulint n_reserved_extents; /* number of reserved free extents for ongoing operations like B-tree page split */ + ulint n_pending_flushes; /* this is > 0 when flushing + the tablespace to disk; dropping of the + tablespace is forbidden if this is > 0 */ + ulint n_pending_ibuf_merges;/* this is > 0 when merging + insert buffer entries to a page so that we + may need to access the ibuf bitmap page in the + tablespade: dropping of the tablespace is + forbidden if this is > 0 */ hash_node_t hash; /* hash chain node */ + hash_node_t name_hash;/* hash chain the name_hash table */ rw_lock_t latch; /* latch protecting the file space storage allocation */ UT_LIST_NODE_T(fil_space_t) space_list; @@ -130,80 +177,115 @@ struct fil_space_struct { #define FIL_SPACE_MAGIC_N 89472 -/* The file system data structure */ +/* The tablespace memory cache; also the totality of logs = the log data space, +is stored here; below we talk about tablespaces, but also the ib_logfiles +form a 'space' and it is handled here */ typedef struct fil_system_struct fil_system_t; struct fil_system_struct { - mutex_t mutex; /* The mutex protecting the system */ + mutex_t mutex; /* The mutex protecting the cache */ hash_table_t* spaces; /* The hash table of spaces in the - system */ + system; they are hashed on the space + id */ + hash_table_t* name_hash; /* hash table based on the space + name */ UT_LIST_BASE_NODE_T(fil_node_t) LRU; /* base node for the LRU list of the - most recently used open files */ - ulint n_open_pending; /* current number of open files with - pending i/o-ops on them */ - ulint max_n_open; /* maximum allowed open files */ - os_event_t can_open; /* this event is set to the signaled - state when the system is capable of - opening a new file, i.e., - n_open_pending < max_n_open */ + most recently used open files with no + pending i/o's; if we start an i/o on + the file, we first remove it from this + list, and return it to the start of + the list when the i/o ends; + log files and the system tablespace are + not put to this list: they are opened + after the startup, and kept open until + shutdown */ + ulint n_open; /* number of files currently open */ + ulint max_n_open; /* n_open is not allowed to exceed + this */ + ib_longlong modification_counter;/* when we write to a file we + increment this by one */ + ulint max_assigned_id;/* maximum space id in the existing + tables, or assigned during the time + mysqld has been up; at an InnoDB + startup we scan the data dictionary + and set here the maximum of the + space id's of the tables there */ + ib_longlong tablespace_version; + /* a counter which is incremented for + every space object memory creation; + every space mem object gets a + 'timestamp' from this; in DISCARD/ + IMPORT this is used to check if we + should ignore an insert buffer merge + request */ UT_LIST_BASE_NODE_T(fil_space_t) space_list; /* list of all file spaces */ }; -/* The file system. This variable is NULL before the module is initialized. */ +/* The tablespace memory cache. This variable is NULL before the module is +initialized. */ fil_system_t* fil_system = NULL; -/* The file system hash table size */ -#define FIL_SYSTEM_HASH_SIZE 500 +/* The tablespace memory cache hash table size */ +#define FIL_SYSTEM_HASH_SIZE 50 /* TODO: make bigger! */ -/*********************************************************************** -Reserves a right to open a single file. The right must be released with -fil_release_right_to_open. */ +/************************************************************************ +NOTE: you must call fil_mutex_enter_and_prepare_for_io() first! +Prepares a file node for i/o. Opens the file if it is closed. Updates the +pending i/o's field in the node and the system appropriately. Takes the node +off the LRU list if it is in the LRU list. The caller must hold the fil_sys +mutex. */ +static void -fil_reserve_right_to_open(void) -/*===========================*/ -{ -loop: - mutex_enter(&(fil_system->mutex)); - - if (fil_system->n_open_pending == fil_system->max_n_open) { - - /* It is not sure we can open the file if it is closed: wait */ - - os_event_reset(fil_system->can_open); - - mutex_exit(&(fil_system->mutex)); +fil_node_prepare_for_io( +/*====================*/ + fil_node_t* node, /* in: file node */ + fil_system_t* system, /* in: tablespace memory cache */ + fil_space_t* space); /* in: space */ +/************************************************************************ +Updates the data structures when an i/o operation finishes. Updates the +pending i/o's field in the node appropriately. */ +static +void +fil_node_complete_io( +/*=================*/ + fil_node_t* node, /* in: file node */ + fil_system_t* system, /* in: tablespace memory cache */ + ulint type); /* in: OS_FILE_WRITE or OS_FILE_READ; marks + the node as modified if + type == OS_FILE_WRITE */ - os_event_wait(fil_system->can_open); - goto loop; - } +/*********************************************************************** +Returns the version number of a tablespace, -1 if not found. */ - fil_system->max_n_open--; +ib_longlong +fil_space_get_version( +/*==================*/ + /* out: version number, -1 if the tablespace does not + exist in the memory cache */ + ulint id) /* in: space id */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + ib_longlong version = -1; - mutex_exit(&(fil_system->mutex)); -} + ut_ad(system); -/*********************************************************************** -Releases a right to open a single file. */ + mutex_enter(&(system->mutex)); -void -fil_release_right_to_open(void) -/*===========================*/ -{ - mutex_enter(&(fil_system->mutex)); - - if (fil_system->n_open_pending == fil_system->max_n_open) { + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); - os_event_set(fil_system->can_open); + if (space) { + version = space->tablespace_version; } - fil_system->max_n_open++; + mutex_exit(&(system->mutex)); - mutex_exit(&(fil_system->mutex)); + return(version); } /*********************************************************************** @@ -215,8 +297,8 @@ fil_space_get_latch( /* out: latch protecting storage allocation */ ulint id) /* in: space id */ { - fil_space_t* space; fil_system_t* system = fil_system; + fil_space_t* space; ut_ad(system); @@ -224,6 +306,8 @@ fil_space_get_latch( HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + ut_a(space); + mutex_exit(&(system->mutex)); return(&(space->latch)); @@ -238,8 +322,8 @@ fil_space_get_type( /* out: FIL_TABLESPACE or FIL_LOG */ ulint id) /* in: space id */ { - fil_space_t* space; fil_system_t* system = fil_system; + fil_space_t* space; ut_ad(system); @@ -247,6 +331,8 @@ fil_space_get_type( HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + ut_a(space); + mutex_exit(&(system->mutex)); return(space->purpose); @@ -261,17 +347,21 @@ fil_space_get_ibuf_data( /* out: ibuf data for this space */ ulint id) /* in: space id */ { + fil_system_t* system = fil_system; fil_space_t* space; - fil_system_t* system = fil_system; ut_ad(system); + ut_a(id == 0); + mutex_enter(&(system->mutex)); HASH_SEARCH(hash, system->spaces, id, space, space->id == id); mutex_exit(&(system->mutex)); + ut_a(space); + return(space->ibuf_data); } @@ -284,16 +374,16 @@ fil_node_create( char* name, /* in: file name (file must be closed) */ ulint size, /* in: file size in database blocks, rounded downwards to an integer */ - ulint id) /* in: space id where to append */ + ulint id, /* in: space id where to append */ + ibool is_raw) /* in: TRUE if a raw device or a raw disk partition */ { + fil_system_t* system = fil_system; fil_node_t* node; fil_space_t* space; char* name2; - fil_system_t* system = fil_system; ut_a(system); ut_a(name); - ut_a(size > 0); mutex_enter(&(system->mutex)); @@ -305,29 +395,119 @@ fil_node_create( node->name = name2; node->open = FALSE; + + ut_a(!is_raw || srv_start_raw_disk_in_use); + + node->is_raw_disk = is_raw; node->size = size; node->magic_n = FIL_NODE_MAGIC_N; node->n_pending = 0; + node->n_pending_flushes = 0; - node->is_modified = FALSE; + node->modification_counter = 0; + node->flush_counter = 0; HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + if (!space) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: Could not find tablespace %lu for\n" +"InnoDB: file %s from the tablespace memory cache.\n", id, name); + mem_free(name2); + + mem_free(node); + + mutex_exit(&(system->mutex)); + + return; + } + space->size += size; + node->space = space; + UT_LIST_ADD_LAST(chain, space->chain, node); mutex_exit(&(system->mutex)); } +/************************************************************************ +Opens a the file of a node of a tablespace. The caller must own the fil_system +mutex. */ +static +void +fil_node_open_file( +/*===============*/ + fil_node_t* node, /* in: file node */ + fil_system_t* system, /* in: tablespace memory cache */ + fil_space_t* space) /* in: space */ +{ + ib_longlong size_bytes; + ulint size_low; + ulint size_high; + ibool ret; + + ut_ad(mutex_own(&(system->mutex))); + + ut_a(node->n_pending == 0); + ut_a(node->open == FALSE); + + /* printf("Opening file %s\n", node->name); */ + + if (space->purpose == FIL_LOG) { + node->handle = os_file_create(node->name, OS_FILE_OPEN, + OS_FILE_AIO, OS_LOG_FILE, &ret); + } else if (node->is_raw_disk) { + node->handle = os_file_create(node->name, + OS_FILE_OPEN_RAW, + OS_FILE_AIO, OS_DATA_FILE, &ret); + } else { + node->handle = os_file_create(node->name, OS_FILE_OPEN, + OS_FILE_AIO, OS_DATA_FILE, &ret); + } + + ut_a(ret); + + node->open = TRUE; + + system->n_open++; + + if (node->size == 0) { + /* It must be a single-table tablespace and we do not know the + size of the file yet */ + + ut_a(space->id != 0); + + os_file_get_size(node->handle, &size_low, &size_high); + + size_bytes = (((ib_longlong)size_high) << 32) + + (ib_longlong)size_low; + + if (size_bytes >= FSP_EXTENT_SIZE * UNIV_PAGE_SIZE) { + node->size = (ulint) ((size_bytes / (1024 * 1024)) + * ((1024 * 1024) / UNIV_PAGE_SIZE)); + } else { + node->size = (ulint) (size_bytes / UNIV_PAGE_SIZE); + } + + space->size = node->size; + } + + if (space->purpose == FIL_TABLESPACE && space->id != 0) { + /* Put the node to the LRU list */ + UT_LIST_ADD_FIRST(LRU, system->LRU, node); + } +} + /************************************************************************** Closes a file. */ static void -fil_node_close( -/*===========*/ +fil_node_close_file( +/*================*/ fil_node_t* node, /* in: file node */ - fil_system_t* system) /* in: file system */ + fil_system_t* system) /* in: tablespace memory cache */ { ibool ret; @@ -335,32 +515,214 @@ fil_node_close( ut_ad(mutex_own(&(system->mutex))); ut_a(node->open); ut_a(node->n_pending == 0); + ut_a(node->n_pending_flushes == 0); ret = os_file_close(node->handle); ut_a(ret); + /* printf("Closing file %s\n", node->name); */ + node->open = FALSE; + ut_a(system->n_open > 0); + system->n_open--; - /* The node is in the LRU list, remove it */ - UT_LIST_REMOVE(LRU, system->LRU, node); + if (node->space->purpose == FIL_TABLESPACE && node->space->id != 0) { + ut_a(UT_LIST_GET_LEN(system->LRU) > 0); + + /* The node is in the LRU list, remove it */ + UT_LIST_REMOVE(LRU, system->LRU, node); + } +} + +/************************************************************************ +Tries to close a file in the LRU list. The caller must hold the fil_sys +mutex. */ +static +ibool +fil_try_to_close_file_in_LRU( +/*=========================*/ + /* out: TRUE if success, FALSE if should retry + later; since i/o's generally complete in < + 100 ms, and as InnoDB writes at most 128 pages + from the buffer pool in a batch, and then + immediately flushes the files, there is a good + chance that the next time we find a suitable + node from the LRU list */ + ibool print_info) /* in: if TRUE, prints information why it + cannot close a file */ +{ + fil_system_t* system = fil_system; + fil_node_t* node; + + ut_ad(mutex_own(&(system->mutex))); + + node = UT_LIST_GET_LAST(system->LRU); + + if (print_info) { + fprintf(stderr, +"InnoDB: fil_sys open file LRU len %lu\n", UT_LIST_GET_LEN(system->LRU)); + } + + while (node != NULL) { + if (node->modification_counter == node->flush_counter + && node->n_pending_flushes == 0) { + + fil_node_close_file(node, system); + + return(TRUE); + } + + if (print_info && node->n_pending_flushes > 0) { + fprintf(stderr, +"InnoDB: cannot close file %s, because n_pending_flushes %lu\n", node->name, + node->n_pending_flushes); + } + + if (print_info + && node->modification_counter != node->flush_counter) { + fprintf(stderr, +"InnoDB: cannot close file %s, because mod_count %lld != fl_count %lld\n", + node->name, node->modification_counter, + node->flush_counter); + } + + node = UT_LIST_GET_PREV(LRU, node); + } + + return(FALSE); } /*********************************************************************** -Frees a file node object from a file system. */ +Reserves the fil_system mutex and tries to make sure we can open at least one +file while holding it. This should be called before calling +fil_node_prepare_for_io(), because that function may need to open a file. */ +static +void +fil_mutex_enter_and_prepare_for_io( +/*===============================*/ + ulint space_id) /* in: space id */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + ibool success; + ibool print_info = FALSE; + ulint count = 0; + ulint count2 = 0; + + ut_ad(!mutex_own(&(system->mutex))); +retry: + mutex_enter(&(system->mutex)); + + if (space_id == 0 || space_id >= SRV_LOG_SPACE_FIRST_ID) { + /* We keep log files and system tablespace files always open; + this is important in preventing deadlocks in this module, as + a page read completion often performs another read from the + insert buffer. The insert buffer is in tablespace 0, and we + cannot end up waiting in this function. */ + + return; + } + + if (system->n_open < system->max_n_open) { + + return; + } + + HASH_SEARCH(hash, system->spaces, space_id, space, + space->id == space_id); + if (space != NULL && space->stop_ios) { + /* We are going to do a rename file and want to stop new i/o's + for a while */ + + if (count2 > 20000) { + fprintf(stderr, +"InnoDB: Warning: tablespace %s has i/o ops stopped for a long time %lu\n", + space->name, count2); + } + + mutex_exit(&(system->mutex)); + + os_thread_sleep(20000); + + count2++; + + goto retry; + } + + /* If the file is already open, no need to do anything; if the space + does not exist, we handle the situation in the function which called + this function */ + + if (!space || UT_LIST_GET_FIRST(space->chain)->open) { + + return; + } + + if (count > 1) { + print_info = TRUE; + } + + /* Too many files are open, try to close some */ +close_more: + success = fil_try_to_close_file_in_LRU(print_info); + + if (success && system->n_open >= system->max_n_open) { + + goto close_more; + } + + if (system->n_open < system->max_n_open) { + /* Ok */ + + return; + } + + if (count >= 2) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: too many (%lu) files stay open while the maximum\n" +"InnoDB: allowed value would be %lu.\n" +"InnoDB: You may need to raise the value of innodb_max_files_open in\n" +"InnoDB: my.cnf.\n", system->n_open, system->max_n_open); + + return; + } + + mutex_exit(&(system->mutex)); + + /* Wake the i/o-handler threads to make sure pending i/o's are + performed */ + os_aio_simulated_wake_handler_threads(); + + os_thread_sleep(20000); + + /* Flush tablespaces so that we can close modified files in the LRU + list */ + + fil_flush_file_spaces(FIL_TABLESPACE); + + count++; + + goto retry; +} + +/*********************************************************************** +Frees a file node object from a tablespace memory cache. */ static void fil_node_free( /*==========*/ fil_node_t* node, /* in, own: file node */ - fil_system_t* system, /* in: file system */ + fil_system_t* system, /* in: tablespace memory cache */ fil_space_t* space) /* in: space where the file node is chained */ { ut_ad(node && system && space); ut_ad(mutex_own(&(system->mutex))); ut_a(node->magic_n == FIL_NODE_MAGIC_N); + ut_a(node->n_pending == 0); if (node->open) { - fil_node_close(node, system); + fil_node_close_file(node, system); } space->size -= node->size; @@ -383,9 +745,9 @@ fil_space_truncate_start( if this does not equal to the combined size of some initial files in the space */ { + fil_system_t* system = fil_system; fil_node_t* node; fil_space_t* space; - fil_system_t* system = fil_system; mutex_enter(&(system->mutex)); @@ -394,7 +756,6 @@ fil_space_truncate_start( ut_a(space); while (trunc_len > 0) { - node = UT_LIST_GET_FIRST(space->chain); ut_a(node->size * UNIV_PAGE_SIZE >= trunc_len); @@ -405,17 +766,323 @@ fil_space_truncate_start( } mutex_exit(&(system->mutex)); -} +} + +/*********************************************************************** +Creates a space memory object and puts it to the tablespace memory cache. If +there is an error, prints an error message to the .err log. */ + +ibool +fil_space_create( +/*=============*/ + /* out: TRUE if success */ + char* name, /* in: space name */ + ulint id, /* in: space id */ + ulint purpose)/* in: FIL_TABLESPACE, or FIL_LOG if log */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + char* name2; + ulint namesake_id; +try_again: + /*printf( + "InnoDB: Adding tablespace %lu of name %s, purpose %lu\n", id, name, + purpose);*/ + + ut_a(system); + ut_a(name); + + mutex_enter(&(system->mutex)); + + HASH_SEARCH(name_hash, system->name_hash, ut_fold_string(name), space, + 0 == strcmp(name, space->name)); + if (space != NULL) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: trying to init to the tablespace memory cache\n" +"InnoDB: a tablespace %lu of name %s,\n" +"InnoDB: but a tablespace %lu of the same name %s\n" +"InnoDB: already exists in the tablespace memory cache!\n", + id, name, space->id, space->name); + + if (id == 0 || purpose != FIL_TABLESPACE) { + + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + fprintf(stderr, +"InnoDB: We assume that InnoDB did a crash recovery, and you had\n" +"InnoDB: an .ibd file for which the table did not exist in the\n" +"InnoDB: InnoDB internal data dictionary in the ibdata files.\n" +"InnoDB: We assume that you later removed the .ibd and .frm files,\n" +"InnoDB: and are now trying to recreate the table. We now remove the\n" +"InnoDB: conflicting tablespace object from the memory cache and try\n" +"InnoDB: the init again.\n"); + + namesake_id = space->id; + + mutex_exit(&(system->mutex)); + + fil_space_free(namesake_id); + + goto try_again; + } + + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + + if (space != NULL) { + fprintf(stderr, +"InnoDB: Error: trying to add tablespace %lu of name %s\n" +"InnoDB: to the tablespace memory cache, but tablespace\n" +"InnoDB: %lu of name %s already exists in the tablespace\n" +"InnoDB: memory cache!\n", id, name, space->id, space->name); + + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + space = mem_alloc(sizeof(fil_space_t)); + + name2 = mem_alloc(ut_strlen(name) + 1); + + ut_strcpy(name2, name); + + space->name = name2; + space->id = id; + + system->tablespace_version++; + space->tablespace_version = + system->tablespace_version; + space->mark = FALSE; + + if (purpose == FIL_TABLESPACE && id > system->max_assigned_id) { + system->max_assigned_id = id; + } + + space->stop_ios = FALSE; + space->stop_ibuf_merges = FALSE; + space->is_being_deleted = FALSE; + space->purpose = purpose; + space->size = 0; + + space->n_reserved_extents = 0; + + space->n_pending_flushes = 0; + space->n_pending_ibuf_merges = 0; + + UT_LIST_INIT(space->chain); + space->magic_n = FIL_SPACE_MAGIC_N; + + space->ibuf_data = NULL; + + rw_lock_create(&(space->latch)); + rw_lock_set_level(&(space->latch), SYNC_FSP); + + HASH_INSERT(fil_space_t, hash, system->spaces, id, space); + + HASH_INSERT(fil_space_t, name_hash, system->name_hash, + ut_fold_string(name), space); + UT_LIST_ADD_LAST(space_list, system->space_list, space); + + mutex_exit(&(system->mutex)); + + return(TRUE); +} + +/*********************************************************************** +Assigns a new space id for a new single-table tablespace. This works simply by +incrementing the global counter. If 4 billion id's is not enough, we may need +to recycle id's. */ +static +ulint +fil_assign_new_space_id(void) +/*=========================*/ + /* out: new tablespace id; ULINT_UNDEFINED if could + not assign an id */ +{ + fil_system_t* system = fil_system; + ulint id; + + mutex_enter(&(system->mutex)); + + system->max_assigned_id++; + + id = system->max_assigned_id; + + if (id > (SRV_LOG_SPACE_FIRST_ID / 2) && (id % 1000000UL == 0)) { + ut_print_timestamp(stderr); + fprintf(stderr, +"InnoDB: Warning: you are running out of new single-table tablespace id's.\n" +"InnoDB: Current counter is %lu and it must not exceed %lu!\n" +"InnoDB: To reset the counter to zero you have to dump all your tables and\n" +"InnoDB: recreate the whole InnoDB installation.\n", id, + SRV_LOG_SPACE_FIRST_ID); + } + + if (id >= SRV_LOG_SPACE_FIRST_ID) { + ut_print_timestamp(stderr); + fprintf(stderr, +"InnoDB: You have run out of single-table tablespace id's!\n" +"InnoDB: Current counter is %lu.\n" +"InnoDB: To reset the counter to zero you have to dump all your tables and\n" +"InnoDB: recreate the whole InnoDB installation.\n", id); + system->max_assigned_id--; + + id = ULINT_UNDEFINED; + } + + mutex_exit(&(system->mutex)); + + return(id); +} + +/*********************************************************************** +Frees a space object from the tablespace memory cache. Closes the files in +the chain but does not delete them. There must not be any pending i/o's or +flushes on the files. */ + +ibool +fil_space_free( +/*===========*/ + /* out: TRUE if success */ + ulint id) /* in: space id */ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + fil_space_t* namespace; + fil_node_t* fil_node; + + mutex_enter(&(system->mutex)); + + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + + if (!space) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: trying to remove tablespace %lu from the cache but\n" +"InnoDB: it is not there.\n", id); + + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + HASH_DELETE(fil_space_t, hash, system->spaces, id, space); + + HASH_SEARCH(name_hash, system->name_hash, ut_fold_string(space->name), + namespace, 0 == strcmp(space->name, namespace->name)); + ut_a(namespace); + ut_a(space == namespace); + + HASH_DELETE(fil_space_t, name_hash, system->name_hash, + ut_fold_string(space->name), space); + + UT_LIST_REMOVE(space_list, system->space_list, space); + + ut_a(space->magic_n == FIL_SPACE_MAGIC_N); + ut_a(0 == space->n_pending_flushes); + + fil_node = UT_LIST_GET_FIRST(space->chain); + + while (fil_node != NULL) { + fil_node_free(fil_node, system, space); + + fil_node = UT_LIST_GET_FIRST(space->chain); + } + + ut_a(0 == UT_LIST_GET_LEN(space->chain)); + + mutex_exit(&(system->mutex)); + + rw_lock_free(&(space->latch)); + + mem_free(space->name); + mem_free(space); + + return(TRUE); +} + +/*********************************************************************** +Returns the size of the space in pages. The tablespace must be cached in the +memory cache. */ + +ulint +fil_space_get_size( +/*===============*/ + /* out: space size, 0 if space not found */ + ulint id) /* in: space id */ +{ + fil_system_t* system = fil_system; + fil_node_t* node; + fil_space_t* space; + ulint size; + + ut_ad(system); + + fil_mutex_enter_and_prepare_for_io(id); + + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + + if (space == NULL) { + mutex_exit(&(system->mutex)); + + return(0); + } + + if (space->size == 0 && space->purpose == FIL_TABLESPACE) { + ut_a(id != 0); + + ut_a(1 == UT_LIST_GET_LEN(space->chain)); + + node = UT_LIST_GET_FIRST(space->chain); + + /* It must be a single-table tablespace and we have not opened + the file yet; the following calls will open it and update the + size fields */ + + fil_node_prepare_for_io(node, system, space); + fil_node_complete_io(node, system, OS_FILE_READ); + } + + size = space->size; + + mutex_exit(&(system->mutex)); + + return(size); +} + +/*********************************************************************** +Checks if the pair space, page_no refers to an existing page in a tablespace +file space. The tablespace must be cached in the memory cache. */ + +ibool +fil_check_adress_in_tablespace( +/*===========================*/ + /* out: TRUE if the address is meaningful */ + ulint id, /* in: space id */ + ulint page_no)/* in: page number */ +{ + if (fil_space_get_size(id) > page_no) { + + return(TRUE); + } + + return(FALSE); +} /******************************************************************** -Creates a file system object. */ +Creates a the tablespace memory cache. */ static fil_system_t* fil_system_create( /*==============*/ - /* out, own: file system object */ + /* out, own: tablespace memory cache */ ulint hash_size, /* in: hash table size */ - ulint max_n_open) /* in: maximum number of open files */ + ulint max_n_open) /* in: maximum number of open files; must be + > 10 */ { fil_system_t* system; @@ -429,12 +1096,17 @@ fil_system_create( mutex_set_level(&(system->mutex), SYNC_ANY_LATCH); system->spaces = hash_create(hash_size); + system->name_hash = hash_create(hash_size); UT_LIST_INIT(system->LRU); - system->n_open_pending = 0; + system->n_open = 0; system->max_n_open = max_n_open; - system->can_open = os_event_create(NULL); + + system->modification_counter = 0; + system->max_assigned_id = 0; + + system->tablespace_version = 0; UT_LIST_INIT(system->space_list); @@ -442,7 +1114,7 @@ fil_system_create( } /******************************************************************** -Initializes the file system of this module. */ +Initializes the tablespace memory cache. */ void fil_init( @@ -451,11 +1123,119 @@ fil_init( { ut_a(fil_system == NULL); + /*printf("Initializing the tablespace cache with max %lu open files\n", + max_n_open); */ fil_system = fil_system_create(FIL_SYSTEM_HASH_SIZE, max_n_open); } +/*********************************************************************** +Opens all log files and system tablespace data files. They stay open until the +database server shutdown. This should be called at a server startup after the +space objects for the log and the system tablespace have been created. The +purpose of this operation is to make sure we never run out of file descriptors +if we need to read from the insert buffer or to write to the log. */ + +void +fil_open_log_and_system_tablespace_files(void) +/*==========================================*/ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + fil_node_t* node; + + mutex_enter(&(system->mutex)); + + space = UT_LIST_GET_FIRST(system->space_list); + + while (space != NULL) { + if (space->purpose != FIL_TABLESPACE || space->id == 0) { + node = UT_LIST_GET_FIRST(space->chain); + + while (node != NULL) { + if (!node->open) { + fil_node_open_file(node, system, + space); + } + if (system->max_n_open < 10 + system->n_open) { + fprintf(stderr, +"InnoDB: Warning: you must raise the value of innodb_max_open_files in\n" +"InnoDB: my.cnf! Remember that InnoDB keeps all log files and all system\n" +"InnoDB: tablespace files open for the whole time mysqld is running, and\n" +"InnoDB: needs to open also some .ibd files if the file-per-table storage\n" +"InnoDB: model is used. Current open files %lu, max allowed open files %lu.\n", + system->n_open, system->max_n_open); + } + node = UT_LIST_GET_NEXT(chain, node); + } + } + space = UT_LIST_GET_NEXT(space_list, space); + } + + mutex_exit(&(system->mutex)); +} + +/*********************************************************************** +Closes all open files. There must not be any pending i/o's or not flushed +modifications in the files. */ + +void +fil_close_all_files(void) +/*=====================*/ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + fil_node_t* node; + + mutex_enter(&(system->mutex)); + + space = UT_LIST_GET_FIRST(system->space_list); + + while (space != NULL) { + node = UT_LIST_GET_FIRST(space->chain); + + while (node != NULL) { + if (node->open) { + fil_node_close_file(node, system); + } + node = UT_LIST_GET_NEXT(chain, node); + } + space = UT_LIST_GET_NEXT(space_list, space); + } + + mutex_exit(&(system->mutex)); +} + +/*********************************************************************** +Sets the max tablespace id counter if the given number is bigger than the +previous value. */ + +void +fil_set_max_space_id_if_bigger( +/*===========================*/ + ulint max_id) /* in: maximum known id */ +{ + fil_system_t* system = fil_system; + + if (max_id >= SRV_LOG_SPACE_FIRST_ID) { + fprintf(stderr, +"InnoDB: Fatal error: max tablespace id is too high, %lu\n", max_id); + ut_a(0); + } + + mutex_enter(&(system->mutex)); + + if (system->max_assigned_id < max_id) { + + system->max_assigned_id = max_id; + } + + mutex_exit(&(system->mutex)); +} + /******************************************************************** -Writes the flushed lsn to the header of each file space. */ +Initializes the ibuf data structure for space 0 == the system tablespace. +This can be called after the file space headers have been created and the +dictionary system has been initialized. */ void fil_ibuf_init_at_db_start(void) @@ -464,39 +1244,37 @@ fil_ibuf_init_at_db_start(void) fil_space_t* space; space = UT_LIST_GET_FIRST(fil_system->space_list); - - while (space) { - if (space->purpose == FIL_TABLESPACE) { - space->ibuf_data = ibuf_data_init_for_space(space->id); - } - space = UT_LIST_GET_NEXT(space_list, space); - } + ut_a(space); + ut_a(space->purpose == FIL_TABLESPACE); + + space->ibuf_data = ibuf_data_init_for_space(space->id); } /******************************************************************** -Writes the flushed lsn and the latest archived log number to the page -header of the first page of a data file. */ +Writes the flushed lsn and the latest archived log number to the page header +of the first page of a data file. */ static ulint fil_write_lsn_and_arch_no_to_file( /*==============================*/ ulint space_id, /* in: space number */ - ulint sum_of_sizes, /* in: combined size of previous files in space, - in database pages */ + ulint sum_of_sizes, /* in: combined size of previous files in + space, in database pages */ dulint lsn, /* in: lsn to write */ ulint arch_log_no) /* in: archived log number to write */ { byte* buf1; byte* buf; + UT_NOT_USED(arch_log_no); + buf1 = mem_alloc(2 * UNIV_PAGE_SIZE); buf = ut_align(buf1, UNIV_PAGE_SIZE); fil_read(TRUE, space_id, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL); mach_write_to_8(buf + FIL_PAGE_FILE_FLUSH_LSN, lsn); - mach_write_to_4(buf + FIL_PAGE_ARCH_LOG_NO, arch_log_no); fil_write(TRUE, space_id, sum_of_sizes, 0, UNIV_PAGE_SIZE, buf, NULL); @@ -505,7 +1283,7 @@ fil_write_lsn_and_arch_no_to_file( /******************************************************************** Writes the flushed lsn and the latest archived log number to the page -header of the first page of each data file. */ +header of the first page of each data file in the system tablespace. */ ulint fil_write_flushed_lsn_to_data_files( @@ -524,18 +1302,20 @@ fil_write_flushed_lsn_to_data_files( space = UT_LIST_GET_FIRST(fil_system->space_list); while (space) { - if (space->purpose == FIL_TABLESPACE) { + /* We only write the lsn to the system tablespace + (space id == 0) files */ + + if (space->id == 0) { + ut_a(space->purpose == FIL_TABLESPACE); sum_of_sizes = 0; node = UT_LIST_GET_FIRST(space->chain); - while (node) { mutex_exit(&(fil_system->mutex)); err = fil_write_lsn_and_arch_no_to_file( - space->id, - sum_of_sizes, - lsn, arch_log_no); + space->id, sum_of_sizes, + lsn, arch_log_no); if (err != DB_SUCCESS) { return(err); @@ -544,11 +1324,11 @@ fil_write_flushed_lsn_to_data_files( mutex_enter(&(fil_system->mutex)); sum_of_sizes += node->size; - node = UT_LIST_GET_NEXT(chain, node); } - } + break; /* there is only one space with id == 0 */ + } space = UT_LIST_GET_NEXT(space_list, space); } @@ -575,8 +1355,9 @@ fil_read_flushed_lsn_and_arch_log_no( byte* buf; byte* buf2; dulint flushed_lsn; - ulint arch_log_no; - + ulint arch_log_no = 0; /* since InnoDB does not archive + its own logs under MySQL, this + parameter is not relevant */ buf2 = ut_malloc(2 * UNIV_PAGE_SIZE); /* Align the memory for a possible read from a raw device */ buf = ut_align(buf2, UNIV_PAGE_SIZE); @@ -584,7 +1365,6 @@ fil_read_flushed_lsn_and_arch_log_no( os_file_read(data_file, buf, 0, 0, UNIV_PAGE_SIZE); flushed_lsn = mach_read_from_8(buf + FIL_PAGE_FILE_FLUSH_LSN); - arch_log_no = mach_read_from_4(buf + FIL_PAGE_ARCH_LOG_NO); ut_free(buf2); @@ -611,143 +1391,959 @@ fil_read_flushed_lsn_and_arch_log_no( } } +/*================ SINGLE-TABLE TABLESPACES ==========================*/ + /*********************************************************************** -Creates a space object and puts it to the file system. */ +Increments the count of pending insert buffer page merges, if space is not +being deleted. */ -void -fil_space_create( -/*=============*/ - char* name, /* in: space name */ - ulint id, /* in: space id */ - ulint purpose)/* in: FIL_TABLESPACE, or FIL_LOG if log */ +ibool +fil_inc_pending_ibuf_merges( +/*========================*/ + /* out: TRUE if being deleted, and ibuf merges should + be skipped */ + ulint id) /* in: space id */ { - fil_space_t* space; - char* name2; - fil_system_t* system = fil_system; - - ut_a(system); - ut_a(name); - -#ifndef UNIV_BASIC_LOG_DEBUG - /* Spaces with an odd id number are reserved to replicate spaces - used in log debugging */ + fil_system_t* system = fil_system; + fil_space_t* space; - ut_anp((purpose == FIL_LOG) || (id % 2 == 0)); -#endif mutex_enter(&(system->mutex)); - space = mem_alloc(sizeof(fil_space_t)); - - name2 = mem_alloc(ut_strlen(name) + 1); + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); - ut_strcpy(name2, name); + if (space == NULL) { + fprintf(stderr, +"InnoDB: Error: trying to do ibuf merge to a dropped tablespace %lu\n", id); + } - space->name = name2; - space->id = id; - space->purpose = purpose; - space->size = 0; + if (space == NULL || space->stop_ibuf_merges) { + mutex_exit(&(system->mutex)); - space->n_reserved_extents = 0; - - UT_LIST_INIT(space->chain); - space->magic_n = FIL_SPACE_MAGIC_N; + return(TRUE); + } - space->ibuf_data = NULL; - - rw_lock_create(&(space->latch)); - rw_lock_set_level(&(space->latch), SYNC_FSP); - - HASH_INSERT(fil_space_t, hash, system->spaces, id, space); + space->n_pending_ibuf_merges++; - UT_LIST_ADD_LAST(space_list, system->space_list, space); - mutex_exit(&(system->mutex)); + + return(FALSE); } /*********************************************************************** -Frees a space object from a file system. Closes the files in the chain -but does not delete them. */ +Decrements the count of pending insert buffer page merges. */ void -fil_space_free( -/*===========*/ +fil_decr_pending_ibuf_merges( +/*========================*/ ulint id) /* in: space id */ { + fil_system_t* system = fil_system; fil_space_t* space; - fil_node_t* fil_node; - fil_system_t* system = fil_system; mutex_enter(&(system->mutex)); HASH_SEARCH(hash, system->spaces, id, space, space->id == id); - HASH_DELETE(fil_space_t, hash, system->spaces, id, space); + if (space == NULL) { + fprintf(stderr, +"InnoDB: Error: decrementing ibuf merge of a dropped tablespace %lu\n", id); + } - UT_LIST_REMOVE(space_list, system->space_list, space); + if (space != NULL) { + space->n_pending_ibuf_merges--; + } - ut_a(space->magic_n == FIL_SPACE_MAGIC_N); + mutex_exit(&(system->mutex)); +} - fil_node = UT_LIST_GET_FIRST(space->chain); +/*********************************************************************** +Deletes a single-table tablespace. The tablespace must be cached in the +memory cache. */ - ut_d(UT_LIST_VALIDATE(chain, fil_node_t, space->chain)); +ibool +fil_delete_tablespace( +/*==================*/ + /* out: TRUE if success */ + ulint id) /* in: space id */ +{ + fil_system_t* system = fil_system; + ibool success; + fil_space_t* space; + fil_node_t* node; + ulint count = 0; + char path[OS_FILE_MAX_PATH]; - while (fil_node != NULL) { - fil_node_free(fil_node, system, space); + ut_a(id != 0); +stop_ibuf_merges: + mutex_enter(&(system->mutex)); - fil_node = UT_LIST_GET_FIRST(space->chain); - } + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + + if (space != NULL) { + space->stop_ibuf_merges = TRUE; + + if (space->n_pending_ibuf_merges == 0) { + mutex_exit(&(system->mutex)); + + count = 0; + + goto try_again; + } else { + if (count > 5000) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: trying to delete tablespace %s,\n" +"InnoDB: but there are %lu pending ibuf merges on it.\n" +"InnoDB: Loop %lu.\n", space->name, space->n_pending_ibuf_merges, count); + } + + mutex_exit(&(system->mutex)); + + os_thread_sleep(20000); + count++; + + goto stop_ibuf_merges; + } + } + + mutex_exit(&(system->mutex)); + count = 0; + +try_again: + mutex_enter(&(system->mutex)); + + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + + if (space == NULL) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: cannot delete tablespace %lu because it is not found\n" +"InnoDB: in the tablespace memory cache.\n", id); + + mutex_exit(&(system->mutex)); - ut_d(UT_LIST_VALIDATE(chain, fil_node_t, space->chain)); - ut_ad(0 == UT_LIST_GET_LEN(space->chain)); + return(FALSE); + } + + ut_a(space); + ut_a(strlen(space->name) < OS_FILE_MAX_PATH); + ut_a(space->n_pending_ibuf_merges == 0); + + strcpy(path, space->name); + + space->is_being_deleted = TRUE; + + ut_a(UT_LIST_GET_LEN(space->chain) == 1); + node = UT_LIST_GET_FIRST(space->chain); + + if (space->n_pending_flushes > 0 || node->n_pending > 0) { + if (count > 1000) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: trying to delete tablespace %s,\n" +"InnoDB: but there are %lu flushes and %lu pending i/o's on it\n" +"InnoDB: Loop %lu.\n", space->name, space->n_pending_flushes, node->n_pending, + count); + } + mutex_exit(&(system->mutex)); + os_thread_sleep(20000); + + count++; + + goto try_again; + } mutex_exit(&(system->mutex)); - mem_free(space->name); - mem_free(space); + /* Invalidate in the buffer pool all pages belonging to the + tablespace. Since we have set space->is_being_deleted = TRUE, readahead + or ibuf merge can no longer read more pages of this tablespace to the + buffer pool. Thus we can clean the tablespace out of the buffer pool + completely and permanently. The flag is_being_deleted also prevents + fil_flush() from being applied to this tablespace. */ + + buf_LRU_invalidate_tablespace(id); + + success = fil_space_free(id); + + if (success) { + success = os_file_delete(path); + + if (success) { + + return(TRUE); + } + } + + return(FALSE); } /*********************************************************************** -Returns the size of the space in pages. */ +Discards a single-table tablespace. The tablespace must be cached in the +memory cache. Discarding is like deleting a tablespace, but +1) we do not drop the table from the data dictionary; +2) we remove all insert buffer entries for the tablespace immediately; in DROP +TABLE they are only removed gradually in the background; +3) when the user does IMPORT TABLESPACE, the tablespace will have the same id +as it originally had. */ -ulint -fil_space_get_size( -/*===============*/ - /* out: space size */ +ibool +fil_discard_tablespace( +/*===================*/ + /* out: TRUE if success */ ulint id) /* in: space id */ { - fil_space_t* space; + ibool success; + + success = fil_delete_tablespace(id); + + if (!success) { + fprintf(stderr, +"InnoDB: Warning: cannot delete tablespace %lu in DISCARD TABLESPACE.\n" +"InnoDB: But let us remove the insert buffer entries for this tablespace.\n", + id); + } + + /* Remove all insert buffer entries for the tablespace */ + + ibuf_delete_for_discarded_space(id); + + return(TRUE); +} + +/*********************************************************************** +Renames the memory cache structures of a single-table tablespace. */ +static +ibool +fil_rename_tablespace_in_mem( +/*=========================*/ + /* out: TRUE if success */ + fil_space_t* space, /* in: tablespace memory object */ + fil_node_t* node, /* in: file node of that tablespace */ + char* path) /* in: new name */ +{ fil_system_t* system = fil_system; - ulint size; + fil_space_t* space2; + char* old_name = space->name; + + HASH_SEARCH(name_hash, system->name_hash, ut_fold_string(old_name), + space2, 0 == strcmp(old_name, space2->name)); + if (space != space2) { + fprintf(stderr, +"InnoDB: Error: cannot find %s in tablespace memory cache\n", old_name); - ut_ad(system); + return(FALSE); + } + + HASH_SEARCH(name_hash, system->name_hash, ut_fold_string(path), + space2, 0 == strcmp(path, space2->name)); + if (space2 != NULL) { + fprintf(stderr, +"InnoDB: Error: %s is already in tablespace memory cache\n", path); + + return(FALSE); + } + + HASH_DELETE(fil_space_t, name_hash, system->name_hash, + ut_fold_string(space->name), space); + mem_free(space->name); + mem_free(node->name); + + space->name = mem_alloc(strlen(path) + 1); + node->name = mem_alloc(strlen(path) + 1); + + strcpy(space->name, path); + strcpy(node->name, path); + + HASH_INSERT(fil_space_t, name_hash, system->name_hash, + ut_fold_string(path), space); + return(TRUE); +} + +/*********************************************************************** +Renames a single-table tablespace. The tablespace must be cached in the +tablespace memory cache. */ + +ibool +fil_rename_tablespace( +/*==================*/ + /* out: TRUE if success */ + char* old_name, /* in: old table name in the standard + databasename/tablename format of InnoDB */ + ulint id, /* in: space id */ + char* new_name) /* in: new table name in the standard + databasename/tablename format of InnoDB */ +{ + fil_system_t* system = fil_system; + ibool success; + fil_space_t* space; + fil_node_t* node; + ulint count = 0; + char old_path[OS_FILE_MAX_PATH]; + char path[OS_FILE_MAX_PATH]; + + ut_a(id != 0); +retry: + count++; + + if (count > 1000) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: problems renaming %s to %s, %lu iterations\n", + old_name, new_name, count); + } mutex_enter(&(system->mutex)); HASH_SEARCH(hash, system->spaces, id, space, space->id == id); - size = space->size; + if (space == NULL) { + fprintf(stderr, +"InnoDB: Error: cannot find space id %lu from the tablespace memory cache\n" +"InnoDB: though the table %s in a rename operation should have that id\n", + id, old_name); + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + if (count > 25000) { + space->stop_ios = FALSE; + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + /* We temporarily close the .ibd file because we do not trust that + operating systems can rename an open file. For the closing we have to + wait until there are no pending i/o's or flushes on the file. */ + + space->stop_ios = TRUE; + + ut_a(UT_LIST_GET_LEN(space->chain) == 1); + node = UT_LIST_GET_FIRST(space->chain); + + if (node->n_pending > 0 || node->n_pending_flushes > 0) { + /* There are pending i/o's or flushes, sleep for a while and + retry */ + + mutex_exit(&(system->mutex)); + + os_thread_sleep(20000); + + goto retry; + + } else if (node->modification_counter > node->flush_counter) { + /* Flush the space */ + + mutex_exit(&(system->mutex)); + + os_thread_sleep(20000); + + fil_flush(id); + + goto retry; + + } else if (node->open) { + /* Close the file */ + + fil_node_close_file(node, system); + } + + /* Check that the old name in the space is right */ + ut_a(strlen(old_name) < OS_FILE_MAX_PATH - 10); + + sprintf(old_path, "./%s.ibd", old_name); + + srv_normalize_path_for_win(old_path); + + ut_a(strcmp(space->name, old_path) == 0); + ut_a(strcmp(node->name, old_path) == 0); + + /* Rename the tablespace and the node in the memory cache */ + + ut_a(strlen(new_name) < OS_FILE_MAX_PATH - 10); + + sprintf(path, "./%s.ibd", new_name); + + srv_normalize_path_for_win(path); + + success = fil_rename_tablespace_in_mem(space, node, path); + + if (!success) { + + goto func_exit; + } + + success = os_file_rename(old_path, path); + + if (!success) { + /* We have to revert the changes we made to the tablespace + memory cache */ + + ut_a(fil_rename_tablespace_in_mem(space, node, old_path)); + } +func_exit: + space->stop_ios = FALSE; + mutex_exit(&(system->mutex)); - return(size); + return(success); } /*********************************************************************** -Checks if the pair space, page_no refers to an existing page in a -tablespace file space. */ +Creates a new single-table tablespace to a database directory of MySQL. +Database directories are under the 'datadir' of MySQL. The datadir is the +directory of a running mysqld program. We can refer to it by simply the +path '.'. */ + +ulint +fil_create_new_single_table_tablespace( +/*===================================*/ + /* out: DB_SUCCESS or error code */ + ulint* space_id, /* out: space id */ + char* tablename, /* in: the table name in the usual + databasename/tablename format of InnoDB */ + ulint size) /* in: the initial size of the tablespace file + in pages, must be > 0 */ +{ + os_file_t file; + ibool ret; + ulint err; + byte* page; + ibool success; + char path[OS_FILE_MAX_PATH]; + + ut_a(strlen(tablename) < OS_FILE_MAX_PATH - 10); + + sprintf(path, "./%s.ibd", tablename); + + srv_normalize_path_for_win(path); + + file = os_file_create(path, OS_FILE_CREATE, OS_FILE_NORMAL, + OS_DATA_FILE, &ret); + if (ret == FALSE) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error creating file %s.\n", path); + + /* The following call will print an error message */ + + err = os_file_get_last_error(TRUE); + + if (err == OS_FILE_ALREADY_EXISTS) { + fprintf(stderr, +"InnoDB: The file already exists though the corresponding table did not\n" +"InnoDB: exist in the InnoDB data dictionary. Have you moved InnoDB\n" +"InnoDB: .ibd files around without using the SQL commands\n" +"InnoDB: DISCARD TABLESPACE and IMPORT TABLESPACE, or did\n" +"InnoDB: mysqld crash in the middle of CREATE TABLE? You can\n" +"InnoDB: resolve the problem by removing the file %s\n" +"InnoDB: under the 'datadir' of MySQL.\n", path); + + return(DB_TABLESPACE_ALREADY_EXISTS); + } + + if (err == OS_FILE_DISK_FULL) { + + return(DB_OUT_OF_FILE_SPACE); + } + + return(DB_ERROR); + } + + page = ut_malloc(UNIV_PAGE_SIZE); + + ret = os_file_set_size(path, file, size * UNIV_PAGE_SIZE, 0); + + if (!ret) { + ut_free(page); + os_file_close(file); + os_file_delete(path); + + return(DB_OUT_OF_FILE_SPACE); + } + + *space_id = fil_assign_new_space_id(); + + if (*space_id == ULINT_UNDEFINED) { + ut_free(page); + os_file_close(file); + os_file_delete(path); + + return(DB_ERROR); + } + + /* We have to write the space id to the file immediately and flush the + file to disk. This is because in crash recovery we must be aware what + tablespaces exist and what are their space id's, so that we can apply + the log records to the right file. It may take quite a while until + buffer pool flush algorithms write anything to the file and flush it to + disk. If we would not write here anything, the file would be filled + with zeros from the call of os_file_set_size(), until a buffer pool + flush would write to it. */ + + memset(page, '\0', UNIV_PAGE_SIZE); + + fsp_header_write_space_id(page, *space_id); + + buf_flush_init_for_writing(page, ut_dulint_zero, *space_id, 0); + + ret = os_file_write(path, file, page, 0, 0, UNIV_PAGE_SIZE); + + ut_free(page); + + if (!ret) { + fprintf(stderr, +"InnoDB: Error: could not write the first page to tablespace %s\n", path); + + os_file_close(file); + os_file_delete(path); + + return(DB_ERROR); + } + + ret = os_file_flush(file); + + if (!ret) { + fprintf(stderr, +"InnoDB: Error: file flush of tablespace %s failed\n", path); + + os_file_close(file); + os_file_delete(path); + + return(DB_ERROR); + } + + os_file_close(file); + + if (*space_id == ULINT_UNDEFINED) { + os_file_delete(path); + + return(DB_ERROR); + } + + success = fil_space_create(path, *space_id, FIL_TABLESPACE); + + if (!success) { + os_file_delete(path); + + return(DB_ERROR); + } + + fil_node_create(path, size, *space_id, FALSE); + + return(DB_SUCCESS); +} + +/************************************************************************ +Tries to open a single-table tablespace and checks the space id is right in +it. If does not succeed, prints an error message to the .err log. This +function is used to open the tablespace when we load a table definition +to the dictionary cache. NOTE that we assume this operation is used under the +protection of the dictionary mutex, so that two users cannot race here. This +operation does not leave the file associated with the tablespace open, but +closes it after we have looked at the space id in it. */ ibool -fil_check_adress_in_tablespace( -/*===========================*/ - /* out: TRUE if the address is meaningful */ +fil_open_single_table_tablespace( +/*=============================*/ + /* out: TRUE if success */ ulint id, /* in: space id */ - ulint page_no)/* in: page number */ + char* name) /* in: table name in the databasename/tablename + format */ { + os_file_t file; + char* filepath; + ibool success; + byte* page; + ulint space_id; + ibool ret = TRUE; + + filepath = ut_malloc(OS_FILE_MAX_PATH); + + ut_a(strlen(name) < OS_FILE_MAX_PATH - 10); + + sprintf(filepath, "./%s.ibd", name); + + srv_normalize_path_for_win(filepath); + + file = os_file_create_simple_no_error_handling(filepath, OS_FILE_OPEN, + OS_FILE_READ_ONLY, &success); + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + + ut_print_timestamp(stderr); + + fprintf(stderr, +" InnoDB: Error: trying to open a table, but could not\n" +"InnoDB: open the tablespace file %s!\n", filepath); + fprintf(stderr, +"InnoDB: have you moved InnoDB .ibd files around without using the\n" +"InnoDB: commands DISCARD TABLESPACE and IMPORT TABLESPACE?\n"); + + ut_free(filepath); + + return(FALSE); + } + + /* Read the first page of the tablespace */ + + page = ut_malloc(UNIV_PAGE_SIZE); + + success = os_file_read(file, page, 0, 0, UNIV_PAGE_SIZE); + + /* We have to read the tablespace id from the file */ + + space_id = fsp_header_get_space_id(page); + + if (space_id != id) { + ut_print_timestamp(stderr); + + fprintf(stderr, +" InnoDB: Error: tablespace id in file %s is %lu, but in the InnoDB\n" +"InnoDB: data dictionary it is %lu.\n", filepath, space_id, id); + fprintf(stderr, +"InnoDB: Have you moved InnoDB .ibd files around without using the\n" +"InnoDB: commands DISCARD TABLESPACE and IMPORT TABLESPACE?\n"); + + ret = FALSE; + + goto func_exit; + } + + success = fil_space_create(filepath, space_id, FIL_TABLESPACE); + + if (!success) { + goto func_exit; + } + + /* We do not measure the size of the file, that is why we pass the 0 + below */ + + fil_node_create(filepath, 0, space_id, FALSE); +func_exit: + os_file_close(file); + ut_free(page); + ut_free(filepath); + + return(ret); +} + +/************************************************************************ +Opens an .ibd file and adds the associated single-table tablespace to the +InnoDB fil0fil.c data structures. */ +static +void +fil_load_single_table_tablespace( +/*=============================*/ + char* dbname, /* in: database name */ + char* filename) /* in: file name (not a path), including the + .ibd extension */ +{ + os_file_t file; + char* filepath; + ibool success; + byte* page; + ulint space_id; + ulint size_low; + ulint size_high; + ib_longlong size; + + filepath = ut_malloc(OS_FILE_MAX_PATH); + + ut_a(strlen(dbname) + strlen(filename) < OS_FILE_MAX_PATH - 10); + + sprintf(filepath, "./%s/%s", dbname, filename); + + srv_normalize_path_for_win(filepath); + + file = os_file_create_simple_no_error_handling(filepath, OS_FILE_OPEN, + OS_FILE_READ_ONLY, &success); + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + + fprintf(stderr, +"InnoDB: Error: could not open single-table tablespace file\n" +"InnoDB: %s!", filepath); + + ut_free(filepath); + + return; + } + + success = os_file_get_size(file, &size_low, &size_high); + + if (!success) { + /* The following call prints an error message */ + os_file_get_last_error(TRUE); + + fprintf(stderr, +"InnoDB: Error: could not measure the size of single-table tablespace file\n" +"InnoDB: %s!", filepath); + + os_file_close(file); + ut_free(filepath); + + return; + } + + size = (((ib_longlong)size_high) << 32) + (ib_longlong)size_low; + + if (size < 4 * UNIV_PAGE_SIZE) { + fprintf(stderr, +"InnoDB: Error: the size of single-table tablespace file %s\n" +"InnoDB: is only %lu %lu, should be at least %lu!", filepath, size_high, + size_low, (ulint)4 * UNIV_PAGE_SIZE); + os_file_close(file); + ut_free(filepath); + + return; + } + + /* Read the first page of the tablespace */ + + page = ut_malloc(UNIV_PAGE_SIZE); + + success = os_file_read(file, page, 0, 0, UNIV_PAGE_SIZE); + + /* We have to read the tablespace id from the file */ + + space_id = fsp_header_get_space_id(page); + + if (space_id == ULINT_UNDEFINED || space_id == 0) { + fprintf(stderr, +"InnoDB: Error: tablespace id %lu in file %s is not sensible\n", space_id, + filepath); + goto func_exit; + } + + success = fil_space_create(filepath, space_id, FIL_TABLESPACE); + + if (!success) { + + goto func_exit; + } + + /* We do not measure the size of the file, that is why we pass the 0 + below */ + + fil_node_create(filepath, 0, space_id, FALSE); +func_exit: + os_file_close(file); + ut_free(page); + ut_free(filepath); +} + +/************************************************************************ +At the server startup, if we need crash recovery, scans the database +directories under the MySQL datadir, looking for .ibd files. Those files are +single-table tablespaces. We need to know the space id in each of them so that +we know into which file we should look to check the contents of a page stored +in the doublewrite buffer, also to know where to apply log records where the +space id is != 0. */ + +ulint +fil_load_single_table_tablespaces(void) +/*===================================*/ + /* out: DB_SUCCESS or error number */ +{ + int ret; + char* dbpath; + os_file_dir_t dir; + os_file_dir_t dbdir; + os_file_stat_t dbinfo; + os_file_stat_t fileinfo; + + /* The datadir of MySQL is always the default directory of mysqld */ + + dir = os_file_opendir((char*)".", TRUE); + + if (dir == NULL) { + + return(DB_ERROR); + } + + dbpath = ut_malloc(OS_FILE_MAX_PATH); + + /* Scan all directories under the datadir. They are the database + directories of MySQL. */ + + ret = os_file_readdir_next_file((char*)".", dir, &dbinfo); + + while (ret == 0) { + /* printf("Looking at %s in datadir\n", dbinfo.name); */ + + if (dbinfo.type == OS_FILE_TYPE_FILE + || dbinfo.type == OS_FILE_TYPE_UNKNOWN) { + + goto next_datadir_item; + } + + /* We found a symlink or a directory; try opening it to see + if a symlink is a directory */ + + ut_a(strlen(dbinfo.name) < OS_FILE_MAX_PATH - 10); + + sprintf(dbpath, "./%s", dbinfo.name); + + srv_normalize_path_for_win(dbpath); + + dbdir = os_file_opendir(dbpath, FALSE); + + if (dbdir != NULL) { + /* printf("Opened dir %s\n", dbinfo.name); */ + + /* We found a database directory; loop through it, + looking for possible .ibd files in it */ + + ret = os_file_readdir_next_file(dbpath, dbdir, + &fileinfo); + while (ret == 0) { + /* printf( +" Looking at file %s\n", fileinfo.name); */ + + if (fileinfo.type == OS_FILE_TYPE_DIR + || dbinfo.type == OS_FILE_TYPE_UNKNOWN) { + goto next_file_item; + } + + /* We found a symlink or a file */ + if (strlen(fileinfo.name) > 4 + && 0 == strcmp(fileinfo.name + + strlen(fileinfo.name) - 4, + ".ibd")) { + /* The name ends in .ibd; try opening + the file */ + fil_load_single_table_tablespace( + dbinfo.name, fileinfo.name); + } +next_file_item: + ret = os_file_readdir_next_file(dbpath, dbdir, + &fileinfo); + } + + if (0 != os_file_closedir(dbdir)) { + fprintf(stderr, +"InnoDB: Warning: could not close database directory %s\n", dbpath); + } + } + +next_datadir_item: + ret = os_file_readdir_next_file((char*)".", dir, &dbinfo); + } + + ut_free(dbpath); + + /* At the end of directory we should get 1 as the return value, -1 + if there was an error */ + if (ret != 1) { + fprintf(stderr, +"InnoDB: Error: os_file_readdir_next_file returned %d in MySQL datadir\n", + ret); + os_file_closedir(dir); + + return(DB_ERROR); + } + + if (0 != os_file_closedir(dir)) { + fprintf(stderr, +"InnoDB: Error: could not close MySQL datadir\n"); + + return(DB_ERROR); + } + + return(DB_SUCCESS); +} + +/************************************************************************ +If we need crash recovery, and we have called +fil_load_single_table_tablespaces() and dict_load_single_table_tablespaces(), +we can call this function to print an error message of orphaned .ibd files +for which there is not a data dictionary entry with a matching table name +and space id. */ + +void +fil_print_orphaned_tablespaces(void) +/*================================*/ +{ + fil_system_t* system = fil_system; + fil_space_t* space; + + mutex_enter(&(system->mutex)); + + space = UT_LIST_GET_FIRST(system->space_list); + + while (space) { + if (space->purpose == FIL_TABLESPACE && space->id != 0 + && !space->mark) { + fprintf(stderr, +"InnoDB: Warning: tablespace %s of id %lu has no matching table in\n" +"InnoDB: the InnoDB data dixtionary.\n", space->name, space->id); + } + + space = UT_LIST_GET_NEXT(space_list, space); + + } + + mutex_exit(&(system->mutex)); +} + +/*********************************************************************** +Returns TRUE if a single-table tablespace does not exist in the memory cache, +or is being deleted there. */ + +ibool +fil_tablespace_deleted_or_being_deleted_in_mem( +/*===========================================*/ + /* out: TRUE if does not exist or is being\ + deleted */ + ulint id, /* in: space id */ + ib_longlong version)/* in: tablespace_version should be this; if + you pass -1 as the value of this, then this + parameter is ignored */ +{ + fil_system_t* system = fil_system; fil_space_t* space; + + ut_ad(system); + + mutex_enter(&(system->mutex)); + + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + + if (space == NULL || space->is_being_deleted) { + mutex_exit(&(system->mutex)); + + return(TRUE); + } + + if (version != -1LL && space->tablespace_version != version) { + mutex_exit(&(system->mutex)); + + return(TRUE); + } + + mutex_exit(&(system->mutex)); + + return(FALSE); +} + +/*********************************************************************** +Returns TRUE if a single-table tablespace exists in the memory cache. */ + +ibool +fil_tablespace_exists_in_mem( +/*=========================*/ + /* out: TRUE if exists */ + ulint id) /* in: space id */ +{ fil_system_t* system = fil_system; - ulint size; - ibool ret; - + fil_space_t* space; + ut_ad(system); mutex_enter(&(system->mutex)); @@ -755,24 +2351,297 @@ fil_check_adress_in_tablespace( HASH_SEARCH(hash, system->spaces, id, space, space->id == id); if (space == NULL) { - ret = FALSE; - } else { - size = space->size; + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + mutex_exit(&(system->mutex)); + + return(TRUE); +} + +/*********************************************************************** +Returns TRUE if a matching tablespace exists in the InnoDB tablespace memory +cache. Note that if we have not done a crash recovery at the database startup, +there may be many tablespaces which are not yet in the memory cache. */ - if (page_no > size) { - ret = FALSE; - } else if (space->purpose != FIL_TABLESPACE) { - ret = FALSE; +ibool +fil_space_for_table_exists_in_mem( +/*==============================*/ + /* out: TRUE if a matching tablespace + exists in the memory cache */ + ulint id, /* in: space id */ + char* name, /* in: table name in the standard + 'databasename/tablename' format */ + ibool mark_space, /* in: in crash recovery, at database startup + we mark all spaces which have an associated + table in the InnoDB data dictionary, so that + we can print a warning about orphaned + tablespaces */ + ibool print_error_if_does_not_exist) + /* in: print detailed error information to + the .err log if a matching tablespace is + not found from memory */ +{ + fil_system_t* system = fil_system; + fil_space_t* namespace; + fil_space_t* space; + char path[OS_FILE_MAX_PATH]; + + ut_ad(system); + + mutex_enter(&(system->mutex)); + + sprintf(path, "./%s.ibd", name); + srv_normalize_path_for_win(path); + + /* Look if there is a space with the same id */ + + HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + + /* Look if there is a space with the same name; the name is the + directory path from the datadir to the file */ + + HASH_SEARCH(name_hash, system->name_hash, + ut_fold_string(path), namespace, + 0 == strcmp(namespace->name, path)); + if (!print_error_if_does_not_exist) { + if (space && space == namespace) { + if (mark_space) { + space->mark = TRUE; + } + + mutex_exit(&(system->mutex)); + + return(TRUE); + } + + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + if (space == NULL) { + if (namespace == NULL) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: table %s in InnoDB data dictionary has tablespace\n" +"InnoDB: id %lu, but tablespace with that id or name does not exist. Have\n" +"InnoDB: you deleted or moved .ibd files? We cannot open table %s now.\n", + name, id, name); } else { - ret = TRUE; + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: table %s in InnoDB data dictionary has tablespace\n" +"InnoDB: id %lu, but tablespace with that id does not exist. There is\n" +"InnoDB: a tablespace of name %s and id %lu, though. Have\n" +"InnoDB: you deleted or moved .ibd files? We cannot open table %s now.\n", + name, id, namespace->name, namespace->id, name); } + + mutex_exit(&(system->mutex)); + + return(FALSE); } - + + if (0 != strcmp(space->name, path)) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: table %s in InnoDB data dictionary has tablespace\n" +"InnoDB: id %lu, but tablespace with that id has name %s. Have you\n" +"InnoDB: deleted or moved .ibd files? We cannot open table %s now.\n", + name, id, space->name, name); + if (namespace != NULL) { + fprintf(stderr, +"InnoDB: There is a tablespace with the right name %s, but its id is %lu.\n", + namespace->name, namespace->id); + } + + mutex_exit(&(system->mutex)); + + return(FALSE); + } + + ut_a(space == namespace); + + if (mark_space) { + space->mark = TRUE; + } + mutex_exit(&(system->mutex)); - return(ret); + return(TRUE); +} + +/************************************************************************** +Tries to extend a data file by the number of pages given. Fractions of 1 MB +are ignored. The tablespace must be cached in the memory cache. */ + +ibool +fil_extend_last_data_file( +/*======================*/ + /* out: TRUE if success, also if we run + out of disk space we may return TRUE */ + ulint* actual_increase,/* out: number of pages we were able to + extend, here the original size of the file and + the resulting size of the file are rounded + downwards to a full megabyte, and the + difference expressed in pages is returned */ + ulint space_id, /* in: space id */ + ulint size, /* in: current size of the space in pages, as + stored in the fsp header */ + ulint size_increase) /* in: try to extend this many pages */ +{ + fil_system_t* system = fil_system; + fil_node_t* node; + fil_space_t* space; + byte* buf2; + byte* buf; + ibool success; + ulint i; + + fil_mutex_enter_and_prepare_for_io(space_id); + + HASH_SEARCH(hash, system->spaces, space_id, space, + space->id == space_id); + ut_a(space); + + node = UT_LIST_GET_LAST(space->chain); + + fil_node_prepare_for_io(node, system, space); + + if (UT_LIST_GET_LEN(space->chain) == 1 && node->size < size) { + ut_print_timestamp(stderr); + fprintf(stderr, +"InnoDB: Fatal error: space %s id %lu size stored in header is %lu pages\n" +"InnoDB: but actual size is only %lu pages (possibly rounded downwards)!\n" +"InnoDB: Cannot continue operation!\n", space->name, space->id, size, + node->size); + exit(1); + } + + buf2 = mem_alloc(1024 * 1024 + UNIV_PAGE_SIZE); + buf = ut_align(buf2, UNIV_PAGE_SIZE); + + memset(buf, '\0', 1024 * 1024); + + for (i = 0; i < size_increase / ((1024 * 1024) / UNIV_PAGE_SIZE); + i++) { + /* If we use native Windows aio, then we use it also in this + write */ + + success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC, + node->name, node->handle, buf, + (node->size << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFFUL, + node->size >> (32 - UNIV_PAGE_SIZE_SHIFT), + 1024 * 1024, NULL, NULL); + + if (!success) { + break; + } + + node->size += ((1024 * 1024) / UNIV_PAGE_SIZE); + space->size += ((1024 * 1024) / UNIV_PAGE_SIZE); + + os_has_said_disk_full = FALSE; + } + + mem_free(buf2); + + fil_node_complete_io(node, system, OS_FILE_WRITE); + + mutex_exit(&(system->mutex)); + + *actual_increase = i * ((1024 * 1024) / UNIV_PAGE_SIZE); + + fil_flush(space_id); + + if (space_id == 0) { + srv_data_file_sizes[srv_n_data_files - 1] += *actual_increase; + } + + return(TRUE); } +/************************************************************************** +Tries to extend a data file so that it would accommodate the number of pages +given. The tablespace must be cached in the memory cache. */ + +ibool +fil_extend_data_file_with_pages( +/*============================*/ + /* out: TRUE if success */ + ulint space_id, /* in: space id, must be != 0 */ + ulint size, /* in: current size of the space in pages, as + stored in the fsp header */ + ulint size_after_extend)/* in: desired size in pages after the + extension, should be less than 4 GB (this + function is primarily intended for increasing + the data file size from < 64 pages to up to + 64 pages) */ +{ + fil_system_t* system = fil_system; + fil_node_t* node; + fil_space_t* space; + byte* buf2; + byte* buf; + ibool success; + + ut_a(space_id != 0); + ut_a(size_after_extend < 64 * 4096); + ut_a(size_after_extend >= size); + + fil_mutex_enter_and_prepare_for_io(space_id); + + HASH_SEARCH(hash, system->spaces, space_id, space, + space->id == space_id); + ut_a(space); + + node = UT_LIST_GET_LAST(space->chain); + + fil_node_prepare_for_io(node, system, space); + + if (UT_LIST_GET_LEN(space->chain) == 1 && node->size < size) { + ut_print_timestamp(stderr); + fprintf(stderr, +"InnoDB: Fatal error: space %s id %lu size stored in header is %lu pages\n" +"InnoDB: but actual size is only %lu pages (possibly rounded downwards)!\n" +"InnoDB: Cannot continue operation!\n", space->name, space_id, size, + node->size); + exit(1); + } + + buf2 = mem_alloc((1 + size_after_extend - size) * UNIV_PAGE_SIZE); + buf = ut_align(buf2, UNIV_PAGE_SIZE); + + memset(buf, '\0', (size_after_extend - size) * UNIV_PAGE_SIZE); + + success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC, + node->name, node->handle, buf, + UNIV_PAGE_SIZE * size, 0, + UNIV_PAGE_SIZE * (size_after_extend - size), + NULL, NULL); + if (success) { + node->size = size_after_extend; + space->size = size_after_extend; + + os_has_said_disk_full = FALSE; + } + + mem_free(buf2); + + fil_node_complete_io(node, system, OS_FILE_WRITE); + + mutex_exit(&(system->mutex)); + + fil_flush(space_id); + + return(success); +} + +/*========== RESERVE FREE EXTENTS (for a B-tree split, for example) ===*/ + /*********************************************************************** Tries to reserve free extents in a file space. */ @@ -784,8 +2653,8 @@ fil_space_reserve_free_extents( ulint n_free_now, /* in: number of free extents now */ ulint n_to_reserve) /* in: how many one wants to reserve */ { - fil_space_t* space; fil_system_t* system = fil_system; + fil_space_t* space; ibool success; ut_ad(system); @@ -794,6 +2663,8 @@ fil_space_reserve_free_extents( HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + ut_a(space); + if (space->n_reserved_extents + n_to_reserve > n_free_now) { success = FALSE; } else { @@ -815,8 +2686,8 @@ fil_space_release_free_extents( ulint id, /* in: space id */ ulint n_reserved) /* in: how many one reserved */ { - fil_space_t* space; fil_system_t* system = fil_system; + fil_space_t* space; ut_ad(system); @@ -824,6 +2695,7 @@ fil_space_release_free_extents( HASH_SEARCH(hash, system->spaces, id, space, space->id == id); + ut_a(space); ut_a(space->n_reserved_extents >= n_reserved); space->n_reserved_extents -= n_reserved; @@ -840,8 +2712,8 @@ fil_space_get_n_reserved_extents( /*=============================*/ ulint id) /* in: space id */ { - fil_space_t* space; fil_system_t* system = fil_system; + fil_space_t* space; ulint n; ut_ad(system); @@ -859,204 +2731,94 @@ fil_space_get_n_reserved_extents( return(n); } +/*============================ FILE I/O ================================*/ + /************************************************************************ +NOTE: you must call fil_mutex_enter_and_prepare_for_io() first! + Prepares a file node for i/o. Opens the file if it is closed. Updates the pending i/o's field in the node and the system appropriately. Takes the node -off the LRU list if it is in the LRU list. */ +off the LRU list if it is in the LRU list. The caller must hold the fil_sys +mutex. */ static void fil_node_prepare_for_io( /*====================*/ fil_node_t* node, /* in: file node */ - fil_system_t* system, /* in: file system */ + fil_system_t* system, /* in: tablespace memory cache */ fil_space_t* space) /* in: space */ { - ibool ret; - fil_node_t* last_node; - ut_ad(node && system && space); ut_ad(mutex_own(&(system->mutex))); + if (system->n_open > system->max_n_open + 5) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: open files %lu exceeds the limit %lu\n", + system->n_open, system->max_n_open); + } + if (node->open == FALSE) { - /* File is closed */ + /* File is closed: open it */ ut_a(node->n_pending == 0); - /* If too many files are open, close one */ - - if (system->n_open_pending + UT_LIST_GET_LEN(system->LRU) - == system->max_n_open) { - - ut_a(UT_LIST_GET_LEN(system->LRU) > 0); - - last_node = UT_LIST_GET_LAST(system->LRU); - - if (last_node == NULL) { - fprintf(stderr, - "InnoDB: Error: cannot close any file to open another for i/o\n" - "InnoDB: Pending i/o's on %lu files exist\n", - system->n_open_pending); - - ut_a(0); - } - - fil_node_close(last_node, system); - } - - if (space->purpose == FIL_LOG) { - node->handle = os_file_create(node->name, OS_FILE_OPEN, - OS_FILE_AIO, OS_LOG_FILE, &ret); - } else { - node->handle = os_file_create(node->name, OS_FILE_OPEN, - OS_FILE_AIO, OS_DATA_FILE, &ret); - } - - ut_a(ret); - - node->open = TRUE; - - system->n_open_pending++; - node->n_pending = 1; - - /* File was closed: the node was not in the LRU list */ - - return; + fil_node_open_file(node, system, space); } - /* File is open */ - if (node->n_pending == 0) { + if (node->n_pending == 0 && space->purpose == FIL_TABLESPACE + && space->id != 0) { /* The node is in the LRU list, remove it */ - UT_LIST_REMOVE(LRU, system->LRU, node); - - system->n_open_pending++; - node->n_pending = 1; - } else { - /* There is already a pending i/o-op on the file: the node is - not in the LRU list */ + ut_a(UT_LIST_GET_LEN(system->LRU) > 0); - node->n_pending++; + UT_LIST_REMOVE(LRU, system->LRU, node); } + + node->n_pending++; } /************************************************************************ Updates the data structures when an i/o operation finishes. Updates the -pending i/os field in the node and the system appropriately. Puts the node -in the LRU list if there are no other pending i/os. */ +pending i/o's field in the node appropriately. */ static void fil_node_complete_io( /*=================*/ fil_node_t* node, /* in: file node */ - fil_system_t* system, /* in: file system */ - ulint type) /* in: OS_FILE_WRITE or ..._READ */ + fil_system_t* system, /* in: tablespace memory cache */ + ulint type) /* in: OS_FILE_WRITE or OS_FILE_READ; marks + the node as modified if + type == OS_FILE_WRITE */ { ut_ad(node); ut_ad(system); ut_ad(mutex_own(&(system->mutex))); + ut_a(node->n_pending > 0); node->n_pending--; - if (type != OS_FILE_READ) { - node->is_modified = TRUE; + if (type == OS_FILE_WRITE) { + system->modification_counter++; + node->modification_counter = system->modification_counter; } - if (node->n_pending == 0) { + if (node->n_pending == 0 && node->space->purpose == FIL_TABLESPACE + && node->space->id != 0) { /* The node must be put back to the LRU list */ UT_LIST_ADD_FIRST(LRU, system->LRU, node); - - ut_a(system->n_open_pending > 0); - - system->n_open_pending--; - - if (system->n_open_pending == system->max_n_open - 1) { - - os_event_set(system->can_open); - } } } - -/************************************************************************** -Tries to extend a data file by the number of pages given. Any fractions of a -megabyte are ignored. */ - -ibool -fil_extend_last_data_file( -/*======================*/ - /* out: TRUE if success, also if we run - out of disk space we may return TRUE */ - ulint* actual_increase,/* out: number of pages we were able to - extend, here the orginal size of the file and - the resulting size of the file are rounded - downwards to a full megabyte, and the - difference expressed in pages is returned */ - ulint size_increase) /* in: try to extend this many pages */ -{ - fil_node_t* node; - fil_space_t* space; - fil_system_t* system = fil_system; - byte* buf2; - byte* buf; - ibool success; - ulint i; - - mutex_enter(&(system->mutex)); - - HASH_SEARCH(hash, system->spaces, 0, space, space->id == 0); - - ut_a(space); - - node = UT_LIST_GET_LAST(space->chain); - - fil_node_prepare_for_io(node, system, space); - - buf2 = mem_alloc(1024 * 1024 + UNIV_PAGE_SIZE); - buf = ut_align(buf2, UNIV_PAGE_SIZE); - - memset(buf, '\0', 1024 * 1024); - - for (i = 0; i < size_increase / ((1024 * 1024) / UNIV_PAGE_SIZE); i++) { - - /* If we use native Windows aio, then also this write is - done using it */ - - success = os_aio(OS_FILE_WRITE, OS_AIO_SYNC, - node->name, node->handle, buf, - (node->size << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFF, - node->size >> (32 - UNIV_PAGE_SIZE_SHIFT), - 1024 * 1024, NULL, NULL); - - if (!success) { - break; - } - - node->size += ((1024 * 1024) / UNIV_PAGE_SIZE); - space->size += ((1024 * 1024) / UNIV_PAGE_SIZE); - - os_has_said_disk_full = FALSE; - } - - mem_free(buf2); - - fil_node_complete_io(node, system, OS_FILE_WRITE); - - mutex_exit(&(system->mutex)); - - *actual_increase = i * ((1024 * 1024) / UNIV_PAGE_SIZE); - - fil_flush(0); - - srv_data_file_sizes[srv_n_data_files - 1] += *actual_increase; - - return(TRUE); -} /************************************************************************ Reads or writes data. This operation is asynchronous (aio). */ -void +ulint fil_io( /*===*/ + /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED + if we are trying to do i/o on a tablespace + which does not exist */ ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE, ORed to OS_FILE_LOG, if a log i/o and ORed to OS_AIO_SIMULATED_WAKE_LATER @@ -1081,17 +2843,15 @@ fil_io( void* message) /* in: message for aio handler if non-sync aio used, else ignored */ { + fil_system_t* system = fil_system; ulint mode; fil_space_t* space; fil_node_t* node; ulint offset_high; ulint offset_low; - fil_system_t* system; - os_event_t event; ibool ret; ulint is_log; ulint wake_later; - ulint count; is_log = type & OS_FILE_LOG; type = type & ~OS_FILE_LOG; @@ -1102,7 +2862,7 @@ fil_io( ut_ad(byte_offset < UNIV_PAGE_SIZE); ut_ad(buf); ut_ad(len > 0); - ut_ad((1 << UNIV_PAGE_SIZE_SHIFT) == UNIV_PAGE_SIZE); + ut_a((1 << UNIV_PAGE_SIZE_SHIFT) == UNIV_PAGE_SIZE); ut_ad(fil_validate()); #ifndef UNIV_LOG_DEBUG /* ibuf bitmap pages must be read in the sync aio mode: */ @@ -1124,82 +2884,45 @@ fil_io( mode = OS_AIO_NORMAL; } - system = fil_system; + /* Reserve the fil_system mutex and make sure that we can open at + least one file while holding it, if the file is not already open */ - count = 0; -loop: - count++; - - /* NOTE that there is a possibility of a hang here: - if the read i/o-handler thread needs to complete - a read by reading from the insert buffer, it may need to - post another read. But if the maximum number of files - are already open, it cannot proceed from here! */ - - mutex_enter(&(system->mutex)); + fil_mutex_enter_and_prepare_for_io(space_id); - if (count < 500 && !is_log && !ibuf_inside() - && system->n_open_pending >= (3 * system->max_n_open) / 4) { - - /* We are not doing an ibuf operation: leave a - safety margin of openable files for possible ibuf - merges needed in page read completion */ - - mutex_exit(&(system->mutex)); - - /* Wake the i/o-handler threads to make sure pending - i/o's are handled and eventually we can open the file */ - - os_aio_simulated_wake_handler_threads(); - - os_thread_sleep(100000); - - if (count > 50) { - fprintf(stderr, - "InnoDB: Warning: waiting for file closes to proceed\n" - "InnoDB: round %lu\n", count); - } - - goto loop; - } - - if (system->n_open_pending == system->max_n_open) { - - /* It is not sure we can open the file if it is closed: wait */ - - event = system->can_open; - os_event_reset(event); - + HASH_SEARCH(hash, system->spaces, space_id, space, + space->id == space_id); + if (!space) { mutex_exit(&(system->mutex)); - /* Wake the i/o-handler threads to make sure pending - i/o's are handled and eventually we can open the file */ - - os_aio_simulated_wake_handler_threads(); - + ut_print_timestamp(stderr); fprintf(stderr, - "InnoDB: Warning: max allowed number of files is open\n"); - - os_event_wait(event); +" InnoDB: Error: trying to do i/o to a tablespace which does not exist.\n" +"InnoDB: i/o type %lu, space id %lu, page no. %lu, i/o length %lu bytes\n", + type, space_id, block_offset, len); - goto loop; - } - - HASH_SEARCH(hash, system->spaces, space_id, space, - space->id == space_id); - ut_a(space); + return(DB_TABLESPACE_DELETED); + } ut_ad((mode != OS_AIO_IBUF) || (space->purpose == FIL_TABLESPACE)); node = UT_LIST_GET_FIRST(space->chain); for (;;) { + if (space->id != 0 && node->size == 0) { + /* We do not know the size of a single-table tablespace + before we open the file */ + + break; + } + if (node == NULL) { fprintf(stderr, - "InnoDB: Error: trying to access page number %lu in space %lu\n" + "InnoDB: Error: trying to access page number %lu in space %lu,\n" + "InnoDB: space name %s,\n" "InnoDB: which is outside the tablespace bounds.\n" "InnoDB: Byte offset %lu, len %lu, i/o type %lu\n", - block_offset, space_id, byte_offset, len, type); + block_offset, space_id, space->name, byte_offset, len, + type); ut_a(0); } @@ -1216,13 +2939,28 @@ loop: /* Open file if closed */ fil_node_prepare_for_io(node, system, space); + /* Check that at least the start offset is within the bounds of a + single-table tablespace */ + if (space->purpose == FIL_TABLESPACE && space->id != 0 + && node->size <= block_offset) { + + fprintf(stderr, + "InnoDB: Error: trying to access page number %lu in space %lu,\n" + "InnoDB: space name %s,\n" + "InnoDB: which is outside the tablespace bounds.\n" + "InnoDB: Byte offset %lu, len %lu, i/o type %lu\n", + block_offset, space_id, space->name, byte_offset, len, + type); + ut_a(0); + } + /* Now we have made the changes in the data structures of system */ mutex_exit(&(system->mutex)); /* Calculate the low 32 bits and the high 32 bits of the file offset */ offset_high = (block_offset >> (32 - UNIV_PAGE_SIZE_SHIFT)); - offset_low = ((block_offset << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFF) + offset_low = ((block_offset << UNIV_PAGE_SIZE_SHIFT) & 0xFFFFFFFFUL) + byte_offset; ut_a(node->size - block_offset >= @@ -1250,6 +2988,8 @@ loop: ut_ad(fil_validate()); } + + return(DB_SUCCESS); } /************************************************************************ @@ -1257,9 +2997,12 @@ Reads data from a space to a buffer. Remember that the possible incomplete blocks at the end of file are ignored: they are not taken into account when calculating the byte offset within a space. */ -void +ulint fil_read( /*=====*/ + /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED + if we are trying to do i/o on a tablespace + which does not exist */ ibool sync, /* in: TRUE if synchronous aio is desired */ ulint space_id, /* in: space id */ ulint block_offset, /* in: offset in number of blocks */ @@ -1273,8 +3016,8 @@ fil_read( void* message) /* in: message for aio handler if non-sync aio used, else ignored */ { - fil_io(OS_FILE_READ, sync, space_id, block_offset, byte_offset, len, - buf, message); + return(fil_io(OS_FILE_READ, sync, space_id, block_offset, + byte_offset, len, buf, message)); } /************************************************************************ @@ -1282,9 +3025,12 @@ Writes data to a space from a buffer. Remember that the possible incomplete blocks at the end of file are ignored: they are not taken into account when calculating the byte offset within a space. */ -void +ulint fil_write( /*======*/ + /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED + if we are trying to do i/o on a tablespace + which does not exist */ ibool sync, /* in: TRUE if synchronous aio is desired */ ulint space_id, /* in: space id */ ulint block_offset, /* in: offset in number of blocks */ @@ -1298,8 +3044,8 @@ fil_write( void* message) /* in: message for aio handler if non-sync aio used, else ignored */ { - fil_io(OS_FILE_WRITE, sync, space_id, block_offset, byte_offset, len, - buf, message); + return(fil_io(OS_FILE_WRITE, sync, space_id, block_offset, + byte_offset, len, buf, message)); } /************************************************************************** @@ -1314,16 +3060,16 @@ fil_aio_wait( ulint segment) /* in: the number of the segment in the aio array to wait for */ { + fil_system_t* system = fil_system; ibool ret; fil_node_t* fil_node; - fil_system_t* system = fil_system; void* message; ulint type; ut_ad(fil_validate()); if (os_aio_use_native_aio) { - srv_io_thread_op_info[segment] = (char *) "native aio handle"; + srv_io_thread_op_info[segment] = (char *) "handle native aio"; #ifdef WIN_ASYNC_IO ret = os_aio_windows_handle(segment, 0, &fil_node, &message, &type); @@ -1334,7 +3080,7 @@ fil_aio_wait( ut_a(0); #endif } else { - srv_io_thread_op_info[segment] =(char *)"simulated aio handle"; + srv_io_thread_op_info[segment] =(char *)"handle simulated aio"; ret = os_aio_simulated_handle(segment, (void**) &fil_node, &message, &type); @@ -1353,6 +3099,10 @@ fil_aio_wait( ut_ad(fil_validate()); /* Do the i/o handling */ + /* IMPORTANT: since i/o handling for reads will read also the insert + buffer in tablespace 0, you have to be very careful not to introduce + deadlocks in the i/o system. We keep tablespace 0 data files always + open, and use a special i/o thread to serve insert buffer requests. */ if (buf_pool_is_block(message)) { srv_io_thread_op_info[segment] = @@ -1365,7 +3115,8 @@ fil_aio_wait( } /************************************************************************** -Flushes to disk possible writes cached by the OS. */ +Flushes to disk possible writes cached by the OS. If the space does not exist +or is being dropped, does not do anything. */ void fil_flush( @@ -1377,41 +3128,79 @@ fil_flush( fil_space_t* space; fil_node_t* node; os_file_t file; + ib_longlong old_mod_counter; mutex_enter(&(system->mutex)); HASH_SEARCH(hash, system->spaces, space_id, space, - space->id == space_id); - ut_a(space); + space->id == space_id); + if (!space || space->is_being_deleted) { + mutex_exit(&(system->mutex)); + + return; + } + space->n_pending_flushes++; /* prevent dropping of the space while + we are flushing */ node = UT_LIST_GET_FIRST(space->chain); while (node) { - if (node->open && node->is_modified) { - file = node->handle; + if (node->modification_counter > node->flush_counter) { + ut_a(node->open); + + /* We want to flush the changes at least up to + old_mod_counter */ + old_mod_counter = node->modification_counter; - node->is_modified = FALSE; - if (space->purpose == FIL_TABLESPACE) { fil_n_pending_tablespace_flushes++; } else { fil_n_pending_log_flushes++; } +#ifdef __WIN__ + if (node->is_raw_disk) { - mutex_exit(&(system->mutex)); + goto skip_flush; + } +#endif +retry: + if (node->n_pending_flushes > 0) { + /* We want to avoid calling os_file_flush() on + the file twice at the same time, because we do + not know what bugs OS's may contain in file + i/o; sleep for a while */ - /* Note that it is not certain, when we have - released the mutex above, that the file of the - handle is still open: we assume that the OS - will not crash or trap even if we pass a handle - to a closed file below in os_file_flush! */ + mutex_exit(&(system->mutex)); + + os_thread_sleep(20000); + + mutex_enter(&(system->mutex)); + + if (node->flush_counter >= old_mod_counter) { + + goto skip_flush; + } + + goto retry; + } + + ut_a(node->open); + file = node->handle; + node->n_pending_flushes++; + + mutex_exit(&(system->mutex)); /* printf("Flushing to file %s\n", node->name); */ - - os_file_flush(file); - + os_file_flush(file); + mutex_enter(&(system->mutex)); + node->n_pending_flushes--; +skip_flush: + if (node->flush_counter < old_mod_counter) { + node->flush_counter = old_mod_counter; + } + if (space->purpose == FIL_TABLESPACE) { fil_n_pending_tablespace_flushes--; } else { @@ -1422,11 +3211,13 @@ fil_flush( node = UT_LIST_GET_NEXT(chain, node); } + space->n_pending_flushes--; + mutex_exit(&(system->mutex)); } /************************************************************************** -Flushes to disk writes in file spaces of the given type possibly cached by +Flushes to disk the writes in file spaces of the given type possibly cached by the OS. */ void @@ -1443,13 +3234,17 @@ fil_flush_file_spaces( while (space) { if (space->purpose == purpose) { + space->n_pending_flushes++; /* prevent dropping of the + space while we are + flushing */ mutex_exit(&(system->mutex)); fil_flush(space->id); mutex_enter(&(system->mutex)); - } + space->n_pending_flushes--; + } space = UT_LIST_GET_NEXT(space_list, space); } @@ -1457,20 +3252,18 @@ fil_flush_file_spaces( } /********************************************************************** -Checks the consistency of the file system. */ +Checks the consistency of the tablespace cache. */ ibool fil_validate(void) /*==============*/ /* out: TRUE if ok */ { + fil_system_t* system = fil_system; fil_space_t* space; fil_node_t* fil_node; - ulint pending_count = 0; - fil_system_t* system; + ulint n_open = 0; ulint i; - - system = fil_system; mutex_enter(&(system->mutex)); @@ -1481,36 +3274,35 @@ fil_validate(void) space = HASH_GET_FIRST(system->spaces, i); while (space != NULL) { - UT_LIST_VALIDATE(chain, fil_node_t, space->chain); fil_node = UT_LIST_GET_FIRST(space->chain); while (fil_node != NULL) { - if (fil_node->n_pending > 0) { - - pending_count++; ut_a(fil_node->open); } + if (fil_node->open) { + n_open++; + } fil_node = UT_LIST_GET_NEXT(chain, fil_node); } - space = HASH_GET_NEXT(hash, space); } } - ut_a(pending_count == system->n_open_pending); + ut_a(system->n_open == n_open); UT_LIST_VALIDATE(LRU, fil_node_t, system->LRU); fil_node = UT_LIST_GET_FIRST(system->LRU); while (fil_node != NULL) { - ut_a(fil_node->n_pending == 0); ut_a(fil_node->open); + ut_a(fil_node->space->purpose == FIL_TABLESPACE); + ut_a(fil_node->space->id != 0); fil_node = UT_LIST_GET_NEXT(LRU, fil_node); } @@ -1578,4 +3370,4 @@ fil_page_get_type( ut_ad(page); return(mach_read_from_2(page + FIL_PAGE_TYPE)); -} +} diff --git a/innobase/fsp/fsp0fsp.c b/innobase/fsp/fsp0fsp.c index 8727c5156e4..7c4d691ea13 100644 --- a/innobase/fsp/fsp0fsp.c +++ b/innobase/fsp/fsp0fsp.c @@ -41,7 +41,8 @@ descriptor page, but used only in the first. */ #define FSP_HEADER_OFFSET FIL_PAGE_DATA /* Offset of the space header within a file page */ /*-------------------------------------*/ -#define FSP_NOT_USED 0 /* this field contained a value up to +#define FSP_SPACE_ID 0 /* space id */ +#define FSP_NOT_USED 4 /* this field contained a value up to which we know that the modifications in the database have been flushed to the file space; not used now */ @@ -50,7 +51,13 @@ descriptor page, but used only in the first. */ #define FSP_FREE_LIMIT 12 /* Minimum page number for which the free list has not been initialized: the pages >= this limit are, by - definition, free */ + definition, free; note that in a + single-table tablespace where size + < 64 pages, this number is 64, i.e., + we have initialized the space + about the first extent, but have not + physically allocted those pages to the + file */ #define FSP_LOWEST_NO_WRITE 16 /* The lowest page offset for which the page has not been written to disk (if it has been written, we know that @@ -263,9 +270,14 @@ static void fsp_fill_free_list( /*===============*/ - ulint space, /* in: space */ - fsp_header_t* header, /* in: space header */ - mtr_t* mtr); /* in: mtr */ + ibool init_space, /* in: TRUE if this is a single-table + tablespace and we are only initing + the tablespace's first extent + descriptor page and ibuf bitmap page; + then we do not allocate more extents */ + ulint space, /* in: space */ + fsp_header_t* header, /* in: space header */ + mtr_t* mtr); /* in: mtr */ /************************************************************************** Allocates a single free page from a segment. This function implements the intelligent allocation strategy which tries to minimize file space @@ -569,7 +581,7 @@ xdes_init( ut_ad((XDES_SIZE - XDES_BITMAP) % 4 == 0); for (i = XDES_BITMAP; i < XDES_SIZE; i += 4) { - mlog_write_ulint(descr + i, 0xFFFFFFFF, MLOG_4BYTES, mtr); + mlog_write_ulint(descr + i, 0xFFFFFFFFUL, MLOG_4BYTES, mtr); } xdes_set_state(descr, XDES_FREE, mtr); @@ -630,8 +642,8 @@ xdes_get_descriptor_with_space_hdr( page_t* descr_page; ut_ad(mtr); - ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space), MTR_MEMO_X_LOCK)); - + ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space), + MTR_MEMO_X_LOCK)); /* Read free limit and space size */ limit = mtr_read_ulint(sp_header + FSP_FREE_LIMIT, MLOG_4BYTES, mtr); size = mtr_read_ulint(sp_header + FSP_SIZE, MLOG_4BYTES, mtr); @@ -646,7 +658,7 @@ xdes_get_descriptor_with_space_hdr( /* If offset is == limit, fill free list of the space. */ if (offset == limit) { - fsp_fill_free_list(space, sp_header, mtr); + fsp_fill_free_list(FALSE, space, sp_header, mtr); } descr_page_no = xdes_calc_descriptor_page(offset); @@ -711,8 +723,8 @@ xdes_lst_get_descriptor( xdes_t* descr; ut_ad(mtr); - ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space), MTR_MEMO_X_LOCK)); - + ut_ad(mtr_memo_contains(mtr, fil_space_get_latch(space), + MTR_MEMO_X_LOCK)); descr = fut_get_ptr(space, lst_node, RW_X_LATCH, mtr) - XDES_FLST_NODE; return(descr); @@ -775,7 +787,7 @@ fsp_init_file_page_low( /* printf("In log debug version: Erase the contents of the file page\n"); */ for (i = 0; i < UNIV_PAGE_SIZE; i++) { - page[i] = 0xFF; + page[i] = (byte)0xFF; } #endif mach_write_to_8(page + UNIV_PAGE_SIZE - FIL_PAGE_END_LSN_OLD_CHKSUM, @@ -828,8 +840,21 @@ fsp_init(void) } /************************************************************************** +Writes the space id to a tablespace header. This function is used past the +buffer pool when we in fil0fil.c create a new single-table tablespace. */ + +void +fsp_header_write_space_id( +/*======================*/ + page_t* page, /* in: first page in the space */ + ulint space_id) /* in: space id */ +{ + mach_write_to_4(page + FSP_HEADER_OFFSET + FSP_SPACE_ID, space_id); +} + +/************************************************************************** Initializes the space header of a new created space and creates also the -insert buffer tree root. */ +insert buffer tree root if space == 0. */ void fsp_header_init( @@ -846,8 +871,7 @@ fsp_header_init( mtr_x_lock(fil_space_get_latch(space), mtr); page = buf_page_create(space, 0, mtr); - buf_page_dbg_add_level(page, SYNC_FSP_PAGE); - + buf_page_get(space, 0, RW_X_LATCH, mtr); buf_page_dbg_add_level(page, SYNC_FSP_PAGE); @@ -857,6 +881,8 @@ fsp_header_init( header = FSP_HEADER_OFFSET + page; + mlog_write_ulint(header + FSP_SPACE_ID, space, MLOG_4BYTES, mtr); + mlog_write_ulint(header + FSP_SIZE, size, MLOG_4BYTES, mtr); mlog_write_ulint(header + FSP_FREE_LIMIT, 0, MLOG_4BYTES, mtr); mlog_write_ulint(header + FSP_LOWEST_NO_WRITE, 0, MLOG_4BYTES, mtr); @@ -870,10 +896,39 @@ fsp_header_init( mlog_write_dulint(header + FSP_SEG_ID, ut_dulint_create(0, 1), MLOG_8BYTES, mtr); - fsp_fill_free_list(space, header, mtr); - - btr_create(DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF, space, + if (space == 0) { + fsp_fill_free_list(FALSE, space, header, mtr); + btr_create(DICT_CLUSTERED | DICT_UNIVERSAL | DICT_IBUF, space, ut_dulint_add(DICT_IBUF_ID_MIN, space), mtr); + } else { + fsp_fill_free_list(TRUE, space, header, mtr); + } +} + +/************************************************************************** +Reads the space id from the first page of a tablespace. */ + +ulint +fsp_header_get_space_id( +/*====================*/ + /* out: space id, ULINT UNDEFINED if error */ + page_t* page) /* in: first page of a tablespace */ +{ + ulint fsp_id; + ulint id; + + fsp_id = mach_read_from_4(FSP_HEADER_OFFSET + page + FSP_SPACE_ID); + + id = mach_read_from_4(page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + + if (id != fsp_id) { + fprintf(stderr, +"InnoDB: Error: space id in fsp header %lu, but in the page header %lu\n", + fsp_id, id); + return(ULINT_UNDEFINED); + } + + return(id); } /************************************************************************** @@ -897,7 +952,8 @@ fsp_header_inc_size( size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); - mlog_write_ulint(header + FSP_SIZE, size + size_inc, MLOG_4BYTES, mtr); + mlog_write_ulint(header + FSP_SIZE, size + size_inc, MLOG_4BYTES, + mtr); } /************************************************************************** @@ -910,7 +966,7 @@ ulint fsp_header_get_free_limit( /*======================*/ /* out: free limit in megabytes */ - ulint space) /* in: space id */ + ulint space) /* in: space id, must be 0 */ { fsp_header_t* header; ulint limit; @@ -944,7 +1000,7 @@ ulint fsp_header_get_tablespace_size( /*===========================*/ /* out: size in pages */ - ulint space) /* in: space id */ + ulint space) /* in: space id, must be 0 */ { fsp_header_t* header; ulint size; @@ -966,10 +1022,42 @@ fsp_header_get_tablespace_size( } /*************************************************************************** -Tries to extend the last data file file if it is defined as auto-extending. */ +Tries to extend a single-table tablespace so that a page would fit in the +data file. */ +static +ibool +fsp_try_extend_data_file_with_pages( +/*================================*/ + /* out: TRUE if success */ + ulint space, /* in: space */ + ulint page_no, /* in: page number */ + fsp_header_t* header, /* in: space header */ + mtr_t* mtr) /* in: mtr */ +{ + ulint size; + ibool success; + + ut_a(space != 0); + + size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); + + ut_a(page_no >= size); + + success = fil_extend_data_file_with_pages(space, size, page_no + 1); + + if (success) { + mlog_write_ulint(header + FSP_SIZE, page_no + 1, MLOG_4BYTES, + mtr); + } + + return(success); +} + +/*************************************************************************** +Tries to extend the last data file of a tablespace if it is auto-extending. */ static ibool -fsp_try_extend_last_file( +fsp_try_extend_data_file( /*=====================*/ /* out: FALSE if not auto-extending */ ulint* actual_increase,/* out: actual increase in pages */ @@ -981,18 +1069,16 @@ fsp_try_extend_last_file( ulint size_increase; ibool success; - ut_a(space == 0); - *actual_increase = 0; - if (!srv_auto_extend_last_data_file) { + if (space == 0 && !srv_auto_extend_last_data_file) { return(FALSE); } size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); - if (srv_last_file_size_max != 0) { + if (space == 0 && srv_last_file_size_max != 0) { if (srv_last_file_size_max < srv_data_file_sizes[srv_n_data_files - 1]) { @@ -1008,20 +1094,48 @@ fsp_try_extend_last_file( size_increase = SRV_AUTO_EXTEND_INCREMENT; } } else { - size_increase = SRV_AUTO_EXTEND_INCREMENT; + if (space == 0) { + size_increase = SRV_AUTO_EXTEND_INCREMENT; + } else { + /* We extend single-table tablespaces first one extent + at a time, but for bigger tablespaces more. It is not + enough to extend always by one extent, because some + extents are frag page extents. */ + + if (size < FSP_EXTENT_SIZE) { + /* Let us first extend the file to 64 pages */ + success = fsp_try_extend_data_file_with_pages( + space, FSP_EXTENT_SIZE - 1, + header, mtr); + + if (!success) { + return(FALSE); + } + + size = FSP_EXTENT_SIZE; + } + + if (size < 32 * FSP_EXTENT_SIZE) { + size_increase = FSP_EXTENT_SIZE; + } else { + size_increase = 8 * FSP_EXTENT_SIZE; + } + } } if (size_increase == 0) { + return(TRUE); } - /* Extend the data file. If we are not able to extend - the full requested length, the function tells us - the number of full megabytes (but the unit is pages!) - we were able to extend. */ - - success = fil_extend_last_data_file(actual_increase, size_increase); - + /* Extend the data file. If we are not able to extend the full + requested length, the function tells how many pages we were able to + extend so that the size of the tablespace would be divisible by 1 MB + (we possibly managed to extend more, but we only take into account + full megabytes). */ + + success = fil_extend_last_data_file(actual_increase, space, size, + size_increase); if (success) { mlog_write_ulint(header + FSP_SIZE, size + *actual_increase, MLOG_4BYTES, mtr); @@ -1038,9 +1152,14 @@ static void fsp_fill_free_list( /*===============*/ - ulint space, /* in: space */ - fsp_header_t* header, /* in: space header */ - mtr_t* mtr) /* in: mtr */ + ibool init_space, /* in: TRUE if this is a single-table + tablespace and we are only initing + the tablespace's first extent + descriptor page and ibuf bitmap page; + then we do not allocate more extents */ + ulint space, /* in: space */ + fsp_header_t* header, /* in: space header */ + mtr_t* mtr) /* in: mtr */ { ulint limit; ulint size; @@ -1059,27 +1178,35 @@ fsp_fill_free_list( size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); limit = mtr_read_ulint(header + FSP_FREE_LIMIT, MLOG_4BYTES, mtr); - if (srv_auto_extend_last_data_file + if (space == 0 && srv_auto_extend_last_data_file && size < limit + FSP_EXTENT_SIZE * FSP_FREE_ADD) { /* Try to increase the last data file size */ - fsp_try_extend_last_file(&actual_increase, space, header, - mtr); + fsp_try_extend_data_file(&actual_increase, space, header, mtr); + size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); + } + + if (space != 0 && !init_space) { + /* Try to increase the data file size */ + fsp_try_extend_data_file(&actual_increase, space, header, mtr); size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); } i = limit; - while ((i + FSP_EXTENT_SIZE <= size) && (count < FSP_FREE_ADD)) { + while ((init_space && i < 1) + || ((i + FSP_EXTENT_SIZE <= size) && (count < FSP_FREE_ADD))) { mlog_write_ulint(header + FSP_FREE_LIMIT, i + FSP_EXTENT_SIZE, MLOG_4BYTES, mtr); /* Update the free limit info in the log system and make a checkpoint */ - log_fsp_current_free_limit_set_and_checkpoint( + if (space == 0) { + log_fsp_current_free_limit_set_and_checkpoint( (i + FSP_EXTENT_SIZE) / ((1024 * 1024) / UNIV_PAGE_SIZE)); + } if (0 == i % XDES_DESCRIBED_PER_PAGE) { @@ -1089,8 +1216,6 @@ fsp_fill_free_list( if (i > 0) { descr_page = buf_page_create(space, i, mtr); - buf_page_dbg_add_level(descr_page, - SYNC_FSP_PAGE); buf_page_get(space, i, RW_X_LATCH, mtr); buf_page_dbg_add_level(descr_page, SYNC_FSP_PAGE); @@ -1106,7 +1231,6 @@ fsp_fill_free_list( ibuf_page = buf_page_create(space, i + FSP_IBUF_BITMAP_OFFSET, &ibuf_mtr); - buf_page_dbg_add_level(ibuf_page, SYNC_IBUF_BITMAP); buf_page_get(space, i + FSP_IBUF_BITMAP_OFFSET, RW_X_LATCH, &ibuf_mtr); @@ -1183,7 +1307,7 @@ fsp_alloc_free_extent( first = flst_get_first(header + FSP_FREE, mtr); if (fil_addr_is_null(first)) { - fsp_fill_free_list(space, header, mtr); + fsp_fill_free_list(FALSE, space, header, mtr); first = flst_get_first(header + FSP_FREE, mtr); } @@ -1220,6 +1344,8 @@ fsp_alloc_free_page( ulint free; ulint frag_n_used; ulint page_no; + ulint space_size; + ibool success; ut_ad(mtr); @@ -1273,6 +1399,30 @@ fsp_alloc_free_page( ut_a(0); } + page_no = xdes_get_offset(descr) + free; + + space_size = mtr_read_ulint(header + FSP_SIZE, MLOG_4BYTES, mtr); + + if (space_size <= page_no) { + /* It must be that we are extending a single-table tablespace + whose size is still < 64 pages */ + + ut_a(space != 0); + if (page_no >= FSP_EXTENT_SIZE) { + fprintf(stderr, +"InnoDB: Error: trying to extend a single-table tablespace %lu\n" +"InnoDB: by single page(s) though the space size %lu. Page no %lu.\n", + space, space_size, page_no); + return(FIL_NULL); + } + success = fsp_try_extend_data_file_with_pages(space, page_no, + header, mtr); + if (!success) { + /* No disk space left */ + return(FIL_NULL); + } + } + xdes_set_bit(descr, XDES_FREE_BIT, free, FALSE, mtr); /* Update the FRAG_N_USED field */ @@ -1294,8 +1444,6 @@ fsp_alloc_free_page( mtr); } - page_no = xdes_get_offset(descr) + free; - /* Initialize the allocated page to the buffer pool, so that it can be obtained immediately with buf_page_get without need for a disk read. */ @@ -1594,8 +1742,8 @@ fsp_alloc_seg_inode( inode = fsp_seg_inode_page_get_nth_inode(page, n, mtr); - if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(page, n + 1, mtr)) { - + if (ULINT_UNDEFINED == fsp_seg_inode_page_find_free(page, n + 1, + mtr)) { /* There are no other unused headers left on the page: move it to another list */ @@ -1813,12 +1961,12 @@ fseg_create_general( will belong to the created segment */ ulint byte_offset, /* in: byte offset of the created segment header on the page */ - ibool has_done_reservation, /* in: TRUE if the caller has - already done the reservation for the pages - with fsp_reserve_free_extents (at least 2 extents: - one for the inode and, then there other for the - segment) is no need to do the check for this - individual operation */ + ibool has_done_reservation, /* in: TRUE if the caller has already + done the reservation for the pages with + fsp_reserve_free_extents (at least 2 extents: one for + the inode and the other for the segment) then there is + no need to do the check for this individual + operation */ mtr_t* mtr) /* in: mtr */ { fsp_header_t* space_header; @@ -1827,6 +1975,7 @@ fseg_create_general( fseg_header_t* header = 0; /* remove warning */ rw_lock_t* latch; ibool success; + ulint n_reserved; page_t* ret = NULL; ulint i; @@ -1848,12 +1997,14 @@ fseg_create_general( /* This thread did not own the latch before this call: free excess pages from the insert buffer free list */ - ibuf_free_excess_pages(space); + if (space == 0) { + ibuf_free_excess_pages(space); + } } if (!has_done_reservation) { - success = fsp_reserve_free_extents(space, 2, FSP_NORMAL, mtr); - + success = fsp_reserve_free_extents(&n_reserved, space, 2, + FSP_NORMAL, mtr); if (!success) { return(NULL); } @@ -1916,7 +2067,7 @@ fseg_create_general( funct_exit: if (!has_done_reservation) { - fil_space_release_free_extents(space, 2); + fil_space_release_free_extents(space, n_reserved); } return(ret); @@ -2132,6 +2283,8 @@ fseg_alloc_free_page_low( FSP_UP, FSP_NO_DIR */ mtr_t* mtr) /* in: mtr handle */ { + fsp_header_t* space_header; + ulint space_size; dulint seg_id; ulint used; ulint reserved; @@ -2142,6 +2295,7 @@ fseg_alloc_free_page_low( xdes_t* ret_descr; /* the extent of the allocated page */ page_t* page; ibool frag_page_allocated = FALSE; + ibool success; ulint n; ut_ad(mtr); @@ -2154,8 +2308,10 @@ fseg_alloc_free_page_low( reserved = fseg_n_reserved_pages_low(seg_inode, &used, mtr); - descr = xdes_get_descriptor(space, hint, mtr); + space_header = fsp_get_space_header(space, mtr); + descr = xdes_get_descriptor_with_space_hdr(space_header, space, + hint, mtr); if (descr == NULL) { /* Hint outside space or too high above free limit: reset hint */ @@ -2288,8 +2444,31 @@ fseg_alloc_free_page_low( return(FIL_NULL); } - if (!frag_page_allocated) { + if (space != 0) { + space_size = fil_space_get_size(space); + + if (space_size <= ret_page) { + /* It must be that we are extending a single-table + tablespace whose size is still < 64 pages */ + if (ret_page >= FSP_EXTENT_SIZE) { + fprintf(stderr, +"InnoDB: Error (2): trying to extend a single-table tablespace %lu\n" +"InnoDB: by single page(s) though the space size %lu. Page no %lu.\n", + space, space_size, ret_page); + return(FIL_NULL); + } + + success = fsp_try_extend_data_file_with_pages(space, + ret_page, space_header, mtr); + if (!success) { + /* No disk space left */ + return(FIL_NULL); + } + } + } + + if (!frag_page_allocated) { /* Initialize the allocated page to buffer pool, so that it can be obtained immediately with buf_page_get without need for a disk read */ @@ -2348,6 +2527,7 @@ fseg_alloc_free_page_general( rw_lock_t* latch; ibool success; ulint page_no; + ulint n_reserved; space = buf_frame_get_space_id(seg_header); @@ -2362,14 +2542,16 @@ fseg_alloc_free_page_general( /* This thread did not own the latch before this call: free excess pages from the insert buffer free list */ - ibuf_free_excess_pages(space); + if (space == 0) { + ibuf_free_excess_pages(space); + } } inode = fseg_inode_get(seg_header, mtr); if (!has_done_reservation) { - success = fsp_reserve_free_extents(space, 2, FSP_NORMAL, mtr); - + success = fsp_reserve_free_extents(&n_reserved, space, 2, + FSP_NORMAL, mtr); if (!success) { return(FIL_NULL); } @@ -2378,7 +2560,7 @@ fseg_alloc_free_page_general( page_no = fseg_alloc_free_page_low(buf_frame_get_space_id(inode), inode, hint, direction, mtr); if (!has_done_reservation) { - fil_space_release_free_extents(space, 2); + fil_space_release_free_extents(space, n_reserved); } return(page_no); @@ -2408,6 +2590,46 @@ fseg_alloc_free_page( } /************************************************************************** +Checks that we have at least 2 frag pages free in the first extent of a +single-table tablespace, and they are also physically initialized to the data +file. That is we have already extended the data file so that those pages are +inside the data file. If not, this function extends the tablespace with +pages. */ +static +ibool +fsp_reserve_free_pages( +/*===================*/ + /* out: TRUE if there were >= 3 free + pages, or we were able to extend */ + ulint space, /* in: space id, must be != 0 */ + fsp_header_t* space_header, /* in: header of that space, + x-latched */ + ulint size, /* in: size of the tablespace in pages, + must be < FSP_EXTENT_SIZE / 2 */ + mtr_t* mtr) /* in: mtr */ +{ + xdes_t* descr; + ulint n_used; + + ut_a(space != 0); + ut_a(size < FSP_EXTENT_SIZE / 2); + + descr = xdes_get_descriptor_with_space_hdr(space_header, space, 0, + mtr); + n_used = xdes_get_n_used(descr, mtr); + + ut_a(n_used <= size); + + if (size >= n_used + 2) { + + return(TRUE); + } + + return(fsp_try_extend_data_file_with_pages(space, n_used + 1, + space_header, mtr)); +} + +/************************************************************************** Reserves free pages from a tablespace. All mini-transactions which may use several pages from the tablespace should call this function beforehand and reserve enough free extents so that they certainly will be able @@ -2425,12 +2647,21 @@ two types of allocation: when space is scarce, FSP_NORMAL allocations will not succeed, but the latter two allocations will succeed, if possible. The purpose is to avoid dead end where the database is full but the user cannot free any space because these freeing operations temporarily -reserve some space. */ +reserve some space. + +Single-table tablespaces whose size is < 32 pages are a special case. In this +function we would liberally reserve several 64 page extents for every page +split or merge in a B-tree. But we do not want to waste disk space if the table +only occupies < 32 pages. That is why we apply different rules in that special +case, just ensuring that there are 3 free pages available. */ ibool fsp_reserve_free_extents( /*=====================*/ /* out: TRUE if we were able to make the reservation */ + ulint* n_reserved,/* out: number of extents actually reserved; if we + return TRUE and the tablespace size is < 64 pages, + then this can be 0, otherwise it is n_ext */ ulint space, /* in: space id */ ulint n_ext, /* in: number of extents to reserve */ ulint alloc_type,/* in: FSP_NORMAL, FSP_UNDO, or FSP_CLEANING */ @@ -2451,6 +2682,8 @@ fsp_reserve_free_extents( ut_ad(!mutex_own(&kernel_mutex) || mtr_memo_contains(mtr, fil_space_get_latch(space), MTR_MEMO_X_LOCK)); + *n_reserved = n_ext; + latch = fil_space_get_latch(space); mtr_x_lock(latch, mtr); @@ -2459,6 +2692,12 @@ fsp_reserve_free_extents( try_again: size = mtr_read_ulint(space_header + FSP_SIZE, MLOG_4BYTES, mtr); + if (size < FSP_EXTENT_SIZE / 2) { + /* Use different rules for small single-table tablespaces */ + *n_reserved = 0; + return(fsp_reserve_free_pages(space, space_header, size, mtr)); + } + n_free_list_ext = flst_get_len(space_header + FSP_FREE, mtr); free_limit = mtr_read_ulint(space_header + FSP_FREE_LIMIT, @@ -2508,7 +2747,7 @@ try_again: return(TRUE); } try_to_extend: - success = fsp_try_extend_last_file(&n_pages_added, space, + success = fsp_try_extend_data_file(&n_pages_added, space, space_header, mtr); if (success && n_pages_added > 0) { @@ -2558,6 +2797,13 @@ fsp_get_available_space_in_free_extents( MLOG_4BYTES, &mtr); mtr_commit(&mtr); + if (size < FSP_EXTENT_SIZE) { + ut_a(space != 0); /* This must be a single-table + tablespace */ + return(0); /* TODO: count free frag pages and return + a value based on that */ + } + /* Below we play safe when counting free extents above the free limit: some of them will contain extent descriptor pages, and therefore will not be free extents */ @@ -2655,14 +2901,10 @@ fseg_free_page_low( xdes_t* descr; ulint not_full_n_used; ulint state; + dulint descr_id; + dulint seg_id; ulint i; - char errbuf[200]; - -#ifdef __WIN__ - dulint desm; - dulint segm; -#endif - + char errbuf[200]; ut_ad(seg_inode && mtr); ut_ad(mach_read_from_4(seg_inode + FSEG_MAGIC_N) == @@ -2715,26 +2957,22 @@ fseg_free_page_low( return; } + /* If we get here, the page is in some extent of the segment */ + + descr_id = mtr_read_dulint(descr + XDES_ID, MLOG_8BYTES, mtr); + seg_id = mtr_read_dulint(seg_inode + FSEG_ID, MLOG_8BYTES, mtr); /* fprintf(stderr, "InnoDB: InnoDB is freeing space %lu page %lu,\n" "InnoDB: which belongs to descr seg %lu %lu\n" "InnoDB: segment %lu %lu.\n", space, page, - ut_dulint_get_high( - mtr_read_dulint(descr + XDES_ID, MLOG_8BYTES, mtr)), - ut_dulint_get_low( - mtr_read_dulint(descr + XDES_ID, MLOG_8BYTES, mtr)), - ut_dulint_get_high( - mtr_read_dulint(seg_inode + FSEG_ID, MLOG_8BYTES, mtr)), - ut_dulint_get_low( - mtr_read_dulint(seg_inode + FSEG_ID, MLOG_8BYTES, mtr))); + ut_dulint_get_high(descr_id), + ut_dulint_get_low(descr_id), + ut_dulint_get_high(seg_id), + ut_dulint_get_low(seg_id)); */ - /* If we get here, the page is in some extent of the segment */ - if (0 != ut_dulint_cmp( - mtr_read_dulint(descr + XDES_ID, MLOG_8BYTES, mtr), - mtr_read_dulint(seg_inode + FSEG_ID, MLOG_8BYTES, mtr))) { - + if (0 != ut_dulint_cmp(descr_id, seg_id)) { ut_sprintf_buf(errbuf, descr, 40); fprintf(stderr, "InnoDB: Dump of the tablespace extent descriptor: %s\n", errbuf); @@ -2742,42 +2980,15 @@ fseg_free_page_low( fprintf(stderr, "InnoDB: Dump of the segment inode: %s\n", errbuf); - -#ifndef __WIN__ - - fprintf(stderr, + fprintf(stderr, "InnoDB: Serious error: InnoDB is trying to free space %lu page %lu,\n" "InnoDB: which does not belong to segment %lu %lu but belongs\n" "InnoDB: to segment %lu %lu.\n", space, page, - ut_dulint_get_high( - mtr_read_dulint(descr + XDES_ID, MLOG_8BYTES, mtr)), - ut_dulint_get_low( - mtr_read_dulint(descr + XDES_ID, MLOG_8BYTES, mtr)), - ut_dulint_get_high( - mtr_read_dulint(seg_inode + FSEG_ID, MLOG_8BYTES, mtr)), - ut_dulint_get_low( - mtr_read_dulint(seg_inode + FSEG_ID, MLOG_8BYTES, mtr))); - -#else - -/* More pedantic usage to avoid VC++ 6.0 compiler errors due to inline - function expansion issues */ - - desm = mtr_read_dulint(descr + XDES_ID, MLOG_8BYTES, mtr); - segm = mtr_read_dulint(seg_inode + FSEG_ID, MLOG_8BYTES, mtr); - - fprintf(stderr, -"InnoDB: Serious error: InnoDB is trying to free space %lu page %lu,\n" -"InnoDB: which does not belong to segment %lu %lu but belongs\n" -"InnoDB: to segment %lu %lu.\n", - space, page, - ut_dulint_get_high(desm), - ut_dulint_get_low(desm), - ut_dulint_get_high(segm), - ut_dulint_get_low(segm)); - -#endif + ut_dulint_get_high(descr_id), + ut_dulint_get_low(descr_id), + ut_dulint_get_high(seg_id), + ut_dulint_get_low(seg_id)); fprintf(stderr, "InnoDB: If the InnoDB recovery crashes here, see section 6.1\n" @@ -3369,7 +3580,7 @@ fsp_validate( n_full_frag_pages = FSP_EXTENT_SIZE * flst_get_len(header + FSP_FULL_FRAG, &mtr); - ut_a(free_limit <= size); + ut_a(free_limit <= size || (space != 0 && size < FSP_EXTENT_SIZE)); flst_validate(header + FSP_FREE, &mtr); flst_validate(header + FSP_FREE_FRAG, &mtr); diff --git a/innobase/ha/ha0ha.c b/innobase/ha/ha0ha.c index eb28e15215d..c7e23d0be1e 100644 --- a/innobase/ha/ha0ha.c +++ b/innobase/ha/ha0ha.c @@ -294,10 +294,10 @@ ha_print_info( { hash_cell_t* cell; /* - ha_node_t* node; - ulint len = 0; - ulint max_len = 0; - ulint nodes = 0; + ha_node_t* node; + ulint len = 0; + ulint max_len = 0; + ulint nodes = 0; */ ulint cells = 0; ulint n_bufs; diff --git a/innobase/ibuf/ibuf0ibuf.c b/innobase/ibuf/ibuf0ibuf.c index c07756ab308..95982e57126 100644 --- a/innobase/ibuf/ibuf0ibuf.c +++ b/innobase/ibuf/ibuf0ibuf.c @@ -29,6 +29,32 @@ Created 7/19/1997 Heikki Tuuri #include "log0recv.h" #include "que0que.h" +/* STRUCTURE OF AN INSERT BUFFER RECORD + +In versions < 4.1.x: + +1. The first field is the page number. +2. The second field is an array which stores type info for each subsequent + field. We store the information which affects the ordering of records, and + also the physical storage size of an SQL NULL value. E.g., for CHAR(10) it + is 10 bytes. +3. Next we have the fields of the actual index record. + +In versions >= 4.1.x: + +Note that contary to what we planned in the 1990's, there will only be one +insert buffer tree, and that is in the system tablespace of InnoDB. + +1. The first field is the space id. +2. The second field is a one-byte marker which differentiates records from + the < 4.1.x storage format. +3. The third field contains the type info, where we have also added 2 bytes to + store the charset. +4. The rest of the fields contain the fields of the actual index record. + +*/ + + /* PREVENTING DEADLOCKS IN THE INSERT BUFFER SYSTEM If an OS thread performs any operation that brings in disk pages from @@ -45,20 +71,20 @@ because they own x-latches to pages which are on a lower level than the insert buffer tree latch, its page latches, and the tablespace latch an insert buffer operation can reserve. -The solution is the following: We put into each tablespace an insert buffer -of its own. Let all the tree and page latches connected with the insert buffer -be later in the latching order than the fsp latch and fsp page latches. +The solution is the following: Let all the tree and page latches connected +with the insert buffer be later in the latching order than the fsp latch and +fsp page latches. + Insert buffer pages must be such that the insert buffer is never invoked -when these pages area accessed as this would result in a recursion violating +when these pages are accessed as this would result in a recursion violating the latching order. We let a special i/o-handler thread take care of i/o to the insert buffer pages and the ibuf bitmap pages, as well as the fsp bitmap pages and the first inode page, which contains the inode of the ibuf tree: let -us call all these ibuf pages. If the OS does not support asynchronous i/o, -then there is no special i/o thread, but to prevent deadlocks, we do not let a -read-ahead access both non-ibuf and ibuf pages. +us call all these ibuf pages. To prevent deadlocks, we do not let a read-ahead +access both non-ibuf and ibuf pages. -Then an i/o-handler for the insert buffer never needs to access the insert -buffer tree and thus obeys the latching order. On the other hand, other +Then an i/o-handler for the insert buffer never needs to access recursively the +insert buffer tree and thus obeys the latching order. On the other hand, other i/o-handlers for other tablespaces may require access to the insert buffer, but because all kinds of latches they need to access there are later in the latching order, no violation of the latching order occurs in this case, @@ -95,8 +121,8 @@ the B-tree non-leaf pages if it has latches on lower level pages. Read-ahead is only allowed for level 1 and 2 pages. Dedicated i/o-handler threads handle exclusively level 1 i/o. A dedicated i/o handler thread handles exclusively level 2 i/o. However, if an OS thread does the i/o handling for itself, i.e., -it uses synchronous aio or the OS does not support aio, it can access any -pages, as long as it obeys the access order rules. */ +it uses synchronous aio, it can access any pages, as long as it obeys the +access order rules. */ /* Buffer pool size per the maximum insert buffer size */ #define IBUF_POOL_SIZE_PER_MAX_SIZE 2 @@ -109,8 +135,8 @@ ulint ibuf_rnd = 986058871; ulint ibuf_flush_count = 0; /* Dimensions for the ibuf_count array */ -#define IBUF_COUNT_N_SPACES 10 -#define IBUF_COUNT_N_PAGES 10000 +#define IBUF_COUNT_N_SPACES 500 +#define IBUF_COUNT_N_PAGES 2000 /* Buffered entry counts for file pages, used in debugging */ ulint* ibuf_counts[IBUF_COUNT_N_SPACES]; @@ -235,6 +261,8 @@ ibuf_header_page_get( { page_t* page; + ut_a(space == 0); + ut_ad(!ibuf_inside()); page = buf_page_get(space, FSP_IBUF_HEADER_PAGE_NO, RW_X_LATCH, mtr); @@ -257,6 +285,7 @@ ibuf_tree_root_get( { page_t* page; + ut_a(space == 0); ut_ad(ibuf_inside()); mtr_x_lock(dict_tree_get_lock((data->index)->tree), mtr); @@ -267,7 +296,7 @@ ibuf_tree_root_get( return(page); } - + /********************************************************************** Gets the ibuf count for a given page. */ @@ -290,9 +319,9 @@ ibuf_count_get( return(*(ibuf_counts[space] + page_no)); } +#ifdef UNIV_IBUF_DEBUG /********************************************************************** Sets the ibuf count for a given page. */ -#ifdef UNIV_IBUF_DEBUG static void ibuf_count_set( @@ -301,17 +330,17 @@ ibuf_count_set( ulint page_no,/* in: page number */ ulint val) /* in: value to set */ { - ut_ad(space < IBUF_COUNT_N_SPACES); - ut_ad(page_no < IBUF_COUNT_N_PAGES); - ut_ad(val < UNIV_PAGE_SIZE); + ut_a(space < IBUF_COUNT_N_SPACES); + ut_a(page_no < IBUF_COUNT_N_PAGES); + ut_a(val < UNIV_PAGE_SIZE); *(ibuf_counts[space] + page_no) = val; } #endif /********************************************************************** -Creates the insert buffer data structure at a database startup and -initializes the data structures for the insert buffer of each tablespace. */ +Creates the insert buffer data structure at a database startup and initializes +the data structures for the insert buffer. */ void ibuf_init_at_db_start(void) @@ -401,19 +430,19 @@ ibuf_data_sizes_update( /* printf("ibuf size %lu, space ibuf size %lu\n", ibuf->size, data->size); */ -} +} /********************************************************************** Creates the insert buffer data struct for a single tablespace. Reads the root page of the insert buffer tree in the tablespace. This function can be called only after the dictionary system has been initialized, as this -creates also the insert buffer table and index for this tablespace. */ +creates also the insert buffer table and index into this tablespace. */ ibuf_data_t* ibuf_data_init_for_space( /*=====================*/ /* out, own: ibuf data struct, linked to the list - in ibuf control structure. */ + in ibuf control structure */ ulint space) /* in: space id */ { ibuf_data_t* data; @@ -425,6 +454,8 @@ ibuf_data_init_for_space( dict_index_t* index; ulint n_used; + ut_a(space == 0); + #ifdef UNIV_LOG_DEBUG if (space % 2 == 1) { @@ -463,7 +494,15 @@ ibuf_data_init_for_space( data->n_merged_recs = 0; ibuf_data_sizes_update(data, root, &mtr); - +/* + if (!data->empty) { + fprintf(stderr, +"InnoDB: index entries found in the insert buffer\n"); + } else { + fprintf(stderr, +"InnoDB: insert buffer empty\n"); + } +*/ mutex_exit(&ibuf_mutex); mtr_commit(&mtr); @@ -676,7 +715,7 @@ ibuf_bitmap_get_map_page( mtr_t* mtr) /* in: mtr */ { page_t* page; - + page = buf_page_get(space, ibuf_bitmap_page_no_calc(page_no), RW_X_LATCH, mtr); buf_page_dbg_add_level(page, SYNC_IBUF_BITMAP); @@ -887,7 +926,7 @@ UNIV_INLINE ibool ibuf_fixed_addr_page( /*=================*/ - /* out: TRUE if a fixed address ibuf i/o page */ + /* out: TRUE if a fixed address ibuf i/o page */ ulint page_no)/* in: page number */ { if ((ibuf_bitmap_page(page_no)) @@ -924,6 +963,12 @@ ibuf_page( return(TRUE); } + if (space != 0) { + /* Currently we only have an ibuf tree in space 0 */ + + return(FALSE); + } + ut_ad(fil_space_get_type(space) == FIL_TABLESPACE); mtr_start(&mtr); @@ -988,14 +1033,60 @@ ibuf_rec_get_page_no( ut_ad(ibuf_inside()); ut_ad(rec_get_n_fields(rec) > 2); - field = rec_get_nth_field(rec, 0, &len); + field = rec_get_nth_field(rec, 1, &len); - ut_ad(len == 4); + if (len == 1) { + /* This is of the >= 4.1.x record format */ + ut_a(trx_sys_multiple_tablespace_format); + + field = rec_get_nth_field(rec, 2, &len); + } else { + ut_a(trx_doublewrite_must_reset_space_ids); + ut_a(!trx_sys_multiple_tablespace_format); + + field = rec_get_nth_field(rec, 0, &len); + } + + ut_a(len == 4); return(mach_read_from_4(field)); } /************************************************************************ +Returns the space id field of an ibuf record. For < 4.1.x format records +returns 0. */ +static +ulint +ibuf_rec_get_space( +/*===============*/ + /* out: space id */ + rec_t* rec) /* in: ibuf record */ +{ + byte* field; + ulint len; + + ut_ad(ibuf_inside()); + ut_ad(rec_get_n_fields(rec) > 2); + + field = rec_get_nth_field(rec, 1, &len); + + if (len == 1) { + /* This is of the >= 4.1.x record format */ + + ut_a(trx_sys_multiple_tablespace_format); + field = rec_get_nth_field(rec, 0, &len); + ut_a(len == 4); + + return(mach_read_from_4(field)); + } + + ut_a(trx_doublewrite_must_reset_space_ids); + ut_a(!trx_sys_multiple_tablespace_format); + + return(0); +} + +/************************************************************************ Returns the space taken by a stored non-clustered index entry if converted to an index record. */ static @@ -1007,6 +1098,7 @@ ibuf_rec_get_volume( rec_t* ibuf_rec)/* in: ibuf record */ { dtype_t dtype; + ibool new_format = FALSE; ulint data_size = 0; ulint n_fields; byte* types; @@ -1017,17 +1109,42 @@ ibuf_rec_get_volume( ut_ad(ibuf_inside()); ut_ad(rec_get_n_fields(ibuf_rec) > 2); - n_fields = rec_get_n_fields(ibuf_rec) - 2; + data = rec_get_nth_field(ibuf_rec, 1, &len); - types = rec_get_nth_field(ibuf_rec, 1, &len); + if (len > 1) { + /* < 4.1.x format record */ - ut_ad(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE); + ut_a(trx_doublewrite_must_reset_space_ids); + ut_a(!trx_sys_multiple_tablespace_format); + + n_fields = rec_get_n_fields(ibuf_rec) - 2; + + types = rec_get_nth_field(ibuf_rec, 1, &len); + + ut_ad(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE); + } else { + /* >= 4.1.x format record */ + + ut_a(trx_sys_multiple_tablespace_format); + new_format = TRUE; + + n_fields = rec_get_n_fields(ibuf_rec) - 4; + + types = rec_get_nth_field(ibuf_rec, 3, &len); + } for (i = 0; i < n_fields; i++) { - data = rec_get_nth_field(ibuf_rec, i + 2, &len); + if (new_format) { + data = rec_get_nth_field(ibuf_rec, i + 4, &len); - dtype_read_for_order_and_null_size(&dtype, + dtype_new_read_for_order_and_null_size(&dtype, + types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + } else { + data = rec_get_nth_field(ibuf_rec, i + 2, &len); + + dtype_read_for_order_and_null_size(&dtype, types + i * DATA_ORDER_NULL_TYPE_BUF_SIZE); + } if (len == UNIV_SQL_NULL) { data_size += dtype_get_sql_null_size(&dtype); @@ -1052,6 +1169,7 @@ ibuf_entry_build( must be kept because we copy pointers to its fields */ dtuple_t* entry, /* in: entry for a non-clustered index */ + ulint space, /* in: space id */ ulint page_no,/* in: index page number where entry should be inserted */ mem_heap_t* heap) /* in: heap into which to build */ @@ -1064,49 +1182,79 @@ ibuf_entry_build( byte* buf2; ulint i; - /* We have to build a tuple whose first field is the page number, - the second field contains the original type information for entry, - and the rest of the fields are copied from entry. All fields - in the tuple are of the type binary. */ + /* Starting from 4.1.x, we have to build a tuple whose + (1) first field is the space id, + (2) the second field a single marker byte to tell that this + is a new format record, + (3) the third contains the page number, and + (4) the fourth contains the relevent type information of each data + field, + (5) and the rest of the fields are copied from entry. All fields + in the tuple are ordered like the type binary in our insert buffer + tree. */ n_fields = dtuple_get_n_fields(entry); - tuple = dtuple_create(heap, n_fields + 2); + tuple = dtuple_create(heap, n_fields + 4); - /* Store the page number in tuple */ + /* Store the space id in tuple */ field = dtuple_get_nth_field(tuple, 0); buf = mem_heap_alloc(heap, 4); - mach_write_to_4(buf, page_no); + mach_write_to_4(buf, space); dfield_set_data(field, buf, 4); - /* Store the type info in tuple */ + /* Store the marker byte field in tuple */ + + field = dtuple_get_nth_field(tuple, 1); - buf2 = mem_heap_alloc(heap, n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE); + buf = mem_heap_alloc(heap, 1); - for (i = 0; i < n_fields; i++) { + /* We set the marker byte zero */ - field = dtuple_get_nth_field(tuple, i + 2); + mach_write_to_1(buf, 0); - entry_field = dtuple_get_nth_field(entry, i); + dfield_set_data(field, buf, 1); + + /* Store the page number in tuple */ + + field = dtuple_get_nth_field(tuple, 2); + buf = mem_heap_alloc(heap, 4); + + mach_write_to_4(buf, page_no); + + dfield_set_data(field, buf, 4); + + /* Store the type info in buf2, and add the fields from entry to + tuple */ + buf2 = mem_heap_alloc(heap, n_fields + * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + for (i = 0; i < n_fields; i++) { + /* We add 4 below because we have the 4 extra fields at the + start of an ibuf record */ + + field = dtuple_get_nth_field(tuple, i + 4); + entry_field = dtuple_get_nth_field(entry, i); dfield_copy(field, entry_field); - dtype_store_for_order_and_null_size( - buf2 + i * DATA_ORDER_NULL_TYPE_BUF_SIZE, + dtype_new_store_for_order_and_null_size( + buf2 + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE, dfield_get_type(entry_field)); } - field = dtuple_get_nth_field(tuple, 1); + /* Store the type info in buf2 to field 3 of tuple */ - dfield_set_data(field, buf2, n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE); + field = dtuple_get_nth_field(tuple, 3); - /* Set the types in the new tuple binary */ + dfield_set_data(field, buf2, n_fields + * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + /* Set all the types in the new tuple binary */ - dtuple_set_types_binary(tuple, n_fields + 2); + dtuple_set_types_binary(tuple, n_fields + 4); return(tuple); } @@ -1135,35 +1283,73 @@ ibuf_build_entry_from_ibuf_rec( ulint len; ulint i; - n_fields = rec_get_n_fields(ibuf_rec) - 2; + data = rec_get_nth_field(ibuf_rec, 1, &len); + + if (len > 1) { + /* This a < 4.1.x format record */ + + ut_a(trx_doublewrite_must_reset_space_ids); + ut_a(!trx_sys_multiple_tablespace_format); + + n_fields = rec_get_n_fields(ibuf_rec) - 2; + tuple = dtuple_create(heap, n_fields); + types = rec_get_nth_field(ibuf_rec, 1, &len); + + ut_a(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE); + + for (i = 0; i < n_fields; i++) { + field = dtuple_get_nth_field(tuple, i); + + data = rec_get_nth_field(ibuf_rec, i + 2, &len); + + dfield_set_data(field, data, len); + + dtype_read_for_order_and_null_size( + dfield_get_type(field), + types + i * DATA_ORDER_NULL_TYPE_BUF_SIZE); + } + + return(tuple); + } + + /* This a >= 4.1.x format record */ + + ut_a(trx_sys_multiple_tablespace_format); + + ut_a(rec_get_n_fields(ibuf_rec) > 4); + + n_fields = rec_get_n_fields(ibuf_rec) - 4; tuple = dtuple_create(heap, n_fields); - types = rec_get_nth_field(ibuf_rec, 1, &len); + types = rec_get_nth_field(ibuf_rec, 3, &len); - ut_ad(len == n_fields * DATA_ORDER_NULL_TYPE_BUF_SIZE); + ut_a(len == n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); for (i = 0; i < n_fields; i++) { - field = dtuple_get_nth_field(tuple, i); + field = dtuple_get_nth_field(tuple, i); - data = rec_get_nth_field(ibuf_rec, i + 2, &len); + data = rec_get_nth_field(ibuf_rec, i + 4, &len); dfield_set_data(field, data, len); - dtype_read_for_order_and_null_size(dfield_get_type(field), - types + i * DATA_ORDER_NULL_TYPE_BUF_SIZE); + dtype_new_read_for_order_and_null_size( + dfield_get_type(field), + types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); } return(tuple); } /************************************************************************* -Builds a search tuple used to search buffered inserts for an index page. */ +Builds a search tuple used to search buffered inserts for an index page. +This is for < 4.1.x format records */ static dtuple_t* ibuf_search_tuple_build( /*====================*/ /* out, own: search tuple */ + ulint space, /* in: space id */ ulint page_no,/* in: index page number */ mem_heap_t* heap) /* in: heap into which to build */ { @@ -1171,6 +1357,10 @@ ibuf_search_tuple_build( dfield_t* field; byte* buf; + ut_a(space == 0); + ut_a(trx_doublewrite_must_reset_space_ids); + ut_a(!trx_sys_multiple_tablespace_format); + tuple = dtuple_create(heap, 1); /* Store the page number in tuple */ @@ -1189,6 +1379,61 @@ ibuf_search_tuple_build( } /************************************************************************* +Builds a search tuple used to search buffered inserts for an index page. +This is for >= 4.1.x format records. */ +static +dtuple_t* +ibuf_new_search_tuple_build( +/*========================*/ + /* out, own: search tuple */ + ulint space, /* in: space id */ + ulint page_no,/* in: index page number */ + mem_heap_t* heap) /* in: heap into which to build */ +{ + dtuple_t* tuple; + dfield_t* field; + byte* buf; + + ut_a(trx_sys_multiple_tablespace_format); + + tuple = dtuple_create(heap, 3); + + /* Store the space id in tuple */ + + field = dtuple_get_nth_field(tuple, 0); + + buf = mem_heap_alloc(heap, 4); + + mach_write_to_4(buf, space); + + dfield_set_data(field, buf, 4); + + /* Store the new format record marker byte */ + + field = dtuple_get_nth_field(tuple, 1); + + buf = mem_heap_alloc(heap, 1); + + mach_write_to_1(buf, 0); + + dfield_set_data(field, buf, 1); + + /* Store the page number in tuple */ + + field = dtuple_get_nth_field(tuple, 2); + + buf = mem_heap_alloc(heap, 4); + + mach_write_to_4(buf, page_no); + + dfield_set_data(field, buf, 4); + + dtuple_set_types_binary(tuple, 3); + + return(tuple); +} + +/************************************************************************* Checks if there are enough pages in the free list of the ibuf tree that we dare to start a pessimistic insert to the insert buffer. */ UNIV_INLINE @@ -1253,6 +1498,8 @@ ibuf_add_free_page( page_t* root; page_t* bitmap_page; + ut_a(space == 0); + mtr_start(&mtr); /* Acquire the fsp latch before the ibuf header, obeying the latching @@ -1296,7 +1543,7 @@ ibuf_add_free_page( page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr); fil_page_set_type(page, FIL_PAGE_IBUF_FREE_LIST); - + ibuf_data->seg_size++; ibuf_data->free_list_len++; @@ -1307,7 +1554,6 @@ ibuf_add_free_page( ibuf_bitmap_page_set_bits(bitmap_page, page_no, IBUF_BITMAP_IBUF, TRUE, &mtr); - mtr_commit(&mtr); mutex_exit(&ibuf_mutex); @@ -1334,6 +1580,8 @@ ibuf_remove_free_page( page_t* root; page_t* bitmap_page; + ut_a(space == 0); + mtr_start(&mtr); /* Acquire the fsp latch before the ibuf header, obeying the latching @@ -1444,6 +1692,12 @@ ibuf_free_excess_pages( ibuf_data_t* ibuf_data; ulint i; + if (space != 0) { + fprintf(stderr, +"InnoDB: Error: calling ibuf_free_excess_pages for space %lu\n", space); + return; + } + ut_ad(rw_lock_own(fil_space_get_latch(space), RW_LOCK_EX)); ut_ad(rw_lock_get_x_lock_count(fil_space_get_latch(space)) == 1); ut_ad(!ibuf_inside()); @@ -1496,8 +1750,12 @@ ibuf_get_merge_page_nos( contract the tree, FALSE if this is called when a single page becomes full and we look if it pays to read also nearby pages */ - rec_t* first_rec,/* in: record from which we read down and - up in the chain of records */ + rec_t* first_rec,/* in: record from which we read up and down + in the chain of records */ + ulint* space_ids,/* in/out: space id's of the pages */ + ib_longlong* space_versions,/* in/out: tablespace version + timestamps; used to prevent reading in old + pages after DISCARD + IMPORT tablespace */ ulint* page_nos,/* in/out: buffer for at least IBUF_MAX_N_PAGES_MERGED many page numbers; the page numbers are in an ascending order */ @@ -1505,8 +1763,11 @@ ibuf_get_merge_page_nos( page_nos in this function */ { ulint prev_page_no; + ulint prev_space_id; ulint first_page_no; + ulint first_space_id; ulint rec_page_no; + ulint rec_space_id; rec_t* rec; ulint sum_volumes; ulint volume_for_page; @@ -1538,49 +1799,70 @@ ibuf_get_merge_page_nos( rec = first_rec; first_page_no = ibuf_rec_get_page_no(first_rec); + first_space_id = ibuf_rec_get_space(first_rec); n_pages = 0; prev_page_no = 0; + prev_space_id = 0; + /* Go backwards from the first_rec until we reach the border of the + 'merge area', or the page start or the limit of storeable pages is + reached */ + while ((rec != page_get_infimum_rec(page)) && (n_pages < limit)) { rec_page_no = ibuf_rec_get_page_no(rec); + rec_space_id = ibuf_rec_get_space(rec); - ut_ad(rec_page_no != 0); - - if (rec_page_no / IBUF_MERGE_AREA - != first_page_no / IBUF_MERGE_AREA) { + if (rec_space_id != first_space_id + || rec_page_no / IBUF_MERGE_AREA + != first_page_no / IBUF_MERGE_AREA) { break; } - if (rec_page_no != prev_page_no) { + if (rec_page_no != prev_page_no + || rec_space_id != prev_space_id) { n_pages++; } prev_page_no = rec_page_no; + prev_space_id = rec_space_id; rec = page_rec_get_prev(rec); } rec = page_rec_get_next(rec); + /* At the loop start there is no prev page; we mark this with a pair + of space id, page no (0, 0) for which there can never be entries in + the insert buffer */ + prev_page_no = 0; + prev_space_id = 0; sum_volumes = 0; volume_for_page = 0; while (*n_stored < limit) { if (rec == page_get_supremum_rec(page)) { + /* When no more records available, mark this with + another 'impossible' pair of space id, page no */ rec_page_no = 1; + rec_space_id = 0; } else { rec_page_no = ibuf_rec_get_page_no(rec); + rec_space_id = ibuf_rec_get_space(rec); ut_ad(rec_page_no > IBUF_TREE_ROOT_PAGE_NO); } #ifdef UNIV_IBUF_DEBUG ut_a(*n_stored < IBUF_MAX_N_PAGES_MERGED); #endif - if (rec_page_no != prev_page_no) { - if ((prev_page_no == first_page_no) + if ((rec_space_id != prev_space_id + || rec_page_no != prev_page_no) + && (prev_space_id != 0 || prev_page_no != 0)) { + + if ((prev_page_no == first_page_no + && prev_space_id == first_space_id) || contract || (volume_for_page > ((IBUF_MERGE_THRESHOLD - 1) @@ -1588,6 +1870,10 @@ ibuf_get_merge_page_nos( / IBUF_PAGE_SIZE_PER_FREE_SPACE) / IBUF_MERGE_THRESHOLD)) { + space_ids[*n_stored] = prev_space_id; + space_versions[*n_stored] + = fil_space_get_version( + prev_space_id); page_nos[*n_stored] = prev_page_no; (*n_stored)++; @@ -1595,8 +1881,9 @@ ibuf_get_merge_page_nos( sum_volumes += volume_for_page; } - if (rec_page_no / IBUF_MERGE_AREA - != first_page_no / IBUF_MERGE_AREA) { + if (rec_space_id != first_space_id + || rec_page_no / IBUF_MERGE_AREA + != first_page_no / IBUF_MERGE_AREA) { break; } @@ -1604,7 +1891,7 @@ ibuf_get_merge_page_nos( volume_for_page = 0; } - if (rec_page_no == 1) { + if (rec_page_no == 1 && rec_space_id == 0) { /* Supremum record */ break; @@ -1615,6 +1902,7 @@ ibuf_get_merge_page_nos( volume_for_page += rec_volume; prev_page_no = rec_page_no; + prev_space_id = rec_space_id; rec = page_rec_get_next(rec); } @@ -1647,6 +1935,8 @@ ibuf_contract_ext( ulint space; ibool all_trees_empty; ulint page_nos[IBUF_MAX_N_PAGES_MERGED]; + ulint space_ids[IBUF_MAX_N_PAGES_MERGED]; + ib_longlong space_versions[IBUF_MAX_N_PAGES_MERGED]; ulint n_stored; ulint sum_sizes; mtr_t mtr; @@ -1659,7 +1949,8 @@ loop: ut_ad(ibuf_validate_low()); - /* Choose an ibuf tree at random */ + /* Choose an ibuf tree at random (though there really is only one tree + in the current implementation) */ ibuf_rnd += 865558671; rnd_pos = ibuf_rnd % ibuf->size; @@ -1695,8 +1986,10 @@ loop: ut_ad(data); - space = (data->index)->space; + space = data->index->space; + ut_a(space == 0); /* We currently only have an ibuf tree in + space 0 */ mtr_start(&mtr); ibuf_enter(); @@ -1725,8 +2018,8 @@ loop: mutex_exit(&ibuf_mutex); sum_sizes = ibuf_get_merge_page_nos(TRUE, btr_pcur_get_rec(&pcur), - page_nos, &n_stored); - + space_ids, space_versions, page_nos, + &n_stored); #ifdef UNIV_IBUF_DEBUG /* printf("Ibuf contract sync %lu pages %lu volume %lu\n", sync, n_stored, sum_sizes); */ @@ -1736,8 +2029,8 @@ loop: mtr_commit(&mtr); btr_pcur_close(&pcur); - buf_read_ibuf_merge_pages(sync, space, page_nos, n_stored); - + buf_read_ibuf_merge_pages(sync, space_ids, space_versions, page_nos, + n_stored); *n_pages = n_stored; return(sum_sizes + 1); @@ -1866,6 +2159,8 @@ ibuf_get_volume_buffered( ulint next_page_no; page_t* next_page; + ut_a(trx_sys_multiple_tablespace_format); + ut_ad((pcur->latch_mode == BTR_MODIFY_PREV) || (pcur->latch_mode == BTR_MODIFY_TREE)); @@ -1888,7 +2183,8 @@ ibuf_get_volume_buffered( break; } - if (page_no != ibuf_rec_get_page_no(rec)) { + if (page_no != ibuf_rec_get_page_no(rec) + || space != ibuf_rec_get_space(rec)) { goto count_later; } @@ -1907,7 +2203,7 @@ ibuf_get_volume_buffered( goto count_later; } - prev_page = buf_page_get(space, prev_page_no, RW_X_LATCH, mtr); + prev_page = buf_page_get(0, prev_page_no, RW_X_LATCH, mtr); buf_page_dbg_add_level(prev_page, SYNC_TREE_NODE); @@ -1924,7 +2220,8 @@ ibuf_get_volume_buffered( return(UNIV_PAGE_SIZE); } - if (page_no != ibuf_rec_get_page_no(rec)) { + if (page_no != ibuf_rec_get_page_no(rec) + || space != ibuf_rec_get_space(rec)) { goto count_later; } @@ -1947,7 +2244,8 @@ count_later: break; } - if (page_no != ibuf_rec_get_page_no(rec)) { + if (page_no != ibuf_rec_get_page_no(rec) + || space != ibuf_rec_get_space(rec)) { return(volume); } @@ -1966,7 +2264,7 @@ count_later: return(volume); } - next_page = buf_page_get(space, next_page_no, RW_X_LATCH, mtr); + next_page = buf_page_get(0, next_page_no, RW_X_LATCH, mtr); buf_page_dbg_add_level(next_page, SYNC_TREE_NODE); @@ -1981,7 +2279,8 @@ count_later: return(UNIV_PAGE_SIZE); } - if (page_no != ibuf_rec_get_page_no(rec)) { + if (page_no != ibuf_rec_get_page_no(rec) + || space != ibuf_rec_get_space(rec)) { return(volume); } @@ -1993,6 +2292,57 @@ count_later: } /************************************************************************* +Reads the biggest tablespace id from the high end of the insert buffer +tree and updates the counter in fil_system. */ + +void +ibuf_update_max_tablespace_id(void) +/*===============================*/ +{ + ulint max_space_id; + rec_t* rec; + byte* field; + ulint len; + ibuf_data_t* ibuf_data; + dict_index_t* ibuf_index; + btr_pcur_t pcur; + mtr_t mtr; + + ibuf_data = fil_space_get_ibuf_data(0); + + ibuf_index = ibuf_data->index; + + ibuf_enter(); + + mtr_start(&mtr); + + btr_pcur_open_at_index_side(FALSE, ibuf_index, BTR_SEARCH_LEAF, + &pcur, TRUE, &mtr); + btr_pcur_move_to_prev(&pcur, &mtr); + + if (btr_pcur_is_before_first_on_page(&pcur, &mtr)) { + /* The tree is empty */ + + max_space_id = 0; + } else { + rec = btr_pcur_get_rec(&pcur); + + field = rec_get_nth_field(rec, 0, &len); + + ut_a(len == 4); + + max_space_id = mach_read_from_4(field); + } + + mtr_commit(&mtr); + ibuf_exit(); + + /* printf("Maximum space id in insert buffer %lu\n", max_space_id); */ + + fil_set_max_space_id_if_bigger(max_space_id); +} + +/************************************************************************* Makes an index insert to the insert buffer, instead of directly to the disk page, if this is possible. */ static @@ -2012,8 +2362,6 @@ ibuf_insert_low( ulint entry_size; btr_pcur_t pcur; btr_cur_t* cursor; - mtr_t mtr; - mtr_t bitmap_mtr; dtuple_t* ibuf_entry; mem_heap_t* heap; ulint buffered; @@ -2025,16 +2373,25 @@ ibuf_insert_low( page_t* root; ulint err; ibool do_merge; + ulint space_ids[IBUF_MAX_N_PAGES_MERGED]; + ib_longlong space_versions[IBUF_MAX_N_PAGES_MERGED]; ulint page_nos[IBUF_MAX_N_PAGES_MERGED]; ulint n_stored; ulint bits; + mtr_t mtr; + mtr_t bitmap_mtr; ut_a(!(index->type & DICT_CLUSTERED)); ut_ad(dtuple_check_typed(entry)); + ut_a(trx_sys_multiple_tablespace_format); + do_merge = FALSE; - - ibuf_data = fil_space_get_ibuf_data(space); + + /* Currently the insert buffer of space 0 takes care of inserts to all + tablespaces */ + + ibuf_data = fil_space_get_ibuf_data(0); ibuf_index = ibuf_data->index; @@ -2061,7 +2418,7 @@ ibuf_insert_low( mutex_enter(&ibuf_pessimistic_insert_mutex); ibuf_enter(); - + mutex_enter(&ibuf_mutex); while (!ibuf_data_enough_free_for_insert(ibuf_data)) { @@ -2072,7 +2429,7 @@ ibuf_insert_low( mutex_exit(&ibuf_pessimistic_insert_mutex); - err = ibuf_add_free_page(space, ibuf_data); + err = ibuf_add_free_page(0, ibuf_data); if (err == DB_STRONG_FAIL) { @@ -2097,7 +2454,7 @@ ibuf_insert_low( the first fields and the type information for other fields, and which will be inserted to the insert buffer. */ - ibuf_entry = ibuf_entry_build(entry, page_no, heap); + ibuf_entry = ibuf_entry_build(entry, space, page_no, heap); /* Open a cursor to the insert buffer tree to calculate if we can add the new entry to it without exceeding the free space limit for the @@ -2122,7 +2479,6 @@ ibuf_insert_low( if (buf_page_peek(space, page_no) || lock_rec_expl_exist_on_page(space, page_no)) { - err = DB_STRONG_FAIL; mtr_commit(&bitmap_mtr); @@ -2135,7 +2491,6 @@ ibuf_insert_low( if (buffered + entry_size + page_dir_calc_reserved_space(1) > ibuf_index_page_calc_free_from_bits(bits)) { - mtr_commit(&bitmap_mtr); /* It may not fit */ @@ -2144,7 +2499,8 @@ ibuf_insert_low( do_merge = TRUE; ibuf_get_merge_page_nos(FALSE, btr_pcur_get_rec(&pcur), - page_nos, &n_stored); + space_ids, space_versions, page_nos, + &n_stored); goto function_exit; } @@ -2180,10 +2536,10 @@ ibuf_insert_low( which would cause the x-latching of the root after that to break the latching order. */ - root = ibuf_tree_root_get(ibuf_data, space, &mtr); + root = ibuf_tree_root_get(ibuf_data, 0, &mtr); err = btr_cur_pessimistic_insert(BTR_NO_LOCKING_FLAG - | BTR_NO_UNDO_LOG_FLAG, + | BTR_NO_UNDO_LOG_FLAG, cursor, ibuf_entry, &ins_rec, &dummy_big_rec, thr, @@ -2200,6 +2556,10 @@ ibuf_insert_low( function_exit: #ifdef UNIV_IBUF_DEBUG if (err == DB_SUCCESS) { + printf( +"Incrementing ibuf count of space %lu page %lu\n" +"from %lu by 1\n", space, page_no, ibuf_count_get(space, page_no)); + ibuf_count_set(space, page_no, ibuf_count_get(space, page_no) + 1); } @@ -2234,7 +2594,8 @@ function_exit: #ifdef UNIV_IBUF_DEBUG ut_a(n_stored <= IBUF_MAX_N_PAGES_MERGED); #endif - buf_read_ibuf_merge_pages(FALSE, space, page_nos, n_stored); + buf_read_ibuf_merge_pages(FALSE, space_ids, space_versions, + page_nos, n_stored); } return(err); @@ -2257,6 +2618,7 @@ ibuf_insert( { ulint err; + ut_a(trx_sys_multiple_tablespace_format); ut_ad(dtuple_check_typed(entry)); ut_a(!(index->type & DICT_CLUSTERED)); @@ -2309,6 +2671,26 @@ ibuf_insert_to_index_page( ut_ad(ibuf_inside()); ut_ad(dtuple_check_typed(entry)); + if (rec_get_n_fields(page_rec_get_next(page_get_infimum_rec(page))) + != dtuple_get_n_fields(entry)) { + + fprintf(stderr, +"InnoDB: Trying to insert a record from the insert buffer to an index page\n" +"InnoDB: but the number of fields does not match!\n%s\n", errbuf); + + buf_page_print(page); + + dtuple_sprintf(errbuf, 900, entry); + + fprintf(stderr, +"InnoDB: The table where where this index record belongs\n" +"InnoDB: is now probably corrupt. Please run CHECK TABLE on\n" +"InnoDB: your tables.\n" +"InnoDB: Send a detailed bug report to mysql@lists.mysql.com!\n"); + + return; + } + low_match = page_cur_search(page, entry, PAGE_CUR_LE, &page_cur); if (low_match == dtuple_get_n_fields(entry)) { @@ -2338,18 +2720,14 @@ ibuf_insert_to_index_page( dtuple_sprintf(errbuf, 900, entry); fprintf(stderr, -"InnoDB: Cannot insert index record %s\n", errbuf); - - fprintf(stderr, +"InnoDB: Cannot insert index record %s\n" "InnoDB: The table where where this index record belongs\n" "InnoDB: is now probably corrupt. Please run CHECK TABLE on\n" -"InnoDB: that table.\n"); - +"InnoDB: that table.\n", errbuf); bitmap_page = ibuf_bitmap_get_map_page( buf_frame_get_space_id(page), buf_frame_get_page_no(page), mtr); - old_bits = ibuf_bitmap_page_get_bits( bitmap_page, buf_frame_get_page_no(page), @@ -2359,12 +2737,11 @@ ibuf_insert_to_index_page( fprintf(stderr, "InnoDB: Send a detailed bug report to mysql@lists.mysql.com!\n"); - } } } } - + /************************************************************************* Deletes from ibuf the record on which pcur is positioned. If we have to resort to a pessimistic delete, this function commits mtr and closes @@ -2388,13 +2765,16 @@ ibuf_delete_rec( ibuf_data_t* ibuf_data; page_t* root; ulint err; - + ut_ad(ibuf_inside()); success = btr_cur_optimistic_delete(btr_pcur_get_btr_cur(pcur), mtr); if (success) { #ifdef UNIV_IBUF_DEBUG + printf( +"Decrementing ibuf count of space %lu page %lu\n" +"from %lu by 1\n", space, page_no, ibuf_count_get(space, page_no)); ibuf_count_set(space, page_no, ibuf_count_get(space, page_no) - 1); #endif @@ -2406,7 +2786,10 @@ ibuf_delete_rec( btr_pcur_commit_specify_mtr(pcur, mtr); - ibuf_data = fil_space_get_ibuf_data(space); + /* Currently the insert buffer of space 0 takes care of inserts to all + tablespaces */ + + ibuf_data = fil_space_get_ibuf_data(0); mutex_enter(&ibuf_mutex); @@ -2416,10 +2799,9 @@ ibuf_delete_rec( if (!success) { fprintf(stderr, - "InnoDB: ERROR: Send the output to heikki.tuuri@innodb.com\n"); - fprintf(stderr, "InnoDB: ibuf cursor restoration fails!\n"); - fprintf(stderr, "InnoDB: ibuf record inserted to page %lu\n", - page_no); +"InnoDB: ERROR: Send the output to mysql@lists.mysql.com\n" +"InnoDB: ibuf cursor restoration fails!\n" +"InnoDB: ibuf record inserted to space %lu page %lu\n", space, page_no); fflush(stderr); rec_print(btr_pcur_get_rec(pcur)); @@ -2429,18 +2811,23 @@ ibuf_delete_rec( rec_print(page_rec_get_next(btr_pcur_get_rec(pcur))); fflush(stdout); - mtr_commit(mtr); + btr_pcur_commit_specify_mtr(pcur, mtr); - fprintf(stderr, "InnoDB: Validating insert buffer tree:\n"); + fprintf(stderr, + "InnoDB: Validating insert buffer tree:\n"); ut_a(btr_validate_tree(ibuf_data->index->tree)); fprintf(stderr, "InnoDB: ibuf tree ok\n"); fflush(stderr); + + btr_pcur_close(pcur); + + mutex_exit(&ibuf_mutex); + + return(TRUE); } - - ut_a(success); - root = ibuf_tree_root_get(ibuf_data, space, mtr); + root = ibuf_tree_root_get(ibuf_data, 0, mtr); btr_cur_pessimistic_delete(&err, TRUE, btr_pcur_get_btr_cur(pcur), FALSE, mtr); @@ -2476,7 +2863,11 @@ ibuf_merge_or_delete_for_page( page_t* page, /* in: if page has been read from disk, pointer to the page x-latched, else NULL */ ulint space, /* in: space id of the index page */ - ulint page_no)/* in: page number of the index page */ + ulint page_no,/* in: page number of the index page */ + ibool update_ibuf_bitmap)/* in: normally this is set to TRUE, but if + we have deleted or are deleting the tablespace, then we + naturally do not want to update a non-existent bitmap + page */ { mem_heap_t* heap; btr_pcur_t pcur; @@ -2493,6 +2884,7 @@ ibuf_merge_or_delete_for_page( ulint old_bits; ulint new_bits; dulint max_trx_id; + ibool tablespace_being_deleted = FALSE; ibool corruption_noticed = FALSE; mtr_t mtr; char err_buf[500]; @@ -2501,7 +2893,7 @@ ibuf_merge_or_delete_for_page( return; } - + #ifdef UNIV_LOG_DEBUG if (space % 2 != 0) { @@ -2515,28 +2907,57 @@ ibuf_merge_or_delete_for_page( return; } - mtr_start(&mtr); + if (update_ibuf_bitmap) { + /* If the following returns FALSE, we get the counter + incremented, and must decrement it when we leave this + function. When the counter is > 0, that prevents tablespace + from being dropped. */ - bitmap_page = ibuf_bitmap_get_map_page(space, page_no, &mtr); + tablespace_being_deleted = fil_inc_pending_ibuf_merges(space); + + if (tablespace_being_deleted) { + /* Do not try to read the bitmap page from space; + just delete the ibuf records for the page */ + + page = NULL; + update_ibuf_bitmap = FALSE; + } + } - if (!ibuf_bitmap_page_get_bits(bitmap_page, page_no, + if (update_ibuf_bitmap) { + mtr_start(&mtr); + bitmap_page = ibuf_bitmap_get_map_page(space, page_no, &mtr); + + if (!ibuf_bitmap_page_get_bits(bitmap_page, page_no, IBUF_BITMAP_BUFFERED, &mtr)) { - /* No inserts buffered for this page */ + /* No inserts buffered for this page */ + mtr_commit(&mtr); - mtr_commit(&mtr); + if (!tablespace_being_deleted) { + fil_decr_pending_ibuf_merges(space); + } - return; + return; + } + mtr_commit(&mtr); } - mtr_commit(&mtr); + /* Currently the insert buffer of space 0 takes care of inserts to all + tablespaces */ - ibuf_data = fil_space_get_ibuf_data(space); + ibuf_data = fil_space_get_ibuf_data(0); ibuf_enter(); heap = mem_heap_create(512); - search_tuple = ibuf_search_tuple_build(page_no, heap); + if (!trx_sys_multiple_tablespace_format) { + ut_a(trx_doublewrite_must_reset_space_ids); + search_tuple = ibuf_search_tuple_build(space, page_no, heap); + } else { + search_tuple = ibuf_new_search_tuple_build(space, page_no, + heap); + } if (page) { /* Move the ownership of the x-latch on the page to this OS @@ -2592,7 +3013,6 @@ loop: IB__FILE__, __LINE__, &mtr); ut_a(success); - buf_page_dbg_add_level(page, SYNC_TREE_NODE); } @@ -2600,7 +3020,6 @@ loop: index page */ btr_pcur_open_on_user_rec(ibuf_data->index, search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF, &pcur, &mtr); - if (!btr_pcur_is_on_user_rec(&pcur, &mtr)) { ut_ad(btr_pcur_is_after_last_in_tree(&pcur, &mtr)); @@ -2613,21 +3032,18 @@ loop: ibuf_rec = btr_pcur_get_rec(&pcur); /* Check if the entry is for this index page */ - if (ibuf_rec_get_page_no(ibuf_rec) != page_no) { - + if (ibuf_rec_get_page_no(ibuf_rec) != page_no + || ibuf_rec_get_space(ibuf_rec) != space) { if (page) { page_header_reset_last_insert(page, &mtr); } - goto reset_bit; } if (corruption_noticed) { rec_sprintf(err_buf, 450, ibuf_rec); - fprintf(stderr, "InnoDB: Discarding record\n %s\n from the insert buffer!\n\n", err_buf); - } else if (page) { /* Now we have at pcur a record which should be inserted to the index page; NOTE that the call below @@ -2637,14 +3053,12 @@ loop: max_trx_id = page_get_max_trx_id( buf_frame_align(ibuf_rec)); - page_update_max_trx_id(page, max_trx_id); entry = ibuf_build_entry_from_ibuf_rec(ibuf_rec, heap); #ifdef UNIV_IBUF_DEBUG volume += rec_get_converted_size(entry) + page_dir_calc_reserved_space(1); - ut_a(volume <= 4 * UNIV_PAGE_SIZE / IBUF_PAGE_SIZE_PER_FREE_SPACE); #endif @@ -2672,43 +3086,38 @@ loop: } reset_bit: - #ifdef UNIV_IBUF_DEBUG if (ibuf_count_get(space, page_no) > 0) { - /* btr_print_tree(ibuf_data->index->tree, 100); ibuf_print(); */ } #endif - bitmap_page = ibuf_bitmap_get_map_page(space, page_no, &mtr); - - ibuf_bitmap_page_set_bits(bitmap_page, page_no, + if (update_ibuf_bitmap) { + bitmap_page = ibuf_bitmap_get_map_page(space, page_no, &mtr); + ibuf_bitmap_page_set_bits(bitmap_page, page_no, IBUF_BITMAP_BUFFERED, FALSE, &mtr); - if (page) { - old_bits = ibuf_bitmap_page_get_bits(bitmap_page, page_no, - IBUF_BITMAP_FREE, &mtr); - new_bits = ibuf_index_page_calc_free(page); - + if (page) { + old_bits = ibuf_bitmap_page_get_bits(bitmap_page, + page_no, IBUF_BITMAP_FREE, &mtr); + new_bits = ibuf_index_page_calc_free(page); #ifdef UNIV_IBUF_DEBUG - /* printf("Old bits %lu new bits %lu max size %lu\n", old_bits, - new_bits, + /* printf("Old bits %lu new bits %lu max size %lu\n", + old_bits, new_bits, page_get_max_insert_size_after_reorganize(page, 1)); */ #endif - if (old_bits != new_bits) { - - ibuf_bitmap_page_set_bits(bitmap_page, page_no, + if (old_bits != new_bits) { + ibuf_bitmap_page_set_bits(bitmap_page, page_no, IBUF_BITMAP_FREE, new_bits, &mtr); + } } } - #ifdef UNIV_IBUF_DEBUG /* printf("Ibuf merge %lu records volume %lu to page no %lu\n", n_inserts, volume, page_no); */ #endif mtr_commit(&mtr); btr_pcur_close(&pcur); - mem_heap_free(heap); /* Protect our statistics keeping from race conditions */ @@ -2719,12 +3128,122 @@ reset_bit: mutex_exit(&ibuf_mutex); + if (update_ibuf_bitmap && !tablespace_being_deleted) { + + fil_decr_pending_ibuf_merges(space); + } + ibuf_exit(); #ifdef UNIV_IBUF_DEBUG ut_a(ibuf_count_get(space, page_no) == 0); #endif } +/************************************************************************* +Deletes all entries in the insert buffer for a given space id. This is used +in DISCARD TABLESPACE and IMPORT TABLESPACE. +NOTE: this does not update the page free bitmaps in the space. The space will +become CORRUPT when you call this function! */ + +void +ibuf_delete_for_discarded_space( +/*============================*/ + ulint space) /* in: space id */ +{ + mem_heap_t* heap; + btr_pcur_t pcur; + dtuple_t* search_tuple; + rec_t* ibuf_rec; + ulint page_no; + ibool closed; + ibuf_data_t* ibuf_data; + ulint n_inserts; + mtr_t mtr; + + /* Currently the insert buffer of space 0 takes care of inserts to all + tablespaces */ + + ibuf_data = fil_space_get_ibuf_data(0); + + heap = mem_heap_create(512); + + /* Use page number 0 to build the search tuple so that we get the + cursor positioned at the first entry for this space id */ + + search_tuple = ibuf_new_search_tuple_build(space, 0, heap); + + n_inserts = 0; +loop: + ibuf_enter(); + + mtr_start(&mtr); + + /* Position pcur in the insert buffer at the first entry for the + space */ + btr_pcur_open_on_user_rec(ibuf_data->index, search_tuple, PAGE_CUR_GE, + BTR_MODIFY_LEAF, &pcur, &mtr); + if (!btr_pcur_is_on_user_rec(&pcur, &mtr)) { + ut_ad(btr_pcur_is_after_last_in_tree(&pcur, &mtr)); + + goto leave_loop; + } + + for (;;) { + ut_ad(btr_pcur_is_on_user_rec(&pcur, &mtr)); + + ibuf_rec = btr_pcur_get_rec(&pcur); + + /* Check if the entry is for this space */ + if (ibuf_rec_get_space(ibuf_rec) != space) { + + goto leave_loop; + } + + page_no = ibuf_rec_get_page_no(ibuf_rec); + + n_inserts++; + + /* Delete the record from ibuf */ + closed = ibuf_delete_rec(space, page_no, &pcur, search_tuple, + &mtr); + if (closed) { + /* Deletion was pessimistic and mtr was committed: + we start from the beginning again */ + + ibuf_exit(); + + goto loop; + } + + if (btr_pcur_is_after_last_on_page(&pcur, &mtr)) { + mtr_commit(&mtr); + btr_pcur_close(&pcur); + + ibuf_exit(); + + goto loop; + } + } + +leave_loop: + mtr_commit(&mtr); + btr_pcur_close(&pcur); + + /* Protect our statistics keeping from race conditions */ + mutex_enter(&ibuf_mutex); + + ibuf_data->n_merges++; + ibuf_data->n_merged_recs += n_inserts; + + mutex_exit(&ibuf_mutex); + + printf("Discarded %lu ibuf entries for space %lu\n", n_inserts, space); + + ibuf_exit(); + + mem_heap_free(heap); +} + /********************************************************************** Validates the ibuf data structures when the caller owns ibuf_mutex. */ @@ -2754,6 +3273,56 @@ ibuf_validate_low(void) } /********************************************************************** +Looks if the insert buffer is empty. */ + +ibool +ibuf_is_empty(void) +/*===============*/ + /* out: TRUE if empty */ +{ + ibuf_data_t* data; + ibool is_empty; + page_t* root; + mtr_t mtr; + + ibuf_enter(); + + mutex_enter(&ibuf_mutex); + + data = UT_LIST_GET_FIRST(ibuf->data_list); + + mtr_start(&mtr); + + root = ibuf_tree_root_get(data, 0, &mtr); + + if (page_get_n_recs(root) == 0) { + + is_empty = TRUE; + + if (data->empty == FALSE) { + fprintf(stderr, +"InnoDB: Warning: insert buffer tree is empty but the data struct does not\n" +"InnoDB: know it. This condition is legal if the master thread has not yet\n" +"InnoDB: run to completion.\n"); + } + } else { + ut_a(data->empty == FALSE); + + is_empty = FALSE; + } + + mtr_commit(&mtr); + + ut_a(data->space == 0); + + mutex_exit(&ibuf_mutex); + + ibuf_exit(); + + return(is_empty); +} + +/********************************************************************** Prints info of ibuf. */ void @@ -2776,9 +3345,15 @@ ibuf_print( while (data) { buf += sprintf(buf, - "Ibuf for space %lu: size %lu, free list len %lu, seg size %lu,\n", + "Ibuf for space %lu: size %lu, free list len %lu, seg size %lu,", data->space, data->size, data->free_list_len, data->seg_size); + if (data->empty) { + buf += sprintf(buf, " is empty\n"); + } else { + buf += sprintf(buf, " is not empty\n"); + } + buf += sprintf(buf, "%lu inserts, %lu merged recs, %lu merges\n", data->n_inserts, data->n_merged_recs, data->n_merges); diff --git a/innobase/include/btr0btr.ic b/innobase/include/btr0btr.ic index 09006828cc9..16057d2c8a6 100644 --- a/innobase/include/btr0btr.ic +++ b/innobase/include/btr0btr.ic @@ -189,6 +189,7 @@ btr_node_ptr_get_child_page_no( ulint n_fields; byte* field; ulint len; + ulint page_no; n_fields = rec_get_n_fields(rec); @@ -197,7 +198,16 @@ btr_node_ptr_get_child_page_no( ut_ad(len == 4); - return(mach_read_from_4(field)); + page_no = mach_read_from_4(field); + + if (page_no == 0) { + fprintf(stderr, +"InnoDB: a nonsensical page number 0 in a node ptr record at offset %lu\n", + (ulint)(rec - buf_frame_align(rec))); + buf_page_print(buf_frame_align(rec)); + } + + return(page_no); } /****************************************************************** diff --git a/innobase/include/btr0pcur.ic b/innobase/include/btr0pcur.ic index a1db2cc52dd..b553a569bda 100644 --- a/innobase/include/btr0pcur.ic +++ b/innobase/include/btr0pcur.ic @@ -564,7 +564,7 @@ btr_pcur_open_at_index_side( } btr_cur_open_at_index_side(from_left, index, latch_mode, - btr_pcur_get_btr_cur(pcur), mtr); + btr_pcur_get_btr_cur(pcur), mtr); pcur->pos_state = BTR_PCUR_IS_POSITIONED; pcur->old_stored = BTR_PCUR_OLD_NOT_STORED; diff --git a/innobase/include/buf0buf.h b/innobase/include/buf0buf.h index d2ee1a440c7..0457da60534 100644 --- a/innobase/include/buf0buf.h +++ b/innobase/include/buf0buf.h @@ -626,19 +626,27 @@ buf_pool_get_nth_block( ulint i); /* in: index of the block */ /************************************************************************ Function which inits a page for read to the buffer buf_pool. If the page is -already in buf_pool, does nothing. Sets the io_fix flag to BUF_IO_READ and -sets a non-recursive exclusive lock on the buffer frame. The io-handler must -take care that the flag is cleared and the lock released later. This is one -of the functions which perform the state transition NOT_USED => FILE_PAGE to -a block (the other is buf_page_create). */ +(1) already in buf_pool, or +(2) if we specify to read only ibuf pages and the page is not an ibuf page, or +(3) if the space is deleted or being deleted, +then this function does nothing. +Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock +on the buffer frame. The io-handler must take care that the flag is cleared +and the lock released later. This is one of the functions which perform the +state transition NOT_USED => FILE_PAGE to a block (the other is +buf_page_create). */ buf_block_t* buf_page_init_for_read( /*===================*/ - /* out: pointer to the block */ - ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ... */ - ulint space, /* in: space id */ - ulint offset);/* in: page number */ + /* out: pointer to the block or NULL */ + ulint* err, /* out: DB_SUCCESS or DB_TABLESPACE_DELETED */ + ulint mode, /* in: BUF_READ_IBUF_PAGES_ONLY, ... */ + ulint space, /* in: space id */ + ib_longlong tablespace_version,/* in: prevents reading from a wrong + version of the tablespace in case we have done + DISCARD + IMPORT */ + ulint offset);/* in: page number */ /************************************************************************ Completes an asynchronous read or write request of a file page to or from the buffer pool. */ diff --git a/innobase/include/buf0lru.h b/innobase/include/buf0lru.h index 5c995b259bf..69a376f8cab 100644 --- a/innobase/include/buf0lru.h +++ b/innobase/include/buf0lru.h @@ -37,6 +37,16 @@ These are low-level functions #define BUF_LRU_FREE_SEARCH_LEN (5 + 2 * BUF_READ_AHEAD_AREA) /********************************************************************** +Invalidates all pages belonging to a given tablespace when we are deleting +the data file(s) of that tablespace. A PROBLEM: if readahead is being started, +what guarantees that it will not try to read in pages after this operation has +completed? */ + +void +buf_LRU_invalidate_tablespace( +/*==========================*/ + ulint id); /* in: space id */ +/********************************************************************** Gets the minimum LRU_position field for the blocks in an initial segment (determined by BUF_LRU_INITIAL_RATIO) of the LRU list. The limit is not guaranteed to be precise, because the ulint_clock may wrap around. */ diff --git a/innobase/include/buf0rea.h b/innobase/include/buf0rea.h index aed965a6b21..380a42f4b80 100644 --- a/innobase/include/buf0rea.h +++ b/innobase/include/buf0rea.h @@ -59,7 +59,7 @@ buf_read_ahead_linear( must want access to this page (see NOTE 3 above) */ /************************************************************************ Issues read requests for pages which the ibuf module wants to read in, in -order to contract insert buffer trees. Technically, this function is like +order to contract the insert buffer tree. Technically, this function is like a read-ahead function. */ void @@ -68,9 +68,14 @@ buf_read_ibuf_merge_pages( ibool sync, /* in: TRUE if the caller wants this function to wait for the highest address page to get read in, before this function returns */ - ulint space, /* in: space id */ - ulint* page_nos, /* in: array of page numbers to read, with - the highest page number last in the array */ + ulint* space_ids, /* in: array of space ids */ + ib_longlong* space_versions,/* in: the spaces must have this version + number (timestamp), otherwise we discard the + read; we use this to cancel reads if + DISCARD + IMPORT may have changed the + tablespace size */ + ulint* page_nos, /* in: array of page numbers to read, with the + highest page number the last in the array */ ulint n_stored); /* in: number of page numbers in the array */ /************************************************************************ Issues read requests for pages which recovery wants to read in. */ diff --git a/innobase/include/data0type.h b/innobase/include/data0type.h index 4da686bf2e1..f202230bb94 100644 --- a/innobase/include/data0type.h +++ b/innobase/include/data0type.h @@ -89,6 +89,8 @@ be less than 256 */ alphabetical order for a single field and decide the storage size of an SQL null*/ #define DATA_ORDER_NULL_TYPE_BUF_SIZE 4 +/* In the >= 4.1.x storage format we need 2 bytes more for the charset */ +#define DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE 6 /************************************************************************* Sets a data type structure. */ @@ -172,24 +174,36 @@ dtype_is_fixed_size( /* out: TRUE if fixed size */ dtype_t* type); /* in: type */ /************************************************************************** +Reads to a type the stored information which determines its alphabetical +ordering and the storage size of an SQL NULL value. */ +UNIV_INLINE +void +dtype_read_for_order_and_null_size( +/*===============================*/ + dtype_t* type, /* in: type struct */ + byte* buf); /* in: buffer for the stored order info */ +/************************************************************************** Stores for a type the information which determines its alphabetical ordering -and the storage size of an SQL NULL value. */ +and the storage size of an SQL NULL value. This is the >= 4.1.x storage +format. */ UNIV_INLINE void -dtype_store_for_order_and_null_size( -/*================================*/ - byte* buf, /* in: buffer for DATA_ORDER_NULL_TYPE_BUF_SIZE +dtype_new_store_for_order_and_null_size( +/*====================================*/ + byte* buf, /* in: buffer for + DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE bytes where we store the info */ dtype_t* type); /* in: type struct */ /************************************************************************** Reads to a type the stored information which determines its alphabetical -ordering and the storage size of an SQL NULL value. */ +ordering and the storage size of an SQL NULL value. This is the 4.1.x storage +format. */ UNIV_INLINE void -dtype_read_for_order_and_null_size( -/*===============================*/ +dtype_new_read_for_order_and_null_size( +/*===================================*/ dtype_t* type, /* in: type struct */ - byte* buf); /* in: buffer for the stored order info */ + byte* buf); /* in: buffer for stored type order info */ /************************************************************************* Validates a data type structure. */ @@ -211,6 +225,7 @@ dtype_print( struct dtype_struct{ ulint mtype; /* main data type */ ulint prtype; /* precise type; MySQL data type */ + ulint chrset; /* MySQL character set code */ /* remaining two fields do not affect alphabetical ordering: */ diff --git a/innobase/include/data0type.ic b/innobase/include/data0type.ic index ddd0b0ae8cc..5d39b3e430b 100644 --- a/innobase/include/data0type.ic +++ b/innobase/include/data0type.ic @@ -27,6 +27,7 @@ dtype_set( type->prtype = prtype; type->len = len; type->prec = prec; + type->chrset = 0; ut_ad(dtype_validate(type)); } @@ -127,18 +128,20 @@ dtype_get_pad_char( /************************************************************************** Stores for a type the information which determines its alphabetical ordering -and the storage size of an SQL NULL value. */ +and the storage size of an SQL NULL value. This is the >= 4.1.x storage +format. */ UNIV_INLINE void -dtype_store_for_order_and_null_size( -/*================================*/ - byte* buf, /* in: buffer for DATA_ORDER_NULL_TYPE_BUF_SIZE +dtype_new_store_for_order_and_null_size( +/*====================================*/ + byte* buf, /* in: buffer for + DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE bytes where we store the info */ dtype_t* type) /* in: type struct */ { - ut_ad(4 == DATA_ORDER_NULL_TYPE_BUF_SIZE); + ut_ad(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); - buf[0] = (byte)(type->mtype & 0xFF); + buf[0] = (byte)(type->mtype & 0xFFUL); if (type->prtype & DATA_BINARY_TYPE) { buf[0] = buf[0] | 128; @@ -148,9 +151,11 @@ dtype_store_for_order_and_null_size( buf[0] = buf[0] | 64; } - buf[1] = (byte)(type->prtype & 0xFF); + buf[1] = (byte)(type->prtype & 0xFFUL); + + mach_write_to_2(buf + 2, type->len & 0xFFFFUL); - mach_write_to_2(buf + 2, type->len & 0xFFFF); + mach_write_to_2(buf + 4, type->chrset & 0xFFFFUL); } /************************************************************************** @@ -179,6 +184,35 @@ dtype_read_for_order_and_null_size( type->len = mach_read_from_2(buf + 2); } +/************************************************************************** +Reads to a type the stored information which determines its alphabetical +ordering and the storage size of an SQL NULL value. This is the 4.1.x storage +format. */ +UNIV_INLINE +void +dtype_new_read_for_order_and_null_size( +/*===================================*/ + dtype_t* type, /* in: type struct */ + byte* buf) /* in: buffer for stored type order info */ +{ + ut_ad(6 == DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE); + + type->mtype = buf[0] & 63; + type->prtype = buf[1]; + + if (buf[0] & 128) { + type->prtype = type->prtype | DATA_BINARY_TYPE; + } + + if (buf[0] & 64) { + type->prtype = type->prtype | DATA_NONLATIN1; + } + + type->len = mach_read_from_2(buf + 2); + + type->chrset = mach_read_from_2(buf + 4); +} + /*************************************************************************** Returns the size of a fixed size data type, 0 if not a fixed size type. */ UNIV_INLINE diff --git a/innobase/include/db0err.h b/innobase/include/db0err.h index 854b9794c00..be7667bfd0c 100644 --- a/innobase/include/db0err.h +++ b/innobase/include/db0err.h @@ -48,6 +48,11 @@ Created 5/24/1996 Heikki Tuuri from a table failed */ #define DB_NO_SAVEPOINT 42 /* no savepoint exists with the given name */ +#define DB_TABLESPACE_ALREADY_EXISTS 43 /* we cannot create a new single-table + tablespace because a file of the same + name already exists */ +#define DB_TABLESPACE_DELETED 44 /* tablespace does not exist or is + being dropped right now */ /* The following are partial failure codes */ #define DB_FAIL 1000 diff --git a/innobase/include/dict0boot.h b/innobase/include/dict0boot.h index cb631be7e35..35eff5af29a 100644 --- a/innobase/include/dict0boot.h +++ b/innobase/include/dict0boot.h @@ -93,7 +93,7 @@ dict_create(void); indexes; ibuf tables and indexes are assigned as the id the number DICT_IBUF_ID_MIN plus the space id */ -#define DICT_IBUF_ID_MIN ut_dulint_create(0xFFFFFFFF, 0) +#define DICT_IBUF_ID_MIN ut_dulint_create(0xFFFFFFFFUL, 0) /* The offset of the dictionary header on the page */ #define DICT_HDR FSEG_PAGE_DATA diff --git a/innobase/include/dict0dict.h b/innobase/include/dict0dict.h index b5ec5381db2..234dece2cda 100644 --- a/innobase/include/dict0dict.h +++ b/innobase/include/dict0dict.h @@ -51,6 +51,16 @@ Inits the data dictionary module. */ void dict_init(void); /*===========*/ +/************************************************************************ +Gets the space id of every table of the data dictionary and makes a linear +list and a hash table of them to the data dictionary cache. This function +can be called at database startup if we did not need to do a crash recovery. +In crash recovery we must scan the space id's from the .ibd files in MySQL +database directories. */ + +void +dict_load_space_id_list(void); +/*=========================*/ /************************************************************************** Returns a stored procedure object and memoryfixes it. */ UNIV_INLINE @@ -187,6 +197,15 @@ dict_table_rename_in_cache( to preserve the original table name in constraints which reference it */ /************************************************************************** +Change the id of a table object in the dictionary cache. This is used in +DISCARD TABLESPACE. */ + +void +dict_table_change_id_in_cache( +/*==========================*/ + dict_table_t* table, /* in: table object already in cache */ + dulint new_id);/* in: new id to set */ +/************************************************************************** Adds a foreign key constraint object to the dictionary cache. May free the object if there already is an object with the same identifier in. At least one of foreign table or referenced table must already be in @@ -734,7 +753,8 @@ dict_tree_build_node_ptr( /*=====================*/ /* out, own: node pointer */ dict_tree_t* tree, /* in: index tree */ - rec_t* rec, /* in: record for which to build node pointer */ + rec_t* rec, /* in: record for which to build node + pointer */ ulint page_no,/* in: page number to put in node pointer */ mem_heap_t* heap, /* in: memory heap where pointer created */ ulint level); /* in: level of rec in tree: 0 means leaf @@ -902,7 +922,7 @@ struct dict_sys_struct{ dict_table_t* sys_columns; /* SYS_COLUMNS table */ dict_table_t* sys_indexes; /* SYS_INDEXES table */ dict_table_t* sys_fields; /* SYS_FIELDS table */ -}; +}; #ifndef UNIV_NONINL #include "dict0dict.ic" diff --git a/innobase/include/dict0load.h b/innobase/include/dict0load.h index b60996a8dab..f7168a0f45f 100644 --- a/innobase/include/dict0load.h +++ b/innobase/include/dict0load.h @@ -15,6 +15,17 @@ Created 4/24/1996 Heikki Tuuri #include "ut0byte.h" /************************************************************************ +In a crash recovery we already have all the tablespace objects created. +This function compares the space id information in the InnoDB data dictionary +to what we already read with fil_load_single_table_tablespaces(). +In a normal startup we just scan the biggest space id, and store it to +fil_system. */ + +void +dict_check_tablespaces_or_store_max_id( +/*===================================*/ + ibool in_crash_recovery); /* in: are we doing a crash recovery */ +/************************************************************************ Finds the first table name in the given database. */ char* @@ -32,7 +43,10 @@ a foreign key references columns in this table. */ dict_table_t* dict_load_table( /*============*/ - /* out: table, NULL if does not exist */ + /* out: table, NULL if does not exist; if the table is + stored in an .ibd file, but the file does not exist, + then we set the ibd_file_missing flag TRUE in the table + object we return */ char* name); /* in: table name */ /*************************************************************************** Loads a table object based on the table id. */ diff --git a/innobase/include/dict0mem.h b/innobase/include/dict0mem.h index 03dc913a7c9..b18e20a644a 100644 --- a/innobase/include/dict0mem.h +++ b/innobase/include/dict0mem.h @@ -309,6 +309,13 @@ struct dict_table_struct{ char* name; /* table name */ ulint space; /* space where the clustered index of the table is placed */ + ibool ibd_file_missing;/* TRUE if this is in a single-table + tablespace and the .ibd file is missing; then + we must return in ha_innodb.cc an error if the + user tries to query such an orphaned table */ + ibool tablespace_discarded;/* this flag is set TRUE when the + user calls DISCARD TABLESPACE on this table, + and reset to FALSE in IMPORT TABLESPACE */ hash_node_t name_hash; /* hash chain node */ hash_node_t id_hash; /* hash chain node */ ulint n_def; /* number of columns defined so far */ diff --git a/innobase/include/dyn0dyn.ic b/innobase/include/dyn0dyn.ic index 787615cae09..b6c4808398b 100644 --- a/innobase/include/dyn0dyn.ic +++ b/innobase/include/dyn0dyn.ic @@ -7,7 +7,7 @@ Created 2/5/1996 Heikki Tuuri *******************************************************/ #define DYN_BLOCK_MAGIC_N 375767 -#define DYN_BLOCK_FULL_FLAG 0x1000000 +#define DYN_BLOCK_FULL_FLAG 0x1000000UL /**************************************************************** Adds a new block to a dyn array. */ diff --git a/innobase/include/fil0fil.h b/innobase/include/fil0fil.h index ad3149f0b36..c76c87395b4 100644 --- a/innobase/include/fil0fil.h +++ b/innobase/include/fil0fil.h @@ -60,10 +60,8 @@ extern fil_addr_t fil_addr_null; first page in a data file: the file has been flushed to disk at least up to this lsn */ -#define FIL_PAGE_ARCH_LOG_NO 34 /* this is only defined for the - first page in a data file: the latest - archived log file number when the - flush lsn above was written */ +#define FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID 34 /* starting from 4.1.x this + contains the space id of the page */ #define FIL_PAGE_DATA 38 /* start of the data on the page */ /* File page trailer */ @@ -86,50 +84,51 @@ extern fil_addr_t fil_addr_null; extern ulint fil_n_pending_log_flushes; extern ulint fil_n_pending_tablespace_flushes; + /*********************************************************************** -Reserves a right to open a single file. The right must be released with -fil_release_right_to_open. */ +Returns the version number of a tablespace, -1 if not found. */ -void -fil_reserve_right_to_open(void); -/*===========================*/ +ib_longlong +fil_space_get_version( +/*==================*/ + /* out: version number, -1 if the tablespace does not + exist in the memory cache */ + ulint id); /* in: space id */ /*********************************************************************** -Releases a right to open a single file. */ +Returns the latch of a file space. */ -void -fil_release_right_to_open(void); -/*===========================*/ -/************************************************************************ -Returns TRUE if file address is undefined. */ -ibool -fil_addr_is_null( -/*=============*/ - /* out: TRUE if undefined */ - fil_addr_t addr); /* in: address */ -/******************************************************************** -Initializes the file system of this module. */ +rw_lock_t* +fil_space_get_latch( +/*================*/ + /* out: latch protecting storage allocation */ + ulint id); /* in: space id */ +/*********************************************************************** +Returns the type of a file space. */ -void -fil_init( -/*=====*/ - ulint max_n_open); /* in: max number of open files */ -/******************************************************************** -Initializes the ibuf indexes at a database start. This can be called -after the file space headers have been created and the dictionary system -has been initialized. */ +ulint +fil_space_get_type( +/*===============*/ + /* out: FIL_TABLESPACE or FIL_LOG */ + ulint id); /* in: space id */ +/*********************************************************************** +Returns the ibuf data of a file space. */ -void -fil_ibuf_init_at_db_start(void); -/*===========================*/ +ibuf_data_t* +fil_space_get_ibuf_data( +/*====================*/ + /* out: ibuf data for this space */ + ulint id); /* in: space id */ /*********************************************************************** -Creates a space object and puts it to the file system. */ +Appends a new file to the chain of files of a space. File must be closed. */ void -fil_space_create( -/*=============*/ - char* name, /* in: space name */ - ulint id, /* in: space id */ - ulint purpose);/* in: FIL_TABLESPACE, or FIL_LOG if log */ +fil_node_create( +/*============*/ + char* name, /* in: file name (file must be closed) */ + ulint size, /* in: file size in database blocks, rounded downwards + to an integer */ + ulint id, /* in: space id where to append */ + ibool is_raw);/* in: TRUE if a raw device or a raw disk partition */ /******************************************************************** Drops files from the start of a file space, so that its size is cut by the amount given. */ @@ -141,48 +140,88 @@ fil_space_truncate_start( ulint trunc_len); /* in: truncate by this much; it is an error if this does not equal to the combined size of some initial files in the space */ -/************************************************************************** -Tries to extend a data file by the number of pages given. Any fractions of a -megabyte are ignored. */ +/*********************************************************************** +Creates a space memory object and puts it to the 'fil system' hash table. If +there is an error, prints an error message to the .err log. */ ibool -fil_extend_last_data_file( -/*======================*/ - /* out: TRUE if success, also if we run - out of disk space we may return TRUE */ - ulint* actual_increase,/* out: number of pages we were able to - extend, here the orginal size of the file and - the resulting size of the file are rounded - downwards to a full megabyte, and the - difference expressed in pages is returned */ - ulint size_increase); /* in: try to extend this many pages */ +fil_space_create( +/*=============*/ + /* out: TRUE if success */ + char* name, /* in: space name */ + ulint id, /* in: space id */ + ulint purpose);/* in: FIL_TABLESPACE, or FIL_LOG if log */ /*********************************************************************** -Frees a space object from a file system. Closes the files in the chain -but does not delete them. */ +Frees a space object from a the tablespace memory cache. Closes the files in +the chain but does not delete them. */ -void +ibool fil_space_free( /*===========*/ + /* out: TRUE if success */ ulint id); /* in: space id */ /*********************************************************************** -Returns the latch of a file space. */ - -rw_lock_t* -fil_space_get_latch( -/*================*/ - /* out: latch protecting storage allocation */ - ulint id); /* in: space id */ -/*********************************************************************** -Returns the type of a file space. */ +Returns the size of the space in pages. The tablespace must be cached in the +memory cache. */ ulint -fil_space_get_type( +fil_space_get_size( /*===============*/ - /* out: FIL_TABLESPACE or FIL_LOG */ + /* out: space size, 0 if space not found */ ulint id); /* in: space id */ +/*********************************************************************** +Checks if the pair space, page_no refers to an existing page in a tablespace +file space. The tablespace must be cached in the memory cache. */ + +ibool +fil_check_adress_in_tablespace( +/*===========================*/ + /* out: TRUE if the address is meaningful */ + ulint id, /* in: space id */ + ulint page_no);/* in: page number */ +/******************************************************************** +Initializes the tablespace memory cache. */ + +void +fil_init( +/*=====*/ + ulint max_n_open); /* in: max number of open files */ +/*********************************************************************** +Opens all log files and system tablespace data files. They stay open until the +database server shutdown. This should be called at a server startup after the +space objects for the log and the system tablespace have been created. The +purpose of this operation is to make sure we never run out of file descriptors +if we need to read from the insert buffer or to write to the log. */ + +void +fil_open_log_and_system_tablespace_files(void); +/*==========================================*/ +/*********************************************************************** +Closes all open files. There must not be any pending i/o's or not flushed +modifications in the files. */ + +void +fil_close_all_files(void); +/*=====================*/ +/*********************************************************************** +Sets the max tablespace id counter if the given number is bigger than the +previous value. */ + +void +fil_set_max_space_id_if_bigger( +/*===========================*/ + ulint max_id);/* in: maximum known id */ +/******************************************************************** +Initializes the ibuf data structure for space 0 == the system tablespace. +This can be called after the file space headers have been created and the +dictionary system has been initialized. */ + +void +fil_ibuf_init_at_db_start(void); +/*===========================*/ /******************************************************************** Writes the flushed lsn and the latest archived log number to the page -header of the first page of each data file. */ +header of the first page of each data file in the system tablespace. */ ulint fil_write_flushed_lsn_to_data_files( @@ -205,48 +244,221 @@ fil_read_flushed_lsn_and_arch_log_no( dulint* max_flushed_lsn, /* in/out: */ ulint* max_arch_log_no); /* in/out: */ /*********************************************************************** -Returns the ibuf data of a file space. */ +Increments the count of pending insert buffer page merges, if space is not +being deleted. */ -ibuf_data_t* -fil_space_get_ibuf_data( -/*====================*/ - /* out: ibuf data for this space */ +ibool +fil_inc_pending_ibuf_merges( +/*========================*/ + /* out: TRUE if being deleted, and ibuf merges should + be skipped */ + ulint id); /* in: space id */ +/*********************************************************************** +Decrements the count of pending insert buffer page merges. */ + +void +fil_decr_pending_ibuf_merges( +/*========================*/ ulint id); /* in: space id */ /*********************************************************************** -Returns the size of the space in pages. */ +Deletes a single-table tablespace. The tablespace must be cached in the +memory cache. */ + +ibool +fil_delete_tablespace( +/*==================*/ + /* out: TRUE if success */ + ulint id); /* in: space id */ +/*********************************************************************** +Discards a single-table tablespace. The tablespace must be cached in the +memory cache. Discarding is like deleting a tablespace, but +1) we do not drop the table from the data dictionary; +2) we remove all insert buffer entries for the tablespace immediately; in DROP +TABLE they are only removed gradually in the background; +3) when the user does IMPORT TABLESPACE, the tablespace will have the same id +as it originally had. */ + +ibool +fil_discard_tablespace( +/*===================*/ + /* out: TRUE if success */ + ulint id); /* in: space id */ +/*********************************************************************** +Renames a single-table tablespace. The tablespace must be cached in the +tablespace memory cache. */ + +ibool +fil_rename_tablespace( +/*==================*/ + /* out: TRUE if success */ + char* old_name, /* in: old table name in the standard + databasename/tablename format of InnoDB */ + ulint id, /* in: space id */ + char* new_name); /* in: new table name in the standard + databasename/tablename format of InnoDB */ +/*********************************************************************** +Creates a new single-table tablespace to a database directory of MySQL. +Database directories are under the 'datadir' of MySQL. The datadir is the +directory of a running mysqld program. We can refer to it by simply the +path '.'. */ ulint -fil_space_get_size( -/*===============*/ - /* out: space size */ +fil_create_new_single_table_tablespace( +/*===================================*/ + /* out: DB_SUCCESS or error code */ + ulint* space_id, /* out: space id */ + char* tablename, /* in: the table name in the usual + databasename/tablename format of InnoDB */ + ulint size); /* in: the initial size of the tablespace file + in pages */ +/************************************************************************ +Tries to open a single-table tablespace and checks the space id is right in +it. If does not succeed, prints an error message to the .err log. This +function is used to open the tablespace when we load a table definition +to the dictionarky cache. NOTE that we assume this operation is used under the +protection of the dictionary mutex, so that two users cannot race here. */ + +ibool +fil_open_single_table_tablespace( +/*=============================*/ + /* out: TRUE if success */ + ulint id, /* in: space id */ + char* name); /* in: table name in the databasename/tablename + format */ +/************************************************************************ +At the server startup, if we need crash recovery, scans the database +directories under the MySQL datadir, looking for .ibd files. Those files are +single-table tablespaces. We need to know the space id in each of them so that +we know into which file we should look to check the contents of a page stored +in the doublewrite buffer, also to know where to apply log records where the +space id is != 0. */ + +ulint +fil_load_single_table_tablespaces(void); +/*===================================*/ + /* out: DB_SUCCESS or error number */ +/************************************************************************ +If we need crash recovery, and we have called +fil_load_single_table_tablespaces() and dict_load_single_table_tablespaces(), +we can call this function to print an error message of orphaned .ibd files +for which there is not a data dictionary entry with a matching table name +and space id. */ + +void +fil_print_orphaned_tablespaces(void); +/*================================*/ +/*********************************************************************** +Returns TRUE if a single-table tablespace does not exist in the memory cache, +or is being deleted there. */ + +ibool +fil_tablespace_deleted_or_being_deleted_in_mem( +/*===========================================*/ + /* out: TRUE if does not exist or is being\ + deleted */ + ulint id, /* in: space id */ + ib_longlong version);/* in: tablespace_version should be this; if + you pass -1 as the value of this, then this + parameter is ignored */ +/*********************************************************************** +Returns TRUE if a single-table tablespace exists in the memory cache. */ + +ibool +fil_tablespace_exists_in_mem( +/*=========================*/ + /* out: TRUE if exists */ ulint id); /* in: space id */ /*********************************************************************** -Checks if the pair space, page_no refers to an existing page in a -tablespace file space. */ +Returns TRUE if a matching tablespace exists in the InnoDB tablespace memory +cache. Note that if we have not done a crash recovery at the database startup, +there may be many tablespaces which are not yet in the memory cache. */ ibool -fil_check_adress_in_tablespace( +fil_space_for_table_exists_in_mem( +/*==============================*/ + /* out: TRUE if a matching tablespace + exists in the memory cache */ + ulint id, /* in: space id */ + char* name, /* in: table name in the standard + 'databasename/tablename' format */ + ibool mark_space, /* in: in crash recovery, at database startup + we mark all spaces which have an associated + table in the InnoDB data dictionary, so that + we can print a warning about orphaned + tablespaces */ + ibool print_error_if_does_not_exist); + /* in: print detailed error information to + the .err log if a matching tablespace is + not found from memory */ +/************************************************************************** +Tries to extend a data file by the number of pages given. Fractions of 1 MB +are ignored. The tablespace must be cached in the memory cache. */ + +ibool +fil_extend_last_data_file( +/*======================*/ + /* out: TRUE if success, also if we run + out of disk space we may return TRUE */ + ulint* actual_increase,/* out: number of pages we were able to + extend, here the original size of the file and + the resulting size of the file are rounded + downwards to a full megabyte, and the + difference expressed in pages is returned */ + ulint space_id, /* in: space id */ + ulint size, /* in: current size of the space in pages, as + stored in the fsp header */ + ulint size_increase); /* in: try to extend this many pages */ +/************************************************************************** +Tries to extend a data file so that it would accommodate the number of pages +given. The tablespace must be cached in the memory cache. */ + +ibool +fil_extend_data_file_with_pages( +/*============================*/ + /* out: TRUE if success */ + ulint space_id, /* in: space id, must be != 0 */ + ulint size, /* in: current size of the space in pages, as + stored in the fsp header */ + ulint size_after_extend);/* in: desired size in pages after the + extension, should be less than 4 GB (this + function is primarily intended for increasing + the data file size from < 64 pages to up to + 64 pages) */ +/*********************************************************************** +Tries to reserve free extents in a file space. */ + +ibool +fil_space_reserve_free_extents( /*===========================*/ - /* out: TRUE if the address is meaningful */ - ulint id, /* in: space id */ - ulint page_no);/* in: page number */ + /* out: TRUE if succeed */ + ulint id, /* in: space id */ + ulint n_free_now, /* in: number of free extents now */ + ulint n_to_reserve); /* in: how many one wants to reserve */ /*********************************************************************** -Appends a new file to the chain of files of a space. -File must be closed. */ +Releases free extents in a file space. */ void -fil_node_create( -/*============*/ - char* name, /* in: file name (file must be closed) */ - ulint size, /* in: file size in database blocks, rounded downwards - to an integer */ - ulint id); /* in: space id where to append */ +fil_space_release_free_extents( +/*===========================*/ + ulint id, /* in: space id */ + ulint n_reserved); /* in: how many one reserved */ +/*********************************************************************** +Gets the number of reserved extents. If the database is silent, this number +should be zero. */ + +ulint +fil_space_get_n_reserved_extents( +/*=============================*/ + ulint id); /* in: space id */ /************************************************************************ Reads or writes data. This operation is asynchronous (aio). */ -void +ulint fil_io( /*===*/ + /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED + if we are trying to do i/o on a tablespace + which does not exist */ ulint type, /* in: OS_FILE_READ or OS_FILE_WRITE, ORed to OS_FILE_LOG, if a log i/o and ORed to OS_AIO_SIMULATED_WAKE_LATER @@ -262,9 +474,9 @@ fil_io( ulint byte_offset, /* in: remainder of offset in bytes; in aio this must be divisible by the OS block size */ - ulint len, /* in: how many bytes to read; this must - not cross a file boundary; in aio this must - be a block size multiple */ + ulint len, /* in: how many bytes to read or write; this + must not cross a file boundary; in aio this + must be a block size multiple */ void* buf, /* in/out: buffer where to store read data or from where to write; in aio this must be appropriately aligned */ @@ -272,12 +484,15 @@ fil_io( aio used, else ignored */ /************************************************************************ Reads data from a space to a buffer. Remember that the possible incomplete -blocks at the end of a file are ignored: they are not taken into account when +blocks at the end of file are ignored: they are not taken into account when calculating the byte offset within a space. */ -void +ulint fil_read( /*=====*/ + /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED + if we are trying to do i/o on a tablespace + which does not exist */ ibool sync, /* in: TRUE if synchronous aio is desired */ ulint space_id, /* in: space id */ ulint block_offset, /* in: offset in number of blocks */ @@ -292,12 +507,15 @@ fil_read( aio used, else ignored */ /************************************************************************ Writes data to a space from a buffer. Remember that the possible incomplete -blocks at the end of a file are ignored: they are not taken into account when +blocks at the end of file are ignored: they are not taken into account when calculating the byte offset within a space. */ -void +ulint fil_write( /*======*/ + /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED + if we are trying to do i/o on a tablespace + which does not exist */ ibool sync, /* in: TRUE if synchronous aio is desired */ ulint space_id, /* in: space id */ ulint block_offset, /* in: offset in number of blocks */ @@ -322,7 +540,8 @@ fil_aio_wait( ulint segment); /* in: the number of the segment in the aio array to wait for */ /************************************************************************** -Flushes to disk possible writes cached by the OS. */ +Flushes to disk possible writes cached by the OS. If the space does not exist +or is being dropped, does not do anything. */ void fil_flush( @@ -338,13 +557,21 @@ fil_flush_file_spaces( /*==================*/ ulint purpose); /* in: FIL_TABLESPACE, FIL_LOG */ /********************************************************************** -Checks the consistency of the file system. */ +Checks the consistency of the tablespace cache. */ ibool fil_validate(void); /*==============*/ /* out: TRUE if ok */ /************************************************************************ +Returns TRUE if file address is undefined. */ + +ibool +fil_addr_is_null( +/*=============*/ + /* out: TRUE if undefined */ + fil_addr_t addr); /* in: address */ +/************************************************************************ Accessor functions for a file page */ ulint @@ -368,32 +595,7 @@ fil_page_get_type( /* out: type; NOTE that if the type has not been written to page, the return value not defined */ byte* page); /* in: file page */ -/*********************************************************************** -Tries to reserve free extents in a file space. */ - -ibool -fil_space_reserve_free_extents( -/*===========================*/ - /* out: TRUE if succeed */ - ulint id, /* in: space id */ - ulint n_free_now, /* in: number of free extents now */ - ulint n_to_reserve); /* in: how many one wants to reserve */ -/*********************************************************************** -Releases free extents in a file space. */ -void -fil_space_release_free_extents( -/*===========================*/ - ulint id, /* in: space id */ - ulint n_reserved); /* in: how many one reserved */ -/*********************************************************************** -Gets the number of reserved extents. If the database is silent, this number -should be zero. */ - -ulint -fil_space_get_n_reserved_extents( -/*=============================*/ - ulint id); /* in: space id */ typedef struct fil_space_struct fil_space_t; diff --git a/innobase/include/fsp0fsp.h b/innobase/include/fsp0fsp.h index 3494f336b1e..127e01ef59f 100644 --- a/innobase/include/fsp0fsp.h +++ b/innobase/include/fsp0fsp.h @@ -55,7 +55,7 @@ ulint fsp_header_get_free_limit( /*======================*/ /* out: free limit in megabytes */ - ulint space); /* in: space id */ + ulint space); /* in: space id, must be 0 */ /************************************************************************** Gets the size of the tablespace from the tablespace header. If we do not have an auto-extending data file, this should be equal to the size of the @@ -65,9 +65,27 @@ ulint fsp_header_get_tablespace_size( /*===========================*/ /* out: size in pages */ - ulint space); /* in: space id */ + ulint space); /* in: space id, must be 0 */ +/************************************************************************** +Reads the space id from the first page of a tablespace. */ + +ulint +fsp_header_get_space_id( +/*====================*/ + /* out: space id, ULINT UNDEFINED if error */ + page_t* page); /* in: first page of a tablespace */ /************************************************************************** -Initializes the space header of a new created space. */ +Writes the space id to a tablespace header. This function is used past the +buffer pool when we in fil0fil.c create a new single-table tablespace. */ + +void +fsp_header_write_space_id( +/*======================*/ + page_t* page, /* in: first page in the space */ + ulint space_id); /* in: space id */ +/************************************************************************** +Initializes the space header of a new created space and creates also the +insert buffer tree root if space == 0. */ void fsp_header_init( @@ -117,12 +135,12 @@ fseg_create_general( will belong to the created segment */ ulint byte_offset, /* in: byte offset of the created segment header on the page */ - ibool has_done_reservation, /* in: TRUE if the caller has - already done the reservation for the pages - with fsp_reserve_free_extents (at least 2 extents: - one for the inode and, then there other for the - segment) is no need to do the check for this - individual operation */ + ibool has_done_reservation, /* in: TRUE if the caller has already + done the reservation for the pages with + fsp_reserve_free_extents (at least 2 extents: one for + the inode and the other for the segment) then there is + no need to do the check for this individual + operation */ mtr_t* mtr); /* in: mtr */ /************************************************************************** Calculates the number of pages reserved by a segment, and how many pages are @@ -194,12 +212,21 @@ two types of allocation: when space is scarce, FSP_NORMAL allocations will not succeed, but the latter two allocations will succeed, if possible. The purpose is to avoid dead end where the database is full but the user cannot free any space because these freeing operations temporarily -reserve some space. */ +reserve some space. + +Single-table tablespaces whose size is < 32 pages are a special case. In this +function we would liberally reserve several 64 page extents for every page +split or merge in a B-tree. But we do not want to waste disk space if the table +only occupies < 32 pages. That is why we apply different rules in that special +case, just ensuring that there are 3 free pages available. */ ibool fsp_reserve_free_extents( /*=====================*/ /* out: TRUE if we were able to make the reservation */ + ulint* n_reserved,/* out: number of extents actually reserved; if we + return TRUE and the tablespace size is < 64 pages, + then this can be 0, otherwise it is n_ext */ ulint space, /* in: space id */ ulint n_ext, /* in: number of extents to reserve */ ulint alloc_type,/* in: FSP_NORMAL, FSP_UNDO, or FSP_CLEANING */ @@ -337,8 +364,8 @@ pages: */ #define FSP_FIRST_INODE_PAGE_NO 2 #define FSP_IBUF_HEADER_PAGE_NO 3 #define FSP_IBUF_TREE_ROOT_PAGE_NO 4 - /* The ibuf tree root page number in each - tablespace; its fseg inode is on the page + /* The ibuf tree root page number in + tablespace 0; its fseg inode is on the page number FSP_FIRST_INODE_PAGE_NO */ #define FSP_TRX_SYS_PAGE_NO 5 #define FSP_FIRST_RSEG_PAGE_NO 6 diff --git a/innobase/include/fut0lst.ic b/innobase/include/fut0lst.ic index d2e79cf7640..c0d61833b48 100644 --- a/innobase/include/fut0lst.ic +++ b/innobase/include/fut0lst.ic @@ -23,7 +23,7 @@ Created 11/28/1995 Heikki Tuuri #define FLST_FIRST 4 /* 6-byte address of the first element of the list; undefined if empty list */ #define FLST_LAST (4 + FIL_ADDR_SIZE) /* 6-byte address of the - first element of the list; undefined + last element of the list; undefined if empty list */ /************************************************************************ diff --git a/innobase/include/ibuf0ibuf.h b/innobase/include/ibuf0ibuf.h index a290e90e4db..bf03b06bd28 100644 --- a/innobase/include/ibuf0ibuf.h +++ b/innobase/include/ibuf0ibuf.h @@ -40,6 +40,13 @@ void ibuf_init_at_db_start(void); /*=======================*/ /************************************************************************* +Reads the biggest tablespace id from the high end of the insert buffer +tree and updates the counter in fil_system. */ + +void +ibuf_update_max_tablespace_id(void); +/*===============================*/ +/************************************************************************* Initializes an ibuf bitmap page. */ void @@ -207,8 +214,8 @@ When an index page is read from a disk to the buffer pool, this function inserts to the page the possible index entries buffered in the insert buffer. The entries are deleted from the insert buffer. If the page is not read, but created in the buffer pool, this function deletes its buffered entries from -the insert buffer; note that there can exist entries if the page belonged to -an index which was dropped. */ +the insert buffer; there can exist entries for such a page if the page +belonged to an index which subsequently was dropped. */ void ibuf_merge_or_delete_for_page( @@ -216,7 +223,21 @@ ibuf_merge_or_delete_for_page( page_t* page, /* in: if page has been read from disk, pointer to the page x-latched, else NULL */ ulint space, /* in: space id of the index page */ - ulint page_no);/* in: page number of the index page */ + ulint page_no,/* in: page number of the index page */ + ibool update_ibuf_bitmap);/* in: normally this is set to TRUE, but if + we have deleted or are deleting the tablespace, then we + naturally do not want to update a non-existent bitmap + page */ +/************************************************************************* +Deletes all entries in the insert buffer for a given space id. This is used +in DISCARD TABLESPACE and IMPORT TABLESPACE. +NOTE: this does not update the page free bitmaps in the space. The space will +become CORRUPT when you call this function! */ + +void +ibuf_delete_for_discarded_space( +/*============================*/ + ulint space); /* in: space id */ /************************************************************************* Contracts insert buffer trees by reading pages to the buffer pool. */ @@ -266,6 +287,13 @@ ibuf_count_get( ulint space, /* in: space id */ ulint page_no);/* in: page number */ /********************************************************************** +Looks if the insert buffer is empty. */ + +ibool +ibuf_is_empty(void); +/*===============*/ + /* out: TRUE if empty */ +/********************************************************************** Prints info of ibuf. */ void diff --git a/innobase/include/lock0lock.h b/innobase/include/lock0lock.h index 5608ba020b7..49f4597b30c 100644 --- a/innobase/include/lock0lock.h +++ b/innobase/include/lock0lock.h @@ -534,12 +534,12 @@ extern lock_sys_t* lock_sys; #define LOCK_X 5 /* exclusive */ #define LOCK_AUTO_INC 6 /* locks the auto-inc counter of a table in an exclusive mode */ -#define LOCK_MODE_MASK 0xF /* mask used to extract mode from the +#define LOCK_MODE_MASK 0xFUL /* mask used to extract mode from the type_mode field in a lock */ /* Lock types */ #define LOCK_TABLE 16 /* these type values should be so high that */ #define LOCK_REC 32 /* they can be ORed to the lock mode */ -#define LOCK_TYPE_MASK 0xF0 /* mask used to extract lock type from the +#define LOCK_TYPE_MASK 0xF0UL /* mask used to extract lock type from the type_mode field in a lock */ /* Waiting lock flag */ #define LOCK_WAIT 256 /* this wait bit should be so high that diff --git a/innobase/include/log0log.h b/innobase/include/log0log.h index 24ec28a56e6..dc44429d636 100644 --- a/innobase/include/log0log.h +++ b/innobase/include/log0log.h @@ -519,9 +519,9 @@ Peeks the current lsn. */ ibool log_peek_lsn( /*=========*/ - /* out: TRUE if success, FALSE if could not get the - log system mutex */ - dulint* lsn); /* out: if returns TRUE, current lsn is here */ + /* out: TRUE if success, FALSE if could not get the + log system mutex */ + dulint* lsn); /* out: if returns TRUE, current lsn is here */ /************************************************************************** Refreshes the statistics used to print per-second averages. */ @@ -549,7 +549,7 @@ extern log_t* log_sys; highest bit is set to 1 if this is the first log block in a log flush write segment */ -#define LOG_BLOCK_FLUSH_BIT_MASK 0x80000000 +#define LOG_BLOCK_FLUSH_BIT_MASK 0x80000000UL /* mask used to get the highest bit in the preceding field */ #define LOG_BLOCK_HDR_DATA_LEN 4 /* number of bytes of log written to @@ -600,12 +600,18 @@ extern log_t* log_sys; #define LOG_CHECKPOINT_CHECKSUM_1 LOG_CHECKPOINT_ARRAY_END #define LOG_CHECKPOINT_CHECKSUM_2 (4 + LOG_CHECKPOINT_ARRAY_END) #define LOG_CHECKPOINT_FSP_FREE_LIMIT (8 + LOG_CHECKPOINT_ARRAY_END) - /* current fsp free limit in the - tablespace, in units of one megabyte */ + /* current fsp free limit in + tablespace 0, in units of one + megabyte; this information is only used + by ibbackup to decide if it can + truncate unused ends of + non-auto-extending data files in space + 0 */ #define LOG_CHECKPOINT_FSP_MAGIC_N (12 + LOG_CHECKPOINT_ARRAY_END) /* this magic number tells if the checkpoint contains the above field: - the field was added to InnoDB-3.23.50 */ + the field was added to + InnoDB-3.23.50 */ #define LOG_CHECKPOINT_SIZE (16 + LOG_CHECKPOINT_ARRAY_END) #define LOG_CHECKPOINT_FSP_MAGIC_N_VAL 1441231243 @@ -794,11 +800,11 @@ struct log_struct{ called */ /* Fields involved in checkpoints */ - ulint log_group_capacity; /* capacity of the log group; if - the checkpoint age exceeds this, it is - a serious error because it is possible - we will then overwrite log and spoil - crash recovery */ + ulint log_group_capacity; /* capacity of the log group; if + the checkpoint age exceeds this, it is + a serious error because it is possible + we will then overwrite log and spoil + crash recovery */ ulint max_modified_age_async; /* when this recommended value for lsn - buf_pool_get_oldest_modification() @@ -840,7 +846,8 @@ struct log_struct{ /* Fields involved in archiving */ ulint archiving_state;/* LOG_ARCH_ON, LOG_ARCH_STOPPING LOG_ARCH_STOPPED, LOG_ARCH_OFF */ - dulint archived_lsn; /* archiving has advanced to this lsn */ + dulint archived_lsn; /* archiving has advanced to this + lsn */ ulint max_archived_lsn_age_async; /* recommended maximum age of archived_lsn, before we start diff --git a/innobase/include/log0log.ic b/innobase/include/log0log.ic index 8de239df0bd..7ae7e859032 100644 --- a/innobase/include/log0log.ic +++ b/innobase/include/log0log.ic @@ -182,9 +182,9 @@ log_block_convert_lsn_to_no( no = ut_dulint_get_low(lsn) / OS_FILE_LOG_BLOCK_SIZE; no += (ut_dulint_get_high(lsn) % OS_FILE_LOG_BLOCK_SIZE) - * 2 * (0x80000000 / OS_FILE_LOG_BLOCK_SIZE); + * 2 * (0x80000000UL / OS_FILE_LOG_BLOCK_SIZE); - no = no & 0x3FFFFFFF; + no = no & 0x3FFFFFFFUL; return(no + 1); } @@ -206,7 +206,7 @@ log_block_calc_checksum( sh = 0; for (i = 0; i < OS_FILE_LOG_BLOCK_SIZE - LOG_BLOCK_TRL_SIZE; i++) { - sum = sum & 0x7FFFFFFF; + sum = sum & 0x7FFFFFFFUL; sum += (((ulint)(*(block + i))) << sh) + (ulint)(*(block + i)); sh++; if (sh > 24) { @@ -346,7 +346,7 @@ log_reserve_and_write_fast( #ifdef UNIV_LOG_DEBUG log_check_log_recs(log->buf + log->old_buf_free, - log->buf_free - log->old_buf_free, log->old_lsn); + log->buf_free - log->old_buf_free, log->old_lsn); #endif return(lsn); } diff --git a/innobase/include/log0recv.h b/innobase/include/log0recv.h index 7b27ee34541..0a4f56816a1 100644 --- a/innobase/include/log0recv.h +++ b/innobase/include/log0recv.h @@ -25,8 +25,8 @@ recv_read_cp_info_for_backup( byte* hdr, /* in: buffer containing the log group header */ dulint* lsn, /* out: checkpoint lsn */ ulint* offset, /* out: checkpoint offset in the log group */ - ulint* fsp_limit,/* out: fsp limit, 1000000000 if the database - is running with < version 3.23.50 of InnoDB */ + ulint* fsp_limit,/* out: fsp limit of space 0, 1000000000 if the + database is running with < version 3.23.50 of InnoDB */ dulint* cp_no, /* out: checkpoint number */ dulint* first_header_lsn); /* out: lsn of of the start of the first log file */ @@ -334,7 +334,6 @@ extern ibool recv_no_ibuf_operations; extern ibool recv_needed_recovery; extern ibool recv_lsn_checks_on; - extern ibool recv_is_making_a_backup; extern ulint recv_max_parsed_page_no; diff --git a/innobase/include/mach0data.ic b/innobase/include/mach0data.ic index 1d6badd035b..0934c27d9f4 100644 --- a/innobase/include/mach0data.ic +++ b/innobase/include/mach0data.ic @@ -17,7 +17,7 @@ mach_write_to_1( ulint n) /* in: ulint integer to be stored, >= 0, < 256 */ { ut_ad(b); - ut_ad(n <= 0xFF); + ut_ad(n <= 0xFFUL); b[0] = (byte)n; } @@ -46,7 +46,7 @@ mach_write_to_2( ulint n) /* in: ulint integer to be stored */ { ut_ad(b); - ut_ad(n <= 0xFFFF); + ut_ad(n <= 0xFFFFUL); b[0] = (byte)(n >> 8); b[1] = (byte)(n); @@ -79,7 +79,7 @@ mach_write_to_3( ulint n) /* in: ulint integer to be stored */ { ut_ad(b); - ut_ad(n <= 0xFFFFFF); + ut_ad(n <= 0xFFFFFFUL); b[0] = (byte)(n >> 16); b[1] = (byte)(n >> 8); @@ -222,20 +222,20 @@ mach_write_compressed( { ut_ad(b); - if (n < 0x80) { + if (n < 0x80UL) { mach_write_to_1(b, n); return(1); - } else if (n < 0x4000) { - mach_write_to_2(b, n | 0x8000); + } else if (n < 0x4000UL) { + mach_write_to_2(b, n | 0x8000UL); return(2); - } else if (n < 0x200000) { - mach_write_to_3(b, n | 0xC00000); + } else if (n < 0x200000UL) { + mach_write_to_3(b, n | 0xC00000UL); return(3); - } else if (n < 0x10000000) { - mach_write_to_4(b, n | 0xE0000000); + } else if (n < 0x10000000UL) { + mach_write_to_4(b, n | 0xE0000000UL); return(4); } else { - mach_write_to_1(b, 0xF0); + mach_write_to_1(b, 0xF0UL); mach_write_to_4(b + 1, n); return(5); } @@ -250,13 +250,13 @@ mach_get_compressed_size( /* out: compressed size in bytes */ ulint n) /* in: ulint integer (< 2^32) to be stored */ { - if (n < 0x80) { + if (n < 0x80UL) { return(1); - } else if (n < 0x4000) { + } else if (n < 0x4000UL) { return(2); - } else if (n < 0x200000) { + } else if (n < 0x200000UL) { return(3); - } else if (n < 0x10000000) { + } else if (n < 0x10000000UL) { return(4); } else { return(5); @@ -278,16 +278,16 @@ mach_read_compressed( flag = mach_read_from_1(b); - if (flag < 0x80) { + if (flag < 0x80UL) { return(flag); - } else if (flag < 0xC0) { - return(mach_read_from_2(b) & 0x7FFF); - } else if (flag < 0xE0) { - return(mach_read_from_3(b) & 0x3FFFFF); - } else if (flag < 0xF0) { - return(mach_read_from_4(b) & 0x1FFFFFFF); + } else if (flag < 0xC0UL) { + return(mach_read_from_2(b) & 0x7FFFUL); + } else if (flag < 0xE0UL) { + return(mach_read_from_3(b) & 0x3FFFFFUL); + } else if (flag < 0xF0UL) { + return(mach_read_from_4(b) & 0x1FFFFFFFUL); } else { - ut_ad(flag == 0xF0); + ut_ad(flag == 0xF0UL); return(mach_read_from_4(b + 1)); } } @@ -477,7 +477,7 @@ mach_dulint_write_much_compressed( return(mach_write_compressed(b, ut_dulint_get_low(n))); } - *b = 0xFF; + *b = (byte)0xFF; size = 1 + mach_write_compressed(b + 1, ut_dulint_get_high(n)); size += mach_write_compressed(b + size, ut_dulint_get_low(n)); @@ -517,7 +517,7 @@ mach_dulint_read_much_compressed( ut_ad(b); - if (*b != 0xFF) { + if (*b != (byte)0xFF) { high = 0; size = 0; } else { @@ -717,11 +717,10 @@ mach_write_to_2_little_endian( { ut_ad(n < 256 * 256); - *dest = (byte)(n & 0xFF); + *dest = (byte)(n & 0xFFUL); n = n >> 8; dest++; - *dest = (byte)(n & 0xFF); + *dest = (byte)(n & 0xFFUL); } - diff --git a/innobase/include/mtr0log.ic b/innobase/include/mtr0log.ic index 0598f1a9536..c2150660794 100644 --- a/innobase/include/mtr0log.ic +++ b/innobase/include/mtr0log.ic @@ -163,13 +163,6 @@ mlog_write_initial_log_record_fast( space = buf_block_get_space(block); offset = buf_block_get_page_no(block); - if (space != 0 || offset > 0x8FFFFFFF) { - fprintf(stderr, - "InnoDB: error: buffer page pointer %lx has nonsensical space id %lu\n" - "InnoDB: or page no %lu\n", (ulint)ptr, space, offset); - ut_a(0); - } - mach_write_to_1(log_ptr, type); log_ptr++; log_ptr += mach_write_compressed(log_ptr, space); diff --git a/innobase/include/os0file.h b/innobase/include/os0file.h index 5c52f0e92bf..9f1c18829c4 100644 --- a/innobase/include/os0file.h +++ b/innobase/include/os0file.h @@ -11,9 +11,11 @@ Created 10/21/1995 Heikki Tuuri #include "univ.i" +#ifndef __WIN__ +#include <dirent.h> +#include <sys/stat.h> +#endif -/* If the following is set to TRUE, we do not call os_file_flush in every -os_file_write */ extern ibool os_do_not_call_flush_at_each_write; extern ibool os_has_said_disk_full; extern ibool os_aio_print_debug; @@ -57,6 +59,7 @@ log. */ #define OS_FILE_OPEN 51 #define OS_FILE_CREATE 52 #define OS_FILE_OVERWRITE 53 +#define OS_FILE_OPEN_RAW 54 #define OS_FILE_READ_ONLY 333 #define OS_FILE_READ_WRITE 444 @@ -117,6 +120,36 @@ extern ulint os_n_file_reads; extern ulint os_n_file_writes; extern ulint os_n_fsyncs; +/* File types for directory entry data type */ + +enum os_file_type_enum{ + OS_FILE_TYPE_UNKNOWN = 0, + OS_FILE_TYPE_FILE, /* regular file */ + OS_FILE_TYPE_DIR, /* directory */ + OS_FILE_TYPE_LINK /* symbolic link */ +}; +typedef enum os_file_type_enum os_file_type_t; + +/* Maximum path string length in bytes when referring to tables with in the +'./databasename/tablename.ibd' path format; we can allocate at least 2 buffers +of this size from the thread stack; that is why this should not be made much +bigger than 4000 bytes */ +#define OS_FILE_MAX_PATH 4000 + +/* Struct used in fetching information of a file in a directory */ +typedef struct os_file_stat_struct os_file_stat_t; +struct os_file_stat_struct{ + char name[OS_FILE_MAX_PATH]; /* path to a file */ + os_file_type_t type; /* file type */ + ib_longlong size; /* file size */ +}; + +#ifdef __WIN___ +typedef HANDLE os_file_dir_t; /* directory stream */ +#else +typedef DIR* os_file_dir_t; /* directory stream */ +#endif + /*************************************************************************** Gets the operating system version. Currently works only on Windows. */ @@ -130,6 +163,42 @@ Creates the seek mutexes used in positioned reads and writes. */ void os_io_init_simple(void); /*===================*/ +/*************************************************************************** +The os_file_opendir() function opens a directory stream corresponding to the +directory named by the dirname argument. The directory stream is positioned +at the first entry. In both Unix and Windows we automatically skip the '.' +and '..' items at the start of the directory listing. */ + +os_file_dir_t +os_file_opendir( +/*============*/ + /* out: directory stream, NULL if error */ + char* dirname, /* in: directory name; it must not contain + a trailing '\' or '/' */ + ibool error_is_fatal);/* in: TRUE if we should treat an error as a + fatal error; if we try to open symlinks then + we do not wish a fatal error if it happens + not to be a directory */ +/*************************************************************************** +Closes a directory stream. */ + +int +os_file_closedir( +/*=============*/ + /* out: 0 if success, -1 if failure */ + os_file_dir_t dir); /* in: directory stream */ +/*************************************************************************** +This function returns information of the next file in the directory. We jump +over the '.' and '..' entries in the directory. */ + +int +os_file_readdir_next_file( +/*======================*/ + /* out: 0 if ok, -1 if error, 1 if at the end + of the directory */ + char* dirname,/* in: directory name or path */ + os_file_dir_t dir, /* in: directory stream */ + os_file_stat_t* info); /* in/out: buffer where the info is returned */ /******************************************************************** A simple function to open or create a file. */ @@ -173,7 +242,9 @@ os_file_create( ulint create_mode,/* in: OS_FILE_OPEN if an existing file is opened (if does not exist, error), or OS_FILE_CREATE if a new file is created (if exists, error), OS_FILE_OVERWRITE - if a new file is created or an old overwritten */ + if a new file is created or an old overwritten; + OS_FILE_OPEN_RAW, if a raw device or disk partition + should be opened */ ulint purpose,/* in: OS_FILE_AIO, if asynchronous, non-buffered i/o is desired, OS_FILE_NORMAL, if any normal file; NOTE that it also depends on type, os_aio_.. and srv_.. @@ -183,6 +254,25 @@ os_file_create( ulint type, /* in: OS_DATA_FILE or OS_LOG_FILE */ ibool* success);/* out: TRUE if succeed, FALSE if error */ /*************************************************************************** +Deletes a file. The file has to be closed before calling this. */ + +ibool +os_file_delete( +/*===========*/ + /* out: TRUE if success */ + char* name); /* in: file path as a null-terminated string */ +/*************************************************************************** +Renames a file (can also move it to another directory). It is safest that the +file is closed before calling this function. */ + +ibool +os_file_rename( +/*===========*/ + /* out: TRUE if success */ + char* oldpath, /* in: old file path as a null-terminated + string */ + char* newpath); /* in: new file path */ +/*************************************************************************** Closes a file handle. In case of error, error number can be retrieved with os_file_get_last_error. */ @@ -238,9 +328,12 @@ overwrite the error number). If the number is not known to this program, the OS error number + 100 is returned. */ ulint -os_file_get_last_error(void); -/*========================*/ - /* out: error number, or OS error number + 100 */ +os_file_get_last_error( +/*===================*/ + /* out: error number, or OS error + number + 100 */ + ibool report_all_errors); /* in: TRUE if we want an error message + printed of all errors */ /*********************************************************************** Requests a synchronous read operation. */ diff --git a/innobase/include/page0page.h b/innobase/include/page0page.h index 04f771c3abd..969313614e3 100644 --- a/innobase/include/page0page.h +++ b/innobase/include/page0page.h @@ -596,7 +596,8 @@ byte* page_parse_delete_rec_list( /*=======================*/ /* out: end of log record or NULL */ - byte type, /* in: MLOG_LIST_END_DELETE or MLOG_LIST_START_DELETE */ + byte type, /* in: MLOG_LIST_END_DELETE or + MLOG_LIST_START_DELETE */ byte* ptr, /* in: buffer */ byte* end_ptr,/* in: buffer end */ page_t* page, /* in: page or NULL */ diff --git a/innobase/include/que0types.h b/innobase/include/que0types.h index c7ce09db40b..e59c2313a5a 100644 --- a/innobase/include/que0types.h +++ b/innobase/include/que0types.h @@ -36,7 +36,8 @@ struct que_common_struct{ if the buffer has been allocated dynamically: if this field is != 0, and the node is a symbol node or a function node, then we - have to free the data field in val explicitly */ + have to free the data field in val + explicitly */ }; #endif diff --git a/innobase/include/rem0rec.h b/innobase/include/rem0rec.h index b28f39925c1..ebdd3c1ac81 100644 --- a/innobase/include/rem0rec.h +++ b/innobase/include/rem0rec.h @@ -21,7 +21,7 @@ Created 5/30/1994 Heikki Tuuri /* Flag denoting the predefined minimum record: this bit is ORed in the 4 info bits of a record */ -#define REC_INFO_MIN_REC_FLAG 0x10 +#define REC_INFO_MIN_REC_FLAG 0x10UL /* Number of extra bytes in a record, in addition to the data and the offsets */ @@ -406,8 +406,8 @@ rec_sprintf( /* Maximum lengths for the data in a physical record if the offsets are given in one byte (resp. two byte) format. */ -#define REC_1BYTE_OFFS_LIMIT 0x7F -#define REC_2BYTE_OFFS_LIMIT 0x7FFF +#define REC_1BYTE_OFFS_LIMIT 0x7FUL +#define REC_2BYTE_OFFS_LIMIT 0x7FFFUL /* The data size of record must be smaller than this because we reserve two upmost bits in a two byte offset for special purposes */ diff --git a/innobase/include/rem0rec.ic b/innobase/include/rem0rec.ic index 9dfd4faeec8..f4acd8547db 100644 --- a/innobase/include/rem0rec.ic +++ b/innobase/include/rem0rec.ic @@ -29,41 +29,41 @@ significant bytes and bits are written below less significant. and the shift needed to obtain each bit-field of the record. */ #define REC_NEXT 2 -#define REC_NEXT_MASK 0xFFFF +#define REC_NEXT_MASK 0xFFFFUL #define REC_NEXT_SHIFT 0 #define REC_SHORT 3 /* This is single byte bit-field */ -#define REC_SHORT_MASK 0x1 +#define REC_SHORT_MASK 0x1UL #define REC_SHORT_SHIFT 0 #define REC_N_FIELDS 4 -#define REC_N_FIELDS_MASK 0x7FE +#define REC_N_FIELDS_MASK 0x7FEUL #define REC_N_FIELDS_SHIFT 1 #define REC_HEAP_NO 5 -#define REC_HEAP_NO_MASK 0xFFF8 +#define REC_HEAP_NO_MASK 0xFFF8UL #define REC_HEAP_NO_SHIFT 3 #define REC_N_OWNED 6 /* This is single byte bit-field */ -#define REC_N_OWNED_MASK 0xF +#define REC_N_OWNED_MASK 0xFUL #define REC_N_OWNED_SHIFT 0 -#define REC_INFO_BITS_MASK 0xF0 +#define REC_INFO_BITS_MASK 0xF0UL #define REC_INFO_BITS_SHIFT 0 /* The deleted flag in info bits */ -#define REC_INFO_DELETED_FLAG 0x20 /* when bit is set to 1, it means the +#define REC_INFO_DELETED_FLAG 0x20UL /* when bit is set to 1, it means the record has been delete marked */ /* The following masks are used to filter the SQL null bit from one-byte and two-byte offsets */ -#define REC_1BYTE_SQL_NULL_MASK 0x80 -#define REC_2BYTE_SQL_NULL_MASK 0x8000 +#define REC_1BYTE_SQL_NULL_MASK 0x80UL +#define REC_2BYTE_SQL_NULL_MASK 0x8000UL /* In a 2-byte offset the second most significant bit denotes a field stored to another page: */ -#define REC_2BYTE_EXTERN_MASK 0x4000 +#define REC_2BYTE_EXTERN_MASK 0x4000UL /**************************************************************** Return field length or UNIV_SQL_NULL. */ @@ -133,7 +133,7 @@ rec_set_bit_field_1( ut_ad(rec); ut_ad(offs <= REC_N_EXTRA_BYTES); ut_ad(mask); - ut_ad(mask <= 0xFF); + ut_ad(mask <= 0xFFUL); ut_ad(((mask >> shift) << shift) == mask); ut_ad(((val << shift) & mask) == (val << shift)); @@ -172,8 +172,8 @@ rec_set_bit_field_2( { ut_ad(rec); ut_ad(offs <= REC_N_EXTRA_BYTES); - ut_ad(mask > 0xFF); - ut_ad(mask <= 0xFFFF); + ut_ad(mask > 0xFFUL); + ut_ad(mask <= 0xFFFFUL); ut_ad((mask >> shift) & 1); ut_ad(0 == ((mask >> shift) & ((mask >> shift) + 1))); ut_ad(((mask >> shift) << shift) == mask); @@ -188,8 +188,8 @@ rec_set_bit_field_2( + (REC_N_FIELDS_MASK << (8 * (REC_N_FIELDS - 4))) + (REC_HEAP_NO_MASK << (8 * (REC_HEAP_NO - 4))) + (REC_N_OWNED_MASK << (8 * (REC_N_OWNED - 3))) - + (REC_INFO_BITS_MASK << (8 * (REC_INFO_BITS - 3)))); - if (m != ut_dbg_zero + 0xFFFFFFFF) { + + (REC_INFO_BITS_MASK << (8 * (REC_INFO_BITS - 3)))); + if (m != ut_dbg_zero + 0xFFFFFFFFUL) { printf("Sum of masks %lx\n", m); ut_error; } diff --git a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h index 87643e87a68..094b95e68d3 100644 --- a/innobase/include/srv0srv.h +++ b/innobase/include/srv0srv.h @@ -17,6 +17,8 @@ Created 10/10/1995 Heikki Tuuri #include "que0types.h" #include "trx0types.h" +extern char* srv_main_thread_op_info; + /* Buffer which can be used in printing fatal error messages */ extern char srv_fatal_errbuf[]; @@ -36,6 +38,8 @@ extern ibool srv_lower_case_table_names; extern char* srv_data_home; extern char* srv_arch_dir; +extern ibool srv_file_per_table; + extern ulint srv_n_data_files; extern char** srv_data_file_names; extern ulint* srv_data_file_sizes; @@ -76,6 +80,8 @@ extern char* srv_file_flush_method_str; extern ulint srv_unix_file_flush_method; extern ulint srv_win_file_flush_method; +extern ulint srv_max_n_open_files; + extern ulint srv_max_dirty_pages_pct; extern ulint srv_force_recovery; diff --git a/innobase/include/srv0start.h b/innobase/include/srv0start.h index 8d2c3fa12c5..97a59fd14c7 100644 --- a/innobase/include/srv0start.h +++ b/innobase/include/srv0start.h @@ -11,6 +11,7 @@ Created 10/10/1995 Heikki Tuuri #define srv0start_h #include "univ.i" +#include "ut0byte.h" /************************************************************************* Normalizes a directory path for Windows: converts slashes to backslashes. */ @@ -79,12 +80,17 @@ innobase_shutdown_for_mysql(void); /*=============================*/ /* out: DB_SUCCESS or error code */ +extern dulint srv_shutdown_lsn; +extern dulint srv_start_lsn; + extern ulint srv_sizeof_trx_t_in_ha_innodb_cc; extern ibool srv_is_being_started; extern ibool srv_startup_is_before_trx_rollback_phase; extern ibool srv_is_being_shut_down; +extern ibool srv_start_raw_disk_in_use; + /* At a shutdown the value first climbs from 0 to SRV_SHUTDOWN_CLEANUP and then to SRV_SHUTDOWN_LAST_PHASE, and so on */ @@ -94,4 +100,7 @@ extern ulint srv_shutdown_state; #define SRV_SHUTDOWN_LAST_PHASE 2 #define SRV_SHUTDOWN_EXIT_THREADS 3 +/* Log 'spaces' have id's >= this */ +#define SRV_LOG_SPACE_FIRST_ID 0xFFFFFFF0UL + #endif diff --git a/innobase/include/sync0sync.h b/innobase/include/sync0sync.h index 320f8faf12d..330b6b77b08 100644 --- a/innobase/include/sync0sync.h +++ b/innobase/include/sync0sync.h @@ -381,8 +381,8 @@ or row lock! */ #define SYNC_IBUF_HEADER 914 #define SYNC_IBUF_PESS_INSERT_MUTEX 912 #define SYNC_IBUF_MUTEX 910 /* ibuf mutex is really below - SYNC_FSP_PAGE: we assign value this - high only to get the program to pass + SYNC_FSP_PAGE: we assign a value this + high only to make the program to pass the debug checks */ /*-------------------------------*/ #define SYNC_INDEX_TREE 900 @@ -401,7 +401,7 @@ or row lock! */ #define SYNC_FSP_PAGE 395 /*------------------------------------- Insert buffer headers */ /*------------------------------------- ibuf_mutex */ -/*------------------------------------- Insert buffer trees */ +/*------------------------------------- Insert buffer tree */ #define SYNC_IBUF_BITMAP_MUTEX 351 #define SYNC_IBUF_BITMAP 350 /*-------------------------------*/ diff --git a/innobase/include/trx0sys.h b/innobase/include/trx0sys.h index a54a6424a4f..0005c4a1711 100644 --- a/innobase/include/trx0sys.h +++ b/innobase/include/trx0sys.h @@ -37,21 +37,35 @@ extern trx_sys_t* trx_sys; /* Doublewrite system */ extern trx_doublewrite_t* trx_doublewrite; +extern ibool trx_doublewrite_must_reset_space_ids; +extern ibool trx_sys_multiple_tablespace_format; /******************************************************************** -Creates the doublewrite buffer at a database start. The header of the +Creates the doublewrite buffer to a new InnoDB installation. The header of the doublewrite buffer is placed on the trx system header page. */ void trx_sys_create_doublewrite_buf(void); /*================================*/ /******************************************************************** -At a database startup uses a possible doublewrite buffer to restore +At a database startup initializes the doublewrite buffer memory structure if +we already have a doublewrite buffer created in the data files. If we are +upgrading to an InnoDB version which supports multiple tablespaces, then this +function performs the necessary update operations. If we are in a crash +recovery, this function uses a possible doublewrite buffer to restore half-written pages in the data files. */ void -trx_sys_doublewrite_restore_corrupt_pages(void); -/*===========================================*/ +trx_sys_doublewrite_init_or_restore_pages( +/*======================================*/ + ibool restore_corrupt_pages); +/******************************************************************** +Marks the trx sys header when we have successfully upgraded to the >= 4.1.x +multiple tablespace format. */ + +void +trx_sys_mark_upgraded_to_multiple_tablespaces(void); +/*===============================================*/ /******************************************************************** Determines if a page number is located inside the doublewrite buffer. */ @@ -354,8 +368,17 @@ this contains the same fields as TRX_SYS_MYSQL_LOG_INFO below */ sys header is half-written to disk, we still may be able to recover the information */ +#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED (24 + FSEG_HEADER_SIZE) + /* If this is not yet set to + .._N, we must reset the + doublewrite buffer, because + starting from 4.1.x the space + id of a data page is stored to + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_NO */ /*-------------------------------------------------------------*/ #define TRX_SYS_DOUBLEWRITE_MAGIC_N 536853855 +#define TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N 1783657386 + #define TRX_SYS_DOUBLEWRITE_BLOCK_SIZE FSP_EXTENT_SIZE diff --git a/innobase/include/univ.i b/innobase/include/univ.i index 4854e5a7b78..463f7e5d04d 100644 --- a/innobase/include/univ.i +++ b/innobase/include/univ.i @@ -88,10 +88,9 @@ memory is read outside the allocated blocks. */ /* #define UNIV_DEBUG -#define UNIV_SYNC_DEBUG #define UNIV_MEM_DEBUG - #define UNIV_IBUF_DEBUG +#define UNIV_SYNC_DEBUG #define UNIV_SEARCH_DEBUG #define UNIV_SYNC_PERF_STAT #define UNIV_SEARCH_PERF_STAT diff --git a/innobase/include/ut0byte.h b/innobase/include/ut0byte.h index 4fb45221899..4274956421e 100644 --- a/innobase/include/ut0byte.h +++ b/innobase/include/ut0byte.h @@ -152,7 +152,7 @@ ut_dulint_align_up( Increments a dulint variable by 1. */ #define UT_DULINT_INC(D)\ {\ - if ((D).low == 0xFFFFFFFF) {\ + if ((D).low == 0xFFFFFFFFUL) {\ (D).high = (D).high + 1;\ (D).low = 0;\ } else {\ diff --git a/innobase/include/ut0byte.ic b/innobase/include/ut0byte.ic index f0df9cc35a3..5a70dcf12a8 100644 --- a/innobase/include/ut0byte.ic +++ b/innobase/include/ut0byte.ic @@ -152,13 +152,13 @@ ut_dulint_add( dulint a, /* in: dulint */ ulint b) /* in: ulint */ { - if (0xFFFFFFFF - b >= a.low) { + if (0xFFFFFFFFUL - b >= a.low) { a.low += b; return(a); } - a.low = a.low - (0xFFFFFFFF - b) - 1; + a.low = a.low - (0xFFFFFFFFUL - b) - 1; a.high++; @@ -183,7 +183,7 @@ ut_dulint_subtract( b -= a.low + 1; - a.low = 0xFFFFFFFF - b; + a.low = 0xFFFFFFFFUL - b; ut_ad(a.high > 0); @@ -214,7 +214,7 @@ ut_dulint_minus( ut_ad(a.high == b.high + 1); - diff = (ulint)(0xFFFFFFFF - b.low); + diff = (ulint)(0xFFFFFFFFUL - b.low); diff += 1 + a.low; ut_ad(diff > a.low); diff --git a/innobase/include/ut0ut.ic b/innobase/include/ut0ut.ic index 9d7dd283f29..9a0ef1c0d5b 100644 --- a/innobase/include/ut0ut.ic +++ b/innobase/include/ut0ut.ic @@ -110,7 +110,7 @@ ut_2pow_remainder( ulint n, /* in: number to be divided */ ulint m) /* in: divisor; power of 2 */ { - ut_ad(0x80000000 % m == 0); + ut_ad(0x80000000UL % m == 0); return(n & (m - 1)); } @@ -125,7 +125,7 @@ ut_2pow_round( ulint n, /* in: number to be rounded */ ulint m) /* in: divisor; power of 2 */ { - ut_ad(0x80000000 % m == 0); + ut_ad(0x80000000UL % m == 0); return(n & ~(m - 1)); } diff --git a/innobase/log/log0log.c b/innobase/log/log0log.c index b0140ef767b..62c1dea0d41 100644 --- a/innobase/log/log0log.c +++ b/innobase/log/log0log.c @@ -24,7 +24,8 @@ Created 12/9/1995 Heikki Tuuri #include "trx0sys.h" #include "trx0trx.h" -/* Current free limit; protected by the log sys mutex; 0 means uninitialized */ +/* Current free limit of space 0; protected by the log sys mutex; 0 means +uninitialized */ ulint log_fsp_current_free_limit = 0; /* Global log system variable */ @@ -195,11 +196,10 @@ loop: if (log->archiving_state != LOG_ARCH_OFF) { - archived_lsn_age = ut_dulint_minus(log->lsn, log->archived_lsn); - + archived_lsn_age = ut_dulint_minus(log->lsn, + log->archived_lsn); if (archived_lsn_age + len_upper_limit > log->max_archived_lsn_age) { - /* Not enough free archived space in log groups: do a synchronous archive write batch: */ @@ -466,7 +466,8 @@ ulint log_group_calc_lsn_offset( /*======================*/ /* out: offset within the log group */ - dulint lsn, /* in: lsn, must be within 4 GB of group->lsn */ + dulint lsn, /* in: lsn, must be within 4 GB of + group->lsn */ log_group_t* group) /* in: log group */ { dulint gr_lsn; @@ -978,7 +979,7 @@ log_io_complete( return; } - if ((ulint)group & 0x1) { + if ((ulint)group & 0x1UL) { /* It was a checkpoint write */ group = (log_group_t*)((ulint)group - 1); @@ -1132,7 +1133,8 @@ loop: if ((next_offset % group->file_size) + len > group->file_size) { - write_len = group->file_size - (next_offset % group->file_size); + write_len = group->file_size + - (next_offset % group->file_size); } else { write_len = len; } @@ -1681,7 +1683,7 @@ log_group_checkpoint( OS_FILE_LOG_BLOCK_SIZE, buf, ((byte*)group + 1)); - ut_ad(((ulint)group & 0x1) == 0); + ut_ad(((ulint)group & 0x1UL) == 0); } } @@ -2205,7 +2207,6 @@ loop: log_archived_file_name_gen(name, group->id, group->archived_file_no + n_files); - fil_reserve_right_to_open(); file_handle = os_file_create(name, open_mode, OS_FILE_AIO, OS_DATA_FILE, &ret); @@ -2216,10 +2217,10 @@ loop: } if (!ret) { - fprintf(stderr, + fprintf(stderr, "InnoDB: Cannot create or open archive log file %s.\n", name); - fprintf(stderr, "InnoDB: Cannot continue operation.\n" + fprintf(stderr, "InnoDB: Cannot continue operation.\n" "InnoDB: Check that the log archive directory exists,\n" "InnoDB: you have access rights to it, and\n" "InnoDB: there is space available.\n"); @@ -2234,12 +2235,10 @@ loop: ut_a(ret); - fil_release_right_to_open(); - /* Add the archive file as a node to the space */ fil_node_create(name, group->file_size / UNIV_PAGE_SIZE, - group->archive_space_id); + group->archive_space_id, FALSE); if (next_offset % group->file_size == 0) { log_group_archive_file_header_write(group, n_files, @@ -3085,10 +3084,24 @@ loop: ut_a(buf_all_freed()); ut_a(0 == ut_dulint_cmp(lsn, log_sys->lsn)); + if (ut_dulint_cmp(lsn, srv_start_lsn) < 0) { + fprintf(stderr, +"InnoDB: Error: log sequence number at shutdown %lu %lu\n" +"InnoDB: is lower than at startup %lu %lu!\n", + ut_dulint_get_high(lsn), + ut_dulint_get_low(lsn), + ut_dulint_get_high(srv_start_lsn), + ut_dulint_get_low(srv_start_lsn)); + } + + srv_shutdown_lsn = lsn; + fil_write_flushed_lsn_to_data_files(lsn, arch_log_no); fil_flush_file_spaces(FIL_TABLESPACE); + fil_close_all_files(); + /* Make some checks that the server really is quiet */ ut_a(srv_n_threads_active[SRV_MASTER] == 0); ut_a(buf_all_freed()); diff --git a/innobase/log/log0recv.c b/innobase/log/log0recv.c index ce90683ae7f..cf9e3c89559 100644 --- a/innobase/log/log0recv.c +++ b/innobase/log/log0recv.c @@ -17,6 +17,7 @@ Created 9/20/1997 Heikki Tuuri #include "buf0flu.h" #include "buf0rea.h" #include "srv0srv.h" +#include "srv0start.h" #include "mtr0mtr.h" #include "mtr0log.h" #include "page0page.h" @@ -73,19 +74,18 @@ ulint recv_previous_parsed_rec_is_multi = 0; ulint recv_max_parsed_page_no = 0; -/* The maximum lsn we see for a page during the recovery process. If this -is bigger than the lsn we are able to scan up to, that is an indication that -the recovery failed and the database may be corrupt. */ - -dulint recv_max_page_lsn; - /* This many frames must be left free in the buffer pool when we scan the log and store the scanned log records in the buffer pool: we will use these free frames to read in pages when we start applying the log records to the database. */ -ulint recv_n_pool_free_frames = 256; +ulint recv_n_pool_free_frames = 256; +/* The maximum lsn we see for a page during the recovery process. If this +is bigger than the lsn we are able to scan up to, that is an indication that +the recovery failed and the database may be corrupt. */ + +dulint recv_max_page_lsn; /************************************************************ Creates the recovery system. */ @@ -304,7 +304,8 @@ recv_copy_group( /*============*/ log_group_t* up_to_date_group, /* in: the most up-to-date log group */ - log_group_t* group, /* in: copy to this log group */ + log_group_t* group, /* in: copy to this log + group */ dulint recovered_lsn) /* in: recovery succeeded up to this lsn */ { @@ -370,7 +371,8 @@ recv_synchronize_groups( /* Read the last recovered log block to the recovery system buffer: the block is always incomplete */ - start_lsn = ut_dulint_align_down(recovered_lsn, OS_FILE_LOG_BLOCK_SIZE); + start_lsn = ut_dulint_align_down(recovered_lsn, + OS_FILE_LOG_BLOCK_SIZE); end_lsn = ut_dulint_align_up(recovered_lsn, OS_FILE_LOG_BLOCK_SIZE); ut_a(ut_dulint_cmp(start_lsn, end_lsn) != 0); @@ -426,7 +428,7 @@ recv_check_cp_is_consistent( fold = ut_fold_binary(buf, LOG_CHECKPOINT_CHECKSUM_1); - if ((fold & 0xFFFFFFFF) != mach_read_from_4(buf + if ((fold & 0xFFFFFFFFUL) != mach_read_from_4(buf + LOG_CHECKPOINT_CHECKSUM_1)) { return(FALSE); } @@ -434,7 +436,7 @@ recv_check_cp_is_consistent( fold = ut_fold_binary(buf + LOG_CHECKPOINT_LSN, LOG_CHECKPOINT_CHECKSUM_2 - LOG_CHECKPOINT_LSN); - if ((fold & 0xFFFFFFFF) != mach_read_from_4(buf + if ((fold & 0xFFFFFFFFUL) != mach_read_from_4(buf + LOG_CHECKPOINT_CHECKSUM_2)) { return(FALSE); } @@ -541,8 +543,8 @@ recv_read_cp_info_for_backup( byte* hdr, /* in: buffer containing the log group header */ dulint* lsn, /* out: checkpoint lsn */ ulint* offset, /* out: checkpoint offset in the log group */ - ulint* fsp_limit,/* out: fsp limit, 1000000000 if the database - is running with < version 3.23.50 of InnoDB */ + ulint* fsp_limit,/* out: fsp limit of space 0, 1000000000 if the + database is running with < version 3.23.50 of InnoDB */ dulint* cp_no, /* out: checkpoint number */ dulint* first_header_lsn) /* out: lsn of of the start of the first log file */ @@ -687,7 +689,7 @@ recv_scan_log_seg_for_backup( < *scanned_checkpoint_no && *scanned_checkpoint_no - log_block_get_checkpoint_no(log_block) - > 0x80000000) { + > 0x80000000UL) { /* Garbage from a log buffer flush which was made before the most recent database recovery */ @@ -884,9 +886,14 @@ recv_add_to_hash_table( recv_data_t* recv_data; recv_data_t** prev_field; recv_addr_t* recv_addr; - - ut_a(space == 0); /* For debugging; TODO: remove this */ + if (fil_tablespace_deleted_or_being_deleted_in_mem(space, -1)) { + /* The tablespace does not exist any more: do not store the + log record */ + + return; + } + len = rec_end - body; recv = mem_heap_alloc(recv_sys->heap, sizeof(recv_t)); @@ -909,6 +916,9 @@ recv_add_to_hash_table( HASH_INSERT(recv_addr_t, addr_hash, recv_sys->addr_hash, recv_fold(space, page_no), recv_addr); recv_sys->n_addrs++; + + /* printf("Inserting log rec for space %lu, page %lu\n", + space, page_no); */ } UT_LIST_ADD_LAST(rec_list, recv_addr->rec_list, recv); @@ -1025,6 +1035,8 @@ recv_recover_page( return; } + /* printf("Recovering space %lu, page %lu\n", space, page_no); */ + recv_addr->state = RECV_BEING_PROCESSED; mutex_exit(&(recv_sys->mutex)); @@ -1036,10 +1048,10 @@ recv_recover_page( block = buf_block_align(page); if (just_read_in) { - /* Move the ownership of the x-latch on the page to - this OS thread, so that we can acquire a second - x-latch on it. This is needed for the operations to - the page to pass the debug checks. */ + /* Move the ownership of the x-latch on the page to this OS + thread, so that we can acquire a second x-latch on it. This + is needed for the operations to the page to pass the debug + checks. */ rw_lock_x_lock_move_ownership(&(block->lock)); } @@ -1433,7 +1445,7 @@ recv_apply_log_recs_for_backup( if (recv_addr != NULL) { success = os_file_read(data_file, page, (nth_page_in_file << UNIV_PAGE_SIZE_SHIFT) - & 0xFFFFFFFF, + & 0xFFFFFFFFUL, nth_page_in_file >> (32 - UNIV_PAGE_SIZE_SHIFT), UNIV_PAGE_SIZE); if (!success) { @@ -1713,7 +1725,7 @@ recv_parse_log_rec( if (*ptr == MLOG_DUMMY_RECORD) { *type = *ptr; - *space = 1000; /* For debugging */ + *space = ULINT_UNDEFINED - 1; /* For debugging */ return(1); } @@ -1727,7 +1739,7 @@ recv_parse_log_rec( /* Check that space id and page_no are sensible */ - if (*space != 0 || *page_no > 0x8FFFFFFF) { + if (*page_no > 0x8FFFFFFFUL) { recv_sys->found_corrupt_log = TRUE; @@ -2265,7 +2277,7 @@ recv_scan_log_recs( < recv_sys->scanned_checkpoint_no) && (recv_sys->scanned_checkpoint_no - log_block_get_checkpoint_no(log_block) - > 0x80000000)) { + > 0x80000000UL)) { /* Garbage from a log buffer flush which was made before the most recent database recovery */ @@ -2298,7 +2310,8 @@ recv_scan_log_recs( if (ut_dulint_cmp(scanned_lsn, recv_sys->scanned_lsn) > 0) { /* We were able to find more log data: add it to the - parsing buffer if parse_start_lsn is already non-zero */ + parsing buffer if parse_start_lsn is already + non-zero */ if (recv_sys->len + 4 * OS_FILE_LOG_BLOCK_SIZE >= RECV_PARSING_BUF_SIZE) { @@ -2396,8 +2409,8 @@ recv_group_scan_log_recs( group, start_lsn, end_lsn); finished = recv_scan_log_recs(TRUE, - (buf_pool->n_frames - - recv_n_pool_free_frames) * UNIV_PAGE_SIZE, + (buf_pool->n_frames + - recv_n_pool_free_frames) * UNIV_PAGE_SIZE, TRUE, log_sys->buf, RECV_SCAN_SIZE, start_lsn, contiguous_lsn, group_scanned_lsn); @@ -2447,7 +2460,6 @@ recv_recovery_from_checkpoint_start( || (ut_dulint_cmp(limit_lsn, ut_dulint_max) == 0)); if (type == LOG_CHECKPOINT) { - recv_sys_create(); recv_sys_init(FALSE, buf_pool_get_curr_size()); } @@ -2461,8 +2473,6 @@ recv_recovery_from_checkpoint_start( return(DB_SUCCESS); } - sync_order_checks_on = TRUE; - recv_recovery_on = TRUE; recv_sys->limit_lsn = limit_lsn; @@ -2535,25 +2545,72 @@ recv_recovery_from_checkpoint_start( recv_sys->scanned_checkpoint_no = 0; recv_sys->recovered_lsn = checkpoint_lsn; - /* NOTE: we always do recovery at startup, but only if + srv_start_lsn = checkpoint_lsn; + + /* NOTE: we always do a 'recovery' at startup, but only if there is something wrong we will print a message to the user about recovery: */ if (ut_dulint_cmp(checkpoint_lsn, max_flushed_lsn) != 0 || ut_dulint_cmp(checkpoint_lsn, min_flushed_lsn) != 0) { + if (ut_dulint_cmp(checkpoint_lsn, max_flushed_lsn) + < 0) { + fprintf(stderr, +"InnoDB: ##########################################################\n" +"InnoDB: WARNING!\n" +"InnoDB: The log sequence number in ibdata files is higher\n" +"InnoDB: than the log sequence number in the ib_logfiles! Are you sure\n" +"InnoDB: you are using the right ib_logfiles to start up the database?\n" +"InnoDB: Log sequence number in ib_logfiles is %lu %lu, log\n" +"InnoDB: sequence numbers stamped to ibdata file headers are between\n" +"InnoDB: %lu %lu and %lu %lu.\n" +"InnoDB: ##########################################################\n", + ut_dulint_get_high(checkpoint_lsn), + ut_dulint_get_low(checkpoint_lsn), + ut_dulint_get_high(min_flushed_lsn), + ut_dulint_get_low(min_flushed_lsn), + ut_dulint_get_high(max_flushed_lsn), + ut_dulint_get_low(max_flushed_lsn)); + } + recv_needed_recovery = TRUE; ut_print_timestamp(stderr); fprintf(stderr, - " InnoDB: Database was not shut down normally.\n" - "InnoDB: Starting recovery from log files...\n"); +" InnoDB: Database was not shut down normally!\n" +"InnoDB: Starting crash recovery.\n"); + + fprintf(stderr, +"InnoDB: Reading tablespace information from the .ibd files...\n"); + + fil_load_single_table_tablespaces(); + + /* If we are using the doublewrite method, we will + check if there are half-written pages in data files, + and restore them from the doublewrite buffer if + possible */ + + if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) { + + fprintf(stderr, +"InnoDB: Restoring possible half-written data pages from the doublewrite\n" +"InnoDB: buffer...\n"); + trx_sys_doublewrite_init_or_restore_pages( + TRUE); + } + + ut_print_timestamp(stderr); + fprintf(stderr, - "InnoDB: Starting log scan based on checkpoint at\n" - "InnoDB: log sequence number %lu %lu\n", +" InnoDB: Starting log scan based on checkpoint at\n" +"InnoDB: log sequence number %lu %lu.\n", ut_dulint_get_high(checkpoint_lsn), ut_dulint_get_low(checkpoint_lsn)); + } else { + /* Init the doublewrite buffer memory structure */ + trx_sys_doublewrite_init_or_restore_pages(FALSE); } } @@ -2675,6 +2732,21 @@ recv_recovery_from_checkpoint_start( log_sys->archived_lsn = archived_lsn; recv_synchronize_groups(up_to_date_group); + + if (!recv_needed_recovery) { + if (ut_dulint_cmp(checkpoint_lsn, recv_sys->recovered_lsn) + != 0) { + fprintf(stderr, +"InnoDB: Warning: we did not need to do crash recovery, but log scan\n" +"InnoDB: progressed past the checkpoint lsn %lu %lu up to lsn %lu %lu\n", + ut_dulint_get_high(checkpoint_lsn), + ut_dulint_get_low(checkpoint_lsn), + ut_dulint_get_high(recv_sys->recovered_lsn), + ut_dulint_get_low(recv_sys->recovered_lsn)); + } + } else { + srv_start_lsn = recv_sys->recovered_lsn; + } log_sys->lsn = recv_sys->recovered_lsn; @@ -2703,8 +2775,6 @@ recv_recovery_from_checkpoint_start( mutex_exit(&(log_sys->mutex)); - sync_order_checks_on = FALSE; - recv_lsn_checks_on = TRUE; /* The database is now ready to start almost normal processing of user @@ -2860,16 +2930,16 @@ recv_reset_log_files_for_backup( printf( "Setting log file size to %lu %lu\n", ut_get_high32(log_file_size), - log_file_size & 0xFFFFFFFF); + log_file_size & 0xFFFFFFFFUL); success = os_file_set_size(name, log_file, - log_file_size & 0xFFFFFFFF, + log_file_size & 0xFFFFFFFFUL, ut_get_high32(log_file_size)); if (!success) { printf( "InnoDB: Cannot set %s size to %lu %lu\n", name, ut_get_high32(log_file_size), - log_file_size & 0xFFFFFFFF); + log_file_size & 0xFFFFFFFFUL); exit(1); } @@ -2933,13 +3003,10 @@ try_open_again: log_archived_file_name_gen(name, group->id, group->archived_file_no); - fil_reserve_right_to_open(); - file_handle = os_file_create(name, OS_FILE_OPEN, OS_FILE_LOG, OS_FILE_AIO, &ret); if (ret == FALSE) { - fil_release_right_to_open(); ask_again: fprintf(stderr, "InnoDB: Do you want to copy additional archived log files\n" @@ -2980,12 +3047,10 @@ ask_again: ut_a(ret); - fil_release_right_to_open(); - /* Add the archive file as a node to the space */ fil_node_create(name, 1 + file_size / UNIV_PAGE_SIZE, - group->archive_space_id); + group->archive_space_id, FALSE); ut_a(RECV_SCAN_SIZE >= LOG_FILE_HDR_SIZE); /* Read the archive file header */ @@ -3061,8 +3126,8 @@ ask_again: read_offset % UNIV_PAGE_SIZE, len, buf, NULL); ret = recv_scan_log_recs(TRUE, - (buf_pool->n_frames - - recv_n_pool_free_frames) * UNIV_PAGE_SIZE, + (buf_pool->n_frames - + recv_n_pool_free_frames) * UNIV_PAGE_SIZE, TRUE, buf, len, start_lsn, &dummy_lsn, &scanned_lsn); @@ -3110,8 +3175,6 @@ recv_recovery_from_archive_start( recv_sys_create(); recv_sys_init(FALSE, buf_pool_get_curr_size()); - - sync_order_checks_on = TRUE; recv_recovery_on = TRUE; recv_recovery_from_backup_on = TRUE; @@ -3198,8 +3261,6 @@ recv_recovery_from_archive_start( mutex_exit(&(log_sys->mutex)); - sync_order_checks_on = FALSE; - return(DB_SUCCESS); } diff --git a/innobase/mach/mach0data.c b/innobase/mach/mach0data.c index 336ce106a75..ff7265b34f4 100644 --- a/innobase/mach/mach0data.c +++ b/innobase/mach/mach0data.c @@ -36,37 +36,37 @@ mach_parse_compressed( flag = mach_read_from_1(ptr); - if (flag < 0x80) { + if (flag < 0x80UL) { *val = flag; return(ptr + 1); - } else if (flag < 0xC0) { + } else if (flag < 0xC0UL) { if (end_ptr < ptr + 2) { return(NULL); } - *val = mach_read_from_2(ptr) & 0x7FFF; + *val = mach_read_from_2(ptr) & 0x7FFFUL; return(ptr + 2); - } else if (flag < 0xE0) { + } else if (flag < 0xE0UL) { if (end_ptr < ptr + 3) { return(NULL); } - *val = mach_read_from_3(ptr) & 0x3FFFFF; + *val = mach_read_from_3(ptr) & 0x3FFFFFUL; return(ptr + 3); - } else if (flag < 0xF0) { + } else if (flag < 0xF0UL) { if (end_ptr < ptr + 4) { return(NULL); } - *val = mach_read_from_4(ptr) & 0x1FFFFFFF; + *val = mach_read_from_4(ptr) & 0x1FFFFFFFUL; return(ptr + 4); } else { - ut_ad(flag == 0xF0); + ut_ad(flag == 0xF0UL); if (end_ptr < ptr + 5) { return(NULL); diff --git a/innobase/mem/mem0dbg.c b/innobase/mem/mem0dbg.c index 22d0bab0da2..1ff44bd8967 100644 --- a/innobase/mem/mem0dbg.c +++ b/innobase/mem/mem0dbg.c @@ -346,21 +346,22 @@ mem_hash_remove( mem_heap_validate_or_print(node->heap, NULL, FALSE, &error, &size, NULL, NULL); if (error) { - printf("Inconsistency in memory heap or buffer n:o %lu created\n", + printf( +"Inconsistency in memory heap or buffer n:o %lu created\n", node->nth_heap); - printf("in %s line %lu and tried to free in %s line %lu.\n", + printf("in %s line %lu and tried to free in %s line %lu.\n", node->file_name, node->line, file_name, line); - printf( - "Hex dump of 400 bytes around memory heap first block start:\n"); + printf( +"Hex dump of 400 bytes around memory heap first block start:\n"); - ut_print_buf((byte*)(node->heap) - 200, 400); + ut_print_buf((byte*)(node->heap) - 200, 400); - printf("\nDump of the mem heap:\n"); + printf("\nDump of the mem heap:\n"); - mem_heap_validate_or_print(node->heap, NULL, TRUE, &error, &size, - NULL, NULL); - ut_error; + mem_heap_validate_or_print(node->heap, NULL, TRUE, &error, + &size, NULL, NULL); + ut_error; } /* Free the memory occupied by the node struct */ @@ -441,6 +442,9 @@ mem_heap_validate_or_print( if ((block->type == MEM_HEAP_BUFFER) && (mem_block_get_len(block) > UNIV_PAGE_SIZE)) { + fprintf(stderr, +"InnoDB: Error: mem block %lx length %lu > UNIV_PAGE_SIZE\n", (ulint)block, + mem_block_get_len(block)); /* error */ return; @@ -480,6 +484,12 @@ mem_heap_validate_or_print( mem_field_trailer_get_check(user_field)) { /* error */ + fprintf(stderr, +"InnoDB: Error: block %lx mem field %lx len %lu\n" +"InnoDB: header check field is %lx but trailer %lx\n", (ulint)block, + (ulint)field, len, check_field, + mem_field_trailer_get_check(user_field)); + return; } @@ -499,6 +509,11 @@ mem_heap_validate_or_print( if (field != (byte*)block + mem_block_get_free(block)) { /* error */ + fprintf(stderr, +"InnoDB: Error: block %lx end of mem fields %lx\n" +"InnoDB: but block free at %lx\n", (ulint)block, (ulint)field, + (ulint)((byte*)block + mem_block_get_free(block))); + return; } @@ -577,6 +592,10 @@ mem_heap_validate( mem_heap_validate_or_print(heap, NULL, FALSE, &error, &us_size, &phys_size, &n_blocks); + if (error) { + mem_heap_print(heap); + } + ut_a(!error); return(TRUE); diff --git a/innobase/mem/mem0pool.c b/innobase/mem/mem0pool.c index b004a8c4df7..9b1aea089d1 100644 --- a/innobase/mem/mem0pool.c +++ b/innobase/mem/mem0pool.c @@ -603,8 +603,8 @@ mem_pool_validate( } } - ut_anp(free + pool->reserved == pool->size - - (pool->size % MEM_AREA_MIN_SIZE)); + ut_anp(free + pool->reserved == pool->size); + mutex_exit(&(pool->mutex)); return(TRUE); diff --git a/innobase/mtr/mtr0log.c b/innobase/mtr/mtr0log.c index 2cfe81d3261..898778dda53 100644 --- a/innobase/mtr/mtr0log.c +++ b/innobase/mtr/mtr0log.c @@ -171,13 +171,13 @@ mlog_parse_nbytes( } if (type == MLOG_1BYTE) { - if (val > 0xFF) { + if (val > 0xFFUL) { recv_sys->found_corrupt_log = TRUE; return(NULL); } } else if (type == MLOG_2BYTES) { - if (val > 0xFFFF) { + if (val > 0xFFFFUL) { recv_sys->found_corrupt_log = TRUE; return(NULL); diff --git a/innobase/os/os0file.c b/innobase/os/os0file.c index f4f50320906..0e44104a53c 100644 --- a/innobase/os/os0file.c +++ b/innobase/os/os0file.c @@ -11,6 +11,7 @@ Created 10/21/1995 Heikki Tuuri #include "os0thread.h" #include "ut0mem.h" #include "srv0srv.h" +#include "srv0start.h" #include "fil0fil.h" #include "buf0buf.h" @@ -33,7 +34,7 @@ ulint os_innodb_umask = 0; #endif /* If the following is set to TRUE, we do not call os_file_flush in every -os_file_write. We can set this TRUE if the doublewrite buffer is used. */ +os_file_write. We can set this TRUE when the doublewrite buffer is used. */ ibool os_do_not_call_flush_at_each_write = FALSE; /* We use these mutexes to protect lseek + file i/o operation, if the @@ -154,7 +155,6 @@ os_mutex_t os_file_count_mutex; ulint os_file_n_pending_preads = 0; ulint os_file_n_pending_pwrites = 0; - /*************************************************************************** Gets the operating system version. Currently works only on Windows. */ @@ -198,9 +198,12 @@ overwrite the error number). If the number is not known to this program, the OS error number + 100 is returned. */ ulint -os_file_get_last_error(void) -/*========================*/ - /* out: error number, or OS error number + 100 */ +os_file_get_last_error( +/*===================*/ + /* out: error number, or OS error + number + 100 */ + ibool report_all_errors) /* in: TRUE if we want an error message + printed of all errors */ { ulint err; @@ -208,7 +211,8 @@ os_file_get_last_error(void) err = (ulint) GetLastError(); - if (err != ERROR_DISK_FULL && err != ERROR_FILE_EXISTS) { + if (report_all_errors + || (err != ERROR_DISK_FULL && err != ERROR_FILE_EXISTS)) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Operating system error number %lu in a file operation.\n" @@ -246,7 +250,8 @@ os_file_get_last_error(void) #else err = (ulint) errno; - if (err != ENOSPC && err != EEXIST) { + if (report_all_errors + || (err != ENOSPC && err != EEXIST)) { ut_print_timestamp(stderr); fprintf(stderr, @@ -309,7 +314,7 @@ os_file_handle_error( UT_NOT_USED(file); - err = os_file_get_last_error(); + err = os_file_get_last_error(FALSE); if (err == OS_FILE_DISK_FULL) { /* We only print a warning about disk full once */ @@ -374,6 +379,217 @@ os_io_init_simple(void) } } +/*************************************************************************** +The os_file_opendir() function opens a directory stream corresponding to the +directory named by the dirname argument. The directory stream is positioned +at the first entry. In both Unix and Windows we automatically skip the '.' +and '..' items at the start of the directory listing. */ + +os_file_dir_t +os_file_opendir( +/*============*/ + /* out: directory stream, NULL if error */ + char* dirname, /* in: directory name; it must not contain + a trailing '\' or '/' */ + ibool error_is_fatal) /* in: TRUE if we should treat an error as a + fatal error; if we try to open symlinks then + we do not wish a fatal error if it happens + not to be a directory */ +{ + os_file_dir_t dir; +#ifdef __WIN__ + LPWIN32_FIND_DATA lpFindFileData; + char path[OS_FILE_MAX_PATH + 3]; + + ut_a(strlen(dirname) < OS_FILE_MAX_PATH); + + strcpy(path, dirname); + strcpy(path + strlen(path), "\*"); + + /* Note that in Windows opening the 'directory stream' also retrieves + the first entry in the directory. Since it is '.', that is no problem, + as we will skip over the '.' and '..' entries anyway. */ + + lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA)); + + dir = FindFirstFile(path, lpFindFileData); + + ut_free(lpFindFileData); + + if (dir == INVALID_HANDLE_VALUE) { + + if (error_is_fatal) { + os_file_handle_error(NULL, dirname, "opendir"); + } + + return(NULL); + } + + return(dir); +#else + dir = opendir(dirname); + + if (dir == NULL && error_is_fatal) { + os_file_handle_error(0, dirname, "opendir"); + } + + return(dir); +#endif +} + +/*************************************************************************** +Closes a directory stream. */ + +int +os_file_closedir( +/*=============*/ + /* out: 0 if success, -1 if failure */ + os_file_dir_t dir) /* in: directory stream */ +{ +#ifdef __WIN__ + BOOL ret; + + ret = FindClose(dir); + + if (!ret) { + os_file_handle_error(NULL, NULL, "closedir"); + + return(-1); + } + + return(0); +#else + int ret; + + ret = closedir(dir); + + if (ret) { + os_file_handle_error(0, NULL, "closedir"); + } + + return(ret); +#endif +} + +/*************************************************************************** +This function returns information of the next file in the directory. We jump +over the '.' and '..' entries in the directory. */ + +int +os_file_readdir_next_file( +/*======================*/ + /* out: 0 if ok, -1 if error, 1 if at the end + of the directory */ + char* dirname,/* in: directory name or path */ + os_file_dir_t dir, /* in: directory stream */ + os_file_stat_t* info) /* in/out: buffer where the info is returned */ +{ +#ifdef __WIN__ + LPWIN32_FIND_DATA lpFindFileData; + BOOL ret; + + lpFindFileData = ut_malloc(sizeof(WIN32_FIND_DATA)); +next_file: + ret = FindNextFile(dir, lpFindFileData); + + if (ret) { + ut_a(strlen(lpFindFileData->cFilename) < OS_FILE_MAX_PATH); + + if (strcmp(lpFindFileData->cFilename, ".") == 0 + || strcmp(lpFindFileData->cFilename, "..") == 0) { + + goto next_file; + } + + strcpy(info->name, lpFindFileData->cFilename); + + info->size = (ib_longlong)(buf->nFileSizeLow) + + (((ib_longlong)(buf->nFileSizeHigh)) << 32); + + if (lpFindFileData->dwFileAttributes + & FILE_ATTRIBUTE_REPARSE_POINT) { +/* TODO: test Windows symlinks */ +/* TODO: MySQL has apparently its own symlink implementation in Windows, +dbname.sym can redirect a database directory: +http://www.mysql.com/doc/en/Windows_symbolic_links.html */ + info->type = OS_FILE_TYPE_LINK; + } else if (lpFindFileData->dwFileAttributes + & FILE_ATTRIBUTE_DIRECTORY) { + info->type = OS_FILE_TYPE_DIR; + } else if (lpFindFileData->dwFileAttributes + & FILE_ATTRIBUTE_NORMAL) { +/* TODO: are FILE_ATTRIBUTE_NORMAL files really all normal files? */ + info->type = OS_FILE_TYPE_FILE; + } else { + info->type = OS_FILE_TYPE_UNKNOWN; + } + } + + ut_free(lpFindFileData); + + if (ret) { + return(0); + } else if (GetLastError() == ERROR_NO_MORE_FILES) { + + return(1); + } else { + os_file_handle_error(NULL, dirname, "readdir_next_file"); + + return(-1); + } +#else + struct dirent* ent; + char* full_path; + int ret; + struct stat statinfo; +next_file: + ent = readdir(dir); + + if (ent == NULL) { + return(1); + } + + ut_a(strlen(ent->d_name) < OS_FILE_MAX_PATH); + + if (strcmp(ent->d_name, ".") == 0 || strcmp(ent->d_name, "..") == 0) { + + goto next_file; + } + + strcpy(info->name, ent->d_name); + + full_path = ut_malloc(strlen(dirname) + strlen(ent->d_name) + 10); + + sprintf(full_path, "%s/%s", dirname, ent->d_name); + + ret = stat(full_path, &statinfo); + + if (ret) { + os_file_handle_error(0, full_path, "stat"); + + ut_free(full_path); + + return(-1); + } + + info->size = (ib_longlong)statinfo.st_size; + + if (S_ISDIR(statinfo.st_mode)) { + info->type = OS_FILE_TYPE_DIR; + } else if (S_ISLNK(statinfo.st_mode)) { + info->type = OS_FILE_TYPE_LINK; + } else if (S_ISREG(statinfo.st_mode)) { + info->type = OS_FILE_TYPE_FILE; + } else { + info->type = OS_FILE_TYPE_UNKNOWN; + } + + ut_free(full_path); + + return(0); +#endif +} + /******************************************************************** A simple function to open or create a file. */ @@ -593,7 +809,9 @@ os_file_create( ulint create_mode, /* in: OS_FILE_OPEN if an existing file is opened (if does not exist, error), or OS_FILE_CREATE if a new file is created (if exists, error), OS_FILE_OVERWRITE - if a new is created or an old overwritten */ + if a new is created or an old overwritten, + OS_FILE_OPEN_RAW, if a raw device or disk partition + should be opened */ ulint purpose,/* in: OS_FILE_AIO, if asynchronous, non-buffered i/o is desired, OS_FILE_NORMAL, if any normal file; NOTE that it also depends on type, os_aio_.. and srv_.. @@ -605,6 +823,7 @@ os_file_create( { #ifdef __WIN__ os_file_t file; + DWORD share_mode = FILE_SHARE_READ; DWORD create_flag; DWORD attributes; ibool retry; @@ -612,6 +831,9 @@ os_file_create( try_again: ut_a(name); + if (create_mode == OS_FILE_OPEN_RAW) { + create_flag = OPEN_EXISTING; + share_mode = FILE_SHARE_WRITE; if (create_mode == OS_FILE_OPEN) { create_flag = OPEN_EXISTING; } else if (create_mode == OS_FILE_CREATE) { @@ -662,14 +884,17 @@ try_again: file = CreateFile(name, GENERIC_READ | GENERIC_WRITE, /* read and write access */ - FILE_SHARE_READ,/* File can be read also by other + share_mode, /* File can be read also by other processes; we must give the read permission because of ibbackup. We do not give the write permission to others because if one would succeed to start 2 instances of mysqld on the SAME files, that could cause severe - database corruption! */ + database corruption! When opening + raw disk partitions Microsoft manuals + say that we must give also the write + permission. */ NULL, /* default security attributes */ create_flag, attributes, @@ -679,8 +904,8 @@ try_again: *success = FALSE; retry = os_file_handle_error(file, name, - create_mode == OS_FILE_OPEN ? - "open" : "create"); + create_mode == OS_FILE_CREATE ? + "create" : "open"); if (retry) { goto try_again; } @@ -700,7 +925,7 @@ try_again: try_again: ut_a(name); - if (create_mode == OS_FILE_OPEN) { + if (create_mode == OS_FILE_OPEN || create_mode == OS_FILE_OPEN_RAW) { mode_str = "OPEN"; create_flag = O_RDWR; @@ -767,8 +992,8 @@ try_again: *success = FALSE; retry = os_file_handle_error(file, name, - create_mode == OS_FILE_OPEN ? - "open" : "create"); + create_mode == OS_FILE_CREATE ? + "create" : "open"); if (retry) { goto try_again; } @@ -781,6 +1006,85 @@ try_again: } /*************************************************************************** +Deletes a file. The file has to be closed before calling this. */ + +ibool +os_file_delete( +/*===========*/ + /* out: TRUE if success */ + char* name) /* in: file path as a null-terminated string */ +{ +#ifdef __WIN__ + os_file_t dummy = NULL; + BOOL ret; + + ret = DeleteFile((LPCTSTR)name); + + if (ret) { + return(TRUE); + } + + os_file_handle_error(dummy, name, "delete"); + + return(FALSE); +#else + os_file_t dummy = 0; + int ret; + + ret = unlink((const char*)name); + + if (ret != 0) { + os_file_handle_error(dummy, name, "delete"); + + return(FALSE); + } + + return(TRUE); +#endif +} + +/*************************************************************************** +Renames a file (can also move it to another directory). It is safest that the +file is closed before calling this function. */ + +ibool +os_file_rename( +/*===========*/ + /* out: TRUE if success */ + char* oldpath, /* in: old file path as a null-terminated + string */ + char* newpath) /* in: new file path */ +{ +#ifdef __WIN__ + os_file_t dummy = NULL; + BOOL ret; + + ret = MoveFile((LPCTSTR)oldpath, (LPCTSTR)newpath); + + if (ret) { + return(TRUE); + } + + os_file_handle_error(dummy, oldpath, "delete"); + + return(FALSE); +#else + os_file_t dummy = 0; + int ret; + + ret = rename((const char*)oldpath, (const char*)newpath); + + if (ret != 0) { + os_file_handle_error(dummy, oldpath, "rename"); + + return(FALSE); + } + + return(TRUE); +#endif +} + +/*************************************************************************** Closes a file handle. In case of error, error number can be retrieved with os_file_get_last_error. */ @@ -889,7 +1193,7 @@ os_file_get_size( } if (sizeof(off_t) > 4) { - *size = (ulint)(offs & 0xFFFFFFFF); + *size = (ulint)(offs & 0xFFFFFFFFUL); *size_high = (ulint)(offs >> 32); } else { *size = (ulint) offs; @@ -1012,6 +1316,15 @@ os_file_flush( return(TRUE); } + /* Since Windows returns ERROR_INVALID_FUNCTION if the 'file' is + actually a raw device, we choose to ignore that error if we are using + raw disks */ + + if (srv_start_raw_disk_in_use && GetLastError() + == ERROR_INVALID_FUNCTION) { + return(TRUE); + } + os_file_handle_error(file, NULL, "flush"); /* It is a fatal error if a file flush does not succeed, because then @@ -1035,9 +1348,10 @@ os_file_flush( } /* Since Linux returns EINVAL if the 'file' is actually a raw device, - we choose to ignore that error */ + we choose to ignore that error if we are using raw disks */ + + if (srv_start_raw_disk_in_use && errno == EINVAL) { - if (errno == EINVAL) { return(TRUE); } @@ -1075,7 +1389,7 @@ os_file_pread( off_t offs; ssize_t n_bytes; - ut_a((offset & 0xFFFFFFFF) == offset); + ut_a((offset & 0xFFFFFFFFUL) == offset); /* If off_t is > 4 bytes in size, then we assume we can pass a 64-bit address */ @@ -1151,7 +1465,7 @@ os_file_pwrite( ssize_t ret; off_t offs; - ut_a((offset & 0xFFFFFFFF) == offset); + ut_a((offset & 0xFFFFFFFFUL) == offset); /* If off_t is > 4 bytes in size, then we assume we can pass a 64-bit address */ @@ -1255,7 +1569,7 @@ os_file_read( ibool retry; ulint i; - ut_a((offset & 0xFFFFFFFF) == offset); + ut_a((offset & 0xFFFFFFFFUL) == offset); os_n_file_reads++; os_bytes_read_since_printout += n; diff --git a/innobase/os/os0proc.c b/innobase/os/os0proc.c index 614cea63200..a427c595bc0 100644 --- a/innobase/os/os0proc.c +++ b/innobase/os/os0proc.c @@ -321,7 +321,7 @@ os_awe_allocate_virtual_mem_window( #elif defined(__WIN2000__) byte* ptr; - if (size > 0x7FFFFFFFFF) { + if (size > (ulint)0x7FFFFFFFUL) { fprintf(stderr, "InnoDB: AWE: Cannot allocate %lu bytes of virtual memory\n", size); @@ -333,7 +333,7 @@ os_awe_allocate_virtual_mem_window( if (ptr == NULL) { fprintf(stderr, "InnoDB: AWE: Cannot allocate %lu bytes of virtual memory, error %lu\n", - size, (ulint)GetLastError()); + size, (ulint)GetLastError()); return(NULL); } diff --git a/innobase/page/page0cur.c b/innobase/page/page0cur.c index 7e2fc19c00f..fa4fa94620a 100644 --- a/innobase/page/page0cur.c +++ b/innobase/page/page0cur.c @@ -629,7 +629,7 @@ page_cur_parse_insert_rec( return(NULL); } - extra_info_yes = end_seg_len & 0x1; + extra_info_yes = end_seg_len & 0x1UL; end_seg_len = end_seg_len / 2; if (end_seg_len >= UNIV_PAGE_SIZE) { @@ -702,7 +702,8 @@ page_cur_parse_insert_rec( /* Build the inserted record to buf */ if (mismatch_index >= UNIV_PAGE_SIZE) { - printf("Is short %lu, info_bits %lu, offset %lu, o_offset %lu\n" + printf( + "Is short %lu, info_bits %lu, offset %lu, o_offset %lu\n" "mismatch index %lu, end_seg_len %lu\n" "parsed len %lu\n", is_short, info_bits, offset, origin_offset, diff --git a/innobase/que/que0que.c b/innobase/que/que0que.c index a96c8840a03..3f28a4b40a5 100644 --- a/innobase/que/que0que.c +++ b/innobase/que/que0que.c @@ -1473,10 +1473,6 @@ loop: mutex_exit(&kernel_mutex); } */ - /* TRUE below denotes that the thread is allowed to own the dictionary - mutex, though */ - ut_ad(sync_thread_levels_empty_gen(TRUE)); - loop_count++; if (next_thr != thr) { diff --git a/innobase/row/row0ins.c b/innobase/row/row0ins.c index e02859bc851..1a7864be5e9 100644 --- a/innobase/row/row0ins.c +++ b/innobase/row/row0ins.c @@ -1739,6 +1739,7 @@ row_ins_index_entry_low( ulint modify = 0; /* remove warning */ rec_t* insert_rec; rec_t* rec; + rec_t* first_rec; ulint err; ulint n_unique; big_rec_t* big_rec = NULL; @@ -1771,6 +1772,14 @@ row_ins_index_entry_low( goto function_exit; } + first_rec = page_rec_get_next(page_get_infimum_rec( + buf_frame_align(btr_cur_get_rec(&cursor)))); + + if (!page_rec_is_supremum(first_rec)) { + ut_a((rec_get_n_fields(first_rec)) + == dtuple_get_n_fields(entry)); + } + n_unique = dict_index_get_n_unique(index); if (index->type & DICT_UNIQUE && (cursor.up_match >= n_unique diff --git a/innobase/row/row0mysql.c b/innobase/row/row0mysql.c index 35305b037c6..497b74fd320 100644 --- a/innobase/row/row0mysql.c +++ b/innobase/row/row0mysql.c @@ -22,12 +22,15 @@ Created 9/17/2000 Heikki Tuuri #include "dict0dict.h" #include "dict0crea.h" #include "dict0load.h" +#include "dict0boot.h" #include "trx0roll.h" #include "trx0purge.h" #include "lock0lock.h" #include "rem0cmp.h" #include "log0log.h" #include "btr0sea.h" +#include "fil0fil.h" +#include "ibuf0ibuf.h" /* A dummy variable used to fool the compiler */ ibool row_mysql_identically_false = FALSE; @@ -1161,7 +1164,9 @@ row_mysql_recover_tmp_table( trx_t* trx) /* in: transaction handle */ { char* ptr; - char old_name[1000]; + char old_name[OS_FILE_MAX_PATH]; + + ut_a(ut_strlen(table->name) + 10 < OS_FILE_MAX_PATH); ut_memcpy(old_name, table->name, ut_strlen(table->name) + 1); @@ -1230,7 +1235,8 @@ row_mysql_lock_data_dictionary( /*===========================*/ trx_t* trx) /* in: transaction */ { - ut_a(trx->dict_operation_lock_mode == 0); + ut_a(trx->dict_operation_lock_mode == 0 + || trx->dict_operation_lock_mode == RW_X_LATCH); /* Serialize data dictionary operations with dictionary mutex: no deadlocks or lock waits can occur then in these operations */ @@ -1431,15 +1437,14 @@ row_create_table_for_mysql( "InnoDB: Warning: cannot create table %s because tablespace full\n", table->name); row_drop_table_for_mysql(table->name, trx); - } else { - ut_a(err == DB_DUPLICATE_KEY); + } else if (err == DB_DUPLICATE_KEY) { ut_print_timestamp(stderr); fprintf(stderr, " InnoDB: Error: table %s already exists in InnoDB internal\n" "InnoDB: data dictionary. Have you deleted the .frm file\n" - "InnoDB: and not used DROP TABLE? Have you used DROP DATABASE\n" + "InnoDB: and not used DROPT ABLE? Have you used DROP DATABASE\n" "InnoDB: for InnoDB tables in MySQL version <= 3.23.43?\n" "InnoDB: See the Restrictions section of the InnoDB manual.\n", table->name); @@ -1449,9 +1454,12 @@ row_create_table_for_mysql( "InnoDB: database and moving the .frm file to the current database.\n" "InnoDB: Then MySQL thinks the table exists, and DROP TABLE will\n" "InnoDB: succeed.\n" - "InnoDB: You can look further help from section 15.1 of\n" + "InnoDB: You can look for further help from section 15.1 of\n" "InnoDB: http://www.innodb.com/ibman.html\n"); } + + /* We may also get err == DB_ERROR if the .ibd file for the + table already exists */ trx->error_state = DB_SUCCESS; } @@ -1490,7 +1498,7 @@ row_create_index_for_mysql( trx->op_info = (char *) "creating index"; /* Check that the same column does not appear twice in the index. - Starting from 4.0.14 InnoDB should be able to cope with that, but + Starting from 4.0.14, InnoDB should be able to cope with that, but safer not to allow them. */ for (i = 0; i < dict_index_get_n_fields(index); i++) { @@ -1532,6 +1540,9 @@ row_create_index_for_mysql( trx->dict_operation = TRUE; + /* Note that the space id where we store the index is inherited from + the table in dict_build_index_def_step() in dict0crea.c. */ + node = ind_create_graph_create(index, heap); thr = pars_complete_graph_for_exec(node, trx, heap); @@ -1545,7 +1556,6 @@ row_create_index_for_mysql( que_graph_free((que_t*) que_node_get_parent(thr)); error_handling: - if (err != DB_SUCCESS) { /* We have special error handling here */ @@ -1806,6 +1816,218 @@ row_add_table_to_background_drop_list( } /************************************************************************* +Discards the tablespace of a table which stored in an .ibd file. Discarding +means that this function deletes the .ibd file and assigns a new table id for +the table. Also the flag table->ibd_file_missing is set TRUE. + +How do we prevent crashes caused by ongoing operations on the table? Old +operations could try to access non-existent pages. + +1) SQL queries, INSERT, SELECT, ...: we must get an exclusive MySQL table lock +on the table before we can do DISCARD TABLESPACE. Then there are no running +queries on the table. +2) Purge and rollback: we assign a new table id for the table. Since purge and +rollback look for the table based on the table id, they see the table as +'dropped' and discard their operations. +3) Insert buffer: we remove all entries for the tablespace in the insert +buffer tree; as long as the tablespace mem object does not exist, ongoing +insert buffer page merges are discarded in buf0rea.c. If we recreate the +tablespace mem object with IMPORT TABLESPACE later, then the tablespace will +have the same id, but the tablespace_version field in the mem object is +different, and ongoing old insert buffer page merges get discarded. +4) Linear readahead and random readahead: we use the same method as in 3) to +discard ongoing operations. */ + +int +row_discard_tablespace_for_mysql( +/*=============================*/ + /* out: error code or DB_SUCCESS */ + char* name, /* in: table name */ + trx_t* trx) /* in: transaction handle */ +{ + dulint new_id; + dict_table_t* table; + que_thr_t* thr; + que_t* graph = NULL; + ibool success; + ulint err; + char buf[2 * OS_FILE_MAX_PATH]; + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + trx->op_info = (char *) "discarding tablespace"; + trx_start_if_not_started(trx); + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + row_mysql_lock_data_dictionary(trx); + + table = dict_table_get_low(name); + + if (!table) { + err = DB_TABLE_NOT_FOUND; + + goto funct_exit; + } + + new_id = dict_hdr_get_new_id(DICT_HDR_TABLE_ID); + + sprintf(buf, + "PROCEDURE DISCARD_TABLESPACE_PROC () IS\n" + "old_id CHAR;\n" + "new_id CHAR;\n" + "new_id_low INT;\n" + "new_id_high INT;\n" + "table_name CHAR;\n" + "BEGIN\n" + "table_name :='%s';\n" + "new_id_high := %lu;\n" + "new_id_low := %lu;\n" + "new_id := CONCAT(TO_BINARY(new_id_high, 4), TO_BINARY(new_id_low, 4));\n" + "SELECT ID INTO old_id\n" + "FROM SYS_TABLES\n" + "WHERE NAME = table_name;\n" + "IF (SQL % NOTFOUND) THEN\n" + " COMMIT WORK;\n" + " RETURN;\n" + "END IF;\n" + "UPDATE SYS_TABLES SET ID = new_id\n" + "WHERE ID = old_id;\n" + "UPDATE SYS_COLUMNS SET TABLE_ID = new_id\n" + "WHERE TABLE_ID = old_id;\n" + "UPDATE SYS_INDEXES SET TABLE_ID = new_id\n" + "WHERE TABLE_ID = old_id;\n" + "COMMIT WORK;\n" + "END;\n", name, ut_dulint_get_high(new_id), ut_dulint_get_low(new_id)); + + ut_a(strlen(buf) < 2 * OS_FILE_MAX_PATH); + + graph = pars_sql(buf); + + ut_a(graph); + + graph->trx = trx; + trx->graph = NULL; + + graph->fork_type = QUE_FORK_MYSQL_INTERFACE; + + ut_a(thr = que_fork_start_command(graph, SESS_COMM_EXECUTE, 0)); + + que_run_threads(thr); + + err = trx->error_state; + + if (err != DB_SUCCESS) { + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx->error_state = DB_SUCCESS; + } else { + dict_table_change_id_in_cache(table, new_id); + + success = fil_discard_tablespace(table->space); + + if (!success) { + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx->error_state = DB_SUCCESS; + + err = DB_ERROR; + } else { + /* Set the flag which tells that now it is legal to + IMPORT a tablespace for this table */ + table->tablespace_discarded = TRUE; + table->ibd_file_missing = TRUE; + } + } +funct_exit: + row_mysql_unlock_data_dictionary(trx); + + if (graph) { + que_graph_free(graph); + } + + trx_commit_for_mysql(trx); + + trx->op_info = (char *) ""; + + return((int) err); +} + +/********************************************************************* +Imports a tablespace. The space id in the .ibd file must match the space id +of the table in the data dictionary. */ + +int +row_import_tablespace_for_mysql( +/*============================*/ + /* out: error code or DB_SUCCESS */ + char* name, /* in: table name */ + trx_t* trx) /* in: transaction handle */ +{ + dict_table_t* table; + ibool success; + ulint err = DB_SUCCESS; + + ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); + + trx_start_if_not_started(trx); + + trx->op_info = (char*) "importing tablespace"; + + /* Serialize data dictionary operations with dictionary mutex: + no deadlocks can occur then in these operations */ + + row_mysql_lock_data_dictionary(trx); + + table = dict_table_get_low(name); + + if (!table) { + err = DB_TABLE_NOT_FOUND; + + goto funct_exit; + } + + if (!table->tablespace_discarded) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: you are trying to IMPORT a tablespace\n" +"InnoDB: %s, though you have not called DISCARD on it yet\n" +"InnoDB: during the lifetime of the mysqld process!\n", name); + + err = DB_ERROR; + + goto funct_exit; + } + + /* Play safe and remove all insert buffer entries, though we should + have removed them already when DISCARD TABLESPACE was called */ + + ibuf_delete_for_discarded_space(table->space); + + success = fil_open_single_table_tablespace(table->space, table->name); + + printf( +"Remember to stop purge + undo if table->ibd_file_is_missing!!!\n"); + + if (success) { + table->ibd_file_missing = FALSE; + table->tablespace_discarded = FALSE; + } else { + err = DB_ERROR; + } + +funct_exit: + row_mysql_unlock_data_dictionary(trx); + + trx_commit_for_mysql(trx); + + trx->op_info = (char *) ""; + + return((int) err); +} + +/************************************************************************* Drops a table for MySQL. If the name of the dropped table ends to characters INNODB_MONITOR, then this also stops printing of monitor output by the master thread. */ @@ -1813,11 +2035,12 @@ output by the master thread. */ int row_drop_table_for_mysql( /*=====================*/ - /* out: error code or DB_SUCCESS */ - char* name, /* in: table name */ - trx_t* trx) /* in: transaction handle */ + /* out: error code or DB_SUCCESS */ + char* name, /* in: table name */ + trx_t* trx) /* in: transaction handle */ { dict_table_t* table; + ulint space_id; que_thr_t* thr; que_t* graph; ulint err; @@ -1826,8 +2049,9 @@ row_drop_table_for_mysql( ulint len; ulint namelen; ulint keywordlen; + ibool success; ibool locked_dictionary = FALSE; - char buf[10000]; + char buf[OS_FILE_MAX_PATH + 2000]; ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); ut_a(name != NULL); @@ -1968,6 +2192,8 @@ row_drop_table_for_mysql( ut_memcpy(buf + len, str2, ut_strlen(str2) + 1); + ut_a(strlen(buf) < OS_FILE_MAX_PATH + 2000); + /* Serialize data dictionary operations with dictionary mutex: no deadlocks can occur then in these operations */ @@ -1999,11 +2225,12 @@ row_drop_table_for_mysql( ut_print_timestamp(stderr); fprintf(stderr, - " InnoDB: Error: table %s does not exist in the InnoDB internal\n" + " InnoDB: Error: table %s\n" + "InnoDB: does not exist in the InnoDB internal\n" "InnoDB: data dictionary though MySQL is trying to drop it.\n" "InnoDB: Have you copied the .frm file of the table to the\n" "InnoDB: MySQL database directory from another database?\n" - "InnoDB: You can look further help from section 15.1 of\n" + "InnoDB: You can look for further help from section 15.1 of\n" "InnoDB: http://www.innodb.com/ibman.html\n", name); goto funct_exit; @@ -2063,13 +2290,32 @@ row_drop_table_for_mysql( ut_a(0); } else { + space_id = table->space; dict_table_remove_from_cache(table); if (dict_load_table(name) != NULL) { ut_print_timestamp(stderr); fprintf(stderr, -" InnoDB: Error: dropping of table %s failed!\n", name); +" InnoDB: Error: not able to remove table %s from the dictionary cache!\n", + name); + err = DB_ERROR; + } + + /* Do not drop possible .ibd tablespace if something went + wrong: we do not want to delete valuable data of the user */ + + if (err == DB_SUCCESS && space_id != 0 + && fil_tablespace_exists_in_mem(space_id)) { + success = fil_delete_tablespace(space_id); + + if (!success) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: not able to delete tablespace %lu of table %s!\n", space_id, + name); + err = DB_ERROR; + } } } funct_exit: @@ -2203,9 +2449,13 @@ row_rename_table_for_mysql( mem_heap_t* heap = NULL; char** constraints_to_drop = NULL; ulint n_constraints_to_drop = 0; + ibool recovering_temp_table = FALSE; + ulint namelen; + ulint keywordlen; ulint len; ulint i; - char buf[10000]; + ibool success; + char buf[2 * OS_FILE_MAX_PATH]; ut_ad(trx->mysql_thread_id == os_thread_get_curr_id()); ut_a(old_name != NULL); @@ -2239,16 +2489,52 @@ row_rename_table_for_mysql( trx->op_info = (char *) "renaming table"; trx_start_if_not_started(trx); + namelen = ut_strlen(new_name); + + keywordlen = ut_strlen("_recover_innodb_tmp_table"); + + if (namelen >= keywordlen + && 0 == ut_memcmp(new_name + namelen - keywordlen, + (char*)"_recover_innodb_tmp_table", keywordlen)) { + + recovering_temp_table = TRUE; + } + /* Serialize data dictionary operations with dictionary mutex: no deadlocks can occur then in these operations */ - row_mysql_lock_data_dictionary(trx); + if (!recovering_temp_table) { + row_mysql_lock_data_dictionary(trx); + } table = dict_table_get_low(old_name); if (!table) { err = DB_TABLE_NOT_FOUND; + ut_print_timestamp(stderr); + + fprintf(stderr, + " InnoDB: Error: table %s\n" + "InnoDB: does not exist in the InnoDB internal\n" + "InnoDB: data dictionary though MySQL is trying to rename the table.\n" + "InnoDB: Have you copied the .frm file of the table to the\n" + "InnoDB: MySQL database directory from another database?\n" + "InnoDB: You can look for further help from section 15.1 of\n" + "InnoDB: http://www.innodb.com/ibman.html\n", + old_name); + goto funct_exit; + } + + if (table->ibd_file_missing) { + err = DB_TABLE_NOT_FOUND; + ut_print_timestamp(stderr); + fprintf(stderr, + " InnoDB: Error: table %s\n" + "InnoDB: does not have an .ibd file in the database directory.\n" + "InnoDB: You can look for further help from section 15.1 of\n" + "InnoDB: http://www.innodb.com/ibman.html\n", + old_name); goto funct_exit; } @@ -2331,6 +2617,8 @@ row_rename_table_for_mysql( ut_memcpy(buf + len, str3, ut_strlen(str3) + 1); + ut_a(strlen(buf) < 2 * OS_FILE_MAX_PATH); + graph = pars_sql(buf); ut_a(graph); @@ -2349,20 +2637,17 @@ row_rename_table_for_mysql( if (err != DB_SUCCESS) { if (err == DB_DUPLICATE_KEY) { ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Error: table %s exists in the InnoDB internal data\n" "InnoDB: dictionary though MySQL is trying rename table %s to it.\n" "InnoDB: Have you deleted the .frm file and not used DROP TABLE?\n" - "InnoDB: You can look further help from section 15.1 of\n" + "InnoDB: You can look for further help from section 15.1 of\n" "InnoDB: http://www.innodb.com/ibman.html\n", new_name, old_name); - fprintf(stderr, "InnoDB: If table %s is a temporary table #sql..., then it can be that\n" "InnoDB: there are still queries running on the table, and it will be\n" "InnoDB: dropped automatically when the queries end.\n", new_name); - fprintf(stderr, "InnoDB: You can drop the orphaned table inside InnoDB by\n" "InnoDB: creating an InnoDB table with the same name in another\n" @@ -2370,13 +2655,27 @@ row_rename_table_for_mysql( "InnoDB: Then MySQL thinks the table exists, and DROP TABLE will\n" "InnoDB: succeed.\n"); } - trx->error_state = DB_SUCCESS; trx_general_rollback_for_mysql(trx, FALSE, NULL); trx->error_state = DB_SUCCESS; } else { - ut_a(dict_table_rename_in_cache(table, new_name, - !row_is_mysql_tmp_table_name(new_name))); + /* The following call will also rename the .ibd data file if + the table is stored in a single-table tablespace */ + + success = dict_table_rename_in_cache(table, new_name, + !row_is_mysql_tmp_table_name(new_name)); + if (!success) { + trx->error_state = DB_SUCCESS; + trx_general_rollback_for_mysql(trx, FALSE, NULL); + trx->error_state = DB_SUCCESS; + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error in table rename, cannot rename %s to %s\n", old_name, + new_name); + err = DB_ERROR; + + goto funct_exit; + } if (row_is_mysql_tmp_table_name(old_name)) { @@ -2390,18 +2689,14 @@ row_rename_table_for_mysql( err = dict_load_foreigns(new_name); if (err != DB_SUCCESS) { - ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Error: in ALTER TABLE table %s\n" "InnoDB: has or is referenced in foreign key constraints\n" "InnoDB: which are not compatible with the new table definition.\n", new_name); - ut_a(dict_table_rename_in_cache(table, old_name, FALSE)); - trx->error_state = DB_SUCCESS; trx_general_rollback_for_mysql(trx, FALSE, NULL); @@ -2410,7 +2705,9 @@ row_rename_table_for_mysql( } } funct_exit: - row_mysql_unlock_data_dictionary(trx); + if (!recovering_temp_table) { + row_mysql_unlock_data_dictionary(trx); + } if (graph) { que_graph_free(graph); @@ -2567,7 +2864,7 @@ row_check_table_for_mysql( ulint n_rows_in_table = ULINT_UNDEFINED; ulint ret = DB_SUCCESS; ulint old_isolation_level; - + prebuilt->trx->op_info = (char *) "checking table"; old_isolation_level = prebuilt->trx->isolation_level; diff --git a/innobase/row/row0sel.c b/innobase/row/row0sel.c index d0f6965f94e..eced7628096 100644 --- a/innobase/row/row0sel.c +++ b/innobase/row/row0sel.c @@ -1905,6 +1905,7 @@ row_sel_convert_mysql_key_to_innobase( ulint key_len) /* in: MySQL key value length */ { byte* original_buf = buf; + byte* original_key_ptr = key_ptr; dict_field_t* field; dfield_t* dfield; ulint data_offset; @@ -2028,7 +2029,16 @@ row_sel_convert_mysql_key_to_innobase( ut_print_timestamp(stderr); fprintf(stderr, - " InnoDB: Warning: using a partial-field key prefix in search\n"); + " InnoDB: Warning: using a partial-field key prefix in search.\n" + "InnoDB: Table name %s, index name %s. Last data field length %lu bytes,\n" + "InnoDB: key ptr now exceeds key end by %lu bytes.\n" + "InnoDB: Key value in the MySQL format:\n", index->table_name, index->name, + data_field_len, + (ulint)(key_ptr - key_end)); + fflush(stderr); + ut_print_buf(original_key_ptr, key_len); + fflush(stdout); + fprintf(stderr, "\n"); if (!is_null) { dfield->len -= (ulint)(key_ptr - key_end); @@ -2155,9 +2165,10 @@ static ibool row_sel_store_mysql_rec( /*====================*/ - /* out: TRUE if success, FALSE - if could not allocate memory for a - BLOB */ + /* out: TRUE if success, FALSE if + could not allocate memory for a BLOB + (though we may also assert in that + case) */ byte* mysql_rec, /* out: row in the MySQL format */ row_prebuilt_t* prebuilt, /* in: prebuilt struct */ rec_t* rec) /* in: Innobase record in the index @@ -2169,8 +2180,9 @@ row_sel_store_mysql_rec( byte* data; ulint len; byte* blob_buf; + int pad_char; ulint i; - + ut_ad(prebuilt->mysql_template); if (prebuilt->blob_heap != NULL) { @@ -2178,9 +2190,10 @@ row_sel_store_mysql_rec( prebuilt->blob_heap = NULL; } - /* Mark all columns as SQL NULL */ + /* MySQL assumes that all columns have the SQL NULL bit set unless it + is a nullable column with a non-NULL value */ - memset(mysql_rec, 255, prebuilt->null_bitmap_len); + memset(mysql_rec, 0xFF, prebuilt->null_bitmap_len); for (i = 0; i < prebuilt->n_template; i++) { @@ -2197,6 +2210,10 @@ row_sel_store_mysql_rec( extern_field_heap = mem_heap_create(UNIV_PAGE_SIZE); + /* NOTE: if we are retrieving a big BLOB, we may + already run out of memory in the next call, which + causes an assert */ + data = btr_rec_copy_externally_stored_field(rec, templ->rec_field_no, &len, extern_field_heap); @@ -2209,20 +2226,28 @@ row_sel_store_mysql_rec( ut_a(prebuilt->templ_contains_blob); - /* A heuristic test that we can allocate - the memory for a big BLOB. We have a safety - margin of 1000000 bytes. Since the test - takes some CPU time, we do not use for small - BLOBs. */ + /* A heuristic test that we can allocate the + memory for a big BLOB. We have a safety margin + of 1000000 bytes. Since the test takes some + CPU time, we do not use it for small BLOBs. */ if (len > 2000000 && !ut_test_malloc(len + 1000000)) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Warning: could not allocate %lu + 1000000 bytes to retrieve\n" +"InnoDB: a big column. Table name %s\n", len, prebuilt->table->name); + + if (extern_field_heap) { + mem_heap_free( + extern_field_heap); + } return(FALSE); } - /* Copy the BLOB data to the BLOB - heap of prebuilt */ + /* Copy the BLOB data to the BLOB heap of + prebuilt */ if (prebuilt->blob_heap == NULL) { prebuilt->blob_heap = @@ -2235,35 +2260,49 @@ row_sel_store_mysql_rec( data = blob_buf; } - + row_sel_field_store_in_mysql_format( mysql_rec + templ->mysql_col_offset, templ->mysql_col_len, data, len, templ->type, templ->is_unsigned); - + + /* Cleanup */ if (extern_field_heap) { - mem_heap_free(extern_field_heap); + mem_heap_free(extern_field_heap); extern_field_heap = NULL; - } - + } + if (templ->mysql_null_bit_mask) { + /* It is a nullable column with a non-NULL + value */ mysql_rec[templ->mysql_null_byte_offset] &= ~(byte) (templ->mysql_null_bit_mask); } } else { /* MySQL seems to assume the field for an SQL NULL - value is set to zero. Not taking this into account - caused seg faults with NULL BLOB fields, and + value is set to zero or space. Not taking this into + account caused seg faults with NULL BLOB fields, and bug number 154 in the MySQL bug database: GROUP BY and DISTINCT could treat NULL values inequal. */ - memset(mysql_rec + templ->mysql_col_offset, - ((templ->type == DATA_VARCHAR || - templ->type == DATA_VARMYSQL || - templ->type == DATA_BINARY) ? ' ' : '\0'), - templ->mysql_col_len); + if (templ->type == DATA_VARCHAR + || templ->type == DATA_CHAR + || templ->type == DATA_BINARY + || templ->type == DATA_FIXBINARY + || templ->type == DATA_MYSQL + || templ->type == DATA_VARMYSQL) { + /* MySQL pads all non-BLOB and non-TEXT + string types with space ' ' */ + + pad_char = ' '; + } else { + pad_char = '\0'; + } + + memset(mysql_rec + templ->mysql_col_offset, pad_char, + templ->mysql_col_len); } - } + } return(TRUE); } @@ -2590,9 +2629,9 @@ row_sel_push_cache_row_for_mysql( ut_ad(prebuilt->fetch_cache_first == 0); - row_sel_store_mysql_rec( + ut_a(row_sel_store_mysql_rec( prebuilt->fetch_cache[prebuilt->n_fetch_cached], - prebuilt, rec); + prebuilt, rec)); prebuilt->n_fetch_cached++; } @@ -2827,23 +2866,6 @@ row_search_for_mysql( mode = pcur->search_mode; } - if ((direction == ROW_SEL_NEXT || direction == ROW_SEL_PREV) - && pcur->old_stored != BTR_PCUR_OLD_STORED) { - - /* MySQL sometimes seems to do fetch next or fetch prev even - if the search condition is unique; this can, for example, - happen with the HANDLER commands; we do not always store the - pcur position in this case, so we cannot restore cursor - position, and must return immediately */ - - /* printf("%s record not found 1\n", index->name); */ - - trx->op_info = (char *) ""; - return(DB_RECORD_NOT_FOUND); - } - - mtr_start(&mtr); - /* In a search where at most one record in the index may match, we can use a LOCK_REC_NOT_GAP type record lock when locking a non-delete- marked matching record. @@ -2858,8 +2880,21 @@ row_search_for_mysql( && dtuple_get_n_fields(search_tuple) == dict_index_get_n_unique(index)) { unique_search = TRUE; + + /* Even if the condition is unique, MySQL seems to try to + retrieve also a second row if a primary key contains more than + 1 column. Return immediately if this is not a HANDLER + command. */ + + if (direction != 0 && !prebuilt->used_in_HANDLER) { + + trx->op_info = (char *) ""; + return(DB_RECORD_NOT_FOUND); + } } + mtr_start(&mtr); + /*-------------------------------------------------------------*/ /* PHASE 2: Try fast adaptive hash index search if possible */ @@ -2912,7 +2947,9 @@ row_search_for_mysql( rec)) { err = DB_TOO_BIG_RECORD; - goto lock_wait_or_error; + /* We let the main loop to do the + error handling */ + goto shortcut_fails_too_big_rec; } mtr_commit(&mtr); @@ -2960,7 +2997,7 @@ row_search_for_mysql( return(DB_RECORD_NOT_FOUND); } - +shortcut_fails_too_big_rec: mtr_commit(&mtr); mtr_start(&mtr); } diff --git a/innobase/row/row0upd.c b/innobase/row/row0upd.c index db68479509d..606f7404d50 100644 --- a/innobase/row/row0upd.c +++ b/innobase/row/row0upd.c @@ -1532,7 +1532,8 @@ row_upd_clust_step( then we have to free the file segments of the index tree associated with the index */ - if (ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) { + if (node->is_delete + && ut_dulint_cmp(node->table->id, DICT_INDEXES_ID) == 0) { dict_drop_index_tree(btr_pcur_get_rec(pcur), mtr); diff --git a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c index a886cbee22a..e16073c171c 100644 --- a/innobase/srv/srv0srv.c +++ b/innobase/srv/srv0srv.c @@ -75,6 +75,10 @@ names, where the file name itself may also contain a path */ char* srv_data_home = NULL; char* srv_arch_dir = NULL; +ibool srv_file_per_table = FALSE; /* store to its own file each table + created by an user; data dictionary + tables are in the system tablespace + 0 */ ulint srv_n_data_files = 0; char** srv_data_file_names = NULL; ulint* srv_data_file_sizes = NULL; /* size in database pages */ @@ -162,6 +166,8 @@ char* srv_file_flush_method_str = NULL; ulint srv_unix_file_flush_method = SRV_UNIX_FDATASYNC; ulint srv_win_file_flush_method = SRV_WIN_IO_UNBUFFERED; +ulint srv_max_n_open_files = 300; + /* The InnoDB main thread tries to keep the ratio of modified pages in the buffer pool to all database pages in the buffer pool smaller than the following number. But it is not guaranteed that the value stays below @@ -1444,7 +1450,7 @@ srv_read_initfile( srv_log_group_home_dirs[i] = ut_malloc(ut_strlen(str_buf) + 1); ut_memcpy(srv_log_group_home_dirs[i], str_buf, - ut_strlen(str_buf) + 1); + ut_strlen(str_buf) + 1); } err = srv_read_init_val(initfile, "INNOBASE_LOG_ARCH_DIR", @@ -2325,7 +2331,7 @@ srv_sprintf_innodb_monitor( char* buf_end = buf + len - 2000; double time_elapsed; time_t current_time; - ulint n_reserved; + ulint n_reserved; mutex_enter(&srv_innodb_monitor_mutex); @@ -2436,7 +2442,8 @@ srv_sprintf_innodb_monitor( if (mem_out_of_mem_err_msg_count > 0) { buf += sprintf(buf, - "Mem allocation has spilled out of additional mem pool %lu times\n"); + "Mem allocation has spilled out of additional mem pool %lu times\n", + mem_out_of_mem_err_msg_count); } if (srv_use_awe) { @@ -2453,15 +2460,15 @@ srv_sprintf_innodb_monitor( "ROW OPERATIONS\n" "--------------\n"); buf += sprintf(buf, - "%ld queries inside InnoDB, %lu queries in queue\n", - srv_conc_n_threads, srv_conc_n_waiting_threads); + "%ld queries inside InnoDB, %lu queries in queue\n", + srv_conc_n_threads, srv_conc_n_waiting_threads); - n_reserved = fil_space_get_n_reserved_extents(0); - if (n_reserved > 0) { - buf += sprintf(buf, - "%lu tablespace extents now reserved for B-tree split operations\n", - n_reserved); - } + n_reserved = fil_space_get_n_reserved_extents(0); + if (n_reserved > 0) { + buf += sprintf(buf, + "%lu tablespace extents now reserved for B-tree split operations\n", + n_reserved); + } #ifdef UNIV_LINUX buf += sprintf(buf, @@ -2701,8 +2708,13 @@ srv_error_monitor_thread( os_thread_create */ { ulint cnt = 0; + dulint old_lsn; + dulint new_lsn; UT_NOT_USED(arg); + + old_lsn = srv_start_lsn; + #ifdef UNIV_DEBUG_THREAD_CREATION printf("Error monitor thread starts, id %lu\n", os_thread_pf(os_thread_get_curr_id())); @@ -2714,6 +2726,25 @@ loop: os_thread_sleep(2000000); + /* Try to track a strange bug reported by Harald Fuchs and others, + where the lsn seems to decrease at times */ + + new_lsn = log_get_lsn(); + + if (ut_dulint_cmp(new_lsn, old_lsn) < 0) { + ut_print_timestamp(stderr); + fprintf(stderr, +" InnoDB: Error: old log sequence number %lu %lu was greater\n" +"InnoDB: than the new log sequence number %lu %lu!\n" +"InnoDB: Please send a bug report to mysql@lists.mysql.com\n", + ut_dulint_get_high(old_lsn), + ut_dulint_get_low(old_lsn), + ut_dulint_get_high(new_lsn), + ut_dulint_get_low(new_lsn)); + } + + old_lsn = new_lsn; + if (difftime(time(NULL), srv_last_monitor_time) > 60) { /* We referesh InnoDB Monitor values so that averages are printed from at most 60 last seconds */ @@ -2903,6 +2934,9 @@ loop: srv_main_thread_op_info = (char*)"flushing log"; log_buffer_flush_to_disk(); + srv_main_thread_op_info = (char*)"making checkpoint"; + log_free_check(); + /* If there were less than 5 i/os during the one second sleep, we assume that there is free disk i/o capacity available, and it makes sense to diff --git a/innobase/srv/srv0start.c b/innobase/srv/srv0start.c index f0ff1167f4d..5de87038bde 100644 --- a/innobase/srv/srv0start.c +++ b/innobase/srv/srv0start.c @@ -30,6 +30,7 @@ Created 2/16/1996 Heikki Tuuri #include "page0cur.h" #include "trx0trx.h" #include "dict0boot.h" +#include "dict0load.h" #include "trx0sys.h" #include "dict0crea.h" #include "btr0btr.h" @@ -56,6 +57,14 @@ Created 2/16/1996 Heikki Tuuri #include "srv0start.h" #include "que0que.h" + +/* Log sequence number immediately after startup */ +dulint srv_start_lsn; +/* Log sequence number at shutdown */ +dulint srv_shutdown_lsn; + +ibool srv_start_raw_disk_in_use = FALSE; + ibool srv_start_has_been_called = FALSE; ulint srv_sizeof_trx_t_in_ha_innodb_cc; @@ -87,13 +96,6 @@ ibool srv_os_test_mutex_is_locked = FALSE; #define SRV_N_PENDING_IOS_PER_THREAD OS_AIO_N_PENDING_IOS_PER_THREAD #define SRV_MAX_N_PENDING_SYNC_IOS 100 -/* The following limit may be too big in some old operating systems: -we may get an assertion failure in os0file.c */ - -#define SRV_MAX_N_OPEN_FILES 500 - -#define SRV_LOG_SPACE_FIRST_ID 1000000000 - /************************************************************************* Reads the data files and their sizes from a character string given in the .cnf file. */ @@ -137,7 +139,8 @@ srv_parse_data_file_paths_and_sizes( while ((*str != ':' && *str != '\0') || (*str == ':' - && (*(str + 1) == '\\' || *(str + 1) == '/'))) { + && (*(str + 1) == '\\' || *(str + 1) == '/' + || *(str + 1) == ':'))) { str++; } @@ -234,11 +237,15 @@ srv_parse_data_file_paths_and_sizes( while (*str != '\0') { path = str; - /* Note that we must ignore the ':' in a Windows path */ + /* Note that we must step over the ':' in a Windows path; + a Windows path normally looks like C:\ibdata\ibdata1:1G, but + a Windows raw partition may have a specification like + \\.\C::1Gnewraw or \\.\PHYSICALDRIVE2:1Gnewraw */ while ((*str != ':' && *str != '\0') || (*str == ':' - && (*(str + 1) == '\\' || *(str + 1) == '/'))) { + && (*(str + 1) == '\\' || *(str + 1) == '/' + || *(str + 1) == ':'))) { str++; } @@ -452,7 +459,8 @@ Normalizes a directory path for Windows: converts slashes to backslashes. */ void srv_normalize_path_for_win( /*=======================*/ - char* str __attribute__((unused))) /* in/out: null-terminated character string */ + char* str __attribute__((unused))) /* in/out: null-terminated + character string */ { #ifdef __WIN__ ulint i; @@ -510,7 +518,7 @@ srv_calc_low32( expressed in bytes */ ulint file_size) /* in: file size in database pages */ { - return(0xFFFFFFFF & (file_size << UNIV_PAGE_SIZE_SHIFT)); + return(0xFFFFFFFFUL & (file_size << UNIV_PAGE_SIZE_SHIFT)); } /************************************************************************* @@ -563,7 +571,7 @@ open_or_create_log_file( files[i] = os_file_create(name, OS_FILE_CREATE, OS_FILE_NORMAL, OS_LOG_FILE, &ret); if (ret == FALSE) { - if (os_file_get_last_error() != OS_FILE_ALREADY_EXISTS) { + if (os_file_get_last_error(FALSE) != OS_FILE_ALREADY_EXISTS) { fprintf(stderr, "InnoDB: Error in creating or opening %s\n", name); @@ -640,7 +648,7 @@ open_or_create_log_file( ut_a(fil_validate()); fil_node_create(name, srv_log_file_size, - 2 * k + SRV_LOG_SPACE_FIRST_ID); + 2 * k + SRV_LOG_SPACE_FIRST_ID, FALSE); /* If this is the first log group, create the file space object for archived logs */ @@ -648,7 +656,8 @@ open_or_create_log_file( if (k == 0 && i == 0) { arch_space_id = 2 * k + 1 + SRV_LOG_SPACE_FIRST_ID; - fil_space_create((char*) "arch_log_space", arch_space_id, FIL_LOG); + fil_space_create((char*) "arch_log_space", arch_space_id, + FIL_LOG); } else { arch_space_id = ULINT_UNDEFINED; } @@ -708,18 +717,32 @@ open_or_create_data_files( sprintf(name, "%s%s", srv_data_home, srv_data_file_names[i]); - files[i] = os_file_create(name, OS_FILE_CREATE, + if (srv_data_file_is_raw_partition[i] == 0) { + + /* First we try to create the file: if it already + exists, ret will get value FALSE */ + + files[i] = os_file_create(name, OS_FILE_CREATE, OS_FILE_NORMAL, OS_DATA_FILE, &ret); - if (srv_data_file_is_raw_partition[i] == SRV_NEW_RAW) { + if (ret == FALSE && os_file_get_last_error(FALSE) != + OS_FILE_ALREADY_EXISTS) { + fprintf(stderr, + "InnoDB: Error in creating or opening %s\n", + name); + + return(DB_ERROR); + } + } else if (srv_data_file_is_raw_partition[i] == SRV_NEW_RAW) { /* The partition is opened, not created; then it is written over */ + srv_start_raw_disk_in_use = TRUE; srv_created_new_raw = TRUE; files[i] = os_file_create( - name, OS_FILE_OPEN, OS_FILE_NORMAL, - OS_DATA_FILE, &ret); + name, OS_FILE_OPEN_RAW, OS_FILE_NORMAL, + OS_DATA_FILE, &ret); if (!ret) { fprintf(stderr, "InnoDB: Error in opening %s\n", name); @@ -727,19 +750,15 @@ open_or_create_data_files( return(DB_ERROR); } } else if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) { + srv_start_raw_disk_in_use = TRUE; + ret = FALSE; + } else { + ut_a(0); } if (ret == FALSE) { - if (srv_data_file_is_raw_partition[i] != SRV_OLD_RAW - && os_file_get_last_error() != - OS_FILE_ALREADY_EXISTS) { - fprintf(stderr, - "InnoDB: Error in creating or opening %s\n", - name); - - return(DB_ERROR); - } + /* We open the data file */ if (one_created) { fprintf(stderr, @@ -750,71 +769,80 @@ open_or_create_data_files( return(DB_ERROR); } - files[i] = os_file_create( - name, OS_FILE_OPEN, OS_FILE_NORMAL, - OS_DATA_FILE, &ret); + if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) { + files[i] = os_file_create( + name, OS_FILE_OPEN_RAW, OS_FILE_NORMAL, + OS_DATA_FILE, &ret); + } else { + files[i] = os_file_create( + name, OS_FILE_OPEN, OS_FILE_NORMAL, + OS_DATA_FILE, &ret); + } + if (!ret) { fprintf(stderr, "InnoDB: Error in opening %s\n", name); - os_file_get_last_error(); + os_file_get_last_error(TRUE); return(DB_ERROR); } - if (srv_data_file_is_raw_partition[i] != SRV_OLD_RAW) { - - ret = os_file_get_size(files[i], &size, - &size_high); - ut_a(ret); - /* Round size downward to megabytes */ + if (srv_data_file_is_raw_partition[i] == SRV_OLD_RAW) { + + goto skip_size_check; + } + + ret = os_file_get_size(files[i], &size, &size_high); + ut_a(ret); + /* Round size downward to megabytes */ - rounded_size_pages = (size / (1024 * 1024) + rounded_size_pages = (size / (1024 * 1024) + 4096 * size_high) << (20 - UNIV_PAGE_SIZE_SHIFT); - if (i == srv_n_data_files - 1 + if (i == srv_n_data_files - 1 && srv_auto_extend_last_data_file) { - if (srv_data_file_sizes[i] > + if (srv_data_file_sizes[i] > rounded_size_pages || (srv_last_file_size_max > 0 && srv_last_file_size_max < rounded_size_pages)) { - fprintf(stderr, + fprintf(stderr, "InnoDB: Error: auto-extending data file %s is of a different size\n" "InnoDB: %lu pages (rounded down to MB) than specified in the .cnf file:\n" "InnoDB: initial %lu pages, max %lu (relevant if non-zero) pages!\n", name, rounded_size_pages, srv_data_file_sizes[i], srv_last_file_size_max); - return(DB_ERROR); - } - - srv_data_file_sizes[i] = - rounded_size_pages; + return(DB_ERROR); } + + srv_data_file_sizes[i] = rounded_size_pages; + } - if (rounded_size_pages - != srv_data_file_sizes[i]) { + if (rounded_size_pages != srv_data_file_sizes[i]) { - fprintf(stderr, + fprintf(stderr, "InnoDB: Error: data file %s is of a different size\n" "InnoDB: %lu pages (rounded down to MB)\n" "InnoDB: than specified in the .cnf file %lu pages!\n", name, rounded_size_pages, srv_data_file_sizes[i]); - return(DB_ERROR); - } + return(DB_ERROR); } - +skip_size_check: fil_read_flushed_lsn_and_arch_log_no(files[i], one_opened, min_flushed_lsn, min_arch_log_no, max_flushed_lsn, max_arch_log_no); one_opened = TRUE; } else { + /* We created the data file and now write it full of + zeros */ + one_created = TRUE; if (i > 0) { @@ -862,7 +890,13 @@ open_or_create_data_files( ut_a(fil_validate()); - fil_node_create(name, srv_data_file_sizes[i], 0); + if (srv_data_file_is_raw_partition[i]) { + + fil_node_create(name, srv_data_file_sizes[i], 0, TRUE); + } else { + fil_node_create(name, srv_data_file_sizes[i], 0, + FALSE); + } } ios = 0; @@ -972,9 +1006,11 @@ innobase_start_or_create_for_mysql(void) ulint tablespace_size_in_header; ulint err; ulint i; - ulint k; + ibool srv_file_per_table_original_value = srv_file_per_table; mtr_t mtr; + srv_file_per_table = FALSE; /* system tables are created in tablespace + 0 */ #ifdef UNIV_DEBUG fprintf(stderr, "InnoDB: !!!!!!!!!!!!!! UNIV_DEBUG switched on !!!!!!!!!!!!!!!\n"); @@ -999,7 +1035,6 @@ innobase_start_or_create_for_mysql(void) fprintf(stderr, "InnoDB: !!!!!!!!!!!!!! UNIV_SIMULATE_AWE switched on !!!!!!!!!!!!!!!!!\n"); #endif - if (srv_sizeof_trx_t_in_ha_innodb_cc != (ulint)sizeof(trx_t)) { fprintf(stderr, "InnoDB: Error: trx_t size is %lu in ha_innodb.cc but %lu in srv0start.c\n" @@ -1122,7 +1157,6 @@ innobase_start_or_create_for_mysql(void) if (!os_aio_use_native_aio) { /* In simulated aio we currently have use only for 4 threads */ - srv_n_file_io_threads = 4; os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD @@ -1136,7 +1170,7 @@ innobase_start_or_create_for_mysql(void) SRV_MAX_N_PENDING_SYNC_IOS); } - fil_init(SRV_MAX_N_OPEN_FILES); + fil_init(srv_max_n_open_files); if (srv_use_awe) { fprintf(stderr, @@ -1168,7 +1202,6 @@ innobase_start_or_create_for_mysql(void) for (i = 0; i < srv_n_file_io_threads; i++) { n[i] = i; - os_thread_create(io_handler_thread, n + i, thread_ids + i); } @@ -1181,7 +1214,6 @@ innobase_start_or_create_for_mysql(void) } if (srv_n_log_files * srv_log_file_size >= 262144) { - fprintf(stderr, "InnoDB: Error: combined size of log files must be < 4 GB\n"); @@ -1227,42 +1259,25 @@ innobase_start_or_create_for_mysql(void) return((int) err); } - if (!create_new_db) { - /* If we are using the doublewrite method, we will - check if there are half-written pages in data files, - and restore them from the doublewrite buffer if - possible */ - - if (srv_force_recovery < SRV_FORCE_NO_LOG_REDO) { - - trx_sys_doublewrite_restore_corrupt_pages(); - } - } - srv_normalize_path_for_win(srv_arch_dir); srv_arch_dir = srv_add_path_separator_if_needed(srv_arch_dir); + + for (i = 0; i < srv_n_log_files; i++) { + err = open_or_create_log_file(create_new_db, &log_file_created, + log_opened, 0, i); + if (err != DB_SUCCESS) { - for (k = 0; k < srv_n_log_groups; k++) { - - for (i = 0; i < srv_n_log_files; i++) { - - err = open_or_create_log_file(create_new_db, - &log_file_created, - log_opened, k, i); - if (err != DB_SUCCESS) { - - return((int) err); - } - - if (log_file_created) { - log_created = TRUE; - } else { - log_opened = TRUE; - } + return((int) err); + } - if ((log_opened && create_new_db) + if (log_file_created) { + log_created = TRUE; + } else { + log_opened = TRUE; + } + if ((log_opened && create_new_db) || (log_opened && log_created)) { - fprintf(stderr, + fprintf(stderr, "InnoDB: Error: all log files must be created at the same time.\n" "InnoDB: All log files must be created also in database creation.\n" "InnoDB: If you want bigger or smaller log files, shut down the\n" @@ -1270,14 +1285,16 @@ innobase_start_or_create_for_mysql(void) "InnoDB: Then delete the existing log files. Edit the .cnf file\n" "InnoDB: and start the database again.\n"); - return(DB_ERROR); - } - + return(DB_ERROR); } } - if (log_created && !create_new_db && !srv_archive_recovery) { + /* Open all log files and data files in the system tablespace: we + keep them open until database shutdown */ + fil_open_log_and_system_tablespace_files(); + + if (log_created && !create_new_db && !srv_archive_recovery) { if (ut_dulint_cmp(max_flushed_lsn, min_flushed_lsn) != 0 || max_arch_log_no != min_arch_log_no) { fprintf(stderr, @@ -1323,7 +1340,6 @@ innobase_start_or_create_for_mysql(void) } else if (srv_archive_recovery) { fprintf(stderr, "InnoDB: Starting archive recovery from a backup...\n"); - err = recv_recovery_from_archive_start( min_flushed_lsn, srv_archive_recovery_limit_lsn, @@ -1332,14 +1348,11 @@ innobase_start_or_create_for_mysql(void) return(DB_ERROR); } - /* Since ibuf init is in dict_boot, and ibuf is needed in any disk i/o, first call dict_boot */ dict_boot(); - trx_sys_init_at_db_start(); - srv_startup_is_before_trx_rollback_phase = FALSE; /* Initialize the fsp free limit global variable in the log @@ -1349,7 +1362,7 @@ innobase_start_or_create_for_mysql(void) recv_recovery_from_archive_finish(); } else { /* We always try to do a recovery, even if the database had - been shut down normally */ + been shut down normally: this is the normal startup path */ err = recv_recovery_from_checkpoint_start(LOG_CHECKPOINT, ut_dulint_max, @@ -1413,6 +1426,14 @@ innobase_start_or_create_for_mysql(void) } } + if (!create_new_db && srv_force_recovery == 0) { + /* After a crash recovery we only check that the info in data + dictionary is consistent with what we already know about space + id's from the call of fil_load_single_table_tablespaces(). */ + + dict_check_tablespaces_or_store_max_id(recv_needed_recovery); + } + if (srv_measure_contention) { /* os_thread_create(&test_measure_cont, NULL, thread_ids + SRV_MAX_N_IO_THREADS); */ @@ -1425,17 +1446,27 @@ innobase_start_or_create_for_mysql(void) and prints InnoDB monitor info */ os_thread_create(&srv_lock_timeout_and_monitor_thread, NULL, - thread_ids + 2 + SRV_MAX_N_IO_THREADS); + thread_ids + 2 + SRV_MAX_N_IO_THREADS); /* Create the thread which warns of long semaphore waits */ os_thread_create(&srv_error_monitor_thread, NULL, - thread_ids + 3 + SRV_MAX_N_IO_THREADS); + thread_ids + 3 + SRV_MAX_N_IO_THREADS); srv_was_started = TRUE; srv_is_being_started = FALSE; +#ifdef UNIV_DEBUG + /* Wait a while so that creates threads have time to suspend themselves + before we switch sync debugging on; otherwise a thread may execute + mutex_enter() before the checks are on, and mutex_exit() after the + checks are on. */ + + os_thread_sleep(2000000); +#endif sync_order_checks_on = TRUE; - if (srv_use_doublewrite_buf && trx_doublewrite == NULL) { + if (srv_use_doublewrite_buf && trx_doublewrite == NULL) { + /* Create the doublewrite buffer to a new tablespace */ + trx_sys_create_doublewrite_buf(); } @@ -1445,8 +1476,8 @@ innobase_start_or_create_for_mysql(void) return((int)DB_ERROR); } - /* Create the master thread which monitors the database - server, and does purge and other utility operations */ + /* Create the master thread which does purge and other utility + operations */ os_thread_create(&srv_master_thread, NULL, thread_ids + 1 + SRV_MAX_N_IO_THREADS); @@ -1478,7 +1509,7 @@ innobase_start_or_create_for_mysql(void) tablespace_size_in_header, sum_of_data_file_sizes); } - /* Check that os_fast_mutexes work as exptected */ + /* Check that os_fast_mutexes work as expected */ os_fast_mutex_init(&srv_os_test_mutex); if (0 != os_fast_mutex_trylock(&srv_os_test_mutex)) { @@ -1498,7 +1529,10 @@ innobase_start_or_create_for_mysql(void) if (srv_print_verbose_log) { ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Started\n"); + fprintf(stderr, +" InnoDB: Started; log sequence number %lu %lu\n", + ut_dulint_get_high(srv_start_lsn), + ut_dulint_get_low(srv_start_lsn)); } if (srv_force_recovery > 0) { @@ -1509,6 +1543,47 @@ innobase_start_or_create_for_mysql(void) fflush(stderr); + if (trx_doublewrite_must_reset_space_ids) { + fprintf(stderr, +"InnoDB: You are upgrading to an InnoDB version which allows multiple\n" +"InnoDB: tablespaces. Wait that purge and insert buffer merge run to\n" +"InnoDB: completion...\n"); + for (;;) { + os_thread_sleep(1000000); + + if (0 == strcmp(srv_main_thread_op_info, + "waiting for server activity")) { + + ut_a(ibuf_is_empty()); + + break; + } + } + fprintf(stderr, +"InnoDB: Full purge and insert buffer merge completed.\n"); + + trx_sys_mark_upgraded_to_multiple_tablespaces(); + + fprintf(stderr, +"InnoDB: You have now successfully upgraded to the multiple tablespaces\n" +"InnoDB: format. You should not downgrade again to an earlier version of\n" +"InnoDB: InnoDB!\n"); + } + + if (srv_force_recovery == 0) { + /* In the insert buffer we may have even bigger tablespace + id's, because we may have dropped those tablespaces, but + insert buffer merge has not had time to clean the records from + the ibuf tree. */ + + ibuf_update_max_tablespace_id(); + } + + srv_file_per_table = srv_file_per_table_original_value; + + fprintf(stderr, +"TODO: make sure MySQL sets field->query_id right in prepare/execute\n"); + return((int) DB_SUCCESS); } @@ -1526,17 +1601,16 @@ innobase_shutdown_for_mysql(void) if (srv_is_being_started) { ut_print_timestamp(stderr); fprintf(stderr, - " InnoDB: Warning: shutting down a not properly started\n"); - fprintf(stderr, - " InnoDB: or created database!\n"); +" InnoDB: Warning: shutting down a not properly started\n" +" InnoDB: or created database!\n"); } return(DB_SUCCESS); } - /* 1. Flush buffer pool to disk, write the current lsn to + /* 1. Flush the buffer pool to disk, write the current lsn to the tablespace header(s), and copy all log data to archive. - The step 1 is the real InnoDB shutdown. The remaining steps + The step 1 is the real InnoDB shutdown. The remaining steps 2 - ... just free data structures after the shutdown. */ logs_empty_and_mark_files_at_shutdown(); @@ -1560,16 +1634,16 @@ innobase_shutdown_for_mysql(void) /* NOTE: IF YOU CREATE THREADS IN INNODB, YOU MUST EXIT THEM HERE OR EARLIER */ - /* 1. Let the lock timeout thread exit */ + /* a. Let the lock timeout thread exit */ os_event_set(srv_lock_timeout_thread_event); - /* 2. srv error monitor thread exits automatically, no need + /* b. srv error monitor thread exits automatically, no need to do anything here */ - /* 3. We wake the master thread so that it exits */ + /* c. We wake the master thread so that it exits */ srv_wake_master_thread(); - /* 4. Exit the i/o threads */ + /* d. Exit the i/o threads */ os_aio_wake_all_threads_at_shutdown(); @@ -1628,7 +1702,10 @@ innobase_shutdown_for_mysql(void) if (srv_print_verbose_log) { ut_print_timestamp(stderr); - fprintf(stderr, " InnoDB: Shutdown completed\n"); + fprintf(stderr, +" InnoDB: Shutdown completed; log sequence number %lu %lu\n", + ut_dulint_get_high(srv_shutdown_lsn), + ut_dulint_get_low(srv_shutdown_lsn)); } return((int) DB_SUCCESS); diff --git a/innobase/sync/sync0rw.c b/innobase/sync/sync0rw.c index b214bca0470..5c5abebd5e1 100644 --- a/innobase/sync/sync0rw.c +++ b/innobase/sync/sync0rw.c @@ -121,6 +121,11 @@ rw_lock_create_func( lock->last_x_line = 0; mutex_enter(&rw_lock_list_mutex); + + if (UT_LIST_GET_LEN(rw_lock_list) > 0) { + ut_a(UT_LIST_GET_FIRST(rw_lock_list)->magic_n + == RW_LOCK_MAGIC_N); + } UT_LIST_ADD_FIRST(list, rw_lock_list, lock); @@ -137,7 +142,7 @@ rw_lock_free( /*=========*/ rw_lock_t* lock) /* in: rw-lock */ { - ut_ad(rw_lock_validate(lock)); + ut_a(rw_lock_validate(lock)); ut_a(rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED); ut_a(rw_lock_get_waiters(lock) == 0); ut_a(rw_lock_get_reader_count(lock) == 0); @@ -148,6 +153,13 @@ rw_lock_free( mutex_enter(&rw_lock_list_mutex); + if (UT_LIST_GET_PREV(list, lock)) { + ut_a(UT_LIST_GET_PREV(list, lock)->magic_n == RW_LOCK_MAGIC_N); + } + if (UT_LIST_GET_NEXT(list, lock)) { + ut_a(UT_LIST_GET_NEXT(list, lock)->magic_n == RW_LOCK_MAGIC_N); + } + UT_LIST_REMOVE(list, rw_lock_list, lock); mutex_exit(&rw_lock_list_mutex); diff --git a/innobase/sync/sync0sync.c b/innobase/sync/sync0sync.c index 773b239189c..680582b05b2 100644 --- a/innobase/sync/sync0sync.c +++ b/innobase/sync/sync0sync.c @@ -159,7 +159,7 @@ struct sync_thread_struct{ }; /* Number of slots reserved for each OS thread in the sync level array */ -#define SYNC_THREAD_N_LEVELS 250 +#define SYNC_THREAD_N_LEVELS 7000 struct sync_level_struct{ void* latch; /* pointer to a mutex or an rw-lock; NULL means that @@ -246,6 +246,10 @@ mutex_create_func( mutex_enter(&mutex_list_mutex); + if (UT_LIST_GET_LEN(mutex_list) > 0) { + ut_a(UT_LIST_GET_FIRST(mutex_list)->magic_n == MUTEX_MAGIC_N); + } + UT_LIST_ADD_FIRST(list, mutex_list, mutex); mutex_exit(&mutex_list_mutex); @@ -261,7 +265,7 @@ mutex_free( /*=======*/ mutex_t* mutex) /* in: mutex */ { - ut_ad(mutex_validate(mutex)); + ut_a(mutex_validate(mutex)); ut_a(mutex_get_lock_word(mutex) == 0); ut_a(mutex_get_waiters(mutex) == 0); @@ -269,6 +273,15 @@ mutex_free( mutex_enter(&mutex_list_mutex); + if (UT_LIST_GET_PREV(list, mutex)) { + ut_a(UT_LIST_GET_PREV(list, mutex)->magic_n + == MUTEX_MAGIC_N); + } + if (UT_LIST_GET_NEXT(list, mutex)) { + ut_a(UT_LIST_GET_NEXT(list, mutex)->magic_n + == MUTEX_MAGIC_N); + } + UT_LIST_REMOVE(list, mutex_list, mutex); mutex_exit(&mutex_list_mutex); @@ -991,7 +1004,7 @@ sync_thread_add_level( } array = thread_slot->levels; - + /* NOTE that there is a problem with _NODE and _LEAF levels: if the B-tree height changes, then a leaf can change to an internal node or the other way around. We do not know at present if this can cause diff --git a/innobase/trx/trx0sys.c b/innobase/trx/trx0sys.c index 0c0dbab708c..89412003485 100644 --- a/innobase/trx/trx0sys.c +++ b/innobase/trx/trx0sys.c @@ -26,6 +26,17 @@ Created 3/26/1996 Heikki Tuuri trx_sys_t* trx_sys = NULL; trx_doublewrite_t* trx_doublewrite = NULL; +/* The following is set to TRUE when we are upgrading from the old format data +files to the new >= 4.1.x format multiple tablespaces format data files */ + +ibool trx_doublewrite_must_reset_space_ids = FALSE; + +/* The following is TRUE when we are using the database in the new format, +i.e., we have successfully upgraded, or have created a new database +installation */ + +ibool trx_sys_multiple_tablespace_format = FALSE; + /* In a MySQL replication slave, in crash recovery we store the master log file name and position here. We have successfully got the updates to InnoDB up to this position. If .._pos is -1, it means no crash recovery was needed, @@ -75,11 +86,11 @@ trx_doublewrite_init( { trx_doublewrite = mem_alloc(sizeof(trx_doublewrite_t)); - /* When we have the doublewrite buffer in use, we do not need to - call os_file_flush (Unix fsync) after every write. */ - + /* Since we now start to use the doublewrite buffer, no need to call + fsync() after every write to a data file */ + os_do_not_call_flush_at_each_write = TRUE; - + mutex_create(&(trx_doublewrite->mutex)); mutex_set_level(&(trx_doublewrite->mutex), SYNC_DOUBLEWRITE); @@ -105,7 +116,41 @@ trx_doublewrite_init( } /******************************************************************** -Creates the doublewrite buffer at a database start. The header of the +Marks the trx sys header when we have successfully upgraded to the >= 4.1.x +multiple tablespace format. */ + +void +trx_sys_mark_upgraded_to_multiple_tablespaces(void) +/*===============================================*/ +{ + page_t* page; + byte* doublewrite; + mtr_t mtr; + + /* We upgraded to 4.1.x and reset the space id fields in the + doublewrite buffer. Let us mark to the trx_sys header that the upgrade + has been done. */ + + mtr_start(&mtr); + + page = buf_page_get(TRX_SYS_SPACE, TRX_SYS_PAGE_NO, RW_X_LATCH, &mtr); + buf_page_dbg_add_level(page, SYNC_NO_ORDER_CHECK); + + doublewrite = page + TRX_SYS_DOUBLEWRITE; + + mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED, + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N, + MLOG_4BYTES, &mtr); + mtr_commit(&mtr); + + /* Flush the modified pages to disk and make a checkpoint */ + log_make_checkpoint_at(ut_dulint_max, TRUE); + + trx_sys_multiple_tablespace_format = TRUE; +} + +/******************************************************************** +Creates the doublewrite buffer to a new InnoDB installation. The header of the doublewrite buffer is placed on the trx system header page. */ void @@ -138,7 +183,6 @@ start_again: if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC) == TRX_SYS_DOUBLEWRITE_MAGIC_N) { - /* The doublewrite buffer has already been created: just read in some numbers */ @@ -244,10 +288,15 @@ start_again: } mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC, - TRX_SYS_DOUBLEWRITE_MAGIC_N, MLOG_4BYTES, &mtr); + TRX_SYS_DOUBLEWRITE_MAGIC_N, MLOG_4BYTES, &mtr); mlog_write_ulint(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC + TRX_SYS_DOUBLEWRITE_REPEAT, - TRX_SYS_DOUBLEWRITE_MAGIC_N, MLOG_4BYTES, &mtr); + TRX_SYS_DOUBLEWRITE_MAGIC_N, MLOG_4BYTES, &mtr); + + mlog_write_ulint(doublewrite + + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED, + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N, + MLOG_4BYTES, &mtr); mtr_commit(&mtr); /* Flush the modified pages to disk and make a checkpoint */ @@ -255,23 +304,31 @@ start_again: fprintf(stderr, "InnoDB: Doublewrite buffer created\n"); + trx_sys_multiple_tablespace_format = TRUE; + goto start_again; } } /******************************************************************** -At a database startup uses a possible doublewrite buffer to restore +At a database startup initializes the doublewrite buffer memory structure if +we already have a doublewrite buffer created in the data files. If we are +upgrading to an InnoDB version which supports multiple tablespaces, then this +function performs the necessary update operations. If we are in a crash +recovery, this function uses a possible doublewrite buffer to restore half-written pages in the data files. */ void -trx_sys_doublewrite_restore_corrupt_pages(void) -/*===========================================*/ +trx_sys_doublewrite_init_or_restore_pages( +/*======================================*/ + ibool restore_corrupt_pages) { byte* buf; byte* read_buf; byte* unaligned_read_buf; ulint block1; ulint block2; + ulint source_page_no; byte* page; byte* doublewrite; ulint space_id; @@ -283,12 +340,11 @@ trx_sys_doublewrite_restore_corrupt_pages(void) unaligned_read_buf = ut_malloc(2 * UNIV_PAGE_SIZE); read_buf = ut_align(unaligned_read_buf, UNIV_PAGE_SIZE); - /* Read the trx sys header to check if we are using the - doublewrite buffer */ + /* Read the trx sys header to check if we are using the doublewrite + buffer */ fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, TRX_SYS_PAGE_NO, 0, UNIV_PAGE_SIZE, read_buf, NULL); - doublewrite = read_buf + TRX_SYS_DOUBLEWRITE; if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_MAGIC) @@ -305,6 +361,23 @@ trx_sys_doublewrite_restore_corrupt_pages(void) goto leave_func; } + if (mach_read_from_4(doublewrite + TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED) + != TRX_SYS_DOUBLEWRITE_SPACE_ID_STORED_N) { + + /* We are upgrading from a version < 4.1.x to a version where + multiple tablespaces are supported. We must reset the space id + field in the pages in the doublewrite buffer because starting + from this version the space id is stored to + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID. */ + + trx_doublewrite_must_reset_space_ids = TRUE; + + fprintf(stderr, +"InnoDB: Resetting space id's in the doublewrite buffer\n"); + } else { + trx_sys_multiple_tablespace_format = TRUE; + } + /* Read the pages from the doublewrite buffer to memory */ fil_io(OS_FILE_READ, TRUE, TRX_SYS_SPACE, block1, 0, @@ -322,12 +395,45 @@ trx_sys_doublewrite_restore_corrupt_pages(void) for (i = 0; i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE * 2; i++) { page_no = mach_read_from_4(page + FIL_PAGE_OFFSET); - space_id = 0; - if (!fil_check_adress_in_tablespace(space_id, page_no)) { + if (trx_doublewrite_must_reset_space_ids) { + + space_id = 0; + mach_write_to_4(page + + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0); + /* We do not need to calculate new checksums for the + pages because the field .._SPACE_ID does not affect + them. Write the page back to where we read it from. */ + + if (i < TRX_SYS_DOUBLEWRITE_BLOCK_SIZE) { + source_page_no = block1 + i; + } else { + source_page_no = block2 + + i - TRX_SYS_DOUBLEWRITE_BLOCK_SIZE; + } + + fil_io(OS_FILE_WRITE, TRUE, 0, source_page_no, 0, + UNIV_PAGE_SIZE, page, NULL); + /* printf("Resetting space id in page %lu\n", + source_page_no); */ + } else { + space_id = mach_read_from_4( + page + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID); + } + + if (!restore_corrupt_pages) { + /* The database was shut down gracefully: no need to + restore pages */ + + } else if (!fil_tablespace_exists_in_mem(space_id)) { + /* Maybe we have dropped the single-table tablespace + and this page once belonged to it: do nothing */ + + } else if (!fil_check_adress_in_tablespace(space_id, + page_no)) { fprintf(stderr, - "InnoDB: Warning: an inconsistent page in the doublewrite buffer\n" - "InnoDB: space id %lu page number %lu, %lu'th page in dblwr buf.\n", +"InnoDB: Warning: a page in the doublewrite buffer is not within space\n" +"InnoDB: bounds; space id %lu page number %lu, page %lu in doublewrite buf.\n", space_id, page_no, i); } else if (space_id == TRX_SYS_SPACE @@ -498,8 +604,8 @@ trx_sys_update_mysql_binlog_offset( mlog_write_ulint(sys_header + field + TRX_SYS_MYSQL_LOG_OFFSET_LOW, - (ulint)(offset & 0xFFFFFFFF), - MLOG_4BYTES, mtr); + (ulint)(offset & 0xFFFFFFFFUL), + MLOG_4BYTES, mtr); } /********************************************************************* diff --git a/innobase/trx/trx0trx.c b/innobase/trx/trx0trx.c index 1ece349ec6c..292b7cd6f2f 100644 --- a/innobase/trx/trx0trx.c +++ b/innobase/trx/trx0trx.c @@ -105,7 +105,7 @@ trx_create( trx->mysql_log_file_name = NULL; trx->mysql_log_offset = 0; - trx->mysql_master_log_file_name = ""; + trx->mysql_master_log_file_name = (char*)""; trx->mysql_master_log_pos = 0; mutex_create(&(trx->undo_mutex)); @@ -1624,14 +1624,14 @@ trx_print( } buf += sprintf(buf, "\n"); + + if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) { - if (trx->n_mysql_tables_in_use > 0 || trx->mysql_n_tables_locked > 0) { + buf += sprintf(buf, "mysql tables in use %lu, locked %lu\n", + trx->n_mysql_tables_in_use, + trx->mysql_n_tables_locked); + } - buf += sprintf(buf, "mysql tables in use %lu, locked %lu\n", - trx->n_mysql_tables_in_use, - trx->mysql_n_tables_locked); - } - start_of_line = buf; switch (trx->que_state) { diff --git a/innobase/trx/trx0undo.c b/innobase/trx/trx0undo.c index 34f56dba130..82572b82807 100644 --- a/innobase/trx/trx0undo.c +++ b/innobase/trx/trx0undo.c @@ -387,6 +387,7 @@ trx_undo_seg_create( page_t* undo_page; trx_upagef_t* page_hdr; trx_usegf_t* seg_hdr; + ulint n_reserved; ibool success; ut_ad(mtr && id && rseg_hdr); @@ -411,8 +412,8 @@ trx_undo_seg_create( space = buf_frame_get_space_id(rseg_hdr); - success = fsp_reserve_free_extents(space, 2, FSP_UNDO, mtr); - + success = fsp_reserve_free_extents(&n_reserved, space, 2, FSP_UNDO, + mtr); if (!success) { return(NULL); @@ -422,7 +423,7 @@ trx_undo_seg_create( undo_page = fseg_create_general(space, 0, TRX_UNDO_SEG_HDR + TRX_UNDO_FSEG_HEADER, TRUE, mtr); - fil_space_release_free_extents(space, 2); + fil_space_release_free_extents(space, n_reserved); if (undo_page == NULL) { /* No space left */ @@ -733,6 +734,7 @@ trx_undo_add_page( page_t* new_page; trx_rseg_t* rseg; ulint page_no; + ulint n_reserved; ibool success; ut_ad(mutex_own(&(trx->undo_mutex))); @@ -749,8 +751,8 @@ trx_undo_add_page( header_page = trx_undo_page_get(undo->space, undo->hdr_page_no, mtr); - success = fsp_reserve_free_extents(undo->space, 1, FSP_UNDO, mtr); - + success = fsp_reserve_free_extents(&n_reserved, undo->space, 1, + FSP_UNDO, mtr); if (!success) { return(FIL_NULL); @@ -761,7 +763,7 @@ trx_undo_add_page( undo->top_page_no + 1, FSP_UP, TRUE, mtr); - fil_space_release_free_extents(undo->space, 1); + fil_space_release_free_extents(undo->space, n_reserved); if (page_no == FIL_NULL) { diff --git a/innobase/ut/ut0byte.c b/innobase/ut/ut0byte.c index 02bdf2065ee..74198419560 100644 --- a/innobase/ut/ut0byte.c +++ b/innobase/ut/ut0byte.c @@ -18,7 +18,7 @@ Created 5/11/1994 Heikki Tuuri dulint ut_dulint_zero = {0, 0}; /* Maximum value for a dulint */ -dulint ut_dulint_max = {0xFFFFFFFF, 0xFFFFFFFF}; +dulint ut_dulint_max = {0xFFFFFFFFUL, 0xFFFFFFFFUL}; /**************************************************************** Sort function for dulint arrays. */ diff --git a/innobase/ut/ut0ut.c b/innobase/ut/ut0ut.c index 4ca113f40ad..be311764261 100644 --- a/innobase/ut/ut0ut.c +++ b/innobase/ut/ut0ut.c @@ -200,7 +200,6 @@ ut_get_year_month_day( *month = (ulint)cal_tm.wMonth; *day = (ulint)cal_tm.wDay; #else - struct tm cal_tm; struct tm* cal_tm_ptr; time_t tm; |