diff options
Diffstat (limited to 'storage')
45 files changed, 4032 insertions, 982 deletions
diff --git a/storage/maria/Makefile.am b/storage/maria/Makefile.am index 795784f31aa..090c3d783d9 100644 --- a/storage/maria/Makefile.am +++ b/storage/maria/Makefile.am @@ -62,7 +62,7 @@ noinst_HEADERS = maria_def.h ma_rt_index.h ma_rt_key.h ma_rt_mbr.h \ ma_control_file.h ha_maria.h ma_blockrec.h \ ma_loghandler.h ma_loghandler_lsn.h ma_pagecache.h \ ma_checkpoint.h ma_recovery.h ma_commit.h \ - trnman_public.h ma_check_standalone.h + trnman_public.h ma_check_standalone.h ma_key_recover.h ma_test1_DEPENDENCIES= $(LIBRARIES) ma_test1_LDADD= @CLIENT_EXTRA_LDFLAGS@ libmaria.a \ $(top_builddir)/storage/myisam/libmyisam.a \ @@ -103,7 +103,8 @@ ma_sp_test_LDADD= @CLIENT_EXTRA_LDFLAGS@ libmaria.a \ $(top_builddir)/strings/libmystrings.a @ZLIB_LIBS@ libmaria_a_SOURCES = ma_init.c ma_open.c ma_extra.c ma_info.c ma_rkey.c \ ma_rnext.c ma_rnext_same.c \ - ma_search.c ma_page.c ma_key.c ma_locking.c \ + ma_search.c ma_page.c ma_key_recover.c ma_key.c \ + ma_locking.c \ ma_rrnd.c ma_scan.c ma_cache.c \ ma_statrec.c ma_packrec.c ma_dynrec.c \ ma_blockrec.c ma_bitmap.c \ diff --git a/storage/maria/ha_maria.cc b/storage/maria/ha_maria.cc index 1d36c05ee4c..f43e23da439 100644 --- a/storage/maria/ha_maria.cc +++ b/storage/maria/ha_maria.cc @@ -2131,7 +2131,7 @@ int ha_maria::create(const char *name, register TABLE *table_arg, HA_CREATE_INFO *ha_create_info) { int error; - uint create_flags= 0, records, i; + uint create_flags= 0, record_count, i; char buff[FN_REFLEN]; MARIA_KEYDEF *keydef; MARIA_COLUMNDEF *recinfo; @@ -2159,7 +2159,7 @@ int ha_maria::create(const char *name, register TABLE *table_arg, ER_ILLEGAL_HA_CREATE_OPTION, "Row format set to PAGE because of TRANSACTIONAL=1 option"); - if ((error= table2maria(table_arg, &keydef, &recinfo, &records))) + if ((error= table2maria(table_arg, &keydef, &recinfo, &record_count))) DBUG_RETURN(error); /* purecov: inspected */ bzero((char*) &create_info, sizeof(create_info)); create_info.max_rows= share->max_rows; @@ -2204,7 +2204,7 @@ int ha_maria::create(const char *name, register TABLE *table_arg, maria_create(fn_format(buff, name, "", "", MY_UNPACK_FILENAME | MY_APPEND_EXT), row_type, share->keys, keydef, - records, recinfo, + record_count, recinfo, 0, (MARIA_UNIQUEDEF *) 0, &create_info, create_flags); @@ -2322,22 +2322,22 @@ uint ha_maria::checksum() const } -bool ha_maria::check_if_incompatible_data(HA_CREATE_INFO *info, +bool ha_maria::check_if_incompatible_data(HA_CREATE_INFO *create_info, uint table_changes) { uint options= table->s->db_options_in_use; - if (info->auto_increment_value != stats.auto_increment_value || - info->data_file_name != data_file_name || - info->index_file_name != index_file_name || - maria_row_type(info) != data_file_type || + if (create_info->auto_increment_value != stats.auto_increment_value || + create_info->data_file_name != data_file_name || + create_info->index_file_name != index_file_name || + maria_row_type(create_info) != data_file_type || table_changes == IS_EQUAL_NO || table_changes & IS_EQUAL_PACK_LENGTH) // Not implemented yet return COMPATIBLE_DATA_NO; if ((options & (HA_OPTION_PACK_RECORD | HA_OPTION_CHECKSUM | HA_OPTION_DELAY_KEY_WRITE)) != - (info->table_options & (HA_OPTION_PACK_RECORD | HA_OPTION_CHECKSUM | + (create_info->table_options & (HA_OPTION_PACK_RECORD | HA_OPTION_CHECKSUM | HA_OPTION_DELAY_KEY_WRITE))) return COMPATIBLE_DATA_NO; return COMPATIBLE_DATA_YES; @@ -2413,7 +2413,7 @@ static int ha_maria_init(void *p) maria_recover() || ma_checkpoint_init(checkpoint_interval); maria_multi_threaded= TRUE; - return res; + return res ? HA_ERR_INITIALIZATION : 0; } @@ -2519,9 +2519,9 @@ static void update_checkpoint_interval(MYSQL_THD thd, } static SHOW_VAR status_variables[]= { - {"Maria_pagecache_blocks_not_flushed", (char*) &maria_pagecache_var.global_blocks_changed, SHOW_LONG}, - {"Maria_pagecache_blocks_unused", (char*) &maria_pagecache_var.blocks_unused, SHOW_LONG}, - {"Maria_pagecache_blocks_used", (char*) &maria_pagecache_var.blocks_used, SHOW_LONG}, + {"Maria_pagecache_blocks_not_flushed", (char*) &maria_pagecache_var.global_blocks_changed, SHOW_LONG_NOFLUSH}, + {"Maria_pagecache_blocks_unused", (char*) &maria_pagecache_var.blocks_unused, SHOW_LONG_NOFLUSH}, + {"Maria_pagecache_blocks_used", (char*) &maria_pagecache_var.blocks_used, SHOW_LONG_NOFLUSH}, {"Maria_pagecache_read_requests", (char*) &maria_pagecache_var.global_cache_r_requests, SHOW_LONGLONG}, {"Maria_pagecache_reads", (char*) &maria_pagecache_var.global_cache_read, SHOW_LONGLONG}, {"Maria_pagecache_write_requests", (char*) &maria_pagecache_var.global_cache_w_requests, SHOW_LONGLONG}, diff --git a/storage/maria/lockman.c b/storage/maria/lockman.c index 8316d70bb29..c9b753fb492 100644 --- a/storage/maria/lockman.c +++ b/storage/maria/lockman.c @@ -247,7 +247,7 @@ static int lockfind(LOCK * volatile *head, LOCK *node, { uint32 hashnr, cur_hashnr; uint64 resource, cur_resource; - intptr link; + intptr cur_link; my_bool cur_active, compatible, upgrading, prev_active; enum lock_type lock, prev_lock, cur_lock; uint16 loid, cur_loid; @@ -276,10 +276,10 @@ retry: if (!cursor->curr) break; do { - link= cursor->curr->link; - cursor->next= PTR(link); + cur_link= cursor->curr->link; + cursor->next= PTR(cur_link); _lf_pin(pins, 0, cursor->next); - } while(link != cursor->curr->link && LF_BACKOFF); + } while (cur_link != cursor->curr->link && LF_BACKOFF); cur_hashnr= cursor->curr->hashnr; cur_resource= cursor->curr->resource; cur_lock= cursor->curr->lock; @@ -290,7 +290,7 @@ retry: (void)LF_BACKOFF; goto retry; } - if (!DELETED(link)) + if (!DELETED(cur_link)) { if (cur_hashnr > hashnr || (cur_hashnr == hashnr && cur_resource >= resource)) @@ -429,7 +429,7 @@ static int lockinsert(LOCK * volatile *head, LOCK *node, LF_PINS *pins, if (res & LOCK_UPGRADE) cursor.upgrade_from->flags|= IGNORE_ME; /* - QQ: is this OK ? if a reader has already read upgrade_from, + QQ: is this OK ? if a reader has already read upgrade_from, it may find it conflicting with node :( - see the last test from test_lockman_simple() */ diff --git a/storage/maria/ma_blockrec.c b/storage/maria/ma_blockrec.c index c418d1e71b2..af7bf4c1f5b 100644 --- a/storage/maria/ma_blockrec.c +++ b/storage/maria/ma_blockrec.c @@ -265,8 +265,9 @@ #include "maria_def.h" #include "ma_blockrec.h" -#include <lf.h> #include "trnman.h" +#include "ma_key_recover.h" +#include <lf.h> /* Struct for having a cursor over a set of extent. @@ -314,12 +315,7 @@ typedef struct st_maria_extent_cursor trn->undo_lsn under log mutex, and needs to know the type of UNDO being undone now to modify state.records under log mutex. */ -struct st_msg_to_write_hook_for_clr_end -{ - LSN previous_undo_lsn; - enum translog_record_type undone_record_type; - ha_checksum checksum_delta; -}; + /** S:share,D:checksum_delta,E:expression,P:pointer_into_record,L:length */ #define store_checksum_in_rec(S,D,E,P,L) do \ { \ @@ -498,12 +494,6 @@ my_bool _ma_init_block_record(MARIA_HA *info) sizeof(MARIA_BITMAP_BLOCK), ELEMENTS_RESERVED_FOR_MAIN_PART, 16)) goto err; - /* The following should be big enough for all purposes */ - if (my_init_dynamic_array(&info->pinned_pages, - sizeof(MARIA_PINNED_PAGE), - max(info->s->base.blobs*2 + 4, - MARIA_MAX_TREE_LEVELS*2), 16)) - goto err; row->base_length= new_row->base_length= info->s->base_length; /* @@ -527,7 +517,6 @@ void _ma_end_block_record(MARIA_HA *info) DBUG_ENTER("_ma_end_block_record"); my_free((uchar*) info->cur_row.empty_bits, MYF(MY_ALLOW_ZERO_PTR)); delete_dynamic(&info->bitmap_blocks); - delete_dynamic(&info->pinned_pages); my_free((uchar*) info->cur_row.extents, MYF(MY_ALLOW_ZERO_PTR)); /* The data file is closed, when needed, in ma_once_end_block_record(). @@ -783,50 +772,6 @@ void copy_not_changed_fields(MARIA_HA *info, MY_BITMAP *changed_fields, } } - -/* - Unpin all pinned pages - - SYNOPSIS - _ma_unpin_all_pages() - info Maria handler - undo_lsn LSN for undo pages. LSN_IMPOSSIBLE if we shouldn't write undo - (error) - - NOTE - We unpin pages in the reverse order as they where pinned; This may not - be strictly necessary but may simplify things in the future. - - RETURN - 0 ok - 1 error (fatal disk error) - -*/ - -void _ma_unpin_all_pages(MARIA_HA *info, LSN undo_lsn) -{ - MARIA_PINNED_PAGE *page_link= ((MARIA_PINNED_PAGE*) - dynamic_array_ptr(&info->pinned_pages, 0)); - MARIA_PINNED_PAGE *pinned_page= page_link + info->pinned_pages.elements; - DBUG_ENTER("_ma_unpin_all_pages"); - DBUG_PRINT("info", ("undo_lsn: %lu", (ulong) undo_lsn)); - - /* True if not disk error */ - DBUG_ASSERT((undo_lsn != LSN_IMPOSSIBLE) || !info->s->now_transactional); - - if (!info->s->now_transactional) - undo_lsn= LSN_IMPOSSIBLE; /* don't try to set a LSN on pages */ - - while (pinned_page-- != page_link) - pagecache_unlock_by_link(info->s->pagecache, pinned_page->link, - pinned_page->unlock, PAGECACHE_UNPIN, - info->trn->rec_lsn, undo_lsn); - - info->pinned_pages.elements= 0; - DBUG_VOID_RETURN; -} - - #ifdef NOT_YET_NEEDED /* Calculate empty space on a page */ @@ -843,23 +788,6 @@ static uint empty_space_on_page(uchar *buff, uint block_size) } #endif -/** - When we have finished the write/update/delete of a row, we have cleanups to - do. For now it is signalling to Checkpoint that all dirtied pages have - their rec_lsn set and page LSN set (_ma_unpin_all_pages() has been called), - and that bitmap pages are correct (_ma_bitmap_release_unused() has been - called). -*/ -#define _ma_finalize_row(info) \ - do { info->trn->rec_lsn= LSN_IMPOSSIBLE; } while(0) -/** unpinning is often the last operation before finalizing: */ -#define _ma_unpin_all_pages_and_finalize_row(info,undo_lsn) do \ - { \ - _ma_unpin_all_pages(info, undo_lsn); \ - _ma_finalize_row(info); \ - } while(0) - - /* Find free position in directory @@ -1379,6 +1307,7 @@ static my_bool get_head_or_tail_page(MARIA_HA *info, lock, &page_link.link))) DBUG_RETURN(1); page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= 1; push_dynamic(&info->pinned_pages, (void*) &page_link); DBUG_ASSERT((res->buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == page_type); @@ -1517,6 +1446,7 @@ static my_bool write_tail(MARIA_HA *info, PAGECACHE_WRITE_DELAY, &page_link.link))) { page_link.unlock= PAGECACHE_LOCK_READ_UNLOCK; + page_link.changed= 1; if (block_is_read) { /* Change the lock used when we read the page */ @@ -1879,7 +1809,6 @@ static my_bool free_full_page_range(MARIA_HA *info, ulonglong page, uint count) TRANSLOG_INTERNAL_PARTS + 1, log_array, log_data, NULL)) res= 1; - } pthread_mutex_lock(&info->s->bitmap.bitmap_lock); if (_ma_reset_full_page_bits(info, &info->s->bitmap, page, @@ -2343,13 +2272,12 @@ static my_bool write_block_record(MARIA_HA *info, head_block+1, bitmap_blocks->count - 1); if (head_tail_block) { - ulong data_length= (tmp_data - info->rec_buff); - uint length; + ulong block_length= (tmp_data - info->rec_buff); uchar *extent_data; - length= (uint) (data_length % FULL_PAGE_SIZE(block_size)); + length= (uint) (block_length % FULL_PAGE_SIZE(block_size)); if (write_tail(info, head_tail_block, - info->rec_buff + data_length - length, + info->rec_buff + block_length - length, length)) goto disk_err; tmp_data-= length; /* Remove the tail */ @@ -2393,7 +2321,7 @@ static my_bool write_block_record(MARIA_HA *info, { uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE]; LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; - size_t data_length= (size_t) (data - row_pos->data); + size_t block_length= (size_t) (data - row_pos->data); /* Log REDO changes of head page */ page_store(log_data + FILEID_STORE_SIZE, head_block->page); @@ -2402,9 +2330,9 @@ static my_bool write_block_record(MARIA_HA *info, log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data; log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); log_array[TRANSLOG_INTERNAL_PARTS + 1].str= (char*) row_pos->data; - log_array[TRANSLOG_INTERNAL_PARTS + 1].length= data_length; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= block_length; if (translog_write_record(&lsn, LOGREC_REDO_INSERT_ROW_HEAD, info->trn, - info, sizeof(log_data) + data_length, + info, sizeof(log_data) + block_length, TRANSLOG_INTERNAL_PARTS + 2, log_array, log_data, NULL)) goto disk_err; @@ -2426,6 +2354,7 @@ static my_bool write_block_record(MARIA_HA *info, PAGECACHE_WRITE_DELAY, &page_link.link)) goto disk_err; page_link.unlock= PAGECACHE_LOCK_READ_UNLOCK; + page_link.changed= 1; if (head_block_is_read) { /* Head page is always the first pinned page */ @@ -2477,12 +2406,12 @@ static my_bool write_block_record(MARIA_HA *info, if (tmp_data_used) { /* Full head page */ - size_t data_length= (ulong) (tmp_data - info->rec_buff); + size_t block_length= (ulong) (tmp_data - info->rec_buff); log_pos= store_page_range(log_pos, head_block+1, block_size, - data_length, &extents); + block_length, &extents); log_array_pos->str= (char*) info->rec_buff; - log_array_pos->length= data_length; - log_entry_length+= data_length; + log_array_pos->length= block_length; + log_entry_length+= block_length; log_array_pos++; sub_extents++; } @@ -2545,7 +2474,7 @@ static my_bool write_block_record(MARIA_HA *info, } /* Write UNDO or CLR record */ - lsn= 0; + lsn= LSN_IMPOSSIBLE; if (share->now_transactional) { LEX_STRING *log_array= info->log_row_parts; @@ -2609,6 +2538,8 @@ static my_bool write_block_record(MARIA_HA *info, if (!old_record) { + /* Store undo_lsn in case we are aborting the insert */ + row->orig_undo_lsn= info->trn->undo_lsn; /* Write UNDO log record for the INSERT */ if (translog_write_record(&lsn, LOGREC_UNDO_ROW_INSERT, info->trn, info, @@ -2711,7 +2642,7 @@ disk_err: Unpin all pinned pages to not cause problems for disk cache. This is safe to call even if we already called _ma_unpin_all_pages() above. */ - _ma_unpin_all_pages_and_finalize_row(info, 0); + _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE); DBUG_RETURN(1); } @@ -2863,32 +2794,13 @@ my_bool _ma_write_abort_block_record(MARIA_HA *info) if (share->now_transactional) { - TRANSLOG_HEADER_BUFFER rec; LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + CLR_TYPE_STORE_SIZE + HA_CHECKSUM_STORE_SIZE]; - int len; struct st_msg_to_write_hook_for_clr_end msg; - /* - We do need the code above (delete_head_or_tail() etc) for - non-transactional tables. - For transactional tables we could skip this code above and just execute - the UNDO_INSERT, but we try to have one code path. - Write CLR record, because we are somehow undoing UNDO_ROW_INSERT. - When we have logging for keys: as maria_write() first writes the row - then the keys, and if failure, deletes the keys then the rows, - info->trn->undo_lsn below will properly point to the UNDO of the - UNDO_ROW_INSERT for this row. - */ - if ((len= translog_read_record_header(info->trn->undo_lsn, &rec)) == - RECHEADER_READ_ERROR) - { - res= 1; - goto end; - } - DBUG_ASSERT(rec.type == LOGREC_UNDO_ROW_INSERT); - memcpy(log_data, rec.header, LSN_STORE_SIZE); /* previous UNDO LSN */ - msg.previous_undo_lsn= lsn_korr(rec.header); + + lsn_store(log_data, info->cur_row.orig_undo_lsn); + msg.previous_undo_lsn= info->cur_row.orig_undo_lsn; msg.undone_record_type= LOGREC_UNDO_ROW_INSERT; clr_type_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, LOGREC_UNDO_ROW_INSERT); @@ -2907,7 +2819,6 @@ my_bool _ma_write_abort_block_record(MARIA_HA *info) log_data + LSN_STORE_SIZE, &msg)) res= 1; } -end: _ma_unpin_all_pages_and_finalize_row(info, lsn); DBUG_RETURN(res); } @@ -2964,6 +2875,7 @@ static my_bool _ma_update_block_record2(MARIA_HA *info, PAGECACHE_LOCK_WRITE, &page_link.link))) DBUG_RETURN(1); page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= 1; push_dynamic(&info->pinned_pages, (void*) &page_link); org_empty_size= uint2korr(buff + EMPTY_SPACE_OFFSET); @@ -3018,7 +2930,7 @@ static my_bool _ma_update_block_record2(MARIA_HA *info, the head page */ head_length= uint2korr(dir + 2); - if (buff[PAGE_TYPE_OFFSET] & PAGE_CAN_BE_COMPACTED && org_empty_size && + if ((buff[PAGE_TYPE_OFFSET] & PAGE_CAN_BE_COMPACTED) && org_empty_size && (head_length < new_row->head_length || (new_row->total_length <= head_length && org_empty_size + head_length >= new_row->total_length))) @@ -3047,14 +2959,13 @@ static my_bool _ma_update_block_record2(MARIA_HA *info, DBUG_RETURN(res); err: - _ma_unpin_all_pages_and_finalize_row(info, 0); + _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE); DBUG_RETURN(1); } /* Wrapper for _ma_update_block_record2() used by ma_update() */ - my_bool _ma_update_block_record(MARIA_HA *info, MARIA_RECORD_POS record_pos, const uchar *orig_rec, const uchar *new_rec) { @@ -3142,7 +3053,7 @@ static int delete_dir_entry(uchar *buff, uint block_size, uint record_number, buff[PAGE_TYPE_OFFSET]= UNALLOCATED_PAGE; #ifdef IDENTICAL_PAGES_AFTER_RECOVERY { - uchar *dir= dir_entry_pos(buff, block_size, record_number); + dir= dir_entry_pos(buff, block_size, record_number); bzero(dir, (record_number+1) * DIR_ENTRY_SIZE); } #endif @@ -3219,6 +3130,7 @@ static my_bool delete_head_or_tail(MARIA_HA *info, PAGECACHE_LOCK_WRITE, &page_link.link))) DBUG_RETURN(1); page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= 1; push_dynamic(&info->pinned_pages, (void*) &page_link); if (from_update) @@ -3291,6 +3203,7 @@ static my_bool delete_head_or_tail(MARIA_HA *info, } /* The page is pinned with a read lock */ page_link.unlock= lock_at_unpin; + page_link.changed= 1; set_dynamic(&info->pinned_pages, (void*) &page_link, info->pinned_pages.elements-1); @@ -3402,7 +3315,7 @@ my_bool _ma_delete_block_record(MARIA_HA *info, const uchar *record) DBUG_RETURN(0); err: - _ma_unpin_all_pages_and_finalize_row(info, 0); + _ma_unpin_all_pages_and_finalize_row(info, LSN_IMPOSSIBLE); DBUG_RETURN(1); } @@ -3579,6 +3492,7 @@ static uchar *read_next_extent(MARIA_HA *info, MARIA_EXTENT_CURSOR *extent, { /* Read during redo */ page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= 1; push_dynamic(&info->pinned_pages, (void*) &page_link); } @@ -3890,8 +3804,9 @@ int _ma_read_block_record2(MARIA_HA *info, uchar *record, } case FIELD_BLOB: { - uint size_length= column->length - portable_sizeof_char_ptr; - ulong blob_length= _ma_calc_blob_length(size_length, field_length_data); + uint column_size_length= column->length - portable_sizeof_char_ptr; + ulong blob_length= _ma_calc_blob_length(column_size_length, + field_length_data); if (!found_blob) { @@ -3920,10 +3835,10 @@ int _ma_read_block_record2(MARIA_HA *info, uchar *record, blob_buffer= info->rec_buff; } - memcpy(field_pos, field_length_data, size_length); - memcpy_fixed(field_pos + size_length, (uchar *) & blob_buffer, + memcpy(field_pos, field_length_data, column_size_length); + memcpy_fixed(field_pos + column_size_length, (uchar *) &blob_buffer, sizeof(char*)); - field_length_data+= size_length; + field_length_data+= column_size_length; /* After we have read one extent, then each blob is in it's own extent @@ -5061,16 +4976,21 @@ my_bool write_hook_for_clr_end(enum translog_record_type type (struct st_msg_to_write_hook_for_clr_end *)hook_arg; DBUG_ASSERT(trn->trid != 0); trn->undo_lsn= msg->previous_undo_lsn; - share->state.state.checksum+= msg->checksum_delta; switch (msg->undone_record_type) { case LOGREC_UNDO_ROW_DELETE: share->state.state.records++; + share->state.state.checksum+= msg->checksum_delta; break; case LOGREC_UNDO_ROW_INSERT: share->state.state.records--; + share->state.state.checksum+= msg->checksum_delta; break; case LOGREC_UNDO_ROW_UPDATE: + share->state.state.checksum+= msg->checksum_delta; + break; + case LOGREC_UNDO_KEY_INSERT: + case LOGREC_UNDO_KEY_DELETE: break; default: DBUG_ASSERT(0); @@ -5180,7 +5100,7 @@ uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn, pagecache_unlock_by_link(share->pagecache, page_link.link, PAGECACHE_LOCK_WRITE_UNLOCK, PAGECACHE_UNPIN, LSN_IMPOSSIBLE, - LSN_IMPOSSIBLE); + LSN_IMPOSSIBLE, 0); DBUG_RETURN(my_errno); } /* Create new page */ @@ -5193,7 +5113,7 @@ uint _ma_apply_redo_insert_row_head_or_tail(MARIA_HA *info, LSN lsn, pagecache_unlock_by_link(share->pagecache, page_link.link, PAGECACHE_LOCK_WRITE_UNLOCK, PAGECACHE_UNPIN, LSN_IMPOSSIBLE, - LSN_IMPOSSIBLE); + LSN_IMPOSSIBLE, 0); /* Fix bitmap, just in case */ empty_space= uint2korr(buff + EMPTY_SPACE_OFFSET); if (_ma_bitmap_set(info, page, page_type == HEAD_PAGE, empty_space)) @@ -5299,7 +5219,7 @@ err: pagecache_unlock_by_link(share->pagecache, page_link.link, PAGECACHE_LOCK_WRITE_UNLOCK, PAGECACHE_UNPIN, LSN_IMPOSSIBLE, - LSN_IMPOSSIBLE); + LSN_IMPOSSIBLE, 0); DBUG_RETURN(HA_ERR_WRONG_IN_RECORD); } @@ -5351,7 +5271,7 @@ uint _ma_apply_redo_purge_row_head_or_tail(MARIA_HA *info, LSN lsn, pagecache_unlock_by_link(share->pagecache, page_link.link, PAGECACHE_LOCK_WRITE_UNLOCK, PAGECACHE_UNPIN, LSN_IMPOSSIBLE, - LSN_IMPOSSIBLE); + LSN_IMPOSSIBLE, 0); DBUG_RETURN(my_errno); } @@ -5365,7 +5285,7 @@ uint _ma_apply_redo_purge_row_head_or_tail(MARIA_HA *info, LSN lsn, pagecache_unlock_by_link(share->pagecache, page_link.link, PAGECACHE_LOCK_WRITE_UNLOCK, PAGECACHE_UNPIN, LSN_IMPOSSIBLE, - LSN_IMPOSSIBLE); + LSN_IMPOSSIBLE, 0); if ((buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK) == page_type) { @@ -5401,7 +5321,7 @@ err: pagecache_unlock_by_link(share->pagecache, page_link.link, PAGECACHE_LOCK_WRITE_UNLOCK, PAGECACHE_UNPIN, LSN_IMPOSSIBLE, - LSN_IMPOSSIBLE); + LSN_IMPOSSIBLE, 0); DBUG_RETURN(HA_ERR_WRONG_IN_RECORD); } @@ -5426,7 +5346,7 @@ uint _ma_apply_redo_free_blocks(MARIA_HA *info, { MARIA_SHARE *share= info->s; uint ranges; - DBUG_ENTER("_ma_apply_redo_purge_blocks"); + DBUG_ENTER("_ma_apply_redo_free_blocks"); ranges= pagerange_korr(header); header+= PAGERANGE_STORE_SIZE; @@ -5493,7 +5413,7 @@ uint _ma_apply_redo_free_head_or_tail(MARIA_HA *info, LSN lsn, pagecache_unlock_by_link(share->pagecache, page_link.link, PAGECACHE_LOCK_WRITE_UNLOCK, PAGECACHE_UNPIN, LSN_IMPOSSIBLE, - LSN_IMPOSSIBLE); + LSN_IMPOSSIBLE, 0); DBUG_RETURN(1); } if (lsn_korr(buff) >= lsn) @@ -5502,7 +5422,7 @@ uint _ma_apply_redo_free_head_or_tail(MARIA_HA *info, LSN lsn, pagecache_unlock_by_link(share->pagecache, page_link.link, PAGECACHE_LOCK_WRITE_UNLOCK, PAGECACHE_UNPIN, LSN_IMPOSSIBLE, - LSN_IMPOSSIBLE); + LSN_IMPOSSIBLE, 0); } else { @@ -5618,7 +5538,7 @@ uint _ma_apply_redo_insert_row_blobs(MARIA_HA *info, pagecache_unlock_by_link(share->pagecache, page_link.link, PAGECACHE_LOCK_WRITE_UNLOCK, PAGECACHE_UNPIN, LSN_IMPOSSIBLE, - LSN_IMPOSSIBLE); + LSN_IMPOSSIBLE, 0); DBUG_RETURN(my_errno); } /* Physical file was too short; Create new page */ @@ -5634,7 +5554,7 @@ uint _ma_apply_redo_insert_row_blobs(MARIA_HA *info, pagecache_unlock_by_link(share->pagecache, page_link.link, PAGECACHE_LOCK_WRITE_UNLOCK, PAGECACHE_UNPIN, LSN_IMPOSSIBLE, - LSN_IMPOSSIBLE); + LSN_IMPOSSIBLE, 0); continue; } } @@ -5685,15 +5605,12 @@ my_bool _ma_apply_undo_row_insert(MARIA_HA *info, LSN undo_lsn, { ulonglong page; uint rownr; - LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; - uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + - CLR_TYPE_STORE_SIZE + HA_CHECKSUM_STORE_SIZE], - *buff; + uchar *buff; my_bool res= 1; MARIA_PINNED_PAGE page_link; - LSN lsn; MARIA_SHARE *share= info->s; - struct st_msg_to_write_hook_for_clr_end msg; + ha_checksum checksum; + LSN lsn; DBUG_ENTER("_ma_apply_undo_row_insert"); page= page_korr(header); @@ -5710,6 +5627,7 @@ my_bool _ma_apply_undo_row_insert(MARIA_HA *info, LSN undo_lsn, DBUG_RETURN(1); page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= 1; push_dynamic(&info->pinned_pages, (void*) &page_link); if (read_row_extent_info(info, buff, rownr)) @@ -5722,26 +5640,11 @@ my_bool _ma_apply_undo_row_insert(MARIA_HA *info, LSN undo_lsn, if (info->cur_row.extents && free_full_pages(info, &info->cur_row)) goto err; - /* undo_lsn must be first for compression to work */ - lsn_store(log_data, undo_lsn); - clr_type_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, - LOGREC_UNDO_ROW_INSERT); - log_array[TRANSLOG_INTERNAL_PARTS + 0].length= - sizeof(log_data) - HA_CHECKSUM_STORE_SIZE; - msg.undone_record_type= LOGREC_UNDO_ROW_INSERT; - msg.previous_undo_lsn= undo_lsn; - store_checksum_in_rec(share, msg.checksum_delta, - - ha_checksum_korr(header), - log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + - CLR_TYPE_STORE_SIZE, - log_array[TRANSLOG_INTERNAL_PARTS + 0].length); - log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data; - - if (translog_write_record(&lsn, LOGREC_CLR_END, - info->trn, info, log_array[TRANSLOG_INTERNAL_PARTS - + 0].length, - TRANSLOG_INTERNAL_PARTS + 1, log_array, - log_data + LSN_STORE_SIZE, &msg)) + checksum= 0; + if (share->calc_checksum) + checksum= -ha_checksum_korr(header); + if (_ma_write_clr(info, undo_lsn, LOGREC_UNDO_ROW_INSERT, + share->calc_checksum != 0, checksum, &lsn)) goto err; res= 0; @@ -5754,7 +5657,8 @@ err: /* Execute undo of a row delete (insert the row back somewhere) */ my_bool _ma_apply_undo_row_delete(MARIA_HA *info, LSN undo_lsn, - const uchar *header, size_t length) + const uchar *header, + size_t header_length __attribute__((unused))) { uchar *record; const uchar *null_bits, *field_length_data; @@ -5853,6 +5757,8 @@ my_bool _ma_apply_undo_row_delete(MARIA_HA *info, LSN undo_lsn, header+= column->length; break; case FIELD_SKIP_ENDSPACE: /* CHAR */ + { + uint length; if (column->length <= 255) length= (uint) *field_length_data++; else @@ -5868,6 +5774,7 @@ my_bool _ma_apply_undo_row_delete(MARIA_HA *info, LSN undo_lsn, ' '); header+= length; break; + } case FIELD_VARCHAR: { uint length; diff --git a/storage/maria/ma_blockrec.h b/storage/maria/ma_blockrec.h index cd49a65a0c8..7f3bf11d96c 100644 --- a/storage/maria/ma_blockrec.h +++ b/storage/maria/ma_blockrec.h @@ -13,7 +13,6 @@ along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ - /* Storage of records in block */ diff --git a/storage/maria/ma_check.c b/storage/maria/ma_check.c index b3090b98deb..0f970e77a0f 100644 --- a/storage/maria/ma_check.c +++ b/storage/maria/ma_check.c @@ -92,7 +92,7 @@ static int _ma_safe_scan_block_record(MARIA_SORT_INFO *sort_info, static void copy_data_file_state(MARIA_STATE_INFO *to, MARIA_STATE_INFO *from); static int write_log_record_for_repair(const HA_CHECK *param, MARIA_HA *info); - +static void report_keypage_fault(HA_CHECK *param, my_off_t position); void maria_chk_init(HA_CHECK *param) { @@ -218,7 +218,7 @@ int maria_chk_del(HA_CHECK *param, register MARIA_HA *info, uint test_flag) else { param->record_checksum+=(ha_checksum) next_link; - next_link= _ma_rec_pos(info->s, buff+1); + next_link= _ma_rec_pos(info, buff+1); empty+=info->s->base.pack_reclength; } } @@ -259,7 +259,7 @@ wrong: /* Check delete links in index file */ -static int check_k_link(HA_CHECK *param, register MARIA_HA *info, +static int check_k_link(HA_CHECK *param, register MARIA_HA *info, my_off_t next_link) { uint block_size= info->s->block_size; @@ -287,7 +287,7 @@ static int check_k_link(HA_CHECK *param, register MARIA_HA *info, DBUG_RETURN(1); /* purecov: end */ } - + /* Key blocks must be aligned at block_size */ if (next_link & (block_size -1)) { @@ -443,8 +443,8 @@ int maria_chk_key(HA_CHECK *param, register MARIA_HA *info) if (!(param->testflag & T_SILENT)) puts("- check index reference"); all_keydata=all_totaldata=key_totlength=0; - old_record_checksum=0; init_checksum=param->record_checksum; + old_record_checksum=0; if (share->data_file_type == STATIC_RECORD) old_record_checksum= (calc_checksum(info->state->records + info->state->del-1) * @@ -474,14 +474,17 @@ int maria_chk_key(HA_CHECK *param, register MARIA_HA *info) printf ("- check data record references index: %d\n",key+1); if (keyinfo->flag & HA_FULLTEXT) full_text_keys++; - if (share->state.key_root[key] == HA_OFFSET_ERROR && - (info->state->records == 0 || keyinfo->flag & HA_FULLTEXT)) + if (share->state.key_root[key] == HA_OFFSET_ERROR) + { + if (info->state->records != 0 && !(keyinfo->flag & HA_FULLTEXT)) + _ma_check_print_error(param, "Key tree %u is empty", key + 1); goto do_stat; - if (!_ma_fetch_keypage(info,keyinfo,share->state.key_root[key], - DFLT_INIT_HITS,info->buff,0)) + } + if (!_ma_fetch_keypage(info, keyinfo, share->state.key_root[key], + PAGECACHE_LOCK_LEFT_UNLOCKED, DFLT_INIT_HITS, + info->buff, 0, 0)) { - _ma_check_print_error(param,"Can't read indexpage from filepos: %s", - llstr(share->state.key_root[key],buff)); + report_keypage_fault(param, share->state.key_root[key]); if (!(param->testflag & T_INFO)) DBUG_RETURN(-1); result= -1; @@ -513,7 +516,9 @@ int maria_chk_key(HA_CHECK *param, register MARIA_HA *info) else if (old_record_checksum != param->record_checksum) { if (key) - _ma_check_print_error(param,"Key %u doesn't point at same records that key 1", + _ma_check_print_error(param, + "Key %u doesn't point at same records as " + "key 1", key+1); else _ma_check_print_error(param,"Key 1 doesn't point at all records"); @@ -600,6 +605,7 @@ do_stat: } /* maria_chk_key */ + static int chk_index_down(HA_CHECK *param, MARIA_HA *info, MARIA_KEYDEF *keyinfo, my_off_t page, uchar *buff, ha_rows *keys, @@ -638,10 +644,10 @@ static int chk_index_down(HA_CHECK *param, MARIA_HA *info, /* purecov: end */ } - if (!_ma_fetch_keypage(info,keyinfo,page, DFLT_INIT_HITS,buff,0)) + if (!_ma_fetch_keypage(info, keyinfo, page, PAGECACHE_LOCK_LEFT_UNLOCKED, + DFLT_INIT_HITS, buff, 0, 0)) { - _ma_check_print_error(param,"Can't read key from filepos: %s", - llstr(page,llbuff)); + report_keypage_fault(param, page); goto err; } param->key_file_blocks+=keyinfo->block_length; @@ -1068,7 +1074,7 @@ static int check_static_record(HA_CHECK *param, MARIA_HA *info, int extend, my_off_t start_recpos, pos; char llbuff[22]; - pos= 0; + pos= 0; while (pos < info->state->data_file_length) { if (*_ma_killed_ptr(param)) @@ -1114,7 +1120,7 @@ static int check_dynamic_record(HA_CHECK *param, MARIA_HA *info, int extend, LINT_INIT(start_recpos); LINT_INIT(to); - pos= 0; + pos= 0; while (pos < info->state->data_file_length) { my_bool got_error= 0; @@ -1217,7 +1223,7 @@ static int check_dynamic_record(HA_CHECK *param, MARIA_HA *info, int extend, if (_ma_alloc_buffer(&info->rec_buff, &info->rec_buff_size, block_info.rec_len + info->s->base.extra_rec_buff_size)) - + { _ma_check_print_error(param, "Not enough memory (%lu) for blob at %s", @@ -1302,7 +1308,7 @@ static int check_dynamic_record(HA_CHECK *param, MARIA_HA *info, int extend, } param->glob_crc+= checksum; } - + if (! got_error) { if (check_keys_in_record(param, info, extend, start_recpos, record)) @@ -1881,7 +1887,7 @@ int maria_chk_data_link(HA_CHECK *param, MARIA_HA *info,int extend) bzero((char*) param->tmp_key_crc, info->s->base.keys * sizeof(param->tmp_key_crc[0])); - + switch (info->s->data_file_type) { case BLOCK_RECORD: error= check_block_record(param, info, extend, record); @@ -1989,7 +1995,7 @@ int maria_chk_data_link(HA_CHECK *param, MARIA_HA *info,int extend) llstr(param->records,llbuff), (long)((param->used - param->link_used)/param->records), (info->s->base.blobs ? 0.0 : - (ulonglong2double((ulonglong) info->s->base.reclength * + (ulonglong2double((ulonglong) info->s->base.reclength * param->records)- my_off_t2double(param->used))/ ulonglong2double((ulonglong) info->s->base.reclength * @@ -2520,8 +2526,9 @@ int maria_movepoint(register MARIA_HA *info, uchar *record, nod_flag=_ma_test_if_nod(info, info->buff); _ma_dpointer(info,info->int_keypos-nod_flag- info->s->rec_reflength,newpos); - if (_ma_write_keypage(info,keyinfo,info->last_keypage, - DFLT_INIT_HITS,info->buff)) + if (_ma_write_keypage(info, keyinfo, info->last_keypage, + PAGECACHE_LOCK_LEFT_UNLOCKED, DFLT_INIT_HITS, + info->buff)) DBUG_RETURN(-1); } else @@ -2694,7 +2701,6 @@ static int sort_one_index(HA_CHECK *param, MARIA_HA *info, uchar *buff,*keypos,*endpos; uchar key[HA_MAX_POSSIBLE_KEY_BUFF]; my_off_t new_page_pos,next_page; - char llbuff[22]; DBUG_ENTER("sort_one_index"); /* cannot walk over R-tree indices */ @@ -2707,10 +2713,10 @@ static int sort_one_index(HA_CHECK *param, MARIA_HA *info, _ma_check_print_error(param,"Not enough memory for key block"); DBUG_RETURN(-1); } - if (!_ma_fetch_keypage(info,keyinfo,pagepos,DFLT_INIT_HITS,buff,0)) + if (!_ma_fetch_keypage(info, keyinfo, pagepos,PAGECACHE_LOCK_LEFT_UNLOCKED, + DFLT_INIT_HITS, buff, 0, 0)) { - _ma_check_print_error(param,"Can't read key block from filepos: %s", - llstr(pagepos,llbuff)); + report_keypage_fault(param, pagepos); goto err; } if ((nod_flag=_ma_test_if_nod(info, buff)) || keyinfo->flag & HA_FULLTEXT) @@ -3938,7 +3944,6 @@ static int sort_get_next_record(MARIA_SORT_PARAM *sort_param) { if (param->testflag & T_VERBOSE) { - char llbuff[22]; record_pos_to_txt(info, info->cur_row.lastpos, llbuff); _ma_check_print_info(param, "Found record with wrong checksum at %s", @@ -4179,7 +4184,7 @@ static int sort_get_next_record(MARIA_SORT_PARAM *sort_param) &sort_param->rec_buff_size, block_info.rec_len + info->s->base.extra_rec_buff_size)) - + { if (param->max_record_length >= block_info.rec_len) { @@ -4441,7 +4446,7 @@ int _ma_sort_write_record(MARIA_SORT_PARAM *sort_param) from=sort_info->buff+ALIGN_SIZE(MARIA_MAX_DYN_BLOCK_HEADER); } /* We can use info->checksum here as only one thread calls this */ - info->cur_row.checksum= (*info->s->calc_check_checksum)(info, + info->cur_row.checksum= (*info->s->calc_check_checksum)(info, sort_param-> record); reclength= _ma_rec_pack(info,from,sort_param->record); @@ -4735,6 +4740,7 @@ static int sort_insert_key(MARIA_SORT_PARAM *sort_param, MARIA_KEYDEF *keyinfo=sort_param->keyinfo; MARIA_SORT_INFO *sort_info= sort_param->sort_info; HA_CHECK *param=sort_info->param; + MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link; DBUG_ENTER("sort_insert_key"); anc_buff= key_block->buff; @@ -4753,6 +4759,7 @@ static int sort_insert_key(MARIA_SORT_PARAM *sort_param, } a_length= info->s->keypage_header + nod_flag; key_block->end_pos= anc_buff + info->s->keypage_header; + bzero(anc_buff, info->s->keypage_header); _ma_store_keynr(info, anc_buff, (uint) (sort_param->keyinfo - info->s->keyinfo)); lastkey=0; /* No previous key in block */ @@ -4783,13 +4790,16 @@ static int sort_insert_key(MARIA_SORT_PARAM *sort_param, bzero(anc_buff+key_block->last_length, keyinfo->block_length- key_block->last_length); key_file_length=info->state->key_file_length; - if ((filepos= _ma_new(info,keyinfo,DFLT_INIT_HITS)) == HA_OFFSET_ERROR) + if ((filepos= _ma_new(info, DFLT_INIT_HITS, &page_link)) == HA_OFFSET_ERROR) DBUG_RETURN(1); /* If we read the page from the key cache, we have to write it back to it */ - if (key_file_length == info->state->key_file_length) + if (page_link->changed) { - if (_ma_write_keypage(info, keyinfo, filepos, DFLT_INIT_HITS, anc_buff)) + pop_dynamic(&info->pinned_pages); + if (_ma_write_keypage(info, keyinfo, filepos, + PAGECACHE_LOCK_WRITE_UNLOCK, + DFLT_INIT_HITS, anc_buff)) DBUG_RETURN(1); } else if (my_pwrite(info->s->kfile.file, anc_buff, @@ -4882,6 +4892,7 @@ int _ma_flush_pending_blocks(MARIA_SORT_PARAM *sort_param) myf myf_rw=sort_info->param->myf_rw; MARIA_HA *info=sort_info->info; MARIA_KEYDEF *keyinfo=sort_param->keyinfo; + MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link; DBUG_ENTER("_ma_flush_pending_blocks"); filepos= HA_OFFSET_ERROR; /* if empty file */ @@ -4894,13 +4905,16 @@ int _ma_flush_pending_blocks(MARIA_SORT_PARAM *sort_param) _ma_kpointer(info,key_block->end_pos,filepos); key_file_length=info->state->key_file_length; bzero(key_block->buff+length, keyinfo->block_length-length); - if ((filepos= _ma_new(info,keyinfo,DFLT_INIT_HITS)) == HA_OFFSET_ERROR) + if ((filepos= _ma_new(info, DFLT_INIT_HITS, &page_link)) == + HA_OFFSET_ERROR) DBUG_RETURN(1); /* If we read the page from the key cache, we have to write it back */ - if (key_file_length == info->state->key_file_length) + if (page_link->changed) { + pop_dynamic(&info->pinned_pages); if (_ma_write_keypage(info, keyinfo, filepos, + PAGECACHE_LOCK_WRITE_UNLOCK, DFLT_INIT_HITS, key_block->buff)) DBUG_RETURN(1); } @@ -5583,7 +5597,7 @@ static int _ma_safe_scan_block_record(MARIA_SORT_INFO *sort_info, length= uint2korr(info->scan.dir + 2); end_of_data= data + length; info->scan.dir-= DIR_ENTRY_SIZE; /* Point to previous row */ - + if (end_of_data > info->scan.dir_end || offset < PAGE_HEADER_SIZE || length < info->s->base.min_block_length) { @@ -5619,7 +5633,7 @@ read_next_page: PAGECACHE_READ_UNKNOWN_PAGE, PAGECACHE_LOCK_LEFT_UNLOCKED, 0))) DBUG_RETURN(my_errno); - + page_type= (info->scan.page_buff[PAGE_TYPE_OFFSET] & PAGE_TYPE_MASK); if (page_type == HEAD_PAGE) @@ -5724,3 +5738,21 @@ static int write_log_record_for_repair(const HA_CHECK *param, MARIA_HA *info) } return 0; } + + +/* Give error message why reading of key page failed */ + +static void report_keypage_fault(HA_CHECK *param, my_off_t position) +{ + char buff[11]; + + if (my_errno == HA_ERR_CRASHED) + _ma_check_print_error(param, + "Wrong base information on indexpage at filepos: %s", + llstr(position, buff)); + else + _ma_check_print_error(param, + "Can't read indexpage from filepos: %s, " + "error: %d", + llstr(position,buff), my_errno); +} diff --git a/storage/maria/ma_checkpoint.c b/storage/maria/ma_checkpoint.c index 76f1723b053..ee1f3182278 100644 --- a/storage/maria/ma_checkpoint.c +++ b/storage/maria/ma_checkpoint.c @@ -32,7 +32,6 @@ #include "maria_def.h" #include "ma_pagecache.h" -#include "trnman.h" #include "ma_blockrec.h" #include "ma_checkpoint.h" #include "ma_loghandler_lsn.h" diff --git a/storage/maria/ma_close.c b/storage/maria/ma_close.c index 9b654803945..27700ff4ff3 100644 --- a/storage/maria/ma_close.c +++ b/storage/maria/ma_close.c @@ -148,7 +148,8 @@ int maria_close(register MARIA_HA *info) error = my_errno; } - my_free((uchar*) info,MYF(0)); + delete_dynamic(&info->pinned_pages); + my_free(info, MYF(0)); if (error) DBUG_RETURN(my_errno= error); diff --git a/storage/maria/ma_control_file.c b/storage/maria/ma_control_file.c index 9ebe21ac15a..122fa9f38ee 100644 --- a/storage/maria/ma_control_file.c +++ b/storage/maria/ma_control_file.c @@ -92,6 +92,7 @@ CONTROL_FILE_ERROR ma_control_file_create_or_open() { char buffer[CONTROL_FILE_SIZE]; char name[FN_REFLEN]; + const char *errmsg; MY_STAT stat_buff; my_bool create_file; int open_flags= O_BINARY | /*O_DIRECT |*/ O_RDWR; @@ -121,7 +122,8 @@ CONTROL_FILE_ERROR ma_control_file_create_or_open() if (maria_in_recovery) DBUG_RETURN(CONTROL_FILE_MISSING); if ((control_file_fd= my_create(name, 0, - open_flags, MYF(MY_SYNC_DIR))) < 0) + open_flags, + MYF(MY_SYNC_DIR | MY_WME))) < 0) DBUG_RETURN(CONTROL_FILE_UNKNOWN_ERROR); /* Create unique uuid for the control file */ @@ -153,10 +155,16 @@ CONTROL_FILE_ERROR ma_control_file_create_or_open() /* Otherwise, file exists */ if ((control_file_fd= my_open(name, open_flags, MYF(MY_WME))) < 0) + { + errmsg= "Can't open file"; goto err; + } - if (my_stat(name, &stat_buff, MYF(MY_WME)) == NULL) + if (my_stat(name, &stat_buff, MYF(0)) == NULL) + { + errmsg= "Can't read status"; goto err; + } if ((uint)stat_buff.st_size < CONTROL_FILE_SIZE) { @@ -176,6 +184,7 @@ CONTROL_FILE_ERROR ma_control_file_create_or_open() MySQL's error log at startup. */ error= CONTROL_FILE_TOO_SMALL; + errmsg= "File size to small"; goto err; } @@ -183,17 +192,21 @@ CONTROL_FILE_ERROR ma_control_file_create_or_open() { /* TODO: store "too big file" message */ error= CONTROL_FILE_TOO_BIG; + errmsg= "File size bigger than expected"; goto err; } - if (my_read(control_file_fd, buffer, CONTROL_FILE_SIZE, - MYF(MY_FNABP | MY_WME))) + if (my_read(control_file_fd, buffer, CONTROL_FILE_SIZE, MYF(MY_FNABP))) + { + errmsg= "Can't read file"; goto err; + } if (memcmp(buffer + CONTROL_FILE_MAGIC_STRING_OFFSET, CONTROL_FILE_MAGIC_STRING, CONTROL_FILE_MAGIC_STRING_SIZE)) { /* TODO: store message "bad magic string" somewhere */ error= CONTROL_FILE_BAD_MAGIC_STRING; + errmsg= "Missing valid id at start of file"; goto err; } memcpy(maria_uuid, buffer + CONTROL_FILE_UUID_OFFSET, @@ -203,15 +216,19 @@ CONTROL_FILE_ERROR ma_control_file_create_or_open() CONTROL_FILE_SIZE - CONTROL_FILE_LSN_OFFSET) != uint4korr(buffer + CONTROL_FILE_CHECKSUM_OFFSET)) { - /* TODO: store message "checksum mismatch" somewhere */ error= CONTROL_FILE_BAD_CHECKSUM; + errmsg= "Checksum missmatch"; goto err; } last_checkpoint_lsn= lsn_korr(buffer + CONTROL_FILE_LSN_OFFSET); last_logno= uint4korr(buffer + CONTROL_FILE_FILENO_OFFSET); DBUG_RETURN(0); + err: + my_printf_error(HA_ERR_INITIALIZATION, + "Error when trying to use maria control file '%s': %s", 0, + name, errmsg); ma_control_file_end(); DBUG_RETURN(error); } @@ -247,7 +264,7 @@ err: */ int ma_control_file_write_and_force(const LSN checkpoint_lsn, uint32 logno, - uint objs_to_write) + uint objs_to_write) { char buffer[CONTROL_FILE_SIZE]; my_bool update_checkpoint_lsn= FALSE, update_logno= FALSE; diff --git a/storage/maria/ma_create.c b/storage/maria/ma_create.c index cdb28e11ae3..95e1fa1fdcb 100644 --- a/storage/maria/ma_create.c +++ b/storage/maria/ma_create.c @@ -188,6 +188,8 @@ int maria_create(const char *name, enum data_file_type datafile_type, max_field_lengths++; packed++; column->fill_length= 1; + options|= HA_OPTION_NULL_FIELDS; /* Use ma_checksum() */ + /* We must test for 257 as length includes pack-length */ if (test(column->length >= 257)) { @@ -270,16 +272,17 @@ int maria_create(const char *name, enum data_file_type datafile_type, { options|= HA_OPTION_TMP_TABLE; tmp_table= TRUE; - create_mode|= O_EXCL | O_NOFOLLOW; + create_mode|= O_NOFOLLOW; /* "CREATE TEMPORARY" tables are not crash-safe (dropped at restart) */ ci->transactional= FALSE; + flags&= ~HA_CREATE_PAGE_CHECKSUM; } share.base.null_bytes= ci->null_bytes; share.base.original_null_bytes= ci->null_bytes; share.base.born_transactional= ci->transactional; share.base.max_field_lengths= max_field_lengths; share.base.field_offsets= 0; /* for future */ - + if (pack_reclength != INT_MAX32) pack_reclength+= max_field_lengths + long_varchar_count; @@ -654,9 +657,9 @@ int maria_create(const char *name, enum data_file_type datafile_type, share.state.dellink = HA_OFFSET_ERROR; share.state.first_bitmap_with_space= 0; +#ifdef EXTERNAL_LOCKING share.state.process= (ulong) getpid(); - share.state.unique= (ulong) 0; - share.state.update_count=(ulong) 0; +#endif share.state.version= (ulong) time((time_t*) 0); share.state.sortkey= (ushort) ~0; share.state.auto_increment=ci->auto_increment; @@ -957,7 +960,7 @@ int maria_create(const char *name, enum data_file_type datafile_type, char empty_string[]= ""; LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 4]; uint total_rec_length= 0; - uint i; + uint k; LSN lsn; log_array[TRANSLOG_INTERNAL_PARTS + 1].length= 1 + 2 + 2 + kfile_size_before_extension; @@ -988,9 +991,9 @@ int maria_create(const char *name, enum data_file_type datafile_type, (ci->index_file_name ? ci->index_file_name : empty_string); log_array[TRANSLOG_INTERNAL_PARTS + 3].length= strlen(log_array[TRANSLOG_INTERNAL_PARTS + 3].str) + 1; - for (i= TRANSLOG_INTERNAL_PARTS; - i < (sizeof(log_array)/sizeof(log_array[0])); i++) - total_rec_length+= log_array[i].length; + for (k= TRANSLOG_INTERNAL_PARTS; + k < (sizeof(log_array)/sizeof(log_array[0])); k++) + total_rec_length+= log_array[k].length; /** For this record to be of any use for Recovery, we need the upper MySQL layer to be crash-safe, which it is not now (that would require diff --git a/storage/maria/ma_delete.c b/storage/maria/ma_delete.c index 543be5142d2..fb13118ab75 100644 --- a/storage/maria/ma_delete.c +++ b/storage/maria/ma_delete.c @@ -17,19 +17,26 @@ #include "ma_fulltext.h" #include "ma_rt_index.h" +#include "trnman.h" +#include "ma_key_recover.h" static int d_search(MARIA_HA *info,MARIA_KEYDEF *keyinfo,uint comp_flag, - uchar *key,uint key_length,my_off_t page,uchar *anc_buff); -static int del(MARIA_HA *info,MARIA_KEYDEF *keyinfo,uchar *key,uchar *anc_buff, - my_off_t leaf_page,uchar *leaf_buff,uchar *keypos, - my_off_t next_block,uchar *ret_key); + uchar *key, uint key_length, + my_off_t page, uchar *anc_buff, + MARIA_PINNED_PAGE *anc_page_link); +static int del(MARIA_HA *info,MARIA_KEYDEF *keyinfo, uchar *key, + uchar *anc_buff, my_off_t leaf_page, uchar *leaf_buff, + MARIA_PINNED_PAGE *leaf_page_link, uchar *keypos, + my_off_t next_block, uchar *ret_key); static int underflow(MARIA_HA *info,MARIA_KEYDEF *keyinfo,uchar *anc_buff, - my_off_t leaf_page,uchar *leaf_buff,uchar *keypos); + my_off_t leaf_page,uchar *leaf_buff, + MARIA_PINNED_PAGE *leaf_page_link, uchar *keypos); static uint remove_key(MARIA_KEYDEF *keyinfo,uint nod_flag,uchar *keypos, uchar *lastkey,uchar *page_end, - my_off_t *next_block); -static int _ma_ck_real_delete(register MARIA_HA *info,MARIA_KEYDEF *keyinfo, - uchar *key, uint key_length, my_off_t *root); + my_off_t *next_block, MARIA_KEY_PARAM *s_temp); +static my_bool _ma_log_delete(MARIA_HA *info, my_off_t page, uchar *buff, + uchar *key_pos, uint move_length, + uint change_length); int maria_delete(MARIA_HA *info,const uchar *record) @@ -108,7 +115,7 @@ int maria_delete(MARIA_HA *info,const uchar *record) info->update= HA_STATE_CHANGED+HA_STATE_DELETED+HA_STATE_ROW_CHANGED; info->state->records-= !share->now_transactional; share->state.changed|= STATE_NOT_OPTIMIZED_ROWS; - + mi_sizestore(lastpos, info->cur_row.lastpos); VOID(_ma_writeinfo(info,WRITEINFO_UPDATE_KEYFILE)); allow_break(); /* Allow SIGHUP & SIGINT */ @@ -142,23 +149,75 @@ err: } /* maria_delete */ - /* Remove a key from the btree index */ +/* Remove a key from the btree index */ int _ma_ck_delete(register MARIA_HA *info, uint keynr, uchar *key, uint key_length) { - return _ma_ck_real_delete(info, info->s->keyinfo+keynr, key, key_length, - &info->s->state.key_root[keynr]); + int res; + LSN lsn= LSN_IMPOSSIBLE; + my_off_t new_root= info->s->state.key_root[keynr]; + DBUG_ENTER("_ma_ck_delete"); + + res= _ma_ck_real_delete(info, info->s->keyinfo+keynr, key, key_length, + &new_root); + + if (!res && info->s->now_transactional) + { + uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + + KEY_NR_STORE_SIZE + PAGE_STORE_SIZE], *log_pos; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + struct st_msg_to_write_hook_for_undo_key msg; + enum translog_record_type log_type= LOGREC_UNDO_KEY_DELETE; + + lsn_store(log_data, info->trn->undo_lsn); + key_nr_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, keynr); + log_pos= log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + KEY_NR_STORE_SIZE; + + if (new_root != info->s->state.key_root[keynr]) + { + my_off_t page; + page= ((new_root == HA_OFFSET_ERROR) ? IMPOSSIBLE_PAGE_NO : + new_root / info->s->block_size); + page_store(log_pos, page); + log_pos+= PAGE_STORE_SIZE; + log_type= LOGREC_UNDO_KEY_DELETE_WITH_ROOT; + } + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - log_data); + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= (char*) key; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= key_length; + + msg.root= &info->s->state.key_root[keynr]; + msg.value= new_root; + + if (translog_write_record(&lsn, log_type, + info->trn, info, + log_array[TRANSLOG_INTERNAL_PARTS + 0].length + + key_length, + TRANSLOG_INTERNAL_PARTS + 2, log_array, + log_data + LSN_STORE_SIZE, &msg)) + res= -1; + } + else + { + info->s->state.key_root[keynr]= new_root; + _ma_fast_unlock_key_del(info); + } + _ma_unpin_all_pages_and_finalize_row(info, lsn); + DBUG_RETURN(res); } /* _ma_ck_delete */ -static int _ma_ck_real_delete(register MARIA_HA *info, MARIA_KEYDEF *keyinfo, - uchar *key, uint key_length, my_off_t *root) +int _ma_ck_real_delete(register MARIA_HA *info, MARIA_KEYDEF *keyinfo, + uchar *key, uint key_length, my_off_t *root) { int error; uint nod_flag; my_off_t old_root; uchar *root_buff; + MARIA_PINNED_PAGE *page_link; DBUG_ENTER("_ma_ck_real_delete"); if ((old_root=*root) == HA_OFFSET_ERROR) @@ -167,13 +226,15 @@ static int _ma_ck_real_delete(register MARIA_HA *info, MARIA_KEYDEF *keyinfo, DBUG_RETURN(my_errno=HA_ERR_CRASHED); } if (!(root_buff= (uchar*) my_alloca((uint) keyinfo->block_length+ - HA_MAX_KEY_BUFF*2))) + HA_MAX_KEY_BUFF*2))) { DBUG_PRINT("error",("Couldn't allocate memory")); DBUG_RETURN(my_errno=ENOMEM); } DBUG_PRINT("info",("root_page: %ld", (long) old_root)); - if (!_ma_fetch_keypage(info,keyinfo,old_root,DFLT_INIT_HITS,root_buff,0)) + if (!_ma_fetch_keypage(info, keyinfo, old_root, + PAGECACHE_LOCK_WRITE, DFLT_INIT_HITS, root_buff, 0, + &page_link)) { error= -1; goto err; @@ -181,7 +242,7 @@ static int _ma_ck_real_delete(register MARIA_HA *info, MARIA_KEYDEF *keyinfo, if ((error=d_search(info,keyinfo, (keyinfo->flag & HA_FULLTEXT ? SEARCH_FIND | SEARCH_UPDATE : SEARCH_SAME), - key,key_length,old_root,root_buff)) >0) + key, key_length, old_root, root_buff, page_link)) >0) { if (error == 2) { @@ -192,6 +253,7 @@ static int _ma_ck_real_delete(register MARIA_HA *info, MARIA_KEYDEF *keyinfo, { uint used_length; _ma_get_used_and_nod(info, root_buff, used_length, nod_flag); + page_link->changed= 1; if (used_length <= nod_flag + info->s->keypage_header + 1) { error=0; @@ -200,12 +262,13 @@ static int _ma_ck_real_delete(register MARIA_HA *info, MARIA_KEYDEF *keyinfo, nod_flag); else *root=HA_OFFSET_ERROR; - if (_ma_dispose(info,keyinfo,old_root,DFLT_INIT_HITS)) + if (_ma_dispose(info, old_root, 0)) error= -1; } else - error= _ma_write_keypage(info,keyinfo,old_root, - DFLT_INIT_HITS,root_buff); + error= _ma_write_keypage(info,keyinfo, old_root, + PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS, root_buff); } } err: @@ -218,15 +281,19 @@ err: /* @brief Remove key below key root + @param key Key to delete. Will contain new key if block was enlarged + @return - @retval 1 If there are less buffers; In this case anc_buff is not saved - @retval 2 If there are more buffers - @retval -1 On errors + @retval 0 ok (anc_page is not changed) + @retval 1 If data on page is too small; In this case anc_buff is not saved + @retval 2 If data on page is too big + @retval -1 On errors */ static int d_search(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, uint comp_flag, uchar *key, uint key_length, - my_off_t page, uchar *anc_buff) + my_off_t anc_page, uchar *anc_buff, + MARIA_PINNED_PAGE *anc_page_link) { int flag,ret_value,save_flag; uint length,nod_flag,search_key_length; @@ -234,6 +301,8 @@ static int d_search(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, uchar *leaf_buff,*keypos; my_off_t leaf_page,next_block; uchar lastkey[HA_MAX_KEY_BUFF]; + MARIA_PINNED_PAGE *leaf_page_link; + MARIA_KEY_PARAM s_temp; DBUG_ENTER("d_search"); DBUG_DUMP("page",anc_buff,_ma_get_page_used(info, anc_buff)); @@ -279,7 +348,8 @@ static int d_search(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, my_off_t root; uchar *kpos=keypos; - if (!(tmp_key_length=(*keyinfo->get_key)(keyinfo,nod_flag,&kpos,lastkey))) + if (!(tmp_key_length=(*keyinfo->get_key)(keyinfo,nod_flag,&kpos, + lastkey))) { maria_print_error(info->s, HA_ERR_CRASHED); my_errno= HA_ERR_CRASHED; @@ -289,24 +359,29 @@ static int d_search(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, if (subkeys == -1) { /* the last entry in sub-tree */ - if (_ma_dispose(info, keyinfo, root,DFLT_INIT_HITS)) + if (_ma_dispose(info, root, 1)) DBUG_RETURN(-1); /* fall through to normal delete */ } else { keyinfo=&info->s->ft2_keyinfo; - kpos-=keyinfo->keylength+nod_flag; /* we'll modify key entry 'in vivo' */ + /* we'll modify key entry 'in vivo' */ + kpos-=keyinfo->keylength+nod_flag; get_key_full_length_rdonly(off, key); key+=off; ret_value= _ma_ck_real_delete(info, &info->s->ft2_keyinfo, - key, HA_FT_WLEN, &root); + key, HA_FT_WLEN, &root); _ma_dpointer(info, kpos+HA_FT_WLEN, root); subkeys++; ft_intXstore(kpos, subkeys); if (!ret_value) - ret_value= _ma_write_keypage(info,keyinfo,page, - DFLT_INIT_HITS,anc_buff); + { + anc_page_link->changed= 1; + ret_value= _ma_write_keypage(info, keyinfo, anc_page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS, anc_buff); + } DBUG_PRINT("exit",("Return: %d",ret_value)); DBUG_RETURN(ret_value); } @@ -320,12 +395,13 @@ static int d_search(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, if (!(leaf_buff= (uchar*) my_alloca((uint) keyinfo->block_length+ HA_MAX_KEY_BUFF*2))) { - DBUG_PRINT("error",("Couldn't allocate memory")); + DBUG_PRINT("error", ("Couldn't allocate memory")); my_errno=ENOMEM; - DBUG_PRINT("exit",("Return: %d",-1)); DBUG_RETURN(-1); } - if (!_ma_fetch_keypage(info,keyinfo,leaf_page,DFLT_INIT_HITS,leaf_buff,0)) + if (!_ma_fetch_keypage(info,keyinfo,leaf_page, + PAGECACHE_LOCK_WRITE, DFLT_INIT_HITS, leaf_buff, + 0, &leaf_page_link)) goto err; } @@ -339,47 +415,64 @@ static int d_search(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, goto err; } save_flag=0; - ret_value=d_search(info,keyinfo,comp_flag,key,key_length, - leaf_page,leaf_buff); + ret_value=d_search(info, keyinfo, comp_flag, key, key_length, + leaf_page, leaf_buff, leaf_page_link); } else { /* Found key */ uint tmp; length= _ma_get_page_used(info, anc_buff); if (!(tmp= remove_key(keyinfo,nod_flag,keypos,lastkey,anc_buff+length, - &next_block))) + &next_block, &s_temp))) goto err; + anc_page_link->changed= 1; length-= tmp; - _ma_store_page_used(info, anc_buff, length, nod_flag); + + /* + Log initial changes on pages + If there is an underflow, there will be more changes logged to the + page + */ + if (info->s->now_transactional && + _ma_log_delete(info, anc_page, anc_buff, s_temp.key_pos, + s_temp.move_length, s_temp.changed_length)) + DBUG_RETURN(-1); + if (!nod_flag) { /* On leaf page */ - if (_ma_write_keypage(info,keyinfo,page,DFLT_INIT_HITS,anc_buff)) + if (test(length <= (info->quick_mode ? + MARIA_MIN_KEYBLOCK_LENGTH : + (uint) keyinfo->underflow_block_length))) { - DBUG_PRINT("exit",("Return: %d",-1)); - DBUG_RETURN(-1); + /* Page will be written by caller if we return 1 */ + DBUG_RETURN(1); } - /* Page will be update later if we return 1 */ - DBUG_RETURN(test(length <= (info->quick_mode ? MARIA_MIN_KEYBLOCK_LENGTH : - (uint) keyinfo->underflow_block_length))); + if (_ma_write_keypage(info, keyinfo, anc_page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS, + anc_buff)) + DBUG_RETURN(-1); + DBUG_RETURN(0); } - save_flag=1; - ret_value=del(info,keyinfo,key,anc_buff,leaf_page,leaf_buff,keypos, - next_block,lastkey); + save_flag=1; /* Mark that anc_buff is changed */ + ret_value= del(info, keyinfo, key, anc_buff, leaf_page, leaf_buff, + leaf_page_link, keypos, next_block, lastkey); } if (ret_value >0) { save_flag=1; if (ret_value == 1) - ret_value= underflow(info,keyinfo,anc_buff,leaf_page,leaf_buff,keypos); + ret_value= underflow(info, keyinfo, anc_buff, leaf_page, leaf_buff, + leaf_page_link, keypos); else { /* This happens only with packed keys */ DBUG_PRINT("test",("Enlarging of key when deleting")); if (!_ma_get_last_key(info,keyinfo,anc_buff,lastkey,keypos,&length)) goto err; - ret_value= _ma_insert(info,keyinfo,key,anc_buff,keypos,lastkey, - (uchar*) 0,(uchar*) 0,(my_off_t) 0,(my_bool) 0); + ret_value= _ma_insert(info, keyinfo, key, anc_buff, keypos, anc_page, + lastkey, (my_off_t) 0, (uchar*) 0, + (MARIA_PINNED_PAGE*) 0, (uchar*) 0, (my_bool) 0); } } if (ret_value == 0 && _ma_get_page_used(info, anc_buff) > @@ -389,7 +482,12 @@ static int d_search(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, ret_value= _ma_split_page(info,keyinfo,key,anc_buff,lastkey,0) | 2; } if (save_flag && ret_value != 1) - ret_value|= _ma_write_keypage(info,keyinfo,page,DFLT_INIT_HITS,anc_buff); + { + anc_page_link->changed= 1; + ret_value|= _ma_write_keypage(info, keyinfo, anc_page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS, anc_buff); + } else { DBUG_DUMP("page", anc_buff, _ma_get_page_used(info, anc_buff)); @@ -405,11 +503,11 @@ err: } /* d_search */ - /* Remove a key that has a page-reference */ +/* Remove a key that has a page-reference */ static int del(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, uchar *key, uchar *anc_buff, my_off_t leaf_page, - uchar *leaf_buff, + uchar *leaf_buff, MARIA_PINNED_PAGE *leaf_page_link, uchar *keypos, /* Pos to where deleted key was */ my_off_t next_block, uchar *ret_key) /* key before keypos in anc_buff */ @@ -420,6 +518,7 @@ static int del(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, uchar keybuff[HA_MAX_KEY_BUFF],*endpos,*next_buff,*key_start, *prev_key; MARIA_SHARE *share=info->s; MARIA_KEY_PARAM s_temp; + MARIA_PINNED_PAGE *next_page_link; DBUG_ENTER("del"); DBUG_PRINT("enter",("leaf_page: %ld keypos: 0x%lx", (long) leaf_page, (ulong) keypos)); @@ -438,20 +537,21 @@ static int del(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, if (!(next_buff= (uchar*) my_alloca((uint) keyinfo->block_length+ HA_MAX_KEY_BUFF*2))) DBUG_RETURN(-1); - if (!_ma_fetch_keypage(info,keyinfo,next_page,DFLT_INIT_HITS,next_buff,0)) + if (!_ma_fetch_keypage(info, keyinfo, next_page, PAGECACHE_LOCK_WRITE, + DFLT_INIT_HITS, next_buff, 0, &next_page_link)) ret_value= -1; else { DBUG_DUMP("next_page", next_buff, _ma_get_page_used(info, next_buff)); - if ((ret_value=del(info,keyinfo,key,anc_buff,next_page,next_buff, - keypos,next_block,ret_key)) >0) + if ((ret_value= del(info,keyinfo,key,anc_buff, next_page, next_buff, + next_page_link, keypos, next_block, ret_key)) >0) { /* Get new length after key was deleted */ endpos=leaf_buff+_ma_get_page_used(info, leaf_buff); if (ret_value == 1) { - ret_value=underflow(info,keyinfo,leaf_buff,next_page, - next_buff,endpos); + ret_value= underflow(info, keyinfo, leaf_buff, next_page, + next_buff, next_page_link, endpos); if (ret_value == 0 && _ma_get_page_used(info, leaf_buff) > (uint) (keyinfo->block_length - KEYPAGE_CHECKSUM_SIZE)) @@ -466,11 +566,15 @@ static int del(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, if (!_ma_get_last_key(info,keyinfo,leaf_buff,keybuff,endpos, &tmp)) goto err; - ret_value= _ma_insert(info,keyinfo,key,leaf_buff,endpos,keybuff, - (uchar*) 0,(uchar*) 0,(my_off_t) 0,0); + ret_value= _ma_insert(info, keyinfo, key, leaf_buff, endpos, + leaf_page, keybuff, (my_off_t) 0, (uchar*) 0, + (MARIA_PINNED_PAGE *) 0, (uchar*) 0, 0); } } - if (_ma_write_keypage(info,keyinfo,leaf_page,DFLT_INIT_HITS,leaf_buff)) + leaf_page_link->changed= 1; + if (_ma_write_keypage(info, keyinfo, leaf_page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS, leaf_buff)) goto err; } my_afree(next_buff); @@ -478,8 +582,11 @@ static int del(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, } /* Remove last key from leaf page */ + leaf_page_link->changed= 1; _ma_store_page_used(info, leaf_buff, key_start-leaf_buff, nod_flag); - if (_ma_write_keypage(info,keyinfo,leaf_page,DFLT_INIT_HITS,leaf_buff)) + if (_ma_write_keypage(info, keyinfo, leaf_page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS, + leaf_buff)) goto err; /* Place last key in ancestor page on deleted key position */ @@ -521,6 +628,7 @@ static int underflow(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, uchar *anc_buff, my_off_t leaf_page,/* Ancestor page and underflow page */ uchar *leaf_buff, + MARIA_PINNED_PAGE *leaf_page_link, uchar *keypos) /* Position to pos after key */ { int t_length; @@ -532,6 +640,7 @@ static int underflow(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, uchar *after_key; MARIA_KEY_PARAM s_temp; MARIA_SHARE *share=info->s; + MARIA_PINNED_PAGE *next_page_link; DBUG_ENTER("underflow"); DBUG_PRINT("enter",("leaf_page: %ld keypos: 0x%lx",(long) leaf_page, (ulong) keypos)); @@ -569,7 +678,8 @@ static int underflow(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, goto err; } next_page= _ma_kpos(key_reflength,next_keypos); - if (!_ma_fetch_keypage(info,keyinfo,next_page,DFLT_INIT_HITS,buff,0)) + if (!_ma_fetch_keypage(info,keyinfo, next_page, PAGECACHE_LOCK_WRITE, + DFLT_INIT_HITS, buff, 0, &next_page_link)) goto err; buff_length= _ma_get_page_used(info, buff); DBUG_DUMP("next",buff,buff_length); @@ -599,7 +709,7 @@ static int underflow(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, /* remove key from anc_buff */ if (!(s_length=remove_key(keyinfo,key_reflength,keypos,anc_key, - anc_buff+anc_length,(my_off_t *) 0))) + anc_buff+anc_length,(my_off_t *) 0, &s_temp))) goto err; anc_length-=s_length; @@ -608,7 +718,8 @@ static int underflow(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, if (buff_length <= (uint) (keyinfo->block_length - KEYPAGE_CHECKSUM_SIZE)) { /* Keys in one page */ memcpy(leaf_buff,buff,(size_t) buff_length); - if (_ma_dispose(info,keyinfo,next_page,DFLT_INIT_HITS)) + next_page_link->changed= 1; + if (_ma_dispose(info, next_page, 0)) goto err; } else @@ -660,10 +771,16 @@ static int underflow(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, (*keyinfo->store_key)(keyinfo,buff+p_length,&s_temp); _ma_store_page_used(info, buff, length + t_length + p_length, nod_flag); - if (_ma_write_keypage(info,keyinfo,next_page,DFLT_INIT_HITS,buff)) + next_page_link->changed= 1; + if (_ma_write_keypage(info, keyinfo, next_page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS, + buff)) goto err; } - if (_ma_write_keypage(info,keyinfo,leaf_page,DFLT_INIT_HITS,leaf_buff)) + leaf_page_link->changed= 1; + if (_ma_write_keypage(info, keyinfo, leaf_page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS, + leaf_buff)) goto err; DBUG_RETURN(anc_length <= ((info->quick_mode ? MARIA_MIN_BLOCK_LENGTH : (uint) keyinfo->underflow_block_length))); @@ -675,8 +792,9 @@ static int underflow(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, if (!keypos) goto err; next_page= _ma_kpos(key_reflength,keypos); - if (!_ma_fetch_keypage(info,keyinfo,next_page,DFLT_INIT_HITS,buff,0)) - goto err; + if (!_ma_fetch_keypage(info, keyinfo, next_page, PAGECACHE_LOCK_WRITE, + DFLT_INIT_HITS, buff, 0, &next_page_link)) + goto err; buff_length= _ma_get_page_used(info, buff); endpos=buff+buff_length; DBUG_DUMP("prev",buff,buff_length); @@ -711,7 +829,7 @@ static int underflow(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, /* remove key from anc_buff */ if (!(s_length= remove_key(keyinfo,key_reflength,keypos,anc_key, - anc_buff+anc_length,(my_off_t *) 0))) + anc_buff+anc_length,(my_off_t *) 0, &s_temp))) goto err; anc_length-=s_length; @@ -719,7 +837,8 @@ static int underflow(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, if (buff_length <= (uint) (keyinfo->block_length - KEYPAGE_CHECKSUM_SIZE)) { /* Keys in one page */ - if (_ma_dispose(info,keyinfo,leaf_page,DFLT_INIT_HITS)) + leaf_page_link->changed= 1; + if (_ma_dispose(info, leaf_page, 0)) goto err; } else @@ -768,11 +887,16 @@ static int underflow(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, (*keyinfo->store_key)(keyinfo,leaf_buff+p_length,&s_temp); _ma_store_page_used(info, leaf_buff, length + t_length + p_length, nod_flag); - if (_ma_write_keypage(info,keyinfo,leaf_page,DFLT_INIT_HITS,leaf_buff)) + leaf_page_link->changed= 1; + if (_ma_write_keypage(info, keyinfo, leaf_page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS, + leaf_buff)) goto err; _ma_store_page_used(info, buff, (uint) (endpos - buff),nod_flag); } - if (_ma_write_keypage(info,keyinfo,next_page,DFLT_INIT_HITS,buff)) + next_page_link->changed= 1; + if (_ma_write_keypage(info, keyinfo, next_page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS, buff)) goto err; DBUG_RETURN(anc_length <= (uint) (keyinfo->block_length - KEYPAGE_CHECKSUM_SIZE)/2); @@ -782,25 +906,43 @@ err: } /* underflow */ - /* - remove a key from packed buffert - The current code doesn't handle the case that the next key may be - packed better against the previous key if there is a case difference - returns how many chars was removed or 0 on error - */ +/* + @brief Remove a key from page + + @fn remove_key() + keyinfo Key handle + keypos Where on page key starts + lastkey Unpacked version of key to be removed + page_end Pointer to end of page + next_block If <> 0 and node-page, this is set to address of + next page + s_temp Information about what changes was done one the page: + s_temp.key_pos Start of key + s_temp.move_length Number of bytes removed at keypos + s_temp.changed_length Number of bytes changed at keypos + + @todo + The current code doesn't handle the case that the next key may be + packed better against the previous key if there is a case difference + + @return + @retval 0 error + @retval # How many chars was removed +*/ static uint remove_key(MARIA_KEYDEF *keyinfo, uint nod_flag, - uchar *keypos, /* Where key starts */ - uchar *lastkey, /* key to be removed */ - uchar *page_end, /* End of page */ - my_off_t *next_block) /* ptr to next block */ + uchar *keypos, uchar *lastkey, + uchar *page_end, my_off_t *next_block, + MARIA_KEY_PARAM *s_temp) { int s_length; uchar *start; DBUG_ENTER("remove_key"); - DBUG_PRINT("enter",("keypos: 0x%lx page_end: 0x%lx",(long) keypos, (long) page_end)); + DBUG_PRINT("enter", ("keypos: 0x%lx page_end: 0x%lx", + (long) keypos, (long) page_end)); - start=keypos; + start= s_temp->key_pos= keypos; + s_temp->changed_length= 0; if (!(keyinfo->flag & (HA_PACK_KEY | HA_SPACE_PACK_USED | HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY))) @@ -822,18 +964,21 @@ static uint remove_key(MARIA_KEYDEF *keyinfo, uint nod_flag, { if (keyinfo->flag & HA_BINARY_PACK_KEY) { - uchar *old_key=start; + uchar *old_key= start; uint next_length,prev_length,prev_pack_length; + + /* keypos points here on start of next key */ get_key_length(next_length,keypos); get_key_pack_length(prev_length,prev_pack_length,old_key); if (next_length > prev_length) { + uint diff= (next_length-prev_length); /* We have to copy data from the current key to the next key */ - bmove_upp(keypos, (lastkey+next_length), - (next_length-prev_length)); - keypos-=(next_length-prev_length)+prev_pack_length; - store_key_length(keypos,prev_length); + keypos-= diff + prev_pack_length; + store_key_length(keypos, prev_length); + bmove(keypos + prev_pack_length, lastkey + prev_length, diff); s_length=(int) (keypos-start); + s_temp->changed_length= diff + prev_pack_length; } } else @@ -874,13 +1019,15 @@ static uint remove_key(MARIA_KEYDEF *keyinfo, uint nod_flag, get_key_length(rest_length,keypos); if (next_length >= prev_length) - { /* Key after is based on deleted key */ - uint pack_length,tmp; - bmove_upp(keypos, (lastkey+next_length), - tmp=(next_length-prev_length)); - rest_length+=tmp; + { + /* Next key is based on deleted key */ + uint pack_length; + uint diff= (next_length-prev_length); + + bmove(keypos - diff, lastkey + prev_length, diff); + rest_length+= diff; pack_length= prev_length ? get_pack_length(rest_length): 0; - keypos-=tmp+pack_length+prev_pack_length; + keypos-= diff + pack_length + prev_pack_length; s_length=(int) (keypos-start); if (prev_length) /* Pack against prev key */ { @@ -903,6 +1050,7 @@ static uint remove_key(MARIA_KEYDEF *keyinfo, uint nod_flag, else *keypos= rest_length; } + s_temp->changed_length= diff + pack_length + prev_pack_length; } } } @@ -910,5 +1058,60 @@ static uint remove_key(MARIA_KEYDEF *keyinfo, uint nod_flag, } end: bmove(start, start+s_length, (uint) (page_end-start-s_length)); + s_temp->move_length= s_length; DBUG_RETURN((uint) s_length); } /* remove_key */ + + +/**************************************************************************** + Logging of redos +****************************************************************************/ + +/* + @brief log entry where some parts are deleted and some things are changed +*/ + +static my_bool _ma_log_delete(MARIA_HA *info, my_off_t page, uchar *buff, + uchar *key_pos, uint move_length, + uint change_length) +{ + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 9], *log_pos; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + uint translog_parts; + uint offset= (uint) (key_pos - buff); + DBUG_ENTER("_ma_log_delete"); + DBUG_ASSERT(info->s->now_transactional && move_length); + + /* Store address of new root page */ + page/= info->s->block_size; + page_store(log_data + FILEID_STORE_SIZE, page); + log_pos= log_data+ FILEID_STORE_SIZE + PAGE_STORE_SIZE; + log_pos[0]= KEY_OP_OFFSET; + int2store(log_pos+1, offset); + log_pos[3]= KEY_OP_SHIFT; + int2store(log_pos+4, -(int) move_length); + log_pos+= 6; + translog_parts= 1; + if (change_length) + { + log_pos[0]= KEY_OP_CHANGE; + int2store(log_pos+1, change_length); + log_pos+= 3; + translog_parts= 2; + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= buff + offset; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= change_length; + } + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - log_data); + + if (translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + log_array[TRANSLOG_INTERNAL_PARTS + 0].length + + change_length, + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_array, log_data, NULL)) + DBUG_RETURN(1); + DBUG_RETURN(0); +} diff --git a/storage/maria/ma_ft_update.c b/storage/maria/ma_ft_update.c index ce173993b6d..b793ccd1295 100644 --- a/storage/maria/ma_ft_update.c +++ b/storage/maria/ma_ft_update.c @@ -309,6 +309,7 @@ uint _ma_ft_convert_to_ft2(MARIA_HA *info, uint keynr, uchar *key) MARIA_KEYDEF *keyinfo=&info->s->ft2_keyinfo; uchar *key_ptr= (uchar*) dynamic_array_ptr(da, 0), *end; uint length, key_length; + MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link; DBUG_ENTER("_ma_ft_convert_to_ft2"); /* we'll generate one pageful at once, and insert the rest one-by-one */ @@ -323,16 +324,18 @@ uint _ma_ft_convert_to_ft2(MARIA_HA *info, uint keynr, uchar *key) /* nothing to do here. _ma_ck_delete() will populate info->ft1_to_ft2 with deleted keys - */ + */ } /* creating pageful of keys */ + bzero(info->buff, info->s->keypage_header); _ma_store_keynr(info, info->buff, keynr); _ma_store_page_used(info, info->buff, length + info->s->keypage_header, 0); memcpy(info->buff + info->s->keypage_header, key_ptr, length); info->keyread_buff_used= info->page_changed=1; /* info->buff is used */ - if ((root= _ma_new(info,keyinfo,DFLT_INIT_HITS)) == HA_OFFSET_ERROR || - _ma_write_keypage(info,keyinfo,root,DFLT_INIT_HITS,info->buff)) + if ((root= _ma_new(info, DFLT_INIT_HITS, &page_link)) == HA_OFFSET_ERROR || + _ma_write_keypage(info, keyinfo, root, page_link->write_lock, + DFLT_INIT_HITS, info->buff)) DBUG_RETURN(-1); /* inserting the rest of key values */ diff --git a/storage/maria/ma_key_recover.c b/storage/maria/ma_key_recover.c new file mode 100644 index 00000000000..b2655e53260 --- /dev/null +++ b/storage/maria/ma_key_recover.c @@ -0,0 +1,599 @@ +/* Copyright (C) 2007 Michael Widenius + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Redo of index */ + +#include "maria_def.h" +#include "ma_blockrec.h" +#include "trnman.h" +#include "ma_key_recover.h" + +/**************************************************************************** + Some helper functions used both by key page loggin and block page loggin +****************************************************************************/ + +/* + @brief Unpin all pinned pages + + @fn _ma_unpin_all_pages() + @param info Maria handler + @param undo_lsn LSN for undo pages. LSN_IMPOSSIBLE if we shouldn't write + undo (like on duplicate key errors) + + @note + We unpin pages in the reverse order as they where pinned; This may not + be strictly necessary but may simplify things in the future. + + @return + @retval 0 ok + @retval 1 error (fatal disk error) +*/ + +void _ma_unpin_all_pages(MARIA_HA *info, LSN undo_lsn) +{ + MARIA_PINNED_PAGE *page_link= ((MARIA_PINNED_PAGE*) + dynamic_array_ptr(&info->pinned_pages, 0)); + MARIA_PINNED_PAGE *pinned_page= page_link + info->pinned_pages.elements; + DBUG_ENTER("_ma_unpin_all_pages"); + DBUG_PRINT("info", ("undo_lsn: %lu", (ulong) undo_lsn)); + + if (!info->s->now_transactional) + undo_lsn= LSN_IMPOSSIBLE; /* don't try to set a LSN on pages */ + + while (pinned_page-- != page_link) + { + DBUG_ASSERT(!pinned_page->changed || + undo_lsn != LSN_IMPOSSIBLE || !info->s->now_transactional); + pagecache_unlock_by_link(info->s->pagecache, pinned_page->link, + pinned_page->unlock, PAGECACHE_UNPIN, + info->trn->rec_lsn, undo_lsn, + pinned_page->changed); + } + + info->pinned_pages.elements= 0; + DBUG_VOID_RETURN; +} + + +my_bool _ma_write_clr(MARIA_HA *info, LSN undo_lsn, + enum translog_record_type undo_type, + my_bool store_checksum, ha_checksum checksum, + LSN *res_lsn) +{ + uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + + PAGE_STORE_SIZE + DIRPOS_STORE_SIZE + + HA_CHECKSUM_STORE_SIZE]; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + struct st_msg_to_write_hook_for_clr_end msg; + my_bool res; + DBUG_ENTER("_ma_write_clr"); + + /* undo_lsn must be first for compression to work */ + lsn_store(log_data, undo_lsn); + clr_type_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, + undo_type); + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= + sizeof(log_data) - HA_CHECKSUM_STORE_SIZE; + msg.undone_record_type= undo_type; + msg.previous_undo_lsn= undo_lsn; + + if (store_checksum) + { + ha_checksum_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE + + CLR_TYPE_STORE_SIZE, checksum); + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + } + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data; + + res= translog_write_record(res_lsn, LOGREC_CLR_END, + info->trn, info, log_array[TRANSLOG_INTERNAL_PARTS + + 0].length, + TRANSLOG_INTERNAL_PARTS + 1, log_array, + log_data + LSN_STORE_SIZE, &msg); + DBUG_RETURN(res); +} + + +/**************************************************************************** + Redo of key pages +****************************************************************************/ + +/** + @brief Apply LOGREC_REDO_INDEX_NEW_PAGE + + @param info Maria handler + @param header Header (without FILEID) + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +uint _ma_apply_redo_index_new_page(MARIA_HA *info, LSN lsn, + const uchar *header, uint length) +{ + ulonglong root_page= page_korr(header); + ulonglong free_page= page_korr(header + PAGE_STORE_SIZE); + uint key_nr= key_nr_korr(header + PAGE_STORE_SIZE * 2); + my_bool page_type_flag= header[PAGE_STORE_SIZE * 2 + KEY_NR_STORE_SIZE]; + enum pagecache_page_lock unlock_method; + enum pagecache_page_pin unpin_method; + MARIA_PINNED_PAGE page_link; + my_off_t file_size; + uchar *buff; + uint result; + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_apply_redo_index_new_page"); + + /* Set header to point at key data */ + + share->state.changed|= (STATE_CHANGED | STATE_NOT_OPTIMIZED_KEYS | + STATE_NOT_SORTED_PAGES); + + header+= PAGE_STORE_SIZE * 2 + KEY_NR_STORE_SIZE + 1; + length-= PAGE_STORE_SIZE * 2 + KEY_NR_STORE_SIZE + 1; + + /* free_page is 0 if we shouldn't set key_del */ + if (free_page) + { + if (free_page != IMPOSSIBLE_PAGE_NO) + share->state.key_del= (my_off_t) free_page * share->block_size; + else + share->state.key_del= HA_OFFSET_ERROR; + } + file_size= (my_off_t) (root_page + 1) * share->block_size; + + /* If root page */ + if (page_type_flag) + share->state.key_root[key_nr]= file_size - share->block_size; + + if (file_size > info->state->key_file_length) + { + info->state->key_file_length= file_size; + buff= info->keyread_buff; + info->keyread_buff_used= 1; + unlock_method= PAGECACHE_LOCK_LEFT_UNLOCKED; + unpin_method= PAGECACHE_PIN_LEFT_UNPINNED; + } + else + { + if (!(buff= pagecache_read(share->pagecache, &share->kfile, + root_page, 0, 0, + PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE, + &page_link.link))) + { + result= (uint) my_errno; + goto err; + } + if (lsn_korr(buff) >= lsn) + { + /* Already applied */ + result= 0; + goto err; + } + unlock_method= PAGECACHE_LOCK_WRITE_UNLOCK; + unpin_method= PAGECACHE_UNPIN; + } + + /* Write modified page */ + lsn_store(buff, lsn); + memcpy(buff + LSN_STORE_SIZE, header, length); + bzero(buff + LSN_STORE_SIZE + length, + share->block_size - LSN_STORE_SIZE - KEYPAGE_CHECKSUM_SIZE - length); + bfill(buff + share->block_size - KEYPAGE_CHECKSUM_SIZE, + KEYPAGE_CHECKSUM_SIZE, (uchar) 255); + if (pagecache_write(share->pagecache, + &share->kfile, root_page, 0, + buff, PAGECACHE_PLAIN_PAGE, + unlock_method, unpin_method, + PAGECACHE_WRITE_DELAY, 0)) + DBUG_RETURN(my_errno); + DBUG_RETURN(0); + +err: + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0); + DBUG_RETURN(result); +} + + +/** + @brief Apply LOGREC_REDO_INDEX_FREE_PAGE + + @param info Maria handler + @param header Header (without FILEID) + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +uint _ma_apply_redo_index_free_page(MARIA_HA *info, + LSN lsn, + const uchar *header) +{ + ulonglong page= page_korr(header); + ulonglong free_page= page_korr(header + PAGE_STORE_SIZE); + my_off_t old_link; + MARIA_PINNED_PAGE page_link; + MARIA_SHARE *share= info->s; + uchar *buff; + int result; + DBUG_ENTER("_ma_apply_redo_index_free_page"); + + share->state.changed|= (STATE_CHANGED | STATE_NOT_OPTIMIZED_KEYS | + STATE_NOT_SORTED_PAGES); + + old_link= share->state.key_del; + share->state.key_del= ((free_page != IMPOSSIBLE_PAGE_NO) ? + (my_off_t) free_page * share->block_size : + HA_OFFSET_ERROR); + if (!(buff= pagecache_read(share->pagecache, &info->s->kfile, + page, 0, 0, + PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE, + &page_link.link))) + { + result= (uint) my_errno; + goto err; + } + if (lsn_korr(buff) >= lsn) + { + /* Already applied */ + result= 0; + goto err; + } + /* Write modified page */ + lsn_store(buff, lsn); + bzero(buff + LSN_STORE_SIZE, share->keypage_header - LSN_STORE_SIZE); + _ma_store_keynr(info, buff, (uchar) MARIA_DELETE_KEY_NR); + mi_sizestore(buff + share->keypage_header, old_link); + share->state.changed|= STATE_NOT_SORTED_PAGES; + + if (pagecache_write(share->pagecache, + &info->s->kfile, page, 0, + buff, PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, + PAGECACHE_WRITE_DELAY, 0)) + DBUG_RETURN(my_errno); + DBUG_RETURN(0); + +err: + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0); + DBUG_RETURN(result); +} + + +/** + @brief Apply LOGREC_REDO_INDEX + + @fn ma_apply_redo_index() + @param info Maria handler + @param header Header (without FILEID) + + @notes + Data for this part is a set of logical instructions of how to + construct the key page. + + Information of the layout of the components for REDO_INDEX: + + Name Parameters (in byte) Information + KEY_OP_OFFSET 2 Set position for next operations + KEY_OP_SHIFT 2 (signed int) How much to shift down or up + KEY_OP_CHANGE 2 length, data Data to replace at 'pos' + KEY_OP_ADD_PREFIX 2 move-length How much data should be moved up + 2 change-length Data to be replaced at page start + KEY_OP_DEL_PREFIX 2 length Bytes to be deleted at page start + KEY_OP_ADD_SUFFIX 2 length, data Add data to end of page + KEY_OP_DEL_SUFFIX 2 length Reduce page length with this + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +uint _ma_apply_redo_index(MARIA_HA *info, + LSN lsn, const uchar *header, uint head_length) +{ + MARIA_SHARE *share= info->s; + ulonglong page= page_korr(header); + MARIA_PINNED_PAGE page_link; + uchar *buff; + const uchar *header_end= header + head_length; + uint page_offset= 0; + uint nod_flag, page_length, keypage_header; + int result; + uint org_page_length; + DBUG_ENTER("_ma_apply_redo_index"); + + /* Set header to point at key data */ + header+= PAGE_STORE_SIZE; + + if (!(buff= pagecache_read(share->pagecache, &info->s->kfile, + page, 0, 0, + PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE, + &page_link.link))) + { + result= 1; + goto err; + } + if (lsn_korr(buff) >= lsn) + { + /* Already applied */ + result= 0; + goto err; + } + + _ma_get_used_and_nod(info, buff, page_length, nod_flag); + keypage_header= share->keypage_header; + org_page_length= page_length; + + /* Apply modifications to page */ + do + { + switch ((enum en_key_op) (*header++)) { + case KEY_OP_OFFSET: + page_offset= uint2korr(header); + header+= 2; + DBUG_ASSERT(page_offset >= keypage_header && page_offset <= page_length); + break; + case KEY_OP_SHIFT: + { + int length= sint2korr(header); + header+= 2; + DBUG_ASSERT(page_offset != 0 && page_offset < page_length && + page_length + length < share->block_size); + + if (length < 0) + bmove(buff + page_offset, buff + page_offset - length, + page_length - page_offset + length); + else + bmove_upp(buff + page_length + length, buff + page_length, + page_length - page_offset); + page_length+= length; + break; + } + case KEY_OP_CHANGE: + { + uint length= uint2korr(header); + DBUG_ASSERT(page_offset != 0 && page_offset + length <= page_length); + + memcpy(buff + page_offset, header + 2 , length); + header+= 2 + length; + break; + } + case KEY_OP_ADD_PREFIX: + { + uint insert_length= uint2korr(header); + uint change_length= uint2korr(header+2); + DBUG_ASSERT(insert_length <= change_length && + page_length + change_length <= share->block_size); + + bmove_upp(buff + page_length + insert_length, buff + page_length, + page_length - keypage_header); + memcpy(buff + keypage_header, header + 4 , change_length); + header+= 4 + change_length; + page_length+= insert_length; + break; + } + case KEY_OP_DEL_PREFIX: + { + uint length= uint2korr(header); + header+= 2; + DBUG_ASSERT(length <= page_length - keypage_header); + + bmove(buff + keypage_header, buff + keypage_header + + length, page_length - keypage_header - length); + page_length-= length; + break; + } + case KEY_OP_ADD_SUFFIX: + { + uint insert_length= uint2korr(header); + DBUG_ASSERT(page_length + insert_length <= share->block_size); + memcpy(buff + page_length, header+2, insert_length); + + page_length+= insert_length; + header+= 2 + insert_length; + break; + } + case KEY_OP_DEL_SUFFIX: + { + uint del_length= uint2korr(header); + header+= 2; + DBUG_ASSERT(page_length - del_length >= keypage_header); + page_length-= del_length; + break; + } + case KEY_OP_NONE: + default: + DBUG_ASSERT(0); + result= 1; + goto err; + } + } while (header < header_end); + DBUG_ASSERT(header == header_end); + + /* Write modified page */ + lsn_store(buff, lsn); + _ma_store_page_used(info, buff, page_length, nod_flag); + + /* + Clean old stuff up. Gives us better compression of we archive things + and makes things easer to debug + */ + if (page_length < org_page_length) + bzero(buff + page_length, org_page_length-page_length); + + if (pagecache_write(share->pagecache, + &info->s->kfile, page, 0, + buff, PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE_UNLOCK, PAGECACHE_UNPIN, + PAGECACHE_WRITE_DELAY, 0)) + DBUG_RETURN(my_errno); + DBUG_RETURN(0); + +err: + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE, 0); + DBUG_RETURN(result); +} + + +/**************************************************************************** + Undo of key block changes +****************************************************************************/ + + +/** + @brief Undo of insert of key (ie, delete the inserted key) +*/ + +my_bool _ma_apply_undo_key_insert(MARIA_HA *info, LSN undo_lsn, + const uchar *header, uint length) +{ + LSN lsn; + my_bool res; + uint keynr; + uchar key[HA_MAX_KEY_BUFF]; + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_apply_undo_key_insert"); + + share->state.changed|= (STATE_CHANGED | STATE_NOT_OPTIMIZED_KEYS | + STATE_NOT_SORTED_PAGES); + keynr= key_nr_korr(header); + length-= KEY_NR_STORE_SIZE; + + /* We have to copy key as _ma_ck_real_delete() may change it */ + memcpy(key, header+ KEY_NR_STORE_SIZE, length); + + res= _ma_ck_real_delete(info, share->keyinfo+keynr, key, length, + &share->state.key_root[keynr]); + + if (_ma_write_clr(info, undo_lsn, LOGREC_UNDO_KEY_INSERT, 1, 0, &lsn)) + res= 1; + + _ma_fast_unlock_key_del(info); + _ma_unpin_all_pages_and_finalize_row(info, lsn); + DBUG_RETURN(res); +} + + +/** + @brief Undo of insert of key (ie, delete the inserted key) +*/ + +my_bool _ma_apply_undo_key_delete(MARIA_HA *info, LSN undo_lsn, + const uchar *header, uint length) +{ + LSN lsn; + my_bool res; + uint keynr; + uchar key[HA_MAX_KEY_BUFF]; + MARIA_SHARE *share= info->s; + DBUG_ENTER("_ma_apply_undo_key_delete"); + + share->state.changed|= (STATE_CHANGED | STATE_NOT_OPTIMIZED_KEYS | + STATE_NOT_SORTED_PAGES); + keynr= key_nr_korr(header); + length-= KEY_NR_STORE_SIZE; + + /* We have to copy key as _ma_ck_real_delete() may change it */ + memcpy(key, header+ KEY_NR_STORE_SIZE, length); + + res= _ma_ck_real_write_btree(info, share->keyinfo+keynr, key, length, + &share->state.key_root[keynr], + share->keyinfo[keynr].write_comp_flag); + + if (_ma_write_clr(info, undo_lsn, LOGREC_UNDO_KEY_DELETE, 1, 0, &lsn)) + res= 1; + + _ma_fast_unlock_key_del(info); + _ma_unpin_all_pages_and_finalize_row(info, lsn); + DBUG_RETURN(res); +} + + +/**************************************************************************** + Handle some local variables +****************************************************************************/ + +/* + @brief lock key_del for other threads usage + + @fn _ma_lock_key_del() + @param info Maria handler + @param insert_at_end Set to 1 if we are doing an insert + + @notes + To allow higher concurrency in the common case where we do inserts + and we don't have any linked blocks we do the following: + - Mark in info->used_key_del that we are not using key_del + - Return at once (without marking key_del as used) + + This is safe as we in this case don't write current_key_del into + the redo log and during recover we are not updating key_del. +*/ + +my_bool _ma_lock_key_del(MARIA_HA *info, my_bool insert_at_end) +{ + MARIA_SHARE *share= info->s; + + if (info->used_key_del != 1) + { + pthread_mutex_lock(&share->intern_lock); + if (share->state.key_del == HA_OFFSET_ERROR && insert_at_end) + { + pthread_mutex_unlock(&share->intern_lock); + info->used_key_del= 2; /* insert-with-append */ + return 1; + } +#ifdef THREAD + while (share->used_key_del) + pthread_cond_wait(&share->intern_cond, &share->intern_lock); +#endif + info->used_key_del= 1; + share->used_key_del= 1; + pthread_mutex_unlock(&share->intern_lock); + } + return 0; +} + + +/* + @brief copy changes to key_del and unlock it +*/ + +void _ma_unlock_key_del(MARIA_HA *info) +{ + MARIA_SHARE *share= info->s; + DBUG_ASSERT(info->used_key_del); + if (info->used_key_del == 1) /* Ignore insert-with-append */ + { + pthread_mutex_lock(&share->intern_lock); + share->used_key_del= 0; + info->s->state.key_del= info->s->current_key_del; + pthread_mutex_unlock(&share->intern_lock); + pthread_cond_signal(&share->intern_cond); + } + info->used_key_del= 0; +} diff --git a/storage/maria/ma_key_recover.h b/storage/maria/ma_key_recover.h new file mode 100644 index 00000000000..9d3548ac472 --- /dev/null +++ b/storage/maria/ma_key_recover.h @@ -0,0 +1,64 @@ +/* Copyright (C) 2007 Michael Widenius + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + When we have finished the write/update/delete of a row, we have cleanups to + do. For now it is signalling to Checkpoint that all dirtied pages have + their rec_lsn set and page LSN set (_ma_unpin_all_pages() has been called), + and that bitmap pages are correct (_ma_bitmap_release_unused() has been + called). +*/ + +/* Function definitions for some redo functions */ + +my_bool _ma_write_clr(MARIA_HA *info, LSN undo_lsn, + enum translog_record_type undo_type, + my_bool store_checksum, ha_checksum checksum, + LSN *res_lsn); +void _ma_unpin_all_pages(MARIA_HA *info, LSN undo_lsn); + +uint _ma_apply_redo_index_new_page(MARIA_HA *info, LSN lsn, + const uchar *header, uint length); +uint _ma_apply_redo_index_free_page(MARIA_HA *info, LSN lsn, + const uchar *header); +uint _ma_apply_redo_index(MARIA_HA *info, + LSN lsn, const uchar *header, uint length); + +my_bool _ma_apply_undo_key_insert(MARIA_HA *info, LSN undo_lsn, + const uchar *header, uint length); +my_bool _ma_apply_undo_key_delete(MARIA_HA *info, LSN undo_lsn, + const uchar *header, uint length); + +static inline void _ma_finalize_row(MARIA_HA *info) +{ + info->trn->rec_lsn= LSN_IMPOSSIBLE; +} + +/* unpinning is often the last operation before finalizing */ + +static inline void _ma_unpin_all_pages_and_finalize_row(MARIA_HA *info, + LSN undo_lsn) +{ + _ma_unpin_all_pages(info, undo_lsn); + _ma_finalize_row(info); +} + +extern my_bool _ma_lock_key_del(MARIA_HA *info, my_bool insert_at_end); +extern void _ma_unlock_key_del(MARIA_HA *info); +static inline void _ma_fast_unlock_key_del(MARIA_HA *info) +{ + if (info->used_key_del) + _ma_unlock_key_del(info); +} diff --git a/storage/maria/ma_key_redo.c b/storage/maria/ma_key_redo.c new file mode 100644 index 00000000000..9299f23f328 --- /dev/null +++ b/storage/maria/ma_key_redo.c @@ -0,0 +1,417 @@ +/* Copyright (C) 2007 Michael Widenius + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; version 2 of the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* Redo of index */ + +#include "maria_def.h" +#include "ma_blockrec.h" +#include "trnman.h" + +/** + @brief Apply LOGREC_REDO_INDEX_NEW_PAGE + + @param info Maria handler + @param header Header (without FILEID) + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +uint _ma_apply_redo_index_new_page(MARIA_HA *info, LSN lsn, + const uchar *header, uint length) +{ + ulonglong root_page= page_korr(header); + ulonglong free_page= page_korr(header + PAGE_STORE_SIZE); + enum pagecache_page_lock unlock_method; + enum pagecache_page_pin unpin_method; + MARIA_PINNED_PAGE page_link; + my_off_t file_size; + uchar *buff; + uint result; + DBUG_ENTER("_ma_apply_redo_index_new_page"); + + /* Set header to point at key data */ + header+= PAGE_STORE_SIZE*2; + length-= PAGE_STORE_SIZE*2; + + if (free_page != IMPOSSIBLE_PAGE_NO) + info->s->state.key_del= (my_off_t) free_page * info->s->block_size; + else + info->s->state.key_del= HA_OFFSET_ERROR; + + file_size= (my_off_t) (root_page + 1) * info->s->block_size; + if (file_size > info->state->key_file_length) + { + info->state->key_file_length= file_size; + buff= info->keyread_buff; + info->keyread_buff_used= 1; + unlock_method= PAGECACHE_LOCK_LEFT_UNLOCKED; + unpin_method= PAGECACHE_PIN_LEFT_UNPINNED; + } + else + { + if (!(buff= pagecache_read(info->s->pagecache, &info->dfile, + root_page, 0, 0, + PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE, + &page_link.link))) + { + result= (uint) my_errno; + goto err; + } + if (lsn_korr(buff) >= lsn) + { + /* Already applied */ + result= 0; + goto err; + } + unlock_method= PAGECACHE_LOCK_WRITE_UNLOCK; + unpin_method= PAGECACHE_UNPIN; + } + + /* Write modified page */ + lsn_store(buff, lsn); + memcpy(buff + LSN_STORE_SIZE, header, length); +#ifdef IDENTICAL_PAGES_AFTER_RECOVERY + bzero(buff + LSN_STORE_SIZE + length, + info->s->block_size - LSN_STORE_SIZE - length); +#endif + if (pagecache_write(info->s->pagecache, + &info->dfile, root_page, 0, + buff, PAGECACHE_PLAIN_PAGE, + unlock_method, unpin_method, + PAGECACHE_WRITE_DELAY, 0)) + DBUG_RETURN(my_errno); + DBUG_RETURN(0); + +err: + pagecache_unlock_by_link(info->s->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE); + DBUG_RETURN(result); +} + + +/** + @brief Apply LOGREC_REDO_INDEX_FREE_PAGE + + @param info Maria handler + @param header Header (without FILEID) + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +uint _ma_apply_redo_index_free_page(MARIA_HA *info, + LSN lsn, + const uchar *header) +{ + ulonglong page= page_korr(header); + ulonglong free_page= page_korr(header + PAGE_STORE_SIZE); + my_off_t old_link; + MARIA_PINNED_PAGE page_link; + MARIA_SHARE *share= info->s; + uchar *buff; + int result; + DBUG_ENTER("_ma_apply_redo_index_free_page"); + + old_link= share->state.key_del; + share->state.key_del= ((free_page != IMPOSSIBLE_PAGE_NO) ? + (my_off_t) free_page * share->block_size : + HA_OFFSET_ERROR); + if (!(buff= pagecache_read(share->pagecache, &info->dfile, + page, 0, 0, + PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE, + &page_link.link))) + { + result= (uint) my_errno; + goto err; + } + if (lsn_korr(buff) >= lsn) + { + /* Already applied */ + result= 0; + goto err; + } + /* Write modified page */ + lsn_store(buff, lsn); + bzero(buff + LSN_STORE_SIZE, share->keypage_header - LSN_STORE_SIZE); + _ma_store_keynr(info, buff, (uchar) MARIA_DELETE_KEY_NR); + mi_sizestore(buff + share->keypage_header, old_link); + share->state.changed|= STATE_NOT_SORTED_PAGES; + + if (pagecache_write(share->pagecache, + &info->dfile, page, 0, + buff, PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, + PAGECACHE_WRITE_DELAY, 0)) + DBUG_RETURN(my_errno); + DBUG_RETURN(0); + +err: + pagecache_unlock_by_link(share->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE); + DBUG_RETURN(result); +} + + +/** + @brief Apply LOGREC_REDO_INDEX + + @fn ma_apply_redo_index() + @param info Maria handler + @param header Header (without FILEID) + + @notes + Data for this part is a set of logical instructions of how to + construct the key page. + + Information of the layout of the components for REDO_INDEX: + + Name Parameters (in byte) Information + KEY_OP_OFFSET 2 Set position for next operations + KEY_OP_SHIFT 2 (signed int) How much to shift down or up + KEY_OP_CHANGE 2 length, data Data to replace at 'pos' + KEY_OP_ADD_PREFIX 2 move-length How much data should be moved up + 2 change-length Data to be replaced at page start + KEY_OP_DEL_PREFIX 2 length Bytes to be deleted at page start + KEY_OP_ADD_SUFFIX 2 length, data Add data to end of page + KEY_OP_DEL_SUFFIX 2 length Reduce page length with this + + @return Operation status + @retval 0 OK + @retval 1 Error +*/ + +uint _ma_apply_redo_index(MARIA_HA *info, + LSN lsn, const uchar *header, uint length) +{ + ulonglong root_page= page_korr(header); + MARIA_PINNED_PAGE page_link; + uchar *buff; + const uchar *header_end= header + length; + uint page_offset= 0; + uint nod_flag, page_length, keypage_header; + int result; + DBUG_ENTER("_ma_apply_redo_index"); + + /* Set header to point at key data */ + header+= PAGE_STORE_SIZE; + + if (!(buff= pagecache_read(info->s->pagecache, &info->dfile, + root_page, 0, 0, + PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_WRITE, + &page_link.link))) + { + result= (uint) my_errno; + goto err; + } + if (lsn_korr(buff) >= lsn) + { + /* Already applied */ + result= 0; + goto err; + } + + _ma_get_used_and_nod(info, buff, page_length, nod_flag); + keypage_header= info->s->keypage_header; + + /* Apply modifications to page */ + do + { + switch ((enum en_key_op) (*header++)) { + case KEY_OP_NONE: + DBUG_ASSERT(0); /* Impossible */ + break; + case KEY_OP_OFFSET: + page_offset= uint2korr(header); + header+= 2; + DBUG_ASSERT(page_offset >= keypage_header && page_offset <= page_length); + break; + case KEY_OP_SHIFT: + { + int length= sint2korr(header); + header+= 2; + DBUG_ASSERT(page_offset != 0 && page_offset < page_length && + page_length + length < info->s->block_size); + + if (length < 0) + bmove(buff + page_offset, buff + page_offset - length, + page_length - page_offset + length); + else + bmove_upp(buff + page_length + length, buff + page_length, + page_length - page_offset); + page_length+= length; + break; + } + case KEY_OP_CHANGE: + { + uint length= uint2korr(header); + DBUG_ASSERT(page_offset != 0 && page_offset + length <= page_length); + + memcpy(buff + page_offset, header + 2 , length); + header+= 2 + length; + break; + } + case KEY_OP_ADD_PREFIX: + { + uint insert_length= uint2korr(header); + uint change_length= uint2korr(header+2); + DBUG_ASSERT(insert_length <= change_length && + page_length + change_length < info->s->block_size); + + bmove_upp(buff + page_length + insert_length, buff + page_length, + page_length - keypage_header); + memcpy(buff + keypage_header, header + 2 , change_length); + header+= 4 + change_length; + page_length+= insert_length; + break; + } + case KEY_OP_DEL_PREFIX: + { + uint length= uint2korr(header); + header+= 2; + DBUG_ASSERT(length <= page_length - keypage_header); + + bmove(buff + keypage_header, buff + keypage_header + + length, page_length - keypage_header - length); + page_length-= length; + break; + } + case KEY_OP_ADD_SUFFIX: + { + uint insert_length= uint2korr(header); + DBUG_ASSERT(page_length + insert_length < info->s->block_size); + memcpy(buff + page_length, header, insert_length); + page_length= insert_length; + header+= 2 + insert_length; + break; + } + case KEY_OP_DEL_SUFFIX: + { + uint del_length= uint2korr(header); + header+= 2; + DBUG_ASSERT(page_length - del_length >= keypage_header); + page_length-= del_length; + break; + } + } + } while (header < header_end); + DBUG_ASSERT(header == header_end); + + /* Write modified page */ + lsn_store(buff, lsn); + memcpy(buff + LSN_STORE_SIZE, header, length); + _ma_store_page_used(info, buff, page_length, nod_flag); + + if (pagecache_write(info->s->pagecache, + &info->dfile, root_page, 0, + buff, PAGECACHE_PLAIN_PAGE, + PAGECACHE_LOCK_WRITE_UNLOCK, PAGECACHE_UNPIN, + PAGECACHE_WRITE_DELAY, 0)) + DBUG_RETURN(my_errno); + DBUG_RETURN(0); + +err: + pagecache_unlock_by_link(info->s->pagecache, page_link.link, + PAGECACHE_LOCK_WRITE_UNLOCK, + PAGECACHE_UNPIN, LSN_IMPOSSIBLE, + LSN_IMPOSSIBLE); + DBUG_RETURN(result); +} + +/* + Unpin all pinned pages + + SYNOPSIS + _ma_unpin_all_pages() + info Maria handler + undo_lsn LSN for undo pages. LSN_IMPOSSIBLE if we shouldn't write undo + (error) + + NOTE + We unpin pages in the reverse order as they where pinned; This may not + be strictly necessary but may simplify things in the future. + + RETURN + 0 ok + 1 error (fatal disk error) + +*/ + +void _ma_unpin_all_pages(MARIA_HA *info, LSN undo_lsn) +{ + MARIA_PINNED_PAGE *page_link= ((MARIA_PINNED_PAGE*) + dynamic_array_ptr(&info->pinned_pages, 0)); + MARIA_PINNED_PAGE *pinned_page= page_link + info->pinned_pages.elements; + DBUG_ENTER("_ma_unpin_all_pages"); + DBUG_PRINT("info", ("undo_lsn: %lu", (ulong) undo_lsn)); + + if (!info->s->now_transactional) + undo_lsn= LSN_IMPOSSIBLE; /* don't try to set a LSN on pages */ + + while (pinned_page-- != page_link) + { + DBUG_ASSERT(pinned_page->changed && + ((undo_lsn != LSN_IMPOSSIBLE) || !info->s->now_transactional)); + pagecache_unlock_by_link(info->s->pagecache, pinned_page->link, + pinned_page->unlock, PAGECACHE_UNPIN, + info->trn->rec_lsn, undo_lsn); + } + + info->pinned_pages.elements= 0; + DBUG_VOID_RETURN; +} + + +/**************************************************************************** + Undo of key block changes +****************************************************************************/ + + +/* + Undo of insert of key (ie, delete the inserted key) +*/ + +my_bool _ma_apply_undo_key_insert(MARIA_HA *info, LSN undo_lsn, + const uchar *header, uint length) +{ + ulonglong page; + uint rownr; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + + CLR_TYPE_STORE_SIZE + HA_CHECKSUM_STORE_SIZE], + *buff; + my_bool res= 1; + MARIA_PINNED_PAGE page_link; + LSN lsn= LSN_IMPOSSIBLE; + MARIA_SHARE *share= info->s; + struct st_msg_to_write_hook_for_clr_end msg; + DBUG_ENTER("_ma_apply_undo_key_insert"); + + keynr= keynr_korr(header); + key= header+ KEY_NR_STORE_SIZE; + length-= KEYNR_STORE_SIZE; + + res= _ma_ck_delete(info, info->s->keyinfo+keynr, key, length, + &info->s->state.key_root[keynr]); + _ma_unpin_all_pages_and_finalize_row(info, lsn); + DBUG_RETURN(res); +} diff --git a/storage/maria/ma_locking.c b/storage/maria/ma_locking.c index 9fc364e2af8..0b24fe9594c 100644 --- a/storage/maria/ma_locking.c +++ b/storage/maria/ma_locking.c @@ -67,18 +67,11 @@ int maria_lock_database(MARIA_HA *info, int lock_type) --share->tot_locks; if (info->lock_type == F_WRLCK && !share->w_locks) { - if (!share->delay_key_write && - flush_pagecache_blocks(share->pagecache, &share->kfile, - FLUSH_KEEP)) - { - error= my_errno; - maria_print_error(info->s, HA_ERR_CRASHED); - /* Mark that table must be checked */ - maria_mark_crashed(info); - } /* pages of transactional tables get flushed at Checkpoint */ - if (!share->base.born_transactional && - _ma_flush_table_files(info, MARIA_FLUSH_DATA, + if (!share->base.born_transactional && !share->temporary && + _ma_flush_table_files(info, + share->delay_key_write ? MARIA_FLUSH_DATA : + MARIA_FLUSH_DATA | MARIA_FLUSH_INDEX, FLUSH_KEEP, FLUSH_KEEP)) error= my_errno; } @@ -110,9 +103,11 @@ int maria_lock_database(MARIA_HA *info, int lock_type) rw_unlock(&info->s->mmap_lock); } #endif +#ifdef EXTERNAL_LOCKING share->state.process= share->last_process=share->this_process; share->state.unique= info->last_unique= info->this_unique; share->state.update_count= info->last_loop= ++info->this_loop; +#endif /* transactional tables rather flush their state at Checkpoint */ if (!share->base.born_transactional) { @@ -239,7 +234,7 @@ int maria_lock_database(MARIA_HA *info, int lock_type) /* Check for bad file descriptors if this table is part of a merge union. Failing to capture this may cause - a crash on windows if the table is renamed and + a crash on windows if the table is renamed and later on referenced by the merge table. */ if( info->owned_by_merge && (info->s)->kfile.file < 0 ) @@ -438,9 +433,17 @@ int _ma_writeinfo(register MARIA_HA *info, uint operation) if (operation) { /* Two threads can't be here */ olderror= my_errno; /* Remember last error */ + +#ifdef EXTERNAL_LOCKING + /* + The following only makes sense if we want to be allow two different + processes access the same table at the same time + */ share->state.process= share->last_process= share->this_process; share->state.unique= info->last_unique= info->this_unique; share->state.update_count= info->last_loop= ++info->this_loop; +#endif + if ((error= _ma_state_info_write_sub(share->kfile.file, &share->state, 1))) olderror=my_errno; @@ -460,11 +463,14 @@ int _ma_writeinfo(register MARIA_HA *info, uint operation) } /* _ma_writeinfo */ - /* Test if someone has changed the database */ - /* (Should be called after readinfo) */ +/* + Test if an external process has changed the database + (Should be called after readinfo) +*/ int _ma_test_if_changed(register MARIA_HA *info) { +#ifdef EXTERNAL_LOCKING MARIA_SHARE *share=info->s; if (share->state.process != share->last_process || share->state.unique != info->last_unique || @@ -481,6 +487,7 @@ int _ma_test_if_changed(register MARIA_HA *info) info->data_changed= 1; /* For maria_is_changed */ return 1; } +#endif return (!(info->update & HA_STATE_AKTIV) || (info->update & (HA_STATE_WRITTEN | HA_STATE_DELETED | HA_STATE_KEY_CHANGED))); diff --git a/storage/maria/ma_loghandler.c b/storage/maria/ma_loghandler.c index 8f6a50bd93f..8e4f50ade21 100644 --- a/storage/maria/ma_loghandler.c +++ b/storage/maria/ma_loghandler.c @@ -15,7 +15,7 @@ #include "maria_def.h" #include "ma_blockrec.h" /* for some constants and in-write hooks */ -#include "trnman.h" /* for access to members of TRN */ +#include "trnman.h" /** @file @@ -65,7 +65,7 @@ static int translog_mutex_lock(pthread_mutex_t *M) { int rc; - DBUG_PRINT("info", ("Going lock mutex 0x%lx...", (ulong)(M))); + DBUG_PRINT("info", ("Going lock mutex 0x%lx", (ulong)(M))); rc= pthread_mutex_lock(M); DBUG_PRINT("info", ("Mutex locked 0x%lx rc: %d", (ulong)(M), rc)); return (rc); @@ -74,7 +74,7 @@ static int translog_mutex_lock(pthread_mutex_t *M) static int translog_mutex_unlock(pthread_mutex_t *M) { int rc; - DBUG_PRINT("info", ("Going unlock mutex 0x%lx...", (ulong)(M))); + DBUG_PRINT("info", ("Going unlock mutex 0x%lx", (ulong)(M))); rc= pthread_mutex_unlock(M); DBUG_PRINT("info", ("Mutex unlocked 0x%lx rc: %d", (ulong)(M), rc)); return(rc); @@ -270,19 +270,9 @@ static void check_translog_description_table(int num) DBUG_ASSERT(num > 0); /* last is reserved for extending the table */ DBUG_ASSERT(num < LOGREC_NUMBER_OF_TYPES - 1); - DBUG_PRINT("info", ("records number: OK")); - DBUG_PRINT("info", - ("record type: %d class: %d fixed: %u header: %u LSNs: %u " - "name: %s", - 0, - log_record_type_descriptor[0].class, - (uint)log_record_type_descriptor[0].fixed_length, - (uint)log_record_type_descriptor[0].read_header_len, - (uint)log_record_type_descriptor[0].compressed_LSN, - log_record_type_descriptor[0].name)); DBUG_ASSERT(log_record_type_descriptor[0].class == LOGRECTYPE_NOT_ALLOWED); - DBUG_PRINT("info", ("record type 0: OK")); - for (i= 1; i <= num; i++) + + for (i= 0; i <= num; i++) { DBUG_PRINT("info", ("record type: %d class: %d fixed: %u header: %u LSNs: %u " @@ -294,7 +284,7 @@ static void check_translog_description_table(int num) log_record_type_descriptor[i].name)); switch (log_record_type_descriptor[i].class) { case LOGRECTYPE_NOT_ALLOWED: - DBUG_ASSERT(0); + DBUG_ASSERT(i == 0); break; case LOGRECTYPE_VARIABLE_LENGTH: DBUG_ASSERT(log_record_type_descriptor[i].fixed_length == 0); @@ -320,13 +310,10 @@ static void check_translog_description_table(int num) default: DBUG_ASSERT(0); } - DBUG_PRINT("info", ("record type %d: OK", i)); } - DBUG_PRINT("info", ("All filled records are OK")); for (i= num + 1; i < LOGREC_NUMBER_OF_TYPES; i++) { DBUG_ASSERT(log_record_type_descriptor[i].class == LOGRECTYPE_NOT_ALLOWED); - DBUG_PRINT("info", ("record type %d: OK", i)); } DBUG_VOID_RETURN; } @@ -453,6 +440,18 @@ static LOG_DESC INIT_LOGREC_REDO_INDEX= {LOGRECTYPE_VARIABLE_LENGTH, 0, 9, NULL, write_hook_for_redo, NULL, 0, "redo_index", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; +static LOG_DESC INIT_LOGREC_REDO_INDEX_NEW_PAGE= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2 + KEY_NR_STORE_SIZE + 1, + NULL, write_hook_for_redo, NULL, 0, + "redo_index_new_page", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + +static LOG_DESC INIT_LOGREC_REDO_INDEX_FREE_PAGE= +{LOGRECTYPE_FIXEDLENGTH, FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2, + FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2, + NULL, write_hook_for_redo, NULL, 0, + "redo_index_free_page", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; + static LOG_DESC INIT_LOGREC_REDO_UNDELETE_ROW= {LOGRECTYPE_FIXEDLENGTH, 16, 16, NULL, write_hook_for_redo, NULL, 0, "redo_undelete_row", LOGREC_NOT_LAST_IN_GROUP, NULL, NULL}; @@ -485,13 +484,23 @@ static LOG_DESC INIT_LOGREC_UNDO_ROW_UPDATE= "undo_row_update", LOGREC_LAST_IN_GROUP, NULL, NULL}; static LOG_DESC INIT_LOGREC_UNDO_KEY_INSERT= -{LOGRECTYPE_VARIABLE_LENGTH, 0, 10, NULL, write_hook_for_undo, NULL, 1, +{LOGRECTYPE_VARIABLE_LENGTH, 0, + LSN_STORE_SIZE + FILEID_STORE_SIZE + KEY_NR_STORE_SIZE, + NULL, write_hook_for_undo_key, NULL, 1, "undo_key_insert", LOGREC_LAST_IN_GROUP, NULL, NULL}; static LOG_DESC INIT_LOGREC_UNDO_KEY_DELETE= -{LOGRECTYPE_VARIABLE_LENGTH, 0, 15, NULL, write_hook_for_undo, NULL, 1, +{LOGRECTYPE_VARIABLE_LENGTH, 0, + LSN_STORE_SIZE + FILEID_STORE_SIZE + KEY_NR_STORE_SIZE, + NULL, write_hook_for_undo_key, NULL, 1, "undo_key_delete", LOGREC_LAST_IN_GROUP, NULL, NULL}; +static LOG_DESC INIT_LOGREC_UNDO_KEY_DELETE_WITH_ROOT= +{LOGRECTYPE_VARIABLE_LENGTH, 0, + LSN_STORE_SIZE + FILEID_STORE_SIZE + KEY_NR_STORE_SIZE + PAGE_STORE_SIZE, + NULL, write_hook_for_undo_key, NULL, 1, + "undo_key_delete_with_root", LOGREC_LAST_IN_GROUP, NULL, NULL}; + static LOG_DESC INIT_LOGREC_PREPARE= {LOGRECTYPE_VARIABLE_LENGTH, 0, 0, NULL, NULL, NULL, 0, "prepare", LOGREC_IS_GROUP_ITSELF, NULL, NULL}; @@ -572,6 +581,10 @@ static void loghandler_init() INIT_LOGREC_REDO_UPDATE_ROW_HEAD; log_record_type_descriptor[LOGREC_REDO_INDEX]= INIT_LOGREC_REDO_INDEX; + log_record_type_descriptor[LOGREC_REDO_INDEX_NEW_PAGE]= + INIT_LOGREC_REDO_INDEX_NEW_PAGE; + log_record_type_descriptor[LOGREC_REDO_INDEX_FREE_PAGE]= + INIT_LOGREC_REDO_INDEX_FREE_PAGE; log_record_type_descriptor[LOGREC_REDO_UNDELETE_ROW]= INIT_LOGREC_REDO_UNDELETE_ROW; log_record_type_descriptor[LOGREC_CLR_END]= @@ -588,6 +601,8 @@ static void loghandler_init() INIT_LOGREC_UNDO_KEY_INSERT; log_record_type_descriptor[LOGREC_UNDO_KEY_DELETE]= INIT_LOGREC_UNDO_KEY_DELETE; + log_record_type_descriptor[LOGREC_UNDO_KEY_DELETE_WITH_ROOT]= + INIT_LOGREC_UNDO_KEY_DELETE_WITH_ROOT; log_record_type_descriptor[LOGREC_PREPARE]= INIT_LOGREC_PREPARE; log_record_type_descriptor[LOGREC_PREPARE_WITH_UNDO_PURGE]= @@ -2432,13 +2447,13 @@ static uchar *translog_get_page(TRANSLOG_VALIDATOR_DATA *data, uchar *buffer, last_protected_sector)); for (i= 1; i <= last_protected_sector; i++) { - uint index= i * 2; + uint idx= i * 2; uint offset= i * DISK_DRIVE_SECTOR_SIZE; DBUG_PRINT("info", ("Sector %u: 0x%02x%02x <- 0x%02x%02x", i, buffer[offset], buffer[offset + 1], - table[index], table[index + 1])); - buffer[offset]= table[index]; - buffer[offset + 1]= table[index + 1]; + table[idx], table[idx + 1])); + buffer[offset]= table[idx]; + buffer[offset + 1]= table[idx + 1]; } } else @@ -2533,7 +2548,7 @@ static void translog_free_link(PAGECACHE_BLOCK_LINK *direct_link) if (direct_link) pagecache_unlock_by_link(log_descriptor.pagecache, direct_link, PAGECACHE_LOCK_READ_UNLOCK, PAGECACHE_UNPIN, - LSN_IMPOSSIBLE, LSN_IMPOSSIBLE); + LSN_IMPOSSIBLE, LSN_IMPOSSIBLE, 0); DBUG_VOID_RETURN; } @@ -2555,21 +2570,21 @@ static void translog_free_link(PAGECACHE_BLOCK_LINK *direct_link) static my_bool translog_get_last_page_addr(TRANSLOG_ADDRESS *addr, my_bool *last_page_ok) { - MY_STAT stat_buff, *stat; + MY_STAT stat_buff, *local_stat; char path[FN_REFLEN]; uint32 rec_offset; uint32 file_no= LSN_FILE_NO(*addr); DBUG_ENTER("translog_get_last_page_addr"); - if (!(stat= my_stat(translog_filename_by_fileno(file_no, path), - &stat_buff, MYF(MY_WME)))) + if (!(local_stat= my_stat(translog_filename_by_fileno(file_no, path), + &stat_buff, MYF(MY_WME)))) DBUG_RETURN(1); - DBUG_PRINT("info", ("File size: %lu", (ulong) stat->st_size)); - if (stat->st_size > TRANSLOG_PAGE_SIZE) + DBUG_PRINT("info", ("File size: %lu", (ulong) local_stat->st_size)); + if (local_stat->st_size > TRANSLOG_PAGE_SIZE) { - rec_offset= (((stat->st_size / TRANSLOG_PAGE_SIZE) - 1) * + rec_offset= (((local_stat->st_size / TRANSLOG_PAGE_SIZE) - 1) * TRANSLOG_PAGE_SIZE); - *last_page_ok= (stat->st_size == rec_offset + TRANSLOG_PAGE_SIZE); + *last_page_ok= (local_stat->st_size == rec_offset + TRANSLOG_PAGE_SIZE); } else { @@ -5769,15 +5784,15 @@ translog_variable_length_header(uchar *page, translog_size_t page_offset, for (;;) { - uint i, read= grp_no; + uint i, read_length= grp_no; buff->chunk0_pages++; if (page_rest < grp_no * (7 + 1)) - read= page_rest / (7 + 1); + read_length= page_rest / (7 + 1); DBUG_PRINT("info", ("Read chunk0 page#%u read: %u left: %u " "start from: %u", - buff->chunk0_pages, read, grp_no, curr)); - for (i= 0; i < read; i++, curr++) + buff->chunk0_pages, read_length, grp_no, curr)); + for (i= 0; i < read_length; i++, curr++) { DBUG_ASSERT(curr < buff->groups_no); buff->groups[curr].addr= lsn_korr(src + i * (7 + 1)); @@ -5787,22 +5802,23 @@ translog_variable_length_header(uchar *page, translog_size_t page_offset, LSN_IN_PARTS(buff->groups[curr].addr), (uint) buff->groups[curr].num)); } - grp_no-= read; + grp_no-= read_length; if (grp_no == 0) { if (scanner) { buff->chunk0_data_addr= scanner->page_addr; + /* offset increased */ buff->chunk0_data_addr+= (page_offset + header_to_skip + - read * (7 + 1)); /* offset increased */ + read_length * (7 + 1)); } else { buff->chunk0_data_addr= buff->lsn; /* offset increased */ - buff->chunk0_data_addr+= (header_to_skip + read * (7 + 1)); + buff->chunk0_data_addr+= (header_to_skip + read_length * (7 + 1)); } - buff->chunk0_data_len= chunk_len - 2 - read * (7 + 1); + buff->chunk0_data_len= chunk_len - 2 - read_length * (7 + 1); DBUG_PRINT("info", ("Data address: (%lu,0x%lx) len: %u", LSN_IN_PARTS(buff->chunk0_data_addr), buff->chunk0_data_len)); diff --git a/storage/maria/ma_loghandler.h b/storage/maria/ma_loghandler.h index 58a21ee09b2..67a861691f0 100644 --- a/storage/maria/ma_loghandler.h +++ b/storage/maria/ma_loghandler.h @@ -50,6 +50,8 @@ struct st_maria_handler; /* Changing one of the "SIZE" below will break backward-compatibility! */ /* Length of CRC at end of pages */ +#define ROW_EXTENT_PAGE_SIZE 5 +#define ROW_EXTENT_COUNT_SIZE 2 #define CRC_LENGTH 4 /* Size of file id in logs */ #define FILEID_STORE_SIZE 2 @@ -61,6 +63,8 @@ struct st_maria_handler; #define CLR_TYPE_STORE_SIZE 1 /* If table has live checksum we store its changes in UNDOs */ #define HA_CHECKSUM_STORE_SIZE 4 +#define KEY_NR_STORE_SIZE 1 +#define PAGE_LENGTH_STORE_SIZE 2 /* Store methods to match the above sizes */ #define fileid_store(T,A) int2store(T,A) @@ -68,12 +72,14 @@ struct st_maria_handler; #define dirpos_store(T,A) ((*(uchar*) (T)) = A) #define pagerange_store(T,A) int2store(T,A) #define clr_type_store(T,A) ((*(uchar*) (T)) = A) +#define key_nr_store(T, A) ((*(uchar*) (T)) = A) #define ha_checksum_store(T,A) int4store(T,A) #define fileid_korr(P) uint2korr(P) #define page_korr(P) uint5korr(P) #define dirpos_korr(P) ((P)[0]) #define pagerange_korr(P) uint2korr(P) #define clr_type_korr(P) ((P)[0]) +#define key_nr_korr(P) ((P)[0]) #define ha_checksum_korr(P) uint4korr(P) /* @@ -108,6 +114,8 @@ enum translog_record_type LOGREC_REDO_DELETE_ROW, LOGREC_REDO_UPDATE_ROW_HEAD, LOGREC_REDO_INDEX, + LOGREC_REDO_INDEX_NEW_PAGE, + LOGREC_REDO_INDEX_FREE_PAGE, LOGREC_REDO_UNDELETE_ROW, LOGREC_CLR_END, LOGREC_PURGE_END, @@ -116,6 +124,7 @@ enum translog_record_type LOGREC_UNDO_ROW_UPDATE, LOGREC_UNDO_KEY_INSERT, LOGREC_UNDO_KEY_DELETE, + LOGREC_UNDO_KEY_DELETE_WITH_ROOT, LOGREC_PREPARE, LOGREC_PREPARE_WITH_UNDO_PURGE, LOGREC_COMMIT, @@ -132,6 +141,20 @@ enum translog_record_type }; #define LOGREC_NUMBER_OF_TYPES 64 /* Maximum, can't be extended */ +/* Type of operations in LOGREC_REDO_INDEX */ + +enum en_key_op +{ + KEY_OP_NONE, /* Not used */ + KEY_OP_OFFSET, /* Set current position */ + KEY_OP_SHIFT, /* Shift up/or down at current position */ + KEY_OP_CHANGE, /* Change data at current position */ + KEY_OP_ADD_PREFIX, /* Insert data at start of page */ + KEY_OP_DEL_PREFIX, /* Delete data at start of page */ + KEY_OP_ADD_SUFFIX, /* Insert data at end of page */ + KEY_OP_DEL_SUFFIX, /* Delete data at end of page */ +}; + /* Size of log file; One log file is restricted to 4G */ typedef uint32 translog_size_t; diff --git a/storage/maria/ma_open.c b/storage/maria/ma_open.c index 50126775b03..75ca054d88c 100644 --- a/storage/maria/ma_open.c +++ b/storage/maria/ma_open.c @@ -19,7 +19,6 @@ #include "ma_sp_defs.h" #include "ma_rt_index.h" #include "ma_blockrec.h" -#include "trnman.h" #include <m_ctype.h> #if defined(MSDOS) || defined(__WIN__) @@ -153,6 +152,14 @@ static MARIA_HA *maria_clone_internal(MARIA_SHARE *share, int mode, if ((*share->init)(&info)) goto err; + /* The following should be big enough for all pinning purposes */ + if (my_init_dynamic_array(&info.pinned_pages, + sizeof(MARIA_PINNED_PAGE), + max(share->base.blobs*2 + 4, + MARIA_MAX_TREE_LEVELS*3), 16)) + goto err; + + pthread_mutex_lock(&share->intern_lock); info.read_record= share->read_record; share->reopen++; @@ -207,7 +214,8 @@ err: switch (errpos) { case 6: (*share->end)(&info); - my_free((uchar*) m_info,MYF(0)); + delete_dynamic(&info.pinned_pages); + my_free(m_info, MYF(0)); /* fall through */ case 5: if (data_file < 0) @@ -491,6 +499,7 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags) { share->keyinfo[i].share= share; disk_pos=_ma_keydef_read(disk_pos, &share->keyinfo[i]); + share->keyinfo[i].key_nr= i; disk_pos_assert(disk_pos + share->keyinfo[i].keysegs * HA_KEYSEG_SIZE, end_pos); if (share->keyinfo[i].key_alg == HA_KEY_ALG_RTREE) @@ -718,7 +727,8 @@ MARIA_HA *maria_open(const char *name, int mode, uint open_flags) #ifdef THREAD thr_lock_init(&share->lock); - VOID(pthread_mutex_init(&share->intern_lock,MY_MUTEX_INIT_FAST)); + VOID(pthread_mutex_init(&share->intern_lock, MY_MUTEX_INIT_FAST)); + VOID(pthread_cond_init(&share->intern_cond, 0)); for (i=0; i<keys; i++) VOID(my_rwlock_init(&share->key_root_lock[i], NULL)); VOID(my_rwlock_init(&share->mmap_lock, NULL)); @@ -851,6 +861,8 @@ void _ma_setup_functions(register MARIA_SHARE *share) share->scan_end= maria_scan_end_dummy;/* Compat. dummy function */ share->write_record_init= _ma_write_init_default; share->write_record_abort= _ma_write_abort_default; + share->keypos_to_recpos= _ma_transparent_recpos; + share->recpos_to_keypos= _ma_transparent_recpos; switch (share->data_file_type) { case COMPRESSED_RECORD: @@ -890,13 +902,15 @@ void _ma_setup_functions(register MARIA_SHARE *share) } break; case STATIC_RECORD: - share->read_record= _ma_read_static_record; - share->scan= _ma_read_rnd_static_record; - share->delete_record= _ma_delete_static_record; - share->compare_record= _ma_cmp_static_record; - share->update_record= _ma_update_static_record; - share->write_record= _ma_write_static_record; - share->compare_unique= _ma_cmp_static_unique; + share->read_record= _ma_read_static_record; + share->scan= _ma_read_rnd_static_record; + share->delete_record= _ma_delete_static_record; + share->compare_record= _ma_cmp_static_record; + share->update_record= _ma_update_static_record; + share->write_record= _ma_write_static_record; + share->compare_unique= _ma_cmp_static_unique; + share->keypos_to_recpos= _ma_static_keypos_to_recpos; + share->recpos_to_keypos= _ma_static_recpos_to_keypos; if (share->state.header.org_data_file_type == STATIC_RECORD && ! (share->options & HA_OPTION_NULL_FIELDS)) share->calc_checksum= _ma_static_checksum; @@ -920,6 +934,9 @@ void _ma_setup_functions(register MARIA_SHARE *share) share->write_record= _ma_write_block_record; share->compare_unique= _ma_cmp_block_unique; share->calc_checksum= _ma_checksum; + share->keypos_to_recpos= _ma_transaction_keypos_to_recpos; + share->recpos_to_keypos= _ma_transaction_recpos_to_keypos; + /* write_block_record() will calculate the checksum; Tell maria_write() that it doesn't have to do this. @@ -988,6 +1005,18 @@ static void setup_key_functions(register MARIA_KEYDEF *keyinfo) keyinfo->pack_key= _ma_calc_static_key_length; keyinfo->store_key= _ma_store_static_key; } + + /* set keyinfo->write_comp_flag */ + if (keyinfo->flag & HA_SORT_ALLOWS_SAME) + keyinfo->write_comp_flag=SEARCH_BIGGER; /* Put after same key */ + else if (keyinfo->flag & ( HA_NOSAME | HA_FULLTEXT)) + { + keyinfo->write_comp_flag= SEARCH_FIND | SEARCH_UPDATE; /* No duplicates */ + if (keyinfo->flag & HA_NULL_ARE_EQUAL) + keyinfo->write_comp_flag|= SEARCH_NULL_ARE_EQUAL; + } + else + keyinfo->write_comp_flag= SEARCH_SAME; /* Keys in rec-pos order */ return; } diff --git a/storage/maria/ma_packrec.c b/storage/maria/ma_packrec.c index 446981d9fb9..08959aca079 100644 --- a/storage/maria/ma_packrec.c +++ b/storage/maria/ma_packrec.c @@ -773,7 +773,7 @@ int _ma_pack_rec_unpack(register MARIA_HA *info, MARIA_BIT_BUFF *bit_buff, { memcpy(to, from, info->s->base.null_bytes); to+= info->s->base.null_bytes; - from+= info->s->base.null_bytes; + from+= info->s->base.null_bytes; reclength-= info->s->base.null_bytes; } init_bit_buffer(bit_buff, (uchar*) from, reclength); diff --git a/storage/maria/ma_page.c b/storage/maria/ma_page.c index a7973af3f57..3493ad544c4 100644 --- a/storage/maria/ma_page.c +++ b/storage/maria/ma_page.c @@ -16,27 +16,40 @@ /* Read and write key blocks */ #include "maria_def.h" +#include "trnman.h" +#include "ma_key_recover.h" - /* Fetch a key-page in memory */ +/* Fetch a key-page in memory */ -uchar *_ma_fetch_keypage(register MARIA_HA *info, MARIA_KEYDEF *keyinfo, - my_off_t page, int level, - uchar *buff, - int return_buffer __attribute__ ((unused))) +uchar *_ma_fetch_keypage(register MARIA_HA *info, + MARIA_KEYDEF *keyinfo __attribute__ ((unused)), + my_off_t page, enum pagecache_page_lock lock, + int level, uchar *buff, + int return_buffer __attribute__ ((unused)), + MARIA_PINNED_PAGE **page_link_res) { uchar *tmp; uint page_size; + uint block_size= info->s->block_size; + MARIA_PINNED_PAGE page_link; DBUG_ENTER("_ma_fetch_keypage"); DBUG_PRINT("enter",("page: %ld", (long) page)); - DBUG_ASSERT(info->s->pagecache->block_size == keyinfo->block_length); - /* - TODO: replace PAGECACHE_PLAIN_PAGE with PAGECACHE_LSN_PAGE when - LSN on the pages will be implemented - */ tmp= pagecache_read(info->s->pagecache, &info->s->kfile, - page / keyinfo->block_length, level, buff, - PAGECACHE_PLAIN_PAGE, PAGECACHE_LOCK_LEFT_UNLOCKED, 0); + page / block_size, level, buff, + info->s->page_type, lock, &page_link.link); + + if (lock != PAGECACHE_LOCK_LEFT_UNLOCKED) + { + DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE); + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= 0; + push_dynamic(&info->pinned_pages, (void*) &page_link); + *page_link_res= dynamic_element(&info->pinned_pages, + info->pinned_pages.elements-1, + MARIA_PINNED_PAGE *); + } + if (tmp == info->buff) info->keyread_buff_used=1; else if (!tmp) @@ -50,13 +63,13 @@ uchar *_ma_fetch_keypage(register MARIA_HA *info, MARIA_KEYDEF *keyinfo, info->last_keypage=page; #ifdef EXTRA_DEBUG page_size= _ma_get_page_used(info, tmp); - if (page_size < 4 || page_size > keyinfo->block_length || - _ma_get_keynr(info, tmp) > info->s->base.keys) + if (page_size < 4 || page_size > block_size || + _ma_get_keynr(info, tmp) != keyinfo->key_nr) { DBUG_PRINT("error",("page %lu had wrong page length: %u keynr: %u", (ulong) page, page_size, _ma_get_keynr(info, tmp))); - DBUG_DUMP("page", (char*) tmp, keyinfo->block_length); + DBUG_DUMP("page", (char*) tmp, page_size); info->last_keypage = HA_OFFSET_ERROR; maria_print_error(info->s, HA_ERR_CRASHED); my_errno= HA_ERR_CRASHED; @@ -67,16 +80,20 @@ uchar *_ma_fetch_keypage(register MARIA_HA *info, MARIA_KEYDEF *keyinfo, } /* _ma_fetch_keypage */ - /* Write a key-page on disk */ +/* Write a key-page on disk */ int _ma_write_keypage(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, - my_off_t page, int level, uchar *buff) + my_off_t page, enum pagecache_page_lock lock, + int level, uchar *buff) { + uint block_size= info->s->block_size; + MARIA_PINNED_PAGE page_link; + int res; DBUG_ENTER("_ma_write_keypage"); #ifdef EXTRA_DEBUG /* Safety check */ if (page < info->s->base.keystart || - page+keyinfo->block_length > info->state->key_file_length || + page+block_size > info->state->key_file_length || (page & (MARIA_MIN_KEY_BLOCK_LENGTH-1))) { DBUG_PRINT("error",("Trying to write inside key status region: " @@ -92,113 +109,214 @@ int _ma_write_keypage(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, #endif /* Verify that keynr is correct */ - DBUG_ASSERT(_ma_get_keynr(info, buff) == - (uint) (keyinfo - info->s->keyinfo)); + DBUG_ASSERT(_ma_get_keynr(info, buff) == keyinfo->key_nr); -#ifdef HAVE_purify +#if defined(EXTRA_DEBUG) && defined(HAVE_purify) { - /* Clear unitialized part of page to avoid valgrind/purify warnings */ - uint length= _ma_get_page_used(info, buff); - bzero(buff+length, keyinfo->block_length-length); - length=keyinfo->block_length; + /* This is here to catch uninitialized bytes */ + ulong crc= my_checksum(0, buff, block_size - KEYPAGE_CHECKSUM_SIZE); + int4store(buff + block_size - KEYPAGE_CHECKSUM_SIZE, crc); } #endif - DBUG_ASSERT(info->s->pagecache->block_size == keyinfo->block_length); +#ifdef IDENTICAL_PAGES_AFTER_RECOVERY + { + uint length= _ma_get_page_used(info, buff); + bzero(buff + length, block_size - length); + } +#endif + DBUG_ASSERT(info->s->pagecache->block_size == block_size); if (!(info->s->options & HA_OPTION_PAGE_CHECKSUM)) - bfill(buff + keyinfo->block_length - KEYPAGE_CHECKSUM_SIZE, + bfill(buff + block_size - KEYPAGE_CHECKSUM_SIZE, KEYPAGE_CHECKSUM_SIZE, (uchar) 255); - /* - TODO: replace PAGECACHE_PLAIN_PAGE with PAGECACHE_LSN_PAGE when - LSN on the pages will be implemented - */ - DBUG_RETURN(pagecache_write(info->s->pagecache, - &info->s->kfile, page / keyinfo->block_length, - level, buff, PAGECACHE_PLAIN_PAGE, - PAGECACHE_LOCK_LEFT_UNLOCKED, - PAGECACHE_PIN_LEFT_UNPINNED, - PAGECACHE_WRITE_DELAY, 0)); + res= pagecache_write(info->s->pagecache, + &info->s->kfile, page / block_size, + level, buff, info->s->page_type, + lock, + lock == PAGECACHE_LOCK_LEFT_WRITELOCKED ? + PAGECACHE_PIN_LEFT_PINNED : + PAGECACHE_PIN, + PAGECACHE_WRITE_DELAY, &page_link.link); + + if (lock == PAGECACHE_LOCK_WRITE) + { + /* It was not locked before, we have to unlock it when we unpin pages */ + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= 1; + push_dynamic(&info->pinned_pages, (void*) &page_link); + } + DBUG_RETURN(res); + } /* maria_write_keypage */ - /* Remove page from disk */ +/* + @brief Put page in free list + + @fn _ma_dispose() + @param info Maria handle + @param pos Address to page + @param page_not_read 1 if page has not yet been read + + @note + The page at 'pos' must have been read with a write lock + + @return + @retval 0 ok + £retval 1 error -int _ma_dispose(register MARIA_HA *info, MARIA_KEYDEF *keyinfo, my_off_t pos, - int level) +*/ + +int _ma_dispose(register MARIA_HA *info, my_off_t pos, my_bool page_not_read) { my_off_t old_link; uchar buff[MAX_KEYPAGE_HEADER_SIZE+8]; - uint offset; - pgcache_page_no_t page_no; + ulonglong page_no; + MARIA_SHARE *share= info->s; + MARIA_PINNED_PAGE page_link; + uint block_size= share->block_size; + int result= 0; + enum pagecache_page_lock lock_method; + enum pagecache_page_pin pin_method; DBUG_ENTER("_ma_dispose"); DBUG_PRINT("enter",("pos: %ld", (long) pos)); + DBUG_ASSERT(pos % block_size == 0); + + (void) _ma_lock_key_del(info, 0); - old_link= info->s->state.key_del; - info->s->state.key_del= pos; - page_no= pos / keyinfo->block_length; - offset= pos % keyinfo->block_length; - bzero(buff, info->s->keypage_header); + old_link= share->state.key_del; + share->state.key_del= pos; + page_no= pos / block_size; + bzero(buff, share->keypage_header); _ma_store_keynr(info, buff, (uchar) MARIA_DELETE_KEY_NR); - mi_sizestore(buff + info->s->keypage_header, old_link); - info->s->state.changed|= STATE_NOT_SORTED_PAGES; - - DBUG_ASSERT(info->s->pagecache->block_size == keyinfo->block_length && - info->s->pagecache->block_size == info->s->block_size); - /* - TODO: replace PAGECACHE_PLAIN_PAGE with PAGECACHE_LSN_PAGE when - LSN on the pages will be implemented - */ - DBUG_RETURN(pagecache_write_part(info->s->pagecache, - &info->s->kfile, page_no, level, buff, - PAGECACHE_PLAIN_PAGE, - PAGECACHE_LOCK_LEFT_UNLOCKED, - PAGECACHE_PIN_LEFT_UNPINNED, - PAGECACHE_WRITE_DELAY, 0, - offset, info->s->keypage_header+8, 0, 0)); + mi_sizestore(buff + share->keypage_header, old_link); + share->state.changed|= STATE_NOT_SORTED_PAGES; + + if (info->s->now_transactional) + { + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2]; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 1]; + my_off_t page; + + /* Store address of deleted page */ + page_store(log_data + FILEID_STORE_SIZE, page_no); + + /* Store link to next unused page (the link that is written to page) */ + page= (old_link == HA_OFFSET_ERROR ? IMPOSSIBLE_PAGE_NO : + old_link / info->s->block_size); + page_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE, page); + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + + if (translog_write_record(&lsn, LOGREC_REDO_INDEX_FREE_PAGE, + info->trn, info, sizeof(log_data), + TRANSLOG_INTERNAL_PARTS + 1, log_array, + log_data, NULL)) + result= 1; + } + + if (page_not_read) + { + lock_method= PAGECACHE_LOCK_WRITE; + pin_method= PAGECACHE_PIN; + } + else + { + lock_method= PAGECACHE_LOCK_LEFT_WRITELOCKED; + pin_method= PAGECACHE_PIN_LEFT_PINNED; + } + + if (pagecache_write_part(share->pagecache, + &share->kfile, (pgcache_page_no_t) page_no, + PAGECACHE_PRIORITY_LOW, buff, + share->page_type, + lock_method, pin_method, + PAGECACHE_WRITE_DELAY, &page_link.link, + 0, share->keypage_header+8, 0, 0)) + result= 1; + + if (page_not_read) + { + /* It was not locked before, we have to unlock it when we unpin pages */ + page_link.unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + page_link.changed= 1; + push_dynamic(&info->pinned_pages, (void*) &page_link); + } + + DBUG_RETURN(result); } /* _ma_dispose */ -/* Make new page on disk */ +/** + @brief Get address for free page to use + + @fn _ma_new() + @param info Maria handle + @param level Type of key block (caching priority for pagecache) + @param page_link Pointer to page in page cache if read. One can + check if this is used by checking if + page_link->changed != 0 + + @return + HA_OFFSET_ERROR File is full or page read error + # Page address to use +*/ + +my_off_t _ma_new(register MARIA_HA *info, int level, + MARIA_PINNED_PAGE **page_link) -my_off_t _ma_new(register MARIA_HA *info, MARIA_KEYDEF *keyinfo, int level) { my_off_t pos; - uchar *buff; + MARIA_SHARE *share= info->s; + uint block_size= share->block_size; DBUG_ENTER("_ma_new"); - if ((pos= info->s->state.key_del) == HA_OFFSET_ERROR) + if (_ma_lock_key_del(info, 1)) { if (info->state->key_file_length >= - info->s->base.max_key_file_length - keyinfo->block_length) + share->base.max_key_file_length - block_size) { my_errno=HA_ERR_INDEX_FILE_FULL; DBUG_RETURN(HA_OFFSET_ERROR); } - pos=info->state->key_file_length; - info->state->key_file_length+= keyinfo->block_length; + pos= info->state->key_file_length; + info->state->key_file_length+= block_size; + (*page_link)->changed= 0; + (*page_link)->write_lock= PAGECACHE_LOCK_WRITE; } else { - /* QQ: Remove this alloc (We don't have to copy the page) */ - buff= my_alloca(info->s->block_size); - DBUG_ASSERT(info->s->pagecache->block_size == keyinfo->block_length && - info->s->pagecache->block_size == info->s->block_size); + uchar *buff; /* TODO: replace PAGECACHE_PLAIN_PAGE with PAGECACHE_LSN_PAGE when LSN on the pages will be implemented */ - DBUG_ASSERT(info->s->pagecache->block_size == keyinfo->block_length); - if (!pagecache_read(info->s->pagecache, - &info->s->kfile, pos / keyinfo->block_length, level, - buff, PAGECACHE_PLAIN_PAGE, - PAGECACHE_LOCK_LEFT_UNLOCKED, 0)) + pos= info->s->state.key_del; /* Protected */ + DBUG_ASSERT(share->pagecache->block_size == block_size); + if (!(buff= pagecache_read(share->pagecache, + &share->kfile, pos / block_size, level, + 0, share->page_type, + PAGECACHE_LOCK_WRITE, &(*page_link)->link))) pos= HA_OFFSET_ERROR; else - info->s->state.key_del= mi_sizekorr(buff+info->s->keypage_header); - my_afree(buff); + { + share->current_key_del= mi_sizekorr(buff+share->keypage_header); + DBUG_ASSERT(share->current_key_del != info->s->state.key_del && + share->current_key_del); + } + + (*page_link)->unlock= PAGECACHE_LOCK_WRITE_UNLOCK; + (*page_link)->write_lock= PAGECACHE_LOCK_WRITE; + (*page_link)->changed= 0; + push_dynamic(&info->pinned_pages, (void*) &page_link); + *page_link= dynamic_element(&info->pinned_pages, + info->pinned_pages.elements-1, + MARIA_PINNED_PAGE *); } - info->s->state.changed|= STATE_NOT_SORTED_PAGES; + share->state.changed|= STATE_NOT_SORTED_PAGES; DBUG_PRINT("exit",("Pos: %ld",(long) pos)); DBUG_RETURN(pos); } /* _ma_new */ diff --git a/storage/maria/ma_pagecache.c b/storage/maria/ma_pagecache.c index 842acd2f0e8..4e468c380db 100755 --- a/storage/maria/ma_pagecache.c +++ b/storage/maria/ma_pagecache.c @@ -156,6 +156,7 @@ struct st_pagecache_hash_link #define PCBLOCK_REASSIGNED 8 /* block does not accept requests for old page */ #define PCBLOCK_IN_FLUSH 16 /* block is in flush operation */ #define PCBLOCK_CHANGED 32 /* block buffer contains a dirty page */ +#define PCBLOCK_DIRECT_W 64 /* possible direct write to the block */ /* page status, returned by find_block */ #define PAGE_READ 0 @@ -463,7 +464,7 @@ error: DBUG_RETURN(1); } #endif /* NOT_USED */ -#endif /* !DBUG_OFF */ +#endif /* !DBUG_OFF */ #define FLUSH_CACHE 2000 /* sort this many blocks at once */ @@ -719,6 +720,11 @@ ulong init_pagecache(PAGECACHE *pagecache, size_t use_mem, { for ( ; ; ) { + if (blocks < 8) + { + my_errno= ENOMEM; + goto err; + } /* Set my_hash_entries to the next bigger 2 power */ if ((pagecache->hash_entries= next_power(blocks)) < (blocks) * 5/4) @@ -749,11 +755,6 @@ ulong init_pagecache(PAGECACHE *pagecache, size_t use_mem, my_large_free(pagecache->block_mem, MYF(0)); pagecache->block_mem= 0; } - if (blocks < 8) - { - my_errno= ENOMEM; - goto err; - } blocks= blocks / 4*3; } pagecache->blocks_unused= blocks; @@ -1082,7 +1083,8 @@ void end_pagecache(PAGECACHE *pagecache, my_bool cleanup) DBUG_PRINT("status", ("used: %lu changed: %lu w_requests: %lu " "writes: %lu r_requests: %lu reads: %lu", - pagecache->blocks_used, pagecache->global_blocks_changed, + pagecache->blocks_used, + pagecache->global_blocks_changed, (ulong) pagecache->global_cache_w_requests, (ulong) pagecache->global_cache_write, (ulong) pagecache->global_cache_r_requests, @@ -1132,9 +1134,9 @@ static inline void link_changed(PAGECACHE_BLOCK_LINK *block, static void link_to_file_list(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block, - PAGECACHE_FILE *file, my_bool unlink) + PAGECACHE_FILE *file, my_bool unlink_flag) { - if (unlink) + if (unlink_flag) unlink_changed(block); link_changed(block, &pagecache->file_blocks[FILE_HASH(*file)]); if (block->status & PCBLOCK_CHANGED) @@ -2503,22 +2505,22 @@ static void check_and_set_lsn(PAGECACHE *pagecache, } -/* - Unlock/unpin page and put LSN stamp if it need - - SYNOPSIS - pagecache_unlock() - pagecache pointer to a page cache data structure - file handler for the file for the block of data to be read - pageno number of the block of data in the file - lock lock change - pin pin page - first_REDO_LSN_for_page do not set it if it is zero - lsn if it is not LSN_IMPOSSIBLE (0) and it +/** + @brief Unlock/unpin page and put LSN stamp if it need + + @param pagecache pointer to a page cache data structure + @pagam file handler for the file for the block of data to be read + @param pageno number of the block of data in the file + @param lock lock change + @param pin pin page + @param first_REDO_LSN_for_page do not set it if it is zero + @param lsn if it is not LSN_IMPOSSIBLE (0) and it is bigger then LSN on the page it will be written on the page + @param was_changed should be true if the page was write locked with + direct link giving and the page was changed - NOTE + @note Pininig uses requests registration mechanism it works following way: | beginnig | ending | | of func. | of func. | @@ -2537,7 +2539,7 @@ void pagecache_unlock(PAGECACHE *pagecache, enum pagecache_page_lock lock, enum pagecache_page_pin pin, LSN first_REDO_LSN_for_page, - LSN lsn) + LSN lsn, my_bool was_changed) { PAGECACHE_BLOCK_LINK *block; int page_st; @@ -2578,6 +2580,26 @@ void pagecache_unlock(PAGECACHE *pagecache, if (lsn != LSN_IMPOSSIBLE) check_and_set_lsn(pagecache, lsn, block); + /* if we lock for write we must link the block to changed blocks */ + DBUG_ASSERT((block->status & PCBLOCK_DIRECT_W) == 0 || + (lock == PAGECACHE_LOCK_WRITE_UNLOCK || + lock == PAGECACHE_LOCK_LEFT_WRITELOCKED)); + /* + if was_changed then status should be PCBLOCK_DIRECT_W or marked + as dirty + */ + DBUG_ASSERT(!was_changed || (block->status & PCBLOCK_DIRECT_W) || + (block->status & PCBLOCK_CHANGED)); + if ((block->status & PCBLOCK_DIRECT_W) && + (lock == PAGECACHE_LOCK_WRITE_UNLOCK)) + { + if (!(block->status & PCBLOCK_CHANGED) && was_changed) + link_to_changed_list(pagecache, block); + block->status&= ~PCBLOCK_DIRECT_W; + DBUG_PRINT("info", ("Drop PCBLOCK_DIRECT_W for block: 0x%lx", + (ulong) block)); + } + if (make_lock_and_pin(pagecache, block, lock, pin, file)) { DBUG_ASSERT(0); /* should not happend */ @@ -2635,6 +2657,8 @@ void pagecache_unpin(PAGECACHE *pagecache, block= find_block(pagecache, file, pageno, 0, 0, 0, &page_st); DBUG_ASSERT(block != 0); DBUG_ASSERT(page_st == PAGE_READ); + /* we can't unpin such page without unlock */ + DBUG_ASSERT((block->status & PCBLOCK_DIRECT_W) == 0); if (lsn != LSN_IMPOSSIBLE) check_and_set_lsn(pagecache, lsn, block); @@ -2665,19 +2689,19 @@ void pagecache_unpin(PAGECACHE *pagecache, } -/* - Unlock/unpin page and put LSN stamp if it need +/** + @brief Unlock/unpin page and put LSN stamp if it need (uses direct block/page pointer) - SYNOPSIS - pagecache_unlock_by_link() - pagecache pointer to a page cache data structure - link direct link to page (returned by read or write) - lock lock change - pin pin page - first_REDO_LSN_for_page do not set it if it is LSN_IMPOSSIBLE (0) - lsn if it is not LSN_IMPOSSIBLE and it is bigger then + @param pagecache pointer to a page cache data structure + @param link direct link to page (returned by read or write) + @param lock lock change + @param pin pin page + @param first_REDO_LSN_for_page do not set it if it is LSN_IMPOSSIBLE (0) + @param lsn if it is not LSN_IMPOSSIBLE and it is bigger then LSN on the page it will be written on the page + @param was_changed should be true if the page was write locked with + direct link giving and the page was changed */ void pagecache_unlock_by_link(PAGECACHE *pagecache, @@ -2685,7 +2709,7 @@ void pagecache_unlock_by_link(PAGECACHE *pagecache, enum pagecache_page_lock lock, enum pagecache_page_pin pin, LSN first_REDO_LSN_for_page, - LSN lsn) + LSN lsn, my_bool was_changed) { DBUG_ENTER("pagecache_unlock_by_link"); DBUG_PRINT("enter", ("block: 0x%lx fd: %u page: %lu %s %s", @@ -2719,24 +2743,48 @@ void pagecache_unlock_by_link(PAGECACHE *pagecache, DBUG_ASSERT(pagecache->can_be_used); inc_counter_for_resize_op(pagecache); - if (first_REDO_LSN_for_page != LSN_IMPOSSIBLE) + if (was_changed) { - /* - LOCK_READ_UNLOCK is ok here as the page may have first locked - with WRITE lock that was temporarly converted to READ lock before - it's unpinned - */ - DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK || - lock == PAGECACHE_LOCK_READ_UNLOCK); - DBUG_ASSERT(pin == PAGECACHE_UNPIN); - if (block->rec_lsn == LSN_MAX) - block->rec_lsn= first_REDO_LSN_for_page; - else - DBUG_ASSERT(cmp_translog_addr(block->rec_lsn, - first_REDO_LSN_for_page) <= 0); + if (first_REDO_LSN_for_page != LSN_IMPOSSIBLE) + { + /* + LOCK_READ_UNLOCK is ok here as the page may have first locked + with WRITE lock that was temporarly converted to READ lock before + it's unpinned + */ + DBUG_ASSERT(lock == PAGECACHE_LOCK_WRITE_UNLOCK || + lock == PAGECACHE_LOCK_READ_UNLOCK); + DBUG_ASSERT(pin == PAGECACHE_UNPIN); + if (block->rec_lsn == LSN_MAX) + block->rec_lsn= first_REDO_LSN_for_page; + else + DBUG_ASSERT(cmp_translog_addr(block->rec_lsn, + first_REDO_LSN_for_page) <= 0); + } + if (lsn != LSN_IMPOSSIBLE) + check_and_set_lsn(pagecache, lsn, block); + } + + /* if we lock for write we must link the block to changed blocks */ + DBUG_ASSERT((block->status & PCBLOCK_DIRECT_W) == 0 || + (lock == PAGECACHE_LOCK_WRITE_UNLOCK || + lock == PAGECACHE_LOCK_LEFT_WRITELOCKED)); + /* + If was_changed then status should be PCBLOCK_DIRECT_W or marked + as dirty + */ + DBUG_ASSERT(!was_changed || (block->status & PCBLOCK_DIRECT_W) || + (block->status & PCBLOCK_CHANGED)); + if ((block->status & PCBLOCK_DIRECT_W) && + (lock == PAGECACHE_LOCK_WRITE_UNLOCK)) + { + if (!(block->status & PCBLOCK_CHANGED) && was_changed) + link_to_changed_list(pagecache, block); + block->status&= ~PCBLOCK_DIRECT_W; + DBUG_PRINT("info", ("Drop PCBLOCK_DIRECT_W for block: 0x%lx", + (ulong) block)); + } - if (lsn != LSN_IMPOSSIBLE) - check_and_set_lsn(pagecache, lsn, block); if (make_lock_and_pin(pagecache, block, lock, pin, 0)) DBUG_ASSERT(0); /* should not happend */ @@ -2786,6 +2834,8 @@ void pagecache_unpin_by_link(PAGECACHE *pagecache, unlock. */ DBUG_ASSERT(pagecache->can_be_used); + /* we can't unpin such page without unlock */ + DBUG_ASSERT((block->status & PCBLOCK_DIRECT_W) == 0); inc_counter_for_resize_op(pagecache); @@ -2869,7 +2919,7 @@ uchar *pagecache_valid_read(PAGECACHE *pagecache, uchar *buff, enum pagecache_page_type type, enum pagecache_page_lock lock, - PAGECACHE_BLOCK_LINK **link, + PAGECACHE_BLOCK_LINK **page_link, pagecache_disk_read_validator validator, uchar* validator_data) { @@ -2887,9 +2937,9 @@ uchar *pagecache_valid_read(PAGECACHE *pagecache, DBUG_ASSERT(buff != 0 || (buff == 0 && (pin == PAGECACHE_PIN || pin == PAGECACHE_PIN_LEFT_PINNED))); - if (!link) - link= &fake_link; - *link= 0; /* Catch errors */ + if (!page_link) + page_link= &fake_link; + *page_link= 0; /* Catch errors */ restart: @@ -2932,6 +2982,11 @@ restart: validator, validator_data); DBUG_PRINT("info", ("read is done")); } + + /* PCBLOCK_DIRECT_W should be unlocked in unlock */ + DBUG_ASSERT((block->status & PCBLOCK_DIRECT_W) == 0 || + lock == PAGECACHE_LOCK_LEFT_WRITELOCKED); + if (make_lock_and_pin(pagecache, block, lock, pin, file)) { /* @@ -2947,11 +3002,15 @@ restart: if (!buff) { buff= block->buffer; - /* if we lock for write we must link the block to changed blocks */ + /* possibly we will write here (resolved on unlock) */ if ((lock == PAGECACHE_LOCK_WRITE || lock == PAGECACHE_LOCK_LEFT_WRITELOCKED) && !(block->status & PCBLOCK_CHANGED)) - link_to_changed_list(pagecache, block); + { + block->status|= PCBLOCK_DIRECT_W; + DBUG_PRINT("info", ("Set PCBLOCK_DIRECT_W for block: 0x%lx", + (ulong) block)); + } } else { @@ -2980,7 +3039,7 @@ restart: if (pin == PAGECACHE_PIN_LEFT_UNPINNED || pin == PAGECACHE_UNPIN) unreg_request(pagecache, block, 1); else - *link= block; + *page_link= block; dec_counter_for_resize_op(pagecache); @@ -3039,28 +3098,27 @@ my_bool pagecache_delete(PAGECACHE *pagecache, lock == PAGECACHE_LOCK_LEFT_WRITELOCKED); DBUG_ASSERT(pin == PAGECACHE_PIN || pin == PAGECACHE_PIN_LEFT_PINNED); - restart: if (pagecache->can_be_used) { /* Key cache is used */ reg1 PAGECACHE_BLOCK_LINK *block; - PAGECACHE_HASH_LINK **unused_start, *link; + PAGECACHE_HASH_LINK **unused_start, *page_link; pagecache_pthread_mutex_lock(&pagecache->cache_lock); if (!pagecache->can_be_used) goto end; inc_counter_for_resize_op(pagecache); - link= get_present_hash_link(pagecache, file, pageno, &unused_start); - if (!link) + page_link= get_present_hash_link(pagecache, file, pageno, &unused_start); + if (!page_link) { DBUG_PRINT("info", ("There is no such page in the cache")); pagecache_pthread_mutex_unlock(&pagecache->cache_lock); DBUG_RETURN(0); } - block= link->block; + block= page_link->block; /* See NOTE for pagecache_unlock about registering requests. */ if (pin == PAGECACHE_PIN) reg_requests(pagecache, block, 1); @@ -3076,6 +3134,9 @@ restart: goto restart; } + /* we can't delete with opened direct link for write */ + DBUG_ASSERT((block->status & PCBLOCK_DIRECT_W) == 0); + if (block->status & PCBLOCK_CHANGED) { if (flush) @@ -3116,8 +3177,8 @@ restart: make_lock_and_pin(pagecache, block, PAGECACHE_LOCK_WRITE_UNLOCK, PAGECACHE_UNPIN, file); - DBUG_ASSERT(link->requests > 0); - link->requests--; + DBUG_ASSERT(page_link->requests > 0); + page_link->requests--; /* See NOTE for pagecache_unlock about registering requests. */ free_block(pagecache, block); @@ -3234,7 +3295,7 @@ my_bool pagecache_write_part(PAGECACHE *pagecache, enum pagecache_page_lock lock, enum pagecache_page_pin pin, enum pagecache_write_mode write_mode, - PAGECACHE_BLOCK_LINK **link, + PAGECACHE_BLOCK_LINK **page_link, uint offset, uint size, pagecache_disk_read_validator validator, uchar* validator_data) @@ -3257,9 +3318,9 @@ my_bool pagecache_write_part(PAGECACHE *pagecache, DBUG_ASSERT(lock != PAGECACHE_LOCK_READ_UNLOCK); DBUG_ASSERT(offset + size <= pagecache->block_size); - if (!link) - link= &fake_link; - *link= 0; + if (!page_link) + page_link= &fake_link; + *page_link= 0; restart: @@ -3307,6 +3368,10 @@ restart: (block->type == PAGECACHE_PLAIN_PAGE && type == PAGECACHE_LSN_PAGE)); block->type= type; + /* we write to the page so it has no sense to keep the flag */ + block->status&= ~PCBLOCK_DIRECT_W; + DBUG_PRINT("info", ("Drop PCBLOCK_DIRECT_W for block: 0x%lx", + (ulong) block)); if (make_lock_and_pin(pagecache, block, write_lock_change_table[lock].new_lock, @@ -3384,7 +3449,7 @@ restart: if (pin == PAGECACHE_PIN_LEFT_UNPINNED || pin == PAGECACHE_UNPIN) unreg_request(pagecache, block, 1); else - *link= block; + *page_link= block; if (block->status & PCBLOCK_ERROR) error= 1; @@ -3629,8 +3694,10 @@ static int flush_pagecache_blocks_int(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *cache_buff[FLUSH_CACHE],**cache; int last_errno= 0; DBUG_ENTER("flush_pagecache_blocks_int"); - DBUG_PRINT("enter",("file: %d blocks_used: %lu blocks_changed: %lu", - file->file, pagecache->blocks_used, pagecache->blocks_changed)); + DBUG_PRINT("enter", + ("file: %d blocks_used: %lu blocks_changed: %lu type: %d", + file->file, pagecache->blocks_used, pagecache->blocks_changed, + type)); #if !defined(DBUG_OFF) && defined(EXTRA_DEBUG) DBUG_EXECUTE("check_pagecache", @@ -4044,7 +4111,7 @@ my_bool pagecache_collect_changed_blocks_with_lsn(PAGECACHE *pagecache, wqueue_add_to_queue(&other_flusher->flush_queue, thread); do { - KEYCACHE_DBUG_PRINT("pagecache_collect_çhanged_blocks_with_lsn: wait", + KEYCACHE_DBUG_PRINT("pagecache_collect_çhanged_blocks_with_lsn: wait", ("suspend thread %ld", thread->id)); pagecache_pthread_cond_wait(&thread->suspend, &pagecache->cache_lock); @@ -4311,7 +4378,7 @@ static int pagecache_pthread_cond_wait(pthread_cond_t *cond, #endif #endif /* defined(PAGECACHE_TIMEOUT) && !defined(__WIN__) */ -#if defined(PAGECACHE_DEBUG) +#if defined(PAGECACHE_DEBUG) static int ___pagecache_pthread_mutex_lock(pthread_mutex_t *mutex) { int rc; diff --git a/storage/maria/ma_pagecache.h b/storage/maria/ma_pagecache.h index 64935a0fa36..59f4803c913 100644 --- a/storage/maria/ma_pagecache.h +++ b/storage/maria/ma_pagecache.h @@ -97,6 +97,9 @@ typedef struct st_pagecache_hash_link PAGECACHE_HASH_LINK; typedef my_bool (*pagecache_disk_read_validator)(uchar *page, uchar *data); #define PAGECACHE_CHANGED_BLOCKS_HASH 128 /* must be power of 2 */ +#define PAGECACHE_PRIORITY_LOW 0 +#define PAGECACHE_PRIORITY_DEFAULT 3 +#define PAGECACHE_PRIORITY_HIGH 6 /* The page cache structure @@ -228,13 +231,13 @@ extern void pagecache_unlock(PAGECACHE *pagecache, enum pagecache_page_lock lock, enum pagecache_page_pin pin, LSN first_REDO_LSN_for_page, - LSN lsn); + LSN lsn, my_bool was_changed); extern void pagecache_unlock_by_link(PAGECACHE *pagecache, PAGECACHE_BLOCK_LINK *block, enum pagecache_page_lock lock, enum pagecache_page_pin pin, LSN first_REDO_LSN_for_page, - LSN lsn); + LSN lsn, my_bool was_changed); extern void pagecache_unpin(PAGECACHE *pagecache, PAGECACHE_FILE *file, pgcache_page_no_t pageno, diff --git a/storage/maria/ma_panic.c b/storage/maria/ma_panic.c index 0394f630343..ceabcd991aa 100644 --- a/storage/maria/ma_panic.c +++ b/storage/maria/ma_panic.c @@ -30,7 +30,7 @@ locked. A maria_readinfo() is done for all single user files to get changes in database - + RETURN 0 ok # error number in case of error diff --git a/storage/maria/ma_range.c b/storage/maria/ma_range.c index 8ef6dc87c0c..2fd172793ba 100644 --- a/storage/maria/ma_range.c +++ b/storage/maria/ma_range.c @@ -211,7 +211,9 @@ static double _ma_search_pos(register MARIA_HA *info, if (pos == HA_OFFSET_ERROR) DBUG_RETURN(0.5); - if (!(buff= _ma_fetch_keypage(info,keyinfo,pos,DFLT_INIT_HITS,info->buff,1))) + if (!(buff= _ma_fetch_keypage(info,keyinfo, pos, + PAGECACHE_LOCK_LEFT_UNLOCKED, DFLT_INIT_HITS, + info->buff, 1, 0))) goto err; flag=(*keyinfo->bin_search)(info, keyinfo, buff, key, key_len, nextflag, &keypos,info->lastkey, &after_key); diff --git a/storage/maria/ma_recovery.c b/storage/maria/ma_recovery.c index 4d2c8dc7fdd..5c7bd4a9f65 100644 --- a/storage/maria/ma_recovery.c +++ b/storage/maria/ma_recovery.c @@ -23,8 +23,9 @@ #include "maria_def.h" #include "ma_recovery.h" #include "ma_blockrec.h" -#include "trnman.h" #include "ma_checkpoint.h" +#include "trnman.h" +#include "ma_key_recover.h" struct st_trn_for_recovery /* used only in the REDO phase */ { @@ -81,14 +82,23 @@ prototype_redo_exec_hook(REDO_PURGE_ROW_TAIL); prototype_redo_exec_hook(REDO_FREE_HEAD_OR_TAIL); prototype_redo_exec_hook(REDO_FREE_BLOCKS); prototype_redo_exec_hook(REDO_DELETE_ALL); +prototype_redo_exec_hook(REDO_INDEX); +prototype_redo_exec_hook(REDO_INDEX_NEW_PAGE); +prototype_redo_exec_hook(REDO_INDEX_FREE_PAGE); prototype_redo_exec_hook(UNDO_ROW_INSERT); prototype_redo_exec_hook(UNDO_ROW_DELETE); prototype_redo_exec_hook(UNDO_ROW_UPDATE); +prototype_redo_exec_hook(UNDO_KEY_INSERT); +prototype_redo_exec_hook(UNDO_KEY_DELETE); +prototype_redo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT); prototype_redo_exec_hook(COMMIT); prototype_redo_exec_hook(CLR_END); prototype_undo_exec_hook(UNDO_ROW_INSERT); prototype_undo_exec_hook(UNDO_ROW_DELETE); prototype_undo_exec_hook(UNDO_ROW_UPDATE); +prototype_undo_exec_hook(UNDO_KEY_INSERT); +prototype_undo_exec_hook(UNDO_KEY_DELETE); +prototype_undo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT); static int run_redo_phase(LSN lsn, enum maria_apply_log_way apply); static uint end_of_redo_phase(my_bool prepare_for_undo_phase); @@ -879,7 +889,7 @@ prototype_redo_exec_hook(REDO_DROP_TABLE) { char *name; int error= 1; - MARIA_HA *info= NULL; + MARIA_HA *info; if (skip_DDLs) { tprint(tracef, "we skip DDLs\n"); @@ -892,7 +902,7 @@ prototype_redo_exec_hook(REDO_DROP_TABLE) rec->record_length) { tprint(tracef, "Failed to read record\n"); - goto end; + return 1; } name= log_record_buffer.str; tprint(tracef, "Table '%s'", name); @@ -1005,6 +1015,7 @@ static int new_table(uint16 sid, const char *name, */ int error= 1; MARIA_HA *info; + MARIA_SHARE *share; checkpoint_useful= TRUE; if ((name == NULL) || (name[0] == 0)) @@ -1032,7 +1043,7 @@ static int new_table(uint16 sid, const char *name, tprint(tracef, "Table is crashed, can't apply log records to it\n"); goto end; } - MARIA_SHARE *share= info->s; + share= info->s; /* check that we're not already using it */ if (share->reopen != 1) { @@ -1320,6 +1331,75 @@ end: } +prototype_redo_exec_hook(REDO_INDEX) +{ + int error= 1; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL) + return 0; + enlarge_buffer(rec); + + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + tprint(tracef, "Failed to read record\n"); + goto end; + } + + if (_ma_apply_redo_index(info, current_group_end_lsn, + log_record_buffer.str + FILEID_STORE_SIZE, + rec->record_length - FILEID_STORE_SIZE)) + goto end; + error= 0; +end: + return error; +} + +prototype_redo_exec_hook(REDO_INDEX_NEW_PAGE) +{ + int error= 1; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL) + return 0; + enlarge_buffer(rec); + + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + tprint(tracef, "Failed to read record\n"); + goto end; + } + + if (_ma_apply_redo_index_new_page(info, current_group_end_lsn, + log_record_buffer.str + FILEID_STORE_SIZE, + rec->record_length - FILEID_STORE_SIZE)) + goto end; + error= 0; +end: + return error; +} + + +prototype_redo_exec_hook(REDO_INDEX_FREE_PAGE) +{ + int error= 1; + MARIA_HA *info= get_MARIA_HA_from_REDO_record(rec); + if (info == NULL) + return 0; + + if (_ma_apply_redo_index_free_page(info, current_group_end_lsn, + rec->header + FILEID_STORE_SIZE)) + goto end; + error= 0; +end: + return error; +} + + #define set_undo_lsn_for_active_trans(TRID, LSN) do { \ all_active_trans[TRID].undo_lsn= LSN; \ if (all_active_trans[TRID].first_undo_lsn == LSN_IMPOSSIBLE) \ @@ -1328,9 +1408,11 @@ end: prototype_redo_exec_hook(UNDO_ROW_INSERT) { MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + MARIA_SHARE *share; + if (info == NULL) return 0; - MARIA_SHARE *share= info->s; + share= info->s; set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn); if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0) { @@ -1353,8 +1435,7 @@ prototype_redo_exec_hook(UNDO_ROW_INSERT) @todo some bits below will rather be set when executing UNDOs related to keys */ - info->s->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED | - STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES; + info->s->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED; } tprint(tracef, " rows' count %lu\n", (ulong)info->s->state.state.records); return 0; @@ -1364,9 +1445,11 @@ prototype_redo_exec_hook(UNDO_ROW_INSERT) prototype_redo_exec_hook(UNDO_ROW_DELETE) { MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + MARIA_SHARE *share; + if (info == NULL) return 0; - MARIA_SHARE *share= info->s; + share= info->s; set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn); if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0) { @@ -1385,8 +1468,7 @@ prototype_redo_exec_hook(UNDO_ROW_DELETE) } share->state.state.checksum+= ha_checksum_korr(buff); } - share->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED | - STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES; + share->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED; } tprint(tracef, " rows' count %lu\n", (ulong)share->state.state.records); return 0; @@ -1396,9 +1478,10 @@ prototype_redo_exec_hook(UNDO_ROW_DELETE) prototype_redo_exec_hook(UNDO_ROW_UPDATE) { MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + MARIA_SHARE *share; if (info == NULL) return 0; - MARIA_SHARE *share= info->s; + share= info->s; set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn); if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0) { @@ -1415,8 +1498,41 @@ prototype_redo_exec_hook(UNDO_ROW_UPDATE) } share->state.state.checksum+= ha_checksum_korr(buff); } - share->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED | - STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES; + share->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED; + } + return 0; +} + +prototype_redo_exec_hook(UNDO_KEY_INSERT) +{ + set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn); + return 0; +} + +prototype_redo_exec_hook(UNDO_KEY_DELETE) +{ + set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn); + return 0; +} + +prototype_redo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT) +{ + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + MARIA_SHARE *share; + if (info == NULL) + return 0; + share= info->s; + set_undo_lsn_for_active_trans(rec->short_trid, rec->lsn); + if (cmp_translog_addr(rec->lsn, share->state.is_of_horizon) >= 0) + { + uint key_nr; + my_off_t page; + page= page_korr(rec->header + LSN_STORE_SIZE + FILEID_STORE_SIZE); + key_nr= key_nr_korr(rec->header + LSN_STORE_SIZE + FILEID_STORE_SIZE + + PAGE_STORE_SIZE); + share->state.key_root[key_nr]= (page == IMPOSSIBLE_PAGE_NO ? + HA_OFFSET_ERROR : + page * share->block_size); } return 0; } @@ -1464,13 +1580,18 @@ prototype_redo_exec_hook(COMMIT) prototype_redo_exec_hook(CLR_END) { MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + MARIA_SHARE *share; + LSN previous_undo_lsn; + enum translog_record_type undone_record_type; + const LOG_DESC *log_desc; + if (info == NULL) return 0; - MARIA_SHARE *share= info->s; - LSN previous_undo_lsn= lsn_korr(rec->header); - enum translog_record_type undone_record_type= + share= info->s; + previous_undo_lsn= lsn_korr(rec->header); + undone_record_type= clr_type_korr(rec->header + LSN_STORE_SIZE + FILEID_STORE_SIZE); - const LOG_DESC *log_desc= &log_record_type_descriptor[undone_record_type]; + log_desc= &log_record_type_descriptor[undone_record_type]; set_undo_lsn_for_active_trans(rec->short_trid, previous_undo_lsn); tprint(tracef, " CLR_END was about %s, undo_lsn now LSN (%lu,0x%lx)\n", @@ -1502,8 +1623,7 @@ prototype_redo_exec_hook(CLR_END) default: DBUG_ASSERT(0); } - share->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED | - STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES; + share->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED; } tprint(tracef, " rows' count %lu\n", (ulong)share->state.state.records); return 0; @@ -1515,6 +1635,8 @@ prototype_undo_exec_hook(UNDO_ROW_INSERT) my_bool error; MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); LSN previous_undo_lsn= lsn_korr(rec->header); + MARIA_SHARE *share; + const uchar *record_ptr; if (info == NULL) { @@ -1526,11 +1648,10 @@ prototype_undo_exec_hook(UNDO_ROW_INSERT) */ return 1; } - MARIA_SHARE *share= info->s; - share->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED | - STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES; + share= info->s; + share->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED; - const uchar *record_ptr= rec->header; + record_ptr= rec->header; if (share->calc_checksum) { /* @@ -1568,14 +1689,13 @@ prototype_undo_exec_hook(UNDO_ROW_DELETE) my_bool error; MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); LSN previous_undo_lsn= lsn_korr(rec->header); + MARIA_SHARE *share; if (info == NULL) return 1; - MARIA_SHARE *share= info->s; - share->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED | - STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES; - + share= info->s; + share->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED; enlarge_buffer(rec); if (log_record_buffer.str == NULL || translog_read_record(rec->lsn, 0, rec->record_length, @@ -1610,12 +1730,13 @@ prototype_undo_exec_hook(UNDO_ROW_UPDATE) my_bool error; MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); LSN previous_undo_lsn= lsn_korr(rec->header); + MARIA_SHARE *share; if (info == NULL) return 1; - MARIA_SHARE *share= info->s; - share->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED | - STATE_NOT_OPTIMIZED_KEYS | STATE_NOT_SORTED_PAGES; + + share= info->s; + share->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED; enlarge_buffer(rec); if (log_record_buffer.str == NULL || @@ -1640,6 +1761,142 @@ prototype_undo_exec_hook(UNDO_ROW_UPDATE) } +prototype_undo_exec_hook(UNDO_KEY_INSERT) +{ + my_bool error; + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + LSN previous_undo_lsn= lsn_korr(rec->header); + MARIA_SHARE *share; + + if (info == NULL) + { + /* + Unlike for REDOs, if the table was skipped it is abnormal; we have a + transaction to rollback which used this table, as it is not rolled back + it was supposed to hold this table and so the table should still be + there. + */ + return 1; + } + + share= info->s; + share->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED; + + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + tprint(tracef, "Failed to read record\n"); + return 1; + } + + info->trn= trn; + error= _ma_apply_undo_key_insert(info, previous_undo_lsn, + log_record_buffer.str + LSN_STORE_SIZE + + FILEID_STORE_SIZE, + rec->record_length - LSN_STORE_SIZE - + FILEID_STORE_SIZE); + info->trn= 0; + /* trn->undo_lsn is updated in an inwrite_hook when writing the CLR_END */ + tprint(tracef, " undo_lsn now LSN (%lu,0x%lx)\n", + LSN_IN_PARTS(previous_undo_lsn)); + return error; +} + + +prototype_undo_exec_hook(UNDO_KEY_DELETE) +{ + my_bool error; + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + LSN previous_undo_lsn= lsn_korr(rec->header); + MARIA_SHARE *share; + + if (info == NULL) + { + /* + Unlike for REDOs, if the table was skipped it is abnormal; we have a + transaction to rollback which used this table, as it is not rolled back + it was supposed to hold this table and so the table should still be + there. + */ + return 1; + } + + share= info->s; + share->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED; + + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + tprint(tracef, "Failed to read record\n"); + return 1; + } + + info->trn= trn; + error= _ma_apply_undo_key_delete(info, previous_undo_lsn, + log_record_buffer.str + LSN_STORE_SIZE + + FILEID_STORE_SIZE, + rec->record_length - LSN_STORE_SIZE - + FILEID_STORE_SIZE); + info->trn= 0; + /* trn->undo_lsn is updated in an inwrite_hook when writing the CLR_END */ + tprint(tracef, " undo_lsn now LSN (%lu,0x%lx)\n", + LSN_IN_PARTS(previous_undo_lsn)); + return error; +} + + +prototype_undo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT) +{ + my_bool error; + MARIA_HA *info= get_MARIA_HA_from_UNDO_record(rec); + LSN previous_undo_lsn= lsn_korr(rec->header); + MARIA_SHARE *share; + + if (info == NULL) + { + /* + Unlike for REDOs, if the table was skipped it is abnormal; we have a + transaction to rollback which used this table, as it is not rolled back + it was supposed to hold this table and so the table should still be + there. + */ + return 1; + } + + share= info->s; + share->state.changed|= STATE_CHANGED | STATE_NOT_ANALYZED; + + enlarge_buffer(rec); + if (log_record_buffer.str == NULL || + translog_read_record(rec->lsn, 0, rec->record_length, + log_record_buffer.str, NULL) != + rec->record_length) + { + tprint(tracef, "Failed to read record\n"); + return 1; + } + + info->trn= trn; + error= _ma_apply_undo_key_delete(info, previous_undo_lsn, + log_record_buffer.str + LSN_STORE_SIZE + + FILEID_STORE_SIZE + PAGE_STORE_SIZE, + rec->record_length - LSN_STORE_SIZE - + FILEID_STORE_SIZE - PAGE_STORE_SIZE); + info->trn= 0; + /* trn->undo_lsn is updated in an inwrite_hook when writing the CLR_END */ + tprint(tracef, " undo_lsn now LSN (%lu,0x%lx)\n", + LSN_IN_PARTS(previous_undo_lsn)); + return error; +} + + + static int run_redo_phase(LSN lsn, enum maria_apply_log_way apply) { TRANSLOG_HEADER_BUFFER rec; @@ -1669,14 +1926,23 @@ static int run_redo_phase(LSN lsn, enum maria_apply_log_way apply) install_redo_exec_hook(REDO_FREE_HEAD_OR_TAIL); install_redo_exec_hook(REDO_FREE_BLOCKS); install_redo_exec_hook(REDO_DELETE_ALL); + install_redo_exec_hook(REDO_INDEX); + install_redo_exec_hook(REDO_INDEX_NEW_PAGE); + install_redo_exec_hook(REDO_INDEX_FREE_PAGE); install_redo_exec_hook(UNDO_ROW_INSERT); install_redo_exec_hook(UNDO_ROW_DELETE); install_redo_exec_hook(UNDO_ROW_UPDATE); + install_redo_exec_hook(UNDO_KEY_INSERT); + install_redo_exec_hook(UNDO_KEY_DELETE); + install_redo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT); install_redo_exec_hook(COMMIT); install_redo_exec_hook(CLR_END); install_undo_exec_hook(UNDO_ROW_INSERT); install_undo_exec_hook(UNDO_ROW_DELETE); install_undo_exec_hook(UNDO_ROW_UPDATE); + install_undo_exec_hook(UNDO_KEY_INSERT); + install_undo_exec_hook(UNDO_KEY_DELETE); + install_undo_exec_hook(UNDO_KEY_DELETE_WITH_ROOT); current_group_end_lsn= LSN_IMPOSSIBLE; @@ -1870,7 +2136,6 @@ static uint end_of_redo_phase(my_bool prepare_for_undo_phase) LSN_IN_PARTS(gslsn), sid); if (all_active_trans[sid].undo_lsn != LSN_IMPOSSIBLE) { - char llbuf[22]; llstr(long_trid, llbuf); tprint(tracef, "Transaction long_trid %s short_trid %u unfinished\n", llbuf, sid); @@ -2017,13 +2282,15 @@ static MARIA_HA *get_MARIA_HA_from_REDO_record(const print_redo_phase_progress(rec->lsn); sid= fileid_korr(rec->header); page= page_korr(rec->header + FILEID_STORE_SIZE); - switch(rec->type) - { + switch(rec->type) { /* not all REDO records have a page: */ case LOGREC_REDO_INSERT_ROW_HEAD: case LOGREC_REDO_INSERT_ROW_TAIL: case LOGREC_REDO_PURGE_ROW_HEAD: case LOGREC_REDO_PURGE_ROW_TAIL: + case LOGREC_REDO_INDEX_NEW_PAGE: + case LOGREC_REDO_INDEX: + case LOGREC_REDO_INDEX_FREE_PAGE: llstr(page, llbuf); tprint(tracef, " For page %s of table of short id %u", llbuf, sid); break; @@ -2281,7 +2548,7 @@ static int new_page(File fileid, pgcache_page_no_t pageid, LSN rec_lsn, static int close_all_tables(void) { int error= 0; - uint count; + uint count= 0; LIST *list_element, *next_open; MARIA_HA *info; pthread_mutex_lock(&THR_LOCK_maria); @@ -2364,6 +2631,9 @@ static void print_redo_phase_progress(TRANSLOG_ADDRESS addr) { static int end_logno= FILENO_IMPOSSIBLE, end_offset, percentage_printed= 0; static ulonglong initial_remainder= -1; + int cur_logno, cur_offset; + ulonglong local_remainder; + if (tracef == stdout) return; if (recovery_message_printed == REC_MSG_NONE) @@ -2379,16 +2649,15 @@ static void print_redo_phase_progress(TRANSLOG_ADDRESS addr) end_logno= LSN_FILE_NO(end_addr); end_offset= LSN_OFFSET(end_addr); } - int cur_logno= LSN_FILE_NO(addr); - int cur_offset= LSN_OFFSET(addr); - ulonglong remainder; - remainder= (cur_logno == end_logno) ? (end_offset - cur_offset) : + cur_logno= LSN_FILE_NO(addr); + cur_offset= LSN_OFFSET(addr); + local_remainder= (cur_logno == end_logno) ? (end_offset - cur_offset) : (TRANSLOG_FILE_SIZE - cur_offset + max(end_logno - cur_logno - 1, 0) * TRANSLOG_FILE_SIZE + end_offset); if (initial_remainder == (ulonglong)(-1)) - initial_remainder= remainder; + initial_remainder= local_remainder; int percentage_done= - (initial_remainder - remainder) * ULL(100) / initial_remainder; + (initial_remainder - local_remainder) * ULL(100) / initial_remainder; if ((percentage_done - percentage_printed) >= 10) { percentage_printed= percentage_done; @@ -2433,11 +2702,10 @@ effect on the state in case of crash. But we make them sync the state as soon as they have finished. This reduces the window for a problem. It looks like only one thread at a time updates the state in memory or -on disk. However there is not 100% certainty when it comes to -HA_EXTRA_(FORCE_REOPEN|PREPARE_FOR_RENAME): can they read the state -from memory while some other thread is updating "records" in memory? -If yes, they may write a corrupted state to disk. -We assume that no for now: ASK_MONTY. +on disk. We assume that the upper level (normally MySQL) has protection +against issuing HA_EXTRA_(FORCE_REOPEN|PREPARE_FOR_RENAME) so that these +are not issued while there are any running transactions on the given table. +If this is not done, we may write a corrupted state to disk. With checkpoints ================ diff --git a/storage/maria/ma_rt_index.c b/storage/maria/ma_rt_index.c index 2530ae86a5c..eba2a519a3f 100644 --- a/storage/maria/ma_rt_index.c +++ b/storage/maria/ma_rt_index.c @@ -67,7 +67,8 @@ static int maria_rtree_find_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo, my_errno= HA_ERR_OUT_OF_MEM; return -1; } - if (!_ma_fetch_keypage(info, keyinfo, page, DFLT_INIT_HITS, page_buf, 0)) + if (!_ma_fetch_keypage(info, keyinfo, page, PAGECACHE_LOCK_LEFT_UNLOCKED, + DFLT_INIT_HITS, page_buf, 0, 0)) goto err1; nod_flag= _ma_test_if_nod(info, page_buf); @@ -286,7 +287,8 @@ static int maria_rtree_get_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo, if (!(page_buf= (uchar*) my_alloca((uint)keyinfo->block_length))) return -1; - if (!_ma_fetch_keypage(info, keyinfo, page, DFLT_INIT_HITS, page_buf, 0)) + if (!_ma_fetch_keypage(info, keyinfo, page, PAGECACHE_LOCK_LEFT_UNLOCKED, + DFLT_INIT_HITS, page_buf, 0, 0)) goto err1; nod_flag= _ma_test_if_nod(info, page_buf); @@ -547,6 +549,7 @@ static int maria_rtree_insert_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uint nod_flag; int res; uchar *page_buf, *k; + MARIA_PINNED_PAGE *page_link; DBUG_ENTER("maria_rtree_insert_req"); if (!(page_buf= (uchar*) my_alloca((uint)keyinfo->block_length + @@ -555,7 +558,8 @@ static int maria_rtree_insert_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo, my_errno= HA_ERR_OUT_OF_MEM; DBUG_RETURN(-1); /* purecov: inspected */ } - if (!_ma_fetch_keypage(info, keyinfo, page, DFLT_INIT_HITS, page_buf, 0)) + if (!_ma_fetch_keypage(info, keyinfo, page, PAGECACHE_LOCK_WRITE, + DFLT_INIT_HITS, page_buf, 0, &page_link)) goto err1; nod_flag= _ma_test_if_nod(info, page_buf); DBUG_PRINT("rtree", ("page: %lu level: %d ins_level: %d nod_flag: %u", @@ -574,7 +578,10 @@ static int maria_rtree_insert_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo, case 0: /* child was not split */ { maria_rtree_combine_rect(keyinfo->seg, k, key, k, key_length); - if (_ma_write_keypage(info, keyinfo, page, DFLT_INIT_HITS, page_buf)) + page_link->changed= 1; + if (_ma_write_keypage(info, keyinfo, page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS, page_buf)) goto err1; goto ok; } @@ -592,7 +599,10 @@ static int maria_rtree_insert_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo, goto err1; res= maria_rtree_add_key(info, keyinfo, new_key, key_length, page_buf, new_page); - if (_ma_write_keypage(info, keyinfo, page, DFLT_INIT_HITS, page_buf)) + page_link->changed= 1; + if (_ma_write_keypage(info, keyinfo, page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS, page_buf)) goto err1; goto ok; } @@ -607,7 +617,9 @@ static int maria_rtree_insert_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo, { res= maria_rtree_add_key(info, keyinfo, key, key_length, page_buf, new_page); - if (_ma_write_keypage(info, keyinfo, page, DFLT_INIT_HITS, page_buf)) + page_link->changed= 1; + if (_ma_write_keypage(info, keyinfo, page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS, page_buf)) goto err1; } @@ -637,17 +649,26 @@ static int maria_rtree_insert_level(MARIA_HA *info, uint keynr, uchar *key, MARIA_KEYDEF *keyinfo= info->s->keyinfo + keynr; int res; my_off_t new_page; + MARIA_PINNED_PAGE *page_link; DBUG_ENTER("maria_rtree_insert_level"); if ((old_root= info->s->state.key_root[keynr]) == HA_OFFSET_ERROR) { - if ((old_root= _ma_new(info, keyinfo, DFLT_INIT_HITS)) == HA_OFFSET_ERROR) + MARIA_PINNED_PAGE tmp_page_link; + page_link= &tmp_page_link; + if ((old_root= _ma_new(info, DFLT_INIT_HITS, &page_link)) == + HA_OFFSET_ERROR) DBUG_RETURN(-1); info->keyread_buff_used= 1; + bzero(info->buff, info->s->keypage_header); + _ma_store_keynr(info, info->buff, keynr); _ma_store_page_used(info, info->buff, info->s->keypage_header, 0); + res= maria_rtree_add_key(info, keyinfo, key, key_length, info->buff, NULL); - if (_ma_write_keypage(info, keyinfo, old_root, DFLT_INIT_HITS, info->buff)) + if (_ma_write_keypage(info, keyinfo, old_root, + page_link->write_lock, + DFLT_INIT_HITS, info->buff)) DBUG_RETURN(1); info->s->state.key_root[keynr]= old_root; DBUG_RETURN(res); @@ -665,6 +686,8 @@ static int maria_rtree_insert_level(MARIA_HA *info, uint keynr, uchar *key, uchar *new_root_buf, *new_key; my_off_t new_root; uint nod_flag= info->s->base.key_reflength; + MARIA_PINNED_PAGE tmp_page_link; + page_link= &tmp_page_link; DBUG_PRINT("rtree", ("root was split, grow a new root")); if (!(new_root_buf= (uchar*) my_alloca((uint)keyinfo->block_length + @@ -674,9 +697,11 @@ static int maria_rtree_insert_level(MARIA_HA *info, uint keynr, uchar *key, DBUG_RETURN(-1); /* purecov: inspected */ } + bzero(new_root_buf, info->s->keypage_header); + _ma_store_keynr(info, new_root_buf, keynr); _ma_store_page_used(info, new_root_buf, info->s->keypage_header, nod_flag); - if ((new_root= _ma_new(info, keyinfo, DFLT_INIT_HITS)) == + if ((new_root= _ma_new(info, DFLT_INIT_HITS, &page_link)) == HA_OFFSET_ERROR) goto err1; @@ -698,7 +723,7 @@ static int maria_rtree_insert_level(MARIA_HA *info, uint keynr, uchar *key, NULL) == -1) goto err1; - if (_ma_write_keypage(info, keyinfo, new_root, + if (_ma_write_keypage(info, keyinfo, new_root, page_link->write_lock, DFLT_INIT_HITS, new_root_buf)) goto err1; info->s->state.key_root[keynr]= new_root; @@ -790,6 +815,7 @@ static int maria_rtree_delete_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uint nod_flag; int res; uchar *page_buf, *last, *k; + MARIA_PINNED_PAGE *page_link; DBUG_ENTER("maria_rtree_delete_req"); if (!(page_buf= (uchar*) my_alloca((uint)keyinfo->block_length))) @@ -797,7 +823,8 @@ static int maria_rtree_delete_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo, my_errno= HA_ERR_OUT_OF_MEM; DBUG_RETURN(-1); /* purecov: inspected */ } - if (!_ma_fetch_keypage(info, keyinfo, page, DFLT_INIT_HITS, page_buf, 0)) + if (!_ma_fetch_keypage(info, keyinfo, page, PAGECACHE_LOCK_WRITE, + DFLT_INIT_HITS, page_buf, 0, &page_link)) goto err1; nod_flag= _ma_test_if_nod(info, page_buf); DBUG_PRINT("rtree", ("page: %lu level: %d nod_flag: %u", @@ -827,7 +854,9 @@ static int maria_rtree_delete_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo, if (maria_rtree_set_key_mbr(info, keyinfo, k, key_length, _ma_kpos(nod_flag, k))) goto err1; + page_link->changed= 1; if (_ma_write_keypage(info, keyinfo, page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS, page_buf)) goto err1; } @@ -852,7 +881,9 @@ static int maria_rtree_delete_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo, level later to reintegrate the subtrees. */ maria_rtree_delete_key(info, page_buf, k, key_length, nod_flag); + page_link->changed= 1; if (_ma_write_keypage(info, keyinfo, page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS, page_buf)) goto err1; *page_size= _ma_get_page_used(info, page_buf); @@ -867,7 +898,9 @@ static int maria_rtree_delete_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo, case 2: /* vacuous case: last key in the leaf */ { maria_rtree_delete_key(info, page_buf, k, key_length, nod_flag); + page_link->changed= 1; if (_ma_write_keypage(info, keyinfo, page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS, page_buf)) goto err1; *page_size= _ma_get_page_used(info, page_buf); @@ -888,19 +921,23 @@ static int maria_rtree_delete_req(MARIA_HA *info, MARIA_KEYDEF *keyinfo, if (!maria_rtree_key_cmp(keyinfo->seg, key, k, key_length, MBR_EQUAL | MBR_DATA)) { + page_link->changed= 1; + maria_rtree_delete_key(info, page_buf, k, key_length, nod_flag); *page_size= _ma_get_page_used(info, page_buf); if (*page_size == info->s->keypage_header) { /* last key in the leaf */ res= 2; - if (_ma_dispose(info, keyinfo, page, DFLT_INIT_HITS)) + if (_ma_dispose(info, page, 0)) goto err1; } else { res= 0; - if (_ma_write_keypage(info, keyinfo, page, DFLT_INIT_HITS, page_buf)) + if (_ma_write_keypage(info, keyinfo, page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS, page_buf)) goto err1; } goto ok; @@ -933,6 +970,7 @@ int maria_rtree_delete(MARIA_HA *info, uint keynr, uchar *key, uint key_length) stPageList ReinsertList; my_off_t old_root; MARIA_KEYDEF *keyinfo= info->s->keyinfo + keynr; + MARIA_PINNED_PAGE *page_link, *root_page_link; DBUG_ENTER("maria_rtree_delete"); if ((old_root= info->s->state.key_root[keynr]) == HA_OFFSET_ERROR) @@ -948,100 +986,102 @@ int maria_rtree_delete(MARIA_HA *info, uint keynr, uchar *key, uint key_length) ReinsertList.m_pages= 0; switch (maria_rtree_delete_req(info, keyinfo, key, key_length, old_root, - &page_size, &ReinsertList, 0)) - { + &page_size, &ReinsertList, 0)) { case 2: /* empty */ - { - info->s->state.key_root[keynr]= HA_OFFSET_ERROR; - DBUG_RETURN(0); - } + { + info->s->state.key_root[keynr]= HA_OFFSET_ERROR; + DBUG_RETURN(0); + } case 0: /* deleted */ + { + uint nod_flag; + ulong i; + for (i= 0; i < ReinsertList.n_pages; ++i) { - uint nod_flag; - ulong i; - for (i= 0; i < ReinsertList.n_pages; ++i) - { - uchar *page_buf, *k, *last; + uchar *page_buf, *k, *last; - if (!(page_buf= (uchar*) my_alloca((uint)keyinfo->block_length))) + if (!(page_buf= (uchar*) my_alloca((uint)keyinfo->block_length))) + { + my_errno= HA_ERR_OUT_OF_MEM; + goto err1; + } + if (!_ma_fetch_keypage(info, keyinfo, ReinsertList.pages[i].offs, + PAGECACHE_LOCK_WRITE, + DFLT_INIT_HITS, page_buf, 0, &page_link)) + goto err1; + nod_flag= _ma_test_if_nod(info, page_buf); + DBUG_PRINT("rtree", ("reinserting keys from " + "page: %lu level: %d nod_flag: %u", + (ulong) ReinsertList.pages[i].offs, + ReinsertList.pages[i].level, nod_flag)); + + k= rt_PAGE_FIRST_KEY(info, page_buf, nod_flag); + last= rt_PAGE_END(info, page_buf); + for (; k < last; k= rt_PAGE_NEXT_KEY(k, key_length, nod_flag)) + { + int res; + if ((res= + maria_rtree_insert_level(info, keynr, k, key_length, + ReinsertList.pages[i].level)) == -1) { - my_errno= HA_ERR_OUT_OF_MEM; + my_afree(page_buf); goto err1; } - if (!_ma_fetch_keypage(info, keyinfo, ReinsertList.pages[i].offs, - DFLT_INIT_HITS, page_buf, 0)) - goto err1; - nod_flag= _ma_test_if_nod(info, page_buf); - DBUG_PRINT("rtree", ("reinserting keys from " - "page: %lu level: %d nod_flag: %u", - (ulong) ReinsertList.pages[i].offs, - ReinsertList.pages[i].level, nod_flag)); - - k= rt_PAGE_FIRST_KEY(info, page_buf, nod_flag); - last= rt_PAGE_END(info, page_buf); - for (; k < last; k= rt_PAGE_NEXT_KEY(k, key_length, nod_flag)) + if (res) { - int res; - if ((res= - maria_rtree_insert_level(info, keynr, k, key_length, - ReinsertList.pages[i].level)) == -1) + ulong j; + DBUG_PRINT("rtree", ("root has been split, adjust levels")); + for (j= i; j < ReinsertList.n_pages; j++) { - my_afree(page_buf); - goto err1; - } - if (res) - { - ulong j; - DBUG_PRINT("rtree", ("root has been split, adjust levels")); - for (j= i; j < ReinsertList.n_pages; j++) - { - ReinsertList.pages[j].level++; - DBUG_PRINT("rtree", ("keys from page: %lu now level: %d", - (ulong) ReinsertList.pages[i].offs, - ReinsertList.pages[i].level)); - } + ReinsertList.pages[j].level++; + DBUG_PRINT("rtree", ("keys from page: %lu now level: %d", + (ulong) ReinsertList.pages[i].offs, + ReinsertList.pages[i].level)); } } - my_afree(page_buf); - if (_ma_dispose(info, keyinfo, ReinsertList.pages[i].offs, - DFLT_INIT_HITS)) - goto err1; } - if (ReinsertList.pages) - my_free((uchar*) ReinsertList.pages, MYF(0)); - - /* check for redundant root (not leaf, 1 child) and eliminate */ - if ((old_root= info->s->state.key_root[keynr]) == HA_OFFSET_ERROR) + my_afree(page_buf); + page_link->changed= 1; + if (_ma_dispose(info, ReinsertList.pages[i].offs, 0)) goto err1; - if (!_ma_fetch_keypage(info, keyinfo, old_root, DFLT_INIT_HITS, - info->buff, 0)) - goto err1; - nod_flag= _ma_test_if_nod(info, info->buff); - page_size= _ma_get_page_used(info, info->buff); - if (nod_flag && (page_size == info->s->keypage_header + key_length + - nod_flag)) - { - my_off_t new_root= _ma_kpos(nod_flag, - rt_PAGE_FIRST_KEY(info, info->buff, - nod_flag)); - if (_ma_dispose(info, keyinfo, old_root, DFLT_INIT_HITS)) - goto err1; - info->s->state.key_root[keynr]= new_root; - } - info->update= HA_STATE_DELETED; - DBUG_RETURN(0); - -err1: - DBUG_RETURN(-1); /* purecov: inspected */ } - case 1: /* not found */ + if (ReinsertList.pages) + my_free((uchar*) ReinsertList.pages, MYF(0)); + + /* check for redundant root (not leaf, 1 child) and eliminate */ + if ((old_root= info->s->state.key_root[keynr]) == HA_OFFSET_ERROR) + goto err1; + if (!_ma_fetch_keypage(info, keyinfo, old_root, + PAGECACHE_LOCK_WRITE, + DFLT_INIT_HITS, info->buff, 0, &root_page_link)) + goto err1; + nod_flag= _ma_test_if_nod(info, info->buff); + page_size= _ma_get_page_used(info, info->buff); + if (nod_flag && (page_size == info->s->keypage_header + key_length + + nod_flag)) { - my_errno= HA_ERR_KEY_NOT_FOUND; - DBUG_RETURN(-1); /* purecov: inspected */ + my_off_t new_root= _ma_kpos(nod_flag, + rt_PAGE_FIRST_KEY(info, info->buff, + nod_flag)); + root_page_link->changed= 1; + if (_ma_dispose(info, old_root, 0)) + goto err1; + info->s->state.key_root[keynr]= new_root; } - default: - case -1: /* error */ - DBUG_RETURN(-1); /* purecov: inspected */ + info->update= HA_STATE_DELETED; + DBUG_RETURN(0); + +err1: + DBUG_RETURN(-1); /* purecov: inspected */ + } + case 1: /* not found */ + { + my_errno= HA_ERR_KEY_NOT_FOUND; + DBUG_RETURN(-1); /* purecov: inspected */ + } + default: + case -1: /* error */ + DBUG_RETURN(-1); /* purecov: inspected */ } } @@ -1071,7 +1111,8 @@ ha_rows maria_rtree_estimate(MARIA_HA *info, uint keynr, uchar *key, return HA_POS_ERROR; if (!(page_buf= (uchar*) my_alloca((uint)keyinfo->block_length))) return HA_POS_ERROR; - if (!_ma_fetch_keypage(info, keyinfo, root, DFLT_INIT_HITS, page_buf, 0)) + if (!_ma_fetch_keypage(info, keyinfo, root, PAGECACHE_LOCK_LEFT_UNLOCKED, + DFLT_INIT_HITS, page_buf, 0, 0)) goto err1; nod_flag= _ma_test_if_nod(info, page_buf); diff --git a/storage/maria/ma_rt_key.c b/storage/maria/ma_rt_key.c index c71d7d7d8eb..311137850f6 100644 --- a/storage/maria/ma_rt_key.c +++ b/storage/maria/ma_rt_key.c @@ -100,7 +100,8 @@ int maria_rtree_set_key_mbr(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *key, { DBUG_ENTER("maria_rtree_set_key_mbr"); if (!_ma_fetch_keypage(info, keyinfo, child_page, - DFLT_INIT_HITS, info->buff, 0)) + PAGECACHE_LOCK_LEFT_UNLOCKED, + DFLT_INIT_HITS, info->buff, 0, 0)) DBUG_RETURN(-1); DBUG_RETURN(maria_rtree_page_mbr(info, keyinfo->seg, diff --git a/storage/maria/ma_rt_split.c b/storage/maria/ma_rt_split.c index d3381ecf1ad..25cfb0be91a 100644 --- a/storage/maria/ma_rt_split.c +++ b/storage/maria/ma_rt_split.c @@ -252,7 +252,6 @@ int maria_rtree_split_page(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uint key_length, my_off_t *new_page_offs) { int n1, n2; /* Number of items in groups */ - SplitStruct *task; SplitStruct *cur; SplitStruct *stop; @@ -268,6 +267,7 @@ int maria_rtree_split_page(MARIA_HA *info, MARIA_KEYDEF *keyinfo, info->s->base.rec_reflength); int max_keys= ((_ma_get_page_used(info, page) - info->s->keypage_header) / (full_length)); + MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link; DBUG_ENTER("maria_rtree_split_page"); DBUG_PRINT("rtree", ("splitting block")); @@ -339,16 +339,19 @@ int maria_rtree_split_page(MARIA_HA *info, MARIA_KEYDEF *keyinfo, memcpy(to - nod_flag, cur->key - nod_flag, full_length); } + bzero(new_page, info->s->keypage_header); + _ma_store_keynr(info, new_page, keyinfo->key_nr); _ma_store_page_used(info, page, info->s->keypage_header + n1 * full_length, nod_flag); _ma_store_page_used(info, new_page, info->s->keypage_header + n2 * full_length, nod_flag); - if ((*new_page_offs= _ma_new(info, keyinfo, DFLT_INIT_HITS)) == - HA_OFFSET_ERROR) + if ((*new_page_offs= _ma_new(info, DFLT_INIT_HITS, &page_link)) == + HA_OFFSET_ERROR) err_code= -1; else err_code= _ma_write_keypage(info, keyinfo, *new_page_offs, + page_link->write_lock, DFLT_INIT_HITS, new_page); DBUG_PRINT("rtree", ("split new block: %lu", (ulong) *new_page_offs)); diff --git a/storage/maria/ma_search.c b/storage/maria/ma_search.c index 460d8367440..8eaff88155e 100644 --- a/storage/maria/ma_search.c +++ b/storage/maria/ma_search.c @@ -45,12 +45,17 @@ int _ma_check_index(MARIA_HA *info, int inx) } /* _ma_check_index */ - /* - ** Search after row by a key - ** Position to row is stored in info->lastpos - ** Return: -1 if not found - ** 1 if one should continue search on higher level - */ +/** + @breif Search after row by a key + + @note + Position to row is stored in info->lastpos + + @return + @retval 0 ok (key found) + @retval -1 Not found + @retval 1 If one should continue search on higher level +*/ int _ma_search(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, uchar *key, uint key_len, uint nextflag, register my_off_t pos) @@ -74,9 +79,10 @@ int _ma_search(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, DBUG_RETURN(1); /* Search at upper levels */ } - if (!(buff= _ma_fetch_keypage(info,keyinfo,pos,DFLT_INIT_HITS, - info->keyread_buff, - test(!(nextflag & SEARCH_SAVE_BUFF))))) + if (!(buff= _ma_fetch_keypage(info,keyinfo, pos, + PAGECACHE_LOCK_LEFT_UNLOCKED, + DFLT_INIT_HITS, info->keyread_buff, + test(!(nextflag & SEARCH_SAVE_BUFF)), 0))) goto err; DBUG_DUMP("page", buff, _ma_get_page_used(info, buff)); @@ -118,9 +124,10 @@ int _ma_search(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, if (pos != info->last_keypage) { uchar *old_buff=buff; - if (!(buff= _ma_fetch_keypage(info,keyinfo,pos,DFLT_INIT_HITS, + if (!(buff= _ma_fetch_keypage(info,keyinfo, pos, + PAGECACHE_LOCK_LEFT_UNLOCKED,DFLT_INIT_HITS, info->keyread_buff, - test(!(nextflag & SEARCH_SAVE_BUFF))))) + test(!(nextflag & SEARCH_SAVE_BUFF)), 0))) goto err; keypos=buff+(keypos-old_buff); maxpos=buff+(maxpos-old_buff); @@ -174,8 +181,9 @@ err: /* ret_pos point to where find or bigger key starts */ /* ARGSUSED */ -int _ma_bin_search(MARIA_HA *info, register MARIA_KEYDEF *keyinfo, uchar *page, - uchar *key, uint key_len, uint comp_flag, uchar **ret_pos, +int _ma_bin_search(MARIA_HA *info, register MARIA_KEYDEF *keyinfo, + uchar *page, const uchar *key, uint key_len, + uint comp_flag, uchar **ret_pos, uchar *buff __attribute__((unused)), my_bool *last_key) { int flag; @@ -209,7 +217,7 @@ int _ma_bin_search(MARIA_HA *info, register MARIA_KEYDEF *keyinfo, uchar *page, (uchar*) key, key_len, comp_flag, not_used); if (flag < 0) start++; /* point at next, bigger key */ - *ret_pos=page+(uint) start*totlength; + *ret_pos= (char*) (page+(uint) start*totlength); *last_key= end == save_end; DBUG_PRINT("exit",("flag: %d keypos: %d",flag,start)); DBUG_RETURN(flag); @@ -242,13 +250,14 @@ int _ma_bin_search(MARIA_HA *info, register MARIA_KEYDEF *keyinfo, uchar *page, < 0 Not found. */ -int _ma_seq_search(MARIA_HA *info, register MARIA_KEYDEF *keyinfo, uchar *page, - uchar *key, uint key_len, uint comp_flag, uchar **ret_pos, +int _ma_seq_search(MARIA_HA *info, register MARIA_KEYDEF *keyinfo, + uchar *page, const uchar *key, uint key_len, + uint comp_flag, uchar **ret_pos, uchar *buff, my_bool *last_key) { int flag; uint nod_flag, length, used_length, not_used[2]; - uchar t_buff[HA_MAX_KEY_BUFF],*end; + uchar t_buff[HA_MAX_KEY_BUFF], *end; DBUG_ENTER("_ma_seq_search"); LINT_INIT(flag); @@ -257,7 +266,7 @@ int _ma_seq_search(MARIA_HA *info, register MARIA_KEYDEF *keyinfo, uchar *page, _ma_get_used_and_nod(info, page, used_length, nod_flag); end= page + used_length; page+= info->s->keypage_header + nod_flag; - *ret_pos=page; + *ret_pos= (uchar*) page; t_buff[0]=0; /* Avoid bugs */ while (page < end) { @@ -290,8 +299,9 @@ int _ma_seq_search(MARIA_HA *info, register MARIA_KEYDEF *keyinfo, uchar *page, int _ma_prefix_search(MARIA_HA *info, register MARIA_KEYDEF *keyinfo, - uchar *page, uchar *key, uint key_len, uint nextflag, - uchar **ret_pos, uchar *buff, my_bool *last_key) + uchar *page, const uchar *key, uint key_len, + uint nextflag, uchar **ret_pos, uchar *buff, + my_bool *last_key) { /* my_flag is raw comparison result to be changed according to @@ -303,10 +313,11 @@ int _ma_prefix_search(MARIA_HA *info, register MARIA_KEYDEF *keyinfo, uint prefix_len,suffix_len; int key_len_skip, seg_len_pack, key_len_left; uchar *end; - uchar *kseg, *vseg, *saved_vseg, *saved_from; + uchar *vseg, *saved_vseg, *saved_from; uchar *sort_order= keyinfo->seg->charset->sort_order; uchar tt_buff[HA_MAX_KEY_BUFF+2], *t_buff=tt_buff+2; uchar *saved_to; + const uchar *kseg; uint saved_length=0, saved_prefix_len=0; uint length_pack; DBUG_ENTER("_ma_prefix_search"); @@ -435,7 +446,7 @@ int _ma_prefix_search(MARIA_HA *info, register MARIA_KEYDEF *keyinfo, { /* We have to compare. But we can still skip part of the key */ uint left; - uchar *k= kseg+prefix_len; + const uchar *k= kseg+prefix_len; /* If prefix_len > cmplen then we are in the end-space comparison @@ -481,7 +492,7 @@ int _ma_prefix_search(MARIA_HA *info, register MARIA_KEYDEF *keyinfo, else { /* We have to compare k and vseg as if they were space extended */ - uchar *k_end= k+ (cmplen - len); + const uchar *k_end= k+ (cmplen - len); for ( ; k < k_end && *k == ' '; k++) ; if (k == k_end) goto cmp_rest; /* should never happen */ @@ -631,8 +642,8 @@ void _ma_kpointer(register MARIA_HA *info, register uchar *buff, my_off_t pos) /* Calc pos to a data-record from a key */ - -my_off_t _ma_dpos(MARIA_HA *info, uint nod_flag, const uchar *after_key) +MARIA_RECORD_POS _ma_dpos(MARIA_HA *info, uint nod_flag, + const uchar *after_key) { my_off_t pos; after_key-=(nod_flag + info->s->rec_reflength); @@ -654,15 +665,16 @@ my_off_t _ma_dpos(MARIA_HA *info, uint nod_flag, const uchar *after_key) default: pos=0L; /* Shut compiler up */ } - return ((info->s->data_file_type == STATIC_RECORD) ? - pos * info->s->base.pack_reclength : pos); + return info->s->keypos_to_recpos(info, pos); } /* Calc position from a record pointer ( in delete link chain ) */ -my_off_t _ma_rec_pos(MARIA_SHARE *s, uchar *ptr) +MARIA_RECORD_POS _ma_rec_pos(MARIA_HA *info, uchar *ptr) { + MARIA_SHARE *s= info->s; + my_off_t pos; switch (s->rec_reflength) { #if SIZEOF_OFF_T > 4 @@ -711,18 +723,16 @@ my_off_t _ma_rec_pos(MARIA_SHARE *s, uchar *ptr) break; default: abort(); /* Impossible */ } - return ((s->data_file_type == STATIC_RECORD) ? - pos * s->base.pack_reclength : pos); + return (*s->keypos_to_recpos)(info, pos); } - /* save position to record */ +/* save position to record */ void _ma_dpointer(MARIA_HA *info, uchar *buff, my_off_t pos) { - if (info->s->data_file_type == STATIC_RECORD && - pos != HA_OFFSET_ERROR) - pos/= info->s->base.pack_reclength; + if (pos != HA_OFFSET_ERROR) + pos= (*info->s->recpos_to_keypos)(info, pos); switch (info->s->rec_reflength) { #if SIZEOF_OFF_T > 4 @@ -748,16 +758,53 @@ void _ma_dpointer(MARIA_HA *info, uchar *buff, my_off_t pos) } /* _ma_dpointer */ - /* Get key from key-block */ - /* page points at previous key; its advanced to point at next key */ - /* key should contain previous key */ - /* Returns length of found key + pointers */ - /* nod_flag is a flag if we are on nod */ +my_off_t _ma_static_keypos_to_recpos(MARIA_HA *info, my_off_t pos) +{ + return pos * info->s->base.pack_reclength; +} + + +my_off_t _ma_static_recpos_to_keypos(MARIA_HA *info, my_off_t pos) +{ + return pos / info->s->base.pack_reclength; +} + +my_off_t _ma_transparent_recpos(MARIA_HA *info __attribute__((unused)), + my_off_t pos) +{ + return pos; +} + +my_off_t _ma_transaction_keypos_to_recpos(MARIA_HA *info + __attribute__((unused)), + my_off_t pos) +{ + /* We need one bit to store if there is transid's after position */ + return pos >> 1; +} + +my_off_t _ma_transaction_recpos_to_keypos(MARIA_HA *info + __attribute__((unused)), + my_off_t pos) +{ + return pos << 1; +} + +/* + @brief Get key from key-block + + @param nod_flag Is set to nod length if we on nod + @param page Points at previous key; Its advanced to point at next key + @param key Should contain previous key + + @notes + Same as _ma_get_key but used with fixed length keys - /* same as _ma_get_key but used with fixed length keys */ + @retval Returns length of found key + pointers + */ uint _ma_get_static_key(register MARIA_KEYDEF *keyinfo, uint nod_flag, - register uchar **page, register uchar *key) + register uchar **page, uchar *key) { memcpy((uchar*) key,(uchar*) *page, (size_t) (keyinfo->keylength+nod_flag)); @@ -1290,8 +1337,9 @@ int _ma_search_next(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, if (info->keyread_buff_used) { - if (!_ma_fetch_keypage(info,keyinfo,info->last_search_keypage, - DFLT_INIT_HITS,info->keyread_buff,0)) + if (!_ma_fetch_keypage(info, keyinfo, info->last_search_keypage, + PAGECACHE_LOCK_LEFT_UNLOCKED, + DFLT_INIT_HITS, info->keyread_buff, 0, 0)) DBUG_RETURN(-1); info->keyread_buff_used=0; } @@ -1360,8 +1408,8 @@ int _ma_search_first(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, do { - if (!_ma_fetch_keypage(info,keyinfo,pos,DFLT_INIT_HITS, - info->keyread_buff,0)) + if (!_ma_fetch_keypage(info, keyinfo, pos, PAGECACHE_LOCK_LEFT_UNLOCKED, + DFLT_INIT_HITS, info->keyread_buff, 0, 0)) { info->cur_row.lastpos= HA_OFFSET_ERROR; DBUG_RETURN(-1); @@ -1409,7 +1457,8 @@ int _ma_search_last(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, do { uint used_length; - if (!_ma_fetch_keypage(info,keyinfo,pos,DFLT_INIT_HITS,buff,0)) + if (!_ma_fetch_keypage(info, keyinfo, pos, PAGECACHE_LOCK_LEFT_UNLOCKED, + DFLT_INIT_HITS, buff, 0, 0)) { info->cur_row.lastpos= HA_OFFSET_ERROR; DBUG_RETURN(-1); @@ -1865,7 +1914,8 @@ void _ma_store_static_key(MARIA_KEYDEF *keyinfo __attribute__((unused)), register uchar *key_pos, register MARIA_KEY_PARAM *s_temp) { - memcpy((uchar*) key_pos,(uchar*) s_temp->key,(size_t) s_temp->totlength); + memcpy(key_pos, s_temp->key,(size_t) s_temp->totlength); + s_temp->changed_length= s_temp->totlength; } @@ -1881,9 +1931,7 @@ void _ma_store_var_pack_key(MARIA_KEYDEF *keyinfo __attribute__((unused)), register MARIA_KEY_PARAM *s_temp) { uint length; - uchar *start; - - start=key_pos; + uchar *org_key_pos= key_pos; if (s_temp->ref_length) { @@ -1898,12 +1946,13 @@ void _ma_store_var_pack_key(MARIA_KEYDEF *keyinfo __attribute__((unused)), /* Not packed against previous key */ store_pack_length(s_temp->pack_marker == 128,key_pos,s_temp->key_length); } - bmove((uchar*) key_pos,(uchar*) s_temp->key, - (length=s_temp->totlength-(uint) (key_pos-start))); + bmove(key_pos, s_temp->key, + (length= s_temp->totlength - (uint) (key_pos-org_key_pos))); + + key_pos+= length; if (!s_temp->next_key_pos) /* No following key */ - return; - key_pos+=length; + goto end; if (s_temp->prev_length) { @@ -1921,19 +1970,25 @@ void _ma_store_var_pack_key(MARIA_KEYDEF *keyinfo __attribute__((unused)), s_temp->n_length); } memcpy(key_pos, s_temp->prev_key, s_temp->prev_length); + key_pos+= s_temp->prev_length; } else if (s_temp->n_ref_length) { store_pack_length(s_temp->pack_marker == 128,key_pos,s_temp->n_ref_length); - if (s_temp->n_ref_length == s_temp->pack_marker) - return; /* Identical key */ - store_key_length(key_pos,s_temp->n_length); + if (s_temp->n_ref_length != s_temp->pack_marker) + { + /* Not identical key */ + store_key_length_inc(key_pos,s_temp->n_length); + } } else { s_temp->n_length+= s_temp->store_not_null; store_pack_length(s_temp->pack_marker == 128,key_pos,s_temp->n_length); } + +end: + s_temp->changed_length= (uint) (key_pos - org_key_pos); } @@ -1943,17 +1998,21 @@ void _ma_store_bin_pack_key(MARIA_KEYDEF *keyinfo __attribute__((unused)), register uchar *key_pos, register MARIA_KEY_PARAM *s_temp) { + uchar *org_key_pos= key_pos; + size_t length= s_temp->totlength - s_temp->ref_length; + store_key_length_inc(key_pos,s_temp->ref_length); - memcpy((char*) key_pos,(char*) s_temp->key+s_temp->ref_length, - (size_t) s_temp->totlength-s_temp->ref_length); + memcpy(key_pos, s_temp->key+s_temp->ref_length, length); + key_pos+= length; if (s_temp->next_key_pos) { - key_pos+=(uint) (s_temp->totlength-s_temp->ref_length); store_key_length_inc(key_pos,s_temp->n_ref_length); if (s_temp->prev_length) /* If we must extend key */ { memcpy(key_pos,s_temp->prev_key,s_temp->prev_length); + key_pos+= s_temp->prev_length; } } + s_temp->changed_length= (uint) (key_pos - org_key_pos); } diff --git a/storage/maria/ma_sort.c b/storage/maria/ma_sort.c index 2b4a4b16923..e61019aeb83 100644 --- a/storage/maria/ma_sort.c +++ b/storage/maria/ma_sort.c @@ -138,8 +138,8 @@ int _ma_create_index_by_sort(MARIA_SORT_PARAM *info, my_bool no_messages, while (memavl >= MIN_SORT_MEMORY) { - if ((records < UINT_MAX32) && - ((my_off_t) (records + 1) * + if ((records < UINT_MAX32) && + ((my_off_t) (records + 1) * (sort_length + sizeof(char*)) <= (my_off_t) memavl)) keys= (uint)records+1; else @@ -1055,4 +1055,3 @@ static int flush_maria_ft_buf(MARIA_SORT_PARAM *info) } return err; } - diff --git a/storage/maria/ma_statrec.c b/storage/maria/ma_statrec.c index 03ebc781104..b0422f067f5 100644 --- a/storage/maria/ma_statrec.c +++ b/storage/maria/ma_statrec.c @@ -30,7 +30,7 @@ my_bool _ma_write_static_record(MARIA_HA *info, const uchar *record) info->s->state.dellink+1, MYF(MY_NABP))) goto err; - info->s->state.dellink= _ma_rec_pos(info->s,temp); + info->s->state.dellink= _ma_rec_pos(info, temp); info->state->del--; info->state->empty-=info->s->base.pack_reclength; if (info->s->file_write(info, record, info->s->base.reclength, diff --git a/storage/maria/ma_test1.c b/storage/maria/ma_test1.c index 1f487e01cd3..1a8a55e7f90 100644 --- a/storage/maria/ma_test1.c +++ b/storage/maria/ma_test1.c @@ -75,7 +75,7 @@ int main(int argc,char *argv[]) maria_data_root= "."; /* Maria requires that we always have a page cache */ if (maria_init() || - (init_pagecache(maria_pagecache, IO_SIZE*16, 0, 0, + (init_pagecache(maria_pagecache, maria_block_size * 16, 0, 0, maria_block_size) == 0) || ma_control_file_create_or_open() || (init_pagecache(maria_log_pagecache, @@ -86,7 +86,7 @@ int main(int argc,char *argv[]) TRANSLOG_DEFAULT_FLAGS) || (transactional && (trnman_init(0) || ma_checkpoint_init(0)))) { - fprintf(stderr, "Error in initialization"); + fprintf(stderr, "Error in initialization\n"); exit(1); } @@ -214,7 +214,13 @@ static int run_test(const char *filename) row_count=deleted=0; for (i=49 ; i>=1 ; i-=2 ) { - if (insert_count-- == 0) { VOID(maria_close(file)) ; exit(0) ; } + if (insert_count-- == 0) + { + if (testflag) + break; + VOID(maria_close(file)); + exit(0); + } j=i%25 +1; create_record(record,j); error=maria_write(file,record); @@ -712,7 +718,7 @@ static struct my_option my_long_options[] = (uchar**) &pagecacheing, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, {"key-length", 'k', "Undocumented", (uchar**) &key_length, (uchar**) &key_length, 0, GET_UINT, REQUIRED_ARG, 6, 0, 0, 0, 0, 0}, - {"key-multiple", 'm', "Undocumented", + {"key-multiple", 'm', "Don't use unique keys", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, {"key-prefix_pack", 'P', "Undocumented", 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}, diff --git a/storage/maria/ma_test2.c b/storage/maria/ma_test2.c index 6bc09fb1aee..d9a24eb7dff 100644 --- a/storage/maria/ma_test2.c +++ b/storage/maria/ma_test2.c @@ -51,7 +51,7 @@ static int verbose=0,testflag=0, die_in_middle_of_transaction= 0; static int pack_seg=HA_SPACE_PACK,pack_type=HA_PACK_KEY,remove_count=-1; static int create_flag= 0, srand_arg= 0, checkpoint= 0; -static ulong pagecache_size=IO_SIZE*16; +static ulong pagecache_size=8192*32; static enum data_file_type record_type= DYNAMIC_RECORD; static uint keys=MARIA_KEYS,recant=1000; @@ -253,14 +253,6 @@ int main(int argc, char *argv[]) for (i=0 ; i < recant ; i++) { ulong blob_length; -#if 0 - /* - Starting from i==72, there was a difference between runtime and - log-applying. This is now fixed, by not using non_header_data_len in - log-applying. - */ - if (i == 72) goto end; -#endif n1=rnd(1000); n2=rnd(100); n3=rnd(5000); sprintf((char*) record,"%6d:%4d:%8d:Pos: %4d ",n1,n2,n3,write_count); int4store(record+STANDARD_LENGTH-4,(long) i); @@ -1160,6 +1152,7 @@ static void put_blob_in_record(uchar *blob_pos, char **blob_buffer, ulong *blob_length) { ulong i,length; + *blob_length= 0; if (use_blob) { if (rnd(10) == 0) @@ -1180,7 +1173,6 @@ static void put_blob_in_record(uchar *blob_pos, char **blob_buffer, else { int4store(blob_pos,0); - *blob_length= 0; } } return; diff --git a/storage/maria/ma_test_all.sh b/storage/maria/ma_test_all.sh index f62dff8b09b..2c1cf659c94 100755 --- a/storage/maria/ma_test_all.sh +++ b/storage/maria/ma_test_all.sh @@ -119,7 +119,7 @@ run_tests() $maria_path/maria_chk$suffix -sm test2 $maria_path/ma_test2$suffix $silent -m10000 -e16384 -E16384 -K -L $row_type $maria_path/maria_chk$suffix -sm test2 - $maria_path/ma_test2$suffix $silent -M -T -c -b65000 + $maria_path/ma_test2$suffix $silent -c -b65000 $row_type $maria_path/maria_chk$suffix -se test2 } diff --git a/storage/maria/ma_write.c b/storage/maria/ma_write.c index 12f29c8ee1d..311597314b3 100644 --- a/storage/maria/ma_write.c +++ b/storage/maria/ma_write.c @@ -17,27 +17,69 @@ #include "ma_fulltext.h" #include "ma_rt_index.h" +#include "trnman.h" +#include "ma_key_recover.h" +#include "ma_blockrec.h" #define MAX_POINTER_LENGTH 8 /* Functions declared in this file */ -static int w_search(MARIA_HA *info,MARIA_KEYDEF *keyinfo, - uint comp_flag, uchar *key, - uint key_length, my_off_t pos, uchar *father_buff, - uchar *father_keypos, my_off_t father_page, +static int w_search(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, + uint comp_flag, uchar *key, uint key_length, my_off_t page, + my_off_t father_page, uchar *father_buff, + MARIA_PINNED_PAGE *father_page_link, uchar *father_keypos, my_bool insert_last); -static int _ma_balance_page(MARIA_HA *info,MARIA_KEYDEF *keyinfo,uchar *key, - uchar *curr_buff,uchar *father_buff, - uchar *father_keypos,my_off_t father_page); +static int _ma_balance_page(MARIA_HA *info,MARIA_KEYDEF *keyinfo, + uchar *key, uchar *curr_buff, my_off_t page, + my_off_t father_page, uchar *father_buff, + uchar *father_keypos, + MARIA_KEY_PARAM *s_temp); static uchar *_ma_find_last_pos(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *page, uchar *key, uint *return_key_length, uchar **after_key); -int _ma_ck_write_tree(register MARIA_HA *info, uint keynr,uchar *key, - uint key_length); -int _ma_ck_write_btree(register MARIA_HA *info, uint keynr,uchar *key, - uint key_length); +static int _ma_ck_write_tree(register MARIA_HA *info, uint keynr,uchar *key, + uint key_length); +static int _ma_ck_write_btree(register MARIA_HA *info, uint keynr,uchar *key, + uint key_length); +static int _ma_ck_write_btree_with_log(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + uchar *key, uint key_length, + my_off_t *root, uint comp_flag); +static my_bool _ma_log_new(MARIA_HA *info, my_off_t page, uchar *buff, + uint page_length, uint key_nr, my_bool root_page); +static my_bool _ma_log_add(MARIA_HA *info, my_off_t page, uchar *buff, + uchar *end_buff, uchar *key_pos, + uint changed_length, int move_length); +static my_bool _ma_log_change(MARIA_HA *info, my_off_t page, uchar *buff, + uchar *key_pos, uint length); +static my_bool _ma_log_split(MARIA_HA *info, my_off_t page, uchar *buff, + uint org_length, uint new_length, + uchar *key_pos, + uint key_length, int move_length, + enum en_key_op prefix_or_suffix, + uchar *data, uint data_length, + uint change_length); +static my_bool _ma_log_del_prefix(MARIA_HA *info, my_off_t page, uchar *buff, + uint org_length, uint new_length, + uchar *key_pos, uint key_length, + int move_length); +static my_bool _ma_log_key_middle(MARIA_HA *info, my_off_t page, uchar *buff, + uint new_length, + uint data_added_first, + uint data_changed_first, + uint data_deleted_last, + uchar *key_pos, + uint key_length, int move_length); +static my_bool _ma_log_prefix(MARIA_HA *info, my_off_t page, + uchar *buff, uint changed_length, + int move_length); +static my_bool _ma_log_suffix(MARIA_HA *info, my_off_t page, + uchar *buff, uint org_length, + uint new_length); +/* + @brief Default handler for returing position to new row +*/ MARIA_RECORD_POS _ma_write_init_default(MARIA_HA *info, const uchar *record @@ -300,31 +342,19 @@ int _ma_ck_write(MARIA_HA *info, uint keynr, uchar *key, uint key_length) /********************************************************************** - * Normal insert code * - **********************************************************************/ + Insert key into btree (normal case) +**********************************************************************/ -int _ma_ck_write_btree(register MARIA_HA *info, uint keynr, uchar *key, - uint key_length) +static int _ma_ck_write_btree(register MARIA_HA *info, uint keynr, uchar *key, + uint key_length) { int error; - uint comp_flag; MARIA_KEYDEF *keyinfo=info->s->keyinfo+keynr; my_off_t *root=&info->s->state.key_root[keynr]; DBUG_ENTER("_ma_ck_write_btree"); - if (keyinfo->flag & HA_SORT_ALLOWS_SAME) - comp_flag=SEARCH_BIGGER; /* Put after same key */ - else if (keyinfo->flag & (HA_NOSAME|HA_FULLTEXT)) - { - comp_flag=SEARCH_FIND | SEARCH_UPDATE; /* No duplicates */ - if (keyinfo->flag & HA_NULL_ARE_EQUAL) - comp_flag|= SEARCH_NULL_ARE_EQUAL; - } - else - comp_flag=SEARCH_SAME; /* Keys in rec-pos order */ - - error= _ma_ck_real_write_btree(info, keyinfo, key, key_length, - root, comp_flag); + error= _ma_ck_write_btree_with_log(info, keyinfo, key, key_length, + root, keyinfo->write_comp_flag); if (info->ft1_to_ft2) { if (!error) @@ -337,30 +367,124 @@ int _ma_ck_write_btree(register MARIA_HA *info, uint keynr, uchar *key, } /* _ma_ck_write_btree */ +/** + @brief Write a key to the b-tree + + @retval -1 error + @retval 0 ok +*/ + +static int _ma_ck_write_btree_with_log(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + uchar *key, uint key_length, + my_off_t *root, uint comp_flag) +{ + LSN lsn= LSN_IMPOSSIBLE; + int error; + my_off_t new_root= *root; +#ifdef NOT_YET + DBUG_ENTER("_ma_ck_write_btree_with_log"); +#endif + + error= _ma_ck_real_write_btree(info, keyinfo, key, key_length, &new_root, + comp_flag); + if (!error && info->s->now_transactional) + { + uchar log_data[LSN_STORE_SIZE + FILEID_STORE_SIZE + + KEY_NR_STORE_SIZE]; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + struct st_msg_to_write_hook_for_undo_key msg; + + lsn_store(log_data, info->trn->undo_lsn); + key_nr_store(log_data + LSN_STORE_SIZE + FILEID_STORE_SIZE, + keyinfo->key_nr); + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= (char*) key; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= key_length; + + msg.root= root; + msg.value= new_root; + + if (translog_write_record(&lsn, LOGREC_UNDO_KEY_INSERT, + info->trn, info, + log_array[TRANSLOG_INTERNAL_PARTS + 0].length + + key_length, + TRANSLOG_INTERNAL_PARTS + 2, log_array, + log_data + LSN_STORE_SIZE, &msg)) + error= -1; + } + else + { + *root= new_root; + _ma_fast_unlock_key_del(info); + } + _ma_unpin_all_pages_and_finalize_row(info, lsn); + +#ifdef NOT_YET + DBUG_RETURN(error); +#else + return(error); +#endif +} /* _ma_ck_write_btree_with_log */ + + + +/** + @brief Write a key to the b-tree + + @retval -1 error + @retval 0 ok +*/ + int _ma_ck_real_write_btree(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *key, uint key_length, my_off_t *root, uint comp_flag) { int error; DBUG_ENTER("_ma_ck_real_write_btree"); + /* key_length parameter is used only if comp_flag is SEARCH_FIND */ if (*root == HA_OFFSET_ERROR || - (error=w_search(info, keyinfo, comp_flag, key, key_length, - *root, (uchar*) 0, (uchar*) 0, - (my_off_t) 0, 1)) > 0) - error= _ma_enlarge_root(info,keyinfo,key,root); + (error= w_search(info, keyinfo, comp_flag, key, key_length, + *root, (my_off_t) 0, (uchar*) 0, + (MARIA_PINNED_PAGE *) 0, (uchar*) 0, 1)) > 0) + error= _ma_enlarge_root(info, keyinfo, key, root); DBUG_RETURN(error); } /* _ma_ck_real_write_btree */ - /* Make a new root with key as only pointer */ +/* + @brief write hook for undo key insert +*/ + +my_bool write_hook_for_undo_key(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg) +{ + struct st_msg_to_write_hook_for_undo_key *msg= + (struct st_msg_to_write_hook_for_undo_key *) hook_arg; + + *msg->root= msg->value; + _ma_fast_unlock_key_del(tbl_info); + return write_hook_for_undo(type, trn, tbl_info, lsn, 0); +} + + +/** + @brief Make a new root with key as only pointer + + @retval -1 error + @retval 0 ok +*/ -int _ma_enlarge_root(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *key, +int _ma_enlarge_root(MARIA_HA *info, MARIA_KEYDEF *keyinfo, const uchar *key, my_off_t *root) { - uint t_length,nod_flag; + uint t_length, nod_flag, page_length; MARIA_KEY_PARAM s_temp; MARIA_SHARE *share=info->s; + MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link; + int res= 0; DBUG_ENTER("_ma_enlarge_root"); nod_flag= (*root != HA_OFFSET_ERROR) ? share->base.key_reflength : 0; @@ -368,18 +492,36 @@ int _ma_enlarge_root(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *key, _ma_kpointer(info, info->buff + info->s->keypage_header, *root); t_length=(*keyinfo->pack_key)(keyinfo,nod_flag,(uchar*) 0, (uchar*) 0, (uchar*) 0, key,&s_temp); + page_length= info->s->keypage_header + t_length + nod_flag; - _ma_store_keynr(info, info->buff, (keyinfo - info->s->keyinfo)); - _ma_store_page_used(info, info->buff, info->s->keypage_header + - t_length + nod_flag, nod_flag); + bzero(info->buff, info->s->keypage_header); + _ma_store_keynr(info, info->buff, keyinfo->key_nr); + _ma_store_page_used(info, info->buff, page_length, nod_flag); (*keyinfo->store_key)(keyinfo, info->buff + info->s->keypage_header + nod_flag, &s_temp); + /* Mark that info->buff was used */ info->keyread_buff_used= info->page_changed= 1; - if ((*root= _ma_new(info,keyinfo,DFLT_INIT_HITS)) == HA_OFFSET_ERROR || - _ma_write_keypage(info,keyinfo,*root,DFLT_INIT_HITS,info->buff)) + if ((*root= _ma_new(info, PAGECACHE_PRIORITY_HIGH, &page_link)) == + HA_OFFSET_ERROR) DBUG_RETURN(-1); - DBUG_RETURN(0); + + /* + Clear unitialized part of page to avoid valgrind/purify warnings + and to get a clean page that is easier to compress and compare with + pages generated with redo + */ + bzero(info->buff + page_length, share->block_size - page_length); + + + if (info->s->now_transactional && + _ma_log_new(info, *root, info->buff, page_length, keyinfo->key_nr, 1)) + res= -1; + if (_ma_write_keypage(info, keyinfo, *root, page_link->write_lock, + PAGECACHE_PRIORITY_HIGH, info->buff)) + res= -1; + + DBUG_RETURN(res); } /* _ma_enlarge_root */ @@ -389,13 +531,14 @@ int _ma_enlarge_root(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *key, @return @retval -1 error @retval 0 ok - @retval 1 key should be stored in higher tree + @retval > 0 Key should be stored in higher tree */ static int w_search(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, uint comp_flag, uchar *key, uint key_length, my_off_t page, - uchar *father_buff, uchar *father_keypos, - my_off_t father_page, my_bool insert_last) + my_off_t father_page, uchar *father_buff, + MARIA_PINNED_PAGE *father_page_link, uchar *father_keypos, + my_bool insert_last) { int error,flag; uint nod_flag, search_key_length; @@ -403,6 +546,7 @@ static int w_search(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, uchar keybuff[HA_MAX_KEY_BUFF]; my_bool was_last_key; my_off_t next_page, dup_key_pos; + MARIA_PINNED_PAGE *page_link; DBUG_ENTER("w_search"); DBUG_PRINT("enter",("page: %ld", (long) page)); @@ -410,7 +554,8 @@ static int w_search(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, if (!(temp_buff= (uchar*) my_alloca((uint) keyinfo->block_length+ HA_MAX_KEY_BUFF*2))) DBUG_RETURN(-1); - if (!_ma_fetch_keypage(info,keyinfo,page,DFLT_INIT_HITS,temp_buff,0)) + if (!_ma_fetch_keypage(info, keyinfo, page, PAGECACHE_LOCK_WRITE, + DFLT_INIT_HITS, temp_buff, 0, &page_link)) goto err; flag=(*keyinfo->bin_search)(info,keyinfo,temp_buff,key,search_key_length, @@ -457,13 +602,19 @@ static int w_search(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, DBUG_ASSERT(subkeys < 0); ft_intXstore(keypos, subkeys); if (!error) - error= _ma_write_keypage(info,keyinfo,page,DFLT_INIT_HITS,temp_buff); + { + page_link->changed= 1; + error= _ma_write_keypage(info, keyinfo, page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS, temp_buff); + } my_afree((uchar*) temp_buff); DBUG_RETURN(error); } } else /* not HA_FULLTEXT, normal HA_NOSAME key */ { + DBUG_PRINT("warning", ("Duplicate key")); info->dup_key_pos= dup_key_pos; my_afree((uchar*) temp_buff); my_errno=HA_ERR_FOUND_DUPP_KEY; @@ -476,12 +627,15 @@ static int w_search(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, insert_last=0; next_page= _ma_kpos(nod_flag,keypos); if (next_page == HA_OFFSET_ERROR || - (error=w_search(info, keyinfo, comp_flag, key, key_length, next_page, - temp_buff, keypos, page, insert_last)) >0) + (error= w_search(info, keyinfo, comp_flag, key, key_length, next_page, + page, temp_buff, page_link, keypos, insert_last)) > 0) { - error= _ma_insert(info,keyinfo,key,temp_buff,keypos,keybuff,father_buff, - father_keypos,father_page, insert_last); - if (_ma_write_keypage(info,keyinfo,page,DFLT_INIT_HITS,temp_buff)) + error= _ma_insert(info, keyinfo, key, temp_buff, keypos, page, keybuff, + father_page, father_buff, father_page_link, + father_keypos, insert_last); + page_link->changed= 1; + if (_ma_write_keypage(info, keyinfo, page, PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS,temp_buff)) goto err; } my_afree((uchar*) temp_buff); @@ -500,9 +654,10 @@ err: _ma_insert() info Open table information. keyinfo Key definition information. - key New key. + key New key anc_buff Key page (beginning). key_pos Position in key page where to insert. + anc_page Page number for anc_buff key_buff Copy of previous key. father_buff parent key page for balancing. father_key_pos position in parent key page for balancing. @@ -511,20 +666,23 @@ err: DESCRIPTION Insert new key at right of key_pos. + Note that caller must save anc_buff RETURN - 2 if key contains key to upper level. - 0 OK. < 0 Error. + 0 OK + 1 If key contains key to upper level (from balance page) + 2 If key contains key to upper level (from split space) */ int _ma_insert(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, - uchar *key, uchar *anc_buff, uchar *key_pos, uchar *key_buff, - uchar *father_buff, uchar *father_key_pos, my_off_t father_page, - my_bool insert_last) + uchar *key, uchar *anc_buff, uchar *key_pos, my_off_t anc_page, + uchar *key_buff, my_off_t father_page, uchar *father_buff, + MARIA_PINNED_PAGE *father_page_link, uchar *father_key_pos, + my_bool insert_last) { - uint a_length,nod_flag; - int t_length; + uint a_length, nod_flag, org_anc_length; + int t_length, res; uchar *endpos, *prev_key; MARIA_KEY_PARAM s_temp; DBUG_ENTER("_ma_insert"); @@ -533,6 +691,7 @@ int _ma_insert(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, USE_WHOLE_KEY);); _ma_get_used_and_nod(info, anc_buff, a_length, nod_flag); + org_anc_length= a_length; endpos= anc_buff+ a_length; prev_key= (key_pos == anc_buff + info->s->keypage_header + nod_flag ? (uchar*) 0 : key_buff); @@ -577,6 +736,11 @@ int _ma_insert(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, (*keyinfo->store_key)(keyinfo,key_pos,&s_temp); a_length+=t_length; _ma_store_page_used(info, anc_buff, a_length, nod_flag); + + /* + Check if the new key fits totally into the the page + (anc_buff is big enough to contain a full page + one key) + */ if (a_length <= (uint) keyinfo->block_length - KEYPAGE_CHECKSUM_SIZE) { if (keyinfo->block_length - KEYPAGE_CHECKSUM_SIZE - a_length < 32 && @@ -588,8 +752,8 @@ int _ma_insert(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, Normal word. One-level tree. Page is almost full. Let's consider converting. We'll compare 'key' and the first key at anc_buff - */ - uchar *a= key, *b= anc_buff + info->s->keypage_header + nod_flag; + */ + const uchar *a= key, *b= anc_buff + info->s->keypage_header + nod_flag; uint alen, blen, ft2len=info->s->ft2_keyinfo.keylength; /* the very first key on the page is always unpacked */ DBUG_ASSERT((*b & 128) == 0); @@ -621,7 +785,7 @@ int _ma_insert(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, */ b+=blen+ft2len+2; for (a=anc_buff+a_length ; b < a ; b+=ft2len+2) - insert_dynamic(info->ft1_to_ft2, b); + insert_dynamic(info->ft1_to_ft2, (uchar*) b); /* fixing the page's length - it contains only one key now */ _ma_store_page_used(info, anc_buff, info->s->keypage_header + blen + @@ -630,6 +794,13 @@ int _ma_insert(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, /* the rest will be done when we're back from recursion */ } } + else + { + if (info->s->now_transactional && + _ma_log_add(info, anc_page, anc_buff, endpos, key_pos, + s_temp.changed_length, t_length)) + DBUG_RETURN(-1); + } DBUG_RETURN(0); /* There is room on page */ } /* Page is full */ @@ -637,29 +808,59 @@ int _ma_insert(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, insert_last=0; if (!(keyinfo->flag & (HA_VAR_LENGTH_KEY | HA_BINARY_PACK_KEY)) && father_buff && !insert_last) - DBUG_RETURN(_ma_balance_page(info,keyinfo,key,anc_buff,father_buff, - father_key_pos,father_page)); - DBUG_RETURN(_ma_split_page(info,keyinfo,key,anc_buff,key_buff, insert_last)); + { + s_temp.key_pos= key_pos; + s_temp.move_length= t_length; + father_page_link->changed= 1; + DBUG_RETURN(_ma_balance_page(info, keyinfo, key, anc_buff, anc_page, + father_page, father_buff, father_key_pos, + &s_temp)); + } + + res= _ma_split_page(info,keyinfo,key,anc_buff,key_buff, insert_last); + if (res < 0) + DBUG_RETURN(res); /* Error */ + + if (info->s->now_transactional) + { + if (_ma_log_split(info, anc_page, anc_buff, org_anc_length, + _ma_get_page_used(info, anc_buff), + key_pos, + s_temp.changed_length, + t_length, KEY_OP_NONE, (uchar*) 0, 0, 0)) + res= -1; + } + DBUG_RETURN(res); } /* _ma_insert */ - /* split a full page in two and assign emerging item to key */ +/** + @brief split a full page in two and assign emerging item to key + + RETURN + @retval 0 ok + @retval -1 error +*/ int _ma_split_page(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, uchar *key, uchar *buff, uchar *key_buff, my_bool insert_last_key) { uint length,a_length,key_ref_length,t_length,nod_flag,key_length; - uchar *key_pos,*pos, *after_key; + uint page_length; + uchar *key_pos,*pos, *after_key, *new_buff; my_off_t new_pos; MARIA_KEY_PARAM s_temp; + MARIA_PINNED_PAGE tmp_page_link, *page_link= &tmp_page_link; + int res; DBUG_ENTER("maria_split_page"); + LINT_INIT(after_key); DBUG_DUMP("buff", buff, _ma_get_page_used(info, buff)); - if (info->s->keyinfo+info->lastinx == keyinfo) - info->page_changed=1; /* Info->buff is used */ + info->page_changed=1; /* Info->buff is used */ info->keyread_buff_used=1; + new_buff= info->buff; nod_flag=_ma_test_if_nod(info, buff); key_ref_length= info->s->keypage_header + nod_flag; if (insert_last_key) @@ -680,12 +881,13 @@ int _ma_split_page(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, { DBUG_PRINT("test",("Splitting nod")); pos=key_pos-nod_flag; - memcpy((uchar*) info->buff + info->s->keypage_header, (uchar*) pos, + memcpy((uchar*) new_buff + info->s->keypage_header, (uchar*) pos, (size_t) nod_flag); } - /* Move middle item to key and pointer to new page */ - if ((new_pos= _ma_new(info,keyinfo,DFLT_INIT_HITS)) == HA_OFFSET_ERROR) + /* Move middle item to key and pointer to new page */ + if ((new_pos= _ma_new(info, PAGECACHE_PRIORITY_HIGH, &page_link)) == + HA_OFFSET_ERROR) DBUG_RETURN(-1); _ma_kpointer(info, _ma_move_key(keyinfo,key,key_buff),new_pos); @@ -697,21 +899,30 @@ int _ma_split_page(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, (uchar*) 0, (uchar*) 0, key_buff, &s_temp); length=(uint) ((buff+a_length)-key_pos); - memcpy((uchar*) info->buff+key_ref_length+t_length,(uchar*) key_pos, + memcpy((uchar*) new_buff+key_ref_length+t_length,(uchar*) key_pos, (size_t) length); - (*keyinfo->store_key)(keyinfo,info->buff+key_ref_length,&s_temp); - _ma_store_page_used(info, info->buff, length + t_length + key_ref_length, - nod_flag); + (*keyinfo->store_key)(keyinfo,new_buff+key_ref_length,&s_temp); + page_length= length + t_length + key_ref_length; + bzero(new_buff, info->s->keypage_header); + _ma_store_page_used(info, new_buff, page_length, nod_flag); /* Copy key number */ - info->buff[info->s->keypage_header - KEYPAGE_USED_SIZE - KEYPAGE_KEYID_SIZE - + new_buff[info->s->keypage_header - KEYPAGE_USED_SIZE - KEYPAGE_KEYID_SIZE - KEYPAGE_FLAG_SIZE]= buff[info->s->keypage_header - KEYPAGE_USED_SIZE - KEYPAGE_KEYID_SIZE - KEYPAGE_FLAG_SIZE]; - if (_ma_write_keypage(info,keyinfo,new_pos,DFLT_INIT_HITS,info->buff)) - DBUG_RETURN(-1); + + res= 2; /* Middle key up */ + if (info->s->now_transactional && + _ma_log_new(info, new_pos, new_buff, page_length, keyinfo->key_nr, 0)) + res= -1; + bzero(new_buff + page_length, info->s->block_size - page_length); + + if (_ma_write_keypage(info, keyinfo, new_pos, page_link->write_lock, + DFLT_INIT_HITS, new_buff)) + res= -1; DBUG_DUMP("key",(uchar*) key, _ma_keylength(keyinfo,key)); - DBUG_RETURN(2); /* Middle key up */ + DBUG_RETURN(res); } /* _ma_split_page */ @@ -821,18 +1032,33 @@ static uchar *_ma_find_last_pos(MARIA_HA *info, MARIA_KEYDEF *keyinfo, } /* _ma_find_last_pos */ - /* Balance page with not packed keys with page on right/left */ - /* returns 0 if balance was done */ +/** + @brief Balance page with static size keys with page on right/left + + @param key Middle key will be stored here + + @notes + Father_buff will always be changed + + @return + @retval 0 Balance was done + @retval 1 Middle key up + @retval -1 Error +*/ static int _ma_balance_page(register MARIA_HA *info, MARIA_KEYDEF *keyinfo, - uchar *key, uchar *curr_buff, uchar *father_buff, - uchar *father_key_pos, my_off_t father_page) + uchar *key, uchar *curr_buff, + my_off_t curr_page, + my_off_t father_page, uchar *father_buff, + uchar *father_key_pos, MARIA_KEY_PARAM *s_temp) { + MARIA_PINNED_PAGE *next_page_link; + MARIA_PINNED_PAGE tmp_page_link, *new_page_link= &tmp_page_link; my_bool right; - uint k_length,father_length,father_keylength,nod_flag,curr_keylength, - right_length,left_length,new_right_length,new_left_length,extra_length, - length,keys; - uchar *pos,*buff,*extra_buff; + uint k_length,father_length,father_keylength,nod_flag,curr_keylength; + uint right_length,left_length,new_right_length,new_left_length,extra_length; + uint keys, tmp_length, extra_buff_length; + uchar *pos,*buff,*extra_buff, *parting_key; my_off_t next_page,new_pos; uchar tmp_part_key[HA_MAX_KEY_BUFF]; DBUG_ENTER("_ma_balance_page"); @@ -853,24 +1079,26 @@ static int _ma_balance_page(register MARIA_HA *info, MARIA_KEYDEF *keyinfo, next_page= _ma_kpos(info->s->base.key_reflength, father_key_pos+father_keylength); buff=info->buff; - DBUG_PRINT("test",("use right page: %lu", (ulong) next_page)); + DBUG_PRINT("info", ("use right page: %lu", (ulong) next_page)); } else { right=0; father_key_pos-=father_keylength; next_page= _ma_kpos(info->s->base.key_reflength,father_key_pos); - /* Fix that curr_buff is to left */ - buff=curr_buff; curr_buff=info->buff; - DBUG_PRINT("test",("use left page: %lu", (ulong) next_page)); + /* Move curr_buff so that it's on the left */ + buff= curr_buff; + curr_buff= info->buff; + DBUG_PRINT("info", ("use left page: %lu", (ulong) next_page)); } /* father_key_pos ptr to parting key */ - if (!_ma_fetch_keypage(info,keyinfo,next_page,DFLT_INIT_HITS,info->buff,0)) + if (!_ma_fetch_keypage(info,keyinfo, next_page, PAGECACHE_LOCK_WRITE, + DFLT_INIT_HITS, info->buff, 0, &next_page_link)) goto err; + next_page_link->changed= 1; DBUG_DUMP("next", info->buff, _ma_get_page_used(info, info->buff)); - /* Test if there is room to share keys */ - + /* Test if there is room to share keys */ left_length= _ma_get_page_used(info, curr_buff); right_length= _ma_get_page_used(info, buff); keys= ((left_length+right_length-info->s->keypage_header*2-nod_flag*2)/ @@ -878,15 +1106,23 @@ static int _ma_balance_page(register MARIA_HA *info, MARIA_KEYDEF *keyinfo, if ((right ? right_length : left_length) + curr_keylength <= (uint) keyinfo->block_length - KEYPAGE_CHECKSUM_SIZE) - { /* Merge buffs */ + { + /* Enough space to hold all keys in the two buffers ; Balance bufferts */ new_left_length= info->s->keypage_header+nod_flag+(keys/2)*curr_keylength; new_right_length=info->s->keypage_header+nod_flag+(((keys+1)/2)* curr_keylength); - _ma_store_page_used(info, curr_buff,new_left_length,nod_flag); - _ma_store_page_used(info, buff,new_right_length,nod_flag); + _ma_store_page_used(info, curr_buff, new_left_length, nod_flag); + _ma_store_page_used(info, buff, new_right_length, nod_flag); + DBUG_PRINT("info", ("left_length: %u -> %u right_length: %u -> %u", + left_length, new_left_length, + right_length, new_right_length)); if (left_length < new_left_length) - { /* Move keys buff -> leaf */ + { + uint length; + DBUG_PRINT("info", ("move keys to end of buff")); + + /* Move keys buff -> curr_buff */ pos=curr_buff+left_length; memcpy(pos,father_key_pos, (size_t) k_length); memcpy(pos+k_length, buff + info->s->keypage_header, @@ -894,30 +1130,150 @@ static int _ma_balance_page(register MARIA_HA *info, MARIA_KEYDEF *keyinfo, pos= buff + info->s->keypage_header + length; memcpy(father_key_pos, pos, (size_t) k_length); bmove(buff + info->s->keypage_header, pos + k_length, new_right_length); + + if (info->s->now_transactional) + { + if (right) + { + /* + Log changes to page on left + The original page is on the left and stored in curr_buff + We have on the page the newly inserted key and data + from buff added last on the page + */ + if (_ma_log_split(info, curr_page, curr_buff, + left_length - s_temp->move_length, + new_left_length, + s_temp->key_pos, s_temp->changed_length, + s_temp->move_length, + KEY_OP_ADD_SUFFIX, + curr_buff + left_length, + new_left_length - left_length, + new_left_length - left_length+ k_length)) + goto err; + /* + Log changes to page on right + This contains the original data with some keys deleted from + start of page + */ + if (_ma_log_prefix(info, next_page, buff, 0, + ((int) new_right_length - (int) right_length))) + goto err; + } + else + { + /* + Log changes to page on right (the original page) which is in buff + Data is removed from start of page + The inserted key may be in buff or moved to curr_buff + */ + if (_ma_log_del_prefix(info, curr_page, buff, + right_length - s_temp->changed_length, + new_right_length, + s_temp->key_pos, s_temp->changed_length, + s_temp->move_length)) + goto err; + /* + Log changes to page on left, which has new data added last + */ + if (_ma_log_suffix(info, next_page, curr_buff, + left_length, new_left_length)) + goto err; + } + } } else - { /* Move keys -> buff */ + { + uint length; + DBUG_PRINT("info", ("move keys to start of buff")); bmove_upp(buff + new_right_length, buff + right_length, right_length - info->s->keypage_header); - length=new_right_length-right_length-k_length; + length= new_right_length -right_length - k_length; memcpy(buff + info->s->keypage_header + length, father_key_pos, (size_t) k_length); pos=curr_buff+new_left_length; memcpy(father_key_pos, pos, (size_t) k_length); - memcpy(buff + info->s->keypage_header, pos+k_length, - (size_t) length); + memcpy(buff + info->s->keypage_header, pos+k_length, (size_t) length); + + if (info->s->now_transactional) + { + if (right) + { + /* + Log changes to page on left + The original page is on the left and stored in curr_buff + The page is shortened from end and the key may be on the page + */ + if (_ma_log_split(info, curr_page, curr_buff, + left_length - s_temp->move_length, + new_left_length, + s_temp->key_pos, s_temp->changed_length, + s_temp->move_length, + KEY_OP_NONE, (uchar*) 0, 0, 0)) + goto err; + /* + Log changes to page on right + This contains the original data, with some data from cur_buff + added first + */ + if (_ma_log_prefix(info, next_page, buff, + (uint) (new_right_length - right_length), + (int) (new_right_length - right_length))) + goto err; + } + else + { + /* + Log changes to page on right (the original page) which is in buff + We have on the page the newly inserted key and data + from buff added first on the page + */ + uint diff_length= new_right_length - right_length; + if (_ma_log_split(info, curr_page, buff, + left_length - s_temp->move_length, + new_right_length, + s_temp->key_pos + diff_length, + s_temp->changed_length, + s_temp->move_length, + KEY_OP_ADD_PREFIX, + buff + info->s->keypage_header, + diff_length, diff_length + k_length)) + goto err; + /* + Log changes to page on left, which is shortened from end + */ + if (_ma_log_suffix(info, next_page, curr_buff, + left_length, new_left_length)) + goto err; + } + } } - if (_ma_write_keypage(info,keyinfo,next_page,DFLT_INIT_HITS,info->buff) || - _ma_write_keypage(info,keyinfo,father_page,DFLT_INIT_HITS,father_buff)) + /* Log changes to father (one level up) page */ + + if (info->s->now_transactional && + _ma_log_change(info, father_page, father_buff, father_key_pos, + k_length)) + goto err; + + /* + next_page_link->changed is marked as true above and fathers + page_link->changed is marked as true in caller + */ + if (_ma_write_keypage(info, keyinfo, next_page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, + DFLT_INIT_HITS, info->buff) || + _ma_write_keypage(info, keyinfo, father_page, + PAGECACHE_LOCK_LEFT_WRITELOCKED, DFLT_INIT_HITS, + father_buff)) goto err; DBUG_RETURN(0); } - /* curr_buff[] and buff[] are full, lets split and make new nod */ + /* curr_buff[] and buff[] are full, lets split and make new nod */ - extra_buff=info->buff+info->s->base.max_key_block_length; + extra_buff= info->buff+info->s->base.max_key_block_length; new_left_length= new_right_length= (info->s->keypage_header + nod_flag + (keys+1) / 3 * curr_keylength); /* @@ -927,51 +1283,141 @@ static int _ma_balance_page(register MARIA_HA *info, MARIA_KEYDEF *keyinfo, */ if (keys == 5) /* Too few keys to balance */ new_left_length-=curr_keylength; - extra_length=nod_flag+left_length+right_length- - new_left_length-new_right_length-curr_keylength; + extra_length= (nod_flag + left_length + right_length - + new_left_length - new_right_length - curr_keylength); + extra_buff_length= extra_length + info->s->keypage_header; DBUG_PRINT("info",("left_length: %d right_length: %d new_left_length: %d new_right_length: %d extra_length: %d", - left_length, right_length, - new_left_length, new_right_length, - extra_length)); + left_length, right_length, + new_left_length, new_right_length, + extra_length)); _ma_store_page_used(info, curr_buff,new_left_length,nod_flag); _ma_store_page_used(info, buff,new_right_length,nod_flag); - /* Copy key number */ + bzero(extra_buff, info->s->keypage_header); extra_buff[info->s->keypage_header - KEYPAGE_USED_SIZE - KEYPAGE_KEYID_SIZE - KEYPAGE_FLAG_SIZE]= buff[info->s->keypage_header - KEYPAGE_USED_SIZE - KEYPAGE_KEYID_SIZE - KEYPAGE_FLAG_SIZE]; - _ma_store_page_used(info, extra_buff, extra_length + info->s->keypage_header, - nod_flag); + _ma_store_page_used(info, extra_buff, extra_buff_length, nod_flag); /* move first largest keys to new page */ pos=buff+right_length-extra_length; - memcpy(extra_buff + info->s->keypage_header, pos, - (size_t) extra_length); - /* Save new parting key */ + memcpy(extra_buff + info->s->keypage_header, pos, extra_length); + /* Zero old data from buffer */ + bzero(extra_buff + extra_buff_length, + info->s->block_size - extra_buff_length); + + /* Save new parting key between buff and extra_buff */ memcpy(tmp_part_key, pos-k_length,k_length); /* Make place for new keys */ bmove_upp(buff+ new_right_length, pos - k_length, - right_length - extra_length - k_length - info->s->keypage_header); + right_length - extra_length - k_length - info->s->keypage_header); /* Copy keys from left page */ pos= curr_buff+new_left_length; memcpy(buff + info->s->keypage_header, pos + k_length, - (size_t) (length=left_length-new_left_length-k_length)); + (size_t) (tmp_length= left_length - new_left_length - k_length)); /* Copy old parting key */ - memcpy(buff + info->s->keypage_header + length, - father_key_pos, (size_t) k_length); + parting_key= buff + info->s->keypage_header + tmp_length; + memcpy(parting_key, father_key_pos, (size_t) k_length); /* Move new parting keys up to caller */ memcpy((right ? key : father_key_pos),pos,(size_t) k_length); memcpy((right ? father_key_pos : key),tmp_part_key, k_length); - if ((new_pos= _ma_new(info,keyinfo,DFLT_INIT_HITS)) == HA_OFFSET_ERROR) + if ((new_pos= _ma_new(info, DFLT_INIT_HITS, &new_page_link)) + == HA_OFFSET_ERROR) goto err; _ma_kpointer(info,key+k_length,new_pos); - if (_ma_write_keypage(info,keyinfo,(right ? new_pos : next_page), - DFLT_INIT_HITS,info->buff) || - _ma_write_keypage(info,keyinfo,(right ? next_page : new_pos), - DFLT_INIT_HITS,extra_buff)) + + if (info->s->now_transactional) + { + if (right) + { + /* + Page order according to key values: + orignal_page (curr_buff), next_page (buff), extra_buff + + cur_buff is shortened, + buff is getting new keys at start and shortened from end. + extra_buff is new page + + Note that extra_buff (largest key parts) will be stored at the + place of the original 'right' page (next_page) and right page (buff) + will be stored at new_pos. + + This makes the log entries smaller as right_page contains all + data to generate the data extra_buff + */ + + /* + Log changes to page on left (page shortened page at end) + */ + if (_ma_log_split(info, curr_page, curr_buff, + left_length, new_left_length, + s_temp->key_pos, s_temp->changed_length, + s_temp->move_length, + KEY_OP_NONE, (uchar*) 0, 0, 0)) + goto err; + /* + Log changes to right page (stored at next page) + This contains the last 'extra_buff' from 'buff' + */ + if (_ma_log_prefix(info, next_page, extra_buff, + 0, (int) (extra_length - right_length))) + goto err; + + /* + Log changes to middle page, which is stored at the new page + position + */ + if (_ma_log_new(info, new_pos, buff, new_right_length, + keyinfo->key_nr, 0)) + goto err; + } + else + { + /* + Log changes to page on right (the original page) which is in buff + This contains the original data, with some data from curr_buff + added first and shortened at end + */ + int data_added_first= left_length - new_left_length; + if (_ma_log_key_middle(info, curr_page, buff, + new_right_length, + data_added_first, + data_added_first, + extra_length, + s_temp->key_pos, + s_temp->changed_length, + s_temp->move_length)) + goto err; + + /* Log changes to page on left, which is shortened from end */ + if (_ma_log_suffix(info, next_page, curr_buff, + left_length, new_left_length)) + goto err; + + /* Log change to rightmost (new) page */ + if (_ma_log_new(info, new_pos, extra_buff, + extra_buff_length, keyinfo->key_nr, 0)) + goto err; + } + + /* Log changes to father (one level up) page */ + if (info->s->now_transactional && + _ma_log_change(info, father_page, father_buff, father_key_pos, + k_length)) + goto err; + } + + if (_ma_write_keypage(info, keyinfo, (right ? new_pos : next_page), + (right ? new_page_link->write_lock : + PAGECACHE_LOCK_LEFT_WRITELOCKED), + DFLT_INIT_HITS, info->buff) || + _ma_write_keypage(info, keyinfo, (right ? next_page : new_pos), + (!right ? new_page_link->write_lock : + PAGECACHE_LOCK_LEFT_WRITELOCKED), + DFLT_INIT_HITS, extra_buff)) goto err; DBUG_RETURN(1); /* Middle key up */ @@ -980,6 +1426,7 @@ err: DBUG_RETURN(-1); } /* _ma_balance_page */ + /********************************************************************** * Bulk insert code * **********************************************************************/ @@ -990,15 +1437,16 @@ typedef struct { } bulk_insert_param; -int _ma_ck_write_tree(register MARIA_HA *info, uint keynr, uchar *key, - uint key_length) +static int _ma_ck_write_tree(register MARIA_HA *info, uint keynr, uchar *key, + uint key_length) { int error; DBUG_ENTER("_ma_ck_write_tree"); - error= tree_insert(&info->bulk_insert[keynr], key, - key_length + info->s->rec_reflength, - info->bulk_insert[keynr].custom_arg) ? 0 : HA_ERR_OUT_OF_MEM ; + error= (tree_insert(&info->bulk_insert[keynr], key, + key_length + info->s->rec_reflength, + info->bulk_insert[keynr].custom_arg) ? 0 : + HA_ERR_OUT_OF_MEM) ; DBUG_RETURN(error); } /* _ma_ck_write_tree */ @@ -1037,8 +1485,8 @@ static int keys_free(uchar *key, TREE_FREE mode, bulk_insert_param *param) keyinfo=param->info->s->keyinfo+param->keynr; keylen= _ma_keylength(keyinfo, key); memcpy(lastkey, key, keylen); - return _ma_ck_write_btree(param->info,param->keynr,lastkey, - keylen - param->info->s->rec_reflength); + return _ma_ck_write_btree(param->info, param->keynr, lastkey, + keylen - param->info->s->rec_reflength); case free_end: if (param->info->s->concurrent_insert) rw_unlock(¶m->info->s->key_root_lock[param->keynr]); @@ -1128,12 +1576,670 @@ void maria_end_bulk_insert(MARIA_HA *info) for (i=0 ; i < info->s->base.keys ; i++) { if (is_tree_inited(& info->bulk_insert[i])) - { - delete_tree(& info->bulk_insert[i]); - } + delete_tree(&info->bulk_insert[i]); } - my_free((void *)info->bulk_insert, MYF(0)); + my_free(info->bulk_insert, MYF(0)); info->bulk_insert=0; } DBUG_VOID_RETURN; } + + +/**************************************************************************** + Dedicated functions that generate log entries +****************************************************************************/ + +/** + @brief Log creation of new page + + @note + We don't have to store the page_length into the log entry as we can + calculate this from the length of the log entry + + @retval 1 error + @retval 0 ok +*/ + +static my_bool _ma_log_new(MARIA_HA *info, my_off_t page, uchar *buff, + uint page_length, uint key_nr, my_bool root_page) +{ + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE * 2 + KEY_NR_STORE_SIZE + +1]; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + MARIA_SHARE *share=info->s; + DBUG_ENTER("_ma_log_new"); + DBUG_PRINT("enter", ("page: %lu", (ulong) page)); + + DBUG_ASSERT(info->s->now_transactional); + + /* Store address of new root page */ + page/= info->s->block_size; + page_store(log_data + FILEID_STORE_SIZE, page); + + /* Store link to next unused page */ + if (info->used_key_del == 2) + page= 0; /* key_del not changed */ + else + page= ((share->current_key_del == HA_OFFSET_ERROR) ? IMPOSSIBLE_PAGE_NO : + share->current_key_del / info->s->block_size); + + page_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE, page); + key_nr_store(log_data + FILEID_STORE_SIZE + PAGE_STORE_SIZE*2, key_nr); + log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE*2 + KEY_NR_STORE_SIZE]= + (uchar) root_page; + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + + page_length-= LSN_STORE_SIZE; + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= buff + LSN_STORE_SIZE; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= page_length; + + if (translog_write_record(&lsn, LOGREC_REDO_INDEX_NEW_PAGE, + info->trn, info, sizeof(log_data) + page_length, + TRANSLOG_INTERNAL_PARTS + 2, log_array, + log_data, NULL)) + DBUG_RETURN(1); + DBUG_RETURN(0); +} + + +/** + @brief + Log that a key was added to the page +*/ + +static my_bool _ma_log_add(MARIA_HA *info, my_off_t page, uchar *buff, + uchar *end_buff, uchar *key_pos, + uint changed_length, int move_length) +{ + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 3 + 3 + 3], *log_pos; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + DBUG_ENTER("_ma_log_add"); + DBUG_PRINT("enter", ("page: %lu", (ulong) page)); + + DBUG_ASSERT(info->s->now_transactional); + + /* + Write REDO entry that contains the logical operations we need + to do the page + */ + log_pos= log_data + FILEID_STORE_SIZE; + page/= info->s->block_size; + page_store(log_pos, page); + log_pos+= PAGE_STORE_SIZE; + + if (key_pos == end_buff) + log_pos[0]= KEY_OP_ADD_SUFFIX; + else + { + uint offset= (uint) (key_pos - buff); + log_pos[0]= KEY_OP_OFFSET; + int2store(log_pos+1, offset); + log_pos+= 3; + if (move_length) + { + log_pos[0]= KEY_OP_SHIFT; + int2store(log_pos+1, move_length); + log_pos+= 3; + } + log_pos[0]= KEY_OP_CHANGE; + } + int2store(log_pos+1, changed_length); + log_pos+= 3; + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - + log_data); + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= key_pos; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= changed_length; + + if (translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + log_array[TRANSLOG_INTERNAL_PARTS + + 0].length + changed_length, + TRANSLOG_INTERNAL_PARTS + 2, log_array, + log_data, NULL)) + DBUG_RETURN(-1); + DBUG_RETURN(0); +} + + +/** + @brief + Log when some part of the key page changes +*/ + +static my_bool _ma_log_change(MARIA_HA *info, my_off_t page, uchar *buff, + uchar *key_pos, uint length) +{ + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 6], *log_pos; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + uint offset= (uint) (key_pos - buff); + DBUG_ENTER("_ma_log_change"); + DBUG_PRINT("enter", ("page: %lu", (ulong) page)); + + DBUG_ASSERT(info->s->now_transactional); + + /* Store address of new root page */ + page/= info->s->block_size; + page_store(log_data + FILEID_STORE_SIZE, page); + log_pos= log_data+ FILEID_STORE_SIZE + PAGE_STORE_SIZE; + log_pos[0]= KEY_OP_OFFSET; + int2store(log_pos+1, offset); + log_pos[3]= KEY_OP_CHANGE; + int2store(log_pos+4, length); + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= (char*) log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= sizeof(log_data); + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= buff + offset; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= length; + + if (translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, sizeof(log_data) + length, + TRANSLOG_INTERNAL_PARTS + 2, log_array, + log_data, NULL)) + DBUG_RETURN(1); + DBUG_RETURN(0); +} + + +/** + @brief + Write log entry for page that has got a key added to the page under + one and only one of the following senarios: + - Page is shortened from end + - Data is added to end of page + - Data added at front of page + + @param prefix_or_suffix KEY_OP_NONE Ignored + KEY_OP_ADD_PREFIX Add data to start of page + KEY_OP_ADD_SUFFIX Add data to end of page + +*/ + +static my_bool _ma_log_split(MARIA_HA *info, my_off_t page, uchar *buff, + uint org_length, uint new_length, + uchar *key_pos, uint key_length, int move_length, + enum en_key_op prefix_or_suffix, + uchar *data, uint data_length, + uint change_length) +{ + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 3+3+3+3+3+2]; + uchar *log_pos; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 3]; + uint offset= (uint) (key_pos - buff); + uint translog_parts, extra_length; + DBUG_ENTER("_ma_log_split"); + DBUG_PRINT("enter", ("page: %lu org_length: %u new_length: %u", + (ulong) page, org_length, new_length)); + + log_pos= log_data + FILEID_STORE_SIZE; + page/= info->s->block_size; + page_store(log_pos, page); + log_pos+= PAGE_STORE_SIZE; + + if (new_length <= offset) + { + /* + Page was split before inserted key. Write redo entry where + we just cut current page at page_length + */ + uint length_offset= org_length - new_length; + log_pos[0]= KEY_OP_DEL_SUFFIX; + int2store(log_pos+1, length_offset); + log_pos+= 3; + translog_parts= 1; + extra_length= 0; + } + else + { + /* Key was added to page which was split after the inserted key */ + uint max_key_length; + + /* + Handle case when split happened directly after the newly inserted key. + */ + max_key_length= new_length - offset; + extra_length= min(key_length, max_key_length); + + if ((int) new_length < (int) (org_length + move_length + data_length)) + { + /* Shorten page */ + uint diff= org_length + move_length + data_length - new_length; + log_pos[0]= KEY_OP_DEL_SUFFIX; + int2store(log_pos + 1, diff); + log_pos+= 3; + } + else + { + DBUG_ASSERT(new_length == org_length + move_length + data_length); + } + + log_pos[0]= KEY_OP_OFFSET; + int2store(log_pos+1, offset); + log_pos+= 3; + + if (move_length) + { + log_pos[0]= KEY_OP_SHIFT; + int2store(log_pos+1, move_length); + log_pos+= 3; + } + + log_pos[0]= KEY_OP_CHANGE; + int2store(log_pos+1, extra_length); + log_pos+= 3; + + /* Point to original inserted key data */ + if (prefix_or_suffix == KEY_OP_ADD_PREFIX) + key_pos+= data_length; + + translog_parts= 2; + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= (char*) key_pos; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= extra_length; + } + + if (data_length) + { + /* Add prefix or suffix */ + log_pos[0]= prefix_or_suffix; + int2store(log_pos+1, data_length); + log_pos+= 3; + if (prefix_or_suffix == KEY_OP_ADD_PREFIX) + { + int2store(log_pos+1, change_length); + log_pos+= 2; + data_length= change_length; + } + log_array[TRANSLOG_INTERNAL_PARTS + translog_parts].str= (char*) data; + log_array[TRANSLOG_INTERNAL_PARTS + translog_parts].length= data_length; + translog_parts++; + extra_length+= data_length; + } + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - + log_data); + DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + log_array[TRANSLOG_INTERNAL_PARTS + + 0].length + + extra_length, + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_array, log_data, NULL)); +} + + +/** + @brief + Write log entry for page that has got a key added to the page + and page is shortened from start of page + + @fn _ma_log_del_prefix() + @param info Maria handler + @param page Page number + @param buff Page buffer + @param org_length Length of buffer when read + @param new_length Final length + @param key_pos Where on page buffer key was added. This is position + before prefix was removed + @param key_length How many bytes was changed at 'key_pos' + @param move_length How many bytes was moved up when key was added + + @return + @retval 0 ok + @retval 1 error +*/ + +static my_bool _ma_log_del_prefix(MARIA_HA *info, my_off_t page, uchar *buff, + uint org_length, uint new_length, + uchar *key_pos, uint key_length, + int move_length) +{ + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 12], *log_pos; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + uint offset= (uint) (key_pos - buff); + uint diff_length= org_length + move_length - new_length; + uint translog_parts, extra_length; + DBUG_ENTER("_ma_log_del_prefix"); + DBUG_PRINT("enter", ("page: %lu org_length: %u new_length: %u", + (ulong) page, org_length, new_length)); + + DBUG_ASSERT((int) diff_length > 0); + + log_pos= log_data + FILEID_STORE_SIZE; + page/= info->s->block_size; + page_store(log_pos, page); + log_pos+= PAGE_STORE_SIZE; + + translog_parts= 1; + extra_length= 0; + + if (offset <= diff_length) + { + /* + Key is not anymore on page. Move data down, but take into account that + the original page had grown with 'move_length bytes' + */ + log_pos[0]= KEY_OP_DEL_PREFIX; + int2store(log_pos+1, diff_length - move_length); + log_pos+= 3; + } + else + { + /* + Correct position to key, as data before key has been delete and key + has thus been moved down + */ + offset-= diff_length; + key_pos-= diff_length; + + /* Move data down */ + log_pos[0]= KEY_OP_DEL_PREFIX; + int2store(log_pos+1, diff_length); + log_pos+= 3; + + log_pos[0]= KEY_OP_OFFSET; + int2store(log_pos+1, offset); + log_pos+= 3; + + if (move_length) + { + log_pos[0]= KEY_OP_SHIFT; + int2store(log_pos+1, move_length); + log_pos+= 3; + } + log_pos[0]= KEY_OP_CHANGE; + int2store(log_pos+1, key_length); + log_pos+= 3; + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= (char*) key_pos; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= key_length; + translog_parts= 2; + extra_length= key_length; + } + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - + log_data); + DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + log_array[TRANSLOG_INTERNAL_PARTS + + 0].length + + extra_length, + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_array, log_data, NULL)); +} + + +/** + @brief + Write log entry for page that has got data added first and + data deleted last. Old changed key may be part of page +*/ + +static my_bool _ma_log_key_middle(MARIA_HA *info, my_off_t page, uchar *buff, + uint new_length, + uint data_added_first, + uint data_changed_first, + uint data_deleted_last, + uchar *key_pos, + uint key_length, int move_length) +{ + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 3+5+3+3+3]; + uchar *log_pos; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 4]; + uint key_offset; + uint translog_parts, extra_length; + DBUG_ENTER("_ma_log_key_middle"); + DBUG_PRINT("enter", ("page: %lu", (ulong) page)); + + /* new place of key after changes */ + key_pos+= data_added_first; + key_offset= (uint) (key_pos - buff); + if (key_offset < new_length) + { + /* key is on page; Calculate how much of the key is there */ + uint max_key_length= new_length - key_offset; + if (max_key_length < key_length) + { + /* Key is last on page */ + key_length= max_key_length; + move_length= 0; + } + /* + Take into account that new data was added as part of original key + that also needs to be removed from page + */ + data_deleted_last+= move_length; + } + + page/= info->s->block_size; + + /* First log changes to page */ + log_pos= log_data + FILEID_STORE_SIZE; + page_store(log_pos, page); + log_pos+= PAGE_STORE_SIZE; + + log_pos[0]= KEY_OP_DEL_SUFFIX; + int2store(log_pos+1, data_deleted_last); + log_pos+= 3; + + log_pos[0]= KEY_OP_ADD_PREFIX; + int2store(log_pos+1, data_added_first); + int2store(log_pos+3, data_changed_first); + log_pos+= 5; + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - + log_data); + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= ((char*) buff + + info->s->keypage_header); + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= data_changed_first; + translog_parts= 2; + extra_length= data_changed_first; + + /* If changed key is on page, log those changes too */ + + if (key_offset < new_length) + { + uchar *start_log_pos= log_pos; + + log_pos[0]= KEY_OP_OFFSET; + int2store(log_pos+1, key_offset); + log_pos+= 3; + if (move_length) + { + log_pos[0]= KEY_OP_SHIFT; + int2store(log_pos+1, move_length); + log_pos+= 3; + } + log_pos[0]= KEY_OP_CHANGE; + int2store(log_pos+1, key_length); + log_pos+= 3; + + log_array[TRANSLOG_INTERNAL_PARTS + 2].str= (char*) start_log_pos; + log_array[TRANSLOG_INTERNAL_PARTS + 2].length= (uint) (log_pos - + start_log_pos); + + log_array[TRANSLOG_INTERNAL_PARTS + 3].str= (char*) key_pos; + log_array[TRANSLOG_INTERNAL_PARTS + 3].length= key_length; + translog_parts+=2; + extra_length+= log_array[TRANSLOG_INTERNAL_PARTS + 2].length + key_length; + } + + DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + log_array[TRANSLOG_INTERNAL_PARTS + + 0].length + extra_length, + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_array, log_data, NULL)); +} + + +/** + @brief + Write log entry for page that has got data added or deleted at start of page +*/ + +static my_bool _ma_log_prefix(MARIA_HA *info, my_off_t page, + uchar *buff, uint changed_length, + int move_length) +{ + uint translog_parts; + LSN lsn; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 7], *log_pos; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + DBUG_ENTER("_ma_log_prefix"); + DBUG_PRINT("enter", ("page: %lu change_length: %u move_length: %d", + (ulong) page, changed_length, move_length)); + + page/= info->s->block_size; + log_pos= log_data + FILEID_STORE_SIZE; + page_store(log_pos, page); + log_pos+= PAGE_STORE_SIZE; + + if (move_length < 0) + { + /* Delete prefix */ + DBUG_ASSERT(changed_length == 0); + log_pos[0]= KEY_OP_DEL_PREFIX; + int2store(log_pos+1, -move_length); + log_pos+= 3; + translog_parts= 1; + } + else + { + /* Add prefix */ + DBUG_ASSERT(changed_length >0 && (int) changed_length >= move_length); + log_pos[0]= KEY_OP_ADD_PREFIX; + int2store(log_pos+1, move_length); + int2store(log_pos+3, changed_length); + log_pos+= 5; + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= ((char*) buff + + info->s->keypage_header); + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= changed_length; + translog_parts= 2; + } + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - + log_data); + + DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + log_array[TRANSLOG_INTERNAL_PARTS + + 0].length + changed_length, + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_array, log_data, NULL)); +} + + +/** + @brief + Write log entry for page that has got data added or deleted at end of page +*/ + +static my_bool _ma_log_suffix(MARIA_HA *info, my_off_t page, + uchar *buff, uint org_length, + uint new_length) +{ + LSN lsn; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 10], *log_pos; + int diff; + uint translog_parts, extra_length; + DBUG_ENTER("_ma_log_suffix"); + DBUG_PRINT("enter", ("page: %lu org_length: %u new_length: %u", + (ulong) page, org_length, new_length)); + + page/= info->s->block_size; + + log_pos= log_data + FILEID_STORE_SIZE; + page_store(log_pos, page); + log_pos+= PAGE_STORE_SIZE; + + if ((diff= (int) (new_length - org_length)) < 0) + { + log_pos[0]= KEY_OP_DEL_SUFFIX; + int2store(log_pos+1, -diff); + log_pos+= 3; + translog_parts= 1; + extra_length= 0; + } + else + { + log_pos[0]= KEY_OP_ADD_SUFFIX; + int2store(log_pos+1, diff); + log_pos+= 3; + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= (char*) buff + org_length; + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= (uint) diff; + translog_parts= 2; + extra_length= (uint) diff; + } + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - + log_data); + + DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + log_array[TRANSLOG_INTERNAL_PARTS + + 0].length + extra_length, + TRANSLOG_INTERNAL_PARTS + translog_parts, + log_array, log_data, NULL)); +} + + +#ifdef NOT_NEEDED + +/** + @brief + Write log entry for page that has got data added first and + data deleted last +*/ + +static my_bool _ma_log_middle(MARIA_HA *info, my_off_t page, + uchar *buff, + uint data_added_first, uint data_changed_first, + uint data_deleted_last) +{ + LSN lsn; + LEX_STRING log_array[TRANSLOG_INTERNAL_PARTS + 2]; + uchar log_data[FILEID_STORE_SIZE + PAGE_STORE_SIZE + 3 + 5], *log_pos; + DBUG_ENTER("_ma_log_middle"); + DBUG_PRINT("enter", ("page: %lu", (ulong) page)); + + page/= info->s->block_size; + + log_pos= log_data + FILEID_STORE_SIZE; + page_store(log_pos, page); + log_pos+= PAGE_STORE_SIZE; + + log_pos[0]= KEY_OP_DEL_PREFIX; + int2store(log_pos+1, data_deleted_last); + log_pos+= 3; + + log_pos[0]= KEY_OP_ADD_PREFIX; + int2store(log_pos+1, data_added_first); + int2store(log_pos+3, data_changed_first); + log_pos+= 5; + + log_array[TRANSLOG_INTERNAL_PARTS + 0].str= log_data; + log_array[TRANSLOG_INTERNAL_PARTS + 0].length= (uint) (log_pos - + log_data); + + log_array[TRANSLOG_INTERNAL_PARTS + 1].str= ((char*) buff + + info->s->keypage_header); + log_array[TRANSLOG_INTERNAL_PARTS + 1].length= data_changed_first; + DBUG_RETURN(translog_write_record(&lsn, LOGREC_REDO_INDEX, + info->trn, info, + log_array[TRANSLOG_INTERNAL_PARTS + + 0].length + data_changed_first, + TRANSLOG_INTERNAL_PARTS + 2, + log_array, log_data, NULL)); +} +#endif diff --git a/storage/maria/maria_chk.c b/storage/maria/maria_chk.c index 60b83caa2f1..d36b440541b 100644 --- a/storage/maria/maria_chk.c +++ b/storage/maria/maria_chk.c @@ -66,7 +66,7 @@ static const char *field_pack[]= }; static const char *record_formats[]= -{ +{ "Fixed length", "Packed", "Compressed", "Block", "?" }; @@ -1255,6 +1255,8 @@ static void descript(HA_CHECK *param, register MARIA_HA *info, char *name) printf("\nMARIA file: %s\n",name); printf("Record format: %s\n", record_formats[share->data_file_type]); + printf("Crashsafe: %s\n", + share->base.born_transactional ? "yes" : "no"); printf("Character set: %s (%d)\n", get_charset_name(share->state.header.language), share->state.header.language); diff --git a/storage/maria/maria_def.h b/storage/maria/maria_def.h index 5351ffdab16..98f076970d4 100644 --- a/storage/maria/maria_def.h +++ b/storage/maria/maria_def.h @@ -289,10 +289,14 @@ typedef struct st_maria_share /* Compare a row in memory with a row on disk */ my_bool (*compare_unique)(MARIA_HA *, MARIA_UNIQUEDEF *, const uchar *record, MARIA_RECORD_POS pos); + my_off_t (*keypos_to_recpos)(MARIA_HA *info, my_off_t pos); + my_off_t (*recpos_to_keypos)(MARIA_HA *info, my_off_t pos); + /* Mapings to read/write the data file */ size_t (*file_read)(MARIA_HA *, uchar *, size_t, my_off_t, myf); size_t (*file_write)(MARIA_HA *, const uchar *, size_t, my_off_t, myf); invalidator_by_filename invalidator; /* query cache invalidator */ + my_off_t current_key_del; /* delete links for index pages */ ulong this_process; /* processid */ ulong last_process; /* For table-change-check */ ulong last_version; /* Version on start */ @@ -333,9 +337,11 @@ typedef struct st_maria_share (FALSE, TRUE) is impossible. */ my_bool now_transactional; + my_bool used_key_del; /* != 0 if key_del is locked */ #ifdef THREAD THR_LOCK lock; pthread_mutex_t intern_lock; /* Locking for use with _locking */ + pthread_cond_t intern_cond; rw_lock_t *key_root_lock; #endif my_off_t mmaped_length; @@ -382,6 +388,7 @@ typedef struct st_maria_row MARIA_RECORD_POS lastpos, nextpos; MARIA_RECORD_POS *tail_positions; ha_checksum checksum; + LSN orig_undo_lsn; /* Lsn at start of row insert */ uchar *empty_bits, *field_lengths; uint *null_field_lengths; /* All null field lengths */ ulong *blob_lengths; /* Length for each blob */ @@ -464,7 +471,7 @@ struct st_maria_handler enum ha_rkey_function last_key_func; /* CONTAIN, OVERLAP, etc */ uint save_lastkey_length; uint pack_key_length; /* For MARIAMRG */ - uint16 last_used_keyseg; /* For MARIAMRG */ + myf lock_wait; /* is 0 or MY_DONT_WAIT */ int errkey; /* Got last error on this key */ int lock_type; /* How database was locked */ int tmp_lock_type; /* When locked by readinfo */ @@ -472,10 +479,12 @@ struct st_maria_handler uint save_update; /* When using KEY_READ */ int save_lastinx; uint preload_buff_size; /* When preloading indexes */ - myf lock_wait; /* is 0 or MY_DONT_WAIT */ + uint16 last_used_keyseg; /* For MARIAMRG */ + uint8 used_key_del; /* != 0 if key_del is used */ my_bool was_locked; /* Was locked in panic */ my_bool append_insert_at_end; /* Set if concurrent insert */ my_bool quick_mode; + /* Marker if key_del_changed */ /* If info->keyread_buff can't be used for rnext */ my_bool page_changed; /* If info->keyread_buff has to be re-read for rnext */ @@ -625,6 +634,9 @@ struct st_maria_handler #define MARIA_MIN_ROWS_TO_DISABLE_INDEXES 100 #define MARIA_MIN_ROWS_TO_USE_WRITE_CACHE 10 +/* Marker for impossible delete link */ +#define IMPOSSIBLE_PAGE_NO LL(0xFFFFFFFFFF) + /* The UNIQUE check is done with a hashed long key */ #define MARIA_UNIQUE_HASH_TYPE HA_KEYTYPE_ULONG_INT @@ -654,10 +666,13 @@ extern my_bool maria_inited; /* This is used by _ma_calc_xxx_key_length och _ma_store_key */ typedef struct st_maria_s_param { - uint ref_length, key_length, n_ref_length; - uint n_length, totlength, part_of_prev_key, prev_length, pack_marker; const uchar *key; uchar *prev_key, *next_key_pos; + uchar *key_pos; /* For balance page */ + uint ref_length, key_length, n_ref_length; + uint n_length, totlength, part_of_prev_key, prev_length, pack_marker; + uint changed_length; + int move_length; /* For balance_page */ bool store_not_null; } MARIA_KEY_PARAM; @@ -666,7 +681,8 @@ typedef struct st_maria_s_param typedef struct st_pinned_page { PAGECACHE_BLOCK_LINK *link; - enum pagecache_page_lock unlock; + enum pagecache_page_lock unlock, write_lock; + my_bool changed; } MARIA_PINNED_PAGE; @@ -692,15 +708,16 @@ extern my_bool _ma_delete_static_record(MARIA_HA *info, const uchar *record); extern my_bool _ma_cmp_static_record(MARIA_HA *info, const uchar *record); extern int _ma_ck_write(MARIA_HA *info, uint keynr, uchar *key, uint length); +extern int _ma_enlarge_root(MARIA_HA *info, MARIA_KEYDEF *keyinfo, + const uchar *key, MARIA_RECORD_POS *root); +extern int _ma_insert(register MARIA_HA *info, register MARIA_KEYDEF *keyinfo, + uchar *key, uchar *anc_buff, uchar *key_pos, + my_off_t anc_page, uchar *key_buff, my_off_t father_page, + uchar *father_buff, MARIA_PINNED_PAGE *father_page_link, + uchar *father_key_pos, my_bool insert_last); extern int _ma_ck_real_write_btree(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *key, uint key_length, MARIA_RECORD_POS *root, uint comp_flag); -extern int _ma_enlarge_root(MARIA_HA *info, MARIA_KEYDEF *keyinfo, - uchar *key, MARIA_RECORD_POS *root); -extern int _ma_insert(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *key, - uchar *anc_buff, uchar *key_pos, uchar *key_buff, - uchar *father_buff, uchar *father_keypos, - my_off_t father_page, my_bool insert_last); extern int _ma_split_page(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *key, uchar *buff, uchar *key_buff, my_bool insert_last); @@ -709,6 +726,9 @@ extern uchar *_ma_find_half_pos(MARIA_HA *info, uint nod_flag, uchar *page, uchar *key, uint *return_key_length, uchar ** after_key); +extern my_bool write_hook_for_undo_key(enum translog_record_type type, + TRN *trn, MARIA_HA *tbl_info, + LSN *lsn, void *hook_arg); extern int _ma_calc_static_key_length(MARIA_KEYDEF *keyinfo, uint nod_flag, uchar *key_pos, uchar *org_key, uchar *key_buff, const uchar *key, @@ -727,19 +747,22 @@ extern int _ma_calc_bin_pack_key_length(MARIA_KEYDEF *keyinfo, uchar *org_key, uchar *prev_key, const uchar *key, MARIA_KEY_PARAM *s_temp); -void _ma_store_static_key(MARIA_KEYDEF *keyinfo, uchar *key_pos, - MARIA_KEY_PARAM *s_temp); -void _ma_store_var_pack_key(MARIA_KEYDEF *keyinfo, uchar *key_pos, - MARIA_KEY_PARAM *s_temp); +extern void _ma_store_static_key(MARIA_KEYDEF *keyinfo, uchar *key_pos, + MARIA_KEY_PARAM *s_temp); +extern void _ma_store_var_pack_key(MARIA_KEYDEF *keyinfo, uchar *key_pos, + MARIA_KEY_PARAM *s_temp); #ifdef NOT_USED -void _ma_store_pack_key(MARIA_KEYDEF *keyinfo, uchar *key_pos, - MARIA_KEY_PARAM *s_temp); +extern void _ma_store_pack_key(MARIA_KEYDEF *keyinfo, uchar *key_pos, + MARIA_KEY_PARAM *s_temp); #endif -void _ma_store_bin_pack_key(MARIA_KEYDEF *keyinfo, uchar *key_pos, - MARIA_KEY_PARAM *s_temp); +extern void _ma_store_bin_pack_key(MARIA_KEYDEF *keyinfo, uchar *key_pos, + MARIA_KEY_PARAM *s_temp); extern int _ma_ck_delete(MARIA_HA *info, uint keynr, uchar *key, uint key_length); +extern int _ma_ck_real_delete(register MARIA_HA *info, MARIA_KEYDEF *keyinfo, + uchar *key, uint key_length, + my_off_t *root); extern int _ma_readinfo(MARIA_HA *info, int lock_flag, int check_keybuffer); extern int _ma_writeinfo(MARIA_HA *info, uint options); extern int _ma_test_if_changed(MARIA_HA *info); @@ -749,22 +772,22 @@ extern int _ma_check_index(MARIA_HA *info, int inx); extern int _ma_search(MARIA_HA *info, MARIA_KEYDEF *keyinfo, uchar *key, uint key_len, uint nextflag, my_off_t pos); extern int _ma_bin_search(MARIA_HA *info, MARIA_KEYDEF *keyinfo, - uchar *page, uchar *key, uint key_len, + uchar *page, const uchar *key, uint key_len, uint comp_flag, uchar **ret_pos, uchar *buff, my_bool *was_last_key); extern int _ma_seq_search(MARIA_HA *info, MARIA_KEYDEF *keyinfo, - uchar *page, uchar *key, uint key_len, + uchar *page, const uchar *key, uint key_len, uint comp_flag, uchar ** ret_pos, uchar *buff, my_bool *was_last_key); extern int _ma_prefix_search(MARIA_HA *info, MARIA_KEYDEF *keyinfo, - uchar *page, uchar *key, uint key_len, + uchar *page, const uchar *key, uint key_len, uint comp_flag, uchar ** ret_pos, uchar *buff, my_bool *was_last_key); extern my_off_t _ma_kpos(uint nod_flag, uchar *after_key); extern void _ma_kpointer(MARIA_HA *info, uchar *buff, my_off_t pos); extern MARIA_RECORD_POS _ma_dpos(MARIA_HA *info, uint nod_flag, const uchar *after_key); -extern MARIA_RECORD_POS _ma_rec_pos(MARIA_SHARE *info, uchar *ptr); +extern MARIA_RECORD_POS _ma_rec_pos(MARIA_HA *info, uchar *ptr); extern void _ma_dpointer(MARIA_HA *info, uchar *buff, MARIA_RECORD_POS pos); extern uint _ma_get_static_key(MARIA_KEYDEF *keyinfo, uint nod_flag, uchar **page, uchar *key); @@ -789,14 +812,22 @@ extern int _ma_search_first(MARIA_HA *info, MARIA_KEYDEF *keyinfo, my_off_t pos); extern int _ma_search_last(MARIA_HA *info, MARIA_KEYDEF *keyinfo, my_off_t pos); +extern my_off_t _ma_static_keypos_to_recpos(MARIA_HA *info, my_off_t pos); +extern my_off_t _ma_static_recpos_to_keypos(MARIA_HA *info, my_off_t pos); +extern my_off_t _ma_transparent_recpos(MARIA_HA *info, my_off_t pos); +extern my_off_t _ma_transaction_keypos_to_recpos(MARIA_HA *info, my_off_t pos); +extern my_off_t _ma_transaction_recpos_to_keypos(MARIA_HA *info, my_off_t pos); + extern uchar *_ma_fetch_keypage(MARIA_HA *info, MARIA_KEYDEF *keyinfo, - my_off_t page, int level, uchar *buff, - int return_buffer); + my_off_t page, enum pagecache_page_lock lock, + int level, uchar *buff, int return_buffer, + MARIA_PINNED_PAGE **page_link); extern int _ma_write_keypage(MARIA_HA *info, MARIA_KEYDEF *keyinfo, - my_off_t page, int level, uchar *buff); -extern int _ma_dispose(MARIA_HA *info, MARIA_KEYDEF *keyinfo, my_off_t pos, - int level); -extern my_off_t _ma_new(MARIA_HA *info, MARIA_KEYDEF *keyinfo, int level); + my_off_t page, enum pagecache_page_lock lock, + int level, uchar *buff); +extern int _ma_dispose(MARIA_HA *info, my_off_t pos, my_bool page_not_read); +extern my_off_t _ma_new(register MARIA_HA *info, int level, + MARIA_PINNED_PAGE **page_link); extern uint _ma_make_key(MARIA_HA *info, uint keynr, uchar *key, const uchar *record, MARIA_RECORD_POS filepos); extern uint _ma_pack_key(MARIA_HA *info, uint keynr, uchar *key, @@ -881,6 +912,21 @@ typedef struct st_maria_block_info #define SORT_BUFFER_INIT (2048L*1024L-MALLOC_OVERHEAD) #define MIN_SORT_BUFFER (4096-MALLOC_OVERHEAD) +/* Struct for clr_end */ + +struct st_msg_to_write_hook_for_clr_end +{ + LSN previous_undo_lsn; + enum translog_record_type undone_record_type; + ha_checksum checksum_delta; +}; + +struct st_msg_to_write_hook_for_undo_key +{ + my_off_t *root; + my_off_t value; +}; + #define fast_ma_writeinfo(INFO) if (!(INFO)->s->tot_locks) (void) _ma_writeinfo((INFO),0) #define fast_ma_readinfo(INFO) ((INFO)->lock_type == F_UNLCK) && _ma_readinfo((INFO),F_RDLCK,1) @@ -987,7 +1033,6 @@ int _ma_update_create_rename_lsn(MARIA_SHARE *share, int _ma_update_create_rename_lsn_sub(MARIA_SHARE *share, LSN lsn, my_bool do_sync); -void _ma_unpin_all_pages(MARIA_HA *info, LSN undo_lsn); #define _ma_tmp_disable_logging_for_table(S) \ { (S)->now_transactional= FALSE; (S)->page_type= PAGECACHE_PLAIN_PAGE; } #define _ma_reenable_logging_for_table(S) \ diff --git a/storage/maria/unittest/ma_pagecache_single.c b/storage/maria/unittest/ma_pagecache_single.c index 8add95e8a36..48f088aed7d 100644 --- a/storage/maria/unittest/ma_pagecache_single.c +++ b/storage/maria/unittest/ma_pagecache_single.c @@ -236,7 +236,7 @@ int simple_pin_test() 0, PAGECACHE_LOCK_READ_UNLOCK, PAGECACHE_UNPIN, - 0, 0); + 0, 0, 0); if (flush_pagecache_blocks(&pagecache, &file1, FLUSH_FORCE_WRITE)) { diag("Got error in flush_pagecache_blocks\n"); diff --git a/storage/myisam/mi_check.c b/storage/myisam/mi_check.c index b41f06a5fb8..152ffd3bb55 100644 --- a/storage/myisam/mi_check.c +++ b/storage/myisam/mi_check.c @@ -1008,7 +1008,7 @@ int chk_data_link(HA_CHECK *param, MI_INFO *info,int extend) del_length+=info->s->base.pack_reclength; continue; /* Record removed */ } - param->glob_crc+= mi_static_checksum(info,record); + param->glob_crc+= (*info->s->calc_check_checksum)(info,record); used+=info->s->base.pack_reclength; break; case DYNAMIC_RECORD: @@ -1162,7 +1162,7 @@ int chk_data_link(HA_CHECK *param, MI_INFO *info,int extend) } else { - info->checksum=mi_checksum(info,record); + info->checksum= (*info->s->calc_check_checksum)(info,record); if (param->testflag & (T_EXTEND | T_MEDIUM | T_VERBOSE)) { if (_mi_rec_check(info,record, info->rec_buff,block_info.rec_len, @@ -1208,10 +1208,7 @@ int chk_data_link(HA_CHECK *param, MI_INFO *info,int extend) llstr(start_recpos,llbuff)); got_error=1; } - if (static_row_size) - param->glob_crc+= mi_static_checksum(info,record); - else - param->glob_crc+= mi_checksum(info,record); + param->glob_crc+= (*info->s->calc_check_checksum)(info,record); link_used+= (block_info.filepos - start_recpos); used+= (pos-start_recpos); } /* switch */ @@ -3164,7 +3161,9 @@ static int sort_get_next_record(MI_SORT_PARAM *sort_param) { if (sort_param->calc_checksum) param->glob_crc+= (info->checksum= - mi_static_checksum(info,sort_param->record)); + (*info->s->calc_check_checksum)(info, + sort_param-> + record)); DBUG_RETURN(0); } if (!sort_param->fix_datafile && sort_param->master) @@ -3440,7 +3439,8 @@ static int sort_get_next_record(MI_SORT_PARAM *sort_param) if (sort_param->read_cache.error < 0) DBUG_RETURN(1); if (sort_param->calc_checksum) - info->checksum= mi_checksum(info, sort_param->record); + info->checksum= (*info->s->calc_check_checksum)(info, + sort_param->record); if ((param->testflag & (T_EXTEND | T_REP)) || searching) { if (_mi_rec_check(info, sort_param->record, sort_param->rec_buff, @@ -3525,7 +3525,9 @@ static int sort_get_next_record(MI_SORT_PARAM *sort_param) info->packed_length=block_info.rec_len; if (sort_param->calc_checksum) param->glob_crc+= (info->checksum= - mi_checksum(info, sort_param->record)); + (*info->s->calc_check_checksum)(info, + sort_param-> + record)); DBUG_RETURN(0); } } @@ -3576,7 +3578,6 @@ int sort_write_record(MI_SORT_PARAM *sort_param) } sort_param->filepos+=share->base.pack_reclength; info->s->state.split++; - /* sort_info->param->glob_crc+=mi_static_checksum(info, sort_param->record); */ break; case DYNAMIC_RECORD: if (! info->blobs) @@ -3599,10 +3600,9 @@ int sort_write_record(MI_SORT_PARAM *sort_param) from= sort_info->buff+ALIGN_SIZE(MI_MAX_DYN_BLOCK_HEADER); } /* We can use info->checksum here as only one thread calls this. */ - info->checksum=mi_checksum(info,sort_param->record); + info->checksum= (*info->s->calc_check_checksum)(info,sort_param->record); reclength=_mi_rec_pack(info,from,sort_param->record); flag=0; - /* sort_info->param->glob_crc+=info->checksum; */ do { diff --git a/storage/myisam/mi_checksum.c b/storage/myisam/mi_checksum.c index 1aa56e571e3..8c408ef7ff5 100644 --- a/storage/myisam/mi_checksum.c +++ b/storage/myisam/mi_checksum.c @@ -19,27 +19,34 @@ ha_checksum mi_checksum(MI_INFO *info, const uchar *buf) { - uint i; ha_checksum crc=0; - MI_COLUMNDEF *rec=info->s->rec; + const uchar *record= buf; + MI_COLUMNDEF *column= info->s->rec; + MI_COLUMNDEF *column_end= column+ info->s->base.fields; + my_bool skip_null_bits= test(info->s->options & HA_OPTION_NULL_FIELDS); - for (i=info->s->base.fields ; i-- ; buf+=(rec++)->length) + for ( ; column != column_end ; buf+= column++->length) { const uchar *pos; ulong length; - switch (rec->type) { + + if ((record[column->null_pos] & column->null_bit) && + skip_null_bits) + continue; /* Null field */ + + switch (column->type) { case FIELD_BLOB: { - length=_mi_calc_blob_length(rec->length- - portable_sizeof_char_ptr, - buf); - memcpy((char*) &pos, buf+rec->length- portable_sizeof_char_ptr, + length=_mi_calc_blob_length(column->length- + portable_sizeof_char_ptr, + buf); + memcpy((char*) &pos, buf+column->length- portable_sizeof_char_ptr, sizeof(char*)); break; } case FIELD_VARCHAR: { - uint pack_length= HA_VARCHAR_PACKLENGTH(rec->length-1); + uint pack_length= HA_VARCHAR_PACKLENGTH(column->length-1); if (pack_length == 1) length= (ulong) *(uchar*) buf; else @@ -48,7 +55,7 @@ ha_checksum mi_checksum(MI_INFO *info, const uchar *buf) break; } default: - length=rec->length; + length=column->length; pos=buf; break; } diff --git a/storage/myisam/mi_create.c b/storage/myisam/mi_create.c index fc5b31e7689..38e518fd823 100644 --- a/storage/myisam/mi_create.c +++ b/storage/myisam/mi_create.c @@ -108,6 +108,9 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, rec++,fields++) { reclength+=rec->length; + if (rec->null_bit) + options|= HA_OPTION_NULL_FIELDS; + if ((type=(enum en_fieldtype) rec->type) != FIELD_NORMAL && type != FIELD_CHECK) { @@ -142,6 +145,7 @@ int mi_create(const char *name,uint keys,MI_KEYDEF *keydefs, long_varchar_count++; pack_reclength+= 2; /* May be packed on 3 bytes */ } + options|= HA_OPTION_NULL_FIELDS; /* Use of mi_checksum() */ } else if (type != FIELD_SKIP_ZERO) { diff --git a/storage/myisam/mi_open.c b/storage/myisam/mi_open.c index b0cc2e54ca7..5ce8ec0275a 100644 --- a/storage/myisam/mi_open.c +++ b/storage/myisam/mi_open.c @@ -144,7 +144,7 @@ MI_INFO *mi_open(const char *name, int mode, uint open_flags) HA_OPTION_COMPRESS_RECORD | HA_OPTION_READ_ONLY_DATA | HA_OPTION_TEMP_COMPRESS_RECORD | HA_OPTION_CHECKSUM | HA_OPTION_TMP_TABLE | HA_OPTION_DELAY_KEY_WRITE | - HA_OPTION_RELIES_ON_SQL_LAYER)) + HA_OPTION_RELIES_ON_SQL_LAYER | HA_OPTION_NULL_FIELDS)) { DBUG_PRINT("error",("wrong options: 0x%lx", share->options)); my_errno=HA_ERR_OLD_FILE; @@ -737,12 +737,14 @@ void mi_setup_functions(register MYISAM_SHARE *share) { share->read_record=_mi_read_pack_record; share->read_rnd=_mi_read_rnd_pack_record; - if (!(share->options & HA_OPTION_TEMP_COMPRESS_RECORD)) - share->calc_checksum=0; /* No checksum */ - else if (share->options & HA_OPTION_PACK_RECORD) + if ((share->options & + (HA_OPTION_PACK_RECORD | HA_OPTION_NULL_FIELDS))) share->calc_checksum= mi_checksum; else share->calc_checksum= mi_static_checksum; + share->calc_check_checksum= share->calc_checksum; + if (!(share->options & HA_OPTION_TEMP_COMPRESS_RECORD)) + share->calc_checksum=0; /* No checksum */ } else if (share->options & HA_OPTION_PACK_RECORD) { @@ -752,6 +754,7 @@ void mi_setup_functions(register MYISAM_SHARE *share) share->compare_record=_mi_cmp_dynamic_record; share->compare_unique=_mi_cmp_dynamic_unique; share->calc_checksum= mi_checksum; + share->calc_check_checksum= share->calc_checksum; /* add bits used to pack data to pack_reclength for faster allocation */ share->base.pack_reclength+= share->base.pack_bits; @@ -775,7 +778,11 @@ void mi_setup_functions(register MYISAM_SHARE *share) share->update_record=_mi_update_static_record; share->write_record=_mi_write_static_record; share->compare_unique=_mi_cmp_static_unique; - share->calc_checksum= mi_static_checksum; + if (share->options & HA_OPTION_NULL_FIELDS) + share->calc_checksum= mi_checksum; + else + share->calc_checksum= mi_static_checksum; + share->calc_check_checksum= share->calc_checksum; } share->file_read= mi_nommap_pread; share->file_write= mi_nommap_pwrite; diff --git a/storage/myisam/mi_test2.c b/storage/myisam/mi_test2.c index fd8adeed1c5..6fb71feb1e7 100644 --- a/storage/myisam/mi_test2.c +++ b/storage/myisam/mi_test2.c @@ -657,10 +657,10 @@ int main(int argc, char *argv[]) sprintf((char*) key2,"%6d",k); min_key.key= key; - min_key.length= USE_WHOLE_KEY; + min_key.keypart_map= HA_WHOLE_KEY; min_key.flag= HA_READ_AFTER_KEY; max_key.key= key2; - max_key.length= USE_WHOLE_KEY; + max_key.keypart_map= HA_WHOLE_KEY; max_key.flag= HA_READ_BEFORE_KEY; range_records= mi_records_in_range(file, 0, &min_key, &max_key); records=0; diff --git a/storage/myisam/myisamdef.h b/storage/myisam/myisamdef.h index fde80172643..59d54bdc542 100644 --- a/storage/myisam/myisamdef.h +++ b/storage/myisam/myisamdef.h @@ -182,7 +182,9 @@ typedef struct st_mi_isam_share int(*delete_record) (struct st_myisam_info *); int(*read_rnd) (struct st_myisam_info *, uchar*, my_off_t, my_bool); int(*compare_record) (struct st_myisam_info *, const uchar*); - ha_checksum(*calc_checksum) (struct st_myisam_info *, const uchar*); + ha_checksum(*calc_checksum) (struct st_myisam_info *, const uchar*); + /* calculate checksum for a row during check table */ + ha_checksum(*calc_check_checksum)(struct st_myisam_info *, const uchar *); int(*compare_unique) (struct st_myisam_info *, MI_UNIQUEDEF *, const uchar *record, my_off_t pos); size_t (*file_read) (MI_INFO *, uchar *, size_t, my_off_t, myf); @@ -518,8 +520,6 @@ extern void _mi_kpointer(MI_INFO *info, uchar *buff, my_off_t pos); extern my_off_t _mi_dpos(MI_INFO *info, uint nod_flag, uchar *after_key); extern my_off_t _mi_rec_pos(MYISAM_SHARE *info, uchar *ptr); extern void _mi_dpointer(MI_INFO *info, uchar *buff, my_off_t pos); -extern int ha_key_cmp(HA_KEYSEG *keyseg, uchar *a, uchar *b, - uint key_length, uint nextflag, uint *diff_length); extern uint _mi_get_static_key(MI_KEYDEF *keyinfo, uint nod_flag, uchar **page, uchar *key); extern uint _mi_get_pack_key(MI_KEYDEF *keyinfo, uint nod_flag, uchar **page, |