summaryrefslogtreecommitdiff
path: root/storage/innobase/btr/btr0cur.cc
diff options
context:
space:
mode:
Diffstat (limited to 'storage/innobase/btr/btr0cur.cc')
-rw-r--r--storage/innobase/btr/btr0cur.cc638
1 files changed, 461 insertions, 177 deletions
diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc
index 8febcb76a3b..f4cbb4e51a3 100644
--- a/storage/innobase/btr/btr0cur.cc
+++ b/storage/innobase/btr/btr0cur.cc
@@ -3,7 +3,7 @@
Copyright (c) 1994, 2018, Oracle and/or its affiliates. All Rights Reserved.
Copyright (c) 2008, Google Inc.
Copyright (c) 2012, Facebook Inc.
-Copyright (c) 2015, 2018, MariaDB Corporation.
+Copyright (c) 2015, 2019, MariaDB Corporation.
Portions of this file contain modifications contributed and copyrighted by
Google, Inc. Those modifications are gratefully acknowledged and are described
@@ -217,7 +217,7 @@ btr_rec_free_externally_stored_fields(
btr_latch_leaves_t
btr_cur_latch_leaves(
buf_block_t* block,
- const page_id_t& page_id,
+ const page_id_t page_id,
const page_size_t& page_size,
ulint latch_mode,
btr_cur_t* cursor,
@@ -392,27 +392,32 @@ when loading a table definition.
@return error code
@retval DB_SUCCESS if no error occurred
@retval DB_CORRUPTION if any corruption was noticed */
-static
-dberr_t
-btr_cur_instant_init_low(dict_index_t* index, mtr_t* mtr)
+static dberr_t btr_cur_instant_init_low(dict_index_t* index, mtr_t* mtr)
{
ut_ad(index->is_primary());
ut_ad(index->n_core_null_bytes == dict_index_t::NO_CORE_NULL_BYTES);
ut_ad(index->table->supports_instant());
ut_ad(index->table->is_readable());
- page_t* root = btr_root_get(index, mtr);
-
- if (!root || btr_cur_instant_root_init(index, root)) {
+ const fil_space_t* space = index->table->space;
+ if (!space) {
+unreadable:
ib::error() << "Table " << index->table->name
<< " has an unreadable root page";
index->table->corrupted = true;
return DB_CORRUPTION;
}
+ page_t* root = btr_root_get(index, mtr);
+
+ if (!root || btr_cur_instant_root_init(index, root)) {
+ goto unreadable;
+ }
+
ut_ad(index->n_core_null_bytes != dict_index_t::NO_CORE_NULL_BYTES);
- if (!index->is_instant()) {
+ if (fil_page_get_type(root) == FIL_PAGE_INDEX) {
+ ut_ad(!index->is_instant());
return DB_SUCCESS;
}
@@ -430,29 +435,38 @@ btr_cur_instant_init_low(dict_index_t* index, mtr_t* mtr)
page_cur_move_to_next(&cur.page_cur);
const rec_t* rec = cur.page_cur.rec;
+ const ulint comp = dict_table_is_comp(index->table);
+ const ulint info_bits = rec_get_info_bits(rec, comp);
+
+ if (page_rec_is_supremum(rec)
+ || !(info_bits & REC_INFO_MIN_REC_FLAG)) {
+ if (!index->is_instant()) {
+ /* The FIL_PAGE_TYPE_INSTANT and PAGE_INSTANT may be
+ assigned even if instant ADD COLUMN was not
+ committed. Changes to these page header fields are not
+ undo-logged, but changes to the hidden metadata record
+ are. If the server is killed and restarted, the page
+ header fields could remain set even though no metadata
+ record is present. */
+ return DB_SUCCESS;
+ }
- if (page_rec_is_supremum(rec) || !rec_is_default_row(rec, index)) {
ib::error() << "Table " << index->table->name
<< " is missing instant ALTER metadata";
index->table->corrupted = true;
return DB_CORRUPTION;
}
- if (dict_table_is_comp(index->table)) {
- if (rec_get_info_bits(rec, true) != REC_INFO_MIN_REC_FLAG
- && rec_get_status(rec) != REC_STATUS_COLUMNS_ADDED) {
+ if ((info_bits & ~REC_INFO_DELETED_FLAG) != REC_INFO_MIN_REC_FLAG
+ || (comp && rec_get_status(rec) != REC_STATUS_INSTANT)) {
incompatible:
- ib::error() << "Table " << index->table->name
- << " contains unrecognizable "
- "instant ALTER metadata";
- index->table->corrupted = true;
- return DB_CORRUPTION;
- }
- } else if (rec_get_info_bits(rec, false) != REC_INFO_MIN_REC_FLAG) {
- goto incompatible;
+ ib::error() << "Table " << index->table->name
+ << " contains unrecognizable instant ALTER metadata";
+ index->table->corrupted = true;
+ return DB_CORRUPTION;
}
- /* Read the 'default row'. We can get here on server restart
+ /* Read the metadata. We can get here on server restart
or when the table was evicted from the data dictionary cache
and is now being accessed again.
@@ -462,6 +476,72 @@ incompatible:
concurrent operations on the table, including table eviction
from the cache. */
+ if (info_bits & REC_INFO_DELETED_FLAG) {
+ /* This metadata record includes a BLOB that identifies
+ any dropped or reordered columns. */
+ ulint trx_id_offset = index->trx_id_offset;
+ if (!trx_id_offset) {
+ /* The PRIMARY KEY contains variable-length columns.
+ For the metadata record, variable-length columns are
+ always written with zero length. The DB_TRX_ID will
+ start right after any fixed-length columns. */
+ for (uint i = index->n_uniq; i--; ) {
+ trx_id_offset += index->fields[i].fixed_len;
+ }
+ }
+
+ const byte* ptr = rec + trx_id_offset
+ + (DATA_TRX_ID_LEN + DATA_ROLL_PTR_LEN);
+
+ if (mach_read_from_4(ptr + BTR_EXTERN_LEN)) {
+ goto incompatible;
+ }
+
+ uint len = mach_read_from_4(ptr + BTR_EXTERN_LEN + 4);
+ if (!len
+ || mach_read_from_4(ptr + BTR_EXTERN_OFFSET)
+ != FIL_PAGE_DATA
+ || mach_read_from_4(ptr + BTR_EXTERN_SPACE_ID)
+ != space->id) {
+ goto incompatible;
+ }
+
+ buf_block_t* block = buf_page_get(
+ page_id_t(space->id,
+ mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO)),
+ univ_page_size, RW_S_LATCH, mtr);
+ buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
+ if (fil_page_get_type(block->frame) != FIL_PAGE_TYPE_BLOB
+ || mach_read_from_4(&block->frame[FIL_PAGE_DATA
+ + BTR_BLOB_HDR_NEXT_PAGE_NO])
+ != FIL_NULL
+ || mach_read_from_4(&block->frame[FIL_PAGE_DATA
+ + BTR_BLOB_HDR_PART_LEN])
+ != len) {
+ goto incompatible;
+ }
+
+ /* The unused part of the BLOB page should be zero-filled. */
+ for (const byte* b = block->frame
+ + (FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE) + len,
+ * const end = block->frame + srv_page_size
+ - BTR_EXTERN_LEN;
+ b < end; ) {
+ if (*b++) {
+ goto incompatible;
+ }
+ }
+
+ if (index->table->deserialise_columns(
+ &block->frame[FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE],
+ len)) {
+ goto incompatible;
+ }
+
+ /* Proceed to initialize the default values of
+ any instantly added columns. */
+ }
+
mem_heap_t* heap = NULL;
ulint* offsets = rec_get_offsets(rec, index, NULL, true,
ULINT_UNDEFINED, &heap);
@@ -471,11 +551,12 @@ inconsistent:
goto incompatible;
}
- /* In fact, because we only ever append fields to the 'default
- value' record, it is also OK to perform READ UNCOMMITTED and
+ /* In fact, because we only ever append fields to the metadata
+ record, it is also OK to perform READ UNCOMMITTED and
then ignore any extra fields, provided that
trx_sys.is_registered(DB_TRX_ID). */
- if (rec_offs_n_fields(offsets) > index->n_fields
+ if (rec_offs_n_fields(offsets)
+ > ulint(index->n_fields) + !!index->table->instant
&& !trx_sys.is_registered(current_trx(),
row_get_rec_trx_id(rec, index,
offsets))) {
@@ -483,10 +564,11 @@ inconsistent:
}
for (unsigned i = index->n_core_fields; i < index->n_fields; i++) {
- ulint len;
- const byte* data = rec_get_nth_field(rec, offsets, i, &len);
dict_col_t* col = index->fields[i].col;
- ut_ad(!col->is_instant());
+ const unsigned o = i + !!index->table->instant;
+ ulint len;
+ const byte* data = rec_get_nth_field(rec, offsets, o, &len);
+ ut_ad(!col->is_added());
ut_ad(!col->def_val.data);
col->def_val.len = len;
switch (len) {
@@ -497,7 +579,7 @@ inconsistent:
continue;
}
ut_ad(len != UNIV_SQL_DEFAULT);
- if (!rec_offs_nth_extern(offsets, i)) {
+ if (!rec_offs_nth_extern(offsets, o)) {
col->def_val.data = mem_heap_dup(
index->table->heap, data, len);
} else if (len < BTR_EXTERN_FIELD_REF_SIZE
@@ -509,7 +591,7 @@ inconsistent:
} else {
col->def_val.data = btr_copy_externally_stored_field(
&col->def_val.len, data,
- dict_table_page_size(index->table),
+ cur.page_cur.block->page.size,
len, index->table->heap);
}
}
@@ -541,8 +623,7 @@ index root page.
@param[in] index clustered index that is on its first access
@param[in] page clustered index root page
@return whether the page is corrupted */
-bool
-btr_cur_instant_root_init(dict_index_t* index, const page_t* page)
+bool btr_cur_instant_root_init(dict_index_t* index, const page_t* page)
{
ut_ad(page_is_root(page));
ut_ad(!page_is_comp(page) == !dict_table_is_comp(index->table));
@@ -573,23 +654,51 @@ btr_cur_instant_root_init(dict_index_t* index, const page_t* page)
break;
}
- uint16_t n = page_get_instant(page);
- if (n < index->n_uniq + DATA_ROLL_PTR || n > index->n_fields) {
+ const uint16_t n = page_get_instant(page);
+
+ if (n < index->n_uniq + DATA_ROLL_PTR) {
/* The PRIMARY KEY (or hidden DB_ROW_ID) and
DB_TRX_ID,DB_ROLL_PTR columns must always be present
- as 'core' fields. All fields, including those for
- instantly added columns, must be present in the data
- dictionary. */
+ as 'core' fields. */
return true;
}
+
+ if (n > REC_MAX_N_FIELDS) {
+ return true;
+ }
+
index->n_core_fields = n;
- ut_ad(!index->is_dummy);
- ut_d(index->is_dummy = true);
- index->n_core_null_bytes = n == index->n_fields
- ? UT_BITS_IN_BYTES(unsigned(index->n_nullable))
- : UT_BITS_IN_BYTES(index->get_n_nullable(n));
- ut_d(index->is_dummy = false);
- return false;
+
+ const rec_t* infimum = page_get_infimum_rec(page);
+ const rec_t* supremum = page_get_supremum_rec(page);
+
+ if (!memcmp(infimum, "infimum", 8)
+ && !memcmp(supremum, "supremum", 8)) {
+ if (n > index->n_fields) {
+ /* All fields, including those for instantly
+ added columns, must be present in the
+ data dictionary. */
+ return true;
+ }
+
+ ut_ad(!index->is_dummy);
+ ut_d(index->is_dummy = true);
+ index->n_core_null_bytes = UT_BITS_IN_BYTES(
+ index->get_n_nullable(n));
+ ut_d(index->is_dummy = false);
+ return false;
+ }
+
+ if (memcmp(infimum, field_ref_zero, 8)
+ || memcmp(supremum, field_ref_zero, 7)) {
+ /* The infimum and supremum records must either contain
+ the original strings, or they must be filled with zero
+ bytes, except for the bytes that we have repurposed. */
+ return true;
+ }
+
+ index->n_core_null_bytes = supremum[7];
+ return index->n_core_null_bytes > 128;
}
/** Optimistically latches the leaf page or pages requested.
@@ -645,7 +754,7 @@ btr_cur_optimistic_latch_leaves(
if (left_page_no != FIL_NULL) {
cursor->left_block = btr_block_get(
- page_id_t(cursor->index->table->space->id,
+ page_id_t(cursor->index->table->space_id,
left_page_no),
page_size_t(cursor->index->table->space
->flags),
@@ -945,6 +1054,37 @@ static ulint btr_node_ptr_max_size(const dict_index_t* index)
field_max_size = dict_col_get_max_size(col);
if (UNIV_UNLIKELY(!field_max_size)) {
+ switch (col->mtype) {
+ case DATA_VARCHAR:
+ if (!comp
+ && (!strcmp(index->table->name.m_name,
+ "SYS_FOREIGN")
+ || !strcmp(index->table->name.m_name,
+ "SYS_FOREIGN_COLS"))) {
+ break;
+ }
+ /* fall through */
+ case DATA_VARMYSQL:
+ case DATA_CHAR:
+ case DATA_MYSQL:
+ /* CHAR(0) and VARCHAR(0) are possible
+ data type definitions in MariaDB.
+ The InnoDB internal SQL parser maps
+ CHAR to DATA_VARCHAR, so DATA_CHAR (or
+ DATA_MYSQL) is only coming from the
+ MariaDB SQL layer. */
+ if (comp) {
+ /* Add a length byte, because
+ fixed-length empty field are
+ encoded as variable-length.
+ For ROW_FORMAT=REDUNDANT,
+ these bytes were added to
+ rec_max_size before this loop. */
+ rec_max_size++;
+ }
+ continue;
+ }
+
/* SYS_FOREIGN.ID is defined as CHAR in the
InnoDB internal SQL parser, which translates
into the incorrect VARCHAR(0). InnoDB does
@@ -961,6 +1101,7 @@ static ulint btr_node_ptr_max_size(const dict_index_t* index)
|| !strcmp(index->table->name.m_name,
"SYS_FOREIGN_COLS"));
ut_ad(!comp);
+ ut_ad(col->mtype == DATA_VARCHAR);
rec_max_size += (srv_page_size == UNIV_PAGE_SIZE_MAX)
? REDUNDANT_REC_MAX_DATA_SIZE
@@ -1264,7 +1405,7 @@ btr_cur_search_to_nth_level_func(
Free blocks and read IO bandwidth should be prior
for them, when the history list is glowing huge. */
if (lock_intention == BTR_INTENTION_DELETE
- && trx_sys.history_size() > BTR_CUR_FINE_HISTORY_LENGTH
+ && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
&& buf_get_n_pending_read_ios()) {
mtr_x_lock(dict_index_get_lock(index), mtr);
} else if (dict_index_is_spatial(index)
@@ -1324,7 +1465,7 @@ btr_cur_search_to_nth_level_func(
const page_size_t page_size(index->table->space->flags);
/* Start with the root page. */
- page_id_t page_id(index->table->space->id, index->page);
+ page_id_t page_id(index->table->space_id, index->page);
if (root_leaf_rw_latch == RW_X_LATCH) {
node_ptr_max_size = btr_node_ptr_max_size(index);
@@ -2270,10 +2411,10 @@ need_opposite_intention:
ut_ad(index->is_instant());
/* This may be a search tuple for
btr_pcur_restore_position(). */
- ut_ad(tuple->info_bits == REC_INFO_DEFAULT_ROW
- || tuple->info_bits == REC_INFO_MIN_REC_FLAG);
- } else if (rec_is_default_row(btr_cur_get_rec(cursor),
- index)) {
+ ut_ad(tuple->is_metadata()
+ || (tuple->is_metadata(tuple->info_bits
+ ^ REC_STATUS_INSTANT)));
+ } else if (rec_is_metadata(btr_cur_get_rec(cursor), *index)) {
/* Only user records belong in the adaptive
hash index. */
} else {
@@ -2400,7 +2541,7 @@ btr_cur_open_at_index_side_func(
Free blocks and read IO bandwidth should be prior
for them, when the history list is glowing huge. */
if (lock_intention == BTR_INTENTION_DELETE
- && trx_sys.history_size() > BTR_CUR_FINE_HISTORY_LENGTH
+ && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
&& buf_get_n_pending_read_ios()) {
mtr_x_lock(dict_index_get_lock(index), mtr);
} else {
@@ -2432,7 +2573,7 @@ btr_cur_open_at_index_side_func(
page_cursor = btr_cur_get_page_cur(cursor);
cursor->index = index;
- page_id_t page_id(index->table->space->id, index->page);
+ page_id_t page_id(index->table->space_id, index->page);
const page_size_t page_size(index->table->space->flags);
if (root_leaf_rw_latch == RW_X_LATCH) {
@@ -2745,7 +2886,7 @@ btr_cur_open_at_rnd_pos_func(
Free blocks and read IO bandwidth should be prior
for them, when the history list is glowing huge. */
if (lock_intention == BTR_INTENTION_DELETE
- && trx_sys.history_size() > BTR_CUR_FINE_HISTORY_LENGTH
+ && trx_sys.rseg_history_len > BTR_CUR_FINE_HISTORY_LENGTH
&& buf_get_n_pending_read_ios()) {
mtr_x_lock(dict_index_get_lock(index), mtr);
} else {
@@ -2789,7 +2930,7 @@ btr_cur_open_at_rnd_pos_func(
page_cursor = btr_cur_get_page_cur(cursor);
cursor->index = index;
- page_id_t page_id(index->table->space->id, index->page);
+ page_id_t page_id(index->table->space_id, index->page);
const page_size_t page_size(index->table->space->flags);
dberr_t err = DB_SUCCESS;
@@ -3120,8 +3261,11 @@ btr_cur_ins_lock_and_undo(
roll_ptr = roll_ptr_t(1) << ROLL_PTR_INSERT_FLAG_POS;
if (!(flags & BTR_KEEP_SYS_FLAG)) {
upd_sys:
- row_upd_index_entry_sys_field(entry, index,
- DATA_ROLL_PTR, roll_ptr);
+ dfield_t* r = dtuple_get_nth_field(
+ entry, index->db_roll_ptr());
+ ut_ad(r->len == DATA_ROLL_PTR_LEN);
+ trx_write_roll_ptr(static_cast<byte*>(r->data),
+ roll_ptr);
}
} else {
err = trx_undo_report_row_operation(thr, index, entry,
@@ -3236,12 +3380,17 @@ btr_cur_optimistic_insert(
leaf = page_is_leaf(page);
+ if (UNIV_UNLIKELY(entry->is_alter_metadata())) {
+ ut_ad(leaf);
+ goto convert_big_rec;
+ }
+
/* Calculate the record size when entry is converted to a record */
rec_size = rec_get_converted_size(index, entry, n_ext);
if (page_zip_rec_needs_ext(rec_size, page_is_comp(page),
dtuple_get_n_fields(entry), page_size)) {
-
+convert_big_rec:
/* The record is so big that we have to store some fields
externally on separate database pages */
big_rec_vec = dtuple_convert_big_rec(index, 0, entry, &n_ext);
@@ -3412,7 +3561,7 @@ fail_err:
} else if (index->disable_ahi) {
# endif
} else if (entry->info_bits & REC_INFO_MIN_REC_FLAG) {
- ut_ad(entry->info_bits == REC_INFO_DEFAULT_ROW);
+ ut_ad(entry->is_metadata());
ut_ad(index->is_instant());
ut_ad(flags == BTR_NO_LOCKING_FLAG);
} else {
@@ -3545,9 +3694,14 @@ btr_cur_pessimistic_insert(
}
if (page_zip_rec_needs_ext(rec_get_converted_size(index, entry, n_ext),
- dict_table_is_comp(index->table),
+ index->table->not_redundant(),
dtuple_get_n_fields(entry),
- dict_table_page_size(index->table))) {
+ btr_cur_get_block(cursor)->page.size)
+ || UNIV_UNLIKELY(entry->is_alter_metadata()
+ && !dfield_is_ext(
+ dtuple_get_nth_field(
+ entry,
+ index->first_user_field())))) {
/* The record is so big that we have to store some fields
externally on separate database pages */
@@ -3620,10 +3774,10 @@ btr_cur_pessimistic_insert(
if (index->disable_ahi); else
# endif
if (entry->info_bits & REC_INFO_MIN_REC_FLAG) {
- ut_ad(entry->info_bits == REC_INFO_DEFAULT_ROW);
+ ut_ad(entry->is_metadata());
ut_ad(index->is_instant());
- ut_ad((flags & ulint(~BTR_KEEP_IBUF_BITMAP))
- == BTR_NO_LOCKING_FLAG);
+ ut_ad(flags & BTR_NO_LOCKING_FLAG);
+ ut_ad(!(flags & BTR_CREATE_FLAG));
} else {
btr_search_update_hash_on_insert(
cursor, btr_get_search_latch(index));
@@ -3705,6 +3859,50 @@ btr_cur_upd_lock_and_undo(
cmpl_info, rec, offsets, roll_ptr));
}
+/** Copy DB_TRX_ID,DB_ROLL_PTR to the redo log.
+@param[in] index clustered index
+@param[in] trx_id_t DB_TRX_ID
+@param[in] roll_ptr DB_ROLL_PTR
+@param[in,out] log_ptr redo log buffer
+@return current end of the redo log buffer */
+static byte*
+btr_cur_log_sys(
+ const dict_index_t* index,
+ trx_id_t trx_id,
+ roll_ptr_t roll_ptr,
+ byte* log_ptr)
+{
+ log_ptr += mach_write_compressed(log_ptr, index->db_trx_id());
+ /* Yes, we are writing DB_ROLL_PTR,DB_TRX_ID in reverse order,
+ after emitting the position of DB_TRX_ID in the index.
+ This is how row_upd_write_sys_vals_to_log()
+ originally worked, and it is part of the redo log format. */
+ trx_write_roll_ptr(log_ptr, roll_ptr);
+ log_ptr += DATA_ROLL_PTR_LEN;
+ log_ptr += mach_u64_write_compressed(log_ptr, trx_id);
+
+ return log_ptr;
+}
+
+/** Write DB_TRX_ID,DB_ROLL_PTR to a clustered index entry.
+@param[in,out] entry clustered index entry
+@param[in] index clustered index
+@param[in] trx_id DB_TRX_ID
+@param[in] roll_ptr DB_ROLL_PTR */
+static void btr_cur_write_sys(
+ dtuple_t* entry,
+ const dict_index_t* index,
+ trx_id_t trx_id,
+ roll_ptr_t roll_ptr)
+{
+ dfield_t* t = dtuple_get_nth_field(entry, index->db_trx_id());
+ ut_ad(t->len == DATA_TRX_ID_LEN);
+ trx_write_trx_id(static_cast<byte*>(t->data), trx_id);
+ dfield_t* r = dtuple_get_nth_field(entry, index->db_roll_ptr());
+ ut_ad(r->len == DATA_ROLL_PTR_LEN);
+ trx_write_roll_ptr(static_cast<byte*>(r->data), roll_ptr);
+}
+
/***********************************************************//**
Writes a redo log record of updating a record in-place. */
void
@@ -3744,8 +3942,7 @@ btr_cur_update_in_place_log(
log_ptr++;
if (dict_index_is_clust(index)) {
- log_ptr = row_upd_write_sys_vals_to_log(
- index, trx_id, roll_ptr, log_ptr, mtr);
+ log_ptr = btr_cur_log_sys(index, trx_id, roll_ptr, log_ptr);
} else {
/* Dummy system fields for a secondary index */
/* TRX_ID Position */
@@ -4098,11 +4295,77 @@ func_exit:
return(err);
}
+/** Trim a metadata record during the rollback of instant ALTER TABLE.
+@param[in] entry metadata tuple
+@param[in] index primary key
+@param[in] update update vector for the rollback */
+ATTRIBUTE_COLD
+static void btr_cur_trim_alter_metadata(dtuple_t* entry,
+ const dict_index_t* index,
+ const upd_t* update)
+{
+ ut_ad(index->is_instant());
+ ut_ad(update->is_alter_metadata());
+ ut_ad(entry->is_alter_metadata());
+
+ ut_ad(update->fields[0].field_no == index->first_user_field());
+ ut_ad(update->fields[0].new_val.ext);
+ ut_ad(update->fields[0].new_val.len == FIELD_REF_SIZE);
+ ut_ad(entry->n_fields - 1 == index->n_fields);
+
+ const byte* ptr = static_cast<const byte*>(
+ update->fields[0].new_val.data);
+ ut_ad(!mach_read_from_4(ptr + BTR_EXTERN_LEN));
+ ut_ad(mach_read_from_4(ptr + BTR_EXTERN_LEN + 4) > 4);
+ ut_ad(mach_read_from_4(ptr + BTR_EXTERN_OFFSET) == FIL_PAGE_DATA);
+ ut_ad(mach_read_from_4(ptr + BTR_EXTERN_SPACE_ID)
+ == index->table->space->id);
+
+ ulint n_fields = update->fields[1].field_no;
+ ut_ad(n_fields <= index->n_fields);
+ if (n_fields != index->n_uniq) {
+ ut_ad(n_fields
+ >= index->n_core_fields);
+ entry->n_fields = n_fields;
+ return;
+ }
+
+ /* This is based on dict_table_t::deserialise_columns()
+ and btr_cur_instant_init_low(). */
+ mtr_t mtr;
+ mtr.start();
+ buf_block_t* block = buf_page_get(
+ page_id_t(index->table->space->id,
+ mach_read_from_4(ptr + BTR_EXTERN_PAGE_NO)),
+ univ_page_size, RW_S_LATCH, &mtr);
+ buf_block_dbg_add_level(block, SYNC_EXTERN_STORAGE);
+ ut_ad(fil_page_get_type(block->frame) == FIL_PAGE_TYPE_BLOB);
+ ut_ad(mach_read_from_4(&block->frame[FIL_PAGE_DATA
+ + BTR_BLOB_HDR_NEXT_PAGE_NO])
+ == FIL_NULL);
+ ut_ad(mach_read_from_4(&block->frame[FIL_PAGE_DATA
+ + BTR_BLOB_HDR_PART_LEN])
+ == mach_read_from_4(ptr + BTR_EXTERN_LEN + 4));
+ n_fields = mach_read_from_4(
+ &block->frame[FIL_PAGE_DATA + BTR_BLOB_HDR_SIZE])
+ + index->first_user_field();
+ /* Rollback should not increase the number of fields. */
+ ut_ad(n_fields <= index->n_fields);
+ ut_ad(n_fields + 1 <= entry->n_fields);
+ /* dict_index_t::clear_instant_alter() cannot be invoked while
+ rollback of an instant ALTER TABLE transaction is in progress
+ for an is_alter_metadata() record. */
+ ut_ad(n_fields >= index->n_core_fields);
+
+ mtr.commit();
+ entry->n_fields = n_fields + 1;
+}
+
/** Trim an update tuple due to instant ADD COLUMN, if needed.
For normal records, the trailing instantly added fields that match
-the 'default row' are omitted.
+the initial default values are omitted.
-For the special 'default row' record on a table on which instant
+For the special metadata record on a table on which instant
ADD COLUMN has already been executed, both ADD COLUMN and the
rollback of ADD COLUMN need to be handled specially.
@@ -4119,13 +4382,12 @@ btr_cur_trim(
const que_thr_t* thr)
{
if (!index->is_instant()) {
- } else if (UNIV_UNLIKELY(update->info_bits == REC_INFO_DEFAULT_ROW)) {
- /* We are either updating a 'default row'
- (instantly adding columns to a table where instant ADD was
+ } else if (UNIV_UNLIKELY(update->is_metadata())) {
+ /* We are either updating a metadata record
+ (instant ALTER TABLE on a table where instant ALTER was
already executed) or rolling back such an operation. */
ut_ad(!upd_get_nth_field(update, 0)->orig_len);
- ut_ad(upd_get_nth_field(update, 0)->field_no
- > index->n_core_fields);
+ ut_ad(entry->is_metadata());
if (thr->graph->trx->in_rollback) {
/* This rollback can occur either as part of
@@ -4142,6 +4404,13 @@ btr_cur_trim(
first instantly added column logged by
innobase_add_instant_try(). */
ut_ad(update->n_fields > 2);
+ if (update->is_alter_metadata()) {
+ btr_cur_trim_alter_metadata(
+ entry, index, update);
+ return;
+ }
+ ut_ad(!entry->is_alter_metadata());
+
ulint n_fields = upd_get_nth_field(update, 0)
->field_no;
ut_ad(n_fields + 1 >= entry->n_fields);
@@ -4227,9 +4496,7 @@ btr_cur_optimistic_update(
|| trx_is_recv(thr_get_trx(thr)));
#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
- const bool is_default_row = update->info_bits == REC_INFO_DEFAULT_ROW;
-
- if (UNIV_LIKELY(!is_default_row)
+ if (UNIV_LIKELY(!update->is_metadata())
&& !row_upd_changes_field_size_or_external(index, *offsets,
update)) {
@@ -4255,6 +4522,10 @@ any_extern:
return(DB_OVERFLOW);
}
+ if (rec_is_metadata(rec, *index) && index->table->instant) {
+ goto any_extern;
+ }
+
for (i = 0; i < upd_get_n_fields(update); i++) {
if (dfield_is_ext(&upd_get_nth_field(update, i)->new_val)) {
@@ -4299,7 +4570,7 @@ any_extern:
if (page_zip_rec_needs_ext(new_rec_size, page_is_comp(page),
dict_index_get_n_fields(index),
- dict_table_page_size(index->table))) {
+ block->page.size)) {
goto any_extern;
}
@@ -4313,10 +4584,10 @@ any_extern:
}
/* We limit max record size to 16k even for 64k page size. */
- if (new_rec_size >= COMPRESSED_REC_MAX_DATA_SIZE ||
- (!dict_table_is_comp(index->table)
- && new_rec_size >= REDUNDANT_REC_MAX_DATA_SIZE)) {
- err = DB_OVERFLOW;
+ if (new_rec_size >= COMPRESSED_REC_MAX_DATA_SIZE ||
+ (!dict_table_is_comp(index->table)
+ && new_rec_size >= REDUNDANT_REC_MAX_DATA_SIZE)) {
+ err = DB_OVERFLOW;
goto func_exit;
}
@@ -4389,8 +4660,8 @@ any_extern:
lock_rec_store_on_page_infimum(block, rec);
}
- if (UNIV_UNLIKELY(is_default_row)) {
- ut_ad(new_entry->info_bits == REC_INFO_DEFAULT_ROW);
+ if (UNIV_UNLIKELY(update->is_metadata())) {
+ ut_ad(new_entry->is_metadata());
ut_ad(index->is_instant());
/* This can be innobase_add_instant_try() performing a
subsequent instant ADD COLUMN, or its rollback by
@@ -4405,10 +4676,7 @@ any_extern:
page_cur_move_to_prev(page_cursor);
if (!(flags & BTR_KEEP_SYS_FLAG)) {
- row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
- roll_ptr);
- row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
- trx_id);
+ btr_cur_write_sys(new_entry, index, trx_id, roll_ptr);
}
/* There are no externally stored columns in new_entry */
@@ -4416,9 +4684,9 @@ any_extern:
cursor, new_entry, offsets, heap, 0/*n_ext*/, mtr);
ut_a(rec); /* <- We calculated above the insert would fit */
- if (UNIV_UNLIKELY(is_default_row)) {
+ if (UNIV_UNLIKELY(update->is_metadata())) {
/* We must empty the PAGE_FREE list, because if this
- was a rollback, the shortened 'default row' record
+ was a rollback, the shortened metadata record
would have too many fields, and we would be unable to
know the size of the freed record. */
btr_page_reorganize(page_cursor, index, mtr);
@@ -4610,8 +4878,25 @@ btr_cur_pessimistic_update(
rec, index, *offsets, page_is_leaf(page),
ULINT_UNDEFINED, offsets_heap);
- dtuple_t* new_entry = row_rec_to_index_entry(
- rec, index, *offsets, &n_ext, entry_heap);
+ dtuple_t* new_entry;
+
+ const bool is_metadata = rec_is_metadata(rec, *index);
+
+ if (UNIV_UNLIKELY(is_metadata)) {
+ ut_ad(update->is_metadata());
+ ut_ad(flags & BTR_NO_LOCKING_FLAG);
+ ut_ad(index->is_instant());
+ new_entry = row_metadata_to_tuple(
+ rec, index, *offsets,
+ &n_ext, entry_heap,
+ update->info_bits, !thr_get_trx(thr)->in_rollback);
+ ut_ad(new_entry->n_fields
+ == ulint(index->n_fields)
+ + update->is_alter_metadata());
+ } else {
+ new_entry = row_rec_to_index_entry(rec, index, *offsets,
+ &n_ext, entry_heap);
+ }
/* The page containing the clustered index record
corresponding to new_entry is latched in mtr. If the
@@ -4623,9 +4908,6 @@ btr_cur_pessimistic_update(
entry_heap);
btr_cur_trim(new_entry, index, update, thr);
- const bool is_default_row = new_entry->info_bits
- & REC_INFO_MIN_REC_FLAG;
-
/* We have to set appropriate extern storage bits in the new
record to be inserted: we have to remember which fields were such */
@@ -4653,11 +4935,14 @@ btr_cur_pessimistic_update(
}
if (page_zip_rec_needs_ext(
- rec_get_converted_size(index, new_entry, n_ext),
- page_is_comp(page),
- dict_index_get_n_fields(index),
- block->page.size)) {
-
+ rec_get_converted_size(index, new_entry, n_ext),
+ page_is_comp(page),
+ dict_index_get_n_fields(index),
+ block->page.size)
+ || (UNIV_UNLIKELY(update->is_alter_metadata())
+ && !dfield_is_ext(dtuple_get_nth_field(
+ new_entry,
+ index->first_user_field())))) {
big_rec_vec = dtuple_convert_big_rec(index, update, new_entry, &n_ext);
if (UNIV_UNLIKELY(big_rec_vec == NULL)) {
@@ -4706,10 +4991,7 @@ btr_cur_pessimistic_update(
}
if (!(flags & BTR_KEEP_SYS_FLAG)) {
- row_upd_index_entry_sys_field(new_entry, index, DATA_ROLL_PTR,
- roll_ptr);
- row_upd_index_entry_sys_field(new_entry, index, DATA_TRX_ID,
- trx_id);
+ btr_cur_write_sys(new_entry, index, trx_id, roll_ptr);
}
if (!page_zip) {
@@ -4717,11 +4999,11 @@ btr_cur_pessimistic_update(
page, 1);
}
- if (UNIV_UNLIKELY(is_default_row)) {
- ut_ad(new_entry->info_bits == REC_INFO_DEFAULT_ROW);
+ if (UNIV_UNLIKELY(is_metadata)) {
+ ut_ad(new_entry->is_metadata());
ut_ad(index->is_instant());
/* This can be innobase_add_instant_try() performing a
- subsequent instant ADD COLUMN, or its rollback by
+ subsequent instant ALTER TABLE, or its rollback by
row_undo_mod_clust_low(). */
ut_ad(flags & BTR_NO_LOCKING_FLAG);
} else {
@@ -4757,9 +5039,9 @@ btr_cur_pessimistic_update(
if (rec) {
page_cursor->rec = rec;
- if (UNIV_UNLIKELY(is_default_row)) {
+ if (UNIV_UNLIKELY(is_metadata)) {
/* We must empty the PAGE_FREE list, because if this
- was a rollback, the shortened 'default row' record
+ was a rollback, the shortened metadata record
would have too many fields, and we would be unable to
know the size of the freed record. */
btr_page_reorganize(page_cursor, index, mtr);
@@ -4770,7 +5052,8 @@ btr_cur_pessimistic_update(
btr_cur_get_block(cursor), rec, block);
}
- if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))) {
+ if (!rec_get_deleted_flag(rec, rec_offs_comp(*offsets))
+ || rec_is_alter_metadata(rec, *index)) {
/* The new inserted record owns its possible externally
stored fields */
btr_cur_unmark_extern_fields(
@@ -4913,9 +5196,9 @@ btr_cur_pessimistic_update(
ut_ad(row_get_rec_trx_id(rec, index, *offsets));
}
- if (UNIV_UNLIKELY(is_default_row)) {
+ if (UNIV_UNLIKELY(is_metadata)) {
/* We must empty the PAGE_FREE list, because if this
- was a rollback, the shortened 'default row' record
+ was a rollback, the shortened metadata record
would have too many fields, and we would be unable to
know the size of the freed record. */
btr_page_reorganize(page_cursor, index, mtr);
@@ -4980,8 +5263,7 @@ btr_cur_del_mark_set_clust_rec_log(
*log_ptr++ = 0;
*log_ptr++ = 1;
- log_ptr = row_upd_write_sys_vals_to_log(
- index, trx_id, roll_ptr, log_ptr, mtr);
+ log_ptr = btr_cur_log_sys(index, trx_id, roll_ptr, log_ptr);
mach_write_to_2(log_ptr, page_offset(rec));
log_ptr += 2;
@@ -5413,42 +5695,41 @@ btr_cur_optimistic_delete_func(
if (UNIV_UNLIKELY(page_is_root(block->frame)
&& page_get_n_recs(block->frame) == 1
+ (cursor->index->is_instant()
- && !rec_is_default_row(rec, cursor->index)))) {
+ && !rec_is_metadata(rec, *cursor->index)))) {
/* The whole index (and table) becomes logically empty.
Empty the whole page. That is, if we are deleting the
- only user record, also delete the 'default row' record
- if one exists (it exists if and only if is_instant()).
- If we are deleting the 'default row' record and the
+ only user record, also delete the metadata record
+ if one exists for instant ADD COLUMN (not generic ALTER TABLE).
+ If we are deleting the metadata record and the
table becomes empty, clean up the whole page. */
dict_index_t* index = cursor->index;
+ const rec_t* first_rec = page_rec_get_next_const(
+ page_get_infimum_rec(block->frame));
ut_ad(!index->is_instant()
- || rec_is_default_row(
- page_rec_get_next_const(
- page_get_infimum_rec(block->frame)),
- index));
- if (UNIV_UNLIKELY(rec_get_info_bits(rec, page_rec_is_comp(rec))
- & REC_INFO_MIN_REC_FLAG)) {
- /* This should be rolling back instant ADD COLUMN.
- If this is a recovered transaction, then
- index->is_instant() will hold until the
- insert into SYS_COLUMNS is rolled back. */
- ut_ad(index->table->supports_instant());
- ut_ad(index->is_primary());
- } else {
- lock_update_delete(block, rec);
- }
- btr_page_empty(block, buf_block_get_page_zip(block),
- index, 0, mtr);
- page_cur_set_after_last(block, btr_cur_get_page_cur(cursor));
-
- if (index->is_primary()) {
- /* Concurrent access is prevented by
- root_block->lock X-latch, so this should be
- safe. */
- index->remove_instant();
+ || rec_is_metadata(first_rec, *index));
+ const bool is_metadata = rec_is_metadata(rec, *index);
+ /* We can remove the metadata when rolling back an
+ instant ALTER TABLE operation, or when deleting the
+ last user record on the page such that only metadata for
+ instant ADD COLUMN (not generic ALTER TABLE) remains. */
+ const bool empty_table = is_metadata
+ || !index->is_instant()
+ || (first_rec != rec
+ && rec_is_add_metadata(first_rec, *index));
+ if (UNIV_LIKELY(empty_table)) {
+ if (UNIV_LIKELY(!is_metadata)) {
+ lock_update_delete(block, rec);
+ }
+ btr_page_empty(block, buf_block_get_page_zip(block),
+ index, 0, mtr);
+ if (index->is_instant()) {
+ /* MDEV-17383: free metadata BLOBs! */
+ index->clear_instant_alter();
+ }
+ page_cur_set_after_last(block,
+ btr_cur_get_page_cur(cursor));
+ return true;
}
-
- return true;
}
offsets = rec_get_offsets(rec, cursor->index, offsets, true,
@@ -5475,7 +5756,7 @@ btr_cur_optimistic_delete_func(
page_cur_delete_rec(btr_cur_get_page_cur(cursor),
cursor->index, offsets, mtr);
/* We must empty the PAGE_FREE list, because
- after rollback, this deleted 'default row' record
+ after rollback, this deleted metadata record
would have too many fields, and we would be
unable to know the size of the freed record. */
btr_page_reorganize(btr_cur_get_page_cur(cursor),
@@ -5628,10 +5909,10 @@ btr_cur_pessimistic_delete(
}
if (page_is_leaf(page)) {
- const bool is_default_row = rec_get_info_bits(
- rec, page_rec_is_comp(rec)) & REC_INFO_MIN_REC_FLAG;
- if (UNIV_UNLIKELY(is_default_row)) {
- /* This should be rolling back instant ADD COLUMN.
+ const bool is_metadata = rec_is_metadata(
+ rec, page_rec_is_comp(rec));
+ if (UNIV_UNLIKELY(is_metadata)) {
+ /* This should be rolling back instant ALTER TABLE.
If this is a recovered transaction, then
index->is_instant() will hold until the
insert into SYS_COLUMNS is rolled back. */
@@ -5647,39 +5928,43 @@ btr_cur_pessimistic_delete(
goto discard_page;
}
} else if (page_get_n_recs(page) == 1
- + (index->is_instant()
- && !rec_is_default_row(rec, index))) {
+ + (index->is_instant() && !is_metadata)) {
/* The whole index (and table) becomes logically empty.
Empty the whole page. That is, if we are deleting the
- only user record, also delete the 'default row' record
- if one exists (it exists if and only if is_instant()).
- If we are deleting the 'default row' record and the
+ only user record, also delete the metadata record
+ if one exists for instant ADD COLUMN
+ (not generic ALTER TABLE).
+ If we are deleting the metadata record
+ (in the rollback of instant ALTER TABLE) and the
table becomes empty, clean up the whole page. */
+
+ const rec_t* first_rec = page_rec_get_next_const(
+ page_get_infimum_rec(page));
ut_ad(!index->is_instant()
- || rec_is_default_row(
- page_rec_get_next_const(
- page_get_infimum_rec(page)),
- index));
- btr_page_empty(block, page_zip, index, 0, mtr);
- page_cur_set_after_last(block,
- btr_cur_get_page_cur(cursor));
- if (index->is_primary()) {
- /* Concurrent access is prevented by
- index->lock and root_block->lock
- X-latch, so this should be safe. */
- index->remove_instant();
+ || rec_is_metadata(first_rec, *index));
+ if (is_metadata || !index->is_instant()
+ || (first_rec != rec
+ && rec_is_add_metadata(first_rec, *index))) {
+ btr_page_empty(block, page_zip, index, 0, mtr);
+ if (index->is_instant()) {
+ /* MDEV-17383: free metadata BLOBs! */
+ index->clear_instant_alter();
+ }
+ page_cur_set_after_last(
+ block,
+ btr_cur_get_page_cur(cursor));
+ ret = TRUE;
+ goto return_after_reservations;
}
- ret = TRUE;
- goto return_after_reservations;
}
- if (UNIV_LIKELY(!is_default_row)) {
+ if (UNIV_LIKELY(!is_metadata)) {
btr_search_update_hash_on_delete(cursor);
} else {
page_cur_delete_rec(btr_cur_get_page_cur(cursor),
index, offsets, mtr);
/* We must empty the PAGE_FREE list, because
- after rollback, this deleted 'default row' record
+ after rollback, this deleted metadata record
would carry too many fields, and we would be
unable to know the size of the freed record. */
btr_page_reorganize(btr_cur_get_page_cur(cursor),
@@ -7164,7 +7449,7 @@ struct btr_blob_log_check_t {
if (m_op == BTR_STORE_INSERT_BULK) {
mtr_x_lock(dict_index_get_lock(index), m_mtr);
m_pcur->btr_cur.page_cur.block = btr_block_get(
- page_id_t(index->table->space->id, page_no),
+ page_id_t(index->table->space_id, page_no),
page_size_t(index->table->space->flags),
RW_X_LATCH, index, m_mtr);
m_pcur->btr_cur.page_cur.rec
@@ -7253,8 +7538,8 @@ btr_store_big_rec_extern_fields(
ut_ad(buf_block_get_frame(rec_block) == page_align(rec));
ut_a(dict_index_is_clust(index));
- ut_a(dict_table_page_size(index->table)
- .equals_to(rec_block->page.size));
+ ut_ad(dict_table_page_size(index->table)
+ .equals_to(rec_block->page.size));
btr_blob_log_check_t redo_log(pcur, btr_mtr, offsets, &rec_block,
&rec, op);
@@ -7299,15 +7584,13 @@ btr_store_big_rec_extern_fields(
}
#endif /* UNIV_DEBUG || UNIV_BLOB_LIGHT_DEBUG */
- const page_size_t page_size(dict_table_page_size(index->table));
-
/* Space available in compressed page to carry blob data */
- const ulint payload_size_zip = page_size.physical()
+ const ulint payload_size_zip = rec_block->page.size.physical()
- FIL_PAGE_DATA;
/* Space available in uncompressed page to carry blob data */
- const ulint payload_size = page_size.physical()
- - FIL_PAGE_DATA - BTR_BLOB_HDR_SIZE - FIL_PAGE_DATA_END;
+ const ulint payload_size = payload_size_zip
+ - (BTR_BLOB_HDR_SIZE + FIL_PAGE_DATA_END);
/* We have to create a file segment to the tablespace
for each field and put the pointer to the field in rec */
@@ -7772,6 +8055,7 @@ btr_free_externally_stored_field(
& ~((BTR_EXTERN_OWNER_FLAG
| BTR_EXTERN_INHERITED_FLAG) << 24)));
ut_ad(space_id == index->table->space->id);
+ ut_ad(space_id == index->table->space_id);
const page_size_t ext_page_size(dict_table_page_size(index->table));
const page_size_t& rec_page_size(rec == NULL