From 7345d37141899ad0575814465a1b90d87aa4f363 Mon Sep 17 00:00:00 2001 From: Julius Goryavsky Date: Mon, 8 Mar 2021 05:03:06 +0100 Subject: MDEV-24853: Duplicate key generated during cluster configuration change Incorrect processing of an auto-incrementing field in the WSREP-related code during applying transactions results in a duplicate key being created. This is due to the fact that at the beginning of the write_row() and update_row() functions, the values of the auto-increment parameters are used, which are read from the parameters of the current thread, but further along the code other values are used, which are read from global variables (when applying a transaction). This can happen when the cluster configuration has changed while applying a transaction (for example in the high_priority_service mode for Galera 4). Further during IST processing duplicating key is detected, and processing of the DB_DUPLICATE_KEY return code (inside innodb, in the write_row() handler) results in a call to the wsrep_thd_self_abort() function. --- storage/innobase/handler/ha_innodb.cc | 308 +++++++++++++++++++--------------- 1 file changed, 177 insertions(+), 131 deletions(-) (limited to 'storage') diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 11d8ec2b81e..423996c7fb1 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -2556,6 +2556,72 @@ innobase_raw_format( return(ut_str_sql_format(buf_tmp, buf_tmp_used, buf, buf_size)); } +/* +The helper function nlz(x) calculates the number of leading zeros +in the binary representation of the number "x", either using a +built-in compiler function or a substitute trick based on the use +of the multiplication operation and a table indexed by the prefix +of the multiplication result: +*/ +#ifdef __GNUC__ +#define nlz(x) __builtin_clzll(x) +#elif defined(_MSC_VER) && !defined(_M_CEE_PURE) && \ + (defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM64)) +#ifndef __INTRIN_H_ +#pragma warning(push, 4) +#pragma warning(disable: 4255 4668) +#include +#pragma warning(pop) +#endif +__forceinline unsigned int nlz (ulonglong x) +{ +#if defined(_M_IX86) || defined(_M_X64) + unsigned long n; +#ifdef _M_X64 + _BitScanReverse64(&n, x); + return (unsigned int) n ^ 63; +#else + unsigned long y = (unsigned long) (x >> 32); + unsigned int m = 31; + if (y == 0) + { + y = (unsigned long) x; + m = 63; + } + _BitScanReverse(&n, y); + return (unsigned int) n ^ m; +#endif +#elif defined(_M_ARM64) + return _CountLeadingZeros(x); +#endif +} +#else +inline unsigned int nlz (ulonglong x) +{ + static unsigned char table [48] = { + 32, 6, 5, 0, 4, 12, 0, 20, + 15, 3, 11, 0, 0, 18, 25, 31, + 8, 14, 2, 0, 10, 0, 0, 0, + 0, 0, 0, 21, 0, 0, 19, 26, + 7, 0, 13, 0, 16, 1, 22, 27, + 9, 0, 17, 23, 28, 24, 29, 30 + }; + unsigned int y= (unsigned int) (x >> 32); + unsigned int n= 0; + if (y == 0) { + y= (unsigned int) x; + n= 32; + } + y = y | (y >> 1); // Propagate leftmost 1-bit to the right. + y = y | (y >> 2); + y = y | (y >> 4); + y = y | (y >> 8); + y = y & ~(y >> 16); + y = y * 0x3EF5D037; + return n + table[y >> 26]; +} +#endif + /*********************************************************************//** Compute the next autoinc value. @@ -2584,85 +2650,93 @@ innobase_next_autoinc( ulonglong max_value) /*!< in: max value for type */ { ulonglong next_value; - ulonglong block = need * step; + ulonglong block; /* Should never be 0. */ ut_a(need > 0); - ut_a(block > 0); + ut_a(step > 0); ut_a(max_value > 0); - /* - Allow auto_increment to go over max_value up to max ulonglong. - This allows us to detect that all values are exhausted. - If we don't do this, we will return max_value several times - and get duplicate key errors instead of auto increment value - out of range. - */ - max_value= (~(ulonglong) 0); + /* + We need to calculate the "block" value equal to the product + "step * need". However, when calculating this product, an integer + overflow can occur, so we cannot simply use the usual multiplication + operation. The snippet below calculates the product of two numbers + and detects an unsigned integer overflow: + */ + unsigned int m= nlz(need); + unsigned int n= nlz(step); + if (m + n <= 8 * sizeof(ulonglong) - 2) { + // The bit width of the original values is too large, + // therefore we are guaranteed to get an overflow. + goto overflow; + } + block = need * (step >> 1); + if ((longlong) block < 0) { + goto overflow; + } + block += block; + if (step & 1) { + block += need; + if (block < need) { + goto overflow; + } + } + + /* Check for overflow. Current can be > max_value if the value + is in reality a negative value. Also, the visual studio compiler + converts large double values (which hypothetically can then be + passed here as the values of the "current" parameter) automatically + into unsigned long long datatype maximum value: */ + if (current > max_value) { + goto overflow; + } /* According to MySQL documentation, if the offset is greater than the step then the offset is ignored. */ - if (offset > block) { + if (offset > step) { offset = 0; } - /* Check for overflow. Current can be > max_value if the value is - in reality a negative value.The visual studio compilers converts - large double values automatically into unsigned long long datatype - maximum value */ - - if (block >= max_value - || offset > max_value - || current >= max_value - || max_value - offset <= offset) { - - next_value = max_value; + /* + Let's round the current value to within a step-size block: + */ + if (current > offset) { + next_value = current - offset; } else { - ut_a(max_value > current); - - ulonglong free = max_value - current; - - if (free < offset || free - offset <= block) { - next_value = max_value; - } else { - next_value = 0; - } + next_value = offset - current; } + next_value -= next_value % step; - if (next_value == 0) { - ulonglong next; - - if (current > offset) { - next = (current - offset) / step; - } else { - next = (offset - current) / step; - } - - ut_a(max_value > next); - next_value = next * step; - /* Check for multiplication overflow. */ - ut_a(next_value >= next); - ut_a(max_value > next_value); - - /* Check for overflow */ - if (max_value - next_value >= block) { - - next_value += block; - - if (max_value - next_value >= offset) { - next_value += offset; - } else { - next_value = max_value; - } - } else { - next_value = max_value; - } + /* + Add an offset to the next value and check that the addition + does not cause an integer overflow: + */ + next_value += offset; + if (next_value < offset) { + goto overflow; } - ut_a(next_value != 0); - ut_a(next_value <= max_value); + /* + Add a block to the next value and check that the addition + does not cause an integer overflow: + */ + next_value += block; + if (next_value < block) { + goto overflow; + } return(next_value); + +overflow: + /* + Allow auto_increment to go over max_value up to max ulonglong. + This allows us to detect that all values are exhausted. + If we don't do this, we will return max_value several times + and get duplicate key errors instead of auto increment value + out of range: + */ + return(~(ulonglong) 0); } /********************************************************************//** @@ -8169,7 +8243,6 @@ ha_innobase::write_row( /* Handling of errors related to auto-increment. */ if (auto_inc_used) { ulonglong auto_inc; - ulonglong col_max_value; /* Note the number of rows processed for this statement, used by get_auto_increment() to determine the number of AUTO-INC @@ -8179,11 +8252,6 @@ ha_innobase::write_row( --trx->n_autoinc_rows; } - /* We need the upper limit of the col type to check for - whether we update the table autoinc counter or not. */ - col_max_value = - table->next_number_field->get_max_int_value(); - /* Get the value that MySQL attempted to store in the table.*/ auto_inc = table->next_number_field->val_uint(); @@ -8250,38 +8318,25 @@ ha_innobase::write_row( if (auto_inc >= m_prebuilt->autoinc_last_value) { set_max_autoinc: + /* We need the upper limit of the col type to check for + whether we update the table autoinc counter or not. */ + ulonglong col_max_value = + table->next_number_field->get_max_int_value(); + /* This should filter out the negative values set explicitly by the user. */ if (auto_inc <= col_max_value) { + ut_ad(m_prebuilt->autoinc_increment > 0); ulonglong offset; ulonglong increment; dberr_t err; -#ifdef WITH_WSREP - /* Applier threads which are processing - ROW events and don't go through server - level autoinc processing, therefore - m_prebuilt autoinc values don't get - properly assigned. Fetch values from - server side. */ - if (trx->is_wsrep() && - wsrep_thd_exec_mode(m_user_thd) == REPL_RECV) - { - wsrep_thd_auto_increment_variables( - m_user_thd, &offset, &increment); - } - else - { -#endif /* WITH_WSREP */ - ut_a(m_prebuilt->autoinc_increment > 0); - offset = m_prebuilt->autoinc_offset; - increment = m_prebuilt->autoinc_increment; -#ifdef WITH_WSREP - } -#endif /* WITH_WSREP */ + + offset = m_prebuilt->autoinc_offset; + increment = m_prebuilt->autoinc_increment; + auto_inc = innobase_next_autoinc( - auto_inc, - 1, increment, offset, + auto_inc, 1, increment, offset, col_max_value); err = innobase_set_max_autoinc( @@ -8949,46 +9004,37 @@ ha_innobase::update_row( /* A value for an AUTO_INCREMENT column was specified in the UPDATE statement. */ - ulonglong offset; - ulonglong increment; -#ifdef WITH_WSREP - /* Applier threads which are processing - ROW events and don't go through server - level autoinc processing, therefore - m_prebuilt autoinc values don't get - properly assigned. Fetch values from - server side. */ - if (trx->is_wsrep() && - wsrep_thd_exec_mode(m_user_thd) == REPL_RECV) - { - wsrep_thd_auto_increment_variables( - m_user_thd, &offset, &increment); - } - else - { -#endif /* WITH_WSREP */ - offset = m_prebuilt->autoinc_offset; - increment = m_prebuilt->autoinc_increment; -#ifdef WITH_WSREP - } -#endif /* WITH_WSREP */ - - autoinc = innobase_next_autoinc( - autoinc, 1, increment, offset, - table->found_next_number_field->get_max_int_value()); - - error = innobase_set_max_autoinc(autoinc); - - if (m_prebuilt->table->persistent_autoinc) { - /* Update the PAGE_ROOT_AUTO_INC. Yes, we do - this even if dict_table_t::autoinc already was - greater than autoinc, because we cannot know - if any INSERT actually used (and wrote to - PAGE_ROOT_AUTO_INC) a value bigger than our - autoinc. */ - btr_write_autoinc(dict_table_get_first_index( - m_prebuilt->table), - autoinc); + /* We need the upper limit of the col type to check for + whether we update the table autoinc counter or not. */ + ulonglong col_max_value = + table->found_next_number_field->get_max_int_value(); + + /* This should filter out the negative + values set explicitly by the user. */ + if (autoinc <= col_max_value) { + ulonglong offset; + ulonglong increment; + + offset = m_prebuilt->autoinc_offset; + increment = m_prebuilt->autoinc_increment; + + autoinc = innobase_next_autoinc( + autoinc, 1, increment, offset, + col_max_value); + + error = innobase_set_max_autoinc(autoinc); + + if (m_prebuilt->table->persistent_autoinc) { + /* Update the PAGE_ROOT_AUTO_INC. Yes, we do + this even if dict_table_t::autoinc already was + greater than autoinc, because we cannot know + if any INSERT actually used (and wrote to + PAGE_ROOT_AUTO_INC) a value bigger than our + autoinc. */ + btr_write_autoinc(dict_table_get_first_index( + m_prebuilt->table), + autoinc); + } } } -- cgit v1.2.1 From 1af855819300ba4e4dd3d582c5c521ff99321152 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Wed, 10 Mar 2021 11:08:51 +0200 Subject: MDEV-25101 Assertion !strcmp(field->name, "table_name") failed btr_node_ptr_max_size(): Let us remove the debug assertion that was added in MDEV-14637. The assertion assumed that no additional indexes exist in mysql.innodb_index_stats or mysql.innodb_table_stats. The code path is working around an incorrect definition of a table, interpreting VARCHAR(64) as the more correct VARCHAR(199). No test case will be added, because MDEV-24579 proves that executing DDL on the statistics tables involves a race condition. The test case included the following: ALTER TABLE mysql.innodb_index_stats ADD KEY (stat_name); CREATE TABLE t (a INT) ENGINE=InnoDB STATS_PERSISTENT=1; --- storage/innobase/btr/btr0cur.cc | 1 - 1 file changed, 1 deletion(-) (limited to 'storage') diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc index 3d03c55bf15..8f2acdb35bd 100644 --- a/storage/innobase/btr/btr0cur.cc +++ b/storage/innobase/btr/btr0cur.cc @@ -837,7 +837,6 @@ static ulint btr_node_ptr_max_size(const dict_index_t* index) TABLE_STATS_NAME) || !strcmp(index->table->name.m_name, INDEX_STATS_NAME))) { - ut_ad(!strcmp(field->name, "table_name")); /* Interpret "table_name" as VARCHAR(199) even if it was incorrectly defined as VARCHAR(64). While the caller of ha_innobase enforces the -- cgit v1.2.1 From 30dea4599e44e3008fb9bc5fe79ab5747841f21f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20Lindstr=C3=B6m?= Date: Thu, 4 Mar 2021 14:28:50 +0200 Subject: MDEV-24978 : SIGABRT in __libc_message Keyvalue can be longer than REC_VERSION_56_MAX_INDEX_COL_LEN and this leads out-of-array reference. Use dynamic memory allocation using actual max length of key value. --- storage/innobase/handler/ha_innodb.cc | 63 +++++++++++++++++++++++------------ 1 file changed, 42 insertions(+), 21 deletions(-) (limited to 'storage') diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 423996c7fb1..56602c68187 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -6730,8 +6730,8 @@ wsrep_innobase_mysql_sort( case MYSQL_TYPE_LONG_BLOB: case MYSQL_TYPE_VARCHAR: { - uchar tmp_str[REC_VERSION_56_MAX_INDEX_COL_LEN] = {'\0'}; - uint tmp_length = REC_VERSION_56_MAX_INDEX_COL_LEN; + uchar *tmp_str; + uint tmp_length; /* Use the charset number to pick the right charset struct for the comparison. Since the MySQL function get_charset may be @@ -6754,7 +6754,11 @@ wsrep_innobase_mysql_sort( } } - ut_a(str_length <= tmp_length); + // Note that strnxfrm may change length of string + tmp_length= charset->coll->strnxfrmlen(charset, str_length); + tmp_length= ut_max(str_length, tmp_length) + 1; + tmp_str= static_cast(ut_malloc_nokey(tmp_length)); + ut_ad(str_length <= tmp_length); memcpy(tmp_str, str, str_length); tmp_length = charset->coll->strnxfrm(charset, str, str_length, @@ -6778,6 +6782,7 @@ wsrep_innobase_mysql_sort( ret_length = tmp_length; } + ut_free(tmp_str); break; } case MYSQL_TYPE_DECIMAL : @@ -7129,7 +7134,7 @@ wsrep_store_key_val_for_row( THD* thd, TABLE* table, uint keynr, /*!< in: key number */ - char* buff, /*!< in/out: buffer for the key value (in MySQL + uchar* buff, /*!< in/out: buffer for the key value (in MySQL format) */ uint buff_len,/*!< in: buffer length */ const uchar* record, @@ -7138,7 +7143,7 @@ wsrep_store_key_val_for_row( KEY* key_info = table->key_info + keynr; KEY_PART_INFO* key_part = key_info->key_part; KEY_PART_INFO* end = key_part + key_info->user_defined_key_parts; - char* buff_start = buff; + uchar* buff_start = buff; enum_field_types mysql_type; Field* field; uint buff_space = buff_len; @@ -7150,7 +7155,8 @@ wsrep_store_key_val_for_row( for (; key_part != end; key_part++) { - uchar sorted[REC_VERSION_56_MAX_INDEX_COL_LEN] = {'\0'}; + uchar *sorted=NULL; + uint max_len=0; ibool part_is_null = FALSE; if (key_part->null_bit) { @@ -7229,10 +7235,14 @@ wsrep_store_key_val_for_row( true_len = key_len; } + max_len= true_len; + sorted= static_cast(ut_malloc_nokey(max_len+1)); memcpy(sorted, data, true_len); true_len = wsrep_innobase_mysql_sort( mysql_type, cs->number, sorted, true_len, - REC_VERSION_56_MAX_INDEX_COL_LEN); + max_len); + ut_ad(true_len <= max_len); + if (wsrep_protocol_version > 1) { /* Note that we always reserve the maximum possible length of the true VARCHAR in the key value, though @@ -7317,11 +7327,13 @@ wsrep_store_key_val_for_row( true_len = key_len; } + max_len= true_len; + sorted= static_cast(ut_malloc_nokey(max_len+1)); memcpy(sorted, blob_data, true_len); true_len = wsrep_innobase_mysql_sort( mysql_type, cs->number, sorted, true_len, - REC_VERSION_56_MAX_INDEX_COL_LEN); - + max_len); + ut_ad(true_len <= max_len); /* Note that we always reserve the maximum possible length of the BLOB prefix in the key value. */ @@ -7397,10 +7409,14 @@ wsrep_store_key_val_for_row( cs->mbmaxlen), &error); } + + max_len= true_len; + sorted= static_cast(ut_malloc_nokey(max_len+1)); memcpy(sorted, src_start, true_len); true_len = wsrep_innobase_mysql_sort( mysql_type, cs->number, sorted, true_len, - REC_VERSION_56_MAX_INDEX_COL_LEN); + max_len); + ut_ad(true_len <= max_len); if (true_len > buff_space) { fprintf (stderr, @@ -7415,6 +7431,11 @@ wsrep_store_key_val_for_row( buff += true_len; buff_space -= true_len; } + + if (sorted) { + ut_free(sorted); + sorted= NULL; + } } ut_a(buff <= buff_start + buff_len); @@ -10492,7 +10513,7 @@ wsrep_append_key( trx_t *trx, TABLE_SHARE *table_share, TABLE *table, - const char* key, + const uchar* key, uint16_t key_len, wsrep_key_type key_type /*!< in: access type of this key (shared, exclusive, semi...) */ @@ -10596,8 +10617,8 @@ ha_innobase::wsrep_append_keys( if (wsrep_protocol_version == 0) { uint len; - char keyval[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'}; - char *key = &keyval[0]; + uchar keyval[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'}; + uchar *key = &keyval[0]; ibool is_null; len = wsrep_store_key_val_for_row( @@ -10629,18 +10650,18 @@ ha_innobase::wsrep_append_keys( for (i=0; is->keys; ++i) { uint len; - char keyval0[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'}; - char keyval1[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'}; - char* key0 = &keyval0[1]; - char* key1 = &keyval1[1]; + uchar keyval0[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'}; + uchar keyval1[WSREP_MAX_SUPPORTED_KEY_LENGTH+1] = {'\0'}; + uchar* key0 = &keyval0[1]; + uchar* key1 = &keyval1[1]; KEY* key_info = table->key_info + i; ibool is_null; dict_index_t* idx = innobase_get_index(i); dict_table_t* tab = (idx) ? idx->table : NULL; - keyval0[0] = (char)i; - keyval1[0] = (char)i; + keyval0[0] = (uchar)i; + keyval1[0] = (uchar)i; if (!tab) { WSREP_WARN("MariaDB-InnoDB key mismatch %s %s", @@ -10698,7 +10719,7 @@ ha_innobase::wsrep_append_keys( wsrep_calc_row_hash(digest, record0, table, m_prebuilt, thd); if ((rcode = wsrep_append_key(thd, trx, table_share, table, - (const char*) digest, 16, + (const uchar*) digest, 16, key_type))) { DBUG_RETURN(rcode); } @@ -10708,7 +10729,7 @@ ha_innobase::wsrep_append_keys( digest, record1, table, m_prebuilt, thd); if ((rcode = wsrep_append_key(thd, trx, table_share, table, - (const char*) digest, + (const uchar*) digest, 16, key_type))) { DBUG_RETURN(rcode); } -- cgit v1.2.1 From 66106130a6cbe701ec07477950094fe40d9a4716 Mon Sep 17 00:00:00 2001 From: Eugene Kosov Date: Wed, 17 Mar 2021 11:01:15 +0300 Subject: switch off storage/innobase/.clang-format: InnoDB uses a common formatting style for all new code --- storage/innobase/.clang-format | 11 ----------- storage/innobase/.clang-format-old | 11 +++++++++++ 2 files changed, 11 insertions(+), 11 deletions(-) delete mode 100644 storage/innobase/.clang-format create mode 100644 storage/innobase/.clang-format-old (limited to 'storage') diff --git a/storage/innobase/.clang-format b/storage/innobase/.clang-format deleted file mode 100644 index 54f7b47bc88..00000000000 --- a/storage/innobase/.clang-format +++ /dev/null @@ -1,11 +0,0 @@ -UseTab: Always -TabWidth: 8 -IndentWidth: 8 -ContinuationIndentWidth: 8 -BreakBeforeBinaryOperators: All -PointerAlignment: Left -BreakBeforeBraces: Custom -ColumnLimit: 79 -BraceWrapping: - AfterFunction: true -AccessModifierOffset: -8 diff --git a/storage/innobase/.clang-format-old b/storage/innobase/.clang-format-old new file mode 100644 index 00000000000..54f7b47bc88 --- /dev/null +++ b/storage/innobase/.clang-format-old @@ -0,0 +1,11 @@ +UseTab: Always +TabWidth: 8 +IndentWidth: 8 +ContinuationIndentWidth: 8 +BreakBeforeBinaryOperators: All +PointerAlignment: Left +BreakBeforeBraces: Custom +ColumnLimit: 79 +BraceWrapping: + AfterFunction: true +AccessModifierOffset: -8 -- cgit v1.2.1 From 14a8b700f3e031700bd49dec8f2dca0ae1786090 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Wed, 17 Mar 2021 14:09:05 +0200 Subject: Cleanup: Remove unused OS_DATA_TEMP_FILE This had been originally added in mysql/mysql-server@192bb153b675fe09037a53e456a79eee7211e3a7 with the motivation to disable O_DIRECT for the dedicated tablespace for temporary tables. In MariaDB Server, commit 5eb539555b36a60944eefeb84d5d6d436ba61e63 (MDEV-12227) should be a better solution. The code became orphaned later in mysql/mysql-server@c61244c0e6c58727cffebfb312ac415a463fa0fe and it had been applied to MariaDB Server 10.2.2 in commit 2e814d4702d71a04388386a9f591d14a35980bfe and commit fec844aca88e1c6b9c36bb0b811e92d9d023ffb9. Thanks to Vladislav Vaintroub for spotting this. --- storage/innobase/include/os0file.h | 3 +-- storage/innobase/os/os0file.cc | 5 ++--- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'storage') diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index 82d1551de6c..23de5bd0ef1 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -2,7 +2,7 @@ Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2009, Percona Inc. -Copyright (c) 2013, 2020, MariaDB Corporation. +Copyright (c) 2013, 2021, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Percona Inc.. Those modifications are @@ -160,7 +160,6 @@ static const ulint OS_FILE_NORMAL = 62; /** Types for file create @{ */ static const ulint OS_DATA_FILE = 100; static const ulint OS_LOG_FILE = 101; -static const ulint OS_DATA_TEMP_FILE = 102; static const ulint OS_DATA_FILE_NO_O_DIRECT = 103; /* @} */ diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index 5c6da7ad51c..17a565c8db2 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -2,7 +2,7 @@ Copyright (c) 1995, 2019, Oracle and/or its affiliates. All Rights Reserved. Copyright (c) 2009, Percona Inc. -Copyright (c) 2013, 2020, MariaDB Corporation. +Copyright (c) 2013, 2021, MariaDB Corporation. Portions of this file contain modifications contributed and copyrighted by Percona Inc.. Those modifications are @@ -3007,7 +3007,6 @@ os_file_create_func( ut_a(type == OS_LOG_FILE || type == OS_DATA_FILE - || type == OS_DATA_TEMP_FILE || type == OS_DATA_FILE_NO_O_DIRECT); ut_a(purpose == OS_FILE_AIO || purpose == OS_FILE_NORMAL); @@ -3055,7 +3054,7 @@ os_file_create_func( /* We disable OS caching (O_DIRECT) only on data files */ if (!read_only && *success - && (type != OS_LOG_FILE && type != OS_DATA_TEMP_FILE + && (type != OS_LOG_FILE && type != OS_DATA_FILE_NO_O_DIRECT) && (srv_file_flush_method == SRV_O_DIRECT || srv_file_flush_method == SRV_O_DIRECT_NO_FSYNC)) { -- cgit v1.2.1 From 00f620b27e960c4b96a8392b27742fd5e41a69e3 Mon Sep 17 00:00:00 2001 From: Vladislav Vaintroub Date: Wed, 17 Mar 2021 12:12:10 +0100 Subject: MDEV-21584 - portability fix This patch implements OS_DATA_FILE_NO_O_DIRECT on Windows. --- storage/innobase/os/os0file.cc | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'storage') diff --git a/storage/innobase/os/os0file.cc b/storage/innobase/os/os0file.cc index 17a565c8db2..d2d5769d85e 100644 --- a/storage/innobase/os/os0file.cc +++ b/storage/innobase/os/os0file.cc @@ -4305,7 +4305,9 @@ os_file_create_func( case SRV_ALL_O_DIRECT_FSYNC: /*Traditional Windows behavior, no buffering for any files.*/ - attributes |= FILE_FLAG_NO_BUFFERING; + if (type != OS_DATA_FILE_NO_O_DIRECT) { + attributes |= FILE_FLAG_NO_BUFFERING; + } break; case SRV_FSYNC: -- cgit v1.2.1 From 6505662c23ba81331d91f65c18e06a759d6f148f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Thu, 18 Mar 2021 12:18:16 +0200 Subject: MDEV-25121: innodb_flush_method=O_DIRECT fails on compressed tables Tests with 4096-byte sector size confirm that it is safe to use O_DIRECT with page_compressed tables. That had been disabled on Linux, in an attempt to fix MDEV-21584 which had been filed for the O_DIRECT problems earlier. The fil_node_t::block_size was being set mostly correctly until commit 10dd290b4b8b8b235c8cf42e100f0a4415629e79 (MDEV-17380) introduced a regression in MariaDB Server 10.4.4. fil_node_t::read_page0(): Initialize fil_node_t::block_size. This will probably make similar code in fil_space_extend_must_retry() redundant, but we play it safe and will not remove that code. Thanks to Vladislav Vaintroub for testing this on Microsoft Windows using an old-fashioned rotational hard disk with 4KiB sector size. Reviewed by: Vladislav Vaintroub --- storage/innobase/fil/fil0fil.cc | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) (limited to 'storage') diff --git a/storage/innobase/fil/fil0fil.cc b/storage/innobase/fil/fil0fil.cc index 5adbd4df69e..659cfa26033 100644 --- a/storage/innobase/fil/fil0fil.cc +++ b/storage/innobase/fil/fil0fil.cc @@ -566,6 +566,10 @@ bool fil_node_t::read_page0(bool first) this->size = ulint(size_bytes / psize); space->committed_size = space->size += this->size; + + if (block_size == 0) { + block_size = os_file_get_block_size(handle, name); + } } else if (space->id != TRX_SYS_SPACE || space->size_in_header) { /* If this is not the first-time open, do nothing. For the system tablespace, we always get invoked as @@ -605,12 +609,15 @@ static bool fil_node_open_file(fil_node_t* node) const bool first_time_open = node->size == 0; - bool o_direct_possible = !FSP_FLAGS_HAS_PAGE_COMPRESSION(space->flags); - if (const ulint ssize = FSP_FLAGS_GET_ZIP_SSIZE(space->flags)) { - compile_time_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096); - if (ssize < 3) { - o_direct_possible = false; - } + ulint type; + compile_time_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096); + switch (FSP_FLAGS_GET_ZIP_SSIZE(space->flags)) { + case 1: + case 2: + type = OS_DATA_FILE_NO_O_DIRECT; + break; + default: + type = OS_DATA_FILE; } if (first_time_open @@ -632,9 +639,7 @@ retry: ? OS_FILE_OPEN_RAW | OS_FILE_ON_ERROR_NO_EXIT : OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT, OS_FILE_AIO, - o_direct_possible - ? OS_DATA_FILE - : OS_DATA_FILE_NO_O_DIRECT, + type, read_only_mode, &success); @@ -668,9 +673,7 @@ retry: ? OS_FILE_OPEN_RAW | OS_FILE_ON_ERROR_NO_EXIT : OS_FILE_OPEN | OS_FILE_ON_ERROR_NO_EXIT, OS_FILE_AIO, - o_direct_possible - ? OS_DATA_FILE - : OS_DATA_FILE_NO_O_DIRECT, + type, read_only_mode, &success); } @@ -3601,11 +3604,22 @@ fil_ibd_create( return(err); } + ulint type; + compile_time_assert(((UNIV_ZIP_SIZE_MIN >> 1) << 3) == 4096); + switch (FSP_FLAGS_GET_ZIP_SSIZE(flags)) { + case 1: + case 2: + type = OS_DATA_FILE_NO_O_DIRECT; + break; + default: + type = OS_DATA_FILE; + } + file = os_file_create( innodb_data_file_key, path, OS_FILE_CREATE | OS_FILE_ON_ERROR_NO_EXIT, OS_FILE_NORMAL, - OS_DATA_FILE, + type, srv_read_only_mode, &success); -- cgit v1.2.1 From c557e9540ab6058713a7c78dfa3df366ea92dd92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marko=20M=C3=A4kel=C3=A4?= Date: Thu, 18 Mar 2021 12:22:11 +0200 Subject: MDEV-10682 Race condition between ANALYZE and STATS_AUTO_RECALC ha_innobase::info_low(): While collecting statistics for ANALYZE TABLE, ensure that dict_stats_process_entry_from_recalc_pool() is not executing on the same table. We observed result differences for the test innodb.innodb_stats because dict_stats_empty_index() was being invoked by the background statistics calculation while ha_innobase::analyze() was executing dict_stats_analyze_index_level(). --- storage/innobase/handler/ha_innodb.cc | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'storage') diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 56602c68187..7478cdf4987 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -14411,6 +14411,13 @@ ha_innobase::info_low( if (dict_stats_is_persistent_enabled(ib_table)) { if (is_analyze) { + row_mysql_lock_data_dictionary( + m_prebuilt->trx); + dict_stats_recalc_pool_del(ib_table); + dict_stats_wait_bg_to_stop_using_table( + ib_table, m_prebuilt->trx); + row_mysql_unlock_data_dictionary( + m_prebuilt->trx); opt = DICT_STATS_RECALC_PERSISTENT; } else { /* This is e.g. 'SHOW INDEXES', fetch @@ -14423,6 +14430,13 @@ ha_innobase::info_low( ret = dict_stats_update(ib_table, opt); + if (opt == DICT_STATS_RECALC_PERSISTENT) { + mutex_enter(&dict_sys->mutex); + ib_table->stats_bg_flag + &= byte(~BG_STAT_SHOULD_QUIT); + mutex_exit(&dict_sys->mutex); + } + if (ret != DB_SUCCESS) { m_prebuilt->trx->op_info = ""; DBUG_RETURN(HA_ERR_GENERIC); -- cgit v1.2.1