From fc7da4dd4f1e2b9b78b292f20d8fe61f1e9a1d11 Mon Sep 17 00:00:00 2001 From: Aleksey Midenkov Date: Mon, 20 Feb 2017 10:06:58 +0300 Subject: IB, SQL: InnoDB partitioning [closes #118] * native InnoDB partitioning for BY SYSTEM_TIME partitions. --- sql/CMakeLists.txt | 4 +- sql/ha_partition.cc | 23 +- sql/ha_partition.h | 117 +- sql/handler.cc | 6 + sql/handler.h | 27 +- sql/partition_info.cc | 192 +- sql/partition_info.h | 49 +- sql/partitioning/partition_handler.cc | 3746 +++++++++++++++++++++++++++++++++ sql/partitioning/partition_handler.h | 1113 ++++++++++ sql/share/errmsg-utf8.txt | 9 + sql/sql_partition.cc | 142 +- sql/sql_partition.h | 34 + sql/sql_table.cc | 5 +- sql/sql_tablespace.cc | 64 + sql/sql_tablespace.h | 35 + sql/table.cc | 14 + 16 files changed, 5533 insertions(+), 47 deletions(-) create mode 100644 sql/partitioning/partition_handler.cc create mode 100644 sql/partitioning/partition_handler.h (limited to 'sql') diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt index 1dfa313a70c..08a39b1975d 100644 --- a/sql/CMakeLists.txt +++ b/sql/CMakeLists.txt @@ -121,7 +121,7 @@ SET (SQL_SOURCE rpl_tblmap.cc sql_binlog.cc event_scheduler.cc event_data_objects.cc event_queue.cc event_db_repository.cc sql_tablespace.cc events.cc ../sql-common/my_user.c - partition_info.cc rpl_utility.cc rpl_injector.cc sql_locale.cc + partition_info.cc partitioning/partition_handler.cc rpl_utility.cc rpl_injector.cc sql_locale.cc rpl_rli.cc rpl_mi.cc sql_servers.cc sql_audit.cc sql_connect.cc scheduler.cc sql_partition_admin.cc sql_profile.cc event_parse_data.cc sql_alter.cc @@ -165,7 +165,7 @@ IF (CMAKE_SYSTEM_NAME MATCHES "Linux" OR ENDIF() -MYSQL_ADD_PLUGIN(partition ha_partition.cc STORAGE_ENGINE DEFAULT STATIC_ONLY +MYSQL_ADD_PLUGIN(partition ha_partition.cc partitioning/partition_handler.cc STORAGE_ENGINE DEFAULT STATIC_ONLY RECOMPILE_FOR_EMBEDDED) MYSQL_ADD_PLUGIN(sql_sequence ha_sequence.cc STORAGE_ENGINE MANDATORY STATIC_ONLY RECOMPILE_FOR_EMBEDDED) diff --git a/sql/ha_partition.cc b/sql/ha_partition.cc index 0fa461e1807..747b9a8871f 100644 --- a/sql/ha_partition.cc +++ b/sql/ha_partition.cc @@ -160,9 +160,6 @@ static int partition_initialize(void *p) bool Partition_share::init(uint num_parts) { DBUG_ENTER("Partition_share::init"); - mysql_mutex_init(key_partition_auto_inc_mutex, - &auto_inc_mutex, - MY_MUTEX_INIT_FAST); auto_inc_initialized= false; partition_name_hash_initialized= false; next_auto_inc_val= 0; @@ -1246,12 +1243,12 @@ int ha_partition::handle_opt_part(THD *thd, HA_CHECK_OPT *check_opt, (modelled after mi_check_print_msg) TODO: move this into the handler, or rewrite mysql_admin_table. */ -static bool print_admin_msg(THD* thd, uint len, +bool print_admin_msg(THD* thd, uint len, const char* msg_type, const char* db_name, String &table_name, const char* op_name, const char *fmt, ...) ATTRIBUTE_FORMAT(printf, 7, 8); -static bool print_admin_msg(THD* thd, uint len, +bool print_admin_msg(THD* thd, uint len, const char* msg_type, const char* db_name, String &table_name, const char* op_name, const char *fmt, ...) @@ -5731,6 +5728,22 @@ int ha_partition::index_next_same(uchar *buf, const uchar *key, uint keylen) } +int ha_partition::index_read_last_map(uchar *buf, + const uchar *key, + key_part_map keypart_map) +{ + DBUG_ENTER("ha_partition::index_read_last_map"); + + m_ordered= true; // Safety measure + end_range= NULL; + m_index_scan_type= partition_index_read_last; + m_start_key.key= key; + m_start_key.keypart_map= keypart_map; + m_start_key.flag= HA_READ_PREFIX_LAST; + DBUG_RETURN(common_index_read(buf, true)); +} + + /* Read next record when performing index scan backwards diff --git a/sql/ha_partition.h b/sql/ha_partition.h index 2c7f4a0861f..861ba47b94e 100644 --- a/sql/ha_partition.h +++ b/sql/ha_partition.h @@ -77,43 +77,118 @@ public: }; +extern PSI_mutex_key key_partition_auto_inc_mutex; + /** Partition specific Handler_share. */ class Partition_share : public Handler_share { public: - bool auto_inc_initialized; - mysql_mutex_t auto_inc_mutex; /**< protecting auto_inc val */ - ulonglong next_auto_inc_val; /**< first non reserved value */ - /** - Hash of partition names. Initialized in the first ha_partition::open() - for the table_share. After that it is read-only, i.e. no locking required. - */ - bool partition_name_hash_initialized; - HASH partition_name_hash; - /** Storage for each partitions Handler_share */ - Parts_share_refs *partitions_share_refs; - Partition_share() {} + Partition_share() + : auto_inc_initialized(false), + next_auto_inc_val(0), + partition_name_hash_initialized(false), + partitions_share_refs(NULL), + partition_names(NULL) + { + mysql_mutex_init(key_partition_auto_inc_mutex, + &auto_inc_mutex, + MY_MUTEX_INIT_FAST); + } + ~Partition_share() { - DBUG_ENTER("Partition_share::~Partition_share"); mysql_mutex_destroy(&auto_inc_mutex); + if (partition_names) + { + my_free(partition_names); + } if (partition_name_hash_initialized) + { my_hash_free(&partition_name_hash); + } if (partitions_share_refs) delete partitions_share_refs; - DBUG_VOID_RETURN; } + bool init(uint num_parts); - void lock_auto_inc() + + /** Set if auto increment is used an initialized. */ + bool auto_inc_initialized; + /** + Mutex protecting next_auto_inc_val. + Initialized if table uses auto increment. + */ + mysql_mutex_t auto_inc_mutex; + /** First non reserved auto increment value. */ + ulonglong next_auto_inc_val; + /** + Hash of partition names. Initialized by the first handler instance of a + table_share calling populate_partition_name_hash(). + After that it is read-only, i.e. no locking required for reading. + */ + HASH partition_name_hash; + /** flag that the name hash is initialized, so it only will do it once. */ + bool partition_name_hash_initialized; + + /** Storage for each partitions Handler_share */ + Parts_share_refs *partitions_share_refs; + + /** + Release reserved auto increment values not used. + @param thd Thread. + @param table_share Table Share + @param next_insert_id Next insert id (first non used auto inc value). + @param max_reserved End of reserved auto inc range. + */ + void release_auto_inc_if_possible(THD *thd, TABLE_SHARE *table_share, + const ulonglong next_insert_id, + const ulonglong max_reserved); + + /** lock mutex protecting auto increment value next_auto_inc_val. */ + inline void lock_auto_inc() { mysql_mutex_lock(&auto_inc_mutex); } - void unlock_auto_inc() + /** unlock mutex protecting auto increment value next_auto_inc_val. */ + inline void unlock_auto_inc() { mysql_mutex_unlock(&auto_inc_mutex); } + /** + Populate partition_name_hash with partition and subpartition names + from part_info. + @param part_info Partition info containing all partitions metadata. + + @return Operation status. + @retval false Success. + @retval true Failure. + */ + bool populate_partition_name_hash(partition_info *part_info); + /** Get partition name. + + @param part_id Partition id (for subpartitioned table only subpartition + names will be returned.) + + @return partition name or NULL if error. + */ + const char *get_partition_name(size_t part_id) const; +private: + const uchar **partition_names; + /** + Insert [sub]partition name into partition_name_hash + @param name Partition name. + @param part_id Partition id. + @param is_subpart True if subpartition else partition. + + @return Operation status. + @retval false Success. + @retval true Failure. + */ + bool insert_partition_name_in_hash(const char *name, + uint part_id, + bool is_subpart); }; @@ -605,6 +680,10 @@ public: virtual int index_last(uchar * buf); virtual int index_next_same(uchar * buf, const uchar * key, uint keylen); + int index_read_last_map(uchar *buf, + const uchar *key, + key_part_map keypart_map); + /* read_first_row is virtual method but is only implemented by handler.cc, no storage engine has implemented it so neither @@ -1086,7 +1165,6 @@ private: ulonglong nr= (((Field_num*) field)->unsigned_flag || field->val_int() > 0) ? field->val_int() : 0; lock_auto_increment(); - DBUG_ASSERT(part_share->auto_inc_initialized); /* must check when the mutex is taken */ if (nr >= part_share->next_auto_inc_val) part_share->next_auto_inc_val= nr + 1; @@ -1310,4 +1388,9 @@ public: friend int cmp_key_rowid_part_id(void *ptr, uchar *ref1, uchar *ref2); }; +bool print_admin_msg(THD* thd, uint len, + const char* msg_type, + const char* db_name, String &table_name, + const char* op_name, const char *fmt, ...); + #endif /* HA_PARTITION_INCLUDED */ diff --git a/sql/handler.cc b/sql/handler.cc index c19d04236d7..ba947fd7a2d 100644 --- a/sql/handler.cc +++ b/sql/handler.cc @@ -2435,6 +2435,12 @@ LEX_STRING *handler::engine_name() } +void handler::ha_statistic_increment(ulong SSV::*offset) const +{ + (table->in_use->status_var.*offset)++; +} + + double handler::keyread_time(uint index, uint ranges, ha_rows rows) { /* diff --git a/sql/handler.h b/sql/handler.h index e20f95df1f3..f5e3d83d8d9 100644 --- a/sql/handler.h +++ b/sql/handler.h @@ -1393,6 +1393,7 @@ struct handlerton bool (*vers_query_trx_id)(THD* thd, void *out, ulonglong trx_id, vtq_field_t field); bool (*vers_query_commit_ts)(THD* thd, void *out, const MYSQL_TIME &commit_ts, vtq_field_t field, bool backwards); bool (*vers_trx_sees)(THD *thd, bool &result, ulonglong trx_id1, ulonglong trx_id0, ulonglong commit_id1, uchar iso_level1, ulonglong commit_id0); + handler *(*vers_upgrade_handler)(handler *hnd, MEM_ROOT *mem_root); }; @@ -3271,6 +3272,18 @@ protected: virtual int index_last(uchar * buf) { return HA_ERR_WRONG_COMMAND; } virtual int index_next_same(uchar *buf, const uchar *key, uint keylen); + /** + @brief + The following functions works like index_read, but it find the last + row with the current key value or prefix. + @returns @see index_read_map(). + */ + virtual int index_read_last_map(uchar * buf, const uchar * key, + key_part_map keypart_map) + { + uint key_len= calculate_key_len(table, active_index, key, keypart_map); + return index_read_last(buf, key, key_len); + } virtual int close(void)=0; inline void update_rows_read() { @@ -3350,7 +3363,7 @@ public: void ft_end() { ft_handler=NULL; } virtual FT_INFO *ft_init_ext(uint flags, uint inx,String *key) { return NULL; } -private: +public: virtual int ft_read(uchar *buf) { return HA_ERR_WRONG_COMMAND; } virtual int rnd_next(uchar *buf)=0; virtual int rnd_pos(uchar * buf, uchar *pos)=0; @@ -4057,6 +4070,7 @@ public: TABLE_SHARE* get_table_share() { return table_share; } protected: /* Service methods for use by storage engines. */ + void ha_statistic_increment(ulong SSV::*offset) const; void **ha_data(THD *) const; THD *ha_thd(void) const; @@ -4082,7 +4096,7 @@ protected: public: bool check_table_binlog_row_based(bool binlog_row); -private: + /* Cache result to avoid extra calls */ inline void mark_trx_read_write() { @@ -4092,6 +4106,8 @@ private: mark_trx_read_write_internal(); } } + +private: void mark_trx_read_write_internal(); bool check_table_binlog_row_based_internal(bool binlog_row); @@ -4210,6 +4226,11 @@ protected: virtual int index_read(uchar * buf, const uchar * key, uint key_len, enum ha_rkey_function find_flag) { return HA_ERR_WRONG_COMMAND; } + virtual int index_read_last(uchar * buf, const uchar * key, uint key_len) + { + my_errno= HA_ERR_WRONG_COMMAND; + return HA_ERR_WRONG_COMMAND; + } friend class ha_partition; friend class ha_sequence; public: @@ -4340,6 +4361,8 @@ public: { DBUG_ASSERT(0); return false; } virtual handler* part_handler(uint32 part_id) { DBUG_ASSERT(0); return NULL; } + virtual void update_partition(uint part_id) + {} protected: Handler_share *get_ha_share_ptr(); void set_ha_share_ptr(Handler_share *arg_ha_share); diff --git a/sql/partition_info.cc b/sql/partition_info.cc index f45b45548b0..c1a792c87e0 100644 --- a/sql/partition_info.cc +++ b/sql/partition_info.cc @@ -208,6 +208,48 @@ bool partition_info::set_named_partition_bitmap(const char *part_name, +/** + Prune away partitions not mentioned in the PARTITION () clause, + if used. + + @param table_list Table list pointing to table to prune. + + @return Operation status + @retval false Success + @retval true Failure +*/ +bool partition_info::set_read_partitions(List *partition_names) +{ + DBUG_ENTER("partition_info::set_read_partitions"); + if (!partition_names || !partition_names->elements) + { + DBUG_RETURN(true); + } + + uint num_names= partition_names->elements; + List_iterator partition_names_it(*partition_names); + uint i= 0; + /* + TODO: When adding support for FK in partitioned tables, the referenced + table must probably lock all partitions for read, and also write depending + of ON DELETE/UPDATE. + */ + bitmap_clear_all(&read_partitions); + + /* No check for duplicate names or overlapping partitions/subpartitions. */ + + DBUG_PRINT("info", ("Searching through partition_name_hash")); + do + { + char *part_name= partition_names_it++; + if (add_named_partition(part_name, strlen(part_name))) + DBUG_RETURN(true); + } while (++i < num_names); + DBUG_RETURN(false); +} + + + /** Prune away partitions not mentioned in the PARTITION () clause, if used. @@ -989,13 +1031,22 @@ bool partition_info::vers_scan_min_max(THD *thd, partition_element *part) uint32 part_id= part->id * sub_factor; uint32 part_id_end= part_id + sub_factor; DBUG_ASSERT(part->empty); + DBUG_ASSERT(part->type == partition_element::VERSIONING); DBUG_ASSERT(table->s->stat_trx); for (; part_id < part_id_end; ++part_id) { - handler *file= table->file->part_handler(part_id); - int rc= file->ha_external_lock(thd, F_RDLCK); + handler *file= table->file->part_handler(part_id); // requires update_partition() for ha_innopart + int rc= file->ha_external_lock(thd, F_RDLCK); // requires ha_commit_trans() for ha_innobase if (rc) - goto error; + { + file->update_partition(part_id); + goto lock_fail; + } + + table->default_column_bitmaps(); + bitmap_set_bit(table->read_set, table->vers_end_field()->field_index); + file->column_bitmaps_signal(); + rc= file->ha_rnd_init(true); if (!rc) { @@ -1006,6 +1057,8 @@ bool partition_info::vers_scan_min_max(THD *thd, partition_element *part) if (thd->killed) { file->ha_rnd_end(); + file->update_partition(part_id); + ha_commit_trans(thd, false); return true; } if (rc) @@ -1014,18 +1067,44 @@ bool partition_info::vers_scan_min_max(THD *thd, partition_element *part) continue; break; } - vers_stat_trx(STAT_TRX_END, part).update_unguarded(table->vers_end_field()); + if (table->vers_end_field()->is_max()) + { + rc= HA_ERR_INTERNAL_ERROR; + push_warning_printf(thd, + Sql_condition::WARN_LEVEL_WARN, + WARN_VERS_PART_NON_HISTORICAL, + ER_THD(thd, WARN_VERS_PART_NON_HISTORICAL), + part->partition_name); + break; + } + if (table->versioned_by_engine()) + { + uchar buf[8]; + Field_timestampf fld(buf, NULL, 0, Field::NONE, table->vers_end_field()->field_name, NULL, 6); + if (!vers_trx_id_to_ts(thd, table->vers_end_field(), fld)) + { + vers_stat_trx(STAT_TRX_END, part).update_unguarded(&fld); + } + } + else + { + vers_stat_trx(STAT_TRX_END, part).update_unguarded(table->vers_end_field()); + } } file->ha_rnd_end(); } file->ha_external_lock(thd, F_UNLCK); + file->update_partition(part_id); if (rc != HA_ERR_END_OF_FILE) { - error: - my_error(ER_INTERNAL_ERROR, MYF(0), "partition/subpartition scan failed in versioned partitions setup"); + ha_commit_trans(thd, false); + lock_fail: + // TODO: print rc code + my_error(ER_INTERNAL_ERROR, MYF(0), "min/max scan failed in versioned partitions setup (see warnings)"); return true; } } + ha_commit_trans(thd, false); return false; } @@ -1073,11 +1152,9 @@ bool partition_info::vers_setup_2(THD * thd, bool is_create_table_ind) DBUG_ASSERT(part_type == VERSIONING_PARTITION); DBUG_ASSERT(vers_info && vers_info->initialized(false)); DBUG_ASSERT(table && table->s); - if (!table->versioned_by_sql()) - { - my_error(ER_VERS_WRONG_PARAMS, MYF(0), table->s->table_name.str, "selected engine is not supported in `BY SYSTEM_TIME` partitioning"); - return true; - } + + bool error= false; + mysql_mutex_lock(&table->s->LOCK_rotation); if (table->s->busy_rotation) { @@ -1124,8 +1201,19 @@ bool partition_info::vers_setup_2(THD * thd, bool is_create_table_ind) if (!is_create_table_ind) { - if (vers_scan_min_max(thd, el)) - return true; + if (el->type == partition_element::AS_OF_NOW) + { + uchar buf[8]; + Field_timestampf fld(buf, NULL, 0, Field::NONE, table->vers_end_field()->field_name, NULL, 6); + fld.set_max(); + vers_stat_trx(STAT_TRX_END, el).update_unguarded(&fld); + el->empty= false; + } + else if (vers_scan_min_max(thd, el)) + { + error= true; + break; + } if (!el->empty) { vers_update_col_vals(thd, prev, el); @@ -1151,7 +1239,7 @@ bool partition_info::vers_setup_2(THD * thd, bool is_create_table_ind) } } // while - if (!dont_stat) + if (!error && !dont_stat) { if (col_val_updated) table->s->stat_serial++; @@ -1165,7 +1253,7 @@ bool partition_info::vers_setup_2(THD * thd, bool is_create_table_ind) table->s->busy_rotation= false; } mysql_mutex_unlock(&table->s->LOCK_rotation); - return false; + return error; } @@ -3262,6 +3350,80 @@ bool partition_info::has_same_partitioning(partition_info *new_part_info) } +static bool has_same_column_order(List *create_list, + Field** field_array) +{ + Field **f_ptr; + List_iterator_fast new_field_it; + Create_field *new_field= NULL; + new_field_it.init(*create_list); + + for (f_ptr= field_array; *f_ptr; f_ptr++) + { + while ((new_field= new_field_it++)) + { + if (new_field->field == *f_ptr) + break; + } + if (!new_field) + break; + } + + if (!new_field) + { + /* Not same order!*/ + return false; + } + return true; +} + +bool partition_info::vers_trx_id_to_ts(THD* thd, Field* in_trx_id, Field_timestamp& out_ts) +{ + handlerton *hton= plugin_hton(table->s->db_plugin); + DBUG_ASSERT(hton); + ulonglong trx_id= in_trx_id->val_int(); + MYSQL_TIME ts; + bool found= hton->vers_query_trx_id(thd, &ts, trx_id, VTQ_COMMIT_TS); + if (!found) + { + push_warning_printf(thd, + Sql_condition::WARN_LEVEL_WARN, + WARN_VERS_TRX_MISSING, + ER_THD(thd, WARN_VERS_TRX_MISSING), + trx_id); + return true; + } + out_ts.store_time_dec(&ts, 6); + return false; +} + + +/** + Check if the partitioning columns are in the same order as the given list. + + Used to see if INPLACE alter can be allowed or not. If the order is + different then the rows must be redistributed for KEY [sub]partitioning. + + @param[in] create_list Column list after ALTER TABLE. + @return true is same order as before ALTER TABLE, else false. +*/ +bool partition_info::same_key_column_order(List *create_list) +{ + /* Only need to check for KEY [sub] partitioning. */ + if (list_of_part_fields && !column_list) + { + if (!has_same_column_order(create_list, part_field_array)) + return false; + } + if (list_of_subpart_fields) + { + if (!has_same_column_order(create_list, subpart_field_array)) + return false; + } + return true; +} + + void partition_info::print_debug(const char *str, uint *value) { DBUG_ENTER("print_debug"); diff --git a/sql/partition_info.h b/sql/partition_info.h index 5a671bfc50f..ef20564837c 100644 --- a/sql/partition_info.h +++ b/sql/partition_info.h @@ -22,6 +22,7 @@ #include "sql_class.h" #include "partition_element.h" +#include "sql_partition.h" class partition_info; struct TABLE_LIST; @@ -382,6 +383,28 @@ public: uint32 *part_id); void report_part_expr_error(bool use_subpart_expr); bool has_same_partitioning(partition_info *new_part_info); + inline bool is_partition_used(uint part_id) const + { + return bitmap_is_set(&read_partitions, part_id); + } + inline bool is_partition_locked(uint part_id) const + { + return bitmap_is_set(&lock_partitions, part_id); + } + inline uint num_partitions_used() + { + return bitmap_bits_set(&read_partitions); + } + inline uint get_first_used_partition() const + { + return bitmap_get_first_set(&read_partitions); + } + inline uint get_next_used_partition(uint part_id) const + { + return bitmap_get_next_set(&read_partitions, part_id); + } + bool same_key_column_order(List *create_list); + private: static int list_part_cmp(const void* a, const void* b); bool set_up_default_partitions(THD *thd, handler *file, HA_CREATE_INFO *info, @@ -392,9 +415,11 @@ private: uint start_no); char *create_default_subpartition_name(THD *thd, uint subpart_no, const char *part_name); + // FIXME: prune_partition_bitmaps() is duplicate of set_read_partitions() bool prune_partition_bitmaps(TABLE_LIST *table_list); bool add_named_partition(const char *part_name, uint length); public: + bool set_read_partitions(List *partition_names); bool has_unique_name(partition_element *element); bool vers_init_info(THD *thd); @@ -475,8 +500,8 @@ public: DBUG_ASSERT(vers_info->initialized()); part= vers_hist_part(); } - max_time-= vers_stat_trx(STAT_TRX_END, part).min_time(); - return max_time > vers_info->interval; + my_time_t min_time= vers_stat_trx(STAT_TRX_END, part).min_time(); + return max_time - min_time > vers_info->interval; } bool vers_interval_exceed(partition_element *part) { @@ -486,15 +511,31 @@ public: { return vers_interval_exceed(vers_hist_part()); } + bool vers_trx_id_to_ts(THD *thd, Field *in_trx_id, Field_timestamp &out_ts); void vers_update_stats(THD *thd, partition_element *el) { DBUG_ASSERT(vers_info && vers_info->initialized()); DBUG_ASSERT(table && table->s); DBUG_ASSERT(el && el->type == partition_element::VERSIONING); + bool updated; mysql_rwlock_wrlock(&table->s->LOCK_stat_serial); el->empty= false; - bool updated= - vers_stat_trx(STAT_TRX_END, el->id).update(table->vers_end_field()); + if (table->versioned_by_engine()) + { + // transaction is not yet pushed to VTQ, so we use now-time + my_time_t end_ts= my_time(0); + + uchar buf[8]; + Field_timestampf fld(buf, NULL, 0, Field::NONE, table->vers_end_field()->field_name, NULL, 6); + fld.store_TIME(end_ts, 0); + updated= + vers_stat_trx(STAT_TRX_END, el->id).update(&fld); + } + else + { + updated= + vers_stat_trx(STAT_TRX_END, el->id).update(table->vers_end_field()); + } if (updated) table->s->stat_serial++; mysql_rwlock_unlock(&table->s->LOCK_stat_serial); diff --git a/sql/partitioning/partition_handler.cc b/sql/partitioning/partition_handler.cc new file mode 100644 index 00000000000..1e04439e100 --- /dev/null +++ b/sql/partitioning/partition_handler.cc @@ -0,0 +1,3746 @@ +/* + Copyright (c) 2005, 2016, Oracle and/or its affiliates. All rights reserved. + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; version 2 of + the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include "table.h" // TABLE_SHARE +#include "sql_partition.h" // LIST_PART_ENTRY, part_id_range +#include "partition_info.h" // NOT_A_PARTITION_ID +#include "partition_handler.h" +#include "log.h" // sql_print_error +#include "key.h" // key_rec_cmp +#include "sql_class.h" // THD +#include + +#define MI_MAX_MSG_BUF 1024 + +// In sql_class.cc: +extern "C" int thd_binlog_format(const MYSQL_THD thd); + +/** operation names for the enum_part_operation. */ +static const char *opt_op_name[]= {"optimize", "analyze", "check", "repair", + "assign_to_keycache", "preload_keys"}; + +// static PSI_memory_key key_memory_Partition_share; +// static PSI_memory_key key_memory_partition_sort_buffer; +// static PSI_memory_key key_memory_Partition_admin; +#ifdef HAVE_PSI_INTERFACE +extern PSI_mutex_key key_partition_auto_inc_mutex; +// static PSI_memory_info all_partitioning_memory[]= +// { { &key_memory_Partition_share, "Partition_share", 0}, +// { &key_memory_partition_sort_buffer, "partition_sort_buffer", 0}, +// { &key_memory_Partition_admin, "Partition_admin", 0} }; +static PSI_mutex_info all_partitioning_mutex[]= +{ { &key_partition_auto_inc_mutex, "Partiton_share::auto_inc_mutex", 0} }; +#endif + +void partitioning_init() +{ +#ifdef HAVE_PSI_INTERFACE + int count; +// count= array_elements(all_partitioning_memory); +// mysql_memory_register("sql", all_partitioning_memory, count); + count= array_elements(all_partitioning_mutex); + mysql_mutex_register("sql", all_partitioning_mutex, count); +#endif +} + + +/** + Release reserved auto increment values not used. + @param thd Thread. + @param table_share Table Share + @param next_insert_id Next insert id (first non used auto inc value). + @param max_reserved End of reserved auto inc range. +*/ +void +Partition_share::release_auto_inc_if_possible(THD *thd, TABLE_SHARE *table_share, + const ulonglong next_insert_id, + const ulonglong max_reserved) +{ +#ifndef DBUG_OFF + if (table_share->tmp_table == NO_TMP_TABLE) + { + mysql_mutex_assert_owner(&auto_inc_mutex); + } +#endif /* DBUG_OFF */ + + /* + If the current auto_increment values is lower than the reserved value (1) + and the reserved value was reserved by this thread (2), then we can + lower the reserved value. + However, we cannot lower the value if there are forced/non generated + values from 'SET INSERT_ID = forced_val' (3). */ + if (next_insert_id < next_auto_inc_val && // (1) + max_reserved >= next_auto_inc_val && // (2) + thd->auto_inc_intervals_forced.maximum() < next_insert_id) // (3) + { + next_auto_inc_val= next_insert_id; + } +} + + +/** + Get the partition name. + + @param part Struct containing name and length + @param[out] length Length of the name + + @return Partition name +*/ + +static uchar *get_part_name_from_def(PART_NAME_DEF *part, + size_t *length, + my_bool not_used MY_ATTRIBUTE((unused))) +{ + *length= part->length; + return part->partition_name; +} + + +/** + Populate the partition_name_hash in part_share. +*/ + +bool Partition_share::populate_partition_name_hash(partition_info *part_info) +{ + uint tot_names; + uint num_subparts= part_info->num_subparts; + DBUG_ENTER("Partition_share::populate_partition_name_hash"); + DBUG_ASSERT(!part_info->is_sub_partitioned() || num_subparts); + + if (num_subparts == 0) + { + num_subparts= 1; + } + + /* + TABLE_SHARE::LOCK_ha_data must been locked before calling this function. + This ensures only one thread/table instance will execute this. + */ + +#ifndef DBUG_OFF + if (part_info->table->s->tmp_table == NO_TMP_TABLE) + { + mysql_mutex_assert_owner(&part_info->table->s->LOCK_ha_data); + } +#endif + if (partition_name_hash_initialized) + { + DBUG_RETURN(false); + } + tot_names= part_info->num_parts; + if (part_info->is_sub_partitioned()) + { + tot_names+= part_info->num_parts * num_subparts; + } + partition_names= static_cast(my_malloc( + part_info->get_tot_partitions() * + sizeof(*partition_names), + MYF(MY_WME))); + if (!partition_names) + { + DBUG_RETURN(true); + } + if (my_hash_init(&partition_name_hash, + system_charset_info, tot_names, 0, 0, + (my_hash_get_key) get_part_name_from_def, + my_free, HASH_UNIQUE)) + { + my_free(partition_names); + partition_names= NULL; + DBUG_RETURN(true); + } + + List_iterator part_it(part_info->partitions); + uint i= 0; + do + { + partition_element *part_elem= part_it++; + DBUG_ASSERT(part_elem->part_state == PART_NORMAL); + if (part_elem->part_state == PART_NORMAL) + { + if (insert_partition_name_in_hash(part_elem->partition_name, + i * num_subparts, + false)) + goto err; + if (part_info->is_sub_partitioned()) + { + List_iterator + subpart_it(part_elem->subpartitions); + partition_element *sub_elem; + uint j= 0; + do + { + sub_elem= subpart_it++; + if (insert_partition_name_in_hash(sub_elem->partition_name, + i * num_subparts + j, true)) + goto err; + + } while (++j < num_subparts); + } + } + } while (++i < part_info->num_parts); + + for (i= 0; i < tot_names; i++) + { + PART_NAME_DEF *part_def; + part_def= reinterpret_cast( + my_hash_element(&partition_name_hash, i)); + if (part_def->is_subpart == part_info->is_sub_partitioned()) + { + partition_names[part_def->part_id]= part_def->partition_name; + } + } + partition_name_hash_initialized= true; + + DBUG_RETURN(false); +err: + my_hash_free(&partition_name_hash); + my_free(partition_names); + partition_names= NULL; + + DBUG_RETURN(true); +} + + +/** + Insert a partition name in the partition_name_hash. + + @param name Name of partition + @param part_id Partition id (number) + @param is_subpart Set if the name belongs to a subpartition + + @return Operation status + @retval true Failure + @retval false Success +*/ + +bool Partition_share::insert_partition_name_in_hash(const char *name, + uint part_id, + bool is_subpart) +{ + PART_NAME_DEF *part_def; + uchar *part_name; + uint part_name_length; + DBUG_ENTER("Partition_share::insert_partition_name_in_hash"); + /* + Calculate and store the length here, to avoid doing it when + searching the hash. + */ + part_name_length= static_cast(strlen(name)); + /* + Must use memory that lives as long as table_share. + Freed in the Partition_share destructor. + Since we use my_multi_malloc, then my_free(part_def) will also free + part_name, as a part of my_hash_free. + */ + if (!my_multi_malloc(MY_WME, + &part_def, sizeof(PART_NAME_DEF), + &part_name, part_name_length + 1, + NULL)) + { + DBUG_RETURN(true); + } + memcpy(part_name, name, part_name_length + 1); + part_def->partition_name= part_name; + part_def->length= part_name_length; + part_def->part_id= part_id; + part_def->is_subpart= is_subpart; + if (my_hash_insert(&partition_name_hash, (uchar *) part_def)) + { + my_free(part_def); + DBUG_RETURN(true); + } + DBUG_RETURN(false); +} + + +const char *Partition_share::get_partition_name(size_t part_id) const +{ + if (partition_names == NULL) + { + return NULL; + } + return reinterpret_cast(partition_names[part_id]); +} +/* + Implementation of Partition_helper class. +*/ +Partition_helper::Partition_helper(handler *main_handler) + : + m_handler(main_handler), + m_part_info(), + m_tot_parts(), + m_last_part(), + m_err_rec(), + m_ordered(), + m_ordered_scan_ongoing(), + m_ordered_rec_buffer(), + m_queue() +{} + + +Partition_helper::~Partition_helper() +{ + DBUG_ASSERT(m_ordered_rec_buffer == NULL); + DBUG_ASSERT(m_key_not_found_partitions.bitmap == NULL); +} + + +/** + Set partition info. + + To be called from Partition_handler. + + @param part_info Partition info to use. + @param early True if called when part_info only created and parsed, + but not setup, checked or fixed. + */ +void Partition_helper::set_part_info_low(partition_info *part_info, + bool early) +{ + /* + ha_partition will set m_tot_parts from the .par file during creating + the new handler. + And this call can be earlier than the partition_default_handling(), + so get_tot_partitions() may return zero. + */ + if (m_tot_parts == 0 && + (m_part_info == NULL || !early)) + { + m_tot_parts= part_info->get_tot_partitions(); + } + m_part_info= part_info; + m_is_sub_partitioned= m_part_info->is_sub_partitioned(); +} + +/** + Initialize the partitioning helper for use after the table is opened. + + @param part_share Partitioning share (used for auto increment). + + @return Operation status. + @retval false for success otherwise true. +*/ + +bool Partition_helper::open_partitioning(Partition_share *part_share) +{ + m_table= get_table(); + DBUG_ASSERT(m_part_info == m_table->part_info); + m_part_share= part_share; + m_tot_parts= m_part_info->get_tot_partitions(); + if (bitmap_init(&m_key_not_found_partitions, NULL, m_tot_parts, false)) + { + return true; + } + bitmap_clear_all(&m_key_not_found_partitions); + m_key_not_found= false; + m_is_sub_partitioned= m_part_info->is_sub_partitioned(); + m_auto_increment_lock= false; + m_auto_increment_safe_stmt_log_lock= false; + m_pkey_is_clustered= m_handler->primary_key_is_clustered(); + m_part_spec.start_part= NOT_A_PARTITION_ID; + m_part_spec.end_part= NOT_A_PARTITION_ID; + m_index_scan_type= PARTITION_NO_INDEX_SCAN; + m_start_key.key= NULL; + m_start_key.length= 0; + m_scan_value= 3; + m_reverse_order= false; + m_curr_key_info[0]= NULL; + m_curr_key_info[1]= NULL; + m_curr_key_info[2]= NULL; + m_top_entry= NO_CURRENT_PART_ID; + m_ref_usage= REF_NOT_USED; + m_rec_length= m_table->s->reclength; + return false; +} + + +void Partition_helper::close_partitioning() +{ + bitmap_free(&m_key_not_found_partitions); + DBUG_ASSERT(!m_ordered_rec_buffer); + destroy_record_priority_queue(); +} + +/**************************************************************************** + MODULE change record +****************************************************************************/ + +/** + Insert a row to the partitioned table. + + @param buf The row in MySQL Row Format. + + @return Operation status. + @retval 0 Success + @retval != 0 Error code +*/ + +int Partition_helper::ph_write_row(uchar *buf) +{ + uint32 part_id; + int error; + longlong func_value; + bool have_auto_increment= m_table->next_number_field && + buf == m_table->record[0]; + THD *thd= get_thd(); + sql_mode_t saved_sql_mode= thd->variables.sql_mode; + bool saved_auto_inc_field_not_null= m_table->auto_increment_field_not_null; +#ifndef DBUG_OFF + my_bitmap_map *old_map; +#endif /* DBUG_OFF */ + DBUG_ENTER("Partition_helper::ph_write_row"); + DBUG_ASSERT(buf == m_table->record[0]); + + /* + If we have an auto_increment column and we are writing a changed row + or a new row, then update the auto_increment value in the record. + */ + if (have_auto_increment) + { + error= m_handler->update_auto_increment(); + + /* + If we have failed to set the auto-increment value for this row, + it is highly likely that we will not be able to insert it into + the correct partition. We must check and fail if neccessary. + */ + if (error) + DBUG_RETURN(error); + + /* + Don't allow generation of auto_increment value the partitions handler. + If a partitions handler would change the value, then it might not + match the partition any longer. + This can occur if 'SET INSERT_ID = 0; INSERT (NULL)', + So allow this by adding 'MODE_NO_AUTO_VALUE_ON_ZERO' to sql_mode. + The partitions handler::next_insert_id must always be 0. Otherwise + we need to forward release_auto_increment, or reset it for all + partitions. + */ + if (m_table->next_number_field->val_int() == 0) + { + m_table->auto_increment_field_not_null= TRUE; + thd->variables.sql_mode|= MODE_NO_AUTO_VALUE_ON_ZERO; + } + } + +#ifndef DBUG_OFF + /* Temporary mark the partitioning fields as readable. */ + old_map= dbug_tmp_use_all_columns(m_table, m_table->read_set); +#endif /* DBUG_OFF */ + + error= m_part_info->get_partition_id(m_part_info, &part_id, &func_value); + +#ifndef DBUG_OFF + dbug_tmp_restore_column_map(m_table->read_set, old_map); +#endif /* DBUG_OFF */ + + if (unlikely(error)) + { + m_part_info->err_value= func_value; + goto exit; + } + if (!m_part_info->is_partition_locked(part_id)) + { + DBUG_PRINT("info", ("Write to non-locked partition %u (func_value: %ld)", + part_id, (long) func_value)); + error= HA_ERR_NOT_IN_LOCK_PARTITIONS; + goto exit; + } + m_last_part= part_id; + DBUG_PRINT("info", ("Insert in partition %d", part_id)); + + error= write_row_in_part(part_id, buf); + + if (have_auto_increment && !m_table->s->next_number_keypart) + { + set_auto_increment_if_higher(); + } +exit: + thd->variables.sql_mode= saved_sql_mode; + m_table->auto_increment_field_not_null= saved_auto_inc_field_not_null; + DBUG_RETURN(error); +} + + +/** + Update an existing row in the partitioned table. + + Yes, update_row() does what you expect, it updates a row. old_data will + have the previous row record in it, while new_data will have the newest + data in it. + Keep in mind that the server can do updates based on ordering if an + ORDER BY clause was used. Consecutive ordering is not guaranteed. + + If the new record belongs to a different partition than the old record + then it will be inserted into the new partition and deleted from the old. + + new_data is always record[0] + old_data is always record[1] + + @param old_data The old record in MySQL Row Format. + @param new_data The new record in MySQL Row Format. + + @return Operation status. + @retval 0 Success + @retval != 0 Error code +*/ + +// FIXME: duplicate of ha_partition::update_row() +int Partition_helper::ph_update_row(const uchar *old_data, uchar *new_data) +{ + THD *thd= get_thd(); + uint32 new_part_id, old_part_id; + int error= 0; + longlong func_value; + DBUG_ENTER("Partition_helper::ph_update_row"); + m_err_rec= NULL; + + // Need to read partition-related columns, to locate the row's partition: + DBUG_ASSERT(bitmap_is_subset(&m_part_info->full_part_field_set, + m_table->read_set)); + if ((error= get_parts_for_update(old_data, new_data, m_table->record[0], + m_part_info, &old_part_id, &new_part_id, + &func_value))) + { + m_part_info->err_value= func_value; + goto exit; + } + DBUG_ASSERT(bitmap_is_set(&(m_part_info->read_partitions), old_part_id)); + if (!bitmap_is_set(&(m_part_info->lock_partitions), new_part_id)) + { + error= HA_ERR_NOT_IN_LOCK_PARTITIONS; + goto exit; + } + + /* + The protocol for updating a row is: + 1) position the handler (cursor) on the row to be updated, + either through the last read row (rnd or index) or by rnd_pos. + 2) call update_row with both old and new full records as arguments. + + This means that m_last_part should already be set to actual partition + where the row was read from. And if that is not the same as the + calculated part_id we found a misplaced row, we return an error to + notify the user that something is broken in the row distribution + between partitions! Since we don't check all rows on read, we return an + error instead of correcting m_last_part, to make the user aware of the + problem! + + Notice that HA_READ_BEFORE_WRITE_REMOVAL does not require this protocol, + so this is not supported for this engine. + */ + if (old_part_id != m_last_part) + { + m_err_rec= old_data; + DBUG_RETURN(HA_ERR_ROW_IN_WRONG_PARTITION); + } + + m_last_part= new_part_id; + if (new_part_id == old_part_id) + { + DBUG_PRINT("info", ("Update in partition %d", new_part_id)); + tmp_disable_binlog(thd); /* Do not replicate the low-level changes. */ + error= update_row_in_part(new_part_id, old_data, new_data); + reenable_binlog(thd); + goto exit; + } + else + { + Field *saved_next_number_field= m_table->next_number_field; + /* + Don't allow generation of auto_increment value for update. + table->next_number_field is never set on UPDATE. + But is set for INSERT ... ON DUPLICATE KEY UPDATE, + and since update_row() does not generate or update an auto_inc value, + we cannot have next_number_field set when moving a row + to another partition with write_row(), since that could + generate/update the auto_inc value. + This gives the same behavior for partitioned vs non partitioned tables. + */ + m_table->next_number_field= NULL; + DBUG_PRINT("info", ("Update from partition %d to partition %d", + old_part_id, new_part_id)); + tmp_disable_binlog(thd); /* Do not replicate the low-level changes. */ + error= write_row_in_part(new_part_id, new_data); + reenable_binlog(thd); + m_table->next_number_field= saved_next_number_field; + if (error) + goto exit; + + if (m_part_info->part_type == VERSIONING_PARTITION) + { + uint sub_factor= m_part_info->num_subparts ? m_part_info->num_subparts : 1; + DBUG_ASSERT(m_tot_parts == m_part_info->num_parts * sub_factor); + uint lpart_id= new_part_id / sub_factor; + // lpart_id is VERSIONING partition because new_part_id != old_part_id + m_part_info->vers_update_stats(thd, lpart_id); + } + + tmp_disable_binlog(thd); /* Do not replicate the low-level changes. */ + error= delete_row_in_part(old_part_id, old_data); + reenable_binlog(thd); + if (error) + { + goto exit; + } + } + +exit: + /* + if updating an auto_increment column, update + m_part_share->next_auto_inc_val if needed. + (not to be used if auto_increment on secondary field in a multi-column + index) + mysql_update does not set table->next_number_field, so we use + table->found_next_number_field instead. + Also checking that the field is marked in the write set. + */ + if (m_table->found_next_number_field && + new_data == m_table->record[0] && + !m_table->s->next_number_keypart && + bitmap_is_set(m_table->write_set, + m_table->found_next_number_field->field_index)) + { + set_auto_increment_if_higher(); + } + DBUG_RETURN(error); +} + + +/** + Delete an existing row in the partitioned table. + + This will delete a row. buf will contain a copy of the row to be deleted. + The server will call this right after the current row has been read + (from either a previous rnd_xxx() or index_xxx() call). + If you keep a pointer to the last row or can access a primary key it will + make doing the deletion quite a bit easier. + Keep in mind that the server does no guarentee consecutive deletions. + ORDER BY clauses can be used. + + buf is either record[0] or record[1] + + @param buf The record in MySQL Row Format. + + @return Operation status. + @retval 0 Success + @retval != 0 Error code +*/ + +int Partition_helper::ph_delete_row(const uchar *buf) +{ + int error; + uint part_id; + DBUG_ENTER("Partition_helper::ph_delete_row"); + m_err_rec= NULL; + + DBUG_ASSERT(bitmap_is_subset(&m_part_info->full_part_field_set, + m_table->read_set)); + if ((error= get_part_for_delete(buf, + m_table->record[0], + m_part_info, + &part_id))) + { + DBUG_RETURN(error); + } + if (!m_part_info->is_partition_locked(part_id)) + { + DBUG_RETURN(HA_ERR_NOT_IN_LOCK_PARTITIONS); + } + + /* + The protocol for deleting a row is: + 1) position the handler (cursor) on the row to be deleted, + either through the last read row (rnd or index) or by rnd_pos. + 2) call delete_row with the full record as argument. + + This means that m_last_part should already be set to actual partition + where the row was read from. And if that is not the same as the + calculated part_id we found a misplaced row, we return an error to + notify the user that something is broken in the row distribution + between partitions! Since we don't check all rows on read, we return an + error instead of forwarding the delete to the correct (m_last_part) + partition! + + Notice that HA_READ_BEFORE_WRITE_REMOVAL does not require this protocol, + so this is not supported for this engine. + + TODO: change the assert in InnoDB into an error instead and make this one + an assert instead and remove the get_part_for_delete()! + */ + if (part_id != m_last_part) + { + m_err_rec= buf; + DBUG_RETURN(HA_ERR_ROW_IN_WRONG_PARTITION); + } + /* Should never call delete_row on a partition which is not read */ + DBUG_ASSERT(m_part_info->is_partition_used(part_id)); + + m_last_part= part_id; + error= delete_row_in_part(part_id, buf); + DBUG_RETURN(error); +} + + +/** + Get a range of auto increment values. + + Can only be used if the auto increment field is the first field in an index. + + This method is called by update_auto_increment which in turn is called + by the individual handlers as part of write_row. We use the + part_share->next_auto_inc_val, or search all + partitions for the highest auto_increment_value if not initialized or + if auto_increment field is a secondary part of a key, we must search + every partition when holding a mutex to be sure of correctness. + + @param[in] increment Increment value. + @param[in] nb_desired_values Number of desired values. + @param[out] first_value First auto inc value reserved + or MAX if failure. + @param[out] nb_reserved_values Number of values reserved. +*/ + +void Partition_helper +::get_auto_increment_first_field(ulonglong increment, + ulonglong nb_desired_values, + ulonglong *first_value, + ulonglong *nb_reserved_values) +{ + THD *thd= get_thd(); + DBUG_ENTER("Partition_helper::get_auto_increment_first_field"); + DBUG_PRINT("info", ("inc: %lu desired_values: %lu first_value: %lu", + (ulong) increment, + (ulong) nb_desired_values, + (ulong) *first_value)); + DBUG_ASSERT(increment && nb_desired_values); + /* + next_number_keypart is != 0 if the auto_increment column is a secondary + column in the index (it is allowed in MyISAM) + */ + DBUG_ASSERT(m_table->s->next_number_keypart == 0); + *first_value= 0; + + /* + Get a lock for handling the auto_increment in part_share + for avoiding two concurrent statements getting the same number. + */ + lock_auto_increment(); + + /* Initialize if not already done. */ + if (!m_part_share->auto_inc_initialized) + { + initialize_auto_increment(false); + } + + /* + In a multi-row insert statement like INSERT SELECT and LOAD DATA + where the number of candidate rows to insert is not known in advance + we must hold a lock/mutex for the whole statement if we have statement + based replication. Because the statement-based binary log contains + only the first generated value used by the statement, and slaves assumes + all other generated values used by this statement were consecutive to + this first one, we must exclusively lock the generator until the statement + is done. + */ + int binlog_format= thd_binlog_format(thd); + if (!m_auto_increment_safe_stmt_log_lock && + thd->lex->sql_command != SQLCOM_INSERT && + binlog_format != BINLOG_FORMAT_UNSPEC && + binlog_format != BINLOG_FORMAT_ROW) + { + DBUG_PRINT("info", ("locking auto_increment_safe_stmt_log_lock")); + m_auto_increment_safe_stmt_log_lock= true; + } + + /* this gets corrected (for offset/increment) in update_auto_increment */ + *first_value= m_part_share->next_auto_inc_val; + m_part_share->next_auto_inc_val+= nb_desired_values * increment; + if (m_part_share->next_auto_inc_val < *first_value) + { + /* Overflow, set to max. */ + m_part_share->next_auto_inc_val= ULLONG_MAX; + } + + unlock_auto_increment(); + DBUG_PRINT("info", ("*first_value: %lu", (ulong) *first_value)); + *nb_reserved_values= nb_desired_values; + DBUG_VOID_RETURN; +} + + +inline void Partition_helper::set_auto_increment_if_higher() +{ + Field_num *field= static_cast(m_table->found_next_number_field); + ulonglong nr= (field->unsigned_flag || field->val_int() > 0) + ? field->val_int() : 0; + lock_auto_increment(); + if (!m_part_share->auto_inc_initialized) + { + initialize_auto_increment(false); + } + /* must hold the mutex when looking/changing m_part_share. */ + if (nr >= m_part_share->next_auto_inc_val) + { + m_part_share->next_auto_inc_val= nr + 1; + } + unlock_auto_increment(); + save_auto_increment(nr); +} + + +void Partition_helper::ph_release_auto_increment() +{ + DBUG_ENTER("Partition_helper::ph_release_auto_increment"); + + if (m_table->s->next_number_keypart) + { + release_auto_increment_all_parts(); + } + else if (m_handler->next_insert_id) + { + ulonglong max_reserved= m_handler->auto_inc_interval_for_cur_row.maximum(); + lock_auto_increment(); + m_part_share->release_auto_inc_if_possible(get_thd(), m_table->s, + m_handler->next_insert_id, + max_reserved); + DBUG_PRINT("info", ("part_share->next_auto_inc_val: %lu", + (ulong) m_part_share->next_auto_inc_val)); + + /* Unlock the multi row statement lock taken in get_auto_increment */ + if (m_auto_increment_safe_stmt_log_lock) + { + m_auto_increment_safe_stmt_log_lock= FALSE; + DBUG_PRINT("info", ("unlocking auto_increment_safe_stmt_log_lock")); + } + + unlock_auto_increment(); + } + DBUG_VOID_RETURN; +} + + +/** + Calculate key hash value from an null terminated array of fields. + Support function for KEY partitioning. + + @param field_array An array of the fields in KEY partitioning + + @return hash_value calculated + + @note Uses the hash function on the character set of the field. + Integer and floating point fields use the binary character set by default. +*/ + +uint32 Partition_helper::ph_calculate_key_hash_value(Field **field_array) +{ + ulong nr1= 1; + ulong nr2= 4; + bool use_51_hash; + use_51_hash= MY_TEST((*field_array)->table->part_info->key_algorithm == + partition_info::KEY_ALGORITHM_51); + + do + { + Field *field= *field_array; + if (use_51_hash) + { + switch (field->real_type()) { + case MYSQL_TYPE_TINY: + case MYSQL_TYPE_SHORT: + case MYSQL_TYPE_LONG: + case MYSQL_TYPE_FLOAT: + case MYSQL_TYPE_DOUBLE: + case MYSQL_TYPE_NEWDECIMAL: + case MYSQL_TYPE_TIMESTAMP: + case MYSQL_TYPE_LONGLONG: + case MYSQL_TYPE_INT24: + case MYSQL_TYPE_TIME: + case MYSQL_TYPE_DATETIME: + case MYSQL_TYPE_YEAR: + case MYSQL_TYPE_NEWDATE: + { + if (field->is_null()) + { + nr1^= (nr1 << 1) | 1; + continue; + } + /* Force this to my_hash_sort_bin, which was used in 5.1! */ + uint len= field->pack_length(); + my_charset_bin.coll->hash_sort(&my_charset_bin, field->ptr, len, + &nr1, &nr2); + /* Done with this field, continue with next one. */ + continue; + } + case MYSQL_TYPE_STRING: + case MYSQL_TYPE_VARCHAR: + case MYSQL_TYPE_BIT: + /* Not affected, same in 5.1 and 5.5 */ + break; + /* + ENUM/SET uses my_hash_sort_simple in 5.1 (i.e. my_charset_latin1) + and my_hash_sort_bin in 5.5! + */ + case MYSQL_TYPE_ENUM: + case MYSQL_TYPE_SET: + { + if (field->is_null()) + { + nr1^= (nr1 << 1) | 1; + continue; + } + /* Force this to my_hash_sort_bin, which was used in 5.1! */ + uint len= field->pack_length(); + my_charset_latin1.coll->hash_sort(&my_charset_latin1, field->ptr, + len, &nr1, &nr2); + continue; + } + /* New types in mysql-5.6. */ + case MYSQL_TYPE_DATETIME2: + case MYSQL_TYPE_TIME2: + case MYSQL_TYPE_TIMESTAMP2: + /* Not affected, 5.6+ only! */ + break; + + /* These types should not be allowed for partitioning! */ + case MYSQL_TYPE_NULL: + case MYSQL_TYPE_DECIMAL: + case MYSQL_TYPE_DATE: + case MYSQL_TYPE_TINY_BLOB: + case MYSQL_TYPE_MEDIUM_BLOB: + case MYSQL_TYPE_LONG_BLOB: + case MYSQL_TYPE_BLOB: + case MYSQL_TYPE_VAR_STRING: + case MYSQL_TYPE_GEOMETRY: + /* fall through. */ + default: + DBUG_ASSERT(0); // New type? + /* Fall through for default hashing (5.5). */ + } + /* fall through, use collation based hashing. */ + } + field->hash(&nr1, &nr2); + } while (*(++field_array)); + return (uint32) nr1; +} + + +bool Partition_helper::print_partition_error(int error, myf errflag) +{ + THD *thd= get_thd(); + DBUG_ENTER("Partition_helper::print_partition_error"); + + /* Should probably look for my own errors first */ + DBUG_PRINT("enter", ("error: %d", error)); + + if ((error == HA_ERR_NO_PARTITION_FOUND) && + ! (thd->lex->alter_info.flags & Alter_info::ALTER_TRUNCATE_PARTITION)) + { + m_part_info->print_no_partition_found(m_table, errflag); + // print_no_partition_found() reports an error, so we can just return here. + DBUG_RETURN(false); + } + else if (error == HA_ERR_ROW_IN_WRONG_PARTITION) + { + /* + Should only happen on DELETE or UPDATE! + Or in ALTER TABLE REBUILD/REORGANIZE where there are a misplaced + row that needed to move to an old partition (not in the given set). + */ + DBUG_ASSERT(thd_sql_command(thd) == SQLCOM_DELETE || + thd_sql_command(thd) == SQLCOM_DELETE_MULTI || + thd_sql_command(thd) == SQLCOM_UPDATE || + thd_sql_command(thd) == SQLCOM_UPDATE_MULTI || + thd_sql_command(thd) == SQLCOM_ALTER_TABLE); + DBUG_ASSERT(m_err_rec); + if (m_err_rec) + { + size_t max_length; + char buf[MAX_KEY_LENGTH]; + String str(buf,sizeof(buf),system_charset_info); + uint32 part_id; + DBUG_ASSERT(m_last_part < m_tot_parts); + str.length(0); + if (thd_sql_command(thd) == SQLCOM_ALTER_TABLE) + { + str.append("from REBUILD/REORGANIZED partition: "); + str.append_ulonglong(m_last_part); + str.append(" to non included partition (new definition): "); + } + else + { + str.append_ulonglong(m_last_part); + str.append(". Correct is "); + } + if (get_part_for_delete(m_err_rec, + m_table->record[0], + m_part_info, + &part_id)) + { + str.append("?"); + } + else + { + str.append_ulonglong(part_id); + } + append_row_to_str(str, m_err_rec, m_table); + + /* Log this error, so the DBA can notice it and fix it! */ + sql_print_error("Table '%-192s' corrupted: row in wrong partition: %s\n" + "Please REPAIR the table!", + m_table->s->table_name.str, + str.c_ptr_safe()); + + max_length= (MYSQL_ERRMSG_SIZE - strlen(ER(ER_ROW_IN_WRONG_PARTITION))); + if (str.length() >= max_length) + { + str.length(max_length-4); + str.append(STRING_WITH_LEN("...")); + } + my_error(ER_ROW_IN_WRONG_PARTITION, MYF(0), str.c_ptr_safe()); + m_err_rec= NULL; + DBUG_RETURN(false); + } + } + + DBUG_RETURN(true); +} + + +/** + Implement the partition changes defined by ALTER TABLE of partitions. + + Add and copy if needed a number of partitions, during this operation + only read operation is ongoing in the server. This is used by + ADD PARTITION all types as well as by REORGANIZE PARTITION. For + one-phased implementations it is used also by DROP and COALESCE + PARTITIONs. + One-phased implementation needs the new frm file, other handlers will + get zero length and a NULL reference here. + + @param[in] create_info HA_CREATE_INFO object describing all + fields and indexes in table + @param[in] path Complete path of db and table name + @param[out] copied Output parameter where number of copied + records are added + @param[out] deleted Output parameter where number of deleted + records are added + + @return Operation status + @retval 0 Success + @retval != 0 Failure +*/ + +// FIXME: duplicate of ha_partition::change_partitions +int Partition_helper::change_partitions(HA_CREATE_INFO *create_info, + const char *path, + ulonglong * const copied, + ulonglong * const deleted) +{ + List_iterator part_it(m_part_info->partitions); + List_iterator t_it(m_part_info->temp_partitions); + char part_name_buff[FN_REFLEN]; + const char *table_level_data_file_name= create_info->data_file_name; + const char *table_level_index_file_name= create_info->index_file_name; + const char *table_level_tablespace_name= create_info->tablespace; + uint num_parts= m_part_info->partitions.elements; + uint num_subparts= m_part_info->num_subparts; + uint i= 0; + uint num_remain_partitions; + uint num_reorged_parts; + int error= 1; + bool first; + uint temp_partitions= m_part_info->temp_partitions.elements; + THD *thd= get_thd(); + DBUG_ENTER("Partition_helper::change_partitions"); + + /* + Use the read_partitions bitmap for reorganized partitions, + i.e. what to copy. + */ + bitmap_clear_all(&m_part_info->read_partitions); + + /* + Assert that it works without HA_FILE_BASED and lower_case_table_name = 2. + */ + DBUG_ASSERT(!strcmp(path, get_canonical_filename(m_handler, path, + part_name_buff))); + num_reorged_parts= 0; + if (!m_part_info->is_sub_partitioned()) + num_subparts= 1; + + /* + Step 1: + Calculate number of reorganized partitions. + */ + if (temp_partitions) + { + num_reorged_parts= temp_partitions * num_subparts; + } + else + { + do + { + partition_element *part_elem= part_it++; + if (part_elem->part_state == PART_CHANGED || + part_elem->part_state == PART_REORGED_DROPPED) + { + num_reorged_parts+= num_subparts; + } + } while (++i < num_parts); + } + + /* + Step 2: + Calculate number of partitions after change. + */ + num_remain_partitions= 0; + if (temp_partitions) + { + num_remain_partitions= num_parts * num_subparts; + } + else + { + part_it.rewind(); + i= 0; + do + { + partition_element *part_elem= part_it++; + if (part_elem->part_state == PART_NORMAL || + part_elem->part_state == PART_TO_BE_ADDED || + part_elem->part_state == PART_CHANGED) + { + num_remain_partitions+= num_subparts; + } + } while (++i < num_parts); + } + + /* + Step 3: + Set the read_partition bit for all partitions to be copied. + */ + if (num_reorged_parts) + { + i= 0; + first= true; + part_it.rewind(); + do + { + partition_element *part_elem= part_it++; + if (part_elem->part_state == PART_CHANGED || + part_elem->part_state == PART_REORGED_DROPPED) + { + for (uint sp = 0; sp < num_subparts; sp++) + { + bitmap_set_bit(&m_part_info->read_partitions, i * num_subparts + sp); + } + DBUG_ASSERT(first); + } + else if (first && temp_partitions && + part_elem->part_state == PART_TO_BE_ADDED) + { + /* + When doing an ALTER TABLE REORGANIZE PARTITION a number of + partitions is to be reorganized into a set of new partitions. + The reorganized partitions are in this case in the temp_partitions + list. We mark all of them in one batch and thus we only do this + until we find the first partition with state PART_TO_BE_ADDED + since this is where the new partitions go in and where the old + ones used to be. + */ + first= false; + DBUG_ASSERT(((i*num_subparts) + num_reorged_parts) <= m_tot_parts); + for (uint sp = 0; sp < num_reorged_parts; sp++) + { + bitmap_set_bit(&m_part_info->read_partitions, i * num_subparts + sp); + } + } + } while (++i < num_parts); + } + + /* + Step 4: + Create the new partitions and also open, lock and call + external_lock on them (if needed) to prepare them for copy phase + and also for later close calls. + No need to create PART_NORMAL partitions since they must not + be written to! + Only PART_CHANGED and PART_TO_BE_ADDED should be written to! + */ + + error= prepare_for_new_partitions(num_remain_partitions, + num_reorged_parts == 0); + + i= 0; + part_it.rewind(); + do + { + partition_element *part_elem= part_it++; + DBUG_ASSERT(part_elem->part_state >= PART_NORMAL && + part_elem->part_state <= PART_CHANGED); + if (part_elem->part_state == PART_TO_BE_ADDED || + part_elem->part_state == PART_CHANGED) + { + /* + A new partition needs to be created PART_TO_BE_ADDED means an + entirely new partition and PART_CHANGED means a changed partition + that will still exist with either more or less data in it. + */ + uint name_variant= NORMAL_PART_NAME; + if (part_elem->part_state == PART_CHANGED || + (part_elem->part_state == PART_TO_BE_ADDED && temp_partitions)) + name_variant= TEMP_PART_NAME; + if (m_part_info->is_sub_partitioned()) + { + List_iterator sub_it(part_elem->subpartitions); + uint j= 0, part; + do + { + partition_element *sub_elem= sub_it++; + create_subpartition_name(part_name_buff, path, + part_elem->partition_name, + sub_elem->partition_name, + name_variant); + part= i * num_subparts + j; + DBUG_PRINT("info", ("Add subpartition %s", part_name_buff)); + /* + update_create_info was called previously in + mysql_prepare_alter_table. Which may have set data/index_file_name + for the partitions to the full partition name, including + '#P#[#SP#] suffix. Remove that suffix + if it exists. + */ + truncate_partition_filename(sub_elem->data_file_name); + truncate_partition_filename(sub_elem->index_file_name); + /* Notice that sub_elem is already based on part_elem's defaults. */ + error= set_up_table_before_create(thd, + m_table->s, + part_name_buff, + create_info, + sub_elem); + if (error) + { + goto err; + } + if ((error= create_new_partition(m_table, + create_info, + part_name_buff, + part, + sub_elem))) + { + goto err; + } + /* Reset create_info to table level values. */ + create_info->data_file_name= table_level_data_file_name; + create_info->index_file_name= table_level_index_file_name; + create_info->tablespace= table_level_tablespace_name; + } while (++j < num_subparts); + } + else + { + create_partition_name(part_name_buff, path, + part_elem->partition_name, name_variant, + true); + DBUG_PRINT("info", ("Add partition %s", part_name_buff)); + /* See comment in subpartition branch above! */ + truncate_partition_filename(part_elem->data_file_name); + truncate_partition_filename(part_elem->index_file_name); + error= set_up_table_before_create(thd, + m_table->s, + part_name_buff, + create_info, + part_elem); + if (error) + { + goto err; + } + if ((error= create_new_partition(m_table, + create_info, + (const char *)part_name_buff, + i, + part_elem))) + { + goto err; + } + /* Reset create_info to table level values. */ + create_info->data_file_name= table_level_data_file_name; + create_info->index_file_name= table_level_index_file_name; + create_info->tablespace= table_level_tablespace_name; + } + } + } while (++i < num_parts); + + /* + Step 5: + State update to prepare for next write of the frm file. + */ + i= 0; + part_it.rewind(); + do + { + partition_element *part_elem= part_it++; + if (part_elem->part_state == PART_TO_BE_ADDED) + part_elem->part_state= PART_IS_ADDED; + else if (part_elem->part_state == PART_CHANGED) + part_elem->part_state= PART_IS_CHANGED; + else if (part_elem->part_state == PART_REORGED_DROPPED) + part_elem->part_state= PART_TO_BE_DROPPED; + } while (++i < num_parts); + for (i= 0; i < temp_partitions; i++) + { + partition_element *part_elem= t_it++; + DBUG_ASSERT(part_elem->part_state == PART_TO_BE_REORGED); + part_elem->part_state= PART_TO_BE_DROPPED; + } + error= copy_partitions(copied, deleted); +err: + if (error) + { + m_handler->print_error(error, + MYF(error != ER_OUTOFMEMORY ? 0 : ME_FATALERROR)); + } + /* + Close and unlock the new temporary partitions. + They will later be deleted or renamed through the ddl-log. + */ + close_new_partitions(); + DBUG_RETURN(error); +} + + +/** + Copy partitions as part of ALTER TABLE of partitions. + + change_partitions has done all the preparations, now it is time to + actually copy the data from the reorganized partitions to the new + partitions. + + @param[out] copied Number of records copied. + @param[out] deleted Number of records deleted. + + @return Operation status + @retval 0 Success + @retval >0 Error code +*/ + +int Partition_helper::copy_partitions(ulonglong * const copied, + ulonglong * const deleted) +{ + uint new_part= 0; + int result= 0; + longlong func_value; + DBUG_ENTER("Partition_helper::copy_partitions"); + + if (m_part_info->linear_hash_ind) + { + if (m_part_info->part_type == HASH_PARTITION) + set_linear_hash_mask(m_part_info, m_part_info->num_parts); + else + set_linear_hash_mask(m_part_info, m_part_info->num_subparts); + } + + /* + m_part_info->read_partitions bitmap is setup for all the reorganized + partitions to be copied. So we can use the normal handler rnd interface + for reading. + */ + if ((result= m_handler->ha_rnd_init(1))) + { + DBUG_RETURN(result); + } + while (true) + { + if ((result= m_handler->ha_rnd_next(m_table->record[0]))) + { + if (result == HA_ERR_RECORD_DELETED) + continue; //Probably MyISAM + if (result != HA_ERR_END_OF_FILE) + goto error; + /* + End-of-file reached, break out to end the copy process. + */ + break; + } + /* Found record to insert into new handler */ + if (m_part_info->get_partition_id(m_part_info, &new_part, + &func_value)) + { + /* + This record is in the original table but will not be in the new + table since it doesn't fit into any partition any longer due to + changed partitioning ranges or list values. + */ + (*deleted)++; + } + else + { + if ((result= write_row_in_new_part(new_part))) + { + goto error; + } + } + } + m_handler->ha_rnd_end(); + DBUG_RETURN(false); +error: + m_handler->ha_rnd_end(); + DBUG_RETURN(result); +} + + +/** + Check/fix misplaced rows. + + @param part_id Partition to check/fix. + @param repair If true, move misplaced rows to correct partition. + + @return Operation status. + @retval 0 Success + @retval != 0 Error +*/ + +int Partition_helper::check_misplaced_rows(uint read_part_id, bool repair) +{ + int result= 0; + THD *thd= get_thd(); + bool ignore= thd->lex->ignore; + uint32 correct_part_id; + longlong func_value; + ha_rows num_misplaced_rows= 0; + ha_rows num_deleted_rows= 0; + + DBUG_ENTER("Partition_helper::check_misplaced_rows"); + + if (repair) + { + /* We must read the full row, if we need to move it! */ + bitmap_set_all(m_table->read_set); + bitmap_set_all(m_table->write_set); + } + else + { + /* Only need to read the partitioning fields. */ + bitmap_union(m_table->read_set, &m_part_info->full_part_field_set); +#if 0 + /* Fill the base columns of virtual generated columns if necessary */ + for (Field **ptr= m_part_info->full_part_field_array; *ptr; ptr++) + { + if ((*ptr)->is_virtual_gcol()) + m_table->mark_gcol_in_maps(*ptr); + } +#endif + } + + if ((result= rnd_init_in_part(read_part_id, true))) + DBUG_RETURN(result); + + while (true) + { + if ((result= ph_rnd_next_in_part(read_part_id, m_table->record[0]))) + { + if (result == HA_ERR_RECORD_DELETED) + continue; + if (result != HA_ERR_END_OF_FILE) + break; + + if (num_misplaced_rows > 0) + { + if (repair) + { + if (num_deleted_rows > 0) + { + print_admin_msg(thd, MI_MAX_MSG_BUF, "warning", + m_table->s->db.str, m_table->alias, + opt_op_name[REPAIR_PARTS], + "Moved %lld misplaced rows, deleted %lld rows", + num_misplaced_rows - num_deleted_rows, + num_deleted_rows); + } + else + { + print_admin_msg(thd, MI_MAX_MSG_BUF, "warning", + m_table->s->db.str, m_table->alias, + opt_op_name[REPAIR_PARTS], + "Moved %lld misplaced rows", + num_misplaced_rows); + } + } + else + { + print_admin_msg(thd, MI_MAX_MSG_BUF, "error", + m_table->s->db.str, m_table->alias, + opt_op_name[CHECK_PARTS], + "Found %lld misplaced rows in partition %u", + num_misplaced_rows, + read_part_id); + } + } + /* End-of-file reached, all rows are now OK, reset result and break. */ + result= 0; + break; + } + + result= m_part_info->get_partition_id(m_part_info, &correct_part_id, + &func_value); + // TODO: Add code to delete rows not matching any partition. + if (result) + break; + + if (correct_part_id != read_part_id) + { + num_misplaced_rows++; + m_err_rec= NULL; + if (!repair) + { + /* Check. */ + result= HA_ADMIN_NEEDS_UPGRADE; + char buf[MAX_KEY_LENGTH]; + String str(buf,sizeof(buf),system_charset_info); + str.length(0); + append_row_to_str(str, m_err_rec, m_table); + print_admin_msg(thd, MI_MAX_MSG_BUF, "error", + m_table->s->db.str, m_table->alias, + opt_op_name[CHECK_PARTS], + "Found a misplaced row" + " in part %d should be in part %d:\n%s", + read_part_id, + correct_part_id, + str.c_ptr_safe()); + /* Break on first misplaced row, unless ignore is given! */ + if (!ignore) + break; + } + else + { + DBUG_PRINT("info", ("Moving row from partition %d to %d", + read_part_id, correct_part_id)); + + /* + Insert row into correct partition. Notice that there are no commit + for every N row, so the repair will be one large transaction! + */ + if ((result= write_row_in_part(correct_part_id, m_table->record[0]))) + { + /* + We have failed to insert a row, it might have been a duplicate! + */ + char buf[MAX_KEY_LENGTH]; + String str(buf,sizeof(buf),system_charset_info); + str.length(0); + if (result == HA_ERR_FOUND_DUPP_KEY) + { + if (ignore) + { + str.append("Duplicate key found, deleting the record:\n"); + num_deleted_rows++; + } + else + { + str.append("Duplicate key found, " + "please update or delete the record:\n"); + result= HA_ADMIN_CORRUPT; + } + } + append_row_to_str(str, m_err_rec, m_table); + + /* + If the engine supports transactions, the failure will be + rollbacked. + */ + if (!m_handler->has_transactions() || + ignore || result == HA_ADMIN_CORRUPT) + { + /* Log this error, so the DBA can notice it and fix it! */ + sql_print_error("Table '%-192s' failed to move/insert a row" + " from part %d into part %d:\n%s", + m_table->s->table_name.str, + read_part_id, + correct_part_id, + str.c_ptr_safe()); + } + print_admin_msg(thd, MI_MAX_MSG_BUF, "error", + m_table->s->db.str, m_table->alias, + opt_op_name[REPAIR_PARTS], + "Failed to move/insert a row" + " from part %d into part %d:\n%s", + read_part_id, + correct_part_id, + str.c_ptr_safe()); + if (!ignore || result != HA_ERR_FOUND_DUPP_KEY) + break; + } + + /* Delete row from wrong partition. */ + if ((result= delete_row_in_part(read_part_id, m_table->record[0]))) + { + result= HA_ADMIN_CORRUPT; + if (m_handler->has_transactions()) + break; + /* + We have introduced a duplicate, since we failed to remove it + from the wrong partition. + */ + char buf[MAX_KEY_LENGTH]; + String str(buf,sizeof(buf),system_charset_info); + str.length(0); + append_row_to_str(str, m_err_rec, m_table); + + /* Log this error, so the DBA can notice it and fix it! */ + sql_print_error("Table '%-192s': Delete from part %d failed with" + " error %d. But it was already inserted into" + " part %d, when moving the misplaced row!" + "\nPlease manually fix the duplicate row:\n%s", + m_table->s->table_name.str, + read_part_id, + result, + correct_part_id, + str.c_ptr_safe()); + break; + } + } + } + } + + int tmp_result= rnd_end_in_part(read_part_id, true); + DBUG_RETURN(result ? result : tmp_result); +} + +/** + Read next row during full partition scan (scan in random row order). + + This function can evaluate the virtual generated columns. If virtual + generated columns are involved, you should not call rnd_next_in_part + directly but this one. + + @param part_id Partition to read from. + @param[in,out] buf buffer that should be filled with data. + + @return Operation status. + @retval 0 Success + @retval != 0 Error code +*/ + +int Partition_helper::ph_rnd_next_in_part(uint part_id, uchar *buf) +{ + int result= rnd_next_in_part(part_id, buf); + +#if 0 + if (!result && m_table->has_gcol()) + result= update_generated_read_fields(buf, m_table); +#endif + + return result; +} + + +/** Set used partitions bitmap from Alter_info. + + @return false if success else true. +*/ + +bool Partition_helper::set_altered_partitions() +{ + Alter_info *alter_info= &get_thd()->lex->alter_info; + + if ((alter_info->flags & Alter_info::ALTER_ADMIN_PARTITION) == 0 || + (alter_info->flags & Alter_info::ALTER_ALL_PARTITION)) + { + /* + Full table command, not ALTER TABLE t PARTITION . + All partitions are already set, so do nothing. + */ + return false; + } + return m_part_info->set_read_partitions(&alter_info->partition_names); +} + +#if 0 +/** + Print a message row formatted for ANALYZE/CHECK/OPTIMIZE/REPAIR TABLE. + + Modeled after mi_check_print_msg. + + @param thd Thread context. + @param len Needed length for message buffer. + @param msg_type Message type. + @param db_name Database name. + @param table_name Table name. + @param op_name Operation name. + @param fmt Message (in printf format with additional arguments). + + @return Operation status. + @retval false for success else true. +*/ + +bool Partition_helper::print_admin_msg(THD* thd, + uint len, + const char *msg_type, + const char *db_name, + const char *table_name, + const char *op_name, + const char *fmt, + ...) +{ + va_list args; + Protocol *protocol= thd->protocol; + uint length; + size_t msg_length; + char name[NAME_LEN*2+2]; + char *msgbuf; + bool error= true; + + if (!(msgbuf= (char*) my_malloc(len, MYF(0)))) + return true; + va_start(args, fmt); + msg_length= my_vsnprintf(msgbuf, len, fmt, args); + va_end(args); + if (msg_length >= (len - 1)) + goto err; + msgbuf[len - 1] = 0; // healthy paranoia + + if (!thd->protocol->connection_alive()) + { + sql_print_error("%s", msgbuf); + goto err; + } + + length=(uint) (strxmov(name, db_name, ".", table_name,NullS) - name); + /* + TODO: switch from protocol to push_warning here. The main reason we didn't + it yet is parallel repair. Due to following trace: + mi_check_print_msg/push_warning/sql_alloc/my_pthread_getspecific_ptr. + + Also we likely need to lock mutex here (in both cases with protocol and + push_warning). + */ + DBUG_PRINT("info",("print_admin_msg: %s, %s, %s, %s", name, op_name, + msg_type, msgbuf)); + protocol->start_row(); + protocol->store(name, length, system_charset_info); + protocol->store(op_name, system_charset_info); + protocol->store(msg_type, system_charset_info); + protocol->store(msgbuf, msg_length, system_charset_info); + if (protocol->end_row()) + { + sql_print_error("Failed on my_net_write, writing to stderr instead: %s\n", + msgbuf); + goto err; + } + error= false; +err: + my_free(msgbuf); + return error; +} +#endif + + +/** + Set table->read_set taking partitioning expressions into account. + + @param[in] rnd_init True if called from rnd_init (else index_init). +*/ + +inline +void Partition_helper::set_partition_read_set() +{ + /* + For operations that may need to change data, we may need to extend + read_set. + */ + if (m_handler->get_lock_type() == F_WRLCK) + { + /* + If write_set contains any of the fields used in partition and + subpartition expression, we need to set all bits in read_set because + the row may need to be inserted in a different [sub]partition. In + other words update_row() can be converted into write_row(), which + requires a complete record. + */ + if (bitmap_is_overlapping(&m_part_info->full_part_field_set, + m_table->write_set)) + { + bitmap_set_all(m_table->read_set); + } + else + { + /* + Some handlers only read fields as specified by the bitmap for the + read set. For partitioned handlers we always require that the + fields of the partition functions are read such that we can + calculate the partition id to place updated and deleted records. + */ + bitmap_union(m_table->read_set, &m_part_info->full_part_field_set); + } + // Mark virtual generated columns writable + for (Field **vf= m_table->vfield; vf && *vf; vf++) + { + if (bitmap_is_set(m_table->read_set, (*vf)->field_index)) + bitmap_set_bit(m_table->write_set, (*vf)->field_index); + } + } +} + + +/**************************************************************************** + MODULE full table scan +****************************************************************************/ + +/** + Initialize engine for random reads. + + rnd_init() is called when the server wants the storage engine to do a + table scan or when the server wants to access data through rnd_pos. + + When scan is used we will scan one handler partition at a time. + When preparing for rnd_pos we will initialize all handler partitions. + No extra cache handling is needed when scanning is not performed. + + Before initializing we will call rnd_end to ensure that we clean up from + any previous incarnation of a table scan. + + @param scan false for initialize for random reads through rnd_pos() + true for initialize for random scan through rnd_next(). + + @return Operation status. + @retval 0 Success + @retval != 0 Error code +*/ + +int Partition_helper::ph_rnd_init(bool scan) +{ + int error; + uint i= 0; + uint part_id; + DBUG_ENTER("Partition_helper::ph_rnd_init"); + + set_partition_read_set(); + + /* Now we see what the index of our first important partition is */ + DBUG_PRINT("info", ("m_part_info->read_partitions: 0x%lx", + (long) m_part_info->read_partitions.bitmap)); + part_id= m_part_info->get_first_used_partition(); + DBUG_PRINT("info", ("m_part_spec.start_part %d", part_id)); + + if (MY_BIT_NONE == part_id) + { + error= 0; + goto err1; + } + + DBUG_PRINT("info", ("rnd_init on partition %d", part_id)); + if (scan) + { + /* A scan can be restarted without rnd_end() in between! */ + if (m_scan_value == 1 && m_part_spec.start_part != NOT_A_PARTITION_ID) + { + /* End previous scan on partition before restart. */ + if ((error= rnd_end_in_part(m_part_spec.start_part, scan))) + { + DBUG_RETURN(error); + } + } + m_scan_value= 1; + if ((error= rnd_init_in_part(part_id, scan))) + goto err; + } + else + { + m_scan_value= 0; + for (i= part_id; + i < MY_BIT_NONE; + i= m_part_info->get_next_used_partition(i)) + { + if ((error= rnd_init_in_part(i, scan))) + goto err; + } + } + m_part_spec.start_part= part_id; + m_part_spec.end_part= m_tot_parts - 1; + DBUG_PRINT("info", ("m_scan_value=%d", m_scan_value)); + DBUG_RETURN(0); + +err: + /* Call rnd_end for all previously initialized partitions. */ + for (; + part_id < i; + part_id= m_part_info->get_next_used_partition(part_id)) + { + rnd_end_in_part(part_id, scan); + } +err1: + m_scan_value= 2; + m_part_spec.start_part= NO_CURRENT_PART_ID; + DBUG_RETURN(error); +} + + +/** + End of a table scan. + + @return Operation status. + @retval 0 Success + @retval != 0 Error code +*/ + +int Partition_helper::ph_rnd_end() +{ + int error= 0; + DBUG_ENTER("Partition_helper::ph_rnd_end"); + switch (m_scan_value) { + case 3: // Error + DBUG_ASSERT(0); + /* fall through. */ + case 2: // Error + break; + case 1: + if (NO_CURRENT_PART_ID != m_part_spec.start_part) // Table scan + { + error= rnd_end_in_part(m_part_spec.start_part, true); + } + break; + case 0: + uint i; + for (i= m_part_info->get_first_used_partition(); + i < MY_BIT_NONE; + i= m_part_info->get_next_used_partition(i)) + { + int part_error; + part_error= rnd_end_in_part(i, false); + if (part_error && !error) { + error= part_error; + } + } + break; + } + m_scan_value= 3; + m_part_spec.start_part= NO_CURRENT_PART_ID; + DBUG_RETURN(error); +} + + +/** + Read next row during full table scan (scan in random row order). + + This is called for each row of the table scan. When you run out of records + you should return HA_ERR_END_OF_FILE. + The Field structure for the table is the key to getting data into buf + in a manner that will allow the server to understand it. + + @param[out] buf buffer that should be filled with data. + + @return Operation status. + @retval 0 Success + @retval != 0 Error code +*/ + +int Partition_helper::ph_rnd_next(uchar *buf) +{ + int result= HA_ERR_END_OF_FILE; + uint part_id= m_part_spec.start_part; + DBUG_ENTER("Partition_helper::ph_rnd_next"); + + if (NO_CURRENT_PART_ID == part_id) + { + /* + The original set of partitions to scan was empty and thus we report + the result here. + */ + goto end; + } + + DBUG_ASSERT(m_scan_value == 1); + + while (TRUE) + { + result= rnd_next_in_part(part_id, buf); + if (!result) + { + m_last_part= part_id; + m_part_spec.start_part= part_id; + m_table->status= 0; + DBUG_RETURN(0); + } + + /* + if we get here, then the current partition ha_rnd_next returned failure + */ + if (result == HA_ERR_RECORD_DELETED) + continue; // Probably MyISAM + + if (result != HA_ERR_END_OF_FILE) + goto end_dont_reset_start_part; // Return error + + /* End current partition */ + DBUG_PRINT("info", ("rnd_end on partition %d", part_id)); + if ((result= rnd_end_in_part(part_id, true))) + break; + + /* Shift to next partition */ + part_id= m_part_info->get_next_used_partition(part_id); + if (part_id >= m_tot_parts) + { + result= HA_ERR_END_OF_FILE; + break; + } + m_last_part= part_id; + m_part_spec.start_part= part_id; + DBUG_PRINT("info", ("rnd_init on partition %d", part_id)); + if ((result= rnd_init_in_part(part_id, true))) + break; + } + +end: + m_part_spec.start_part= NO_CURRENT_PART_ID; +end_dont_reset_start_part: + m_table->status= STATUS_NOT_FOUND; + DBUG_RETURN(result); +} + + +/** + Save position of current row. + + position() is called after each call to rnd_next() if the data needs + to be ordered or accessed later. + + The server uses ref to store data. ref_length in the above case is + the size needed to store current_position. ref is just a byte array + that the server will maintain. If you are using offsets to mark rows, then + current_position should be the offset. If it is a primary key like in + InnoDB, then it needs to be a primary key. + + @param record Current record in MySQL Row Format. +*/ + +void Partition_helper::ph_position(const uchar *record) +{ + DBUG_ASSERT(m_part_info->is_partition_used(m_last_part)); + DBUG_ENTER("Partition_helper::ph_position"); + DBUG_PRINT("info", ("record: %p", record)); + DBUG_DUMP("record", record, m_rec_length); + + /* + If m_ref_usage is set, then the ref is already stored in the + priority queue (m_queue) when doing ordered scans. + */ + if (m_ref_usage != REF_NOT_USED && m_ordered_scan_ongoing) + { + DBUG_ASSERT(!m_queue->empty()); + DBUG_ASSERT(m_ordered_rec_buffer); + DBUG_ASSERT(!m_curr_key_info[1]); + DBUG_ASSERT(uint2korr(m_queue->top()) == m_last_part); + /* We already have the ref and part id. */ + memcpy(m_handler->ref, m_queue->top(), m_handler->ref_length); + } + else + { + DBUG_PRINT("info", ("m_last_part: %u", m_last_part)); + int2store(m_handler->ref, m_last_part); + position_in_last_part(m_handler->ref + PARTITION_BYTES_IN_POS, record); + } + DBUG_DUMP("ref_out", m_handler->ref, m_handler->ref_length); + + DBUG_VOID_RETURN; +} + + +/** + Read row using position. + + This is like rnd_next, but you are given a position to use to determine + the row. The position will be pointing to data of length handler::ref_length + that handler::ref was set by position(record). Tables clustered on primary + key usually use the full primary key as reference (like InnoDB). Heap based + tables usually returns offset in heap file (like MyISAM). + + @param[out] buf buffer that should be filled with record in MySQL format. + @param[in] pos position given as handler::ref when position() was called. + + @return Operation status. + @retval 0 Success + @retval != 0 Error code +*/ + +int Partition_helper::ph_rnd_pos(uchar *buf, uchar *pos) +{ + uint part_id; + DBUG_ENTER("Partition_helper::ph_rnd_pos"); + + part_id= uint2korr(pos); + DBUG_ASSERT(part_id < m_tot_parts); + DBUG_ASSERT(m_part_info->is_partition_used(part_id)); + m_last_part= part_id; + DBUG_RETURN(rnd_pos_in_part(part_id, buf, (pos + PARTITION_BYTES_IN_POS))); +} + + +/** + Read row using position using given record to find. + + This works as position()+rnd_pos() functions, but does some extra work, + calculating m_last_part - the partition to where the 'record' should go. + + Only useful when position is based on primary key + (HA_PRIMARY_KEY_REQUIRED_FOR_POSITION). + + @param record Current record in MySQL Row Format. + + @return Operation status. + @retval 0 Success + @retval != 0 Error code +*/ + +int Partition_helper::ph_rnd_pos_by_record(uchar *record) +{ + DBUG_ENTER("Partition_helper::ph_rnd_pos_by_record"); + + DBUG_ASSERT(m_handler->ha_table_flags() & + HA_PRIMARY_KEY_REQUIRED_FOR_POSITION); + /* TODO: Support HA_READ_BEFORE_WRITE_REMOVAL */ + /* Set m_last_part correctly. */ + if (unlikely(get_part_for_delete(record, + m_table->record[0], + m_part_info, + &m_last_part))) + DBUG_RETURN(HA_ERR_INTERNAL_ERROR); + + DBUG_RETURN(rnd_pos_by_record_in_last_part(record)); +} + + +/**************************************************************************** + MODULE index scan +****************************************************************************/ +/* + Positions an index cursor to the index specified in the handle. Fetches the + row if available. If the key value is null, begin at the first key of the + index. + + There are loads of optimizations possible here for the partition handler. + The same optimizations can also be checked for full table scan although + only through conditions and not from index ranges. + Phase one optimizations: + Check if the fields of the partition function are bound. If so only use + the single partition it becomes bound to. + Phase two optimizations: + If it can be deducted through range or list partitioning that only a + subset of the partitions are used, then only use those partitions. +*/ + +/** + Setup the ordered record buffer and the priority queue. + + Call destroy_record_priority_queue() to deallocate or clean-up + from failure. + + @return false on success, else true. +*/ + +int Partition_helper::init_record_priority_queue() +{ + uint used_parts= m_part_info->num_partitions_used(); + DBUG_ENTER("Partition_helper::init_record_priority_queue"); + DBUG_ASSERT(!m_ordered_rec_buffer); + DBUG_ASSERT(!m_queue); + /* Initialize the priority queue. */ + // TODO: Create test to see the cost of allocating when needed vs + // allocate once and keep between statements. Also test on NUMA + // machines to see the difference (I guess that allocating when needed + // will allocate on 'correct' NUMA node and be faster.) + if (!m_queue) + { + m_queue= new (std::nothrow) Prio_queue(Key_rec_less(m_curr_key_info)); + if (!m_queue) + { + DBUG_RETURN(HA_ERR_OUT_OF_MEM); + } + } + /* Initialize the ordered record buffer. */ + if (!m_ordered_rec_buffer) + { + uint alloc_len; + /* + Allocate record buffer for each used partition. + If PK is clustered index, it is either the primary sort key or is + added as secondary sort. So we only need to allocate for part id + and a full record per partition. + Otherwise if the clustered index was generated, we might need to + do a secondary sort by rowid (handler::ref) and must allocate for + ref (includes part id) and full record per partition. We don't + know yet if we need to do secondary sort by rowid, so we must + allocate space for it. + TODO: enhance ha_index_init() for HA_EXTRA_SECONDARY_SORT_ROWID to + avoid allocating space for handler::ref when not needed. + When enhancing ha_index_init() care must be taken on ph_position(), + so InnoDB's row_id is correctly handled (taken from m_last_part). + */ + if (m_pkey_is_clustered && m_table->s->primary_key != MAX_KEY) + { + m_rec_offset= PARTITION_BYTES_IN_POS; + m_ref_usage= REF_NOT_USED; + } + else + { + m_rec_offset= m_handler->ref_length; + m_ref_usage= REF_STORED_IN_PQ; + } + alloc_len= used_parts * (m_rec_offset + m_rec_length); + /* Allocate a key for temporary use when setting up the scan. */ + alloc_len+= m_table->s->max_key_length; + + m_ordered_rec_buffer= static_cast( + my_malloc(alloc_len, + MYF(MY_WME))); + if (!m_ordered_rec_buffer) + { + DBUG_RETURN(HA_ERR_OUT_OF_MEM); + } + + /* + We set-up one record per partition and each record has 2 bytes in + front where the partition id is written. This is used by ordered + index_read. + If we need to also sort by rowid (handler::ref), then m_curr_key_info[1] + is NULL and we add the rowid before the record. + We also set-up a reference to the first record for temporary use in + setting up the scan. + */ + char *ptr= (char*) m_ordered_rec_buffer; + uint i; + for (i= m_part_info->get_first_used_partition(); + i < MY_BIT_NONE; + i= m_part_info->get_next_used_partition(i)) + { + DBUG_PRINT("info", ("init rec-buf for part %u", i)); + int2store(ptr, i); + ptr+= m_rec_offset + m_rec_length; + } + m_start_key.key= (const uchar*)ptr; + /* + Initialize priority queue, initialized to reading forward. + Start by only sort by KEY, HA_EXTRA_SECONDARY_SORT_ROWID + will be given if we should sort by handler::ref too. + */ + m_queue->m_rec_offset= m_rec_offset; + if (m_queue->reserve(used_parts)) + { + DBUG_RETURN(HA_ERR_OUT_OF_MEM); + } + } + DBUG_RETURN(init_record_priority_queue_for_parts(used_parts)); +} + + +/** + Destroy the ordered record buffer and the priority queue. +*/ + +void Partition_helper::destroy_record_priority_queue() +{ + DBUG_ENTER("Partition_helper::destroy_record_priority_queue"); + if (m_ordered_rec_buffer) + { + my_free(m_ordered_rec_buffer); + m_ordered_rec_buffer= NULL; + } + if (m_queue) + { + m_queue->clear(); + delete m_queue; + m_queue= NULL; + } + m_ref_usage= REF_NOT_USED; + m_ordered_scan_ongoing= false; + DBUG_VOID_RETURN; +} + + +/** + Common setup for index_init. + + Set up variables and initialize the record priority queue. + + @param inx Index to be used. + @param sorted True if the rows must be returned in index order. + + @return Operation status. + @retval 0 Success + @retval != 0 Error code +*/ + +int Partition_helper::ph_index_init_setup(uint inx, bool sorted) +{ + DBUG_ENTER("Partition_helper:ph_:index_init_setup"); + + DBUG_ASSERT(inx != MAX_KEY); + DBUG_PRINT("info", ("inx %u sorted %u", inx, sorted)); + m_part_spec.start_part= NO_CURRENT_PART_ID; + m_start_key.length= 0; + m_ordered= sorted; + m_ref_usage= REF_NOT_USED; + m_curr_key_info[0]= m_table->key_info+inx; + m_curr_key_info[1]= NULL; + /* + There are two cases where it is not enough to only sort on the key: + 1) For clustered indexes, the optimizer assumes that all keys + have the rest of the PK columns appended to the KEY, so it will + sort by PK as secondary sort key. + 2) Rowid-Order-Retrieval access methods, like index_merge_intersect + and index_merge_union. These methods requires the index to be sorted + on rowid (handler::ref) as secondary sort key. + */ + if (m_pkey_is_clustered && m_table->s->primary_key != MAX_KEY && + inx != m_table->s->primary_key) + { + /* + if PK is clustered, then the key cmp must use the pk to + differentiate between equal key in given index. + */ + DBUG_PRINT("info", ("Clustered pk, using pk as secondary cmp")); + m_curr_key_info[1]= m_table->key_info+m_table->s->primary_key; + } + + /* + Some handlers only read fields as specified by the bitmap for the + read set. For partitioned handlers we always require that the + fields of the partition functions are read such that we can + calculate the partition id to place updated and deleted records. + */ + if (m_handler->get_lock_type() == F_WRLCK) + bitmap_union(m_table->read_set, &m_part_info->full_part_field_set); + + DBUG_RETURN(0); +} + + +/** + Initialize handler before start of index scan. + + index_init is always called before starting index scans (except when + starting through index_read_idx and using read_range variants). + + @param inx Index number. + @param sorted Is rows to be returned in sorted order. + + @return Operation status + @retval 0 Success + @retval != 0 Error code +*/ + +int Partition_helper::ph_index_init(uint inx, bool sorted) +{ + int error; + uint part_id= m_part_info->get_first_used_partition(); + DBUG_ENTER("Partition_helper::ph_index_init"); + m_handler->active_index= inx; + + if (part_id == MY_BIT_NONE) + { + DBUG_RETURN(0); + } + + if ((error= ph_index_init_setup(inx, sorted))) + { + DBUG_RETURN(error); + } + if ((error= init_record_priority_queue())) + { + destroy_record_priority_queue(); + DBUG_RETURN(error); + } + + for (/* part_id already set. */; + part_id < MY_BIT_NONE; + part_id= m_part_info->get_next_used_partition(part_id)) + { + if ((error= index_init_in_part(part_id, inx, sorted))) + goto err; + + DBUG_EXECUTE_IF("partition_fail_index_init", { + part_id++; + error= HA_ERR_NO_PARTITION_FOUND; + goto err; + }); + } +err: + if (error) + { + /* End the previously initialized indexes. */ + uint j; + for (j= m_part_info->get_first_used_partition(); + j < part_id; + j= m_part_info->get_next_used_partition(j)) + { + (void) index_end_in_part(j); + } + destroy_record_priority_queue(); + } + DBUG_RETURN(error); +} + + +/** + End of index scan. + + index_end is called at the end of an index scan to clean up any + things needed to clean up. + + @return Operation status. + @retval 0 Success + @retval != 0 Error code +*/ + +int Partition_helper::ph_index_end() +{ + int error= 0; + uint i; + DBUG_ENTER("Partition_helper::ph_index_end"); + + m_part_spec.start_part= NO_CURRENT_PART_ID; + m_ref_usage= REF_NOT_USED; + for (i= m_part_info->get_first_used_partition(); + i < MY_BIT_NONE; + i= m_part_info->get_next_used_partition(i)) + { + int tmp; + if ((tmp= index_end_in_part(i))) + error= tmp; + } + destroy_record_priority_queue(); + m_handler->active_index= MAX_KEY; + DBUG_RETURN(error); +} + + +/** + Read one record in an index scan and start an index scan. + + index_read_map starts a new index scan using a start key. The MySQL Server + will check the end key on its own. Thus to function properly the + partitioned handler need to ensure that it delivers records in the sort + order of the MySQL Server. + index_read_map can be restarted without calling index_end on the previous + index scan and without calling index_init. In this case the index_read_map + is on the same index as the previous index_scan. This is particularly + used in conjunction with multi read ranges. + + @param[out] buf Read row in MySQL Row Format + @param[in] key Key parts in consecutive order + @param[in] keypart_map Which part of key is used + @param[in] find_flag What type of key condition is used + + @return Operation status. + @retval 0 Success + @retval != 0 Error code +*/ + +int Partition_helper::ph_index_read_map(uchar *buf, + const uchar *key, + key_part_map keypart_map, + enum ha_rkey_function find_flag) +{ + DBUG_ENTER("Partition_handler::ph_index_read_map"); + m_handler->end_range= NULL; + m_index_scan_type= PARTITION_INDEX_READ; + m_start_key.key= key; + m_start_key.keypart_map= keypart_map; + m_start_key.flag= find_flag; + DBUG_RETURN(common_index_read(buf, true)); +} + + +/** + Common routine for a number of index_read variants. + + @param[out] buf Buffer where the record should be returned. + @param[in] have_start_key TRUE <=> the left endpoint is available, i.e. + we're in index_read call or in read_range_first + call and the range has left endpoint. + FALSE <=> there is no left endpoint (we're in + read_range_first() call and the range has no left + endpoint). + + @return Operation status + @retval 0 OK + @retval HA_ERR_END_OF_FILE Whole index scanned, without finding the record. + @retval HA_ERR_KEY_NOT_FOUND Record not found, but index cursor positioned. + @retval other Error code. + + @details + Start scanning the range (when invoked from read_range_first()) or doing + an index lookup (when invoked from index_read_XXX): + - If possible, perform partition selection + - Find the set of partitions we're going to use + - Depending on whether we need ordering: + NO: Get the first record from first used partition (see + handle_unordered_scan_next_partition) + YES: Fill the priority queue and get the record that is the first in + the ordering +*/ + +int Partition_helper::common_index_read(uchar *buf, bool have_start_key) +{ + int error; + m_reverse_order= false; + DBUG_ENTER("Partition_helper::common_index_read"); + + DBUG_PRINT("info", ("m_ordered %u m_ordered_scan_ong %u", + m_ordered, m_ordered_scan_ongoing)); + + if (have_start_key) + { + m_start_key.length= calculate_key_len(m_table, + m_handler->active_index, + NULL, + m_start_key.keypart_map); + DBUG_PRINT("info", ("have_start_key map %lu find_flag %u len %u", + m_start_key.keypart_map, m_start_key.flag, + m_start_key.length)); + DBUG_ASSERT(m_start_key.length); + } + if ((error= partition_scan_set_up(buf, have_start_key))) + { + DBUG_RETURN(error); + } + + if (have_start_key && + (m_start_key.flag == HA_READ_KEY_OR_PREV || + m_start_key.flag == HA_READ_PREFIX_LAST || + m_start_key.flag == HA_READ_PREFIX_LAST_OR_PREV || + m_start_key.flag == HA_READ_BEFORE_KEY)) + { + m_reverse_order= true; + m_ordered_scan_ongoing= true; + } + DBUG_PRINT("info", ("m_ordered %u m_o_scan_ong %u have_start_key %u", + m_ordered, m_ordered_scan_ongoing, have_start_key)); + if (!m_ordered_scan_ongoing) + { + /* + We use unordered index scan when read_range is used and flag + is set to not use ordered. + We also use an unordered index scan when the number of partitions to + scan is only one. + The unordered index scan will use the partition set created. + */ + DBUG_PRINT("info", ("doing unordered scan")); + error= handle_unordered_scan_next_partition(buf); + } + else + { + /* + In all other cases we will use the ordered index scan. This will use + the partition set created by the get_partition_set method. + */ + error= handle_ordered_index_scan(buf); + } + DBUG_RETURN(error); +} + + +/** + Start an index scan from leftmost record and return first record. + + index_first() asks for the first key in the index. + This is similar to index_read except that there is no start key since + the scan starts from the leftmost entry and proceeds forward with + index_next. + + @param[out] buf Read row in MySQL Row Format. + + @return Operation status. + @retval 0 Success + @retval != 0 Error code +*/ + +int Partition_helper::ph_index_first(uchar *buf) +{ + DBUG_ENTER("Partition_helper::ph_index_first"); + + m_handler->end_range= NULL; + m_index_scan_type= PARTITION_INDEX_FIRST; + m_reverse_order= false; + DBUG_RETURN(common_first_last(buf)); +} + + +/** + Start an index scan from rightmost record and return first record. + + index_last() asks for the last key in the index. + This is similar to index_read except that there is no start key since + the scan starts from the rightmost entry and proceeds forward with + index_prev. + + @param[out] buf Read row in MySQL Row Format. + + @return Operation status. + @retval 0 Success + @retval != 0 Error code +*/ + +int Partition_helper::ph_index_last(uchar *buf) +{ + DBUG_ENTER("Partition_helper::ph_index_last"); + + m_index_scan_type= PARTITION_INDEX_LAST; + m_reverse_order= true; + DBUG_RETURN(common_first_last(buf)); +} + + +/** + Common routine for index_first/index_last. + + @param[out] buf Read row in MySQL Row Format. + + @return Operation status. + @retval 0 Success + @retval != 0 Error code +*/ + +int Partition_helper::common_first_last(uchar *buf) +{ + int error; + DBUG_ENTER("Partition_helper::common_first_last"); + + if ((error= partition_scan_set_up(buf, false))) + { + DBUG_RETURN(error); + } + if (!m_ordered_scan_ongoing && + m_index_scan_type != PARTITION_INDEX_LAST) + { + DBUG_RETURN(handle_unordered_scan_next_partition(buf)); + } + DBUG_RETURN(handle_ordered_index_scan(buf)); +} + + +/** + Read last using key. + + This is used in join_read_last_key to optimize away an ORDER BY. + Can only be used on indexes supporting HA_READ_ORDER. + + @param[out] buf Read row in MySQL Row Format + @param[in] key Key + @param[in] keypart_map Which part of key is used + + @return Operation status. + @retval 0 Success + @retval != 0 Error code +*/ + +int Partition_helper::ph_index_read_last_map(uchar *buf, + const uchar *key, + key_part_map keypart_map) +{ + DBUG_ENTER("Partition_helper::ph_index_read_last_map"); + + m_ordered= true; // Safety measure + m_handler->end_range= NULL; + m_index_scan_type= PARTITION_INDEX_READ_LAST; + m_start_key.key= key; + m_start_key.keypart_map= keypart_map; + m_start_key.flag= HA_READ_PREFIX_LAST; + DBUG_RETURN(common_index_read(buf, true)); +} + + +/** + Read index by key and keymap. + + Positions an index cursor to the index specified. + Fetches the row if available. If the key value is null, + begin at first key of the index. + + Optimization of the default implementation to take advantage of dynamic + partition pruning. + + @param[out] buf Read row in MySQL Row Format + @param[in] index Index to read from + @param[in] key Key + @param[in] keypart_map Which part of key is used + @param[in] find_flag Direction/how to search. + + @return Operation status. + @retval 0 Success + @retval != 0 Error code +*/ +int Partition_helper::ph_index_read_idx_map(uchar *buf, + uint index, + const uchar *key, + key_part_map keypart_map, + enum ha_rkey_function find_flag) +{ + int error= HA_ERR_KEY_NOT_FOUND; + DBUG_ENTER("Partition_helper::ph_index_read_idx_map"); + + if (find_flag == HA_READ_KEY_EXACT) + { + uint part; + m_start_key.key= key; + m_start_key.keypart_map= keypart_map; + m_start_key.flag= find_flag; + m_start_key.length= calculate_key_len(m_table, + index, + NULL, + m_start_key.keypart_map); + + get_partition_set(m_table, buf, index, &m_start_key, &m_part_spec); + + /* + We have either found exactly 1 partition + (in which case start_part == end_part) + or no matching partitions (start_part > end_part) + */ + DBUG_ASSERT(m_part_spec.start_part >= m_part_spec.end_part); + /* The start part is must be marked as used. */ + DBUG_ASSERT(m_part_spec.start_part > m_part_spec.end_part || + m_part_info->is_partition_used(m_part_spec.start_part)); + + for (part= m_part_spec.start_part; + part <= m_part_spec.end_part; + part= m_part_info->get_next_used_partition(part)) + { + error= index_read_idx_map_in_part(part, + buf, + index, + key, + keypart_map, + find_flag); + if (error != HA_ERR_KEY_NOT_FOUND && + error != HA_ERR_END_OF_FILE) + { + break; + } + } + if (part <= m_part_spec.end_part) + { + m_last_part= part; + } + } + else + { + /* + If not only used with HA_READ_KEY_EXACT, we should investigate if + possible to optimize for other find_flag's as well. + */ + DBUG_ASSERT(0); + error= HA_ERR_INTERNAL_ERROR; + } + DBUG_RETURN(error); +} + + +/** + Read next record in a forward index scan. + + Used to read forward through the index (left to right, low to high). + + @param[out] buf Read row in MySQL Row Format. + + @return Operation status. + @retval 0 Success + @retval != 0 Error code +*/ + +int Partition_helper::ph_index_next(uchar *buf) +{ + DBUG_ENTER("Partition_helper::ph_index_next"); + + /* + TODO(low priority): + If we want partition to work with the HANDLER commands, we + must be able to do index_last() -> index_prev() -> index_next() + and if direction changes, we must step back those partitions in + the record queue so we don't return a value from the wrong direction. + */ + DBUG_ASSERT(m_index_scan_type != PARTITION_INDEX_LAST || + m_table->open_by_handler); + if (!m_ordered_scan_ongoing) + { + DBUG_RETURN(handle_unordered_next(buf, false)); + } + DBUG_RETURN(handle_ordered_next(buf, false)); +} + + +/** + Read next same record. + + This routine is used to read the next but only if the key is the same + as supplied in the call. + + @param[out] buf Read row in MySQL Row Format. + @param[in] key Key. + @param[in] keylen Length of key. + + @return Operation status. + @retval 0 Success + @retval != 0 Error code +*/ + +int Partition_helper::ph_index_next_same(uchar *buf, const uchar *key, uint keylen) +{ + DBUG_ENTER("Partition_helper::ph_index_next_same"); + + DBUG_ASSERT(keylen == m_start_key.length); + DBUG_ASSERT(m_index_scan_type != PARTITION_INDEX_LAST); + if (!m_ordered_scan_ongoing) + DBUG_RETURN(handle_unordered_next(buf, true)); + DBUG_RETURN(handle_ordered_next(buf, true)); +} + + +/** + Read next record when performing index scan backwards. + + Used to read backwards through the index (right to left, high to low). + + @param[out] buf Read row in MySQL Row Format. + + @return Operation status. + @retval 0 Success + @retval != 0 Error code +*/ + +int Partition_helper::ph_index_prev(uchar *buf) +{ + DBUG_ENTER("Partition_helper::ph_index_prev"); + + /* TODO: read comment in index_next */ + DBUG_ASSERT(m_index_scan_type != PARTITION_INDEX_FIRST || + m_table->open_by_handler); + DBUG_RETURN(handle_ordered_prev(buf)); +} + + +/** + Start a read of one range with start and end key. + + We re-implement read_range_first since we don't want the compare_key + check at the end. This is already performed in the partition handler. + read_range_next is very much different due to that we need to scan + all underlying handlers. + + @param start_key Specification of start key. + @param end_key Specification of end key. + @param eq_range_arg Is it equal range. + @param sorted Should records be returned in sorted order. + + @return Operation status. + @retval 0 Success + @retval != 0 Error code +*/ + +int Partition_helper::ph_read_range_first(const key_range *start_key, + const key_range *end_key, + bool eq_range_arg, + bool sorted) +{ + int error= HA_ERR_END_OF_FILE; + bool have_start_key= (start_key != NULL); + uint part_id= m_part_info->get_first_used_partition(); + DBUG_ENTER("Partition_helper::ph_read_range_first"); + + if (part_id == MY_BIT_NONE) + { + /* No partition to scan. */ + m_table->status= STATUS_NOT_FOUND; + DBUG_RETURN(error); + } + + m_ordered= sorted; + set_eq_range(eq_range_arg); + m_handler->set_end_range(end_key); + + set_range_key_part(m_curr_key_info[0]->key_part); + if (have_start_key) + m_start_key= *start_key; + else + m_start_key.key= NULL; + + m_index_scan_type= PARTITION_READ_RANGE; + error= common_index_read(m_table->record[0], have_start_key); + DBUG_RETURN(error); +} + + +/** + Read next record in read of a range with start and end key. + + @return Operation status. + @retval 0 Success + @retval != 0 Error code +*/ + +int Partition_helper::ph_read_range_next() +{ + DBUG_ENTER("Partition_helper::ph_read_range_next"); + + if (m_ordered_scan_ongoing) + { + DBUG_RETURN(handle_ordered_next(m_table->record[0], get_eq_range())); + } + DBUG_RETURN(handle_unordered_next(m_table->record[0], get_eq_range())); +} + + +/** + Common routine to set up index scans. + + Find out which partitions we'll need to read when scanning the specified + range. + + If we need to scan only one partition, set m_ordered_scan_ongoing=FALSE + as we will not need to do merge ordering. + + @param buf Buffer to later return record in (this function + needs it to calculate partitioning function values) + + @param idx_read_flag TRUE <=> m_start_key has range start endpoint which + probably can be used to determine the set of + partitions to scan. + FALSE <=> there is no start endpoint. + + @return Operation status. + @retval 0 Success + @retval !=0 Error code +*/ + +int Partition_helper::partition_scan_set_up(uchar * buf, bool idx_read_flag) +{ + DBUG_ENTER("Partition_helper::partition_scan_set_up"); + + if (idx_read_flag) + get_partition_set(m_table, + buf, + m_handler->active_index, + &m_start_key, + &m_part_spec); + else + { + // TODO: set to get_first_used_part() instead! + m_part_spec.start_part= 0; + // TODO: Implement bitmap_get_last_set() and use that here! + m_part_spec.end_part= m_tot_parts - 1; + } + if (m_part_spec.start_part > m_part_spec.end_part) + { + /* + We discovered a partition set but the set was empty so we report + key not found. + */ + DBUG_PRINT("info", ("scan with no partition to scan")); + m_table->status= STATUS_NOT_FOUND; + DBUG_RETURN(HA_ERR_END_OF_FILE); + } + if (m_part_spec.start_part == m_part_spec.end_part) + { + /* + We discovered a single partition to scan, this never needs to be + performed using the ordered index scan. + */ + DBUG_PRINT("info", ("index scan using the single partition %d", + m_part_spec.start_part)); + m_ordered_scan_ongoing= FALSE; + } + else + { + /* + Set m_ordered_scan_ongoing according how the scan should be done + Only exact partitions are discovered atm by get_partition_set. + Verify this, also bitmap must have at least one bit set otherwise + the result from this table is the empty set. + */ + uint start_part= m_part_info->get_first_used_partition(); + if (start_part == MY_BIT_NONE) + { + DBUG_PRINT("info", ("scan with no partition to scan")); + m_table->status= STATUS_NOT_FOUND; + DBUG_RETURN(HA_ERR_END_OF_FILE); + } + if (start_part > m_part_spec.start_part) + m_part_spec.start_part= start_part; + m_ordered_scan_ongoing= m_ordered; + } + DBUG_ASSERT(m_part_spec.start_part < m_tot_parts); + DBUG_ASSERT(m_part_spec.end_part < m_tot_parts); + DBUG_RETURN(0); +} + + +/** + Common routine to handle index_next with unordered results. + + These routines are used to scan partitions without considering order. + This is performed in two situations. + 1) In read_multi_range this is the normal case + 2) When performing any type of index_read, index_first, index_last where + all fields in the partition function is bound. In this case the index + scan is performed on only one partition and thus it isn't necessary to + perform any sort. + + @param[out] buf Read row in MySQL Row Format. + @param[in] next_same Called from index_next_same. + + @return Operation status. + @retval HA_ERR_END_OF_FILE End of scan + @retval 0 Success + @retval other Error code +*/ + +int Partition_helper::handle_unordered_next(uchar *buf, bool is_next_same) +{ + int error; + DBUG_ENTER("Partition_helper::handle_unordered_next"); + + if (m_part_spec.start_part >= m_tot_parts) + { + /* Should only happen with SQL HANDLER! */ + DBUG_ASSERT(m_table->open_by_handler); + DBUG_RETURN(HA_ERR_END_OF_FILE); + } + + /* + We should consider if this should be split into three functions as + partition_read_range is_next_same are always local constants + */ + + if (m_index_scan_type == PARTITION_READ_RANGE) + { + DBUG_ASSERT(buf == m_table->record[0]); + error= read_range_next_in_part(m_part_spec.start_part, NULL); + } + else if (is_next_same) + { + error= index_next_same_in_part(m_part_spec.start_part, + buf, + m_start_key.key, + m_start_key.length); + } + else + { + error= index_next_in_part(m_part_spec.start_part, buf); + } + + if (error == HA_ERR_END_OF_FILE) + { + m_part_spec.start_part++; // Start using next part + error= handle_unordered_scan_next_partition(buf); + } + else + { + m_last_part= m_part_spec.start_part; + } + DBUG_RETURN(error); +} + + +/** + Handle index_next when changing to new partition. + + This routine is used to start the index scan on the next partition. + Both initial start and after completing scan on one partition. + + @param[out] buf Read row in MySQL Row Format + + @return Operation status. + @retval HA_ERR_END_OF_FILE End of scan + @retval 0 Success + @retval other Error code +*/ + +int Partition_helper::handle_unordered_scan_next_partition(uchar * buf) +{ + uint i= m_part_spec.start_part; + int saved_error= HA_ERR_END_OF_FILE; + DBUG_ENTER("Partition_helper::handle_unordered_scan_next_partition"); + + if (i) + i= m_part_info->get_next_used_partition(i - 1); + else + i= m_part_info->get_first_used_partition(); + + for (; + i <= m_part_spec.end_part; + i= m_part_info->get_next_used_partition(i)) + { + int error; + m_part_spec.start_part= i; + switch (m_index_scan_type) { + case PARTITION_READ_RANGE: + DBUG_ASSERT(buf == m_table->record[0]); + DBUG_PRINT("info", ("read_range_first on partition %d", i)); + error= read_range_first_in_part(i, + NULL, + m_start_key.key? &m_start_key: NULL, + m_handler->end_range, + get_eq_range(), + false); + break; + case PARTITION_INDEX_READ: + DBUG_PRINT("info", ("index_read on partition %d", i)); + error= index_read_map_in_part(i, + buf, + m_start_key.key, + m_start_key.keypart_map, + m_start_key.flag); + break; + case PARTITION_INDEX_FIRST: + DBUG_PRINT("info", ("index_first on partition %d", i)); + error= index_first_in_part(i, buf); + break; + case PARTITION_INDEX_FIRST_UNORDERED: + /* When is this ever used? */ + DBUG_ASSERT(0); + /* + We perform a scan without sorting and this means that we + should not use the index_first since not all handlers + support it and it is also unnecessary to restrict sort + order. + */ + DBUG_PRINT("info", ("read_range_first on partition %d", i)); + DBUG_ASSERT(buf == m_table->record[0]); + error= read_range_first_in_part(i, + NULL, + 0, + m_handler->end_range, + get_eq_range(), + 0); + break; + default: + DBUG_ASSERT(0); + DBUG_RETURN(HA_ERR_INTERNAL_ERROR); + } + if (!error) + { + m_last_part= i; + DBUG_RETURN(0); + } + if ((error != HA_ERR_END_OF_FILE) && (error != HA_ERR_KEY_NOT_FOUND)) + DBUG_RETURN(error); + + /* + If HA_ERR_KEY_NOT_FOUND, we must return that error instead of + HA_ERR_END_OF_FILE, to be able to continue search. + */ + if (saved_error != HA_ERR_KEY_NOT_FOUND) + saved_error= error; + DBUG_PRINT("info", ("END_OF_FILE/KEY_NOT_FOUND on partition %d", i)); + } + if (saved_error == HA_ERR_END_OF_FILE) + m_part_spec.start_part= NO_CURRENT_PART_ID; + DBUG_RETURN(saved_error); +} + + +/** + Common routine to start index scan with ordered results. + + @param[out] buf Read row in MySQL Row Format + + @return Operation status + @retval HA_ERR_END_OF_FILE End of scan + @retval HA_ERR_KEY_NOT_FOUND End of scan + @retval 0 Success + @retval other Error code + + @details + This part contains the logic to handle index scans that require ordered + output. This includes all except those started by read_range_first with + the flag ordered set to FALSE. Thus most direct index_read and all + index_first and index_last. + + We implement ordering by keeping one record plus a key buffer for each + partition. Every time a new entry is requested we will fetch a new + entry from the partition that is currently not filled with an entry. + Then the entry is put into its proper sort position. + + Returning a record is done by getting the top record, copying the + record to the request buffer and setting the partition as empty on + entries. +*/ + +int Partition_helper::handle_ordered_index_scan(uchar *buf) +{ + uint i; + std::vector parts; + bool found= FALSE; + uchar *part_rec_buf_ptr= m_ordered_rec_buffer; + int saved_error= HA_ERR_END_OF_FILE; + DBUG_ENTER("Partition_helper::handle_ordered_index_scan"); + DBUG_ASSERT(part_rec_buf_ptr); + + if (m_key_not_found) + { + m_key_not_found= false; + bitmap_clear_all(&m_key_not_found_partitions); + DBUG_PRINT("info", ("Cleared m_key_not_found_partitions")); + } + m_top_entry= NO_CURRENT_PART_ID; + m_queue->clear(); + parts.reserve(m_queue->capacity()); + DBUG_ASSERT(m_part_info->is_partition_used(m_part_spec.start_part)); + + /* + Position part_rec_buf_ptr to point to the first used partition >= + start_part. There may be partitions marked by used_partitions, + but is before start_part. These partitions has allocated record buffers + but is dynamically pruned, so those buffers must be skipped. + */ + for (i= m_part_info->get_first_used_partition(); + i < m_part_spec.start_part; + i= m_part_info->get_next_used_partition(i)) + { + part_rec_buf_ptr+= m_rec_offset + m_rec_length; + } + DBUG_PRINT("info", ("m_part_spec.start_part %u first_used_part %u", + m_part_spec.start_part, i)); + for (/* continue from above */ ; + i <= m_part_spec.end_part; + i= m_part_info->get_next_used_partition(i)) + { + DBUG_PRINT("info", ("reading from part %u (scan_type: %u inx: %u)", + i, m_index_scan_type, m_handler->active_index)); + DBUG_ASSERT(i == uint2korr(part_rec_buf_ptr)); + uchar *rec_buf_ptr= part_rec_buf_ptr + m_rec_offset; + uchar *read_buf; + int error; + DBUG_PRINT("info", ("part %u, scan_type %d", i, m_index_scan_type)); + + /* ICP relies on Item evaluation, which expects the row in record[0]. */ + if (m_handler->pushed_idx_cond) + read_buf= m_table->record[0]; + else + read_buf= rec_buf_ptr; + + switch (m_index_scan_type) { + case PARTITION_INDEX_READ: + error= index_read_map_in_part(i, + read_buf, + m_start_key.key, + m_start_key.keypart_map, + m_start_key.flag); + break; + case PARTITION_INDEX_FIRST: + error= index_first_in_part(i, read_buf); + break; + case PARTITION_INDEX_LAST: + error= index_last_in_part(i, read_buf); + break; + case PARTITION_INDEX_READ_LAST: + error= index_read_last_map_in_part(i, + read_buf, + m_start_key.key, + m_start_key.keypart_map); + break; + case PARTITION_READ_RANGE: + { + /* + To enable optimization in derived engines, we provide a read buffer + pointer if we want to read into something different than table->record[0] + (which read_range_* always uses). + */ + error= read_range_first_in_part(i, + read_buf == m_table->record[0] + ? NULL : read_buf, + m_start_key.key ? &m_start_key : NULL, + m_handler->end_range, + get_eq_range(), + true); + break; + } + default: + DBUG_ASSERT(false); + DBUG_RETURN(HA_ERR_END_OF_FILE); + } + DBUG_PRINT("info", ("error %d from partition %u", error, i)); + /* When using ICP, copy record[0] to the priority queue for sorting. */ + if (m_handler->pushed_idx_cond) + memcpy(rec_buf_ptr, read_buf, m_rec_length); + if (!error) + { + found= true; + if (m_ref_usage != REF_NOT_USED) + { + /* position_in_last_part needs m_last_part set. */ + m_last_part= i; + position_in_last_part(part_rec_buf_ptr + PARTITION_BYTES_IN_POS, + rec_buf_ptr); + } + /* + Save for later insertion in queue; + */ + parts.push_back(part_rec_buf_ptr); + DBUG_DUMP("row", read_buf, m_rec_length); + } + else if (error != HA_ERR_KEY_NOT_FOUND && error != HA_ERR_END_OF_FILE) + { + DBUG_RETURN(error); + } + else if (error == HA_ERR_KEY_NOT_FOUND) + { + DBUG_PRINT("info", ("HA_ERR_KEY_NOT_FOUND from partition %u", i)); + bitmap_set_bit(&m_key_not_found_partitions, i); + m_key_not_found= true; + saved_error= error; + } + part_rec_buf_ptr+= m_rec_offset + m_rec_length; + } + if (found) + { + /* + We found at least one partition with data, now sort all entries and + after that read the first entry and copy it to the buffer to return in. + */ + m_queue->m_max_at_top= m_reverse_order; + m_queue->m_keys= m_curr_key_info; + DBUG_ASSERT(m_queue->empty()); + /* + If PK, we should not sort by rowid, since that is already done + through the KEY setup. + */ + DBUG_ASSERT(!m_curr_key_info[1] || m_ref_usage == REF_NOT_USED); + m_queue->assign(parts); + return_top_record(buf); + m_table->status= 0; + DBUG_PRINT("info", ("Record returned from partition %d", m_top_entry)); + DBUG_RETURN(0); + } + DBUG_RETURN(saved_error); +} + + +/** + Return the top record in sort order. + + @param[out] buf Row returned in MySQL Row Format. +*/ + +void Partition_helper::return_top_record(uchar *buf) +{ + uint part_id; + uchar *key_buffer= m_queue->top(); + uchar *rec_buffer= key_buffer + m_rec_offset; + + part_id= uint2korr(key_buffer); + copy_cached_row(buf, rec_buffer); + DBUG_PRINT("info", ("from part_id %u", part_id)); + DBUG_DUMP("returned_row", buf, m_table->s->reclength); + m_last_part= part_id; + m_top_entry= part_id; +} + + +/** + Add index_next/prev results from partitions without exact match. + + If there where any partitions that returned HA_ERR_KEY_NOT_FOUND when + ha_index_read_map was done, those partitions must be included in the + following index_next/prev call. +*/ + +int Partition_helper::handle_ordered_index_scan_key_not_found() +{ + int error; + uint i; + size_t old_elements= m_queue->size(); + uchar *part_buf= m_ordered_rec_buffer; + uchar *curr_rec_buf= NULL; + DBUG_ENTER("Partition_helper::handle_ordered_index_scan_key_not_found"); + DBUG_ASSERT(m_key_not_found); + DBUG_ASSERT(part_buf); + /* + Loop over all used partitions to get the correct offset + into m_ordered_rec_buffer. + */ + for (i= m_part_info->get_first_used_partition(); + i < MY_BIT_NONE; + i= m_part_info->get_next_used_partition(i)) + { + if (bitmap_is_set(&m_key_not_found_partitions, i)) + { + /* + This partition is used and did return HA_ERR_KEY_NOT_FOUND + in index_read_map. + */ + uchar *read_buf; + curr_rec_buf= part_buf + m_rec_offset; + /* ICP relies on Item evaluation, which expects the row in record[0]. */ + if (m_handler->pushed_idx_cond) + read_buf= m_table->record[0]; + else + read_buf= curr_rec_buf; + + if (m_reverse_order) + error= index_prev_in_part(i, read_buf); + else + error= index_next_in_part(i, read_buf); + /* HA_ERR_KEY_NOT_FOUND is not allowed from index_next! */ + DBUG_ASSERT(error != HA_ERR_KEY_NOT_FOUND); + DBUG_PRINT("info", ("Filling from partition %u reverse %u error %d", + i, m_reverse_order, error)); + if (!error) + { + /* When using ICP, copy record[0] to the priority queue for sorting. */ + if (m_handler->pushed_idx_cond) + memcpy(curr_rec_buf, read_buf, m_rec_length); + if (m_ref_usage != REF_NOT_USED) + { + /* position_in_last_part needs m_last_part set. */ + m_last_part= i; + position_in_last_part(part_buf + PARTITION_BYTES_IN_POS, + curr_rec_buf); + } + m_queue->push(part_buf); + } + else if (error != HA_ERR_END_OF_FILE && error != HA_ERR_KEY_NOT_FOUND) + DBUG_RETURN(error); + } + part_buf+= m_rec_offset + m_rec_length; + } + DBUG_ASSERT(curr_rec_buf); + bitmap_clear_all(&m_key_not_found_partitions); + m_key_not_found= false; + + if (m_queue->size() > old_elements) + { + /* Update m_top_entry, which may have changed. */ + uchar *key_buffer= m_queue->top(); + m_top_entry= uint2korr(key_buffer); + } + DBUG_RETURN(0); +} + + +/** + Common routine to handle index_next with ordered results. + + @param[out] buf Read row in MySQL Row Format. + @param[in] next_same Called from index_next_same. + + @return Operation status. + @retval HA_ERR_END_OF_FILE End of scan + @retval 0 Success + @retval other Error code +*/ + +int Partition_helper::handle_ordered_next(uchar *buf, bool is_next_same) +{ + int error; + uint part_id= m_top_entry; + uchar *rec_buf= m_queue->empty() ? NULL : m_queue->top() + m_rec_offset; + uchar *read_buf; + DBUG_ENTER("Partition_helper::handle_ordered_next"); + + if (m_reverse_order) + { + /* + TODO: To support change of direction (index_prev -> index_next, + index_read_map(HA_READ_KEY_EXACT) -> index_prev etc.) + We would need to: + - Step back all cursors we have a buffered row from a previous next/prev + call (i.e. for all partitions we previously called index_prev, we must + call index_next and skip that row. + - empty the priority queue and initialize it again with reverse ordering. + */ + DBUG_ASSERT(m_table->open_by_handler); + DBUG_RETURN(HA_ERR_WRONG_COMMAND); + } + + if (m_key_not_found) + { + if (is_next_same) + { + /* Only rows which match the key. */ + m_key_not_found= false; + bitmap_clear_all(&m_key_not_found_partitions); + } + else + { + /* There are partitions not included in the index record queue. */ + size_t old_elements= m_queue->size(); + if ((error= handle_ordered_index_scan_key_not_found())) + DBUG_RETURN(error); + /* + If the queue top changed, i.e. one of the partitions that gave + HA_ERR_KEY_NOT_FOUND in index_read_map found the next record, + return it. + Otherwise replace the old with a call to index_next (fall through). + */ + if (old_elements != m_queue->size() && part_id != m_top_entry) + { + return_top_record(buf); + DBUG_PRINT("info", ("Returning row from part %u (prev KEY_NOT_FOUND)", + m_top_entry)); + DBUG_RETURN(0); + } + } + } + if (part_id >= m_tot_parts) + DBUG_RETURN(HA_ERR_END_OF_FILE); + + DBUG_PRINT("info", ("next row from part %u (inx %u)", + part_id, m_handler->active_index)); + + /* Assert that buffer for fetch is not NULL */ + DBUG_ASSERT(rec_buf); + + /* ICP relies on Item evaluation, which expects the row in record[0]. */ + if (m_handler->pushed_idx_cond) + read_buf= m_table->record[0]; + else + read_buf= rec_buf; + + + if (m_index_scan_type == PARTITION_READ_RANGE) + { + error= read_range_next_in_part(part_id, + read_buf == m_table->record[0] + ? NULL : read_buf); + } + else if (!is_next_same) + error= index_next_in_part(part_id, read_buf); + else + error= index_next_same_in_part(part_id, + read_buf, + m_start_key.key, + m_start_key.length); + if (error) + { + if (error == HA_ERR_END_OF_FILE) + { + /* Return next buffered row */ + if (!m_queue->empty()) + m_queue->pop(); + if (m_queue->empty()) + { + /* + If priority queue is empty, we have finished fetching rows from all + partitions. Reset the value of next partition to NONE. This would + imply HA_ERR_END_OF_FILE for all future calls. + */ + m_top_entry= NO_CURRENT_PART_ID; + } + else + { + return_top_record(buf); + DBUG_PRINT("info", ("Record returned from partition %u (2)", + m_top_entry)); + m_table->status= 0; + error= 0; + } + } + DBUG_RETURN(error); + } + /* When using ICP, copy record[0] to the priority queue for sorting. */ + if (m_handler->pushed_idx_cond) + memcpy(rec_buf, read_buf, m_rec_length); + if (m_ref_usage != REF_NOT_USED) + { + /* position_in_last_part needs m_last_part set. */ + m_last_part= part_id; + position_in_last_part(rec_buf - m_rec_offset + PARTITION_BYTES_IN_POS, + rec_buf); + } + DBUG_DUMP("rec_buf", rec_buf, m_rec_length); + m_queue->update_top(); + return_top_record(buf); + DBUG_PRINT("info", ("Record returned from partition %u", m_top_entry)); + DBUG_RETURN(0); +} + + +/** + Common routine to handle index_prev with ordered results. + + @param[out] buf Read row in MySQL Row Format. + + @return Operation status. + @retval HA_ERR_END_OF_FILE End of scan + @retval 0 Success + @retval other Error code +*/ + +int Partition_helper::handle_ordered_prev(uchar *buf) +{ + int error; + uint part_id= m_top_entry; + uchar *rec_buf= m_queue->empty() ? NULL : m_queue->top() + m_rec_offset; + uchar *read_buf; + DBUG_ENTER("Partition_helper::handle_ordered_prev"); + + if (!m_reverse_order) + { + /* TODO: See comment in handle_ordered_next(). */ + DBUG_ASSERT(m_table->open_by_handler); + DBUG_RETURN(HA_ERR_WRONG_COMMAND); + } + + if (m_key_not_found) + { + /* There are partitions not included in the index record queue. */ + size_t old_elements= m_queue->size(); + if ((error= handle_ordered_index_scan_key_not_found())) + DBUG_RETURN(error); + if (old_elements != m_queue->size() && part_id != m_top_entry) + { + /* + Should only be possible for when HA_READ_KEY_EXACT was previously used, + which is not supported to have a subsequent call for PREV. + I.e. HA_READ_KEY_EXACT is considered to not have reverse order! + */ + DBUG_ASSERT(0); + /* + If the queue top changed, i.e. one of the partitions that gave + HA_ERR_KEY_NOT_FOUND in index_read_map found the next record, + return it. + Otherwise replace the old with a call to index_next (fall through). + */ + return_top_record(buf); + DBUG_RETURN(0); + } + } + + if (part_id >= m_tot_parts) + { + /* This should never happen, except for SQL HANDLER calls! */ + DBUG_ASSERT(m_table->open_by_handler); + DBUG_RETURN(HA_ERR_END_OF_FILE); + } + + /* Assert that buffer for fetch is not NULL */ + DBUG_ASSERT(rec_buf); + + /* ICP relies on Item evaluation, which expects the row in record[0]. */ + if (m_handler->pushed_idx_cond) + read_buf= m_table->record[0]; + else + read_buf= rec_buf; + + if ((error= index_prev_in_part(part_id, read_buf))) + { + if (error == HA_ERR_END_OF_FILE) + { + if (!m_queue->empty()) + m_queue->pop(); + if (m_queue->empty()) + { + /* + If priority queue is empty, we have finished fetching rows from all + partitions. Reset the value of next partition to NONE. This would + imply HA_ERR_END_OF_FILE for all future calls. + */ + m_top_entry= NO_CURRENT_PART_ID; + } + else + { + return_top_record(buf); + DBUG_PRINT("info", ("Record returned from partition %d (2)", + m_top_entry)); + error= 0; + m_table->status= 0; + } + } + DBUG_RETURN(error); + } + /* When using ICP, copy record[0] to the priority queue for sorting. */ + if (m_handler->pushed_idx_cond) + memcpy(rec_buf, read_buf, m_rec_length); + + if (m_ref_usage != REF_NOT_USED) + { + /* position_in_last_part needs m_last_part set. */ + m_last_part= part_id; + position_in_last_part(rec_buf - m_rec_offset + PARTITION_BYTES_IN_POS, + rec_buf); + } + m_queue->update_top(); + return_top_record(buf); + DBUG_PRINT("info", ("Record returned from partition %d", m_top_entry)); + DBUG_RETURN(0); +} + +/** + Get statistics from a specific partition. + + @param[out] stat_info Area to report values into. + @param[out] check_sum Check sum of partition. + @param[in] part_id Partition to report from. +*/ +void +Partition_helper::get_dynamic_partition_info_low(PARTITION_STATS *stat_info, + ha_checksum *check_sum, + uint part_id) +{ + ha_statistics *part_stat= &m_handler->stats; + DBUG_ASSERT(bitmap_is_set(&m_part_info->read_partitions, part_id)); + DBUG_ASSERT(bitmap_is_subset(&m_part_info->read_partitions, + &m_part_info->lock_partitions)); + DBUG_ASSERT(bitmap_is_subset(&m_part_info->lock_partitions, + &m_part_info->read_partitions)); + bitmap_clear_all(&m_part_info->read_partitions); + bitmap_set_bit(&m_part_info->read_partitions, part_id); + m_handler->info(HA_STATUS_TIME | + HA_STATUS_VARIABLE | + HA_STATUS_VARIABLE_EXTRA | + HA_STATUS_NO_LOCK); + stat_info->records= part_stat->records; + stat_info->mean_rec_length= part_stat->mean_rec_length; + stat_info->data_file_length= part_stat->data_file_length; + stat_info->max_data_file_length= part_stat->max_data_file_length; + stat_info->index_file_length= part_stat->index_file_length; + stat_info->delete_length= part_stat->delete_length; + stat_info->create_time= part_stat->create_time; + stat_info->update_time= part_stat->update_time; + stat_info->check_time= part_stat->check_time; + if (get_thd()->variables.old_mode ? + m_handler->ha_table_flags() & HA_HAS_OLD_CHECKSUM : + m_handler->ha_table_flags() & HA_HAS_NEW_CHECKSUM) + { + *check_sum= checksum_in_part(part_id); + } + bitmap_copy(&m_part_info->read_partitions, &m_part_info->lock_partitions); +} + + +/** + Get checksum for table. + + @return Checksum or 0 if not supported, which also may be a correct checksum!. +*/ + +ha_checksum Partition_helper::ph_checksum() const +{ + ha_checksum sum= 0; + if (get_thd()->variables.old_mode ? + m_handler->ha_table_flags() & HA_HAS_OLD_CHECKSUM : + m_handler->ha_table_flags() & HA_HAS_NEW_CHECKSUM) + { + for (uint i= 0; i < m_tot_parts; i++) + { + sum+= checksum_in_part(i); + } + } + return sum; +} diff --git a/sql/partitioning/partition_handler.h b/sql/partitioning/partition_handler.h new file mode 100644 index 00000000000..cf4e1dcb24b --- /dev/null +++ b/sql/partitioning/partition_handler.h @@ -0,0 +1,1113 @@ +#ifndef PARTITION_HANDLER_INCLUDED +#define PARTITION_HANDLER_INCLUDED + +/* + Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved. + + This program is free software; you can redistribute it and/or + modify it under the terms of the GNU General Public License + as published by the Free Software Foundation; version 2 of + the License. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +*/ + +#include "my_global.h" // uint etc. +#include "my_base.h" // ha_rows. +#include "handler.h" // Handler_share +#include "sql_partition.h" // part_id_range +#include "mysqld_error.h" // ER_ILLEGAL_HA +#include "priority_queue.h" +#include "key.h" // key_rec_cmp +#include "ha_partition.h" +#include + +#define PARTITION_BYTES_IN_POS 2 + +/* forward declarations */ +typedef struct st_mem_root MEM_ROOT; + +static const uint NO_CURRENT_PART_ID= UINT_MAX32; + +/** + bits in Partition_handler::alter_flags(): + + HA_PARTITION_FUNCTION_SUPPORTED indicates that the function is + supported at all. + HA_FAST_CHANGE_PARTITION means that optimized variants of the changes + exists but they are not necessarily done online. + + HA_ONLINE_DOUBLE_WRITE means that the handler supports writing to both + the new partition and to the old partitions when updating through the + old partitioning schema while performing a change of the partitioning. + This means that we can support updating of the table while performing + the copy phase of the change. For no lock at all also a double write + from new to old must exist and this is not required when this flag is + set. + This is actually removed even before it was introduced the first time. + The new idea is that handlers will handle the lock level already in + store_lock for ALTER TABLE partitions. + TODO: Implement this via the alter-inplace api. +*/ + +enum enum_part_operation { + OPTIMIZE_PARTS= 0, + ANALYZE_PARTS, + CHECK_PARTS, + REPAIR_PARTS, + ASSIGN_KEYCACHE_PARTS, + PRELOAD_KEYS_PARTS +}; + + +/** + Initialize partitioning (currently only PSI keys). +*/ +void partitioning_init(); + + +/** + Class for partitioning specific operations. + + Returned from handler::get_partition_handler(). +*/ +class Partition_handler :public Sql_alloc +{ +public: + Partition_handler() {} + ~Partition_handler() {} + + bool init(uint num_parts); + + /** + Get dynamic table information from partition. + + @param[out] stat_info Statistics struct to fill in. + @param[out] check_sum Check sum value to fill in if supported. + @param[in] part_id Partition to report for. + + @note stat_info and check_sum are initialized by caller. + check_sum is only expected to be updated if HA_HAS_CHECKSUM. + */ + virtual void get_dynamic_partition_info(PARTITION_STATS *stat_info, + uint part_id) = 0; + + /** + Get default number of partitions. + + Used during creating a partitioned table. + + @param info Create info. + @return Number of default partitions. + */ + virtual int get_default_num_partitions(HA_CREATE_INFO *info) { return 1;} + /** + Setup auto partitioning. + + Called for engines with HA_USE_AUTO_PARTITION to setup the partition info + object + + @param[in,out] part_info Partition object to setup. + */ + virtual void set_auto_partitions(partition_info *part_info) { return; } + /** + Get number of partitions for table in SE + + @param name normalized path(same as open) to the table + + @param[out] num_parts Number of partitions + + @retval false for success + @retval true for failure, for example table didn't exist in engine + */ + virtual bool get_num_parts(const char *name, + uint *num_parts) + { + *num_parts= 0; + return false; + } + /** + Set the partition info object to be used by the handler. + + @param part_info Partition info to be used by the handler. + @param early True if called when part_info only created and parsed, + but not setup, checked or fixed. + */ + virtual void set_part_info(partition_info *part_info) = 0; + /** + Initialize partition. + + @param mem_root Memory root for memory allocations. + + @return Operation status + @retval false Success. + @retval true Failure. + */ + virtual bool initialize_partition(MEM_ROOT *mem_root) {return false;} + + + /** + Alter flags. + + Given a set of alter table flags, return which is supported. + + @param flags Alter table operation flags. + + @return Supported alter table flags. + */ + virtual uint alter_flags(uint flags) const + { return 0; } + +private: + /** + Truncate partition. + + Low-level primitive for handler, implementing + Partition_handler::truncate_partition(). + + @return Operation status + @retval 0 Success. + @retval != 0 Error code. + */ + virtual int truncate_partition_low() + { return HA_ERR_WRONG_COMMAND; } + /** + Truncate partition. + + Low-level primitive for handler, implementing + Partition_handler::change_partitions(). + + @param[in] create_info Table create info. + @param[in] path Path including table name. + @param[out] copied Number of rows copied. + @param[out] deleted Number of rows deleted. + + @return Operation status + @retval 0 Success. + @retval != 0 Error code. + */ + virtual int change_partitions_low(HA_CREATE_INFO *create_info, + const char *path, + ulonglong * const copied, + ulonglong * const deleted) + { + my_error(ER_ILLEGAL_HA, MYF(0), create_info->alias); + return HA_ERR_WRONG_COMMAND; + } + /** + Return the table handler. + + For some partitioning specific functions it is still needed to access + the handler directly for transaction handling (mark_trx_read_write()) + and to assert correct locking. + + @return handler or NULL if not supported. + */ + virtual handler *get_handler() + { return NULL; } +}; + + +/// Maps compare function to strict weak ordering required by Priority_queue. +struct Key_rec_less +{ + typedef int (*key_compare_fun)(void*, uchar *, uchar *); + + explicit Key_rec_less(KEY **keys) + : m_keys(keys), m_fun(key_rec_cmp), m_max_at_top(false) + { + } + + bool operator()(uchar *first, uchar *second) + { + const int cmpval= + (*m_fun)(m_keys, first + m_rec_offset, second + m_rec_offset); + return m_max_at_top ? cmpval < 0 : cmpval > 0; + } + + KEY **m_keys; + key_compare_fun m_fun; + uint m_rec_offset; + bool m_max_at_top; +}; + + +/** + Partition_helper is a helper class that implements most generic partitioning + functionality such as: + table scan, index scan (both ordered and non-ordered), + insert (write_row()), delete and update. + And includes ALTER TABLE ... ADD/COALESCE/DROP/REORGANIZE/... PARTITION + support. + It also implements a cache for the auto increment value and check/repair for + rows in wrong partition. + + How to use it: + Inherit it and implement: + - *_in_part() functions for row operations. + - prepare_for_new_partitions(), create_new_partition(), close_new_partitions() + write_row_in_new_part() for handling 'fast' alter partition. +*/ +class Partition_helper : public Sql_alloc +{ + typedef Priority_queue, Key_rec_less> Prio_queue; +public: + Partition_helper(handler *main_handler); + ~Partition_helper(); + + /** + Set partition info. + + To be called from Partition_handler. + + @param part_info Partition info to use. + @param early True if called when part_info only created and parsed, + but not setup, checked or fixed. + */ + virtual void set_part_info_low(partition_info *part_info, bool early); + /** + Initialize variables used before the table is opened. + + @param mem_root Memory root to allocate things from (not yet used). + + @return Operation status. + @retval false success. + @retval true failure. + */ + inline bool init_partitioning(MEM_ROOT *mem_root) + { +#ifndef DBUG_OFF + m_key_not_found_partitions.bitmap= NULL; +#endif + return false; + } + + + /** + INSERT/UPDATE/DELETE functions. + @see handler.h + @{ + */ + + /** + Insert a row to the partitioned table. + + @param buf The row in MySQL Row Format. + + @return Operation status. + @retval 0 Success + @retval != 0 Error code + */ + int ph_write_row(uchar *buf); + /** + Update an existing row in the partitioned table. + + Yes, update_row() does what you expect, it updates a row. old_data will + have the previous row record in it, while new_data will have the newest + data in it. + Keep in mind that the server can do updates based on ordering if an + ORDER BY clause was used. Consecutive ordering is not guaranteed. + + If the new record belongs to a different partition than the old record + then it will be inserted into the new partition and deleted from the old. + + new_data is always record[0] + old_data is always record[1] + + @param old_data The old record in MySQL Row Format. + @param new_data The new record in MySQL Row Format. + + @return Operation status. + @retval 0 Success + @retval != 0 Error code + */ + int ph_update_row(const uchar *old_data, uchar *new_data); + /** + Delete an existing row in the partitioned table. + + This will delete a row. buf will contain a copy of the row to be deleted. + The server will call this right after the current row has been read + (from either a previous rnd_xxx() or index_xxx() call). + If you keep a pointer to the last row or can access a primary key it will + make doing the deletion quite a bit easier. + Keep in mind that the server does no guarantee consecutive deletions. + ORDER BY clauses can be used. + + buf is either record[0] or record[1] + + @param buf The record in MySQL Row Format. + + @return Operation status. + @retval 0 Success + @retval != 0 Error code + */ + int ph_delete_row(const uchar *buf); + + /** @} */ + + /** Release unused auto increment values. */ + void ph_release_auto_increment(); + /** + Calculate key hash value from an null terminated array of fields. + Support function for KEY partitioning. + + @param field_array An array of the fields in KEY partitioning + + @return hash_value calculated + + @note Uses the hash function on the character set of the field. + Integer and floating point fields use the binary character set by default. + */ + static uint32 ph_calculate_key_hash_value(Field **field_array); + /** Get checksum for table. + @return Checksum or 0 if not supported (which also may be a correct checksum!). + */ + ha_checksum ph_checksum() const; + + /** + MODULE full table scan + + This module is used for the most basic access method for any table + handler. This is to fetch all data through a full table scan. No + indexes are needed to implement this part. + It contains one method to start the scan (rnd_init) that can also be + called multiple times (typical in a nested loop join). Then proceeding + to the next record (rnd_next) and closing the scan (rnd_end). + To remember a record for later access there is a method (position) + and there is a method used to retrieve the record based on the stored + position. + The position can be a file position, a primary key, a ROWID dependent + on the handler below. + + unlike index_init(), rnd_init() can be called two times + without rnd_end() in between (it only makes sense if scan=1). + then the second call should prepare for the new table scan + (e.g if rnd_init allocates the cursor, second call should + position it to the start of the table, no need to deallocate + and allocate it again. + @see handler.h + @{ + */ + + int ph_rnd_init(bool scan); + int ph_rnd_end(); + int ph_rnd_next(uchar *buf); + void ph_position(const uchar *record); + int ph_rnd_pos(uchar *buf, uchar *pos); + int ph_rnd_pos_by_record(uchar *record); + + /** @} */ + + /** + MODULE index scan + + This part of the handler interface is used to perform access through + indexes. The interface is defined as a scan interface but the handler + can also use key lookup if the index is a unique index or a primary + key index. + Index scans are mostly useful for SELECT queries but are an important + part also of UPDATE, DELETE, REPLACE and CREATE TABLE table AS SELECT + and so forth. + Naturally an index is needed for an index scan and indexes can either + be ordered, hash based. Some ordered indexes can return data in order + but not necessarily all of them. + There are many flags that define the behavior of indexes in the + various handlers. These methods are found in the optimizer module. + ------------------------------------------------------------------------- + + index_read is called to start a scan of an index. The find_flag defines + the semantics of the scan. These flags are defined in + include/my_base.h + index_read_idx is the same but also initializes index before calling doing + the same thing as index_read. Thus it is similar to index_init followed + by index_read. This is also how we implement it. + + index_read/index_read_idx does also return the first row. Thus for + key lookups, the index_read will be the only call to the handler in + the index scan. + + index_init initializes an index before using it and index_end does + any end processing needed. + @{ + */ + + int ph_index_init_setup(uint key_nr, bool sorted); + int ph_index_init(uint key_nr, bool sorted); + int ph_index_end(); + /* + These methods are used to jump to next or previous entry in the index + scan. There are also methods to jump to first and last entry. + */ + int ph_index_first(uchar *buf); + int ph_index_last(uchar *buf); + int ph_index_next(uchar *buf); + int ph_index_next_same(uchar *buf, const uchar *key, uint keylen); + int ph_index_prev(uchar *buf); + int ph_index_read_map(uchar *buf, + const uchar *key, + key_part_map keypart_map, + enum ha_rkey_function find_flag); + int ph_index_read_last_map(uchar *buf, + const uchar *key, + key_part_map keypart_map); + int ph_index_read_idx_map(uchar *buf, + uint index, + const uchar *key, + key_part_map keypart_map, + enum ha_rkey_function find_flag); + int ph_read_range_first(const key_range *start_key, + const key_range *end_key, + bool eq_range_arg, + bool sorted); + int ph_read_range_next(); + /** @} */ + + /** + Functions matching Partition_handler API. + @{ + */ + + /** + Get statistics from a specific partition. + @param[out] stat_info Area to report values into. + @param[out] check_sum Check sum of partition. + @param[in] part_id Partition to report from. + */ + virtual void get_dynamic_partition_info_low(PARTITION_STATS *stat_info, + ha_checksum *check_sum, + uint part_id); + + /** + Implement the partition changes defined by ALTER TABLE of partitions. + + Add and copy if needed a number of partitions, during this operation + only read operation is ongoing in the server. This is used by + ADD PARTITION all types as well as by REORGANIZE PARTITION. For + one-phased implementations it is used also by DROP and COALESCE + PARTITIONs. + One-phased implementation needs the new frm file, other handlers will + get zero length and a NULL reference here. + + @param[in] create_info HA_CREATE_INFO object describing all + fields and indexes in table + @param[in] path Complete path of db and table name + @param[out] copied Output parameter where number of copied + records are added + @param[out] deleted Output parameter where number of deleted + records are added + + @return Operation status + @retval 0 Success + @retval != 0 Failure + */ + int change_partitions(HA_CREATE_INFO *create_info, + const char *path, + ulonglong * const copied, + ulonglong * const deleted); + /** @} */ + +protected: + /* Common helper functions to be used by inheriting engines. */ + + /* + open/close functions. + */ + + /** + Set m_part_share, Allocate internal bitmaps etc. used by open tables. + + @param mem_root Memory root to allocate things from (not yet used). + + @return Operation status. + @retval false success. + @retval true failure. + */ + bool open_partitioning(Partition_share *part_share); + /** + Close partitioning for a table. + + Frees memory and release other resources. + */ + void close_partitioning(); + + /** + Lock auto increment value if needed. + */ + inline void lock_auto_increment() + { + /* lock already taken */ + if (m_auto_increment_safe_stmt_log_lock) + return; + DBUG_ASSERT(!m_auto_increment_lock); + if(m_table->s->tmp_table == NO_TMP_TABLE) + { + m_auto_increment_lock= true; + m_part_share->lock_auto_inc(); + } + } + /** + unlock auto increment. + */ + inline void unlock_auto_increment() + { + /* + If m_auto_increment_safe_stmt_log_lock is true, we have to keep the lock. + It will be set to false and thus unlocked at the end of the statement by + ha_partition::release_auto_increment. + */ + if(m_auto_increment_lock && !m_auto_increment_safe_stmt_log_lock) + { + m_part_share->unlock_auto_inc(); + m_auto_increment_lock= false; + } + } + /** + Get auto increment. + + Only to be used for auto increment values that are the first field in + an unique index. + + @param[in] increment Increment between generated numbers. + @param[in] nb_desired_values Number of values requested. + @param[out] first_value First reserved value (ULLONG_MAX on error). + @param[out] nb_reserved_values Number of values reserved. + */ + void get_auto_increment_first_field(ulonglong increment, + ulonglong nb_desired_values, + ulonglong *first_value, + ulonglong *nb_reserved_values); + + /** + Initialize the record priority queue used for sorted index scans. + @return Operation status. + @retval 0 Success. + @retval != 0 Error code. + */ + int init_record_priority_queue(); + /** + Destroy the record priority queue used for sorted index scans. + */ + void destroy_record_priority_queue(); + /* + Administrative support functions. + */ + + /** Print partitioning specific error. + @param error Error code. + @param errflag Error flag. + @return false if error is printed else true. + */ + bool print_partition_error(int error, myf errflag); +#if 0 + /** + Print a message row formatted for ANALYZE/CHECK/OPTIMIZE/REPAIR TABLE. + + Modeled after mi_check_print_msg. + + @param thd Thread context. + @param len Needed length for message buffer. + @param msg_type Message type. + @param db_name Database name. + @param table_name Table name. + @param op_name Operation name. + @param fmt Message (in printf format with additional arguments). + + @return Operation status. + @retval false for success else true. + */ + bool print_admin_msg(THD *thd, + uint len, + const char *msg_type, + const char *db_name, + const char *table_name, + const char *op_name, + const char *fmt, + ...); +#endif + /** + Check/fix misplaced rows. + + @param part_id Partition to check/fix. + @param repair If true, move misplaced rows to correct partition. + + @return Operation status. + @retval 0 Success + @retval != 0 Error + */ + int check_misplaced_rows(uint part_id, bool repair); + /** + Set used partitions bitmap from Alter_info. + + @return false if success else true. + */ + bool set_altered_partitions(); + +private: + enum partition_index_scan_type + { + PARTITION_INDEX_READ= 1, + PARTITION_INDEX_FIRST, + PARTITION_INDEX_FIRST_UNORDERED, + PARTITION_INDEX_LAST, + PARTITION_INDEX_READ_LAST, + PARTITION_READ_RANGE, + PARTITION_NO_INDEX_SCAN + }; + + /** handler to use (ha_partition, ha_innopart etc.) */ + handler *m_handler; + /** Convenience pointer to table from m_handler (i.e. m_handler->table). */ + TABLE *m_table; + + /* + Access methods to protected areas in handler to avoid adding + friend class Partition_helper in class handler. + */ + virtual THD *get_thd() const = 0; + virtual TABLE *get_table() const = 0; + virtual bool get_eq_range() const = 0; + virtual void set_eq_range(bool eq_range) = 0; + virtual void set_range_key_part(KEY_PART_INFO *key_part) = 0; + + /* + Implementation of per partition operation by instantiated engine. + These must be implemented in the 'real' partition_helper subclass. + */ + + /** + Write a row in the specified partition. + + @see handler::write_row(). + + @param part_id Partition to write to. + @param buf Buffer with data to write. + + @return Operation status. + @retval 0 Success. + @retval != 0 Error code. + */ + virtual int write_row_in_part(uint part_id, uchar *buf) = 0; + /** + Update a row in the specified partition. + + @see handler::update_row(). + + @param part_id Partition to update in. + @param old_data Buffer containing old row. + @param new_data Buffer containing new row. + + @return Operation status. + @retval 0 Success. + @retval != 0 Error code. + */ + virtual int update_row_in_part(uint new_part_id, + const uchar *old_data, + uchar *new_data) = 0; + /** + Delete an existing row in the specified partition. + + @see handler::delete_row(). + + @param part_id Partition to delete from. + @param buf Buffer containing row to delete. + + @return Operation status. + @retval 0 Success. + @retval != 0 Error code. + */ + virtual int delete_row_in_part(uint part_id, const uchar *buf) = 0; + /** + Initialize the shared auto increment value. + + @param no_lock If HA_STATUS_NO_LOCK should be used in info(HA_STATUS_AUTO). + + Also sets stats.auto_increment_value. + */ + virtual int initialize_auto_increment(bool no_lock) = 0; + /** Release auto_increment in all underlying partitions. */ + virtual void release_auto_increment_all_parts() {} + /** Save or persist the current max auto increment. */ + virtual void save_auto_increment(ulonglong nr) {} + /** + Per partition equivalent of rnd_* and index_* functions. + + @see class handler. + */ + virtual int rnd_init_in_part(uint part_id, bool table_scan) = 0; + int ph_rnd_next_in_part(uint part_id, uchar *buf); + virtual int rnd_next_in_part(uint part_id, uchar *buf) = 0; + virtual int rnd_end_in_part(uint part_id, bool scan) = 0; + virtual void position_in_last_part(uchar *ref, const uchar *row) = 0; + /* If ph_rnd_pos is used then this needs to be implemented! */ + virtual int rnd_pos_in_part(uint part_id, uchar *buf, uchar *pos) + { DBUG_ASSERT(0); return HA_ERR_WRONG_COMMAND; } + virtual int rnd_pos_by_record_in_last_part(uchar *row) + { + /* + Not much overhead to use default function. This avoids out-of-sync code. + */ + return m_handler->rnd_pos_by_record(row); + } + virtual int index_init_in_part(uint part, uint keynr, bool sorted) + { DBUG_ASSERT(0); return HA_ERR_WRONG_COMMAND; } + virtual int index_end_in_part(uint part) + { DBUG_ASSERT(0); return HA_ERR_WRONG_COMMAND; } + virtual int index_first_in_part(uint part, uchar *buf) = 0; + virtual int index_last_in_part(uint part, uchar *buf) = 0; + virtual int index_prev_in_part(uint part, uchar *buf) = 0; + virtual int index_next_in_part(uint part, uchar *buf) = 0; + virtual int index_next_same_in_part(uint part, + uchar *buf, + const uchar *key, + uint length) = 0; + virtual int index_read_map_in_part(uint part, + uchar *buf, + const uchar *key, + key_part_map keypart_map, + enum ha_rkey_function find_flag) = 0; + virtual int index_read_last_map_in_part(uint part, + uchar *buf, + const uchar *key, + key_part_map keypart_map) = 0; + /** + Do read_range_first in the specified partition. + If buf is set, then copy the result there instead of table->record[0]. + */ + virtual int read_range_first_in_part(uint part, + uchar *buf, + const key_range *start_key, + const key_range *end_key, + bool eq_range, + bool sorted) = 0; + /** + Do read_range_next in the specified partition. + If buf is set, then copy the result there instead of table->record[0]. + */ + virtual int read_range_next_in_part(uint part, uchar *buf) = 0; + virtual int index_read_idx_map_in_part(uint part, + uchar *buf, + uint index, + const uchar *key, + key_part_map keypart_map, + enum ha_rkey_function find_flag) = 0; + /** + Initialize engine specific resources for the record priority queue + used duing ordered index reads for multiple partitions. + + @param used_parts Number of partitions used in query + (number of set bits in m_part_info->read_partitions). + + @return Operation status. + @retval 0 Success. + @retval != 0 Error code. + */ + virtual int init_record_priority_queue_for_parts(uint used_parts) + { + return 0; + } + /** + Destroy and release engine specific resources used by the record + priority queue. + */ + virtual void destroy_record_priority_queue_for_parts() {} + /** + Checksum for a partition. + + @param part_id Partition to checksum. + */ + virtual ha_checksum checksum_in_part(uint part_id) const + { DBUG_ASSERT(0); return 0; } + /** + Copy a cached row. + + Used when copying a row from the record priority queue to the return buffer. + For some engines, like InnoDB, only marked columns must be copied, + to preserve non-read columns. + + @param[out] to_rec Buffer to copy to. + @param[in] from_rec Buffer to copy from. + */ + virtual void copy_cached_row(uchar *to_rec, const uchar *from_rec) + { memcpy(to_rec, from_rec, m_rec_length); } + /** + Prepare for creating new partitions during ALTER TABLE ... PARTITION. + @param num_partitions Number of new partitions to be created. + @param only_create True if only creating the partition + (no open/lock is needed). + + @return Operation status. + @retval 0 Success. + @retval != 0 Error code. + */ + virtual int prepare_for_new_partitions(uint num_partitions, + bool only_create) = 0; + /** + Create a new partition to be filled during ALTER TABLE ... PARTITION. + @param table Table to create the partition in. + @param create_info Table/partition specific create info. + @param part_name Partition name. + @param new_part_id Partition id in new table. + @param part_elem Partition element. + + @return Operation status. + @retval 0 Success. + @retval != 0 Error code. + */ + virtual int create_new_partition(TABLE *table, + HA_CREATE_INFO *create_info, + const char *part_name, + uint new_part_id, + partition_element *part_elem) = 0; + /** + Close and finalize new partitions. + */ + virtual void close_new_partitions() = 0; + /** + write row to new partition. + @param new_part New partition to write to. + + @return Operation status. + @retval 0 Success. + @retval != 0 Error code. + */ + virtual int write_row_in_new_part(uint new_part) = 0; + + /* Internal helper functions*/ + /** + Update auto increment value if current row contains a higher value. + */ + inline void set_auto_increment_if_higher(); + /** + Common routine to set up index scans. + + Find out which partitions we'll need to read when scanning the specified + range. + + If we need to scan only one partition, set m_ordered_scan_ongoing=FALSE + as we will not need to do merge ordering. + + @param buf Buffer to later return record in (this function + needs it to calculate partitioning function values) + + @param idx_read_flag True <=> m_start_key has range start endpoint which + probably can be used to determine the set of + partitions to scan. + False <=> there is no start endpoint. + + @return Operation status. + @retval 0 Success + @retval !=0 Error code + */ + int partition_scan_set_up(uchar *buf, bool idx_read_flag); + /** + Common routine to handle index_next with unordered results. + + These routines are used to scan partitions without considering order. + This is performed in two situations. + 1) In read_multi_range this is the normal case + 2) When performing any type of index_read, index_first, index_last where + all fields in the partition function is bound. In this case the index + scan is performed on only one partition and thus it isn't necessary to + perform any sort. + + @param[out] buf Read row in MySQL Row Format. + @param[in] next_same Called from index_next_same. + + @return Operation status. + @retval HA_ERR_END_OF_FILE End of scan + @retval 0 Success + @retval other Error code + */ + int handle_unordered_next(uchar *buf, bool is_next_same); + /** + Handle index_next when changing to new partition. + + This routine is used to start the index scan on the next partition. + Both initial start and after completing scan on one partition. + + @param[out] buf Read row in MySQL Row Format + + @return Operation status. + @retval HA_ERR_END_OF_FILE End of scan + @retval 0 Success + @retval other Error code + */ + int handle_unordered_scan_next_partition(uchar *buf); + /** + Common routine to start index scan with ordered results. + + @param[out] buf Read row in MySQL Row Format + + @return Operation status + @retval HA_ERR_END_OF_FILE End of scan + @retval HA_ERR_KEY_NOT_FOUND End of scan + @retval 0 Success + @retval other Error code + */ + int handle_ordered_index_scan(uchar *buf); + /** + Add index_next/prev results from partitions without exact match. + + If there where any partitions that returned HA_ERR_KEY_NOT_FOUND when + ha_index_read_map was done, those partitions must be included in the + following index_next/prev call. + + @return Operation status + @retval HA_ERR_END_OF_FILE End of scan + @retval 0 Success + @retval other Error code + */ + int handle_ordered_index_scan_key_not_found(); + /** + Common routine to handle index_prev with ordered results. + + @param[out] buf Read row in MySQL Row Format. + + @return Operation status. + @retval HA_ERR_END_OF_FILE End of scan + @retval 0 Success + @retval other Error code + */ + int handle_ordered_prev(uchar *buf); + /** + Common routine to handle index_next with ordered results. + + @param[out] buf Read row in MySQL Row Format. + @param[in] next_same Called from index_next_same. + + @return Operation status. + @retval HA_ERR_END_OF_FILE End of scan + @retval 0 Success + @retval other Error code + */ + int handle_ordered_next(uchar *buf, bool is_next_same); + /** + Common routine for a number of index_read variants. + + @param[out] buf Buffer where the record should be returned. + @param[in] have_start_key TRUE <=> the left endpoint is available, i.e. + we're in index_read call or in read_range_first + call and the range has left endpoint. + FALSE <=> there is no left endpoint (we're in + read_range_first() call and the range has no + left endpoint). + + @return Operation status + @retval 0 OK + @retval HA_ERR_END_OF_FILE Whole index scanned, without finding the record. + @retval HA_ERR_KEY_NOT_FOUND Record not found, but index cursor positioned. + @retval other Error code. + */ + int common_index_read(uchar *buf, bool have_start_key); + /** + Common routine for index_first/index_last. + + @param[out] buf Read row in MySQL Row Format. + + @return Operation status. + @retval 0 Success + @retval != 0 Error code + */ + int common_first_last(uchar *buf); + /** + Return the top record in sort order. + + @param[out] buf Row returned in MySQL Row Format. + */ + void return_top_record(uchar *buf); + /** + Copy partitions as part of ALTER TABLE of partitions. + + change_partitions has done all the preparations, now it is time to + actually copy the data from the reorganized partitions to the new + partitions. + + @param[out] copied Number of records copied. + @param[out] deleted Number of records deleted. + + @return Operation status + @retval 0 Success + @retval >0 Error code + */ + virtual int copy_partitions(ulonglong * const copied, + ulonglong * const deleted); + + /** + Set table->read_set taking partitioning expressions into account. + */ + void set_partition_read_set(); + + /* + These could be private as well, + but easier to expose them to derived classes to use. + */ +protected: + /** All internal partitioning data! @{ */ + /** Tables partitioning info (same as table->part_info) */ + partition_info *m_part_info; + /** Is primary key clustered. */ + bool m_pkey_is_clustered; + /** Cached value of m_part_info->is_sub_partitioned(). */ + bool m_is_sub_partitioned; + /** Partition share for auto_inc handling. */ + Partition_share *m_part_share; + /** Total number of partitions. */ + uint m_tot_parts; + uint m_last_part; // Last accessed partition. + const uchar *m_err_rec; // record which gave error. + bool m_auto_increment_safe_stmt_log_lock; + bool m_auto_increment_lock; + part_id_range m_part_spec; // Which parts to scan + uint m_scan_value; // Value passed in rnd_init + // call + key_range m_start_key; // index read key range + enum partition_index_scan_type m_index_scan_type;// What type of index + // scan + uint m_rec_length; // Local copy of record length + + bool m_ordered; // Ordered/Unordered index scan. + bool m_ordered_scan_ongoing; // Ordered index scan ongoing. + bool m_reverse_order; // Scanning in reverse order (prev). + /** Row and key buffer for ordered index scan. */ + uchar *m_ordered_rec_buffer; + /** Prio queue used by sorted read. */ + Prio_queue *m_queue; + /** Which partition is to deliver next result. */ + uint m_top_entry; + /** Offset in m_ordered_rec_buffer from part buffer to its record buffer. */ + uint m_rec_offset; + /** + Current index used for sorting. + If clustered PK exists, then it will be used as secondary index to + sort on if the first is equal in key_rec_cmp. + So if clustered pk: m_curr_key_info[0]= current index and + m_curr_key_info[1]= pk and [2]= NULL. + Otherwise [0]= current index, [1]= NULL, and we will + sort by rowid as secondary sort key if equal first key. + */ + KEY *m_curr_key_info[3]; + enum enum_using_ref { + /** handler::ref is not copied to the PQ. */ + REF_NOT_USED= 0, + /** + handler::ref is copied to the PQ but does not need to be used in sorting. + */ + REF_STORED_IN_PQ, + /** handler::ref is copied to the PQ and must be used during sorting. */ + REF_USED_FOR_SORT}; + /** How handler::ref is used in the priority queue. */ + enum_using_ref m_ref_usage; + /** Set if previous index_* call returned HA_ERR_KEY_NOT_FOUND. */ + bool m_key_not_found; + /** Partitions that returned HA_ERR_KEY_NOT_FOUND. */ + MY_BITMAP m_key_not_found_partitions; + /** @} */ +}; +#endif /* PARTITION_HANDLER_INCLUDED */ diff --git a/sql/share/errmsg-utf8.txt b/sql/share/errmsg-utf8.txt index b30240f64c3..0aeaa058cf9 100644 --- a/sql/share/errmsg-utf8.txt +++ b/sql/share/errmsg-utf8.txt @@ -7523,8 +7523,17 @@ WARN_VERS_PARAMETERS WARN_VERS_PART_ROTATION eng "Switching from partition %`s to %`s" +WARN_VERS_TRX_MISSING + eng "VTQ missing transaction ID %lu" + +WARN_VERS_PART_NON_HISTORICAL + eng "Partition %`s contains non-historical data" + ER_VERS_NOT_ALLOWED eng "%`s is not allowed for versioned table" ER_VERS_WRONG_QUERY_TYPE eng "%`s works only with %`s query type" + +ER_WRONG_TABLESPACE_NAME 42000 + eng "Incorrect tablespace name `%-.192s`" diff --git a/sql/sql_partition.cc b/sql/sql_partition.cc index b358fe3386e..caca441e5e4 100644 --- a/sql/sql_partition.cc +++ b/sql/sql_partition.cc @@ -67,6 +67,7 @@ #include "opt_range.h" // store_key_image_to_rec #include "sql_alter.h" // Alter_table_ctx #include "sql_select.h" +#include "sql_tablespace.h" // check_tablespace_name #include using std::max; @@ -3458,7 +3459,10 @@ int vers_get_partition_id(partition_info *part_info, { table->s->busy_rotation= true; mysql_mutex_unlock(&table->s->LOCK_rotation); - if (part_info->vers_limit_exceed() || part_info->vers_interval_exceed(sys_trx_end->get_timestamp())) + // transaction is not yet pushed to VTQ, so we use now-time + my_time_t end_ts= sys_trx_end->table->versioned_by_engine() ? + my_time(0) : sys_trx_end->get_timestamp(); + if (part_info->vers_limit_exceed() || part_info->vers_interval_exceed(end_ts)) { part_info->vers_part_rotate(thd); } @@ -7388,6 +7392,39 @@ err: } #endif + +/* + Prepare for calling val_int on partition function by setting fields to + point to the record where the values of the PF-fields are stored. + + SYNOPSIS + set_field_ptr() + ptr Array of fields to change ptr + new_buf New record pointer + old_buf Old record pointer + + DESCRIPTION + Set ptr in field objects of field array to refer to new_buf record + instead of previously old_buf. Used before calling val_int and after + it is used to restore pointers to table->record[0]. + This routine is placed outside of partition code since it can be useful + also for other programs. +*/ + +void set_field_ptr(Field **ptr, const uchar *new_buf, + const uchar *old_buf) +{ + my_ptrdiff_t diff= (new_buf - old_buf); + DBUG_ENTER("set_field_ptr"); + + do + { + (*ptr)->move_field_offset(diff); + } while (*(++ptr)); + DBUG_VOID_RETURN; +} + + /* Prepare for calling val_int on partition function by setting fields to point to the record where the values of the PF-fields are stored. @@ -7426,6 +7463,61 @@ void set_key_field_ptr(KEY *key_info, const uchar *new_buf, } +/** + Append all fields in read_set to string + + @param[in,out] str String to append to. + @param[in] row Row to append. + @param[in] table Table containing read_set and fields for the row. +*/ +void append_row_to_str(String &str, const uchar *row, TABLE *table) +{ + Field **fields, **field_ptr; + const uchar *rec; + uint num_fields= bitmap_bits_set(table->read_set); + uint curr_field_index= 0; + bool is_rec0= !row || row == table->record[0]; + if (!row) + rec= table->record[0]; + else + rec= row; + + /* Create a new array of all read fields. */ + fields= (Field**) my_malloc(sizeof(void*) * (num_fields + 1), + MYF(0)); + if (!fields) + return; + fields[num_fields]= NULL; + for (field_ptr= table->field; + *field_ptr; + field_ptr++) + { + if (!bitmap_is_set(table->read_set, (*field_ptr)->field_index)) + continue; + fields[curr_field_index++]= *field_ptr; + } + + + if (!is_rec0) + set_field_ptr(fields, rec, table->record[0]); + + for (field_ptr= fields; + *field_ptr; + field_ptr++) + { + Field *field= *field_ptr; + str.append(" "); + str.append(field->field_name); + str.append(":"); + field_unpack(&str, field, rec, 0, false); + } + + if (!is_rec0) + set_field_ptr(fields, table->record[0], rec); + my_free(fields); +} + + /* SYNOPSIS mem_alloc_error() @@ -8595,4 +8687,52 @@ uint get_partition_field_store_length(Field *field) store_length+= HA_KEY_BLOB_LENGTH; return store_length; } + +// FIXME: duplicate of ha_partition::set_up_table_before_create +bool set_up_table_before_create(THD *thd, + TABLE_SHARE *share, + const char *partition_name_with_path, + HA_CREATE_INFO *info, + partition_element *part_elem) +{ + bool error= false; + const char *partition_name; + DBUG_ENTER("set_up_table_before_create"); + + DBUG_ASSERT(part_elem); + + if (!part_elem) + DBUG_RETURN(true); + share->max_rows= part_elem->part_max_rows; + share->min_rows= part_elem->part_min_rows; + partition_name= strrchr(partition_name_with_path, FN_LIBCHAR); + if ((part_elem->index_file_name && + (error= append_file_to_dir(thd, + const_cast(&part_elem->index_file_name), + partition_name+1))) || + (part_elem->data_file_name && + (error= append_file_to_dir(thd, + const_cast(&part_elem->data_file_name), + partition_name+1)))) + { + DBUG_RETURN(error); + } + if (part_elem->index_file_name != NULL) + { + info->index_file_name= part_elem->index_file_name; + } + if (part_elem->data_file_name != NULL) + { + info->data_file_name= part_elem->data_file_name; + } + if (part_elem->tablespace_name != NULL) + { + if (check_tablespace_name(part_elem->tablespace_name) != IDENT_NAME_OK) + { + DBUG_RETURN(true); + } + info->tablespace= part_elem->tablespace_name; + } + DBUG_RETURN(error); +} #endif diff --git a/sql/sql_partition.h b/sql/sql_partition.h index c2665a8366b..aef4a6ce5e1 100644 --- a/sql/sql_partition.h +++ b/sql/sql_partition.h @@ -40,6 +40,7 @@ typedef struct st_key_range key_range; #define HA_CAN_UPDATE_PARTITION_KEY (1 << 1) #define HA_CAN_PARTITION_UNIQUE (1 << 2) #define HA_USE_AUTO_PARTITION (1 << 3) +#define HA_ONLY_VERS_PARTITION (1 << 4) #define NORMAL_PART_NAME 0 #define TEMP_PART_NAME 1 @@ -127,6 +128,14 @@ uint32 get_partition_id_range_for_endpoint(partition_info *part_info, bool check_part_func_fields(Field **ptr, bool ok_with_charsets); bool field_is_partition_charset(Field *field); Item* convert_charset_partition_constant(Item *item, CHARSET_INFO *cs); +/** + Append all fields in read_set to string + + @param[in,out] str String to append to. + @param[in] row Row to append. + @param[in] table Table containing read_set and fields for the row. +*/ +void append_row_to_str(String &str, const uchar *row, TABLE *table); void mem_alloc_error(size_t size); void truncate_partition_filename(char *path); @@ -291,6 +300,31 @@ void create_subpartition_name(char *out, const char *in1, void set_key_field_ptr(KEY *key_info, const uchar *new_buf, const uchar *old_buf); +/** Set up table for creating a partition. +Copy info from partition to the table share so the created partition +has the correct info. + @param thd THD object + @param share Table share to be updated. + @param info Create info to be updated. + @param part_elem partition_element containing the info. + + @return status + @retval TRUE Error + @retval FALSE Success + + @details + Set up + 1) Comment on partition + 2) MAX_ROWS, MIN_ROWS on partition + 3) Index file name on partition + 4) Data file name on partition +*/ +bool set_up_table_before_create(THD *thd, + TABLE_SHARE *share, + const char *partition_name_with_path, + HA_CREATE_INFO *info, + partition_element *part_elem); + extern const LEX_STRING partition_keywords[]; #endif /* SQL_PARTITION_INCLUDED */ diff --git a/sql/sql_table.cc b/sql/sql_table.cc index b5cf35ed17c..3a921e0dc79 100644 --- a/sql/sql_table.cc +++ b/sql/sql_table.cc @@ -4492,7 +4492,10 @@ handler *mysql_create_frm_image(THD *thd, part_info->part_info_string= part_syntax_buf; part_info->part_info_len= syntax_len; if ((!(engine_type->partition_flags && - engine_type->partition_flags() & HA_CAN_PARTITION)) || + ((engine_type->partition_flags() & HA_CAN_PARTITION) || + (part_info->part_type == VERSIONING_PARTITION && + engine_type->partition_flags() & HA_ONLY_VERS_PARTITION)) + )) || create_info->db_type == partition_hton) { /* diff --git a/sql/sql_tablespace.cc b/sql/sql_tablespace.cc index 8b9e14e5a18..318be320640 100644 --- a/sql/sql_tablespace.cc +++ b/sql/sql_tablespace.cc @@ -22,6 +22,70 @@ #include "sql_table.h" // write_bin_log #include "sql_class.h" // THD +/** + Check if tablespace name is valid + + @param tablespace_name Name of the tablespace + + @note Tablespace names are not reflected in the file system, so + character case conversion or consideration is not relevant. + + @note Checking for path characters or ending space is not done. + The only checks are for identifier length, both in terms of + number of characters and number of bytes. + + @retval IDENT_NAME_OK Identifier name is ok (Success) + @retval IDENT_NAME_WRONG Identifier name is wrong, if length == 0 +* (ER_WRONG_TABLESPACE_NAME) + @retval IDENT_NAME_TOO_LONG Identifier name is too long if it is greater + than 64 characters (ER_TOO_LONG_IDENT) + + @note In case of IDENT_NAME_TOO_LONG or IDENT_NAME_WRONG, the function + reports an error (using my_error()). +*/ + +enum_ident_name_check check_tablespace_name(const char *tablespace_name) +{ + size_t name_length= 0; //< Length as number of bytes + size_t name_length_symbols= 0; //< Length as number of symbols + + // Name must be != NULL and length must be > 0 + if (!tablespace_name || (name_length= strlen(tablespace_name)) == 0) + { + my_error(ER_WRONG_TABLESPACE_NAME, MYF(0), tablespace_name); + return IDENT_NAME_WRONG; + } + + // If we do not have too many bytes, we must check the number of symbols, + // provided the system character set may use more than one byte per symbol. + if (name_length <= NAME_LEN && use_mb(system_charset_info)) + { + const char *name= tablespace_name; //< The actual tablespace name + const char *end= name + name_length; //< Pointer to first byte after name + + // Loop over all symbols as long as we don't have too many already + while (name != end && name_length_symbols <= NAME_CHAR_LEN) + { + int len= my_ismbchar(system_charset_info, name, end); + if (len) + name += len; + else + name++; + + name_length_symbols++; + } + } + + if (name_length_symbols > NAME_CHAR_LEN || name_length > NAME_LEN) + { + my_error(ER_TOO_LONG_IDENT, MYF(0), tablespace_name); + return IDENT_NAME_TOO_LONG; + } + + return IDENT_NAME_OK; +} + + int mysql_alter_tablespace(THD *thd, st_alter_tablespace *ts_info) { int error= HA_ADMIN_NOT_IMPLEMENTED; diff --git a/sql/sql_tablespace.h b/sql/sql_tablespace.h index ae77d15cbcb..b97c64f7965 100644 --- a/sql/sql_tablespace.h +++ b/sql/sql_tablespace.h @@ -19,6 +19,41 @@ class THD; class st_alter_tablespace; +/** + Enumerate possible status of a identifier name while determining + its validity +*/ +enum enum_ident_name_check +{ + IDENT_NAME_OK, + IDENT_NAME_WRONG, + IDENT_NAME_TOO_LONG +}; + +/** + Check if tablespace name is valid + + @param tablespace_name Name of the tablespace + + @note Tablespace names are not reflected in the file system, so + character case conversion or consideration is not relevant. + + @note Checking for path characters or ending space is not done. + The only checks are for identifier length, both in terms of + number of characters and number of bytes. + + @retval IDENT_NAME_OK Identifier name is ok (Success) + @retval IDENT_NAME_WRONG Identifier name is wrong, if length == 0 + (ER_WRONG_TABLESPACE_NAME) + @retval IDENT_NAME_TOO_LONG Identifier name is too long if it is greater + than 64 characters (ER_TOO_LONG_IDENT) + + @note In case of IDENT_NAME_TOO_LONG or IDENT_NAME_WRONG, the function + reports an error (using my_error()). +*/ + +enum_ident_name_check check_tablespace_name(const char *tablespace_name); + int mysql_alter_tablespace(THD* thd, st_alter_tablespace *ts_info); #endif /* SQL_TABLESPACE_INCLUDED */ diff --git a/sql/table.cc b/sql/table.cc index 354658ba476..b256b3e91b6 100644 --- a/sql/table.cc +++ b/sql/table.cc @@ -3261,6 +3261,20 @@ enum open_frm_error open_table_from_share(THD *thd, TABLE_SHARE *share, } outparam->part_info->is_auto_partitioned= share->auto_partitioned; DBUG_PRINT("info", ("autopartitioned: %u", share->auto_partitioned)); + if (outparam->part_info->part_type == VERSIONING_PARTITION && + share->db_type()->vers_upgrade_handler) + { + outparam->file= share->db_type()->vers_upgrade_handler( + outparam->file, &outparam->mem_root); + if (!outparam->file) + { + thd->stmt_arena= backup_stmt_arena_ptr; + thd->restore_active_arena(&part_func_arena, &backup_arena); + my_error(ER_OUTOFMEMORY, MYF(0), 4095); + error_reported= TRUE; + goto err; + } + } /* We should perform the fix_partition_func in either local or caller's arena depending on work_part_info_used value. -- cgit v1.2.1