summaryrefslogtreecommitdiff
path: root/sql
diff options
context:
space:
mode:
authorAleksey Midenkov <midenok@gmail.com>2017-02-20 10:06:58 +0300
committerAleksey Midenkov <midenok@gmail.com>2017-05-05 20:36:28 +0300
commitfc7da4dd4f1e2b9b78b292f20d8fe61f1e9a1d11 (patch)
tree72d251bf8b315752eed47a093630c658c5b85282 /sql
parentfb801289f314bee6e5b1864f3ef58f8f38a59278 (diff)
downloadmariadb-git-fc7da4dd4f1e2b9b78b292f20d8fe61f1e9a1d11.tar.gz
IB, SQL: InnoDB partitioning [closes #118]
* native InnoDB partitioning for BY SYSTEM_TIME partitions.
Diffstat (limited to 'sql')
-rw-r--r--sql/CMakeLists.txt4
-rw-r--r--sql/ha_partition.cc23
-rw-r--r--sql/ha_partition.h117
-rw-r--r--sql/handler.cc6
-rw-r--r--sql/handler.h27
-rw-r--r--sql/partition_info.cc192
-rw-r--r--sql/partition_info.h49
-rw-r--r--sql/partitioning/partition_handler.cc3746
-rw-r--r--sql/partitioning/partition_handler.h1113
-rw-r--r--sql/share/errmsg-utf8.txt9
-rw-r--r--sql/sql_partition.cc142
-rw-r--r--sql/sql_partition.h34
-rw-r--r--sql/sql_table.cc5
-rw-r--r--sql/sql_tablespace.cc64
-rw-r--r--sql/sql_tablespace.h35
-rw-r--r--sql/table.cc14
16 files changed, 5533 insertions, 47 deletions
diff --git a/sql/CMakeLists.txt b/sql/CMakeLists.txt
index 1dfa313a70c..08a39b1975d 100644
--- a/sql/CMakeLists.txt
+++ b/sql/CMakeLists.txt
@@ -121,7 +121,7 @@ SET (SQL_SOURCE
rpl_tblmap.cc sql_binlog.cc event_scheduler.cc event_data_objects.cc
event_queue.cc event_db_repository.cc
sql_tablespace.cc events.cc ../sql-common/my_user.c
- partition_info.cc rpl_utility.cc rpl_injector.cc sql_locale.cc
+ partition_info.cc partitioning/partition_handler.cc rpl_utility.cc rpl_injector.cc sql_locale.cc
rpl_rli.cc rpl_mi.cc sql_servers.cc sql_audit.cc
sql_connect.cc scheduler.cc sql_partition_admin.cc
sql_profile.cc event_parse_data.cc sql_alter.cc
@@ -165,7 +165,7 @@ IF (CMAKE_SYSTEM_NAME MATCHES "Linux" OR
ENDIF()
-MYSQL_ADD_PLUGIN(partition ha_partition.cc STORAGE_ENGINE DEFAULT STATIC_ONLY
+MYSQL_ADD_PLUGIN(partition ha_partition.cc partitioning/partition_handler.cc STORAGE_ENGINE DEFAULT STATIC_ONLY
RECOMPILE_FOR_EMBEDDED)
MYSQL_ADD_PLUGIN(sql_sequence ha_sequence.cc STORAGE_ENGINE MANDATORY STATIC_ONLY
RECOMPILE_FOR_EMBEDDED)
diff --git a/sql/ha_partition.cc b/sql/ha_partition.cc
index 0fa461e1807..747b9a8871f 100644
--- a/sql/ha_partition.cc
+++ b/sql/ha_partition.cc
@@ -160,9 +160,6 @@ static int partition_initialize(void *p)
bool Partition_share::init(uint num_parts)
{
DBUG_ENTER("Partition_share::init");
- mysql_mutex_init(key_partition_auto_inc_mutex,
- &auto_inc_mutex,
- MY_MUTEX_INIT_FAST);
auto_inc_initialized= false;
partition_name_hash_initialized= false;
next_auto_inc_val= 0;
@@ -1246,12 +1243,12 @@ int ha_partition::handle_opt_part(THD *thd, HA_CHECK_OPT *check_opt,
(modelled after mi_check_print_msg)
TODO: move this into the handler, or rewrite mysql_admin_table.
*/
-static bool print_admin_msg(THD* thd, uint len,
+bool print_admin_msg(THD* thd, uint len,
const char* msg_type,
const char* db_name, String &table_name,
const char* op_name, const char *fmt, ...)
ATTRIBUTE_FORMAT(printf, 7, 8);
-static bool print_admin_msg(THD* thd, uint len,
+bool print_admin_msg(THD* thd, uint len,
const char* msg_type,
const char* db_name, String &table_name,
const char* op_name, const char *fmt, ...)
@@ -5731,6 +5728,22 @@ int ha_partition::index_next_same(uchar *buf, const uchar *key, uint keylen)
}
+int ha_partition::index_read_last_map(uchar *buf,
+ const uchar *key,
+ key_part_map keypart_map)
+{
+ DBUG_ENTER("ha_partition::index_read_last_map");
+
+ m_ordered= true; // Safety measure
+ end_range= NULL;
+ m_index_scan_type= partition_index_read_last;
+ m_start_key.key= key;
+ m_start_key.keypart_map= keypart_map;
+ m_start_key.flag= HA_READ_PREFIX_LAST;
+ DBUG_RETURN(common_index_read(buf, true));
+}
+
+
/*
Read next record when performing index scan backwards
diff --git a/sql/ha_partition.h b/sql/ha_partition.h
index 2c7f4a0861f..861ba47b94e 100644
--- a/sql/ha_partition.h
+++ b/sql/ha_partition.h
@@ -77,43 +77,118 @@ public:
};
+extern PSI_mutex_key key_partition_auto_inc_mutex;
+
/**
Partition specific Handler_share.
*/
class Partition_share : public Handler_share
{
public:
- bool auto_inc_initialized;
- mysql_mutex_t auto_inc_mutex; /**< protecting auto_inc val */
- ulonglong next_auto_inc_val; /**< first non reserved value */
- /**
- Hash of partition names. Initialized in the first ha_partition::open()
- for the table_share. After that it is read-only, i.e. no locking required.
- */
- bool partition_name_hash_initialized;
- HASH partition_name_hash;
- /** Storage for each partitions Handler_share */
- Parts_share_refs *partitions_share_refs;
- Partition_share() {}
+ Partition_share()
+ : auto_inc_initialized(false),
+ next_auto_inc_val(0),
+ partition_name_hash_initialized(false),
+ partitions_share_refs(NULL),
+ partition_names(NULL)
+ {
+ mysql_mutex_init(key_partition_auto_inc_mutex,
+ &auto_inc_mutex,
+ MY_MUTEX_INIT_FAST);
+ }
+
~Partition_share()
{
- DBUG_ENTER("Partition_share::~Partition_share");
mysql_mutex_destroy(&auto_inc_mutex);
+ if (partition_names)
+ {
+ my_free(partition_names);
+ }
if (partition_name_hash_initialized)
+ {
my_hash_free(&partition_name_hash);
+ }
if (partitions_share_refs)
delete partitions_share_refs;
- DBUG_VOID_RETURN;
}
+
bool init(uint num_parts);
- void lock_auto_inc()
+
+ /** Set if auto increment is used an initialized. */
+ bool auto_inc_initialized;
+ /**
+ Mutex protecting next_auto_inc_val.
+ Initialized if table uses auto increment.
+ */
+ mysql_mutex_t auto_inc_mutex;
+ /** First non reserved auto increment value. */
+ ulonglong next_auto_inc_val;
+ /**
+ Hash of partition names. Initialized by the first handler instance of a
+ table_share calling populate_partition_name_hash().
+ After that it is read-only, i.e. no locking required for reading.
+ */
+ HASH partition_name_hash;
+ /** flag that the name hash is initialized, so it only will do it once. */
+ bool partition_name_hash_initialized;
+
+ /** Storage for each partitions Handler_share */
+ Parts_share_refs *partitions_share_refs;
+
+ /**
+ Release reserved auto increment values not used.
+ @param thd Thread.
+ @param table_share Table Share
+ @param next_insert_id Next insert id (first non used auto inc value).
+ @param max_reserved End of reserved auto inc range.
+ */
+ void release_auto_inc_if_possible(THD *thd, TABLE_SHARE *table_share,
+ const ulonglong next_insert_id,
+ const ulonglong max_reserved);
+
+ /** lock mutex protecting auto increment value next_auto_inc_val. */
+ inline void lock_auto_inc()
{
mysql_mutex_lock(&auto_inc_mutex);
}
- void unlock_auto_inc()
+ /** unlock mutex protecting auto increment value next_auto_inc_val. */
+ inline void unlock_auto_inc()
{
mysql_mutex_unlock(&auto_inc_mutex);
}
+ /**
+ Populate partition_name_hash with partition and subpartition names
+ from part_info.
+ @param part_info Partition info containing all partitions metadata.
+
+ @return Operation status.
+ @retval false Success.
+ @retval true Failure.
+ */
+ bool populate_partition_name_hash(partition_info *part_info);
+ /** Get partition name.
+
+ @param part_id Partition id (for subpartitioned table only subpartition
+ names will be returned.)
+
+ @return partition name or NULL if error.
+ */
+ const char *get_partition_name(size_t part_id) const;
+private:
+ const uchar **partition_names;
+ /**
+ Insert [sub]partition name into partition_name_hash
+ @param name Partition name.
+ @param part_id Partition id.
+ @param is_subpart True if subpartition else partition.
+
+ @return Operation status.
+ @retval false Success.
+ @retval true Failure.
+ */
+ bool insert_partition_name_in_hash(const char *name,
+ uint part_id,
+ bool is_subpart);
};
@@ -605,6 +680,10 @@ public:
virtual int index_last(uchar * buf);
virtual int index_next_same(uchar * buf, const uchar * key, uint keylen);
+ int index_read_last_map(uchar *buf,
+ const uchar *key,
+ key_part_map keypart_map);
+
/*
read_first_row is virtual method but is only implemented by
handler.cc, no storage engine has implemented it so neither
@@ -1086,7 +1165,6 @@ private:
ulonglong nr= (((Field_num*) field)->unsigned_flag ||
field->val_int() > 0) ? field->val_int() : 0;
lock_auto_increment();
- DBUG_ASSERT(part_share->auto_inc_initialized);
/* must check when the mutex is taken */
if (nr >= part_share->next_auto_inc_val)
part_share->next_auto_inc_val= nr + 1;
@@ -1310,4 +1388,9 @@ public:
friend int cmp_key_rowid_part_id(void *ptr, uchar *ref1, uchar *ref2);
};
+bool print_admin_msg(THD* thd, uint len,
+ const char* msg_type,
+ const char* db_name, String &table_name,
+ const char* op_name, const char *fmt, ...);
+
#endif /* HA_PARTITION_INCLUDED */
diff --git a/sql/handler.cc b/sql/handler.cc
index c19d04236d7..ba947fd7a2d 100644
--- a/sql/handler.cc
+++ b/sql/handler.cc
@@ -2435,6 +2435,12 @@ LEX_STRING *handler::engine_name()
}
+void handler::ha_statistic_increment(ulong SSV::*offset) const
+{
+ (table->in_use->status_var.*offset)++;
+}
+
+
double handler::keyread_time(uint index, uint ranges, ha_rows rows)
{
/*
diff --git a/sql/handler.h b/sql/handler.h
index e20f95df1f3..f5e3d83d8d9 100644
--- a/sql/handler.h
+++ b/sql/handler.h
@@ -1393,6 +1393,7 @@ struct handlerton
bool (*vers_query_trx_id)(THD* thd, void *out, ulonglong trx_id, vtq_field_t field);
bool (*vers_query_commit_ts)(THD* thd, void *out, const MYSQL_TIME &commit_ts, vtq_field_t field, bool backwards);
bool (*vers_trx_sees)(THD *thd, bool &result, ulonglong trx_id1, ulonglong trx_id0, ulonglong commit_id1, uchar iso_level1, ulonglong commit_id0);
+ handler *(*vers_upgrade_handler)(handler *hnd, MEM_ROOT *mem_root);
};
@@ -3271,6 +3272,18 @@ protected:
virtual int index_last(uchar * buf)
{ return HA_ERR_WRONG_COMMAND; }
virtual int index_next_same(uchar *buf, const uchar *key, uint keylen);
+ /**
+ @brief
+ The following functions works like index_read, but it find the last
+ row with the current key value or prefix.
+ @returns @see index_read_map().
+ */
+ virtual int index_read_last_map(uchar * buf, const uchar * key,
+ key_part_map keypart_map)
+ {
+ uint key_len= calculate_key_len(table, active_index, key, keypart_map);
+ return index_read_last(buf, key, key_len);
+ }
virtual int close(void)=0;
inline void update_rows_read()
{
@@ -3350,7 +3363,7 @@ public:
void ft_end() { ft_handler=NULL; }
virtual FT_INFO *ft_init_ext(uint flags, uint inx,String *key)
{ return NULL; }
-private:
+public:
virtual int ft_read(uchar *buf) { return HA_ERR_WRONG_COMMAND; }
virtual int rnd_next(uchar *buf)=0;
virtual int rnd_pos(uchar * buf, uchar *pos)=0;
@@ -4057,6 +4070,7 @@ public:
TABLE_SHARE* get_table_share() { return table_share; }
protected:
/* Service methods for use by storage engines. */
+ void ha_statistic_increment(ulong SSV::*offset) const;
void **ha_data(THD *) const;
THD *ha_thd(void) const;
@@ -4082,7 +4096,7 @@ protected:
public:
bool check_table_binlog_row_based(bool binlog_row);
-private:
+
/* Cache result to avoid extra calls */
inline void mark_trx_read_write()
{
@@ -4092,6 +4106,8 @@ private:
mark_trx_read_write_internal();
}
}
+
+private:
void mark_trx_read_write_internal();
bool check_table_binlog_row_based_internal(bool binlog_row);
@@ -4210,6 +4226,11 @@ protected:
virtual int index_read(uchar * buf, const uchar * key, uint key_len,
enum ha_rkey_function find_flag)
{ return HA_ERR_WRONG_COMMAND; }
+ virtual int index_read_last(uchar * buf, const uchar * key, uint key_len)
+ {
+ my_errno= HA_ERR_WRONG_COMMAND;
+ return HA_ERR_WRONG_COMMAND;
+ }
friend class ha_partition;
friend class ha_sequence;
public:
@@ -4340,6 +4361,8 @@ public:
{ DBUG_ASSERT(0); return false; }
virtual handler* part_handler(uint32 part_id)
{ DBUG_ASSERT(0); return NULL; }
+ virtual void update_partition(uint part_id)
+ {}
protected:
Handler_share *get_ha_share_ptr();
void set_ha_share_ptr(Handler_share *arg_ha_share);
diff --git a/sql/partition_info.cc b/sql/partition_info.cc
index f45b45548b0..c1a792c87e0 100644
--- a/sql/partition_info.cc
+++ b/sql/partition_info.cc
@@ -215,6 +215,48 @@ bool partition_info::set_named_partition_bitmap(const char *part_name,
@param table_list Table list pointing to table to prune.
@return Operation status
+ @retval false Success
+ @retval true Failure
+*/
+bool partition_info::set_read_partitions(List<char> *partition_names)
+{
+ DBUG_ENTER("partition_info::set_read_partitions");
+ if (!partition_names || !partition_names->elements)
+ {
+ DBUG_RETURN(true);
+ }
+
+ uint num_names= partition_names->elements;
+ List_iterator<char> partition_names_it(*partition_names);
+ uint i= 0;
+ /*
+ TODO: When adding support for FK in partitioned tables, the referenced
+ table must probably lock all partitions for read, and also write depending
+ of ON DELETE/UPDATE.
+ */
+ bitmap_clear_all(&read_partitions);
+
+ /* No check for duplicate names or overlapping partitions/subpartitions. */
+
+ DBUG_PRINT("info", ("Searching through partition_name_hash"));
+ do
+ {
+ char *part_name= partition_names_it++;
+ if (add_named_partition(part_name, strlen(part_name)))
+ DBUG_RETURN(true);
+ } while (++i < num_names);
+ DBUG_RETURN(false);
+}
+
+
+
+/**
+ Prune away partitions not mentioned in the PARTITION () clause,
+ if used.
+
+ @param table_list Table list pointing to table to prune.
+
+ @return Operation status
@retval true Failure
@retval false Success
*/
@@ -989,13 +1031,22 @@ bool partition_info::vers_scan_min_max(THD *thd, partition_element *part)
uint32 part_id= part->id * sub_factor;
uint32 part_id_end= part_id + sub_factor;
DBUG_ASSERT(part->empty);
+ DBUG_ASSERT(part->type == partition_element::VERSIONING);
DBUG_ASSERT(table->s->stat_trx);
for (; part_id < part_id_end; ++part_id)
{
- handler *file= table->file->part_handler(part_id);
- int rc= file->ha_external_lock(thd, F_RDLCK);
+ handler *file= table->file->part_handler(part_id); // requires update_partition() for ha_innopart
+ int rc= file->ha_external_lock(thd, F_RDLCK); // requires ha_commit_trans() for ha_innobase
if (rc)
- goto error;
+ {
+ file->update_partition(part_id);
+ goto lock_fail;
+ }
+
+ table->default_column_bitmaps();
+ bitmap_set_bit(table->read_set, table->vers_end_field()->field_index);
+ file->column_bitmaps_signal();
+
rc= file->ha_rnd_init(true);
if (!rc)
{
@@ -1006,6 +1057,8 @@ bool partition_info::vers_scan_min_max(THD *thd, partition_element *part)
if (thd->killed)
{
file->ha_rnd_end();
+ file->update_partition(part_id);
+ ha_commit_trans(thd, false);
return true;
}
if (rc)
@@ -1014,18 +1067,44 @@ bool partition_info::vers_scan_min_max(THD *thd, partition_element *part)
continue;
break;
}
- vers_stat_trx(STAT_TRX_END, part).update_unguarded(table->vers_end_field());
+ if (table->vers_end_field()->is_max())
+ {
+ rc= HA_ERR_INTERNAL_ERROR;
+ push_warning_printf(thd,
+ Sql_condition::WARN_LEVEL_WARN,
+ WARN_VERS_PART_NON_HISTORICAL,
+ ER_THD(thd, WARN_VERS_PART_NON_HISTORICAL),
+ part->partition_name);
+ break;
+ }
+ if (table->versioned_by_engine())
+ {
+ uchar buf[8];
+ Field_timestampf fld(buf, NULL, 0, Field::NONE, table->vers_end_field()->field_name, NULL, 6);
+ if (!vers_trx_id_to_ts(thd, table->vers_end_field(), fld))
+ {
+ vers_stat_trx(STAT_TRX_END, part).update_unguarded(&fld);
+ }
+ }
+ else
+ {
+ vers_stat_trx(STAT_TRX_END, part).update_unguarded(table->vers_end_field());
+ }
}
file->ha_rnd_end();
}
file->ha_external_lock(thd, F_UNLCK);
+ file->update_partition(part_id);
if (rc != HA_ERR_END_OF_FILE)
{
- error:
- my_error(ER_INTERNAL_ERROR, MYF(0), "partition/subpartition scan failed in versioned partitions setup");
+ ha_commit_trans(thd, false);
+ lock_fail:
+ // TODO: print rc code
+ my_error(ER_INTERNAL_ERROR, MYF(0), "min/max scan failed in versioned partitions setup (see warnings)");
return true;
}
}
+ ha_commit_trans(thd, false);
return false;
}
@@ -1073,11 +1152,9 @@ bool partition_info::vers_setup_2(THD * thd, bool is_create_table_ind)
DBUG_ASSERT(part_type == VERSIONING_PARTITION);
DBUG_ASSERT(vers_info && vers_info->initialized(false));
DBUG_ASSERT(table && table->s);
- if (!table->versioned_by_sql())
- {
- my_error(ER_VERS_WRONG_PARAMS, MYF(0), table->s->table_name.str, "selected engine is not supported in `BY SYSTEM_TIME` partitioning");
- return true;
- }
+
+ bool error= false;
+
mysql_mutex_lock(&table->s->LOCK_rotation);
if (table->s->busy_rotation)
{
@@ -1124,8 +1201,19 @@ bool partition_info::vers_setup_2(THD * thd, bool is_create_table_ind)
if (!is_create_table_ind)
{
- if (vers_scan_min_max(thd, el))
- return true;
+ if (el->type == partition_element::AS_OF_NOW)
+ {
+ uchar buf[8];
+ Field_timestampf fld(buf, NULL, 0, Field::NONE, table->vers_end_field()->field_name, NULL, 6);
+ fld.set_max();
+ vers_stat_trx(STAT_TRX_END, el).update_unguarded(&fld);
+ el->empty= false;
+ }
+ else if (vers_scan_min_max(thd, el))
+ {
+ error= true;
+ break;
+ }
if (!el->empty)
{
vers_update_col_vals(thd, prev, el);
@@ -1151,7 +1239,7 @@ bool partition_info::vers_setup_2(THD * thd, bool is_create_table_ind)
}
} // while
- if (!dont_stat)
+ if (!error && !dont_stat)
{
if (col_val_updated)
table->s->stat_serial++;
@@ -1165,7 +1253,7 @@ bool partition_info::vers_setup_2(THD * thd, bool is_create_table_ind)
table->s->busy_rotation= false;
}
mysql_mutex_unlock(&table->s->LOCK_rotation);
- return false;
+ return error;
}
@@ -3262,6 +3350,80 @@ bool partition_info::has_same_partitioning(partition_info *new_part_info)
}
+static bool has_same_column_order(List<Create_field> *create_list,
+ Field** field_array)
+{
+ Field **f_ptr;
+ List_iterator_fast<Create_field> new_field_it;
+ Create_field *new_field= NULL;
+ new_field_it.init(*create_list);
+
+ for (f_ptr= field_array; *f_ptr; f_ptr++)
+ {
+ while ((new_field= new_field_it++))
+ {
+ if (new_field->field == *f_ptr)
+ break;
+ }
+ if (!new_field)
+ break;
+ }
+
+ if (!new_field)
+ {
+ /* Not same order!*/
+ return false;
+ }
+ return true;
+}
+
+bool partition_info::vers_trx_id_to_ts(THD* thd, Field* in_trx_id, Field_timestamp& out_ts)
+{
+ handlerton *hton= plugin_hton(table->s->db_plugin);
+ DBUG_ASSERT(hton);
+ ulonglong trx_id= in_trx_id->val_int();
+ MYSQL_TIME ts;
+ bool found= hton->vers_query_trx_id(thd, &ts, trx_id, VTQ_COMMIT_TS);
+ if (!found)
+ {
+ push_warning_printf(thd,
+ Sql_condition::WARN_LEVEL_WARN,
+ WARN_VERS_TRX_MISSING,
+ ER_THD(thd, WARN_VERS_TRX_MISSING),
+ trx_id);
+ return true;
+ }
+ out_ts.store_time_dec(&ts, 6);
+ return false;
+}
+
+
+/**
+ Check if the partitioning columns are in the same order as the given list.
+
+ Used to see if INPLACE alter can be allowed or not. If the order is
+ different then the rows must be redistributed for KEY [sub]partitioning.
+
+ @param[in] create_list Column list after ALTER TABLE.
+ @return true is same order as before ALTER TABLE, else false.
+*/
+bool partition_info::same_key_column_order(List<Create_field> *create_list)
+{
+ /* Only need to check for KEY [sub] partitioning. */
+ if (list_of_part_fields && !column_list)
+ {
+ if (!has_same_column_order(create_list, part_field_array))
+ return false;
+ }
+ if (list_of_subpart_fields)
+ {
+ if (!has_same_column_order(create_list, subpart_field_array))
+ return false;
+ }
+ return true;
+}
+
+
void partition_info::print_debug(const char *str, uint *value)
{
DBUG_ENTER("print_debug");
diff --git a/sql/partition_info.h b/sql/partition_info.h
index 5a671bfc50f..ef20564837c 100644
--- a/sql/partition_info.h
+++ b/sql/partition_info.h
@@ -22,6 +22,7 @@
#include "sql_class.h"
#include "partition_element.h"
+#include "sql_partition.h"
class partition_info;
struct TABLE_LIST;
@@ -382,6 +383,28 @@ public:
uint32 *part_id);
void report_part_expr_error(bool use_subpart_expr);
bool has_same_partitioning(partition_info *new_part_info);
+ inline bool is_partition_used(uint part_id) const
+ {
+ return bitmap_is_set(&read_partitions, part_id);
+ }
+ inline bool is_partition_locked(uint part_id) const
+ {
+ return bitmap_is_set(&lock_partitions, part_id);
+ }
+ inline uint num_partitions_used()
+ {
+ return bitmap_bits_set(&read_partitions);
+ }
+ inline uint get_first_used_partition() const
+ {
+ return bitmap_get_first_set(&read_partitions);
+ }
+ inline uint get_next_used_partition(uint part_id) const
+ {
+ return bitmap_get_next_set(&read_partitions, part_id);
+ }
+ bool same_key_column_order(List<Create_field> *create_list);
+
private:
static int list_part_cmp(const void* a, const void* b);
bool set_up_default_partitions(THD *thd, handler *file, HA_CREATE_INFO *info,
@@ -392,9 +415,11 @@ private:
uint start_no);
char *create_default_subpartition_name(THD *thd, uint subpart_no,
const char *part_name);
+ // FIXME: prune_partition_bitmaps() is duplicate of set_read_partitions()
bool prune_partition_bitmaps(TABLE_LIST *table_list);
bool add_named_partition(const char *part_name, uint length);
public:
+ bool set_read_partitions(List<char> *partition_names);
bool has_unique_name(partition_element *element);
bool vers_init_info(THD *thd);
@@ -475,8 +500,8 @@ public:
DBUG_ASSERT(vers_info->initialized());
part= vers_hist_part();
}
- max_time-= vers_stat_trx(STAT_TRX_END, part).min_time();
- return max_time > vers_info->interval;
+ my_time_t min_time= vers_stat_trx(STAT_TRX_END, part).min_time();
+ return max_time - min_time > vers_info->interval;
}
bool vers_interval_exceed(partition_element *part)
{
@@ -486,15 +511,31 @@ public:
{
return vers_interval_exceed(vers_hist_part());
}
+ bool vers_trx_id_to_ts(THD *thd, Field *in_trx_id, Field_timestamp &out_ts);
void vers_update_stats(THD *thd, partition_element *el)
{
DBUG_ASSERT(vers_info && vers_info->initialized());
DBUG_ASSERT(table && table->s);
DBUG_ASSERT(el && el->type == partition_element::VERSIONING);
+ bool updated;
mysql_rwlock_wrlock(&table->s->LOCK_stat_serial);
el->empty= false;
- bool updated=
- vers_stat_trx(STAT_TRX_END, el->id).update(table->vers_end_field());
+ if (table->versioned_by_engine())
+ {
+ // transaction is not yet pushed to VTQ, so we use now-time
+ my_time_t end_ts= my_time(0);
+
+ uchar buf[8];
+ Field_timestampf fld(buf, NULL, 0, Field::NONE, table->vers_end_field()->field_name, NULL, 6);
+ fld.store_TIME(end_ts, 0);
+ updated=
+ vers_stat_trx(STAT_TRX_END, el->id).update(&fld);
+ }
+ else
+ {
+ updated=
+ vers_stat_trx(STAT_TRX_END, el->id).update(table->vers_end_field());
+ }
if (updated)
table->s->stat_serial++;
mysql_rwlock_unlock(&table->s->LOCK_stat_serial);
diff --git a/sql/partitioning/partition_handler.cc b/sql/partitioning/partition_handler.cc
new file mode 100644
index 00000000000..1e04439e100
--- /dev/null
+++ b/sql/partitioning/partition_handler.cc
@@ -0,0 +1,3746 @@
+/*
+ Copyright (c) 2005, 2016, Oracle and/or its affiliates. All rights reserved.
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License
+ as published by the Free Software Foundation; version 2 of
+ the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#include "table.h" // TABLE_SHARE
+#include "sql_partition.h" // LIST_PART_ENTRY, part_id_range
+#include "partition_info.h" // NOT_A_PARTITION_ID
+#include "partition_handler.h"
+#include "log.h" // sql_print_error
+#include "key.h" // key_rec_cmp
+#include "sql_class.h" // THD
+#include <mysql/psi/psi_memory.h>
+
+#define MI_MAX_MSG_BUF 1024
+
+// In sql_class.cc:
+extern "C" int thd_binlog_format(const MYSQL_THD thd);
+
+/** operation names for the enum_part_operation. */
+static const char *opt_op_name[]= {"optimize", "analyze", "check", "repair",
+ "assign_to_keycache", "preload_keys"};
+
+// static PSI_memory_key key_memory_Partition_share;
+// static PSI_memory_key key_memory_partition_sort_buffer;
+// static PSI_memory_key key_memory_Partition_admin;
+#ifdef HAVE_PSI_INTERFACE
+extern PSI_mutex_key key_partition_auto_inc_mutex;
+// static PSI_memory_info all_partitioning_memory[]=
+// { { &key_memory_Partition_share, "Partition_share", 0},
+// { &key_memory_partition_sort_buffer, "partition_sort_buffer", 0},
+// { &key_memory_Partition_admin, "Partition_admin", 0} };
+static PSI_mutex_info all_partitioning_mutex[]=
+{ { &key_partition_auto_inc_mutex, "Partiton_share::auto_inc_mutex", 0} };
+#endif
+
+void partitioning_init()
+{
+#ifdef HAVE_PSI_INTERFACE
+ int count;
+// count= array_elements(all_partitioning_memory);
+// mysql_memory_register("sql", all_partitioning_memory, count);
+ count= array_elements(all_partitioning_mutex);
+ mysql_mutex_register("sql", all_partitioning_mutex, count);
+#endif
+}
+
+
+/**
+ Release reserved auto increment values not used.
+ @param thd Thread.
+ @param table_share Table Share
+ @param next_insert_id Next insert id (first non used auto inc value).
+ @param max_reserved End of reserved auto inc range.
+*/
+void
+Partition_share::release_auto_inc_if_possible(THD *thd, TABLE_SHARE *table_share,
+ const ulonglong next_insert_id,
+ const ulonglong max_reserved)
+{
+#ifndef DBUG_OFF
+ if (table_share->tmp_table == NO_TMP_TABLE)
+ {
+ mysql_mutex_assert_owner(&auto_inc_mutex);
+ }
+#endif /* DBUG_OFF */
+
+ /*
+ If the current auto_increment values is lower than the reserved value (1)
+ and the reserved value was reserved by this thread (2), then we can
+ lower the reserved value.
+ However, we cannot lower the value if there are forced/non generated
+ values from 'SET INSERT_ID = forced_val' (3). */
+ if (next_insert_id < next_auto_inc_val && // (1)
+ max_reserved >= next_auto_inc_val && // (2)
+ thd->auto_inc_intervals_forced.maximum() < next_insert_id) // (3)
+ {
+ next_auto_inc_val= next_insert_id;
+ }
+}
+
+
+/**
+ Get the partition name.
+
+ @param part Struct containing name and length
+ @param[out] length Length of the name
+
+ @return Partition name
+*/
+
+static uchar *get_part_name_from_def(PART_NAME_DEF *part,
+ size_t *length,
+ my_bool not_used MY_ATTRIBUTE((unused)))
+{
+ *length= part->length;
+ return part->partition_name;
+}
+
+
+/**
+ Populate the partition_name_hash in part_share.
+*/
+
+bool Partition_share::populate_partition_name_hash(partition_info *part_info)
+{
+ uint tot_names;
+ uint num_subparts= part_info->num_subparts;
+ DBUG_ENTER("Partition_share::populate_partition_name_hash");
+ DBUG_ASSERT(!part_info->is_sub_partitioned() || num_subparts);
+
+ if (num_subparts == 0)
+ {
+ num_subparts= 1;
+ }
+
+ /*
+ TABLE_SHARE::LOCK_ha_data must been locked before calling this function.
+ This ensures only one thread/table instance will execute this.
+ */
+
+#ifndef DBUG_OFF
+ if (part_info->table->s->tmp_table == NO_TMP_TABLE)
+ {
+ mysql_mutex_assert_owner(&part_info->table->s->LOCK_ha_data);
+ }
+#endif
+ if (partition_name_hash_initialized)
+ {
+ DBUG_RETURN(false);
+ }
+ tot_names= part_info->num_parts;
+ if (part_info->is_sub_partitioned())
+ {
+ tot_names+= part_info->num_parts * num_subparts;
+ }
+ partition_names= static_cast<const uchar**>(my_malloc(
+ part_info->get_tot_partitions() *
+ sizeof(*partition_names),
+ MYF(MY_WME)));
+ if (!partition_names)
+ {
+ DBUG_RETURN(true);
+ }
+ if (my_hash_init(&partition_name_hash,
+ system_charset_info, tot_names, 0, 0,
+ (my_hash_get_key) get_part_name_from_def,
+ my_free, HASH_UNIQUE))
+ {
+ my_free(partition_names);
+ partition_names= NULL;
+ DBUG_RETURN(true);
+ }
+
+ List_iterator<partition_element> part_it(part_info->partitions);
+ uint i= 0;
+ do
+ {
+ partition_element *part_elem= part_it++;
+ DBUG_ASSERT(part_elem->part_state == PART_NORMAL);
+ if (part_elem->part_state == PART_NORMAL)
+ {
+ if (insert_partition_name_in_hash(part_elem->partition_name,
+ i * num_subparts,
+ false))
+ goto err;
+ if (part_info->is_sub_partitioned())
+ {
+ List_iterator<partition_element>
+ subpart_it(part_elem->subpartitions);
+ partition_element *sub_elem;
+ uint j= 0;
+ do
+ {
+ sub_elem= subpart_it++;
+ if (insert_partition_name_in_hash(sub_elem->partition_name,
+ i * num_subparts + j, true))
+ goto err;
+
+ } while (++j < num_subparts);
+ }
+ }
+ } while (++i < part_info->num_parts);
+
+ for (i= 0; i < tot_names; i++)
+ {
+ PART_NAME_DEF *part_def;
+ part_def= reinterpret_cast<PART_NAME_DEF*>(
+ my_hash_element(&partition_name_hash, i));
+ if (part_def->is_subpart == part_info->is_sub_partitioned())
+ {
+ partition_names[part_def->part_id]= part_def->partition_name;
+ }
+ }
+ partition_name_hash_initialized= true;
+
+ DBUG_RETURN(false);
+err:
+ my_hash_free(&partition_name_hash);
+ my_free(partition_names);
+ partition_names= NULL;
+
+ DBUG_RETURN(true);
+}
+
+
+/**
+ Insert a partition name in the partition_name_hash.
+
+ @param name Name of partition
+ @param part_id Partition id (number)
+ @param is_subpart Set if the name belongs to a subpartition
+
+ @return Operation status
+ @retval true Failure
+ @retval false Success
+*/
+
+bool Partition_share::insert_partition_name_in_hash(const char *name,
+ uint part_id,
+ bool is_subpart)
+{
+ PART_NAME_DEF *part_def;
+ uchar *part_name;
+ uint part_name_length;
+ DBUG_ENTER("Partition_share::insert_partition_name_in_hash");
+ /*
+ Calculate and store the length here, to avoid doing it when
+ searching the hash.
+ */
+ part_name_length= static_cast<uint>(strlen(name));
+ /*
+ Must use memory that lives as long as table_share.
+ Freed in the Partition_share destructor.
+ Since we use my_multi_malloc, then my_free(part_def) will also free
+ part_name, as a part of my_hash_free.
+ */
+ if (!my_multi_malloc(MY_WME,
+ &part_def, sizeof(PART_NAME_DEF),
+ &part_name, part_name_length + 1,
+ NULL))
+ {
+ DBUG_RETURN(true);
+ }
+ memcpy(part_name, name, part_name_length + 1);
+ part_def->partition_name= part_name;
+ part_def->length= part_name_length;
+ part_def->part_id= part_id;
+ part_def->is_subpart= is_subpart;
+ if (my_hash_insert(&partition_name_hash, (uchar *) part_def))
+ {
+ my_free(part_def);
+ DBUG_RETURN(true);
+ }
+ DBUG_RETURN(false);
+}
+
+
+const char *Partition_share::get_partition_name(size_t part_id) const
+{
+ if (partition_names == NULL)
+ {
+ return NULL;
+ }
+ return reinterpret_cast<const char*>(partition_names[part_id]);
+}
+/*
+ Implementation of Partition_helper class.
+*/
+Partition_helper::Partition_helper(handler *main_handler)
+ :
+ m_handler(main_handler),
+ m_part_info(),
+ m_tot_parts(),
+ m_last_part(),
+ m_err_rec(),
+ m_ordered(),
+ m_ordered_scan_ongoing(),
+ m_ordered_rec_buffer(),
+ m_queue()
+{}
+
+
+Partition_helper::~Partition_helper()
+{
+ DBUG_ASSERT(m_ordered_rec_buffer == NULL);
+ DBUG_ASSERT(m_key_not_found_partitions.bitmap == NULL);
+}
+
+
+/**
+ Set partition info.
+
+ To be called from Partition_handler.
+
+ @param part_info Partition info to use.
+ @param early True if called when part_info only created and parsed,
+ but not setup, checked or fixed.
+ */
+void Partition_helper::set_part_info_low(partition_info *part_info,
+ bool early)
+{
+ /*
+ ha_partition will set m_tot_parts from the .par file during creating
+ the new handler.
+ And this call can be earlier than the partition_default_handling(),
+ so get_tot_partitions() may return zero.
+ */
+ if (m_tot_parts == 0 &&
+ (m_part_info == NULL || !early))
+ {
+ m_tot_parts= part_info->get_tot_partitions();
+ }
+ m_part_info= part_info;
+ m_is_sub_partitioned= m_part_info->is_sub_partitioned();
+}
+
+/**
+ Initialize the partitioning helper for use after the table is opened.
+
+ @param part_share Partitioning share (used for auto increment).
+
+ @return Operation status.
+ @retval false for success otherwise true.
+*/
+
+bool Partition_helper::open_partitioning(Partition_share *part_share)
+{
+ m_table= get_table();
+ DBUG_ASSERT(m_part_info == m_table->part_info);
+ m_part_share= part_share;
+ m_tot_parts= m_part_info->get_tot_partitions();
+ if (bitmap_init(&m_key_not_found_partitions, NULL, m_tot_parts, false))
+ {
+ return true;
+ }
+ bitmap_clear_all(&m_key_not_found_partitions);
+ m_key_not_found= false;
+ m_is_sub_partitioned= m_part_info->is_sub_partitioned();
+ m_auto_increment_lock= false;
+ m_auto_increment_safe_stmt_log_lock= false;
+ m_pkey_is_clustered= m_handler->primary_key_is_clustered();
+ m_part_spec.start_part= NOT_A_PARTITION_ID;
+ m_part_spec.end_part= NOT_A_PARTITION_ID;
+ m_index_scan_type= PARTITION_NO_INDEX_SCAN;
+ m_start_key.key= NULL;
+ m_start_key.length= 0;
+ m_scan_value= 3;
+ m_reverse_order= false;
+ m_curr_key_info[0]= NULL;
+ m_curr_key_info[1]= NULL;
+ m_curr_key_info[2]= NULL;
+ m_top_entry= NO_CURRENT_PART_ID;
+ m_ref_usage= REF_NOT_USED;
+ m_rec_length= m_table->s->reclength;
+ return false;
+}
+
+
+void Partition_helper::close_partitioning()
+{
+ bitmap_free(&m_key_not_found_partitions);
+ DBUG_ASSERT(!m_ordered_rec_buffer);
+ destroy_record_priority_queue();
+}
+
+/****************************************************************************
+ MODULE change record
+****************************************************************************/
+
+/**
+ Insert a row to the partitioned table.
+
+ @param buf The row in MySQL Row Format.
+
+ @return Operation status.
+ @retval 0 Success
+ @retval != 0 Error code
+*/
+
+int Partition_helper::ph_write_row(uchar *buf)
+{
+ uint32 part_id;
+ int error;
+ longlong func_value;
+ bool have_auto_increment= m_table->next_number_field &&
+ buf == m_table->record[0];
+ THD *thd= get_thd();
+ sql_mode_t saved_sql_mode= thd->variables.sql_mode;
+ bool saved_auto_inc_field_not_null= m_table->auto_increment_field_not_null;
+#ifndef DBUG_OFF
+ my_bitmap_map *old_map;
+#endif /* DBUG_OFF */
+ DBUG_ENTER("Partition_helper::ph_write_row");
+ DBUG_ASSERT(buf == m_table->record[0]);
+
+ /*
+ If we have an auto_increment column and we are writing a changed row
+ or a new row, then update the auto_increment value in the record.
+ */
+ if (have_auto_increment)
+ {
+ error= m_handler->update_auto_increment();
+
+ /*
+ If we have failed to set the auto-increment value for this row,
+ it is highly likely that we will not be able to insert it into
+ the correct partition. We must check and fail if neccessary.
+ */
+ if (error)
+ DBUG_RETURN(error);
+
+ /*
+ Don't allow generation of auto_increment value the partitions handler.
+ If a partitions handler would change the value, then it might not
+ match the partition any longer.
+ This can occur if 'SET INSERT_ID = 0; INSERT (NULL)',
+ So allow this by adding 'MODE_NO_AUTO_VALUE_ON_ZERO' to sql_mode.
+ The partitions handler::next_insert_id must always be 0. Otherwise
+ we need to forward release_auto_increment, or reset it for all
+ partitions.
+ */
+ if (m_table->next_number_field->val_int() == 0)
+ {
+ m_table->auto_increment_field_not_null= TRUE;
+ thd->variables.sql_mode|= MODE_NO_AUTO_VALUE_ON_ZERO;
+ }
+ }
+
+#ifndef DBUG_OFF
+ /* Temporary mark the partitioning fields as readable. */
+ old_map= dbug_tmp_use_all_columns(m_table, m_table->read_set);
+#endif /* DBUG_OFF */
+
+ error= m_part_info->get_partition_id(m_part_info, &part_id, &func_value);
+
+#ifndef DBUG_OFF
+ dbug_tmp_restore_column_map(m_table->read_set, old_map);
+#endif /* DBUG_OFF */
+
+ if (unlikely(error))
+ {
+ m_part_info->err_value= func_value;
+ goto exit;
+ }
+ if (!m_part_info->is_partition_locked(part_id))
+ {
+ DBUG_PRINT("info", ("Write to non-locked partition %u (func_value: %ld)",
+ part_id, (long) func_value));
+ error= HA_ERR_NOT_IN_LOCK_PARTITIONS;
+ goto exit;
+ }
+ m_last_part= part_id;
+ DBUG_PRINT("info", ("Insert in partition %d", part_id));
+
+ error= write_row_in_part(part_id, buf);
+
+ if (have_auto_increment && !m_table->s->next_number_keypart)
+ {
+ set_auto_increment_if_higher();
+ }
+exit:
+ thd->variables.sql_mode= saved_sql_mode;
+ m_table->auto_increment_field_not_null= saved_auto_inc_field_not_null;
+ DBUG_RETURN(error);
+}
+
+
+/**
+ Update an existing row in the partitioned table.
+
+ Yes, update_row() does what you expect, it updates a row. old_data will
+ have the previous row record in it, while new_data will have the newest
+ data in it.
+ Keep in mind that the server can do updates based on ordering if an
+ ORDER BY clause was used. Consecutive ordering is not guaranteed.
+
+ If the new record belongs to a different partition than the old record
+ then it will be inserted into the new partition and deleted from the old.
+
+ new_data is always record[0]
+ old_data is always record[1]
+
+ @param old_data The old record in MySQL Row Format.
+ @param new_data The new record in MySQL Row Format.
+
+ @return Operation status.
+ @retval 0 Success
+ @retval != 0 Error code
+*/
+
+// FIXME: duplicate of ha_partition::update_row()
+int Partition_helper::ph_update_row(const uchar *old_data, uchar *new_data)
+{
+ THD *thd= get_thd();
+ uint32 new_part_id, old_part_id;
+ int error= 0;
+ longlong func_value;
+ DBUG_ENTER("Partition_helper::ph_update_row");
+ m_err_rec= NULL;
+
+ // Need to read partition-related columns, to locate the row's partition:
+ DBUG_ASSERT(bitmap_is_subset(&m_part_info->full_part_field_set,
+ m_table->read_set));
+ if ((error= get_parts_for_update(old_data, new_data, m_table->record[0],
+ m_part_info, &old_part_id, &new_part_id,
+ &func_value)))
+ {
+ m_part_info->err_value= func_value;
+ goto exit;
+ }
+ DBUG_ASSERT(bitmap_is_set(&(m_part_info->read_partitions), old_part_id));
+ if (!bitmap_is_set(&(m_part_info->lock_partitions), new_part_id))
+ {
+ error= HA_ERR_NOT_IN_LOCK_PARTITIONS;
+ goto exit;
+ }
+
+ /*
+ The protocol for updating a row is:
+ 1) position the handler (cursor) on the row to be updated,
+ either through the last read row (rnd or index) or by rnd_pos.
+ 2) call update_row with both old and new full records as arguments.
+
+ This means that m_last_part should already be set to actual partition
+ where the row was read from. And if that is not the same as the
+ calculated part_id we found a misplaced row, we return an error to
+ notify the user that something is broken in the row distribution
+ between partitions! Since we don't check all rows on read, we return an
+ error instead of correcting m_last_part, to make the user aware of the
+ problem!
+
+ Notice that HA_READ_BEFORE_WRITE_REMOVAL does not require this protocol,
+ so this is not supported for this engine.
+ */
+ if (old_part_id != m_last_part)
+ {
+ m_err_rec= old_data;
+ DBUG_RETURN(HA_ERR_ROW_IN_WRONG_PARTITION);
+ }
+
+ m_last_part= new_part_id;
+ if (new_part_id == old_part_id)
+ {
+ DBUG_PRINT("info", ("Update in partition %d", new_part_id));
+ tmp_disable_binlog(thd); /* Do not replicate the low-level changes. */
+ error= update_row_in_part(new_part_id, old_data, new_data);
+ reenable_binlog(thd);
+ goto exit;
+ }
+ else
+ {
+ Field *saved_next_number_field= m_table->next_number_field;
+ /*
+ Don't allow generation of auto_increment value for update.
+ table->next_number_field is never set on UPDATE.
+ But is set for INSERT ... ON DUPLICATE KEY UPDATE,
+ and since update_row() does not generate or update an auto_inc value,
+ we cannot have next_number_field set when moving a row
+ to another partition with write_row(), since that could
+ generate/update the auto_inc value.
+ This gives the same behavior for partitioned vs non partitioned tables.
+ */
+ m_table->next_number_field= NULL;
+ DBUG_PRINT("info", ("Update from partition %d to partition %d",
+ old_part_id, new_part_id));
+ tmp_disable_binlog(thd); /* Do not replicate the low-level changes. */
+ error= write_row_in_part(new_part_id, new_data);
+ reenable_binlog(thd);
+ m_table->next_number_field= saved_next_number_field;
+ if (error)
+ goto exit;
+
+ if (m_part_info->part_type == VERSIONING_PARTITION)
+ {
+ uint sub_factor= m_part_info->num_subparts ? m_part_info->num_subparts : 1;
+ DBUG_ASSERT(m_tot_parts == m_part_info->num_parts * sub_factor);
+ uint lpart_id= new_part_id / sub_factor;
+ // lpart_id is VERSIONING partition because new_part_id != old_part_id
+ m_part_info->vers_update_stats(thd, lpart_id);
+ }
+
+ tmp_disable_binlog(thd); /* Do not replicate the low-level changes. */
+ error= delete_row_in_part(old_part_id, old_data);
+ reenable_binlog(thd);
+ if (error)
+ {
+ goto exit;
+ }
+ }
+
+exit:
+ /*
+ if updating an auto_increment column, update
+ m_part_share->next_auto_inc_val if needed.
+ (not to be used if auto_increment on secondary field in a multi-column
+ index)
+ mysql_update does not set table->next_number_field, so we use
+ table->found_next_number_field instead.
+ Also checking that the field is marked in the write set.
+ */
+ if (m_table->found_next_number_field &&
+ new_data == m_table->record[0] &&
+ !m_table->s->next_number_keypart &&
+ bitmap_is_set(m_table->write_set,
+ m_table->found_next_number_field->field_index))
+ {
+ set_auto_increment_if_higher();
+ }
+ DBUG_RETURN(error);
+}
+
+
+/**
+ Delete an existing row in the partitioned table.
+
+ This will delete a row. buf will contain a copy of the row to be deleted.
+ The server will call this right after the current row has been read
+ (from either a previous rnd_xxx() or index_xxx() call).
+ If you keep a pointer to the last row or can access a primary key it will
+ make doing the deletion quite a bit easier.
+ Keep in mind that the server does no guarentee consecutive deletions.
+ ORDER BY clauses can be used.
+
+ buf is either record[0] or record[1]
+
+ @param buf The record in MySQL Row Format.
+
+ @return Operation status.
+ @retval 0 Success
+ @retval != 0 Error code
+*/
+
+int Partition_helper::ph_delete_row(const uchar *buf)
+{
+ int error;
+ uint part_id;
+ DBUG_ENTER("Partition_helper::ph_delete_row");
+ m_err_rec= NULL;
+
+ DBUG_ASSERT(bitmap_is_subset(&m_part_info->full_part_field_set,
+ m_table->read_set));
+ if ((error= get_part_for_delete(buf,
+ m_table->record[0],
+ m_part_info,
+ &part_id)))
+ {
+ DBUG_RETURN(error);
+ }
+ if (!m_part_info->is_partition_locked(part_id))
+ {
+ DBUG_RETURN(HA_ERR_NOT_IN_LOCK_PARTITIONS);
+ }
+
+ /*
+ The protocol for deleting a row is:
+ 1) position the handler (cursor) on the row to be deleted,
+ either through the last read row (rnd or index) or by rnd_pos.
+ 2) call delete_row with the full record as argument.
+
+ This means that m_last_part should already be set to actual partition
+ where the row was read from. And if that is not the same as the
+ calculated part_id we found a misplaced row, we return an error to
+ notify the user that something is broken in the row distribution
+ between partitions! Since we don't check all rows on read, we return an
+ error instead of forwarding the delete to the correct (m_last_part)
+ partition!
+
+ Notice that HA_READ_BEFORE_WRITE_REMOVAL does not require this protocol,
+ so this is not supported for this engine.
+
+ TODO: change the assert in InnoDB into an error instead and make this one
+ an assert instead and remove the get_part_for_delete()!
+ */
+ if (part_id != m_last_part)
+ {
+ m_err_rec= buf;
+ DBUG_RETURN(HA_ERR_ROW_IN_WRONG_PARTITION);
+ }
+ /* Should never call delete_row on a partition which is not read */
+ DBUG_ASSERT(m_part_info->is_partition_used(part_id));
+
+ m_last_part= part_id;
+ error= delete_row_in_part(part_id, buf);
+ DBUG_RETURN(error);
+}
+
+
+/**
+ Get a range of auto increment values.
+
+ Can only be used if the auto increment field is the first field in an index.
+
+ This method is called by update_auto_increment which in turn is called
+ by the individual handlers as part of write_row. We use the
+ part_share->next_auto_inc_val, or search all
+ partitions for the highest auto_increment_value if not initialized or
+ if auto_increment field is a secondary part of a key, we must search
+ every partition when holding a mutex to be sure of correctness.
+
+ @param[in] increment Increment value.
+ @param[in] nb_desired_values Number of desired values.
+ @param[out] first_value First auto inc value reserved
+ or MAX if failure.
+ @param[out] nb_reserved_values Number of values reserved.
+*/
+
+void Partition_helper
+::get_auto_increment_first_field(ulonglong increment,
+ ulonglong nb_desired_values,
+ ulonglong *first_value,
+ ulonglong *nb_reserved_values)
+{
+ THD *thd= get_thd();
+ DBUG_ENTER("Partition_helper::get_auto_increment_first_field");
+ DBUG_PRINT("info", ("inc: %lu desired_values: %lu first_value: %lu",
+ (ulong) increment,
+ (ulong) nb_desired_values,
+ (ulong) *first_value));
+ DBUG_ASSERT(increment && nb_desired_values);
+ /*
+ next_number_keypart is != 0 if the auto_increment column is a secondary
+ column in the index (it is allowed in MyISAM)
+ */
+ DBUG_ASSERT(m_table->s->next_number_keypart == 0);
+ *first_value= 0;
+
+ /*
+ Get a lock for handling the auto_increment in part_share
+ for avoiding two concurrent statements getting the same number.
+ */
+ lock_auto_increment();
+
+ /* Initialize if not already done. */
+ if (!m_part_share->auto_inc_initialized)
+ {
+ initialize_auto_increment(false);
+ }
+
+ /*
+ In a multi-row insert statement like INSERT SELECT and LOAD DATA
+ where the number of candidate rows to insert is not known in advance
+ we must hold a lock/mutex for the whole statement if we have statement
+ based replication. Because the statement-based binary log contains
+ only the first generated value used by the statement, and slaves assumes
+ all other generated values used by this statement were consecutive to
+ this first one, we must exclusively lock the generator until the statement
+ is done.
+ */
+ int binlog_format= thd_binlog_format(thd);
+ if (!m_auto_increment_safe_stmt_log_lock &&
+ thd->lex->sql_command != SQLCOM_INSERT &&
+ binlog_format != BINLOG_FORMAT_UNSPEC &&
+ binlog_format != BINLOG_FORMAT_ROW)
+ {
+ DBUG_PRINT("info", ("locking auto_increment_safe_stmt_log_lock"));
+ m_auto_increment_safe_stmt_log_lock= true;
+ }
+
+ /* this gets corrected (for offset/increment) in update_auto_increment */
+ *first_value= m_part_share->next_auto_inc_val;
+ m_part_share->next_auto_inc_val+= nb_desired_values * increment;
+ if (m_part_share->next_auto_inc_val < *first_value)
+ {
+ /* Overflow, set to max. */
+ m_part_share->next_auto_inc_val= ULLONG_MAX;
+ }
+
+ unlock_auto_increment();
+ DBUG_PRINT("info", ("*first_value: %lu", (ulong) *first_value));
+ *nb_reserved_values= nb_desired_values;
+ DBUG_VOID_RETURN;
+}
+
+
+inline void Partition_helper::set_auto_increment_if_higher()
+{
+ Field_num *field= static_cast<Field_num*>(m_table->found_next_number_field);
+ ulonglong nr= (field->unsigned_flag || field->val_int() > 0)
+ ? field->val_int() : 0;
+ lock_auto_increment();
+ if (!m_part_share->auto_inc_initialized)
+ {
+ initialize_auto_increment(false);
+ }
+ /* must hold the mutex when looking/changing m_part_share. */
+ if (nr >= m_part_share->next_auto_inc_val)
+ {
+ m_part_share->next_auto_inc_val= nr + 1;
+ }
+ unlock_auto_increment();
+ save_auto_increment(nr);
+}
+
+
+void Partition_helper::ph_release_auto_increment()
+{
+ DBUG_ENTER("Partition_helper::ph_release_auto_increment");
+
+ if (m_table->s->next_number_keypart)
+ {
+ release_auto_increment_all_parts();
+ }
+ else if (m_handler->next_insert_id)
+ {
+ ulonglong max_reserved= m_handler->auto_inc_interval_for_cur_row.maximum();
+ lock_auto_increment();
+ m_part_share->release_auto_inc_if_possible(get_thd(), m_table->s,
+ m_handler->next_insert_id,
+ max_reserved);
+ DBUG_PRINT("info", ("part_share->next_auto_inc_val: %lu",
+ (ulong) m_part_share->next_auto_inc_val));
+
+ /* Unlock the multi row statement lock taken in get_auto_increment */
+ if (m_auto_increment_safe_stmt_log_lock)
+ {
+ m_auto_increment_safe_stmt_log_lock= FALSE;
+ DBUG_PRINT("info", ("unlocking auto_increment_safe_stmt_log_lock"));
+ }
+
+ unlock_auto_increment();
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ Calculate key hash value from an null terminated array of fields.
+ Support function for KEY partitioning.
+
+ @param field_array An array of the fields in KEY partitioning
+
+ @return hash_value calculated
+
+ @note Uses the hash function on the character set of the field.
+ Integer and floating point fields use the binary character set by default.
+*/
+
+uint32 Partition_helper::ph_calculate_key_hash_value(Field **field_array)
+{
+ ulong nr1= 1;
+ ulong nr2= 4;
+ bool use_51_hash;
+ use_51_hash= MY_TEST((*field_array)->table->part_info->key_algorithm ==
+ partition_info::KEY_ALGORITHM_51);
+
+ do
+ {
+ Field *field= *field_array;
+ if (use_51_hash)
+ {
+ switch (field->real_type()) {
+ case MYSQL_TYPE_TINY:
+ case MYSQL_TYPE_SHORT:
+ case MYSQL_TYPE_LONG:
+ case MYSQL_TYPE_FLOAT:
+ case MYSQL_TYPE_DOUBLE:
+ case MYSQL_TYPE_NEWDECIMAL:
+ case MYSQL_TYPE_TIMESTAMP:
+ case MYSQL_TYPE_LONGLONG:
+ case MYSQL_TYPE_INT24:
+ case MYSQL_TYPE_TIME:
+ case MYSQL_TYPE_DATETIME:
+ case MYSQL_TYPE_YEAR:
+ case MYSQL_TYPE_NEWDATE:
+ {
+ if (field->is_null())
+ {
+ nr1^= (nr1 << 1) | 1;
+ continue;
+ }
+ /* Force this to my_hash_sort_bin, which was used in 5.1! */
+ uint len= field->pack_length();
+ my_charset_bin.coll->hash_sort(&my_charset_bin, field->ptr, len,
+ &nr1, &nr2);
+ /* Done with this field, continue with next one. */
+ continue;
+ }
+ case MYSQL_TYPE_STRING:
+ case MYSQL_TYPE_VARCHAR:
+ case MYSQL_TYPE_BIT:
+ /* Not affected, same in 5.1 and 5.5 */
+ break;
+ /*
+ ENUM/SET uses my_hash_sort_simple in 5.1 (i.e. my_charset_latin1)
+ and my_hash_sort_bin in 5.5!
+ */
+ case MYSQL_TYPE_ENUM:
+ case MYSQL_TYPE_SET:
+ {
+ if (field->is_null())
+ {
+ nr1^= (nr1 << 1) | 1;
+ continue;
+ }
+ /* Force this to my_hash_sort_bin, which was used in 5.1! */
+ uint len= field->pack_length();
+ my_charset_latin1.coll->hash_sort(&my_charset_latin1, field->ptr,
+ len, &nr1, &nr2);
+ continue;
+ }
+ /* New types in mysql-5.6. */
+ case MYSQL_TYPE_DATETIME2:
+ case MYSQL_TYPE_TIME2:
+ case MYSQL_TYPE_TIMESTAMP2:
+ /* Not affected, 5.6+ only! */
+ break;
+
+ /* These types should not be allowed for partitioning! */
+ case MYSQL_TYPE_NULL:
+ case MYSQL_TYPE_DECIMAL:
+ case MYSQL_TYPE_DATE:
+ case MYSQL_TYPE_TINY_BLOB:
+ case MYSQL_TYPE_MEDIUM_BLOB:
+ case MYSQL_TYPE_LONG_BLOB:
+ case MYSQL_TYPE_BLOB:
+ case MYSQL_TYPE_VAR_STRING:
+ case MYSQL_TYPE_GEOMETRY:
+ /* fall through. */
+ default:
+ DBUG_ASSERT(0); // New type?
+ /* Fall through for default hashing (5.5). */
+ }
+ /* fall through, use collation based hashing. */
+ }
+ field->hash(&nr1, &nr2);
+ } while (*(++field_array));
+ return (uint32) nr1;
+}
+
+
+bool Partition_helper::print_partition_error(int error, myf errflag)
+{
+ THD *thd= get_thd();
+ DBUG_ENTER("Partition_helper::print_partition_error");
+
+ /* Should probably look for my own errors first */
+ DBUG_PRINT("enter", ("error: %d", error));
+
+ if ((error == HA_ERR_NO_PARTITION_FOUND) &&
+ ! (thd->lex->alter_info.flags & Alter_info::ALTER_TRUNCATE_PARTITION))
+ {
+ m_part_info->print_no_partition_found(m_table, errflag);
+ // print_no_partition_found() reports an error, so we can just return here.
+ DBUG_RETURN(false);
+ }
+ else if (error == HA_ERR_ROW_IN_WRONG_PARTITION)
+ {
+ /*
+ Should only happen on DELETE or UPDATE!
+ Or in ALTER TABLE REBUILD/REORGANIZE where there are a misplaced
+ row that needed to move to an old partition (not in the given set).
+ */
+ DBUG_ASSERT(thd_sql_command(thd) == SQLCOM_DELETE ||
+ thd_sql_command(thd) == SQLCOM_DELETE_MULTI ||
+ thd_sql_command(thd) == SQLCOM_UPDATE ||
+ thd_sql_command(thd) == SQLCOM_UPDATE_MULTI ||
+ thd_sql_command(thd) == SQLCOM_ALTER_TABLE);
+ DBUG_ASSERT(m_err_rec);
+ if (m_err_rec)
+ {
+ size_t max_length;
+ char buf[MAX_KEY_LENGTH];
+ String str(buf,sizeof(buf),system_charset_info);
+ uint32 part_id;
+ DBUG_ASSERT(m_last_part < m_tot_parts);
+ str.length(0);
+ if (thd_sql_command(thd) == SQLCOM_ALTER_TABLE)
+ {
+ str.append("from REBUILD/REORGANIZED partition: ");
+ str.append_ulonglong(m_last_part);
+ str.append(" to non included partition (new definition): ");
+ }
+ else
+ {
+ str.append_ulonglong(m_last_part);
+ str.append(". Correct is ");
+ }
+ if (get_part_for_delete(m_err_rec,
+ m_table->record[0],
+ m_part_info,
+ &part_id))
+ {
+ str.append("?");
+ }
+ else
+ {
+ str.append_ulonglong(part_id);
+ }
+ append_row_to_str(str, m_err_rec, m_table);
+
+ /* Log this error, so the DBA can notice it and fix it! */
+ sql_print_error("Table '%-192s' corrupted: row in wrong partition: %s\n"
+ "Please REPAIR the table!",
+ m_table->s->table_name.str,
+ str.c_ptr_safe());
+
+ max_length= (MYSQL_ERRMSG_SIZE - strlen(ER(ER_ROW_IN_WRONG_PARTITION)));
+ if (str.length() >= max_length)
+ {
+ str.length(max_length-4);
+ str.append(STRING_WITH_LEN("..."));
+ }
+ my_error(ER_ROW_IN_WRONG_PARTITION, MYF(0), str.c_ptr_safe());
+ m_err_rec= NULL;
+ DBUG_RETURN(false);
+ }
+ }
+
+ DBUG_RETURN(true);
+}
+
+
+/**
+ Implement the partition changes defined by ALTER TABLE of partitions.
+
+ Add and copy if needed a number of partitions, during this operation
+ only read operation is ongoing in the server. This is used by
+ ADD PARTITION all types as well as by REORGANIZE PARTITION. For
+ one-phased implementations it is used also by DROP and COALESCE
+ PARTITIONs.
+ One-phased implementation needs the new frm file, other handlers will
+ get zero length and a NULL reference here.
+
+ @param[in] create_info HA_CREATE_INFO object describing all
+ fields and indexes in table
+ @param[in] path Complete path of db and table name
+ @param[out] copied Output parameter where number of copied
+ records are added
+ @param[out] deleted Output parameter where number of deleted
+ records are added
+
+ @return Operation status
+ @retval 0 Success
+ @retval != 0 Failure
+*/
+
+// FIXME: duplicate of ha_partition::change_partitions
+int Partition_helper::change_partitions(HA_CREATE_INFO *create_info,
+ const char *path,
+ ulonglong * const copied,
+ ulonglong * const deleted)
+{
+ List_iterator<partition_element> part_it(m_part_info->partitions);
+ List_iterator <partition_element> t_it(m_part_info->temp_partitions);
+ char part_name_buff[FN_REFLEN];
+ const char *table_level_data_file_name= create_info->data_file_name;
+ const char *table_level_index_file_name= create_info->index_file_name;
+ const char *table_level_tablespace_name= create_info->tablespace;
+ uint num_parts= m_part_info->partitions.elements;
+ uint num_subparts= m_part_info->num_subparts;
+ uint i= 0;
+ uint num_remain_partitions;
+ uint num_reorged_parts;
+ int error= 1;
+ bool first;
+ uint temp_partitions= m_part_info->temp_partitions.elements;
+ THD *thd= get_thd();
+ DBUG_ENTER("Partition_helper::change_partitions");
+
+ /*
+ Use the read_partitions bitmap for reorganized partitions,
+ i.e. what to copy.
+ */
+ bitmap_clear_all(&m_part_info->read_partitions);
+
+ /*
+ Assert that it works without HA_FILE_BASED and lower_case_table_name = 2.
+ */
+ DBUG_ASSERT(!strcmp(path, get_canonical_filename(m_handler, path,
+ part_name_buff)));
+ num_reorged_parts= 0;
+ if (!m_part_info->is_sub_partitioned())
+ num_subparts= 1;
+
+ /*
+ Step 1:
+ Calculate number of reorganized partitions.
+ */
+ if (temp_partitions)
+ {
+ num_reorged_parts= temp_partitions * num_subparts;
+ }
+ else
+ {
+ do
+ {
+ partition_element *part_elem= part_it++;
+ if (part_elem->part_state == PART_CHANGED ||
+ part_elem->part_state == PART_REORGED_DROPPED)
+ {
+ num_reorged_parts+= num_subparts;
+ }
+ } while (++i < num_parts);
+ }
+
+ /*
+ Step 2:
+ Calculate number of partitions after change.
+ */
+ num_remain_partitions= 0;
+ if (temp_partitions)
+ {
+ num_remain_partitions= num_parts * num_subparts;
+ }
+ else
+ {
+ part_it.rewind();
+ i= 0;
+ do
+ {
+ partition_element *part_elem= part_it++;
+ if (part_elem->part_state == PART_NORMAL ||
+ part_elem->part_state == PART_TO_BE_ADDED ||
+ part_elem->part_state == PART_CHANGED)
+ {
+ num_remain_partitions+= num_subparts;
+ }
+ } while (++i < num_parts);
+ }
+
+ /*
+ Step 3:
+ Set the read_partition bit for all partitions to be copied.
+ */
+ if (num_reorged_parts)
+ {
+ i= 0;
+ first= true;
+ part_it.rewind();
+ do
+ {
+ partition_element *part_elem= part_it++;
+ if (part_elem->part_state == PART_CHANGED ||
+ part_elem->part_state == PART_REORGED_DROPPED)
+ {
+ for (uint sp = 0; sp < num_subparts; sp++)
+ {
+ bitmap_set_bit(&m_part_info->read_partitions, i * num_subparts + sp);
+ }
+ DBUG_ASSERT(first);
+ }
+ else if (first && temp_partitions &&
+ part_elem->part_state == PART_TO_BE_ADDED)
+ {
+ /*
+ When doing an ALTER TABLE REORGANIZE PARTITION a number of
+ partitions is to be reorganized into a set of new partitions.
+ The reorganized partitions are in this case in the temp_partitions
+ list. We mark all of them in one batch and thus we only do this
+ until we find the first partition with state PART_TO_BE_ADDED
+ since this is where the new partitions go in and where the old
+ ones used to be.
+ */
+ first= false;
+ DBUG_ASSERT(((i*num_subparts) + num_reorged_parts) <= m_tot_parts);
+ for (uint sp = 0; sp < num_reorged_parts; sp++)
+ {
+ bitmap_set_bit(&m_part_info->read_partitions, i * num_subparts + sp);
+ }
+ }
+ } while (++i < num_parts);
+ }
+
+ /*
+ Step 4:
+ Create the new partitions and also open, lock and call
+ external_lock on them (if needed) to prepare them for copy phase
+ and also for later close calls.
+ No need to create PART_NORMAL partitions since they must not
+ be written to!
+ Only PART_CHANGED and PART_TO_BE_ADDED should be written to!
+ */
+
+ error= prepare_for_new_partitions(num_remain_partitions,
+ num_reorged_parts == 0);
+
+ i= 0;
+ part_it.rewind();
+ do
+ {
+ partition_element *part_elem= part_it++;
+ DBUG_ASSERT(part_elem->part_state >= PART_NORMAL &&
+ part_elem->part_state <= PART_CHANGED);
+ if (part_elem->part_state == PART_TO_BE_ADDED ||
+ part_elem->part_state == PART_CHANGED)
+ {
+ /*
+ A new partition needs to be created PART_TO_BE_ADDED means an
+ entirely new partition and PART_CHANGED means a changed partition
+ that will still exist with either more or less data in it.
+ */
+ uint name_variant= NORMAL_PART_NAME;
+ if (part_elem->part_state == PART_CHANGED ||
+ (part_elem->part_state == PART_TO_BE_ADDED && temp_partitions))
+ name_variant= TEMP_PART_NAME;
+ if (m_part_info->is_sub_partitioned())
+ {
+ List_iterator<partition_element> sub_it(part_elem->subpartitions);
+ uint j= 0, part;
+ do
+ {
+ partition_element *sub_elem= sub_it++;
+ create_subpartition_name(part_name_buff, path,
+ part_elem->partition_name,
+ sub_elem->partition_name,
+ name_variant);
+ part= i * num_subparts + j;
+ DBUG_PRINT("info", ("Add subpartition %s", part_name_buff));
+ /*
+ update_create_info was called previously in
+ mysql_prepare_alter_table. Which may have set data/index_file_name
+ for the partitions to the full partition name, including
+ '#P#<part_name>[#SP#<subpart_name>] suffix. Remove that suffix
+ if it exists.
+ */
+ truncate_partition_filename(sub_elem->data_file_name);
+ truncate_partition_filename(sub_elem->index_file_name);
+ /* Notice that sub_elem is already based on part_elem's defaults. */
+ error= set_up_table_before_create(thd,
+ m_table->s,
+ part_name_buff,
+ create_info,
+ sub_elem);
+ if (error)
+ {
+ goto err;
+ }
+ if ((error= create_new_partition(m_table,
+ create_info,
+ part_name_buff,
+ part,
+ sub_elem)))
+ {
+ goto err;
+ }
+ /* Reset create_info to table level values. */
+ create_info->data_file_name= table_level_data_file_name;
+ create_info->index_file_name= table_level_index_file_name;
+ create_info->tablespace= table_level_tablespace_name;
+ } while (++j < num_subparts);
+ }
+ else
+ {
+ create_partition_name(part_name_buff, path,
+ part_elem->partition_name, name_variant,
+ true);
+ DBUG_PRINT("info", ("Add partition %s", part_name_buff));
+ /* See comment in subpartition branch above! */
+ truncate_partition_filename(part_elem->data_file_name);
+ truncate_partition_filename(part_elem->index_file_name);
+ error= set_up_table_before_create(thd,
+ m_table->s,
+ part_name_buff,
+ create_info,
+ part_elem);
+ if (error)
+ {
+ goto err;
+ }
+ if ((error= create_new_partition(m_table,
+ create_info,
+ (const char *)part_name_buff,
+ i,
+ part_elem)))
+ {
+ goto err;
+ }
+ /* Reset create_info to table level values. */
+ create_info->data_file_name= table_level_data_file_name;
+ create_info->index_file_name= table_level_index_file_name;
+ create_info->tablespace= table_level_tablespace_name;
+ }
+ }
+ } while (++i < num_parts);
+
+ /*
+ Step 5:
+ State update to prepare for next write of the frm file.
+ */
+ i= 0;
+ part_it.rewind();
+ do
+ {
+ partition_element *part_elem= part_it++;
+ if (part_elem->part_state == PART_TO_BE_ADDED)
+ part_elem->part_state= PART_IS_ADDED;
+ else if (part_elem->part_state == PART_CHANGED)
+ part_elem->part_state= PART_IS_CHANGED;
+ else if (part_elem->part_state == PART_REORGED_DROPPED)
+ part_elem->part_state= PART_TO_BE_DROPPED;
+ } while (++i < num_parts);
+ for (i= 0; i < temp_partitions; i++)
+ {
+ partition_element *part_elem= t_it++;
+ DBUG_ASSERT(part_elem->part_state == PART_TO_BE_REORGED);
+ part_elem->part_state= PART_TO_BE_DROPPED;
+ }
+ error= copy_partitions(copied, deleted);
+err:
+ if (error)
+ {
+ m_handler->print_error(error,
+ MYF(error != ER_OUTOFMEMORY ? 0 : ME_FATALERROR));
+ }
+ /*
+ Close and unlock the new temporary partitions.
+ They will later be deleted or renamed through the ddl-log.
+ */
+ close_new_partitions();
+ DBUG_RETURN(error);
+}
+
+
+/**
+ Copy partitions as part of ALTER TABLE of partitions.
+
+ change_partitions has done all the preparations, now it is time to
+ actually copy the data from the reorganized partitions to the new
+ partitions.
+
+ @param[out] copied Number of records copied.
+ @param[out] deleted Number of records deleted.
+
+ @return Operation status
+ @retval 0 Success
+ @retval >0 Error code
+*/
+
+int Partition_helper::copy_partitions(ulonglong * const copied,
+ ulonglong * const deleted)
+{
+ uint new_part= 0;
+ int result= 0;
+ longlong func_value;
+ DBUG_ENTER("Partition_helper::copy_partitions");
+
+ if (m_part_info->linear_hash_ind)
+ {
+ if (m_part_info->part_type == HASH_PARTITION)
+ set_linear_hash_mask(m_part_info, m_part_info->num_parts);
+ else
+ set_linear_hash_mask(m_part_info, m_part_info->num_subparts);
+ }
+
+ /*
+ m_part_info->read_partitions bitmap is setup for all the reorganized
+ partitions to be copied. So we can use the normal handler rnd interface
+ for reading.
+ */
+ if ((result= m_handler->ha_rnd_init(1)))
+ {
+ DBUG_RETURN(result);
+ }
+ while (true)
+ {
+ if ((result= m_handler->ha_rnd_next(m_table->record[0])))
+ {
+ if (result == HA_ERR_RECORD_DELETED)
+ continue; //Probably MyISAM
+ if (result != HA_ERR_END_OF_FILE)
+ goto error;
+ /*
+ End-of-file reached, break out to end the copy process.
+ */
+ break;
+ }
+ /* Found record to insert into new handler */
+ if (m_part_info->get_partition_id(m_part_info, &new_part,
+ &func_value))
+ {
+ /*
+ This record is in the original table but will not be in the new
+ table since it doesn't fit into any partition any longer due to
+ changed partitioning ranges or list values.
+ */
+ (*deleted)++;
+ }
+ else
+ {
+ if ((result= write_row_in_new_part(new_part)))
+ {
+ goto error;
+ }
+ }
+ }
+ m_handler->ha_rnd_end();
+ DBUG_RETURN(false);
+error:
+ m_handler->ha_rnd_end();
+ DBUG_RETURN(result);
+}
+
+
+/**
+ Check/fix misplaced rows.
+
+ @param part_id Partition to check/fix.
+ @param repair If true, move misplaced rows to correct partition.
+
+ @return Operation status.
+ @retval 0 Success
+ @retval != 0 Error
+*/
+
+int Partition_helper::check_misplaced_rows(uint read_part_id, bool repair)
+{
+ int result= 0;
+ THD *thd= get_thd();
+ bool ignore= thd->lex->ignore;
+ uint32 correct_part_id;
+ longlong func_value;
+ ha_rows num_misplaced_rows= 0;
+ ha_rows num_deleted_rows= 0;
+
+ DBUG_ENTER("Partition_helper::check_misplaced_rows");
+
+ if (repair)
+ {
+ /* We must read the full row, if we need to move it! */
+ bitmap_set_all(m_table->read_set);
+ bitmap_set_all(m_table->write_set);
+ }
+ else
+ {
+ /* Only need to read the partitioning fields. */
+ bitmap_union(m_table->read_set, &m_part_info->full_part_field_set);
+#if 0
+ /* Fill the base columns of virtual generated columns if necessary */
+ for (Field **ptr= m_part_info->full_part_field_array; *ptr; ptr++)
+ {
+ if ((*ptr)->is_virtual_gcol())
+ m_table->mark_gcol_in_maps(*ptr);
+ }
+#endif
+ }
+
+ if ((result= rnd_init_in_part(read_part_id, true)))
+ DBUG_RETURN(result);
+
+ while (true)
+ {
+ if ((result= ph_rnd_next_in_part(read_part_id, m_table->record[0])))
+ {
+ if (result == HA_ERR_RECORD_DELETED)
+ continue;
+ if (result != HA_ERR_END_OF_FILE)
+ break;
+
+ if (num_misplaced_rows > 0)
+ {
+ if (repair)
+ {
+ if (num_deleted_rows > 0)
+ {
+ print_admin_msg(thd, MI_MAX_MSG_BUF, "warning",
+ m_table->s->db.str, m_table->alias,
+ opt_op_name[REPAIR_PARTS],
+ "Moved %lld misplaced rows, deleted %lld rows",
+ num_misplaced_rows - num_deleted_rows,
+ num_deleted_rows);
+ }
+ else
+ {
+ print_admin_msg(thd, MI_MAX_MSG_BUF, "warning",
+ m_table->s->db.str, m_table->alias,
+ opt_op_name[REPAIR_PARTS],
+ "Moved %lld misplaced rows",
+ num_misplaced_rows);
+ }
+ }
+ else
+ {
+ print_admin_msg(thd, MI_MAX_MSG_BUF, "error",
+ m_table->s->db.str, m_table->alias,
+ opt_op_name[CHECK_PARTS],
+ "Found %lld misplaced rows in partition %u",
+ num_misplaced_rows,
+ read_part_id);
+ }
+ }
+ /* End-of-file reached, all rows are now OK, reset result and break. */
+ result= 0;
+ break;
+ }
+
+ result= m_part_info->get_partition_id(m_part_info, &correct_part_id,
+ &func_value);
+ // TODO: Add code to delete rows not matching any partition.
+ if (result)
+ break;
+
+ if (correct_part_id != read_part_id)
+ {
+ num_misplaced_rows++;
+ m_err_rec= NULL;
+ if (!repair)
+ {
+ /* Check. */
+ result= HA_ADMIN_NEEDS_UPGRADE;
+ char buf[MAX_KEY_LENGTH];
+ String str(buf,sizeof(buf),system_charset_info);
+ str.length(0);
+ append_row_to_str(str, m_err_rec, m_table);
+ print_admin_msg(thd, MI_MAX_MSG_BUF, "error",
+ m_table->s->db.str, m_table->alias,
+ opt_op_name[CHECK_PARTS],
+ "Found a misplaced row"
+ " in part %d should be in part %d:\n%s",
+ read_part_id,
+ correct_part_id,
+ str.c_ptr_safe());
+ /* Break on first misplaced row, unless ignore is given! */
+ if (!ignore)
+ break;
+ }
+ else
+ {
+ DBUG_PRINT("info", ("Moving row from partition %d to %d",
+ read_part_id, correct_part_id));
+
+ /*
+ Insert row into correct partition. Notice that there are no commit
+ for every N row, so the repair will be one large transaction!
+ */
+ if ((result= write_row_in_part(correct_part_id, m_table->record[0])))
+ {
+ /*
+ We have failed to insert a row, it might have been a duplicate!
+ */
+ char buf[MAX_KEY_LENGTH];
+ String str(buf,sizeof(buf),system_charset_info);
+ str.length(0);
+ if (result == HA_ERR_FOUND_DUPP_KEY)
+ {
+ if (ignore)
+ {
+ str.append("Duplicate key found, deleting the record:\n");
+ num_deleted_rows++;
+ }
+ else
+ {
+ str.append("Duplicate key found, "
+ "please update or delete the record:\n");
+ result= HA_ADMIN_CORRUPT;
+ }
+ }
+ append_row_to_str(str, m_err_rec, m_table);
+
+ /*
+ If the engine supports transactions, the failure will be
+ rollbacked.
+ */
+ if (!m_handler->has_transactions() ||
+ ignore || result == HA_ADMIN_CORRUPT)
+ {
+ /* Log this error, so the DBA can notice it and fix it! */
+ sql_print_error("Table '%-192s' failed to move/insert a row"
+ " from part %d into part %d:\n%s",
+ m_table->s->table_name.str,
+ read_part_id,
+ correct_part_id,
+ str.c_ptr_safe());
+ }
+ print_admin_msg(thd, MI_MAX_MSG_BUF, "error",
+ m_table->s->db.str, m_table->alias,
+ opt_op_name[REPAIR_PARTS],
+ "Failed to move/insert a row"
+ " from part %d into part %d:\n%s",
+ read_part_id,
+ correct_part_id,
+ str.c_ptr_safe());
+ if (!ignore || result != HA_ERR_FOUND_DUPP_KEY)
+ break;
+ }
+
+ /* Delete row from wrong partition. */
+ if ((result= delete_row_in_part(read_part_id, m_table->record[0])))
+ {
+ result= HA_ADMIN_CORRUPT;
+ if (m_handler->has_transactions())
+ break;
+ /*
+ We have introduced a duplicate, since we failed to remove it
+ from the wrong partition.
+ */
+ char buf[MAX_KEY_LENGTH];
+ String str(buf,sizeof(buf),system_charset_info);
+ str.length(0);
+ append_row_to_str(str, m_err_rec, m_table);
+
+ /* Log this error, so the DBA can notice it and fix it! */
+ sql_print_error("Table '%-192s': Delete from part %d failed with"
+ " error %d. But it was already inserted into"
+ " part %d, when moving the misplaced row!"
+ "\nPlease manually fix the duplicate row:\n%s",
+ m_table->s->table_name.str,
+ read_part_id,
+ result,
+ correct_part_id,
+ str.c_ptr_safe());
+ break;
+ }
+ }
+ }
+ }
+
+ int tmp_result= rnd_end_in_part(read_part_id, true);
+ DBUG_RETURN(result ? result : tmp_result);
+}
+
+/**
+ Read next row during full partition scan (scan in random row order).
+
+ This function can evaluate the virtual generated columns. If virtual
+ generated columns are involved, you should not call rnd_next_in_part
+ directly but this one.
+
+ @param part_id Partition to read from.
+ @param[in,out] buf buffer that should be filled with data.
+
+ @return Operation status.
+ @retval 0 Success
+ @retval != 0 Error code
+*/
+
+int Partition_helper::ph_rnd_next_in_part(uint part_id, uchar *buf)
+{
+ int result= rnd_next_in_part(part_id, buf);
+
+#if 0
+ if (!result && m_table->has_gcol())
+ result= update_generated_read_fields(buf, m_table);
+#endif
+
+ return result;
+}
+
+
+/** Set used partitions bitmap from Alter_info.
+
+ @return false if success else true.
+*/
+
+bool Partition_helper::set_altered_partitions()
+{
+ Alter_info *alter_info= &get_thd()->lex->alter_info;
+
+ if ((alter_info->flags & Alter_info::ALTER_ADMIN_PARTITION) == 0 ||
+ (alter_info->flags & Alter_info::ALTER_ALL_PARTITION))
+ {
+ /*
+ Full table command, not ALTER TABLE t <cmd> PARTITION <partition list>.
+ All partitions are already set, so do nothing.
+ */
+ return false;
+ }
+ return m_part_info->set_read_partitions(&alter_info->partition_names);
+}
+
+#if 0
+/**
+ Print a message row formatted for ANALYZE/CHECK/OPTIMIZE/REPAIR TABLE.
+
+ Modeled after mi_check_print_msg.
+
+ @param thd Thread context.
+ @param len Needed length for message buffer.
+ @param msg_type Message type.
+ @param db_name Database name.
+ @param table_name Table name.
+ @param op_name Operation name.
+ @param fmt Message (in printf format with additional arguments).
+
+ @return Operation status.
+ @retval false for success else true.
+*/
+
+bool Partition_helper::print_admin_msg(THD* thd,
+ uint len,
+ const char *msg_type,
+ const char *db_name,
+ const char *table_name,
+ const char *op_name,
+ const char *fmt,
+ ...)
+{
+ va_list args;
+ Protocol *protocol= thd->protocol;
+ uint length;
+ size_t msg_length;
+ char name[NAME_LEN*2+2];
+ char *msgbuf;
+ bool error= true;
+
+ if (!(msgbuf= (char*) my_malloc(len, MYF(0))))
+ return true;
+ va_start(args, fmt);
+ msg_length= my_vsnprintf(msgbuf, len, fmt, args);
+ va_end(args);
+ if (msg_length >= (len - 1))
+ goto err;
+ msgbuf[len - 1] = 0; // healthy paranoia
+
+ if (!thd->protocol->connection_alive())
+ {
+ sql_print_error("%s", msgbuf);
+ goto err;
+ }
+
+ length=(uint) (strxmov(name, db_name, ".", table_name,NullS) - name);
+ /*
+ TODO: switch from protocol to push_warning here. The main reason we didn't
+ it yet is parallel repair. Due to following trace:
+ mi_check_print_msg/push_warning/sql_alloc/my_pthread_getspecific_ptr.
+
+ Also we likely need to lock mutex here (in both cases with protocol and
+ push_warning).
+ */
+ DBUG_PRINT("info",("print_admin_msg: %s, %s, %s, %s", name, op_name,
+ msg_type, msgbuf));
+ protocol->start_row();
+ protocol->store(name, length, system_charset_info);
+ protocol->store(op_name, system_charset_info);
+ protocol->store(msg_type, system_charset_info);
+ protocol->store(msgbuf, msg_length, system_charset_info);
+ if (protocol->end_row())
+ {
+ sql_print_error("Failed on my_net_write, writing to stderr instead: %s\n",
+ msgbuf);
+ goto err;
+ }
+ error= false;
+err:
+ my_free(msgbuf);
+ return error;
+}
+#endif
+
+
+/**
+ Set table->read_set taking partitioning expressions into account.
+
+ @param[in] rnd_init True if called from rnd_init (else index_init).
+*/
+
+inline
+void Partition_helper::set_partition_read_set()
+{
+ /*
+ For operations that may need to change data, we may need to extend
+ read_set.
+ */
+ if (m_handler->get_lock_type() == F_WRLCK)
+ {
+ /*
+ If write_set contains any of the fields used in partition and
+ subpartition expression, we need to set all bits in read_set because
+ the row may need to be inserted in a different [sub]partition. In
+ other words update_row() can be converted into write_row(), which
+ requires a complete record.
+ */
+ if (bitmap_is_overlapping(&m_part_info->full_part_field_set,
+ m_table->write_set))
+ {
+ bitmap_set_all(m_table->read_set);
+ }
+ else
+ {
+ /*
+ Some handlers only read fields as specified by the bitmap for the
+ read set. For partitioned handlers we always require that the
+ fields of the partition functions are read such that we can
+ calculate the partition id to place updated and deleted records.
+ */
+ bitmap_union(m_table->read_set, &m_part_info->full_part_field_set);
+ }
+ // Mark virtual generated columns writable
+ for (Field **vf= m_table->vfield; vf && *vf; vf++)
+ {
+ if (bitmap_is_set(m_table->read_set, (*vf)->field_index))
+ bitmap_set_bit(m_table->write_set, (*vf)->field_index);
+ }
+ }
+}
+
+
+/****************************************************************************
+ MODULE full table scan
+****************************************************************************/
+
+/**
+ Initialize engine for random reads.
+
+ rnd_init() is called when the server wants the storage engine to do a
+ table scan or when the server wants to access data through rnd_pos.
+
+ When scan is used we will scan one handler partition at a time.
+ When preparing for rnd_pos we will initialize all handler partitions.
+ No extra cache handling is needed when scanning is not performed.
+
+ Before initializing we will call rnd_end to ensure that we clean up from
+ any previous incarnation of a table scan.
+
+ @param scan false for initialize for random reads through rnd_pos()
+ true for initialize for random scan through rnd_next().
+
+ @return Operation status.
+ @retval 0 Success
+ @retval != 0 Error code
+*/
+
+int Partition_helper::ph_rnd_init(bool scan)
+{
+ int error;
+ uint i= 0;
+ uint part_id;
+ DBUG_ENTER("Partition_helper::ph_rnd_init");
+
+ set_partition_read_set();
+
+ /* Now we see what the index of our first important partition is */
+ DBUG_PRINT("info", ("m_part_info->read_partitions: 0x%lx",
+ (long) m_part_info->read_partitions.bitmap));
+ part_id= m_part_info->get_first_used_partition();
+ DBUG_PRINT("info", ("m_part_spec.start_part %d", part_id));
+
+ if (MY_BIT_NONE == part_id)
+ {
+ error= 0;
+ goto err1;
+ }
+
+ DBUG_PRINT("info", ("rnd_init on partition %d", part_id));
+ if (scan)
+ {
+ /* A scan can be restarted without rnd_end() in between! */
+ if (m_scan_value == 1 && m_part_spec.start_part != NOT_A_PARTITION_ID)
+ {
+ /* End previous scan on partition before restart. */
+ if ((error= rnd_end_in_part(m_part_spec.start_part, scan)))
+ {
+ DBUG_RETURN(error);
+ }
+ }
+ m_scan_value= 1;
+ if ((error= rnd_init_in_part(part_id, scan)))
+ goto err;
+ }
+ else
+ {
+ m_scan_value= 0;
+ for (i= part_id;
+ i < MY_BIT_NONE;
+ i= m_part_info->get_next_used_partition(i))
+ {
+ if ((error= rnd_init_in_part(i, scan)))
+ goto err;
+ }
+ }
+ m_part_spec.start_part= part_id;
+ m_part_spec.end_part= m_tot_parts - 1;
+ DBUG_PRINT("info", ("m_scan_value=%d", m_scan_value));
+ DBUG_RETURN(0);
+
+err:
+ /* Call rnd_end for all previously initialized partitions. */
+ for (;
+ part_id < i;
+ part_id= m_part_info->get_next_used_partition(part_id))
+ {
+ rnd_end_in_part(part_id, scan);
+ }
+err1:
+ m_scan_value= 2;
+ m_part_spec.start_part= NO_CURRENT_PART_ID;
+ DBUG_RETURN(error);
+}
+
+
+/**
+ End of a table scan.
+
+ @return Operation status.
+ @retval 0 Success
+ @retval != 0 Error code
+*/
+
+int Partition_helper::ph_rnd_end()
+{
+ int error= 0;
+ DBUG_ENTER("Partition_helper::ph_rnd_end");
+ switch (m_scan_value) {
+ case 3: // Error
+ DBUG_ASSERT(0);
+ /* fall through. */
+ case 2: // Error
+ break;
+ case 1:
+ if (NO_CURRENT_PART_ID != m_part_spec.start_part) // Table scan
+ {
+ error= rnd_end_in_part(m_part_spec.start_part, true);
+ }
+ break;
+ case 0:
+ uint i;
+ for (i= m_part_info->get_first_used_partition();
+ i < MY_BIT_NONE;
+ i= m_part_info->get_next_used_partition(i))
+ {
+ int part_error;
+ part_error= rnd_end_in_part(i, false);
+ if (part_error && !error) {
+ error= part_error;
+ }
+ }
+ break;
+ }
+ m_scan_value= 3;
+ m_part_spec.start_part= NO_CURRENT_PART_ID;
+ DBUG_RETURN(error);
+}
+
+
+/**
+ Read next row during full table scan (scan in random row order).
+
+ This is called for each row of the table scan. When you run out of records
+ you should return HA_ERR_END_OF_FILE.
+ The Field structure for the table is the key to getting data into buf
+ in a manner that will allow the server to understand it.
+
+ @param[out] buf buffer that should be filled with data.
+
+ @return Operation status.
+ @retval 0 Success
+ @retval != 0 Error code
+*/
+
+int Partition_helper::ph_rnd_next(uchar *buf)
+{
+ int result= HA_ERR_END_OF_FILE;
+ uint part_id= m_part_spec.start_part;
+ DBUG_ENTER("Partition_helper::ph_rnd_next");
+
+ if (NO_CURRENT_PART_ID == part_id)
+ {
+ /*
+ The original set of partitions to scan was empty and thus we report
+ the result here.
+ */
+ goto end;
+ }
+
+ DBUG_ASSERT(m_scan_value == 1);
+
+ while (TRUE)
+ {
+ result= rnd_next_in_part(part_id, buf);
+ if (!result)
+ {
+ m_last_part= part_id;
+ m_part_spec.start_part= part_id;
+ m_table->status= 0;
+ DBUG_RETURN(0);
+ }
+
+ /*
+ if we get here, then the current partition ha_rnd_next returned failure
+ */
+ if (result == HA_ERR_RECORD_DELETED)
+ continue; // Probably MyISAM
+
+ if (result != HA_ERR_END_OF_FILE)
+ goto end_dont_reset_start_part; // Return error
+
+ /* End current partition */
+ DBUG_PRINT("info", ("rnd_end on partition %d", part_id));
+ if ((result= rnd_end_in_part(part_id, true)))
+ break;
+
+ /* Shift to next partition */
+ part_id= m_part_info->get_next_used_partition(part_id);
+ if (part_id >= m_tot_parts)
+ {
+ result= HA_ERR_END_OF_FILE;
+ break;
+ }
+ m_last_part= part_id;
+ m_part_spec.start_part= part_id;
+ DBUG_PRINT("info", ("rnd_init on partition %d", part_id));
+ if ((result= rnd_init_in_part(part_id, true)))
+ break;
+ }
+
+end:
+ m_part_spec.start_part= NO_CURRENT_PART_ID;
+end_dont_reset_start_part:
+ m_table->status= STATUS_NOT_FOUND;
+ DBUG_RETURN(result);
+}
+
+
+/**
+ Save position of current row.
+
+ position() is called after each call to rnd_next() if the data needs
+ to be ordered or accessed later.
+
+ The server uses ref to store data. ref_length in the above case is
+ the size needed to store current_position. ref is just a byte array
+ that the server will maintain. If you are using offsets to mark rows, then
+ current_position should be the offset. If it is a primary key like in
+ InnoDB, then it needs to be a primary key.
+
+ @param record Current record in MySQL Row Format.
+*/
+
+void Partition_helper::ph_position(const uchar *record)
+{
+ DBUG_ASSERT(m_part_info->is_partition_used(m_last_part));
+ DBUG_ENTER("Partition_helper::ph_position");
+ DBUG_PRINT("info", ("record: %p", record));
+ DBUG_DUMP("record", record, m_rec_length);
+
+ /*
+ If m_ref_usage is set, then the ref is already stored in the
+ priority queue (m_queue) when doing ordered scans.
+ */
+ if (m_ref_usage != REF_NOT_USED && m_ordered_scan_ongoing)
+ {
+ DBUG_ASSERT(!m_queue->empty());
+ DBUG_ASSERT(m_ordered_rec_buffer);
+ DBUG_ASSERT(!m_curr_key_info[1]);
+ DBUG_ASSERT(uint2korr(m_queue->top()) == m_last_part);
+ /* We already have the ref and part id. */
+ memcpy(m_handler->ref, m_queue->top(), m_handler->ref_length);
+ }
+ else
+ {
+ DBUG_PRINT("info", ("m_last_part: %u", m_last_part));
+ int2store(m_handler->ref, m_last_part);
+ position_in_last_part(m_handler->ref + PARTITION_BYTES_IN_POS, record);
+ }
+ DBUG_DUMP("ref_out", m_handler->ref, m_handler->ref_length);
+
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ Read row using position.
+
+ This is like rnd_next, but you are given a position to use to determine
+ the row. The position will be pointing to data of length handler::ref_length
+ that handler::ref was set by position(record). Tables clustered on primary
+ key usually use the full primary key as reference (like InnoDB). Heap based
+ tables usually returns offset in heap file (like MyISAM).
+
+ @param[out] buf buffer that should be filled with record in MySQL format.
+ @param[in] pos position given as handler::ref when position() was called.
+
+ @return Operation status.
+ @retval 0 Success
+ @retval != 0 Error code
+*/
+
+int Partition_helper::ph_rnd_pos(uchar *buf, uchar *pos)
+{
+ uint part_id;
+ DBUG_ENTER("Partition_helper::ph_rnd_pos");
+
+ part_id= uint2korr(pos);
+ DBUG_ASSERT(part_id < m_tot_parts);
+ DBUG_ASSERT(m_part_info->is_partition_used(part_id));
+ m_last_part= part_id;
+ DBUG_RETURN(rnd_pos_in_part(part_id, buf, (pos + PARTITION_BYTES_IN_POS)));
+}
+
+
+/**
+ Read row using position using given record to find.
+
+ This works as position()+rnd_pos() functions, but does some extra work,
+ calculating m_last_part - the partition to where the 'record' should go.
+
+ Only useful when position is based on primary key
+ (HA_PRIMARY_KEY_REQUIRED_FOR_POSITION).
+
+ @param record Current record in MySQL Row Format.
+
+ @return Operation status.
+ @retval 0 Success
+ @retval != 0 Error code
+*/
+
+int Partition_helper::ph_rnd_pos_by_record(uchar *record)
+{
+ DBUG_ENTER("Partition_helper::ph_rnd_pos_by_record");
+
+ DBUG_ASSERT(m_handler->ha_table_flags() &
+ HA_PRIMARY_KEY_REQUIRED_FOR_POSITION);
+ /* TODO: Support HA_READ_BEFORE_WRITE_REMOVAL */
+ /* Set m_last_part correctly. */
+ if (unlikely(get_part_for_delete(record,
+ m_table->record[0],
+ m_part_info,
+ &m_last_part)))
+ DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
+
+ DBUG_RETURN(rnd_pos_by_record_in_last_part(record));
+}
+
+
+/****************************************************************************
+ MODULE index scan
+****************************************************************************/
+/*
+ Positions an index cursor to the index specified in the handle. Fetches the
+ row if available. If the key value is null, begin at the first key of the
+ index.
+
+ There are loads of optimizations possible here for the partition handler.
+ The same optimizations can also be checked for full table scan although
+ only through conditions and not from index ranges.
+ Phase one optimizations:
+ Check if the fields of the partition function are bound. If so only use
+ the single partition it becomes bound to.
+ Phase two optimizations:
+ If it can be deducted through range or list partitioning that only a
+ subset of the partitions are used, then only use those partitions.
+*/
+
+/**
+ Setup the ordered record buffer and the priority queue.
+
+ Call destroy_record_priority_queue() to deallocate or clean-up
+ from failure.
+
+ @return false on success, else true.
+*/
+
+int Partition_helper::init_record_priority_queue()
+{
+ uint used_parts= m_part_info->num_partitions_used();
+ DBUG_ENTER("Partition_helper::init_record_priority_queue");
+ DBUG_ASSERT(!m_ordered_rec_buffer);
+ DBUG_ASSERT(!m_queue);
+ /* Initialize the priority queue. */
+ // TODO: Create test to see the cost of allocating when needed vs
+ // allocate once and keep between statements. Also test on NUMA
+ // machines to see the difference (I guess that allocating when needed
+ // will allocate on 'correct' NUMA node and be faster.)
+ if (!m_queue)
+ {
+ m_queue= new (std::nothrow) Prio_queue(Key_rec_less(m_curr_key_info));
+ if (!m_queue)
+ {
+ DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+ }
+ }
+ /* Initialize the ordered record buffer. */
+ if (!m_ordered_rec_buffer)
+ {
+ uint alloc_len;
+ /*
+ Allocate record buffer for each used partition.
+ If PK is clustered index, it is either the primary sort key or is
+ added as secondary sort. So we only need to allocate for part id
+ and a full record per partition.
+ Otherwise if the clustered index was generated, we might need to
+ do a secondary sort by rowid (handler::ref) and must allocate for
+ ref (includes part id) and full record per partition. We don't
+ know yet if we need to do secondary sort by rowid, so we must
+ allocate space for it.
+ TODO: enhance ha_index_init() for HA_EXTRA_SECONDARY_SORT_ROWID to
+ avoid allocating space for handler::ref when not needed.
+ When enhancing ha_index_init() care must be taken on ph_position(),
+ so InnoDB's row_id is correctly handled (taken from m_last_part).
+ */
+ if (m_pkey_is_clustered && m_table->s->primary_key != MAX_KEY)
+ {
+ m_rec_offset= PARTITION_BYTES_IN_POS;
+ m_ref_usage= REF_NOT_USED;
+ }
+ else
+ {
+ m_rec_offset= m_handler->ref_length;
+ m_ref_usage= REF_STORED_IN_PQ;
+ }
+ alloc_len= used_parts * (m_rec_offset + m_rec_length);
+ /* Allocate a key for temporary use when setting up the scan. */
+ alloc_len+= m_table->s->max_key_length;
+
+ m_ordered_rec_buffer= static_cast<uchar*>(
+ my_malloc(alloc_len,
+ MYF(MY_WME)));
+ if (!m_ordered_rec_buffer)
+ {
+ DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+ }
+
+ /*
+ We set-up one record per partition and each record has 2 bytes in
+ front where the partition id is written. This is used by ordered
+ index_read.
+ If we need to also sort by rowid (handler::ref), then m_curr_key_info[1]
+ is NULL and we add the rowid before the record.
+ We also set-up a reference to the first record for temporary use in
+ setting up the scan.
+ */
+ char *ptr= (char*) m_ordered_rec_buffer;
+ uint i;
+ for (i= m_part_info->get_first_used_partition();
+ i < MY_BIT_NONE;
+ i= m_part_info->get_next_used_partition(i))
+ {
+ DBUG_PRINT("info", ("init rec-buf for part %u", i));
+ int2store(ptr, i);
+ ptr+= m_rec_offset + m_rec_length;
+ }
+ m_start_key.key= (const uchar*)ptr;
+ /*
+ Initialize priority queue, initialized to reading forward.
+ Start by only sort by KEY, HA_EXTRA_SECONDARY_SORT_ROWID
+ will be given if we should sort by handler::ref too.
+ */
+ m_queue->m_rec_offset= m_rec_offset;
+ if (m_queue->reserve(used_parts))
+ {
+ DBUG_RETURN(HA_ERR_OUT_OF_MEM);
+ }
+ }
+ DBUG_RETURN(init_record_priority_queue_for_parts(used_parts));
+}
+
+
+/**
+ Destroy the ordered record buffer and the priority queue.
+*/
+
+void Partition_helper::destroy_record_priority_queue()
+{
+ DBUG_ENTER("Partition_helper::destroy_record_priority_queue");
+ if (m_ordered_rec_buffer)
+ {
+ my_free(m_ordered_rec_buffer);
+ m_ordered_rec_buffer= NULL;
+ }
+ if (m_queue)
+ {
+ m_queue->clear();
+ delete m_queue;
+ m_queue= NULL;
+ }
+ m_ref_usage= REF_NOT_USED;
+ m_ordered_scan_ongoing= false;
+ DBUG_VOID_RETURN;
+}
+
+
+/**
+ Common setup for index_init.
+
+ Set up variables and initialize the record priority queue.
+
+ @param inx Index to be used.
+ @param sorted True if the rows must be returned in index order.
+
+ @return Operation status.
+ @retval 0 Success
+ @retval != 0 Error code
+*/
+
+int Partition_helper::ph_index_init_setup(uint inx, bool sorted)
+{
+ DBUG_ENTER("Partition_helper:ph_:index_init_setup");
+
+ DBUG_ASSERT(inx != MAX_KEY);
+ DBUG_PRINT("info", ("inx %u sorted %u", inx, sorted));
+ m_part_spec.start_part= NO_CURRENT_PART_ID;
+ m_start_key.length= 0;
+ m_ordered= sorted;
+ m_ref_usage= REF_NOT_USED;
+ m_curr_key_info[0]= m_table->key_info+inx;
+ m_curr_key_info[1]= NULL;
+ /*
+ There are two cases where it is not enough to only sort on the key:
+ 1) For clustered indexes, the optimizer assumes that all keys
+ have the rest of the PK columns appended to the KEY, so it will
+ sort by PK as secondary sort key.
+ 2) Rowid-Order-Retrieval access methods, like index_merge_intersect
+ and index_merge_union. These methods requires the index to be sorted
+ on rowid (handler::ref) as secondary sort key.
+ */
+ if (m_pkey_is_clustered && m_table->s->primary_key != MAX_KEY &&
+ inx != m_table->s->primary_key)
+ {
+ /*
+ if PK is clustered, then the key cmp must use the pk to
+ differentiate between equal key in given index.
+ */
+ DBUG_PRINT("info", ("Clustered pk, using pk as secondary cmp"));
+ m_curr_key_info[1]= m_table->key_info+m_table->s->primary_key;
+ }
+
+ /*
+ Some handlers only read fields as specified by the bitmap for the
+ read set. For partitioned handlers we always require that the
+ fields of the partition functions are read such that we can
+ calculate the partition id to place updated and deleted records.
+ */
+ if (m_handler->get_lock_type() == F_WRLCK)
+ bitmap_union(m_table->read_set, &m_part_info->full_part_field_set);
+
+ DBUG_RETURN(0);
+}
+
+
+/**
+ Initialize handler before start of index scan.
+
+ index_init is always called before starting index scans (except when
+ starting through index_read_idx and using read_range variants).
+
+ @param inx Index number.
+ @param sorted Is rows to be returned in sorted order.
+
+ @return Operation status
+ @retval 0 Success
+ @retval != 0 Error code
+*/
+
+int Partition_helper::ph_index_init(uint inx, bool sorted)
+{
+ int error;
+ uint part_id= m_part_info->get_first_used_partition();
+ DBUG_ENTER("Partition_helper::ph_index_init");
+ m_handler->active_index= inx;
+
+ if (part_id == MY_BIT_NONE)
+ {
+ DBUG_RETURN(0);
+ }
+
+ if ((error= ph_index_init_setup(inx, sorted)))
+ {
+ DBUG_RETURN(error);
+ }
+ if ((error= init_record_priority_queue()))
+ {
+ destroy_record_priority_queue();
+ DBUG_RETURN(error);
+ }
+
+ for (/* part_id already set. */;
+ part_id < MY_BIT_NONE;
+ part_id= m_part_info->get_next_used_partition(part_id))
+ {
+ if ((error= index_init_in_part(part_id, inx, sorted)))
+ goto err;
+
+ DBUG_EXECUTE_IF("partition_fail_index_init", {
+ part_id++;
+ error= HA_ERR_NO_PARTITION_FOUND;
+ goto err;
+ });
+ }
+err:
+ if (error)
+ {
+ /* End the previously initialized indexes. */
+ uint j;
+ for (j= m_part_info->get_first_used_partition();
+ j < part_id;
+ j= m_part_info->get_next_used_partition(j))
+ {
+ (void) index_end_in_part(j);
+ }
+ destroy_record_priority_queue();
+ }
+ DBUG_RETURN(error);
+}
+
+
+/**
+ End of index scan.
+
+ index_end is called at the end of an index scan to clean up any
+ things needed to clean up.
+
+ @return Operation status.
+ @retval 0 Success
+ @retval != 0 Error code
+*/
+
+int Partition_helper::ph_index_end()
+{
+ int error= 0;
+ uint i;
+ DBUG_ENTER("Partition_helper::ph_index_end");
+
+ m_part_spec.start_part= NO_CURRENT_PART_ID;
+ m_ref_usage= REF_NOT_USED;
+ for (i= m_part_info->get_first_used_partition();
+ i < MY_BIT_NONE;
+ i= m_part_info->get_next_used_partition(i))
+ {
+ int tmp;
+ if ((tmp= index_end_in_part(i)))
+ error= tmp;
+ }
+ destroy_record_priority_queue();
+ m_handler->active_index= MAX_KEY;
+ DBUG_RETURN(error);
+}
+
+
+/**
+ Read one record in an index scan and start an index scan.
+
+ index_read_map starts a new index scan using a start key. The MySQL Server
+ will check the end key on its own. Thus to function properly the
+ partitioned handler need to ensure that it delivers records in the sort
+ order of the MySQL Server.
+ index_read_map can be restarted without calling index_end on the previous
+ index scan and without calling index_init. In this case the index_read_map
+ is on the same index as the previous index_scan. This is particularly
+ used in conjunction with multi read ranges.
+
+ @param[out] buf Read row in MySQL Row Format
+ @param[in] key Key parts in consecutive order
+ @param[in] keypart_map Which part of key is used
+ @param[in] find_flag What type of key condition is used
+
+ @return Operation status.
+ @retval 0 Success
+ @retval != 0 Error code
+*/
+
+int Partition_helper::ph_index_read_map(uchar *buf,
+ const uchar *key,
+ key_part_map keypart_map,
+ enum ha_rkey_function find_flag)
+{
+ DBUG_ENTER("Partition_handler::ph_index_read_map");
+ m_handler->end_range= NULL;
+ m_index_scan_type= PARTITION_INDEX_READ;
+ m_start_key.key= key;
+ m_start_key.keypart_map= keypart_map;
+ m_start_key.flag= find_flag;
+ DBUG_RETURN(common_index_read(buf, true));
+}
+
+
+/**
+ Common routine for a number of index_read variants.
+
+ @param[out] buf Buffer where the record should be returned.
+ @param[in] have_start_key TRUE <=> the left endpoint is available, i.e.
+ we're in index_read call or in read_range_first
+ call and the range has left endpoint.
+ FALSE <=> there is no left endpoint (we're in
+ read_range_first() call and the range has no left
+ endpoint).
+
+ @return Operation status
+ @retval 0 OK
+ @retval HA_ERR_END_OF_FILE Whole index scanned, without finding the record.
+ @retval HA_ERR_KEY_NOT_FOUND Record not found, but index cursor positioned.
+ @retval other Error code.
+
+ @details
+ Start scanning the range (when invoked from read_range_first()) or doing
+ an index lookup (when invoked from index_read_XXX):
+ - If possible, perform partition selection
+ - Find the set of partitions we're going to use
+ - Depending on whether we need ordering:
+ NO: Get the first record from first used partition (see
+ handle_unordered_scan_next_partition)
+ YES: Fill the priority queue and get the record that is the first in
+ the ordering
+*/
+
+int Partition_helper::common_index_read(uchar *buf, bool have_start_key)
+{
+ int error;
+ m_reverse_order= false;
+ DBUG_ENTER("Partition_helper::common_index_read");
+
+ DBUG_PRINT("info", ("m_ordered %u m_ordered_scan_ong %u",
+ m_ordered, m_ordered_scan_ongoing));
+
+ if (have_start_key)
+ {
+ m_start_key.length= calculate_key_len(m_table,
+ m_handler->active_index,
+ NULL,
+ m_start_key.keypart_map);
+ DBUG_PRINT("info", ("have_start_key map %lu find_flag %u len %u",
+ m_start_key.keypart_map, m_start_key.flag,
+ m_start_key.length));
+ DBUG_ASSERT(m_start_key.length);
+ }
+ if ((error= partition_scan_set_up(buf, have_start_key)))
+ {
+ DBUG_RETURN(error);
+ }
+
+ if (have_start_key &&
+ (m_start_key.flag == HA_READ_KEY_OR_PREV ||
+ m_start_key.flag == HA_READ_PREFIX_LAST ||
+ m_start_key.flag == HA_READ_PREFIX_LAST_OR_PREV ||
+ m_start_key.flag == HA_READ_BEFORE_KEY))
+ {
+ m_reverse_order= true;
+ m_ordered_scan_ongoing= true;
+ }
+ DBUG_PRINT("info", ("m_ordered %u m_o_scan_ong %u have_start_key %u",
+ m_ordered, m_ordered_scan_ongoing, have_start_key));
+ if (!m_ordered_scan_ongoing)
+ {
+ /*
+ We use unordered index scan when read_range is used and flag
+ is set to not use ordered.
+ We also use an unordered index scan when the number of partitions to
+ scan is only one.
+ The unordered index scan will use the partition set created.
+ */
+ DBUG_PRINT("info", ("doing unordered scan"));
+ error= handle_unordered_scan_next_partition(buf);
+ }
+ else
+ {
+ /*
+ In all other cases we will use the ordered index scan. This will use
+ the partition set created by the get_partition_set method.
+ */
+ error= handle_ordered_index_scan(buf);
+ }
+ DBUG_RETURN(error);
+}
+
+
+/**
+ Start an index scan from leftmost record and return first record.
+
+ index_first() asks for the first key in the index.
+ This is similar to index_read except that there is no start key since
+ the scan starts from the leftmost entry and proceeds forward with
+ index_next.
+
+ @param[out] buf Read row in MySQL Row Format.
+
+ @return Operation status.
+ @retval 0 Success
+ @retval != 0 Error code
+*/
+
+int Partition_helper::ph_index_first(uchar *buf)
+{
+ DBUG_ENTER("Partition_helper::ph_index_first");
+
+ m_handler->end_range= NULL;
+ m_index_scan_type= PARTITION_INDEX_FIRST;
+ m_reverse_order= false;
+ DBUG_RETURN(common_first_last(buf));
+}
+
+
+/**
+ Start an index scan from rightmost record and return first record.
+
+ index_last() asks for the last key in the index.
+ This is similar to index_read except that there is no start key since
+ the scan starts from the rightmost entry and proceeds forward with
+ index_prev.
+
+ @param[out] buf Read row in MySQL Row Format.
+
+ @return Operation status.
+ @retval 0 Success
+ @retval != 0 Error code
+*/
+
+int Partition_helper::ph_index_last(uchar *buf)
+{
+ DBUG_ENTER("Partition_helper::ph_index_last");
+
+ m_index_scan_type= PARTITION_INDEX_LAST;
+ m_reverse_order= true;
+ DBUG_RETURN(common_first_last(buf));
+}
+
+
+/**
+ Common routine for index_first/index_last.
+
+ @param[out] buf Read row in MySQL Row Format.
+
+ @return Operation status.
+ @retval 0 Success
+ @retval != 0 Error code
+*/
+
+int Partition_helper::common_first_last(uchar *buf)
+{
+ int error;
+ DBUG_ENTER("Partition_helper::common_first_last");
+
+ if ((error= partition_scan_set_up(buf, false)))
+ {
+ DBUG_RETURN(error);
+ }
+ if (!m_ordered_scan_ongoing &&
+ m_index_scan_type != PARTITION_INDEX_LAST)
+ {
+ DBUG_RETURN(handle_unordered_scan_next_partition(buf));
+ }
+ DBUG_RETURN(handle_ordered_index_scan(buf));
+}
+
+
+/**
+ Read last using key.
+
+ This is used in join_read_last_key to optimize away an ORDER BY.
+ Can only be used on indexes supporting HA_READ_ORDER.
+
+ @param[out] buf Read row in MySQL Row Format
+ @param[in] key Key
+ @param[in] keypart_map Which part of key is used
+
+ @return Operation status.
+ @retval 0 Success
+ @retval != 0 Error code
+*/
+
+int Partition_helper::ph_index_read_last_map(uchar *buf,
+ const uchar *key,
+ key_part_map keypart_map)
+{
+ DBUG_ENTER("Partition_helper::ph_index_read_last_map");
+
+ m_ordered= true; // Safety measure
+ m_handler->end_range= NULL;
+ m_index_scan_type= PARTITION_INDEX_READ_LAST;
+ m_start_key.key= key;
+ m_start_key.keypart_map= keypart_map;
+ m_start_key.flag= HA_READ_PREFIX_LAST;
+ DBUG_RETURN(common_index_read(buf, true));
+}
+
+
+/**
+ Read index by key and keymap.
+
+ Positions an index cursor to the index specified.
+ Fetches the row if available. If the key value is null,
+ begin at first key of the index.
+
+ Optimization of the default implementation to take advantage of dynamic
+ partition pruning.
+
+ @param[out] buf Read row in MySQL Row Format
+ @param[in] index Index to read from
+ @param[in] key Key
+ @param[in] keypart_map Which part of key is used
+ @param[in] find_flag Direction/how to search.
+
+ @return Operation status.
+ @retval 0 Success
+ @retval != 0 Error code
+*/
+int Partition_helper::ph_index_read_idx_map(uchar *buf,
+ uint index,
+ const uchar *key,
+ key_part_map keypart_map,
+ enum ha_rkey_function find_flag)
+{
+ int error= HA_ERR_KEY_NOT_FOUND;
+ DBUG_ENTER("Partition_helper::ph_index_read_idx_map");
+
+ if (find_flag == HA_READ_KEY_EXACT)
+ {
+ uint part;
+ m_start_key.key= key;
+ m_start_key.keypart_map= keypart_map;
+ m_start_key.flag= find_flag;
+ m_start_key.length= calculate_key_len(m_table,
+ index,
+ NULL,
+ m_start_key.keypart_map);
+
+ get_partition_set(m_table, buf, index, &m_start_key, &m_part_spec);
+
+ /*
+ We have either found exactly 1 partition
+ (in which case start_part == end_part)
+ or no matching partitions (start_part > end_part)
+ */
+ DBUG_ASSERT(m_part_spec.start_part >= m_part_spec.end_part);
+ /* The start part is must be marked as used. */
+ DBUG_ASSERT(m_part_spec.start_part > m_part_spec.end_part ||
+ m_part_info->is_partition_used(m_part_spec.start_part));
+
+ for (part= m_part_spec.start_part;
+ part <= m_part_spec.end_part;
+ part= m_part_info->get_next_used_partition(part))
+ {
+ error= index_read_idx_map_in_part(part,
+ buf,
+ index,
+ key,
+ keypart_map,
+ find_flag);
+ if (error != HA_ERR_KEY_NOT_FOUND &&
+ error != HA_ERR_END_OF_FILE)
+ {
+ break;
+ }
+ }
+ if (part <= m_part_spec.end_part)
+ {
+ m_last_part= part;
+ }
+ }
+ else
+ {
+ /*
+ If not only used with HA_READ_KEY_EXACT, we should investigate if
+ possible to optimize for other find_flag's as well.
+ */
+ DBUG_ASSERT(0);
+ error= HA_ERR_INTERNAL_ERROR;
+ }
+ DBUG_RETURN(error);
+}
+
+
+/**
+ Read next record in a forward index scan.
+
+ Used to read forward through the index (left to right, low to high).
+
+ @param[out] buf Read row in MySQL Row Format.
+
+ @return Operation status.
+ @retval 0 Success
+ @retval != 0 Error code
+*/
+
+int Partition_helper::ph_index_next(uchar *buf)
+{
+ DBUG_ENTER("Partition_helper::ph_index_next");
+
+ /*
+ TODO(low priority):
+ If we want partition to work with the HANDLER commands, we
+ must be able to do index_last() -> index_prev() -> index_next()
+ and if direction changes, we must step back those partitions in
+ the record queue so we don't return a value from the wrong direction.
+ */
+ DBUG_ASSERT(m_index_scan_type != PARTITION_INDEX_LAST ||
+ m_table->open_by_handler);
+ if (!m_ordered_scan_ongoing)
+ {
+ DBUG_RETURN(handle_unordered_next(buf, false));
+ }
+ DBUG_RETURN(handle_ordered_next(buf, false));
+}
+
+
+/**
+ Read next same record.
+
+ This routine is used to read the next but only if the key is the same
+ as supplied in the call.
+
+ @param[out] buf Read row in MySQL Row Format.
+ @param[in] key Key.
+ @param[in] keylen Length of key.
+
+ @return Operation status.
+ @retval 0 Success
+ @retval != 0 Error code
+*/
+
+int Partition_helper::ph_index_next_same(uchar *buf, const uchar *key, uint keylen)
+{
+ DBUG_ENTER("Partition_helper::ph_index_next_same");
+
+ DBUG_ASSERT(keylen == m_start_key.length);
+ DBUG_ASSERT(m_index_scan_type != PARTITION_INDEX_LAST);
+ if (!m_ordered_scan_ongoing)
+ DBUG_RETURN(handle_unordered_next(buf, true));
+ DBUG_RETURN(handle_ordered_next(buf, true));
+}
+
+
+/**
+ Read next record when performing index scan backwards.
+
+ Used to read backwards through the index (right to left, high to low).
+
+ @param[out] buf Read row in MySQL Row Format.
+
+ @return Operation status.
+ @retval 0 Success
+ @retval != 0 Error code
+*/
+
+int Partition_helper::ph_index_prev(uchar *buf)
+{
+ DBUG_ENTER("Partition_helper::ph_index_prev");
+
+ /* TODO: read comment in index_next */
+ DBUG_ASSERT(m_index_scan_type != PARTITION_INDEX_FIRST ||
+ m_table->open_by_handler);
+ DBUG_RETURN(handle_ordered_prev(buf));
+}
+
+
+/**
+ Start a read of one range with start and end key.
+
+ We re-implement read_range_first since we don't want the compare_key
+ check at the end. This is already performed in the partition handler.
+ read_range_next is very much different due to that we need to scan
+ all underlying handlers.
+
+ @param start_key Specification of start key.
+ @param end_key Specification of end key.
+ @param eq_range_arg Is it equal range.
+ @param sorted Should records be returned in sorted order.
+
+ @return Operation status.
+ @retval 0 Success
+ @retval != 0 Error code
+*/
+
+int Partition_helper::ph_read_range_first(const key_range *start_key,
+ const key_range *end_key,
+ bool eq_range_arg,
+ bool sorted)
+{
+ int error= HA_ERR_END_OF_FILE;
+ bool have_start_key= (start_key != NULL);
+ uint part_id= m_part_info->get_first_used_partition();
+ DBUG_ENTER("Partition_helper::ph_read_range_first");
+
+ if (part_id == MY_BIT_NONE)
+ {
+ /* No partition to scan. */
+ m_table->status= STATUS_NOT_FOUND;
+ DBUG_RETURN(error);
+ }
+
+ m_ordered= sorted;
+ set_eq_range(eq_range_arg);
+ m_handler->set_end_range(end_key);
+
+ set_range_key_part(m_curr_key_info[0]->key_part);
+ if (have_start_key)
+ m_start_key= *start_key;
+ else
+ m_start_key.key= NULL;
+
+ m_index_scan_type= PARTITION_READ_RANGE;
+ error= common_index_read(m_table->record[0], have_start_key);
+ DBUG_RETURN(error);
+}
+
+
+/**
+ Read next record in read of a range with start and end key.
+
+ @return Operation status.
+ @retval 0 Success
+ @retval != 0 Error code
+*/
+
+int Partition_helper::ph_read_range_next()
+{
+ DBUG_ENTER("Partition_helper::ph_read_range_next");
+
+ if (m_ordered_scan_ongoing)
+ {
+ DBUG_RETURN(handle_ordered_next(m_table->record[0], get_eq_range()));
+ }
+ DBUG_RETURN(handle_unordered_next(m_table->record[0], get_eq_range()));
+}
+
+
+/**
+ Common routine to set up index scans.
+
+ Find out which partitions we'll need to read when scanning the specified
+ range.
+
+ If we need to scan only one partition, set m_ordered_scan_ongoing=FALSE
+ as we will not need to do merge ordering.
+
+ @param buf Buffer to later return record in (this function
+ needs it to calculate partitioning function values)
+
+ @param idx_read_flag TRUE <=> m_start_key has range start endpoint which
+ probably can be used to determine the set of
+ partitions to scan.
+ FALSE <=> there is no start endpoint.
+
+ @return Operation status.
+ @retval 0 Success
+ @retval !=0 Error code
+*/
+
+int Partition_helper::partition_scan_set_up(uchar * buf, bool idx_read_flag)
+{
+ DBUG_ENTER("Partition_helper::partition_scan_set_up");
+
+ if (idx_read_flag)
+ get_partition_set(m_table,
+ buf,
+ m_handler->active_index,
+ &m_start_key,
+ &m_part_spec);
+ else
+ {
+ // TODO: set to get_first_used_part() instead!
+ m_part_spec.start_part= 0;
+ // TODO: Implement bitmap_get_last_set() and use that here!
+ m_part_spec.end_part= m_tot_parts - 1;
+ }
+ if (m_part_spec.start_part > m_part_spec.end_part)
+ {
+ /*
+ We discovered a partition set but the set was empty so we report
+ key not found.
+ */
+ DBUG_PRINT("info", ("scan with no partition to scan"));
+ m_table->status= STATUS_NOT_FOUND;
+ DBUG_RETURN(HA_ERR_END_OF_FILE);
+ }
+ if (m_part_spec.start_part == m_part_spec.end_part)
+ {
+ /*
+ We discovered a single partition to scan, this never needs to be
+ performed using the ordered index scan.
+ */
+ DBUG_PRINT("info", ("index scan using the single partition %d",
+ m_part_spec.start_part));
+ m_ordered_scan_ongoing= FALSE;
+ }
+ else
+ {
+ /*
+ Set m_ordered_scan_ongoing according how the scan should be done
+ Only exact partitions are discovered atm by get_partition_set.
+ Verify this, also bitmap must have at least one bit set otherwise
+ the result from this table is the empty set.
+ */
+ uint start_part= m_part_info->get_first_used_partition();
+ if (start_part == MY_BIT_NONE)
+ {
+ DBUG_PRINT("info", ("scan with no partition to scan"));
+ m_table->status= STATUS_NOT_FOUND;
+ DBUG_RETURN(HA_ERR_END_OF_FILE);
+ }
+ if (start_part > m_part_spec.start_part)
+ m_part_spec.start_part= start_part;
+ m_ordered_scan_ongoing= m_ordered;
+ }
+ DBUG_ASSERT(m_part_spec.start_part < m_tot_parts);
+ DBUG_ASSERT(m_part_spec.end_part < m_tot_parts);
+ DBUG_RETURN(0);
+}
+
+
+/**
+ Common routine to handle index_next with unordered results.
+
+ These routines are used to scan partitions without considering order.
+ This is performed in two situations.
+ 1) In read_multi_range this is the normal case
+ 2) When performing any type of index_read, index_first, index_last where
+ all fields in the partition function is bound. In this case the index
+ scan is performed on only one partition and thus it isn't necessary to
+ perform any sort.
+
+ @param[out] buf Read row in MySQL Row Format.
+ @param[in] next_same Called from index_next_same.
+
+ @return Operation status.
+ @retval HA_ERR_END_OF_FILE End of scan
+ @retval 0 Success
+ @retval other Error code
+*/
+
+int Partition_helper::handle_unordered_next(uchar *buf, bool is_next_same)
+{
+ int error;
+ DBUG_ENTER("Partition_helper::handle_unordered_next");
+
+ if (m_part_spec.start_part >= m_tot_parts)
+ {
+ /* Should only happen with SQL HANDLER! */
+ DBUG_ASSERT(m_table->open_by_handler);
+ DBUG_RETURN(HA_ERR_END_OF_FILE);
+ }
+
+ /*
+ We should consider if this should be split into three functions as
+ partition_read_range is_next_same are always local constants
+ */
+
+ if (m_index_scan_type == PARTITION_READ_RANGE)
+ {
+ DBUG_ASSERT(buf == m_table->record[0]);
+ error= read_range_next_in_part(m_part_spec.start_part, NULL);
+ }
+ else if (is_next_same)
+ {
+ error= index_next_same_in_part(m_part_spec.start_part,
+ buf,
+ m_start_key.key,
+ m_start_key.length);
+ }
+ else
+ {
+ error= index_next_in_part(m_part_spec.start_part, buf);
+ }
+
+ if (error == HA_ERR_END_OF_FILE)
+ {
+ m_part_spec.start_part++; // Start using next part
+ error= handle_unordered_scan_next_partition(buf);
+ }
+ else
+ {
+ m_last_part= m_part_spec.start_part;
+ }
+ DBUG_RETURN(error);
+}
+
+
+/**
+ Handle index_next when changing to new partition.
+
+ This routine is used to start the index scan on the next partition.
+ Both initial start and after completing scan on one partition.
+
+ @param[out] buf Read row in MySQL Row Format
+
+ @return Operation status.
+ @retval HA_ERR_END_OF_FILE End of scan
+ @retval 0 Success
+ @retval other Error code
+*/
+
+int Partition_helper::handle_unordered_scan_next_partition(uchar * buf)
+{
+ uint i= m_part_spec.start_part;
+ int saved_error= HA_ERR_END_OF_FILE;
+ DBUG_ENTER("Partition_helper::handle_unordered_scan_next_partition");
+
+ if (i)
+ i= m_part_info->get_next_used_partition(i - 1);
+ else
+ i= m_part_info->get_first_used_partition();
+
+ for (;
+ i <= m_part_spec.end_part;
+ i= m_part_info->get_next_used_partition(i))
+ {
+ int error;
+ m_part_spec.start_part= i;
+ switch (m_index_scan_type) {
+ case PARTITION_READ_RANGE:
+ DBUG_ASSERT(buf == m_table->record[0]);
+ DBUG_PRINT("info", ("read_range_first on partition %d", i));
+ error= read_range_first_in_part(i,
+ NULL,
+ m_start_key.key? &m_start_key: NULL,
+ m_handler->end_range,
+ get_eq_range(),
+ false);
+ break;
+ case PARTITION_INDEX_READ:
+ DBUG_PRINT("info", ("index_read on partition %d", i));
+ error= index_read_map_in_part(i,
+ buf,
+ m_start_key.key,
+ m_start_key.keypart_map,
+ m_start_key.flag);
+ break;
+ case PARTITION_INDEX_FIRST:
+ DBUG_PRINT("info", ("index_first on partition %d", i));
+ error= index_first_in_part(i, buf);
+ break;
+ case PARTITION_INDEX_FIRST_UNORDERED:
+ /* When is this ever used? */
+ DBUG_ASSERT(0);
+ /*
+ We perform a scan without sorting and this means that we
+ should not use the index_first since not all handlers
+ support it and it is also unnecessary to restrict sort
+ order.
+ */
+ DBUG_PRINT("info", ("read_range_first on partition %d", i));
+ DBUG_ASSERT(buf == m_table->record[0]);
+ error= read_range_first_in_part(i,
+ NULL,
+ 0,
+ m_handler->end_range,
+ get_eq_range(),
+ 0);
+ break;
+ default:
+ DBUG_ASSERT(0);
+ DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
+ }
+ if (!error)
+ {
+ m_last_part= i;
+ DBUG_RETURN(0);
+ }
+ if ((error != HA_ERR_END_OF_FILE) && (error != HA_ERR_KEY_NOT_FOUND))
+ DBUG_RETURN(error);
+
+ /*
+ If HA_ERR_KEY_NOT_FOUND, we must return that error instead of
+ HA_ERR_END_OF_FILE, to be able to continue search.
+ */
+ if (saved_error != HA_ERR_KEY_NOT_FOUND)
+ saved_error= error;
+ DBUG_PRINT("info", ("END_OF_FILE/KEY_NOT_FOUND on partition %d", i));
+ }
+ if (saved_error == HA_ERR_END_OF_FILE)
+ m_part_spec.start_part= NO_CURRENT_PART_ID;
+ DBUG_RETURN(saved_error);
+}
+
+
+/**
+ Common routine to start index scan with ordered results.
+
+ @param[out] buf Read row in MySQL Row Format
+
+ @return Operation status
+ @retval HA_ERR_END_OF_FILE End of scan
+ @retval HA_ERR_KEY_NOT_FOUND End of scan
+ @retval 0 Success
+ @retval other Error code
+
+ @details
+ This part contains the logic to handle index scans that require ordered
+ output. This includes all except those started by read_range_first with
+ the flag ordered set to FALSE. Thus most direct index_read and all
+ index_first and index_last.
+
+ We implement ordering by keeping one record plus a key buffer for each
+ partition. Every time a new entry is requested we will fetch a new
+ entry from the partition that is currently not filled with an entry.
+ Then the entry is put into its proper sort position.
+
+ Returning a record is done by getting the top record, copying the
+ record to the request buffer and setting the partition as empty on
+ entries.
+*/
+
+int Partition_helper::handle_ordered_index_scan(uchar *buf)
+{
+ uint i;
+ std::vector<uchar*> parts;
+ bool found= FALSE;
+ uchar *part_rec_buf_ptr= m_ordered_rec_buffer;
+ int saved_error= HA_ERR_END_OF_FILE;
+ DBUG_ENTER("Partition_helper::handle_ordered_index_scan");
+ DBUG_ASSERT(part_rec_buf_ptr);
+
+ if (m_key_not_found)
+ {
+ m_key_not_found= false;
+ bitmap_clear_all(&m_key_not_found_partitions);
+ DBUG_PRINT("info", ("Cleared m_key_not_found_partitions"));
+ }
+ m_top_entry= NO_CURRENT_PART_ID;
+ m_queue->clear();
+ parts.reserve(m_queue->capacity());
+ DBUG_ASSERT(m_part_info->is_partition_used(m_part_spec.start_part));
+
+ /*
+ Position part_rec_buf_ptr to point to the first used partition >=
+ start_part. There may be partitions marked by used_partitions,
+ but is before start_part. These partitions has allocated record buffers
+ but is dynamically pruned, so those buffers must be skipped.
+ */
+ for (i= m_part_info->get_first_used_partition();
+ i < m_part_spec.start_part;
+ i= m_part_info->get_next_used_partition(i))
+ {
+ part_rec_buf_ptr+= m_rec_offset + m_rec_length;
+ }
+ DBUG_PRINT("info", ("m_part_spec.start_part %u first_used_part %u",
+ m_part_spec.start_part, i));
+ for (/* continue from above */ ;
+ i <= m_part_spec.end_part;
+ i= m_part_info->get_next_used_partition(i))
+ {
+ DBUG_PRINT("info", ("reading from part %u (scan_type: %u inx: %u)",
+ i, m_index_scan_type, m_handler->active_index));
+ DBUG_ASSERT(i == uint2korr(part_rec_buf_ptr));
+ uchar *rec_buf_ptr= part_rec_buf_ptr + m_rec_offset;
+ uchar *read_buf;
+ int error;
+ DBUG_PRINT("info", ("part %u, scan_type %d", i, m_index_scan_type));
+
+ /* ICP relies on Item evaluation, which expects the row in record[0]. */
+ if (m_handler->pushed_idx_cond)
+ read_buf= m_table->record[0];
+ else
+ read_buf= rec_buf_ptr;
+
+ switch (m_index_scan_type) {
+ case PARTITION_INDEX_READ:
+ error= index_read_map_in_part(i,
+ read_buf,
+ m_start_key.key,
+ m_start_key.keypart_map,
+ m_start_key.flag);
+ break;
+ case PARTITION_INDEX_FIRST:
+ error= index_first_in_part(i, read_buf);
+ break;
+ case PARTITION_INDEX_LAST:
+ error= index_last_in_part(i, read_buf);
+ break;
+ case PARTITION_INDEX_READ_LAST:
+ error= index_read_last_map_in_part(i,
+ read_buf,
+ m_start_key.key,
+ m_start_key.keypart_map);
+ break;
+ case PARTITION_READ_RANGE:
+ {
+ /*
+ To enable optimization in derived engines, we provide a read buffer
+ pointer if we want to read into something different than table->record[0]
+ (which read_range_* always uses).
+ */
+ error= read_range_first_in_part(i,
+ read_buf == m_table->record[0]
+ ? NULL : read_buf,
+ m_start_key.key ? &m_start_key : NULL,
+ m_handler->end_range,
+ get_eq_range(),
+ true);
+ break;
+ }
+ default:
+ DBUG_ASSERT(false);
+ DBUG_RETURN(HA_ERR_END_OF_FILE);
+ }
+ DBUG_PRINT("info", ("error %d from partition %u", error, i));
+ /* When using ICP, copy record[0] to the priority queue for sorting. */
+ if (m_handler->pushed_idx_cond)
+ memcpy(rec_buf_ptr, read_buf, m_rec_length);
+ if (!error)
+ {
+ found= true;
+ if (m_ref_usage != REF_NOT_USED)
+ {
+ /* position_in_last_part needs m_last_part set. */
+ m_last_part= i;
+ position_in_last_part(part_rec_buf_ptr + PARTITION_BYTES_IN_POS,
+ rec_buf_ptr);
+ }
+ /*
+ Save for later insertion in queue;
+ */
+ parts.push_back(part_rec_buf_ptr);
+ DBUG_DUMP("row", read_buf, m_rec_length);
+ }
+ else if (error != HA_ERR_KEY_NOT_FOUND && error != HA_ERR_END_OF_FILE)
+ {
+ DBUG_RETURN(error);
+ }
+ else if (error == HA_ERR_KEY_NOT_FOUND)
+ {
+ DBUG_PRINT("info", ("HA_ERR_KEY_NOT_FOUND from partition %u", i));
+ bitmap_set_bit(&m_key_not_found_partitions, i);
+ m_key_not_found= true;
+ saved_error= error;
+ }
+ part_rec_buf_ptr+= m_rec_offset + m_rec_length;
+ }
+ if (found)
+ {
+ /*
+ We found at least one partition with data, now sort all entries and
+ after that read the first entry and copy it to the buffer to return in.
+ */
+ m_queue->m_max_at_top= m_reverse_order;
+ m_queue->m_keys= m_curr_key_info;
+ DBUG_ASSERT(m_queue->empty());
+ /*
+ If PK, we should not sort by rowid, since that is already done
+ through the KEY setup.
+ */
+ DBUG_ASSERT(!m_curr_key_info[1] || m_ref_usage == REF_NOT_USED);
+ m_queue->assign(parts);
+ return_top_record(buf);
+ m_table->status= 0;
+ DBUG_PRINT("info", ("Record returned from partition %d", m_top_entry));
+ DBUG_RETURN(0);
+ }
+ DBUG_RETURN(saved_error);
+}
+
+
+/**
+ Return the top record in sort order.
+
+ @param[out] buf Row returned in MySQL Row Format.
+*/
+
+void Partition_helper::return_top_record(uchar *buf)
+{
+ uint part_id;
+ uchar *key_buffer= m_queue->top();
+ uchar *rec_buffer= key_buffer + m_rec_offset;
+
+ part_id= uint2korr(key_buffer);
+ copy_cached_row(buf, rec_buffer);
+ DBUG_PRINT("info", ("from part_id %u", part_id));
+ DBUG_DUMP("returned_row", buf, m_table->s->reclength);
+ m_last_part= part_id;
+ m_top_entry= part_id;
+}
+
+
+/**
+ Add index_next/prev results from partitions without exact match.
+
+ If there where any partitions that returned HA_ERR_KEY_NOT_FOUND when
+ ha_index_read_map was done, those partitions must be included in the
+ following index_next/prev call.
+*/
+
+int Partition_helper::handle_ordered_index_scan_key_not_found()
+{
+ int error;
+ uint i;
+ size_t old_elements= m_queue->size();
+ uchar *part_buf= m_ordered_rec_buffer;
+ uchar *curr_rec_buf= NULL;
+ DBUG_ENTER("Partition_helper::handle_ordered_index_scan_key_not_found");
+ DBUG_ASSERT(m_key_not_found);
+ DBUG_ASSERT(part_buf);
+ /*
+ Loop over all used partitions to get the correct offset
+ into m_ordered_rec_buffer.
+ */
+ for (i= m_part_info->get_first_used_partition();
+ i < MY_BIT_NONE;
+ i= m_part_info->get_next_used_partition(i))
+ {
+ if (bitmap_is_set(&m_key_not_found_partitions, i))
+ {
+ /*
+ This partition is used and did return HA_ERR_KEY_NOT_FOUND
+ in index_read_map.
+ */
+ uchar *read_buf;
+ curr_rec_buf= part_buf + m_rec_offset;
+ /* ICP relies on Item evaluation, which expects the row in record[0]. */
+ if (m_handler->pushed_idx_cond)
+ read_buf= m_table->record[0];
+ else
+ read_buf= curr_rec_buf;
+
+ if (m_reverse_order)
+ error= index_prev_in_part(i, read_buf);
+ else
+ error= index_next_in_part(i, read_buf);
+ /* HA_ERR_KEY_NOT_FOUND is not allowed from index_next! */
+ DBUG_ASSERT(error != HA_ERR_KEY_NOT_FOUND);
+ DBUG_PRINT("info", ("Filling from partition %u reverse %u error %d",
+ i, m_reverse_order, error));
+ if (!error)
+ {
+ /* When using ICP, copy record[0] to the priority queue for sorting. */
+ if (m_handler->pushed_idx_cond)
+ memcpy(curr_rec_buf, read_buf, m_rec_length);
+ if (m_ref_usage != REF_NOT_USED)
+ {
+ /* position_in_last_part needs m_last_part set. */
+ m_last_part= i;
+ position_in_last_part(part_buf + PARTITION_BYTES_IN_POS,
+ curr_rec_buf);
+ }
+ m_queue->push(part_buf);
+ }
+ else if (error != HA_ERR_END_OF_FILE && error != HA_ERR_KEY_NOT_FOUND)
+ DBUG_RETURN(error);
+ }
+ part_buf+= m_rec_offset + m_rec_length;
+ }
+ DBUG_ASSERT(curr_rec_buf);
+ bitmap_clear_all(&m_key_not_found_partitions);
+ m_key_not_found= false;
+
+ if (m_queue->size() > old_elements)
+ {
+ /* Update m_top_entry, which may have changed. */
+ uchar *key_buffer= m_queue->top();
+ m_top_entry= uint2korr(key_buffer);
+ }
+ DBUG_RETURN(0);
+}
+
+
+/**
+ Common routine to handle index_next with ordered results.
+
+ @param[out] buf Read row in MySQL Row Format.
+ @param[in] next_same Called from index_next_same.
+
+ @return Operation status.
+ @retval HA_ERR_END_OF_FILE End of scan
+ @retval 0 Success
+ @retval other Error code
+*/
+
+int Partition_helper::handle_ordered_next(uchar *buf, bool is_next_same)
+{
+ int error;
+ uint part_id= m_top_entry;
+ uchar *rec_buf= m_queue->empty() ? NULL : m_queue->top() + m_rec_offset;
+ uchar *read_buf;
+ DBUG_ENTER("Partition_helper::handle_ordered_next");
+
+ if (m_reverse_order)
+ {
+ /*
+ TODO: To support change of direction (index_prev -> index_next,
+ index_read_map(HA_READ_KEY_EXACT) -> index_prev etc.)
+ We would need to:
+ - Step back all cursors we have a buffered row from a previous next/prev
+ call (i.e. for all partitions we previously called index_prev, we must
+ call index_next and skip that row.
+ - empty the priority queue and initialize it again with reverse ordering.
+ */
+ DBUG_ASSERT(m_table->open_by_handler);
+ DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+ }
+
+ if (m_key_not_found)
+ {
+ if (is_next_same)
+ {
+ /* Only rows which match the key. */
+ m_key_not_found= false;
+ bitmap_clear_all(&m_key_not_found_partitions);
+ }
+ else
+ {
+ /* There are partitions not included in the index record queue. */
+ size_t old_elements= m_queue->size();
+ if ((error= handle_ordered_index_scan_key_not_found()))
+ DBUG_RETURN(error);
+ /*
+ If the queue top changed, i.e. one of the partitions that gave
+ HA_ERR_KEY_NOT_FOUND in index_read_map found the next record,
+ return it.
+ Otherwise replace the old with a call to index_next (fall through).
+ */
+ if (old_elements != m_queue->size() && part_id != m_top_entry)
+ {
+ return_top_record(buf);
+ DBUG_PRINT("info", ("Returning row from part %u (prev KEY_NOT_FOUND)",
+ m_top_entry));
+ DBUG_RETURN(0);
+ }
+ }
+ }
+ if (part_id >= m_tot_parts)
+ DBUG_RETURN(HA_ERR_END_OF_FILE);
+
+ DBUG_PRINT("info", ("next row from part %u (inx %u)",
+ part_id, m_handler->active_index));
+
+ /* Assert that buffer for fetch is not NULL */
+ DBUG_ASSERT(rec_buf);
+
+ /* ICP relies on Item evaluation, which expects the row in record[0]. */
+ if (m_handler->pushed_idx_cond)
+ read_buf= m_table->record[0];
+ else
+ read_buf= rec_buf;
+
+
+ if (m_index_scan_type == PARTITION_READ_RANGE)
+ {
+ error= read_range_next_in_part(part_id,
+ read_buf == m_table->record[0]
+ ? NULL : read_buf);
+ }
+ else if (!is_next_same)
+ error= index_next_in_part(part_id, read_buf);
+ else
+ error= index_next_same_in_part(part_id,
+ read_buf,
+ m_start_key.key,
+ m_start_key.length);
+ if (error)
+ {
+ if (error == HA_ERR_END_OF_FILE)
+ {
+ /* Return next buffered row */
+ if (!m_queue->empty())
+ m_queue->pop();
+ if (m_queue->empty())
+ {
+ /*
+ If priority queue is empty, we have finished fetching rows from all
+ partitions. Reset the value of next partition to NONE. This would
+ imply HA_ERR_END_OF_FILE for all future calls.
+ */
+ m_top_entry= NO_CURRENT_PART_ID;
+ }
+ else
+ {
+ return_top_record(buf);
+ DBUG_PRINT("info", ("Record returned from partition %u (2)",
+ m_top_entry));
+ m_table->status= 0;
+ error= 0;
+ }
+ }
+ DBUG_RETURN(error);
+ }
+ /* When using ICP, copy record[0] to the priority queue for sorting. */
+ if (m_handler->pushed_idx_cond)
+ memcpy(rec_buf, read_buf, m_rec_length);
+ if (m_ref_usage != REF_NOT_USED)
+ {
+ /* position_in_last_part needs m_last_part set. */
+ m_last_part= part_id;
+ position_in_last_part(rec_buf - m_rec_offset + PARTITION_BYTES_IN_POS,
+ rec_buf);
+ }
+ DBUG_DUMP("rec_buf", rec_buf, m_rec_length);
+ m_queue->update_top();
+ return_top_record(buf);
+ DBUG_PRINT("info", ("Record returned from partition %u", m_top_entry));
+ DBUG_RETURN(0);
+}
+
+
+/**
+ Common routine to handle index_prev with ordered results.
+
+ @param[out] buf Read row in MySQL Row Format.
+
+ @return Operation status.
+ @retval HA_ERR_END_OF_FILE End of scan
+ @retval 0 Success
+ @retval other Error code
+*/
+
+int Partition_helper::handle_ordered_prev(uchar *buf)
+{
+ int error;
+ uint part_id= m_top_entry;
+ uchar *rec_buf= m_queue->empty() ? NULL : m_queue->top() + m_rec_offset;
+ uchar *read_buf;
+ DBUG_ENTER("Partition_helper::handle_ordered_prev");
+
+ if (!m_reverse_order)
+ {
+ /* TODO: See comment in handle_ordered_next(). */
+ DBUG_ASSERT(m_table->open_by_handler);
+ DBUG_RETURN(HA_ERR_WRONG_COMMAND);
+ }
+
+ if (m_key_not_found)
+ {
+ /* There are partitions not included in the index record queue. */
+ size_t old_elements= m_queue->size();
+ if ((error= handle_ordered_index_scan_key_not_found()))
+ DBUG_RETURN(error);
+ if (old_elements != m_queue->size() && part_id != m_top_entry)
+ {
+ /*
+ Should only be possible for when HA_READ_KEY_EXACT was previously used,
+ which is not supported to have a subsequent call for PREV.
+ I.e. HA_READ_KEY_EXACT is considered to not have reverse order!
+ */
+ DBUG_ASSERT(0);
+ /*
+ If the queue top changed, i.e. one of the partitions that gave
+ HA_ERR_KEY_NOT_FOUND in index_read_map found the next record,
+ return it.
+ Otherwise replace the old with a call to index_next (fall through).
+ */
+ return_top_record(buf);
+ DBUG_RETURN(0);
+ }
+ }
+
+ if (part_id >= m_tot_parts)
+ {
+ /* This should never happen, except for SQL HANDLER calls! */
+ DBUG_ASSERT(m_table->open_by_handler);
+ DBUG_RETURN(HA_ERR_END_OF_FILE);
+ }
+
+ /* Assert that buffer for fetch is not NULL */
+ DBUG_ASSERT(rec_buf);
+
+ /* ICP relies on Item evaluation, which expects the row in record[0]. */
+ if (m_handler->pushed_idx_cond)
+ read_buf= m_table->record[0];
+ else
+ read_buf= rec_buf;
+
+ if ((error= index_prev_in_part(part_id, read_buf)))
+ {
+ if (error == HA_ERR_END_OF_FILE)
+ {
+ if (!m_queue->empty())
+ m_queue->pop();
+ if (m_queue->empty())
+ {
+ /*
+ If priority queue is empty, we have finished fetching rows from all
+ partitions. Reset the value of next partition to NONE. This would
+ imply HA_ERR_END_OF_FILE for all future calls.
+ */
+ m_top_entry= NO_CURRENT_PART_ID;
+ }
+ else
+ {
+ return_top_record(buf);
+ DBUG_PRINT("info", ("Record returned from partition %d (2)",
+ m_top_entry));
+ error= 0;
+ m_table->status= 0;
+ }
+ }
+ DBUG_RETURN(error);
+ }
+ /* When using ICP, copy record[0] to the priority queue for sorting. */
+ if (m_handler->pushed_idx_cond)
+ memcpy(rec_buf, read_buf, m_rec_length);
+
+ if (m_ref_usage != REF_NOT_USED)
+ {
+ /* position_in_last_part needs m_last_part set. */
+ m_last_part= part_id;
+ position_in_last_part(rec_buf - m_rec_offset + PARTITION_BYTES_IN_POS,
+ rec_buf);
+ }
+ m_queue->update_top();
+ return_top_record(buf);
+ DBUG_PRINT("info", ("Record returned from partition %d", m_top_entry));
+ DBUG_RETURN(0);
+}
+
+/**
+ Get statistics from a specific partition.
+
+ @param[out] stat_info Area to report values into.
+ @param[out] check_sum Check sum of partition.
+ @param[in] part_id Partition to report from.
+*/
+void
+Partition_helper::get_dynamic_partition_info_low(PARTITION_STATS *stat_info,
+ ha_checksum *check_sum,
+ uint part_id)
+{
+ ha_statistics *part_stat= &m_handler->stats;
+ DBUG_ASSERT(bitmap_is_set(&m_part_info->read_partitions, part_id));
+ DBUG_ASSERT(bitmap_is_subset(&m_part_info->read_partitions,
+ &m_part_info->lock_partitions));
+ DBUG_ASSERT(bitmap_is_subset(&m_part_info->lock_partitions,
+ &m_part_info->read_partitions));
+ bitmap_clear_all(&m_part_info->read_partitions);
+ bitmap_set_bit(&m_part_info->read_partitions, part_id);
+ m_handler->info(HA_STATUS_TIME |
+ HA_STATUS_VARIABLE |
+ HA_STATUS_VARIABLE_EXTRA |
+ HA_STATUS_NO_LOCK);
+ stat_info->records= part_stat->records;
+ stat_info->mean_rec_length= part_stat->mean_rec_length;
+ stat_info->data_file_length= part_stat->data_file_length;
+ stat_info->max_data_file_length= part_stat->max_data_file_length;
+ stat_info->index_file_length= part_stat->index_file_length;
+ stat_info->delete_length= part_stat->delete_length;
+ stat_info->create_time= part_stat->create_time;
+ stat_info->update_time= part_stat->update_time;
+ stat_info->check_time= part_stat->check_time;
+ if (get_thd()->variables.old_mode ?
+ m_handler->ha_table_flags() & HA_HAS_OLD_CHECKSUM :
+ m_handler->ha_table_flags() & HA_HAS_NEW_CHECKSUM)
+ {
+ *check_sum= checksum_in_part(part_id);
+ }
+ bitmap_copy(&m_part_info->read_partitions, &m_part_info->lock_partitions);
+}
+
+
+/**
+ Get checksum for table.
+
+ @return Checksum or 0 if not supported, which also may be a correct checksum!.
+*/
+
+ha_checksum Partition_helper::ph_checksum() const
+{
+ ha_checksum sum= 0;
+ if (get_thd()->variables.old_mode ?
+ m_handler->ha_table_flags() & HA_HAS_OLD_CHECKSUM :
+ m_handler->ha_table_flags() & HA_HAS_NEW_CHECKSUM)
+ {
+ for (uint i= 0; i < m_tot_parts; i++)
+ {
+ sum+= checksum_in_part(i);
+ }
+ }
+ return sum;
+}
diff --git a/sql/partitioning/partition_handler.h b/sql/partitioning/partition_handler.h
new file mode 100644
index 00000000000..cf4e1dcb24b
--- /dev/null
+++ b/sql/partitioning/partition_handler.h
@@ -0,0 +1,1113 @@
+#ifndef PARTITION_HANDLER_INCLUDED
+#define PARTITION_HANDLER_INCLUDED
+
+/*
+ Copyright (c) 2005, 2015, Oracle and/or its affiliates. All rights reserved.
+
+ This program is free software; you can redistribute it and/or
+ modify it under the terms of the GNU General Public License
+ as published by the Free Software Foundation; version 2 of
+ the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
+*/
+
+#include "my_global.h" // uint etc.
+#include "my_base.h" // ha_rows.
+#include "handler.h" // Handler_share
+#include "sql_partition.h" // part_id_range
+#include "mysqld_error.h" // ER_ILLEGAL_HA
+#include "priority_queue.h"
+#include "key.h" // key_rec_cmp
+#include "ha_partition.h"
+#include <vector>
+
+#define PARTITION_BYTES_IN_POS 2
+
+/* forward declarations */
+typedef struct st_mem_root MEM_ROOT;
+
+static const uint NO_CURRENT_PART_ID= UINT_MAX32;
+
+/**
+ bits in Partition_handler::alter_flags():
+
+ HA_PARTITION_FUNCTION_SUPPORTED indicates that the function is
+ supported at all.
+ HA_FAST_CHANGE_PARTITION means that optimized variants of the changes
+ exists but they are not necessarily done online.
+
+ HA_ONLINE_DOUBLE_WRITE means that the handler supports writing to both
+ the new partition and to the old partitions when updating through the
+ old partitioning schema while performing a change of the partitioning.
+ This means that we can support updating of the table while performing
+ the copy phase of the change. For no lock at all also a double write
+ from new to old must exist and this is not required when this flag is
+ set.
+ This is actually removed even before it was introduced the first time.
+ The new idea is that handlers will handle the lock level already in
+ store_lock for ALTER TABLE partitions.
+ TODO: Implement this via the alter-inplace api.
+*/
+
+enum enum_part_operation {
+ OPTIMIZE_PARTS= 0,
+ ANALYZE_PARTS,
+ CHECK_PARTS,
+ REPAIR_PARTS,
+ ASSIGN_KEYCACHE_PARTS,
+ PRELOAD_KEYS_PARTS
+};
+
+
+/**
+ Initialize partitioning (currently only PSI keys).
+*/
+void partitioning_init();
+
+
+/**
+ Class for partitioning specific operations.
+
+ Returned from handler::get_partition_handler().
+*/
+class Partition_handler :public Sql_alloc
+{
+public:
+ Partition_handler() {}
+ ~Partition_handler() {}
+
+ bool init(uint num_parts);
+
+ /**
+ Get dynamic table information from partition.
+
+ @param[out] stat_info Statistics struct to fill in.
+ @param[out] check_sum Check sum value to fill in if supported.
+ @param[in] part_id Partition to report for.
+
+ @note stat_info and check_sum are initialized by caller.
+ check_sum is only expected to be updated if HA_HAS_CHECKSUM.
+ */
+ virtual void get_dynamic_partition_info(PARTITION_STATS *stat_info,
+ uint part_id) = 0;
+
+ /**
+ Get default number of partitions.
+
+ Used during creating a partitioned table.
+
+ @param info Create info.
+ @return Number of default partitions.
+ */
+ virtual int get_default_num_partitions(HA_CREATE_INFO *info) { return 1;}
+ /**
+ Setup auto partitioning.
+
+ Called for engines with HA_USE_AUTO_PARTITION to setup the partition info
+ object
+
+ @param[in,out] part_info Partition object to setup.
+ */
+ virtual void set_auto_partitions(partition_info *part_info) { return; }
+ /**
+ Get number of partitions for table in SE
+
+ @param name normalized path(same as open) to the table
+
+ @param[out] num_parts Number of partitions
+
+ @retval false for success
+ @retval true for failure, for example table didn't exist in engine
+ */
+ virtual bool get_num_parts(const char *name,
+ uint *num_parts)
+ {
+ *num_parts= 0;
+ return false;
+ }
+ /**
+ Set the partition info object to be used by the handler.
+
+ @param part_info Partition info to be used by the handler.
+ @param early True if called when part_info only created and parsed,
+ but not setup, checked or fixed.
+ */
+ virtual void set_part_info(partition_info *part_info) = 0;
+ /**
+ Initialize partition.
+
+ @param mem_root Memory root for memory allocations.
+
+ @return Operation status
+ @retval false Success.
+ @retval true Failure.
+ */
+ virtual bool initialize_partition(MEM_ROOT *mem_root) {return false;}
+
+
+ /**
+ Alter flags.
+
+ Given a set of alter table flags, return which is supported.
+
+ @param flags Alter table operation flags.
+
+ @return Supported alter table flags.
+ */
+ virtual uint alter_flags(uint flags) const
+ { return 0; }
+
+private:
+ /**
+ Truncate partition.
+
+ Low-level primitive for handler, implementing
+ Partition_handler::truncate_partition().
+
+ @return Operation status
+ @retval 0 Success.
+ @retval != 0 Error code.
+ */
+ virtual int truncate_partition_low()
+ { return HA_ERR_WRONG_COMMAND; }
+ /**
+ Truncate partition.
+
+ Low-level primitive for handler, implementing
+ Partition_handler::change_partitions().
+
+ @param[in] create_info Table create info.
+ @param[in] path Path including table name.
+ @param[out] copied Number of rows copied.
+ @param[out] deleted Number of rows deleted.
+
+ @return Operation status
+ @retval 0 Success.
+ @retval != 0 Error code.
+ */
+ virtual int change_partitions_low(HA_CREATE_INFO *create_info,
+ const char *path,
+ ulonglong * const copied,
+ ulonglong * const deleted)
+ {
+ my_error(ER_ILLEGAL_HA, MYF(0), create_info->alias);
+ return HA_ERR_WRONG_COMMAND;
+ }
+ /**
+ Return the table handler.
+
+ For some partitioning specific functions it is still needed to access
+ the handler directly for transaction handling (mark_trx_read_write())
+ and to assert correct locking.
+
+ @return handler or NULL if not supported.
+ */
+ virtual handler *get_handler()
+ { return NULL; }
+};
+
+
+/// Maps compare function to strict weak ordering required by Priority_queue.
+struct Key_rec_less
+{
+ typedef int (*key_compare_fun)(void*, uchar *, uchar *);
+
+ explicit Key_rec_less(KEY **keys)
+ : m_keys(keys), m_fun(key_rec_cmp), m_max_at_top(false)
+ {
+ }
+
+ bool operator()(uchar *first, uchar *second)
+ {
+ const int cmpval=
+ (*m_fun)(m_keys, first + m_rec_offset, second + m_rec_offset);
+ return m_max_at_top ? cmpval < 0 : cmpval > 0;
+ }
+
+ KEY **m_keys;
+ key_compare_fun m_fun;
+ uint m_rec_offset;
+ bool m_max_at_top;
+};
+
+
+/**
+ Partition_helper is a helper class that implements most generic partitioning
+ functionality such as:
+ table scan, index scan (both ordered and non-ordered),
+ insert (write_row()), delete and update.
+ And includes ALTER TABLE ... ADD/COALESCE/DROP/REORGANIZE/... PARTITION
+ support.
+ It also implements a cache for the auto increment value and check/repair for
+ rows in wrong partition.
+
+ How to use it:
+ Inherit it and implement:
+ - *_in_part() functions for row operations.
+ - prepare_for_new_partitions(), create_new_partition(), close_new_partitions()
+ write_row_in_new_part() for handling 'fast' alter partition.
+*/
+class Partition_helper : public Sql_alloc
+{
+ typedef Priority_queue<uchar *, std::vector<uchar*>, Key_rec_less> Prio_queue;
+public:
+ Partition_helper(handler *main_handler);
+ ~Partition_helper();
+
+ /**
+ Set partition info.
+
+ To be called from Partition_handler.
+
+ @param part_info Partition info to use.
+ @param early True if called when part_info only created and parsed,
+ but not setup, checked or fixed.
+ */
+ virtual void set_part_info_low(partition_info *part_info, bool early);
+ /**
+ Initialize variables used before the table is opened.
+
+ @param mem_root Memory root to allocate things from (not yet used).
+
+ @return Operation status.
+ @retval false success.
+ @retval true failure.
+ */
+ inline bool init_partitioning(MEM_ROOT *mem_root)
+ {
+#ifndef DBUG_OFF
+ m_key_not_found_partitions.bitmap= NULL;
+#endif
+ return false;
+ }
+
+
+ /**
+ INSERT/UPDATE/DELETE functions.
+ @see handler.h
+ @{
+ */
+
+ /**
+ Insert a row to the partitioned table.
+
+ @param buf The row in MySQL Row Format.
+
+ @return Operation status.
+ @retval 0 Success
+ @retval != 0 Error code
+ */
+ int ph_write_row(uchar *buf);
+ /**
+ Update an existing row in the partitioned table.
+
+ Yes, update_row() does what you expect, it updates a row. old_data will
+ have the previous row record in it, while new_data will have the newest
+ data in it.
+ Keep in mind that the server can do updates based on ordering if an
+ ORDER BY clause was used. Consecutive ordering is not guaranteed.
+
+ If the new record belongs to a different partition than the old record
+ then it will be inserted into the new partition and deleted from the old.
+
+ new_data is always record[0]
+ old_data is always record[1]
+
+ @param old_data The old record in MySQL Row Format.
+ @param new_data The new record in MySQL Row Format.
+
+ @return Operation status.
+ @retval 0 Success
+ @retval != 0 Error code
+ */
+ int ph_update_row(const uchar *old_data, uchar *new_data);
+ /**
+ Delete an existing row in the partitioned table.
+
+ This will delete a row. buf will contain a copy of the row to be deleted.
+ The server will call this right after the current row has been read
+ (from either a previous rnd_xxx() or index_xxx() call).
+ If you keep a pointer to the last row or can access a primary key it will
+ make doing the deletion quite a bit easier.
+ Keep in mind that the server does no guarantee consecutive deletions.
+ ORDER BY clauses can be used.
+
+ buf is either record[0] or record[1]
+
+ @param buf The record in MySQL Row Format.
+
+ @return Operation status.
+ @retval 0 Success
+ @retval != 0 Error code
+ */
+ int ph_delete_row(const uchar *buf);
+
+ /** @} */
+
+ /** Release unused auto increment values. */
+ void ph_release_auto_increment();
+ /**
+ Calculate key hash value from an null terminated array of fields.
+ Support function for KEY partitioning.
+
+ @param field_array An array of the fields in KEY partitioning
+
+ @return hash_value calculated
+
+ @note Uses the hash function on the character set of the field.
+ Integer and floating point fields use the binary character set by default.
+ */
+ static uint32 ph_calculate_key_hash_value(Field **field_array);
+ /** Get checksum for table.
+ @return Checksum or 0 if not supported (which also may be a correct checksum!).
+ */
+ ha_checksum ph_checksum() const;
+
+ /**
+ MODULE full table scan
+
+ This module is used for the most basic access method for any table
+ handler. This is to fetch all data through a full table scan. No
+ indexes are needed to implement this part.
+ It contains one method to start the scan (rnd_init) that can also be
+ called multiple times (typical in a nested loop join). Then proceeding
+ to the next record (rnd_next) and closing the scan (rnd_end).
+ To remember a record for later access there is a method (position)
+ and there is a method used to retrieve the record based on the stored
+ position.
+ The position can be a file position, a primary key, a ROWID dependent
+ on the handler below.
+
+ unlike index_init(), rnd_init() can be called two times
+ without rnd_end() in between (it only makes sense if scan=1).
+ then the second call should prepare for the new table scan
+ (e.g if rnd_init allocates the cursor, second call should
+ position it to the start of the table, no need to deallocate
+ and allocate it again.
+ @see handler.h
+ @{
+ */
+
+ int ph_rnd_init(bool scan);
+ int ph_rnd_end();
+ int ph_rnd_next(uchar *buf);
+ void ph_position(const uchar *record);
+ int ph_rnd_pos(uchar *buf, uchar *pos);
+ int ph_rnd_pos_by_record(uchar *record);
+
+ /** @} */
+
+ /**
+ MODULE index scan
+
+ This part of the handler interface is used to perform access through
+ indexes. The interface is defined as a scan interface but the handler
+ can also use key lookup if the index is a unique index or a primary
+ key index.
+ Index scans are mostly useful for SELECT queries but are an important
+ part also of UPDATE, DELETE, REPLACE and CREATE TABLE table AS SELECT
+ and so forth.
+ Naturally an index is needed for an index scan and indexes can either
+ be ordered, hash based. Some ordered indexes can return data in order
+ but not necessarily all of them.
+ There are many flags that define the behavior of indexes in the
+ various handlers. These methods are found in the optimizer module.
+ -------------------------------------------------------------------------
+
+ index_read is called to start a scan of an index. The find_flag defines
+ the semantics of the scan. These flags are defined in
+ include/my_base.h
+ index_read_idx is the same but also initializes index before calling doing
+ the same thing as index_read. Thus it is similar to index_init followed
+ by index_read. This is also how we implement it.
+
+ index_read/index_read_idx does also return the first row. Thus for
+ key lookups, the index_read will be the only call to the handler in
+ the index scan.
+
+ index_init initializes an index before using it and index_end does
+ any end processing needed.
+ @{
+ */
+
+ int ph_index_init_setup(uint key_nr, bool sorted);
+ int ph_index_init(uint key_nr, bool sorted);
+ int ph_index_end();
+ /*
+ These methods are used to jump to next or previous entry in the index
+ scan. There are also methods to jump to first and last entry.
+ */
+ int ph_index_first(uchar *buf);
+ int ph_index_last(uchar *buf);
+ int ph_index_next(uchar *buf);
+ int ph_index_next_same(uchar *buf, const uchar *key, uint keylen);
+ int ph_index_prev(uchar *buf);
+ int ph_index_read_map(uchar *buf,
+ const uchar *key,
+ key_part_map keypart_map,
+ enum ha_rkey_function find_flag);
+ int ph_index_read_last_map(uchar *buf,
+ const uchar *key,
+ key_part_map keypart_map);
+ int ph_index_read_idx_map(uchar *buf,
+ uint index,
+ const uchar *key,
+ key_part_map keypart_map,
+ enum ha_rkey_function find_flag);
+ int ph_read_range_first(const key_range *start_key,
+ const key_range *end_key,
+ bool eq_range_arg,
+ bool sorted);
+ int ph_read_range_next();
+ /** @} */
+
+ /**
+ Functions matching Partition_handler API.
+ @{
+ */
+
+ /**
+ Get statistics from a specific partition.
+ @param[out] stat_info Area to report values into.
+ @param[out] check_sum Check sum of partition.
+ @param[in] part_id Partition to report from.
+ */
+ virtual void get_dynamic_partition_info_low(PARTITION_STATS *stat_info,
+ ha_checksum *check_sum,
+ uint part_id);
+
+ /**
+ Implement the partition changes defined by ALTER TABLE of partitions.
+
+ Add and copy if needed a number of partitions, during this operation
+ only read operation is ongoing in the server. This is used by
+ ADD PARTITION all types as well as by REORGANIZE PARTITION. For
+ one-phased implementations it is used also by DROP and COALESCE
+ PARTITIONs.
+ One-phased implementation needs the new frm file, other handlers will
+ get zero length and a NULL reference here.
+
+ @param[in] create_info HA_CREATE_INFO object describing all
+ fields and indexes in table
+ @param[in] path Complete path of db and table name
+ @param[out] copied Output parameter where number of copied
+ records are added
+ @param[out] deleted Output parameter where number of deleted
+ records are added
+
+ @return Operation status
+ @retval 0 Success
+ @retval != 0 Failure
+ */
+ int change_partitions(HA_CREATE_INFO *create_info,
+ const char *path,
+ ulonglong * const copied,
+ ulonglong * const deleted);
+ /** @} */
+
+protected:
+ /* Common helper functions to be used by inheriting engines. */
+
+ /*
+ open/close functions.
+ */
+
+ /**
+ Set m_part_share, Allocate internal bitmaps etc. used by open tables.
+
+ @param mem_root Memory root to allocate things from (not yet used).
+
+ @return Operation status.
+ @retval false success.
+ @retval true failure.
+ */
+ bool open_partitioning(Partition_share *part_share);
+ /**
+ Close partitioning for a table.
+
+ Frees memory and release other resources.
+ */
+ void close_partitioning();
+
+ /**
+ Lock auto increment value if needed.
+ */
+ inline void lock_auto_increment()
+ {
+ /* lock already taken */
+ if (m_auto_increment_safe_stmt_log_lock)
+ return;
+ DBUG_ASSERT(!m_auto_increment_lock);
+ if(m_table->s->tmp_table == NO_TMP_TABLE)
+ {
+ m_auto_increment_lock= true;
+ m_part_share->lock_auto_inc();
+ }
+ }
+ /**
+ unlock auto increment.
+ */
+ inline void unlock_auto_increment()
+ {
+ /*
+ If m_auto_increment_safe_stmt_log_lock is true, we have to keep the lock.
+ It will be set to false and thus unlocked at the end of the statement by
+ ha_partition::release_auto_increment.
+ */
+ if(m_auto_increment_lock && !m_auto_increment_safe_stmt_log_lock)
+ {
+ m_part_share->unlock_auto_inc();
+ m_auto_increment_lock= false;
+ }
+ }
+ /**
+ Get auto increment.
+
+ Only to be used for auto increment values that are the first field in
+ an unique index.
+
+ @param[in] increment Increment between generated numbers.
+ @param[in] nb_desired_values Number of values requested.
+ @param[out] first_value First reserved value (ULLONG_MAX on error).
+ @param[out] nb_reserved_values Number of values reserved.
+ */
+ void get_auto_increment_first_field(ulonglong increment,
+ ulonglong nb_desired_values,
+ ulonglong *first_value,
+ ulonglong *nb_reserved_values);
+
+ /**
+ Initialize the record priority queue used for sorted index scans.
+ @return Operation status.
+ @retval 0 Success.
+ @retval != 0 Error code.
+ */
+ int init_record_priority_queue();
+ /**
+ Destroy the record priority queue used for sorted index scans.
+ */
+ void destroy_record_priority_queue();
+ /*
+ Administrative support functions.
+ */
+
+ /** Print partitioning specific error.
+ @param error Error code.
+ @param errflag Error flag.
+ @return false if error is printed else true.
+ */
+ bool print_partition_error(int error, myf errflag);
+#if 0
+ /**
+ Print a message row formatted for ANALYZE/CHECK/OPTIMIZE/REPAIR TABLE.
+
+ Modeled after mi_check_print_msg.
+
+ @param thd Thread context.
+ @param len Needed length for message buffer.
+ @param msg_type Message type.
+ @param db_name Database name.
+ @param table_name Table name.
+ @param op_name Operation name.
+ @param fmt Message (in printf format with additional arguments).
+
+ @return Operation status.
+ @retval false for success else true.
+ */
+ bool print_admin_msg(THD *thd,
+ uint len,
+ const char *msg_type,
+ const char *db_name,
+ const char *table_name,
+ const char *op_name,
+ const char *fmt,
+ ...);
+#endif
+ /**
+ Check/fix misplaced rows.
+
+ @param part_id Partition to check/fix.
+ @param repair If true, move misplaced rows to correct partition.
+
+ @return Operation status.
+ @retval 0 Success
+ @retval != 0 Error
+ */
+ int check_misplaced_rows(uint part_id, bool repair);
+ /**
+ Set used partitions bitmap from Alter_info.
+
+ @return false if success else true.
+ */
+ bool set_altered_partitions();
+
+private:
+ enum partition_index_scan_type
+ {
+ PARTITION_INDEX_READ= 1,
+ PARTITION_INDEX_FIRST,
+ PARTITION_INDEX_FIRST_UNORDERED,
+ PARTITION_INDEX_LAST,
+ PARTITION_INDEX_READ_LAST,
+ PARTITION_READ_RANGE,
+ PARTITION_NO_INDEX_SCAN
+ };
+
+ /** handler to use (ha_partition, ha_innopart etc.) */
+ handler *m_handler;
+ /** Convenience pointer to table from m_handler (i.e. m_handler->table). */
+ TABLE *m_table;
+
+ /*
+ Access methods to protected areas in handler to avoid adding
+ friend class Partition_helper in class handler.
+ */
+ virtual THD *get_thd() const = 0;
+ virtual TABLE *get_table() const = 0;
+ virtual bool get_eq_range() const = 0;
+ virtual void set_eq_range(bool eq_range) = 0;
+ virtual void set_range_key_part(KEY_PART_INFO *key_part) = 0;
+
+ /*
+ Implementation of per partition operation by instantiated engine.
+ These must be implemented in the 'real' partition_helper subclass.
+ */
+
+ /**
+ Write a row in the specified partition.
+
+ @see handler::write_row().
+
+ @param part_id Partition to write to.
+ @param buf Buffer with data to write.
+
+ @return Operation status.
+ @retval 0 Success.
+ @retval != 0 Error code.
+ */
+ virtual int write_row_in_part(uint part_id, uchar *buf) = 0;
+ /**
+ Update a row in the specified partition.
+
+ @see handler::update_row().
+
+ @param part_id Partition to update in.
+ @param old_data Buffer containing old row.
+ @param new_data Buffer containing new row.
+
+ @return Operation status.
+ @retval 0 Success.
+ @retval != 0 Error code.
+ */
+ virtual int update_row_in_part(uint new_part_id,
+ const uchar *old_data,
+ uchar *new_data) = 0;
+ /**
+ Delete an existing row in the specified partition.
+
+ @see handler::delete_row().
+
+ @param part_id Partition to delete from.
+ @param buf Buffer containing row to delete.
+
+ @return Operation status.
+ @retval 0 Success.
+ @retval != 0 Error code.
+ */
+ virtual int delete_row_in_part(uint part_id, const uchar *buf) = 0;
+ /**
+ Initialize the shared auto increment value.
+
+ @param no_lock If HA_STATUS_NO_LOCK should be used in info(HA_STATUS_AUTO).
+
+ Also sets stats.auto_increment_value.
+ */
+ virtual int initialize_auto_increment(bool no_lock) = 0;
+ /** Release auto_increment in all underlying partitions. */
+ virtual void release_auto_increment_all_parts() {}
+ /** Save or persist the current max auto increment. */
+ virtual void save_auto_increment(ulonglong nr) {}
+ /**
+ Per partition equivalent of rnd_* and index_* functions.
+
+ @see class handler.
+ */
+ virtual int rnd_init_in_part(uint part_id, bool table_scan) = 0;
+ int ph_rnd_next_in_part(uint part_id, uchar *buf);
+ virtual int rnd_next_in_part(uint part_id, uchar *buf) = 0;
+ virtual int rnd_end_in_part(uint part_id, bool scan) = 0;
+ virtual void position_in_last_part(uchar *ref, const uchar *row) = 0;
+ /* If ph_rnd_pos is used then this needs to be implemented! */
+ virtual int rnd_pos_in_part(uint part_id, uchar *buf, uchar *pos)
+ { DBUG_ASSERT(0); return HA_ERR_WRONG_COMMAND; }
+ virtual int rnd_pos_by_record_in_last_part(uchar *row)
+ {
+ /*
+ Not much overhead to use default function. This avoids out-of-sync code.
+ */
+ return m_handler->rnd_pos_by_record(row);
+ }
+ virtual int index_init_in_part(uint part, uint keynr, bool sorted)
+ { DBUG_ASSERT(0); return HA_ERR_WRONG_COMMAND; }
+ virtual int index_end_in_part(uint part)
+ { DBUG_ASSERT(0); return HA_ERR_WRONG_COMMAND; }
+ virtual int index_first_in_part(uint part, uchar *buf) = 0;
+ virtual int index_last_in_part(uint part, uchar *buf) = 0;
+ virtual int index_prev_in_part(uint part, uchar *buf) = 0;
+ virtual int index_next_in_part(uint part, uchar *buf) = 0;
+ virtual int index_next_same_in_part(uint part,
+ uchar *buf,
+ const uchar *key,
+ uint length) = 0;
+ virtual int index_read_map_in_part(uint part,
+ uchar *buf,
+ const uchar *key,
+ key_part_map keypart_map,
+ enum ha_rkey_function find_flag) = 0;
+ virtual int index_read_last_map_in_part(uint part,
+ uchar *buf,
+ const uchar *key,
+ key_part_map keypart_map) = 0;
+ /**
+ Do read_range_first in the specified partition.
+ If buf is set, then copy the result there instead of table->record[0].
+ */
+ virtual int read_range_first_in_part(uint part,
+ uchar *buf,
+ const key_range *start_key,
+ const key_range *end_key,
+ bool eq_range,
+ bool sorted) = 0;
+ /**
+ Do read_range_next in the specified partition.
+ If buf is set, then copy the result there instead of table->record[0].
+ */
+ virtual int read_range_next_in_part(uint part, uchar *buf) = 0;
+ virtual int index_read_idx_map_in_part(uint part,
+ uchar *buf,
+ uint index,
+ const uchar *key,
+ key_part_map keypart_map,
+ enum ha_rkey_function find_flag) = 0;
+ /**
+ Initialize engine specific resources for the record priority queue
+ used duing ordered index reads for multiple partitions.
+
+ @param used_parts Number of partitions used in query
+ (number of set bits in m_part_info->read_partitions).
+
+ @return Operation status.
+ @retval 0 Success.
+ @retval != 0 Error code.
+ */
+ virtual int init_record_priority_queue_for_parts(uint used_parts)
+ {
+ return 0;
+ }
+ /**
+ Destroy and release engine specific resources used by the record
+ priority queue.
+ */
+ virtual void destroy_record_priority_queue_for_parts() {}
+ /**
+ Checksum for a partition.
+
+ @param part_id Partition to checksum.
+ */
+ virtual ha_checksum checksum_in_part(uint part_id) const
+ { DBUG_ASSERT(0); return 0; }
+ /**
+ Copy a cached row.
+
+ Used when copying a row from the record priority queue to the return buffer.
+ For some engines, like InnoDB, only marked columns must be copied,
+ to preserve non-read columns.
+
+ @param[out] to_rec Buffer to copy to.
+ @param[in] from_rec Buffer to copy from.
+ */
+ virtual void copy_cached_row(uchar *to_rec, const uchar *from_rec)
+ { memcpy(to_rec, from_rec, m_rec_length); }
+ /**
+ Prepare for creating new partitions during ALTER TABLE ... PARTITION.
+ @param num_partitions Number of new partitions to be created.
+ @param only_create True if only creating the partition
+ (no open/lock is needed).
+
+ @return Operation status.
+ @retval 0 Success.
+ @retval != 0 Error code.
+ */
+ virtual int prepare_for_new_partitions(uint num_partitions,
+ bool only_create) = 0;
+ /**
+ Create a new partition to be filled during ALTER TABLE ... PARTITION.
+ @param table Table to create the partition in.
+ @param create_info Table/partition specific create info.
+ @param part_name Partition name.
+ @param new_part_id Partition id in new table.
+ @param part_elem Partition element.
+
+ @return Operation status.
+ @retval 0 Success.
+ @retval != 0 Error code.
+ */
+ virtual int create_new_partition(TABLE *table,
+ HA_CREATE_INFO *create_info,
+ const char *part_name,
+ uint new_part_id,
+ partition_element *part_elem) = 0;
+ /**
+ Close and finalize new partitions.
+ */
+ virtual void close_new_partitions() = 0;
+ /**
+ write row to new partition.
+ @param new_part New partition to write to.
+
+ @return Operation status.
+ @retval 0 Success.
+ @retval != 0 Error code.
+ */
+ virtual int write_row_in_new_part(uint new_part) = 0;
+
+ /* Internal helper functions*/
+ /**
+ Update auto increment value if current row contains a higher value.
+ */
+ inline void set_auto_increment_if_higher();
+ /**
+ Common routine to set up index scans.
+
+ Find out which partitions we'll need to read when scanning the specified
+ range.
+
+ If we need to scan only one partition, set m_ordered_scan_ongoing=FALSE
+ as we will not need to do merge ordering.
+
+ @param buf Buffer to later return record in (this function
+ needs it to calculate partitioning function values)
+
+ @param idx_read_flag True <=> m_start_key has range start endpoint which
+ probably can be used to determine the set of
+ partitions to scan.
+ False <=> there is no start endpoint.
+
+ @return Operation status.
+ @retval 0 Success
+ @retval !=0 Error code
+ */
+ int partition_scan_set_up(uchar *buf, bool idx_read_flag);
+ /**
+ Common routine to handle index_next with unordered results.
+
+ These routines are used to scan partitions without considering order.
+ This is performed in two situations.
+ 1) In read_multi_range this is the normal case
+ 2) When performing any type of index_read, index_first, index_last where
+ all fields in the partition function is bound. In this case the index
+ scan is performed on only one partition and thus it isn't necessary to
+ perform any sort.
+
+ @param[out] buf Read row in MySQL Row Format.
+ @param[in] next_same Called from index_next_same.
+
+ @return Operation status.
+ @retval HA_ERR_END_OF_FILE End of scan
+ @retval 0 Success
+ @retval other Error code
+ */
+ int handle_unordered_next(uchar *buf, bool is_next_same);
+ /**
+ Handle index_next when changing to new partition.
+
+ This routine is used to start the index scan on the next partition.
+ Both initial start and after completing scan on one partition.
+
+ @param[out] buf Read row in MySQL Row Format
+
+ @return Operation status.
+ @retval HA_ERR_END_OF_FILE End of scan
+ @retval 0 Success
+ @retval other Error code
+ */
+ int handle_unordered_scan_next_partition(uchar *buf);
+ /**
+ Common routine to start index scan with ordered results.
+
+ @param[out] buf Read row in MySQL Row Format
+
+ @return Operation status
+ @retval HA_ERR_END_OF_FILE End of scan
+ @retval HA_ERR_KEY_NOT_FOUND End of scan
+ @retval 0 Success
+ @retval other Error code
+ */
+ int handle_ordered_index_scan(uchar *buf);
+ /**
+ Add index_next/prev results from partitions without exact match.
+
+ If there where any partitions that returned HA_ERR_KEY_NOT_FOUND when
+ ha_index_read_map was done, those partitions must be included in the
+ following index_next/prev call.
+
+ @return Operation status
+ @retval HA_ERR_END_OF_FILE End of scan
+ @retval 0 Success
+ @retval other Error code
+ */
+ int handle_ordered_index_scan_key_not_found();
+ /**
+ Common routine to handle index_prev with ordered results.
+
+ @param[out] buf Read row in MySQL Row Format.
+
+ @return Operation status.
+ @retval HA_ERR_END_OF_FILE End of scan
+ @retval 0 Success
+ @retval other Error code
+ */
+ int handle_ordered_prev(uchar *buf);
+ /**
+ Common routine to handle index_next with ordered results.
+
+ @param[out] buf Read row in MySQL Row Format.
+ @param[in] next_same Called from index_next_same.
+
+ @return Operation status.
+ @retval HA_ERR_END_OF_FILE End of scan
+ @retval 0 Success
+ @retval other Error code
+ */
+ int handle_ordered_next(uchar *buf, bool is_next_same);
+ /**
+ Common routine for a number of index_read variants.
+
+ @param[out] buf Buffer where the record should be returned.
+ @param[in] have_start_key TRUE <=> the left endpoint is available, i.e.
+ we're in index_read call or in read_range_first
+ call and the range has left endpoint.
+ FALSE <=> there is no left endpoint (we're in
+ read_range_first() call and the range has no
+ left endpoint).
+
+ @return Operation status
+ @retval 0 OK
+ @retval HA_ERR_END_OF_FILE Whole index scanned, without finding the record.
+ @retval HA_ERR_KEY_NOT_FOUND Record not found, but index cursor positioned.
+ @retval other Error code.
+ */
+ int common_index_read(uchar *buf, bool have_start_key);
+ /**
+ Common routine for index_first/index_last.
+
+ @param[out] buf Read row in MySQL Row Format.
+
+ @return Operation status.
+ @retval 0 Success
+ @retval != 0 Error code
+ */
+ int common_first_last(uchar *buf);
+ /**
+ Return the top record in sort order.
+
+ @param[out] buf Row returned in MySQL Row Format.
+ */
+ void return_top_record(uchar *buf);
+ /**
+ Copy partitions as part of ALTER TABLE of partitions.
+
+ change_partitions has done all the preparations, now it is time to
+ actually copy the data from the reorganized partitions to the new
+ partitions.
+
+ @param[out] copied Number of records copied.
+ @param[out] deleted Number of records deleted.
+
+ @return Operation status
+ @retval 0 Success
+ @retval >0 Error code
+ */
+ virtual int copy_partitions(ulonglong * const copied,
+ ulonglong * const deleted);
+
+ /**
+ Set table->read_set taking partitioning expressions into account.
+ */
+ void set_partition_read_set();
+
+ /*
+ These could be private as well,
+ but easier to expose them to derived classes to use.
+ */
+protected:
+ /** All internal partitioning data! @{ */
+ /** Tables partitioning info (same as table->part_info) */
+ partition_info *m_part_info;
+ /** Is primary key clustered. */
+ bool m_pkey_is_clustered;
+ /** Cached value of m_part_info->is_sub_partitioned(). */
+ bool m_is_sub_partitioned;
+ /** Partition share for auto_inc handling. */
+ Partition_share *m_part_share;
+ /** Total number of partitions. */
+ uint m_tot_parts;
+ uint m_last_part; // Last accessed partition.
+ const uchar *m_err_rec; // record which gave error.
+ bool m_auto_increment_safe_stmt_log_lock;
+ bool m_auto_increment_lock;
+ part_id_range m_part_spec; // Which parts to scan
+ uint m_scan_value; // Value passed in rnd_init
+ // call
+ key_range m_start_key; // index read key range
+ enum partition_index_scan_type m_index_scan_type;// What type of index
+ // scan
+ uint m_rec_length; // Local copy of record length
+
+ bool m_ordered; // Ordered/Unordered index scan.
+ bool m_ordered_scan_ongoing; // Ordered index scan ongoing.
+ bool m_reverse_order; // Scanning in reverse order (prev).
+ /** Row and key buffer for ordered index scan. */
+ uchar *m_ordered_rec_buffer;
+ /** Prio queue used by sorted read. */
+ Prio_queue *m_queue;
+ /** Which partition is to deliver next result. */
+ uint m_top_entry;
+ /** Offset in m_ordered_rec_buffer from part buffer to its record buffer. */
+ uint m_rec_offset;
+ /**
+ Current index used for sorting.
+ If clustered PK exists, then it will be used as secondary index to
+ sort on if the first is equal in key_rec_cmp.
+ So if clustered pk: m_curr_key_info[0]= current index and
+ m_curr_key_info[1]= pk and [2]= NULL.
+ Otherwise [0]= current index, [1]= NULL, and we will
+ sort by rowid as secondary sort key if equal first key.
+ */
+ KEY *m_curr_key_info[3];
+ enum enum_using_ref {
+ /** handler::ref is not copied to the PQ. */
+ REF_NOT_USED= 0,
+ /**
+ handler::ref is copied to the PQ but does not need to be used in sorting.
+ */
+ REF_STORED_IN_PQ,
+ /** handler::ref is copied to the PQ and must be used during sorting. */
+ REF_USED_FOR_SORT};
+ /** How handler::ref is used in the priority queue. */
+ enum_using_ref m_ref_usage;
+ /** Set if previous index_* call returned HA_ERR_KEY_NOT_FOUND. */
+ bool m_key_not_found;
+ /** Partitions that returned HA_ERR_KEY_NOT_FOUND. */
+ MY_BITMAP m_key_not_found_partitions;
+ /** @} */
+};
+#endif /* PARTITION_HANDLER_INCLUDED */
diff --git a/sql/share/errmsg-utf8.txt b/sql/share/errmsg-utf8.txt
index b30240f64c3..0aeaa058cf9 100644
--- a/sql/share/errmsg-utf8.txt
+++ b/sql/share/errmsg-utf8.txt
@@ -7523,8 +7523,17 @@ WARN_VERS_PARAMETERS
WARN_VERS_PART_ROTATION
eng "Switching from partition %`s to %`s"
+WARN_VERS_TRX_MISSING
+ eng "VTQ missing transaction ID %lu"
+
+WARN_VERS_PART_NON_HISTORICAL
+ eng "Partition %`s contains non-historical data"
+
ER_VERS_NOT_ALLOWED
eng "%`s is not allowed for versioned table"
ER_VERS_WRONG_QUERY_TYPE
eng "%`s works only with %`s query type"
+
+ER_WRONG_TABLESPACE_NAME 42000
+ eng "Incorrect tablespace name `%-.192s`"
diff --git a/sql/sql_partition.cc b/sql/sql_partition.cc
index b358fe3386e..caca441e5e4 100644
--- a/sql/sql_partition.cc
+++ b/sql/sql_partition.cc
@@ -67,6 +67,7 @@
#include "opt_range.h" // store_key_image_to_rec
#include "sql_alter.h" // Alter_table_ctx
#include "sql_select.h"
+#include "sql_tablespace.h" // check_tablespace_name
#include <algorithm>
using std::max;
@@ -3458,7 +3459,10 @@ int vers_get_partition_id(partition_info *part_info,
{
table->s->busy_rotation= true;
mysql_mutex_unlock(&table->s->LOCK_rotation);
- if (part_info->vers_limit_exceed() || part_info->vers_interval_exceed(sys_trx_end->get_timestamp()))
+ // transaction is not yet pushed to VTQ, so we use now-time
+ my_time_t end_ts= sys_trx_end->table->versioned_by_engine() ?
+ my_time(0) : sys_trx_end->get_timestamp();
+ if (part_info->vers_limit_exceed() || part_info->vers_interval_exceed(end_ts))
{
part_info->vers_part_rotate(thd);
}
@@ -7388,6 +7392,39 @@ err:
}
#endif
+
+/*
+ Prepare for calling val_int on partition function by setting fields to
+ point to the record where the values of the PF-fields are stored.
+
+ SYNOPSIS
+ set_field_ptr()
+ ptr Array of fields to change ptr
+ new_buf New record pointer
+ old_buf Old record pointer
+
+ DESCRIPTION
+ Set ptr in field objects of field array to refer to new_buf record
+ instead of previously old_buf. Used before calling val_int and after
+ it is used to restore pointers to table->record[0].
+ This routine is placed outside of partition code since it can be useful
+ also for other programs.
+*/
+
+void set_field_ptr(Field **ptr, const uchar *new_buf,
+ const uchar *old_buf)
+{
+ my_ptrdiff_t diff= (new_buf - old_buf);
+ DBUG_ENTER("set_field_ptr");
+
+ do
+ {
+ (*ptr)->move_field_offset(diff);
+ } while (*(++ptr));
+ DBUG_VOID_RETURN;
+}
+
+
/*
Prepare for calling val_int on partition function by setting fields to
point to the record where the values of the PF-fields are stored.
@@ -7426,6 +7463,61 @@ void set_key_field_ptr(KEY *key_info, const uchar *new_buf,
}
+/**
+ Append all fields in read_set to string
+
+ @param[in,out] str String to append to.
+ @param[in] row Row to append.
+ @param[in] table Table containing read_set and fields for the row.
+*/
+void append_row_to_str(String &str, const uchar *row, TABLE *table)
+{
+ Field **fields, **field_ptr;
+ const uchar *rec;
+ uint num_fields= bitmap_bits_set(table->read_set);
+ uint curr_field_index= 0;
+ bool is_rec0= !row || row == table->record[0];
+ if (!row)
+ rec= table->record[0];
+ else
+ rec= row;
+
+ /* Create a new array of all read fields. */
+ fields= (Field**) my_malloc(sizeof(void*) * (num_fields + 1),
+ MYF(0));
+ if (!fields)
+ return;
+ fields[num_fields]= NULL;
+ for (field_ptr= table->field;
+ *field_ptr;
+ field_ptr++)
+ {
+ if (!bitmap_is_set(table->read_set, (*field_ptr)->field_index))
+ continue;
+ fields[curr_field_index++]= *field_ptr;
+ }
+
+
+ if (!is_rec0)
+ set_field_ptr(fields, rec, table->record[0]);
+
+ for (field_ptr= fields;
+ *field_ptr;
+ field_ptr++)
+ {
+ Field *field= *field_ptr;
+ str.append(" ");
+ str.append(field->field_name);
+ str.append(":");
+ field_unpack(&str, field, rec, 0, false);
+ }
+
+ if (!is_rec0)
+ set_field_ptr(fields, table->record[0], rec);
+ my_free(fields);
+}
+
+
/*
SYNOPSIS
mem_alloc_error()
@@ -8595,4 +8687,52 @@ uint get_partition_field_store_length(Field *field)
store_length+= HA_KEY_BLOB_LENGTH;
return store_length;
}
+
+// FIXME: duplicate of ha_partition::set_up_table_before_create
+bool set_up_table_before_create(THD *thd,
+ TABLE_SHARE *share,
+ const char *partition_name_with_path,
+ HA_CREATE_INFO *info,
+ partition_element *part_elem)
+{
+ bool error= false;
+ const char *partition_name;
+ DBUG_ENTER("set_up_table_before_create");
+
+ DBUG_ASSERT(part_elem);
+
+ if (!part_elem)
+ DBUG_RETURN(true);
+ share->max_rows= part_elem->part_max_rows;
+ share->min_rows= part_elem->part_min_rows;
+ partition_name= strrchr(partition_name_with_path, FN_LIBCHAR);
+ if ((part_elem->index_file_name &&
+ (error= append_file_to_dir(thd,
+ const_cast<const char**>(&part_elem->index_file_name),
+ partition_name+1))) ||
+ (part_elem->data_file_name &&
+ (error= append_file_to_dir(thd,
+ const_cast<const char**>(&part_elem->data_file_name),
+ partition_name+1))))
+ {
+ DBUG_RETURN(error);
+ }
+ if (part_elem->index_file_name != NULL)
+ {
+ info->index_file_name= part_elem->index_file_name;
+ }
+ if (part_elem->data_file_name != NULL)
+ {
+ info->data_file_name= part_elem->data_file_name;
+ }
+ if (part_elem->tablespace_name != NULL)
+ {
+ if (check_tablespace_name(part_elem->tablespace_name) != IDENT_NAME_OK)
+ {
+ DBUG_RETURN(true);
+ }
+ info->tablespace= part_elem->tablespace_name;
+ }
+ DBUG_RETURN(error);
+}
#endif
diff --git a/sql/sql_partition.h b/sql/sql_partition.h
index c2665a8366b..aef4a6ce5e1 100644
--- a/sql/sql_partition.h
+++ b/sql/sql_partition.h
@@ -40,6 +40,7 @@ typedef struct st_key_range key_range;
#define HA_CAN_UPDATE_PARTITION_KEY (1 << 1)
#define HA_CAN_PARTITION_UNIQUE (1 << 2)
#define HA_USE_AUTO_PARTITION (1 << 3)
+#define HA_ONLY_VERS_PARTITION (1 << 4)
#define NORMAL_PART_NAME 0
#define TEMP_PART_NAME 1
@@ -127,6 +128,14 @@ uint32 get_partition_id_range_for_endpoint(partition_info *part_info,
bool check_part_func_fields(Field **ptr, bool ok_with_charsets);
bool field_is_partition_charset(Field *field);
Item* convert_charset_partition_constant(Item *item, CHARSET_INFO *cs);
+/**
+ Append all fields in read_set to string
+
+ @param[in,out] str String to append to.
+ @param[in] row Row to append.
+ @param[in] table Table containing read_set and fields for the row.
+*/
+void append_row_to_str(String &str, const uchar *row, TABLE *table);
void mem_alloc_error(size_t size);
void truncate_partition_filename(char *path);
@@ -291,6 +300,31 @@ void create_subpartition_name(char *out, const char *in1,
void set_key_field_ptr(KEY *key_info, const uchar *new_buf,
const uchar *old_buf);
+/** Set up table for creating a partition.
+Copy info from partition to the table share so the created partition
+has the correct info.
+ @param thd THD object
+ @param share Table share to be updated.
+ @param info Create info to be updated.
+ @param part_elem partition_element containing the info.
+
+ @return status
+ @retval TRUE Error
+ @retval FALSE Success
+
+ @details
+ Set up
+ 1) Comment on partition
+ 2) MAX_ROWS, MIN_ROWS on partition
+ 3) Index file name on partition
+ 4) Data file name on partition
+*/
+bool set_up_table_before_create(THD *thd,
+ TABLE_SHARE *share,
+ const char *partition_name_with_path,
+ HA_CREATE_INFO *info,
+ partition_element *part_elem);
+
extern const LEX_STRING partition_keywords[];
#endif /* SQL_PARTITION_INCLUDED */
diff --git a/sql/sql_table.cc b/sql/sql_table.cc
index b5cf35ed17c..3a921e0dc79 100644
--- a/sql/sql_table.cc
+++ b/sql/sql_table.cc
@@ -4492,7 +4492,10 @@ handler *mysql_create_frm_image(THD *thd,
part_info->part_info_string= part_syntax_buf;
part_info->part_info_len= syntax_len;
if ((!(engine_type->partition_flags &&
- engine_type->partition_flags() & HA_CAN_PARTITION)) ||
+ ((engine_type->partition_flags() & HA_CAN_PARTITION) ||
+ (part_info->part_type == VERSIONING_PARTITION &&
+ engine_type->partition_flags() & HA_ONLY_VERS_PARTITION))
+ )) ||
create_info->db_type == partition_hton)
{
/*
diff --git a/sql/sql_tablespace.cc b/sql/sql_tablespace.cc
index 8b9e14e5a18..318be320640 100644
--- a/sql/sql_tablespace.cc
+++ b/sql/sql_tablespace.cc
@@ -22,6 +22,70 @@
#include "sql_table.h" // write_bin_log
#include "sql_class.h" // THD
+/**
+ Check if tablespace name is valid
+
+ @param tablespace_name Name of the tablespace
+
+ @note Tablespace names are not reflected in the file system, so
+ character case conversion or consideration is not relevant.
+
+ @note Checking for path characters or ending space is not done.
+ The only checks are for identifier length, both in terms of
+ number of characters and number of bytes.
+
+ @retval IDENT_NAME_OK Identifier name is ok (Success)
+ @retval IDENT_NAME_WRONG Identifier name is wrong, if length == 0
+* (ER_WRONG_TABLESPACE_NAME)
+ @retval IDENT_NAME_TOO_LONG Identifier name is too long if it is greater
+ than 64 characters (ER_TOO_LONG_IDENT)
+
+ @note In case of IDENT_NAME_TOO_LONG or IDENT_NAME_WRONG, the function
+ reports an error (using my_error()).
+*/
+
+enum_ident_name_check check_tablespace_name(const char *tablespace_name)
+{
+ size_t name_length= 0; //< Length as number of bytes
+ size_t name_length_symbols= 0; //< Length as number of symbols
+
+ // Name must be != NULL and length must be > 0
+ if (!tablespace_name || (name_length= strlen(tablespace_name)) == 0)
+ {
+ my_error(ER_WRONG_TABLESPACE_NAME, MYF(0), tablespace_name);
+ return IDENT_NAME_WRONG;
+ }
+
+ // If we do not have too many bytes, we must check the number of symbols,
+ // provided the system character set may use more than one byte per symbol.
+ if (name_length <= NAME_LEN && use_mb(system_charset_info))
+ {
+ const char *name= tablespace_name; //< The actual tablespace name
+ const char *end= name + name_length; //< Pointer to first byte after name
+
+ // Loop over all symbols as long as we don't have too many already
+ while (name != end && name_length_symbols <= NAME_CHAR_LEN)
+ {
+ int len= my_ismbchar(system_charset_info, name, end);
+ if (len)
+ name += len;
+ else
+ name++;
+
+ name_length_symbols++;
+ }
+ }
+
+ if (name_length_symbols > NAME_CHAR_LEN || name_length > NAME_LEN)
+ {
+ my_error(ER_TOO_LONG_IDENT, MYF(0), tablespace_name);
+ return IDENT_NAME_TOO_LONG;
+ }
+
+ return IDENT_NAME_OK;
+}
+
+
int mysql_alter_tablespace(THD *thd, st_alter_tablespace *ts_info)
{
int error= HA_ADMIN_NOT_IMPLEMENTED;
diff --git a/sql/sql_tablespace.h b/sql/sql_tablespace.h
index ae77d15cbcb..b97c64f7965 100644
--- a/sql/sql_tablespace.h
+++ b/sql/sql_tablespace.h
@@ -19,6 +19,41 @@
class THD;
class st_alter_tablespace;
+/**
+ Enumerate possible status of a identifier name while determining
+ its validity
+*/
+enum enum_ident_name_check
+{
+ IDENT_NAME_OK,
+ IDENT_NAME_WRONG,
+ IDENT_NAME_TOO_LONG
+};
+
+/**
+ Check if tablespace name is valid
+
+ @param tablespace_name Name of the tablespace
+
+ @note Tablespace names are not reflected in the file system, so
+ character case conversion or consideration is not relevant.
+
+ @note Checking for path characters or ending space is not done.
+ The only checks are for identifier length, both in terms of
+ number of characters and number of bytes.
+
+ @retval IDENT_NAME_OK Identifier name is ok (Success)
+ @retval IDENT_NAME_WRONG Identifier name is wrong, if length == 0
+ (ER_WRONG_TABLESPACE_NAME)
+ @retval IDENT_NAME_TOO_LONG Identifier name is too long if it is greater
+ than 64 characters (ER_TOO_LONG_IDENT)
+
+ @note In case of IDENT_NAME_TOO_LONG or IDENT_NAME_WRONG, the function
+ reports an error (using my_error()).
+*/
+
+enum_ident_name_check check_tablespace_name(const char *tablespace_name);
+
int mysql_alter_tablespace(THD* thd, st_alter_tablespace *ts_info);
#endif /* SQL_TABLESPACE_INCLUDED */
diff --git a/sql/table.cc b/sql/table.cc
index 354658ba476..b256b3e91b6 100644
--- a/sql/table.cc
+++ b/sql/table.cc
@@ -3261,6 +3261,20 @@ enum open_frm_error open_table_from_share(THD *thd, TABLE_SHARE *share,
}
outparam->part_info->is_auto_partitioned= share->auto_partitioned;
DBUG_PRINT("info", ("autopartitioned: %u", share->auto_partitioned));
+ if (outparam->part_info->part_type == VERSIONING_PARTITION &&
+ share->db_type()->vers_upgrade_handler)
+ {
+ outparam->file= share->db_type()->vers_upgrade_handler(
+ outparam->file, &outparam->mem_root);
+ if (!outparam->file)
+ {
+ thd->stmt_arena= backup_stmt_arena_ptr;
+ thd->restore_active_arena(&part_func_arena, &backup_arena);
+ my_error(ER_OUTOFMEMORY, MYF(0), 4095);
+ error_reported= TRUE;
+ goto err;
+ }
+ }
/*
We should perform the fix_partition_func in either local or
caller's arena depending on work_part_info_used value.