summaryrefslogtreecommitdiff
path: root/sql/ha_partition.cc
diff options
context:
space:
mode:
Diffstat (limited to 'sql/ha_partition.cc')
-rw-r--r--sql/ha_partition.cc6059
1 files changed, 6059 insertions, 0 deletions
diff --git a/sql/ha_partition.cc b/sql/ha_partition.cc
new file mode 100644
index 00000000000..d2497081893
--- /dev/null
+++ b/sql/ha_partition.cc
@@ -0,0 +1,6059 @@
+/* Copyright (C) 2005 MySQL AB
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; version 2 of the License.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+/*
+ This handler was developed by Mikael Ronstrom for version 5.1 of MySQL.
+ It is an abstraction layer on top of other handlers such as MyISAM,
+ InnoDB, Federated, Berkeley DB and so forth. Partitioned tables can also
+ be handled by a storage engine. The current example of this is NDB
+ Cluster that has internally handled partitioning. This have benefits in
+ that many loops needed in the partition handler can be avoided.
+
+ Partitioning has an inherent feature which in some cases is positive and
+ in some cases is negative. It splits the data into chunks. This makes
+ the data more manageable, queries can easily be parallelised towards the
+ parts and indexes are split such that there are less levels in the
+ index trees. The inherent disadvantage is that to use a split index
+ one has to scan all index parts which is ok for large queries but for
+ small queries it can be a disadvantage.
+
+ Partitioning lays the foundation for more manageable databases that are
+ extremely large. It does also lay the foundation for more parallelism
+ in the execution of queries. This functionality will grow with later
+ versions of MySQL.
+
+ You can enable it in your buld by doing the following during your build
+ process:
+ ./configure --with-partition
+
+ The partition is setup to use table locks. It implements an partition "SHARE"
+ that is inserted into a hash by table name. You can use this to store
+ information of state that any partition handler object will be able to see
+ if it is using the same table.
+
+ Please read the object definition in ha_partition.h before reading the rest
+ if this file.
+*/
+
+#ifdef __GNUC__
+#pragma implementation // gcc: Class implementation
+#endif
+
+#include "mysql_priv.h"
+
+#ifdef WITH_PARTITION_STORAGE_ENGINE
+#include "ha_partition.h"
+
+#include <mysql/plugin.h>
+
+static const char *ha_par_ext= ".par";
+#ifdef NOT_USED
+static int free_share(PARTITION_SHARE * share);
+static PARTITION_SHARE *get_share(const char *table_name, TABLE * table);
+#endif
+
+/****************************************************************************
+ MODULE create/delete handler object
+****************************************************************************/
+
+static handler *partition_create_handler(handlerton *hton,
+ TABLE_SHARE *share,
+ MEM_ROOT *mem_root);
+static uint partition_flags();
+static uint alter_table_flags(uint flags);
+
+
+static int partition_initialize(void *p)
+{
+
+ handlerton *partition_hton;
+ partition_hton= (handlerton *)p;
+
+ partition_hton->state= SHOW_OPTION_YES;
+ partition_hton->db_type= DB_TYPE_PARTITION_DB;
+ partition_hton->create= partition_create_handler;
+ partition_hton->partition_flags= partition_flags;
+ partition_hton->alter_table_flags= alter_table_flags;
+ partition_hton->flags= HTON_NOT_USER_SELECTABLE | HTON_HIDDEN;
+
+ return 0;
+}
+
+/*
+ Create new partition handler
+
+ SYNOPSIS
+ partition_create_handler()
+ table Table object
+
+ RETURN VALUE
+ New partition object
+*/
+
+static handler *partition_create_handler(handlerton *hton,
+ TABLE_SHARE *share,
+ MEM_ROOT *mem_root)
+{
+ ha_partition *file= new (mem_root) ha_partition(hton, share);
+ if (file && file->initialise_partition(mem_root))
+ {
+ delete file;
+ file= 0;
+ }
+ return file;
+}
+
+/*
+ HA_CAN_PARTITION:
+ Used by storage engines that can handle partitioning without this
+ partition handler
+ (Partition, NDB)
+
+ HA_CAN_UPDATE_PARTITION_KEY:
+ Set if the handler can update fields that are part of the partition
+ function.
+
+ HA_CAN_PARTITION_UNIQUE:
+ Set if the handler can handle unique indexes where the fields of the
+ unique key are not part of the fields of the partition function. Thus
+ a unique key can be set on all fields.
+
+ HA_USE_AUTO_PARTITION
+ Set if the handler sets all tables to be partitioned by default.
+*/
+
+static uint partition_flags()
+{
+ return HA_CAN_PARTITION;
+}
+
+static uint alter_table_flags(uint flags __attribute__((unused)))
+{
+ return (HA_PARTITION_FUNCTION_SUPPORTED |
+ HA_FAST_CHANGE_PARTITION);
+}
+
+const uint ha_partition::NO_CURRENT_PART_ID= 0xFFFFFFFF;
+
+/*
+ Constructor method
+
+ SYNOPSIS
+ ha_partition()
+ table Table object
+
+ RETURN VALUE
+ NONE
+*/
+
+ha_partition::ha_partition(handlerton *hton, TABLE_SHARE *share)
+ :handler(hton, share), m_part_info(NULL), m_create_handler(FALSE),
+ m_is_sub_partitioned(0), is_clone(FALSE)
+{
+ DBUG_ENTER("ha_partition::ha_partition(table)");
+ init_handler_variables();
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Constructor method
+
+ SYNOPSIS
+ ha_partition()
+ part_info Partition info
+
+ RETURN VALUE
+ NONE
+*/
+
+ha_partition::ha_partition(handlerton *hton, partition_info *part_info)
+ :handler(hton, NULL), m_part_info(part_info),
+ m_create_handler(TRUE),
+ m_is_sub_partitioned(m_part_info->is_sub_partitioned()), is_clone(FALSE)
+{
+ DBUG_ENTER("ha_partition::ha_partition(part_info)");
+ init_handler_variables();
+ DBUG_ASSERT(m_part_info);
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Initialise handler object
+
+ SYNOPSIS
+ init_handler_variables()
+
+ RETURN VALUE
+ NONE
+*/
+
+void ha_partition::init_handler_variables()
+{
+ active_index= MAX_KEY;
+ m_mode= 0;
+ m_open_test_lock= 0;
+ m_file_buffer= NULL;
+ m_name_buffer_ptr= NULL;
+ m_engine_array= NULL;
+ m_file= NULL;
+ m_file_tot_parts= 0;
+ m_reorged_file= NULL;
+ m_new_file= NULL;
+ m_reorged_parts= 0;
+ m_added_file= NULL;
+ m_tot_parts= 0;
+ m_pkey_is_clustered= 0;
+ m_lock_type= F_UNLCK;
+ m_part_spec.start_part= NO_CURRENT_PART_ID;
+ m_scan_value= 2;
+ m_ref_length= 0;
+ m_part_spec.end_part= NO_CURRENT_PART_ID;
+ m_index_scan_type= partition_no_index_scan;
+ m_start_key.key= NULL;
+ m_start_key.length= 0;
+ m_myisam= FALSE;
+ m_innodb= FALSE;
+ m_extra_cache= FALSE;
+ m_extra_cache_size= 0;
+ m_table_flags= HA_FILE_BASED | HA_REC_NOT_IN_SEQ;
+ m_low_byte_first= 1;
+ m_part_field_array= NULL;
+ m_ordered_rec_buffer= NULL;
+ m_top_entry= NO_CURRENT_PART_ID;
+ m_rec_length= 0;
+ m_last_part= 0;
+ m_rec0= 0;
+ m_curr_key_info= 0;
+ /*
+ this allows blackhole to work properly
+ */
+ m_no_locks= 0;
+
+#ifdef DONT_HAVE_TO_BE_INITALIZED
+ m_start_key.flag= 0;
+ m_ordered= TRUE;
+#endif
+}
+
+
+const char *ha_partition::table_type() const
+{
+ // we can do this since we only support a single engine type
+ return m_file[0]->table_type();
+}
+
+
+/*
+ Destructor method
+
+ SYNOPSIS
+ ~ha_partition()
+
+ RETURN VALUE
+ NONE
+*/
+
+ha_partition::~ha_partition()
+{
+ DBUG_ENTER("ha_partition::~ha_partition()");
+ if (m_file != NULL)
+ {
+ uint i;
+ for (i= 0; i < m_tot_parts; i++)
+ delete m_file[i];
+ }
+ my_free((char*) m_ordered_rec_buffer, MYF(MY_ALLOW_ZERO_PTR));
+
+ clear_handler_file();
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Initialise partition handler object
+
+ SYNOPSIS
+ initialise_partition()
+ mem_root Allocate memory through this
+
+ RETURN VALUE
+ 1 Error
+ 0 Success
+
+ DESCRIPTION
+
+ The partition handler is only a layer on top of other engines. Thus it
+ can't really perform anything without the underlying handlers. Thus we
+ add this method as part of the allocation of a handler object.
+
+ 1) Allocation of underlying handlers
+ If we have access to the partition info we will allocate one handler
+ instance for each partition.
+ 2) Allocation without partition info
+ The cases where we don't have access to this information is when called
+ in preparation for delete_table and rename_table and in that case we
+ only need to set HA_FILE_BASED. In that case we will use the .par file
+ that contains information about the partitions and their engines and
+ the names of each partition.
+ 3) Table flags initialisation
+ We need also to set table flags for the partition handler. This is not
+ static since it depends on what storage engines are used as underlying
+ handlers.
+ The table flags is set in this routine to simulate the behaviour of a
+ normal storage engine
+ The flag HA_FILE_BASED will be set independent of the underlying handlers
+ 4) Index flags initialisation
+ When knowledge exists on the indexes it is also possible to initialise the
+ index flags. Again the index flags must be initialised by using the under-
+ lying handlers since this is storage engine dependent.
+ The flag HA_READ_ORDER will be reset for the time being to indicate no
+ ordered output is available from partition handler indexes. Later a merge
+ sort will be performed using the underlying handlers.
+ 5) primary_key_is_clustered, has_transactions and low_byte_first is
+ calculated here.
+
+*/
+
+bool ha_partition::initialise_partition(MEM_ROOT *mem_root)
+{
+ handler **file_array, *file;
+ DBUG_ENTER("ha_partition::initialise_partition");
+
+ if (m_create_handler)
+ {
+ m_tot_parts= m_part_info->get_tot_partitions();
+ DBUG_ASSERT(m_tot_parts > 0);
+ if (new_handlers_from_part_info(mem_root))
+ DBUG_RETURN(1);
+ }
+ else if (!table_share || !table_share->normalized_path.str)
+ {
+ /*
+ Called with dummy table share (delete, rename and alter table)
+ Don't need to set-up table flags other than
+ HA_FILE_BASED here
+ */
+ m_table_flags|= HA_FILE_BASED | HA_REC_NOT_IN_SEQ;
+ DBUG_RETURN(0);
+ }
+ else if (get_from_handler_file(table_share->normalized_path.str, mem_root))
+ {
+ mem_alloc_error(2);
+ DBUG_RETURN(1);
+ }
+ /*
+ We create all underlying table handlers here. We do it in this special
+ method to be able to report allocation errors.
+
+ Set up table_flags, low_byte_first, primary_key_is_clustered and
+ has_transactions since they are called often in all kinds of places,
+ other parameters are calculated on demand.
+ HA_FILE_BASED is always set for partition handler since we use a
+ special file for handling names of partitions, engine types.
+ HA_CAN_GEOMETRY, HA_CAN_FULLTEXT, HA_CAN_SQL_HANDLER, HA_DUPLICATE_POS,
+ HA_CAN_INSERT_DELAYED is disabled until further investigated.
+ */
+ m_table_flags= (ulong)m_file[0]->ha_table_flags();
+ m_low_byte_first= m_file[0]->low_byte_first();
+ m_pkey_is_clustered= TRUE;
+ file_array= m_file;
+ do
+ {
+ file= *file_array;
+ if (m_low_byte_first != file->low_byte_first())
+ {
+ // Cannot have handlers with different endian
+ my_error(ER_MIX_HANDLER_ERROR, MYF(0));
+ DBUG_RETURN(1);
+ }
+ if (!file->primary_key_is_clustered())
+ m_pkey_is_clustered= FALSE;
+ m_table_flags&= file->ha_table_flags();
+ } while (*(++file_array));
+ m_table_flags&= ~(HA_CAN_GEOMETRY | HA_CAN_FULLTEXT | HA_DUPLICATE_POS |
+ HA_CAN_SQL_HANDLER | HA_CAN_INSERT_DELAYED |
+ HA_PRIMARY_KEY_REQUIRED_FOR_POSITION);
+ m_table_flags|= HA_FILE_BASED | HA_REC_NOT_IN_SEQ;
+ DBUG_RETURN(0);
+}
+
+/****************************************************************************
+ MODULE meta data changes
+****************************************************************************/
+/*
+ Delete a table
+
+ SYNOPSIS
+ delete_table()
+ name Full path of table name
+
+ RETURN VALUE
+ >0 Error
+ 0 Success
+
+ DESCRIPTION
+ Used to delete a table. By the time delete_table() has been called all
+ opened references to this table will have been closed (and your globally
+ shared references released. The variable name will just be the name of
+ the table. You will need to remove any files you have created at this
+ point.
+
+ If you do not implement this, the default delete_table() is called from
+ handler.cc and it will delete all files with the file extentions returned
+ by bas_ext().
+
+ Called from handler.cc by delete_table and ha_create_table(). Only used
+ during create if the table_flag HA_DROP_BEFORE_CREATE was specified for
+ the storage engine.
+*/
+
+int ha_partition::delete_table(const char *name)
+{
+ int error;
+ DBUG_ENTER("ha_partition::delete_table");
+
+ if ((error= del_ren_cre_table(name, NULL, NULL, NULL)))
+ DBUG_RETURN(error);
+ DBUG_RETURN(handler::delete_table(name));
+}
+
+
+/*
+ Rename a table
+
+ SYNOPSIS
+ rename_table()
+ from Full path of old table name
+ to Full path of new table name
+
+ RETURN VALUE
+ >0 Error
+ 0 Success
+
+ DESCRIPTION
+ Renames a table from one name to another from alter table call.
+
+ If you do not implement this, the default rename_table() is called from
+ handler.cc and it will rename all files with the file extentions returned
+ by bas_ext().
+
+ Called from sql_table.cc by mysql_rename_table().
+*/
+
+int ha_partition::rename_table(const char *from, const char *to)
+{
+ int error;
+ DBUG_ENTER("ha_partition::rename_table");
+
+ if ((error= del_ren_cre_table(from, to, NULL, NULL)))
+ DBUG_RETURN(error);
+ DBUG_RETURN(handler::rename_table(from, to));
+}
+
+
+/*
+ Create the handler file (.par-file)
+
+ SYNOPSIS
+ create_handler_files()
+ name Full path of table name
+ create_info Create info generated for CREATE TABLE
+
+ RETURN VALUE
+ >0 Error
+ 0 Success
+
+ DESCRIPTION
+ create_handler_files is called to create any handler specific files
+ before opening the file with openfrm to later call ::create on the
+ file object.
+ In the partition handler this is used to store the names of partitions
+ and types of engines in the partitions.
+*/
+
+int ha_partition::create_handler_files(const char *path,
+ const char *old_path,
+ int action_flag,
+ HA_CREATE_INFO *create_info)
+{
+ DBUG_ENTER("ha_partition::create_handler_files()");
+
+ /*
+ We need to update total number of parts since we might write the handler
+ file as part of a partition management command
+ */
+ if (action_flag == CHF_DELETE_FLAG ||
+ action_flag == CHF_RENAME_FLAG)
+ {
+ char name[FN_REFLEN];
+ char old_name[FN_REFLEN];
+
+ strxmov(name, path, ha_par_ext, NullS);
+ strxmov(old_name, old_path, ha_par_ext, NullS);
+ if ((action_flag == CHF_DELETE_FLAG &&
+ my_delete(name, MYF(MY_WME))) ||
+ (action_flag == CHF_RENAME_FLAG &&
+ my_rename(old_name, name, MYF(MY_WME))))
+ {
+ DBUG_RETURN(TRUE);
+ }
+ }
+ else if (action_flag == CHF_CREATE_FLAG)
+ {
+ if (create_handler_file(path))
+ {
+ my_error(ER_CANT_CREATE_HANDLER_FILE, MYF(0));
+ DBUG_RETURN(1);
+ }
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Create a partitioned table
+
+ SYNOPSIS
+ create()
+ name Full path of table name
+ table_arg Table object
+ create_info Create info generated for CREATE TABLE
+
+ RETURN VALUE
+ >0 Error
+ 0 Success
+
+ DESCRIPTION
+ create() is called to create a table. The variable name will have the name
+ of the table. When create() is called you do not need to worry about
+ opening the table. Also, the FRM file will have already been created so
+ adjusting create_info will not do you any good. You can overwrite the frm
+ file at this point if you wish to change the table definition, but there
+ are no methods currently provided for doing that.
+
+ Called from handler.cc by ha_create_table().
+*/
+
+int ha_partition::create(const char *name, TABLE *table_arg,
+ HA_CREATE_INFO *create_info)
+{
+ char t_name[FN_REFLEN];
+ DBUG_ENTER("ha_partition::create");
+
+ strmov(t_name, name);
+ DBUG_ASSERT(*fn_rext((char*)name) == '\0');
+ if (del_ren_cre_table(t_name, NULL, table_arg, create_info))
+ {
+ handler::delete_table(t_name);
+ DBUG_RETURN(1);
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Drop partitions as part of ALTER TABLE of partitions
+
+ SYNOPSIS
+ drop_partitions()
+ path Complete path of db and table name
+
+ RETURN VALUE
+ >0 Failure
+ 0 Success
+
+ DESCRIPTION
+ Use part_info object on handler object to deduce which partitions to
+ drop (each partition has a state attached to it)
+*/
+
+int ha_partition::drop_partitions(const char *path)
+{
+ List_iterator<partition_element> part_it(m_part_info->partitions);
+ char part_name_buff[FN_REFLEN];
+ uint no_parts= m_part_info->partitions.elements;
+ uint no_subparts= m_part_info->no_subparts;
+ uint i= 0;
+ uint name_variant;
+ int ret_error;
+ int error= 0;
+ DBUG_ENTER("ha_partition::drop_partitions");
+
+ /*
+ Assert that it works without HA_FILE_BASED and lower_case_table_name = 2.
+ We use m_file[0] as long as all partitions have the same storage engine.
+ */
+ DBUG_ASSERT(!strcmp(path, get_canonical_filename(m_file[0], path,
+ part_name_buff)));
+ do
+ {
+ partition_element *part_elem= part_it++;
+ if (part_elem->part_state == PART_TO_BE_DROPPED)
+ {
+ handler *file;
+ /*
+ This part is to be dropped, meaning the part or all its subparts.
+ */
+ name_variant= NORMAL_PART_NAME;
+ if (m_is_sub_partitioned)
+ {
+ List_iterator<partition_element> sub_it(part_elem->subpartitions);
+ uint j= 0, part;
+ do
+ {
+ partition_element *sub_elem= sub_it++;
+ part= i * no_subparts + j;
+ create_subpartition_name(part_name_buff, path,
+ part_elem->partition_name,
+ sub_elem->partition_name, name_variant);
+ file= m_file[part];
+ DBUG_PRINT("info", ("Drop subpartition %s", part_name_buff));
+ if ((ret_error= file->ha_delete_table(part_name_buff)))
+ error= ret_error;
+ if (deactivate_ddl_log_entry(sub_elem->log_entry->entry_pos))
+ error= 1;
+ } while (++j < no_subparts);
+ }
+ else
+ {
+ create_partition_name(part_name_buff, path,
+ part_elem->partition_name, name_variant,
+ TRUE);
+ file= m_file[i];
+ DBUG_PRINT("info", ("Drop partition %s", part_name_buff));
+ if ((ret_error= file->ha_delete_table(part_name_buff)))
+ error= ret_error;
+ if (deactivate_ddl_log_entry(part_elem->log_entry->entry_pos))
+ error= 1;
+ }
+ if (part_elem->part_state == PART_IS_CHANGED)
+ part_elem->part_state= PART_NORMAL;
+ else
+ part_elem->part_state= PART_IS_DROPPED;
+ }
+ } while (++i < no_parts);
+ VOID(sync_ddl_log());
+ DBUG_RETURN(error);
+}
+
+
+/*
+ Rename partitions as part of ALTER TABLE of partitions
+
+ SYNOPSIS
+ rename_partitions()
+ path Complete path of db and table name
+
+ RETURN VALUE
+ TRUE Failure
+ FALSE Success
+
+ DESCRIPTION
+ When reorganising partitions, adding hash partitions and coalescing
+ partitions it can be necessary to rename partitions while holding
+ an exclusive lock on the table.
+ Which partitions to rename is given by state of partitions found by the
+ partition info struct referenced from the handler object
+*/
+
+int ha_partition::rename_partitions(const char *path)
+{
+ List_iterator<partition_element> part_it(m_part_info->partitions);
+ List_iterator<partition_element> temp_it(m_part_info->temp_partitions);
+ char part_name_buff[FN_REFLEN];
+ char norm_name_buff[FN_REFLEN];
+ uint no_parts= m_part_info->partitions.elements;
+ uint part_count= 0;
+ uint no_subparts= m_part_info->no_subparts;
+ uint i= 0;
+ uint j= 0;
+ int error= 0;
+ int ret_error;
+ uint temp_partitions= m_part_info->temp_partitions.elements;
+ handler *file;
+ partition_element *part_elem, *sub_elem;
+ DBUG_ENTER("ha_partition::rename_partitions");
+
+ /*
+ Assert that it works without HA_FILE_BASED and lower_case_table_name = 2.
+ We use m_file[0] as long as all partitions have the same storage engine.
+ */
+ DBUG_ASSERT(!strcmp(path, get_canonical_filename(m_file[0], path,
+ norm_name_buff)));
+
+ if (temp_partitions)
+ {
+ /*
+ These are the reorganised partitions that have already been copied.
+ We delete the partitions and log the delete by inactivating the
+ delete log entry in the table log. We only need to synchronise
+ these writes before moving to the next loop since there is no
+ interaction among reorganised partitions, they cannot have the
+ same name.
+ */
+ do
+ {
+ part_elem= temp_it++;
+ if (m_is_sub_partitioned)
+ {
+ List_iterator<partition_element> sub_it(part_elem->subpartitions);
+ do
+ {
+ sub_elem= sub_it++;
+ file= m_reorged_file[part_count++];
+ create_subpartition_name(norm_name_buff, path,
+ part_elem->partition_name,
+ sub_elem->partition_name,
+ NORMAL_PART_NAME);
+ DBUG_PRINT("info", ("Delete subpartition %s", norm_name_buff));
+ if ((ret_error= file->ha_delete_table(norm_name_buff)))
+ error= ret_error;
+ else if (deactivate_ddl_log_entry(sub_elem->log_entry->entry_pos))
+ error= 1;
+ else
+ sub_elem->log_entry= NULL; /* Indicate success */
+ } while (++j < no_subparts);
+ }
+ else
+ {
+ file= m_reorged_file[part_count++];
+ create_partition_name(norm_name_buff, path,
+ part_elem->partition_name, NORMAL_PART_NAME,
+ TRUE);
+ DBUG_PRINT("info", ("Delete partition %s", norm_name_buff));
+ if ((ret_error= file->ha_delete_table(norm_name_buff)))
+ error= ret_error;
+ else if (deactivate_ddl_log_entry(part_elem->log_entry->entry_pos))
+ error= 1;
+ else
+ part_elem->log_entry= NULL; /* Indicate success */
+ }
+ } while (++i < temp_partitions);
+ VOID(sync_ddl_log());
+ }
+ i= 0;
+ do
+ {
+ /*
+ When state is PART_IS_CHANGED it means that we have created a new
+ TEMP partition that is to be renamed to normal partition name and
+ we are to delete the old partition with currently the normal name.
+
+ We perform this operation by
+ 1) Delete old partition with normal partition name
+ 2) Signal this in table log entry
+ 3) Synch table log to ensure we have consistency in crashes
+ 4) Rename temporary partition name to normal partition name
+ 5) Signal this to table log entry
+ It is not necessary to synch the last state since a new rename
+ should not corrupt things if there was no temporary partition.
+
+ The only other parts we need to cater for are new parts that
+ replace reorganised parts. The reorganised parts were deleted
+ by the code above that goes through the temp_partitions list.
+ Thus the synch above makes it safe to simply perform step 4 and 5
+ for those entries.
+ */
+ part_elem= part_it++;
+ if (part_elem->part_state == PART_IS_CHANGED ||
+ part_elem->part_state == PART_TO_BE_DROPPED ||
+ (part_elem->part_state == PART_IS_ADDED && temp_partitions))
+ {
+ if (m_is_sub_partitioned)
+ {
+ List_iterator<partition_element> sub_it(part_elem->subpartitions);
+ uint part;
+
+ j= 0;
+ do
+ {
+ sub_elem= sub_it++;
+ part= i * no_subparts + j;
+ create_subpartition_name(norm_name_buff, path,
+ part_elem->partition_name,
+ sub_elem->partition_name,
+ NORMAL_PART_NAME);
+ if (part_elem->part_state == PART_IS_CHANGED)
+ {
+ file= m_reorged_file[part_count++];
+ DBUG_PRINT("info", ("Delete subpartition %s", norm_name_buff));
+ if ((ret_error= file->ha_delete_table(norm_name_buff)))
+ error= ret_error;
+ else if (deactivate_ddl_log_entry(sub_elem->log_entry->entry_pos))
+ error= 1;
+ VOID(sync_ddl_log());
+ }
+ file= m_new_file[part];
+ create_subpartition_name(part_name_buff, path,
+ part_elem->partition_name,
+ sub_elem->partition_name,
+ TEMP_PART_NAME);
+ DBUG_PRINT("info", ("Rename subpartition from %s to %s",
+ part_name_buff, norm_name_buff));
+ if ((ret_error= file->ha_rename_table(part_name_buff,
+ norm_name_buff)))
+ error= ret_error;
+ else if (deactivate_ddl_log_entry(sub_elem->log_entry->entry_pos))
+ error= 1;
+ else
+ sub_elem->log_entry= NULL;
+ } while (++j < no_subparts);
+ }
+ else
+ {
+ create_partition_name(norm_name_buff, path,
+ part_elem->partition_name, NORMAL_PART_NAME,
+ TRUE);
+ if (part_elem->part_state == PART_IS_CHANGED)
+ {
+ file= m_reorged_file[part_count++];
+ DBUG_PRINT("info", ("Delete partition %s", norm_name_buff));
+ if ((ret_error= file->ha_delete_table(norm_name_buff)))
+ error= ret_error;
+ else if (deactivate_ddl_log_entry(part_elem->log_entry->entry_pos))
+ error= 1;
+ VOID(sync_ddl_log());
+ }
+ file= m_new_file[i];
+ create_partition_name(part_name_buff, path,
+ part_elem->partition_name, TEMP_PART_NAME,
+ TRUE);
+ DBUG_PRINT("info", ("Rename partition from %s to %s",
+ part_name_buff, norm_name_buff));
+ if ((ret_error= file->ha_rename_table(part_name_buff,
+ norm_name_buff)))
+ error= ret_error;
+ else if (deactivate_ddl_log_entry(part_elem->log_entry->entry_pos))
+ error= 1;
+ else
+ part_elem->log_entry= NULL;
+ }
+ }
+ } while (++i < no_parts);
+ VOID(sync_ddl_log());
+ DBUG_RETURN(error);
+}
+
+
+#define OPTIMIZE_PARTS 1
+#define ANALYZE_PARTS 2
+#define CHECK_PARTS 3
+#define REPAIR_PARTS 4
+
+static const char *opt_op_name[]= {NULL,
+ "optimize", "analyze", "check", "repair" };
+
+/*
+ Optimize table
+
+ SYNOPSIS
+ optimize()
+ thd Thread object
+ check_opt Check/analyze/repair/optimize options
+
+ RETURN VALUES
+ >0 Error
+ 0 Success
+*/
+
+int ha_partition::optimize(THD *thd, HA_CHECK_OPT *check_opt)
+{
+ DBUG_ENTER("ha_partition::optimize");
+
+ DBUG_RETURN(handle_opt_partitions(thd, check_opt,
+ OPTIMIZE_PARTS,
+ thd->lex->alter_info.flags &
+ ALTER_OPTIMIZE_PARTITION ? FALSE : TRUE));
+}
+
+
+/*
+ Analyze table
+
+ SYNOPSIS
+ analyze()
+ thd Thread object
+ check_opt Check/analyze/repair/optimize options
+
+ RETURN VALUES
+ >0 Error
+ 0 Success
+*/
+
+int ha_partition::analyze(THD *thd, HA_CHECK_OPT *check_opt)
+{
+ DBUG_ENTER("ha_partition::analyze");
+
+ DBUG_RETURN(handle_opt_partitions(thd, check_opt,
+ ANALYZE_PARTS,
+ thd->lex->alter_info.flags &
+ ALTER_ANALYZE_PARTITION ? FALSE : TRUE));
+}
+
+
+/*
+ Check table
+
+ SYNOPSIS
+ check()
+ thd Thread object
+ check_opt Check/analyze/repair/optimize options
+
+ RETURN VALUES
+ >0 Error
+ 0 Success
+*/
+
+int ha_partition::check(THD *thd, HA_CHECK_OPT *check_opt)
+{
+ DBUG_ENTER("ha_partition::check");
+
+ DBUG_RETURN(handle_opt_partitions(thd, check_opt,
+ CHECK_PARTS,
+ thd->lex->alter_info.flags &
+ ALTER_CHECK_PARTITION ? FALSE : TRUE));
+}
+
+
+/*
+ Repair table
+
+ SYNOPSIS
+ repair()
+ thd Thread object
+ check_opt Check/analyze/repair/optimize options
+
+ RETURN VALUES
+ >0 Error
+ 0 Success
+*/
+
+int ha_partition::repair(THD *thd, HA_CHECK_OPT *check_opt)
+{
+ DBUG_ENTER("ha_partition::repair");
+
+ DBUG_RETURN(handle_opt_partitions(thd, check_opt,
+ REPAIR_PARTS,
+ thd->lex->alter_info.flags &
+ ALTER_REPAIR_PARTITION ? FALSE : TRUE));
+}
+
+/*
+ Handle optimize/analyze/check/repair of one partition
+
+ SYNOPSIS
+ handle_opt_part()
+ thd Thread object
+ check_opt Options
+ file Handler object of partition
+ flag Optimize/Analyze/Check/Repair flag
+
+ RETURN VALUE
+ >0 Failure
+ 0 Success
+*/
+
+static int handle_opt_part(THD *thd, HA_CHECK_OPT *check_opt,
+ handler *file, uint flag)
+{
+ int error;
+ DBUG_ENTER("handle_opt_part");
+ DBUG_PRINT("enter", ("flag = %u", flag));
+
+ if (flag == OPTIMIZE_PARTS)
+ error= file->ha_optimize(thd, check_opt);
+ else if (flag == ANALYZE_PARTS)
+ error= file->ha_analyze(thd, check_opt);
+ else if (flag == CHECK_PARTS)
+ error= file->ha_check(thd, check_opt);
+ else if (flag == REPAIR_PARTS)
+ error= file->ha_repair(thd, check_opt);
+ else
+ {
+ DBUG_ASSERT(FALSE);
+ error= 1;
+ }
+ if (error == HA_ADMIN_ALREADY_DONE)
+ error= 0;
+ DBUG_RETURN(error);
+}
+
+
+/*
+ print a message row formatted for ANALYZE/CHECK/OPTIMIZE/REPAIR TABLE
+ (modelled after mi_check_print_msg)
+ TODO: move this into the handler, or rewrite mysql_admin_table.
+*/
+static bool print_admin_msg(THD* thd, const char* msg_type,
+ const char* db_name, const char* table_name,
+ const char* op_name, const char *fmt, ...)
+{
+ va_list args;
+ Protocol *protocol= thd->protocol;
+ uint length, msg_length;
+ char msgbuf[MI_MAX_MSG_BUF];
+ char name[NAME_LEN*2+2];
+
+ va_start(args, fmt);
+ msg_length= my_vsnprintf(msgbuf, sizeof(msgbuf), fmt, args);
+ va_end(args);
+ msgbuf[sizeof(msgbuf) - 1] = 0; // healthy paranoia
+
+
+ if (!thd->vio_ok())
+ {
+ sql_print_error(msgbuf);
+ return TRUE;
+ }
+
+ length=(uint) (strxmov(name, db_name, ".", table_name,NullS) - name);
+ /*
+ TODO: switch from protocol to push_warning here. The main reason we didn't
+ it yet is parallel repair. Due to following trace:
+ mi_check_print_msg/push_warning/sql_alloc/my_pthread_getspecific_ptr.
+
+ Also we likely need to lock mutex here (in both cases with protocol and
+ push_warning).
+ */
+ DBUG_PRINT("info",("print_admin_msg: %s, %s, %s, %s", name, op_name,
+ msg_type, msgbuf));
+ protocol->prepare_for_resend();
+ protocol->store(name, length, system_charset_info);
+ protocol->store(op_name, system_charset_info);
+ protocol->store(msg_type, system_charset_info);
+ protocol->store(msgbuf, msg_length, system_charset_info);
+ if (protocol->write())
+ {
+ sql_print_error("Failed on my_net_write, writing to stderr instead: %s\n",
+ msgbuf);
+ return TRUE;
+ }
+ return FALSE;
+}
+
+
+/*
+ Handle optimize/analyze/check/repair of partitions
+
+ SYNOPSIS
+ handle_opt_partitions()
+ thd Thread object
+ check_opt Options
+ flag Optimize/Analyze/Check/Repair flag
+ all_parts All partitions or only a subset
+
+ RETURN VALUE
+ >0 Failure
+ 0 Success
+*/
+
+int ha_partition::handle_opt_partitions(THD *thd, HA_CHECK_OPT *check_opt,
+ uint flag, bool all_parts)
+{
+ List_iterator<partition_element> part_it(m_part_info->partitions);
+ uint no_parts= m_part_info->no_parts;
+ uint no_subparts= m_part_info->no_subparts;
+ uint i= 0;
+ int error;
+ DBUG_ENTER("ha_partition::handle_opt_partitions");
+ DBUG_PRINT("enter", ("all_parts %u, flag= %u", all_parts, flag));
+
+ do
+ {
+ partition_element *part_elem= part_it++;
+ /*
+ when ALTER TABLE <CMD> PARTITION ...
+ it should only do named partitions, otherwise all partitions
+ */
+ if (all_parts ||
+ part_elem->part_state == PART_CHANGED)
+ {
+ if (m_is_sub_partitioned)
+ {
+ List_iterator<partition_element> subpart_it(part_elem->subpartitions);
+ partition_element *sub_elem;
+ uint j= 0, part;
+ do
+ {
+ sub_elem= subpart_it++;
+ part= i * no_subparts + j;
+ DBUG_PRINT("info", ("Optimize subpartition %u (%s)",
+ part, sub_elem->partition_name));
+#ifdef NOT_USED
+ if (print_admin_msg(thd, "note", table_share->db.str, table->alias,
+ opt_op_name[flag],
+ "Start to operate on subpartition %s",
+ sub_elem->partition_name))
+ DBUG_RETURN(HA_ADMIN_INTERNAL_ERROR);
+#endif
+ if ((error= handle_opt_part(thd, check_opt, m_file[part], flag)))
+ {
+ /* print a line which partition the error belongs to */
+ if (error != HA_ADMIN_NOT_IMPLEMENTED &&
+ error != HA_ADMIN_ALREADY_DONE &&
+ error != HA_ADMIN_TRY_ALTER)
+ {
+ print_admin_msg(thd, "error", table_share->db.str, table->alias,
+ opt_op_name[flag],
+ "Subpartition %s returned error",
+ sub_elem->partition_name);
+ }
+ DBUG_RETURN(error);
+ }
+ } while (++j < no_subparts);
+ }
+ else
+ {
+ DBUG_PRINT("info", ("Optimize partition %u (%s)", i,
+ part_elem->partition_name));
+#ifdef NOT_USED
+ if (print_admin_msg(thd, "note", table_share->db.str, table->alias,
+ opt_op_name[flag],
+ "Start to operate on partition %s",
+ part_elem->partition_name))
+ DBUG_RETURN(HA_ADMIN_INTERNAL_ERROR);
+#endif
+ if ((error= handle_opt_part(thd, check_opt, m_file[i], flag)))
+ {
+ /* print a line which partition the error belongs to */
+ if (error != HA_ADMIN_NOT_IMPLEMENTED &&
+ error != HA_ADMIN_ALREADY_DONE &&
+ error != HA_ADMIN_TRY_ALTER)
+ {
+ print_admin_msg(thd, "error", table_share->db.str, table->alias,
+ opt_op_name[flag], "Partition %s returned error",
+ part_elem->partition_name);
+ }
+ DBUG_RETURN(error);
+ }
+ }
+ }
+ } while (++i < no_parts);
+ DBUG_RETURN(FALSE);
+}
+
+
+/**
+ @brief Check and repair the table if neccesary
+
+ @param thd Thread object
+
+ @retval TRUE Error/Not supported
+ @retval FALSE Success
+*/
+
+bool ha_partition::check_and_repair(THD *thd)
+{
+ handler **file= m_file;
+ DBUG_ENTER("ha_partition::check_and_repair");
+
+ do
+ {
+ if ((*file)->ha_check_and_repair(thd))
+ DBUG_RETURN(TRUE);
+ } while (*(++file));
+ DBUG_RETURN(FALSE);
+}
+
+
+/**
+ @breif Check if the table can be automatically repaired
+
+ @retval TRUE Can be auto repaired
+ @retval FALSE Cannot be auto repaired
+*/
+
+bool ha_partition::auto_repair() const
+{
+ DBUG_ENTER("ha_partition::auto_repair");
+
+ /*
+ As long as we only support one storage engine per table,
+ we can use the first partition for this function.
+ */
+ DBUG_RETURN(m_file[0]->auto_repair());
+}
+
+
+/**
+ @breif Check if the table is crashed
+
+ @retval TRUE Crashed
+ @retval FALSE Not crashed
+*/
+
+bool ha_partition::is_crashed() const
+{
+ handler **file= m_file;
+ DBUG_ENTER("ha_partition::is_crashed");
+
+ do
+ {
+ if ((*file)->is_crashed())
+ DBUG_RETURN(TRUE);
+ } while (*(++file));
+ DBUG_RETURN(FALSE);
+}
+
+
+/*
+ Prepare by creating a new partition
+
+ SYNOPSIS
+ prepare_new_partition()
+ table Table object
+ create_info Create info from CREATE TABLE
+ file Handler object of new partition
+ part_name partition name
+
+ RETURN VALUE
+ >0 Error
+ 0 Success
+*/
+
+int ha_partition::prepare_new_partition(TABLE *tbl,
+ HA_CREATE_INFO *create_info,
+ handler *file, const char *part_name,
+ partition_element *p_elem)
+{
+ int error;
+ bool create_flag= FALSE;
+ DBUG_ENTER("prepare_new_partition");
+
+ if ((error= set_up_table_before_create(tbl, part_name, create_info,
+ 0, p_elem)))
+ goto error;
+ if ((error= file->ha_create(part_name, tbl, create_info)))
+ goto error;
+ create_flag= TRUE;
+ if ((error= file->ha_open(tbl, part_name, m_mode, m_open_test_lock)))
+ goto error;
+ /*
+ Note: if you plan to add another call that may return failure,
+ better to do it before external_lock() as cleanup_new_partition()
+ assumes that external_lock() is last call that may fail here.
+ Otherwise see description for cleanup_new_partition().
+ */
+ if ((error= file->ha_external_lock(current_thd, m_lock_type)))
+ goto error;
+
+ DBUG_RETURN(0);
+error:
+ if (create_flag)
+ VOID(file->ha_delete_table(part_name));
+ DBUG_RETURN(error);
+}
+
+
+/*
+ Cleanup by removing all created partitions after error
+
+ SYNOPSIS
+ cleanup_new_partition()
+ part_count Number of partitions to remove
+
+ RETURN VALUE
+ NONE
+
+ DESCRIPTION
+ This function is called immediately after prepare_new_partition() in
+ case the latter fails.
+
+ In prepare_new_partition() last call that may return failure is
+ external_lock(). That means if prepare_new_partition() fails,
+ partition does not have external lock. Thus no need to call
+ external_lock(F_UNLCK) here.
+
+ TODO:
+ We must ensure that in the case that we get an error during the process
+ that we call external_lock with F_UNLCK, close the table and delete the
+ table in the case where we have been successful with prepare_handler.
+ We solve this by keeping an array of successful calls to prepare_handler
+ which can then be used to undo the call.
+*/
+
+void ha_partition::cleanup_new_partition(uint part_count)
+{
+ handler **save_m_file= m_file;
+ DBUG_ENTER("ha_partition::cleanup_new_partition");
+
+ if (m_added_file && m_added_file[0])
+ {
+ m_file= m_added_file;
+ m_added_file= NULL;
+
+ /* delete_table also needed, a bit more complex */
+ close();
+
+ m_added_file= m_file;
+ m_file= save_m_file;
+ }
+ DBUG_VOID_RETURN;
+}
+
+/*
+ Implement the partition changes defined by ALTER TABLE of partitions
+
+ SYNOPSIS
+ change_partitions()
+ create_info HA_CREATE_INFO object describing all
+ fields and indexes in table
+ path Complete path of db and table name
+ out: copied Output parameter where number of copied
+ records are added
+ out: deleted Output parameter where number of deleted
+ records are added
+ pack_frm_data Reference to packed frm file
+ pack_frm_len Length of packed frm file
+
+ RETURN VALUE
+ >0 Failure
+ 0 Success
+
+ DESCRIPTION
+ Add and copy if needed a number of partitions, during this operation
+ no other operation is ongoing in the server. This is used by
+ ADD PARTITION all types as well as by REORGANIZE PARTITION. For
+ one-phased implementations it is used also by DROP and COALESCE
+ PARTITIONs.
+ One-phased implementation needs the new frm file, other handlers will
+ get zero length and a NULL reference here.
+*/
+
+int ha_partition::change_partitions(HA_CREATE_INFO *create_info,
+ const char *path,
+ ulonglong *copied,
+ ulonglong *deleted,
+ const uchar *pack_frm_data
+ __attribute__((unused)),
+ size_t pack_frm_len
+ __attribute__((unused)))
+{
+ List_iterator<partition_element> part_it(m_part_info->partitions);
+ List_iterator <partition_element> t_it(m_part_info->temp_partitions);
+ char part_name_buff[FN_REFLEN];
+ uint no_parts= m_part_info->partitions.elements;
+ uint no_subparts= m_part_info->no_subparts;
+ uint i= 0;
+ uint no_remain_partitions, part_count, orig_count;
+ handler **new_file_array;
+ int error= 1;
+ bool first;
+ uint temp_partitions= m_part_info->temp_partitions.elements;
+ THD *thd= current_thd;
+ DBUG_ENTER("ha_partition::change_partitions");
+
+ /*
+ Assert that it works without HA_FILE_BASED and lower_case_table_name = 2.
+ We use m_file[0] as long as all partitions have the same storage engine.
+ */
+ DBUG_ASSERT(!strcmp(path, get_canonical_filename(m_file[0], path,
+ part_name_buff)));
+ m_reorged_parts= 0;
+ if (!m_part_info->is_sub_partitioned())
+ no_subparts= 1;
+
+ /*
+ Step 1:
+ Calculate number of reorganised partitions and allocate space for
+ their handler references.
+ */
+ if (temp_partitions)
+ {
+ m_reorged_parts= temp_partitions * no_subparts;
+ }
+ else
+ {
+ do
+ {
+ partition_element *part_elem= part_it++;
+ if (part_elem->part_state == PART_CHANGED ||
+ part_elem->part_state == PART_REORGED_DROPPED)
+ {
+ m_reorged_parts+= no_subparts;
+ }
+ } while (++i < no_parts);
+ }
+ if (m_reorged_parts &&
+ !(m_reorged_file= (handler**)sql_calloc(sizeof(handler*)*
+ (m_reorged_parts + 1))))
+ {
+ mem_alloc_error(sizeof(handler*)*(m_reorged_parts+1));
+ DBUG_RETURN(ER_OUTOFMEMORY);
+ }
+
+ /*
+ Step 2:
+ Calculate number of partitions after change and allocate space for
+ their handler references.
+ */
+ no_remain_partitions= 0;
+ if (temp_partitions)
+ {
+ no_remain_partitions= no_parts * no_subparts;
+ }
+ else
+ {
+ part_it.rewind();
+ i= 0;
+ do
+ {
+ partition_element *part_elem= part_it++;
+ if (part_elem->part_state == PART_NORMAL ||
+ part_elem->part_state == PART_TO_BE_ADDED ||
+ part_elem->part_state == PART_CHANGED)
+ {
+ no_remain_partitions+= no_subparts;
+ }
+ } while (++i < no_parts);
+ }
+ if (!(new_file_array= (handler**)sql_calloc(sizeof(handler*)*
+ (2*(no_remain_partitions + 1)))))
+ {
+ mem_alloc_error(sizeof(handler*)*2*(no_remain_partitions+1));
+ DBUG_RETURN(ER_OUTOFMEMORY);
+ }
+ m_added_file= &new_file_array[no_remain_partitions + 1];
+
+ /*
+ Step 3:
+ Fill m_reorged_file with handler references and NULL at the end
+ */
+ if (m_reorged_parts)
+ {
+ i= 0;
+ part_count= 0;
+ first= TRUE;
+ part_it.rewind();
+ do
+ {
+ partition_element *part_elem= part_it++;
+ if (part_elem->part_state == PART_CHANGED ||
+ part_elem->part_state == PART_REORGED_DROPPED)
+ {
+ memcpy((void*)&m_reorged_file[part_count],
+ (void*)&m_file[i*no_subparts],
+ sizeof(handler*)*no_subparts);
+ part_count+= no_subparts;
+ }
+ else if (first && temp_partitions &&
+ part_elem->part_state == PART_TO_BE_ADDED)
+ {
+ /*
+ When doing an ALTER TABLE REORGANIZE PARTITION a number of
+ partitions is to be reorganised into a set of new partitions.
+ The reorganised partitions are in this case in the temp_partitions
+ list. We copy all of them in one batch and thus we only do this
+ until we find the first partition with state PART_TO_BE_ADDED
+ since this is where the new partitions go in and where the old
+ ones used to be.
+ */
+ first= FALSE;
+ DBUG_ASSERT(((i*no_subparts) + m_reorged_parts) <= m_file_tot_parts);
+ memcpy((void*)m_reorged_file, &m_file[i*no_subparts],
+ sizeof(handler*)*m_reorged_parts);
+ }
+ } while (++i < no_parts);
+ }
+
+ /*
+ Step 4:
+ Fill new_array_file with handler references. Create the handlers if
+ needed.
+ */
+ i= 0;
+ part_count= 0;
+ orig_count= 0;
+ first= TRUE;
+ part_it.rewind();
+ do
+ {
+ partition_element *part_elem= part_it++;
+ if (part_elem->part_state == PART_NORMAL)
+ {
+ DBUG_ASSERT(orig_count + no_subparts <= m_file_tot_parts);
+ memcpy((void*)&new_file_array[part_count], (void*)&m_file[orig_count],
+ sizeof(handler*)*no_subparts);
+ part_count+= no_subparts;
+ orig_count+= no_subparts;
+ }
+ else if (part_elem->part_state == PART_CHANGED ||
+ part_elem->part_state == PART_TO_BE_ADDED)
+ {
+ uint j= 0;
+ do
+ {
+ if (!(new_file_array[part_count++]=
+ get_new_handler(table->s,
+ thd->mem_root,
+ part_elem->engine_type)))
+ {
+ mem_alloc_error(sizeof(handler));
+ DBUG_RETURN(ER_OUTOFMEMORY);
+ }
+ } while (++j < no_subparts);
+ if (part_elem->part_state == PART_CHANGED)
+ orig_count+= no_subparts;
+ else if (temp_partitions && first)
+ {
+ orig_count+= (no_subparts * temp_partitions);
+ first= FALSE;
+ }
+ }
+ } while (++i < no_parts);
+ first= FALSE;
+ /*
+ Step 5:
+ Create the new partitions and also open, lock and call external_lock
+ on them to prepare them for copy phase and also for later close
+ calls
+ */
+ i= 0;
+ part_count= 0;
+ part_it.rewind();
+ do
+ {
+ partition_element *part_elem= part_it++;
+ if (part_elem->part_state == PART_TO_BE_ADDED ||
+ part_elem->part_state == PART_CHANGED)
+ {
+ /*
+ A new partition needs to be created PART_TO_BE_ADDED means an
+ entirely new partition and PART_CHANGED means a changed partition
+ that will still exist with either more or less data in it.
+ */
+ uint name_variant= NORMAL_PART_NAME;
+ if (part_elem->part_state == PART_CHANGED ||
+ (part_elem->part_state == PART_TO_BE_ADDED && temp_partitions))
+ name_variant= TEMP_PART_NAME;
+ if (m_part_info->is_sub_partitioned())
+ {
+ List_iterator<partition_element> sub_it(part_elem->subpartitions);
+ uint j= 0, part;
+ do
+ {
+ partition_element *sub_elem= sub_it++;
+ create_subpartition_name(part_name_buff, path,
+ part_elem->partition_name,
+ sub_elem->partition_name,
+ name_variant);
+ part= i * no_subparts + j;
+ DBUG_PRINT("info", ("Add subpartition %s", part_name_buff));
+ if ((error= prepare_new_partition(table, create_info,
+ new_file_array[part],
+ (const char *)part_name_buff,
+ sub_elem)))
+ {
+ cleanup_new_partition(part_count);
+ DBUG_RETURN(error);
+ }
+ m_added_file[part_count++]= new_file_array[part];
+ } while (++j < no_subparts);
+ }
+ else
+ {
+ create_partition_name(part_name_buff, path,
+ part_elem->partition_name, name_variant,
+ TRUE);
+ DBUG_PRINT("info", ("Add partition %s", part_name_buff));
+ if ((error= prepare_new_partition(table, create_info,
+ new_file_array[i],
+ (const char *)part_name_buff,
+ part_elem)))
+ {
+ cleanup_new_partition(part_count);
+ DBUG_RETURN(error);
+ }
+ m_added_file[part_count++]= new_file_array[i];
+ }
+ }
+ } while (++i < no_parts);
+
+ /*
+ Step 6:
+ State update to prepare for next write of the frm file.
+ */
+ i= 0;
+ part_it.rewind();
+ do
+ {
+ partition_element *part_elem= part_it++;
+ if (part_elem->part_state == PART_TO_BE_ADDED)
+ part_elem->part_state= PART_IS_ADDED;
+ else if (part_elem->part_state == PART_CHANGED)
+ part_elem->part_state= PART_IS_CHANGED;
+ else if (part_elem->part_state == PART_REORGED_DROPPED)
+ part_elem->part_state= PART_TO_BE_DROPPED;
+ } while (++i < no_parts);
+ for (i= 0; i < temp_partitions; i++)
+ {
+ partition_element *part_elem= t_it++;
+ DBUG_ASSERT(part_elem->part_state == PART_TO_BE_REORGED);
+ part_elem->part_state= PART_TO_BE_DROPPED;
+ }
+ m_new_file= new_file_array;
+ DBUG_RETURN(copy_partitions(copied, deleted));
+}
+
+
+/*
+ Copy partitions as part of ALTER TABLE of partitions
+
+ SYNOPSIS
+ copy_partitions()
+ out:copied Number of records copied
+ out:deleted Number of records deleted
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+
+ DESCRIPTION
+ change_partitions has done all the preparations, now it is time to
+ actually copy the data from the reorganised partitions to the new
+ partitions.
+*/
+
+int ha_partition::copy_partitions(ulonglong *copied, ulonglong *deleted)
+{
+ uint reorg_part= 0;
+ int result= 0;
+ longlong func_value;
+ DBUG_ENTER("ha_partition::copy_partitions");
+
+ if (m_part_info->linear_hash_ind)
+ {
+ if (m_part_info->part_type == HASH_PARTITION)
+ set_linear_hash_mask(m_part_info, m_part_info->no_parts);
+ else
+ set_linear_hash_mask(m_part_info, m_part_info->no_subparts);
+ }
+
+ while (reorg_part < m_reorged_parts)
+ {
+ handler *file= m_reorged_file[reorg_part];
+ uint32 new_part;
+
+ late_extra_cache(reorg_part);
+ if ((result= file->ha_rnd_init(1)))
+ goto error;
+ while (TRUE)
+ {
+ if ((result= file->rnd_next(m_rec0)))
+ {
+ if (result == HA_ERR_RECORD_DELETED)
+ continue; //Probably MyISAM
+ if (result != HA_ERR_END_OF_FILE)
+ goto error;
+ /*
+ End-of-file reached, break out to continue with next partition or
+ end the copy process.
+ */
+ break;
+ }
+ /* Found record to insert into new handler */
+ if (m_part_info->get_partition_id(m_part_info, &new_part,
+ &func_value))
+ {
+ /*
+ This record is in the original table but will not be in the new
+ table since it doesn't fit into any partition any longer due to
+ changed partitioning ranges or list values.
+ */
+ deleted++;
+ }
+ else
+ {
+ THD *thd= ha_thd();
+ /* Copy record to new handler */
+ copied++;
+ tmp_disable_binlog(thd); /* Do not replicate the low-level changes. */
+ result= m_new_file[new_part]->ha_write_row(m_rec0);
+ reenable_binlog(thd);
+ if (result)
+ goto error;
+ }
+ }
+ late_extra_no_cache(reorg_part);
+ file->ha_rnd_end();
+ reorg_part++;
+ }
+ DBUG_RETURN(FALSE);
+error:
+ DBUG_RETURN(result);
+}
+
+
+/*
+ Update create info as part of ALTER TABLE
+
+ SYNOPSIS
+ update_create_info()
+ create_info Create info from ALTER TABLE
+
+ RETURN VALUE
+ NONE
+
+ DESCRIPTION
+ Method empty so far
+*/
+
+void ha_partition::update_create_info(HA_CREATE_INFO *create_info)
+{
+ info(HA_STATUS_AUTO);
+
+ if (!(create_info->used_fields & HA_CREATE_USED_AUTO))
+ create_info->auto_increment_value= stats.auto_increment_value;
+
+ create_info->data_file_name= create_info->index_file_name = NULL;
+ return;
+}
+
+
+void ha_partition::change_table_ptr(TABLE *table_arg, TABLE_SHARE *share)
+{
+ handler **file_array= m_file;
+ table= table_arg;
+ table_share= share;
+ do
+ {
+ (*file_array)->change_table_ptr(table_arg, share);
+ } while (*(++file_array));
+ if (m_added_file && m_added_file[0])
+ {
+ /* if in middle of a drop/rename etc */
+ file_array= m_added_file;
+ do
+ {
+ (*file_array)->change_table_ptr(table_arg, share);
+ } while (*(++file_array));
+ }
+}
+
+/*
+ Change comments specific to handler
+
+ SYNOPSIS
+ update_table_comment()
+ comment Original comment
+
+ RETURN VALUE
+ new comment
+
+ DESCRIPTION
+ No comment changes so far
+*/
+
+char *ha_partition::update_table_comment(const char *comment)
+{
+ return (char*) comment; /* Nothing to change */
+}
+
+
+
+/*
+ Handle delete, rename and create table
+
+ SYNOPSIS
+ del_ren_cre_table()
+ from Full path of old table
+ to Full path of new table
+ table_arg Table object
+ create_info Create info
+
+ RETURN VALUE
+ >0 Error
+ 0 Success
+
+ DESCRIPTION
+ Common routine to handle delete_table and rename_table.
+ The routine uses the partition handler file to get the
+ names of the partition instances. Both these routines
+ are called after creating the handler without table
+ object and thus the file is needed to discover the
+ names of the partitions and the underlying storage engines.
+*/
+
+uint ha_partition::del_ren_cre_table(const char *from,
+ const char *to,
+ TABLE *table_arg,
+ HA_CREATE_INFO *create_info)
+{
+ int save_error= 0;
+ int error;
+ char from_buff[FN_REFLEN], to_buff[FN_REFLEN], from_lc_buff[FN_REFLEN],
+ to_lc_buff[FN_REFLEN];
+ char *name_buffer_ptr;
+ const char *from_path;
+ const char *to_path= NULL;
+ uint i;
+ handler **file, **abort_file;
+ DBUG_ENTER("del_ren_cre_table()");
+
+ if (get_from_handler_file(from, current_thd->mem_root))
+ DBUG_RETURN(TRUE);
+ DBUG_ASSERT(m_file_buffer);
+ DBUG_PRINT("enter", ("from: (%s) to: (%s)", from, to));
+ name_buffer_ptr= m_name_buffer_ptr;
+ file= m_file;
+ /*
+ Since ha_partition has HA_FILE_BASED, it must alter underlying table names
+ if they do not have HA_FILE_BASED and lower_case_table_names == 2.
+ See Bug#37402, for Mac OS X.
+ The appended #P#<partname>[#SP#<subpartname>] will remain in current case.
+ Using the first partitions handler, since mixing handlers is not allowed.
+ */
+ from_path= get_canonical_filename(*file, from, from_lc_buff);
+ if (to != NULL)
+ to_path= get_canonical_filename(*file, to, to_lc_buff);
+ i= 0;
+ do
+ {
+ create_partition_name(from_buff, from_path, name_buffer_ptr,
+ NORMAL_PART_NAME, FALSE);
+
+ if (to != NULL)
+ { // Rename branch
+ create_partition_name(to_buff, to_path, name_buffer_ptr,
+ NORMAL_PART_NAME, FALSE);
+ error= (*file)->ha_rename_table(from_buff, to_buff);
+ }
+ else if (table_arg == NULL) // delete branch
+ error= (*file)->ha_delete_table(from_buff);
+ else
+ {
+ if ((error= set_up_table_before_create(table_arg, from_buff,
+ create_info, i, NULL)) ||
+ ((error= (*file)->ha_create(from_buff, table_arg, create_info))))
+ goto create_error;
+ }
+ name_buffer_ptr= strend(name_buffer_ptr) + 1;
+ if (error)
+ save_error= error;
+ i++;
+ } while (*(++file));
+ DBUG_RETURN(save_error);
+create_error:
+ name_buffer_ptr= m_name_buffer_ptr;
+ for (abort_file= file, file= m_file; file < abort_file; file++)
+ {
+ create_partition_name(from_buff, from_path, name_buffer_ptr, NORMAL_PART_NAME,
+ FALSE);
+ VOID((*file)->ha_delete_table((const char*) from_buff));
+ name_buffer_ptr= strend(name_buffer_ptr) + 1;
+ }
+ DBUG_RETURN(error);
+}
+
+/*
+ Find partition based on partition id
+
+ SYNOPSIS
+ find_partition_element()
+ part_id Partition id of partition looked for
+
+ RETURN VALUE
+ >0 Reference to partition_element
+ 0 Partition not found
+*/
+
+partition_element *ha_partition::find_partition_element(uint part_id)
+{
+ uint i;
+ uint curr_part_id= 0;
+ List_iterator_fast <partition_element> part_it(m_part_info->partitions);
+
+ for (i= 0; i < m_part_info->no_parts; i++)
+ {
+ partition_element *part_elem;
+ part_elem= part_it++;
+ if (m_is_sub_partitioned)
+ {
+ uint j;
+ List_iterator_fast <partition_element> sub_it(part_elem->subpartitions);
+ for (j= 0; j < m_part_info->no_subparts; j++)
+ {
+ part_elem= sub_it++;
+ if (part_id == curr_part_id++)
+ return part_elem;
+ }
+ }
+ else if (part_id == curr_part_id++)
+ return part_elem;
+ }
+ DBUG_ASSERT(0);
+ my_error(ER_OUT_OF_RESOURCES, MYF(0));
+ current_thd->fatal_error(); // Abort
+ return NULL;
+}
+
+
+/*
+ Set up table share object before calling create on underlying handler
+
+ SYNOPSIS
+ set_up_table_before_create()
+ table Table object
+ info Create info
+ part_id Partition id of partition to set-up
+
+ RETURN VALUE
+ TRUE Error
+ FALSE Success
+
+ DESCRIPTION
+ Set up
+ 1) Comment on partition
+ 2) MAX_ROWS, MIN_ROWS on partition
+ 3) Index file name on partition
+ 4) Data file name on partition
+*/
+
+int ha_partition::set_up_table_before_create(TABLE *tbl,
+ const char *partition_name_with_path,
+ HA_CREATE_INFO *info,
+ uint part_id,
+ partition_element *part_elem)
+{
+ int error= 0;
+ const char *partition_name;
+ THD *thd= current_thd;
+ DBUG_ENTER("set_up_table_before_create");
+
+ if (!part_elem)
+ {
+ part_elem= find_partition_element(part_id);
+ if (!part_elem)
+ DBUG_RETURN(1); // Fatal error
+ }
+ tbl->s->max_rows= part_elem->part_max_rows;
+ tbl->s->min_rows= part_elem->part_min_rows;
+ partition_name= strrchr(partition_name_with_path, FN_LIBCHAR);
+ if ((part_elem->index_file_name &&
+ (error= append_file_to_dir(thd,
+ (const char**)&part_elem->index_file_name,
+ partition_name+1))) ||
+ (part_elem->data_file_name &&
+ (error= append_file_to_dir(thd,
+ (const char**)&part_elem->data_file_name,
+ partition_name+1))))
+ {
+ DBUG_RETURN(error);
+ }
+ info->index_file_name= part_elem->index_file_name;
+ info->data_file_name= part_elem->data_file_name;
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Add two names together
+
+ SYNOPSIS
+ name_add()
+ out:dest Destination string
+ first_name First name
+ sec_name Second name
+
+ RETURN VALUE
+ >0 Error
+ 0 Success
+
+ DESCRIPTION
+ Routine used to add two names with '_' in between then. Service routine
+ to create_handler_file
+ Include the NULL in the count of characters since it is needed as separator
+ between the partition names.
+*/
+
+static uint name_add(char *dest, const char *first_name, const char *sec_name)
+{
+ return (uint) (strxmov(dest, first_name, "#SP#", sec_name, NullS) -dest) + 1;
+}
+
+
+/*
+ Create the special .par file
+
+ SYNOPSIS
+ create_handler_file()
+ name Full path of table name
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+
+ DESCRIPTION
+ Method used to create handler file with names of partitions, their
+ engine types and the number of partitions.
+*/
+
+bool ha_partition::create_handler_file(const char *name)
+{
+ partition_element *part_elem, *subpart_elem;
+ uint i, j, part_name_len, subpart_name_len;
+ uint tot_partition_words, tot_name_len, no_parts;
+ uint tot_parts= 0;
+ uint tot_len_words, tot_len_byte, chksum, tot_name_words;
+ char *name_buffer_ptr;
+ uchar *file_buffer, *engine_array;
+ bool result= TRUE;
+ char file_name[FN_REFLEN];
+ char part_name[FN_REFLEN];
+ char subpart_name[FN_REFLEN];
+ File file;
+ List_iterator_fast <partition_element> part_it(m_part_info->partitions);
+ DBUG_ENTER("create_handler_file");
+
+ no_parts= m_part_info->partitions.elements;
+ DBUG_PRINT("info", ("table name = %s, no_parts = %u", name,
+ no_parts));
+ tot_name_len= 0;
+ for (i= 0; i < no_parts; i++)
+ {
+ part_elem= part_it++;
+ if (part_elem->part_state != PART_NORMAL &&
+ part_elem->part_state != PART_TO_BE_ADDED &&
+ part_elem->part_state != PART_CHANGED)
+ continue;
+ tablename_to_filename(part_elem->partition_name, part_name,
+ FN_REFLEN);
+ part_name_len= strlen(part_name);
+ if (!m_is_sub_partitioned)
+ {
+ tot_name_len+= part_name_len + 1;
+ tot_parts++;
+ }
+ else
+ {
+ List_iterator_fast <partition_element> sub_it(part_elem->subpartitions);
+ for (j= 0; j < m_part_info->no_subparts; j++)
+ {
+ subpart_elem= sub_it++;
+ tablename_to_filename(subpart_elem->partition_name,
+ subpart_name,
+ FN_REFLEN);
+ subpart_name_len= strlen(subpart_name);
+ tot_name_len+= part_name_len + subpart_name_len + 5;
+ tot_parts++;
+ }
+ }
+ }
+ /*
+ File format:
+ Length in words 4 byte
+ Checksum 4 byte
+ Total number of partitions 4 byte
+ Array of engine types n * 4 bytes where
+ n = (m_tot_parts + 3)/4
+ Length of name part in bytes 4 bytes
+ Name part m * 4 bytes where
+ m = ((length_name_part + 3)/4)*4
+
+ All padding bytes are zeroed
+ */
+ tot_partition_words= (tot_parts + 3) / 4;
+ tot_name_words= (tot_name_len + 3) / 4;
+ tot_len_words= 4 + tot_partition_words + tot_name_words;
+ tot_len_byte= 4 * tot_len_words;
+ if (!(file_buffer= (uchar *) my_malloc(tot_len_byte, MYF(MY_ZEROFILL))))
+ DBUG_RETURN(TRUE);
+ engine_array= (file_buffer + 12);
+ name_buffer_ptr= (char*) (file_buffer + ((4 + tot_partition_words) * 4));
+ part_it.rewind();
+ for (i= 0; i < no_parts; i++)
+ {
+ part_elem= part_it++;
+ if (part_elem->part_state != PART_NORMAL &&
+ part_elem->part_state != PART_TO_BE_ADDED &&
+ part_elem->part_state != PART_CHANGED)
+ continue;
+ if (!m_is_sub_partitioned)
+ {
+ tablename_to_filename(part_elem->partition_name, part_name, FN_REFLEN);
+ name_buffer_ptr= strmov(name_buffer_ptr, part_name)+1;
+ *engine_array= (uchar) ha_legacy_type(part_elem->engine_type);
+ DBUG_PRINT("info", ("engine: %u", *engine_array));
+ engine_array++;
+ }
+ else
+ {
+ List_iterator_fast <partition_element> sub_it(part_elem->subpartitions);
+ for (j= 0; j < m_part_info->no_subparts; j++)
+ {
+ subpart_elem= sub_it++;
+ tablename_to_filename(part_elem->partition_name, part_name,
+ FN_REFLEN);
+ tablename_to_filename(subpart_elem->partition_name, subpart_name,
+ FN_REFLEN);
+ name_buffer_ptr+= name_add(name_buffer_ptr,
+ part_name,
+ subpart_name);
+ *engine_array= (uchar) ha_legacy_type(subpart_elem->engine_type);
+ DBUG_PRINT("info", ("engine: %u", *engine_array));
+ engine_array++;
+ }
+ }
+ }
+ chksum= 0;
+ int4store(file_buffer, tot_len_words);
+ int4store(file_buffer + 8, tot_parts);
+ int4store(file_buffer + 12 + (tot_partition_words * 4), tot_name_len);
+ for (i= 0; i < tot_len_words; i++)
+ chksum^= uint4korr(file_buffer + 4 * i);
+ int4store(file_buffer + 4, chksum);
+ /*
+ Remove .frm extension and replace with .par
+ Create and write and close file
+ to be used at open, delete_table and rename_table
+ */
+ fn_format(file_name, name, "", ha_par_ext, MY_APPEND_EXT);
+ if ((file= my_create(file_name, CREATE_MODE, O_RDWR | O_TRUNC,
+ MYF(MY_WME))) >= 0)
+ {
+ result= my_write(file, (uchar *) file_buffer, tot_len_byte,
+ MYF(MY_WME | MY_NABP)) != 0;
+ VOID(my_close(file, MYF(0)));
+ }
+ else
+ result= TRUE;
+ my_free((char*) file_buffer, MYF(0));
+ DBUG_RETURN(result);
+}
+
+/*
+ Clear handler variables and free some memory
+
+ SYNOPSIS
+ clear_handler_file()
+
+ RETURN VALUE
+ NONE
+*/
+
+void ha_partition::clear_handler_file()
+{
+ if (m_engine_array)
+ plugin_unlock_list(NULL, m_engine_array, m_tot_parts);
+ my_free((char*) m_file_buffer, MYF(MY_ALLOW_ZERO_PTR));
+ my_free((char*) m_engine_array, MYF(MY_ALLOW_ZERO_PTR));
+ m_file_buffer= NULL;
+ m_engine_array= NULL;
+}
+
+/*
+ Create underlying handler objects
+
+ SYNOPSIS
+ create_handlers()
+ mem_root Allocate memory through this
+
+ RETURN VALUE
+ TRUE Error
+ FALSE Success
+*/
+
+bool ha_partition::create_handlers(MEM_ROOT *mem_root)
+{
+ uint i;
+ uint alloc_len= (m_tot_parts + 1) * sizeof(handler*);
+ handlerton *hton0;
+ DBUG_ENTER("create_handlers");
+
+ if (!(m_file= (handler **) alloc_root(mem_root, alloc_len)))
+ DBUG_RETURN(TRUE);
+ m_file_tot_parts= m_tot_parts;
+ bzero((char*) m_file, alloc_len);
+ for (i= 0; i < m_tot_parts; i++)
+ {
+ handlerton *hton= plugin_data(m_engine_array[i], handlerton*);
+ if (!(m_file[i]= get_new_handler(table_share, mem_root,
+ hton)))
+ DBUG_RETURN(TRUE);
+ DBUG_PRINT("info", ("engine_type: %u", hton->db_type));
+ }
+ /* For the moment we only support partition over the same table engine */
+ hton0= plugin_data(m_engine_array[0], handlerton*);
+ if (hton0 == myisam_hton)
+ {
+ DBUG_PRINT("info", ("MyISAM"));
+ m_myisam= TRUE;
+ }
+ /* INNODB may not be compiled in... */
+ else if (ha_legacy_type(hton0) == DB_TYPE_INNODB)
+ {
+ DBUG_PRINT("info", ("InnoDB"));
+ m_innodb= TRUE;
+ }
+ DBUG_RETURN(FALSE);
+}
+
+/*
+ Create underlying handler objects from partition info
+
+ SYNOPSIS
+ new_handlers_from_part_info()
+ mem_root Allocate memory through this
+
+ RETURN VALUE
+ TRUE Error
+ FALSE Success
+*/
+
+bool ha_partition::new_handlers_from_part_info(MEM_ROOT *mem_root)
+{
+ uint i, j, part_count;
+ partition_element *part_elem;
+ uint alloc_len= (m_tot_parts + 1) * sizeof(handler*);
+ List_iterator_fast <partition_element> part_it(m_part_info->partitions);
+ DBUG_ENTER("ha_partition::new_handlers_from_part_info");
+
+ if (!(m_file= (handler **) alloc_root(mem_root, alloc_len)))
+ {
+ mem_alloc_error(alloc_len);
+ goto error_end;
+ }
+ m_file_tot_parts= m_tot_parts;
+ bzero((char*) m_file, alloc_len);
+ DBUG_ASSERT(m_part_info->no_parts > 0);
+
+ i= 0;
+ part_count= 0;
+ /*
+ Don't know the size of the underlying storage engine, invent a number of
+ bytes allocated for error message if allocation fails
+ */
+ do
+ {
+ part_elem= part_it++;
+ if (m_is_sub_partitioned)
+ {
+ for (j= 0; j < m_part_info->no_subparts; j++)
+ {
+ if (!(m_file[part_count++]= get_new_handler(table_share, mem_root,
+ part_elem->engine_type)))
+ goto error;
+ DBUG_PRINT("info", ("engine_type: %u",
+ (uint) ha_legacy_type(part_elem->engine_type)));
+ }
+ }
+ else
+ {
+ if (!(m_file[part_count++]= get_new_handler(table_share, mem_root,
+ part_elem->engine_type)))
+ goto error;
+ DBUG_PRINT("info", ("engine_type: %u",
+ (uint) ha_legacy_type(part_elem->engine_type)));
+ }
+ } while (++i < m_part_info->no_parts);
+ if (part_elem->engine_type == myisam_hton)
+ {
+ DBUG_PRINT("info", ("MyISAM"));
+ m_myisam= TRUE;
+ }
+ DBUG_RETURN(FALSE);
+error:
+ mem_alloc_error(sizeof(handler));
+error_end:
+ DBUG_RETURN(TRUE);
+}
+
+
+/*
+ Get info about partition engines and their names from the .par file
+
+ SYNOPSIS
+ get_from_handler_file()
+ name Full path of table name
+ mem_root Allocate memory through this
+
+ RETURN VALUE
+ TRUE Error
+ FALSE Success
+
+ DESCRIPTION
+ Open handler file to get partition names, engine types and number of
+ partitions.
+*/
+
+bool ha_partition::get_from_handler_file(const char *name, MEM_ROOT *mem_root)
+{
+ char buff[FN_REFLEN], *address_tot_name_len;
+ File file;
+ char *file_buffer, *name_buffer_ptr;
+ handlerton **engine_array;
+ uint i, len_bytes, len_words, tot_partition_words, tot_name_words, chksum;
+ DBUG_ENTER("ha_partition::get_from_handler_file");
+ DBUG_PRINT("enter", ("table name: '%s'", name));
+
+ if (m_file_buffer)
+ DBUG_RETURN(FALSE);
+ fn_format(buff, name, "", ha_par_ext, MY_APPEND_EXT);
+
+ /* Following could be done with my_stat to read in whole file */
+ if ((file= my_open(buff, O_RDONLY | O_SHARE, MYF(0))) < 0)
+ DBUG_RETURN(TRUE);
+ if (my_read(file, (uchar *) & buff[0], 8, MYF(MY_NABP)))
+ goto err1;
+ len_words= uint4korr(buff);
+ len_bytes= 4 * len_words;
+ if (!(file_buffer= (char*) my_malloc(len_bytes, MYF(0))))
+ goto err1;
+ VOID(my_seek(file, 0, MY_SEEK_SET, MYF(0)));
+ if (my_read(file, (uchar *) file_buffer, len_bytes, MYF(MY_NABP)))
+ goto err2;
+
+ chksum= 0;
+ for (i= 0; i < len_words; i++)
+ chksum ^= uint4korr((file_buffer) + 4 * i);
+ if (chksum)
+ goto err2;
+ m_tot_parts= uint4korr((file_buffer) + 8);
+ DBUG_PRINT("info", ("No of parts = %u", m_tot_parts));
+ tot_partition_words= (m_tot_parts + 3) / 4;
+ engine_array= (handlerton **) my_alloca(m_tot_parts * sizeof(handlerton*));
+ for (i= 0; i < m_tot_parts; i++)
+ engine_array[i]= ha_resolve_by_legacy_type(current_thd,
+ (enum legacy_db_type)
+ *(uchar *) ((file_buffer) + 12 + i));
+ address_tot_name_len= file_buffer + 12 + 4 * tot_partition_words;
+ tot_name_words= (uint4korr(address_tot_name_len) + 3) / 4;
+ if (len_words != (tot_partition_words + tot_name_words + 4))
+ goto err3;
+ name_buffer_ptr= file_buffer + 16 + 4 * tot_partition_words;
+ VOID(my_close(file, MYF(0)));
+ m_file_buffer= file_buffer; // Will be freed in clear_handler_file()
+ m_name_buffer_ptr= name_buffer_ptr;
+
+ if (!(m_engine_array= (plugin_ref*)
+ my_malloc(m_tot_parts * sizeof(plugin_ref), MYF(MY_WME))))
+ goto err3;
+
+ for (i= 0; i < m_tot_parts; i++)
+ m_engine_array[i]= ha_lock_engine(NULL, engine_array[i]);
+
+ my_afree((gptr) engine_array);
+
+ if (!m_file && create_handlers(mem_root))
+ {
+ clear_handler_file();
+ DBUG_RETURN(TRUE);
+ }
+ DBUG_RETURN(FALSE);
+
+err3:
+ my_afree((gptr) engine_array);
+err2:
+ my_free(file_buffer, MYF(0));
+err1:
+ VOID(my_close(file, MYF(0)));
+ DBUG_RETURN(TRUE);
+}
+
+
+/****************************************************************************
+ MODULE open/close object
+****************************************************************************/
+/*
+ Open handler object
+
+ SYNOPSIS
+ open()
+ name Full path of table name
+ mode Open mode flags
+ test_if_locked ?
+
+ RETURN VALUE
+ >0 Error
+ 0 Success
+
+ DESCRIPTION
+ Used for opening tables. The name will be the name of the file.
+ A table is opened when it needs to be opened. For instance
+ when a request comes in for a select on the table (tables are not
+ open and closed for each request, they are cached).
+
+ Called from handler.cc by handler::ha_open(). The server opens all tables
+ by calling ha_open() which then calls the handler specific open().
+*/
+
+int ha_partition::open(const char *name, int mode, uint test_if_locked)
+{
+ char *name_buffer_ptr= m_name_buffer_ptr;
+ int error;
+ uint alloc_len;
+ handler **file;
+ char name_buff[FN_REFLEN];
+ DBUG_ENTER("ha_partition::open");
+
+ ref_length= 0;
+ m_mode= mode;
+ m_open_test_lock= test_if_locked;
+ m_part_field_array= m_part_info->full_part_field_array;
+ if (get_from_handler_file(name, &table->mem_root))
+ DBUG_RETURN(1);
+ m_start_key.length= 0;
+ m_rec0= table->record[0];
+ m_rec_length= table->s->reclength;
+ alloc_len= m_tot_parts * (m_rec_length + PARTITION_BYTES_IN_POS);
+ alloc_len+= table->s->max_key_length;
+ if (!m_ordered_rec_buffer)
+ {
+ if (!(m_ordered_rec_buffer= (uchar*)my_malloc(alloc_len, MYF(MY_WME))))
+ {
+ DBUG_RETURN(1);
+ }
+ {
+ /*
+ We set-up one record per partition and each record has 2 bytes in
+ front where the partition id is written. This is used by ordered
+ index_read.
+ We also set-up a reference to the first record for temporary use in
+ setting up the scan.
+ */
+ char *ptr= (char*)m_ordered_rec_buffer;
+ uint i= 0;
+ do
+ {
+ int2store(ptr, i);
+ ptr+= m_rec_length + PARTITION_BYTES_IN_POS;
+ } while (++i < m_tot_parts);
+ m_start_key.key= (const uchar*)ptr;
+ }
+ }
+
+ /* Initialise the bitmap we use to determine what partitions are used */
+ if (!is_clone)
+ {
+ if (bitmap_init(&(m_part_info->used_partitions), NULL, m_tot_parts, TRUE))
+ DBUG_RETURN(1);
+ bitmap_set_all(&(m_part_info->used_partitions));
+ }
+
+ /* Recalculate table flags as they may change after open */
+ m_table_flags= m_file[0]->ha_table_flags();
+ file= m_file;
+ do
+ {
+ create_partition_name(name_buff, name, name_buffer_ptr, NORMAL_PART_NAME,
+ FALSE);
+ if ((error= (*file)->ha_open(table, (const char*) name_buff, mode,
+ test_if_locked)))
+ goto err_handler;
+ m_no_locks+= (*file)->lock_count();
+ name_buffer_ptr+= strlen(name_buffer_ptr) + 1;
+ set_if_bigger(ref_length, ((*file)->ref_length));
+ m_table_flags&= (*file)->ha_table_flags();
+ } while (*(++file));
+ m_table_flags&= ~(HA_CAN_GEOMETRY | HA_CAN_FULLTEXT | HA_DUPLICATE_POS |
+ HA_CAN_SQL_HANDLER | HA_CAN_INSERT_DELAYED);
+ m_table_flags|= HA_FILE_BASED | HA_REC_NOT_IN_SEQ;
+ key_used_on_scan= m_file[0]->key_used_on_scan;
+ implicit_emptied= m_file[0]->implicit_emptied;
+ /*
+ Add 2 bytes for partition id in position ref length.
+ ref_length=max_in_all_partitions(ref_length) + PARTITION_BYTES_IN_POS
+ */
+ ref_length+= PARTITION_BYTES_IN_POS;
+ m_ref_length= ref_length;
+ /*
+ Release buffer read from .par file. It will not be reused again after
+ being opened once.
+ */
+ clear_handler_file();
+ /*
+ Initialise priority queue, initialised to reading forward.
+ */
+ if ((error= init_queue(&m_queue, m_tot_parts, (uint) PARTITION_BYTES_IN_POS,
+ 0, key_rec_cmp, (void*)this)))
+ goto err_handler;
+
+ /*
+ Some handlers update statistics as part of the open call. This will in
+ some cases corrupt the statistics of the partition handler and thus
+ to ensure we have correct statistics we call info from open after
+ calling open on all individual handlers.
+ */
+ info(HA_STATUS_VARIABLE | HA_STATUS_CONST);
+ DBUG_RETURN(0);
+
+err_handler:
+ while (file-- != m_file)
+ (*file)->close();
+ if (!is_clone)
+ bitmap_free(&(m_part_info->used_partitions));
+
+ DBUG_RETURN(error);
+}
+
+handler *ha_partition::clone(MEM_ROOT *mem_root)
+{
+ handler *new_handler= get_new_handler(table->s, mem_root,
+ table->s->db_type());
+ ((ha_partition*)new_handler)->m_part_info= m_part_info;
+ ((ha_partition*)new_handler)->is_clone= TRUE;
+ if (new_handler && !new_handler->ha_open(table,
+ table->s->normalized_path.str,
+ table->db_stat,
+ HA_OPEN_IGNORE_IF_LOCKED))
+ return new_handler;
+ return NULL;
+}
+
+
+/*
+ Close handler object
+
+ SYNOPSIS
+ close()
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+
+ DESCRIPTION
+ Called from sql_base.cc, sql_select.cc, and table.cc.
+ In sql_select.cc it is only used to close up temporary tables or during
+ the process where a temporary table is converted over to being a
+ myisam table.
+ For sql_base.cc look at close_data_tables().
+*/
+
+int ha_partition::close(void)
+{
+ bool first= TRUE;
+ handler **file;
+ DBUG_ENTER("ha_partition::close");
+
+ delete_queue(&m_queue);
+ if (!is_clone)
+ bitmap_free(&(m_part_info->used_partitions));
+ file= m_file;
+
+repeat:
+ do
+ {
+ (*file)->close();
+ } while (*(++file));
+
+ if (first && m_added_file && m_added_file[0])
+ {
+ file= m_added_file;
+ first= FALSE;
+ goto repeat;
+ }
+
+ DBUG_RETURN(0);
+}
+
+/****************************************************************************
+ MODULE start/end statement
+****************************************************************************/
+/*
+ A number of methods to define various constants for the handler. In
+ the case of the partition handler we need to use some max and min
+ of the underlying handlers in most cases.
+*/
+
+/*
+ Set external locks on table
+
+ SYNOPSIS
+ external_lock()
+ thd Thread object
+ lock_type Type of external lock
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+
+ DESCRIPTION
+ First you should go read the section "locking functions for mysql" in
+ lock.cc to understand this.
+ This create a lock on the table. If you are implementing a storage engine
+ that can handle transactions look at ha_berkeley.cc to see how you will
+ want to go about doing this. Otherwise you should consider calling
+ flock() here.
+ Originally this method was used to set locks on file level to enable
+ several MySQL Servers to work on the same data. For transactional
+ engines it has been "abused" to also mean start and end of statements
+ to enable proper rollback of statements and transactions. When LOCK
+ TABLES has been issued the start_stmt method takes over the role of
+ indicating start of statement but in this case there is no end of
+ statement indicator(?).
+
+ Called from lock.cc by lock_external() and unlock_external(). Also called
+ from sql_table.cc by copy_data_between_tables().
+*/
+
+int ha_partition::external_lock(THD *thd, int lock_type)
+{
+ bool first= TRUE;
+ uint error;
+ handler **file;
+ DBUG_ENTER("ha_partition::external_lock");
+
+ file= m_file;
+ m_lock_type= lock_type;
+
+repeat:
+ do
+ {
+ DBUG_PRINT("info", ("external_lock(thd, %d) iteration %d",
+ lock_type, (int) (file - m_file)));
+ if ((error= (*file)->ha_external_lock(thd, lock_type)))
+ {
+ if (F_UNLCK != lock_type)
+ goto err_handler;
+ }
+ } while (*(++file));
+
+ if (first && m_added_file && m_added_file[0])
+ {
+ DBUG_ASSERT(lock_type == F_UNLCK);
+ file= m_added_file;
+ first= FALSE;
+ goto repeat;
+ }
+ DBUG_RETURN(0);
+
+err_handler:
+ while (file-- != m_file)
+ {
+ (*file)->ha_external_lock(thd, F_UNLCK);
+ }
+ DBUG_RETURN(error);
+}
+
+
+/*
+ Get the lock(s) for the table and perform conversion of locks if needed
+
+ SYNOPSIS
+ store_lock()
+ thd Thread object
+ to Lock object array
+ lock_type Table lock type
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+
+ DESCRIPTION
+ The idea with handler::store_lock() is the following:
+
+ The statement decided which locks we should need for the table
+ for updates/deletes/inserts we get WRITE locks, for SELECT... we get
+ read locks.
+
+ Before adding the lock into the table lock handler (see thr_lock.c)
+ mysqld calls store lock with the requested locks. Store lock can now
+ modify a write lock to a read lock (or some other lock), ignore the
+ lock (if we don't want to use MySQL table locks at all) or add locks
+ for many tables (like we do when we are using a MERGE handler).
+
+ Berkeley DB for partition changes all WRITE locks to TL_WRITE_ALLOW_WRITE
+ (which signals that we are doing WRITES, but we are still allowing other
+ reader's and writer's.
+
+ When releasing locks, store_lock() is also called. In this case one
+ usually doesn't have to do anything.
+
+ store_lock is called when holding a global mutex to ensure that only
+ one thread at a time changes the locking information of tables.
+
+ In some exceptional cases MySQL may send a request for a TL_IGNORE;
+ This means that we are requesting the same lock as last time and this
+ should also be ignored. (This may happen when someone does a flush
+ table when we have opened a part of the tables, in which case mysqld
+ closes and reopens the tables and tries to get the same locks as last
+ time). In the future we will probably try to remove this.
+
+ Called from lock.cc by get_lock_data().
+*/
+
+THR_LOCK_DATA **ha_partition::store_lock(THD *thd,
+ THR_LOCK_DATA **to,
+ enum thr_lock_type lock_type)
+{
+ handler **file;
+ DBUG_ENTER("ha_partition::store_lock");
+ file= m_file;
+ do
+ {
+ DBUG_PRINT("info", ("store lock %d iteration", (int) (file - m_file)));
+ to= (*file)->store_lock(thd, to, lock_type);
+ } while (*(++file));
+ DBUG_RETURN(to);
+}
+
+/*
+ Start a statement when table is locked
+
+ SYNOPSIS
+ start_stmt()
+ thd Thread object
+ lock_type Type of external lock
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+
+ DESCRIPTION
+ This method is called instead of external lock when the table is locked
+ before the statement is executed.
+*/
+
+int ha_partition::start_stmt(THD *thd, thr_lock_type lock_type)
+{
+ int error= 0;
+ handler **file;
+ DBUG_ENTER("ha_partition::start_stmt");
+
+ file= m_file;
+ do
+ {
+ if ((error= (*file)->start_stmt(thd, lock_type)))
+ break;
+ } while (*(++file));
+ DBUG_RETURN(error);
+}
+
+
+/*
+ Get number of lock objects returned in store_lock
+
+ SYNOPSIS
+ lock_count()
+
+ RETURN VALUE
+ Number of locks returned in call to store_lock
+
+ DESCRIPTION
+ Returns the number of store locks needed in call to store lock.
+ We return number of partitions since we call store_lock on each
+ underlying handler. Assists the above functions in allocating
+ sufficient space for lock structures.
+*/
+
+uint ha_partition::lock_count() const
+{
+ DBUG_ENTER("ha_partition::lock_count");
+ DBUG_PRINT("info", ("m_no_locks %d", m_no_locks));
+ DBUG_RETURN(m_no_locks);
+}
+
+
+/*
+ Unlock last accessed row
+
+ SYNOPSIS
+ unlock_row()
+
+ RETURN VALUE
+ NONE
+
+ DESCRIPTION
+ Record currently processed was not in the result set of the statement
+ and is thus unlocked. Used for UPDATE and DELETE queries.
+*/
+
+void ha_partition::unlock_row()
+{
+ m_file[m_last_part]->unlock_row();
+ return;
+}
+
+
+/****************************************************************************
+ MODULE change record
+****************************************************************************/
+
+/*
+ Insert a row to the table
+
+ SYNOPSIS
+ write_row()
+ buf The row in MySQL Row Format
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+
+ DESCRIPTION
+ write_row() inserts a row. buf() is a byte array of data, normally
+ record[0].
+
+ You can use the field information to extract the data from the native byte
+ array type.
+
+ Example of this would be:
+ for (Field **field=table->field ; *field ; field++)
+ {
+ ...
+ }
+
+ See ha_tina.cc for a variant of extracting all of the data as strings.
+ ha_berkeley.cc has a variant of how to store it intact by "packing" it
+ for ha_berkeley's own native storage type.
+
+ Called from item_sum.cc, item_sum.cc, sql_acl.cc, sql_insert.cc,
+ sql_insert.cc, sql_select.cc, sql_table.cc, sql_udf.cc, and sql_update.cc.
+
+ ADDITIONAL INFO:
+
+ We have to set timestamp fields and auto_increment fields, because those
+ may be used in determining which partition the row should be written to.
+*/
+
+int ha_partition::write_row(uchar * buf)
+{
+ uint32 part_id;
+ int error;
+ longlong func_value;
+ bool autoincrement_lock= FALSE;
+ my_bitmap_map *old_map;
+ THD *thd= ha_thd();
+ timestamp_auto_set_type orig_timestamp_type= table->timestamp_field_type;
+#ifdef NOT_NEEDED
+ uchar *rec0= m_rec0;
+#endif
+ DBUG_ENTER("ha_partition::write_row");
+ DBUG_ASSERT(buf == m_rec0);
+
+ /* If we have a timestamp column, update it to the current time */
+ if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_INSERT)
+ table->timestamp_field->set_time();
+ table->timestamp_field_type= TIMESTAMP_NO_AUTO_SET;
+
+ /*
+ If we have an auto_increment column and we are writing a changed row
+ or a new row, then update the auto_increment value in the record.
+ */
+ if (table->next_number_field && buf == table->record[0])
+ {
+ /*
+ Some engines (InnoDB for example) can change autoincrement
+ counter only after 'table->write_row' operation.
+ So if another thread gets inside the ha_partition::write_row
+ before it is complete, it gets same auto_increment value,
+ which means DUP_KEY error (bug #27405)
+ Here we separate the access using table_share->mutex, and
+ use autoincrement_lock variable to avoid unnecessary locks.
+ Probably not an ideal solution.
+ */
+ if (table_share->tmp_table == NO_TMP_TABLE)
+ {
+ /*
+ Bug#30878 crash when alter table from non partitioned table
+ to partitioned.
+ Checking if tmp table then there is no need to lock,
+ and the table_share->mutex may not be initialised.
+ */
+ autoincrement_lock= TRUE;
+ pthread_mutex_lock(&table_share->mutex);
+ }
+ error= update_auto_increment();
+
+ /*
+ If we have failed to set the auto-increment value for this row,
+ it is highly likely that we will not be able to insert it into
+ the correct partition. We must check and fail if neccessary.
+ */
+ if (error)
+ goto exit;
+ }
+
+ old_map= dbug_tmp_use_all_columns(table, table->read_set);
+#ifdef NOT_NEEDED
+ if (likely(buf == rec0))
+#endif
+ error= m_part_info->get_partition_id(m_part_info, &part_id,
+ &func_value);
+#ifdef NOT_NEEDED
+ else
+ {
+ set_field_ptr(m_part_field_array, buf, rec0);
+ error= m_part_info->get_partition_id(m_part_info, &part_id,
+ &func_value);
+ set_field_ptr(m_part_field_array, rec0, buf);
+ }
+#endif
+ dbug_tmp_restore_column_map(table->read_set, old_map);
+ if (unlikely(error))
+ {
+ m_part_info->err_value= func_value;
+ goto exit;
+ }
+ m_last_part= part_id;
+ DBUG_PRINT("info", ("Insert in partition %d", part_id));
+ tmp_disable_binlog(thd); /* Do not replicate the low-level changes. */
+ error= m_file[part_id]->ha_write_row(buf);
+ reenable_binlog(thd);
+exit:
+ table->timestamp_field_type= orig_timestamp_type;
+ if (autoincrement_lock)
+ pthread_mutex_unlock(&table_share->mutex);
+ DBUG_RETURN(error);
+}
+
+
+/*
+ Update an existing row
+
+ SYNOPSIS
+ update_row()
+ old_data Old record in MySQL Row Format
+ new_data New record in MySQL Row Format
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+
+ DESCRIPTION
+ Yes, update_row() does what you expect, it updates a row. old_data will
+ have the previous row record in it, while new_data will have the newest
+ data in it.
+ Keep in mind that the server can do updates based on ordering if an
+ ORDER BY clause was used. Consecutive ordering is not guarenteed.
+
+ Currently new_data will not have an updated auto_increament record, or
+ and updated timestamp field. You can do these for partition by doing these:
+ if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_UPDATE)
+ table->timestamp_field->set_time();
+ if (table->next_number_field && record == table->record[0])
+ update_auto_increment();
+
+ Called from sql_select.cc, sql_acl.cc, sql_update.cc, and sql_insert.cc.
+ new_data is always record[0]
+ old_data is normally record[1] but may be anything
+*/
+
+int ha_partition::update_row(const uchar *old_data, uchar *new_data)
+{
+ THD *thd= ha_thd();
+ uint32 new_part_id, old_part_id;
+ int error= 0;
+ longlong func_value;
+ timestamp_auto_set_type orig_timestamp_type= table->timestamp_field_type;
+ DBUG_ENTER("ha_partition::update_row");
+
+ /*
+ We need to set timestamp field once before we calculate
+ the partition. Then we disable timestamp calculations
+ inside m_file[*]->update_row() methods
+ */
+ if (orig_timestamp_type & TIMESTAMP_AUTO_SET_ON_UPDATE)
+ table->timestamp_field->set_time();
+ table->timestamp_field_type= TIMESTAMP_NO_AUTO_SET;
+
+ if ((error= get_parts_for_update(old_data, new_data, table->record[0],
+ m_part_info, &old_part_id, &new_part_id,
+ &func_value)))
+ {
+ m_part_info->err_value= func_value;
+ goto exit;
+ }
+
+ /*
+ TODO:
+ set_internal_auto_increment=
+ max(set_internal_auto_increment, new_data->auto_increment)
+ */
+ m_last_part= new_part_id;
+ if (new_part_id == old_part_id)
+ {
+ DBUG_PRINT("info", ("Update in partition %d", new_part_id));
+ tmp_disable_binlog(thd); /* Do not replicate the low-level changes. */
+ error= m_file[new_part_id]->ha_update_row(old_data, new_data);
+ reenable_binlog(thd);
+ goto exit;
+ }
+ else
+ {
+ DBUG_PRINT("info", ("Update from partition %d to partition %d",
+ old_part_id, new_part_id));
+ tmp_disable_binlog(thd); /* Do not replicate the low-level changes. */
+ error= m_file[new_part_id]->ha_write_row(new_data);
+ reenable_binlog(thd);
+ if (error)
+ goto exit;
+
+ tmp_disable_binlog(thd); /* Do not replicate the low-level changes. */
+ error= m_file[old_part_id]->ha_delete_row(old_data);
+ reenable_binlog(thd);
+ if (error)
+ {
+#ifdef IN_THE_FUTURE
+ (void) m_file[new_part_id]->delete_last_inserted_row(new_data);
+#endif
+ goto exit;
+ }
+ }
+
+exit:
+ table->timestamp_field_type= orig_timestamp_type;
+ DBUG_RETURN(error);
+}
+
+
+/*
+ Remove an existing row
+
+ SYNOPSIS
+ delete_row
+ buf Deleted row in MySQL Row Format
+
+ RETURN VALUE
+ >0 Error Code
+ 0 Success
+
+ DESCRIPTION
+ This will delete a row. buf will contain a copy of the row to be deleted.
+ The server will call this right after the current row has been read
+ (from either a previous rnd_xxx() or index_xxx() call).
+ If you keep a pointer to the last row or can access a primary key it will
+ make doing the deletion quite a bit easier.
+ Keep in mind that the server does no guarentee consecutive deletions.
+ ORDER BY clauses can be used.
+
+ Called in sql_acl.cc and sql_udf.cc to manage internal table information.
+ Called in sql_delete.cc, sql_insert.cc, and sql_select.cc. In sql_select
+ it is used for removing duplicates while in insert it is used for REPLACE
+ calls.
+
+ buf is either record[0] or record[1]
+*/
+
+int ha_partition::delete_row(const uchar *buf)
+{
+ uint32 part_id;
+ int error;
+ THD *thd= ha_thd();
+ DBUG_ENTER("ha_partition::delete_row");
+
+ if ((error= get_part_for_delete(buf, m_rec0, m_part_info, &part_id)))
+ {
+ DBUG_RETURN(error);
+ }
+ m_last_part= part_id;
+ tmp_disable_binlog(thd);
+ error= m_file[part_id]->ha_delete_row(buf);
+ reenable_binlog(thd);
+ DBUG_RETURN(error);
+}
+
+
+/*
+ Delete all rows in a table
+
+ SYNOPSIS
+ delete_all_rows()
+
+ RETURN VALUE
+ >0 Error Code
+ 0 Success
+
+ DESCRIPTION
+ Used to delete all rows in a table. Both for cases of truncate and
+ for cases where the optimizer realizes that all rows will be
+ removed as a result of a SQL statement.
+
+ Called from item_sum.cc by Item_func_group_concat::clear(),
+ Item_sum_count_distinct::clear(), and Item_func_group_concat::clear().
+ Called from sql_delete.cc by mysql_delete().
+ Called from sql_select.cc by JOIN::reinit().
+ Called from sql_union.cc by st_select_lex_unit::exec().
+*/
+
+int ha_partition::delete_all_rows()
+{
+ int error;
+ handler **file;
+ DBUG_ENTER("ha_partition::delete_all_rows");
+
+ file= m_file;
+ do
+ {
+ if ((error= (*file)->ha_delete_all_rows()))
+ DBUG_RETURN(error);
+ } while (*(++file));
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Start a large batch of insert rows
+
+ SYNOPSIS
+ start_bulk_insert()
+ rows Number of rows to insert
+
+ RETURN VALUE
+ NONE
+
+ DESCRIPTION
+ rows == 0 means we will probably insert many rows
+*/
+
+void ha_partition::start_bulk_insert(ha_rows rows)
+{
+ handler **file;
+ DBUG_ENTER("ha_partition::start_bulk_insert");
+
+ rows= rows ? rows/m_tot_parts + 1 : 0;
+ file= m_file;
+ do
+ {
+ (*file)->ha_start_bulk_insert(rows);
+ } while (*(++file));
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Finish a large batch of insert rows
+
+ SYNOPSIS
+ end_bulk_insert()
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+*/
+
+int ha_partition::end_bulk_insert()
+{
+ int error= 0;
+ handler **file;
+ DBUG_ENTER("ha_partition::end_bulk_insert");
+
+ file= m_file;
+ do
+ {
+ int tmp;
+ if ((tmp= (*file)->ha_end_bulk_insert()))
+ error= tmp;
+ } while (*(++file));
+ DBUG_RETURN(error);
+}
+
+
+/****************************************************************************
+ MODULE full table scan
+****************************************************************************/
+/*
+ Initialize engine for random reads
+
+ SYNOPSIS
+ ha_partition::rnd_init()
+ scan 0 Initialize for random reads through rnd_pos()
+ 1 Initialize for random scan through rnd_next()
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+
+ DESCRIPTION
+ rnd_init() is called when the server wants the storage engine to do a
+ table scan or when the server wants to access data through rnd_pos.
+
+ When scan is used we will scan one handler partition at a time.
+ When preparing for rnd_pos we will init all handler partitions.
+ No extra cache handling is needed when scannning is not performed.
+
+ Before initialising we will call rnd_end to ensure that we clean up from
+ any previous incarnation of a table scan.
+ Called from filesort.cc, records.cc, sql_handler.cc, sql_select.cc,
+ sql_table.cc, and sql_update.cc.
+*/
+
+int ha_partition::rnd_init(bool scan)
+{
+ int error;
+ uint i= 0;
+ uint32 part_id;
+ DBUG_ENTER("ha_partition::rnd_init");
+
+ /*
+ For operations that may need to change data, we may need to extend
+ read_set.
+ */
+ if (m_lock_type == F_WRLCK)
+ {
+ /*
+ If write_set contains any of the fields used in partition and
+ subpartition expression, we need to set all bits in read_set because
+ the row may need to be inserted in a different [sub]partition. In
+ other words update_row() can be converted into write_row(), which
+ requires a complete record.
+ */
+ if (bitmap_is_overlapping(&m_part_info->full_part_field_set,
+ table->write_set))
+ bitmap_set_all(table->read_set);
+ else
+ {
+ /*
+ Some handlers only read fields as specified by the bitmap for the
+ read set. For partitioned handlers we always require that the
+ fields of the partition functions are read such that we can
+ calculate the partition id to place updated and deleted records.
+ */
+ bitmap_union(table->read_set, &m_part_info->full_part_field_set);
+ }
+ }
+
+ /* Now we see what the index of our first important partition is */
+ DBUG_PRINT("info", ("m_part_info->used_partitions: 0x%lx",
+ (long) m_part_info->used_partitions.bitmap));
+ part_id= bitmap_get_first_set(&(m_part_info->used_partitions));
+ DBUG_PRINT("info", ("m_part_spec.start_part %d", part_id));
+
+ if (MY_BIT_NONE == part_id)
+ {
+ error= 0;
+ goto err1;
+ }
+
+ /*
+ We have a partition and we are scanning with rnd_next
+ so we bump our cache
+ */
+ DBUG_PRINT("info", ("rnd_init on partition %d", part_id));
+ if (scan)
+ {
+ /*
+ rnd_end() is needed for partitioning to reset internal data if scan
+ is already in use
+ */
+ rnd_end();
+ late_extra_cache(part_id);
+ if ((error= m_file[part_id]->ha_rnd_init(scan)))
+ goto err;
+ }
+ else
+ {
+ for (i= part_id; i < m_tot_parts; i++)
+ {
+ if (bitmap_is_set(&(m_part_info->used_partitions), i))
+ {
+ if ((error= m_file[i]->ha_rnd_init(scan)))
+ goto err;
+ }
+ }
+ }
+ m_scan_value= scan;
+ m_part_spec.start_part= part_id;
+ m_part_spec.end_part= m_tot_parts - 1;
+ DBUG_PRINT("info", ("m_scan_value=%d", m_scan_value));
+ DBUG_RETURN(0);
+
+err:
+ while ((int)--i >= (int)part_id)
+ {
+ if (bitmap_is_set(&(m_part_info->used_partitions), i))
+ m_file[i]->ha_rnd_end();
+ }
+err1:
+ m_scan_value= 2;
+ m_part_spec.start_part= NO_CURRENT_PART_ID;
+ DBUG_RETURN(error);
+}
+
+
+/*
+ End of a table scan
+
+ SYNOPSIS
+ rnd_end()
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+*/
+
+int ha_partition::rnd_end()
+{
+ handler **file;
+ DBUG_ENTER("ha_partition::rnd_end");
+ switch (m_scan_value) {
+ case 2: // Error
+ break;
+ case 1:
+ if (NO_CURRENT_PART_ID != m_part_spec.start_part) // Table scan
+ {
+ late_extra_no_cache(m_part_spec.start_part);
+ m_file[m_part_spec.start_part]->ha_rnd_end();
+ }
+ break;
+ case 0:
+ file= m_file;
+ do
+ {
+ if (bitmap_is_set(&(m_part_info->used_partitions), (file - m_file)))
+ (*file)->ha_rnd_end();
+ } while (*(++file));
+ break;
+ }
+ m_scan_value= 2;
+ m_part_spec.start_part= NO_CURRENT_PART_ID;
+ DBUG_RETURN(0);
+}
+
+/*
+ read next row during full table scan (scan in random row order)
+
+ SYNOPSIS
+ rnd_next()
+ buf buffer that should be filled with data
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+
+ DESCRIPTION
+ This is called for each row of the table scan. When you run out of records
+ you should return HA_ERR_END_OF_FILE.
+ The Field structure for the table is the key to getting data into buf
+ in a manner that will allow the server to understand it.
+
+ Called from filesort.cc, records.cc, sql_handler.cc, sql_select.cc,
+ sql_table.cc, and sql_update.cc.
+*/
+
+int ha_partition::rnd_next(uchar *buf)
+{
+ handler *file;
+ int result= HA_ERR_END_OF_FILE;
+ uint part_id= m_part_spec.start_part;
+ DBUG_ENTER("ha_partition::rnd_next");
+
+ if (NO_CURRENT_PART_ID == part_id)
+ {
+ /*
+ The original set of partitions to scan was empty and thus we report
+ the result here.
+ */
+ goto end;
+ }
+
+ DBUG_ASSERT(m_scan_value == 1);
+ file= m_file[part_id];
+
+ while (TRUE)
+ {
+ int result= file->rnd_next(buf);
+ if (!result)
+ {
+ m_last_part= part_id;
+ m_part_spec.start_part= part_id;
+ table->status= 0;
+ DBUG_RETURN(0);
+ }
+
+ /*
+ if we get here, then the current partition rnd_next returned failure
+ */
+ if (result == HA_ERR_RECORD_DELETED)
+ continue; // Probably MyISAM
+
+ if (result != HA_ERR_END_OF_FILE)
+ goto end_dont_reset_start_part; // Return error
+
+ /* End current partition */
+ late_extra_no_cache(part_id);
+ DBUG_PRINT("info", ("rnd_end on partition %d", part_id));
+ if ((result= file->ha_rnd_end()))
+ break;
+
+ /* Shift to next partition */
+ while (++part_id < m_tot_parts &&
+ !bitmap_is_set(&(m_part_info->used_partitions), part_id))
+ ;
+ if (part_id >= m_tot_parts)
+ {
+ result= HA_ERR_END_OF_FILE;
+ break;
+ }
+ file= m_file[part_id];
+ DBUG_PRINT("info", ("rnd_init on partition %d", part_id));
+ if ((result= file->ha_rnd_init(1)))
+ break;
+ late_extra_cache(part_id);
+ }
+
+end:
+ m_part_spec.start_part= NO_CURRENT_PART_ID;
+end_dont_reset_start_part:
+ table->status= STATUS_NOT_FOUND;
+ DBUG_RETURN(result);
+}
+
+
+/*
+ Save position of current row
+
+ SYNOPSIS
+ position()
+ record Current record in MySQL Row Format
+
+ RETURN VALUE
+ NONE
+
+ DESCRIPTION
+ position() is called after each call to rnd_next() if the data needs
+ to be ordered. You can do something like the following to store
+ the position:
+ ha_store_ptr(ref, ref_length, current_position);
+
+ The server uses ref to store data. ref_length in the above case is
+ the size needed to store current_position. ref is just a byte array
+ that the server will maintain. If you are using offsets to mark rows, then
+ current_position should be the offset. If it is a primary key like in
+ BDB, then it needs to be a primary key.
+
+ Called from filesort.cc, sql_select.cc, sql_delete.cc and sql_update.cc.
+*/
+
+void ha_partition::position(const uchar *record)
+{
+ handler *file= m_file[m_last_part];
+ DBUG_ENTER("ha_partition::position");
+
+ file->position(record);
+ int2store(ref, m_last_part);
+ memcpy((ref + PARTITION_BYTES_IN_POS), file->ref,
+ (ref_length - PARTITION_BYTES_IN_POS));
+
+#ifdef SUPPORTING_PARTITION_OVER_DIFFERENT_ENGINES
+#ifdef HAVE_purify
+ bzero(ref + PARTITION_BYTES_IN_POS + ref_length,
+ max_ref_length-ref_length);
+#endif /* HAVE_purify */
+#endif
+ DBUG_VOID_RETURN;
+}
+
+
+void ha_partition::column_bitmaps_signal()
+{
+ handler::column_bitmaps_signal();
+ bitmap_union(table->read_set, &m_part_info->full_part_field_set);
+}
+
+
+/*
+ Read row using position
+
+ SYNOPSIS
+ rnd_pos()
+ out:buf Row read in MySQL Row Format
+ position Position of read row
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+
+ DESCRIPTION
+ This is like rnd_next, but you are given a position to use
+ to determine the row. The position will be of the type that you stored in
+ ref. You can use ha_get_ptr(pos,ref_length) to retrieve whatever key
+ or position you saved when position() was called.
+ Called from filesort.cc records.cc sql_insert.cc sql_select.cc
+ sql_update.cc.
+*/
+
+int ha_partition::rnd_pos(uchar * buf, uchar *pos)
+{
+ uint part_id;
+ handler *file;
+ DBUG_ENTER("ha_partition::rnd_pos");
+
+ part_id= uint2korr((const uchar *) pos);
+ DBUG_ASSERT(part_id < m_tot_parts);
+ file= m_file[part_id];
+ m_last_part= part_id;
+ DBUG_RETURN(file->rnd_pos(buf, (pos + PARTITION_BYTES_IN_POS)));
+}
+
+
+/*
+ Read row using position using given record to find
+
+ SYNOPSIS
+ rnd_pos_by_record()
+ record Current record in MySQL Row Format
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+
+ DESCRIPTION
+ this works as position()+rnd_pos() functions, but does some extra work,
+ calculating m_last_part - the partition to where the 'record'
+ should go.
+
+ called from replication (log_event.cc)
+*/
+
+int ha_partition::rnd_pos_by_record(uchar *record)
+{
+ DBUG_ENTER("ha_partition::rnd_pos_by_record");
+
+ if (unlikely(get_part_for_delete(record, m_rec0, m_part_info, &m_last_part)))
+ DBUG_RETURN(1);
+
+ DBUG_RETURN(handler::rnd_pos_by_record(record));
+}
+
+
+/****************************************************************************
+ MODULE index scan
+****************************************************************************/
+/*
+ Positions an index cursor to the index specified in the handle. Fetches the
+ row if available. If the key value is null, begin at the first key of the
+ index.
+
+ There are loads of optimisations possible here for the partition handler.
+ The same optimisations can also be checked for full table scan although
+ only through conditions and not from index ranges.
+ Phase one optimisations:
+ Check if the fields of the partition function are bound. If so only use
+ the single partition it becomes bound to.
+ Phase two optimisations:
+ If it can be deducted through range or list partitioning that only a
+ subset of the partitions are used, then only use those partitions.
+*/
+
+/*
+ Initialise handler before start of index scan
+
+ SYNOPSIS
+ index_init()
+ inx Index number
+ sorted Is rows to be returned in sorted order
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+
+ DESCRIPTION
+ index_init is always called before starting index scans (except when
+ starting through index_read_idx and using read_range variants).
+*/
+
+int ha_partition::index_init(uint inx, bool sorted)
+{
+ int error= 0;
+ handler **file;
+ DBUG_ENTER("ha_partition::index_init");
+
+ active_index= inx;
+ m_part_spec.start_part= NO_CURRENT_PART_ID;
+ m_start_key.length= 0;
+ m_ordered= sorted;
+ m_curr_key_info= table->key_info+inx;
+ /*
+ Some handlers only read fields as specified by the bitmap for the
+ read set. For partitioned handlers we always require that the
+ fields of the partition functions are read such that we can
+ calculate the partition id to place updated and deleted records.
+ But this is required for operations that may need to change data only.
+ */
+ if (m_lock_type == F_WRLCK)
+ bitmap_union(table->read_set, &m_part_info->full_part_field_set);
+ else if (sorted)
+ {
+ /*
+ An ordered scan is requested. We must make sure all fields of the
+ used index are in the read set, as partitioning requires them for
+ sorting (see ha_partition::handle_ordered_index_scan).
+
+ The SQL layer may request an ordered index scan without having index
+ fields in the read set when
+ - it needs to do an ordered scan over an index prefix.
+ - it evaluates ORDER BY with SELECT COUNT(*) FROM t1.
+
+ TODO: handle COUNT(*) queries via unordered scan.
+ */
+ uint i;
+ for (i= 0; i < m_curr_key_info->key_parts; i++)
+ bitmap_set_bit(table->read_set,
+ m_curr_key_info->key_part[i].field->field_index);
+ }
+ file= m_file;
+ do
+ {
+ /* TODO RONM: Change to index_init() when code is stable */
+ if (bitmap_is_set(&(m_part_info->used_partitions), (file - m_file)))
+ if ((error= (*file)->ha_index_init(inx, sorted)))
+ {
+ DBUG_ASSERT(0); // Should never happen
+ break;
+ }
+ } while (*(++file));
+ DBUG_RETURN(error);
+}
+
+
+/*
+ End of index scan
+
+ SYNOPSIS
+ index_end()
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+
+ DESCRIPTION
+ index_end is called at the end of an index scan to clean up any
+ things needed to clean up.
+*/
+
+int ha_partition::index_end()
+{
+ int error= 0;
+ handler **file;
+ DBUG_ENTER("ha_partition::index_end");
+
+ active_index= MAX_KEY;
+ m_part_spec.start_part= NO_CURRENT_PART_ID;
+ file= m_file;
+ do
+ {
+ int tmp;
+ /* TODO RONM: Change to index_end() when code is stable */
+ if (bitmap_is_set(&(m_part_info->used_partitions), (file - m_file)))
+ if ((tmp= (*file)->ha_index_end()))
+ error= tmp;
+ } while (*(++file));
+ DBUG_RETURN(error);
+}
+
+
+/*
+ Read one record in an index scan and start an index scan
+
+ SYNOPSIS
+ index_read()
+ buf Read row in MySQL Row Format
+ key Key parts in consecutive order
+ key_len Total length of key parts
+ find_flag What type of key condition is used
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+
+ DESCRIPTION
+ index_read starts a new index scan using a start key. The MySQL Server
+ will check the end key on its own. Thus to function properly the
+ partitioned handler need to ensure that it delivers records in the sort
+ order of the MySQL Server.
+ index_read can be restarted without calling index_end on the previous
+ index scan and without calling index_init. In this case the index_read
+ is on the same index as the previous index_scan. This is particularly
+ used in conjuntion with multi read ranges.
+*/
+
+int ha_partition::index_read_map(uchar *buf, const uchar *key,
+ key_part_map keypart_map,
+ enum ha_rkey_function find_flag)
+{
+ DBUG_ENTER("ha_partition::index_read_map");
+
+ end_range= 0;
+ m_index_scan_type= partition_index_read;
+ DBUG_RETURN(common_index_read(buf, key, keypart_map, find_flag));
+}
+
+
+/*
+ Common routine for a number of index_read variants
+
+ SYNOPSIS
+ common_index_read
+
+ see index_read for rest
+*/
+
+int ha_partition::common_index_read(uchar *buf, const uchar *key,
+ key_part_map keypart_map,
+ enum ha_rkey_function find_flag)
+{
+ int error;
+ bool reverse_order= FALSE;
+ uint key_len= calculate_key_len(table, active_index, key, keypart_map);
+ DBUG_ENTER("ha_partition::common_index_read");
+
+ memcpy((void*)m_start_key.key, key, key_len);
+ m_start_key.keypart_map= keypart_map;
+ m_start_key.length= key_len;
+ m_start_key.flag= find_flag;
+
+ if ((error= partition_scan_set_up(buf, TRUE)))
+ {
+ DBUG_RETURN(error);
+ }
+ if (find_flag == HA_READ_PREFIX_LAST ||
+ find_flag == HA_READ_PREFIX_LAST_OR_PREV ||
+ find_flag == HA_READ_BEFORE_KEY)
+ {
+ reverse_order= TRUE;
+ m_ordered_scan_ongoing= TRUE;
+ }
+ if (!m_ordered_scan_ongoing ||
+ (find_flag == HA_READ_KEY_EXACT &&
+ (key_len >= m_curr_key_info->key_length ||
+ key_len == 0)))
+ {
+ /*
+ We use unordered index scan either when read_range is used and flag
+ is set to not use ordered or when an exact key is used and in this
+ case all records will be sorted equal and thus the sort order of the
+ resulting records doesn't matter.
+ We also use an unordered index scan when the number of partitions to
+ scan is only one.
+ The unordered index scan will use the partition set created.
+ Need to set unordered scan ongoing since we can come here even when
+ it isn't set.
+ */
+ m_ordered_scan_ongoing= FALSE;
+ error= handle_unordered_scan_next_partition(buf);
+ }
+ else
+ {
+ /*
+ In all other cases we will use the ordered index scan. This will use
+ the partition set created by the get_partition_set method.
+ */
+ error= handle_ordered_index_scan(buf, reverse_order);
+ }
+ DBUG_RETURN(error);
+}
+
+
+/*
+ Start an index scan from leftmost record and return first record
+
+ SYNOPSIS
+ index_first()
+ buf Read row in MySQL Row Format
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+
+ DESCRIPTION
+ index_first() asks for the first key in the index.
+ This is similar to index_read except that there is no start key since
+ the scan starts from the leftmost entry and proceeds forward with
+ index_next.
+
+ Called from opt_range.cc, opt_sum.cc, sql_handler.cc,
+ and sql_select.cc.
+*/
+
+int ha_partition::index_first(uchar * buf)
+{
+ DBUG_ENTER("ha_partition::index_first");
+
+ end_range= 0;
+ m_index_scan_type= partition_index_first;
+ DBUG_RETURN(common_first_last(buf));
+}
+
+
+/*
+ Start an index scan from rightmost record and return first record
+
+ SYNOPSIS
+ index_last()
+ buf Read row in MySQL Row Format
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+
+ DESCRIPTION
+ index_last() asks for the last key in the index.
+ This is similar to index_read except that there is no start key since
+ the scan starts from the rightmost entry and proceeds forward with
+ index_prev.
+
+ Called from opt_range.cc, opt_sum.cc, sql_handler.cc,
+ and sql_select.cc.
+*/
+
+int ha_partition::index_last(uchar * buf)
+{
+ DBUG_ENTER("ha_partition::index_last");
+
+ m_index_scan_type= partition_index_last;
+ DBUG_RETURN(common_first_last(buf));
+}
+
+/*
+ Common routine for index_first/index_last
+
+ SYNOPSIS
+ common_index_first_last
+
+ see index_first for rest
+*/
+
+int ha_partition::common_first_last(uchar *buf)
+{
+ int error;
+
+ if ((error= partition_scan_set_up(buf, FALSE)))
+ return error;
+ if (!m_ordered_scan_ongoing &&
+ m_index_scan_type != partition_index_last)
+ return handle_unordered_scan_next_partition(buf);
+ return handle_ordered_index_scan(buf, FALSE);
+}
+
+
+/*
+ Read last using key
+
+ SYNOPSIS
+ index_read_last()
+ buf Read row in MySQL Row Format
+ key Key
+ keypart_map Which part of key is used
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+
+ DESCRIPTION
+ This is used in join_read_last_key to optimise away an ORDER BY.
+ Can only be used on indexes supporting HA_READ_ORDER
+*/
+
+int ha_partition::index_read_last_map(uchar *buf, const uchar *key,
+ key_part_map keypart_map)
+{
+ DBUG_ENTER("ha_partition::index_read_last");
+
+ m_ordered= TRUE; // Safety measure
+ end_range= 0;
+ m_index_scan_type= partition_index_read_last;
+ DBUG_RETURN(common_index_read(buf, key, keypart_map, HA_READ_PREFIX_LAST));
+}
+
+
+/*
+ Read next record in a forward index scan
+
+ SYNOPSIS
+ index_next()
+ buf Read row in MySQL Row Format
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+
+ DESCRIPTION
+ Used to read forward through the index.
+*/
+
+int ha_partition::index_next(uchar * buf)
+{
+ DBUG_ENTER("ha_partition::index_next");
+
+ /*
+ TODO(low priority):
+ If we want partition to work with the HANDLER commands, we
+ must be able to do index_last() -> index_prev() -> index_next()
+ */
+ DBUG_ASSERT(m_index_scan_type != partition_index_last);
+ if (!m_ordered_scan_ongoing)
+ {
+ DBUG_RETURN(handle_unordered_next(buf, FALSE));
+ }
+ DBUG_RETURN(handle_ordered_next(buf, FALSE));
+}
+
+
+/*
+ Read next record special
+
+ SYNOPSIS
+ index_next_same()
+ buf Read row in MySQL Row Format
+ key Key
+ keylen Length of key
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+
+ DESCRIPTION
+ This routine is used to read the next but only if the key is the same
+ as supplied in the call.
+*/
+
+int ha_partition::index_next_same(uchar *buf, const uchar *key, uint keylen)
+{
+ DBUG_ENTER("ha_partition::index_next_same");
+
+ DBUG_ASSERT(keylen == m_start_key.length);
+ DBUG_ASSERT(m_index_scan_type != partition_index_last);
+ if (!m_ordered_scan_ongoing)
+ DBUG_RETURN(handle_unordered_next(buf, TRUE));
+ DBUG_RETURN(handle_ordered_next(buf, TRUE));
+}
+
+
+/*
+ Read next record when performing index scan backwards
+
+ SYNOPSIS
+ index_prev()
+ buf Read row in MySQL Row Format
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+
+ DESCRIPTION
+ Used to read backwards through the index.
+*/
+
+int ha_partition::index_prev(uchar * buf)
+{
+ DBUG_ENTER("ha_partition::index_prev");
+
+ /* TODO: read comment in index_next */
+ DBUG_ASSERT(m_index_scan_type != partition_index_first);
+ DBUG_RETURN(handle_ordered_prev(buf));
+}
+
+
+/*
+ Start a read of one range with start and end key
+
+ SYNOPSIS
+ read_range_first()
+ start_key Specification of start key
+ end_key Specification of end key
+ eq_range_arg Is it equal range
+ sorted Should records be returned in sorted order
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+
+ DESCRIPTION
+ We reimplement read_range_first since we don't want the compare_key
+ check at the end. This is already performed in the partition handler.
+ read_range_next is very much different due to that we need to scan
+ all underlying handlers.
+*/
+
+int ha_partition::read_range_first(const key_range *start_key,
+ const key_range *end_key,
+ bool eq_range_arg, bool sorted)
+{
+ int error;
+ DBUG_ENTER("ha_partition::read_range_first");
+
+ m_ordered= sorted;
+ eq_range= eq_range_arg;
+ end_range= 0;
+ if (end_key)
+ {
+ end_range= &save_end_range;
+ save_end_range= *end_key;
+ key_compare_result_on_equal=
+ ((end_key->flag == HA_READ_BEFORE_KEY) ? 1 :
+ (end_key->flag == HA_READ_AFTER_KEY) ? -1 : 0);
+ }
+ range_key_part= m_curr_key_info->key_part;
+
+ if (!start_key) // Read first record
+ {
+ if (m_ordered)
+ m_index_scan_type= partition_index_first;
+ else
+ m_index_scan_type= partition_index_first_unordered;
+ error= common_first_last(m_rec0);
+ }
+ else
+ {
+ m_index_scan_type= partition_index_read;
+ error= common_index_read(m_rec0,
+ start_key->key,
+ start_key->keypart_map, start_key->flag);
+ }
+ DBUG_RETURN(error);
+}
+
+
+/*
+ Read next record in read of a range with start and end key
+
+ SYNOPSIS
+ read_range_next()
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+*/
+
+int ha_partition::read_range_next()
+{
+ DBUG_ENTER("ha_partition::read_range_next");
+
+ if (m_ordered)
+ {
+ DBUG_RETURN(handler::read_range_next());
+ }
+ DBUG_RETURN(handle_unordered_next(m_rec0, eq_range));
+}
+
+
+/*
+ Common routine to set up scans
+
+ SYNOPSIS
+ buf Buffer to later return record in
+ idx_read_flag Is it index scan
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+
+ DESCRIPTION
+ This is where we check which partitions to actually scan if not all
+ of them
+*/
+
+int ha_partition::partition_scan_set_up(uchar * buf, bool idx_read_flag)
+{
+ DBUG_ENTER("ha_partition::partition_scan_set_up");
+
+ if (idx_read_flag)
+ get_partition_set(table,buf,active_index,&m_start_key,&m_part_spec);
+ else
+ {
+ m_part_spec.start_part= 0;
+ m_part_spec.end_part= m_tot_parts - 1;
+ }
+ if (m_part_spec.start_part > m_part_spec.end_part)
+ {
+ /*
+ We discovered a partition set but the set was empty so we report
+ key not found.
+ */
+ DBUG_PRINT("info", ("scan with no partition to scan"));
+ DBUG_RETURN(HA_ERR_END_OF_FILE);
+ }
+ if (m_part_spec.start_part == m_part_spec.end_part)
+ {
+ /*
+ We discovered a single partition to scan, this never needs to be
+ performed using the ordered index scan.
+ */
+ DBUG_PRINT("info", ("index scan using the single partition %d",
+ m_part_spec.start_part));
+ m_ordered_scan_ongoing= FALSE;
+ }
+ else
+ {
+ /*
+ Set m_ordered_scan_ongoing according how the scan should be done
+ Only exact partitions are discovered atm by get_partition_set.
+ Verify this, also bitmap must have at least one bit set otherwise
+ the result from this table is the empty set.
+ */
+ uint start_part= bitmap_get_first_set(&(m_part_info->used_partitions));
+ if (start_part == MY_BIT_NONE)
+ {
+ DBUG_PRINT("info", ("scan with no partition to scan"));
+ DBUG_RETURN(HA_ERR_END_OF_FILE);
+ }
+ if (start_part > m_part_spec.start_part)
+ m_part_spec.start_part= start_part;
+ DBUG_ASSERT(m_part_spec.start_part < m_tot_parts);
+ m_ordered_scan_ongoing= m_ordered;
+ }
+ DBUG_ASSERT(m_part_spec.start_part < m_tot_parts &&
+ m_part_spec.end_part < m_tot_parts);
+ DBUG_RETURN(0);
+}
+
+
+/****************************************************************************
+ Unordered Index Scan Routines
+****************************************************************************/
+/*
+ Common routine to handle index_next with unordered results
+
+ SYNOPSIS
+ handle_unordered_next()
+ out:buf Read row in MySQL Row Format
+ next_same Called from index_next_same
+
+ RETURN VALUE
+ HA_ERR_END_OF_FILE End of scan
+ 0 Success
+ other Error code
+
+ DESCRIPTION
+ These routines are used to scan partitions without considering order.
+ This is performed in two situations.
+ 1) In read_multi_range this is the normal case
+ 2) When performing any type of index_read, index_first, index_last where
+ all fields in the partition function is bound. In this case the index
+ scan is performed on only one partition and thus it isn't necessary to
+ perform any sort.
+*/
+
+int ha_partition::handle_unordered_next(uchar *buf, bool is_next_same)
+{
+ handler *file= m_file[m_part_spec.start_part];
+ int error;
+ DBUG_ENTER("ha_partition::handle_unordered_next");
+
+ /*
+ We should consider if this should be split into two functions as
+ next_same is alwas a local constant
+ */
+ if (is_next_same)
+ {
+ if (!(error= file->index_next_same(buf, m_start_key.key,
+ m_start_key.length)))
+ {
+ m_last_part= m_part_spec.start_part;
+ DBUG_RETURN(0);
+ }
+ }
+ else if (!(error= file->index_next(buf)))
+ {
+ if (!(file->index_flags(active_index, 0, 1) & HA_READ_ORDER) ||
+ compare_key(end_range) <= 0)
+ {
+ m_last_part= m_part_spec.start_part;
+ DBUG_RETURN(0); // Row was in range
+ }
+ error= HA_ERR_END_OF_FILE;
+ }
+
+ if (error == HA_ERR_END_OF_FILE)
+ {
+ m_part_spec.start_part++; // Start using next part
+ error= handle_unordered_scan_next_partition(buf);
+ }
+ DBUG_RETURN(error);
+}
+
+
+/*
+ Handle index_next when changing to new partition
+
+ SYNOPSIS
+ handle_unordered_scan_next_partition()
+ buf Read row in MySQL Row Format
+
+ RETURN VALUE
+ HA_ERR_END_OF_FILE End of scan
+ 0 Success
+ other Error code
+
+ DESCRIPTION
+ This routine is used to start the index scan on the next partition.
+ Both initial start and after completing scan on one partition.
+*/
+
+int ha_partition::handle_unordered_scan_next_partition(uchar * buf)
+{
+ uint i;
+ DBUG_ENTER("ha_partition::handle_unordered_scan_next_partition");
+
+ for (i= m_part_spec.start_part; i <= m_part_spec.end_part; i++)
+ {
+ int error;
+ handler *file;
+
+ if (!(bitmap_is_set(&(m_part_info->used_partitions), i)))
+ continue;
+ file= m_file[i];
+ m_part_spec.start_part= i;
+ switch (m_index_scan_type) {
+ case partition_index_read:
+ DBUG_PRINT("info", ("index_read on partition %d", i));
+ error= file->index_read_map(buf, m_start_key.key,
+ m_start_key.keypart_map,
+ m_start_key.flag);
+ break;
+ case partition_index_first:
+ DBUG_PRINT("info", ("index_first on partition %d", i));
+ error= file->index_first(buf);
+ break;
+ case partition_index_first_unordered:
+ /*
+ We perform a scan without sorting and this means that we
+ should not use the index_first since not all handlers
+ support it and it is also unnecessary to restrict sort
+ order.
+ */
+ DBUG_PRINT("info", ("read_range_first on partition %d", i));
+ table->record[0]= buf;
+ error= file->read_range_first(0, end_range, eq_range, 0);
+ table->record[0]= m_rec0;
+ break;
+ default:
+ DBUG_ASSERT(FALSE);
+ DBUG_RETURN(1);
+ }
+ if (!error)
+ {
+ if (!(file->index_flags(active_index, 0, 1) & HA_READ_ORDER) ||
+ compare_key(end_range) <= 0)
+ {
+ m_last_part= i;
+ DBUG_RETURN(0);
+ }
+ error= HA_ERR_END_OF_FILE;
+ }
+ if ((error != HA_ERR_END_OF_FILE) && (error != HA_ERR_KEY_NOT_FOUND))
+ DBUG_RETURN(error);
+ DBUG_PRINT("info", ("HA_ERR_END_OF_FILE on partition %d", i));
+ }
+ m_part_spec.start_part= NO_CURRENT_PART_ID;
+ DBUG_RETURN(HA_ERR_END_OF_FILE);
+}
+
+
+/*
+ Common routine to start index scan with ordered results
+
+ SYNOPSIS
+ handle_ordered_index_scan()
+ out:buf Read row in MySQL Row Format
+
+ RETURN VALUE
+ HA_ERR_END_OF_FILE End of scan
+ 0 Success
+ other Error code
+
+ DESCRIPTION
+ This part contains the logic to handle index scans that require ordered
+ output. This includes all except those started by read_range_first with
+ the flag ordered set to FALSE. Thus most direct index_read and all
+ index_first and index_last.
+
+ We implement ordering by keeping one record plus a key buffer for each
+ partition. Every time a new entry is requested we will fetch a new
+ entry from the partition that is currently not filled with an entry.
+ Then the entry is put into its proper sort position.
+
+ Returning a record is done by getting the top record, copying the
+ record to the request buffer and setting the partition as empty on
+ entries.
+*/
+
+int ha_partition::handle_ordered_index_scan(uchar *buf, bool reverse_order)
+{
+ uint i;
+ uint j= 0;
+ bool found= FALSE;
+ DBUG_ENTER("ha_partition::handle_ordered_index_scan");
+
+ m_top_entry= NO_CURRENT_PART_ID;
+ queue_remove_all(&m_queue);
+
+ DBUG_PRINT("info", ("m_part_spec.start_part %d", m_part_spec.start_part));
+ for (i= m_part_spec.start_part; i <= m_part_spec.end_part; i++)
+ {
+ if (!(bitmap_is_set(&(m_part_info->used_partitions), i)))
+ continue;
+ uchar *rec_buf_ptr= rec_buf(i);
+ int error;
+ handler *file= m_file[i];
+
+ switch (m_index_scan_type) {
+ case partition_index_read:
+ error= file->index_read_map(rec_buf_ptr,
+ m_start_key.key,
+ m_start_key.keypart_map,
+ m_start_key.flag);
+ break;
+ case partition_index_first:
+ error= file->index_first(rec_buf_ptr);
+ reverse_order= FALSE;
+ break;
+ case partition_index_last:
+ error= file->index_last(rec_buf_ptr);
+ reverse_order= TRUE;
+ break;
+ case partition_index_read_last:
+ error= file->index_read_last_map(rec_buf_ptr,
+ m_start_key.key,
+ m_start_key.keypart_map);
+ reverse_order= TRUE;
+ break;
+ default:
+ DBUG_ASSERT(FALSE);
+ DBUG_RETURN(HA_ERR_END_OF_FILE);
+ }
+ if (!error)
+ {
+ found= TRUE;
+ /*
+ Initialise queue without order first, simply insert
+ */
+ queue_element(&m_queue, j++)= (uchar*)queue_buf(i);
+ }
+ else if (error != HA_ERR_KEY_NOT_FOUND && error != HA_ERR_END_OF_FILE)
+ {
+ DBUG_RETURN(error);
+ }
+ }
+ if (found)
+ {
+ /*
+ We found at least one partition with data, now sort all entries and
+ after that read the first entry and copy it to the buffer to return in.
+ */
+ queue_set_max_at_top(&m_queue, reverse_order);
+ queue_set_cmp_arg(&m_queue, (void*)m_curr_key_info);
+ m_queue.elements= j;
+ queue_fix(&m_queue);
+ return_top_record(buf);
+ table->status= 0;
+ DBUG_PRINT("info", ("Record returned from partition %d", m_top_entry));
+ DBUG_RETURN(0);
+ }
+ DBUG_RETURN(HA_ERR_END_OF_FILE);
+}
+
+
+/*
+ Return the top record in sort order
+
+ SYNOPSIS
+ return_top_record()
+ out:buf Row returned in MySQL Row Format
+
+ RETURN VALUE
+ NONE
+*/
+
+void ha_partition::return_top_record(uchar *buf)
+{
+ uint part_id;
+ uchar *key_buffer= queue_top(&m_queue);
+ uchar *rec_buffer= key_buffer + PARTITION_BYTES_IN_POS;
+
+ part_id= uint2korr(key_buffer);
+ memcpy(buf, rec_buffer, m_rec_length);
+ m_last_part= part_id;
+ m_top_entry= part_id;
+}
+
+
+/*
+ Common routine to handle index_next with ordered results
+
+ SYNOPSIS
+ handle_ordered_next()
+ out:buf Read row in MySQL Row Format
+ next_same Called from index_next_same
+
+ RETURN VALUE
+ HA_ERR_END_OF_FILE End of scan
+ 0 Success
+ other Error code
+*/
+
+int ha_partition::handle_ordered_next(uchar *buf, bool is_next_same)
+{
+ int error;
+ uint part_id= m_top_entry;
+ handler *file= m_file[part_id];
+ DBUG_ENTER("ha_partition::handle_ordered_next");
+
+ if (!is_next_same)
+ error= file->index_next(rec_buf(part_id));
+ else
+ error= file->index_next_same(rec_buf(part_id), m_start_key.key,
+ m_start_key.length);
+ if (error)
+ {
+ if (error == HA_ERR_END_OF_FILE)
+ {
+ /* Return next buffered row */
+ queue_remove(&m_queue, (uint) 0);
+ if (m_queue.elements)
+ {
+ DBUG_PRINT("info", ("Record returned from partition %u (2)",
+ m_top_entry));
+ return_top_record(buf);
+ table->status= 0;
+ error= 0;
+ }
+ }
+ DBUG_RETURN(error);
+ }
+ queue_replaced(&m_queue);
+ return_top_record(buf);
+ DBUG_PRINT("info", ("Record returned from partition %u", m_top_entry));
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Common routine to handle index_prev with ordered results
+
+ SYNOPSIS
+ handle_ordered_prev()
+ out:buf Read row in MySQL Row Format
+
+ RETURN VALUE
+ HA_ERR_END_OF_FILE End of scan
+ 0 Success
+ other Error code
+*/
+
+int ha_partition::handle_ordered_prev(uchar *buf)
+{
+ int error;
+ uint part_id= m_top_entry;
+ handler *file= m_file[part_id];
+ DBUG_ENTER("ha_partition::handle_ordered_prev");
+
+ if ((error= file->index_prev(rec_buf(part_id))))
+ {
+ if (error == HA_ERR_END_OF_FILE)
+ {
+ queue_remove(&m_queue, (uint) 0);
+ if (m_queue.elements)
+ {
+ return_top_record(buf);
+ DBUG_PRINT("info", ("Record returned from partition %d (2)",
+ m_top_entry));
+ error= 0;
+ table->status= 0;
+ }
+ }
+ DBUG_RETURN(error);
+ }
+ queue_replaced(&m_queue);
+ return_top_record(buf);
+ DBUG_PRINT("info", ("Record returned from partition %d", m_top_entry));
+ DBUG_RETURN(0);
+}
+
+
+/****************************************************************************
+ MODULE information calls
+****************************************************************************/
+
+/*
+ These are all first approximations of the extra, info, scan_time
+ and read_time calls
+*/
+
+/*
+ General method to gather info from handler
+
+ SYNOPSIS
+ info()
+ flag Specifies what info is requested
+
+ RETURN VALUE
+ NONE
+
+ DESCRIPTION
+ ::info() is used to return information to the optimizer.
+ Currently this table handler doesn't implement most of the fields
+ really needed. SHOW also makes use of this data
+ Another note, if your handler doesn't proved exact record count,
+ you will probably want to have the following in your code:
+ if (records < 2)
+ records = 2;
+ The reason is that the server will optimize for cases of only a single
+ record. If in a table scan you don't know the number of records
+ it will probably be better to set records to two so you can return
+ as many records as you need.
+
+ Along with records a few more variables you may wish to set are:
+ records
+ deleted
+ data_file_length
+ index_file_length
+ delete_length
+ check_time
+ Take a look at the public variables in handler.h for more information.
+
+ Called in:
+ filesort.cc
+ ha_heap.cc
+ item_sum.cc
+ opt_sum.cc
+ sql_delete.cc
+ sql_delete.cc
+ sql_derived.cc
+ sql_select.cc
+ sql_select.cc
+ sql_select.cc
+ sql_select.cc
+ sql_select.cc
+ sql_show.cc
+ sql_show.cc
+ sql_show.cc
+ sql_show.cc
+ sql_table.cc
+ sql_union.cc
+ sql_update.cc
+
+ Some flags that are not implemented
+ HA_STATUS_POS:
+ This parameter is never used from the MySQL Server. It is checked in a
+ place in MyISAM so could potentially be used by MyISAM specific
+ programs.
+ HA_STATUS_NO_LOCK:
+ This is declared and often used. It's only used by MyISAM.
+ It means that MySQL doesn't need the absolute latest statistics
+ information. This may save the handler from doing internal locks while
+ retrieving statistics data.
+*/
+
+int ha_partition::info(uint flag)
+{
+ handler *file, **file_array;
+ DBUG_ENTER("ha_partition:info");
+
+ if (flag & HA_STATUS_AUTO)
+ {
+ ulonglong auto_increment_value= 0;
+ DBUG_PRINT("info", ("HA_STATUS_AUTO"));
+ file_array= m_file;
+ do
+ {
+ file= *file_array;
+ file->info(HA_STATUS_AUTO);
+ set_if_bigger(auto_increment_value, file->stats.auto_increment_value);
+ } while (*(++file_array));
+ stats.auto_increment_value= auto_increment_value;
+ }
+ if (flag & HA_STATUS_VARIABLE)
+ {
+ DBUG_PRINT("info", ("HA_STATUS_VARIABLE"));
+ /*
+ Calculates statistical variables
+ records: Estimate of number records in table
+ We report sum (always at least 2)
+ deleted: Estimate of number holes in the table due to
+ deletes
+ We report sum
+ data_file_length: Length of data file, in principle bytes in table
+ We report sum
+ index_file_length: Length of index file, in principle bytes in
+ indexes in the table
+ We report sum
+ delete_length: Length of free space easily used by new records in table
+ We report sum
+ mean_record_length:Mean record length in the table
+ We calculate this
+ check_time: Time of last check (only applicable to MyISAM)
+ We report last time of all underlying handlers
+ */
+ stats.records= 0;
+ stats.deleted= 0;
+ stats.data_file_length= 0;
+ stats.index_file_length= 0;
+ stats.check_time= 0;
+ stats.delete_length= 0;
+ file_array= m_file;
+ do
+ {
+ if (bitmap_is_set(&(m_part_info->used_partitions), (file_array - m_file)))
+ {
+ file= *file_array;
+ file->info(HA_STATUS_VARIABLE);
+ stats.records+= file->stats.records;
+ stats.deleted+= file->stats.deleted;
+ stats.data_file_length+= file->stats.data_file_length;
+ stats.index_file_length+= file->stats.index_file_length;
+ stats.delete_length+= file->stats.delete_length;
+ if (file->stats.check_time > stats.check_time)
+ stats.check_time= file->stats.check_time;
+ }
+ } while (*(++file_array));
+ if (stats.records < 2 &&
+ !(m_table_flags & HA_STATS_RECORDS_IS_EXACT))
+ stats.records= 2;
+ if (stats.records > 0)
+ stats.mean_rec_length= (ulong) (stats.data_file_length / stats.records);
+ else
+ stats.mean_rec_length= 1; //? What should we set here
+ }
+ if (flag & HA_STATUS_CONST)
+ {
+ DBUG_PRINT("info", ("HA_STATUS_CONST"));
+ /*
+ Recalculate loads of constant variables. MyISAM also sets things
+ directly on the table share object.
+
+ Check whether this should be fixed since handlers should not
+ change things directly on the table object.
+
+ Monty comment: This should NOT be changed! It's the handlers
+ responsibility to correct table->s->keys_xxxx information if keys
+ have been disabled.
+
+ The most important parameters set here is records per key on
+ all indexes. block_size and primar key ref_length.
+
+ For each index there is an array of rec_per_key.
+ As an example if we have an index with three attributes a,b and c
+ we will have an array of 3 rec_per_key.
+ rec_per_key[0] is an estimate of number of records divided by
+ number of unique values of the field a.
+ rec_per_key[1] is an estimate of the number of records divided
+ by the number of unique combinations of the fields a and b.
+ rec_per_key[2] is an estimate of the number of records divided
+ by the number of unique combinations of the fields a,b and c.
+
+ Many handlers only set the value of rec_per_key when all fields
+ are bound (rec_per_key[2] in the example above).
+
+ If the handler doesn't support statistics, it should set all of the
+ above to 0.
+
+ We will allow the first handler to set the rec_per_key and use
+ this as an estimate on the total table.
+
+ max_data_file_length: Maximum data file length
+ We ignore it, is only used in
+ SHOW TABLE STATUS
+ max_index_file_length: Maximum index file length
+ We ignore it since it is never used
+ block_size: Block size used
+ We set it to the value of the first handler
+ ref_length: We set this to the value calculated
+ and stored in local object
+ create_time: Creation time of table
+ Set by first handler
+
+ So we calculate these constants by using the variables on the first
+ handler.
+ */
+
+ file= m_file[0];
+ file->info(HA_STATUS_CONST);
+ stats.create_time= file->stats.create_time;
+ ref_length= m_ref_length;
+ }
+ if (flag & HA_STATUS_ERRKEY)
+ {
+ handler *file= m_file[m_last_part];
+ DBUG_PRINT("info", ("info: HA_STATUS_ERRKEY"));
+ /*
+ This flag is used to get index number of the unique index that
+ reported duplicate key
+ We will report the errkey on the last handler used and ignore the rest
+ */
+ file->info(HA_STATUS_ERRKEY);
+ if (file->errkey != (uint) -1)
+ errkey= file->errkey;
+ }
+ if (flag & HA_STATUS_TIME)
+ {
+ DBUG_PRINT("info", ("info: HA_STATUS_TIME"));
+ /*
+ This flag is used to set the latest update time of the table.
+ Used by SHOW commands
+ We will report the maximum of these times
+ */
+ stats.update_time= 0;
+ file_array= m_file;
+ do
+ {
+ file= *file_array;
+ file->info(HA_STATUS_TIME);
+ if (file->stats.update_time > stats.update_time)
+ stats.update_time= file->stats.update_time;
+ } while (*(++file_array));
+ }
+ DBUG_RETURN(0);
+}
+
+
+void ha_partition::get_dynamic_partition_info(PARTITION_INFO *stat_info,
+ uint part_id)
+{
+ handler *file= m_file[part_id];
+ file->info(HA_STATUS_CONST | HA_STATUS_TIME | HA_STATUS_VARIABLE |
+ HA_STATUS_NO_LOCK);
+
+ stat_info->records= file->stats.records;
+ stat_info->mean_rec_length= file->stats.mean_rec_length;
+ stat_info->data_file_length= file->stats.data_file_length;
+ stat_info->max_data_file_length= file->stats.max_data_file_length;
+ stat_info->index_file_length= file->stats.index_file_length;
+ stat_info->delete_length= file->stats.delete_length;
+ stat_info->create_time= file->stats.create_time;
+ stat_info->update_time= file->stats.update_time;
+ stat_info->check_time= file->stats.check_time;
+ stat_info->check_sum= 0;
+ if (file->ha_table_flags() & HA_HAS_CHECKSUM)
+ stat_info->check_sum= file->checksum();
+ return;
+}
+
+
+/*
+ General function to prepare handler for certain behavior
+
+ SYNOPSIS
+ extra()
+ operation Operation type for extra call
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+
+ DESCRIPTION
+ extra() is called whenever the server wishes to send a hint to
+ the storage engine. The MyISAM engine implements the most hints.
+
+ We divide the parameters into the following categories:
+ 1) Parameters used by most handlers
+ 2) Parameters used by some non-MyISAM handlers
+ 3) Parameters used only by MyISAM
+ 4) Parameters only used by temporary tables for query processing
+ 5) Parameters only used by MyISAM internally
+ 6) Parameters not used at all
+ 7) Parameters only used by federated tables for query processing
+ 8) Parameters only used by NDB
+
+ The partition handler need to handle category 1), 2) and 3).
+
+ 1) Parameters used by most handlers
+ -----------------------------------
+ HA_EXTRA_RESET:
+ This option is used by most handlers and it resets the handler state
+ to the same state as after an open call. This includes releasing
+ any READ CACHE or WRITE CACHE or other internal buffer used.
+
+ It is called from the reset method in the handler interface. There are
+ three instances where this is called.
+ 1) After completing a INSERT ... SELECT ... query the handler for the
+ table inserted into is reset
+ 2) It is called from close_thread_table which in turn is called from
+ close_thread_tables except in the case where the tables are locked
+ in which case ha_commit_stmt is called instead.
+ It is only called from here if refresh_version hasn't changed and the
+ table is not an old table when calling close_thread_table.
+ close_thread_tables is called from many places as a general clean up
+ function after completing a query.
+ 3) It is called when deleting the QUICK_RANGE_SELECT object if the
+ QUICK_RANGE_SELECT object had its own handler object. It is called
+ immediatley before close of this local handler object.
+ HA_EXTRA_KEYREAD:
+ HA_EXTRA_NO_KEYREAD:
+ These parameters are used to provide an optimisation hint to the handler.
+ If HA_EXTRA_KEYREAD is set it is enough to read the index fields, for
+ many handlers this means that the index-only scans can be used and it
+ is not necessary to use the real records to satisfy this part of the
+ query. Index-only scans is a very important optimisation for disk-based
+ indexes. For main-memory indexes most indexes contain a reference to the
+ record and thus KEYREAD only says that it is enough to read key fields.
+ HA_EXTRA_NO_KEYREAD disables this for the handler, also HA_EXTRA_RESET
+ will disable this option.
+ The handler will set HA_KEYREAD_ONLY in its table flags to indicate this
+ feature is supported.
+ HA_EXTRA_FLUSH:
+ Indication to flush tables to disk, is supposed to be used to
+ ensure disk based tables are flushed at end of query execution.
+ Currently is never used.
+
+ 2) Parameters used by some non-MyISAM handlers
+ ----------------------------------------------
+ HA_EXTRA_KEYREAD_PRESERVE_FIELDS:
+ This is a strictly InnoDB feature that is more or less undocumented.
+ When it is activated InnoDB copies field by field from its fetch
+ cache instead of all fields in one memcpy. Have no idea what the
+ purpose of this is.
+ Cut from include/my_base.h:
+ When using HA_EXTRA_KEYREAD, overwrite only key member fields and keep
+ other fields intact. When this is off (by default) InnoDB will use memcpy
+ to overwrite entire row.
+ HA_EXTRA_IGNORE_DUP_KEY:
+ HA_EXTRA_NO_IGNORE_DUP_KEY:
+ Informs the handler to we will not stop the transaction if we get an
+ duplicate key errors during insert/upate.
+ Always called in pair, triggered by INSERT IGNORE and other similar
+ SQL constructs.
+ Not used by MyISAM.
+
+ 3) Parameters used only by MyISAM
+ ---------------------------------
+ HA_EXTRA_NORMAL:
+ Only used in MyISAM to reset quick mode, not implemented by any other
+ handler. Quick mode is also reset in MyISAM by HA_EXTRA_RESET.
+
+ It is called after completing a successful DELETE query if the QUICK
+ option is set.
+
+ HA_EXTRA_QUICK:
+ When the user does DELETE QUICK FROM table where-clause; this extra
+ option is called before the delete query is performed and
+ HA_EXTRA_NORMAL is called after the delete query is completed.
+ Temporary tables used internally in MySQL always set this option
+
+ The meaning of quick mode is that when deleting in a B-tree no merging
+ of leafs is performed. This is a common method and many large DBMS's
+ actually only support this quick mode since it is very difficult to
+ merge leaves in a tree used by many threads concurrently.
+
+ HA_EXTRA_CACHE:
+ This flag is usually set with extra_opt along with a cache size.
+ The size of this buffer is set by the user variable
+ record_buffer_size. The value of this cache size is the amount of
+ data read from disk in each fetch when performing a table scan.
+ This means that before scanning a table it is normal to call
+ extra with HA_EXTRA_CACHE and when the scan is completed to call
+ HA_EXTRA_NO_CACHE to release the cache memory.
+
+ Some special care is taken when using this extra parameter since there
+ could be a write ongoing on the table in the same statement. In this
+ one has to take special care since there might be a WRITE CACHE as
+ well. HA_EXTRA_CACHE specifies using a READ CACHE and using
+ READ CACHE and WRITE CACHE at the same time is not possible.
+
+ Only MyISAM currently use this option.
+
+ It is set when doing full table scans using rr_sequential and
+ reset when completing such a scan with end_read_record
+ (resetting means calling extra with HA_EXTRA_NO_CACHE).
+
+ It is set in filesort.cc for MyISAM internal tables and it is set in
+ a multi-update where HA_EXTRA_CACHE is called on a temporary result
+ table and after that ha_rnd_init(0) on table to be updated
+ and immediately after that HA_EXTRA_NO_CACHE on table to be updated.
+
+ Apart from that it is always used from init_read_record but not when
+ used from UPDATE statements. It is not used from DELETE statements
+ with ORDER BY and LIMIT but it is used in normal scan loop in DELETE
+ statements. The reason here is that DELETE's in MyISAM doesn't move
+ existings data rows.
+
+ It is also set in copy_data_between_tables when scanning the old table
+ to copy over to the new table.
+ And it is set in join_init_read_record where quick objects are used
+ to perform a scan on the table. In this case the full table scan can
+ even be performed multiple times as part of the nested loop join.
+
+ For purposes of the partition handler it is obviously necessary to have
+ special treatment of this extra call. If we would simply pass this
+ extra call down to each handler we would allocate
+ cache size * no of partitions amount of memory and this is not
+ necessary since we will only scan one partition at a time when doing
+ full table scans.
+
+ Thus we treat it by first checking whether we have MyISAM handlers in
+ the table, if not we simply ignore the call and if we have we will
+ record the call but will not call any underlying handler yet. Then
+ when performing the sequential scan we will check this recorded value
+ and call extra_opt whenever we start scanning a new partition.
+
+ monty: Neads to be fixed so that it's passed to all handlers when we
+ move to another partition during table scan.
+
+ HA_EXTRA_NO_CACHE:
+ When performing a UNION SELECT HA_EXTRA_NO_CACHE is called from the
+ flush method in the select_union class.
+ It is used to some extent when insert delayed inserts.
+ See HA_EXTRA_RESET_STATE for use in conjunction with delete_all_rows().
+
+ It should be ok to call HA_EXTRA_NO_CACHE on all underlying handlers
+ if they are MyISAM handlers. Other handlers we can ignore the call
+ for. If no cache is in use they will quickly return after finding
+ this out. And we also ensure that all caches are disabled and no one
+ is left by mistake.
+ In the future this call will probably be deleted an we will instead call
+ ::reset();
+
+ HA_EXTRA_WRITE_CACHE:
+ See above, called from various places. It is mostly used when we
+ do INSERT ... SELECT
+ No special handling to save cache space is developed currently.
+
+ HA_EXTRA_PREPARE_FOR_UPDATE:
+ This is called as part of a multi-table update. When the table to be
+ updated is also scanned then this informs MyISAM handler to drop any
+ caches if dynamic records are used (fixed size records do not care
+ about this call). We pass this along to all underlying MyISAM handlers
+ and ignore it for the rest.
+
+ HA_EXTRA_PREPARE_FOR_DROP:
+ Only used by MyISAM, called in preparation for a DROP TABLE.
+ It's used mostly by Windows that cannot handle dropping an open file.
+ On other platforms it has the same effect as HA_EXTRA_FORCE_REOPEN.
+
+ HA_EXTRA_PREPARE_FOR_RENAME:
+ Informs the handler we are about to attempt a rename of the table.
+
+ HA_EXTRA_READCHECK:
+ HA_EXTRA_NO_READCHECK:
+ Only one call to HA_EXTRA_NO_READCHECK from ha_open where it says that
+ this is not needed in SQL. The reason for this call is that MyISAM sets
+ the READ_CHECK_USED in the open call so the call is needed for MyISAM
+ to reset this feature.
+ The idea with this parameter was to inform of doing/not doing a read
+ check before applying an update. Since SQL always performs a read before
+ applying the update No Read Check is needed in MyISAM as well.
+
+ This is a cut from Docs/myisam.txt
+ Sometimes you might want to force an update without checking whether
+ another user has changed the record since you last read it. This is
+ somewhat dangerous, so it should ideally not be used. That can be
+ accomplished by wrapping the mi_update() call in two calls to mi_extra(),
+ using these functions:
+ HA_EXTRA_NO_READCHECK=5 No readcheck on update
+ HA_EXTRA_READCHECK=6 Use readcheck (def)
+
+ HA_EXTRA_FORCE_REOPEN:
+ Only used by MyISAM, called when altering table, closing tables to
+ enforce a reopen of the table files.
+
+ 4) Parameters only used by temporary tables for query processing
+ ----------------------------------------------------------------
+ HA_EXTRA_RESET_STATE:
+ Same as reset() except that buffers are not released. If there is
+ a READ CACHE it is reinit'ed. A cache is reinit'ed to restart reading
+ or to change type of cache between READ CACHE and WRITE CACHE.
+
+ This extra function is always called immediately before calling
+ delete_all_rows on the handler for temporary tables.
+ There are cases however when HA_EXTRA_RESET_STATE isn't called in
+ a similar case for a temporary table in sql_union.cc and in two other
+ cases HA_EXTRA_NO_CACHE is called before and HA_EXTRA_WRITE_CACHE
+ called afterwards.
+ The case with HA_EXTRA_NO_CACHE and HA_EXTRA_WRITE_CACHE means
+ disable caching, delete all rows and enable WRITE CACHE. This is
+ used for temporary tables containing distinct sums and a
+ functional group.
+
+ The only case that delete_all_rows is called on non-temporary tables
+ is in sql_delete.cc when DELETE FROM table; is called by a user.
+ In this case no special extra calls are performed before or after this
+ call.
+
+ The partition handler should not need to bother about this one. It
+ should never be called.
+
+ HA_EXTRA_NO_ROWS:
+ Don't insert rows indication to HEAP and MyISAM, only used by temporary
+ tables used in query processing.
+ Not handled by partition handler.
+
+ 5) Parameters only used by MyISAM internally
+ --------------------------------------------
+ HA_EXTRA_REINIT_CACHE:
+ This call reinitialises the READ CACHE described above if there is one
+ and otherwise the call is ignored.
+
+ We can thus safely call it on all underlying handlers if they are
+ MyISAM handlers. It is however never called so we don't handle it at all.
+ HA_EXTRA_FLUSH_CACHE:
+ Flush WRITE CACHE in MyISAM. It is only from one place in the code.
+ This is in sql_insert.cc where it is called if the table_flags doesn't
+ contain HA_DUPLICATE_POS. The only handler having the HA_DUPLICATE_POS
+ set is the MyISAM handler and so the only handler not receiving this
+ call is MyISAM.
+ Thus in effect this call is called but never used. Could be removed
+ from sql_insert.cc
+ HA_EXTRA_NO_USER_CHANGE:
+ Only used by MyISAM, never called.
+ Simulates lock_type as locked.
+ HA_EXTRA_WAIT_LOCK:
+ HA_EXTRA_WAIT_NOLOCK:
+ Only used by MyISAM, called from MyISAM handler but never from server
+ code on top of the handler.
+ Sets lock_wait on/off
+ HA_EXTRA_NO_KEYS:
+ Only used MyISAM, only used internally in MyISAM handler, never called
+ from server level.
+ HA_EXTRA_KEYREAD_CHANGE_POS:
+ HA_EXTRA_REMEMBER_POS:
+ HA_EXTRA_RESTORE_POS:
+ HA_EXTRA_PRELOAD_BUFFER_SIZE:
+ HA_EXTRA_CHANGE_KEY_TO_DUP:
+ HA_EXTRA_CHANGE_KEY_TO_UNIQUE:
+ Only used by MyISAM, never called.
+
+ 6) Parameters not used at all
+ -----------------------------
+ HA_EXTRA_KEY_CACHE:
+ HA_EXTRA_NO_KEY_CACHE:
+ This parameters are no longer used and could be removed.
+
+ 7) Parameters only used by federated tables for query processing
+ ----------------------------------------------------------------
+ HA_EXTRA_INSERT_WITH_UPDATE:
+ Inform handler that an "INSERT...ON DUPLICATE KEY UPDATE" will be
+ executed. This condition is unset by HA_EXTRA_NO_IGNORE_DUP_KEY.
+
+ 8) Parameters only used by NDB
+ ------------------------------
+ HA_EXTRA_DELETE_CANNOT_BATCH:
+ HA_EXTRA_UPDATE_CANNOT_BATCH:
+ Inform handler that delete_row()/update_row() cannot batch deletes/updates
+ and should perform them immediately. This may be needed when table has
+ AFTER DELETE/UPDATE triggers which access to subject table.
+ These flags are reset by the handler::extra(HA_EXTRA_RESET) call.
+*/
+
+int ha_partition::extra(enum ha_extra_function operation)
+{
+ DBUG_ENTER("ha_partition:extra");
+ DBUG_PRINT("info", ("operation: %d", (int) operation));
+
+ switch (operation) {
+ /* Category 1), used by most handlers */
+ case HA_EXTRA_KEYREAD:
+ case HA_EXTRA_NO_KEYREAD:
+ case HA_EXTRA_FLUSH:
+ DBUG_RETURN(loop_extra(operation));
+
+ /* Category 2), used by non-MyISAM handlers */
+ case HA_EXTRA_IGNORE_DUP_KEY:
+ case HA_EXTRA_NO_IGNORE_DUP_KEY:
+ case HA_EXTRA_KEYREAD_PRESERVE_FIELDS:
+ {
+ if (!m_myisam)
+ DBUG_RETURN(loop_extra(operation));
+ break;
+ }
+
+ /* Category 3), used by MyISAM handlers */
+ case HA_EXTRA_PREPARE_FOR_RENAME:
+ DBUG_RETURN(prepare_for_rename());
+ break;
+ case HA_EXTRA_NORMAL:
+ case HA_EXTRA_QUICK:
+ case HA_EXTRA_NO_READCHECK:
+ case HA_EXTRA_PREPARE_FOR_UPDATE:
+ case HA_EXTRA_FORCE_REOPEN:
+ case HA_EXTRA_PREPARE_FOR_DROP:
+ case HA_EXTRA_FLUSH_CACHE:
+ {
+ if (m_myisam)
+ DBUG_RETURN(loop_extra(operation));
+ break;
+ }
+ case HA_EXTRA_CACHE:
+ {
+ prepare_extra_cache(0);
+ break;
+ }
+ case HA_EXTRA_NO_CACHE:
+ case HA_EXTRA_WRITE_CACHE:
+ {
+ m_extra_cache= FALSE;
+ m_extra_cache_size= 0;
+ DBUG_RETURN(loop_extra(operation));
+ }
+ case HA_EXTRA_IGNORE_NO_KEY:
+ case HA_EXTRA_NO_IGNORE_NO_KEY:
+ {
+ /*
+ Ignore as these are specific to NDB for handling
+ idempotency
+ */
+ break;
+ }
+ case HA_EXTRA_WRITE_CAN_REPLACE:
+ case HA_EXTRA_WRITE_CANNOT_REPLACE:
+ {
+ /*
+ Informs handler that write_row() can replace rows which conflict
+ with row being inserted by PK/unique key without reporting error
+ to the SQL-layer.
+
+ This optimization is not safe for partitioned table in general case
+ since we may have to put new version of row into partition which is
+ different from partition in which old version resides (for example
+ when we partition by non-PK column or by some column which is not
+ part of unique key which were violated).
+ And since NDB which is the only engine at the moment that supports
+ this optimization handles partitioning on its own we simple disable
+ it here. (BTW for NDB this optimization is safe since it supports
+ only KEY partitioning and won't use this optimization for tables
+ which have additional unique constraints).
+ */
+ break;
+ }
+ /* Category 7), used by federated handlers */
+ case HA_EXTRA_INSERT_WITH_UPDATE:
+ DBUG_RETURN(loop_extra(operation));
+ /* Category 8) Parameters only used by NDB */
+ case HA_EXTRA_DELETE_CANNOT_BATCH:
+ case HA_EXTRA_UPDATE_CANNOT_BATCH:
+ {
+ /* Currently only NDB use the *_CANNOT_BATCH */
+ break;
+ }
+ default:
+ {
+ /* Temporary crash to discover what is wrong */
+ DBUG_ASSERT(0);
+ break;
+ }
+ }
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Special extra call to reset extra parameters
+
+ SYNOPSIS
+ reset()
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+
+ DESCRIPTION
+ Called at end of each statement to reste buffers
+*/
+
+int ha_partition::reset(void)
+{
+ int result= 0, tmp;
+ handler **file;
+ DBUG_ENTER("ha_partition::reset");
+ if (m_part_info)
+ bitmap_set_all(&m_part_info->used_partitions);
+ file= m_file;
+ do
+ {
+ if ((tmp= (*file)->ha_reset()))
+ result= tmp;
+ } while (*(++file));
+ DBUG_RETURN(result);
+}
+
+/*
+ Special extra method for HA_EXTRA_CACHE with cachesize as extra parameter
+
+ SYNOPSIS
+ extra_opt()
+ operation Must be HA_EXTRA_CACHE
+ cachesize Size of cache in full table scan
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+*/
+
+int ha_partition::extra_opt(enum ha_extra_function operation, ulong cachesize)
+{
+ DBUG_ENTER("ha_partition::extra_opt()");
+
+ DBUG_ASSERT(HA_EXTRA_CACHE == operation);
+ prepare_extra_cache(cachesize);
+ DBUG_RETURN(0);
+}
+
+
+/*
+ Call extra on handler with HA_EXTRA_CACHE and cachesize
+
+ SYNOPSIS
+ prepare_extra_cache()
+ cachesize Size of cache for full table scan
+
+ RETURN VALUE
+ NONE
+*/
+
+void ha_partition::prepare_extra_cache(uint cachesize)
+{
+ DBUG_ENTER("ha_partition::prepare_extra_cache()");
+
+ m_extra_cache= TRUE;
+ m_extra_cache_size= cachesize;
+ if (m_part_spec.start_part != NO_CURRENT_PART_ID)
+ {
+ late_extra_cache(m_part_spec.start_part);
+ }
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Prepares our new and reorged handlers for rename or delete
+
+ SYNOPSIS
+ prepare_for_delete()
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+*/
+
+int ha_partition::prepare_for_rename()
+{
+ int result= 0, tmp;
+ handler **file;
+ DBUG_ENTER("ha_partition::prepare_for_rename()");
+
+ if (m_new_file != NULL)
+ {
+ for (file= m_new_file; *file; file++)
+ if ((tmp= (*file)->extra(HA_EXTRA_PREPARE_FOR_RENAME)))
+ result= tmp;
+ for (file= m_reorged_file; *file; file++)
+ if ((tmp= (*file)->extra(HA_EXTRA_PREPARE_FOR_RENAME)))
+ result= tmp;
+ DBUG_RETURN(result);
+ }
+
+ DBUG_RETURN(loop_extra(HA_EXTRA_PREPARE_FOR_RENAME));
+}
+
+/*
+ Call extra on all partitions
+
+ SYNOPSIS
+ loop_extra()
+ operation extra operation type
+
+ RETURN VALUE
+ >0 Error code
+ 0 Success
+*/
+
+int ha_partition::loop_extra(enum ha_extra_function operation)
+{
+ int result= 0, tmp;
+ handler **file;
+ DBUG_ENTER("ha_partition::loop_extra()");
+
+ /*
+ TODO, 5.2: this is where you could possibly add optimisations to add the bitmap
+ _if_ a SELECT.
+ */
+ for (file= m_file; *file; file++)
+ {
+ if ((tmp= (*file)->extra(operation)))
+ result= tmp;
+ }
+ DBUG_RETURN(result);
+}
+
+
+/*
+ Call extra(HA_EXTRA_CACHE) on next partition_id
+
+ SYNOPSIS
+ late_extra_cache()
+ partition_id Partition id to call extra on
+
+ RETURN VALUE
+ NONE
+*/
+
+void ha_partition::late_extra_cache(uint partition_id)
+{
+ handler *file;
+ DBUG_ENTER("ha_partition::late_extra_cache");
+
+ if (!m_extra_cache)
+ DBUG_VOID_RETURN;
+ file= m_file[partition_id];
+ if (m_extra_cache_size == 0)
+ VOID(file->extra(HA_EXTRA_CACHE));
+ else
+ VOID(file->extra_opt(HA_EXTRA_CACHE, m_extra_cache_size));
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ Call extra(HA_EXTRA_NO_CACHE) on next partition_id
+
+ SYNOPSIS
+ late_extra_no_cache()
+ partition_id Partition id to call extra on
+
+ RETURN VALUE
+ NONE
+*/
+
+void ha_partition::late_extra_no_cache(uint partition_id)
+{
+ handler *file;
+ DBUG_ENTER("ha_partition::late_extra_no_cache");
+
+ if (!m_extra_cache)
+ DBUG_VOID_RETURN;
+ file= m_file[partition_id];
+ VOID(file->extra(HA_EXTRA_NO_CACHE));
+ DBUG_VOID_RETURN;
+}
+
+
+/****************************************************************************
+ MODULE optimiser support
+****************************************************************************/
+
+/*
+ Get keys to use for scanning
+
+ SYNOPSIS
+ keys_to_use_for_scanning()
+
+ RETURN VALUE
+ key_map of keys usable for scanning
+*/
+
+const key_map *ha_partition::keys_to_use_for_scanning()
+{
+ DBUG_ENTER("ha_partition::keys_to_use_for_scanning");
+
+ DBUG_RETURN(m_file[0]->keys_to_use_for_scanning());
+}
+
+
+/*
+ Return time for a scan of the table
+
+ SYNOPSIS
+ scan_time()
+
+ RETURN VALUE
+ time for scan
+*/
+
+double ha_partition::scan_time()
+{
+ double scan_time= 0;
+ handler **file;
+ DBUG_ENTER("ha_partition::scan_time");
+
+ for (file= m_file; *file; file++)
+ if (bitmap_is_set(&(m_part_info->used_partitions), (file - m_file)))
+ scan_time+= (*file)->scan_time();
+ DBUG_RETURN(scan_time);
+}
+
+
+/*
+ Get time to read
+
+ SYNOPSIS
+ read_time()
+ index Index number used
+ ranges Number of ranges
+ rows Number of rows
+
+ RETURN VALUE
+ time for read
+
+ DESCRIPTION
+ This will be optimised later to include whether or not the index can
+ be used with partitioning. To achieve we need to add another parameter
+ that specifies how many of the index fields that are bound in the ranges.
+ Possibly added as a new call to handlers.
+*/
+
+double ha_partition::read_time(uint index, uint ranges, ha_rows rows)
+{
+ DBUG_ENTER("ha_partition::read_time");
+
+ DBUG_RETURN(m_file[0]->read_time(index, ranges, rows));
+}
+
+/*
+ Find number of records in a range
+
+ SYNOPSIS
+ records_in_range()
+ inx Index number
+ min_key Start of range
+ max_key End of range
+
+ RETURN VALUE
+ Number of rows in range
+
+ DESCRIPTION
+ Given a starting key, and an ending key estimate the number of rows that
+ will exist between the two. end_key may be empty which in case determine
+ if start_key matches any rows.
+
+ Called from opt_range.cc by check_quick_keys().
+
+ monty: MUST be called for each range and added.
+ Note that MySQL will assume that if this returns 0 there is no
+ matching rows for the range!
+*/
+
+ha_rows ha_partition::records_in_range(uint inx, key_range *min_key,
+ key_range *max_key)
+{
+ handler **file;
+ ha_rows in_range= 0;
+ DBUG_ENTER("ha_partition::records_in_range");
+
+ file= m_file;
+ do
+ {
+ if (bitmap_is_set(&(m_part_info->used_partitions), (file - m_file)))
+ {
+ ha_rows tmp_in_range= (*file)->records_in_range(inx, min_key, max_key);
+ if (tmp_in_range == HA_POS_ERROR)
+ DBUG_RETURN(tmp_in_range);
+ in_range+= tmp_in_range;
+ }
+ } while (*(++file));
+ DBUG_RETURN(in_range);
+}
+
+
+/*
+ Estimate upper bound of number of rows
+
+ SYNOPSIS
+ estimate_rows_upper_bound()
+
+ RETURN VALUE
+ Number of rows
+*/
+
+ha_rows ha_partition::estimate_rows_upper_bound()
+{
+ ha_rows rows, tot_rows= 0;
+ handler **file;
+ DBUG_ENTER("ha_partition::estimate_rows_upper_bound");
+
+ file= m_file;
+ do
+ {
+ if (bitmap_is_set(&(m_part_info->used_partitions), (file - m_file)))
+ {
+ rows= (*file)->estimate_rows_upper_bound();
+ if (rows == HA_POS_ERROR)
+ DBUG_RETURN(HA_POS_ERROR);
+ tot_rows+= rows;
+ }
+ } while (*(++file));
+ DBUG_RETURN(tot_rows);
+}
+
+
+/**
+ Number of rows in table. see handler.h
+
+ SYNOPSIS
+ records()
+
+ RETURN VALUE
+ Number of total rows in a partitioned table.
+*/
+
+ha_rows ha_partition::records()
+{
+ ha_rows rows, tot_rows= 0;
+ handler **file;
+ DBUG_ENTER("ha_partition::records");
+
+ file= m_file;
+ do
+ {
+ rows= (*file)->records();
+ if (rows == HA_POS_ERROR)
+ DBUG_RETURN(HA_POS_ERROR);
+ tot_rows+= rows;
+ } while (*(++file));
+ DBUG_RETURN(tot_rows);
+}
+
+
+/*
+ Is it ok to switch to a new engine for this table
+
+ SYNOPSIS
+ can_switch_engine()
+
+ RETURN VALUE
+ TRUE Ok
+ FALSE Not ok
+
+ DESCRIPTION
+ Used to ensure that tables with foreign key constraints are not moved
+ to engines without foreign key support.
+*/
+
+bool ha_partition::can_switch_engines()
+{
+ handler **file;
+ DBUG_ENTER("ha_partition::can_switch_engines");
+
+ file= m_file;
+ do
+ {
+ if (!(*file)->can_switch_engines())
+ DBUG_RETURN(FALSE);
+ } while (*(++file));
+ DBUG_RETURN(TRUE);
+}
+
+
+/*
+ Is table cache supported
+
+ SYNOPSIS
+ table_cache_type()
+
+*/
+
+uint8 ha_partition::table_cache_type()
+{
+ DBUG_ENTER("ha_partition::table_cache_type");
+
+ DBUG_RETURN(m_file[0]->table_cache_type());
+}
+
+
+/****************************************************************************
+ MODULE print messages
+****************************************************************************/
+
+const char *ha_partition::index_type(uint inx)
+{
+ DBUG_ENTER("ha_partition::index_type");
+
+ DBUG_RETURN(m_file[0]->index_type(inx));
+}
+
+
+enum row_type ha_partition::get_row_type() const
+{
+ handler **file;
+ enum row_type type= (*m_file)->get_row_type();
+
+ for (file= m_file, file++; *file; file++)
+ {
+ enum row_type part_type= (*file)->get_row_type();
+ if (part_type != type)
+ return ROW_TYPE_NOT_USED;
+ }
+
+ return type;
+}
+
+
+void ha_partition::print_error(int error, myf errflag)
+{
+ DBUG_ENTER("ha_partition::print_error");
+
+ /* Should probably look for my own errors first */
+ DBUG_PRINT("enter", ("error: %d", error));
+
+ if (error == HA_ERR_NO_PARTITION_FOUND)
+ m_part_info->print_no_partition_found(table);
+ else
+ m_file[m_last_part]->print_error(error, errflag);
+ DBUG_VOID_RETURN;
+}
+
+
+bool ha_partition::get_error_message(int error, String *buf)
+{
+ DBUG_ENTER("ha_partition::get_error_message");
+
+ /* Should probably look for my own errors first */
+ DBUG_RETURN(m_file[m_last_part]->get_error_message(error, buf));
+}
+
+
+/****************************************************************************
+ MODULE handler characteristics
+****************************************************************************/
+/*
+ If frm_error() is called then we will use this to to find out what file
+ extensions exist for the storage engine. This is also used by the default
+ rename_table and delete_table method in handler.cc.
+*/
+
+static const char *ha_partition_ext[]=
+{
+ ha_par_ext, NullS
+};
+
+const char **ha_partition::bas_ext() const
+{ return ha_partition_ext; }
+
+
+uint ha_partition::min_of_the_max_uint(
+ uint (handler::*operator_func)(void) const) const
+{
+ handler **file;
+ uint min_of_the_max= ((*m_file)->*operator_func)();
+
+ for (file= m_file+1; *file; file++)
+ {
+ uint tmp= ((*file)->*operator_func)();
+ set_if_smaller(min_of_the_max, tmp);
+ }
+ return min_of_the_max;
+}
+
+
+uint ha_partition::max_supported_key_parts() const
+{
+ return min_of_the_max_uint(&handler::max_supported_key_parts);
+}
+
+
+uint ha_partition::max_supported_key_length() const
+{
+ return min_of_the_max_uint(&handler::max_supported_key_length);
+}
+
+
+uint ha_partition::max_supported_key_part_length() const
+{
+ return min_of_the_max_uint(&handler::max_supported_key_part_length);
+}
+
+
+uint ha_partition::max_supported_record_length() const
+{
+ return min_of_the_max_uint(&handler::max_supported_record_length);
+}
+
+
+uint ha_partition::max_supported_keys() const
+{
+ return min_of_the_max_uint(&handler::max_supported_keys);
+}
+
+
+uint ha_partition::extra_rec_buf_length() const
+{
+ handler **file;
+ uint max= (*m_file)->extra_rec_buf_length();
+
+ for (file= m_file, file++; *file; file++)
+ if (max < (*file)->extra_rec_buf_length())
+ max= (*file)->extra_rec_buf_length();
+ return max;
+}
+
+
+uint ha_partition::min_record_length(uint options) const
+{
+ handler **file;
+ uint max= (*m_file)->min_record_length(options);
+
+ for (file= m_file, file++; *file; file++)
+ if (max < (*file)->min_record_length(options))
+ max= (*file)->min_record_length(options);
+ return max;
+}
+
+
+/****************************************************************************
+ MODULE compare records
+****************************************************************************/
+/*
+ Compare two positions
+
+ SYNOPSIS
+ cmp_ref()
+ ref1 First position
+ ref2 Second position
+
+ RETURN VALUE
+ <0 ref1 < ref2
+ 0 Equal
+ >0 ref1 > ref2
+
+ DESCRIPTION
+ We get two references and need to check if those records are the same.
+ If they belong to different partitions we decide that they are not
+ the same record. Otherwise we use the particular handler to decide if
+ they are the same. Sort in partition id order if not equal.
+*/
+
+int ha_partition::cmp_ref(const uchar *ref1, const uchar *ref2)
+{
+ uint part_id;
+ my_ptrdiff_t diff1, diff2;
+ handler *file;
+ DBUG_ENTER("ha_partition::cmp_ref");
+
+ if ((ref1[0] == ref2[0]) && (ref1[1] == ref2[1]))
+ {
+ part_id= uint2korr(ref1);
+ file= m_file[part_id];
+ DBUG_ASSERT(part_id < m_tot_parts);
+ DBUG_RETURN(file->cmp_ref((ref1 + PARTITION_BYTES_IN_POS),
+ (ref2 + PARTITION_BYTES_IN_POS)));
+ }
+ diff1= ref2[1] - ref1[1];
+ diff2= ref2[0] - ref1[0];
+ if (diff1 > 0)
+ {
+ DBUG_RETURN(-1);
+ }
+ if (diff1 < 0)
+ {
+ DBUG_RETURN(+1);
+ }
+ if (diff2 > 0)
+ {
+ DBUG_RETURN(-1);
+ }
+ DBUG_RETURN(+1);
+}
+
+
+/****************************************************************************
+ MODULE auto increment
+****************************************************************************/
+
+void ha_partition::restore_auto_increment(ulonglong)
+{
+ DBUG_ENTER("ha_partition::restore_auto_increment");
+
+ DBUG_VOID_RETURN;
+}
+
+
+/*
+ This method is called by update_auto_increment which in turn is called
+ by the individual handlers as part of write_row. We will always let
+ the first handler keep track of the auto increment value for all
+ partitions.
+*/
+
+void ha_partition::get_auto_increment(ulonglong offset, ulonglong increment,
+ ulonglong nb_desired_values,
+ ulonglong *first_value,
+ ulonglong *nb_reserved_values)
+{
+ ulonglong first_value_part, last_value_part, nb_reserved_values_part,
+ last_value= ~ (ulonglong) 0;
+ handler **pos, **end;
+ bool retry= TRUE;
+ DBUG_ENTER("ha_partition::get_auto_increment");
+
+again:
+ for (pos=m_file, end= m_file+ m_tot_parts; pos != end ; pos++)
+ {
+ first_value_part= *first_value;
+ (*pos)->get_auto_increment(offset, increment, nb_desired_values,
+ &first_value_part, &nb_reserved_values_part);
+ if (first_value_part == ~(ulonglong)(0)) // error in one partition
+ {
+ *first_value= first_value_part;
+ sql_print_error("Partition failed to reserve auto_increment value");
+ DBUG_VOID_RETURN;
+ }
+ /*
+ Partition has reserved an interval. Intersect it with the intervals
+ already reserved for the previous partitions.
+ */
+ last_value_part= (nb_reserved_values_part == ULONGLONG_MAX) ?
+ ULONGLONG_MAX : (first_value_part + nb_reserved_values_part * increment);
+ set_if_bigger(*first_value, first_value_part);
+ set_if_smaller(last_value, last_value_part);
+ }
+ if (last_value < *first_value) /* empty intersection, error */
+ {
+ /*
+ When we have an empty intersection, it means that one or more
+ partitions may have a significantly different autoinc next value.
+ We should not fail here - it just means that we should try to
+ find a new reservation making use of the current *first_value
+ wbich should now be compatible with all partitions.
+ */
+ if (retry)
+ {
+ retry= FALSE;
+ last_value= ~ (ulonglong) 0;
+ release_auto_increment();
+ goto again;
+ }
+ /*
+ We should not get here.
+ */
+ sql_print_error("Failed to calculate auto_increment value for partition");
+
+ *first_value= ~(ulonglong)(0);
+ }
+ if (increment) // If not check for values
+ *nb_reserved_values= (last_value == ULONGLONG_MAX) ?
+ ULONGLONG_MAX : ((last_value - *first_value) / increment);
+ DBUG_VOID_RETURN;
+}
+
+void ha_partition::release_auto_increment()
+{
+ DBUG_ENTER("ha_partition::release_auto_increment");
+
+ for (uint i= 0; i < m_tot_parts; i++)
+ {
+ m_file[i]->ha_release_auto_increment();
+ }
+ DBUG_VOID_RETURN;
+}
+
+/****************************************************************************
+ MODULE initialise handler for HANDLER call
+****************************************************************************/
+
+void ha_partition::init_table_handle_for_HANDLER()
+{
+ return;
+}
+
+
+/****************************************************************************
+ MODULE enable/disable indexes
+****************************************************************************/
+
+/*
+ Disable indexes for a while
+ SYNOPSIS
+ disable_indexes()
+ mode Mode
+ RETURN VALUES
+ 0 Success
+ != 0 Error
+*/
+
+int ha_partition::disable_indexes(uint mode)
+{
+ handler **file;
+ int error= 0;
+
+ for (file= m_file; *file; file++)
+ {
+ if ((error= (*file)->ha_disable_indexes(mode)))
+ break;
+ }
+ return error;
+}
+
+
+/*
+ Enable indexes again
+ SYNOPSIS
+ enable_indexes()
+ mode Mode
+ RETURN VALUES
+ 0 Success
+ != 0 Error
+*/
+
+int ha_partition::enable_indexes(uint mode)
+{
+ handler **file;
+ int error= 0;
+
+ for (file= m_file; *file; file++)
+ {
+ if ((error= (*file)->ha_enable_indexes(mode)))
+ break;
+ }
+ return error;
+}
+
+
+/*
+ Check if indexes are disabled
+ SYNOPSIS
+ indexes_are_disabled()
+
+ RETURN VALUES
+ 0 Indexes are enabled
+ != 0 Indexes are disabled
+*/
+
+int ha_partition::indexes_are_disabled(void)
+{
+ handler **file;
+ int error= 0;
+
+ for (file= m_file; *file; file++)
+ {
+ if ((error= (*file)->indexes_are_disabled()))
+ break;
+ }
+ return error;
+}
+
+
+/****************************************************************************
+ MODULE Partition Share
+****************************************************************************/
+/*
+ Service routines for ... methods.
+-------------------------------------------------------------------------
+ Variables for partition share methods. A hash used to track open tables.
+ A mutex for the hash table and an init variable to check if hash table
+ is initialised.
+ There is also a constant ending of the partition handler file name.
+*/
+
+#ifdef NOT_USED
+static HASH partition_open_tables;
+static pthread_mutex_t partition_mutex;
+static int partition_init= 0;
+
+
+/*
+ Function we use in the creation of our hash to get key.
+*/
+
+static uchar *partition_get_key(PARTITION_SHARE *share, size_t *length,
+ my_bool not_used __attribute__ ((unused)))
+{
+ *length= share->table_name_length;
+ return (uchar *) share->table_name;
+}
+
+/*
+ Example of simple lock controls. The "share" it creates is structure we
+ will pass to each partition handler. Do you have to have one of these?
+ Well, you have pieces that are used for locking, and they are needed to
+ function.
+*/
+
+static PARTITION_SHARE *get_share(const char *table_name, TABLE *table)
+{
+ PARTITION_SHARE *share;
+ uint length;
+ char *tmp_name;
+
+ /*
+ So why does this exist? There is no way currently to init a storage
+ engine.
+ Innodb and BDB both have modifications to the server to allow them to
+ do this. Since you will not want to do this, this is probably the next
+ best method.
+ */
+ if (!partition_init)
+ {
+ /* Hijack a mutex for init'ing the storage engine */
+ pthread_mutex_lock(&LOCK_mysql_create_db);
+ if (!partition_init)
+ {
+ partition_init++;
+ VOID(pthread_mutex_init(&partition_mutex, MY_MUTEX_INIT_FAST));
+ (void) hash_init(&partition_open_tables, system_charset_info, 32, 0, 0,
+ (hash_get_key) partition_get_key, 0, 0);
+ }
+ pthread_mutex_unlock(&LOCK_mysql_create_db);
+ }
+ pthread_mutex_lock(&partition_mutex);
+ length= (uint) strlen(table_name);
+
+ if (!(share= (PARTITION_SHARE *) hash_search(&partition_open_tables,
+ (uchar *) table_name, length)))
+ {
+ if (!(share= (PARTITION_SHARE *)
+ my_multi_malloc(MYF(MY_WME | MY_ZEROFILL),
+ &share, (uint) sizeof(*share),
+ &tmp_name, (uint) length + 1, NullS)))
+ {
+ pthread_mutex_unlock(&partition_mutex);
+ return NULL;
+ }
+
+ share->use_count= 0;
+ share->table_name_length= length;
+ share->table_name= tmp_name;
+ strmov(share->table_name, table_name);
+ if (my_hash_insert(&partition_open_tables, (uchar *) share))
+ goto error;
+ thr_lock_init(&share->lock);
+ pthread_mutex_init(&share->mutex, MY_MUTEX_INIT_FAST);
+ }
+ share->use_count++;
+ pthread_mutex_unlock(&partition_mutex);
+
+ return share;
+
+error:
+ pthread_mutex_unlock(&partition_mutex);
+ my_free((uchar*) share, MYF(0));
+
+ return NULL;
+}
+
+
+/*
+ Free lock controls. We call this whenever we close a table. If the table
+ had the last reference to the share then we free memory associated with
+ it.
+*/
+
+static int free_share(PARTITION_SHARE *share)
+{
+ pthread_mutex_lock(&partition_mutex);
+ if (!--share->use_count)
+ {
+ hash_delete(&partition_open_tables, (uchar *) share);
+ thr_lock_delete(&share->lock);
+ pthread_mutex_destroy(&share->mutex);
+ my_free((uchar*) share, MYF(0));
+ }
+ pthread_mutex_unlock(&partition_mutex);
+
+ return 0;
+}
+#endif /* NOT_USED */
+
+struct st_mysql_storage_engine partition_storage_engine=
+{ MYSQL_HANDLERTON_INTERFACE_VERSION };
+
+mysql_declare_plugin(partition)
+{
+ MYSQL_STORAGE_ENGINE_PLUGIN,
+ &partition_storage_engine,
+ "partition",
+ "Mikael Ronstrom, MySQL AB",
+ "Partition Storage Engine Helper",
+ PLUGIN_LICENSE_GPL,
+ partition_initialize, /* Plugin Init */
+ NULL, /* Plugin Deinit */
+ 0x0100, /* 1.0 */
+ NULL, /* status variables */
+ NULL, /* system variables */
+ NULL /* config options */
+}
+mysql_declare_plugin_end;
+
+#endif