diff options
Diffstat (limited to 'sql')
73 files changed, 11945 insertions, 1163 deletions
diff --git a/sql/Makefile.am b/sql/Makefile.am index 4824a75d6fa..e90be7630fa 100644 --- a/sql/Makefile.am +++ b/sql/Makefile.am @@ -29,9 +29,9 @@ libexec_PROGRAMS = mysqld noinst_PROGRAMS = gen_lex_hash bin_PROGRAMS = mysql_tzinfo_to_sql gen_lex_hash_LDFLAGS = @NOINST_LDFLAGS@ -LDADD = $(top_builddir)/myisam/libmyisam.a \ - $(top_builddir)/myisammrg/libmyisammrg.a \ - $(top_builddir)/heap/libheap.a \ +LDADD = $(top_builddir)/storage/myisam/libmyisam.a \ + $(top_builddir)/storage/myisammrg/libmyisammrg.a \ + $(top_builddir)/storage/heap/libheap.a \ $(top_builddir)/vio/libvio.a \ $(top_builddir)/mysys/libmysys.a \ $(top_builddir)/dbug/libdbug.a \ @@ -55,7 +55,7 @@ noinst_HEADERS = item.h item_func.h item_sum.h item_cmpfunc.h \ ha_ndbcluster.h opt_range.h protocol.h \ sql_select.h structs.h table.h sql_udf.h hash_filo.h\ lex.h lex_symbol.h sql_acl.h sql_crypt.h \ - log_event.h sql_repl.h slave.h \ + log_event.h sql_repl.h slave.h rpl_filter.h \ stacktrace.h sql_sort.h sql_cache.h set_var.h \ spatial.h gstream.h client_settings.h tzfile.h \ tztime.h my_decimal.h\ @@ -64,7 +64,7 @@ noinst_HEADERS = item.h item_func.h item_sum.h item_cmpfunc.h \ sql_array.h \ examples/ha_example.h examples/ha_archive.h \ examples/ha_tina.h ha_blackhole.h \ - ha_federated.h + ha_federated.h ha_partition.h mysqld_SOURCES = sql_lex.cc sql_handler.cc \ item.cc item_sum.cc item_buff.cc item_func.cc \ item_cmpfunc.cc item_strfunc.cc item_timefunc.cc \ @@ -90,7 +90,8 @@ mysqld_SOURCES = sql_lex.cc sql_handler.cc \ sql_db.cc sql_table.cc sql_rename.cc sql_crypt.cc \ sql_load.cc mf_iocache.cc field_conv.cc sql_show.cc \ sql_udf.cc sql_analyse.cc sql_analyse.h sql_cache.cc \ - slave.cc sql_repl.cc sql_union.cc sql_derived.cc \ + slave.cc sql_repl.cc rpl_filter.cc \ + sql_union.cc sql_derived.cc \ client.c sql_client.cc mini_client_errors.c pack.c\ stacktrace.c repl_failsafe.h repl_failsafe.cc \ sql_olap.cc sql_view.cc \ @@ -100,6 +101,7 @@ mysqld_SOURCES = sql_lex.cc sql_handler.cc \ sp_cache.cc parse_file.cc sql_trigger.cc \ examples/ha_example.cc examples/ha_archive.cc \ examples/ha_tina.cc ha_blackhole.cc \ + ha_partition.cc sql_partition.cc \ ha_federated.cc gen_lex_hash_SOURCES = gen_lex_hash.cc diff --git a/sql/examples/ha_tina.cc b/sql/examples/ha_tina.cc index 5c3cbdcf2ca..3a9302483b4 100644 --- a/sql/examples/ha_tina.cc +++ b/sql/examples/ha_tina.cc @@ -99,7 +99,8 @@ static byte* tina_get_key(TINA_SHARE *share,uint *length, int get_mmap(TINA_SHARE *share, int write) { DBUG_ENTER("ha_tina::get_mmap"); - if (share->mapped_file && munmap(share->mapped_file, share->file_stat.st_size)) + if (share->mapped_file && my_munmap(share->mapped_file, + share->file_stat.st_size)) DBUG_RETURN(1); if (my_fstat(share->data_file, &share->file_stat, MYF(MY_WME)) == -1) @@ -108,13 +109,13 @@ int get_mmap(TINA_SHARE *share, int write) if (share->file_stat.st_size) { if (write) - share->mapped_file= (byte *)mmap(NULL, share->file_stat.st_size, - PROT_READ|PROT_WRITE, MAP_SHARED, - share->data_file, 0); + share->mapped_file= (byte *)my_mmap(NULL, share->file_stat.st_size, + PROT_READ|PROT_WRITE, MAP_SHARED, + share->data_file, 0); else - share->mapped_file= (byte *)mmap(NULL, share->file_stat.st_size, - PROT_READ, MAP_PRIVATE, - share->data_file, 0); + share->mapped_file= (byte *)my_mmap(NULL, share->file_stat.st_size, + PROT_READ, MAP_PRIVATE, + share->data_file, 0); if ((share->mapped_file ==(caddr_t)-1)) { /* @@ -222,7 +223,7 @@ static int free_share(TINA_SHARE *share) if (!--share->use_count){ /* Drop the mapped file */ if (share->mapped_file) - munmap(share->mapped_file, share->file_stat.st_size); + my_munmap(share->mapped_file, share->file_stat.st_size); result_code= my_close(share->data_file,MYF(0)); hash_delete(&tina_open_tables, (byte*) share); thr_lock_delete(&share->lock); @@ -797,7 +798,7 @@ int ha_tina::rnd_end() if (my_chsize(share->data_file, length, 0, MYF(MY_WME))) DBUG_RETURN(-1); - if (munmap(share->mapped_file, length)) + if (my_munmap(share->mapped_file, length)) DBUG_RETURN(-1); /* We set it to null so that get_mmap() won't try to unmap it */ diff --git a/sql/field.cc b/sql/field.cc index 224b6c279f3..270214d4350 100644 --- a/sql/field.cc +++ b/sql/field.cc @@ -67,6 +67,7 @@ inline int field_type2index (enum_field_types field_type) ((int)FIELDTYPE_TEAR_FROM) + (field_type - FIELDTYPE_TEAR_TO) - 1); } + static enum_field_types field_types_merge_rules [FIELDTYPE_NUM][FIELDTYPE_NUM]= { /* MYSQL_TYPE_DECIMAL -> */ @@ -1233,6 +1234,7 @@ Field::Field(char *ptr_arg,uint32 length_arg,uchar *null_ptr_arg, flags=null_ptr ? 0: NOT_NULL_FLAG; comment.str= (char*) ""; comment.length=0; + fieldnr= 0; } uint Field::offset() @@ -5905,6 +5907,26 @@ int Field_str::store(double nr) } +uint Field::is_equal(create_field *new_field) +{ + return (new_field->sql_type == type()); +} + + +uint Field_str::is_equal(create_field *new_field) +{ + if (((new_field->flags & (BINCMP_FLAG | BINARY_FLAG)) && + !(flags & (BINCMP_FLAG | BINARY_FLAG))) || + (!(new_field->flags & (BINCMP_FLAG | BINARY_FLAG)) && + (flags & (BINCMP_FLAG | BINARY_FLAG)))) + return 0; /* One of the fields is binary and the other one isn't */ + + return ((new_field->sql_type == type()) && + new_field->charset == field_charset && + new_field->length == max_length()); +} + + int Field_string::store(longlong nr) { char buff[64]; @@ -6283,7 +6305,8 @@ my_decimal *Field_varstring::val_decimal(my_decimal *decimal_value) } -int Field_varstring::cmp(const char *a_ptr, const char *b_ptr) +int Field_varstring::cmp_max(const char *a_ptr, const char *b_ptr, + uint max_len) { uint a_length, b_length; int diff; @@ -6298,6 +6321,8 @@ int Field_varstring::cmp(const char *a_ptr, const char *b_ptr) a_length= uint2korr(a_ptr); b_length= uint2korr(b_ptr); } + set_if_smaller(a_length, max_len); + set_if_smaller(b_length, max_len); diff= field_charset->coll->strnncollsp(field_charset, (const uchar*) a_ptr+ length_bytes, @@ -6661,6 +6686,22 @@ Field *Field_varstring::new_key_field(MEM_ROOT *root, } +uint Field_varstring::is_equal(create_field *new_field) +{ + if (new_field->sql_type == type() && + new_field->charset == field_charset) + { + if (new_field->length == max_length()) + return IS_EQUAL_YES; + if (new_field->length > max_length() && + ((new_field->length <= 255 && max_length() <= 255) || + (new_field->length > 255 && max_length() > 255))) + return IS_EQUAL_PACK_LENGTH; // VARCHAR, longer variable length + } + return IS_EQUAL_NO; +} + + /**************************************************************************** ** blob type ** A blob is saved as a length and a pointer. The length is stored in the @@ -6928,13 +6969,16 @@ int Field_blob::cmp(const char *a,uint32 a_length, const char *b, } -int Field_blob::cmp(const char *a_ptr, const char *b_ptr) +int Field_blob::cmp_max(const char *a_ptr, const char *b_ptr, + uint max_length) { char *blob1,*blob2; memcpy_fixed(&blob1,a_ptr+packlength,sizeof(char*)); memcpy_fixed(&blob2,b_ptr+packlength,sizeof(char*)); - return Field_blob::cmp(blob1,get_length(a_ptr), - blob2,get_length(b_ptr)); + uint a_len= get_length(a_ptr), b_len= get_length(b_ptr); + set_if_smaller(a_len, max_length); + set_if_smaller(b_len, max_length); + return Field_blob::cmp(blob1,a_len,blob2,b_len); } @@ -7767,6 +7811,17 @@ bool Field_num::eq_def(Field *field) } +uint Field_num::is_equal(create_field *new_field) +{ + return ((new_field->sql_type == type()) && + ((new_field->flags & UNSIGNED_FLAG) == (uint) (flags & + UNSIGNED_FLAG)) && + ((new_field->flags & AUTO_INCREMENT_FLAG) == + (uint) (flags & AUTO_INCREMENT_FLAG)) && + (new_field->length >= max_length())); +} + + /* Bit field. @@ -7954,6 +8009,35 @@ my_decimal *Field_bit::val_decimal(my_decimal *deciaml_value) } +/* + Compare two bit fields using pointers within the record. + SYNOPSIS + cmp_max() + a Pointer to field->ptr in first record + b Pointer to field->ptr in second record + max_len Maximum length used in index + DESCRIPTION + This method is used from key_rec_cmp used by merge sorts used + by partitioned index read and later other similar places. + The a and b pointer must be pointers to the field in a record + (not the table->record[0] necessarily) +*/ +int Field_bit::cmp_max(const char *a, const char *b, uint max_len) +{ + my_ptrdiff_t a_diff= a - ptr; + my_ptrdiff_t b_diff= b - ptr; + if (bit_len) + { + int flag; + uchar bits_a= get_rec_bits(bit_ptr+a_diff, bit_ofs, bit_len); + uchar bits_b= get_rec_bits(bit_ptr+b_diff, bit_ofs, bit_len); + if ((flag= (int) (bits_a - bits_b))) + return flag; + } + return memcmp(a, b, field_length); +} + + int Field_bit::key_cmp(const byte *str, uint length) { if (bit_len) diff --git a/sql/field.h b/sql/field.h index 2b67ed3f599..c31d70dd651 100644 --- a/sql/field.h +++ b/sql/field.h @@ -29,6 +29,7 @@ class Send_field; class Protocol; +class create_field; struct st_cache_field; void field_conv(Field *to,Field *from); @@ -87,7 +88,11 @@ public: utype unireg_check; uint32 field_length; // Length of field uint field_index; // field number in fields array - uint16 flags; + uint32 flags; + /* fieldnr is the id of the field (first field = 1) as is also + used in key_part. + */ + uint16 fieldnr; uchar null_bit; // Bit used to test null bit Field(char *ptr_arg,uint32 length_arg,uchar *null_ptr_arg,uchar null_bit_arg, @@ -150,6 +155,8 @@ public: virtual enum_field_types type() const =0; virtual enum_field_types real_type() const { return type(); } inline int cmp(const char *str) { return cmp(ptr,str); } + virtual int cmp_max(const char *a, const char *b, uint max_len) + { return cmp(a, b); } virtual int cmp(const char *,const char *)=0; virtual int cmp_binary(const char *a,const char *b, uint32 max_length=~0L) { return memcmp(a,b,pack_length()); } @@ -179,6 +186,12 @@ public: return test(record[(uint) (null_ptr - (uchar*) table->record[0])] & null_bit); } + inline bool is_null_in_record_with_offset(my_ptrdiff_t offset) + { + if (!null_ptr) + return 0; + return test(null_ptr[offset] & null_bit); + } inline void set_null(int row_offset=0) { if (null_ptr) null_ptr[row_offset]|= null_bit; } inline void set_notnull(int row_offset=0) @@ -303,6 +316,8 @@ public: int warn_if_overflow(int op_result); /* maximum possible display length */ virtual uint32 max_length()= 0; + + virtual uint is_equal(create_field *new_field); /* convert decimal to longlong with overflow check */ longlong convert_decimal2longlong(const my_decimal *val, bool unsigned_flag, int *err); @@ -343,6 +358,7 @@ public: bool eq_def(Field *field); int store_decimal(const my_decimal *); my_decimal *val_decimal(my_decimal *); + uint is_equal(create_field *new_field); }; @@ -367,6 +383,7 @@ public: uint32 max_length() { return field_length; } friend class create_field; my_decimal *val_decimal(my_decimal *); + uint is_equal(create_field *new_field); }; @@ -1055,7 +1072,11 @@ public: longlong val_int(void); String *val_str(String*,String *); my_decimal *val_decimal(my_decimal *); - int cmp(const char *,const char*); + int cmp_max(const char *, const char *, uint max_length); + int cmp(const char *a,const char*b) + { + return cmp_max(a, b, ~0); + } void sort_string(char *buff,uint length); void get_key_image(char *buff,uint length, imagetype type); void set_key_image(char *buff,uint length); @@ -1081,6 +1102,7 @@ public: Field *new_key_field(MEM_ROOT *root, struct st_table *new_table, char *new_ptr, uchar *new_null_ptr, uint new_null_bit); + uint is_equal(create_field *new_field); }; @@ -1111,7 +1133,9 @@ public: longlong val_int(void); String *val_str(String*,String *); my_decimal *val_decimal(my_decimal *); - int cmp(const char *,const char*); + int cmp_max(const char *, const char *, uint max_length); + int cmp(const char *a,const char*b) + { return cmp_max(a, b, ~0); } int cmp(const char *a, uint32 a_length, const char *b, uint32 b_length); int cmp_binary(const char *a,const char *b, uint32 max_length=~0L); int key_cmp(const byte *,const byte*); @@ -1135,6 +1159,10 @@ public: { memcpy_fixed(str,ptr+packlength,sizeof(char*)); } + inline void get_ptr(char **str, uint row_offset) + { + memcpy_fixed(str,ptr+packlength+row_offset,sizeof(char*)); + } inline void set_ptr(char *length,char *data) { memcpy(ptr,length,packlength); @@ -1304,6 +1332,7 @@ public: my_decimal *val_decimal(my_decimal *); int cmp(const char *a, const char *b) { return cmp_binary(a, b); } + int cmp_max(const char *a, const char *b, uint max_length); int key_cmp(const byte *a, const byte *b) { return cmp_binary((char *) a, (char *) b); } int key_cmp(const byte *str, uint length); diff --git a/sql/ha_berkeley.cc b/sql/ha_berkeley.cc index b8a779c08cf..9a0e0ed1488 100644 --- a/sql/ha_berkeley.cc +++ b/sql/ha_berkeley.cc @@ -1367,7 +1367,7 @@ int ha_berkeley::delete_row(const byte * record) } -int ha_berkeley::index_init(uint keynr) +int ha_berkeley::index_init(uint keynr, bool sorted) { int error; DBUG_ENTER("ha_berkeley::index_init"); @@ -1645,7 +1645,7 @@ int ha_berkeley::rnd_init(bool scan) { DBUG_ENTER("rnd_init"); current_row.flags=DB_DBT_REALLOC; - DBUG_RETURN(index_init(primary_key)); + DBUG_RETURN(index_init(primary_key, 0)); } int ha_berkeley::rnd_end() @@ -2153,7 +2153,7 @@ ulonglong ha_berkeley::get_auto_increment() (void) ha_berkeley::extra(HA_EXTRA_KEYREAD); /* Set 'active_index' */ - ha_berkeley::index_init(table->s->next_number_index); + ha_berkeley::index_init(table->s->next_number_index, 0); if (!table->s->next_number_key_offset) { // Autoincrement at key-start @@ -2492,7 +2492,7 @@ void ha_berkeley::get_status() if (!(share->status & STATUS_PRIMARY_KEY_INIT)) { (void) extra(HA_EXTRA_KEYREAD); - index_init(primary_key); + index_init(primary_key, 0); if (!index_last(table->record[1])) share->auto_ident=uint5korr(current_ident); index_end(); @@ -2645,4 +2645,14 @@ int ha_berkeley::cmp_ref(const byte *ref1, const byte *ref2) return 0; } + +bool ha_berkeley::check_if_incompatible_data(HA_CREATE_INFO *info, + uint table_changes) +{ + if (table_changes < IS_EQUAL_YES) + return COMPATIBLE_DATA_NO; + return COMPATIBLE_DATA_YES; +} + + #endif /* HAVE_BERKELEY_DB */ diff --git a/sql/ha_berkeley.h b/sql/ha_berkeley.h index 282641e3f25..aab76accefa 100644 --- a/sql/ha_berkeley.h +++ b/sql/ha_berkeley.h @@ -92,7 +92,7 @@ class ha_berkeley: public handler const char **bas_ext() const; ulong table_flags(void) const { return int_table_flags; } uint max_supported_keys() const { return MAX_KEY-1; } - uint extra_rec_buf_length() { return BDB_HIDDEN_PRIMARY_KEY_LENGTH; } + uint extra_rec_buf_length() const { return BDB_HIDDEN_PRIMARY_KEY_LENGTH; } ha_rows estimate_rows_upper_bound(); const key_map *keys_to_use_for_scanning() { return &key_map_full; } bool has_transactions() { return 1;} @@ -103,7 +103,7 @@ class ha_berkeley: public handler int write_row(byte * buf); int update_row(const byte * old_data, byte * new_data); int delete_row(const byte * buf); - int index_init(uint index); + int index_init(uint index, bool sorted); int index_end(); int index_read(byte * buf, const byte * key, uint key_len, enum ha_rkey_function find_flag); @@ -151,6 +151,7 @@ class ha_berkeley: public handler uint8 table_cache_type() { return HA_CACHE_TBL_TRANSACT; } bool primary_key_is_clustered() { return true; } int cmp_ref(const byte *ref1, const byte *ref2); + bool check_if_incompatible_data(HA_CREATE_INFO *info, uint table_changes); }; extern bool berkeley_shared_data; diff --git a/sql/ha_federated.cc b/sql/ha_federated.cc index 96cb81fe3ec..87525edd4c9 100644 --- a/sql/ha_federated.cc +++ b/sql/ha_federated.cc @@ -1546,9 +1546,6 @@ inline uint field_in_record_is_null(TABLE *table, int ha_federated::write_row(byte *buf) { bool has_fields= FALSE; - uint all_fields_have_same_query_id= 1; - ulong current_query_id= 1; - ulong tmp_query_id= 1; char insert_buffer[FEDERATED_QUERY_BUFFER_SIZE]; char values_buffer[FEDERATED_QUERY_BUFFER_SIZE]; char insert_field_value_buffer[STRING_BUFFER_USUAL_SIZE]; @@ -1577,14 +1574,6 @@ int ha_federated::write_row(byte *buf) table->timestamp_field->set_time(); /* - get the current query id - the fields that we add to the insert - statement to send to the foreign will not be appended unless they match - this query id - */ - current_query_id= table->in_use->query_id; - DBUG_PRINT("info", ("current query id %d", current_query_id)); - - /* start both our field and field values strings */ insert_string.append(FEDERATED_INSERT); @@ -1597,21 +1586,8 @@ int ha_federated::write_row(byte *buf) values_string.append(FEDERATED_OPENPAREN); /* - Even if one field is different, all_fields_same_query_id can't remain - 0 if it remains 0, then that means no fields were specified in the query - such as in the case of INSERT INTO table VALUES (val1, val2, valN) - - */ - for (field= table->field; *field; field++) - { - if (field > table->field && tmp_query_id != (*field)->query_id) - all_fields_have_same_query_id= 0; - - tmp_query_id= (*field)->query_id; - } - /* loop through the field pointer array, add any fields to both the values - list and the fields list that match the current query id + list and the fields list that is part of the write set You might ask "Why an index variable (has_fields) ?" My answer is that we need to count how many fields we actually need @@ -1619,8 +1595,7 @@ int ha_federated::write_row(byte *buf) for (field= table->field; *field; field++) { /* if there is a query id and if it's equal to the current query id */ - if (((*field)->query_id && (*field)->query_id == current_query_id) - || all_fields_have_same_query_id) + if (ha_get_bit_in_write_set((*field)->fieldnr)) { /* There are some fields. This will be used later to determine @@ -2086,7 +2061,7 @@ error: } /* Initialized at each key walk (called multiple times unlike rnd_init()) */ -int ha_federated::index_init(uint keynr) +int ha_federated::index_init(uint keynr, bool sorted) { int error; DBUG_ENTER("ha_federated::index_init"); diff --git a/sql/ha_federated.h b/sql/ha_federated.h index f75fa21b1d6..c94a28219ae 100644 --- a/sql/ha_federated.h +++ b/sql/ha_federated.h @@ -248,7 +248,7 @@ public: int write_row(byte *buf); int update_row(const byte *old_data, byte *new_data); int delete_row(const byte *buf); - int index_init(uint keynr); + int index_init(uint keynr, bool sorted); int index_read(byte *buf, const byte *key, uint key_len, enum ha_rkey_function find_flag); int index_read_idx(byte *buf, uint idx, const byte *key, diff --git a/sql/ha_heap.cc b/sql/ha_heap.cc index 94ee3f8e656..01e693978db 100644 --- a/sql/ha_heap.cc +++ b/sql/ha_heap.cc @@ -629,3 +629,15 @@ ulonglong ha_heap::get_auto_increment() ha_heap::info(HA_STATUS_AUTO); return auto_increment_value; } + + +bool ha_heap::check_if_incompatible_data(HA_CREATE_INFO *info, + uint table_changes) +{ + /* Check that auto_increment value was not changed */ + if ((table_changes != IS_EQUAL_YES && + info->used_fields & HA_CREATE_USED_AUTO) && + info->auto_increment_value != 0) + return COMPATIBLE_DATA_NO; + return COMPATIBLE_DATA_YES; +} diff --git a/sql/ha_heap.h b/sql/ha_heap.h index 7c4227e952c..24097460a24 100644 --- a/sql/ha_heap.h +++ b/sql/ha_heap.h @@ -106,6 +106,7 @@ public: HEAP_PTR ptr2=*(HEAP_PTR*)ref2; return ptr1 < ptr2? -1 : (ptr1 > ptr2? 1 : 0); } + bool check_if_incompatible_data(HA_CREATE_INFO *info, uint table_changes); private: void update_key_stats(); }; diff --git a/sql/ha_innodb.cc b/sql/ha_innodb.cc index 4ed5fadb603..638bda49002 100644 --- a/sql/ha_innodb.cc +++ b/sql/ha_innodb.cc @@ -111,28 +111,28 @@ typedef byte mysql_byte; /* Include necessary InnoDB headers */ extern "C" { -#include "../innobase/include/univ.i" -#include "../innobase/include/os0file.h" -#include "../innobase/include/os0thread.h" -#include "../innobase/include/srv0start.h" -#include "../innobase/include/srv0srv.h" -#include "../innobase/include/trx0roll.h" -#include "../innobase/include/trx0trx.h" -#include "../innobase/include/trx0sys.h" -#include "../innobase/include/mtr0mtr.h" -#include "../innobase/include/row0ins.h" -#include "../innobase/include/row0mysql.h" -#include "../innobase/include/row0sel.h" -#include "../innobase/include/row0upd.h" -#include "../innobase/include/log0log.h" -#include "../innobase/include/lock0lock.h" -#include "../innobase/include/dict0crea.h" -#include "../innobase/include/btr0cur.h" -#include "../innobase/include/btr0btr.h" -#include "../innobase/include/fsp0fsp.h" -#include "../innobase/include/sync0sync.h" -#include "../innobase/include/fil0fil.h" -#include "../innobase/include/trx0xa.h" +#include "../storage/innobase/include/univ.i" +#include "../storage/innobase/include/os0file.h" +#include "../storage/innobase/include/os0thread.h" +#include "../storage/innobase/include/srv0start.h" +#include "../storage/innobase/include/srv0srv.h" +#include "../storage/innobase/include/trx0roll.h" +#include "../storage/innobase/include/trx0trx.h" +#include "../storage/innobase/include/trx0sys.h" +#include "../storage/innobase/include/mtr0mtr.h" +#include "../storage/innobase/include/row0ins.h" +#include "../storage/innobase/include/row0mysql.h" +#include "../storage/innobase/include/row0sel.h" +#include "../storage/innobase/include/row0upd.h" +#include "../storage/innobase/include/log0log.h" +#include "../storage/innobase/include/lock0lock.h" +#include "../storage/innobase/include/dict0crea.h" +#include "../storage/innobase/include/btr0cur.h" +#include "../storage/innobase/include/btr0btr.h" +#include "../storage/innobase/include/fsp0fsp.h" +#include "../storage/innobase/include/sync0sync.h" +#include "../storage/innobase/include/fil0fil.h" +#include "../storage/innobase/include/trx0xa.h" } #define HA_INNOBASE_ROWS_IN_TABLE 10000 /* to get optimization right */ @@ -3081,7 +3081,8 @@ build_template( goto include_field; } - if (thd->query_id == field->query_id) { + if (table->file->ha_get_bit_in_read_set(i+1) || + table->file->ha_get_bit_in_write_set(i+1)) { /* This field is needed in the query */ goto include_field; @@ -3701,7 +3702,8 @@ int ha_innobase::index_init( /*====================*/ /* out: 0 or error number */ - uint keynr) /* in: key (index) number */ + uint keynr, /* in: key (index) number */ + bool sorted) /* in: 1 if result MUST be sorted according to index */ { int error = 0; DBUG_ENTER("index_init"); @@ -6809,7 +6811,7 @@ ha_innobase::innobase_read_and_init_auto_inc( } (void) extra(HA_EXTRA_KEYREAD); - index_init(table->s->next_number_index); + index_init(table->s->next_number_index, 1); /* Starting from 5.0.9, we use a consistent read to read the auto-inc column maximum value. This eliminates the spurious deadlocks caused @@ -7342,4 +7344,24 @@ innobase_set_cursor_view( (cursor_view_t*) curview); } + +bool ha_innobase::check_if_incompatible_data(HA_CREATE_INFO *info, + uint table_changes) +{ + if (table_changes != IS_EQUAL_YES) + return COMPATIBLE_DATA_NO; + + /* Check that auto_increment value was not changed */ + if ((info->used_fields & HA_CREATE_USED_AUTO) && + info->auto_increment_value != 0) + return COMPATIBLE_DATA_NO; + + /* Check that row format didn't change */ + if ((info->used_fields & HA_CREATE_USED_AUTO) && + get_row_type() != info->row_type) + return COMPATIBLE_DATA_NO; + + return COMPATIBLE_DATA_YES; +} + #endif /* HAVE_INNOBASE_DB */ diff --git a/sql/ha_innodb.h b/sql/ha_innodb.h index 672e48d9817..595ab5ccde2 100644 --- a/sql/ha_innodb.h +++ b/sql/ha_innodb.h @@ -124,7 +124,7 @@ class ha_innobase: public handler int delete_row(const byte * buf); void unlock_row(); - int index_init(uint index); + int index_init(uint index, bool sorted); int index_end(); int index_read(byte * buf, const byte * key, uint key_len, enum ha_rkey_function find_flag); @@ -152,6 +152,16 @@ class ha_innobase: public handler int transactional_table_lock(THD *thd, int lock_type); int start_stmt(THD *thd); + int ha_retrieve_all_cols() + { + ha_set_all_bits_in_read_set(); + return extra(HA_EXTRA_RETRIEVE_ALL_COLS); + } + int ha_retrieve_all_pk() + { + ha_set_primary_key_in_read_set(); + return extra(HA_EXTRA_RETRIEVE_PRIMARY_KEY); + } void position(byte *record); ha_rows records_in_range(uint inx, key_range *min_key, key_range *max_key); @@ -194,6 +204,8 @@ class ha_innobase: public handler static ulonglong get_mysql_bin_log_pos(); bool primary_key_is_clustered() { return true; } int cmp_ref(const byte *ref1, const byte *ref2); + bool check_if_incompatible_data(HA_CREATE_INFO *info, + uint table_changes); }; extern struct show_var_st innodb_status_variables[]; diff --git a/sql/ha_myisam.cc b/sql/ha_myisam.cc index 8f3970d69e6..715ee3da8b9 100644 --- a/sql/ha_myisam.cc +++ b/sql/ha_myisam.cc @@ -27,8 +27,8 @@ #ifndef MASTER #include "../srclib/myisam/myisamdef.h" #else -#include "../myisam/myisamdef.h" -#include "../myisam/rt_index.h" +#include "../storage/myisam/myisamdef.h" +#include "../storage/myisam/rt_index.h" #endif ulong myisam_recover_options= HA_RECOVER_NONE; @@ -1697,3 +1697,25 @@ uint ha_myisam::checksum() const return (uint)file->s->state.checksum; } + +bool ha_myisam::check_if_incompatible_data(HA_CREATE_INFO *info, + uint table_changes) +{ + uint options= table->s->db_options_in_use; + + if (info->auto_increment_value != auto_increment_value || + info->raid_type != raid_type || + info->raid_chunks != raid_chunks || + info->raid_chunksize != raid_chunksize || + info->data_file_name != data_file_name || + info->index_file_name != index_file_name || + table_changes == IS_EQUAL_NO) + return COMPATIBLE_DATA_NO; + + if ((options & (HA_OPTION_PACK_RECORD | HA_OPTION_CHECKSUM | + HA_OPTION_DELAY_KEY_WRITE)) != + (info->table_options & (HA_OPTION_PACK_RECORD | HA_OPTION_CHECKSUM | + HA_OPTION_DELAY_KEY_WRITE))) + return COMPATIBLE_DATA_NO; + return COMPATIBLE_DATA_YES; +} diff --git a/sql/ha_myisam.h b/sql/ha_myisam.h index ca684463311..79036893faf 100644 --- a/sql/ha_myisam.h +++ b/sql/ha_myisam.h @@ -123,6 +123,7 @@ class ha_myisam: public handler int backup(THD* thd, HA_CHECK_OPT* check_opt); int assign_to_keycache(THD* thd, HA_CHECK_OPT* check_opt); int preload_keys(THD* thd, HA_CHECK_OPT* check_opt); + bool check_if_incompatible_data(HA_CREATE_INFO *info, uint table_changes); #ifdef HAVE_REPLICATION int dump(THD* thd, int fd); int net_read_dump(NET* net); diff --git a/sql/ha_myisammrg.cc b/sql/ha_myisammrg.cc index f92717e11eb..388ecfb331f 100644 --- a/sql/ha_myisammrg.cc +++ b/sql/ha_myisammrg.cc @@ -25,7 +25,7 @@ #ifndef MASTER #include "../srclib/myisammrg/myrg_def.h" #else -#include "../myisammrg/myrg_def.h" +#include "../storage/myisammrg/myrg_def.h" #endif /***************************************************************************** @@ -515,3 +515,14 @@ void ha_myisammrg::append_create_info(String *packet) } packet->append(')'); } + + +bool ha_myisammrg::check_if_incompatible_data(HA_CREATE_INFO *info, + uint table_changes) +{ + /* + For myisammrg, we should always re-generate the mapping file as this + is trivial to do + */ + return COMPATIBLE_DATA_NO; +} diff --git a/sql/ha_myisammrg.h b/sql/ha_myisammrg.h index c762b7c286e..45cf93a7e63 100644 --- a/sql/ha_myisammrg.h +++ b/sql/ha_myisammrg.h @@ -82,4 +82,5 @@ class ha_myisammrg: public handler void update_create_info(HA_CREATE_INFO *create_info); void append_create_info(String *packet); MYRG_INFO *myrg_info() { return file; } + bool check_if_incompatible_data(HA_CREATE_INFO *info, uint table_changes); }; diff --git a/sql/ha_ndbcluster.cc b/sql/ha_ndbcluster.cc index 146fbee93c8..a036ccdbf14 100644 --- a/sql/ha_ndbcluster.cc +++ b/sql/ha_ndbcluster.cc @@ -34,6 +34,7 @@ // options from from mysqld.cc extern my_bool opt_ndb_optimized_node_selection; +extern enum ndb_distribution opt_ndb_distribution_id; extern const char *opt_ndbcluster_connectstring; // Default value for parallelism @@ -41,7 +42,7 @@ static const int parallelism= 0; // Default value for max number of transactions // createable against NDB from this handler -static const int max_transactions= 2; +static const int max_transactions= 3; // should really be 2 but there is a transaction to much allocated when loch table is used static const char *ha_ndb_ext=".ndb"; @@ -103,6 +104,7 @@ static HASH ndbcluster_open_tables; static byte *ndbcluster_get_key(NDB_SHARE *share,uint *length, my_bool not_used __attribute__((unused))); +static void ndb_set_fragmentation(NDBTAB & tab, TABLE *table, uint pk_len); static NDB_SHARE *get_share(const char *table_name); static void free_share(NDB_SHARE *share); @@ -858,21 +860,18 @@ int ha_ndbcluster::get_ndb_value(NdbOperation *ndb_op, Field *field, /* Check if any set or get of blob value in current query. */ -bool ha_ndbcluster::uses_blob_value(bool all_fields) +bool ha_ndbcluster::uses_blob_value() { if (table->s->blob_fields == 0) return FALSE; - if (all_fields) - return TRUE; { uint no_fields= table->s->fields; int i; - THD *thd= current_thd; // They always put blobs at the end.. for (i= no_fields - 1; i >= 0; i--) { - Field *field= table->field[i]; - if (thd->query_id == field->query_id) + if ((m_write_op && ha_get_bit_in_write_set(i+1)) || + (!m_write_op && ha_get_bit_in_read_set(i+1))) { return TRUE; } @@ -1145,7 +1144,7 @@ int ha_ndbcluster::get_ndb_lock_type(enum thr_lock_type type) { if (type >= TL_WRITE_ALLOW_WRITE) return NdbOperation::LM_Exclusive; - else if (uses_blob_value(m_retrieve_all_fields)) + else if (uses_blob_value()) return NdbOperation::LM_Read; else return NdbOperation::LM_CommittedRead; @@ -1297,17 +1296,14 @@ inline int ha_ndbcluster::define_read_attrs(byte* buf, NdbOperation* op) { uint i; - THD *thd= current_thd; - DBUG_ENTER("define_read_attrs"); // Define attributes to read for (i= 0; i < table->s->fields; i++) { Field *field= table->field[i]; - if ((thd->query_id == field->query_id) || - ((field->flags & PRI_KEY_FLAG)) || - m_retrieve_all_fields) + if (ha_get_bit_in_read_set(i+1) || + ((field->flags & PRI_KEY_FLAG))) { if (get_ndb_value(op, field, i, buf)) ERR_RETURN(op->getNdbError()); @@ -1334,11 +1330,13 @@ int ha_ndbcluster::define_read_attrs(byte* buf, NdbOperation* op) DBUG_RETURN(0); } + /* Read one record from NDB using primary key */ -int ha_ndbcluster::pk_read(const byte *key, uint key_len, byte *buf) +int ha_ndbcluster::pk_read(const byte *key, uint key_len, byte *buf, + uint32 part_id) { uint no_fields= table->s->fields; NdbConnection *trans= m_active_trans; @@ -1348,6 +1346,7 @@ int ha_ndbcluster::pk_read(const byte *key, uint key_len, byte *buf) DBUG_ENTER("pk_read"); DBUG_PRINT("enter", ("key_len: %u", key_len)); DBUG_DUMP("key", (char*)key, key_len); + m_write_op= FALSE; NdbOperation::LockMode lm= (NdbOperation::LockMode)get_ndb_lock_type(m_lock.type); @@ -1355,6 +1354,8 @@ int ha_ndbcluster::pk_read(const byte *key, uint key_len, byte *buf) op->readTuple(lm) != 0) ERR_RETURN(trans->getNdbError()); + if (m_use_partition_function) + op->setPartitionId(part_id); if (table->s->primary_key == MAX_KEY) { // This table has no primary key, use "hidden" primary key @@ -1392,17 +1393,20 @@ int ha_ndbcluster::pk_read(const byte *key, uint key_len, byte *buf) Read one complementing record from NDB using primary key from old_data */ -int ha_ndbcluster::complemented_pk_read(const byte *old_data, byte *new_data) +int ha_ndbcluster::complemented_pk_read(const byte *old_data, byte *new_data, + uint32 old_part_id) { uint no_fields= table->s->fields, i; NdbTransaction *trans= m_active_trans; NdbOperation *op; - THD *thd= current_thd; DBUG_ENTER("complemented_pk_read"); + m_write_op= FALSE; - if (m_retrieve_all_fields) + if (ha_get_all_bit_in_read_set()) + { // We have allready retrieved all fields, nothing to complement DBUG_RETURN(0); + } NdbOperation::LockMode lm= (NdbOperation::LockMode)get_ndb_lock_type(m_lock.type); @@ -1412,12 +1416,16 @@ int ha_ndbcluster::complemented_pk_read(const byte *old_data, byte *new_data) int res; if ((res= set_primary_key_from_record(op, old_data))) ERR_RETURN(trans->getNdbError()); + + if (m_use_partition_function) + op->setPartitionId(old_part_id); + // Read all unreferenced non-key field(s) for (i= 0; i < no_fields; i++) { Field *field= table->field[i]; if (!((field->flags & PRI_KEY_FLAG) || - (thd->query_id == field->query_id))) + (ha_get_bit_in_read_set(i+1)))) { if (get_ndb_value(op, field, i, new_data)) ERR_RETURN(trans->getNdbError()); @@ -1441,7 +1449,7 @@ int ha_ndbcluster::complemented_pk_read(const byte *old_data, byte *new_data) { Field *field= table->field[i]; if (!((field->flags & PRI_KEY_FLAG) || - (thd->query_id == field->query_id))) + (ha_get_bit_in_read_set(i+1)))) { m_value[i].ptr= NULL; } @@ -1470,6 +1478,17 @@ int ha_ndbcluster::peek_row(const byte *record) if ((res= set_primary_key_from_record(op, record))) ERR_RETURN(trans->getNdbError()); + if (m_use_partition_function) + { + uint32 part_id; + int error; + if ((error= m_part_info->get_partition_id(m_part_info, &part_id))) + { + DBUG_RETURN(error); + } + op->setPartitionId(part_id); + } + if (execute_no_commit_ie(this,trans) != 0) { table->status= STATUS_NOT_FOUND; @@ -1808,7 +1827,8 @@ int ha_ndbcluster::set_bounds(NdbIndexScanOperation *op, int ha_ndbcluster::ordered_index_scan(const key_range *start_key, const key_range *end_key, - bool sorted, bool descending, byte* buf) + bool sorted, bool descending, + byte* buf, part_id_range *part_spec) { int res; bool restart; @@ -1819,6 +1839,7 @@ int ha_ndbcluster::ordered_index_scan(const key_range *start_key, DBUG_PRINT("enter", ("index: %u, sorted: %d, descending: %d", active_index, sorted, descending)); DBUG_PRINT("enter", ("Starting new ordered scan on %s", m_tabname)); + m_write_op= FALSE; // Check that sorted seems to be initialised DBUG_ASSERT(sorted == 0 || sorted == 1); @@ -1833,11 +1854,17 @@ int ha_ndbcluster::ordered_index_scan(const key_range *start_key, (const NDBTAB *) m_table)) || op->readTuples(lm, 0, parallelism, sorted, descending)) ERR_RETURN(trans->getNdbError()); + if (m_use_partition_function && part_spec != NULL && + part_spec->start_part == part_spec->end_part) + op->setPartitionId(part_spec->start_part); m_active_cursor= op; } else { restart= TRUE; op= (NdbIndexScanOperation*)m_active_cursor; + if (m_use_partition_function && part_spec != NULL && + part_spec->start_part == part_spec->end_part) + op->setPartitionId(part_spec->start_part); DBUG_ASSERT(op->getSorted() == sorted); DBUG_ASSERT(op->getLockMode() == (NdbOperation::LockMode)get_ndb_lock_type(m_lock.type)); @@ -1878,6 +1905,7 @@ int ha_ndbcluster::full_table_scan(byte *buf) DBUG_ENTER("full_table_scan"); DBUG_PRINT("enter", ("Starting new scan on %s", m_tabname)); + m_write_op= FALSE; NdbOperation::LockMode lm= (NdbOperation::LockMode)get_ndb_lock_type(m_lock.type); @@ -1907,6 +1935,7 @@ int ha_ndbcluster::write_row(byte *record) NdbOperation *op; int res; THD *thd= current_thd; + m_write_op= TRUE; DBUG_ENTER("write_row"); @@ -1935,6 +1964,17 @@ int ha_ndbcluster::write_row(byte *record) if (res != 0) ERR_RETURN(trans->getNdbError()); + if (m_use_partition_function) + { + uint32 part_id; + int error; + if ((error= m_part_info->get_partition_id(m_part_info, &part_id))) + { + DBUG_RETURN(error); + } + op->setPartitionId(part_id); + } + if (table->s->primary_key == MAX_KEY) { // Table has hidden primary key @@ -2092,25 +2132,35 @@ int ha_ndbcluster::update_row(const byte *old_data, byte *new_data) NdbScanOperation* cursor= m_active_cursor; NdbOperation *op; uint i; + uint32 old_part_id= 0, new_part_id= 0; + int error; DBUG_ENTER("update_row"); + m_write_op= TRUE; statistic_increment(thd->status_var.ha_update_count, &LOCK_status); if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_UPDATE) { table->timestamp_field->set_time(); - // Set query_id so that field is really updated - table->timestamp_field->query_id= thd->query_id; + ha_set_bit_in_write_set(table->timestamp_field->fieldnr); + } + + if (m_use_partition_function && + (error= get_parts_for_update(old_data, new_data, table->record[0], + m_part_info, &old_part_id, &new_part_id))) + { + DBUG_RETURN(error); } /* Check for update of primary key for special handling */ if ((table->s->primary_key != MAX_KEY) && - (key_cmp(table->s->primary_key, old_data, new_data))) + (key_cmp(table->s->primary_key, old_data, new_data)) || + (old_part_id != new_part_id)) { int read_res, insert_res, delete_res, undo_res; DBUG_PRINT("info", ("primary key update, doing pk read+delete+insert")); // Get all old fields, since we optimize away fields not in query - read_res= complemented_pk_read(old_data, new_data); + read_res= complemented_pk_read(old_data, new_data, old_part_id); if (read_res) { DBUG_PRINT("info", ("pk read failed")); @@ -2164,8 +2214,10 @@ int ha_ndbcluster::update_row(const byte *old_data, byte *new_data) if (!(op= cursor->updateCurrentTuple())) ERR_RETURN(trans->getNdbError()); m_ops_pending++; - if (uses_blob_value(FALSE)) + if (uses_blob_value()) m_blobs_pending= TRUE; + if (m_use_partition_function) + cursor->setPartitionId(new_part_id); } else { @@ -2173,6 +2225,8 @@ int ha_ndbcluster::update_row(const byte *old_data, byte *new_data) op->updateTuple() != 0) ERR_RETURN(trans->getNdbError()); + if (m_use_partition_function) + op->setPartitionId(new_part_id); if (table->s->primary_key == MAX_KEY) { // This table has no primary key, use "hidden" primary key @@ -2202,7 +2256,7 @@ int ha_ndbcluster::update_row(const byte *old_data, byte *new_data) for (i= 0; i < table->s->fields; i++) { Field *field= table->field[i]; - if (((thd->query_id == field->query_id) || m_retrieve_all_fields) && + if (ha_get_bit_in_write_set(i+1) && (!(field->flags & PRI_KEY_FLAG)) && set_ndb_value(op, field, i)) ERR_RETURN(op->getNdbError()); @@ -2228,11 +2282,21 @@ int ha_ndbcluster::delete_row(const byte *record) NdbTransaction *trans= m_active_trans; NdbScanOperation* cursor= m_active_cursor; NdbOperation *op; + uint32 part_id; + int error; DBUG_ENTER("delete_row"); + m_write_op= TRUE; statistic_increment(thd->status_var.ha_delete_count,&LOCK_status); m_rows_changed++; + if (m_use_partition_function && + (error= get_part_for_delete(record, table->record[0], m_part_info, + &part_id))) + { + DBUG_RETURN(error); + } + if (cursor) { /* @@ -2247,6 +2311,9 @@ int ha_ndbcluster::delete_row(const byte *record) ERR_RETURN(trans->getNdbError()); m_ops_pending++; + if (m_use_partition_function) + cursor->setPartitionId(part_id); + no_uncommitted_rows_update(-1); if (!m_primary_key_update) @@ -2260,6 +2327,9 @@ int ha_ndbcluster::delete_row(const byte *record) op->deleteTuple() != 0) ERR_RETURN(trans->getNdbError()); + if (m_use_partition_function) + op->setPartitionId(part_id); + no_uncommitted_rows_update(-1); if (table->s->primary_key == MAX_KEY) @@ -2385,8 +2455,6 @@ void ha_ndbcluster::print_results() DBUG_ENTER("print_results"); #ifndef DBUG_OFF - const NDBTAB *tab= (const NDBTAB*) m_table; - if (!_db_on_) DBUG_VOID_RETURN; @@ -2441,11 +2509,13 @@ print_value: } -int ha_ndbcluster::index_init(uint index) +int ha_ndbcluster::index_init(uint index, bool sorted) { DBUG_ENTER("ha_ndbcluster::index_init"); - DBUG_PRINT("enter", ("index: %u", index)); - DBUG_RETURN(handler::index_init(index)); + DBUG_PRINT("enter", ("index: %u sorted: %d", index, sorted)); + active_index= index; + m_sorted= sorted; + DBUG_RETURN(0); } @@ -2482,55 +2552,16 @@ int ha_ndbcluster::index_read(byte *buf, const byte *key, uint key_len, enum ha_rkey_function find_flag) { + key_range start_key; + bool descending= FALSE; DBUG_ENTER("ha_ndbcluster::index_read"); DBUG_PRINT("enter", ("active_index: %u, key_len: %u, find_flag: %d", active_index, key_len, find_flag)); - int error; - ndb_index_type type= get_index_type(active_index); - const KEY* key_info= table->key_info+active_index; - switch (type){ - case PRIMARY_KEY_ORDERED_INDEX: - case PRIMARY_KEY_INDEX: - if (find_flag == HA_READ_KEY_EXACT && key_info->key_length == key_len) - { - if (m_active_cursor && (error= close_scan())) - DBUG_RETURN(error); - DBUG_RETURN(pk_read(key, key_len, buf)); - } - else if (type == PRIMARY_KEY_INDEX) - { - DBUG_RETURN(1); - } - break; - case UNIQUE_ORDERED_INDEX: - case UNIQUE_INDEX: - if (find_flag == HA_READ_KEY_EXACT && key_info->key_length == key_len && - !check_null_in_key(key_info, key, key_len)) - { - if (m_active_cursor && (error= close_scan())) - DBUG_RETURN(error); - DBUG_RETURN(unique_index_read(key, key_len, buf)); - } - else if (type == UNIQUE_INDEX) - { - DBUG_RETURN(1); - } - break; - case ORDERED_INDEX: - break; - default: - case UNDEFINED_INDEX: - DBUG_ASSERT(FALSE); - DBUG_RETURN(1); - break; - } - - key_range start_key; start_key.key= key; start_key.length= key_len; start_key.flag= find_flag; - bool descending= FALSE; + descending= FALSE; switch (find_flag) { case HA_READ_KEY_OR_PREV: case HA_READ_BEFORE_KEY: @@ -2541,8 +2572,8 @@ int ha_ndbcluster::index_read(byte *buf, default: break; } - error= ordered_index_scan(&start_key, 0, TRUE, descending, buf); - DBUG_RETURN(error == HA_ERR_END_OF_FILE ? HA_ERR_KEY_NOT_FOUND : error); + DBUG_RETURN(read_range_first_to_buf(&start_key, 0, descending, + m_sorted, buf)); } @@ -2553,7 +2584,7 @@ int ha_ndbcluster::index_read_idx(byte *buf, uint index_no, statistic_increment(current_thd->status_var.ha_read_key_count, &LOCK_status); DBUG_ENTER("ha_ndbcluster::index_read_idx"); DBUG_PRINT("enter", ("index_no: %u, key_len: %u", index_no, key_len)); - index_init(index_no); + index_init(index_no, 0); DBUG_RETURN(index_read(buf, key, key_len, find_flag)); } @@ -2584,7 +2615,7 @@ int ha_ndbcluster::index_first(byte *buf) // Start the ordered index scan and fetch the first row // Only HA_READ_ORDER indexes get called by index_first - DBUG_RETURN(ordered_index_scan(0, 0, TRUE, FALSE, buf)); + DBUG_RETURN(ordered_index_scan(0, 0, TRUE, FALSE, buf, NULL)); } @@ -2592,7 +2623,7 @@ int ha_ndbcluster::index_last(byte *buf) { DBUG_ENTER("ha_ndbcluster::index_last"); statistic_increment(current_thd->status_var.ha_read_last_count,&LOCK_status); - DBUG_RETURN(ordered_index_scan(0, 0, TRUE, TRUE, buf)); + DBUG_RETURN(ordered_index_scan(0, 0, TRUE, TRUE, buf, NULL)); } int ha_ndbcluster::index_read_last(byte * buf, const byte * key, uint key_len) @@ -2601,66 +2632,76 @@ int ha_ndbcluster::index_read_last(byte * buf, const byte * key, uint key_len) DBUG_RETURN(index_read(buf, key, key_len, HA_READ_PREFIX_LAST)); } -inline int ha_ndbcluster::read_range_first_to_buf(const key_range *start_key, const key_range *end_key, - bool eq_r, bool sorted, + bool desc, bool sorted, byte* buf) { - KEY* key_info; - int error= 1; + part_id_range part_spec; + ndb_index_type type= get_index_type(active_index); + const KEY* key_info= table->key_info+active_index; + int error; DBUG_ENTER("ha_ndbcluster::read_range_first_to_buf"); - DBUG_PRINT("info", ("eq_r: %d, sorted: %d", eq_r, sorted)); + DBUG_PRINT("info", ("desc: %d, sorted: %d", desc, sorted)); - switch (get_index_type(active_index)){ + if (m_use_partition_function) + { + get_partition_set(table, buf, active_index, start_key, &part_spec); + if (part_spec.start_part > part_spec.end_part) + { + DBUG_RETURN(HA_ERR_END_OF_FILE); + } + else if (part_spec.start_part == part_spec.end_part) + { + /* + Only one partition is required to scan, if sorted is required we + don't need it any more since output from one ordered partitioned + index is always sorted. + */ + sorted= FALSE; + } + } + m_write_op= FALSE; + switch (type){ case PRIMARY_KEY_ORDERED_INDEX: case PRIMARY_KEY_INDEX: - key_info= table->key_info + active_index; if (start_key && start_key->length == key_info->key_length && start_key->flag == HA_READ_KEY_EXACT) { if (m_active_cursor && (error= close_scan())) DBUG_RETURN(error); - error= pk_read(start_key->key, start_key->length, buf); - DBUG_RETURN(error == HA_ERR_KEY_NOT_FOUND ? HA_ERR_END_OF_FILE : error); + DBUG_RETURN(pk_read(start_key->key, start_key->length, buf, + part_spec.start_part)); } break; case UNIQUE_ORDERED_INDEX: case UNIQUE_INDEX: - key_info= table->key_info + active_index; if (start_key && start_key->length == key_info->key_length && start_key->flag == HA_READ_KEY_EXACT && !check_null_in_key(key_info, start_key->key, start_key->length)) { if (m_active_cursor && (error= close_scan())) DBUG_RETURN(error); - error= unique_index_read(start_key->key, start_key->length, buf); - DBUG_RETURN(error == HA_ERR_KEY_NOT_FOUND ? HA_ERR_END_OF_FILE : error); + DBUG_RETURN(unique_index_read(start_key->key, start_key->length, buf)); } break; default: break; } - // Start the ordered index scan and fetch the first row - error= ordered_index_scan(start_key, end_key, sorted, FALSE, buf); - DBUG_RETURN(error); + DBUG_RETURN(ordered_index_scan(start_key, end_key, sorted, desc, buf, + &part_spec)); } - int ha_ndbcluster::read_range_first(const key_range *start_key, const key_range *end_key, bool eq_r, bool sorted) { byte* buf= table->record[0]; DBUG_ENTER("ha_ndbcluster::read_range_first"); - - DBUG_RETURN(read_range_first_to_buf(start_key, - end_key, - eq_r, - sorted, - buf)); + DBUG_RETURN(read_range_first_to_buf(start_key, end_key, FALSE, + sorted, buf)); } int ha_ndbcluster::read_range_next() @@ -2686,7 +2727,7 @@ int ha_ndbcluster::rnd_init(bool scan) DBUG_RETURN(-1); } } - index_init(table->s->primary_key); + index_init(table->s->primary_key, 0); DBUG_RETURN(0); } @@ -2753,7 +2794,20 @@ int ha_ndbcluster::rnd_pos(byte *buf, byte *pos) &LOCK_status); // The primary key for the record is stored in pos // Perform a pk_read using primary key "index" - DBUG_RETURN(pk_read(pos, ref_length, buf)); + { + part_id_range part_spec; + if (m_use_partition_function) + { + key_range key_spec; + KEY *key_info= table->key_info + active_index; + key_spec.key= pos; + key_spec.length= ref_length; + key_spec.flag= HA_READ_KEY_EXACT; + get_full_part_id_from_key(table, buf, key_info, &key_spec, &part_spec); + DBUG_ASSERT(part_spec.start_part == part_spec.end_part); + } + DBUG_RETURN(pk_read(pos, ref_length, buf, part_spec.start_part)); + } } @@ -2885,83 +2939,11 @@ int ha_ndbcluster::extra(enum ha_extra_function operation) { DBUG_ENTER("extra"); switch (operation) { - case HA_EXTRA_NORMAL: /* Optimize for space (def) */ - DBUG_PRINT("info", ("HA_EXTRA_NORMAL")); - break; - case HA_EXTRA_QUICK: /* Optimize for speed */ - DBUG_PRINT("info", ("HA_EXTRA_QUICK")); - break; case HA_EXTRA_RESET: /* Reset database to after open */ DBUG_PRINT("info", ("HA_EXTRA_RESET")); DBUG_PRINT("info", ("Clearing condition stack")); cond_clear(); break; - case HA_EXTRA_CACHE: /* Cash record in HA_rrnd() */ - DBUG_PRINT("info", ("HA_EXTRA_CACHE")); - break; - case HA_EXTRA_NO_CACHE: /* End cacheing of records (def) */ - DBUG_PRINT("info", ("HA_EXTRA_NO_CACHE")); - break; - case HA_EXTRA_NO_READCHECK: /* No readcheck on update */ - DBUG_PRINT("info", ("HA_EXTRA_NO_READCHECK")); - break; - case HA_EXTRA_READCHECK: /* Use readcheck (def) */ - DBUG_PRINT("info", ("HA_EXTRA_READCHECK")); - break; - case HA_EXTRA_KEYREAD: /* Read only key to database */ - DBUG_PRINT("info", ("HA_EXTRA_KEYREAD")); - break; - case HA_EXTRA_NO_KEYREAD: /* Normal read of records (def) */ - DBUG_PRINT("info", ("HA_EXTRA_NO_KEYREAD")); - break; - case HA_EXTRA_NO_USER_CHANGE: /* No user is allowed to write */ - DBUG_PRINT("info", ("HA_EXTRA_NO_USER_CHANGE")); - break; - case HA_EXTRA_KEY_CACHE: - DBUG_PRINT("info", ("HA_EXTRA_KEY_CACHE")); - break; - case HA_EXTRA_NO_KEY_CACHE: - DBUG_PRINT("info", ("HA_EXTRA_NO_KEY_CACHE")); - break; - case HA_EXTRA_WAIT_LOCK: /* Wait until file is avalably (def) */ - DBUG_PRINT("info", ("HA_EXTRA_WAIT_LOCK")); - break; - case HA_EXTRA_NO_WAIT_LOCK: /* If file is locked, return quickly */ - DBUG_PRINT("info", ("HA_EXTRA_NO_WAIT_LOCK")); - break; - case HA_EXTRA_WRITE_CACHE: /* Use write cache in ha_write() */ - DBUG_PRINT("info", ("HA_EXTRA_WRITE_CACHE")); - break; - case HA_EXTRA_FLUSH_CACHE: /* flush write_record_cache */ - DBUG_PRINT("info", ("HA_EXTRA_FLUSH_CACHE")); - break; - case HA_EXTRA_NO_KEYS: /* Remove all update of keys */ - DBUG_PRINT("info", ("HA_EXTRA_NO_KEYS")); - break; - case HA_EXTRA_KEYREAD_CHANGE_POS: /* Keyread, but change pos */ - DBUG_PRINT("info", ("HA_EXTRA_KEYREAD_CHANGE_POS")); /* xxxxchk -r must be used */ - break; - case HA_EXTRA_REMEMBER_POS: /* Remember pos for next/prev */ - DBUG_PRINT("info", ("HA_EXTRA_REMEMBER_POS")); - break; - case HA_EXTRA_RESTORE_POS: - DBUG_PRINT("info", ("HA_EXTRA_RESTORE_POS")); - break; - case HA_EXTRA_REINIT_CACHE: /* init cache from current record */ - DBUG_PRINT("info", ("HA_EXTRA_REINIT_CACHE")); - break; - case HA_EXTRA_FORCE_REOPEN: /* Datafile have changed on disk */ - DBUG_PRINT("info", ("HA_EXTRA_FORCE_REOPEN")); - break; - case HA_EXTRA_FLUSH: /* Flush tables to disk */ - DBUG_PRINT("info", ("HA_EXTRA_FLUSH")); - break; - case HA_EXTRA_NO_ROWS: /* Don't write rows */ - DBUG_PRINT("info", ("HA_EXTRA_NO_ROWS")); - break; - case HA_EXTRA_RESET_STATE: /* Reset positions */ - DBUG_PRINT("info", ("HA_EXTRA_RESET_STATE")); - break; case HA_EXTRA_IGNORE_DUP_KEY: /* Dup keys don't rollback everything*/ DBUG_PRINT("info", ("HA_EXTRA_IGNORE_DUP_KEY")); if (current_thd->lex->sql_command == SQLCOM_REPLACE) @@ -2980,34 +2962,8 @@ int ha_ndbcluster::extra(enum ha_extra_function operation) m_use_write= FALSE; m_ignore_dup_key= FALSE; break; - case HA_EXTRA_RETRIEVE_ALL_COLS: /* Retrieve all columns, not just those - where field->query_id is the same as - the current query id */ - DBUG_PRINT("info", ("HA_EXTRA_RETRIEVE_ALL_COLS")); - m_retrieve_all_fields= TRUE; - break; - case HA_EXTRA_PREPARE_FOR_DELETE: - DBUG_PRINT("info", ("HA_EXTRA_PREPARE_FOR_DELETE")); - break; - case HA_EXTRA_PREPARE_FOR_UPDATE: /* Remove read cache if problems */ - DBUG_PRINT("info", ("HA_EXTRA_PREPARE_FOR_UPDATE")); - break; - case HA_EXTRA_PRELOAD_BUFFER_SIZE: - DBUG_PRINT("info", ("HA_EXTRA_PRELOAD_BUFFER_SIZE")); - break; - case HA_EXTRA_RETRIEVE_PRIMARY_KEY: - DBUG_PRINT("info", ("HA_EXTRA_RETRIEVE_PRIMARY_KEY")); - m_retrieve_primary_key= TRUE; - break; - case HA_EXTRA_CHANGE_KEY_TO_UNIQUE: - DBUG_PRINT("info", ("HA_EXTRA_CHANGE_KEY_TO_UNIQUE")); - break; - case HA_EXTRA_CHANGE_KEY_TO_DUP: - DBUG_PRINT("info", ("HA_EXTRA_CHANGE_KEY_TO_DUP")); - case HA_EXTRA_KEYREAD_PRESERVE_FIELDS: - DBUG_PRINT("info", ("HA_EXTRA_KEYREAD_PRESERVE_FIELDS")); + default: break; - } DBUG_RETURN(0); @@ -3286,8 +3242,6 @@ int ha_ndbcluster::external_lock(THD *thd, int lock_type) DBUG_ASSERT(m_active_trans); // Start of transaction m_rows_changed= 0; - m_retrieve_all_fields= FALSE; - m_retrieve_primary_key= FALSE; m_ops_pending= 0; { NDBDICT *dict= ndb->getDictionary(); @@ -3426,8 +3380,6 @@ int ha_ndbcluster::start_stmt(THD *thd) m_active_trans= trans; // Start of statement - m_retrieve_all_fields= FALSE; - m_retrieve_primary_key= FALSE; m_ops_pending= 0; DBUG_RETURN(error); @@ -3802,56 +3754,6 @@ static int create_ndb_column(NDBCOL &col, return 0; } -/* - Create a table in NDB Cluster - */ - -static void ndb_set_fragmentation(NDBTAB &tab, TABLE *form, uint pk_length) -{ - if (form->s->max_rows == (ha_rows) 0) /* default setting, don't set fragmentation */ - return; - /** - * get the number of fragments right - */ - uint no_fragments; - { -#if MYSQL_VERSION_ID >= 50000 - uint acc_row_size= 25 + /*safety margin*/ 2; -#else - uint acc_row_size= pk_length*4; - /* add acc overhead */ - if (pk_length <= 8) /* main page will set the limit */ - acc_row_size+= 25 + /*safety margin*/ 2; - else /* overflow page will set the limit */ - acc_row_size+= 4 + /*safety margin*/ 4; -#endif - ulonglong acc_fragment_size= 512*1024*1024; - ulonglong max_rows= form->s->max_rows; -#if MYSQL_VERSION_ID >= 50100 - no_fragments= (max_rows*acc_row_size)/acc_fragment_size+1; -#else - no_fragments= ((max_rows*acc_row_size)/acc_fragment_size+1 - +1/*correct rounding*/)/2; -#endif - } - { - uint no_nodes= g_ndb_cluster_connection->no_db_nodes(); - NDBTAB::FragmentType ftype; - if (no_fragments > 2*no_nodes) - { - ftype= NDBTAB::FragAllLarge; - if (no_fragments > 4*no_nodes) - push_warning(current_thd, MYSQL_ERROR::WARN_LEVEL_WARN, ER_UNKNOWN_ERROR, - "Ndb might have problems storing the max amount of rows specified"); - } - else if (no_fragments > no_nodes) - ftype= NDBTAB::FragAllMedium; - else - ftype= NDBTAB::FragAllSmall; - tab.setFragmentType(ftype); - } -} - int ha_ndbcluster::create(const char *name, TABLE *form, HA_CREATE_INFO *info) @@ -3954,7 +3856,22 @@ int ha_ndbcluster::create(const char *name, } } - ndb_set_fragmentation(tab, form, pk_length); + // Check partition info + partition_info *part_info= form->s->part_info; + if (part_info) + { + int error; + if ((error= set_up_partition_info(part_info, form, (void*)&tab))) + { + DBUG_RETURN(error); + } + } + else + { + ndb_set_fragmentation(tab, form, pk_length); + } + + if ((my_errno= check_ndb_connection())) DBUG_RETURN(my_errno); @@ -4203,11 +4120,12 @@ ha_ndbcluster::ha_ndbcluster(TABLE *table_arg): HA_NEED_READ_RANGE_BUFFER | HA_CAN_BIT_FIELD), m_share(0), + m_part_info(NULL), + m_use_partition_function(FALSE), + m_sorted(FALSE), m_use_write(FALSE), m_ignore_dup_key(FALSE), m_primary_key_update(FALSE), - m_retrieve_all_fields(FALSE), - m_retrieve_primary_key(FALSE), m_rows_to_insert((ha_rows) 1), m_rows_inserted((ha_rows) 0), m_bulk_insert_rows((ha_rows) 1024), @@ -4319,6 +4237,15 @@ int ha_ndbcluster::open(const char *name, int mode, uint test_if_locked) if (!res) info(HA_STATUS_VARIABLE | HA_STATUS_CONST); + if (table->s->part_info) + { + m_part_info= table->s->part_info; + if (!(m_part_info->part_type == HASH_PARTITION && + m_part_info->list_of_part_fields && + !is_sub_partitioned(m_part_info))) + m_use_partition_function= TRUE; + } + DBUG_RETURN(res); } @@ -5531,6 +5458,7 @@ ha_ndbcluster::read_multi_range_first(KEY_MULTI_RANGE **found_range_p, HANDLER_BUFFER *buffer) { DBUG_ENTER("ha_ndbcluster::read_multi_range_first"); + m_write_op= FALSE; int res; KEY* key_info= table->key_info + active_index; @@ -5538,7 +5466,7 @@ ha_ndbcluster::read_multi_range_first(KEY_MULTI_RANGE **found_range_p, ulong reclength= table->s->reclength; NdbOperation* op; - if (uses_blob_value(m_retrieve_all_fields)) + if (uses_blob_value()) { /** * blobs can't be batched currently @@ -5590,12 +5518,29 @@ ha_ndbcluster::read_multi_range_first(KEY_MULTI_RANGE **found_range_p, for (; multi_range_curr<multi_range_end && curr+reclength <= end_of_buffer; multi_range_curr++) { - switch (index_type){ + part_id_range part_spec; + if (m_use_partition_function) + { + get_partition_set(table, curr, active_index, + &multi_range_curr->start_key, + &part_spec); + if (part_spec.start_part > part_spec.end_part) + { + /* + We can skip this partition since the key won't fit into any + partition + */ + curr += reclength; + multi_range_curr->range_flag |= SKIP_RANGE; + continue; + } + } + switch(index_type){ case PRIMARY_KEY_ORDERED_INDEX: if (!(multi_range_curr->start_key.length == key_info->key_length && - multi_range_curr->start_key.flag == HA_READ_KEY_EXACT)) - goto range; - /* fall through */ + multi_range_curr->start_key.flag == HA_READ_KEY_EXACT)) + goto range; + // else fall through case PRIMARY_KEY_INDEX: { multi_range_curr->range_flag |= UNIQUE_RANGE; @@ -5603,7 +5548,9 @@ ha_ndbcluster::read_multi_range_first(KEY_MULTI_RANGE **found_range_p, !op->readTuple(lm) && !set_primary_key(op, multi_range_curr->start_key.key) && !define_read_attrs(curr, op) && - (op->setAbortOption(AO_IgnoreError), TRUE)) + (op->setAbortOption(AO_IgnoreError), TRUE) && + (!m_use_partition_function || + (op->setPartitionId(part_spec.start_part), true))) curr += reclength; else ERR_RETURN(op ? op->getNdbError() : m_active_trans->getNdbError()); @@ -5612,11 +5559,11 @@ ha_ndbcluster::read_multi_range_first(KEY_MULTI_RANGE **found_range_p, break; case UNIQUE_ORDERED_INDEX: if (!(multi_range_curr->start_key.length == key_info->key_length && - multi_range_curr->start_key.flag == HA_READ_KEY_EXACT && - !check_null_in_key(key_info, multi_range_curr->start_key.key, - multi_range_curr->start_key.length))) - goto range; - /* fall through */ + multi_range_curr->start_key.flag == HA_READ_KEY_EXACT && + !check_null_in_key(key_info, multi_range_curr->start_key.key, + multi_range_curr->start_key.length))) + goto range; + // else fall through case UNIQUE_INDEX: { multi_range_curr->range_flag |= UNIQUE_RANGE; @@ -5630,8 +5577,7 @@ ha_ndbcluster::read_multi_range_first(KEY_MULTI_RANGE **found_range_p, ERR_RETURN(op ? op->getNdbError() : m_active_trans->getNdbError()); break; } - case ORDERED_INDEX: - { + case ORDERED_INDEX: { range: multi_range_curr->range_flag &= ~(uint)UNIQUE_RANGE; if (scanOp == 0) @@ -5706,7 +5652,7 @@ ha_ndbcluster::read_multi_range_first(KEY_MULTI_RANGE **found_range_p, } #if 0 -#define DBUG_MULTI_RANGE(x) printf("read_multi_range_next: case %d\n", x); +#define DBUG_MULTI_RANGE(x) DBUG_PRINT("info", ("read_multi_range_next: case %d\n", x)); #else #define DBUG_MULTI_RANGE(x) #endif @@ -5717,6 +5663,7 @@ ha_ndbcluster::read_multi_range_next(KEY_MULTI_RANGE ** multi_range_found_p) DBUG_ENTER("ha_ndbcluster::read_multi_range_next"); if (m_disable_multi_read) { + DBUG_MULTI_RANGE(11); DBUG_RETURN(handler::read_multi_range_next(multi_range_found_p)); } @@ -5726,10 +5673,16 @@ ha_ndbcluster::read_multi_range_next(KEY_MULTI_RANGE ** multi_range_found_p) const NdbOperation* op= m_current_multi_operation; for (;multi_range_curr < m_multi_range_defined; multi_range_curr++) { + DBUG_MULTI_RANGE(12); + if (multi_range_curr->range_flag & SKIP_RANGE) + continue; if (multi_range_curr->range_flag & UNIQUE_RANGE) { if (op->getNdbError().code == 0) + { + DBUG_MULTI_RANGE(13); goto found_next; + } op= m_active_trans->getNextCompletedOperation(op); m_multi_range_result_ptr += reclength; @@ -5746,6 +5699,7 @@ ha_ndbcluster::read_multi_range_next(KEY_MULTI_RANGE ** multi_range_found_p) } else { + DBUG_MULTI_RANGE(14); goto close_scan; } } @@ -5779,6 +5733,7 @@ ha_ndbcluster::read_multi_range_next(KEY_MULTI_RANGE ** multi_range_found_p) DBUG_ASSERT(range_no == -1); if ((res= m_multi_cursor->nextResult(true))) { + DBUG_MULTI_RANGE(15); goto close_scan; } multi_range_curr--; // Will be increased in for-loop @@ -5806,12 +5761,16 @@ close_scan: } else { + DBUG_MULTI_RANGE(9); DBUG_RETURN(ndb_err(m_active_trans)); } } if (multi_range_curr == multi_range_end) + { + DBUG_MULTI_RANGE(16); DBUG_RETURN(HA_ERR_END_OF_FILE); + } /** * Read remaining ranges @@ -7043,6 +7002,8 @@ ha_ndbcluster::build_scan_filter_predicate(Ndb_cond * &cond, : NULL; break; default: + field= NULL; //Keep compiler happy + DBUG_ASSERT(0); break; } switch ((negated) ? @@ -7390,4 +7351,211 @@ ha_ndbcluster::generate_scan_filter(Ndb_cond_stack *ndb_cond_stack, DBUG_RETURN(0); } + +/* + Create a table in NDB Cluster + */ +static uint get_no_fragments(ulonglong max_rows) +{ +#if MYSQL_VERSION_ID >= 50000 + uint acc_row_size= 25 + /*safety margin*/ 2; +#else + uint acc_row_size= pk_length*4; + /* add acc overhead */ + if (pk_length <= 8) /* main page will set the limit */ + acc_row_size+= 25 + /*safety margin*/ 2; + else /* overflow page will set the limit */ + acc_row_size+= 4 + /*safety margin*/ 4; +#endif + ulonglong acc_fragment_size= 512*1024*1024; +#if MYSQL_VERSION_ID >= 50100 + return (max_rows*acc_row_size)/acc_fragment_size+1; +#else + return ((max_rows*acc_row_size)/acc_fragment_size+1 + +1/*correct rounding*/)/2; +#endif +} + + +/* + Routine to adjust default number of partitions to always be a multiple + of number of nodes and never more than 4 times the number of nodes. + +*/ +static bool adjusted_frag_count(uint no_fragments, uint no_nodes, + uint &reported_frags) +{ + uint i= 0; + reported_frags= no_nodes; + while (reported_frags < no_fragments && ++i < 4 && + (reported_frags + no_nodes) < MAX_PARTITIONS) + reported_frags+= no_nodes; + return (reported_frags < no_fragments); +} + +int ha_ndbcluster::get_default_no_partitions(ulonglong max_rows) +{ + uint reported_frags; + uint no_fragments= get_no_fragments(max_rows); + uint no_nodes= g_ndb_cluster_connection->no_db_nodes(); + adjusted_frag_count(no_fragments, no_nodes, reported_frags); + return (int)reported_frags; +} + + +/* + User defined partitioning set-up. We need to check how many fragments the + user wants defined and which node groups to put those into. Later we also + want to attach those partitions to a tablespace. + + All the functionality of the partition function, partition limits and so + forth are entirely handled by the MySQL Server. There is one exception to + this rule for PARTITION BY KEY where NDB handles the hash function and + this type can thus be handled transparently also by NDB API program. + For RANGE, HASH and LIST and subpartitioning the NDB API programs must + implement the function to map to a partition. +*/ + +uint ha_ndbcluster::set_up_partition_info(partition_info *part_info, + TABLE *table, + void *tab_par) +{ + DBUG_ENTER("ha_ndbcluster::set_up_partition_info"); + ushort node_group[MAX_PARTITIONS]; + ulong ng_index= 0, i, j; + NDBTAB *tab= (NDBTAB*)tab_par; + NDBTAB::FragmentType ftype= NDBTAB::UserDefined; + partition_element *part_elem; + + if (part_info->part_type == HASH_PARTITION && + part_info->list_of_part_fields == TRUE) + { + Field **fields= part_info->part_field_array; + + if (part_info->linear_hash_ind) + ftype= NDBTAB::DistrKeyLin; + else + ftype= NDBTAB::DistrKeyHash; + + for (i= 0; i < part_info->part_field_list.elements; i++) + { + NDBCOL *col= tab->getColumn(fields[i]->fieldnr - 1); + DBUG_PRINT("info",("setting dist key on %s", col->getName())); + col->setPartitionKey(TRUE); + } + } + List_iterator<partition_element> part_it(part_info->partitions); + for (i= 0; i < part_info->no_parts; i++) + { + part_elem= part_it++; + if (!is_sub_partitioned(part_info)) + { + node_group[ng_index++]= part_elem->nodegroup_id; + //Here we should insert tablespace id based on tablespace name + } + else + { + List_iterator<partition_element> sub_it(part_elem->subpartitions); + for (j= 0; j < part_info->no_subparts; j++) + { + part_elem= sub_it++; + node_group[ng_index++]= part_elem->nodegroup_id; + //Here we should insert tablespace id based on tablespace name + } + } + } + { + uint no_nodes= g_ndb_cluster_connection->no_db_nodes(); + if (ng_index > 4 * no_nodes) + { + DBUG_RETURN(1300); + } + } + tab->setNodeGroupIds(&node_group, ng_index); + tab->setFragmentType(ftype); + DBUG_RETURN(0); +} + + +/* + This routine is used to set-up fragmentation when the user has only specified + ENGINE = NDB and no user defined partitioning what so ever. Thus all values + will be based on default values. We will choose Linear Hash or Hash with + perfect spread dependent on a session variable defined in MySQL. +*/ + +static void ndb_set_fragmentation(NDBTAB &tab, TABLE *form, uint pk_length) +{ + NDBTAB::FragmentType ftype= NDBTAB::DistrKeyHash; + ushort node_group[MAX_PARTITIONS]; + uint no_nodes= g_ndb_cluster_connection->no_db_nodes(), no_fragments, i; + DBUG_ENTER("ndb_set_fragmentation"); + + if (form->s->max_rows == (ha_rows) 0) + { + no_fragments= no_nodes; + } + else + { + /* + Ensure that we get enough fragments to handle all rows and ensure that + the table is fully distributed by keeping the number of fragments a + multiple of the number of nodes. + */ + uint fragments= get_no_fragments(form->s->max_rows); + if (adjusted_frag_count(fragments, no_nodes, no_fragments)) + { + push_warning(current_thd, + MYSQL_ERROR::WARN_LEVEL_WARN, ER_UNKNOWN_ERROR, + "Ndb might have problems storing the max amount of rows specified"); + } + } + /* + Always start with node group 0 and continue with next node group from + there + */ + node_group[0]= 0; + for (i= 1; i < no_fragments; i++) + node_group[i]= UNDEF_NODEGROUP; + switch (opt_ndb_distribution_id) + { + case ND_KEYHASH: + ftype= NDBTAB::DistrKeyHash; + break; + case ND_LINHASH: + ftype= NDBTAB::DistrKeyLin; + break; + } + tab.setFragmentType(ftype); + tab.setNodeGroupIds(&node_group, no_fragments); + DBUG_VOID_RETURN; +} + + +bool ha_ndbcluster::check_if_incompatible_data(HA_CREATE_INFO *info, + uint table_changes) +{ + /* + TODO: Remove the dummy return below, when cluster gets + signal from alter table when only .frm is changed. Cluster + needs it to manage the copies. + */ + return COMPATIBLE_DATA_NO; + + if (table_changes != IS_EQUAL_YES) + return COMPATIBLE_DATA_NO; + + /* Check that auto_increment value was not changed */ + if ((info->used_fields & HA_CREATE_USED_AUTO) && + info->auto_increment_value != 0) + return COMPATIBLE_DATA_NO; + + /* Check that row format didn't change */ + if ((info->used_fields & HA_CREATE_USED_AUTO) && + get_row_type() != info->row_type) + return COMPATIBLE_DATA_NO; + + return COMPATIBLE_DATA_YES; +} + #endif /* HAVE_NDBCLUSTER_DB */ diff --git a/sql/ha_ndbcluster.h b/sql/ha_ndbcluster.h index 034bb9292e8..bf9891c364b 100644 --- a/sql/ha_ndbcluster.h +++ b/sql/ha_ndbcluster.h @@ -113,6 +113,8 @@ struct negated_function_mapping NDB_FUNC_TYPE neg_fun; }; +enum ndb_distribution { ND_KEYHASH= 0, ND_LINHASH= 1 }; + /* Define what functions can be negated in condition pushdown. Note, these HAVE to be in the same order as in definition enum @@ -463,7 +465,7 @@ class ha_ndbcluster: public handler int write_row(byte *buf); int update_row(const byte *old_data, byte *new_data); int delete_row(const byte *buf); - int index_init(uint index); + int index_init(uint index, bool sorted); int index_end(); int index_read(byte *buf, const byte *key, uint key_len, enum ha_rkey_function find_flag); @@ -505,6 +507,11 @@ class ha_ndbcluster: public handler const char * table_type() const; const char ** bas_ext() const; ulong table_flags(void) const; + ulong partition_flags(void) const + { + return (HA_CAN_PARTITION | HA_CAN_UPDATE_PARTITION_KEY | + HA_CAN_PARTITION_UNIQUE); + } ulong index_flags(uint idx, uint part, bool all_parts) const; uint max_supported_record_length() const; uint max_supported_keys() const; @@ -514,6 +521,7 @@ class ha_ndbcluster: public handler int rename_table(const char *from, const char *to); int delete_table(const char *name); int create(const char *name, TABLE *form, HA_CREATE_INFO *info); + int get_default_no_partitions(ulonglong max_rows); THR_LOCK_DATA **store_lock(THD *thd, THR_LOCK_DATA **to, enum thr_lock_type lock_type); @@ -577,6 +585,10 @@ static void set_tabname(const char *pathname, char *tabname); uint key_length, qc_engine_callback *engine_callback, ulonglong *engine_data); + + bool check_if_incompatible_data(HA_CREATE_INFO *info, + uint table_changes); + private: int alter_table_name(const char *to); int drop_table(); @@ -592,15 +604,21 @@ private: NDB_INDEX_TYPE get_index_type_from_table(uint index_no) const; int check_index_fields_not_null(uint index_no); - int pk_read(const byte *key, uint key_len, byte *buf); - int complemented_pk_read(const byte *old_data, byte *new_data); - int peek_row(const byte *record); - int unique_index_read(const byte *key, uint key_len, - byte *buf); + uint set_up_partition_info(partition_info *part_info, + TABLE *table, + void *tab); + int complemented_pk_read(const byte *old_data, byte *new_data, + uint32 old_part_id); + int pk_read(const byte *key, uint key_len, byte *buf, uint32 part_id); int ordered_index_scan(const key_range *start_key, const key_range *end_key, - bool sorted, bool descending, byte* buf); + bool sorted, bool descending, byte* buf, + part_id_range *part_spec); int full_table_scan(byte * buf); + + int peek_row(const byte *record); + int unique_index_read(const byte *key, uint key_len, + byte *buf); int fetch_next(NdbScanOperation* op); int next_result(byte *buf); int define_read_attrs(byte* buf, NdbOperation* op); @@ -632,7 +650,7 @@ private: ulonglong get_auto_increment(); void invalidate_dictionary_cache(bool global); int ndb_err(NdbTransaction*); - bool uses_blob_value(bool all_fields); + bool uses_blob_value(); char *update_table_comment(const char * comment); @@ -680,11 +698,15 @@ private: // NdbRecAttr has no reference to blob typedef union { const NdbRecAttr *rec; NdbBlob *blob; void *ptr; } NdbValue; NdbValue m_value[NDB_MAX_ATTRIBUTES_IN_TABLE]; + partition_info *m_part_info; + byte *m_rec0; + Field **m_part_field_array; + bool m_use_partition_function; + bool m_sorted; bool m_use_write; bool m_ignore_dup_key; bool m_primary_key_update; - bool m_retrieve_all_fields; - bool m_retrieve_primary_key; + bool m_write_op; ha_rows m_rows_to_insert; ha_rows m_rows_inserted; ha_rows m_bulk_insert_rows; diff --git a/sql/ha_partition.cc b/sql/ha_partition.cc new file mode 100644 index 00000000000..39c4f2243a5 --- /dev/null +++ b/sql/ha_partition.cc @@ -0,0 +1,3240 @@ +/* Copyright (C) 2005 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + This handler was developed by Mikael Ronström for version 5.1 of MySQL. + It is an abstraction layer on top of other handlers such as MyISAM, + InnoDB, Federated, Berkeley DB and so forth. Partitioned tables can also + be handled by a storage engine. The current example of this is NDB + Cluster that has internally handled partitioning. This have benefits in + that many loops needed in the partition handler can be avoided. + + Partitioning has an inherent feature which in some cases is positive and + in some cases is negative. It splits the data into chunks. This makes + the data more manageable, queries can easily be parallelised towards the + parts and indexes are split such that there are less levels in the + index trees. The inherent disadvantage is that to use a split index + one has to scan all index parts which is ok for large queries but for + small queries it can be a disadvantage. + + Partitioning lays the foundation for more manageable databases that are + extremely large. It does also lay the foundation for more parallelism + in the execution of queries. This functionality will grow with later + versions of MySQL. + + You can enable it in your buld by doing the following during your build + process: + ./configure --with-partition + + The partition is setup to use table locks. It implements an partition "SHARE" + that is inserted into a hash by table name. You can use this to store + information of state that any partition handler object will be able to see + if it is using the same table. + + Please read the object definition in ha_partition.h before reading the rest + if this file. +*/ + +#ifdef __GNUC__ +#pragma implementation // gcc: Class implementation +#endif + +#include <mysql_priv.h> + +#ifdef HAVE_PARTITION_DB +#include "ha_partition.h" + +static const char *ha_par_ext= ".par"; +#ifdef NOT_USED +static int free_share(PARTITION_SHARE * share); +static PARTITION_SHARE *get_share(const char *table_name, TABLE * table); +#endif + +/**************************************************************************** + MODULE create/delete handler object +****************************************************************************/ + +static handlerton partition_hton = { + "partition", + 0, /* slot */ + 0, /* savepoint size */ + NULL /*ndbcluster_close_connection*/, + NULL, /* savepoint_set */ + NULL, /* savepoint_rollback */ + NULL, /* savepoint_release */ + NULL /*ndbcluster_commit*/, + NULL /*ndbcluster_rollback*/, + NULL, /* prepare */ + NULL, /* recover */ + NULL, /* commit_by_xid */ + NULL, /* rollback_by_xid */ + HTON_NO_FLAGS +}; + +ha_partition::ha_partition(TABLE *table) + :handler(&partition_hton, table), m_part_info(NULL), m_create_handler(FALSE), + m_is_sub_partitioned(0) +{ + DBUG_ENTER("ha_partition::ha_partition(table)"); + init_handler_variables(); + if (table) + { + if (table->s->part_info) + { + m_part_info= table->s->part_info; + m_is_sub_partitioned= is_sub_partitioned(m_part_info); + } + } + DBUG_VOID_RETURN; +} + + +ha_partition::ha_partition(partition_info *part_info) + :handler(&partition_hton, NULL), m_part_info(part_info), m_create_handler(TRUE), + m_is_sub_partitioned(is_sub_partitioned(m_part_info)) + +{ + DBUG_ENTER("ha_partition::ha_partition(part_info)"); + init_handler_variables(); + DBUG_ASSERT(m_part_info); + DBUG_VOID_RETURN; +} + + +void ha_partition::init_handler_variables() +{ + active_index= MAX_KEY; + m_file_buffer= NULL; + m_name_buffer_ptr= NULL; + m_engine_array= NULL; + m_file= NULL; + m_tot_parts= 0; + m_has_transactions= 0; + m_pkey_is_clustered= 0; + m_lock_type= F_UNLCK; + m_part_spec.start_part= NO_CURRENT_PART_ID; + m_scan_value= 2; + m_ref_length= 0; + m_part_spec.end_part= NO_CURRENT_PART_ID; + m_index_scan_type= partition_no_index_scan; + m_start_key.key= NULL; + m_start_key.length= 0; + m_myisam= FALSE; + m_innodb= FALSE; + m_extra_cache= FALSE; + m_extra_cache_size= 0; + m_table_flags= HA_FILE_BASED | HA_REC_NOT_IN_SEQ; + m_low_byte_first= 1; + m_part_field_array= NULL; + m_ordered_rec_buffer= NULL; + m_top_entry= NO_CURRENT_PART_ID; + m_rec_length= 0; + m_last_part= 0; + m_rec0= 0; + m_curr_key_info= 0; + +#ifdef DONT_HAVE_TO_BE_INITALIZED + m_start_key.flag= 0; + m_ordered= TRUE; +#endif +} + + +ha_partition::~ha_partition() +{ + DBUG_ENTER("ha_partition::~ha_partition()"); + if (m_file != NULL) + { + uint i; + for (i= 0; i < m_tot_parts; i++) + delete m_file[i]; + } + my_free((char*) m_ordered_rec_buffer, MYF(MY_ALLOW_ZERO_PTR)); + + clear_handler_file(); + DBUG_VOID_RETURN; +} + + +/* + The partition handler is only a layer on top of other engines. Thus it + can't really perform anything without the underlying handlers. Thus we + add this method as part of the allocation of a handler object. + + 1) Allocation of underlying handlers + If we have access to the partition info we will allocate one handler + instance for each partition. + 2) Allocation without partition info + The cases where we don't have access to this information is when called + in preparation for delete_table and rename_table and in that case we + only need to set HA_FILE_BASED. In that case we will use the .par file + that contains information about the partitions and their engines and + the names of each partition. + 3) Table flags initialisation + We need also to set table flags for the partition handler. This is not + static since it depends on what storage engines are used as underlying + handlers. + The table flags is set in this routine to simulate the behaviour of a + normal storage engine + The flag HA_FILE_BASED will be set independent of the underlying handlers + 4) Index flags initialisation + When knowledge exists on the indexes it is also possible to initialise the + index flags. Again the index flags must be initialised by using the under- + lying handlers since this is storage engine dependent. + The flag HA_READ_ORDER will be reset for the time being to indicate no + ordered output is available from partition handler indexes. Later a merge + sort will be performed using the underlying handlers. + 5) primary_key_is_clustered, has_transactions and low_byte_first is + calculated here. +*/ + +int ha_partition::ha_initialise() +{ + handler **file_array, *file; + DBUG_ENTER("ha_partition::set_up_constants"); + + if (m_part_info) + { + m_tot_parts= get_tot_partitions(m_part_info); + DBUG_ASSERT(m_tot_parts > 0); + if (m_create_handler) + { + if (new_handlers_from_part_info()) + DBUG_RETURN(1); + } + else if (get_from_handler_file(table->s->path)) + { + my_error(ER_OUTOFMEMORY, MYF(0), 129); //Temporary fix TODO print_error + DBUG_RETURN(1); + } + /* + We create all underlying table handlers here. We only do it if we have + access to the partition info. We do it in this special method to be + able to report allocation errors. + */ + /* + Set up table_flags, low_byte_first, primary_key_is_clustered and + has_transactions since they are called often in all kinds of places, + other parameters are calculated on demand. + HA_FILE_BASED is always set for partition handler since we use a + special file for handling names of partitions, engine types. + HA_CAN_GEOMETRY, HA_CAN_FULLTEXT, HA_CAN_SQL_HANDLER, + HA_CAN_INSERT_DELAYED is disabled until further investigated. + */ + m_table_flags= m_file[0]->table_flags(); + m_low_byte_first= m_file[0]->low_byte_first(); + m_has_transactions= TRUE; + m_pkey_is_clustered= TRUE; + file_array= m_file; + do + { + file= *file_array; + if (m_low_byte_first != file->low_byte_first()) + { + // Cannot have handlers with different endian + my_error(ER_MIX_HANDLER_ERROR, MYF(0)); + DBUG_RETURN(1); + } + if (!file->has_transactions()) + m_has_transactions= FALSE; + if (!file->primary_key_is_clustered()) + m_pkey_is_clustered= FALSE; + m_table_flags&= file->table_flags(); + } while (*(++file_array)); + m_table_flags&= ~(HA_CAN_GEOMETRY & HA_CAN_FULLTEXT & + HA_CAN_SQL_HANDLER & HA_CAN_INSERT_DELAYED); + /* + TODO RONM: + Make sure that the tree works without partition defined, compiles + and goes through mysql-test-run. + */ + } + m_table_flags|= HA_FILE_BASED | HA_REC_NOT_IN_SEQ; + DBUG_RETURN(0); +} + +/**************************************************************************** + MODULE meta data changes +****************************************************************************/ +/* + This method is used to calculate the partition name, service routine to + the del_ren_cre_table method. +*/ + +static void create_partition_name(char *out, const char *in1, const char *in2) +{ + strxmov(out, in1, "_", in2, NullS); +} + +/* + This method is used to calculate the partition name, service routine to + the del_ren_cre_table method. +*/ + +static void create_subpartition_name(char *out, const char *in1, + const char *in2, const char *in3) +{ + strxmov(out, in1, "_", in2, "_", in3, NullS); +} + + +/* + Used to delete a table. By the time delete_table() has been called all + opened references to this table will have been closed (and your globally + shared references released. The variable name will just be the name of + the table. You will need to remove any files you have created at this + point. + + If you do not implement this, the default delete_table() is called from + handler.cc and it will delete all files with the file extentions returned + by bas_ext(). + + Called from handler.cc by delete_table and ha_create_table(). Only used + during create if the table_flag HA_DROP_BEFORE_CREATE was specified for + the storage engine. +*/ + +int ha_partition::delete_table(const char *name) +{ + int error; + DBUG_ENTER("ha_partition::delete_table"); + if ((error= del_ren_cre_table(name, NULL, NULL, NULL))) + DBUG_RETURN(error); + DBUG_RETURN(handler::delete_table(name)); +} + + +/* + Renames a table from one name to another from alter table call. + + If you do not implement this, the default rename_table() is called from + handler.cc and it will delete all files with the file extentions returned + by bas_ext(). + + Called from sql_table.cc by mysql_rename_table(). +*/ + +int ha_partition::rename_table(const char *from, const char *to) +{ + int error; + DBUG_ENTER("ha_partition::rename_table"); + if ((error= del_ren_cre_table(from, to, NULL, NULL))) + DBUG_RETURN(error); + DBUG_RETURN(handler::rename_table(from, to)); +} + + +/* + create_handler_files is called to create any handler specific files + before opening the file with openfrm to later call ::create on the + file object. + In the partition handler this is used to store the names of partitions + and types of engines in the partitions. +*/ + +int ha_partition::create_handler_files(const char *name) +{ + DBUG_ENTER("ha_partition::create_handler_files()"); + + /* + We need to update total number of parts since we might write the handler + file as part of a partition management command + */ + m_tot_parts= get_tot_partitions(m_part_info); + if (create_handler_file(name)) + { + my_error(ER_CANT_CREATE_HANDLER_FILE, MYF(0)); + DBUG_RETURN(1); + } + DBUG_RETURN(0); +} + + +/* + create() is called to create a table. The variable name will have the name + of the table. When create() is called you do not need to worry about + opening the table. Also, the FRM file will have already been created so + adjusting create_info will not do you any good. You can overwrite the frm + file at this point if you wish to change the table definition, but there + are no methods currently provided for doing that. + + Called from handle.cc by ha_create_table(). +*/ + +int ha_partition::create(const char *name, TABLE *table_arg, + HA_CREATE_INFO *create_info) +{ + char t_name[FN_REFLEN]; + DBUG_ENTER("ha_partition::create"); + + strmov(t_name, name); + *fn_ext(t_name)= 0; + if (del_ren_cre_table(t_name, NULL, table_arg, create_info)) + { + handler::delete_table(t_name); + DBUG_RETURN(1); + } + DBUG_RETURN(0); +} + +int ha_partition::drop_partitions(const char *path) +{ + List_iterator<partition_element> part_it(m_part_info->partitions); + char part_name_buff[FN_REFLEN]; + uint no_parts= m_part_info->no_parts; + uint no_subparts= m_part_info->no_subparts, i= 0; + int error= 1; + DBUG_ENTER("ha_partition::drop_partitions()"); + + do + { + partition_element *part_elem= part_it++; + if (part_elem->part_state == PART_IS_DROPPED) + { + /* + This part is to be dropped, meaning the part or all its subparts. + */ + if (is_sub_partitioned(m_part_info)) + { + List_iterator<partition_element> sub_it(part_elem->subpartitions); + uint j= 0, part; + do + { + partition_element *sub_elem= sub_it++; + create_subpartition_name(part_name_buff, path, + part_elem->partition_name, + sub_elem->partition_name); + part= i * no_subparts + j; + DBUG_PRINT("info", ("Drop subpartition %s", part_name_buff)); + error= m_file[part]->delete_table((const char *) part_name_buff); + } while (++j < no_subparts); + } + else + { + create_partition_name(part_name_buff, path, + part_elem->partition_name); + DBUG_PRINT("info", ("Drop partition %s", part_name_buff)); + error= m_file[i]->delete_table((const char *) part_name_buff); + } + } + } while (++i < no_parts); + DBUG_RETURN(error); +} + +void ha_partition::update_create_info(HA_CREATE_INFO *create_info) +{ + return; +} + + +char *ha_partition::update_table_comment(const char *comment) +{ + return (char*) comment; // Nothing to change +} + + + +/* + Common routine to handle delete_table and rename_table. + The routine uses the partition handler file to get the + names of the partition instances. Both these routines + are called after creating the handler without table + object and thus the file is needed to discover the + names of the partitions and the underlying storage engines. +*/ + +uint ha_partition::del_ren_cre_table(const char *from, + const char *to, + TABLE *table_arg, + HA_CREATE_INFO *create_info) +{ + int save_error= 0, error; + char from_buff[FN_REFLEN], to_buff[FN_REFLEN]; + char *name_buffer_ptr; + uint i; + handler **file; + DBUG_ENTER("del_ren_cre_table()"); + + if (get_from_handler_file(from)) + DBUG_RETURN(TRUE); + DBUG_ASSERT(m_file_buffer); + name_buffer_ptr= m_name_buffer_ptr; + file= m_file; + i= 0; + do + { + create_partition_name(from_buff, from, name_buffer_ptr); + if (to != NULL) + { // Rename branch + create_partition_name(to_buff, to, name_buffer_ptr); + error= (*file)->rename_table((const char*) from_buff, + (const char*) to_buff); + } + else if (table_arg == NULL) // delete branch + error= (*file)->delete_table((const char*) from_buff); + else + { + set_up_table_before_create(table_arg, create_info, i); + error= (*file)->create(from_buff, table_arg, create_info); + } + name_buffer_ptr= strend(name_buffer_ptr) + 1; + if (error) + save_error= error; + i++; + } while (*(++file)); + DBUG_RETURN(save_error); +} + + +partition_element *ha_partition::find_partition_element(uint part_id) +{ + uint i; + uint curr_part_id= 0; + List_iterator_fast < partition_element > part_it(m_part_info->partitions); + + for (i= 0; i < m_part_info->no_parts; i++) + { + partition_element *part_elem; + part_elem= part_it++; + if (m_is_sub_partitioned) + { + uint j; + List_iterator_fast <partition_element> sub_it(part_elem->subpartitions); + for (j= 0; j < m_part_info->no_subparts; j++) + { + part_elem= sub_it++; + if (part_id == curr_part_id++) + return part_elem; + } + } + else if (part_id == curr_part_id++) + return part_elem; + } + DBUG_ASSERT(0); + current_thd->fatal_error(); // Abort + return NULL; +} + + +void ha_partition::set_up_table_before_create(TABLE *table, + HA_CREATE_INFO *info, + uint part_id) +{ + /* + Set up + 1) Comment on partition + 2) MAX_ROWS, MIN_ROWS on partition + 3) Index file name on partition + 4) Data file name on partition + */ + partition_element *part_elem= find_partition_element(part_id); + if (!part_elem) + return; // Fatal error + table->s->max_rows= part_elem->part_max_rows; + table->s->min_rows= part_elem->part_min_rows; + info->index_file_name= part_elem->index_file_name; + info->data_file_name= part_elem->data_file_name; +} + + +/* + Routine used to add two names with '_' in between then. Service routine + to create_handler_file + Include the NULL in the count of characters since it is needed as separator + between the partition names. +*/ + +static uint name_add(char *dest, const char *first_name, const char *sec_name) +{ + return (uint) (strxmov(dest, first_name, "_", sec_name, NullS) -dest) + 1; +} + + +/* + Method used to create handler file with names of partitions, their + engine types and the number of partitions. +*/ + +bool ha_partition::create_handler_file(const char *name) +{ + partition_element *part_elem, *subpart_elem; + uint i, j, part_name_len, subpart_name_len; + uint tot_partition_words, tot_name_len; + uint tot_len_words, tot_len_byte, chksum, tot_name_words; + char *name_buffer_ptr; + uchar *file_buffer, *engine_array; + bool result= TRUE; + char file_name[FN_REFLEN]; + File file; + List_iterator_fast < partition_element > part_it(m_part_info->partitions); + DBUG_ENTER("create_handler_file"); + + DBUG_PRINT("info", ("table name = %s", name)); + tot_name_len= 0; + for (i= 0; i < m_part_info->no_parts; i++) + { + part_elem= part_it++; + part_name_len= strlen(part_elem->partition_name); + if (!m_is_sub_partitioned) + tot_name_len+= part_name_len + 1; + else + { + List_iterator_fast<partition_element> sub_it(part_elem->subpartitions); + for (j= 0; j < m_part_info->no_subparts; j++) + { + subpart_elem= sub_it++; + subpart_name_len= strlen(subpart_elem->partition_name); + tot_name_len+= part_name_len + subpart_name_len + 2; + } + } + } + /* + File format: + Length in words 4 byte + Checksum 4 byte + Total number of partitions 4 byte + Array of engine types n * 4 bytes where + n = (m_tot_parts + 3)/4 + Length of name part in bytes 4 bytes + Name part m * 4 bytes where + m = ((length_name_part + 3)/4)*4 + + All padding bytes are zeroed + */ + tot_partition_words= (m_tot_parts + 3) / 4; + tot_name_words= (tot_name_len + 3) / 4; + tot_len_words= 4 + tot_partition_words + tot_name_words; + tot_len_byte= 4 * tot_len_words; + if (!(file_buffer= (uchar *) my_malloc(tot_len_byte, MYF(MY_ZEROFILL)))) + DBUG_RETURN(TRUE); + engine_array= (file_buffer + 12); + name_buffer_ptr= (char*) (file_buffer + ((4 + tot_partition_words) * 4)); + part_it.rewind(); + for (i= 0; i < m_part_info->no_parts; i++) + { + part_elem= part_it++; + if (!m_is_sub_partitioned) + { + name_buffer_ptr= strmov(name_buffer_ptr, part_elem->partition_name)+1; + *engine_array= (uchar) part_elem->engine_type; + DBUG_PRINT("info", ("engine: %u", *engine_array)); + engine_array++; + } + else + { + List_iterator_fast<partition_element> sub_it(part_elem->subpartitions); + for (j= 0; j < m_part_info->no_subparts; j++) + { + subpart_elem= sub_it++; + name_buffer_ptr+= name_add(name_buffer_ptr, + part_elem->partition_name, + subpart_elem->partition_name); + *engine_array= (uchar) part_elem->engine_type; + engine_array++; + } + } + } + chksum= 0; + int4store(file_buffer, tot_len_words); + int4store(file_buffer + 8, m_tot_parts); + int4store(file_buffer + 12 + (tot_partition_words * 4), tot_name_len); + for (i= 0; i < tot_len_words; i++) + chksum^= uint4korr(file_buffer + 4 * i); + int4store(file_buffer + 4, chksum); + /* + Remove .frm extension and replace with .par + Create and write and close file + to be used at open, delete_table and rename_table + */ + fn_format(file_name, name, "", ".par", MYF(MY_REPLACE_EXT)); + if ((file= my_create(file_name, CREATE_MODE, O_RDWR | O_TRUNC, + MYF(MY_WME))) >= 0) + { + result= my_write(file, (byte *) file_buffer, tot_len_byte, + MYF(MY_WME | MY_NABP)); + VOID(my_close(file, MYF(0))); + } + else + result= TRUE; + my_free((char*) file_buffer, MYF(0)); + DBUG_RETURN(result); +} + + +void ha_partition::clear_handler_file() +{ + my_free((char*) m_file_buffer, MYF(MY_ALLOW_ZERO_PTR)); + m_file_buffer= NULL; + m_name_buffer_ptr= NULL; + m_engine_array= NULL; +} + + +bool ha_partition::create_handlers() +{ + uint i; + uint alloc_len= (m_tot_parts + 1) * sizeof(handler*); + DBUG_ENTER("create_handlers"); + + if (!(m_file= (handler **) sql_alloc(alloc_len))) + DBUG_RETURN(TRUE); + bzero(m_file, alloc_len); + for (i= 0; i < m_tot_parts; i++) + { + if (!(m_file[i]= get_new_handler(table, (enum db_type) m_engine_array[i]))) + DBUG_RETURN(TRUE); + DBUG_PRINT("info", ("engine_type: %u", m_engine_array[i])); + } + m_file[m_tot_parts]= 0; + /* For the moment we only support partition over the same table engine */ + if (m_engine_array[0] == (uchar) DB_TYPE_MYISAM) + { + DBUG_PRINT("info", ("MyISAM")); + m_myisam= TRUE; + } + else if (m_engine_array[0] == (uchar) DB_TYPE_INNODB) + { + DBUG_PRINT("info", ("InnoDB")); + m_innodb= TRUE; + } + DBUG_RETURN(FALSE); +} + + +bool ha_partition::new_handlers_from_part_info() +{ + uint i, j; + partition_element *part_elem; + uint alloc_len= (m_tot_parts + 1) * sizeof(handler*); + List_iterator_fast <partition_element> part_it(m_part_info->partitions); + DBUG_ENTER("ha_partition::new_handlers_from_part_info"); + + if (!(m_file= (handler **) sql_alloc(alloc_len))) + goto error; + bzero(m_file, alloc_len); + DBUG_ASSERT(m_part_info->no_parts > 0); + + i= 0; + /* + Don't know the size of the underlying storage engine, invent a number of + bytes allocated for error message if allocation fails + */ + alloc_len= 128; + do + { + part_elem= part_it++; + if (!(m_file[i]= get_new_handler(table, part_elem->engine_type))) + goto error; + DBUG_PRINT("info", ("engine_type: %u", (uint) part_elem->engine_type)); + if (m_is_sub_partitioned) + { + for (j= 0; j < m_part_info->no_subparts; j++) + { + if (!(m_file[i]= get_new_handler(table, part_elem->engine_type))) + goto error; + DBUG_PRINT("info", ("engine_type: %u", (uint) part_elem->engine_type)); + } + } + } while (++i < m_part_info->no_parts); + if (part_elem->engine_type == DB_TYPE_MYISAM) + { + DBUG_PRINT("info", ("MyISAM")); + m_myisam= TRUE; + } + DBUG_RETURN(FALSE); +error: + my_error(ER_OUTOFMEMORY, MYF(0), alloc_len); + DBUG_RETURN(TRUE); +} + + +/* + Open handler file to get partition names, engine types and number of + partitions. +*/ + +bool ha_partition::get_from_handler_file(const char *name) +{ + char buff[FN_REFLEN], *address_tot_name_len; + File file; + char *file_buffer, *name_buffer_ptr; + uchar *engine_array; + uint i, len_bytes, len_words, tot_partition_words, tot_name_words, chksum; + DBUG_ENTER("ha_partition::get_from_handler_file"); + DBUG_PRINT("enter", ("table name: '%s'", name)); + + if (m_file_buffer) + DBUG_RETURN(FALSE); + fn_format(buff, name, "", ha_par_ext, MYF(0)); + + /* Following could be done with my_stat to read in whole file */ + if ((file= my_open(buff, O_RDONLY | O_SHARE, MYF(0))) < 0) + DBUG_RETURN(TRUE); + if (my_read(file, (byte *) & buff[0], 8, MYF(MY_NABP))) + goto err1; + len_words= uint4korr(buff); + len_bytes= 4 * len_words; + if (!(file_buffer= my_malloc(len_bytes, MYF(0)))) + goto err1; + VOID(my_seek(file, 0, MY_SEEK_SET, MYF(0))); + if (my_read(file, (byte *) file_buffer, len_bytes, MYF(MY_NABP))) + goto err2; + + chksum= 0; + for (i= 0; i < len_words; i++) + chksum ^= uint4korr((file_buffer) + 4 * i); + if (chksum) + goto err2; + m_tot_parts= uint4korr((file_buffer) + 8); + tot_partition_words= (m_tot_parts + 3) / 4; + engine_array= (uchar *) ((file_buffer) + 12); + address_tot_name_len= file_buffer + 12 + 4 * tot_partition_words; + tot_name_words= (uint4korr(address_tot_name_len) + 3) / 4; + if (len_words != (tot_partition_words + tot_name_words + 4)) + goto err2; + name_buffer_ptr= file_buffer + 16 + 4 * tot_partition_words; + VOID(my_close(file, MYF(0))); + m_file_buffer= file_buffer; // Will be freed in clear_handler_file() + m_name_buffer_ptr= name_buffer_ptr; + m_engine_array= engine_array; + if (!m_file && create_handlers()) + { + clear_handler_file(); + DBUG_RETURN(TRUE); + } + DBUG_RETURN(FALSE); + +err2: + my_free(file_buffer, MYF(0)); +err1: + VOID(my_close(file, MYF(0))); + DBUG_RETURN(TRUE); +} + +/**************************************************************************** + MODULE open/close object +****************************************************************************/ +/* + Used for opening tables. The name will be the name of the file. + A table is opened when it needs to be opened. For instance + when a request comes in for a select on the table (tables are not + open and closed for each request, they are cached). + + Called from handler.cc by handler::ha_open(). The server opens all tables + by calling ha_open() which then calls the handler specific open(). +*/ + +int ha_partition::open(const char *name, int mode, uint test_if_locked) +{ + int error; + char name_buff[FN_REFLEN]; + char *name_buffer_ptr= m_name_buffer_ptr; + handler **file; + uint alloc_len; + DBUG_ENTER("ha_partition::open"); + + ref_length= 0; + m_part_field_array= m_part_info->full_part_field_array; + if (get_from_handler_file(name)) + DBUG_RETURN(1); + m_start_key.length= 0; + m_rec0= table->record[0]; + m_rec_length= table->s->reclength; + alloc_len= m_tot_parts * (m_rec_length + PARTITION_BYTES_IN_POS); + alloc_len+= table->s->max_key_length; + if (!m_ordered_rec_buffer) + { + if (!(m_ordered_rec_buffer= my_malloc(alloc_len, MYF(MY_WME)))) + { + DBUG_RETURN(1); + } + { + /* + We set-up one record per partition and each record has 2 bytes in + front where the partition id is written. This is used by ordered + index_read. + We also set-up a reference to the first record for temporary use in + setting up the scan. + */ + char *ptr= m_ordered_rec_buffer; + uint i= 0; + do + { + int2store(ptr, i); + ptr+= m_rec_length + PARTITION_BYTES_IN_POS; + } while (++i < m_tot_parts); + m_start_key.key= ptr; + } + } + file= m_file; + do + { + create_partition_name(name_buff, name, name_buffer_ptr); + if ((error= (*file)->ha_open((const char*) name_buff, mode, + test_if_locked))) + goto err_handler; + name_buffer_ptr+= strlen(name_buffer_ptr) + 1; + set_if_bigger(ref_length, ((*file)->ref_length)); + } while (*(++file)); + /* + Add 2 bytes for partition id in position ref length. + ref_length=max_in_all_partitions(ref_length) + PARTITION_BYTES_IN_POS + */ + ref_length+= PARTITION_BYTES_IN_POS; + m_ref_length= ref_length; + /* + Release buffer read from .par file. It will not be reused again after + being opened once. + */ + clear_handler_file(); + /* + Initialise priority queue, initialised to reading forward. + */ + if ((error= init_queue(&queue, m_tot_parts, (uint) PARTITION_BYTES_IN_POS, + 0, key_rec_cmp, (void*)this))) + goto err_handler; + /* + Some handlers update statistics as part of the open call. This will in + some cases corrupt the statistics of the partition handler and thus + to ensure we have correct statistics we call info from open after + calling open on all individual handlers. + */ + info(HA_STATUS_VARIABLE | HA_STATUS_CONST); + DBUG_RETURN(0); + +err_handler: + while (file-- != m_file) + (*file)->close(); + DBUG_RETURN(error); +} + +/* + Closes a table. We call the free_share() function to free any resources + that we have allocated in the "shared" structure. + + Called from sql_base.cc, sql_select.cc, and table.cc. + In sql_select.cc it is only used to close up temporary tables or during + the process where a temporary table is converted over to being a + myisam table. + For sql_base.cc look at close_data_tables(). +*/ + +int ha_partition::close(void) +{ + handler **file; + DBUG_ENTER("ha_partition::close"); + file= m_file; + do + { + (*file)->close(); + } while (*(++file)); + DBUG_RETURN(0); +} + + +/**************************************************************************** + MODULE start/end statement +****************************************************************************/ +/* + A number of methods to define various constants for the handler. In + the case of the partition handler we need to use some max and min + of the underlying handlers in most cases. +*/ + +/* + First you should go read the section "locking functions for mysql" in + lock.cc to understand this. + This create a lock on the table. If you are implementing a storage engine + that can handle transactions look at ha_berkely.cc to see how you will + want to goo about doing this. Otherwise you should consider calling + flock() here. + Originally this method was used to set locks on file level to enable + several MySQL Servers to work on the same data. For transactional + engines it has been "abused" to also mean start and end of statements + to enable proper rollback of statements and transactions. When LOCK + TABLES has been issued the start_stmt method takes over the role of + indicating start of statement but in this case there is no end of + statement indicator(?). + + Called from lock.cc by lock_external() and unlock_external(). Also called + from sql_table.cc by copy_data_between_tables(). +*/ + +int ha_partition::external_lock(THD *thd, int lock_type) +{ + uint error; + handler **file; + DBUG_ENTER("ha_partition::external_lock"); + file= m_file; + do + { + if ((error= (*file)->external_lock(thd, lock_type))) + { + if (lock_type != F_UNLCK) + goto err_handler; + } + } while (*(++file)); + m_lock_type= lock_type; // For the future (2009?) + DBUG_RETURN(0); + +err_handler: + while (file-- != m_file) + (*file)->external_lock(thd, F_UNLCK); + DBUG_RETURN(error); +} + + +/* + The idea with handler::store_lock() is the following: + + The statement decided which locks we should need for the table + for updates/deletes/inserts we get WRITE locks, for SELECT... we get + read locks. + + Before adding the lock into the table lock handler (see thr_lock.c) + mysqld calls store lock with the requested locks. Store lock can now + modify a write lock to a read lock (or some other lock), ignore the + lock (if we don't want to use MySQL table locks at all) or add locks + for many tables (like we do when we are using a MERGE handler). + + Berkeley DB for partition changes all WRITE locks to TL_WRITE_ALLOW_WRITE + (which signals that we are doing WRITES, but we are still allowing other + reader's and writer's. + + When releasing locks, store_lock() are also called. In this case one + usually doesn't have to do anything. + + store_lock is called when holding a global mutex to ensure that only + one thread at a time changes the locking information of tables. + + In some exceptional cases MySQL may send a request for a TL_IGNORE; + This means that we are requesting the same lock as last time and this + should also be ignored. (This may happen when someone does a flush + table when we have opened a part of the tables, in which case mysqld + closes and reopens the tables and tries to get the same locks at last + time). In the future we will probably try to remove this. + + Called from lock.cc by get_lock_data(). +*/ + +THR_LOCK_DATA **ha_partition::store_lock(THD *thd, + THR_LOCK_DATA **to, + enum thr_lock_type lock_type) +{ + handler **file; + DBUG_ENTER("ha_partition::store_lock"); + file= m_file; + do + { + to= (*file)->store_lock(thd, to, lock_type); + } while (*(++file)); + DBUG_RETURN(to); +} + + +int ha_partition::start_stmt(THD *thd) +{ + int error= 0; + handler **file; + DBUG_ENTER("ha_partition::start_stmt"); + file= m_file; + do + { + if ((error= (*file)->start_stmt(thd))) + break; + } while (*(++file)); + DBUG_RETURN(error); +} + + +/* + Returns the number of store locks needed in call to store lock. + We return number of partitions since we call store_lock on each + underlying handler. Assists the above functions in allocating + sufficient space for lock structures. +*/ + +uint ha_partition::lock_count() const +{ + DBUG_ENTER("ha_partition::lock_count"); + DBUG_RETURN(m_tot_parts); +} + + +/* + Record currently processed was not in the result set of the statement + and is thus unlocked. Used for UPDATE and DELETE queries. +*/ + +void ha_partition::unlock_row() +{ + m_file[m_last_part]->unlock_row(); + return; +} + + +/**************************************************************************** + MODULE change record +****************************************************************************/ + +/* + write_row() inserts a row. buf() is a byte array of data, normally record[0]. + + You can use the field information to extract the data from the native byte + array type. + + Example of this would be: + for (Field **field=table->field ; *field ; field++) + { + ... + } + + See ha_tina.cc for an partition of extracting all of the data as strings. + ha_berekly.cc has an partition of how to store it intact by "packing" it + for ha_berkeley's own native storage type. + + See the note for update_row() on auto_increments and timestamps. This + case also applied to write_row(). + + Called from item_sum.cc, item_sum.cc, sql_acl.cc, sql_insert.cc, + sql_insert.cc, sql_select.cc, sql_table.cc, sql_udf.cc, and sql_update.cc. + + ADDITIONAL INFO: + + Most handlers set timestamp when calling write row if any such fields + exists. Since we are calling an underlying handler we assume the´ + underlying handler will assume this responsibility. + + Underlying handlers will also call update_auto_increment to calculate + the new auto increment value. We will catch the call to + get_auto_increment and ensure this increment value is maintained by + only one of the underlying handlers. +*/ + +int ha_partition::write_row(byte * buf) +{ + uint32 part_id; + int error; +#ifdef NOT_NEEDED + byte *rec0= m_rec0; +#endif + DBUG_ENTER("ha_partition::write_row"); + DBUG_ASSERT(buf == m_rec0); + +#ifdef NOT_NEEDED + if (likely(buf == rec0)) +#endif + error= m_part_info->get_partition_id(m_part_info, &part_id); +#ifdef NOT_NEEDED + else + { + set_field_ptr(m_part_field_array, buf, rec0); + error= m_part_info->get_partition_id(m_part_info, &part_id); + set_field_ptr(m_part_field_array, rec0, buf); + } +#endif + if (unlikely(error)) + DBUG_RETURN(error); + m_last_part= part_id; + DBUG_PRINT("info", ("Insert in partition %d", part_id)); + DBUG_RETURN(m_file[part_id]->write_row(buf)); +} + + +/* + Yes, update_row() does what you expect, it updates a row. old_data will + have the previous row record in it, while new_data will have the newest + data in it. + Keep in mind that the server can do updates based on ordering if an + ORDER BY clause was used. Consecutive ordering is not guarenteed. + + Currently new_data will not have an updated auto_increament record, or + and updated timestamp field. You can do these for partition by doing these: + if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_UPDATE) + table->timestamp_field->set_time(); + if (table->next_number_field && record == table->record[0]) + update_auto_increment(); + + Called from sql_select.cc, sql_acl.cc, sql_update.cc, and sql_insert.cc. + new_data is always record[0] + old_data is normally record[1] but may be anything + +*/ + +int ha_partition::update_row(const byte *old_data, byte *new_data) +{ + uint32 new_part_id, old_part_id; + int error; + DBUG_ENTER("ha_partition::update_row"); + + if ((error= get_parts_for_update(old_data, new_data, table->record[0], + m_part_info, &old_part_id, &new_part_id))) + { + DBUG_RETURN(error); + } + + /* + TODO: + set_internal_auto_increment= + max(set_internal_auto_increment, new_data->auto_increment) + */ + m_last_part= new_part_id; + if (new_part_id == old_part_id) + { + DBUG_PRINT("info", ("Update in partition %d", new_part_id)); + DBUG_RETURN(m_file[new_part_id]->update_row(old_data, new_data)); + } + else + { + DBUG_PRINT("info", ("Update from partition %d to partition %d", + old_part_id, new_part_id)); + if ((error= m_file[new_part_id]->write_row(new_data))) + DBUG_RETURN(error); + if ((error= m_file[old_part_id]->delete_row(old_data))) + { +#ifdef IN_THE_FUTURE + (void) m_file[new_part_id]->delete_last_inserted_row(new_data); +#endif + DBUG_RETURN(error); + } + } + DBUG_RETURN(0); +} + + +/* + This will delete a row. buf will contain a copy of the row to be deleted. + The server will call this right after the current row has been read + (from either a previous rnd_xxx() or index_xxx() call). + If you keep a pointer to the last row or can access a primary key it will + make doing the deletion quite a bit easier. + Keep in mind that the server does no guarentee consecutive deletions. + ORDER BY clauses can be used. + + Called in sql_acl.cc and sql_udf.cc to manage internal table information. + Called in sql_delete.cc, sql_insert.cc, and sql_select.cc. In sql_select + it is used for removing duplicates while in insert it is used for REPLACE + calls. + + buf is either record[0] or record[1] + +*/ + +int ha_partition::delete_row(const byte *buf) +{ + uint32 part_id; + int error; + DBUG_ENTER("ha_partition::delete_row"); + + if ((error= get_part_for_delete(buf, m_rec0, m_part_info, &part_id))) + { + DBUG_RETURN(error); + } + m_last_part= part_id; + DBUG_RETURN(m_file[part_id]->delete_row(buf)); +} + + +/* + Used to delete all rows in a table. Both for cases of truncate and + for cases where the optimizer realizes that all rows will be + removed as a result of a SQL statement. + + Called from item_sum.cc by Item_func_group_concat::clear(), + Item_sum_count_distinct::clear(), and Item_func_group_concat::clear(). + Called from sql_delete.cc by mysql_delete(). + Called from sql_select.cc by JOIN::reinit(). + Called from sql_union.cc by st_select_lex_unit::exec(). +*/ + +int ha_partition::delete_all_rows() +{ + int error; + handler **file; + DBUG_ENTER("ha_partition::delete_all_rows"); + file= m_file; + do + { + if ((error= (*file)->delete_all_rows())) + DBUG_RETURN(error); + } while (*(++file)); + DBUG_RETURN(0); +} + +/* + rows == 0 means we will probably insert many rows +*/ + +void ha_partition::start_bulk_insert(ha_rows rows) +{ + handler **file; + DBUG_ENTER("ha_partition::start_bulk_insert"); + if (!rows) + { + /* Avoid allocation big caches in all underlaying handlers */ + DBUG_VOID_RETURN; + } + rows= rows/m_tot_parts + 1; + file= m_file; + do + { + (*file)->start_bulk_insert(rows); + } while (*(++file)); + DBUG_VOID_RETURN; +} + + +int ha_partition::end_bulk_insert() +{ + int error= 0; + handler **file; + DBUG_ENTER("ha_partition::end_bulk_insert"); + + file= m_file; + do + { + int tmp; + /* We want to execute end_bulk_insert() on all handlers */ + if ((tmp= (*file)->end_bulk_insert())) + error= tmp; + } while (*(++file)); + DBUG_RETURN(error); +} + +/**************************************************************************** + MODULE full table scan +****************************************************************************/ +/* + Initialize engine for random reads + + SYNOPSIS + ha_partition::rnd_init() + scan 0 Initialize for random reads through rnd_pos() + 1 Initialize for random scan through rnd_next() + + NOTES + rnd_init() is called when the server wants the storage engine to do a + table scan or when the server wants to access data through rnd_pos. + + When scan is used we will scan one handler partition at a time. + When preparing for rnd_pos we will init all handler partitions. + No extra cache handling is needed when scannning is not performed. + + Before initialising we will call rnd_end to ensure that we clean up from + any previous incarnation of a table scan. + Called from filesort.cc, records.cc, sql_handler.cc, sql_select.cc, + sql_table.cc, and sql_update.cc. +*/ + +int ha_partition::rnd_init(bool scan) +{ + int error; + handler **file; + DBUG_ENTER("ha_partition::rnd_init"); + + include_partition_fields_in_used_fields(); + if (scan) + { + /* + rnd_end() is needed for partitioning to reset internal data if scan + is already in use + */ + + rnd_end(); + if (partition_scan_set_up(rec_buf(0), FALSE)) + { + /* + The set of partitions to scan is empty. We return success and return + end of file on first rnd_next. + */ + DBUG_RETURN(0); + } + /* + We will use the partition set in our scan, using the start and stop + partition and checking each scan before start dependent on bittfields. + */ + late_extra_cache(m_part_spec.start_part); + DBUG_PRINT("info", ("rnd_init on partition %d",m_part_spec.start_part)); + error= m_file[m_part_spec.start_part]->ha_rnd_init(1); + m_scan_value= 1; // Scan active + if (error) + m_scan_value= 2; // No scan active + DBUG_RETURN(error); + } + file= m_file; + do + { + if ((error= (*file)->ha_rnd_init(0))) + goto err; + } while (*(++file)); + m_scan_value= 0; + DBUG_RETURN(0); + +err: + while (file--) + (*file)->ha_rnd_end(); + DBUG_RETURN(error); +} + + +int ha_partition::rnd_end() +{ + handler **file; + DBUG_ENTER("ha_partition::rnd_end"); + switch (m_scan_value) { + case 2: // Error + break; + case 1: // Table scan + if (m_part_spec.start_part != NO_CURRENT_PART_ID) + { + late_extra_no_cache(m_part_spec.start_part); + m_file[m_part_spec.start_part]->ha_rnd_end(); + } + break; + case 0: + file= m_file; + do + { + (*file)->ha_rnd_end(); + } while (*(++file)); + break; + } + m_part_spec.start_part= NO_CURRENT_PART_ID; + m_scan_value= 2; + DBUG_RETURN(0); +} + + +/* + read next row during full table scan (scan in random row order) + + SYNOPSIS + rnd_next() + buf buffer that should be filled with data + + This is called for each row of the table scan. When you run out of records + you should return HA_ERR_END_OF_FILE. + The Field structure for the table is the key to getting data into buf + in a manner that will allow the server to understand it. + + Called from filesort.cc, records.cc, sql_handler.cc, sql_select.cc, + sql_table.cc, and sql_update.cc. +*/ + +int ha_partition::rnd_next(byte *buf) +{ + DBUG_ASSERT(m_scan_value); + uint part_id= m_part_spec.start_part; // Cache of this variable + handler *file= m_file[part_id]; + int result= HA_ERR_END_OF_FILE; + DBUG_ENTER("ha_partition::rnd_next"); + + DBUG_ASSERT(m_scan_value == 1); + + if (part_id > m_part_spec.end_part) + { + /* + The original set of partitions to scan was empty and thus we report + the result here. + */ + goto end; + } + while (TRUE) + { + if ((result= file->rnd_next(buf))) + { + if (result == HA_ERR_RECORD_DELETED) + continue; // Probably MyISAM + + if (result != HA_ERR_END_OF_FILE) + break; // Return error + + /* End current partition */ + late_extra_no_cache(part_id); + DBUG_PRINT("info", ("rnd_end on partition %d", part_id)); + if ((result= file->ha_rnd_end())) + break; + /* Shift to next partition */ + if (++part_id > m_part_spec.end_part) + { + result= HA_ERR_END_OF_FILE; + break; + } + file= m_file[part_id]; + DBUG_PRINT("info", ("rnd_init on partition %d", part_id)); + if ((result= file->ha_rnd_init(1))) + break; + late_extra_cache(part_id); + } + else + { + m_part_spec.start_part= part_id; + m_last_part= part_id; + table->status= 0; + DBUG_RETURN(0); + } + } + +end: + m_part_spec.start_part= NO_CURRENT_PART_ID; + table->status= STATUS_NOT_FOUND; + DBUG_RETURN(result); +} + + +inline void store_part_id_in_pos(byte *pos, uint part_id) +{ + int2store(pos, part_id); +} + +inline uint get_part_id_from_pos(const byte *pos) +{ + return uint2korr(pos); +} + +/* + position() is called after each call to rnd_next() if the data needs + to be ordered. You can do something like the following to store + the position: + ha_store_ptr(ref, ref_length, current_position); + + The server uses ref to store data. ref_length in the above case is + the size needed to store current_position. ref is just a byte array + that the server will maintain. If you are using offsets to mark rows, then + current_position should be the offset. If it is a primary key like in + BDB, then it needs to be a primary key. + + Called from filesort.cc, sql_select.cc, sql_delete.cc and sql_update.cc. +*/ + +void ha_partition::position(const byte *record) +{ + handler *file= m_file[m_last_part]; + DBUG_ENTER("ha_partition::position"); + file->position(record); + store_part_id_in_pos(ref, m_last_part); + memcpy((ref + PARTITION_BYTES_IN_POS), file->ref, + (ref_length - PARTITION_BYTES_IN_POS)); + +#ifdef SUPPORTING_PARTITION_OVER_DIFFERENT_ENGINES +#ifdef HAVE_purify + bzero(ref + PARTITION_BYTES_IN_POS + ref_length, max_ref_length-ref_length); +#endif /* HAVE_purify */ +#endif + DBUG_VOID_RETURN; +} + +/* + This is like rnd_next, but you are given a position to use + to determine the row. The position will be of the type that you stored in + ref. You can use ha_get_ptr(pos,ref_length) to retrieve whatever key + or position you saved when position() was called. + Called from filesort.cc records.cc sql_insert.cc sql_select.cc + sql_update.cc. +*/ + +int ha_partition::rnd_pos(byte * buf, byte *pos) +{ + uint part_id; + handler *file; + DBUG_ENTER("ha_partition::rnd_pos"); + + part_id= get_part_id_from_pos((const byte *) pos); + DBUG_ASSERT(part_id < m_tot_parts); + file= m_file[part_id]; + m_last_part= part_id; + DBUG_RETURN(file->rnd_pos(buf, (pos + PARTITION_BYTES_IN_POS))); +} + + +/**************************************************************************** + MODULE index scan +****************************************************************************/ +/* + Positions an index cursor to the index specified in the handle. Fetches the + row if available. If the key value is null, begin at the first key of the + index. + + There are loads of optimisations possible here for the partition handler. + The same optimisations can also be checked for full table scan although + only through conditions and not from index ranges. + Phase one optimisations: + Check if the fields of the partition function are bound. If so only use + the single partition it becomes bound to. + Phase two optimisations: + If it can be deducted through range or list partitioning that only a + subset of the partitions are used, then only use those partitions. +*/ + +/* + index_init is always called before starting index scans (except when + starting through index_read_idx and using read_range variants). +*/ + +int ha_partition::index_init(uint inx, bool sorted) +{ + int error= 0; + handler **file; + DBUG_ENTER("ha_partition::index_init"); + + active_index= inx; + m_part_spec.start_part= NO_CURRENT_PART_ID; + m_start_key.length= 0; + m_ordered= sorted; + m_curr_key_info= table->key_info+inx; + include_partition_fields_in_used_fields(); + + file= m_file; + do + { + /* TODO RONM: Change to index_init() when code is stable */ + if ((error= (*file)->ha_index_init(inx, sorted))) + { + DBUG_ASSERT(0); // Should never happen + break; + } + } while (*(++file)); + DBUG_RETURN(error); +} + + +/* + index_end is called at the end of an index scan to clean up any + things needed to clean up. +*/ + +int ha_partition::index_end() +{ + int error= 0; + handler **file; + DBUG_ENTER("ha_partition::index_end"); + + active_index= MAX_KEY; + m_part_spec.start_part= NO_CURRENT_PART_ID; + file= m_file; + do + { + int tmp; + /* We want to execute index_end() on all handlers */ + /* TODO RONM: Change to index_end() when code is stable */ + if ((tmp= (*file)->ha_index_end())) + error= tmp; + } while (*(++file)); + DBUG_RETURN(error); +} + + +/* + index_read starts a new index scan using a start key. The MySQL Server + will check the end key on its own. Thus to function properly the + partitioned handler need to ensure that it delivers records in the sort + order of the MySQL Server. + index_read can be restarted without calling index_end on the previous + index scan and without calling index_init. In this case the index_read + is on the same index as the previous index_scan. This is particularly + used in conjuntion with multi read ranges. +*/ + +int ha_partition::index_read(byte * buf, const byte * key, + uint key_len, enum ha_rkey_function find_flag) +{ + DBUG_ENTER("ha_partition::index_read"); + end_range= 0; + DBUG_RETURN(common_index_read(buf, key, key_len, find_flag)); +} + + +int ha_partition::common_index_read(byte *buf, const byte *key, uint key_len, + enum ha_rkey_function find_flag) +{ + int error; + DBUG_ENTER("ha_partition::common_index_read"); + + memcpy((void*)m_start_key.key, key, key_len); + m_start_key.length= key_len; + m_start_key.flag= find_flag; + m_index_scan_type= partition_index_read; + + if ((error= partition_scan_set_up(buf, TRUE))) + { + DBUG_RETURN(error); + } + + if (!m_ordered_scan_ongoing || + (find_flag == HA_READ_KEY_EXACT && + (key_len >= m_curr_key_info->key_length || + key_len == 0))) + { + /* + We use unordered index scan either when read_range is used and flag + is set to not use ordered or when an exact key is used and in this + case all records will be sorted equal and thus the sort order of the + resulting records doesn't matter. + We also use an unordered index scan when the number of partitions to + scan is only one. + The unordered index scan will use the partition set created. + Need to set unordered scan ongoing since we can come here even when + it isn't set. + */ + m_ordered_scan_ongoing= FALSE; + error= handle_unordered_scan_next_partition(buf); + } + else + { + /* + In all other cases we will use the ordered index scan. This will use + the partition set created by the get_partition_set method. + */ + error= handle_ordered_index_scan(buf); + } + DBUG_RETURN(error); +} + + +/* + index_first() asks for the first key in the index. + This is similar to index_read except that there is no start key since + the scan starts from the leftmost entry and proceeds forward with + index_next. + + Called from opt_range.cc, opt_sum.cc, sql_handler.cc, + and sql_select.cc. +*/ + +int ha_partition::index_first(byte * buf) +{ + DBUG_ENTER("ha_partition::index_first"); + end_range= 0; + m_index_scan_type= partition_index_first; + DBUG_RETURN(common_first_last(buf)); +} + + +/* + index_last() asks for the last key in the index. + This is similar to index_read except that there is no start key since + the scan starts from the rightmost entry and proceeds forward with + index_prev. + + Called from opt_range.cc, opt_sum.cc, sql_handler.cc, + and sql_select.cc. +*/ + +int ha_partition::index_last(byte * buf) +{ + DBUG_ENTER("ha_partition::index_last"); + m_index_scan_type= partition_index_last; + DBUG_RETURN(common_first_last(buf)); +} + +int ha_partition::common_first_last(byte *buf) +{ + int error; + if ((error= partition_scan_set_up(buf, FALSE))) + return error; + if (!m_ordered_scan_ongoing) + return handle_unordered_scan_next_partition(buf); + return handle_ordered_index_scan(buf); +} + +/* + Positions an index cursor to the index specified in key. Fetches the + row if any. This is only used to read whole keys. + TODO: Optimise this code to avoid index_init and index_end +*/ + +int ha_partition::index_read_idx(byte * buf, uint index, const byte * key, + uint key_len, + enum ha_rkey_function find_flag) +{ + int res; + DBUG_ENTER("ha_partition::index_read_idx"); + index_init(index, 0); + res= index_read(buf, key, key_len, find_flag); + index_end(); + DBUG_RETURN(res); +} + +/* + This is used in join_read_last_key to optimise away an ORDER BY. + Can only be used on indexes supporting HA_READ_ORDER +*/ + +int ha_partition::index_read_last(byte *buf, const byte *key, uint keylen) +{ + DBUG_ENTER("ha_partition::index_read_last"); + m_ordered= TRUE; // Safety measure + DBUG_RETURN(index_read(buf, key, keylen, HA_READ_PREFIX_LAST)); +} + + +/* + Used to read forward through the index. +*/ + +int ha_partition::index_next(byte * buf) +{ + DBUG_ENTER("ha_partition::index_next"); + /* + TODO(low priority): + If we want partition to work with the HANDLER commands, we + must be able to do index_last() -> index_prev() -> index_next() + */ + DBUG_ASSERT(m_index_scan_type != partition_index_last); + if (!m_ordered_scan_ongoing) + { + DBUG_RETURN(handle_unordered_next(buf, FALSE)); + } + DBUG_RETURN(handle_ordered_next(buf, FALSE)); +} + + +/* + This routine is used to read the next but only if the key is the same + as supplied in the call. +*/ + +int ha_partition::index_next_same(byte *buf, const byte *key, uint keylen) +{ + DBUG_ENTER("ha_partition::index_next_same"); + DBUG_ASSERT(keylen == m_start_key.length); + DBUG_ASSERT(m_index_scan_type != partition_index_last); + if (!m_ordered_scan_ongoing) + DBUG_RETURN(handle_unordered_next(buf, TRUE)); + DBUG_RETURN(handle_ordered_next(buf, TRUE)); +} + +/* + Used to read backwards through the index. +*/ + +int ha_partition::index_prev(byte * buf) +{ + DBUG_ENTER("ha_partition::index_prev"); + /* TODO: read comment in index_next */ + DBUG_ASSERT(m_index_scan_type != partition_index_first); + DBUG_RETURN(handle_ordered_prev(buf)); +} + + +/* + We reimplement read_range_first since we don't want the compare_key + check at the end. This is already performed in the partition handler. + read_range_next is very much different due to that we need to scan + all underlying handlers. +*/ + +int ha_partition::read_range_first(const key_range *start_key, + const key_range *end_key, + bool eq_range_arg, bool sorted) +{ + int error; + DBUG_ENTER("ha_partition::read_range_first"); + m_ordered= sorted; + eq_range= eq_range_arg; + end_range= 0; + if (end_key) + { + end_range= &save_end_range; + save_end_range= *end_key; + key_compare_result_on_equal= + ((end_key->flag == HA_READ_BEFORE_KEY) ? 1 : + (end_key->flag == HA_READ_AFTER_KEY) ? -1 : 0); + } + range_key_part= m_curr_key_info->key_part; + + if (!start_key) // Read first record + { + m_index_scan_type= partition_index_first; + error= common_first_last(m_rec0); + } + else + { + error= common_index_read(m_rec0, + start_key->key, + start_key->length, start_key->flag); + } + DBUG_RETURN(error); +} + + +int ha_partition::read_range_next() +{ + DBUG_ENTER("ha_partition::read_range_next"); + if (m_ordered) + { + DBUG_RETURN(handler::read_range_next()); + } + DBUG_RETURN(handle_unordered_next(m_rec0, eq_range)); +} + + +int ha_partition::partition_scan_set_up(byte * buf, bool idx_read_flag) +{ + DBUG_ENTER("ha_partition::partition_scan_set_up"); + + if (idx_read_flag) + get_partition_set(table,buf,active_index,&m_start_key,&m_part_spec); + else + get_partition_set(table, buf, MAX_KEY, 0, &m_part_spec); + if (m_part_spec.start_part > m_part_spec.end_part) + { + /* + We discovered a partition set but the set was empty so we report + key not found. + */ + DBUG_PRINT("info", ("scan with no partition to scan")); + DBUG_RETURN(HA_ERR_END_OF_FILE); + } + if (m_part_spec.start_part == m_part_spec.end_part) + { + /* + We discovered a single partition to scan, this never needs to be + performed using the ordered index scan. + */ + DBUG_PRINT("info", ("index scan using the single partition %d", + m_part_spec.start_part)); + m_ordered_scan_ongoing= FALSE; + } + else + { + /* + Set m_ordered_scan_ongoing according how the scan should be done + */ + m_ordered_scan_ongoing= m_ordered; + } + DBUG_ASSERT(m_part_spec.start_part < m_tot_parts && + m_part_spec.end_part < m_tot_parts); + DBUG_RETURN(0); +} + + +/**************************************************************************** + Unordered Index Scan Routines +****************************************************************************/ +/* + These routines are used to scan partitions without considering order. + This is performed in two situations. + 1) In read_multi_range this is the normal case + 2) When performing any type of index_read, index_first, index_last where + all fields in the partition function is bound. In this case the index + scan is performed on only one partition and thus it isn't necessary to + perform any sort. +*/ + +int ha_partition::handle_unordered_next(byte *buf, bool next_same) +{ + handler *file= file= m_file[m_part_spec.start_part]; + int error; + DBUG_ENTER("ha_partition::handle_unordered_next"); + + /* + We should consider if this should be split into two functions as + next_same is alwas a local constant + */ + if (next_same) + { + if (!(error= file->index_next_same(buf, m_start_key.key, + m_start_key.length))) + { + m_last_part= m_part_spec.start_part; + DBUG_RETURN(0); + } + } + else if (!(error= file->index_next(buf))) + { + if (compare_key(end_range) <= 0) + { + m_last_part= m_part_spec.start_part; + DBUG_RETURN(0); // Row was in range + } + error= HA_ERR_END_OF_FILE; + } + + if (error == HA_ERR_END_OF_FILE) + { + m_part_spec.start_part++; // Start using next part + error= handle_unordered_scan_next_partition(buf); + } + DBUG_RETURN(error); +} + + +/* + This routine is used to start the index scan on the next partition. + Both initial start and after completing scan on one partition. +*/ + +int ha_partition::handle_unordered_scan_next_partition(byte * buf) +{ + uint i; + DBUG_ENTER("ha_partition::handle_unordered_scan_next_partition"); + + for (i= m_part_spec.start_part; i <= m_part_spec.end_part; i++) + { + int error; + handler *file= m_file[i]; + + m_part_spec.start_part= i; + switch (m_index_scan_type) { + case partition_index_read: + DBUG_PRINT("info", ("index_read on partition %d", i)); + error= file->index_read(buf, m_start_key.key, + m_start_key.length, + m_start_key.flag); + break; + case partition_index_first: + DBUG_PRINT("info", ("index_first on partition %d", i)); + error= file->index_first(buf); + break; + default: + DBUG_ASSERT(FALSE); + DBUG_RETURN(1); + } + if (!error) + { + if (compare_key(end_range) <= 0) + { + m_last_part= i; + DBUG_RETURN(0); + } + error= HA_ERR_END_OF_FILE; + } + if ((error != HA_ERR_END_OF_FILE) && (error != HA_ERR_KEY_NOT_FOUND)) + DBUG_RETURN(error); + DBUG_PRINT("info", ("HA_ERR_END_OF_FILE on partition %d", i)); + } + m_part_spec.start_part= NO_CURRENT_PART_ID; + DBUG_RETURN(HA_ERR_END_OF_FILE); +} + + +/* + This part contains the logic to handle index scans that require ordered + output. This includes all except those started by read_range_first with + the flag ordered set to FALSE. Thus most direct index_read and all + index_first and index_last. + + We implement ordering by keeping one record plus a key buffer for each + partition. Every time a new entry is requested we will fetch a new + entry from the partition that is currently not filled with an entry. + Then the entry is put into its proper sort position. + + Returning a record is done by getting the top record, copying the + record to the request buffer and setting the partition as empty on + entries. +*/ + +int ha_partition::handle_ordered_index_scan(byte *buf) +{ + uint i, j= 0; + bool found= FALSE; + bool reverse_order= FALSE; + DBUG_ENTER("ha_partition::handle_ordered_index_scan"); + + m_top_entry= NO_CURRENT_PART_ID; + queue_remove_all(&queue); + for (i= m_part_spec.start_part; i <= m_part_spec.end_part; i++) + { + int error; + byte *rec_buf_ptr= rec_buf(i); + handler *file= m_file[i]; + + switch (m_index_scan_type) { + case partition_index_read: + error= file->index_read(rec_buf_ptr, + m_start_key.key, + m_start_key.length, + m_start_key.flag); + reverse_order= FALSE; + break; + case partition_index_first: + error= file->index_first(rec_buf_ptr); + reverse_order= FALSE; + break; + case partition_index_last: + error= file->index_last(rec_buf_ptr); + reverse_order= TRUE; + break; + default: + DBUG_ASSERT(FALSE); + DBUG_RETURN(HA_ERR_END_OF_FILE); + } + if (!error) + { + found= TRUE; + /* + Initialise queue without order first, simply insert + */ + queue_element(&queue, j++)= (byte*)queue_buf(i); + } + else if (error != HA_ERR_KEY_NOT_FOUND && error != HA_ERR_END_OF_FILE) + { + DBUG_RETURN(error); + } + } + if (found) + { + /* + We found at least one partition with data, now sort all entries and + after that read the first entry and copy it to the buffer to return in. + */ + queue_set_max_at_top(&queue, reverse_order); + queue_set_cmp_arg(&queue, (void*)m_curr_key_info); + queue.elements= j; + queue_fix(&queue); + return_top_record(buf); + DBUG_PRINT("info", ("Record returned from partition %d", m_top_entry)); + DBUG_RETURN(0); + } + DBUG_RETURN(HA_ERR_END_OF_FILE); +} + + +void ha_partition::return_top_record(byte *buf) +{ + uint part_id; + byte *key_buffer= queue_top(&queue); + byte *rec_buffer= key_buffer + PARTITION_BYTES_IN_POS; + part_id= uint2korr(key_buffer); + memcpy(buf, rec_buffer, m_rec_length); + m_last_part= part_id; + m_top_entry= part_id; +} + + +int ha_partition::handle_ordered_next(byte *buf, bool next_same) +{ + int error; + uint part_id= m_top_entry; + handler *file= m_file[part_id]; + DBUG_ENTER("ha_partition::handle_ordered_next"); + + if (!next_same) + error= file->index_next(rec_buf(part_id)); + else + error= file->index_next_same(rec_buf(part_id), m_start_key.key, + m_start_key.length); + if (error) + { + if (error == HA_ERR_END_OF_FILE) + { + /* Return next buffered row */ + queue_remove(&queue, (uint) 0); + if (queue.elements) + { + DBUG_PRINT("info", ("Record returned from partition %u (2)", + m_top_entry)); + return_top_record(buf); + error= 0; + } + } + DBUG_RETURN(error); + } + queue_replaced(&queue); + return_top_record(buf); + DBUG_PRINT("info", ("Record returned from partition %u", m_top_entry)); + DBUG_RETURN(0); +} + + +int ha_partition::handle_ordered_prev(byte *buf) +{ + int error; + uint part_id= m_top_entry; + handler *file= m_file[part_id]; + DBUG_ENTER("ha_partition::handle_ordered_prev"); + if ((error= file->index_prev(rec_buf(part_id)))) + { + if (error == HA_ERR_END_OF_FILE) + { + queue_remove(&queue, (uint) 0); + if (queue.elements) + { + return_top_record(buf); + DBUG_PRINT("info", ("Record returned from partition %d (2)", + m_top_entry)); + error= 0; + } + } + DBUG_RETURN(error); + } + queue_replaced(&queue); + return_top_record(buf); + DBUG_PRINT("info", ("Record returned from partition %d", m_top_entry)); + DBUG_RETURN(0); +} + + +void ha_partition::include_partition_fields_in_used_fields() +{ + DBUG_ENTER("ha_partition::include_partition_fields_in_used_fields"); + Field **ptr= m_part_field_array; + do + { + ha_set_bit_in_read_set((*ptr)->fieldnr); + } while (*(++ptr)); + DBUG_VOID_RETURN; +} + + +/**************************************************************************** + MODULE information calls +****************************************************************************/ + +/* + These are all first approximations of the extra, info, scan_time + and read_time calls +*/ + +/* + ::info() is used to return information to the optimizer. + Currently this table handler doesn't implement most of the fields + really needed. SHOW also makes use of this data + Another note, if your handler doesn't proved exact record count, + you will probably want to have the following in your code: + if (records < 2) + records = 2; + The reason is that the server will optimize for cases of only a single + record. If in a table scan you don't know the number of records + it will probably be better to set records to two so you can return + as many records as you need. + + Along with records a few more variables you may wish to set are: + records + deleted + data_file_length + index_file_length + delete_length + check_time + Take a look at the public variables in handler.h for more information. + + Called in: + filesort.cc + ha_heap.cc + item_sum.cc + opt_sum.cc + sql_delete.cc + sql_delete.cc + sql_derived.cc + sql_select.cc + sql_select.cc + sql_select.cc + sql_select.cc + sql_select.cc + sql_show.cc + sql_show.cc + sql_show.cc + sql_show.cc + sql_table.cc + sql_union.cc + sql_update.cc + + Some flags that are not implemented + HA_STATUS_POS: + This parameter is never used from the MySQL Server. It is checked in a + place in MyISAM so could potentially be used by MyISAM specific programs. + HA_STATUS_NO_LOCK: + This is declared and often used. It's only used by MyISAM. + It means that MySQL doesn't need the absolute latest statistics + information. This may save the handler from doing internal locks while + retrieving statistics data. +*/ + +void ha_partition::info(uint flag) +{ + handler *file, **file_array; + DBUG_ENTER("ha_partition:info"); + + if (flag & HA_STATUS_AUTO) + { + DBUG_PRINT("info", ("HA_STATUS_AUTO")); + /* + The auto increment value is only maintained by the first handler + so we will only call this. + */ + m_file[0]->info(HA_STATUS_AUTO); + } + if (flag & HA_STATUS_VARIABLE) + { + DBUG_PRINT("info", ("HA_STATUS_VARIABLE")); + /* + Calculates statistical variables + records: Estimate of number records in table + We report sum (always at least 2) + deleted: Estimate of number holes in the table due to + deletes + We report sum + data_file_length: Length of data file, in principle bytes in table + We report sum + index_file_length: Length of index file, in principle bytes in + indexes in the table + We report sum + mean_record_length:Mean record length in the table + We calculate this + check_time: Time of last check (only applicable to MyISAM) + We report last time of all underlying handlers + */ + records= 0; + deleted= 0; + data_file_length= 0; + index_file_length= 0; + check_time= 0; + file_array= m_file; + do + { + file= *file_array; + file->info(HA_STATUS_VARIABLE); + records+= file->records; + deleted+= file->deleted; + data_file_length+= file->data_file_length; + index_file_length+= file->index_file_length; + if (file->check_time > check_time) + check_time= file->check_time; + } while (*(++file_array)); + if (records < 2) + records= 2; + mean_rec_length= (ulong) (data_file_length / records); + } + if (flag & HA_STATUS_CONST) + { + DBUG_PRINT("info", ("HA_STATUS_CONST")); + /* + Recalculate loads of constant variables. MyISAM also sets things + directly on the table share object. + + Check whether this should be fixed since handlers should not + change things directly on the table object. + + Monty comment: This should NOT be changed! It's the handlers + responsibility to correct table->s->keys_xxxx information if keys + have been disabled. + + The most important parameters set here is records per key on + all indexes. block_size and primar key ref_length. + + For each index there is an array of rec_per_key. + As an example if we have an index with three attributes a,b and c + we will have an array of 3 rec_per_key. + rec_per_key[0] is an estimate of number of records divided by + number of unique values of the field a. + rec_per_key[1] is an estimate of the number of records divided + by the number of unique combinations of the fields a and b. + rec_per_key[2] is an estimate of the number of records divided + by the number of unique combinations of the fields a,b and c. + + Many handlers only set the value of rec_per_key when all fields + are bound (rec_per_key[2] in the example above). + + If the handler doesn't support statistics, it should set all of the + above to 0. + + We will allow the first handler to set the rec_per_key and use + this as an estimate on the total table. + + max_data_file_length: Maximum data file length + We ignore it, is only used in + SHOW TABLE STATUS + max_index_file_length: Maximum index file length + We ignore it since it is never used + block_size: Block size used + We set it to the value of the first handler + sortkey: Never used at any place so ignored + ref_length: We set this to the value calculated + and stored in local object + raid_type: Set by first handler (MyISAM) + raid_chunks: Set by first handler (MyISAM) + raid_chunksize: Set by first handler (MyISAM) + create_time: Creation time of table + Set by first handler + + So we calculate these constants by using the variables on the first + handler. + */ + + file= m_file[0]; + file->info(HA_STATUS_CONST); + create_time= file->create_time; + raid_type= file->raid_type; + raid_chunks= file->raid_chunks; + raid_chunksize= file->raid_chunksize; + ref_length= m_ref_length; + } + if (flag & HA_STATUS_ERRKEY) + { + handler *file= m_file[m_last_part]; + DBUG_PRINT("info", ("info: HA_STATUS_ERRKEY")); + /* + This flag is used to get index number of the unique index that + reported duplicate key + We will report the errkey on the last handler used and ignore the rest + */ + file->info(HA_STATUS_ERRKEY); + if (file->errkey != (uint) -1) + errkey= file->errkey; + } + if (flag & HA_STATUS_TIME) + { + DBUG_PRINT("info", ("info: HA_STATUS_TIME")); + /* + This flag is used to set the latest update time of the table. + Used by SHOW commands + We will report the maximum of these times + */ + update_time= 0; + file_array= m_file; + do + { + file= *file_array; + file->info(HA_STATUS_TIME); + if (file->update_time > update_time) + update_time= file->update_time; + } while (*(++file_array)); + } + DBUG_VOID_RETURN; +} + + +/* + extra() is called whenever the server wishes to send a hint to + the storage engine. The MyISAM engine implements the most hints. + + We divide the parameters into the following categories: + 1) Parameters used by most handlers + 2) Parameters used by some non-MyISAM handlers + 3) Parameters used only by MyISAM + 4) Parameters only used by temporary tables for query processing + 5) Parameters only used by MyISAM internally + 6) Parameters not used at all + + The partition handler need to handle category 1), 2) and 3). + + 1) Parameters used by most handlers + ----------------------------------- + HA_EXTRA_RESET: + This option is used by most handlers and it resets the handler state + to the same state as after an open call. This includes releasing + any READ CACHE or WRITE CACHE or other internal buffer used. + + It is called from the reset method in the handler interface. There are + three instances where this is called. + 1) After completing a INSERT ... SELECT ... query the handler for the + table inserted into is reset + 2) It is called from close_thread_table which in turn is called from + close_thread_tables except in the case where the tables are locked + in which case ha_commit_stmt is called instead. + It is only called from here if flush_version hasn't changed and the + table is not an old table when calling close_thread_table. + close_thread_tables is called from many places as a general clean up + function after completing a query. + 3) It is called when deleting the QUICK_RANGE_SELECT object if the + QUICK_RANGE_SELECT object had its own handler object. It is called + immediatley before close of this local handler object. + HA_EXTRA_KEYREAD: + HA_EXTRA_NO_KEYREAD: + These parameters are used to provide an optimisation hint to the handler. + If HA_EXTRA_KEYREAD is set it is enough to read the index fields, for + many handlers this means that the index-only scans can be used and it + is not necessary to use the real records to satisfy this part of the + query. Index-only scans is a very important optimisation for disk-based + indexes. For main-memory indexes most indexes contain a reference to the + record and thus KEYREAD only says that it is enough to read key fields. + HA_EXTRA_NO_KEYREAD disables this for the handler, also HA_EXTRA_RESET + will disable this option. + The handler will set HA_KEYREAD_ONLY in its table flags to indicate this + feature is supported. + HA_EXTRA_FLUSH: + Indication to flush tables to disk, called at close_thread_table to + ensure disk based tables are flushed at end of query execution. + + 2) Parameters used by some non-MyISAM handlers + ---------------------------------------------- + HA_EXTRA_RETRIEVE_ALL_COLS: + Many handlers have implemented optimisations to avoid fetching all + fields when retrieving data. In certain situations all fields need + to be retrieved even though the query_id is not set on all field + objects. + + It is called from copy_data_between_tables where all fields are + copied without setting query_id before calling the handlers. + It is called from UPDATE statements when the fields of the index + used is updated or ORDER BY is used with UPDATE. + And finally when calculating checksum of a table using the CHECKSUM + command. + HA_EXTRA_RETRIEVE_PRIMARY_KEY: + In some situations it is mandatory to retrieve primary key fields + independent of the query id's. This extra flag specifies that fetch + of primary key fields is mandatory. + HA_EXTRA_KEYREAD_PRESERVE_FIELDS: + This is a strictly InnoDB feature that is more or less undocumented. + When it is activated InnoDB copies field by field from its fetch + cache instead of all fields in one memcpy. Have no idea what the + purpose of this is. + Cut from include/my_base.h: + When using HA_EXTRA_KEYREAD, overwrite only key member fields and keep + other fields intact. When this is off (by default) InnoDB will use memcpy + to overwrite entire row. + HA_EXTRA_IGNORE_DUP_KEY: + HA_EXTRA_NO_IGNORE_DUP_KEY: + Informs the handler to we will not stop the transaction if we get an + duplicate key errors during insert/upate. + Always called in pair, triggered by INSERT IGNORE and other similar + SQL constructs. + Not used by MyISAM. + + 3) Parameters used only by MyISAM + --------------------------------- + HA_EXTRA_NORMAL: + Only used in MyISAM to reset quick mode, not implemented by any other + handler. Quick mode is also reset in MyISAM by HA_EXTRA_RESET. + + It is called after completing a successful DELETE query if the QUICK + option is set. + + HA_EXTRA_QUICK: + When the user does DELETE QUICK FROM table where-clause; this extra + option is called before the delete query is performed and + HA_EXTRA_NORMAL is called after the delete query is completed. + Temporary tables used internally in MySQL always set this option + + The meaning of quick mode is that when deleting in a B-tree no merging + of leafs is performed. This is a common method and many large DBMS's + actually only support this quick mode since it is very difficult to + merge leaves in a tree used by many threads concurrently. + + HA_EXTRA_CACHE: + This flag is usually set with extra_opt along with a cache size. + The size of this buffer is set by the user variable + record_buffer_size. The value of this cache size is the amount of + data read from disk in each fetch when performing a table scan. + This means that before scanning a table it is normal to call + extra with HA_EXTRA_CACHE and when the scan is completed to call + HA_EXTRA_NO_CACHE to release the cache memory. + + Some special care is taken when using this extra parameter since there + could be a write ongoing on the table in the same statement. In this + one has to take special care since there might be a WRITE CACHE as + well. HA_EXTRA_CACHE specifies using a READ CACHE and using + READ CACHE and WRITE CACHE at the same time is not possible. + + Only MyISAM currently use this option. + + It is set when doing full table scans using rr_sequential and + reset when completing such a scan with end_read_record + (resetting means calling extra with HA_EXTRA_NO_CACHE). + + It is set in filesort.cc for MyISAM internal tables and it is set in + a multi-update where HA_EXTRA_CACHE is called on a temporary result + table and after that ha_rnd_init(0) on table to be updated + and immediately after that HA_EXTRA_NO_CACHE on table to be updated. + + Apart from that it is always used from init_read_record but not when + used from UPDATE statements. It is not used from DELETE statements + with ORDER BY and LIMIT but it is used in normal scan loop in DELETE + statements. The reason here is that DELETE's in MyISAM doesn't move + existings data rows. + + It is also set in copy_data_between_tables when scanning the old table + to copy over to the new table. + And it is set in join_init_read_record where quick objects are used + to perform a scan on the table. In this case the full table scan can + even be performed multiple times as part of the nested loop join. + + For purposes of the partition handler it is obviously necessary to have + special treatment of this extra call. If we would simply pass this + extra call down to each handler we would allocate + cache size * no of partitions amount of memory and this is not + necessary since we will only scan one partition at a time when doing + full table scans. + + Thus we treat it by first checking whether we have MyISAM handlers in + the table, if not we simply ignore the call and if we have we will + record the call but will not call any underlying handler yet. Then + when performing the sequential scan we will check this recorded value + and call extra_opt whenever we start scanning a new partition. + + monty: Neads to be fixed so that it's passed to all handlers when we + move to another partition during table scan. + + HA_EXTRA_NO_CACHE: + When performing a UNION SELECT HA_EXTRA_NO_CACHE is called from the + flush method in the select_union class. + It is used to some extent when insert delayed inserts. + See HA_EXTRA_RESET_STATE for use in conjunction with delete_all_rows(). + + It should be ok to call HA_EXTRA_NO_CACHE on all underlying handlers + if they are MyISAM handlers. Other handlers we can ignore the call + for. If no cache is in use they will quickly return after finding + this out. And we also ensure that all caches are disabled and no one + is left by mistake. + In the future this call will probably be deleted an we will instead call + ::reset(); + + HA_EXTRA_WRITE_CACHE: + See above, called from various places. It is mostly used when we + do INSERT ... SELECT + No special handling to save cache space is developed currently. + + HA_EXTRA_PREPARE_FOR_UPDATE: + This is called as part of a multi-table update. When the table to be + updated is also scanned then this informs MyISAM handler to drop any + caches if dynamic records are used (fixed size records do not care + about this call). We pass this along to all underlying MyISAM handlers + and ignore it for the rest. + + HA_EXTRA_PREPARE_FOR_DELETE: + Only used by MyISAM, called in preparation for a DROP TABLE. + It's used mostly by Windows that cannot handle dropping an open file. + On other platforms it has the same effect as HA_EXTRA_FORCE_REOPEN. + + HA_EXTRA_READCHECK: + HA_EXTRA_NO_READCHECK: + Only one call to HA_EXTRA_NO_READCHECK from ha_open where it says that + this is not needed in SQL. The reason for this call is that MyISAM sets + the READ_CHECK_USED in the open call so the call is needed for MyISAM + to reset this feature. + The idea with this parameter was to inform of doing/not doing a read + check before applying an update. Since SQL always performs a read before + applying the update No Read Check is needed in MyISAM as well. + + This is a cut from Docs/myisam.txt + Sometimes you might want to force an update without checking whether + another user has changed the record since you last read it. This is + somewhat dangerous, so it should ideally not be used. That can be + accomplished by wrapping the mi_update() call in two calls to mi_extra(), + using these functions: + HA_EXTRA_NO_READCHECK=5 No readcheck on update + HA_EXTRA_READCHECK=6 Use readcheck (def) + + HA_EXTRA_FORCE_REOPEN: + Only used by MyISAM, called when altering table, closing tables to + enforce a reopen of the table files. + + 4) Parameters only used by temporary tables for query processing + ---------------------------------------------------------------- + HA_EXTRA_RESET_STATE: + Same as HA_EXTRA_RESET except that buffers are not released. If there is + a READ CACHE it is reinit'ed. A cache is reinit'ed to restart reading + or to change type of cache between READ CACHE and WRITE CACHE. + + This extra function is always called immediately before calling + delete_all_rows on the handler for temporary tables. + There are cases however when HA_EXTRA_RESET_STATE isn't called in + a similar case for a temporary table in sql_union.cc and in two other + cases HA_EXTRA_NO_CACHE is called before and HA_EXTRA_WRITE_CACHE + called afterwards. + The case with HA_EXTRA_NO_CACHE and HA_EXTRA_WRITE_CACHE means + disable caching, delete all rows and enable WRITE CACHE. This is + used for temporary tables containing distinct sums and a + functional group. + + The only case that delete_all_rows is called on non-temporary tables + is in sql_delete.cc when DELETE FROM table; is called by a user. + In this case no special extra calls are performed before or after this + call. + + The partition handler should not need to bother about this one. It + should never be called. + + HA_EXTRA_NO_ROWS: + Don't insert rows indication to HEAP and MyISAM, only used by temporary + tables used in query processing. + Not handled by partition handler. + + 5) Parameters only used by MyISAM internally + -------------------------------------------- + HA_EXTRA_REINIT_CACHE: + This call reinitialises the READ CACHE described above if there is one + and otherwise the call is ignored. + + We can thus safely call it on all underlying handlers if they are + MyISAM handlers. It is however never called so we don't handle it at all. + HA_EXTRA_FLUSH_CACHE: + Flush WRITE CACHE in MyISAM. It is only from one place in the code. + This is in sql_insert.cc where it is called if the table_flags doesn't + contain HA_DUPP_POS. The only handler having the HA_DUPP_POS set is the + MyISAM handler and so the only handler not receiving this call is MyISAM. + Thus in effect this call is called but never used. Could be removed + from sql_insert.cc + HA_EXTRA_NO_USER_CHANGE: + Only used by MyISAM, never called. + Simulates lock_type as locked. + HA_EXTRA_WAIT_LOCK: + HA_EXTRA_WAIT_NOLOCK: + Only used by MyISAM, called from MyISAM handler but never from server + code on top of the handler. + Sets lock_wait on/off + HA_EXTRA_NO_KEYS: + Only used MyISAM, only used internally in MyISAM handler, never called + from server level. + HA_EXTRA_KEYREAD_CHANGE_POS: + HA_EXTRA_REMEMBER_POS: + HA_EXTRA_RESTORE_POS: + HA_EXTRA_PRELOAD_BUFFER_SIZE: + HA_EXTRA_CHANGE_KEY_TO_DUP: + HA_EXTRA_CHANGE_KEY_TO_UNIQUE: + Only used by MyISAM, never called. + + 6) Parameters not used at all + ----------------------------- + HA_EXTRA_KEY_CACHE: + HA_EXTRA_NO_KEY_CACHE: + This parameters are no longer used and could be removed. +*/ + +int ha_partition::extra(enum ha_extra_function operation) +{ + DBUG_ENTER("ha_partition:extra"); + DBUG_PRINT("info", ("operation: %d", (int) operation)); + + switch (operation) { + /* Category 1), used by most handlers */ + case HA_EXTRA_KEYREAD: + case HA_EXTRA_NO_KEYREAD: + case HA_EXTRA_FLUSH: + DBUG_RETURN(loop_extra(operation)); + + /* Category 2), used by non-MyISAM handlers */ + case HA_EXTRA_IGNORE_DUP_KEY: + case HA_EXTRA_NO_IGNORE_DUP_KEY: + case HA_EXTRA_RETRIEVE_ALL_COLS: + case HA_EXTRA_RETRIEVE_PRIMARY_KEY: + case HA_EXTRA_KEYREAD_PRESERVE_FIELDS: + { + if (!m_myisam) + DBUG_RETURN(loop_extra(operation)); + break; + } + + /* Category 3), used by MyISAM handlers */ + case HA_EXTRA_NORMAL: + case HA_EXTRA_QUICK: + case HA_EXTRA_NO_READCHECK: + case HA_EXTRA_PREPARE_FOR_UPDATE: + case HA_EXTRA_PREPARE_FOR_DELETE: + case HA_EXTRA_FORCE_REOPEN: + { + if (m_myisam) + DBUG_RETURN(loop_extra(operation)); + break; + } + case HA_EXTRA_CACHE: + { + prepare_extra_cache(0); + break; + } + case HA_EXTRA_NO_CACHE: + { + m_extra_cache= FALSE; + m_extra_cache_size= 0; + DBUG_RETURN(loop_extra(operation)); + } + default: + { + /* Temporary crash to discover what is wrong */ + DBUG_ASSERT(0); + break; + } + } + DBUG_RETURN(0); +} + + +/* + This will in the future be called instead of extra(HA_EXTRA_RESET) as this + is such a common call +*/ + +int ha_partition::reset(void) +{ + int result= 0, tmp; + handler **file; + DBUG_ENTER("ha_partition::reset"); + file= m_file; + do + { + if ((tmp= (*file)->reset())) + result= tmp; + } while (*(++file)); + DBUG_RETURN(result); +} + + +int ha_partition::extra_opt(enum ha_extra_function operation, ulong cachesize) +{ + DBUG_ENTER("ha_partition::extra_opt()"); + DBUG_ASSERT(HA_EXTRA_CACHE == operation); + prepare_extra_cache(cachesize); + DBUG_RETURN(0); +} + + +void ha_partition::prepare_extra_cache(uint cachesize) +{ + DBUG_ENTER("ha_partition::prepare_extra_cache()"); + + m_extra_cache= TRUE; + m_extra_cache_size= cachesize; + if (m_part_spec.start_part != NO_CURRENT_PART_ID) + { + DBUG_ASSERT(m_part_spec.start_part == 0); + late_extra_cache(0); + } + DBUG_VOID_RETURN; +} + + +int ha_partition::loop_extra(enum ha_extra_function operation) +{ + int result= 0, tmp; + handler **file; + DBUG_ENTER("ha_partition::loop_extra()"); + for (file= m_file; *file; file++) + { + if ((tmp= (*file)->extra(operation))) + result= tmp; + } + DBUG_RETURN(result); +} + + +void ha_partition::late_extra_cache(uint partition_id) +{ + handler *file; + DBUG_ENTER("ha_partition::late_extra_cache"); + if (!m_extra_cache) + DBUG_VOID_RETURN; + file= m_file[partition_id]; + if (m_extra_cache_size == 0) + VOID(file->extra(HA_EXTRA_CACHE)); + else + VOID(file->extra_opt(HA_EXTRA_CACHE, m_extra_cache_size)); + DBUG_VOID_RETURN; +} + + +void ha_partition::late_extra_no_cache(uint partition_id) +{ + handler *file; + DBUG_ENTER("ha_partition::late_extra_no_cache"); + if (!m_extra_cache) + DBUG_VOID_RETURN; + file= m_file[partition_id]; + VOID(file->extra(HA_EXTRA_NO_CACHE)); + DBUG_VOID_RETURN; +} + + +/**************************************************************************** + MODULE optimiser support +****************************************************************************/ + +const key_map *ha_partition::keys_to_use_for_scanning() +{ + DBUG_ENTER("ha_partition::keys_to_use_for_scanning"); + DBUG_RETURN(m_file[0]->keys_to_use_for_scanning()); +} + +double ha_partition::scan_time() +{ + double scan_time= 0; + handler **file; + DBUG_ENTER("ha_partition::scan_time"); + + for (file= m_file; *file; file++) + scan_time+= (*file)->scan_time(); + DBUG_RETURN(scan_time); +} + + +/* + This will be optimised later to include whether or not the index can + be used with partitioning. To achieve we need to add another parameter + that specifies how many of the index fields that are bound in the ranges. + Possibly added as a new call to handlers. +*/ + +double ha_partition::read_time(uint index, uint ranges, ha_rows rows) +{ + DBUG_ENTER("ha_partition::read_time"); + DBUG_RETURN(m_file[0]->read_time(index, ranges, rows)); +} + +/* + Given a starting key, and an ending key estimate the number of rows that + will exist between the two. end_key may be empty which in case determine + if start_key matches any rows. + + Called from opt_range.cc by check_quick_keys(). + + monty: MUST be called for each range and added. + Note that MySQL will assume that if this returns 0 there is no + matching rows for the range! +*/ + +ha_rows ha_partition::records_in_range(uint inx, key_range *min_key, + key_range *max_key) +{ + ha_rows in_range= 0; + handler **file; + DBUG_ENTER("ha_partition::records_in_range"); + + file= m_file; + do + { + in_range+= (*file)->records_in_range(inx, min_key, max_key); + } while (*(++file)); + DBUG_RETURN(in_range); +} + + +ha_rows ha_partition::estimate_rows_upper_bound() +{ + ha_rows rows, tot_rows= 0; + handler **file; + DBUG_ENTER("ha_partition::estimate_rows_upper_bound"); + + file= m_file; + do + { + rows= (*file)->estimate_rows_upper_bound(); + if (rows == HA_POS_ERROR) + DBUG_RETURN(HA_POS_ERROR); + tot_rows+= rows; + } while (*(++file)); + DBUG_RETURN(tot_rows); +} + + +uint8 ha_partition::table_cache_type() +{ + DBUG_ENTER("ha_partition::table_cache_type"); + DBUG_RETURN(m_file[0]->table_cache_type()); +} + + +/**************************************************************************** + MODULE print messages +****************************************************************************/ + +const char *ha_partition::index_type(uint inx) +{ + DBUG_ENTER("ha_partition::index_type"); + DBUG_RETURN(m_file[0]->index_type(inx)); +} + + +void ha_partition::print_error(int error, myf errflag) +{ + DBUG_ENTER("ha_partition::print_error"); + /* Should probably look for my own errors first */ + /* monty: needs to be called for the last used partition ! */ + m_file[0]->print_error(error, errflag); + DBUG_VOID_RETURN; +} + + +bool ha_partition::get_error_message(int error, String *buf) +{ + DBUG_ENTER("ha_partition::get_error_message"); + /* Should probably look for my own errors first */ + /* monty: needs to be called for the last used partition ! */ + DBUG_RETURN(m_file[0]->get_error_message(error, buf)); +} + + +/**************************************************************************** + MODULE handler characteristics +****************************************************************************/ +/* + If frm_error() is called then we will use this to to find out what file + extensions exist for the storage engine. This is also used by the default + rename_table and delete_table method in handler.cc. +*/ + +static const char *ha_partition_ext[]= +{ + ha_par_ext, NullS +}; + +const char **ha_partition::bas_ext() const +{ return ha_partition_ext; } + + +uint ha_partition::min_of_the_max_uint(uint (handler::*operator_func)(void) const) const +{ + handler **file; + uint min_of_the_max= ((*m_file)->*operator_func)(); + + for (file= m_file+1; *file; file++) + { + uint tmp= ((*file)->*operator_func)(); + set_if_smaller(min_of_the_max, tmp); + } + return min_of_the_max; +} + + +uint ha_partition::max_supported_key_parts() const +{ + return min_of_the_max_uint(&handler::max_supported_key_parts); +} + + +uint ha_partition::max_supported_key_length() const +{ + return min_of_the_max_uint(&handler::max_supported_key_length); +} + + +uint ha_partition::max_supported_key_part_length() const +{ + return min_of_the_max_uint(&handler::max_supported_key_part_length); +} + + +uint ha_partition::max_supported_record_length() const +{ + return min_of_the_max_uint(&handler::max_supported_record_length); +} + + +uint ha_partition::max_supported_keys() const +{ + return min_of_the_max_uint(&handler::max_supported_keys); +} + + +uint ha_partition::extra_rec_buf_length() const +{ + handler **file; + uint max= (*m_file)->extra_rec_buf_length(); + for (file= m_file, file++; *file; file++) + if (max < (*file)->extra_rec_buf_length()) + max= (*file)->extra_rec_buf_length(); + return max; +} + + +uint ha_partition::min_record_length(uint options) const +{ + handler **file; + uint max= (*m_file)->min_record_length(options); + for (file= m_file, file++; *file; file++) + if (max < (*file)->min_record_length(options)) + max= (*file)->min_record_length(options); + return max; +} + + +/**************************************************************************** + MODULE compare records +****************************************************************************/ +/* + We get two references and need to check if those records are the same. + If they belong to different partitions we decide that they are not + the same record. Otherwise we use the particular handler to decide if + they are the same. Sort in partition id order if not equal. +*/ + +int ha_partition::cmp_ref(const byte *ref1, const byte *ref2) +{ + uint part_id; + my_ptrdiff_t diff1, diff2; + handler *file; + DBUG_ENTER("ha_partition::cmp_ref"); + if ((ref1[0] == ref2[0]) && (ref1[1] == ref2[1])) + { + part_id= get_part_id_from_pos(ref1); + file= m_file[part_id]; + DBUG_ASSERT(part_id < m_tot_parts); + DBUG_RETURN(file->cmp_ref((ref1 + PARTITION_BYTES_IN_POS), + (ref2 + PARTITION_BYTES_IN_POS))); + } + diff1= ref2[1] - ref1[1]; + diff2= ref2[0] - ref1[0]; + if (diff1 > 0) + { + DBUG_RETURN(-1); + } + if (diff1 < 0) + { + DBUG_RETURN(+1); + } + if (diff2 > 0) + { + DBUG_RETURN(-1); + } + DBUG_RETURN(+1); +} + + +/**************************************************************************** + MODULE auto increment +****************************************************************************/ + +void ha_partition::restore_auto_increment() +{ + DBUG_ENTER("ha_partition::restore_auto_increment"); + DBUG_VOID_RETURN; +} + + +/* + This method is called by update_auto_increment which in turn is called + by the individual handlers as part of write_row. We will always let + the first handler keep track of the auto increment value for all + partitions. +*/ + +ulonglong ha_partition::get_auto_increment() +{ + DBUG_ENTER("ha_partition::get_auto_increment"); + DBUG_RETURN(m_file[0]->get_auto_increment()); +} + + +/**************************************************************************** + MODULE initialise handler for HANDLER call +****************************************************************************/ + +void ha_partition::init_table_handle_for_HANDLER() +{ + return; +} + + +/**************************************************************************** + MODULE Partition Share +****************************************************************************/ +/* + Service routines for ... methods. +------------------------------------------------------------------------- + Variables for partition share methods. A hash used to track open tables. + A mutex for the hash table and an init variable to check if hash table + is initialised. + There is also a constant ending of the partition handler file name. +*/ + +#ifdef NOT_USED +static HASH partition_open_tables; +static pthread_mutex_t partition_mutex; +static int partition_init= 0; + + +/* + Function we use in the creation of our hash to get key. +*/ +static byte *partition_get_key(PARTITION_SHARE *share, uint *length, + my_bool not_used __attribute__ ((unused))) +{ + *length= share->table_name_length; + return (byte *) share->table_name; +} + +/* + Example of simple lock controls. The "share" it creates is structure we + will pass to each partition handler. Do you have to have one of these? + Well, you have pieces that are used for locking, and they are needed to + function. +*/ + + +static PARTITION_SHARE *get_share(const char *table_name, TABLE *table) +{ + PARTITION_SHARE *share; + uint length; + char *tmp_name; + + /* + So why does this exist? There is no way currently to init a storage + engine. + Innodb and BDB both have modifications to the server to allow them to + do this. Since you will not want to do this, this is probably the next + best method. + */ + if (!partition_init) + { + /* Hijack a mutex for init'ing the storage engine */ + pthread_mutex_lock(&LOCK_mysql_create_db); + if (!partition_init) + { + partition_init++; + VOID(pthread_mutex_init(&partition_mutex, MY_MUTEX_INIT_FAST)); + (void) hash_init(&partition_open_tables, system_charset_info, 32, 0, 0, + (hash_get_key) partition_get_key, 0, 0); + } + pthread_mutex_unlock(&LOCK_mysql_create_db); + } + pthread_mutex_lock(&partition_mutex); + length= (uint) strlen(table_name); + + if (!(share= (PARTITION_SHARE *) hash_search(&partition_open_tables, + (byte *) table_name, length))) + { + if (!(share= (PARTITION_SHARE *) + my_multi_malloc(MYF(MY_WME | MY_ZEROFILL), + &share, sizeof(*share), + &tmp_name, length + 1, NullS))) + { + pthread_mutex_unlock(&partition_mutex); + return NULL; + } + + share->use_count= 0; + share->table_name_length= length; + share->table_name= tmp_name; + strmov(share->table_name, table_name); + if (my_hash_insert(&partition_open_tables, (byte *) share)) + goto error; + thr_lock_init(&share->lock); + pthread_mutex_init(&share->mutex, MY_MUTEX_INIT_FAST); + } + share->use_count++; + pthread_mutex_unlock(&partition_mutex); + + return share; + +error: + pthread_mutex_unlock(&partition_mutex); + my_free((gptr) share, MYF(0)); + + return NULL; +} + + +/* + Free lock controls. We call this whenever we close a table. If the table + had the last reference to the share then we free memory associated with + it. +*/ + +static int free_share(PARTITION_SHARE *share) +{ + pthread_mutex_lock(&partition_mutex); + if (!--share->use_count) + { + hash_delete(&partition_open_tables, (byte *) share); + thr_lock_delete(&share->lock); + pthread_mutex_destroy(&share->mutex); + my_free((gptr) share, MYF(0)); + } + pthread_mutex_unlock(&partition_mutex); + + return 0; +} +#endif /* NOT_USED */ +#endif /* HAVE_PARTITION_DB */ diff --git a/sql/ha_partition.h b/sql/ha_partition.h new file mode 100644 index 00000000000..858bf09ecaa --- /dev/null +++ b/sql/ha_partition.h @@ -0,0 +1,922 @@ +/* Copyright (C) 2005 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#ifdef __GNUC__ +#pragma interface /* gcc class implementation */ +#endif + +/* + PARTITION_SHARE is a structure that will be shared amoung all open handlers + The partition implements the minimum of what you will probably need. +*/ + +typedef struct st_partition_share +{ + char *table_name; + uint table_name_length, use_count; + pthread_mutex_t mutex; + THR_LOCK lock; +} PARTITION_SHARE; + + +#define PARTITION_BYTES_IN_POS 2 +class ha_partition :public handler +{ +private: + enum partition_index_scan_type + { + partition_index_read= 0, + partition_index_first= 1, + partition_index_last= 2, + partition_no_index_scan= 3 + }; + /* Data for the partition handler */ + char *m_file_buffer; // Buffer with names + char *m_name_buffer_ptr; // Pointer to first partition name + uchar *m_engine_array; // Array of types of the handlers + handler **m_file; // Array of references to handler inst. + partition_info *m_part_info; // local reference to partition + byte *m_start_key_ref; // Reference of start key in current + // index scan info + Field **m_part_field_array; // Part field array locally to save acc + byte *m_ordered_rec_buffer; // Row and key buffer for ord. idx scan + KEY *m_curr_key_info; // Current index + byte *m_rec0; // table->record[0] + QUEUE queue; // Prio queue used by sorted read + /* + Since the partition handler is a handler on top of other handlers, it + is necessary to keep information about what the underlying handler + characteristics is. It is not possible to keep any handler instances + for this since the MySQL Server sometimes allocating the handler object + without freeing them. + */ + u_long m_table_flags; + u_long m_low_byte_first; + + uint m_tot_parts; // Total number of partitions; + uint m_last_part; // Last file that we update,write + int m_lock_type; // Remembers type of last + // external_lock + part_id_range m_part_spec; // Which parts to scan + uint m_scan_value; // Value passed in rnd_init + // call + uint m_ref_length; // Length of position in this + // handler object + key_range m_start_key; // index read key range + enum partition_index_scan_type m_index_scan_type;// What type of index + // scan + uint m_top_entry; // Which partition is to + // deliver next result + uint m_rec_length; // Local copy of record length + + bool m_ordered; // Ordered/Unordered index scan + bool m_has_transactions; // Can we support transactions + bool m_pkey_is_clustered; // Is primary key clustered + bool m_create_handler; // Handler used to create table + bool m_is_sub_partitioned; // Is subpartitioned + bool m_ordered_scan_ongoing; + bool m_use_bit_array; + + /* + We keep track if all underlying handlers are MyISAM since MyISAM has a + great number of extra flags not needed by other handlers. + */ + bool m_myisam; // Are all underlying handlers + // MyISAM + /* + We keep track of InnoDB handlers below since it requires proper setting + of query_id in fields at index_init and index_read calls. + */ + bool m_innodb; // Are all underlying handlers + // InnoDB + /* + When calling extra(HA_EXTRA_CACHE) we do not pass this to the underlying + handlers immediately. Instead we cache it and call the underlying + immediately before starting the scan on the partition. This is to + prevent allocating a READ CACHE for each partition in parallel when + performing a full table scan on MyISAM partitioned table. + This state is cleared by extra(HA_EXTRA_NO_CACHE). + */ + bool m_extra_cache; + uint m_extra_cache_size; + + void init_handler_variables(); + /* + Variables for lock structures. + */ + THR_LOCK_DATA lock; /* MySQL lock */ + PARTITION_SHARE *share; /* Shared lock info */ + +public: + /* + ------------------------------------------------------------------------- + MODULE create/delete handler object + ------------------------------------------------------------------------- + Object create/delete methode. The normal called when a table object + exists. There is also a method to create the handler object with only + partition information. This is used from mysql_create_table when the + table is to be created and the engine type is deduced to be the + partition handler. + ------------------------------------------------------------------------- + */ + ha_partition(TABLE * table); + ha_partition(partition_info * part_info); + ~ha_partition(); + /* + A partition handler has no characteristics in itself. It only inherits + those from the underlying handlers. Here we set-up those constants to + enable later calls of the methods to retrieve constants from the under- + lying handlers. Returns false if not successful. + */ + int ha_initialise(); + + /* + ------------------------------------------------------------------------- + MODULE meta data changes + ------------------------------------------------------------------------- + Meta data routines to CREATE, DROP, RENAME table and often used at + ALTER TABLE (update_create_info used from ALTER TABLE and SHOW ..). + + update_table_comment is used in SHOW TABLE commands to provide a + chance for the handler to add any interesting comments to the table + comments not provided by the users comment. + + create_handler_files is called before opening a new handler object + with openfrm to call create. It is used to create any local handler + object needed in opening the object in openfrm + ------------------------------------------------------------------------- + */ + virtual int delete_table(const char *from); + virtual int rename_table(const char *from, const char *to); + virtual int create(const char *name, TABLE * form, + HA_CREATE_INFO * create_info); + virtual int create_handler_files(const char *name); + virtual void update_create_info(HA_CREATE_INFO * create_info); + virtual char *update_table_comment(const char *comment); + virtual int drop_partitions(const char *path); +private: + /* + delete_table, rename_table and create uses very similar logic which + is packed into this routine. + */ + uint del_ren_cre_table(const char *from, + const char *to= NULL, + TABLE * table_arg= NULL, + HA_CREATE_INFO * create_info= NULL); + /* + One method to create the table_name.par file containing the names of the + underlying partitions, their engine and the number of partitions. + And one method to read it in. + */ + bool create_handler_file(const char *name); + bool get_from_handler_file(const char *name); + bool new_handlers_from_part_info(); + bool create_handlers(); + void clear_handler_file(); + void set_up_table_before_create(TABLE * table_arg, HA_CREATE_INFO * info, + uint part_id); + partition_element *find_partition_element(uint part_id); +public: + + /* + ------------------------------------------------------------------------- + MODULE open/close object + ------------------------------------------------------------------------- + Open and close handler object to ensure all underlying files and + objects allocated and deallocated for query handling is handled + properly. + ------------------------------------------------------------------------- + + A handler object is opened as part of its initialisation and before + being used for normal queries (not before meta-data changes always. + If the object was opened it will also be closed before being deleted. + */ + virtual int open(const char *name, int mode, uint test_if_locked); + virtual int close(void); + + /* + ------------------------------------------------------------------------- + MODULE start/end statement + ------------------------------------------------------------------------- + This module contains methods that are used to understand start/end of + statements, transaction boundaries, and aid for proper concurrency + control. + The partition handler need not implement abort and commit since this + will be handled by any underlying handlers implementing transactions. + There is only one call to each handler type involved per transaction + and these go directly to the handlers supporting transactions + currently InnoDB, BDB and NDB). + ------------------------------------------------------------------------- + */ + virtual THR_LOCK_DATA **store_lock(THD * thd, THR_LOCK_DATA ** to, + enum thr_lock_type lock_type); + virtual int external_lock(THD * thd, int lock_type); + /* + When table is locked a statement is started by calling start_stmt + instead of external_lock + */ + virtual int start_stmt(THD * thd); + /* + Lock count is number of locked underlying handlers (I assume) + */ + virtual uint lock_count(void) const; + /* + Call to unlock rows not to be updated in transaction + */ + virtual void unlock_row(); + + /* + ------------------------------------------------------------------------- + MODULE change record + ------------------------------------------------------------------------- + This part of the handler interface is used to change the records + after INSERT, DELETE, UPDATE, REPLACE method calls but also other + special meta-data operations as ALTER TABLE, LOAD DATA, TRUNCATE. + ------------------------------------------------------------------------- + + These methods are used for insert (write_row), update (update_row) + and delete (delete_row). All methods to change data always work on + one row at a time. update_row and delete_row also contains the old + row. + delete_all_rows will delete all rows in the table in one call as a + special optimisation for DELETE from table; + + Bulk inserts are supported if all underlying handlers support it. + start_bulk_insert and end_bulk_insert is called before and after a + number of calls to write_row. + Not yet though. + */ + virtual int write_row(byte * buf); + virtual int update_row(const byte * old_data, byte * new_data); + virtual int delete_row(const byte * buf); + virtual int delete_all_rows(void); + virtual void start_bulk_insert(ha_rows rows); + virtual int end_bulk_insert(); + + /* + ------------------------------------------------------------------------- + MODULE full table scan + ------------------------------------------------------------------------- + This module is used for the most basic access method for any table + handler. This is to fetch all data through a full table scan. No + indexes are needed to implement this part. + It contains one method to start the scan (rnd_init) that can also be + called multiple times (typical in a nested loop join). Then proceeding + to the next record (rnd_next) and closing the scan (rnd_end). + To remember a record for later access there is a method (position) + and there is a method used to retrieve the record based on the stored + position. + The position can be a file position, a primary key, a ROWID dependent + on the handler below. + ------------------------------------------------------------------------- + */ + /* + unlike index_init(), rnd_init() can be called two times + without rnd_end() in between (it only makes sense if scan=1). + then the second call should prepare for the new table scan + (e.g if rnd_init allocates the cursor, second call should + position it to the start of the table, no need to deallocate + and allocate it again + */ + virtual int rnd_init(bool scan); + virtual int rnd_end(); + virtual int rnd_next(byte * buf); + virtual int rnd_pos(byte * buf, byte * pos); + virtual void position(const byte * record); + + /* + ------------------------------------------------------------------------- + MODULE index scan + ------------------------------------------------------------------------- + This part of the handler interface is used to perform access through + indexes. The interface is defined as a scan interface but the handler + can also use key lookup if the index is a unique index or a primary + key index. + Index scans are mostly useful for SELECT queries but are an important + part also of UPDATE, DELETE, REPLACE and CREATE TABLE table AS SELECT + and so forth. + Naturally an index is needed for an index scan and indexes can either + be ordered, hash based. Some ordered indexes can return data in order + but not necessarily all of them. + There are many flags that define the behavior of indexes in the + various handlers. These methods are found in the optimizer module. + ------------------------------------------------------------------------- + + index_read is called to start a scan of an index. The find_flag defines + the semantics of the scan. These flags are defined in + include/my_base.h + index_read_idx is the same but also initializes index before calling doing + the same thing as index_read. Thus it is similar to index_init followed + by index_read. This is also how we implement it. + + index_read/index_read_idx does also return the first row. Thus for + key lookups, the index_read will be the only call to the handler in + the index scan. + + index_init initializes an index before using it and index_end does + any end processing needed. + */ + virtual int index_read(byte * buf, const byte * key, + uint key_len, enum ha_rkey_function find_flag); + virtual int index_read_idx(byte * buf, uint idx, const byte * key, + uint key_len, enum ha_rkey_function find_flag); + virtual int index_init(uint idx, bool sorted); + virtual int index_end(); + + /* + These methods are used to jump to next or previous entry in the index + scan. There are also methods to jump to first and last entry. + */ + virtual int index_next(byte * buf); + virtual int index_prev(byte * buf); + virtual int index_first(byte * buf); + virtual int index_last(byte * buf); + virtual int index_next_same(byte * buf, const byte * key, uint keylen); + virtual int index_read_last(byte * buf, const byte * key, uint keylen); + + /* + read_first_row is virtual method but is only implemented by + handler.cc, no storage engine has implemented it so neither + will the partition handler. + + virtual int read_first_row(byte *buf, uint primary_key); + */ + + /* + We don't implement multi read range yet, will do later. + virtual int read_multi_range_first(KEY_MULTI_RANGE **found_range_p, + KEY_MULTI_RANGE *ranges, uint range_count, + bool sorted, HANDLER_BUFFER *buffer); + virtual int read_multi_range_next(KEY_MULTI_RANGE **found_range_p); + */ + + + virtual int read_range_first(const key_range * start_key, + const key_range * end_key, + bool eq_range, bool sorted); + virtual int read_range_next(); + +private: + int common_index_read(byte * buf, const byte * key, + uint key_len, enum ha_rkey_function find_flag); + int common_first_last(byte * buf); + int partition_scan_set_up(byte * buf, bool idx_read_flag); + int handle_unordered_next(byte * buf, bool next_same); + int handle_unordered_scan_next_partition(byte * buf); + byte *queue_buf(uint part_id) + { + return (m_ordered_rec_buffer + + (part_id * (m_rec_length + PARTITION_BYTES_IN_POS))); + } + byte *rec_buf(uint part_id) + { + return (queue_buf(part_id) + + PARTITION_BYTES_IN_POS); + } + int handle_ordered_index_scan(byte * buf); + int handle_ordered_next(byte * buf, bool next_same); + int handle_ordered_prev(byte * buf); + void return_top_record(byte * buf); + void include_partition_fields_in_used_fields(); +public: + /* + ------------------------------------------------------------------------- + MODULE information calls + ------------------------------------------------------------------------- + This calls are used to inform the handler of specifics of the ongoing + scans and other actions. Most of these are used for optimisation + purposes. + ------------------------------------------------------------------------- + */ + virtual void info(uint); + virtual int extra(enum ha_extra_function operation); + virtual int extra_opt(enum ha_extra_function operation, ulong cachesize); + virtual int reset(void); + +private: + static const uint NO_CURRENT_PART_ID= 0xFFFFFFFF; + int loop_extra(enum ha_extra_function operation); + void late_extra_cache(uint partition_id); + void late_extra_no_cache(uint partition_id); + void prepare_extra_cache(uint cachesize); +public: + + /* + ------------------------------------------------------------------------- + MODULE optimiser support + ------------------------------------------------------------------------- + ------------------------------------------------------------------------- + */ + + /* + NOTE !!!!!! + ------------------------------------------------------------------------- + ------------------------------------------------------------------------- + One important part of the public handler interface that is not depicted in + the methods is the attribute records + + which is defined in the base class. This is looked upon directly and is + set by calling info(HA_STATUS_INFO) ? + ------------------------------------------------------------------------- + */ + + /* + keys_to_use_for_scanning can probably be implemented as the + intersection of all underlying handlers if mixed handlers are used. + This method is used to derive whether an index can be used for + index-only scanning when performing an ORDER BY query. + Only called from one place in sql_select.cc + */ + virtual const key_map *keys_to_use_for_scanning(); + + /* + Called in test_quick_select to determine if indexes should be used. + */ + virtual double scan_time(); + + /* + The next method will never be called if you do not implement indexes. + */ + virtual double read_time(uint index, uint ranges, ha_rows rows); + /* + For the given range how many records are estimated to be in this range. + Used by optimiser to calculate cost of using a particular index. + */ + virtual ha_rows records_in_range(uint inx, key_range * min_key, + key_range * max_key); + + /* + Upper bound of number records returned in scan is sum of all + underlying handlers. + */ + virtual ha_rows estimate_rows_upper_bound(); + + /* + table_cache_type is implemented by the underlying handler but all + underlying handlers must have the same implementation for it to work. + */ + virtual uint8 table_cache_type(); + + /* + ------------------------------------------------------------------------- + MODULE print messages + ------------------------------------------------------------------------- + This module contains various methods that returns text messages for + table types, index type and error messages. + ------------------------------------------------------------------------- + */ + /* + The name of the index type that will be used for display + Here we must ensure that all handlers use the same index type + for each index created. + */ + virtual const char *index_type(uint inx); + + /* The name of the table type that will be used for display purposes */ + virtual const char *table_type() const + { return "PARTITION"; } + + /* + Handler specific error messages + */ + virtual void print_error(int error, myf errflag); + virtual bool get_error_message(int error, String * buf); + /* + ------------------------------------------------------------------------- + MODULE handler characteristics + ------------------------------------------------------------------------- + This module contains a number of methods defining limitations and + characteristics of the handler. The partition handler will calculate + this characteristics based on underlying handler characteristics. + ------------------------------------------------------------------------- + + This is a list of flags that says what the storage engine + implements. The current table flags are documented in handler.h + The partition handler will support whatever the underlying handlers + support except when specifically mentioned below about exceptions + to this rule. + + HA_READ_RND_SAME: + Not currently used. (Means that the handler supports the rnd_same() call) + (MyISAM, HEAP) + + HA_TABLE_SCAN_ON_INDEX: + Used to avoid scanning full tables on an index. If this flag is set then + the handler always has a primary key (hidden if not defined) and this + index is used for scanning rather than a full table scan in all + situations. + (InnoDB, BDB, Federated) + + HA_REC_NOT_IN_SEQ: + This flag is set for handlers that cannot guarantee that the rows are + returned accroding to incremental positions (0, 1, 2, 3...). + This also means that rnd_next() should return HA_ERR_RECORD_DELETED + if it finds a deleted row. + (MyISAM (not fixed length row), BDB, HEAP, NDB, InooDB) + + HA_CAN_GEOMETRY: + Can the storage engine handle spatial data. + Used to check that no spatial attributes are declared unless + the storage engine is capable of handling it. + (MyISAM) + + HA_FAST_KEY_READ: + Setting this flag indicates that the handler is equally fast in + finding a row by key as by position. + This flag is used in a very special situation in conjunction with + filesort's. For further explanation see intro to init_read_record. + (BDB, HEAP, InnoDB) + + HA_NULL_IN_KEY: + Is NULL values allowed in indexes. + If this is not allowed then it is not possible to use an index on a + NULLable field. + (BDB, HEAP, MyISAM, NDB, InnoDB) + + HA_DUPP_POS: + Tells that we can the position for the conflicting duplicate key + record is stored in table->file->dupp_ref. (insert uses rnd_pos() on + this to find the duplicated row) + (MyISAM) + + HA_CAN_INDEX_BLOBS: + Is the storage engine capable of defining an index of a prefix on + a BLOB attribute. + (BDB, Federated, MyISAM, InnoDB) + + HA_AUTO_PART_KEY: + Auto increment fields can be part of a multi-part key. For second part + auto-increment keys, the auto_incrementing is done in handler.cc + (BDB, Federated, MyISAM, NDB) + + HA_REQUIRE_PRIMARY_KEY: + Can't define a table without primary key (and cannot handle a table + with hidden primary key) + (No handler has this limitation currently) + + HA_NOT_EXACT_COUNT: + Does the counter of records after the info call specify an exact + value or not. If it doesn't this flag is set. + Only MyISAM and HEAP uses exact count. + (MyISAM, HEAP, BDB, InnoDB, NDB, Federated) + + HA_CAN_INSERT_DELAYED: + Can the storage engine support delayed inserts. + To start with the partition handler will not support delayed inserts. + Further investigation needed. + (HEAP, MyISAM) + + HA_PRIMARY_KEY_IN_READ_INDEX: + This parameter is set when the handler will also return the primary key + when doing read-only-key on another index. + + HA_NOT_DELETE_WITH_CACHE: + Seems to be an old MyISAM feature that is no longer used. No handler + has it defined but it is checked in init_read_record. + Further investigation needed. + (No handler defines it) + + HA_NO_PREFIX_CHAR_KEYS: + Indexes on prefixes of character fields is not allowed. + (NDB) + + HA_CAN_FULLTEXT: + Does the storage engine support fulltext indexes + The partition handler will start by not supporting fulltext indexes. + (MyISAM) + + HA_CAN_SQL_HANDLER: + Can the HANDLER interface in the MySQL API be used towards this + storage engine. + (MyISAM, InnoDB) + + HA_NO_AUTO_INCREMENT: + Set if the storage engine does not support auto increment fields. + (Currently not set by any handler) + + HA_HAS_CHECKSUM: + Special MyISAM feature. Has special SQL support in CREATE TABLE. + No special handling needed by partition handler. + (MyISAM) + + HA_FILE_BASED: + Should file names always be in lower case (used by engines + that map table names to file names. + Since partition handler has a local file this flag is set. + (BDB, Federated, MyISAM) + + HA_CAN_BIT_FIELD: + Is the storage engine capable of handling bit fields? + (MyISAM, NDB) + + HA_NEED_READ_RANGE_BUFFER: + Is Read Multi-Range supported => need multi read range buffer + This parameter specifies whether a buffer for read multi range + is needed by the handler. Whether the handler supports this + feature or not is dependent of whether the handler implements + read_multi_range* calls or not. The only handler currently + supporting this feature is NDB so the partition handler need + not handle this call. There are methods in handler.cc that will + transfer those calls into index_read and other calls in the + index scan module. + (NDB) + */ + virtual ulong alter_table_flags(void) const + { + //return HA_ONLINE_ADD_EMPTY_PARTITION + HA_ONLINE_DROP_PARTITION; + return HA_ONLINE_DROP_PARTITION; + } + virtual ulong table_flags() const + { return m_table_flags; } + /* + HA_CAN_PARTITION: + Used by storage engines that can handle partitioning without this + partition handler + (Partition, NDB) + + HA_CAN_UPDATE_PARTITION_KEY: + Set if the handler can update fields that are part of the partition + function. + + HA_CAN_PARTITION_UNIQUE: + Set if the handler can handle unique indexes where the fields of the + unique key are not part of the fields of the partition function. Thus + a unique key can be set on all fields. + */ + virtual ulong partition_flags() const + { return HA_CAN_PARTITION; } + + /* + This is a bitmap of flags that says how the storage engine + implements indexes. The current index flags are documented in + handler.h. If you do not implement indexes, just return zero + here. + + part is the key part to check. First key part is 0 + If all_parts it's set, MySQL want to know the flags for the combined + index up to and including 'part'. + + HA_READ_NEXT: + Does the index support read next, this is assumed in the server + code and never checked so all indexes must support this. + Note that the handler can be used even if it doesn't have any index. + (BDB, HEAP, MyISAM, Federated, NDB, InnoDB) + + HA_READ_PREV: + Can the index be used to scan backwards. + (BDB, HEAP, MyISAM, NDB, InnoDB) + + HA_READ_ORDER: + Can the index deliver its record in index order. Typically true for + all ordered indexes and not true for hash indexes. + In first step this is not true for partition handler until a merge + sort has been implemented in partition handler. + Used to set keymap part_of_sortkey + This keymap is only used to find indexes usable for resolving an ORDER BY + in the query. Thus in most cases index_read will work just fine without + order in result production. When this flag is set it is however safe to + order all output started by index_read since most engines do this. With + read_multi_range calls there is a specific flag setting order or not + order so in those cases ordering of index output can be avoided. + (BDB, InnoDB, HEAP, MyISAM, NDB) + + HA_READ_RANGE: + Specify whether index can handle ranges, typically true for all + ordered indexes and not true for hash indexes. + Used by optimiser to check if ranges (as key >= 5) can be optimised + by index. + (BDB, InnoDB, NDB, MyISAM, HEAP) + + HA_ONLY_WHOLE_INDEX: + Can't use part key searches. This is typically true for hash indexes + and typically not true for ordered indexes. + (Federated, NDB, HEAP) + + HA_KEYREAD_ONLY: + Does the storage engine support index-only scans on this index. + Enables use of HA_EXTRA_KEYREAD and HA_EXTRA_NO_KEYREAD + Used to set key_map keys_for_keyread and to check in optimiser for + index-only scans. When doing a read under HA_EXTRA_KEYREAD the handler + only have to fill in the columns the key covers. If + HA_PRIMARY_KEY_IN_READ_INDEX is set then also the PRIMARY KEY columns + must be updated in the row. + (BDB, InnoDB, MyISAM) + */ + virtual ulong index_flags(uint inx, uint part, bool all_parts) const + { + return m_file[0]->index_flags(inx, part, all_parts); + } + + /* + extensions of table handler files + */ + virtual const char **bas_ext() const; + /* + unireg.cc will call the following to make sure that the storage engine + can handle the data it is about to send. + + The maximum supported values is the minimum of all handlers in the table + */ + uint min_of_the_max_uint(uint (handler::*operator_func)(void) const) const; + virtual uint max_supported_record_length() const; + virtual uint max_supported_keys() const; + virtual uint max_supported_key_parts() const; + virtual uint max_supported_key_length() const; + virtual uint max_supported_key_part_length() const; + + /* + All handlers in a partitioned table must have the same low_byte_first + */ + virtual bool low_byte_first() const + { return m_low_byte_first; } + + /* + The extra record buffer length is the maximum needed by all handlers. + The minimum record length is the maximum of all involved handlers. + */ + virtual uint extra_rec_buf_length() const; + virtual uint min_record_length(uint options) const; + + /* + Transactions on the table is supported if all handlers below support + transactions. + */ + virtual bool has_transactions() + { return m_has_transactions; } + + /* + Primary key is clustered can only be true if all underlying handlers have + this feature. + */ + virtual bool primary_key_is_clustered() + { return m_pkey_is_clustered; } + + /* + ------------------------------------------------------------------------- + MODULE compare records + ------------------------------------------------------------------------- + cmp_ref checks if two references are the same. For most handlers this is + a simple memcmp of the reference. However some handlers use primary key + as reference and this can be the same even if memcmp says they are + different. This is due to character sets and end spaces and so forth. + For the partition handler the reference is first two bytes providing the + partition identity of the referred record and then the reference of the + underlying handler. + Thus cmp_ref for the partition handler always returns FALSE for records + not in the same partition and uses cmp_ref on the underlying handler + to check whether the rest of the reference part is also the same. + ------------------------------------------------------------------------- + */ + virtual int cmp_ref(const byte * ref1, const byte * ref2); + /* + ------------------------------------------------------------------------- + MODULE auto increment + ------------------------------------------------------------------------- + This module is used to handle the support of auto increments. + + This variable in the handler is used as part of the handler interface + It is maintained by the parent handler object and should not be + touched by child handler objects (see handler.cc for its use). + + auto_increment_column_changed + ------------------------------------------------------------------------- + */ + virtual void restore_auto_increment(); + virtual ulonglong get_auto_increment(); + + /* + ------------------------------------------------------------------------- + MODULE initialise handler for HANDLER call + ------------------------------------------------------------------------- + This method is a special InnoDB method called before a HANDLER query. + ------------------------------------------------------------------------- + */ + virtual void init_table_handle_for_HANDLER(); + + /* + The remainder of this file defines the handler methods not implemented + by the partition handler + */ + + /* + ------------------------------------------------------------------------- + MODULE foreign key support + ------------------------------------------------------------------------- + The following methods are used to implement foreign keys as supported by + InnoDB. Implement this ?? + get_foreign_key_create_info is used by SHOW CREATE TABLE to get a textual + description of how the CREATE TABLE part to define FOREIGN KEY's is done. + free_foreign_key_create_info is used to free the memory area that provided + this description. + ------------------------------------------------------------------------- + + virtual char* get_foreign_key_create_info() + virtual void free_foreign_key_create_info(char* str) + + virtual int get_foreign_key_list(THD *thd, + List<FOREIGN_KEY_INFO> *f_key_list) + virtual uint referenced_by_foreign_key() + */ + + /* + ------------------------------------------------------------------------- + MODULE fulltext index + ------------------------------------------------------------------------- + Fulltext stuff not yet. + ------------------------------------------------------------------------- + virtual int ft_init() { return HA_ERR_WRONG_COMMAND; } + virtual FT_INFO *ft_init_ext(uint flags,uint inx,const byte *key, + uint keylen) + { return NULL; } + virtual int ft_read(byte *buf) { return HA_ERR_WRONG_COMMAND; } + */ + + /* + ------------------------------------------------------------------------- + MODULE restart full table scan at position (MyISAM) + ------------------------------------------------------------------------- + The following method is only used by MyISAM when used as + temporary tables in a join. + virtual int restart_rnd_next(byte *buf, byte *pos); + */ + + /* + ------------------------------------------------------------------------- + MODULE on-line ALTER TABLE + ------------------------------------------------------------------------- + These methods are in the handler interface but never used (yet) + They are to be used by on-line alter table add/drop index: + ------------------------------------------------------------------------- + virtual ulong index_ddl_flags(KEY *wanted_index) const + virtual int add_index(TABLE *table_arg,KEY *key_info,uint num_of_keys); + virtual int drop_index(TABLE *table_arg,uint *key_num,uint num_of_keys); + */ + + /* + ------------------------------------------------------------------------- + MODULE tablespace support + ------------------------------------------------------------------------- + Admin of table spaces is not applicable to the partition handler (InnoDB) + This means that the following method is not implemented: + ------------------------------------------------------------------------- + virtual int discard_or_import_tablespace(my_bool discard) + */ + + /* + ------------------------------------------------------------------------- + MODULE admin MyISAM + ------------------------------------------------------------------------- + Admin commands not supported currently (almost purely MyISAM routines) + This means that the following methods are not implemented: + ------------------------------------------------------------------------- + + virtual int check(THD* thd, HA_CHECK_OPT *check_opt); + virtual int backup(TD* thd, HA_CHECK_OPT *check_opt); + virtual int restore(THD* thd, HA_CHECK_OPT *check_opt); + virtual int repair(THD* thd, HA_CHECK_OPT *check_opt); + virtual int optimize(THD* thd, HA_CHECK_OPT *check_opt); + virtual int analyze(THD* thd, HA_CHECK_OPT *check_opt); + virtual int assign_to_keycache(THD* thd, HA_CHECK_OPT *check_opt); + virtual int preload_keys(THD *thd, HA_CHECK_OPT *check_opt); + virtual bool check_and_repair(THD *thd); + virtual int dump(THD* thd, int fd = -1); + virtual int net_read_dump(NET* net); + virtual uint checksum() const; + virtual bool is_crashed() const; + virtual bool auto_repair() const; + + ------------------------------------------------------------------------- + MODULE enable/disable indexes + ------------------------------------------------------------------------- + Enable/Disable Indexes are not supported currently (Heap, MyISAM) + This means that the following methods are not implemented: + ------------------------------------------------------------------------- + virtual int disable_indexes(uint mode); + virtual int enable_indexes(uint mode); + virtual int indexes_are_disabled(void); + */ + + /* + ------------------------------------------------------------------------- + MODULE append_create_info + ------------------------------------------------------------------------- + append_create_info is only used by MyISAM MERGE tables and the partition + handler will not support this handler as underlying handler. + Implement this?? + ------------------------------------------------------------------------- + virtual void append_create_info(String *packet) + */ +}; diff --git a/sql/handler.cc b/sql/handler.cc index 3acca812a13..3e85e73cab5 100644 --- a/sql/handler.cc +++ b/sql/handler.cc @@ -34,6 +34,9 @@ #ifdef HAVE_EXAMPLE_DB #include "examples/ha_example.h" #endif +#ifdef HAVE_PARTITION_DB +#include "ha_partition.h" +#endif #ifdef HAVE_ARCHIVE_DB #include "examples/ha_archive.h" #endif @@ -170,7 +173,13 @@ enum db_type ha_checktype(THD *thd, enum db_type database_type, { if (ha_storage_engine_is_enabled(database_type)) return database_type; - +#ifdef HAVE_PARTITION_DB + /* + Partition handler is not in the list of handlers shown since it is an internal handler + */ + if (database_type == DB_TYPE_PARTITION_DB) + return database_type; +#endif if (no_substitute) { if (report_error) @@ -203,47 +212,66 @@ enum db_type ha_checktype(THD *thd, enum db_type database_type, handler *get_new_handler(TABLE *table, enum db_type db_type) { + handler *file; switch (db_type) { #ifndef NO_HASH case DB_TYPE_HASH: - return new ha_hash(table); + file= new ha_hash(table); + break; #endif case DB_TYPE_MRG_ISAM: - return new ha_myisammrg(table); + file= new ha_myisammrg(table); + break; #ifdef HAVE_BERKELEY_DB case DB_TYPE_BERKELEY_DB: - return new ha_berkeley(table); + file= new ha_berkeley(table); + break; #endif #ifdef HAVE_INNOBASE_DB case DB_TYPE_INNODB: - return new ha_innobase(table); + file= new ha_innobase(table); + break; #endif #ifdef HAVE_EXAMPLE_DB case DB_TYPE_EXAMPLE_DB: - return new ha_example(table); + file= new ha_example(table); + break; +#endif +#ifdef HAVE_PARTITION_DB + case DB_TYPE_PARTITION_DB: + { + file= new ha_partition(table); + break; + } #endif #ifdef HAVE_ARCHIVE_DB case DB_TYPE_ARCHIVE_DB: - return new ha_archive(table); + file= new ha_archive(table); + break; #endif #ifdef HAVE_BLACKHOLE_DB case DB_TYPE_BLACKHOLE_DB: - return new ha_blackhole(table); + file= new ha_blackhole(table); + break; #endif #ifdef HAVE_FEDERATED_DB case DB_TYPE_FEDERATED_DB: - return new ha_federated(table); + file= new ha_federated(table); + break; #endif #ifdef HAVE_CSV_DB case DB_TYPE_CSV_DB: - return new ha_tina(table); + file= new ha_tina(table); + break; #endif #ifdef HAVE_NDBCLUSTER_DB case DB_TYPE_NDBCLUSTER: - return new ha_ndbcluster(table); + file= new ha_ndbcluster(table); + break; #endif case DB_TYPE_HEAP: - return new ha_heap(table); + file= new ha_heap(table); + break; default: // should never happen { enum db_type def=(enum db_type) current_thd->variables.table_type; @@ -253,12 +281,46 @@ handler *get_new_handler(TABLE *table, enum db_type db_type) } /* Fall back to MyISAM */ case DB_TYPE_MYISAM: - return new ha_myisam(table); + file= new ha_myisam(table); + break; case DB_TYPE_MRG_MYISAM: - return new ha_myisammrg(table); + file= new ha_myisammrg(table); + break; + } + if (file) + { + if (file->ha_initialise()) + { + delete file; + file=0; + } } + return file; } + +#ifdef HAVE_PARTITION_DB +handler *get_ha_partition(partition_info *part_info) +{ + ha_partition *partition; + DBUG_ENTER("get_ha_partition"); + if ((partition= new ha_partition(part_info))) + { + if (partition->ha_initialise()) + { + delete partition; + partition= 0; + } + } + else + { + my_error(ER_OUTOFMEMORY, MYF(0), sizeof(ha_partition)); + } + DBUG_RETURN(((handler*) partition)); +} +#endif + + /* Register handler error messages for use with my_error(). @@ -1354,6 +1416,111 @@ int handler::ha_open(const char *name, int mode, int test_if_locked) DBUG_RETURN(error); } +int handler::ha_initialise() +{ + DBUG_ENTER("ha_initialise"); + if (table && table->s->fields && + ha_allocate_read_write_set(table->s->fields)) + { + DBUG_RETURN(TRUE); + } + DBUG_RETURN(FALSE); +} + +int handler::ha_allocate_read_write_set(ulong no_fields) +{ + uint bitmap_size= 4*(((no_fields+1)+31)/32); + uint32 *read_buf, *write_buf; +#ifndef DEBUG_OFF + my_bool r; +#endif + DBUG_ENTER("ha_allocate_read_write_set"); + DBUG_PRINT("enter", ("no_fields = %d", no_fields)); + + if (table) + { + if (table->read_set == NULL) + { + read_set= (MY_BITMAP*)sql_alloc(sizeof(MY_BITMAP)); + write_set= (MY_BITMAP*)sql_alloc(sizeof(MY_BITMAP)); + read_buf= (uint32*)sql_alloc(bitmap_size); + write_buf= (uint32*)sql_alloc(bitmap_size); + if (!read_set || !write_set || !read_buf || !write_buf) + { + ha_deallocate_read_write_set(); + DBUG_RETURN(TRUE); + } +#ifndef DEBUG_OFF + r = +#endif + bitmap_init(read_set, read_buf, no_fields+1, FALSE); + DBUG_ASSERT(!r /*bitmap_init(read_set...)*/); +#ifndef DEBUG_OFF + r = +#endif + bitmap_init(write_set, write_buf, no_fields+1, FALSE); + DBUG_ASSERT(!r /*bitmap_init(write_set...)*/); + table->read_set= read_set; + table->write_set= write_set; + ha_clear_all_set(); + } + else + { + read_set= table->read_set; + write_set= table->write_set; + } + } + DBUG_RETURN(FALSE); +} + +void handler::ha_deallocate_read_write_set() +{ + DBUG_ENTER("ha_deallocate_read_write_set"); + read_set=write_set=0; + DBUG_VOID_RETURN; +} + +void handler::ha_clear_all_set() +{ + DBUG_ENTER("ha_clear_all_set"); + bitmap_clear_all(read_set); + bitmap_clear_all(write_set); + bitmap_set_bit(read_set, 0); + bitmap_set_bit(write_set, 0); + DBUG_VOID_RETURN; +} + +int handler::ha_retrieve_all_cols() +{ + DBUG_ENTER("handler::ha_retrieve_all_cols"); + bitmap_set_all(read_set); + DBUG_RETURN(0); +} + +int handler::ha_retrieve_all_pk() +{ + DBUG_ENTER("ha_retrieve_all_pk"); + ha_set_primary_key_in_read_set(); + DBUG_RETURN(0); +} + +void handler::ha_set_primary_key_in_read_set() +{ + ulong prim_key= table->s->primary_key; + DBUG_ENTER("handler::ha_set_primary_key_in_read_set"); + DBUG_PRINT("info", ("Primary key = %d", prim_key)); + if (prim_key != MAX_KEY) + { + KEY_PART_INFO *key_part= table->key_info[prim_key].key_part; + KEY_PART_INFO *key_part_end= key_part + + table->key_info[prim_key].key_parts; + for (;key_part != key_part_end; ++key_part) + ha_set_bit_in_read_set(key_part->fieldnr); + } + DBUG_VOID_RETURN; +} + + /* Read first row (only) from a table This is never called for InnoDB or BDB tables, as these table types @@ -1382,7 +1549,7 @@ int handler::read_first_row(byte * buf, uint primary_key) else { /* Find the first row through the primary key */ - (void) ha_index_init(primary_key); + (void) ha_index_init(primary_key, 0); error=index_first(buf); (void) ha_index_end(); } @@ -1566,7 +1733,7 @@ ulonglong handler::get_auto_increment() int error; (void) extra(HA_EXTRA_KEYREAD); - index_init(table->s->next_number_index); + index_init(table->s->next_number_index, 1); if (!table->s->next_number_key_offset) { // Autoincrement at key-start error=index_last(table->record[1]); @@ -2390,7 +2557,7 @@ int handler::compare_key(key_range *range) int handler::index_read_idx(byte * buf, uint index, const byte * key, uint key_len, enum ha_rkey_function find_flag) { - int error= ha_index_init(index); + int error= ha_index_init(index, 0); if (!error) error= index_read(buf, key, key_len, find_flag); if (!error) diff --git a/sql/handler.h b/sql/handler.h index 811791a498b..fca0717011f 100644 --- a/sql/handler.h +++ b/sql/handler.h @@ -89,6 +89,11 @@ #define HA_NEED_READ_RANGE_BUFFER (1 << 29) /* for read_multi_range */ #define HA_ANY_INDEX_MAY_BE_UNIQUE (1 << 30) +/* Flags for partition handlers */ +#define HA_CAN_PARTITION (1 << 0) /* Partition support */ +#define HA_CAN_UPDATE_PARTITION_KEY (1 << 1) +#define HA_CAN_PARTITION_UNIQUE (1 << 2) + /* bits in index_flags(index_number) for what you can do with index */ #define HA_READ_NEXT 1 /* TODO really use this flag */ @@ -98,6 +103,10 @@ #define HA_ONLY_WHOLE_INDEX 16 /* Can't use part key searches */ #define HA_KEYREAD_ONLY 64 /* Support HA_EXTRA_KEYREAD */ +/* bits in alter_table_flags */ +#define HA_ONLINE_ADD_EMPTY_PARTITION 1 +#define HA_ONLINE_DROP_PARTITION 2 + /* operations for disable/enable indexes */ #define HA_KEY_SWITCH_NONUNIQ 0 #define HA_KEY_SWITCH_ALL 1 @@ -174,6 +183,7 @@ enum db_type DB_TYPE_EXAMPLE_DB, DB_TYPE_ARCHIVE_DB, DB_TYPE_CSV_DB, DB_TYPE_FEDERATED_DB, DB_TYPE_BLACKHOLE_DB, + DB_TYPE_PARTITION_DB, DB_TYPE_DEFAULT // Must be last }; @@ -220,6 +230,9 @@ typedef ulonglong my_xid; // this line is the same as in log_event.h #define MAXGTRIDSIZE 64 #define MAXBQUALSIZE 64 +#define COMPATIBLE_DATA_YES 0 +#define COMPATIBLE_DATA_NO 1 + struct xid_t { long formatID; long gtrid_length; @@ -379,6 +392,221 @@ typedef struct st_thd_trans enum enum_tx_isolation { ISO_READ_UNCOMMITTED, ISO_READ_COMMITTED, ISO_REPEATABLE_READ, ISO_SERIALIZABLE}; + +typedef struct { + uint32 start_part; + uint32 end_part; + bool use_bit_array; +} part_id_range; +/** + * An enum and a struct to handle partitioning and subpartitioning. + */ +enum partition_type { + NOT_A_PARTITION= 0, + RANGE_PARTITION, + HASH_PARTITION, + LIST_PARTITION +}; + +enum partition_state { + PART_NORMAL= 0, + PART_IS_DROPPED= 1, + PART_TO_BE_DROPPED= 2, + PART_DROPPING= 3, + PART_IS_ADDED= 4, + PART_ADDING= 5, + PART_ADDED= 6 +}; + +#define UNDEF_NODEGROUP 65535 +class Item; + +class partition_element :public Sql_alloc { +public: + List<partition_element> subpartitions; + List<longlong> list_val_list; + ulonglong part_max_rows; + ulonglong part_min_rows; + char *partition_name; + char *tablespace_name; + longlong range_value; + char* part_comment; + char* data_file_name; + char* index_file_name; + enum db_type engine_type; + enum partition_state part_state; + uint16 nodegroup_id; + + partition_element() + : part_max_rows(0), part_min_rows(0), partition_name(NULL), + tablespace_name(NULL), range_value(0), part_comment(NULL), + data_file_name(NULL), index_file_name(NULL), + engine_type(DB_TYPE_UNKNOWN),part_state(PART_NORMAL), + nodegroup_id(UNDEF_NODEGROUP) + { + subpartitions.empty(); + list_val_list.empty(); + } + ~partition_element() {} +}; + +typedef struct { + longlong list_value; + uint partition_id; +} LIST_PART_ENTRY; +enum Item_result; + +class partition_info; + +typedef bool (*get_part_id_func)(partition_info *part_info, + uint32 *part_id); +typedef uint32 (*get_subpart_id_func)(partition_info *part_info); + +class partition_info :public Sql_alloc { +public: + /* + * Here comes a set of definitions needed for partitioned table handlers. + */ + List<partition_element> partitions; + List<partition_element> temp_partitions; + + List<char> part_field_list; + List<char> subpart_field_list; + + get_part_id_func get_partition_id; + get_part_id_func get_part_partition_id; + get_subpart_id_func get_subpartition_id; + + Field **part_field_array; + Field **subpart_field_array; + Field **full_part_field_array; + + Item *part_expr; + Item *subpart_expr; + + Item *item_free_list; + + union { + longlong *range_int_array; + LIST_PART_ENTRY *list_array; + }; + char* part_info_string; + + char *part_func_string; + char *subpart_func_string; + + partition_element *curr_part_elem; + partition_element *current_partition; + /* + These key_map's are used for Partitioning to enable quick decisions + on whether we can derive more information about which partition to + scan just by looking at what index is used. + */ + key_map all_fields_in_PF, all_fields_in_PPF, all_fields_in_SPF; + key_map some_fields_in_PF; + + enum db_type default_engine_type; + Item_result part_result_type; + partition_type part_type; + partition_type subpart_type; + + uint part_info_len; + uint part_func_len; + uint subpart_func_len; + + uint no_parts; + uint no_subparts; + uint count_curr_parts; + uint count_curr_subparts; + + uint part_error_code; + + uint no_list_values; + + uint no_part_fields; + uint no_subpart_fields; + uint no_full_part_fields; + + uint16 linear_hash_mask; + + bool use_default_partitions; + bool use_default_subpartitions; + bool defined_max_value; + bool list_of_part_fields; + bool list_of_subpart_fields; + bool linear_hash_ind; + + partition_info() + : get_partition_id(NULL), get_part_partition_id(NULL), + get_subpartition_id(NULL), + part_field_array(NULL), subpart_field_array(NULL), + full_part_field_array(NULL), + part_expr(NULL), subpart_expr(NULL), item_free_list(NULL), + list_array(NULL), + part_info_string(NULL), + part_func_string(NULL), subpart_func_string(NULL), + curr_part_elem(NULL), current_partition(NULL), + default_engine_type(DB_TYPE_UNKNOWN), + part_result_type(INT_RESULT), + part_type(NOT_A_PARTITION), subpart_type(NOT_A_PARTITION), + part_info_len(0), part_func_len(0), subpart_func_len(0), + no_parts(0), no_subparts(0), + count_curr_parts(0), count_curr_subparts(0), part_error_code(0), + no_list_values(0), no_part_fields(0), no_subpart_fields(0), + no_full_part_fields(0), linear_hash_mask(0), + use_default_partitions(TRUE), + use_default_subpartitions(TRUE), defined_max_value(FALSE), + list_of_part_fields(FALSE), list_of_subpart_fields(FALSE), + linear_hash_ind(FALSE) + { + all_fields_in_PF.clear_all(); + all_fields_in_PPF.clear_all(); + all_fields_in_SPF.clear_all(); + some_fields_in_PF.clear_all(); + partitions.empty(); + temp_partitions.empty(); + part_field_list.empty(); + subpart_field_list.empty(); + } + ~partition_info() {} +}; + + +#ifdef HAVE_PARTITION_DB +/* + Answers the question if subpartitioning is used for a certain table + SYNOPSIS + is_sub_partitioned() + part_info A reference to the partition_info struct + RETURN VALUE + Returns true if subpartitioning used and false otherwise + DESCRIPTION + A routine to check for subpartitioning for improved readability of code +*/ +inline +bool is_sub_partitioned(partition_info *part_info) +{ return (part_info->subpart_type == NOT_A_PARTITION ? FALSE : TRUE); } + + +/* + Returns the total number of partitions on the leaf level. + SYNOPSIS + get_tot_partitions() + part_info A reference to the partition_info struct + RETURN VALUE + Returns the number of partitions + DESCRIPTION + A routine to check for number of partitions for improved readability + of code +*/ +inline +uint get_tot_partitions(partition_info *part_info) +{ + return part_info->no_parts * + (is_sub_partitioned(part_info) ? part_info->no_subparts : 1); +} +#endif + typedef struct st_ha_create_information { CHARSET_INFO *table_charset, *default_table_charset; @@ -427,6 +655,38 @@ typedef struct st_ha_check_opt } HA_CHECK_OPT; +#ifdef HAVE_PARTITION_DB +bool is_partition_in_list(char *part_name, List<char> list_part_names); +bool is_partitions_in_table(partition_info *new_part_info, + partition_info *old_part_info); +bool set_up_defaults_for_partitioning(partition_info *part_info, + handler *file, + ulonglong max_rows, + uint start_no); +handler *get_ha_partition(partition_info *part_info); +int get_parts_for_update(const byte *old_data, byte *new_data, + const byte *rec0, partition_info *part_info, + uint32 *old_part_id, uint32 *new_part_id); +int get_part_for_delete(const byte *buf, const byte *rec0, + partition_info *part_info, uint32 *part_id); +bool check_partition_info(partition_info *part_info,enum db_type eng_type, + handler *file, ulonglong max_rows); +bool fix_partition_func(THD *thd, const char *name, TABLE *table); +char *generate_partition_syntax(partition_info *part_info, + uint *buf_length, bool use_sql_alloc); +bool partition_key_modified(TABLE *table, List<Item> &fields); +void get_partition_set(const TABLE *table, byte *buf, const uint index, + const key_range *key_spec, + part_id_range *part_spec); +void get_full_part_id_from_key(const TABLE *table, byte *buf, + KEY *key_info, + const key_range *key_spec, + part_id_range *part_spec); +bool mysql_unpack_partition(File file, THD *thd, uint part_info_len, + TABLE *table); +#endif + + /* This is a buffer area that the handler can use to store rows. 'end_of_used_area' should be kept updated after calls to @@ -444,10 +704,13 @@ typedef struct st_handler_buffer class handler :public Sql_alloc { +#ifdef HAVE_PARTITION_DB + friend class ha_partition; +#endif protected: struct st_table *table; /* The table definition */ - virtual int index_init(uint idx) { active_index=idx; return 0; } + virtual int index_init(uint idx, bool sorted) { active_index=idx; return 0; } virtual int index_end() { active_index=MAX_KEY; return 0; } /* rnd_init() can be called two times without rnd_end() in between @@ -459,6 +722,8 @@ class handler :public Sql_alloc virtual int rnd_init(bool scan) =0; virtual int rnd_end() { return 0; } +private: + virtual int reset() { return extra(HA_EXTRA_RESET); } public: const handlerton *ht; /* storage engine of this handler */ byte *ref; /* Pointer to current row */ @@ -501,6 +766,8 @@ public: bool auto_increment_column_changed; bool implicit_emptied; /* Can be !=0 only if HEAP */ const COND *pushed_cond; + MY_BITMAP *read_set; + MY_BITMAP *write_set; handler(const handlerton *ht_arg, TABLE *table_arg) :table(table_arg), ht(ht_arg), @@ -513,7 +780,12 @@ public: raid_type(0), ft_handler(0), inited(NONE), implicit_emptied(0), pushed_cond(NULL) {} - virtual ~handler(void) { /* TODO: DBUG_ASSERT(inited == NONE); */ } + virtual ~handler(void) + { + ha_deallocate_read_write_set(); + /* TODO: DBUG_ASSERT(inited == NONE); */ + } + virtual int ha_initialise(); int ha_open(const char *name, int mode, int test_if_locked); bool update_auto_increment(); virtual void print_error(int error, myf errflag); @@ -526,7 +798,7 @@ public: { return rows2double(ranges+rows); } virtual const key_map *keys_to_use_for_scanning() { return &key_map_empty; } virtual bool has_transactions(){ return 0;} - virtual uint extra_rec_buf_length() { return 0; } + virtual uint extra_rec_buf_length() const { return 0; } /* Return upper bound of current number of records in the table @@ -545,12 +817,12 @@ public: virtual const char *index_type(uint key_number) { DBUG_ASSERT(0); return "";} - int ha_index_init(uint idx) + int ha_index_init(uint idx, bool sorted) { DBUG_ENTER("ha_index_init"); DBUG_ASSERT(inited==NONE); inited=INDEX; - DBUG_RETURN(index_init(idx)); + DBUG_RETURN(index_init(idx, sorted)); } int ha_index_end() { @@ -573,11 +845,140 @@ public: inited=NONE; DBUG_RETURN(rnd_end()); } + int ha_reset() + { + DBUG_ENTER("ha_reset"); + ha_clear_all_set(); + DBUG_RETURN(reset()); + } + /* this is necessary in many places, e.g. in HANDLER command */ int ha_index_or_rnd_end() { return inited == INDEX ? ha_index_end() : inited == RND ? ha_rnd_end() : 0; } + /* + These are a set of routines used to enable handlers to only read/write + partial lists of the fields in the table. The bit vector is maintained + by the server part and is used by the handler at calls to read/write + data in the table. + It replaces the use of query id's for this purpose. The benefit is that + the handler can also set bits in the read/write set if it has special + needs and it is also easy for other parts of the server to interact + with the handler (e.g. the replication part for row-level logging). + The routines are all part of the general handler and are not possible + to override by a handler. A handler can however set/reset bits by + calling these routines. + + The methods ha_retrieve_all_cols and ha_retrieve_all_pk are made + virtual to handle InnoDB specifics. If InnoDB doesn't need the + extra parameters HA_EXTRA_RETRIEVE_ALL_COLS and + HA_EXTRA_RETRIEVE_PRIMARY_KEY anymore then these methods need not be + virtual anymore. + */ + virtual int ha_retrieve_all_cols(); + virtual int ha_retrieve_all_pk(); + void ha_set_all_bits_in_read_set() + { + DBUG_ENTER("ha_set_all_bits_in_read_set"); + bitmap_set_all(read_set); + DBUG_VOID_RETURN; + } + void ha_set_all_bits_in_write_set() + { + DBUG_ENTER("ha_set_all_bits_in_write_set"); + bitmap_set_all(write_set); + DBUG_VOID_RETURN; + } + void ha_set_bit_in_read_set(uint fieldnr) + { + DBUG_ENTER("ha_set_bit_in_read_set"); + DBUG_PRINT("info", ("fieldnr = %d", fieldnr)); + bitmap_set_bit(read_set, fieldnr); + DBUG_VOID_RETURN; + } + void ha_clear_bit_in_read_set(uint fieldnr) + { + DBUG_ENTER("ha_clear_bit_in_read_set"); + DBUG_PRINT("info", ("fieldnr = %d", fieldnr)); + bitmap_clear_bit(read_set, fieldnr); + DBUG_VOID_RETURN; + } + void ha_set_bit_in_write_set(uint fieldnr) + { + DBUG_ENTER("ha_set_bit_in_write_set"); + DBUG_PRINT("info", ("fieldnr = %d", fieldnr)); + bitmap_set_bit(write_set, fieldnr); + DBUG_VOID_RETURN; + } + void ha_clear_bit_in_write_set(uint fieldnr) + { + DBUG_ENTER("ha_clear_bit_in_write_set"); + DBUG_PRINT("info", ("fieldnr = %d", fieldnr)); + bitmap_clear_bit(write_set, fieldnr); + DBUG_VOID_RETURN; + } + void ha_set_bit_in_rw_set(uint fieldnr, bool write_op) + { + DBUG_ENTER("ha_set_bit_in_rw_set"); + DBUG_PRINT("info", ("Set bit %u in read set", fieldnr)); + bitmap_set_bit(read_set, fieldnr); + if (!write_op) { + DBUG_VOID_RETURN; + } + else + { + DBUG_PRINT("info", ("Set bit %u in read and write set", fieldnr)); + bitmap_set_bit(write_set, fieldnr); + } + DBUG_VOID_RETURN; + } + bool ha_get_bit_in_read_set(uint fieldnr) + { + bool bit_set=bitmap_is_set(read_set,fieldnr); + DBUG_ENTER("ha_get_bit_in_read_set"); + DBUG_PRINT("info", ("bit %u = %u", fieldnr, bit_set)); + DBUG_RETURN(bit_set); + } + bool ha_get_bit_in_write_set(uint fieldnr) + { + bool bit_set=bitmap_is_set(write_set,fieldnr); + DBUG_ENTER("ha_get_bit_in_write_set"); + DBUG_PRINT("info", ("bit %u = %u", fieldnr, bit_set)); + DBUG_RETURN(bit_set); + } + bool ha_get_all_bit_in_read_set() + { + bool all_bits_set= bitmap_is_set_all(read_set); + DBUG_ENTER("ha_get_all_bit_in_read_set"); + DBUG_PRINT("info", ("all bits set = %u", all_bits_set)); + DBUG_RETURN(all_bits_set); + } + bool ha_get_all_bit_in_read_clear() + { + bool all_bits_set= bitmap_is_clear_all(read_set); + DBUG_ENTER("ha_get_all_bit_in_read_clear"); + DBUG_PRINT("info", ("all bits clear = %u", all_bits_set)); + DBUG_RETURN(all_bits_set); + } + bool ha_get_all_bit_in_write_set() + { + bool all_bits_set= bitmap_is_set_all(write_set); + DBUG_ENTER("ha_get_all_bit_in_write_set"); + DBUG_PRINT("info", ("all bits set = %u", all_bits_set)); + DBUG_RETURN(all_bits_set); + } + bool ha_get_all_bit_in_write_clear() + { + bool all_bits_set= bitmap_is_clear_all(write_set); + DBUG_ENTER("ha_get_all_bit_in_write_clear"); + DBUG_PRINT("info", ("all bits clear = %u", all_bits_set)); + DBUG_RETURN(all_bits_set); + } + void ha_set_primary_key_in_read_set(); + int ha_allocate_read_write_set(ulong no_fields); + void ha_deallocate_read_write_set(); + void ha_clear_all_set(); uint get_index(void) const { return active_index; } virtual int open(const char *name, int mode, uint test_if_locked)=0; virtual int close(void)=0; @@ -586,6 +987,85 @@ public: { return HA_ERR_WRONG_COMMAND; } virtual int delete_row(const byte * buf) { return HA_ERR_WRONG_COMMAND; } + /* + SYNOPSIS + start_bulk_update() + RETURN + 0 Bulk update used by handler + 1 Bulk update not used, normal operation used + */ + virtual bool start_bulk_update() { return 1; } + /* + SYNOPSIS + start_bulk_delete() + RETURN + 0 Bulk delete used by handler + 1 Bulk delete not used, normal operation used + */ + virtual bool start_bulk_delete() { return 1; } + /* + SYNOPSIS + This method is similar to update_row, however the handler doesn't need + to execute the updates at this point in time. The handler can be certain + that another call to bulk_update_row will occur OR a call to + exec_bulk_update before the set of updates in this query is concluded. + + bulk_update_row() + old_data Old record + new_data New record + dup_key_found Number of duplicate keys found + RETURN + 0 Bulk delete used by handler + 1 Bulk delete not used, normal operation used + */ + virtual int bulk_update_row(const byte *old_data, byte *new_data, + uint *dup_key_found) + { + DBUG_ASSERT(FALSE); + return HA_ERR_WRONG_COMMAND; + } + /* + SYNOPSIS + After this call all outstanding updates must be performed. The number + of duplicate key errors are reported in the duplicate key parameter. + It is allowed to continue to the batched update after this call, the + handler has to wait until end_bulk_update with changing state. + + exec_bulk_update() + dup_key_found Number of duplicate keys found + RETURN + 0 Success + >0 Error code + */ + virtual int exec_bulk_update(uint *dup_key_found) + { + DBUG_ASSERT(FALSE); + return HA_ERR_WRONG_COMMAND; + } + /* + SYNOPSIS + Perform any needed clean-up, no outstanding updates are there at the + moment. + + end_bulk_update() + RETURN + Nothing + */ + virtual void end_bulk_update() { return; } + /* + SYNOPSIS + Execute all outstanding deletes and close down the bulk delete. + + end_bulk_delete() + RETURN + 0 Success + >0 Error code + */ + virtual int end_bulk_delete() + { + DBUG_ASSERT(FALSE); + return HA_ERR_WRONG_COMMAND; + } virtual int index_read(byte * buf, const byte * key, uint key_len, enum ha_rkey_function find_flag) { return HA_ERR_WRONG_COMMAND; } @@ -636,7 +1116,6 @@ public: { return 0; } virtual int extra_opt(enum ha_extra_function operation, ulong cache_size) { return extra(operation); } - virtual int reset() { return extra(HA_EXTRA_RESET); } virtual int external_lock(THD *thd, int lock_type) { return 0; } virtual void unlock_row() {} virtual int start_stmt(THD *thd) {return 0;} @@ -698,6 +1177,20 @@ public: virtual char *update_table_comment(const char * comment) { return (char*) comment;} virtual void append_create_info(String *packet) {} + /* + SYNOPSIS + is_fk_defined_on_table_or_index() + index Index to check if foreign key uses it + RETURN VALUE + TRUE Foreign key defined on table or index + FALSE No foreign key defined + DESCRIPTION + If index == MAX_KEY then a check for table is made and if index < + MAX_KEY then a check is made if the table has foreign keys and if + a foreign key uses this index (and thus the index cannot be dropped). + */ + virtual bool is_fk_defined_on_table_or_index(uint index) + { return FALSE; } virtual char* get_foreign_key_create_info() { return(NULL);} /* gets foreign key create string from InnoDB */ /* used in ALTER TABLE; 1 if changing storage engine is allowed */ @@ -713,6 +1206,11 @@ public: virtual const char *table_type() const =0; virtual const char **bas_ext() const =0; virtual ulong table_flags(void) const =0; + virtual ulong alter_table_flags(void) const { return 0; } +#ifdef HAVE_PARTITION_DB + virtual ulong partition_flags(void) const { return 0;} + virtual int get_default_no_partitions(ulonglong max_rows) { return 1;} +#endif virtual ulong index_flags(uint idx, uint part, bool all_parts) const =0; virtual ulong index_ddl_flags(KEY *wanted_index) const { return (HA_DDL_SUPPORT); } @@ -752,7 +1250,21 @@ public: virtual int delete_table(const char *name); virtual int create(const char *name, TABLE *form, HA_CREATE_INFO *info)=0; + virtual int create_handler_files(const char *name) { return FALSE;} + /* + SYNOPSIS + drop_partitions() + path Complete path of db and table name + RETURN VALUE + TRUE Failure + FALSE Success + DESCRIPTION + Drop a partition, during this operation no other activity is ongoing + in this server on the table. + */ + virtual int drop_partitions(const char *path) + { return HA_ERR_WRONG_COMMAND; } /* lock_count() can be more than one if the table is a MERGE */ virtual uint lock_count(void) const { return 1; } virtual THR_LOCK_DATA **store_lock(THD *thd, @@ -816,6 +1328,9 @@ public: Pops the top if condition stack, if stack is not empty */ virtual void cond_pop() { return; }; + virtual bool check_if_incompatible_data(HA_CREATE_INFO *create_info, + uint table_changes) + { return COMPATIBLE_DATA_NO; } }; /* Some extern variables used with handlers */ diff --git a/sql/item.cc b/sql/item.cc index e7da646ae73..1cb6734d373 100644 --- a/sql/item.cc +++ b/sql/item.cc @@ -3407,13 +3407,18 @@ bool Item_field::fix_fields(THD *thd, Item **reference) set_field(from_field); } - else if (thd->set_query_id && field->query_id != thd->query_id) + else if (thd->set_query_id) { - /* We only come here in unions */ - TABLE *table=field->table; - field->query_id=thd->query_id; - table->used_fields++; - table->used_keys.intersect(field->part_of_key); + TABLE *table= field->table; + table->file->ha_set_bit_in_rw_set(field->fieldnr, + (bool)(thd->set_query_id-1)); + if (field->query_id != thd->query_id) + { + /* We only come here in unions */ + field->query_id=thd->query_id; + table->used_fields++; + table->used_keys.intersect(field->part_of_key); + } } #ifndef NO_EMBEDDED_ACCESS_CHECKS if (any_privileges) diff --git a/sql/item_subselect.cc b/sql/item_subselect.cc index 52a74b6f4c6..aead2d67c2c 100644 --- a/sql/item_subselect.cc +++ b/sql/item_subselect.cc @@ -1603,7 +1603,7 @@ int subselect_uniquesubquery_engine::exec() } if (!table->file->inited) - table->file->ha_index_init(tab->ref.key); + table->file->ha_index_init(tab->ref.key, 0); error= table->file->index_read(table->record[0], tab->ref.key_buff, tab->ref.key_length,HA_READ_KEY_EXACT); @@ -1656,7 +1656,7 @@ int subselect_indexsubquery_engine::exec() } if (!table->file->inited) - table->file->ha_index_init(tab->ref.key); + table->file->ha_index_init(tab->ref.key, 1); error= table->file->index_read(table->record[0], tab->ref.key_buff, tab->ref.key_length,HA_READ_KEY_EXACT); diff --git a/sql/key.cc b/sql/key.cc index 4bd71d2fa47..731df134efa 100644 --- a/sql/key.cc +++ b/sql/key.cc @@ -429,3 +429,86 @@ int key_cmp(KEY_PART_INFO *key_part, const byte *key, uint key_length) } return 0; // Keys are equal } + + +/* + Compare two records in index order + SYNOPSIS + key_rec_cmp() + key Index information + rec0 Pointer to table->record[0] + first_rec Pointer to record compare with + second_rec Pointer to record compare against first_rec + DESCRIPTION + This method is set-up such that it can be called directly from the + priority queue and it is attempted to be optimised as much as possible + since this will be called O(N * log N) times while performing a merge + sort in various places in the code. + + We retrieve the pointer to table->record[0] using the fact that key_parts + have an offset making it possible to calculate the start of the record. + We need to get the diff to the compared record since none of the records + being compared are stored in table->record[0]. + + We first check for NULL values, if there are no NULL values we use + a compare method that gets two field pointers and a max length + and return the result of the comparison. +*/ + +int key_rec_cmp(void *key, byte *first_rec, byte *second_rec) +{ + KEY *key_info= (KEY*)key; + uint key_parts= key_info->key_parts, i= 0; + KEY_PART_INFO *key_part= key_info->key_part; + char *rec0= key_part->field->ptr - key_part->offset; + my_ptrdiff_t first_diff= first_rec - rec0, sec_diff= second_rec - rec0; + int result= 0; + DBUG_ENTER("key_rec_cmp"); + + do + { + Field *field= key_part->field; + uint length; + + if (key_part->null_bit) + { + /* The key_part can contain NULL values */ + bool first_is_null= field->is_null_in_record_with_offset(first_diff); + bool sec_is_null= field->is_null_in_record_with_offset(sec_diff); + /* + NULL is smaller then everything so if first is NULL and the other + not then we know that we should return -1 and for the opposite + we should return +1. If both are NULL then we call it equality + although it is a strange form of equality, we have equally little + information of the real value. + */ + if (!first_is_null) + { + if (!sec_is_null) + ; /* Fall through, no NULL fields */ + else + { + DBUG_RETURN(+1); + } + } + else if (!sec_is_null) + { + DBUG_RETURN(-1); + } + else + goto next_loop; /* Both were NULL */ + } + /* + No null values in the fields + We use the virtual method cmp_max with a max length parameter. + For most field types this translates into a cmp without + max length. The exceptions are the BLOB and VARCHAR field types + that take the max length into account. + */ + result= field->cmp_max(field->ptr+first_diff, field->ptr+sec_diff, + key_part->length); +next_loop: + key_part++; + } while (!result && ++i < key_parts); + DBUG_RETURN(result); +} diff --git a/sql/lex.h b/sql/lex.h index a5366742fd9..9b8f94f61bf 100644 --- a/sql/lex.h +++ b/sql/lex.h @@ -110,6 +110,7 @@ static SYMBOL symbols[] = { { "CIPHER", SYM(CIPHER_SYM)}, { "CLIENT", SYM(CLIENT_SYM)}, { "CLOSE", SYM(CLOSE_SYM)}, + { "COALESCE", SYM(COALESCE)}, { "COLLATE", SYM(COLLATE_SYM)}, { "COLLATION", SYM(COLLATION_SYM)}, { "COLUMN", SYM(COLUMN_SYM)}, @@ -274,11 +275,14 @@ static SYMBOL symbols[] = { { "LEAVE", SYM(LEAVE_SYM)}, { "LEAVES", SYM(LEAVES)}, { "LEFT", SYM(LEFT)}, + { "LESS", SYM(LESS_SYM)}, { "LEVEL", SYM(LEVEL_SYM)}, { "LIKE", SYM(LIKE)}, { "LIMIT", SYM(LIMIT)}, + { "LINEAR", SYM(LINEAR_SYM)}, { "LINES", SYM(LINES)}, { "LINESTRING", SYM(LINESTRING)}, + { "LIST", SYM(LIST_SYM)}, { "LOAD", SYM(LOAD)}, { "LOCAL", SYM(LOCAL_SYM)}, { "LOCALTIME", SYM(NOW_SYM)}, @@ -312,6 +316,7 @@ static SYMBOL symbols[] = { { "MAX_ROWS", SYM(MAX_ROWS)}, { "MAX_UPDATES_PER_HOUR", SYM(MAX_UPDATES_PER_HOUR)}, { "MAX_USER_CONNECTIONS", SYM(MAX_USER_CONNECTIONS_SYM)}, + { "MAXVALUE", SYM(MAX_VALUE_SYM)}, { "MEDIUM", SYM(MEDIUM_SYM)}, { "MEDIUMBLOB", SYM(MEDIUMBLOB)}, { "MEDIUMINT", SYM(MEDIUMINT)}, @@ -343,6 +348,7 @@ static SYMBOL symbols[] = { { "NEW", SYM(NEW_SYM)}, { "NEXT", SYM(NEXT_SYM)}, { "NO", SYM(NO_SYM)}, + { "NODEGROUP", SYM(NODEGROUP_SYM)}, { "NONE", SYM(NONE_SYM)}, { "NOT", SYM(NOT_SYM)}, { "NO_WRITE_TO_BINLOG", SYM(NO_WRITE_TO_BINLOG)}, @@ -365,6 +371,10 @@ static SYMBOL symbols[] = { { "OUTFILE", SYM(OUTFILE)}, { "PACK_KEYS", SYM(PACK_KEYS_SYM)}, { "PARTIAL", SYM(PARTIAL)}, +#ifdef HAVE_PARTITION_DB + { "PARTITION", SYM(PARTITION_SYM)}, +#endif + { "PARTITIONS", SYM(PARTITIONS_SYM)}, { "PASSWORD", SYM(PASSWORD)}, { "PHASE", SYM(PHASE_SYM)}, { "POINT", SYM(POINT_SYM)}, @@ -385,6 +395,7 @@ static SYMBOL symbols[] = { { "RAID_CHUNKS", SYM(RAID_CHUNKS)}, { "RAID_CHUNKSIZE", SYM(RAID_CHUNKSIZE)}, { "RAID_TYPE", SYM(RAID_TYPE)}, + { "RANGE", SYM(RANGE_SYM)}, { "READ", SYM(READ_SYM)}, { "READS", SYM(READS_SYM)}, { "REAL", SYM(REAL)}, @@ -398,6 +409,7 @@ static SYMBOL symbols[] = { { "RELEASE", SYM(RELEASE_SYM)}, { "RELOAD", SYM(RELOAD)}, { "RENAME", SYM(RENAME)}, + { "REORGANISE", SYM(REORGANISE_SYM)}, { "REPAIR", SYM(REPAIR)}, { "REPEATABLE", SYM(REPEATABLE_SYM)}, { "REPLACE", SYM(REPLACE)}, @@ -476,6 +488,8 @@ static SYMBOL symbols[] = { { "STRING", SYM(STRING_SYM)}, { "STRIPED", SYM(RAID_STRIPED_SYM)}, { "SUBJECT", SYM(SUBJECT_SYM)}, + { "SUBPARTITION", SYM(SUBPARTITION_SYM)}, + { "SUBPARTITIONS", SYM(SUBPARTITIONS_SYM)}, { "SUPER", SYM(SUPER_SYM)}, { "SUSPEND", SYM(SUSPEND_SYM)}, { "TABLE", SYM(TABLE_SYM)}, @@ -485,6 +499,7 @@ static SYMBOL symbols[] = { { "TEMPTABLE", SYM(TEMPTABLE_SYM)}, { "TERMINATED", SYM(TERMINATED)}, { "TEXT", SYM(TEXT_SYM)}, + { "THAN", SYM(THAN_SYM)}, { "THEN", SYM(THEN_SYM)}, { "TIME", SYM(TIME_SYM)}, { "TIMESTAMP", SYM(TIMESTAMP)}, @@ -576,7 +591,6 @@ static SYMBOL sql_functions[] = { { "CENTROID", F_SYM(FUNC_ARG1),0,CREATE_FUNC_GEOM(create_func_centroid)}, { "CHAR_LENGTH", F_SYM(FUNC_ARG1),0,CREATE_FUNC(create_func_char_length)}, { "CHARACTER_LENGTH", F_SYM(FUNC_ARG1),0,CREATE_FUNC(create_func_char_length)}, - { "COALESCE", SYM(COALESCE)}, { "COERCIBILITY", F_SYM(FUNC_ARG1),0,CREATE_FUNC(create_func_coercibility)}, { "COMPRESS", F_SYM(FUNC_ARG1),0,CREATE_FUNC(create_func_compress)}, { "CONCAT", SYM(CONCAT)}, diff --git a/sql/lock.cc b/sql/lock.cc index 941d7baa76e..0aba7fddb51 100644 --- a/sql/lock.cc +++ b/sql/lock.cc @@ -72,7 +72,7 @@ TODO: #ifndef MASTER #include "../srclib/myisammrg/myrg_def.h" #else -#include "../myisammrg/myrg_def.h" +#include "../storage/myisammrg/myrg_def.h" #endif static MYSQL_LOCK *get_lock_data(THD *thd, TABLE **table,uint count, @@ -211,7 +211,6 @@ static int lock_external(THD *thd, TABLE **tables, uint count) ((*tables)->reginfo.lock_type >= TL_READ && (*tables)->reginfo.lock_type <= TL_READ_NO_INSERT)) lock_type=F_RDLCK; - if ((error=(*tables)->file->external_lock(thd,lock_type))) { print_lock_error(error, (*tables)->file->table_type()); diff --git a/sql/log.cc b/sql/log.cc index 920a3fcff42..9d9f500fe80 100644 --- a/sql/log.cc +++ b/sql/log.cc @@ -24,6 +24,7 @@ #include "mysql_priv.h" #include "sql_repl.h" +#include "rpl_filter.h" #include <my_dir.h> #include <stdarg.h> @@ -1613,10 +1614,11 @@ bool MYSQL_LOG::write(Log_event *event_info) binlog_[wild_]{do|ignore}_table?" (WL#1049)" */ if ((thd && !(thd->options & OPTION_BIN_LOG)) || - (!db_ok(local_db, binlog_do_db, binlog_ignore_db))) + (!binlog_filter->db_ok(local_db))) { VOID(pthread_mutex_unlock(&LOCK_log)); - DBUG_PRINT("error",("!db_ok('%s')", local_db)); + DBUG_PRINT("info",("db_ok('%s')==%d", local_db, + binlog_filter->db_ok(local_db))); DBUG_RETURN(0); } #endif /* HAVE_REPLICATION */ diff --git a/sql/log_event.cc b/sql/log_event.cc index 5cb4c289a10..7ee505939f0 100644 --- a/sql/log_event.cc +++ b/sql/log_event.cc @@ -23,6 +23,7 @@ #include "mysql_priv.h" #include "slave.h" +#include "rpl_filter.h" #include <my_dir.h> #endif /* MYSQL_CLIENT */ @@ -1535,7 +1536,7 @@ int Query_log_event::exec_event(struct st_relay_log_info* rli, const char *query */ thd->catalog= catalog_len ? (char *) catalog : (char *)""; thd->db_length= db_len; - thd->db= (char*) rewrite_db(db, &thd->db_length); + thd->db= (char *) rpl_filter->get_rewrite_db(db, &thd->db_length); thd->variables.auto_increment_increment= auto_increment_increment; thd->variables.auto_increment_offset= auto_increment_offset; @@ -1564,7 +1565,7 @@ int Query_log_event::exec_event(struct st_relay_log_info* rli, const char *query ::exec_event(), then the companion SET also have so we don't need to reset_one_shot_variables(). */ - if (db_ok(thd->db, replicate_do_db, replicate_ignore_db)) + if (rpl_filter->db_ok(thd->db)) { thd->set_time((time_t)when); thd->query_length= q_len_arg; @@ -2685,7 +2686,7 @@ int Load_log_event::exec_event(NET* net, struct st_relay_log_info* rli, bool use_rli_only_for_errors) { thd->db_length= db_len; - thd->db= (char*) rewrite_db(db, &thd->db_length); + thd->db= (char *) rpl_filter->get_rewrite_db(db, &thd->db_length); DBUG_ASSERT(thd->query == 0); thd->query_length= 0; // Should not be needed thd->query_error= 0; @@ -2724,7 +2725,7 @@ int Load_log_event::exec_event(NET* net, struct st_relay_log_info* rli, ::exec_event(), then the companion SET also have so we don't need to reset_one_shot_variables(). */ - if (db_ok(thd->db, replicate_do_db, replicate_ignore_db)) + if (rpl_filter->db_ok(thd->db)) { thd->set_time((time_t)when); VOID(pthread_mutex_lock(&LOCK_thread_count)); @@ -2746,7 +2747,7 @@ int Load_log_event::exec_event(NET* net, struct st_relay_log_info* rli, tables.updating= 1; // the table will be opened in mysql_load - if (table_rules_on && !tables_ok(thd, &tables)) + if (rpl_filter->is_on() && !rpl_filter->tables_ok(thd->db, &tables)) { // TODO: this is a bug - this needs to be moved to the I/O thread if (net) diff --git a/sql/mysql_priv.h b/sql/mysql_priv.h index 0bddf92e6aa..497382d8c96 100644 --- a/sql/mysql_priv.h +++ b/sql/mysql_priv.h @@ -398,6 +398,13 @@ void debug_sync_point(const char* lock_name, uint lock_timeout); #define STRING_BUFFER_USUAL_SIZE 80 +/* + Some defines for exit codes for ::is_equal class functions. +*/ +#define IS_EQUAL_NO 0 +#define IS_EQUAL_YES 1 +#define IS_EQUAL_PACK_LENGTH 2 + enum enum_parsing_place { NO_MATTER, @@ -626,6 +633,22 @@ bool check_table_access(THD *thd, ulong want_access, TABLE_LIST *tables, bool no_errors); bool check_global_access(THD *thd, ulong want_access); +/* + Support routine for SQL parser on partitioning syntax +*/ +my_bool is_partition_management(LEX *lex); +/* + General routine to change field->ptr of a NULL-terminated array of Field + objects. Useful when needed to call val_int, val_str or similar and the + field data is not in table->record[0] but in some other structure. + set_key_field_ptr changes all fields of an index using a key_info object. + All methods presume that there is at least one field to change. +*/ + +void set_field_ptr(Field **ptr, const byte *new_buf, const byte *old_buf); +void set_key_field_ptr(KEY *key_info, const byte *new_buf, + const byte *old_buf); + bool mysql_backup_table(THD* thd, TABLE_LIST* table_list); bool mysql_restore_table(THD* thd, TABLE_LIST* table_list); @@ -786,6 +809,10 @@ Field * find_field_in_table(THD *thd, TABLE *table, const char *name, uint length, bool check_grants, bool allow_rowid, uint *cached_field_index_ptr); + +Field * +find_field_in_table_sef(TABLE *table, const char *name); + #ifdef HAVE_OPENSSL #include <openssl/des.h> struct st_des_keyblock @@ -921,10 +948,10 @@ bool setup_tables(THD *thd, Name_resolution_context *context, int setup_wild(THD *thd, TABLE_LIST *tables, List<Item> &fields, List<Item> *sum_func_list, uint wild_num); bool setup_fields(THD *thd, Item** ref_pointer_array, - List<Item> &item, bool set_query_id, + List<Item> &item, ulong set_query_id, List<Item> *sum_func_list, bool allow_sum_func); inline bool setup_fields_with_no_wrap(THD *thd, Item **ref_pointer_array, - List<Item> &item, bool set_query_id, + List<Item> &item, ulong set_query_id, List<Item> *sum_func_list, bool allow_sum_func) { @@ -1044,6 +1071,7 @@ bool key_cmp_if_same(TABLE *form,const byte *key,uint index,uint key_length); void key_unpack(String *to,TABLE *form,uint index); bool check_if_key_used(TABLE *table, uint idx, List<Item> &fields); int key_cmp(KEY_PART_INFO *key_part, const byte *key, uint key_length); +int key_rec_cmp(void *key_info, byte *a, byte *b); bool init_errmessage(void); void sql_perror(const char *message); @@ -1204,7 +1232,6 @@ extern KNOWN_DATE_TIME_FORMAT known_date_time_formats[]; extern String null_string; extern HASH open_cache; extern TABLE *unused_tables; -extern I_List<i_string> binlog_do_db, binlog_ignore_db; extern const char* any_db; extern struct my_option my_long_options[]; extern const LEX_STRING view_type; @@ -1221,6 +1248,7 @@ extern SHOW_COMP_OPTION have_query_cache; extern SHOW_COMP_OPTION have_geometry, have_rtree_keys; extern SHOW_COMP_OPTION have_crypt; extern SHOW_COMP_OPTION have_compress; +extern SHOW_COMP_OPTION have_partition_db; #ifndef __WIN__ extern pthread_t signal_thread; @@ -1274,7 +1302,7 @@ int rea_create_table(THD *thd, my_string file_name, const char *db, const char *table, HA_CREATE_INFO *create_info, List<create_field> &create_field, - uint key_count,KEY *key_info); + uint key_count,KEY *key_info, handler *file); int format_number(uint inputflag,uint max_length,my_string pos,uint length, my_string *errpos); int openfrm(THD *thd, const char *name,const char *alias,uint filestat, diff --git a/sql/mysqld.cc b/sql/mysqld.cc index f7e9b21076e..aaed7b64377 100644 --- a/sql/mysqld.cc +++ b/sql/mysqld.cc @@ -19,6 +19,7 @@ #include <my_dir.h> #include "slave.h" #include "sql_repl.h" +#include "rpl_filter.h" #include "repl_failsafe.h" #include "stacktrace.h" #include "mysqld_suffix.h" @@ -362,6 +363,12 @@ my_bool opt_ndb_shm, opt_ndb_optimized_node_selection; ulong opt_ndb_cache_check_time; const char *opt_ndb_mgmd; ulong opt_ndb_nodeid; + +const char *ndb_distribution_names[]= {"KEYHASH", "LINHASH", NullS}; +TYPELIB ndb_distribution_typelib= { array_elements(ndb_distribution_names)-1, + "", ndb_distribution_names, NULL }; +const char *opt_ndb_distribution= ndb_distribution_names[ND_KEYHASH]; +enum ndb_distribution opt_ndb_distribution_id= ND_KEYHASH; #endif my_bool opt_readonly, use_temp_pool, relay_log_purge; my_bool opt_sync_frm, opt_allow_suspicious_udfs; @@ -450,12 +457,10 @@ FILE *bootstrap_file; int bootstrap_error; FILE *stderror_file=0; -I_List<i_string_pair> replicate_rewrite_db; -I_List<i_string> replicate_do_db, replicate_ignore_db; -// allow the user to tell us which db to replicate and which to ignore -I_List<i_string> binlog_do_db, binlog_ignore_db; I_List<THD> threads; I_List<NAMED_LIST> key_caches; +Rpl_filter* rpl_filter; +Rpl_filter* binlog_filter; struct system_variables global_system_variables; struct system_variables max_system_variables; @@ -470,6 +475,7 @@ CHARSET_INFO *national_charset_info, *table_alias_charset; SHOW_COMP_OPTION have_berkeley_db, have_innodb, have_isam, have_ndbcluster, have_example_db, have_archive_db, have_csv_db; SHOW_COMP_OPTION have_federated_db; +SHOW_COMP_OPTION have_partition_db; SHOW_COMP_OPTION have_raid, have_openssl, have_symlink, have_query_cache; SHOW_COMP_OPTION have_geometry, have_rtree_keys; SHOW_COMP_OPTION have_crypt, have_compress; @@ -1084,12 +1090,9 @@ void clean_up(bool print_message) free_max_user_conn(); #ifdef HAVE_REPLICATION end_slave_list(); - free_list(&replicate_do_db); - free_list(&replicate_ignore_db); - free_list(&binlog_do_db); - free_list(&binlog_ignore_db); - free_list(&replicate_rewrite_db); #endif + delete binlog_filter; + delete rpl_filter; #ifdef HAVE_OPENSSL if (ssl_acceptor_fd) my_free((gptr) ssl_acceptor_fd, MYF(MY_ALLOW_ZERO_PTR)); @@ -3111,6 +3114,15 @@ int main(int argc, char **argv) #endif { DEBUGGER_OFF; + + rpl_filter= new Rpl_filter; + binlog_filter= new Rpl_filter; + if (!rpl_filter || !binlog_filter) + { + sql_perror("Could not allocate replication and binlog filters"); + exit(1); + } + MY_INIT(argv[0]); // init my_sys library & pthreads #ifdef _CUSTOMSTARTUPCONFIG_ @@ -3466,7 +3478,6 @@ default_service_handling(char **argv, int main(int argc, char **argv) { - /* When several instances are running on the same machine, we need to have an unique named hEventShudown through the @@ -4323,6 +4334,7 @@ enum options_mysqld OPT_NDB_FORCE_SEND, OPT_NDB_AUTOINCREMENT_PREFETCH_SZ, OPT_NDB_SHM, OPT_NDB_OPTIMIZED_NODE_SELECTION, OPT_NDB_CACHE_CHECK_TIME, OPT_NDB_MGMD, OPT_NDB_NODEID, + OPT_NDB_DISTRIBUTION, OPT_SKIP_SAFEMALLOC, OPT_TEMP_POOL, OPT_TX_ISOLATION, OPT_COMPLETION_TYPE, OPT_SKIP_STACK_TRACE, OPT_SKIP_SYMLINKS, @@ -4409,6 +4421,7 @@ enum options_mysqld OPT_ENABLE_SHARED_MEMORY, OPT_SHARED_MEMORY_BASE_NAME, OPT_OLD_PASSWORDS, + OPT_OLD_ALTER_TABLE, OPT_EXPIRE_LOGS_DAYS, OPT_GROUP_CONCAT_MAX_LEN, OPT_DEFAULT_COLLATION, @@ -4895,6 +4908,11 @@ Disable with --skip-ndbcluster (will save memory).", (gptr*) &global_system_variables.ndb_autoincrement_prefetch_sz, (gptr*) &global_system_variables.ndb_autoincrement_prefetch_sz, 0, GET_ULONG, REQUIRED_ARG, 32, 1, 256, 0, 0, 0}, + {"ndb-distibution", OPT_NDB_DISTRIBUTION, + "Default distribution for new tables in ndb", + (gptr*) &opt_ndb_distribution, + (gptr*) &opt_ndb_distribution, + 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0}, {"ndb-force-send", OPT_NDB_FORCE_SEND, "Force send of buffers to ndb immediately without waiting for " "other threads.", @@ -4941,6 +4959,11 @@ Disable with --skip-ndbcluster (will save memory).", (gptr*) &opt_no_mix_types, (gptr*) &opt_no_mix_types, 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0}, #endif + {"old-alter-table", OPT_OLD_ALTER_TABLE, + "Use old, non-optimized alter table.", + (gptr*) &global_system_variables.old_alter_table, + (gptr*) &max_system_variables.old_alter_table, 0, GET_BOOL, NO_ARG, + 0, 0, 0, 0, 0, 0}, {"old-passwords", OPT_OLD_PASSWORDS, "Use old password encryption method (needed for 4.0 and older clients).", (gptr*) &global_system_variables.old_passwords, (gptr*) &max_system_variables.old_passwords, 0, GET_BOOL, NO_ARG, @@ -6105,13 +6128,6 @@ static void mysql_init_variables(void) exit(1); multi_keycache_init(); /* set key_cache_hash.default_value = dflt_key_cache */ - /* Initialize structures that is used when processing options */ - replicate_rewrite_db.empty(); - replicate_do_db.empty(); - replicate_ignore_db.empty(); - binlog_do_db.empty(); - binlog_ignore_db.empty(); - /* Set directory paths */ strmake(language, LANGUAGE, sizeof(language)-1); strmake(mysql_real_data_home, get_relative_path(DATADIR), @@ -6144,6 +6160,7 @@ static void mysql_init_variables(void) global_system_variables.max_join_size= (ulonglong) HA_POS_ERROR; max_system_variables.max_join_size= (ulonglong) HA_POS_ERROR; global_system_variables.old_passwords= 0; + global_system_variables.old_alter_table= 0; /* Variables that depends on compile options */ #ifndef DBUG_OFF @@ -6167,6 +6184,11 @@ static void mysql_init_variables(void) #else have_example_db= SHOW_OPTION_NO; #endif +#ifdef HAVE_PARTITION_DB + have_partition_db= SHOW_OPTION_YES; +#else + have_partition_db= SHOW_OPTION_NO; +#endif #ifdef HAVE_ARCHIVE_DB have_archive_db= SHOW_OPTION_YES; #else @@ -6357,14 +6379,12 @@ get_one_option(int optid, const struct my_option *opt __attribute__((unused)), } case (int)OPT_REPLICATE_IGNORE_DB: { - i_string *db = new i_string(argument); - replicate_ignore_db.push_back(db); + rpl_filter->add_ignore_db(argument); break; } case (int)OPT_REPLICATE_DO_DB: { - i_string *db = new i_string(argument); - replicate_do_db.push_back(db); + rpl_filter->add_do_db(argument); break; } case (int)OPT_REPLICATE_REWRITE_DB: @@ -6397,71 +6417,54 @@ get_one_option(int optid, const struct my_option *opt __attribute__((unused)), exit(1); } - i_string_pair *db_pair = new i_string_pair(key, val); - replicate_rewrite_db.push_back(db_pair); + rpl_filter->add_db_rewrite(key, val); break; } case (int)OPT_BINLOG_IGNORE_DB: { - i_string *db = new i_string(argument); - binlog_ignore_db.push_back(db); + binlog_filter->add_ignore_db(argument); break; } case (int)OPT_BINLOG_DO_DB: { - i_string *db = new i_string(argument); - binlog_do_db.push_back(db); + binlog_filter->add_do_db(argument); break; } case (int)OPT_REPLICATE_DO_TABLE: { - if (!do_table_inited) - init_table_rule_hash(&replicate_do_table, &do_table_inited); - if (add_table_rule(&replicate_do_table, argument)) + if (rpl_filter->add_do_table(argument)) { fprintf(stderr, "Could not add do table rule '%s'!\n", argument); exit(1); } - table_rules_on = 1; break; } case (int)OPT_REPLICATE_WILD_DO_TABLE: { - if (!wild_do_table_inited) - init_table_rule_array(&replicate_wild_do_table, - &wild_do_table_inited); - if (add_wild_table_rule(&replicate_wild_do_table, argument)) + if (rpl_filter->add_wild_do_table(argument)) { fprintf(stderr, "Could not add do table rule '%s'!\n", argument); exit(1); } - table_rules_on = 1; break; } case (int)OPT_REPLICATE_WILD_IGNORE_TABLE: { - if (!wild_ignore_table_inited) - init_table_rule_array(&replicate_wild_ignore_table, - &wild_ignore_table_inited); - if (add_wild_table_rule(&replicate_wild_ignore_table, argument)) + if (rpl_filter->add_wild_ignore_table(argument)) { fprintf(stderr, "Could not add ignore table rule '%s'!\n", argument); exit(1); } - table_rules_on = 1; break; } case (int)OPT_REPLICATE_IGNORE_TABLE: { - if (!ignore_table_inited) - init_table_rule_hash(&replicate_ignore_table, &ignore_table_inited); - if (add_table_rule(&replicate_ignore_table, argument)) + if (rpl_filter->add_ignore_table(argument)) { fprintf(stderr, "Could not add ignore table rule '%s'!\n", argument); exit(1); } - table_rules_on = 1; break; } #endif /* HAVE_REPLICATION */ @@ -6699,6 +6702,20 @@ get_one_option(int optid, const struct my_option *opt __attribute__((unused)), opt_ndb_constrbuf[opt_ndb_constrbuf_len]= 0; opt_ndbcluster_connectstring= opt_ndb_constrbuf; break; + case OPT_NDB_DISTRIBUTION: + int id; + if ((id= find_type(argument, &ndb_distribution_typelib, 2)) <= 0) + { + fprintf(stderr, + "Unknown ndb distribution type: '%s' " + "(should be '%s' or '%s')\n", + argument, + ndb_distribution_names[ND_KEYHASH], + ndb_distribution_names[ND_LINHASH]); + exit(1); + } + opt_ndb_distribution_id= (enum ndb_distribution)(id-1); + break; #endif case OPT_INNODB: #ifdef HAVE_INNOBASE_DB diff --git a/sql/opt_range.cc b/sql/opt_range.cc index cb250251155..b69822d201b 100644 --- a/sql/opt_range.cc +++ b/sql/opt_range.cc @@ -751,7 +751,7 @@ int QUICK_RANGE_SELECT::init() DBUG_ENTER("QUICK_RANGE_SELECT::init"); if (file->inited == handler::NONE) - DBUG_RETURN(error= file->ha_index_init(index)); + DBUG_RETURN(error= file->ha_index_init(index, 1)); error= 0; DBUG_RETURN(0); } @@ -778,9 +778,10 @@ QUICK_RANGE_SELECT::~QUICK_RANGE_SELECT() { DBUG_PRINT("info", ("Freeing separate handler %p (free=%d)", file, free_file)); - file->reset(); + file->ha_reset(); file->external_lock(current_thd, F_UNLCK); file->close(); + delete file; } } delete_dynamic(&ranges); /* ranges are allocated in alloc */ @@ -916,7 +917,7 @@ int QUICK_RANGE_SELECT::init_ror_merged_scan(bool reuse_handler) { DBUG_PRINT("info", ("Reusing handler %p", file)); if (file->extra(HA_EXTRA_KEYREAD) || - file->extra(HA_EXTRA_RETRIEVE_PRIMARY_KEY) || + file->ha_retrieve_all_pk() || init() || reset()) { DBUG_RETURN(1); @@ -944,7 +945,7 @@ int QUICK_RANGE_SELECT::init_ror_merged_scan(bool reuse_handler) goto failure; if (file->extra(HA_EXTRA_KEYREAD) || - file->extra(HA_EXTRA_RETRIEVE_PRIMARY_KEY) || + file->ha_retrieve_all_pk() || init() || reset()) { file->external_lock(thd, F_UNLCK); @@ -956,6 +957,8 @@ int QUICK_RANGE_SELECT::init_ror_merged_scan(bool reuse_handler) DBUG_RETURN(0); failure: + if (file) + delete file; file= save_file; DBUG_RETURN(1); } @@ -1562,9 +1565,10 @@ static int fill_used_fields_bitmap(PARAM *param) { TABLE *table= param->table; param->fields_bitmap_size= (table->s->fields/8 + 1); - uchar *tmp; + uint32 *tmp; uint pk; - if (!(tmp= (uchar*)alloc_root(param->mem_root,param->fields_bitmap_size)) || + if (!(tmp= (uint32*)alloc_root(param->mem_root, + bytes_word_aligned(param->fields_bitmap_size))) || bitmap_init(¶m->needed_fields, tmp, param->fields_bitmap_size*8, FALSE)) return 1; @@ -2307,7 +2311,7 @@ static ROR_SCAN_INFO *make_ror_scan(const PARAM *param, int idx, SEL_ARG *sel_arg) { ROR_SCAN_INFO *ror_scan; - uchar *bitmap_buf; + uint32 *bitmap_buf; uint keynr; DBUG_ENTER("make_ror_scan"); @@ -2322,8 +2326,8 @@ ROR_SCAN_INFO *make_ror_scan(const PARAM *param, int idx, SEL_ARG *sel_arg) ror_scan->sel_arg= sel_arg; ror_scan->records= param->table->quick_rows[keynr]; - if (!(bitmap_buf= (uchar*)alloc_root(param->mem_root, - param->fields_bitmap_size))) + if (!(bitmap_buf= (uint32*)alloc_root(param->mem_root, + bytes_word_aligned(param->fields_bitmap_size)))) DBUG_RETURN(NULL); if (bitmap_init(&ror_scan->covered_fields, bitmap_buf, @@ -2437,12 +2441,13 @@ static ROR_INTERSECT_INFO* ror_intersect_init(const PARAM *param) { ROR_INTERSECT_INFO *info; - uchar* buf; + uint32* buf; if (!(info= (ROR_INTERSECT_INFO*)alloc_root(param->mem_root, sizeof(ROR_INTERSECT_INFO)))) return NULL; info->param= param; - if (!(buf= (uchar*)alloc_root(param->mem_root, param->fields_bitmap_size))) + if (!(buf= (uint32*)alloc_root(param->mem_root, + bytes_word_aligned(param->fields_bitmap_size)))) return NULL; if (bitmap_init(&info->covered_fields, buf, param->fields_bitmap_size*8, FALSE)) @@ -2459,7 +2464,7 @@ void ror_intersect_cpy(ROR_INTERSECT_INFO *dst, const ROR_INTERSECT_INFO *src) { dst->param= src->param; memcpy(dst->covered_fields.bitmap, src->covered_fields.bitmap, - src->covered_fields.bitmap_size); + no_bytes_in_map(&src->covered_fields)); dst->out_rows= src->out_rows; dst->is_covering= src->is_covering; dst->index_records= src->index_records; @@ -3001,9 +3006,9 @@ TRP_ROR_INTERSECT *get_best_covering_ror_intersect(PARAM *param, /*I=set of all covering indexes */ ror_scan_mark= tree->ror_scans; - uchar buf[MAX_KEY/8+1]; + uint32 int_buf[MAX_KEY/32+1]; MY_BITMAP covered_fields; - if (bitmap_init(&covered_fields, buf, nbits, FALSE)) + if (bitmap_init(&covered_fields, int_buf, nbits, FALSE)) DBUG_RETURN(0); bitmap_clear_all(&covered_fields); @@ -5767,7 +5772,7 @@ int QUICK_INDEX_MERGE_SELECT::read_keys_and_merge() (This also creates a deficiency - it is possible that we will retrieve parts of key that are not used by current query at all.) */ - if (head->file->extra(HA_EXTRA_RETRIEVE_PRIMARY_KEY)) + if (head->file->ha_retrieve_all_pk()) DBUG_RETURN(1); cur_quick_it.rewind(); @@ -6037,7 +6042,7 @@ int QUICK_RANGE_SELECT::reset() in_range= FALSE; cur_range= (QUICK_RANGE**) ranges.buffer; - if (file->inited == handler::NONE && (error= file->ha_index_init(index))) + if (file->inited == handler::NONE && (error= file->ha_index_init(index,1))) DBUG_RETURN(error); /* Do not allocate the buffers twice. */ @@ -6296,7 +6301,7 @@ int QUICK_RANGE_SELECT_GEOM::get_next() (byte*) range->min_key, range->min_length, (ha_rkey_function)(range->flag ^ GEOM_FLAG)); - if (result != HA_ERR_KEY_NOT_FOUND) + if (result != HA_ERR_KEY_NOT_FOUND && result != HA_ERR_END_OF_FILE) DBUG_RETURN(result); range=0; // Not found, to next range } @@ -6439,7 +6444,7 @@ int QUICK_SELECT_DESC::get_next() } if (result) { - if (result != HA_ERR_KEY_NOT_FOUND) + if (result != HA_ERR_KEY_NOT_FOUND && result != HA_ERR_END_OF_FILE) DBUG_RETURN(result); range=0; // Not found, to next range continue; @@ -8111,7 +8116,7 @@ int QUICK_GROUP_MIN_MAX_SELECT::reset(void) DBUG_ENTER("QUICK_GROUP_MIN_MAX_SELECT::reset"); file->extra(HA_EXTRA_KEYREAD); /* We need only the key attributes */ - result= file->ha_index_init(index); + result= file->ha_index_init(index, 1); result= file->index_last(record); if (result == HA_ERR_END_OF_FILE) DBUG_RETURN(0); @@ -8187,7 +8192,7 @@ int QUICK_GROUP_MIN_MAX_SELECT::get_next() DBUG_ASSERT(is_last_prefix <= 0); if (result == HA_ERR_KEY_NOT_FOUND) continue; - else if (result) + if (result) break; if (have_min) @@ -8217,10 +8222,11 @@ int QUICK_GROUP_MIN_MAX_SELECT::get_next() HA_READ_KEY_EXACT); result= have_min ? min_res : have_max ? max_res : result; - } - while (result == HA_ERR_KEY_NOT_FOUND && is_last_prefix != 0); + } while ((result == HA_ERR_KEY_NOT_FOUND || result == HA_ERR_END_OF_FILE) && + is_last_prefix != 0); if (result == 0) + { /* Partially mimic the behavior of end_select_send. Copy the field data from Item_field::field into Item_field::result_field @@ -8228,6 +8234,7 @@ int QUICK_GROUP_MIN_MAX_SELECT::get_next() other fields in non-ANSI SQL mode). */ copy_fields(&join->tmp_table_param); + } else if (result == HA_ERR_KEY_NOT_FOUND) result= HA_ERR_END_OF_FILE; @@ -8254,6 +8261,7 @@ int QUICK_GROUP_MIN_MAX_SELECT::get_next() RETURN 0 on success HA_ERR_KEY_NOT_FOUND if no MIN key was found that fulfills all conditions. + HA_ERR_END_OF_FILE - "" - other if some error occurred */ @@ -8307,7 +8315,7 @@ int QUICK_GROUP_MIN_MAX_SELECT::next_min() if (key_cmp(index_info->key_part, group_prefix, real_prefix_len)) key_restore(record, tmp_record, index_info, 0); } - else if (result == HA_ERR_KEY_NOT_FOUND) + else if (result == HA_ERR_KEY_NOT_FOUND || result == HA_ERR_END_OF_FILE) result= 0; /* There is a result in any case. */ } } @@ -8332,6 +8340,7 @@ int QUICK_GROUP_MIN_MAX_SELECT::next_min() RETURN 0 on success HA_ERR_KEY_NOT_FOUND if no MAX key was found that fulfills all conditions. + HA_ERR_END_OF_FILE - "" - other if some error occurred */ @@ -8432,6 +8441,7 @@ int QUICK_GROUP_MIN_MAX_SELECT::next_prefix() 0 on success HA_ERR_KEY_NOT_FOUND if there is no key with the given prefix in any of the ranges + HA_ERR_END_OF_FILE - "" - other if some error */ @@ -8476,11 +8486,12 @@ int QUICK_GROUP_MIN_MAX_SELECT::next_min_in_range() result= file->index_read(record, group_prefix, search_prefix_len, find_flag); - if ((result == HA_ERR_KEY_NOT_FOUND) && - (cur_range->flag & (EQ_RANGE | NULL_RANGE))) - continue; /* Check the next range. */ - else if (result) + if (result) { + if ((result == HA_ERR_KEY_NOT_FOUND || result == HA_ERR_END_OF_FILE) && + (cur_range->flag & (EQ_RANGE | NULL_RANGE))) + continue; /* Check the next range. */ + /* In all other cases (HA_ERR_*, HA_READ_KEY_EXACT with NO_MIN_RANGE, HA_READ_AFTER_KEY, HA_READ_KEY_OR_NEXT) if the lookup failed for this @@ -8507,7 +8518,7 @@ int QUICK_GROUP_MIN_MAX_SELECT::next_min_in_range() /* Check if record belongs to the current group. */ if (key_cmp(index_info->key_part, group_prefix, real_prefix_len)) { - result = HA_ERR_KEY_NOT_FOUND; + result= HA_ERR_KEY_NOT_FOUND; continue; } @@ -8525,7 +8536,7 @@ int QUICK_GROUP_MIN_MAX_SELECT::next_min_in_range() if (!((cur_range->flag & NEAR_MAX) && (cmp_res == -1) || (cmp_res <= 0))) { - result = HA_ERR_KEY_NOT_FOUND; + result= HA_ERR_KEY_NOT_FOUND; continue; } } @@ -8564,6 +8575,7 @@ int QUICK_GROUP_MIN_MAX_SELECT::next_min_in_range() 0 on success HA_ERR_KEY_NOT_FOUND if there is no key with the given prefix in any of the ranges + HA_ERR_END_OF_FILE - "" - other if some error */ @@ -8609,10 +8621,12 @@ int QUICK_GROUP_MIN_MAX_SELECT::next_max_in_range() result= file->index_read(record, group_prefix, search_prefix_len, find_flag); - if ((result == HA_ERR_KEY_NOT_FOUND) && (cur_range->flag & EQ_RANGE)) - continue; /* Check the next range. */ if (result) { + if ((result == HA_ERR_KEY_NOT_FOUND || result == HA_ERR_END_OF_FILE) && + (cur_range->flag & EQ_RANGE)) + continue; /* Check the next range. */ + /* In no key was found with this upper bound, there certainly are no keys in the ranges to the left. diff --git a/sql/opt_sum.cc b/sql/opt_sum.cc index 33c8eadc065..9802bbddde6 100644 --- a/sql/opt_sum.cc +++ b/sql/opt_sum.cc @@ -181,7 +181,7 @@ int opt_sum_query(TABLE_LIST *tables, List<Item> &all_fields,COND *conds) const_result= 0; break; } - error= table->file->ha_index_init((uint) ref.key); + error= table->file->ha_index_init((uint) ref.key, 1); if (!ref.key_length) error= table->file->index_first(table->record[0]); @@ -253,7 +253,7 @@ int opt_sum_query(TABLE_LIST *tables, List<Item> &all_fields,COND *conds) const_result= 0; break; } - error= table->file->ha_index_init((uint) ref.key); + error= table->file->ha_index_init((uint) ref.key, 1); if (!ref.key_length) error= table->file->index_last(table->record[0]); diff --git a/sql/records.cc b/sql/records.cc index 9b05dc3e291..b3610cf1bbf 100644 --- a/sql/records.cc +++ b/sql/records.cc @@ -31,6 +31,74 @@ static int rr_cmp(uchar *a,uchar *b); /* init struct for read with info->read_record */ +/* + init_read_record is used to scan by using a number of different methods. + Which method to use is set-up in this call so that later calls to + the info->read_record will call the appropriate method using a function + pointer. + + There are five methods that relate completely to the sort function + filesort. The result of a filesort is retrieved using read_record + calls. The other two methods are used for normal table access. + + The filesort will produce references to the records sorted, these + references can be stored in memory or in a temporary file. + + The temporary file is normally used when the references doesn't fit into + a properly sized memory buffer. For most small queries the references + are stored in the memory buffer. + + The temporary file is also used when performing an update where a key is + modified. + + Methods used when ref's are in memory (using rr_from_pointers): + rr_unpack_from_buffer: + ---------------------- + This method is used when table->sort.addon_field is allocated. + This is allocated for most SELECT queries not involving any BLOB's. + In this case the records are fetched from a memory buffer. + rr_from_pointers: + ----------------- + Used when the above is not true, UPDATE, DELETE and so forth and + SELECT's involving BLOB's. It is also used when the addon_field + buffer is not allocated due to that its size was bigger than the + session variable max_length_for_sort_data. + In this case the record data is fetched from the handler using the + saved reference using the rnd_pos handler call. + + Methods used when ref's are in a temporary file (using rr_from_tempfile) + rr_unpack_from_tempfile: + ------------------------ + Same as rr_unpack_from_buffer except that references are fetched from + temporary file. Should obviously not really happen other than in + strange configurations. + + rr_from_tempfile: + ----------------- + Same as rr_from_pointers except that references are fetched from + temporary file instead of from + rr_from_cache: + -------------- + This is a special variant of rr_from_tempfile that can be used for + handlers that is not using the HA_FAST_KEY_READ table flag. Instead + of reading the references one by one from the temporary file it reads + a set of them, sorts them and reads all of them into a buffer which + is then used for a number of subsequent calls to rr_from_cache. + It is only used for SELECT queries and a number of other conditions + on table size. + + All other accesses use either index access methods (rr_quick) or a full + table scan (rr_sequential). + rr_quick: + --------- + rr_quick uses one of the QUICK_SELECT classes in opt_range.cc to + perform an index scan. There are loads of functionality hidden + in these quick classes. It handles all index scans of various kinds. + rr_sequential: + -------------- + This is the most basic access method of a table using rnd_init, + rnd_next and rnd_end. No indexes are used. +*/ void init_read_record(READ_RECORD *info,THD *thd, TABLE *table, SQL_SELECT *select, int use_record_cache, bool print_error) diff --git a/sql/repl_failsafe.cc b/sql/repl_failsafe.cc index 0b6e44c0272..855520fb2e4 100644 --- a/sql/repl_failsafe.cc +++ b/sql/repl_failsafe.cc @@ -20,6 +20,7 @@ #include "repl_failsafe.h" #include "sql_repl.h" #include "slave.h" +#include "rpl_filter.h" #include "log_event.h" #include <mysql.h> @@ -735,14 +736,14 @@ static int fetch_db_tables(THD *thd, MYSQL *mysql, const char *db, TABLE_LIST table; const char* table_name= row[0]; int error; - if (table_rules_on) + if (rpl_filter->is_on()) { bzero((char*) &table, sizeof(table)); //just for safe table.db= (char*) db; table.table_name= (char*) table_name; table.updating= 1; - if (!tables_ok(thd, &table)) + if (!rpl_filter->tables_ok(thd->db, &table)) continue; } /* download master's table and overwrite slave's table */ @@ -860,8 +861,8 @@ bool load_master_data(THD* thd) data from master */ - if (!db_ok(db, replicate_do_db, replicate_ignore_db) || - !db_ok_with_wild_table(db) || + if (!rpl_filter->db_ok(db) || + !rpl_filter->db_ok_with_wild_table(db) || !strcmp(db,"mysql")) { *cur_table_res = 0; diff --git a/sql/rpl_filter.cc b/sql/rpl_filter.cc new file mode 100644 index 00000000000..f9f8a3e98a7 --- /dev/null +++ b/sql/rpl_filter.cc @@ -0,0 +1,539 @@ +/* Copyright (C) 2000-2003 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#include "mysql_priv.h" +#include "rpl_filter.h" + +#define TABLE_RULE_HASH_SIZE 16 +#define TABLE_RULE_ARR_SIZE 16 + +Rpl_filter::Rpl_filter() : + table_rules_on(0), do_table_inited(0), ignore_table_inited(0), + wild_do_table_inited(0), wild_ignore_table_inited(0) +{ + do_db.empty(); + ignore_db.empty(); + rewrite_db.empty(); +} + + +Rpl_filter::~Rpl_filter() +{ + if (do_table_inited) + hash_free(&do_table); + if (ignore_table_inited) + hash_free(&ignore_table); + if (wild_do_table_inited) + free_string_array(&wild_do_table); + if (wild_ignore_table_inited) + free_string_array(&wild_ignore_table); + free_list(&do_db); + free_list(&ignore_db); + free_list(&rewrite_db); +} + + +/* + Returns true if table should be logged/replicated + + SYNOPSIS + tables_ok() + db db to use if db in TABLE_LIST is undefined for a table + tables list of tables to check + + NOTES + Changing table order in the list can lead to different results. + + Note also order of precedence of do/ignore rules (see code). For + that reason, users should not set conflicting rules because they + may get unpredicted results (precedence order is explained in the + manual). + + If no table in the list is marked "updating", then we always + return 0, because there is no reason to execute this statement on + slave if it updates nothing. (Currently, this can only happen if + statement is a multi-delete (SQLCOM_DELETE_MULTI) and "tables" are + the tables in the FROM): + + In the case of SQLCOM_DELETE_MULTI, there will be a second call to + tables_ok(), with tables having "updating==TRUE" (those after the + DELETE), so this second call will make the decision (because + all_tables_not_ok() = !tables_ok(1st_list) && + !tables_ok(2nd_list)). + + TODO + "Include all tables like "abc.%" except "%.EFG"". (Can't be done now.) + If we supported Perl regexps, we could do it with pattern: /^abc\.(?!EFG)/ + (I could not find an equivalent in the regex library MySQL uses). + + RETURN VALUES + 0 should not be logged/replicated + 1 should be logged/replicated +*/ + +bool +Rpl_filter::tables_ok(const char* db, TABLE_LIST* tables) +{ + bool some_tables_updating= 0; + DBUG_ENTER("Rpl_filter::tables_ok"); + + for (; tables; tables= tables->next_global) + { + char hash_key[2*NAME_LEN+2]; + char *end; + uint len; + + if (!tables->updating) + continue; + some_tables_updating= 1; + end= strmov(hash_key, tables->db ? tables->db : db); + *end++= '.'; + len= (uint) (strmov(end, tables->table_name) - hash_key); + if (do_table_inited) // if there are any do's + { + if (hash_search(&do_table, (byte*) hash_key, len)) + DBUG_RETURN(1); + } + if (ignore_table_inited) // if there are any ignores + { + if (hash_search(&ignore_table, (byte*) hash_key, len)) + DBUG_RETURN(0); + } + if (wild_do_table_inited && + find_wild(&wild_do_table, hash_key, len)) + DBUG_RETURN(1); + if (wild_ignore_table_inited && + find_wild(&wild_ignore_table, hash_key, len)) + DBUG_RETURN(0); + } + + /* + If no table was to be updated, ignore statement (no reason we play it on + slave, slave is supposed to replicate _changes_ only). + If no explicit rule found and there was a do list, do not replicate. + If there was no do list, go ahead + */ + DBUG_RETURN(some_tables_updating && + !do_table_inited && !wild_do_table_inited); +} + + +/* + Checks whether a db matches some do_db and ignore_db rules + + SYNOPSIS + db_ok() + db name of the db to check + + RETURN VALUES + 0 should not be logged/replicated + 1 should be logged/replicated +*/ + +bool +Rpl_filter::db_ok(const char* db) +{ + DBUG_ENTER("Rpl_filter::db_ok"); + + if (do_db.is_empty() && ignore_db.is_empty()) + DBUG_RETURN(1); // Ok to replicate if the user puts no constraints + + /* + If the user has specified restrictions on which databases to replicate + and db was not selected, do not replicate. + */ + if (!db) + DBUG_RETURN(0); + + if (!do_db.is_empty()) // if the do's are not empty + { + I_List_iterator<i_string> it(do_db); + i_string* tmp; + + while ((tmp=it++)) + { + if (!strcmp(tmp->ptr, db)) + DBUG_RETURN(1); // match + } + DBUG_RETURN(0); + } + else // there are some elements in the don't, otherwise we cannot get here + { + I_List_iterator<i_string> it(ignore_db); + i_string* tmp; + + while ((tmp=it++)) + { + if (!strcmp(tmp->ptr, db)) + DBUG_RETURN(0); // match + } + DBUG_RETURN(1); + } +} + + +/* + Checks whether a db matches wild_do_table and wild_ignore_table + rules (for replication) + + SYNOPSIS + db_ok_with_wild_table() + db name of the db to check. + Is tested with check_db_name() before calling this function. + + NOTES + Here is the reason for this function. + We advise users who want to exclude a database 'db1' safely to do it + with replicate_wild_ignore_table='db1.%' instead of binlog_ignore_db or + replicate_ignore_db because the two lasts only check for the selected db, + which won't work in that case: + USE db2; + UPDATE db1.t SET ... #this will be replicated and should not + whereas replicate_wild_ignore_table will work in all cases. + With replicate_wild_ignore_table, we only check tables. When + one does 'DROP DATABASE db1', tables are not involved and the + statement will be replicated, while users could expect it would not (as it + rougly means 'DROP db1.first_table, DROP db1.second_table...'). + In other words, we want to interpret 'db1.%' as "everything touching db1". + That is why we want to match 'db1' against 'db1.%' wild table rules. + + RETURN VALUES + 0 should not be logged/replicated + 1 should be logged/replicated +*/ + +bool +Rpl_filter::db_ok_with_wild_table(const char *db) +{ + DBUG_ENTER("Rpl_filter::db_ok_with_wild_table"); + + char hash_key[NAME_LEN+2]; + char *end; + int len; + end= strmov(hash_key, db); + *end++= '.'; + len= end - hash_key ; + if (wild_do_table_inited && find_wild(&wild_do_table, hash_key, len)) + { + DBUG_PRINT("return",("1")); + DBUG_RETURN(1); + } + if (wild_ignore_table_inited && find_wild(&wild_ignore_table, hash_key, len)) + { + DBUG_PRINT("return",("0")); + DBUG_RETURN(0); + } + + /* + If no explicit rule found and there was a do list, do not replicate. + If there was no do list, go ahead + */ + DBUG_PRINT("return",("db=%s,retval=%d", db, !wild_do_table_inited)); + DBUG_RETURN(!wild_do_table_inited); +} + + +bool +Rpl_filter::is_on() +{ + return table_rules_on; +} + + +int +Rpl_filter::add_do_table(const char* table_spec) +{ + DBUG_ENTER("Rpl_filter::add_do_table"); + if (!do_table_inited) + init_table_rule_hash(&do_table, &do_table_inited); + table_rules_on= 1; + DBUG_RETURN(add_table_rule(&do_table, table_spec)); +} + + +int +Rpl_filter::add_ignore_table(const char* table_spec) +{ + DBUG_ENTER("Rpl_filter::add_ignore_table"); + if (!ignore_table_inited) + init_table_rule_hash(&ignore_table, &ignore_table_inited); + table_rules_on= 1; + DBUG_RETURN(add_table_rule(&ignore_table, table_spec)); +} + + +int +Rpl_filter::add_wild_do_table(const char* table_spec) +{ + DBUG_ENTER("Rpl_filter::add_wild_do_table"); + if (!wild_do_table_inited) + init_table_rule_array(&wild_do_table, &wild_do_table_inited); + table_rules_on= 1; + DBUG_RETURN(add_wild_table_rule(&wild_do_table, table_spec)); +} + + +int +Rpl_filter::add_wild_ignore_table(const char* table_spec) +{ + DBUG_ENTER("Rpl_filter::add_wild_ignore_table"); + if (!wild_ignore_table_inited) + init_table_rule_array(&wild_ignore_table, &wild_ignore_table_inited); + table_rules_on= 1; + DBUG_RETURN(add_wild_table_rule(&wild_ignore_table, table_spec)); +} + + +void +Rpl_filter::add_db_rewrite(const char* from_db, const char* to_db) +{ + i_string_pair *db_pair = new i_string_pair(from_db, to_db); + rewrite_db.push_back(db_pair); +} + + +int +Rpl_filter::add_table_rule(HASH* h, const char* table_spec) +{ + const char* dot = strchr(table_spec, '.'); + if (!dot) return 1; + // len is always > 0 because we know the there exists a '.' + uint len = (uint)strlen(table_spec); + TABLE_RULE_ENT* e = (TABLE_RULE_ENT*)my_malloc(sizeof(TABLE_RULE_ENT) + + len, MYF(MY_WME)); + if (!e) return 1; + e->db= (char*)e + sizeof(TABLE_RULE_ENT); + e->tbl_name= e->db + (dot - table_spec) + 1; + e->key_len= len; + memcpy(e->db, table_spec, len); + + return my_hash_insert(h, (byte*)e); +} + + +/* + Add table expression with wildcards to dynamic array +*/ + +int +Rpl_filter::add_wild_table_rule(DYNAMIC_ARRAY* a, const char* table_spec) +{ + const char* dot = strchr(table_spec, '.'); + if (!dot) return 1; + uint len = (uint)strlen(table_spec); + TABLE_RULE_ENT* e = (TABLE_RULE_ENT*)my_malloc(sizeof(TABLE_RULE_ENT) + + len, MYF(MY_WME)); + if (!e) return 1; + e->db= (char*)e + sizeof(TABLE_RULE_ENT); + e->tbl_name= e->db + (dot - table_spec) + 1; + e->key_len= len; + memcpy(e->db, table_spec, len); + insert_dynamic(a, (gptr)&e); + return 0; +} + + +void +Rpl_filter::add_do_db(const char* table_spec) +{ + DBUG_ENTER("Rpl_filter::add_do_db"); + i_string *db = new i_string(table_spec); + do_db.push_back(db); +} + + +void +Rpl_filter::add_ignore_db(const char* table_spec) +{ + DBUG_ENTER("Rpl_filter::add_ignore_db"); + i_string *db = new i_string(table_spec); + ignore_db.push_back(db); +} + + +static byte* get_table_key(const byte* a, uint* len, + my_bool __attribute__((unused))) +{ + TABLE_RULE_ENT *e= (TABLE_RULE_ENT *) a; + + *len= e->key_len; + return (byte*)e->db; +} + + +static void free_table_ent(void* a) +{ + TABLE_RULE_ENT *e= (TABLE_RULE_ENT *) a; + + my_free((gptr) e, MYF(0)); +} + + +void +Rpl_filter::init_table_rule_hash(HASH* h, bool* h_inited) +{ + hash_init(h, system_charset_info,TABLE_RULE_HASH_SIZE,0,0, + get_table_key, free_table_ent, 0); + *h_inited = 1; +} + + +void +Rpl_filter::init_table_rule_array(DYNAMIC_ARRAY* a, bool* a_inited) +{ + my_init_dynamic_array(a, sizeof(TABLE_RULE_ENT*), TABLE_RULE_ARR_SIZE, + TABLE_RULE_ARR_SIZE); + *a_inited = 1; +} + + +TABLE_RULE_ENT* +Rpl_filter::find_wild(DYNAMIC_ARRAY *a, const char* key, int len) +{ + uint i; + const char* key_end= key + len; + + for (i= 0; i < a->elements; i++) + { + TABLE_RULE_ENT* e ; + get_dynamic(a, (gptr)&e, i); + if (!my_wildcmp(system_charset_info, key, key_end, + (const char*)e->db, + (const char*)(e->db + e->key_len), + '\\',wild_one,wild_many)) + return e; + } + + return 0; +} + + +void +Rpl_filter::free_string_array(DYNAMIC_ARRAY *a) +{ + uint i; + for (i= 0; i < a->elements; i++) + { + char* p; + get_dynamic(a, (gptr) &p, i); + my_free(p, MYF(MY_WME)); + } + delete_dynamic(a); +} + + +/* + Builds a String from a HASH of TABLE_RULE_ENT. Cannot be used for any other + hash, as it assumes that the hash entries are TABLE_RULE_ENT. + + SYNOPSIS + table_rule_ent_hash_to_str() + s pointer to the String to fill + h pointer to the HASH to read + + RETURN VALUES + none +*/ + +void +Rpl_filter::table_rule_ent_hash_to_str(String* s, HASH* h) +{ + s->length(0); + for (uint i= 0; i < h->records; i++) + { + TABLE_RULE_ENT* e= (TABLE_RULE_ENT*) hash_element(h, i); + if (s->length()) + s->append(','); + s->append(e->db,e->key_len); + } +} + + +void +Rpl_filter::table_rule_ent_dynamic_array_to_str(String* s, DYNAMIC_ARRAY* a) +{ + s->length(0); + for (uint i= 0; i < a->elements; i++) + { + TABLE_RULE_ENT* e; + get_dynamic(a, (gptr)&e, i); + if (s->length()) + s->append(','); + s->append(e->db,e->key_len); + } +} + + +void +Rpl_filter::get_do_table(String* str) +{ + table_rule_ent_hash_to_str(str, &do_table); +} + + +void +Rpl_filter::get_ignore_table(String* str) +{ + table_rule_ent_hash_to_str(str, &ignore_table); +} + + +void +Rpl_filter::get_wild_do_table(String* str) +{ + table_rule_ent_dynamic_array_to_str(str, &wild_do_table); +} + + +void +Rpl_filter::get_wild_ignore_table(String* str) +{ + table_rule_ent_dynamic_array_to_str(str, &wild_ignore_table); +} + + +const char* +Rpl_filter::get_rewrite_db(const char* db, uint32 *new_len) +{ + if (rewrite_db.is_empty() || !db) + return db; + I_List_iterator<i_string_pair> it(rewrite_db); + i_string_pair* tmp; + + while ((tmp=it++)) + { + if (!strcmp(tmp->key, db)) + { + *new_len= strlen(tmp->val); + return tmp->val; + } + } + return db; +} + + +I_List<i_string>* +Rpl_filter::get_do_db() +{ + return &do_db; +} + + +I_List<i_string>* +Rpl_filter::get_ignore_db() +{ + return &ignore_db; +} diff --git a/sql/rpl_filter.h b/sql/rpl_filter.h new file mode 100644 index 00000000000..cfcb3b43607 --- /dev/null +++ b/sql/rpl_filter.h @@ -0,0 +1,113 @@ +/* Copyright (C) 2000-2003 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +#ifndef RPL_FILTER_H +#define RPL_FILTER_H + +#include "mysql.h" +#include "my_list.h" + +typedef struct st_table_rule_ent +{ + char* db; + char* tbl_name; + uint key_len; +} TABLE_RULE_ENT; + +/* + Rpl_filter + + Inclusion and exclusion rules of tables and databases. + Also handles rewrites of db. + Used for replication and binlogging. + */ +class Rpl_filter +{ +public: + Rpl_filter(); + ~Rpl_filter(); + Rpl_filter(Rpl_filter const&); + Rpl_filter& operator=(Rpl_filter const&); + + /* Checks - returns true if ok to replicate/log */ + + bool tables_ok(const char* db, TABLE_LIST* tables); + bool db_ok(const char* db); + bool db_ok_with_wild_table(const char *db); + + bool is_on(); + + /* Setters - add filtering rules */ + + int add_do_table(const char* table_spec); + int add_ignore_table(const char* table_spec); + + int add_wild_do_table(const char* table_spec); + int add_wild_ignore_table(const char* table_spec); + + void add_do_db(const char* db_spec); + void add_ignore_db(const char* db_spec); + + void add_db_rewrite(const char* from_db, const char* to_db); + + /* Getters - to get information about current rules */ + + void get_do_table(String* str); + void get_ignore_table(String* str); + + void get_wild_do_table(String* str); + void get_wild_ignore_table(String* str); + + const char* get_rewrite_db(const char* db, uint32 *new_len); + + I_List<i_string>* get_do_db(); + I_List<i_string>* get_ignore_db(); + +private: + bool table_rules_on; + + void init_table_rule_hash(HASH* h, bool* h_inited); + void init_table_rule_array(DYNAMIC_ARRAY* a, bool* a_inited); + + int add_table_rule(HASH* h, const char* table_spec); + int add_wild_table_rule(DYNAMIC_ARRAY* a, const char* table_spec); + + void free_string_array(DYNAMIC_ARRAY *a); + + void table_rule_ent_hash_to_str(String* s, HASH* h); + void table_rule_ent_dynamic_array_to_str(String* s, DYNAMIC_ARRAY* a); + TABLE_RULE_ENT* find_wild(DYNAMIC_ARRAY *a, const char* key, int len); + + HASH do_table; + HASH ignore_table; + DYNAMIC_ARRAY wild_do_table; + DYNAMIC_ARRAY wild_ignore_table; + + bool do_table_inited; + bool ignore_table_inited; + bool wild_do_table_inited; + bool wild_ignore_table_inited; + + I_List<i_string> do_db; + I_List<i_string> ignore_db; + + I_List<i_string_pair> rewrite_db; +}; + +extern Rpl_filter *rpl_filter; +extern Rpl_filter *binlog_filter; + +#endif // RPL_FILTER_H diff --git a/sql/set_var.cc b/sql/set_var.cc index 774062dedf2..f1ab86ace51 100644 --- a/sql/set_var.cc +++ b/sql/set_var.cc @@ -282,6 +282,8 @@ sys_var_thd_ulong sys_net_retry_count("net_retry_count", &SV::net_retry_count, 0, fix_net_retry_count); sys_var_thd_bool sys_new_mode("new", &SV::new_mode); +sys_var_thd_bool sys_old_alter_table("old_alter_table", + &SV::old_alter_table); sys_var_thd_bool sys_old_passwords("old_passwords", &SV::old_passwords); sys_var_thd_ulong sys_optimizer_prune_level("optimizer_prune_level", &SV::optimizer_prune_level); @@ -636,6 +638,7 @@ sys_var *sys_variables[]= &sys_net_wait_timeout, &sys_net_write_timeout, &sys_new_mode, + &sys_old_alter_table, &sys_old_passwords, &sys_optimizer_prune_level, &sys_optimizer_search_depth, @@ -796,6 +799,7 @@ struct show_var_st init_vars[]= { {"have_isam", (char*) &have_isam, SHOW_HAVE}, {"have_ndbcluster", (char*) &have_ndbcluster, SHOW_HAVE}, {"have_openssl", (char*) &have_openssl, SHOW_HAVE}, + {"have_partition_engine", (char*) &have_partition_db, SHOW_HAVE}, {"have_query_cache", (char*) &have_query_cache, SHOW_HAVE}, {"have_raid", (char*) &have_raid, SHOW_HAVE}, {"have_rtree_keys", (char*) &have_rtree_keys, SHOW_HAVE}, @@ -912,6 +916,7 @@ struct show_var_st init_vars[]= { {sys_net_retry_count.name, (char*) &sys_net_retry_count, SHOW_SYS}, {sys_net_write_timeout.name,(char*) &sys_net_write_timeout, SHOW_SYS}, {sys_new_mode.name, (char*) &sys_new_mode, SHOW_SYS}, + {sys_old_alter_table.name, (char*) &sys_old_alter_table, SHOW_SYS}, {sys_old_passwords.name, (char*) &sys_old_passwords, SHOW_SYS}, {"open_files_limit", (char*) &open_files_limit, SHOW_LONG}, {sys_optimizer_prune_level.name, (char*) &sys_optimizer_prune_level, diff --git a/sql/set_var.h b/sql/set_var.h index 40ff4c8583f..8c1444870eb 100644 --- a/sql/set_var.h +++ b/sql/set_var.h @@ -880,6 +880,7 @@ public: /* updated in sql_acl.cc */ +extern sys_var_thd_bool sys_old_alter_table; extern sys_var_thd_bool sys_old_passwords; extern LEX_STRING default_key_cache_base; diff --git a/sql/share/errmsg.txt b/sql/share/errmsg.txt index 5f1f7035af9..681b3b5b572 100644 --- a/sql/share/errmsg.txt +++ b/sql/share/errmsg.txt @@ -5403,3 +5403,122 @@ ER_VIEW_PREVENT_UPDATE eng "The definition of table '%-.64s' prevents operation %s on table '%-.64s'." ER_PS_NO_RECURSION eng "The prepared statement contains a stored routine call that refers to that same statement. It's not allowed to execute a prepared statement in such a recursive manner" +ER_PARTITION_REQUIRES_VALUES_ERROR + eng "%s PARTITIONING requires definition of VALUES %s for each partition" + swe "%s PARTITIONering kräver definition av VALUES %s för varje partition" +ER_PARTITION_WRONG_VALUES_ERROR + eng "Only %s PARTITIONING can use VALUES %s in partition definition" + swe "Endast %s partitionering kan använda VALUES %s i definition av partitionen" +ER_PARTITION_MAXVALUE_ERROR + eng "MAXVALUE can only be used in last partition definition" + swe "MAXVALUE kan bara användas i definitionen av den sista partitionen" +ER_PARTITION_SUBPARTITION_ERROR + eng "Subpartitions can only be hash partitions and by key" + swe "Subpartitioner kan bara vara hash och key partitioner" +ER_PARTITION_WRONG_NO_PART_ERROR + eng "Wrong number of partitions defined, mismatch with previous setting" + swe "Antal partitioner definierade och antal partitioner är inte lika" +ER_PARTITION_WRONG_NO_SUBPART_ERROR + eng "Wrong number of subpartitions defined, mismatch with previous setting" + swe "Antal subpartitioner definierade och antal subpartitioner är inte lika" +ER_CONST_EXPR_IN_PARTITION_FUNC_ERROR + eng "Constant/Random expression in (sub)partitioning function is not allowed" + swe "Konstanta uttryck eller slumpmässiga uttryck är inte tillåtna (sub)partitioneringsfunktioner" +ER_NO_CONST_EXPR_IN_RANGE_OR_LIST_ERROR + eng "Expression in RANGE/LIST VALUES must be constant" + swe "Uttryck i RANGE/LIST VALUES måste vara ett konstant uttryck" +ER_FIELD_NOT_FOUND_PART_ERROR + eng "Field in list of fields for partition function not found in table" + swe "Fält i listan av fält för partitionering med key inte funnen i tabellen" +ER_LIST_OF_FIELDS_ONLY_IN_HASH_ERROR + eng "List of fields is only allowed in KEY partitions" + swe "En lista av fält är endast tillåtet för KEY partitioner" +ER_INCONSISTENT_PARTITION_INFO_ERROR + eng "The partition info in the frm file is not consistent with what can be written into the frm file" + swe "Partitioneringsinformationen i frm-filen är inte konsistent med vad som kan skrivas i frm-filen" +ER_PARTITION_FUNC_NOT_ALLOWED_ERROR + eng "The %s function returns the wrong type" + swe "%s-funktionen returnerar felaktig typ" +ER_PARTITIONS_MUST_BE_DEFINED_ERROR + eng "For %s partitions each partition must be defined" + swe "För %s partitionering så måste varje partition definieras" +ER_RANGE_NOT_INCREASING_ERROR + eng "VALUES LESS THAN value must be strictly increasing for each partition" + swe "Värden i VALUES LESS THAN måste vara strikt växande för varje partition" +ER_INCONSISTENT_TYPE_OF_FUNCTIONS_ERROR + eng "VALUES value must be of same type as partition function" + swe "Värden i VALUES måste vara av samma typ som partitioneringsfunktionen" +ER_MULTIPLE_DEF_CONST_IN_LIST_PART_ERROR + eng "Multiple definition of same constant in list partitioning" + swe "Multipel definition av samma konstant i list partitionering" +ER_PARTITION_ENTRY_ERROR + eng "Partitioning can not be used stand-alone in query" + swe "Partitioneringssyntax kan inte användas på egen hand i en SQL-fråga" +ER_MIX_HANDLER_ERROR + eng "The mix of handlers in the partitions is not allowed in this version of MySQL" + swe "Denna mix av lagringsmotorer är inte tillåten i denna version av MySQL" +ER_PARTITION_NOT_DEFINED_ERROR + eng "For the partitioned engine it is necessary to define all %s" + swe "För partitioneringsmotorn så är det nödvändigt att definiera alla %s" +ER_TOO_MANY_PARTITIONS_ERROR + eng "Too many partitions were defined" + swe "För många partitioner definierades" +ER_SUBPARTITION_ERROR + eng "It is only possible to mix RANGE/LIST partitioning with HASH/KEY partitioning for subpartitioning" + swe "Det är endast möjligt att blanda RANGE/LIST partitionering med HASH/KEY partitionering för subpartitionering" +ER_CANT_CREATE_HANDLER_FILE + eng "Failed to create specific handler file" + swe "Misslyckades med att skapa specifik fil i lagringsmotor" +ER_BLOB_FIELD_IN_PART_FUNC_ERROR + eng "A BLOB field is not allowed in partition function" + swe "Ett BLOB-fält är inte tillåtet i partitioneringsfunktioner" +ER_CHAR_SET_IN_PART_FIELD_ERROR + eng "VARCHAR only allowed if binary collation for partition functions" + swe "VARCHAR endast tillåten med binär collation för partitioneringsfunktion" +ER_UNIQUE_KEY_NEED_ALL_FIELDS_IN_PF + eng "A %s need to include all fields in the partition function" + swe "En %s behöver inkludera alla fält i partitioneringsfunktionen för denna lagringsmotor" +ER_NO_PARTS_ERROR + eng "Number of %s = 0 is not an allowed value" + swe "Antal %s = 0 är inte ett tillåten värde" +ER_PARTITION_MGMT_ON_NONPARTITIONED + eng "Partition management on a not partitioned table is not possible" + swe "Partitioneringskommando på en opartitionerad tabell är inte möjligt" +ER_DROP_PARTITION_NON_EXISTENT + eng "Error in list of partitions to change" + swe "Fel i listan av partitioner att förändra" +ER_DROP_LAST_PARTITION + eng "Cannot remove all partitions, use DROP TABLE instead" + swe "Det är inte tillåtet att ta bort alla partitioner, använd DROP TABLE istället" +ER_COALESCE_ONLY_ON_HASH_PARTITION + eng "COALESCE PARTITION can only be used on HASH/KEY partitions" + swe "COALESCE PARTITION kan bara användas på HASH/KEY partitioner" +ER_ONLY_ON_RANGE_LIST_PARTITION + eng "%s PARTITION can only be used on RANGE/LIST partitions" + swe "%s PARTITION kan bara användas på RANGE/LIST-partitioner" +ER_ADD_PARTITION_SUBPART_ERROR + eng "Trying to Add partition(s) with wrong number of subpartitions" + swe "ADD PARTITION med fel antal subpartitioner" +ER_ADD_PARTITION_NO_NEW_PARTITION + eng "At least one partition must be added" + swe "Åtminstone en partition måste läggas till vid ADD PARTITION" +ER_COALESCE_PARTITION_NO_PARTITION + eng "At least one partition must be coalesced" + swe "Åtminstone en partition måste slås ihop vid COALESCE PARTITION" +ER_REORG_PARTITION_NOT_EXIST + eng "More partitions to reorganise than there are partitions" + swe "Fler partitioner att reorganisera än det finns partitioner" +ER_SAME_NAME_PARTITION + eng "All partitions must have unique names in the table" + swe "Alla partitioner i tabellen måste ha unika namn" +ER_CONSECUTIVE_REORG_PARTITIONS + eng "When reorganising a set of partitions they must be in consecutive order" + swe "När ett antal partitioner omorganiseras måste de vara i konsekutiv ordning" +ER_REORG_OUTSIDE_RANGE + eng "The new partitions cover a bigger range then the reorganised partitions do" + swe "De nya partitionerna täcker ett större intervall än de omorganiserade partitionerna" +ER_DROP_PARTITION_FAILURE + eng "Drop partition not supported in this version for this handler" +ER_DROP_PARTITION_WHEN_FK_DEFINED + eng "Cannot drop a partition when a foreign key constraint is defined on the table" + swe "Kan inte ta bort en partition när en främmande nyckel är definierad på tabellen" diff --git a/sql/slave.cc b/sql/slave.cc index d2a60076cef..757d8bc212d 100644 --- a/sql/slave.cc +++ b/sql/slave.cc @@ -22,6 +22,7 @@ #include <myisam.h> #include "slave.h" #include "sql_repl.h" +#include "rpl_filter.h" #include "repl_failsafe.h" #include <thr_alarm.h> #include <my_dir.h> @@ -36,11 +37,7 @@ typedef bool (*CHECK_KILLED_FUNC)(THD*,void*); volatile bool slave_sql_running = 0, slave_io_running = 0; char* slave_load_tmpdir = 0; MASTER_INFO *active_mi; -HASH replicate_do_table, replicate_ignore_table; -DYNAMIC_ARRAY replicate_wild_do_table, replicate_wild_ignore_table; -bool do_table_inited = 0, ignore_table_inited = 0; -bool wild_do_table_inited = 0, wild_ignore_table_inited = 0; -bool table_rules_on= 0, replicate_same_server_id; +bool replicate_same_server_id; ulonglong relay_log_space_limit = 0; /* @@ -194,20 +191,6 @@ err: } -static void free_table_ent(TABLE_RULE_ENT* e) -{ - my_free((gptr) e, MYF(0)); -} - - -static byte* get_table_key(TABLE_RULE_ENT* e, uint* len, - my_bool not_used __attribute__((unused))) -{ - *len = e->key_len; - return (byte*)e->db; -} - - /* Open the given relay log @@ -809,245 +792,6 @@ int start_slave_threads(bool need_slave_mutex, bool wait_for_start, } -void init_table_rule_hash(HASH* h, bool* h_inited) -{ - hash_init(h, system_charset_info,TABLE_RULE_HASH_SIZE,0,0, - (hash_get_key) get_table_key, - (hash_free_key) free_table_ent, 0); - *h_inited = 1; -} - - -void init_table_rule_array(DYNAMIC_ARRAY* a, bool* a_inited) -{ - my_init_dynamic_array(a, sizeof(TABLE_RULE_ENT*), TABLE_RULE_ARR_SIZE, - TABLE_RULE_ARR_SIZE); - *a_inited = 1; -} - - -static TABLE_RULE_ENT* find_wild(DYNAMIC_ARRAY *a, const char* key, int len) -{ - uint i; - const char* key_end = key + len; - - for (i = 0; i < a->elements; i++) - { - TABLE_RULE_ENT* e ; - get_dynamic(a, (gptr)&e, i); - if (!my_wildcmp(system_charset_info, key, key_end, - (const char*)e->db, - (const char*)(e->db + e->key_len), - '\\',wild_one,wild_many)) - return e; - } - - return 0; -} - - -/* - Checks whether tables match some (wild_)do_table and (wild_)ignore_table - rules (for replication) - - SYNOPSIS - tables_ok() - thd thread (SQL slave thread normally). Mustn't be null. - tables list of tables to check - - NOTES - Note that changing the order of the tables in the list can lead to - different results. Note also the order of precedence of the do/ignore - rules (see code below). For that reason, users should not set conflicting - rules because they may get unpredicted results (precedence order is - explained in the manual). - If no table of the list is marked "updating" (so far this can only happen - if the statement is a multi-delete (SQLCOM_DELETE_MULTI) and the "tables" - is the tables in the FROM): then we always return 0, because there is no - reason we play this statement on this slave if it updates nothing. In the - case of SQLCOM_DELETE_MULTI, there will be a second call to tables_ok(), - with tables having "updating==TRUE" (those after the DELETE), so this - second call will make the decision (because - all_tables_not_ok() = !tables_ok(1st_list) && !tables_ok(2nd_list)). - - Thought which arose from a question of a big customer "I want to include - all tables like "abc.%" except the "%.EFG"". This can't be done now. If we - supported Perl regexps we could do it with this pattern: /^abc\.(?!EFG)/ - (I could not find an equivalent in the regex library MySQL uses). - - RETURN VALUES - 0 should not be logged/replicated - 1 should be logged/replicated -*/ - -bool tables_ok(THD* thd, TABLE_LIST* tables) -{ - bool some_tables_updating= 0; - DBUG_ENTER("tables_ok"); - - /* - In routine, can't reliably pick and choose substatements, so always - replicate. - We can't reliably know if one substatement should be executed or not: - consider the case of this substatement: a SELECT on a non-replicated - constant table; if we don't execute it maybe it was going to fill a - variable which was going to be used by the next substatement to update - a replicated table? If we execute it maybe the constant non-replicated - table does not exist (and so we'll fail) while there was no need to - execute this as this SELECT does not influence replicated tables in the - rest of the routine? In other words: users are used to replicate-*-table - specifying how to handle updates to tables, these options don't say - anything about reads to tables; we can't guess. - */ - if (thd->spcont) - DBUG_RETURN(1); - - for (; tables; tables= tables->next_global) - { - char hash_key[2*NAME_LEN+2]; - char *end; - uint len; - - if (!tables->updating) - continue; - some_tables_updating= 1; - end= strmov(hash_key, tables->db ? tables->db : thd->db); - *end++= '.'; - len= (uint) (strmov(end, tables->table_name) - hash_key); - if (do_table_inited) // if there are any do's - { - if (hash_search(&replicate_do_table, (byte*) hash_key, len)) - DBUG_RETURN(1); - } - if (ignore_table_inited) // if there are any ignores - { - if (hash_search(&replicate_ignore_table, (byte*) hash_key, len)) - DBUG_RETURN(0); - } - if (wild_do_table_inited && find_wild(&replicate_wild_do_table, - hash_key, len)) - DBUG_RETURN(1); - if (wild_ignore_table_inited && find_wild(&replicate_wild_ignore_table, - hash_key, len)) - DBUG_RETURN(0); - } - - /* - If no table was to be updated, ignore statement (no reason we play it on - slave, slave is supposed to replicate _changes_ only). - If no explicit rule found and there was a do list, do not replicate. - If there was no do list, go ahead - */ - DBUG_RETURN(some_tables_updating && - !do_table_inited && !wild_do_table_inited); -} - - -/* - Checks whether a db matches wild_do_table and wild_ignore_table - rules (for replication) - - SYNOPSIS - db_ok_with_wild_table() - db name of the db to check. - Is tested with check_db_name() before calling this function. - - NOTES - Here is the reason for this function. - We advise users who want to exclude a database 'db1' safely to do it - with replicate_wild_ignore_table='db1.%' instead of binlog_ignore_db or - replicate_ignore_db because the two lasts only check for the selected db, - which won't work in that case: - USE db2; - UPDATE db1.t SET ... #this will be replicated and should not - whereas replicate_wild_ignore_table will work in all cases. - With replicate_wild_ignore_table, we only check tables. When - one does 'DROP DATABASE db1', tables are not involved and the - statement will be replicated, while users could expect it would not (as it - rougly means 'DROP db1.first_table, DROP db1.second_table...'). - In other words, we want to interpret 'db1.%' as "everything touching db1". - That is why we want to match 'db1' against 'db1.%' wild table rules. - - RETURN VALUES - 0 should not be logged/replicated - 1 should be logged/replicated - */ - -int db_ok_with_wild_table(const char *db) -{ - char hash_key[NAME_LEN+2]; - char *end; - int len; - end= strmov(hash_key, db); - *end++= '.'; - len= end - hash_key ; - if (wild_do_table_inited && find_wild(&replicate_wild_do_table, - hash_key, len)) - return 1; - if (wild_ignore_table_inited && find_wild(&replicate_wild_ignore_table, - hash_key, len)) - return 0; - - /* - If no explicit rule found and there was a do list, do not replicate. - If there was no do list, go ahead - */ - return !wild_do_table_inited; -} - - -int add_table_rule(HASH* h, const char* table_spec) -{ - const char* dot = strchr(table_spec, '.'); - if (!dot) return 1; - // len is always > 0 because we know the there exists a '.' - uint len = (uint)strlen(table_spec); - TABLE_RULE_ENT* e = (TABLE_RULE_ENT*)my_malloc(sizeof(TABLE_RULE_ENT) - + len, MYF(MY_WME)); - if (!e) return 1; - e->db = (char*)e + sizeof(TABLE_RULE_ENT); - e->tbl_name = e->db + (dot - table_spec) + 1; - e->key_len = len; - memcpy(e->db, table_spec, len); - (void)my_hash_insert(h, (byte*)e); - return 0; -} - - -/* - Add table expression with wildcards to dynamic array -*/ - -int add_wild_table_rule(DYNAMIC_ARRAY* a, const char* table_spec) -{ - const char* dot = strchr(table_spec, '.'); - if (!dot) return 1; - uint len = (uint)strlen(table_spec); - TABLE_RULE_ENT* e = (TABLE_RULE_ENT*)my_malloc(sizeof(TABLE_RULE_ENT) - + len, MYF(MY_WME)); - if (!e) return 1; - e->db = (char*)e + sizeof(TABLE_RULE_ENT); - e->tbl_name = e->db + (dot - table_spec) + 1; - e->key_len = len; - memcpy(e->db, table_spec, len); - insert_dynamic(a, (gptr)&e); - return 0; -} - - -static void free_string_array(DYNAMIC_ARRAY *a) -{ - uint i; - for (i = 0; i < a->elements; i++) - { - char* p; - get_dynamic(a, (gptr) &p, i); - my_free(p, MYF(MY_WME)); - } - delete_dynamic(a); -} - - #ifdef NOT_USED_YET static int end_slave_on_walk(MASTER_INFO* mi, gptr /*unused*/) { @@ -1083,14 +827,6 @@ void end_slave() */ terminate_slave_threads(active_mi,SLAVE_FORCE_ALL); end_master_info(active_mi); - if (do_table_inited) - hash_free(&replicate_do_table); - if (ignore_table_inited) - hash_free(&replicate_ignore_table); - if (wild_do_table_inited) - free_string_array(&replicate_wild_do_table); - if (wild_ignore_table_inited) - free_string_array(&replicate_wild_ignore_table); delete active_mi; active_mi= 0; } @@ -1170,24 +906,6 @@ bool net_request_file(NET* net, const char* fname) } -const char *rewrite_db(const char* db, uint32 *new_len) -{ - if (replicate_rewrite_db.is_empty() || !db) - return db; - I_List_iterator<i_string_pair> it(replicate_rewrite_db); - i_string_pair* tmp; - - while ((tmp=it++)) - { - if (!strcmp(tmp->key, db)) - { - *new_len= (uint32)strlen(tmp->val); - return tmp->val; - } - } - return db; -} - /* From other comments and tests in code, it looks like sometimes Query_log_event and Load_log_event can have db == 0 @@ -1200,60 +918,6 @@ const char *print_slave_db_safe(const char* db) return (db ? db : ""); } -/* - Checks whether a db matches some do_db and ignore_db rules - (for logging or replication) - - SYNOPSIS - db_ok() - db name of the db to check - do_list either binlog_do_db or replicate_do_db - ignore_list either binlog_ignore_db or replicate_ignore_db - - RETURN VALUES - 0 should not be logged/replicated - 1 should be logged/replicated -*/ - -int db_ok(const char* db, I_List<i_string> &do_list, - I_List<i_string> &ignore_list ) -{ - if (do_list.is_empty() && ignore_list.is_empty()) - return 1; // ok to replicate if the user puts no constraints - - /* - If the user has specified restrictions on which databases to replicate - and db was not selected, do not replicate. - */ - if (!db) - return 0; - - if (!do_list.is_empty()) // if the do's are not empty - { - I_List_iterator<i_string> it(do_list); - i_string* tmp; - - while ((tmp=it++)) - { - if (!strcmp(tmp->ptr, db)) - return 1; // match - } - return 0; - } - else // there are some elements in the don't, otherwise we cannot get here - { - I_List_iterator<i_string> it(ignore_list); - i_string* tmp; - - while ((tmp=it++)) - { - if (!strcmp(tmp->ptr, db)) - return 0; // match - } - return 1; - } -} - static int init_strvar_from_file(char *var, int max_size, IO_CACHE *f, const char *default_val) @@ -2262,48 +1926,6 @@ int register_slave_on_master(MYSQL* mysql) } -/* - Builds a String from a HASH of TABLE_RULE_ENT. Cannot be used for any other - hash, as it assumes that the hash entries are TABLE_RULE_ENT. - - SYNOPSIS - table_rule_ent_hash_to_str() - s pointer to the String to fill - h pointer to the HASH to read - - RETURN VALUES - none -*/ - -void table_rule_ent_hash_to_str(String* s, HASH* h) -{ - s->length(0); - for (uint i=0 ; i < h->records ; i++) - { - TABLE_RULE_ENT* e= (TABLE_RULE_ENT*) hash_element(h, i); - if (s->length()) - s->append(','); - s->append(e->db,e->key_len); - } -} - -/* - Mostly the same thing as above -*/ - -void table_rule_ent_dynamic_array_to_str(String* s, DYNAMIC_ARRAY* a) -{ - s->length(0); - for (uint i=0 ; i < a->elements ; i++) - { - TABLE_RULE_ENT* e; - get_dynamic(a, (gptr)&e, i); - if (s->length()) - s->append(','); - s->append(e->db,e->key_len); - } -} - bool show_master_info(THD* thd, MASTER_INFO* mi) { // TODO: fix this for multi-master @@ -2399,23 +2021,18 @@ bool show_master_info(THD* thd, MASTER_INFO* mi) protocol->store(mi->slave_running == MYSQL_SLAVE_RUN_CONNECT ? "Yes" : "No", &my_charset_bin); protocol->store(mi->rli.slave_running ? "Yes":"No", &my_charset_bin); - protocol->store(&replicate_do_db); - protocol->store(&replicate_ignore_db); - /* - We can't directly use some protocol->store for - replicate_*_table, - as Protocol doesn't know the TABLE_RULE_ENT struct. - We first build Strings and then pass them to protocol->store. - */ + protocol->store(rpl_filter->get_do_db()); + protocol->store(rpl_filter->get_ignore_db()); + char buf[256]; String tmp(buf, sizeof(buf), &my_charset_bin); - table_rule_ent_hash_to_str(&tmp, &replicate_do_table); + rpl_filter->get_do_table(&tmp); protocol->store(&tmp); - table_rule_ent_hash_to_str(&tmp, &replicate_ignore_table); + rpl_filter->get_ignore_table(&tmp); protocol->store(&tmp); - table_rule_ent_dynamic_array_to_str(&tmp, &replicate_wild_do_table); + rpl_filter->get_wild_do_table(&tmp); protocol->store(&tmp); - table_rule_ent_dynamic_array_to_str(&tmp, &replicate_wild_ignore_table); + rpl_filter->get_wild_ignore_table(&tmp); protocol->store(&tmp); protocol->store((uint32) mi->rli.last_slave_errno); @@ -3888,10 +3505,8 @@ static int process_io_create_file(MASTER_INFO* mi, Create_file_log_event* cev) if (unlikely(!cev->is_valid())) DBUG_RETURN(1); - /* - TODO: fix to honor table rules, not only db rules - */ - if (!db_ok(cev->db, replicate_do_db, replicate_ignore_db)) + + if (!rpl_filter->db_ok(cev->db)) { skip_load_data_infile(net); DBUG_RETURN(0); diff --git a/sql/slave.h b/sql/slave.h index c41234ab2ed..ead1aa87ce6 100644 --- a/sql/slave.h +++ b/sql/slave.h @@ -21,6 +21,8 @@ #include "mysql.h" #include "my_list.h" +#include "rpl_filter.h" + #define SLAVE_NET_TIMEOUT 3600 #define MAX_SLAVE_ERRMSG 1024 #define MAX_SLAVE_ERROR 2000 @@ -461,15 +463,6 @@ typedef struct st_master_info int queue_event(MASTER_INFO* mi,const char* buf,ulong event_len); -typedef struct st_table_rule_ent -{ - char* db; - char* tbl_name; - uint key_len; -} TABLE_RULE_ENT; - -#define TABLE_RULE_HASH_SIZE 16 -#define TABLE_RULE_ARR_SIZE 16 #define MAX_SLAVE_ERRMSG 1024 #define RPL_LOG_NAME (rli->group_master_log_name[0] ? rli->group_master_log_name :\ @@ -523,27 +516,9 @@ int mysql_table_dump(THD* thd, const char* db, int fetch_master_table(THD* thd, const char* db_name, const char* table_name, MASTER_INFO* mi, MYSQL* mysql, bool overwrite); -void table_rule_ent_hash_to_str(String* s, HASH* h); -void table_rule_ent_dynamic_array_to_str(String* s, DYNAMIC_ARRAY* a); bool show_master_info(THD* thd, MASTER_INFO* mi); bool show_binlog_info(THD* thd); -/* See if the query uses any tables that should not be replicated */ -bool tables_ok(THD* thd, TABLE_LIST* tables); - -/* - Check to see if the database is ok to operate on with respect to the - do and ignore lists - used in replication -*/ -int db_ok(const char* db, I_List<i_string> &do_list, - I_List<i_string> &ignore_list ); -int db_ok_with_wild_table(const char *db); - -int add_table_rule(HASH* h, const char* table_spec); -int add_wild_table_rule(DYNAMIC_ARRAY* a, const char* table_spec); -void init_table_rule_hash(HASH* h, bool* h_inited); -void init_table_rule_array(DYNAMIC_ARRAY* a, bool* a_inited); -const char *rewrite_db(const char* db, uint32 *new_db_len); const char *print_slave_db_safe(const char *db); int check_expected_error(THD* thd, RELAY_LOG_INFO* rli, int error_code); void skip_load_data_infile(NET* net); @@ -577,11 +552,7 @@ extern "C" pthread_handler_decl(handle_slave_sql,arg); extern bool volatile abort_loop; extern MASTER_INFO main_mi, *active_mi; /* active_mi for multi-master */ extern LIST master_list; -extern HASH replicate_do_table, replicate_ignore_table; -extern DYNAMIC_ARRAY replicate_wild_do_table, replicate_wild_ignore_table; -extern bool do_table_inited, ignore_table_inited, - wild_do_table_inited, wild_ignore_table_inited; -extern bool table_rules_on, replicate_same_server_id; +extern bool replicate_same_server_id; extern int disconnect_slave_event_count, abort_slave_event_count ; @@ -595,8 +566,6 @@ extern my_bool master_ssl; extern my_string master_ssl_ca, master_ssl_capath, master_ssl_cert, master_ssl_cipher, master_ssl_key; -extern I_List<i_string> replicate_do_db, replicate_ignore_db; -extern I_List<i_string_pair> replicate_rewrite_db; extern I_List<THD> threads; #endif diff --git a/sql/sp.cc b/sql/sp.cc index 016703662a5..d635d9ad728 100644 --- a/sql/sp.cc +++ b/sql/sp.cc @@ -808,7 +808,7 @@ db_show_routine_status(THD *thd, int type, const char *wild) } } - table->file->ha_index_init(0); + table->file->ha_index_init(0, 1); if ((res= table->file->index_first(table->record[0]))) { res= (res == HA_ERR_END_OF_FILE) ? 0 : SP_INTERNAL_ERROR; @@ -858,7 +858,7 @@ sp_drop_db_routines(THD *thd, char *db) goto err; ret= SP_OK; - table->file->ha_index_init(0); + table->file->ha_index_init(0, 1); if (! table->file->index_read(table->record[0], key, keylen, HA_READ_KEY_EXACT)) { diff --git a/sql/sql_acl.cc b/sql/sql_acl.cc index 4127129576b..c2413545b3a 100644 --- a/sql/sql_acl.cc +++ b/sql/sql_acl.cc @@ -27,9 +27,6 @@ #include "mysql_priv.h" #include "hash_filo.h" -#ifdef HAVE_REPLICATION -#include "sql_repl.h" //for tables_ok() -#endif #include <m_ctype.h> #include <stdarg.h> #include "sp_head.h" @@ -37,6 +34,8 @@ #ifndef NO_EMBEDDED_ACCESS_CHECKS +#define FIRST_NON_YN_FIELD 26 + class acl_entry :public hash_filo_element { public: @@ -1399,7 +1398,7 @@ bool change_password(THD *thd, const char *host, const char *user, GRANT and REVOKE are applied the slave in/exclusion rules as they are some kind of updates to the mysql.% tables. */ - if (thd->slave_thread && table_rules_on) + if (thd->slave_thread && rpl_filter->is_on()) { /* The tables must be marked "updating" so that tables_ok() takes them into @@ -1407,7 +1406,7 @@ bool change_password(THD *thd, const char *host, const char *user, */ tables.updating= 1; /* Thanks to bzero, tables.next==0 */ - if (!tables_ok(thd, &tables)) + if (!thd->spcont || rpl_filter->tables_ok(0, &tables)) DBUG_RETURN(0); } #endif @@ -1594,7 +1593,7 @@ static bool update_user_table(THD *thd, TABLE *table, key_copy((byte *) user_key, table->record[0], table->key_info, table->key_info->key_length); - table->file->extra(HA_EXTRA_RETRIEVE_ALL_COLS); + table->file->ha_retrieve_all_cols(); if (table->file->index_read_idx(table->record[0], 0, (byte *) user_key, table->key_info->key_length, HA_READ_KEY_EXACT)) @@ -1683,7 +1682,7 @@ static int replace_user_table(THD *thd, TABLE *table, const LEX_USER &combo, key_copy(user_key, table->record[0], table->key_info, table->key_info->key_length); - table->file->extra(HA_EXTRA_RETRIEVE_ALL_COLS); + table->file->ha_retrieve_all_cols(); if (table->file->index_read_idx(table->record[0], 0, user_key, table->key_info->key_length, HA_READ_KEY_EXACT)) @@ -1816,7 +1815,7 @@ static int replace_user_table(THD *thd, TABLE *table, const LEX_USER &combo, We should NEVER delete from the user table, as a uses can still use mysqld even if he doesn't have any privileges in the user table! */ - table->file->extra(HA_EXTRA_RETRIEVE_ALL_COLS); + table->file->ha_retrieve_all_cols(); if (cmp_record(table,record[1]) && (error=table->file->update_row(table->record[1],table->record[0]))) { // This should never happen @@ -1898,7 +1897,7 @@ static int replace_db_table(TABLE *table, const char *db, key_copy(user_key, table->record[0], table->key_info, table->key_info->key_length); - table->file->extra(HA_EXTRA_RETRIEVE_ALL_COLS); + table->file->ha_retrieve_all_cols(); if (table->file->index_read_idx(table->record[0],0, user_key, table->key_info->key_length, HA_READ_KEY_EXACT)) @@ -1934,7 +1933,7 @@ static int replace_db_table(TABLE *table, const char *db, /* update old existing row */ if (rights) { - table->file->extra(HA_EXTRA_RETRIEVE_ALL_COLS); + table->file->ha_retrieve_all_cols(); if ((error=table->file->update_row(table->record[1],table->record[0]))) goto table_error; /* purecov: deadcode */ } @@ -2113,7 +2112,7 @@ GRANT_TABLE::GRANT_TABLE(TABLE *form, TABLE *col_privs) key_copy(key, col_privs->record[0], col_privs->key_info, key_prefix_len); col_privs->field[4]->store("",0, &my_charset_latin1); - col_privs->file->ha_index_init(0); + col_privs->file->ha_index_init(0, 1); if (col_privs->file->index_read(col_privs->record[0], (byte*) key, key_prefix_len, HA_READ_KEY_EXACT)) @@ -2258,7 +2257,7 @@ static int replace_column_table(GRANT_TABLE *g_t, List_iterator <LEX_COLUMN> iter(columns); class LEX_COLUMN *column; - table->file->ha_index_init(0); + table->file->ha_index_init(0, 1); while ((column= iter++)) { ulong privileges= column->rights; @@ -2273,7 +2272,7 @@ static int replace_column_table(GRANT_TABLE *g_t, key_copy(user_key, table->record[0], table->key_info, table->key_info->key_length); - table->file->extra(HA_EXTRA_RETRIEVE_ALL_COLS); + table->file->ha_retrieve_all_cols(); if (table->file->index_read(table->record[0], user_key, table->key_info->key_length, HA_READ_KEY_EXACT)) @@ -2351,7 +2350,7 @@ static int replace_column_table(GRANT_TABLE *g_t, key_copy(user_key, table->record[0], table->key_info, key_prefix_length); - table->file->extra(HA_EXTRA_RETRIEVE_ALL_COLS); + table->file->ha_retrieve_all_cols(); if (table->file->index_read(table->record[0], user_key, key_prefix_length, HA_READ_KEY_EXACT)) @@ -2449,7 +2448,7 @@ static int replace_table_table(THD *thd, GRANT_TABLE *grant_table, key_copy(user_key, table->record[0], table->key_info, table->key_info->key_length); - table->file->extra(HA_EXTRA_RETRIEVE_ALL_COLS); + table->file->ha_retrieve_all_cols(); if (table->file->index_read_idx(table->record[0], 0, user_key, table->key_info->key_length, HA_READ_KEY_EXACT)) @@ -2766,14 +2765,15 @@ bool mysql_table_grant(THD *thd, TABLE_LIST *table_list, GRANT and REVOKE are applied the slave in/exclusion rules as they are some kind of updates to the mysql.% tables. */ - if (thd->slave_thread && table_rules_on) + if (thd->slave_thread && rpl_filter->is_on()) + if (thd->slave_thread && rpl_filter->is_on()) { /* The tables must be marked "updating" so that tables_ok() takes them into account in tests. */ tables[0].updating= tables[1].updating= tables[2].updating= 1; - if (!tables_ok(thd, tables)) + if (!(thd->spcont || rpl_filter->tables_ok(0, tables))) DBUG_RETURN(FALSE); } #endif @@ -2973,14 +2973,14 @@ bool mysql_routine_grant(THD *thd, TABLE_LIST *table_list, bool is_proc, GRANT and REVOKE are applied the slave in/exclusion rules as they are some kind of updates to the mysql.% tables. */ - if (thd->slave_thread && table_rules_on) + if (thd->slave_thread && rpl_filter->is_on()) { /* The tables must be marked "updating" so that tables_ok() takes them into account in tests. */ tables[0].updating= tables[1].updating= 1; - if (!tables_ok(thd, tables)) + if (!(thd->spcont || rpl_filter->tables_ok(0, tables))) DBUG_RETURN(FALSE); } #endif @@ -3104,14 +3104,14 @@ bool mysql_grant(THD *thd, const char *db, List <LEX_USER> &list, GRANT and REVOKE are applied the slave in/exclusion rules as they are some kind of updates to the mysql.% tables. */ - if (thd->slave_thread && table_rules_on) + if (thd->slave_thread && rpl_filter->is_on()) { /* The tables must be marked "updating" so that tables_ok() takes them into account in tests. */ tables[0].updating= tables[1].updating= 1; - if (!tables_ok(thd, tables)) + if (!(thd->spcont || rpl_filter->tables_ok(0, tables))) DBUG_RETURN(FALSE); } #endif @@ -3254,8 +3254,8 @@ static my_bool grant_load(TABLE_LIST *tables) t_table = tables[0].table; c_table = tables[1].table; p_table= tables[2].table; - t_table->file->ha_index_init(0); - p_table->file->ha_index_init(0); + t_table->file->ha_index_init(0, 1); + p_table->file->ha_index_init(0, 1); if (!t_table->file->index_first(t_table->record[0])) { memex_ptr= &memex; @@ -4425,7 +4425,7 @@ int open_grant_tables(THD *thd, TABLE_LIST *tables) GRANT and REVOKE are applied the slave in/exclusion rules as they are some kind of updates to the mysql.% tables. */ - if (thd->slave_thread && table_rules_on) + if (thd->slave_thread && rpl_filter->is_on()) { /* The tables must be marked "updating" so that tables_ok() takes them into @@ -4433,7 +4433,7 @@ int open_grant_tables(THD *thd, TABLE_LIST *tables) */ tables[0].updating=tables[1].updating=tables[2].updating= tables[3].updating=tables[4].updating=1; - if (!tables_ok(thd, tables)) + if (!(thd->spcont || rpl_filter->tables_ok(0, tables))) DBUG_RETURN(1); tables[0].updating=tables[1].updating=tables[2].updating= tables[3].updating=tables[4].updating=0;; @@ -4595,7 +4595,7 @@ static int handle_grant_table(TABLE_LIST *tables, uint table_no, bool drop, user_key, key_prefix_length, HA_READ_KEY_EXACT))) { - if (error != HA_ERR_KEY_NOT_FOUND) + if (error != HA_ERR_KEY_NOT_FOUND && error != HA_ERR_END_OF_FILE) { table->file->print_error(error, MYF(0)); result= -1; diff --git a/sql/sql_acl.h b/sql/sql_acl.h index 6e6f33e68c2..50aa35e8cc7 100644 --- a/sql/sql_acl.h +++ b/sql/sql_acl.h @@ -14,6 +14,8 @@ along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include "slave.h" // for tables_ok(), rpl_filter + #define SELECT_ACL (1L << 0) #define INSERT_ACL (1L << 1) #define UPDATE_ACL (1L << 2) @@ -50,7 +52,6 @@ */ #define EXTRA_ACL (1L << 29) #define NO_ACCESS (1L << 30) - #define DB_ACLS \ (UPDATE_ACL | SELECT_ACL | INSERT_ACL | DELETE_ACL | CREATE_ACL | DROP_ACL | \ GRANT_ACL | REFERENCES_ACL | INDEX_ACL | ALTER_ACL | CREATE_TMP_ACL | \ diff --git a/sql/sql_base.cc b/sql/sql_base.cc index 766840e667c..105cd6a6670 100644 --- a/sql/sql_base.cc +++ b/sql/sql_base.cc @@ -565,7 +565,7 @@ bool close_thread_table(THD *thd, TABLE **table_ptr) else { // Free memory and reset for next loop - table->file->reset(); + table->file->ha_reset(); } table->in_use=0; if (unused_tables) @@ -2566,15 +2566,20 @@ static void update_field_dependencies(THD *thd, Field *field, TABLE *table) { if (thd->set_query_id) { + table->file->ha_set_bit_in_rw_set(field->fieldnr, + (bool)(thd->set_query_id-1)); if (field->query_id != thd->query_id) { + if (table->get_fields_in_item_tree) + field->flags|= GET_FIXED_FIELDS_FLAG; field->query_id= thd->query_id; table->used_fields++; table->used_keys.intersect(field->part_of_key); } else thd->dupp_field= field; - } + } else if (table->get_fields_in_item_tree) + field->flags|= GET_FIXED_FIELDS_FLAG; } @@ -2977,6 +2982,42 @@ find_field_in_table_ref(THD *thd, TABLE_LIST *table_list, /* + Find field in table, no side effects, only purpose is to check for field + in table object and get reference to the field if found. + + SYNOPSIS + find_field_in_table_sef() + + table table where to find + name Name of field searched for + + RETURN + 0 field is not found + # pointer to field +*/ + +Field *find_field_in_table_sef(TABLE *table, const char *name) +{ + Field **field_ptr; + if (table->s->name_hash.records) + field_ptr= (Field**)hash_search(&table->s->name_hash,(byte*) name, + strlen(name)); + else + { + if (!(field_ptr= table->field)) + return (Field *)0; + for (; *field_ptr; ++field_ptr) + if (!my_strcasecmp(system_charset_info, (*field_ptr)->field_name, name)) + break; + } + if (field_ptr) + return *field_ptr; + else + return (Field *)0; +} + + +/* Find field in table list. SYNOPSIS @@ -3642,15 +3683,19 @@ mark_common_columns(THD *thd, TABLE_LIST *table_ref_1, TABLE_LIST *table_ref_2, if (field_1) { + TABLE *table_1= nj_col_1->table_ref->table; /* Mark field_1 used for table cache. */ field_1->query_id= thd->query_id; - nj_col_1->table_ref->table->used_keys.intersect(field_1->part_of_key); + table_1->file->ha_set_bit_in_read_set(field_1->fieldnr); + table_1->used_keys.intersect(field_1->part_of_key); } if (field_2) { + TABLE *table_2= nj_col_2->table_ref->table; /* Mark field_2 used for table cache. */ field_2->query_id= thd->query_id; - nj_col_2->table_ref->table->used_keys.intersect(field_2->part_of_key); + table_2->file->ha_set_bit_in_read_set(field_2->fieldnr); + table_2->used_keys.intersect(field_2->part_of_key); } if (using_fields != NULL) @@ -4130,11 +4175,11 @@ int setup_wild(THD *thd, TABLE_LIST *tables, List<Item> &fields, ****************************************************************************/ bool setup_fields(THD *thd, Item **ref_pointer_array, - List<Item> &fields, bool set_query_id, + List<Item> &fields, ulong set_query_id, List<Item> *sum_func_list, bool allow_sum_func) { reg2 Item *item; - bool save_set_query_id= thd->set_query_id; + ulong save_set_query_id= thd->set_query_id; List_iterator<Item> it(fields); DBUG_ENTER("setup_fields"); @@ -4510,6 +4555,7 @@ insert_fields(THD *thd, Name_resolution_context *context, const char *db_name, if (field->query_id == thd->query_id) thd->dupp_field= field; field->query_id= thd->query_id; + field->table->file->ha_set_bit_in_read_set(field->fieldnr); if (table) table->used_keys.intersect(field->part_of_key); @@ -4550,7 +4596,10 @@ insert_fields(THD *thd, Name_resolution_context *context, const char *db_name, For NATURAL joins, used_tables is updated in the IF above. */ if (table) + { table->used_fields= table->s->fields; + table->file->ha_set_all_bits_in_read_set(); + } } if (found) DBUG_RETURN(FALSE); diff --git a/sql/sql_bitmap.h b/sql/sql_bitmap.h index 0f5b6dcd35e..35c501ede56 100644 --- a/sql/sql_bitmap.h +++ b/sql/sql_bitmap.h @@ -25,7 +25,7 @@ template <uint default_width> class Bitmap { MY_BITMAP map; - uchar buffer[(default_width+7)/8]; + uint32 buffer[(default_width+31)/32]; public: Bitmap() { init(); } Bitmap(const Bitmap& from) { *this=from; } @@ -48,14 +48,14 @@ public: void intersect(ulonglong map2buff) { MY_BITMAP map2; - bitmap_init(&map2, (uchar *)&map2buff, sizeof(ulonglong)*8, 0); + bitmap_init(&map2, (uint32 *)&map2buff, sizeof(ulonglong)*8, 0); bitmap_intersect(&map, &map2); } /* Use highest bit for all bits above sizeof(ulonglong)*8. */ void intersect_extended(ulonglong map2buff) { intersect(map2buff); - if (map.bitmap_size > sizeof(ulonglong)) + if (map.n_bits > sizeof(ulonglong) * 8) bitmap_set_above(&map, sizeof(ulonglong), test(map2buff & (LL(1) << (sizeof(ulonglong) * 8 - 1)))); } @@ -70,7 +70,7 @@ public: char *print(char *buf) const { char *s=buf; - const uchar *e=buffer, *b=e+sizeof(buffer)-1; + const uchar *e=(uchar *)buffer, *b=e+sizeof(buffer)-1; while (!*b && b>e) b--; if ((*s=_dig_vec_upper[*b >> 4]) != '0') diff --git a/sql/sql_cache.cc b/sql/sql_cache.cc index 04663c5b096..492d9b99d88 100644 --- a/sql/sql_cache.cc +++ b/sql/sql_cache.cc @@ -303,7 +303,7 @@ TODO list: #ifndef MASTER #include "../srclib/myisammrg/myrg_def.h" #else -#include "../myisammrg/myrg_def.h" +#include "../storage/myisammrg/myrg_def.h" #endif #ifdef EMBEDDED_LIBRARY diff --git a/sql/sql_class.h b/sql/sql_class.h index a0c61944c6a..5ce2f7d8847 100644 --- a/sql/sql_class.h +++ b/sql/sql_class.h @@ -465,19 +465,20 @@ public: class i_string: public ilink { public: - char* ptr; + const char* ptr; i_string():ptr(0) { } - i_string(char* s) : ptr(s) {} + i_string(const char* s) : ptr(s) {} }; /* needed for linked list of two strings for replicate-rewrite-db */ class i_string_pair: public ilink { public: - char* key; - char* val; + const char* key; + const char* val; i_string_pair():key(0),val(0) { } - i_string_pair(char* key_arg, char* val_arg) : key(key_arg),val(val_arg) {} + i_string_pair(const char* key_arg, const char* val_arg) : + key(key_arg),val(val_arg) {} }; @@ -565,6 +566,7 @@ struct system_variables my_bool ndb_use_exact_count; my_bool ndb_use_transactions; #endif /* HAVE_NDBCLUSTER_DB */ + my_bool old_alter_table; my_bool old_passwords; /* Only charset part of these variables is sensible */ @@ -774,8 +776,15 @@ public: /* - if set_query_id=1, we set field->query_id for all fields. In that case field list can not contain duplicates. + 0: Means query_id is not set and no indicator to handler of fields used + is set + 1: Means query_id is set for fields in list and bit in read set is set + to inform handler of that field is to be read + 2: Means query is set for fields in list and bit is set in update set + to inform handler that it needs to update this field in write_row + and update_row */ - bool set_query_id; + ulong set_query_id; /* This variable is used in post-parse stage to declare that sum-functions, or functions which have sense only if GROUP BY is present, are allowed. diff --git a/sql/sql_delete.cc b/sql/sql_delete.cc index 7fb9f9eccdd..00d82bcdfda 100644 --- a/sql/sql_delete.cc +++ b/sql/sql_delete.cc @@ -30,7 +30,8 @@ bool mysql_delete(THD *thd, TABLE_LIST *table_list, COND *conds, SQL_LIST *order, ha_rows limit, ulonglong options, bool reset_auto_increment) { - int error; + bool will_batch; + int error, loc_error; TABLE *table; SQL_SELECT *select=0; READ_RECORD info; @@ -174,6 +175,7 @@ bool mysql_delete(THD *thd, TABLE_LIST *table_list, COND *conds, deleted=0L; init_ftfuncs(thd, select_lex, 1); thd->proc_info="updating"; + will_batch= !table->file->start_bulk_delete(); while (!(error=info.read_record(&info)) && !thd->killed && !thd->net.report_error) { @@ -189,7 +191,7 @@ bool mysql_delete(THD *thd, TABLE_LIST *table_list, COND *conds, break; } - if (!(error=table->file->delete_row(table->record[0]))) + if (!(error= table->file->delete_row(table->record[0]))) { deleted++; if (table->triggers && @@ -225,7 +227,13 @@ bool mysql_delete(THD *thd, TABLE_LIST *table_list, COND *conds, } if (thd->killed && !error) error= 1; // Aborted - thd->proc_info="end"; + if (will_batch && (loc_error= table->file->end_bulk_delete())) + { + if (error != 1) + table->file->print_error(loc_error,MYF(0)); + error=1; + } + thd->proc_info= "end"; end_read_record(&info); free_io_cache(table); // Will not do any harm if (options & OPTION_QUICK) @@ -643,7 +651,8 @@ void multi_delete::send_error(uint errcode,const char *err) int multi_delete::do_deletes() { - int local_error= 0, counter= 0; + int local_error= 0, counter= 0, error; + bool will_batch; DBUG_ENTER("do_deletes"); DBUG_ASSERT(do_delete); @@ -671,6 +680,7 @@ int multi_delete::do_deletes() been deleted by foreign key handling */ info.ignore_not_found_rows= 1; + will_batch= !table->file->start_bulk_delete(); while (!(local_error=info.read_record(&info)) && !thd->killed) { if (table->triggers && @@ -694,6 +704,14 @@ int multi_delete::do_deletes() break; } } + if (will_batch && (error= table->file->end_bulk_delete())) + { + if (!local_error) + { + local_error= error; + table->file->print_error(local_error,MYF(0)); + } + } end_read_record(&info); if (thd->killed && !local_error) local_error= 1; diff --git a/sql/sql_handler.cc b/sql/sql_handler.cc index 169132e2185..fc7d7ac0012 100644 --- a/sql/sql_handler.cc +++ b/sql/sql_handler.cc @@ -461,7 +461,7 @@ bool mysql_ha_read(THD *thd, TABLE_LIST *tables, if (keyname) { table->file->ha_index_or_rnd_end(); - table->file->ha_index_init(keyno); + table->file->ha_index_init(keyno, 1); error= table->file->index_first(table->record[0]); } else @@ -483,7 +483,7 @@ bool mysql_ha_read(THD *thd, TABLE_LIST *tables, case RLAST: DBUG_ASSERT(keyname != 0); table->file->ha_index_or_rnd_end(); - table->file->ha_index_init(keyno); + table->file->ha_index_init(keyno, 1); error= table->file->index_last(table->record[0]); mode=RPREV; break; @@ -522,7 +522,7 @@ bool mysql_ha_read(THD *thd, TABLE_LIST *tables, if (!(key= (byte*) thd->calloc(ALIGN_SIZE(key_len)))) goto err; table->file->ha_index_or_rnd_end(); - table->file->ha_index_init(keyno); + table->file->ha_index_init(keyno, 1); key_copy(key, table->record[0], table->key_info + keyno, key_len); error= table->file->index_read(table->record[0], key,key_len,ha_rkey_mode); diff --git a/sql/sql_help.cc b/sql/sql_help.cc index 799758f7d1e..e3a2602713f 100644 --- a/sql/sql_help.cc +++ b/sql/sql_help.cc @@ -286,8 +286,8 @@ int get_topics_for_keyword(THD *thd, TABLE *topics, TABLE *relations, rtopic_id= find_fields[help_relation_help_topic_id].field; rkey_id= find_fields[help_relation_help_keyword_id].field; - topics->file->ha_index_init(iindex_topic); - relations->file->ha_index_init(iindex_relations); + topics->file->ha_index_init(iindex_topic,1); + relations->file->ha_index_init(iindex_relations,1); rkey_id->store((longlong) key_id); rkey_id->get_key_image(buff, rkey_id->pack_length(), Field::itRAW); diff --git a/sql/sql_insert.cc b/sql/sql_insert.cc index 9421cc4bb6b..77741d0ed3c 100644 --- a/sql/sql_insert.cc +++ b/sql/sql_insert.cc @@ -103,6 +103,11 @@ static int check_insert_fields(THD *thd, TABLE_LIST *table_list, #endif clear_timestamp_auto_bits(table->timestamp_field_type, TIMESTAMP_AUTO_SET_ON_INSERT); + /* + No fields are provided so all fields must be provided in the values. + Thus we set all bits in the write set. + */ + table->file->ha_set_all_bits_in_write_set(); } else { // Part field list @@ -140,7 +145,11 @@ static int check_insert_fields(THD *thd, TABLE_LIST *table_list, */ table_list->next_local= 0; context->resolve_in_table_list_only(table_list); - res= setup_fields(thd, 0, fields, 1, 0, 0); + /* + Indicate fields in list is to be updated by setting set_query_id + parameter to 2. This sets the bit in the write_set for each field. + */ + res= setup_fields(thd, 0, fields, 2, 0, 0); /* Restore the current context. */ table_list->next_local= save_next_local; @@ -239,9 +248,10 @@ static int check_update_fields(THD *thd, TABLE_LIST *insert_table_list, /* Check the fields we are going to modify. This will set the query_id - of all used fields to the threads query_id. + of all used fields to the threads query_id. It will also set all + fields into the write set of this table. */ - if (setup_fields(thd, 0, update_fields, 1, 0, 0)) + if (setup_fields(thd, 0, update_fields, 2, 0, 0)) return -1; if (table->timestamp_field) @@ -251,7 +261,10 @@ static int check_update_fields(THD *thd, TABLE_LIST *insert_table_list, clear_timestamp_auto_bits(table->timestamp_field_type, TIMESTAMP_AUTO_SET_ON_UPDATE); else + { table->timestamp_field->query_id= timestamp_query_id; + table->file->ha_set_bit_in_write_set(table->timestamp_field->fieldnr); + } } return 0; @@ -687,7 +700,7 @@ static bool check_view_insertability(THD * thd, TABLE_LIST *view) Field_translator *trans; Field **field_ptr= table->field; uint used_fields_buff_size= (table->s->fields + 7) / 8; - uchar *used_fields_buff= (uchar*)thd->alloc(used_fields_buff_size); + uint32 *used_fields_buff= (uint32*)thd->alloc(used_fields_buff_size); MY_BITMAP used_fields; DBUG_ENTER("check_key_in_view"); @@ -927,7 +940,7 @@ bool mysql_prepare_insert(THD *thd, TABLE_LIST *table_list, select_lex->first_execution= 0; } if (duplic == DUP_UPDATE || duplic == DUP_REPLACE) - table->file->extra(HA_EXTRA_RETRIEVE_PRIMARY_KEY); + table->file->ha_retrieve_all_pk(); DBUG_RETURN(FALSE); } @@ -2300,7 +2313,7 @@ select_insert::~select_insert() if (table) { table->next_number_field=0; - table->file->reset(); + table->file->ha_reset(); } thd->count_cuted_fields= CHECK_FIELD_IGNORE; thd->abort_on_warning= 0; diff --git a/sql/sql_lex.cc b/sql/sql_lex.cc index e579ee9f8bd..3f1e237b653 100644 --- a/sql/sql_lex.cc +++ b/sql/sql_lex.cc @@ -159,6 +159,7 @@ void lex_start(THD *thd, uchar *buf,uint length) lex->yylineno = 1; lex->in_comment=0; lex->length=0; + lex->part_info= 0; lex->select_lex.in_sum_expr=0; lex->select_lex.expr_list.empty(); lex->select_lex.ftfunc_list_alloc.empty(); diff --git a/sql/sql_lex.h b/sql/sql_lex.h index 6c91045189c..e5567d0d7c4 100644 --- a/sql/sql_lex.h +++ b/sql/sql_lex.h @@ -25,6 +25,7 @@ class sp_head; class sp_name; class sp_instr; class sp_pcontext; +class partition_info; /* The following hack is needed because mysql_yacc.cc does not define @@ -655,6 +656,11 @@ typedef class st_select_lex SELECT_LEX; #define ALTER_KEYS_ONOFF 512 #define ALTER_CONVERT 1024 #define ALTER_FORCE 2048 +#define ALTER_RECREATE 4096 +#define ALTER_ADD_PARTITION 8192 +#define ALTER_DROP_PARTITION 16384 +#define ALTER_COALESCE_PARTITION 32768 +#define ALTER_REORGANISE_PARTITION 65536 typedef struct st_alter_info { @@ -663,9 +669,17 @@ typedef struct st_alter_info uint flags; enum enum_enable_or_disable keys_onoff; enum tablespace_op_type tablespace_op; + List<char> partition_names; + uint no_parts; st_alter_info(){clear();} - void clear(){keys_onoff= LEAVE_AS_IS;tablespace_op= NO_TABLESPACE_OP;} + void clear() + { + keys_onoff= LEAVE_AS_IS; + tablespace_op= NO_TABLESPACE_OP; + no_parts= 0; + partition_names.empty(); + } void reset(){drop_list.empty();alter_list.empty();clear();} } ALTER_INFO; @@ -733,6 +747,8 @@ typedef struct st_lex TABLE_LIST **query_tables_last; /* store original leaf_tables for INSERT SELECT and PS/SP */ TABLE_LIST *leaf_tables_insert; + /* Partition info structure filled in by PARTITION BY parse part */ + partition_info *part_info; List<key_part_spec> col_list; List<key_part_spec> ref_list; diff --git a/sql/sql_load.cc b/sql/sql_load.cc index e1684f9bb11..8eec970b4df 100644 --- a/sql/sql_load.cc +++ b/sql/sql_load.cc @@ -172,7 +172,7 @@ bool mysql_load(THD *thd,sql_exchange *ex,TABLE_LIST *table_list, The main thing to fix to remove this restriction is to ensure that the table is marked to be 'used for insert' in which case we should never - mark this table as as 'const table' (ie, one that has only one row). + mark this table as 'const table' (ie, one that has only one row). */ if (unique_table(table_list, table_list->next_global)) { @@ -188,6 +188,10 @@ bool mysql_load(THD *thd,sql_exchange *ex,TABLE_LIST *table_list, Field **field; for (field=table->field; *field ; field++) fields_vars.push_back(new Item_field(*field)); + /* + Since all fields are set we set all bits in the write set + */ + table->file->ha_set_all_bits_in_write_set(); table->timestamp_field_type= TIMESTAMP_NO_AUTO_SET; /* Let us also prepare SET clause, altough it is probably empty @@ -200,8 +204,15 @@ bool mysql_load(THD *thd,sql_exchange *ex,TABLE_LIST *table_list, else { // Part field list /* TODO: use this conds for 'WITH CHECK OPTIONS' */ - if (setup_fields(thd, 0, fields_vars, 1, 0, 0) || - setup_fields(thd, 0, set_fields, 1, 0, 0) || + /* + Indicate that both variables in field list and fields in update_list + is to be included in write set of table. We do however set all bits + in write set anyways since it is not allowed to specify NULLs in + LOAD DATA + */ + table->file->ha_set_all_bits_in_write_set(); + if (setup_fields(thd, 0, fields_vars, 2, 0, 0) || + setup_fields(thd, 0, set_fields, 2, 0, 0) || check_that_all_fields_are_given_values(thd, table, table_list)) DBUG_RETURN(TRUE); /* diff --git a/sql/sql_parse.cc b/sql/sql_parse.cc index 514b1acd2b4..f4cab56e97c 100644 --- a/sql/sql_parse.cc +++ b/sql/sql_parse.cc @@ -16,6 +16,7 @@ #include "mysql_priv.h" #include "sql_repl.h" +#include "rpl_filter.h" #include "repl_failsafe.h" #include <m_ctype.h> #include <myisam.h> @@ -179,10 +180,13 @@ static bool begin_trans(THD *thd) */ inline bool all_tables_not_ok(THD *thd, TABLE_LIST *tables) { - return (table_rules_on && tables && !tables_ok(thd,tables) && + return (rpl_filter->is_on() && tables && + !(thd->spcont || rpl_filter->tables_ok(thd->db, tables)) && ((thd->lex->sql_command != SQLCOM_DELETE_MULTI) || - !tables_ok(thd, - (TABLE_LIST *)thd->lex->auxilliary_table_list.first))); + !(thd->spcont || + rpl_filter->tables_ok(thd->db, + (TABLE_LIST *) + thd->lex->auxilliary_table_list.first)))); } #endif @@ -3511,9 +3515,9 @@ end_with_restore_list: above was not called. So we have to check rules again here. */ #ifdef HAVE_REPLICATION - if (thd->slave_thread && - (!db_ok(lex->name, replicate_do_db, replicate_ignore_db) || - !db_ok_with_wild_table(lex->name))) + if (thd->slave_thread && + (!rpl_filter->db_ok(lex->name) || + !rpl_filter->db_ok_with_wild_table(lex->name))) { my_message(ER_SLAVE_IGNORED_TABLE, ER(ER_SLAVE_IGNORED_TABLE), MYF(0)); break; @@ -3546,8 +3550,8 @@ end_with_restore_list: */ #ifdef HAVE_REPLICATION if (thd->slave_thread && - (!db_ok(lex->name, replicate_do_db, replicate_ignore_db) || - !db_ok_with_wild_table(lex->name))) + (!rpl_filter->db_ok(lex->name) || + !rpl_filter->db_ok_with_wild_table(lex->name))) { my_message(ER_SLAVE_IGNORED_TABLE, ER(ER_SLAVE_IGNORED_TABLE), MYF(0)); break; @@ -3586,8 +3590,8 @@ end_with_restore_list: */ #ifdef HAVE_REPLICATION if (thd->slave_thread && - (!db_ok(db, replicate_do_db, replicate_ignore_db) || - !db_ok_with_wild_table(db))) + (!rpl_filter->db_ok(lex->name) || + !rpl_filter->db_ok_with_wild_table(lex->name))) { my_message(ER_SLAVE_IGNORED_TABLE, ER(ER_SLAVE_IGNORED_TABLE), MYF(0)); break; diff --git a/sql/sql_partition.cc b/sql/sql_partition.cc new file mode 100644 index 00000000000..1f24806dc5e --- /dev/null +++ b/sql/sql_partition.cc @@ -0,0 +1,3219 @@ +/* Copyright (C) 2005 MySQL AB + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ + +/* + This file was introduced as a container for general functionality related + to partitioning introduced in MySQL version 5.1. It contains functionality + used by all handlers that support partitioning, which in the first version + is the partitioning handler itself and the NDB handler. + + The first version was written by Mikael Ronström. + + This version supports RANGE partitioning, LIST partitioning, HASH + partitioning and composite partitioning (hereafter called subpartitioning) + where each RANGE/LIST partitioning is HASH partitioned. The hash function + can either be supplied by the user or by only a list of fields (also + called KEY partitioning, where the MySQL server will use an internal + hash function. + There are quite a few defaults that can be used as well. +*/ + +/* Some general useful functions */ + +#include "mysql_priv.h" +#include <errno.h> +#include <m_ctype.h> +#include "md5.h" + +#ifdef HAVE_PARTITION_DB +/* + Partition related functions declarations and some static constants; +*/ +static char *hash_str= "HASH"; +static char *range_str= "RANGE"; +static char *list_str= "LIST"; +static char *part_str= "PARTITION"; +static char *sub_str= "SUB"; +static char *by_str= "BY"; +static char *key_str= "KEY"; +static char *space_str= " "; +static char *equal_str= "="; +static char *end_paren_str= ")"; +static char *begin_paren_str= "("; +static char *comma_str= ","; +static char buff[22]; + +bool get_partition_id_list(partition_info *part_info, + uint32 *part_id); +bool get_partition_id_range(partition_info *part_info, + uint32 *part_id); +bool get_partition_id_hash_nosub(partition_info *part_info, + uint32 *part_id); +bool get_partition_id_key_nosub(partition_info *part_info, + uint32 *part_id); +bool get_partition_id_linear_hash_nosub(partition_info *part_info, + uint32 *part_id); +bool get_partition_id_linear_key_nosub(partition_info *part_info, + uint32 *part_id); +bool get_partition_id_range_sub_hash(partition_info *part_info, + uint32 *part_id); +bool get_partition_id_range_sub_key(partition_info *part_info, + uint32 *part_id); +bool get_partition_id_range_sub_linear_hash(partition_info *part_info, + uint32 *part_id); +bool get_partition_id_range_sub_linear_key(partition_info *part_info, + uint32 *part_id); +bool get_partition_id_list_sub_hash(partition_info *part_info, + uint32 *part_id); +bool get_partition_id_list_sub_key(partition_info *part_info, + uint32 *part_id); +bool get_partition_id_list_sub_linear_hash(partition_info *part_info, + uint32 *part_id); +bool get_partition_id_list_sub_linear_key(partition_info *part_info, + uint32 *part_id); +uint32 get_partition_id_hash_sub(partition_info *part_info); +uint32 get_partition_id_key_sub(partition_info *part_info); +uint32 get_partition_id_linear_hash_sub(partition_info *part_info); +uint32 get_partition_id_linear_key_sub(partition_info *part_info); +#endif + + +/* + A routine used by the parser to decide whether we are specifying a full + partitioning or if only partitions to add or to split. + SYNOPSIS + is_partition_management() + lex Reference to the lex object + RETURN VALUE + TRUE Yes, it is part of a management partition command + FALSE No, not a management partition command + DESCRIPTION + This needs to be outside of HAVE_PARTITION_DB since it is used from the + sql parser that doesn't have any #ifdef's +*/ + +my_bool is_partition_management(LEX *lex) +{ + return (lex->sql_command == SQLCOM_ALTER_TABLE && + (lex->alter_info.flags == ALTER_ADD_PARTITION || + lex->alter_info.flags == ALTER_REORGANISE_PARTITION)); +} + +#ifdef HAVE_PARTITION_DB +/* + A support function to check if a partition name is in a list of strings + SYNOPSIS + is_partition_in_list() + part_name String searched for + list_part_names A list of names searched in + RETURN VALUES + TRUE String found + FALSE String not found +*/ + +bool is_partition_in_list(char *part_name, + List<char> list_part_names) +{ + List_iterator<char> part_names_it(list_part_names); + uint no_names= list_part_names.elements; + uint i= 0; + do + { + char *list_name= part_names_it++; + if (!(my_strcasecmp(system_charset_info, part_name, list_name))) + return TRUE; + } while (++i < no_names); + return FALSE; +} + + +/* + A support function to check partition names for duplication in a + partitioned table + SYNOPSIS + is_partitions_in_table() + new_part_info New partition info + old_part_info Old partition info + RETURN VALUES + TRUE Duplicate names found + FALSE Duplicate names not found + DESCRIPTION + Can handle that the new and old parts are the same in which case it + checks that the list of names in the partitions doesn't contain any + duplicated names. +*/ + +bool is_partitions_in_table(partition_info *new_part_info, + partition_info *old_part_info) +{ + uint no_new_parts= new_part_info->partitions.elements, new_count; + uint no_old_parts= old_part_info->partitions.elements, old_count; + List_iterator<partition_element> new_parts_it(new_part_info->partitions); + bool same_part_info= (new_part_info == old_part_info); + DBUG_ENTER("is_partitions_in_table"); + + new_count= 0; + do + { + List_iterator<partition_element> old_parts_it(old_part_info->partitions); + char *new_name= (new_parts_it++)->partition_name; + new_count++; + old_count= 0; + do + { + char *old_name= (old_parts_it++)->partition_name; + old_count++; + if (same_part_info && old_count == new_count) + break; + if (!(my_strcasecmp(system_charset_info, old_name, new_name))) + { + DBUG_RETURN(TRUE); + } + } while (old_count < no_old_parts); + } while (new_count < no_new_parts); + DBUG_RETURN(FALSE); +} + + +/* + A useful routine used by update_row for partition handlers to calculate + the partition ids of the old and the new record. + SYNOPSIS + get_part_for_update() + old_data Buffer of old record + new_data Buffer of new record + rec0 Reference to table->record[0] + part_info Reference to partition information + part_field_array A NULL-terminated array of fields for partition + function + old_part_id The returned partition id of old record + new_part_id The returned partition id of new record + RETURN VALUE + 0 Success + > 0 Error code + DESCRIPTION + Dependent on whether buf is not record[0] we need to prepare the + fields. Then we call the function pointer get_partition_id to + calculate the partition ids. +*/ + +int get_parts_for_update(const byte *old_data, byte *new_data, + const byte *rec0, partition_info *part_info, + uint32 *old_part_id, uint32 *new_part_id) +{ + Field **part_field_array= part_info->full_part_field_array; + int error; + DBUG_ENTER("get_parts_for_update"); + DBUG_ASSERT(new_data == rec0); + + set_field_ptr(part_field_array, old_data, rec0); + error= part_info->get_partition_id(part_info, old_part_id); + set_field_ptr(part_field_array, rec0, old_data); + if (unlikely(error)) // Should never happen + { + DBUG_ASSERT(0); + DBUG_RETURN(error); + } +#ifdef NOT_NEEDED + if (new_data == rec0) +#endif + { + if (unlikely(error= part_info->get_partition_id(part_info,new_part_id))) + { + DBUG_RETURN(error); + } + } +#ifdef NOT_NEEDED + else + { + /* + This branch should never execute but it is written anyways for + future use. It will be tested by ensuring that the above + condition is false in one test situation before pushing the code. + */ + set_field_ptr(part_field_array, new_data, rec0); + error= part_info->get_partition_id(part_info, new_part_id); + set_field_ptr(part_field_array, rec0, new_data); + if (unlikely(error)) + { + DBUG_RETURN(error); + } + } +#endif + DBUG_RETURN(0); +} + + +/* + A useful routine used by delete_row for partition handlers to calculate + the partition id. + SYNOPSIS + get_part_for_delete() + buf Buffer of old record + rec0 Reference to table->record[0] + part_info Reference to partition information + part_field_array A NULL-terminated array of fields for partition + function + part_id The returned partition id to delete from + RETURN VALUE + 0 Success + > 0 Error code + DESCRIPTION + Dependent on whether buf is not record[0] we need to prepare the + fields. Then we call the function pointer get_partition_id to + calculate the partition id. +*/ + +int get_part_for_delete(const byte *buf, const byte *rec0, + partition_info *part_info, uint32 *part_id) +{ + int error; + DBUG_ENTER("get_part_for_delete"); + + if (likely(buf == rec0)) + { + if (unlikely((error= part_info->get_partition_id(part_info, part_id)))) + { + DBUG_RETURN(error); + } + DBUG_PRINT("info", ("Delete from partition %d", *part_id)); + } + else + { + Field **part_field_array= part_info->full_part_field_array; + set_field_ptr(part_field_array, buf, rec0); + error= part_info->get_partition_id(part_info, part_id); + set_field_ptr(part_field_array, rec0, buf); + if (unlikely(error)) + { + DBUG_RETURN(error); + } + DBUG_PRINT("info", ("Delete from partition %d (path2)", *part_id)); + } + DBUG_RETURN(0); +} + + +/* + This routine allocates an array for all range constants to achieve a fast + check what partition a certain value belongs to. At the same time it does + also check that the range constants are defined in increasing order and + that the expressions are constant integer expressions. + SYNOPSIS + check_range_constants() + part_info + RETURN VALUE + TRUE An error occurred during creation of range constants + FALSE Successful creation of range constant mapping + DESCRIPTION + This routine is called from check_partition_info to get a quick error + before we came too far into the CREATE TABLE process. It is also called + from fix_partition_func every time we open the .frm file. It is only + called for RANGE PARTITIONed tables. +*/ + +static bool check_range_constants(partition_info *part_info) +{ + partition_element* part_def; + longlong current_largest_int= LONGLONG_MIN, part_range_value_int; + uint no_parts= part_info->no_parts, i; + List_iterator<partition_element> it(part_info->partitions); + bool result= TRUE; + DBUG_ENTER("check_range_constants"); + DBUG_PRINT("enter", ("INT_RESULT with %d parts", no_parts)); + + part_info->part_result_type= INT_RESULT; + part_info->range_int_array= + (longlong*)sql_alloc(no_parts * sizeof(longlong)); + if (unlikely(part_info->range_int_array == NULL)) + { + my_error(ER_OUTOFMEMORY, MYF(0), no_parts*sizeof(longlong)); + goto end; + } + i= 0; + do + { + part_def= it++; + if ((i != (no_parts - 1)) || !part_info->defined_max_value) + part_range_value_int= part_def->range_value; + else + part_range_value_int= LONGLONG_MAX; + if (likely(current_largest_int < part_range_value_int)) + { + current_largest_int= part_range_value_int; + part_info->range_int_array[i]= part_range_value_int; + } + else + { + my_error(ER_RANGE_NOT_INCREASING_ERROR, MYF(0)); + goto end; + } + } while (++i < no_parts); + result= FALSE; +end: + DBUG_RETURN(result); +} + + +/* + A support routine for check_list_constants used by qsort to sort the + constant list expressions. + SYNOPSIS + list_part_cmp() + a First list constant to compare with + b Second list constant to compare with + RETURN VALUE + +1 a > b + 0 a == b + -1 a < b +*/ + +static int list_part_cmp(const void* a, const void* b) +{ + longlong a1, b1; + a1= ((LIST_PART_ENTRY*)a)->list_value; + b1= ((LIST_PART_ENTRY*)b)->list_value; + if (a1 < b1) + return -1; + else if (a1 > b1) + return +1; + else + return 0; +} + + +/* + This routine allocates an array for all list constants to achieve a fast + check what partition a certain value belongs to. At the same time it does + also check that there are no duplicates among the list constants and that + that the list expressions are constant integer expressions. + SYNOPSIS + check_list_constants() + part_info + RETURN VALUE + TRUE An error occurred during creation of list constants + FALSE Successful creation of list constant mapping + DESCRIPTION + This routine is called from check_partition_info to get a quick error + before we came too far into the CREATE TABLE process. It is also called + from fix_partition_func every time we open the .frm file. It is only + called for LIST PARTITIONed tables. +*/ + +static bool check_list_constants(partition_info *part_info) +{ + uint i, no_list_values= 0, no_parts, list_index= 0; + longlong *list_value; + bool not_first, result= TRUE; + longlong curr_value, prev_value; + partition_element* part_def; + List_iterator<partition_element> list_func_it(part_info->partitions); + DBUG_ENTER("check_list_constants"); + + part_info->part_result_type= INT_RESULT; + + /* + We begin by calculating the number of list values that have been + defined in the first step. + + We use this number to allocate a properly sized array of structs + to keep the partition id and the value to use in that partition. + In the second traversal we assign them values in the struct array. + + Finally we sort the array of structs in order of values to enable + a quick binary search for the proper value to discover the + partition id. + After sorting the array we check that there are no duplicates in the + list. + */ + + no_parts= part_info->no_parts; + i= 0; + do + { + part_def= list_func_it++; + List_iterator<longlong> list_val_it1(part_def->list_val_list); + while (list_val_it1++) + no_list_values++; + } while (++i < no_parts); + list_func_it.rewind(); + part_info->no_list_values= no_list_values; + part_info->list_array= + (LIST_PART_ENTRY*)sql_alloc(no_list_values*sizeof(LIST_PART_ENTRY)); + if (unlikely(part_info->list_array == NULL)) + { + my_error(ER_OUTOFMEMORY, MYF(0), no_list_values*sizeof(LIST_PART_ENTRY)); + goto end; + } + + i= 0; + do + { + part_def= list_func_it++; + List_iterator<longlong> list_val_it2(part_def->list_val_list); + while ((list_value= list_val_it2++)) + { + part_info->list_array[list_index].list_value= *list_value; + part_info->list_array[list_index++].partition_id= i; + } + } while (++i < no_parts); + + qsort((void*)part_info->list_array, no_list_values, + sizeof(LIST_PART_ENTRY), &list_part_cmp); + + not_first= FALSE; + i= prev_value= 0; //prev_value initialised to quiet compiler + do + { + curr_value= part_info->list_array[i].list_value; + if (likely(!not_first || prev_value != curr_value)) + { + prev_value= curr_value; + not_first= TRUE; + } + else + { + my_error(ER_MULTIPLE_DEF_CONST_IN_LIST_PART_ERROR, MYF(0)); + goto end; + } + } while (++i < no_list_values); + result= FALSE; +end: + DBUG_RETURN(result); +} + + +/* + Create a memory area where default partition names are stored and fill it + up with the names. + SYNOPSIS + create_default_partition_names() + no_parts Number of partitions + subpart Is it subpartitions + RETURN VALUE + A pointer to the memory area of the default partition names + DESCRIPTION + A support routine for the partition code where default values are + generated. + The external routine needing this code is check_partition_info +*/ + +#define MAX_PART_NAME_SIZE 8 + +static char *create_default_partition_names(uint no_parts, uint start_no, + bool subpart) +{ + char *ptr= sql_calloc(no_parts*MAX_PART_NAME_SIZE); + char *move_ptr= ptr; + uint i= 0; + DBUG_ENTER("create_default_partition_names"); + if (likely(ptr != 0)) + { + do + { + if (subpart) + my_sprintf(move_ptr, (move_ptr,"sp%u", (start_no + i))); + else + my_sprintf(move_ptr, (move_ptr,"p%u", (start_no + i))); + move_ptr+=MAX_PART_NAME_SIZE; + } while (++i < no_parts); + } + else + { + my_error(ER_OUTOFMEMORY, MYF(0), no_parts*MAX_PART_NAME_SIZE); + } + DBUG_RETURN(ptr); +} + + +/* + Set up all the default partitions not set-up by the user in the SQL + statement. Also perform a number of checks that the user hasn't tried + to use default values where no defaults exists. + SYNOPSIS + set_up_default_partitions() + part_info The reference to all partition information + file A reference to a handler of the table + max_rows Maximum number of rows stored in the table + RETURN VALUE + TRUE Error, attempted default values not possible + FALSE Ok, default partitions set-up + DESCRIPTION + The routine uses the underlying handler of the partitioning to define + the default number of partitions. For some handlers this requires + knowledge of the maximum number of rows to be stored in the table. + This routine only accepts HASH and KEY partitioning and thus there is + no subpartitioning if this routine is successful. + The external routine needing this code is check_partition_info +*/ + +static bool set_up_default_partitions(partition_info *part_info, + handler *file, ulonglong max_rows, + uint start_no) +{ + uint no_parts, i; + char *default_name; + bool result= TRUE; + DBUG_ENTER("set_up_default_partitions"); + + if (part_info->part_type != HASH_PARTITION) + { + char *error_string; + if (part_info->part_type == RANGE_PARTITION) + error_string= range_str; + else + error_string= list_str; + my_error(ER_PARTITIONS_MUST_BE_DEFINED_ERROR, MYF(0), error_string); + goto end; + } + if (part_info->no_parts == 0) + part_info->no_parts= file->get_default_no_partitions(max_rows); + no_parts= part_info->no_parts; + part_info->use_default_partitions= FALSE; + if (unlikely(no_parts > MAX_PARTITIONS)) + { + my_error(ER_TOO_MANY_PARTITIONS_ERROR, MYF(0)); + goto end; + } + if (unlikely((!(default_name= create_default_partition_names(no_parts, + start_no, + FALSE))))) + goto end; + i= 0; + do + { + partition_element *part_elem= new partition_element(); + if (likely(part_elem != 0)) + { + part_elem->engine_type= DB_TYPE_UNKNOWN; + part_elem->partition_name= default_name; + default_name+=MAX_PART_NAME_SIZE; + part_info->partitions.push_back(part_elem); + } + else + { + my_error(ER_OUTOFMEMORY, MYF(0), sizeof(partition_element)); + goto end; + } + } while (++i < no_parts); + result= FALSE; +end: + DBUG_RETURN(result); +} + + +/* + Set up all the default subpartitions not set-up by the user in the SQL + statement. Also perform a number of checks that the default partitioning + becomes an allowed partitioning scheme. + SYNOPSIS + set_up_default_subpartitions() + part_info The reference to all partition information + file A reference to a handler of the table + max_rows Maximum number of rows stored in the table + RETURN VALUE + TRUE Error, attempted default values not possible + FALSE Ok, default partitions set-up + DESCRIPTION + The routine uses the underlying handler of the partitioning to define + the default number of partitions. For some handlers this requires + knowledge of the maximum number of rows to be stored in the table. + This routine is only called for RANGE or LIST partitioning and those + need to be specified so only subpartitions are specified. + The external routine needing this code is check_partition_info +*/ + +static bool set_up_default_subpartitions(partition_info *part_info, + handler *file, ulonglong max_rows) +{ + uint i, j, no_parts, no_subparts; + char *default_name, *name_ptr; + bool result= TRUE; + partition_element *part_elem; + List_iterator<partition_element> part_it(part_info->partitions); + DBUG_ENTER("set_up_default_subpartitions"); + + if (part_info->no_subparts == 0) + part_info->no_subparts= file->get_default_no_partitions(max_rows); + no_parts= part_info->no_parts; + no_subparts= part_info->no_subparts; + part_info->use_default_subpartitions= FALSE; + if (unlikely((no_parts * no_subparts) > MAX_PARTITIONS)) + { + my_error(ER_TOO_MANY_PARTITIONS_ERROR, MYF(0)); + goto end; + } + if (unlikely((!(default_name= + create_default_partition_names(no_subparts, (uint)0, TRUE))))) + goto end; + i= 0; + do + { + part_elem= part_it++; + j= 0; + name_ptr= default_name; + do + { + partition_element *subpart_elem= new partition_element(); + if (likely(subpart_elem != 0)) + { + subpart_elem->engine_type= DB_TYPE_UNKNOWN; + subpart_elem->partition_name= name_ptr; + name_ptr+= MAX_PART_NAME_SIZE; + part_elem->subpartitions.push_back(subpart_elem); + } + else + { + my_error(ER_OUTOFMEMORY, MYF(0), sizeof(partition_element)); + goto end; + } + } while (++j < no_subparts); + } while (++i < no_parts); + result= FALSE; +end: + DBUG_RETURN(result); +} + + +/* + Set up defaults for partition or subpartition (cannot set-up for both, + this will return an error. + SYNOPSIS + set_up_defaults_for_partitioning() + part_info The reference to all partition information + file A reference to a handler of the table + max_rows Maximum number of rows stored in the table + RETURN VALUE + TRUE Error, attempted default values not possible + FALSE Ok, default partitions set-up + DESCRIPTION + Support routine for check_partition_info +*/ + +bool set_up_defaults_for_partitioning(partition_info *part_info, + handler *file, + ulonglong max_rows, uint start_no) +{ + DBUG_ENTER("set_up_defaults_for_partitioning"); + + if (part_info->use_default_partitions) + DBUG_RETURN(set_up_default_partitions(part_info, file, max_rows, + start_no)); + if (is_sub_partitioned(part_info) && part_info->use_default_subpartitions) + DBUG_RETURN(set_up_default_subpartitions(part_info, file, max_rows)); + DBUG_RETURN(FALSE); +} + + +/* + Check that all partitions use the same storage engine. + This is currently a limitation in this version. + SYNOPSIS + check_engine_mix() + engine_array An array of engine identifiers + no_parts Total number of partitions + RETURN VALUE + TRUE Error, mixed engines + FALSE Ok, no mixed engines +*/ + +static bool check_engine_mix(u_char *engine_array, uint no_parts) +{ + /* + Current check verifies only that all handlers are the same. + Later this check will be more sophisticated. + */ + uint i= 0; + bool result= FALSE; + DBUG_ENTER("check_engine_mix"); + + do + { + if (engine_array[i] != engine_array[0]) + { + result= TRUE; + break; + } + } while (++i < no_parts); + DBUG_RETURN(result); +} + + +/* + We will check that the partition info requested is possible to set-up in + this version. This routine is an extension of the parser one could say. + If defaults were used we will generate default data structures for all + partitions. + SYNOPSIS + check_partition_info() + part_info The reference to all partition information + db_type Default storage engine if no engine specified per + partition. + file A reference to a handler of the table + max_rows Maximum number of rows stored in the table + RETURN VALUE + TRUE Error, something went wrong + FALSE Ok, full partition data structures are now generated + DESCRIPTION + This code is used early in the CREATE TABLE and ALTER TABLE process. +*/ + +bool check_partition_info(partition_info *part_info,enum db_type eng_type, + handler *file, ulonglong max_rows) +{ + u_char *engine_array= NULL; + uint part_count= 0, i, no_parts, tot_partitions; + bool result= TRUE; + List_iterator<partition_element> part_it(part_info->partitions); + DBUG_ENTER("check_partition_info"); + + if (unlikely(is_sub_partitioned(part_info) && + (!(part_info->part_type == RANGE_PARTITION || + part_info->part_type == LIST_PARTITION)))) + { + /* Only RANGE and LIST partitioning can be subpartitioned */ + my_error(ER_SUBPARTITION_ERROR, MYF(0)); + goto end; + } + if (unlikely(set_up_defaults_for_partitioning(part_info, file, + max_rows, (uint)0))) + goto end; + tot_partitions= get_tot_partitions(part_info); + if (unlikely(tot_partitions > MAX_PARTITIONS)) + { + my_error(ER_TOO_MANY_PARTITIONS_ERROR, MYF(0)); + goto end; + } + if (unlikely(is_partitions_in_table(part_info, part_info))) + { + my_error(ER_SAME_NAME_PARTITION, MYF(0)); + goto end; + } + engine_array= (u_char*)my_malloc(tot_partitions, MYF(MY_WME)); + if (unlikely(!engine_array)) + goto end; + i= 0; + no_parts= part_info->no_parts; + do + { + partition_element *part_elem= part_it++; + if (!is_sub_partitioned(part_info)) + { + if (part_elem->engine_type == DB_TYPE_UNKNOWN) + part_elem->engine_type= eng_type; + DBUG_PRINT("info", ("engine = %u",(uint)part_elem->engine_type)); + engine_array[part_count++]= (u_char)part_elem->engine_type; + } + else + { + uint j= 0, no_subparts= part_info->no_subparts;; + List_iterator<partition_element> sub_it(part_elem->subpartitions); + do + { + part_elem= sub_it++; + if (part_elem->engine_type == DB_TYPE_UNKNOWN) + part_elem->engine_type= eng_type; + DBUG_PRINT("info", ("engine = %u",(uint)part_elem->engine_type)); + engine_array[part_count++]= (u_char)part_elem->engine_type; + } while (++j < no_subparts); + } + } while (++i < part_info->no_parts); + if (unlikely(check_engine_mix(engine_array, part_count))) + { + my_error(ER_MIX_HANDLER_ERROR, MYF(0)); + goto end; + } + + /* + We need to check all constant expressions that they are of the correct + type and that they are increasing for ranges and not overlapping for + list constants. + */ + + if (unlikely((part_info->part_type == RANGE_PARTITION && + check_range_constants(part_info)) || + (part_info->part_type == LIST_PARTITION && + check_list_constants(part_info)))) + goto end; + result= FALSE; +end: + my_free((char*)engine_array,MYF(MY_ALLOW_ZERO_PTR)); + DBUG_RETURN(result); +} + + +/* + A great number of functions below here is part of the fix_partition_func + method. It is used to set up the partition structures for execution from + openfrm. It is called at the end of the openfrm when the table struct has + been set-up apart from the partition information. + It involves: + 1) Setting arrays of fields for the partition functions. + 2) Setting up binary search array for LIST partitioning + 3) Setting up array for binary search for RANGE partitioning + 4) Setting up key_map's to assist in quick evaluation whether one + can deduce anything from a given index of what partition to use + 5) Checking whether a set of partitions can be derived from a range on + a field in the partition function. + As part of doing this there is also a great number of error controls. + This is actually the place where most of the things are checked for + partition information when creating a table. + Things that are checked includes + 1) No NULLable fields in partition function + 2) All fields of partition function in Primary keys and unique indexes + (if not supported) + 3) No fields in partition function that are BLOB's or VARCHAR with a + collation other than the binary collation. + + + + Create an array of partition fields (NULL terminated). Before this method + is called fix_fields or find_table_in_sef has been called to set + GET_FIXED_FIELDS_FLAG on all fields that are part of the partition + function. + SYNOPSIS + set_up_field_array() + table TABLE object for which partition fields are set-up + sub_part Is the table subpartitioned as well + RETURN VALUE + TRUE Error, some field didn't meet requirements + FALSE Ok, partition field array set-up + DESCRIPTION + This method is used to set-up both partition and subpartitioning + field array and used for all types of partitioning. + It is part of the logic around fix_partition_func. +*/ +static bool set_up_field_array(TABLE *table, + bool sub_part) +{ + Field **ptr, *field, **field_array; + uint no_fields= 0, size_field_array, i= 0; + partition_info *part_info= table->s->part_info; + int result= FALSE; + DBUG_ENTER("set_up_field_array"); + + ptr= table->field; + while ((field= *(ptr++))) + { + if (field->flags & GET_FIXED_FIELDS_FLAG) + no_fields++; + } + size_field_array= (no_fields+1)*sizeof(Field*); + field_array= (Field**)sql_alloc(size_field_array); + if (unlikely(!field_array)) + { + my_error(ER_OUTOFMEMORY, MYF(0), size_field_array); + result= TRUE; + } + ptr= table->field; + while ((field= *(ptr++))) + { + if (field->flags & GET_FIXED_FIELDS_FLAG) + { + field->flags&= ~GET_FIXED_FIELDS_FLAG; + field->flags|= FIELD_IN_PART_FUNC_FLAG; + if (likely(!result)) + { + field_array[i++]= field; + + /* + We check that the fields are proper. It is required for each + field in a partition function to: + 1) Not be a BLOB of any type + A BLOB takes too long time to evaluate so we don't want it for + performance reasons. + 2) Not be a VARCHAR other than VARCHAR with a binary collation + A VARCHAR with character sets can have several values being + equal with different number of spaces or NULL's. This is not a + good ground for a safe and exact partition function. Thus it is + not allowed in partition functions. + */ + + if (unlikely(field->flags & BLOB_FLAG)) + { + my_error(ER_BLOB_FIELD_IN_PART_FUNC_ERROR, MYF(0)); + result= TRUE; + } + else if (unlikely((!field->flags & BINARY_FLAG) && + field->real_type() == MYSQL_TYPE_VARCHAR)) + { + my_error(ER_CHAR_SET_IN_PART_FIELD_ERROR, MYF(0)); + result= TRUE; + } + } + } + } + field_array[no_fields]= 0; + if (!sub_part) + { + part_info->part_field_array= field_array; + part_info->no_part_fields= no_fields; + } + else + { + part_info->subpart_field_array= field_array; + part_info->no_subpart_fields= no_fields; + } + DBUG_RETURN(result); +} + + +/* + Create a field array including all fields of both the partitioning and the + subpartitioning functions. + SYNOPSIS + create_full_part_field_array() + table TABLE object for which partition fields are set-up + part_info Reference to partitioning data structure + RETURN VALUE + TRUE Memory allocation of field array failed + FALSE Ok + DESCRIPTION + If there is no subpartitioning then the same array is used as for the + partitioning. Otherwise a new array is built up using the flag + FIELD_IN_PART_FUNC in the field object. + This function is called from fix_partition_func +*/ + +static bool create_full_part_field_array(TABLE *table, + partition_info *part_info) +{ + bool result= FALSE; + DBUG_ENTER("create_full_part_field_array"); + + if (!is_sub_partitioned(part_info)) + { + part_info->full_part_field_array= part_info->part_field_array; + part_info->no_full_part_fields= part_info->no_part_fields; + } + else + { + Field **ptr, *field, **field_array; + uint no_part_fields=0, size_field_array; + ptr= table->field; + while ((field= *(ptr++))) + { + if (field->flags & FIELD_IN_PART_FUNC_FLAG) + no_part_fields++; + } + size_field_array= (no_part_fields+1)*sizeof(Field*); + field_array= (Field**)sql_alloc(size_field_array); + if (unlikely(!field_array)) + { + my_error(ER_OUTOFMEMORY, MYF(0), size_field_array); + result= TRUE; + goto end; + } + no_part_fields= 0; + ptr= table->field; + while ((field= *(ptr++))) + { + if (field->flags & FIELD_IN_PART_FUNC_FLAG) + field_array[no_part_fields++]= field; + } + field_array[no_part_fields]=0; + part_info->full_part_field_array= field_array; + part_info->no_full_part_fields= no_part_fields; + } +end: + DBUG_RETURN(result); +} + + +/* + These support routines is used to set/reset an indicator of all fields + in a certain key. It is used in conjunction with another support routine + that traverse all fields in the PF to find if all or some fields in the + PF is part of the key. This is used to check primary keys and unique + keys involve all fields in PF (unless supported) and to derive the + key_map's used to quickly decide whether the index can be used to + derive which partitions are needed to scan. + + + + Clear flag GET_FIXED_FIELDS_FLAG in all fields of a key previously set by + set_indicator_in_key_fields (always used in pairs). + SYNOPSIS + clear_indicator_in_key_fields() + key_info Reference to find the key fields +*/ + +static void clear_indicator_in_key_fields(KEY *key_info) +{ + KEY_PART_INFO *key_part; + uint key_parts= key_info->key_parts, i; + for (i= 0, key_part=key_info->key_part; i < key_parts; i++, key_part++) + key_part->field->flags&= (~GET_FIXED_FIELDS_FLAG); +} + + +/* + Set flag GET_FIXED_FIELDS_FLAG in all fields of a key. + SYNOPSIS + set_indicator_in_key_fields + key_info Reference to find the key fields +*/ + +static void set_indicator_in_key_fields(KEY *key_info) +{ + KEY_PART_INFO *key_part; + uint key_parts= key_info->key_parts, i; + for (i= 0, key_part=key_info->key_part; i < key_parts; i++, key_part++) + key_part->field->flags|= GET_FIXED_FIELDS_FLAG; +} + + +/* + Check if all or some fields in partition field array is part of a key + previously used to tag key fields. + SYNOPSIS + check_fields_in_PF() + ptr Partition field array + all_fields Is all fields of partition field array used in key + some_fields Is some fields of partition field array used in key + RETURN VALUE + all_fields, some_fields +*/ + +static void check_fields_in_PF(Field **ptr, bool *all_fields, + bool *some_fields) +{ + DBUG_ENTER("check_fields_in_PF"); + *all_fields= TRUE; + *some_fields= FALSE; + do + { + /* Check if the field of the PF is part of the current key investigated */ + if ((*ptr)->flags & GET_FIXED_FIELDS_FLAG) + *some_fields= TRUE; + else + *all_fields= FALSE; + } while (*(++ptr)); + DBUG_VOID_RETURN; +} + + +/* + Clear flag GET_FIXED_FIELDS_FLAG in all fields of the table. + This routine is used for error handling purposes. + SYNOPSIS + clear_field_flag() + table TABLE object for which partition fields are set-up +*/ + +static void clear_field_flag(TABLE *table) +{ + Field **ptr; + DBUG_ENTER("clear_field_flag"); + + for (ptr= table->field; *ptr; ptr++) + (*ptr)->flags&= (~GET_FIXED_FIELDS_FLAG); + DBUG_VOID_RETURN; +} + + +/* + This routine sets-up the partition field array for KEY partitioning, it + also verifies that all fields in the list of fields is actually a part of + the table. + SYNOPSIS + handle_list_of_fields() + it A list of field names for the partition function + table TABLE object for which partition fields are set-up + part_info Reference to partitioning data structure + sub_part Is the table subpartitioned as well + RETURN VALUE + TRUE Fields in list of fields not part of table + FALSE All fields ok and array created + DESCRIPTION + find_field_in_table_sef finds the field given its name. All fields get + GET_FIXED_FIELDS_FLAG set. +*/ + +static bool handle_list_of_fields(List_iterator<char> it, + TABLE *table, + partition_info *part_info, + bool sub_part) +{ + Field *field; + bool result; + char *field_name; + DBUG_ENTER("handle_list_of_fields"); + + while ((field_name= it++)) + { + field= find_field_in_table_sef(table, field_name); + if (likely(field != 0)) + field->flags|= GET_FIXED_FIELDS_FLAG; + else + { + my_error(ER_FIELD_NOT_FOUND_PART_ERROR, MYF(0)); + clear_field_flag(table); + result= TRUE; + goto end; + } + } + result= set_up_field_array(table, sub_part); +end: + DBUG_RETURN(result); +} + + +/* + This function is used to build an array of partition fields for the + partitioning function and subpartitioning function. The partitioning + function is an item tree that must reference at least one field in the + table. This is checked first in the parser that the function doesn't + contain non-cacheable parts (like a random function) and by checking + here that the function isn't a constant function. + SYNOPSIS + fix_fields_part_func() + thd The thread object + tables A list of one table, the partitioned table + func_expr The item tree reference of the partition function + part_info Reference to partitioning data structure + sub_part Is the table subpartitioned as well + RETURN VALUE + TRUE An error occurred, something was wrong with the + partition function. + FALSE Ok, a partition field array was created + DESCRIPTION + The function uses a new feature in fix_fields where the flag + GET_FIXED_FIELDS_FLAG is set for all fields in the item tree. + This field must always be reset before returning from the function + since it is used for other purposes as well. +*/ + +static bool fix_fields_part_func(THD *thd, TABLE_LIST *tables, + Item* func_expr, partition_info *part_info, + bool sub_part) +{ + /* + Calculate the number of fields in the partition function. + Use it allocate memory for array of Field pointers. + Initialise array of field pointers. Use information set when + calling fix_fields and reset it immediately after. + The get_fields_in_item_tree activates setting of bit in flags + on the field object. + */ + + bool result= TRUE; + TABLE *table= tables->table; + TABLE_LIST *save_table_list, *save_first_table, *save_last_table; + int error; + Name_resolution_context *context; + DBUG_ENTER("fix_fields_part_func"); + + context= thd->lex->current_context(); + table->map= 1; //To ensure correct calculation of const item + table->get_fields_in_item_tree= TRUE; + save_table_list= context->table_list; + save_first_table= context->first_name_resolution_table; + save_last_table= context->last_name_resolution_table; + context->table_list= tables; + context->first_name_resolution_table= tables; + context->last_name_resolution_table= NULL; + func_expr->walk(&Item::change_context_processor, (byte*) context); + thd->where= "partition function"; + error= func_expr->fix_fields(thd, (Item**)0); + context->table_list= save_table_list; + context->first_name_resolution_table= save_first_table; + context->last_name_resolution_table= save_last_table; + if (unlikely(error)) + { + DBUG_PRINT("info", ("Field in partition function not part of table")); + clear_field_flag(table); + goto end; + } + if (unlikely(func_expr->const_item())) + { + my_error(ER_CONST_EXPR_IN_PARTITION_FUNC_ERROR, MYF(0)); + clear_field_flag(table); + goto end; + } + result= set_up_field_array(table, sub_part); +end: + table->get_fields_in_item_tree= FALSE; + table->map= 0; //Restore old value + DBUG_RETURN(result); +} + + +/* + This function verifies that if there is a primary key that it contains + all the fields of the partition function. + This is a temporary limitation that will hopefully be removed after a + while. + SYNOPSIS + check_primary_key() + table TABLE object for which partition fields are set-up + RETURN VALUES + TRUE Not all fields in partitioning function was part + of primary key + FALSE Ok, all fields of partitioning function were part + of primary key +*/ + +static bool check_primary_key(TABLE *table) +{ + uint primary_key= table->s->primary_key; + bool all_fields, some_fields, result= FALSE; + DBUG_ENTER("check_primary_key"); + + if (primary_key < MAX_KEY) + { + set_indicator_in_key_fields(table->key_info+primary_key); + check_fields_in_PF(table->s->part_info->full_part_field_array, + &all_fields, &some_fields); + clear_indicator_in_key_fields(table->key_info+primary_key); + if (unlikely(!all_fields)) + { + my_error(ER_UNIQUE_KEY_NEED_ALL_FIELDS_IN_PF,MYF(0),"PRIMARY KEY"); + result= TRUE; + } + } + DBUG_RETURN(result); +} + + +/* + This function verifies that if there is a unique index that it contains + all the fields of the partition function. + This is a temporary limitation that will hopefully be removed after a + while. + SYNOPSIS + check_unique_keys() + table TABLE object for which partition fields are set-up + RETURN VALUES + TRUE Not all fields in partitioning function was part + of all unique keys + FALSE Ok, all fields of partitioning function were part + of unique keys +*/ + +static bool check_unique_keys(TABLE *table) +{ + bool all_fields, some_fields, result= FALSE; + uint keys= table->s->keys, i; + DBUG_ENTER("check_unique_keys"); + for (i= 0; i < keys; i++) + { + if (table->key_info[i].flags & HA_NOSAME) //Unique index + { + set_indicator_in_key_fields(table->key_info+i); + check_fields_in_PF(table->s->part_info->full_part_field_array, + &all_fields, &some_fields); + clear_indicator_in_key_fields(table->key_info+i); + if (unlikely(!all_fields)) + { + my_error(ER_UNIQUE_KEY_NEED_ALL_FIELDS_IN_PF,MYF(0),"UNIQUE INDEX"); + result= TRUE; + break; + } + } + } + DBUG_RETURN(result); +} + + +/* + An important optimisation is whether a range on a field can select a subset + of the partitions. + A prerequisite for this to happen is that the PF is a growing function OR + a shrinking function. + This can never happen for a multi-dimensional PF. Thus this can only happen + with PF with at most one field involved in the PF. + The idea is that if the function is a growing function and you know that + the field of the PF is 4 <= A <= 6 then we can convert this to a range + in the PF instead by setting the range to PF(4) <= PF(A) <= PF(6). In the + case of RANGE PARTITIONING and LIST PARTITIONING this can be used to + calculate a set of partitions rather than scanning all of them. + Thus the following prerequisites are there to check if sets of partitions + can be found. + 1) Only possible for RANGE and LIST partitioning (not for subpartitioning) + 2) Only possible if PF only contains 1 field + 3) Possible if PF is a growing function of the field + 4) Possible if PF is a shrinking function of the field + OBSERVATION: + 1) IF f1(A) is a growing function AND f2(A) is a growing function THEN + f1(A) + f2(A) is a growing function + f1(A) * f2(A) is a growing function if f1(A) >= 0 and f2(A) >= 0 + 2) IF f1(A) is a growing function and f2(A) is a shrinking function THEN + f1(A) / f2(A) is a growing function if f1(A) >= 0 and f2(A) > 0 + 3) IF A is a growing function then a function f(A) that removes the + least significant portion of A is a growing function + E.g. DATE(datetime) is a growing function + MONTH(datetime) is not a growing/shrinking function + 4) IF f1(A) is a growing function and f2(A) is a growing function THEN + f1(f2(A)) and f2(f1(A)) are also growing functions + 5) IF f1(A) is a shrinking function and f2(A) is a growing function THEN + f1(f2(A)) is a shrinking function and f2(f1(A)) is a shrinking function + 6) f1(A) = A is a growing function + 7) f1(A) = A*a + b (where a and b are constants) is a growing function + + By analysing the item tree of the PF we can use these deducements and + derive whether the PF is a growing function or a shrinking function or + neither of it. + + If the PF is range capable then a flag is set on the table object + indicating this to notify that we can use also ranges on the field + of the PF to deduce a set of partitions if the fields of the PF were + not all fully bound. + SYNOPSIS + check_range_capable_PF() + table TABLE object for which partition fields are set-up + DESCRIPTION + Support for this is not implemented yet. +*/ + +void check_range_capable_PF(TABLE *table) +{ + DBUG_ENTER("check_range_capable_PF"); + DBUG_VOID_RETURN; +} + + +/* + Set up partition key maps + SYNOPSIS + set_up_partition_key_maps() + table TABLE object for which partition fields are set-up + part_info Reference to partitioning data structure + RETURN VALUES + None + DESCRIPTION + This function sets up a couple of key maps to be able to quickly check + if an index ever can be used to deduce the partition fields or even + a part of the fields of the partition function. + We set up the following key_map's. + PF = Partition Function + 1) All fields of the PF is set even by equal on the first fields in the + key + 2) All fields of the PF is set if all fields of the key is set + 3) At least one field in the PF is set if all fields is set + 4) At least one field in the PF is part of the key +*/ + +static void set_up_partition_key_maps(TABLE *table, + partition_info *part_info) +{ + uint keys= table->s->keys, i; + bool all_fields, some_fields; + DBUG_ENTER("set_up_partition_key_maps"); + + part_info->all_fields_in_PF.clear_all(); + part_info->all_fields_in_PPF.clear_all(); + part_info->all_fields_in_SPF.clear_all(); + part_info->some_fields_in_PF.clear_all(); + for (i= 0; i < keys; i++) + { + set_indicator_in_key_fields(table->key_info+i); + check_fields_in_PF(part_info->full_part_field_array, + &all_fields, &some_fields); + if (all_fields) + part_info->all_fields_in_PF.set_bit(i); + if (some_fields) + part_info->some_fields_in_PF.set_bit(i); + if (is_sub_partitioned(part_info)) + { + check_fields_in_PF(part_info->part_field_array, + &all_fields, &some_fields); + if (all_fields) + part_info->all_fields_in_PPF.set_bit(i); + check_fields_in_PF(part_info->subpart_field_array, + &all_fields, &some_fields); + if (all_fields) + part_info->all_fields_in_SPF.set_bit(i); + } + clear_indicator_in_key_fields(table->key_info+i); + } + DBUG_VOID_RETURN; +} + + +/* + Set-up all function pointers for calculation of partition id, + subpartition id and the upper part in subpartitioning. This is to speed up + execution of get_partition_id which is executed once every record to be + written and deleted and twice for updates. + SYNOPSIS + set_up_partition_function_pointers() + part_info Reference to partitioning data structure +*/ + +static void set_up_partition_func_pointers(partition_info *part_info) +{ + if (is_sub_partitioned(part_info)) + { + if (part_info->part_type == RANGE_PARTITION) + { + part_info->get_part_partition_id= get_partition_id_range; + if (part_info->list_of_subpart_fields) + { + if (part_info->linear_hash_ind) + { + part_info->get_partition_id= get_partition_id_range_sub_linear_key; + part_info->get_subpartition_id= get_partition_id_linear_key_sub; + } + else + { + part_info->get_partition_id= get_partition_id_range_sub_key; + part_info->get_subpartition_id= get_partition_id_key_sub; + } + } + else + { + if (part_info->linear_hash_ind) + { + part_info->get_partition_id= get_partition_id_range_sub_linear_hash; + part_info->get_subpartition_id= get_partition_id_linear_hash_sub; + } + else + { + part_info->get_partition_id= get_partition_id_range_sub_hash; + part_info->get_subpartition_id= get_partition_id_hash_sub; + } + } + } + else //LIST Partitioning + { + part_info->get_part_partition_id= get_partition_id_list; + if (part_info->list_of_subpart_fields) + { + if (part_info->linear_hash_ind) + { + part_info->get_partition_id= get_partition_id_list_sub_linear_key; + part_info->get_subpartition_id= get_partition_id_linear_key_sub; + } + else + { + part_info->get_partition_id= get_partition_id_list_sub_key; + part_info->get_subpartition_id= get_partition_id_key_sub; + } + } + else + { + if (part_info->linear_hash_ind) + { + part_info->get_partition_id= get_partition_id_list_sub_linear_hash; + part_info->get_subpartition_id= get_partition_id_linear_hash_sub; + } + else + { + part_info->get_partition_id= get_partition_id_list_sub_hash; + part_info->get_subpartition_id= get_partition_id_hash_sub; + } + } + } + } + else //No subpartitioning + { + part_info->get_part_partition_id= NULL; + part_info->get_subpartition_id= NULL; + if (part_info->part_type == RANGE_PARTITION) + part_info->get_partition_id= get_partition_id_range; + else if (part_info->part_type == LIST_PARTITION) + part_info->get_partition_id= get_partition_id_list; + else //HASH partitioning + { + if (part_info->list_of_part_fields) + { + if (part_info->linear_hash_ind) + part_info->get_partition_id= get_partition_id_linear_key_nosub; + else + part_info->get_partition_id= get_partition_id_key_nosub; + } + else + { + if (part_info->linear_hash_ind) + part_info->get_partition_id= get_partition_id_linear_hash_nosub; + else + part_info->get_partition_id= get_partition_id_hash_nosub; + } + } + } +} + + +/* + For linear hashing we need a mask which is on the form 2**n - 1 where + 2**n >= no_parts. Thus if no_parts is 6 then mask is 2**3 - 1 = 8 - 1 = 7. + SYNOPSIS + set_linear_hash_mask() + part_info Reference to partitioning data structure + no_parts Number of parts in linear hash partitioning +*/ + +static void set_linear_hash_mask(partition_info *part_info, uint no_parts) +{ + uint mask; + for (mask= 1; mask < no_parts; mask<<=1) + ; + part_info->linear_hash_mask= mask - 1; +} + + +/* + This function calculates the partition id provided the result of the hash + function using linear hashing parameters, mask and number of partitions. + SYNOPSIS + get_part_id_from_linear_hash() + hash_value Hash value calculated by HASH function or KEY function + mask Mask calculated previously by set_linear_hash_mask + no_parts Number of partitions in HASH partitioned part + RETURN VALUE + part_id The calculated partition identity (starting at 0) + DESCRIPTION + The partition is calculated according to the theory of linear hashing. + See e.g. Linear hashing: a new tool for file and table addressing, + Reprinted from VLDB-80 in Readings Database Systems, 2nd ed, M. Stonebraker + (ed.), Morgan Kaufmann 1994. +*/ + +static uint32 get_part_id_from_linear_hash(longlong hash_value, uint mask, + uint no_parts) +{ + uint32 part_id= (uint32)(hash_value & mask); + if (part_id >= no_parts) + { + uint new_mask= ((mask + 1) >> 1) - 1; + part_id= hash_value & new_mask; + } + return part_id; +} + +/* + This function is called as part of opening the table by opening the .frm + file. It is a part of CREATE TABLE to do this so it is quite permissible + that errors due to erroneus syntax isn't found until we come here. + If the user has used a non-existing field in the table is one such example + of an error that is not discovered until here. + SYNOPSIS + fix_partition_func() + thd The thread object + name The name of the partitioned table + table TABLE object for which partition fields are set-up + RETURN VALUE + TRUE + FALSE + DESCRIPTION + The name parameter contains the full table name and is used to get the + database name of the table which is used to set-up a correct + TABLE_LIST object for use in fix_fields. +*/ + +bool fix_partition_func(THD *thd, const char* name, TABLE *table) +{ + bool result= TRUE; + uint dir_length, home_dir_length; + TABLE_LIST tables; + TABLE_SHARE *share= table->s; + char db_name_string[FN_REFLEN]; + char* db_name; + partition_info *part_info= share->part_info; + ulong save_set_query_id= thd->set_query_id; + DBUG_ENTER("fix_partition_func"); + + thd->set_query_id= 0; + /* + Set-up the TABLE_LIST object to be a list with a single table + Set the object to zero to create NULL pointers and set alias + and real name to table name and get database name from file name. + */ + + bzero((void*)&tables, sizeof(TABLE_LIST)); + tables.alias= tables.table_name= (char*)share->table_name; + tables.table= table; + tables.next_local= 0; + tables.next_name_resolution_table= 0; + strmov(db_name_string, name); + dir_length= dirname_length(db_name_string); + db_name_string[dir_length - 1]= 0; + home_dir_length= dirname_length(db_name_string); + db_name= &db_name_string[home_dir_length]; + tables.db= db_name; + + if (is_sub_partitioned(part_info)) + { + DBUG_ASSERT(part_info->subpart_type == HASH_PARTITION); + /* + Subpartition is defined. We need to verify that subpartitioning + function is correct. + */ + if (part_info->linear_hash_ind) + set_linear_hash_mask(part_info, part_info->no_subparts); + if (part_info->list_of_subpart_fields) + { + List_iterator<char> it(part_info->subpart_field_list); + if (unlikely(handle_list_of_fields(it, table, part_info, TRUE))) + goto end; + } + else + { + if (unlikely(fix_fields_part_func(thd, &tables, + part_info->subpart_expr, part_info, TRUE))) + goto end; + if (unlikely(part_info->subpart_expr->result_type() != INT_RESULT)) + { + my_error(ER_PARTITION_FUNC_NOT_ALLOWED_ERROR, MYF(0), + "SUBPARTITION"); + goto end; + } + } + } + DBUG_ASSERT(part_info->part_type != NOT_A_PARTITION); + /* + Partition is defined. We need to verify that partitioning + function is correct. + */ + if (part_info->part_type == HASH_PARTITION) + { + if (part_info->linear_hash_ind) + set_linear_hash_mask(part_info, part_info->no_parts); + if (part_info->list_of_part_fields) + { + List_iterator<char> it(part_info->part_field_list); + if (unlikely(handle_list_of_fields(it, table, part_info, FALSE))) + goto end; + } + else + { + if (unlikely(fix_fields_part_func(thd, &tables, part_info->part_expr, + part_info, FALSE))) + goto end; + if (unlikely(part_info->part_expr->result_type() != INT_RESULT)) + { + my_error(ER_PARTITION_FUNC_NOT_ALLOWED_ERROR, MYF(0), part_str); + goto end; + } + part_info->part_result_type= INT_RESULT; + } + } + else + { + char *error_str; + if (part_info->part_type == RANGE_PARTITION) + { + error_str= range_str; + if (unlikely(check_range_constants(part_info))) + goto end; + } + else if (part_info->part_type == LIST_PARTITION) + { + error_str= list_str; + if (unlikely(check_list_constants(part_info))) + goto end; + } + else + { + DBUG_ASSERT(0); + my_error(ER_INCONSISTENT_PARTITION_INFO_ERROR, MYF(0)); + goto end; + } + if (unlikely(part_info->no_parts < 1)) + { + my_error(ER_PARTITIONS_MUST_BE_DEFINED_ERROR, MYF(0), error_str); + goto end; + } + if (unlikely(fix_fields_part_func(thd, &tables, part_info->part_expr, + part_info, FALSE))) + goto end; + if (unlikely(part_info->part_expr->result_type() != INT_RESULT)) + { + my_error(ER_PARTITION_FUNC_NOT_ALLOWED_ERROR, MYF(0), part_str); + goto end; + } + } + if (unlikely(create_full_part_field_array(table, part_info))) + goto end; + if (unlikely(check_primary_key(table))) + goto end; + if (unlikely((!table->file->partition_flags() & HA_CAN_PARTITION_UNIQUE) && + check_unique_keys(table))) + goto end; + check_range_capable_PF(table); + set_up_partition_key_maps(table, part_info); + set_up_partition_func_pointers(part_info); + result= FALSE; +end: + thd->set_query_id= save_set_query_id; + DBUG_RETURN(result); +} + + +/* + The code below is support routines for the reverse parsing of the + partitioning syntax. This feature is very useful to generate syntax for + all default values to avoid all default checking when opening the frm + file. It is also used when altering the partitioning by use of various + ALTER TABLE commands. Finally it is used for SHOW CREATE TABLES. +*/ + +static int add_write(File fptr, const char *buf, uint len) +{ + uint len_written= my_write(fptr, buf, len, MYF(0)); + if (likely(len == len_written)) + return 0; + else + return 1; +} + +static int add_string(File fptr, const char *string) +{ + return add_write(fptr, string, strlen(string)); +} + +static int add_string_len(File fptr, const char *string, uint len) +{ + return add_write(fptr, string, len); +} + +static int add_space(File fptr) +{ + return add_string(fptr, space_str); +} + +static int add_comma(File fptr) +{ + return add_string(fptr, comma_str); +} + +static int add_equal(File fptr) +{ + return add_string(fptr, equal_str); +} + +static int add_end_parenthesis(File fptr) +{ + return add_string(fptr, end_paren_str); +} + +static int add_begin_parenthesis(File fptr) +{ + return add_string(fptr, begin_paren_str); +} + +static int add_part_key_word(File fptr, const char *key_string) +{ + int err= add_string(fptr, key_string); + err+= add_space(fptr); + return err + add_begin_parenthesis(fptr); +} + +static int add_hash(File fptr) +{ + return add_part_key_word(fptr, hash_str); +} + +static int add_partition(File fptr) +{ + strxmov(buff, part_str, space_str, NullS); + return add_string(fptr, buff); +} + +static int add_subpartition(File fptr) +{ + int err= add_string(fptr, sub_str); + return err + add_partition(fptr); +} + +static int add_partition_by(File fptr) +{ + strxmov(buff, part_str, space_str, by_str, space_str, NullS); + return add_string(fptr, buff); +} + +static int add_subpartition_by(File fptr) +{ + int err= add_string(fptr, sub_str); + return err + add_partition_by(fptr); +} + +static int add_key_partition(File fptr, List<char> field_list) +{ + uint i, no_fields; + int err; + List_iterator<char> part_it(field_list); + err= add_part_key_word(fptr, key_str); + no_fields= field_list.elements; + i= 0; + do + { + const char *field_str= part_it++; + err+= add_string(fptr, field_str); + if (i != (no_fields-1)) + err+= add_comma(fptr); + } while (++i < no_fields); + return err; +} + +static int add_int(File fptr, longlong number) +{ + llstr(number, buff); + return add_string(fptr, buff); +} + +static int add_keyword_string(File fptr, const char *keyword, + const char *keystr) +{ + int err= add_string(fptr, keyword); + err+= add_space(fptr); + err+= add_equal(fptr); + err+= add_space(fptr); + err+= add_string(fptr, keystr); + return err + add_space(fptr); +} + +static int add_keyword_int(File fptr, const char *keyword, longlong num) +{ + int err= add_string(fptr, keyword); + err+= add_space(fptr); + err+= add_equal(fptr); + err+= add_space(fptr); + err+= add_int(fptr, num); + return err + add_space(fptr); +} + +static int add_engine(File fptr, enum db_type engine_type) +{ + const char *engine_str= ha_get_storage_engine(engine_type); + int err= add_string(fptr, "ENGINE = "); + return err + add_string(fptr, engine_str); + return err; +} + +static int add_partition_options(File fptr, partition_element *p_elem) +{ + int err= 0; + if (p_elem->tablespace_name) + err+= add_keyword_string(fptr,"TABLESPACE",p_elem->tablespace_name); + if (p_elem->nodegroup_id != UNDEF_NODEGROUP) + err+= add_keyword_int(fptr,"NODEGROUP",(longlong)p_elem->nodegroup_id); + if (p_elem->part_max_rows) + err+= add_keyword_int(fptr,"MAX_ROWS",(longlong)p_elem->part_max_rows); + if (p_elem->part_min_rows) + err+= add_keyword_int(fptr,"MIN_ROWS",(longlong)p_elem->part_min_rows); + if (p_elem->data_file_name) + err+= add_keyword_string(fptr,"DATA DIRECTORY",p_elem->data_file_name); + if (p_elem->index_file_name) + err+= add_keyword_string(fptr,"INDEX DIRECTORY",p_elem->index_file_name); + if (p_elem->part_comment) + err+= add_keyword_string(fptr, "COMMENT",p_elem->part_comment); + return err + add_engine(fptr,p_elem->engine_type); +} + +static int add_partition_values(File fptr, partition_info *part_info, + partition_element *p_elem) +{ + int err= 0; + if (part_info->part_type == RANGE_PARTITION) + { + err+= add_string(fptr, "VALUES LESS THAN "); + if (p_elem->range_value != LONGLONG_MAX) + { + err+= add_begin_parenthesis(fptr); + err+= add_int(fptr, p_elem->range_value); + err+= add_end_parenthesis(fptr); + } + else + err+= add_string(fptr, "MAXVALUE"); + } + else if (part_info->part_type == LIST_PARTITION) + { + uint i; + List_iterator<longlong> list_val_it(p_elem->list_val_list); + err+= add_string(fptr, "VALUES IN "); + uint no_items= p_elem->list_val_list.elements; + err+= add_begin_parenthesis(fptr); + i= 0; + do + { + longlong *list_value= list_val_it++; + err+= add_int(fptr, *list_value); + if (i != (no_items-1)) + err+= add_comma(fptr); + } while (++i < no_items); + err+= add_end_parenthesis(fptr); + } + return err + add_space(fptr); +} + +/* + Generate the partition syntax from the partition data structure. + Useful for support of generating defaults, SHOW CREATE TABLES + and easy partition management. + SYNOPSIS + generate_partition_syntax() + part_info The partitioning data structure + buf_length A pointer to the returned buffer length + use_sql_alloc Allocate buffer from sql_alloc if true + otherwise use my_malloc + RETURN VALUES + NULL error + buf, buf_length Buffer and its length + DESCRIPTION + Here we will generate the full syntax for the given command where all + defaults have been expanded. By so doing the it is also possible to + make lots of checks of correctness while at it. + This could will also be reused for SHOW CREATE TABLES and also for all + type ALTER TABLE commands focusing on changing the PARTITION structure + in any fashion. + + The implementation writes the syntax to a temporary file (essentially + an abstraction of a dynamic array) and if all writes goes well it + allocates a buffer and writes the syntax into this one and returns it. + + As a security precaution the file is deleted before writing into it. This + means that no other processes on the machine can open and read the file + while this processing is ongoing. + + The code is optimised for minimal code size since it is not used in any + common queries. +*/ + +char *generate_partition_syntax(partition_info *part_info, + uint *buf_length, + bool use_sql_alloc) +{ + uint i,j, no_parts, no_subparts; + partition_element *part_elem; + ulonglong buffer_length; + char path[FN_REFLEN]; + int err= 0; + DBUG_ENTER("generate_partition_syntax"); + File fptr; + char *buf= NULL; //Return buffer + const char *file_name; + sprintf(path, "%s_%lx_%lx", "part_syntax", current_pid, + current_thd->thread_id); + fn_format(path,path,mysql_tmpdir,".psy", MY_REPLACE_EXT); + file_name= &path[0]; + DBUG_PRINT("info", ("File name = %s", file_name)); + if (unlikely(((fptr= my_open(file_name,O_CREAT|O_RDWR, MYF(MY_WME))) == -1))) + DBUG_RETURN(NULL); +#if defined(MSDOS) || defined(__WIN__) || defined(__EMX__) || defined(OS2) +#else + my_delete(file_name, MYF(0)); +#endif + err+= add_space(fptr); + err+= add_partition_by(fptr); + switch (part_info->part_type) + { + case RANGE_PARTITION: + err+= add_part_key_word(fptr, range_str); + break; + case LIST_PARTITION: + err+= add_part_key_word(fptr, list_str); + break; + case HASH_PARTITION: + if (part_info->linear_hash_ind) + err+= add_string(fptr, "LINEAR "); + if (part_info->list_of_part_fields) + err+= add_key_partition(fptr, part_info->part_field_list); + else + err+= add_hash(fptr); + break; + default: + DBUG_ASSERT(0); + /* We really shouldn't get here, no use in continuing from here */ + current_thd->fatal_error(); + DBUG_RETURN(NULL); + } + if (part_info->part_expr) + err+= add_string_len(fptr, part_info->part_func_string, + part_info->part_func_len); + err+= add_end_parenthesis(fptr); + err+= add_space(fptr); + if (is_sub_partitioned(part_info)) + { + err+= add_subpartition_by(fptr); + /* Must be hash partitioning for subpartitioning */ + if (part_info->list_of_subpart_fields) + err+= add_key_partition(fptr, part_info->subpart_field_list); + else + err+= add_hash(fptr); + if (part_info->subpart_expr) + err+= add_string_len(fptr, part_info->subpart_func_string, + part_info->subpart_func_len); + err+= add_end_parenthesis(fptr); + err+= add_space(fptr); + } + err+= add_begin_parenthesis(fptr); + List_iterator<partition_element> part_it(part_info->partitions); + no_parts= part_info->no_parts; + no_subparts= part_info->no_subparts; + i= 0; + do + { + part_elem= part_it++; + err+= add_partition(fptr); + err+= add_string(fptr, part_elem->partition_name); + err+= add_space(fptr); + err+= add_partition_values(fptr, part_info, part_elem); + if (!is_sub_partitioned(part_info)) + err+= add_partition_options(fptr, part_elem); + if (is_sub_partitioned(part_info)) + { + err+= add_space(fptr); + err+= add_begin_parenthesis(fptr); + List_iterator<partition_element> sub_it(part_elem->subpartitions); + j= 0; + do + { + part_elem= sub_it++; + err+= add_subpartition(fptr); + err+= add_string(fptr, part_elem->partition_name); + err+= add_space(fptr); + err+= add_partition_options(fptr, part_elem); + if (j != (no_subparts-1)) + { + err+= add_comma(fptr); + err+= add_space(fptr); + } + else + err+= add_end_parenthesis(fptr); + } while (++j < no_subparts); + } + if (i != (no_parts-1)) + { + err+= add_comma(fptr); + err+= add_space(fptr); + } + else + err+= add_end_parenthesis(fptr); + } while (++i < no_parts); + if (err) + goto close_file; + buffer_length= my_seek(fptr, 0L,MY_SEEK_END,MYF(0)); + if (unlikely(buffer_length == MY_FILEPOS_ERROR)) + goto close_file; + if (unlikely(my_seek(fptr, 0L, MY_SEEK_SET, MYF(0)) == MY_FILEPOS_ERROR)) + goto close_file; + *buf_length= (uint)buffer_length; + if (use_sql_alloc) + buf= sql_alloc(*buf_length+1); + else + buf= my_malloc(*buf_length+1, MYF(MY_WME)); + if (!buf) + goto close_file; + + if (unlikely(my_read(fptr, buf, *buf_length, MYF(MY_FNABP)))) + { + if (!use_sql_alloc) + my_free(buf, MYF(0)); + else + buf= NULL; + } + else + buf[*buf_length]= 0; + +close_file: + /* + Delete the file before closing to ensure the file doesn't get synched + to disk unnecessary. We only used the file system as a dynamic array + implementation so we are not really interested in getting the file + present on disk. + This is not possible on Windows so here it has to be done after closing + the file. Also on Unix we delete immediately after opening to ensure no + other process can read the information written into the file. + */ + my_close(fptr, MYF(0)); +#if defined(MSDOS) || defined(__WIN__) || defined(__EMX__) || defined(OS2) + my_delete(file_name, MYF(0)); +#endif + DBUG_RETURN(buf); +} + + +/* + Check if partition key fields are modified and if it can be handled by the + underlying storage engine. + SYNOPSIS + partition_key_modified + table TABLE object for which partition fields are set-up + fields A list of the to be modifed + RETURN VALUES + TRUE Need special handling of UPDATE + FALSE Normal UPDATE handling is ok +*/ + +bool partition_key_modified(TABLE *table, List<Item> &fields) +{ + List_iterator_fast<Item> f(fields); + partition_info *part_info= table->s->part_info; + Item_field *item_field; + DBUG_ENTER("partition_key_modified"); + if (!part_info) + DBUG_RETURN(FALSE); + if (table->file->partition_flags() & HA_CAN_UPDATE_PARTITION_KEY) + DBUG_RETURN(FALSE); + f.rewind(); + while ((item_field=(Item_field*) f++)) + if (item_field->field->flags & FIELD_IN_PART_FUNC_FLAG) + DBUG_RETURN(TRUE); + DBUG_RETURN(FALSE); +} + + +/* + The next set of functions are used to calculate the partition identity. + A handler sets up a variable that corresponds to one of these functions + to be able to quickly call it whenever the partition id needs to calculated + based on the record in table->record[0] (or set up to fake that). + There are 4 functions for hash partitioning and 2 for RANGE/LIST partitions. + In addition there are 4 variants for RANGE subpartitioning and 4 variants + for LIST subpartitioning thus in total there are 14 variants of this + function. + + We have a set of support functions for these 14 variants. There are 4 + variants of hash functions and there is a function for each. The KEY + partitioning uses the function calculate_key_value to calculate the hash + value based on an array of fields. The linear hash variants uses the + method get_part_id_from_linear_hash to get the partition id using the + hash value and some parameters calculated from the number of partitions. +*/ + +/* + Calculate hash value for KEY partitioning using an array of fields. + SYNOPSIS + calculate_key_value() + field_array An array of the fields in KEY partitioning + RETURN VALUE + hash_value calculated + DESCRIPTION + Uses the hash function on the character set of the field. Integer and + floating point fields use the binary character set by default. +*/ + +static uint32 calculate_key_value(Field **field_array) +{ + uint32 hashnr= 0; + ulong nr2= 4; + do + { + Field *field= *field_array; + if (field->is_null()) + { + hashnr^= (hashnr << 1) | 1; + } + else + { + uint len= field->pack_length(); + ulong nr1= 1; + CHARSET_INFO *cs= field->charset(); + cs->coll->hash_sort(cs, (uchar*)field->ptr, len, &nr1, &nr2); + hashnr^= (uint32)nr1; + } + } while (*(++field_array)); + return hashnr; +} + + +/* + A simple support function to calculate part_id given local part and + sub part. + SYNOPSIS + get_part_id_for_sub() + loc_part_id Local partition id + sub_part_id Subpartition id + no_subparts Number of subparts +*/ + +inline +static uint32 get_part_id_for_sub(uint32 loc_part_id, uint32 sub_part_id, + uint no_subparts) +{ + return (uint32)((loc_part_id * no_subparts) + sub_part_id); +} + + +/* + Calculate part_id for (SUB)PARTITION BY HASH + SYNOPSIS + get_part_id_hash() + no_parts Number of hash partitions + part_expr Item tree of hash function + RETURN VALUE + Calculated partition id +*/ + +inline +static uint32 get_part_id_hash(uint no_parts, + Item *part_expr) +{ + DBUG_ENTER("get_part_id_hash"); + DBUG_RETURN((uint32)(part_expr->val_int() % no_parts)); +} + + +/* + Calculate part_id for (SUB)PARTITION BY LINEAR HASH + SYNOPSIS + get_part_id_linear_hash() + part_info A reference to the partition_info struct where all the + desired information is given + no_parts Number of hash partitions + part_expr Item tree of hash function + RETURN VALUE + Calculated partition id +*/ + +inline +static uint32 get_part_id_linear_hash(partition_info *part_info, + uint no_parts, + Item *part_expr) +{ + DBUG_ENTER("get_part_id_linear_hash"); + DBUG_RETURN(get_part_id_from_linear_hash(part_expr->val_int(), + part_info->linear_hash_mask, + no_parts)); +} + + +/* + Calculate part_id for (SUB)PARTITION BY KEY + SYNOPSIS + get_part_id_key() + field_array Array of fields for PARTTION KEY + no_parts Number of KEY partitions + RETURN VALUE + Calculated partition id +*/ + +inline +static uint32 get_part_id_key(Field **field_array, + uint no_parts) +{ + DBUG_ENTER("get_part_id_key"); + DBUG_RETURN(calculate_key_value(field_array) & no_parts); +} + + +/* + Calculate part_id for (SUB)PARTITION BY LINEAR KEY + SYNOPSIS + get_part_id_linear_key() + part_info A reference to the partition_info struct where all the + desired information is given + field_array Array of fields for PARTTION KEY + no_parts Number of KEY partitions + RETURN VALUE + Calculated partition id +*/ + +inline +static uint32 get_part_id_linear_key(partition_info *part_info, + Field **field_array, + uint no_parts) +{ + DBUG_ENTER("get_partition_id_linear_key"); + DBUG_RETURN(get_part_id_from_linear_hash(calculate_key_value(field_array), + part_info->linear_hash_mask, + no_parts)); +} + +/* + This function is used to calculate the partition id where all partition + fields have been prepared to point to a record where the partition field + values are bound. + SYNOPSIS + get_partition_id() + part_info A reference to the partition_info struct where all the + desired information is given + part_id The partition id is returned through this pointer + RETURN VALUE + part_id + return TRUE means that the fields of the partition function didn't fit + into any partition and thus the values of the PF-fields are not allowed. + DESCRIPTION + A routine used from write_row, update_row and delete_row from any + handler supporting partitioning. It is also a support routine for + get_partition_set used to find the set of partitions needed to scan + for a certain index scan or full table scan. + + It is actually 14 different variants of this function which are called + through a function pointer. + + get_partition_id_list + get_partition_id_range + get_partition_id_hash_nosub + get_partition_id_key_nosub + get_partition_id_linear_hash_nosub + get_partition_id_linear_key_nosub + get_partition_id_range_sub_hash + get_partition_id_range_sub_key + get_partition_id_range_sub_linear_hash + get_partition_id_range_sub_linear_key + get_partition_id_list_sub_hash + get_partition_id_list_sub_key + get_partition_id_list_sub_linear_hash + get_partition_id_list_sub_linear_key +*/ + +/* + This function is used to calculate the main partition to use in the case of + subpartitioning and we don't know enough to get the partition identity in + total. + SYNOPSIS + get_part_partition_id() + part_info A reference to the partition_info struct where all the + desired information is given + part_id The partition id is returned through this pointer + RETURN VALUE + part_id + return TRUE means that the fields of the partition function didn't fit + into any partition and thus the values of the PF-fields are not allowed. + DESCRIPTION + + It is actually 6 different variants of this function which are called + through a function pointer. + + get_partition_id_list + get_partition_id_range + get_partition_id_hash_nosub + get_partition_id_key_nosub + get_partition_id_linear_hash_nosub + get_partition_id_linear_key_nosub +*/ + + +bool get_partition_id_list(partition_info *part_info, + uint32 *part_id) +{ + DBUG_ENTER("get_partition_id_list"); + LIST_PART_ENTRY *list_array= part_info->list_array; + uint list_index; + longlong list_value; + uint min_list_index= 0, max_list_index= part_info->no_list_values - 1; + longlong part_func_value= part_info->part_expr->val_int(); + while (max_list_index >= min_list_index) + { + list_index= (max_list_index + min_list_index) >> 1; + list_value= list_array[list_index].list_value; + if (list_value < part_func_value) + min_list_index= list_index + 1; + else if (list_value > part_func_value) + max_list_index= list_index - 1; + else { + *part_id= (uint32)list_array[list_index].partition_id; + DBUG_RETURN(FALSE); + } + } + *part_id= 0; + DBUG_RETURN(TRUE); +} + + +bool get_partition_id_range(partition_info *part_info, + uint32 *part_id) +{ + DBUG_ENTER("get_partition_id_int_range"); + longlong *range_array= part_info->range_int_array; + uint max_partition= part_info->no_parts - 1; + uint min_part_id= 0, max_part_id= max_partition, loc_part_id; + longlong part_func_value= part_info->part_expr->val_int(); + while (max_part_id > min_part_id) + { + loc_part_id= (max_part_id + min_part_id + 1) >> 1; + if (range_array[loc_part_id] < part_func_value) + min_part_id= loc_part_id + 1; + else + max_part_id= loc_part_id - 1; + } + loc_part_id= max_part_id; + if (part_func_value >= range_array[loc_part_id]) + if (loc_part_id != max_partition) + loc_part_id++; + *part_id= (uint32)loc_part_id; + if (loc_part_id == max_partition) + if (range_array[loc_part_id] != LONGLONG_MAX) + if (part_func_value >= range_array[loc_part_id]) + DBUG_RETURN(TRUE); + DBUG_RETURN(FALSE); +} + +bool get_partition_id_hash_nosub(partition_info *part_info, + uint32 *part_id) +{ + *part_id= get_part_id_hash(part_info->no_parts, part_info->part_expr); + return FALSE; +} + + +bool get_partition_id_linear_hash_nosub(partition_info *part_info, + uint32 *part_id) +{ + *part_id= get_part_id_linear_hash(part_info, part_info->no_parts, + part_info->part_expr); + return FALSE; +} + + +bool get_partition_id_key_nosub(partition_info *part_info, + uint32 *part_id) +{ + *part_id= get_part_id_key(part_info->part_field_array, part_info->no_parts); + return FALSE; +} + + +bool get_partition_id_linear_key_nosub(partition_info *part_info, + uint32 *part_id) +{ + *part_id= get_part_id_linear_key(part_info, + part_info->part_field_array, + part_info->no_parts); + return FALSE; +} + + +bool get_partition_id_range_sub_hash(partition_info *part_info, + uint32 *part_id) +{ + uint32 loc_part_id, sub_part_id; + uint no_subparts; + DBUG_ENTER("get_partition_id_range_sub_hash"); + if (unlikely(get_partition_id_range(part_info, &loc_part_id))) + { + DBUG_RETURN(TRUE); + } + no_subparts= part_info->no_subparts; + sub_part_id= get_part_id_hash(no_subparts, part_info->subpart_expr); + *part_id= get_part_id_for_sub(loc_part_id, sub_part_id, no_subparts); + DBUG_RETURN(FALSE); +} + + +bool get_partition_id_range_sub_linear_hash(partition_info *part_info, + uint32 *part_id) +{ + uint32 loc_part_id, sub_part_id; + uint no_subparts; + DBUG_ENTER("get_partition_id_range_sub_linear_hash"); + if (unlikely(get_partition_id_range(part_info, &loc_part_id))) + { + DBUG_RETURN(TRUE); + } + no_subparts= part_info->no_subparts; + sub_part_id= get_part_id_linear_hash(part_info, no_subparts, + part_info->subpart_expr); + *part_id= get_part_id_for_sub(loc_part_id, sub_part_id, no_subparts); + DBUG_RETURN(FALSE); +} + + +bool get_partition_id_range_sub_key(partition_info *part_info, + uint32 *part_id) +{ + uint32 loc_part_id, sub_part_id; + uint no_subparts; + DBUG_ENTER("get_partition_id_range_sub_key"); + if (unlikely(get_partition_id_range(part_info, &loc_part_id))) + { + DBUG_RETURN(TRUE); + } + no_subparts= part_info->no_subparts; + sub_part_id= get_part_id_key(part_info->subpart_field_array, no_subparts); + *part_id= get_part_id_for_sub(loc_part_id, sub_part_id, no_subparts); + DBUG_RETURN(FALSE); +} + + +bool get_partition_id_range_sub_linear_key(partition_info *part_info, + uint32 *part_id) +{ + uint32 loc_part_id, sub_part_id; + uint no_subparts; + DBUG_ENTER("get_partition_id_range_sub_linear_key"); + if (unlikely(get_partition_id_range(part_info, &loc_part_id))) + { + DBUG_RETURN(TRUE); + } + no_subparts= part_info->no_subparts; + sub_part_id= get_part_id_linear_key(part_info, + part_info->subpart_field_array, + no_subparts); + *part_id= get_part_id_for_sub(loc_part_id, sub_part_id, no_subparts); + DBUG_RETURN(FALSE); +} + + +bool get_partition_id_list_sub_hash(partition_info *part_info, + uint32 *part_id) +{ + uint32 loc_part_id, sub_part_id; + uint no_subparts; + DBUG_ENTER("get_partition_id_list_sub_hash"); + if (unlikely(get_partition_id_list(part_info, &loc_part_id))) + { + DBUG_RETURN(TRUE); + } + no_subparts= part_info->no_subparts; + sub_part_id= get_part_id_hash(no_subparts, part_info->subpart_expr); + *part_id= get_part_id_for_sub(loc_part_id, sub_part_id, no_subparts); + DBUG_RETURN(FALSE); +} + + +bool get_partition_id_list_sub_linear_hash(partition_info *part_info, + uint32 *part_id) +{ + uint32 loc_part_id, sub_part_id; + uint no_subparts; + DBUG_ENTER("get_partition_id_list_sub_linear_hash"); + if (unlikely(get_partition_id_list(part_info, &loc_part_id))) + { + DBUG_RETURN(TRUE); + } + no_subparts= part_info->no_subparts; + sub_part_id= get_part_id_hash(no_subparts, part_info->subpart_expr); + *part_id= get_part_id_for_sub(loc_part_id, sub_part_id, no_subparts); + DBUG_RETURN(FALSE); +} + + +bool get_partition_id_list_sub_key(partition_info *part_info, + uint32 *part_id) +{ + uint32 loc_part_id, sub_part_id; + uint no_subparts; + DBUG_ENTER("get_partition_id_range_sub_key"); + if (unlikely(get_partition_id_list(part_info, &loc_part_id))) + { + DBUG_RETURN(TRUE); + } + no_subparts= part_info->no_subparts; + sub_part_id= get_part_id_key(part_info->subpart_field_array, no_subparts); + *part_id= get_part_id_for_sub(loc_part_id, sub_part_id, no_subparts); + DBUG_RETURN(FALSE); +} + + +bool get_partition_id_list_sub_linear_key(partition_info *part_info, + uint32 *part_id) +{ + uint32 loc_part_id, sub_part_id; + uint no_subparts; + DBUG_ENTER("get_partition_id_list_sub_linear_key"); + if (unlikely(get_partition_id_list(part_info, &loc_part_id))) + { + DBUG_RETURN(TRUE); + } + no_subparts= part_info->no_subparts; + sub_part_id= get_part_id_linear_key(part_info, + part_info->subpart_field_array, + no_subparts); + *part_id= get_part_id_for_sub(loc_part_id, sub_part_id, no_subparts); + DBUG_RETURN(FALSE); +} + + +/* + This function is used to calculate the subpartition id + SYNOPSIS + get_subpartition_id() + part_info A reference to the partition_info struct where all the + desired information is given + RETURN VALUE + part_id + The subpartition identity + DESCRIPTION + A routine used in some SELECT's when only partial knowledge of the + partitions is known. + + It is actually 4 different variants of this function which are called + through a function pointer. + + get_partition_id_hash_sub + get_partition_id_key_sub + get_partition_id_linear_hash_sub + get_partition_id_linear_key_sub +*/ + +uint32 get_partition_id_hash_sub(partition_info *part_info) +{ + return get_part_id_hash(part_info->no_subparts, part_info->subpart_expr); +} + + +uint32 get_partition_id_linear_hash_sub(partition_info *part_info) +{ + return get_part_id_linear_hash(part_info, part_info->no_subparts, + part_info->subpart_expr); +} + + +uint32 get_partition_id_key_sub(partition_info *part_info) +{ + return get_part_id_key(part_info->subpart_field_array, + part_info->no_subparts); +} + + +uint32 get_partition_id_linear_key_sub(partition_info *part_info) +{ + return get_part_id_linear_key(part_info, + part_info->subpart_field_array, + part_info->no_subparts); +} + + +/* + Set an indicator on all partition fields that are set by the key + SYNOPSIS + set_PF_fields_in_key() + key_info Information about the index + key_length Length of key + RETURN VALUE + TRUE Found partition field set by key + FALSE No partition field set by key +*/ + +static bool set_PF_fields_in_key(KEY *key_info, uint key_length) +{ + KEY_PART_INFO *key_part; + bool found_part_field= FALSE; + DBUG_ENTER("set_PF_fields_in_key"); + + for (key_part= key_info->key_part; (int)key_length > 0; key_part++) + { + if (key_part->null_bit) + key_length--; + if (key_part->type == HA_KEYTYPE_BIT) + { + if (((Field_bit*)key_part->field)->bit_len) + key_length--; + } + if (key_part->key_part_flag & (HA_BLOB_PART + HA_VAR_LENGTH_PART)) + { + key_length-= HA_KEY_BLOB_LENGTH; + } + if (key_length < key_part->length) + break; + key_length-= key_part->length; + if (key_part->field->flags & FIELD_IN_PART_FUNC_FLAG) + { + found_part_field= TRUE; + key_part->field->flags|= GET_FIXED_FIELDS_FLAG; + } + } + DBUG_RETURN(found_part_field); +} + + +/* + We have found that at least one partition field was set by a key, now + check if a partition function has all its fields bound or not. + SYNOPSIS + check_part_func_bound() + ptr Array of fields NULL terminated (partition fields) + RETURN VALUE + TRUE All fields in partition function are set + FALSE Not all fields in partition function are set +*/ + +static bool check_part_func_bound(Field **ptr) +{ + bool result= TRUE; + DBUG_ENTER("check_part_func_bound"); + + for (; *ptr; ptr++) + { + if (!((*ptr)->flags & GET_FIXED_FIELDS_FLAG)) + { + result= FALSE; + break; + } + } + DBUG_RETURN(result); +} + + +/* + Get the id of the subpartitioning part by using the key buffer of the + index scan. + SYNOPSIS + get_sub_part_id_from_key() + table The table object + buf A buffer that can be used to evaluate the partition function + key_info The index object + key_spec A key_range containing key and key length + RETURN VALUES + part_id Subpartition id to use + DESCRIPTION + Use key buffer to set-up record in buf, move field pointers and + get the partition identity and restore field pointers afterwards. +*/ + +static uint32 get_sub_part_id_from_key(const TABLE *table,byte *buf, + KEY *key_info, + const key_range *key_spec) +{ + byte *rec0= table->record[0]; + partition_info *part_info= table->s->part_info; + uint32 part_id; + DBUG_ENTER("get_sub_part_id_from_key"); + + key_restore(buf, (byte*)key_spec->key, key_info, key_spec->length); + if (likely(rec0 == buf)) + part_id= part_info->get_subpartition_id(part_info); + else + { + Field **part_field_array= part_info->subpart_field_array; + set_field_ptr(part_field_array, buf, rec0); + part_id= part_info->get_subpartition_id(part_info); + set_field_ptr(part_field_array, rec0, buf); + } + DBUG_RETURN(part_id); +} + +/* + Get the id of the partitioning part by using the key buffer of the + index scan. + SYNOPSIS + get_part_id_from_key() + table The table object + buf A buffer that can be used to evaluate the partition function + key_info The index object + key_spec A key_range containing key and key length + part_id Partition to use + RETURN VALUES + TRUE Partition to use not found + FALSE Ok, part_id indicates partition to use + DESCRIPTION + Use key buffer to set-up record in buf, move field pointers and + get the partition identity and restore field pointers afterwards. +*/ +bool get_part_id_from_key(const TABLE *table, byte *buf, KEY *key_info, + const key_range *key_spec, uint32 *part_id) +{ + bool result; + byte *rec0= table->record[0]; + partition_info *part_info= table->s->part_info; + DBUG_ENTER("get_part_id_from_key"); + + key_restore(buf, (byte*)key_spec->key, key_info, key_spec->length); + if (likely(rec0 == buf)) + result= part_info->get_part_partition_id(part_info, part_id); + else + { + Field **part_field_array= part_info->part_field_array; + set_field_ptr(part_field_array, buf, rec0); + result= part_info->get_part_partition_id(part_info, part_id); + set_field_ptr(part_field_array, rec0, buf); + } + DBUG_RETURN(result); +} + +/* + Get the partitioning id of the full PF by using the key buffer of the + index scan. + SYNOPSIS + get_full_part_id_from_key() + table The table object + buf A buffer that is used to evaluate the partition function + key_info The index object + key_spec A key_range containing key and key length + part_spec A partition id containing start part and end part + RETURN VALUES + part_spec + No partitions to scan is indicated by end_part > start_part when returning + DESCRIPTION + Use key buffer to set-up record in buf, move field pointers if needed and + get the partition identity and restore field pointers afterwards. +*/ + +void get_full_part_id_from_key(const TABLE *table, byte *buf, + KEY *key_info, + const key_range *key_spec, + part_id_range *part_spec) +{ + bool result; + partition_info *part_info= table->s->part_info; + byte *rec0= table->record[0]; + DBUG_ENTER("get_full_part_id_from_key"); + + key_restore(buf, (byte*)key_spec->key, key_info, key_spec->length); + if (likely(rec0 == buf)) + result= part_info->get_partition_id(part_info, &part_spec->start_part); + else + { + Field **part_field_array= part_info->full_part_field_array; + set_field_ptr(part_field_array, buf, rec0); + result= part_info->get_partition_id(part_info, &part_spec->start_part); + set_field_ptr(part_field_array, rec0, buf); + } + part_spec->end_part= part_spec->start_part; + if (unlikely(result)) + part_spec->start_part++; + DBUG_VOID_RETURN; +} + +/* + Get the set of partitions to use in query. + SYNOPSIS + get_partition_set() + table The table object + buf A buffer that can be used to evaluate the partition function + index The index of the key used, if MAX_KEY no index used + key_spec A key_range containing key and key length + part_spec Contains start part, end part and indicator if bitmap is + used for which partitions to scan + DESCRIPTION + This function is called to discover which partitions to use in an index + scan or a full table scan. + It returns a range of partitions to scan. If there are holes in this + range with partitions that are not needed to scan a bit array is used + to signal which partitions to use and which not to use. + If start_part > end_part at return it means no partition needs to be + scanned. If start_part == end_part it always means a single partition + needs to be scanned. + RETURN VALUE + part_spec +*/ +void get_partition_set(const TABLE *table, byte *buf, const uint index, + const key_range *key_spec, part_id_range *part_spec) +{ + partition_info *part_info= table->s->part_info; + uint no_parts= get_tot_partitions(part_info), i, part_id; + uint sub_part= no_parts, part_part= no_parts; + KEY *key_info= NULL; + bool found_part_field= FALSE; + DBUG_ENTER("get_partition_set"); + + part_spec->use_bit_array= FALSE; + part_spec->start_part= 0; + part_spec->end_part= no_parts - 1; + if ((index < MAX_KEY) && + key_spec->flag == (uint)HA_READ_KEY_EXACT && + part_info->some_fields_in_PF.is_set(index)) + { + key_info= table->key_info+index; + /* + The index can potentially provide at least one PF-field (field in the + partition function). Thus it is interesting to continue our probe. + */ + if (key_spec->length == key_info->key_length) + { + /* + The entire key is set so we can check whether we can immediately + derive either the complete PF or if we can derive either + the top PF or the subpartitioning PF. This can be established by + checking precalculated bits on each index. + */ + if (part_info->all_fields_in_PF.is_set(index)) + { + /* + We can derive the exact partition to use, no more than this one + is needed. + */ + get_full_part_id_from_key(table,buf,key_info,key_spec,part_spec); + DBUG_VOID_RETURN; + } + else if (is_sub_partitioned(part_info)) + { + if (part_info->all_fields_in_SPF.is_set(index)) + sub_part= get_sub_part_id_from_key(table, buf, key_info, key_spec); + else if (part_info->all_fields_in_PPF.is_set(index)) + { + if (get_part_id_from_key(table,buf,key_info,key_spec,&part_part)) + { + /* + The value of the RANGE or LIST partitioning was outside of + allowed values. Thus it is certain that the result of this + scan will be empty. + */ + part_spec->start_part= no_parts; + DBUG_VOID_RETURN; + } + } + } + } + else + { + /* + Set an indicator on all partition fields that are bound. + If at least one PF-field was bound it pays off to check whether + the PF or PPF or SPF has been bound. + (PF = Partition Function, SPF = Subpartition Function and + PPF = Partition Function part of subpartitioning) + */ + if ((found_part_field= set_PF_fields_in_key(key_info, + key_spec->length))) + { + if (check_part_func_bound(part_info->full_part_field_array)) + { + /* + We were able to bind all fields in the partition function even + by using only a part of the key. Calculate the partition to use. + */ + get_full_part_id_from_key(table,buf,key_info,key_spec,part_spec); + clear_indicator_in_key_fields(key_info); + DBUG_VOID_RETURN; + } + else if (check_part_func_bound(part_info->part_field_array)) + sub_part= get_sub_part_id_from_key(table, buf, key_info, key_spec); + else if (check_part_func_bound(part_info->subpart_field_array)) + { + if (get_part_id_from_key(table,buf,key_info,key_spec,&part_part)) + { + part_spec->start_part= no_parts; + clear_indicator_in_key_fields(key_info); + DBUG_VOID_RETURN; + } + } + } + } + } + { + /* + The next step is to analyse the table condition to see whether any + information about which partitions to scan can be derived from there. + Currently not implemented. + */ + } + /* + If we come here we have found a range of sorts we have either discovered + nothing or we have discovered a range of partitions with possible holes + in it. We need a bitvector to further the work here. + */ + if (!(part_part == no_parts && sub_part == no_parts)) + { + /* + We can only arrive here if we are using subpartitioning. + */ + if (part_part != no_parts) + { + /* + We know the top partition and need to scan all underlying + subpartitions. This is a range without holes. + */ + DBUG_ASSERT(sub_part == no_parts); + part_spec->start_part= part_part * part_info->no_parts; + part_spec->end_part= part_spec->start_part+part_info->no_subparts - 1; + } + else + { + DBUG_ASSERT(sub_part != no_parts); + part_spec->use_bit_array= TRUE; + part_spec->start_part= sub_part; + part_spec->end_part=sub_part+ + (part_info->no_subparts*(part_info->no_parts-1)); + for (i= 0, part_id= sub_part; i < part_info->no_parts; + i++, part_id+= part_info->no_subparts) + ; //Set bit part_id in bit array + } + } + if (found_part_field) + clear_indicator_in_key_fields(key_info); + DBUG_VOID_RETURN; +} + + +/* + If the table is partitioned we will read the partition info into the + .frm file here. + ------------------------------- + | Fileinfo 64 bytes | + ------------------------------- + | Formnames 7 bytes | + ------------------------------- + | Not used 4021 bytes | + ------------------------------- + | Keyinfo + record | + ------------------------------- + | Padded to next multiple | + | of IO_SIZE | + ------------------------------- + | Forminfo 288 bytes | + ------------------------------- + | Screen buffer, to make | + |Â field names readable | + ------------------------------- + | Packed field info | + |Â 17 + 1 + strlen(field_name) | + | + 1 end of file character | + ------------------------------- + | Partition info | + ------------------------------- + We provide the length of partition length in Fileinfo[55-58]. + + Read the partition syntax from the frm file and parse it to get the + data structures of the partitioning. + SYNOPSIS + mysql_unpack_partition() + file File reference of frm file + thd Thread object + part_info_len Length of partition syntax + table Table object of partitioned table + RETURN VALUE + TRUE Error + FALSE Sucess + DESCRIPTION + Read the partition syntax from the current position in the frm file. + Initiate a LEX object, save the list of item tree objects to free after + the query is done. Set-up partition info object such that parser knows + it is called from internally. Call parser to create data structures + (best possible recreation of item trees and so forth since there is no + serialisation of these objects other than in parseable text format). + We need to save the text of the partition functions since it is not + possible to retrace this given an item tree. +*/ + +bool mysql_unpack_partition(File file, THD *thd, uint part_info_len, + TABLE* table) +{ + Item *thd_free_list= thd->free_list; + bool result= TRUE; + uchar* part_buf= NULL; + partition_info *part_info; + LEX *old_lex= thd->lex, lex; + DBUG_ENTER("mysql_unpack_partition"); + if (read_string(file, (gptr*)&part_buf, part_info_len)) + DBUG_RETURN(result); + thd->lex= &lex; + lex_start(thd, part_buf, part_info_len); + /* + We need to use the current SELECT_LEX since I need to keep the + Name_resolution_context object which is referenced from the + Item_field objects. + This is not a nice solution since if the parser uses current_select + for anything else it will corrupt the current LEX object. + */ + thd->lex->current_select= old_lex->current_select; + /* + All Items created is put into a free list on the THD object. This list + is used to free all Item objects after completing a query. We don't + want that to happen with the Item tree created as part of the partition + info. This should be attached to the table object and remain so until + the table object is released. + Thus we move away the current list temporarily and start a new list that + we then save in the partition info structure. + */ + thd->free_list= NULL; + lex.part_info= (partition_info*)1; //Indicate yyparse from this place + if (yyparse((void*)thd) || thd->is_fatal_error) + { + free_items(thd->free_list); + goto end; + } + part_info= lex.part_info; + table->s->part_info= part_info; + part_info->item_free_list= thd->free_list; + + { + /* + This code part allocates memory for the serialised item information for + the partition functions. In most cases this is not needed but if the + table is used for SHOW CREATE TABLES or ALTER TABLE that modifies + partition information it is needed and the info is lost if we don't + save it here so unfortunately we have to do it here even if in most + cases it is not needed. This is a consequence of that item trees are + not serialisable. + */ + uint part_func_len= part_info->part_func_len; + uint subpart_func_len= part_info->subpart_func_len; + char *part_func_string, *subpart_func_string= NULL; + if (!((part_func_string= sql_alloc(part_func_len))) || + (subpart_func_len && + !((subpart_func_string= sql_alloc(subpart_func_len))))) + { + my_error(ER_OUTOFMEMORY, MYF(0), part_func_len); + free_items(thd->free_list); + part_info->item_free_list= 0; + goto end; + } + memcpy(part_func_string, part_info->part_func_string, part_func_len); + if (subpart_func_len) + memcpy(subpart_func_string, part_info->subpart_func_string, + subpart_func_len); + part_info->part_func_string= part_func_string; + part_info->subpart_func_string= subpart_func_string; + } + + result= FALSE; +end: + thd->free_list= thd_free_list; + x_free((gptr)part_buf); + thd->lex= old_lex; + DBUG_RETURN(result); +} +#endif + +/* + Prepare for calling val_int on partition function by setting fields to + point to the record where the values of the PF-fields are stored. + SYNOPSIS + set_field_ptr() + ptr Array of fields to change ptr + new_buf New record pointer + old_buf Old record pointer + DESCRIPTION + Set ptr in field objects of field array to refer to new_buf record + instead of previously old_buf. Used before calling val_int and after + it is used to restore pointers to table->record[0]. + This routine is placed outside of partition code since it can be useful + also for other programs. +*/ + +void set_field_ptr(Field **ptr, const byte *new_buf, + const byte *old_buf) +{ + my_ptrdiff_t diff= (new_buf - old_buf); + DBUG_ENTER("set_nullable_field_ptr"); + + do + { + (*ptr)->move_field(diff); + } while (*(++ptr)); + DBUG_VOID_RETURN; +} + + +/* + Prepare for calling val_int on partition function by setting fields to + point to the record where the values of the PF-fields are stored. + This variant works on a key_part reference. + It is not required that all fields are NOT NULL fields. + SYNOPSIS + set_key_field_ptr() + key_part key part with a set of fields to change ptr + new_buf New record pointer + old_buf Old record pointer + DESCRIPTION + Set ptr in field objects of field array to refer to new_buf record + instead of previously old_buf. Used before calling val_int and after + it is used to restore pointers to table->record[0]. + This routine is placed outside of partition code since it can be useful + also for other programs. +*/ + +void set_key_field_ptr(KEY *key_info, const byte *new_buf, + const byte *old_buf) +{ + KEY_PART_INFO *key_part= key_info->key_part; + uint key_parts= key_info->key_parts, i= 0; + my_ptrdiff_t diff= (new_buf - old_buf); + DBUG_ENTER("set_key_field_ptr"); + + do + { + key_part->field->move_field(diff); + key_part++; + } while (++i < key_parts); + DBUG_VOID_RETURN; +} + diff --git a/sql/sql_prepare.cc b/sql/sql_prepare.cc index 879ea626494..73bbce647bd 100644 --- a/sql/sql_prepare.cc +++ b/sql/sql_prepare.cc @@ -2686,11 +2686,11 @@ bool Prepared_statement::prepare(const char *packet, uint packet_len) old_stmt_arena= thd->stmt_arena; thd->stmt_arena= this; lex_start(thd, (uchar*) thd->query, thd->query_length); - lex->safe_to_cache_query= FALSE; lex->stmt_prepare_mode= TRUE; rc= yyparse((void *)thd) || thd->is_fatal_error || thd->net.report_error || init_param_array(this); + lex->safe_to_cache_query= FALSE; /* While doing context analysis of the query (in check_prepared_statement) we allocate a lot of additional memory: for open tables, JOINs, derived diff --git a/sql/sql_repl.cc b/sql/sql_repl.cc index 32a8378d41d..d376423e990 100644 --- a/sql/sql_repl.cc +++ b/sql/sql_repl.cc @@ -19,6 +19,7 @@ #include "sql_repl.h" #include "log_event.h" +#include "rpl_filter.h" #include <my_dir.h> int max_binlog_dump_events = 0; // unlimited @@ -1455,8 +1456,8 @@ bool show_binlog_info(THD* thd) int dir_len = dirname_length(li.log_file_name); protocol->store(li.log_file_name + dir_len, &my_charset_bin); protocol->store((ulonglong) li.pos); - protocol->store(&binlog_do_db); - protocol->store(&binlog_ignore_db); + protocol->store(binlog_filter->get_do_db()); + protocol->store(binlog_filter->get_ignore_db()); if (protocol->write()) DBUG_RETURN(TRUE); } diff --git a/sql/sql_repl.h b/sql/sql_repl.h index 9eb6456ee20..ba64e626adc 100644 --- a/sql/sql_repl.h +++ b/sql/sql_repl.h @@ -31,7 +31,6 @@ typedef struct st_slave_info extern my_bool opt_show_slave_auth_info; extern char *master_host, *master_info_file; extern bool server_id_supplied; -extern I_List<i_string> binlog_do_db, binlog_ignore_db; extern int max_binlog_dump_events; extern my_bool opt_sporadic_binlog_dump_fail; diff --git a/sql/sql_select.cc b/sql/sql_select.cc index 04969b37012..0fc43345c04 100644 --- a/sql/sql_select.cc +++ b/sql/sql_select.cc @@ -941,23 +941,19 @@ JOIN::optimize() } /* - Need to tell Innobase that to play it safe, it should fetch all - columns of the tables: this is because MySQL may build row - pointers for the rows, and for all columns of the primary key the - field->query_id has not necessarily been set to thd->query_id by - MySQL. + Need to tell handlers that to play it safe, it should fetch all + columns of the primary key of the tables: this is because MySQL may + build row pointers for the rows, and for all columns of the primary key + the read set has not necessarily been set by the server code. */ - -#ifdef HAVE_INNOBASE_DB if (need_tmp || select_distinct || group_list || order) { for (uint i_h = const_tables; i_h < tables; i_h++) { TABLE* table_h = join_tab[i_h].table; - table_h->file->extra(HA_EXTRA_RETRIEVE_PRIMARY_KEY); + table_h->file->ha_retrieve_all_pk(); } } -#endif DBUG_EXECUTE("info",TEST_join(this);); @@ -1301,6 +1297,9 @@ JOIN::exec() /* Copy data to the temporary table */ thd->proc_info= "Copying to tmp table"; DBUG_PRINT("info", ("%s", thd->proc_info)); + if (!curr_join->sort_and_group && + curr_join->const_tables != curr_join->tables) + curr_join->join_tab[curr_join->const_tables].sorted= 0; if ((tmp_error= do_select(curr_join, (List<Item> *) 0, curr_tmp_table, 0))) { error= tmp_error; @@ -1447,6 +1446,9 @@ JOIN::exec() 1, TRUE)) DBUG_VOID_RETURN; curr_join->group_list= 0; + if (!curr_join->sort_and_group && + curr_join->const_tables != curr_join->tables) + curr_join->join_tab[curr_join->const_tables].sorted= 0; if (setup_sum_funcs(curr_join->thd, curr_join->sum_funcs) || (tmp_error= do_select(curr_join, (List<Item> *) 0, curr_tmp_table, 0))) @@ -1633,6 +1635,16 @@ JOIN::exec() (select_options & OPTION_FOUND_ROWS ? HA_POS_ERROR : unit->select_limit_cnt))) DBUG_VOID_RETURN; + if (curr_join->const_tables != curr_join->tables && + !curr_join->join_tab[curr_join->const_tables].table->sort.io_cache) + { + /* + If no IO cache exists for the first table then we are using an + INDEX SCAN and no filesort. Thus we should not remove the sorted + attribute on the INDEX SCAN. + */ + skip_sort_order= 1; + } } } /* XXX: When can we have here thd->net.report_error not zero? */ @@ -5728,6 +5740,7 @@ make_join_readinfo(JOIN *join, uint options) uint i; bool statistics= test(!(join->select_options & SELECT_DESCRIBE)); + bool sorted= 1; DBUG_ENTER("make_join_readinfo"); for (i=join->const_tables ; i < join->tables ; i++) @@ -5737,6 +5750,8 @@ make_join_readinfo(JOIN *join, uint options) tab->read_record.table= table; tab->read_record.file=table->file; tab->next_select=sub_select; /* normal select */ + tab->sorted= sorted; + sorted= 0; // only first must be sorted switch (tab->type) { case JT_SYSTEM: // Only happens with left join table->status=STATUS_NO_RECORD; @@ -8143,7 +8158,7 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields, uint hidden_null_count, hidden_null_pack_length, hidden_field_count; uint blob_count,group_null_items, string_count; uint temp_pool_slot=MY_BIT_NONE; - ulong reclength, string_total_length; + ulong reclength, string_total_length, fieldnr= 0; bool using_unique_constraint= 0; bool use_packed_rows= 0; bool not_all_columns= !(select_options & TMP_TABLE_ALL_COLUMNS); @@ -8166,7 +8181,7 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields, statistic_increment(thd->status_var.created_tmp_tables, &LOCK_status); if (use_temp_pool) - temp_pool_slot = bitmap_set_next(&temp_pool); + temp_pool_slot = bitmap_lock_set_next(&temp_pool); if (temp_pool_slot != MY_BIT_NONE) // we got a slot sprintf(path, "%s_%lx_%i", tmp_file_prefix, @@ -8218,12 +8233,12 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields, param->group_length : 0, NullS)) { - bitmap_clear_bit(&temp_pool, temp_pool_slot); + bitmap_lock_clear_bit(&temp_pool, temp_pool_slot); DBUG_RETURN(NULL); /* purecov: inspected */ } if (!(param->copy_field=copy=new Copy_field[field_count])) { - bitmap_clear_bit(&temp_pool, temp_pool_slot); + bitmap_lock_clear_bit(&temp_pool, temp_pool_slot); my_free((gptr) table,MYF(0)); /* purecov: inspected */ DBUG_RETURN(NULL); /* purecov: inspected */ } @@ -8254,6 +8269,7 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields, table->s->tmp_table= TMP_TABLE; table->s->db_low_byte_first=1; // True for HEAP and MyISAM table->s->table_charset= param->table_charset; + table->s->primary_key= MAX_KEY; //Indicate no primary key table->s->keys_for_keyread.init(); table->s->keys_in_use.init(); /* For easier error reporting */ @@ -8329,6 +8345,7 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields, (*argp)->maybe_null=1; } new_field->query_id= thd->query_id; + new_field->fieldnr= ++fieldnr; } } } @@ -8376,6 +8393,7 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields, new_field->flags|= GROUP_FLAG; } new_field->query_id= thd->query_id; + new_field->fieldnr= ++fieldnr; new_field->field_index= (uint) (reg_field - table->field); *(reg_field++) =new_field; } @@ -8385,6 +8403,7 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields, DBUG_ASSERT(field_count >= (uint) (reg_field - table->field)); field_count= (uint) (reg_field - table->field); *blob_field= 0; // End marker + table->s->fields= field_count; /* If result table is small; use a heap */ if (blob_count || using_unique_constraint || @@ -8401,7 +8420,11 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields, { table->file=get_new_handler(table,table->s->db_type= DB_TYPE_HEAP); } - + if (table->s->fields) + { + table->file->ha_set_all_bits_in_read_set(); + table->file->ha_set_all_bits_in_write_set(); + } if (!using_unique_constraint) reclength+= group_null_items; // null flag is stored separately @@ -8427,7 +8450,6 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields, string_total_length / string_count >= AVG_STRING_LENGTH_TO_PACK_ROWS)) use_packed_rows= 1; - table->s->fields= field_count; table->s->reclength= reclength; { uint alloc_length=ALIGN_SIZE(reclength+MI_UNIQUE_HASH_LENGTH+1); @@ -8672,7 +8694,7 @@ create_tmp_table(THD *thd,TMP_TABLE_PARAM *param,List<Item> &fields, err: free_tmp_table(thd,table); /* purecov: inspected */ - bitmap_clear_bit(&temp_pool, temp_pool_slot); + bitmap_lock_clear_bit(&temp_pool, temp_pool_slot); DBUG_RETURN(NULL); /* purecov: inspected */ } @@ -8946,7 +8968,7 @@ free_tmp_table(THD *thd, TABLE *entry) my_free((gptr) entry->record[0],MYF(0)); free_io_cache(entry); - bitmap_clear_bit(&temp_pool, entry->temp_pool_slot); + bitmap_lock_clear_bit(&temp_pool, entry->temp_pool_slot); my_free((gptr) entry,MYF(0)); thd->proc_info=save_proc_info; @@ -9007,7 +9029,12 @@ bool create_myisam_from_heap(THD *thd, TABLE *table, TMP_TABLE_PARAM *param, new_table.file->extra(HA_EXTRA_WRITE_CACHE); #endif - /* copy all old rows */ + /* + copy all old rows from heap table to MyISAM table + This is the only code that uses record[1] to read/write but this + is safe as this is a temporary MyISAM table without timestamp/autoincrement + or partitioning. + */ while (!table->file->rnd_next(new_table.record[1])) { if ((write_err=new_table.file->write_row(new_table.record[1]))) @@ -9042,8 +9069,8 @@ bool create_myisam_from_heap(THD *thd, TABLE *table, TMP_TABLE_PARAM *param, (void) new_table.file->close(); err1: new_table.file->delete_table(new_table.s->table_name); - delete new_table.file; err2: + delete new_table.file; thd->proc_info=save_proc_info; DBUG_RETURN(1); } @@ -9138,7 +9165,7 @@ do_select(JOIN *join,List<Item> *fields,TABLE *table,Procedure *procedure) empty_record(table); if (table->group && join->tmp_table_param.sum_func_count && table->s->keys && !table->file->inited) - table->file->ha_index_init(0); + table->file->ha_index_init(0, 0); } /* Set up select_end */ join->join_tab[join->tables-1].next_select= setup_end_select_func(join); @@ -9837,7 +9864,7 @@ join_read_const(JOIN_TAB *tab) table->status= STATUS_NOT_FOUND; mark_as_null_row(tab->table); empty_record(table); - if (error != HA_ERR_KEY_NOT_FOUND) + if (error != HA_ERR_KEY_NOT_FOUND && error != HA_ERR_END_OF_FILE) return report_error(table, error); return -1; } @@ -9860,7 +9887,9 @@ join_read_key(JOIN_TAB *tab) TABLE *table= tab->table; if (!table->file->inited) - table->file->ha_index_init(tab->ref.key); + { + table->file->ha_index_init(tab->ref.key, tab->sorted); + } if (cmp_buffer_with_ref(tab) || (table->status & (STATUS_GARBAGE | STATUS_NO_PARENT | STATUS_NULL_ROW))) { @@ -9872,7 +9901,7 @@ join_read_key(JOIN_TAB *tab) error=table->file->index_read(table->record[0], tab->ref.key_buff, tab->ref.key_length,HA_READ_KEY_EXACT); - if (error && error != HA_ERR_KEY_NOT_FOUND) + if (error && error != HA_ERR_KEY_NOT_FOUND && error != HA_ERR_END_OF_FILE) return report_error(table, error); } table->null_row=0; @@ -9892,14 +9921,16 @@ join_read_always_key(JOIN_TAB *tab) return -1; } if (!table->file->inited) - table->file->ha_index_init(tab->ref.key); + { + table->file->ha_index_init(tab->ref.key, tab->sorted); + } if (cp_buffer_from_ref(tab->join->thd, &tab->ref)) return -1; if ((error=table->file->index_read(table->record[0], tab->ref.key_buff, tab->ref.key_length,HA_READ_KEY_EXACT))) { - if (error != HA_ERR_KEY_NOT_FOUND) + if (error != HA_ERR_KEY_NOT_FOUND && error != HA_ERR_END_OF_FILE) return report_error(table, error); return -1; /* purecov: inspected */ } @@ -9919,14 +9950,14 @@ join_read_last_key(JOIN_TAB *tab) TABLE *table= tab->table; if (!table->file->inited) - table->file->ha_index_init(tab->ref.key); + table->file->ha_index_init(tab->ref.key, tab->sorted); if (cp_buffer_from_ref(tab->join->thd, &tab->ref)) return -1; if ((error=table->file->index_read_last(table->record[0], tab->ref.key_buff, tab->ref.key_length))) { - if (error != HA_ERR_KEY_NOT_FOUND) + if (error != HA_ERR_KEY_NOT_FOUND && error != HA_ERR_END_OF_FILE) return report_error(table, error); return -1; /* purecov: inspected */ } @@ -10029,7 +10060,7 @@ join_read_first(JOIN_TAB *tab) tab->read_record.index=tab->index; tab->read_record.record=table->record[0]; if (!table->file->inited) - table->file->ha_index_init(tab->index); + table->file->ha_index_init(tab->index, tab->sorted); if ((error=tab->table->file->index_first(tab->table->record[0]))) { if (error != HA_ERR_KEY_NOT_FOUND && error != HA_ERR_END_OF_FILE) @@ -10068,7 +10099,7 @@ join_read_last(JOIN_TAB *tab) tab->read_record.index=tab->index; tab->read_record.record=table->record[0]; if (!table->file->inited) - table->file->ha_index_init(tab->index); + table->file->ha_index_init(tab->index, 1); if ((error= tab->table->file->index_last(tab->table->record[0]))) return report_error(table, error); return 0; @@ -10092,7 +10123,7 @@ join_ft_read_first(JOIN_TAB *tab) TABLE *table= tab->table; if (!table->file->inited) - table->file->ha_index_init(tab->ref.key); + table->file->ha_index_init(tab->ref.key, 1); #if NOT_USED_YET if (cp_buffer_from_ref(tab->join->thd, &tab->ref)) // as ft-key doesn't use store_key's return -1; // see also FT_SELECT::init() @@ -10487,7 +10518,7 @@ end_update(JOIN *join, JOIN_TAB *join_tab __attribute__((unused)), error, 0)) DBUG_RETURN(NESTED_LOOP_ERROR); // Not a table_is_full error /* Change method to update rows */ - table->file->ha_index_init(0); + table->file->ha_index_init(0, 0); join->join_tab[join->tables-1].next_select=end_unique_update; } join->send_records++; diff --git a/sql/sql_select.h b/sql/sql_select.h index 47906c2697e..ce40f657a8e 100644 --- a/sql/sql_select.h +++ b/sql/sql_select.h @@ -133,6 +133,7 @@ typedef struct st_join_table { uint used_fields,used_fieldlength,used_blobs; enum join_type type; bool cached_eq_ref_table,eq_ref_table,not_used_in_distinct; + bool sorted; TABLE_REF ref; JOIN_CACHE cache; JOIN *join; diff --git a/sql/sql_show.cc b/sql/sql_show.cc index 51330a6109b..c346f4cc291 100644 --- a/sql/sql_show.cc +++ b/sql/sql_show.cc @@ -971,11 +971,16 @@ store_create_info(THD *thd, TABLE_LIST *table_list, String *packet) packet->append("\n)", 2); if (!(thd->variables.sql_mode & MODE_NO_TABLE_OPTIONS) && !foreign_db_mode) { - if (thd->variables.sql_mode & (MODE_MYSQL323 | MODE_MYSQL40)) - packet->append(" TYPE=", 6); - else - packet->append(" ENGINE=", 8); - packet->append(file->table_type()); +#ifdef HAVE_PARTITION_DB + if (!table->s->part_info) +#endif + { + if (thd->variables.sql_mode & (MODE_MYSQL323 | MODE_MYSQL40)) + packet->append(" TYPE=", 6); + else + packet->append(" ENGINE=", 8); + packet->append(file->table_type()); + } if (share->table_charset && !(thd->variables.sql_mode & MODE_MYSQL323) && @@ -1042,6 +1047,23 @@ store_create_info(THD *thd, TABLE_LIST *table_list, String *packet) append_directory(thd, packet, "DATA", create_info.data_file_name); append_directory(thd, packet, "INDEX", create_info.index_file_name); } +#ifdef HAVE_PARTITION_DB + { + /* + Partition syntax for CREATE TABLE is at the end of the syntax. + */ + uint part_syntax_len; + char *part_syntax; + if (table->s->part_info && + ((part_syntax= generate_partition_syntax(table->s->part_info, + &part_syntax_len, + FALSE)))) + { + packet->append(part_syntax, part_syntax_len); + my_free(part_syntax, MYF(0)); + } + } +#endif DBUG_RETURN(0); } @@ -2798,7 +2820,7 @@ int fill_schema_proc(THD *thd, TABLE_LIST *tables, COND *cond) { DBUG_RETURN(1); } - proc_table->file->ha_index_init(0); + proc_table->file->ha_index_init(0, 1); if ((res= proc_table->file->index_first(proc_table->record[0]))) { res= (res == HA_ERR_END_OF_FILE) ? 0 : 1; diff --git a/sql/sql_table.cc b/sql/sql_table.cc index 611ab0f16aa..6f60419ff94 100644 --- a/sql/sql_table.cc +++ b/sql/sql_table.cc @@ -30,6 +30,7 @@ #include <io.h> #endif + const char *primary_key_name="PRIMARY"; static bool check_if_keyname_exists(const char *name,KEY *start, KEY *end); @@ -44,6 +45,64 @@ static bool prepare_blob_field(THD *thd, create_field *sql_field); static bool check_engine(THD *thd, const char *table_name, enum db_type *new_engine); +/* + SYNOPSIS + write_bin_log() + thd Thread object + clear_error is clear_error to be called + RETURN VALUES + NONE + DESCRIPTION + Write the binlog if open, routine used in multiple places in this + file +*/ + +static void write_bin_log(THD *thd, bool clear_error) +{ + if (mysql_bin_log.is_open()) + { + if (clear_error) + thd->clear_error(); + Query_log_event qinfo(thd, thd->query, thd->query_length, FALSE, FALSE); + mysql_bin_log.write(&qinfo); + } +} + +/* + SYNOPSIS + abort_and_upgrade_lock() + thd Thread object + table Table object + db Database name + table_name Table name + old_lock_level Old lock level + RETURN VALUES + TRUE Failure + FALSE Success + DESCRIPTION + Remember old lock level (for possible downgrade later on), abort all + waiting threads and ensure that all keeping locks currently are + completed such that we own the lock exclusively and no other interaction + is ongoing. +*/ + +static bool abort_and_upgrade_lock(THD *thd, TABLE *table, const char *db, + const char *table_name, + uint *old_lock_level) +{ + uint flags= RTFC_WAIT_OTHER_THREAD_FLAG | RTFC_CHECK_KILLED_FLAG; + DBUG_ENTER("abort_and_upgrade_locks"); + + *old_lock_level= table->reginfo.lock_type; + mysql_lock_abort(thd, table); + VOID(remove_table_from_cache(thd, db, table_name, flags)); + if (thd->killed) + { + thd->no_warnings_for_error= 0; + DBUG_RETURN(TRUE); + } + DBUG_RETURN(FALSE); +} /* Build the path to a file for a table (or the base path that can @@ -1347,6 +1406,34 @@ static int mysql_prepare_table(THD *thd, HA_CREATE_INFO *create_info, /* + Set table default charset, if not set + + SYNOPSIS + set_table_default_charset() + create_info Table create information + + DESCRIPTION + If the table character set was not given explicitely, + let's fetch the database default character set and + apply it to the table. +*/ + +static void set_table_default_charset(THD *thd, + HA_CREATE_INFO *create_info, char *db) +{ + if (!create_info->default_table_charset) + { + HA_CREATE_INFO db_info; + char path[FN_REFLEN]; + /* Abuse build_table_path() to build the path to the db.opt file */ + build_table_path(path, sizeof(path), db, MY_DB_OPT_FILE, ""); + load_db_opt(thd, path, &db_info); + create_info->default_table_charset= db_info.default_table_charset; + } +} + + +/* Extend long VARCHAR fields to blob & prepare field if it's a blob SYNOPSIS @@ -1503,7 +1590,66 @@ bool mysql_create_table(THD *thd,const char *db, const char *table_name, if (create_info->row_type == ROW_TYPE_DYNAMIC) db_options|=HA_OPTION_PACK_RECORD; alias= table_case_name(create_info, table_name); - file=get_new_handler((TABLE*) 0, create_info->db_type); + if (!(file=get_new_handler((TABLE*) 0, create_info->db_type))) + { + my_error(ER_OUTOFMEMORY, MYF(0), 128);//128 bytes invented + DBUG_RETURN(TRUE); + } +#ifdef HAVE_PARTITION_DB + partition_info *part_info= thd->lex->part_info; + if (part_info) + { + /* + The table has been specified as a partitioned table. + If this is part of an ALTER TABLE the handler will be the partition + handler but we need to specify the default handler to use for + partitions also in the call to check_partition_info. We transport + this information in the default_db_type variable, it is either + DB_TYPE_DEFAULT or the engine set in the ALTER TABLE command. + */ + enum db_type part_engine_type= create_info->db_type; + char *part_syntax_buf; + uint syntax_len; + if (part_engine_type == DB_TYPE_PARTITION_DB) + { + /* + This only happens at ALTER TABLE. + default_engine_type was assigned from the engine set in the ALTER + TABLE command. + */ + part_engine_type= ha_checktype(thd, + part_info->default_engine_type, 0, 0); + } + if (check_partition_info(part_info, part_engine_type, + file, create_info->max_rows)) + DBUG_RETURN(TRUE); + /* + We reverse the partitioning parser and generate a standard format + for syntax stored in frm file. + */ + if (!(part_syntax_buf= generate_partition_syntax(part_info, + &syntax_len, + TRUE))) + DBUG_RETURN(TRUE); + part_info->part_info_string= part_syntax_buf; + part_info->part_info_len= syntax_len; + if ((!(file->partition_flags() & HA_CAN_PARTITION)) || + create_info->db_type == DB_TYPE_PARTITION_DB) + { + /* + The handler assigned to the table cannot handle partitioning. + Assign the partition handler as the handler of the table. + */ + DBUG_PRINT("info", ("db_type= %d, part_flag= %d", create_info->db_type,file->partition_flags())); + delete file; + create_info->db_type= DB_TYPE_PARTITION_DB; + if (!(file= get_ha_partition(part_info))) + { + DBUG_RETURN(TRUE); + } + } + } +#endif #ifdef NOT_USED /* @@ -1517,30 +1663,17 @@ bool mysql_create_table(THD *thd,const char *db, const char *table_name, (file->table_flags() & HA_NO_TEMP_TABLES)) { my_error(ER_ILLEGAL_HA, MYF(0), table_name); - DBUG_RETURN(TRUE); + goto err; } #endif - /* - If the table character set was not given explicitely, - let's fetch the database default character set and - apply it to the table. - */ - if (!create_info->default_table_charset) - { - HA_CREATE_INFO db_info; - char path[FN_REFLEN]; - /* Abuse build_table_path() to build the path to the db.opt file */ - build_table_path(path, sizeof(path), db, MY_DB_OPT_FILE, ""); - load_db_opt(thd, path, &db_info); - create_info->default_table_charset= db_info.default_table_charset; - } + set_table_default_charset(thd, create_info, (char*) db); if (mysql_prepare_table(thd, create_info, &fields, &keys, internal_tmp_table, &db_options, file, &key_info_buffer, &key_count, select_field_count)) - DBUG_RETURN(TRUE); + goto err; /* Check if table exists */ if (create_info->options & HA_LEX_CREATE_TMP_TABLE) @@ -1565,13 +1698,13 @@ bool mysql_create_table(THD *thd,const char *db, const char *table_name, push_warning_printf(thd, MYSQL_ERROR::WARN_LEVEL_NOTE, ER_TABLE_EXISTS_ERROR, ER(ER_TABLE_EXISTS_ERROR), alias); - DBUG_RETURN(FALSE); + goto no_err; } my_error(ER_TABLE_EXISTS_ERROR, MYF(0), alias); - DBUG_RETURN(TRUE); + goto err; } if (wait_if_global_read_lock(thd, 0, 1)) - DBUG_RETURN(error); + goto err; VOID(pthread_mutex_lock(&LOCK_open)); if (!internal_tmp_table && !(create_info->options & HA_LEX_CREATE_TMP_TABLE)) { @@ -1617,7 +1750,7 @@ bool mysql_create_table(THD *thd,const char *db, const char *table_name, if (rea_create_table(thd, path, db, table_name, create_info, fields, key_count, - key_info_buffer)) + key_info_buffer, file)) goto end; if (create_info->options & HA_LEX_CREATE_TMP_TABLE) { @@ -1648,8 +1781,16 @@ warn: end: VOID(pthread_mutex_unlock(&LOCK_open)); start_waiting_global_read_lock(thd); + delete file; thd->proc_info="After create"; DBUG_RETURN(error); + +err: + delete file; + DBUG_RETURN(TRUE); +no_err: + delete file; + DBUG_RETURN(FALSE); } /* @@ -2709,12 +2850,7 @@ bool mysql_create_like_table(THD* thd, TABLE_LIST* table, } // Must be written before unlock - if (mysql_bin_log.is_open()) - { - thd->clear_error(); - Query_log_event qinfo(thd, thd->query, thd->query_length, FALSE, FALSE); - mysql_bin_log.write(&qinfo); - } + write_bin_log(thd, TRUE); res= FALSE; goto err; @@ -2820,11 +2956,7 @@ mysql_discard_or_import_tablespace(THD *thd, error=1; if (error) goto err; - if (mysql_bin_log.is_open()) - { - Query_log_event qinfo(thd, thd->query, thd->query_length, FALSE, FALSE); - mysql_bin_log.write(&qinfo); - } + write_bin_log(thd, FALSE); err: close_thread_tables(thd); thd->tablespace_op=FALSE; @@ -3043,6 +3175,166 @@ int mysql_drop_indexes(THD *thd, TABLE_LIST *table_list, #endif /* NOT_USED */ + +#define ALTER_TABLE_DATA_CHANGED 1 +#define ALTER_TABLE_INDEX_CHANGED 2 + +/* + SYNOPSIS + compare tables() + table original table + create_list fields in new table + key_list keys in new table + create_info create options in new table + + DESCRIPTION + 'table' (first argument) contains information of the original + table, which includes all corresponding parts that the new + table has in arguments create_list, key_list and create_info. + + By comparing the changes between the original and new table + we can determine how much it has changed after ALTER TABLE + and whether we need to make a copy of the table, or just change + the .frm file. + + RETURN VALUES + 0 No copy needed + 1 Data changes, copy needed + 2 Index changes, copy needed +*/ + +uint compare_tables(TABLE *table, List<create_field> *create_list, + List<Key> *key_list, HA_CREATE_INFO *create_info, + ALTER_INFO *alter_info, uint order_num) +{ + Field **f_ptr, *field; + uint changes= 0, tmp; + List_iterator_fast<create_field> new_field_it(*create_list); + create_field *new_field; + + /* + Some very basic checks. If number of fields changes, or the + handler, we need to run full ALTER TABLE. In the future + new fields can be added and old dropped without copy, but + not yet. + + Test also that engine was not given during ALTER TABLE, or + we are force to run regular alter table (copy). + E.g. ALTER TABLE tbl_name ENGINE=MyISAM. + + For the following ones we also want to run regular alter table: + ALTER TABLE tbl_name ORDER BY .. + ALTER TABLE tbl_name CONVERT TO CHARACTER SET .. + + At the moment we can't handle altering temporary tables without a copy. + We also test if OPTIMIZE TABLE was given and was mapped to alter table. + In that case we always do full copy. + */ + if (table->s->fields != create_list->elements || + table->s->db_type != create_info->db_type || + table->s->tmp_table || + create_info->used_fields & HA_CREATE_USED_ENGINE || + create_info->used_fields & HA_CREATE_USED_CHARSET || + create_info->used_fields & HA_CREATE_USED_DEFAULT_CHARSET || + (alter_info->flags & ALTER_RECREATE) || + order_num) + return ALTER_TABLE_DATA_CHANGED; + + /* + Go through fields and check if the original ones are compatible + with new table. + */ + for (f_ptr= table->field, new_field= new_field_it++; + (field= *f_ptr); f_ptr++, new_field= new_field_it++) + { + /* Make sure we have at least the default charset in use. */ + if (!new_field->charset) + new_field->charset= create_info->default_table_charset; + + /* Check that NULL behavior is same for old and new fields */ + if ((new_field->flags & NOT_NULL_FLAG) != + (uint) (field->flags & NOT_NULL_FLAG)) + return ALTER_TABLE_DATA_CHANGED; + + /* Don't pack rows in old tables if the user has requested this. */ + if (create_info->row_type == ROW_TYPE_DYNAMIC || + (new_field->flags & BLOB_FLAG) || + new_field->sql_type == MYSQL_TYPE_VARCHAR && + create_info->row_type != ROW_TYPE_FIXED) + create_info->table_options|= HA_OPTION_PACK_RECORD; + + /* Evaluate changes bitmap and send to check_if_incompatible_data() */ + if (!(tmp= field->is_equal(new_field))) + return ALTER_TABLE_DATA_CHANGED; + + changes|= tmp; + } + /* Check if changes are compatible with current handler without a copy */ + if (table->file->check_if_incompatible_data(create_info, changes)) + return ALTER_TABLE_DATA_CHANGED; + + /* + Go through keys and check if the original ones are compatible + with new table. + */ + KEY *table_key_info= table->key_info; + List_iterator_fast<Key> key_it(*key_list); + Key *key= key_it++; + + /* Check if the number of key elements has changed */ + if (table->s->keys != key_list->elements) + return ALTER_TABLE_INDEX_CHANGED; + + for (uint i= 0; i < table->s->keys; i++, table_key_info++, key= key_it++) + { + /* + Check that the key types are compatible between old and new tables. + */ + if (table_key_info->algorithm != key->algorithm || + ((key->type == Key::PRIMARY || key->type == Key::UNIQUE) && + !(table_key_info->flags & HA_NOSAME)) || + (!(key->type == Key::PRIMARY || key->type == Key::UNIQUE) && + (table_key_info->flags & HA_NOSAME)) || + ((key->type == Key::SPATIAL) && + !(table_key_info->flags & HA_SPATIAL)) || + (!(key->type == Key::SPATIAL) && + (table_key_info->flags & HA_SPATIAL)) || + ((key->type == Key::FULLTEXT) && + !(table_key_info->flags & HA_FULLTEXT)) || + (!(key->type == Key::FULLTEXT) && + (table_key_info->flags & HA_FULLTEXT))) + return ALTER_TABLE_INDEX_CHANGED; + + if (table_key_info->key_parts != key->columns.elements) + return ALTER_TABLE_INDEX_CHANGED; + + /* + Check that the key parts remain compatible between the old and + new tables. + */ + KEY_PART_INFO *table_key_part= table_key_info->key_part; + List_iterator_fast<key_part_spec> key_part_it(key->columns); + key_part_spec *key_part= key_part_it++; + for (uint j= 0; j < table_key_info->key_parts; j++, + table_key_part++, key_part= key_part_it++) + { + /* + Key definition has changed if we are using a different field or + if the used key length is different + (If key_part->length == 0 it means we are using the whole field) + */ + if (strcmp(key_part->field_name, table_key_part->field->field_name) || + (key_part->length && key_part->length != table_key_part->length) || + (key_part->length == 0 && table_key_part->length != + table_key_part->field->pack_length())) + return ALTER_TABLE_INDEX_CHANGED; + } + } + + return 0; // Tables are compatible +} + + /* Alter table */ @@ -3064,7 +3356,13 @@ bool mysql_alter_table(THD *thd,char *new_db, char *new_name, ulonglong next_insert_id; uint db_create_options, used_fields; enum db_type old_db_type,new_db_type; - bool need_copy_table; + uint need_copy_table= 0; +#ifdef HAVE_PARTITION_DB + bool online_add_empty_partition= FALSE; + bool online_drop_partition= FALSE; + bool partition_changed= FALSE; + enum db_type default_engine_type; +#endif DBUG_ENTER("mysql_alter_table"); thd->proc_info="init"; @@ -3140,6 +3438,423 @@ bool mysql_alter_table(THD *thd,char *new_db, char *new_name, old_db_type= table->s->db_type; if (create_info->db_type == DB_TYPE_DEFAULT) create_info->db_type= old_db_type; + +#ifdef HAVE_PARTITION_DB + /* + We need to handle both partition management command such as Add Partition + and others here as well as an ALTER TABLE that completely changes the + partitioning and yet others that don't change anything at all. We start + by checking the partition management variants and then check the general + change patterns. + */ + if (alter_info->flags & (ALTER_ADD_PARTITION + + ALTER_DROP_PARTITION + ALTER_COALESCE_PARTITION + + ALTER_REORGANISE_PARTITION)) + { + partition_info *tab_part_info= table->s->part_info; + if (!tab_part_info) + { + my_error(ER_PARTITION_MGMT_ON_NONPARTITIONED, MYF(0)); + DBUG_RETURN(TRUE); + } + { + List_iterator<partition_element> t_it(tab_part_info->partitions); + partition_element *t_part_elem= t_it++; + if (is_sub_partitioned(tab_part_info)) + { + List_iterator<partition_element> s_it(t_part_elem->subpartitions); + t_part_elem= s_it++; + } + default_engine_type= t_part_elem->engine_type; + } + /* + We are going to manipulate the partition info on the table object + so we need to ensure that the data structure of the table object + is freed by setting version to 0. + */ + table->s->version= 0L; + if (alter_info->flags == ALTER_ADD_PARTITION) + { + /* + We start by moving the new partitions to the list of temporary + partitions. We will then check that the new partitions fit in the + partitioning scheme as currently set-up. + Partitions are always added at the end in ADD PARTITION. + */ + partition_info *alt_part_info= thd->lex->part_info; + uint no_new_partitions= alt_part_info->no_parts; + uint no_orig_partitions= tab_part_info->no_parts; + uint check_total_partitions= no_new_partitions + no_orig_partitions; + uint new_total_partitions= check_total_partitions; + /* + We allow quite a lot of values to be supplied by defaults, however we + must know the number of new partitions in this case. + */ + if (no_new_partitions == 0) + { + my_error(ER_ADD_PARTITION_NO_NEW_PARTITION, MYF(0)); + DBUG_RETURN(TRUE); + } + if (is_sub_partitioned(tab_part_info)) + { + if (alt_part_info->no_subparts == 0) + alt_part_info->no_subparts= tab_part_info->no_subparts; + else if (alt_part_info->no_subparts != tab_part_info->no_subparts) + { + my_error(ER_ADD_PARTITION_SUBPART_ERROR, MYF(0)); + DBUG_RETURN(TRUE); + } + check_total_partitions= new_total_partitions* + alt_part_info->no_subparts; + } + if (check_total_partitions > MAX_PARTITIONS) + { + my_error(ER_TOO_MANY_PARTITIONS_ERROR, MYF(0)); + DBUG_RETURN(TRUE); + } + alt_part_info->part_type= tab_part_info->part_type; + if (set_up_defaults_for_partitioning(alt_part_info, + table->file, + (ulonglong)0ULL, + tab_part_info->no_parts)) + { + DBUG_RETURN(TRUE); + } + /* + Need to concatenate the lists here to make it possible to check the + partition info for correctness using check_partition_info + */ + { + List_iterator<partition_element> alt_it(alt_part_info->partitions); + uint part_count= 0; + do + { + partition_element *part_elem= alt_it++; + tab_part_info->partitions.push_back(part_elem); + tab_part_info->temp_partitions.push_back(part_elem); + } while (++part_count < no_new_partitions); + tab_part_info->no_parts+= no_new_partitions; + } + { + List_iterator<partition_element> tab_it(tab_part_info->partitions); + partition_element *part_elem= tab_it++; + if (is_sub_partitioned(tab_part_info)) + { + List_iterator<partition_element> sub_it(part_elem->subpartitions); + part_elem= sub_it++; + } + if (check_partition_info(tab_part_info, part_elem->engine_type, + table->file, (ulonglong)0ULL)) + { + DBUG_RETURN(TRUE); + } + } + create_info->db_type= DB_TYPE_PARTITION_DB; + thd->lex->part_info= tab_part_info; + if (table->file->alter_table_flags() & HA_ONLINE_ADD_EMPTY_PARTITION && + (tab_part_info->part_type == RANGE_PARTITION || + tab_part_info->part_type == LIST_PARTITION)) + { + /* + For range and list partitions add partition is simply adding a new + empty partition to the table. If the handler support this we will + use the simple method of doing this. In this case we need to break + out the new partitions from the list again and only keep them in the + temporary list. Added partitions are always added at the end. + */ + { + List_iterator<partition_element> tab_it(tab_part_info->partitions); + uint part_count= 0; + do + { + tab_it++; + } while (++part_count < no_orig_partitions); + do + { + tab_it++; + tab_it.remove(); + } while (++part_count < new_total_partitions); + } + tab_part_info->no_parts-= no_new_partitions; + online_add_empty_partition= TRUE; + } + else + { + tab_part_info->temp_partitions.empty(); + } + } + else if (alter_info->flags == ALTER_DROP_PARTITION) + { + /* + Drop a partition from a range partition and list partitioning is + always safe and can be made more or less immediate. It is necessary + however to ensure that the partition to be removed is safely removed + and that REPAIR TABLE can remove the partition if for some reason the + command to drop the partition failed in the middle. + */ + uint part_count= 0; + uint no_parts_dropped= alter_info->partition_names.elements; + uint no_parts_found= 0; + List_iterator<partition_element> part_it(tab_part_info->partitions); + if (!(tab_part_info->part_type == RANGE_PARTITION || + tab_part_info->part_type == LIST_PARTITION)) + { + my_error(ER_ONLY_ON_RANGE_LIST_PARTITION, MYF(0), "DROP"); + DBUG_RETURN(TRUE); + } + if (no_parts_dropped >= tab_part_info->no_parts) + { + my_error(ER_DROP_LAST_PARTITION, MYF(0)); + DBUG_RETURN(TRUE); + } + do + { + partition_element *part_elem= part_it++; + if (is_partition_in_list(part_elem->partition_name, + alter_info->partition_names)) + { + /* + Remove the partition from the list and put it instead in the + list of temporary partitions with a new state. + */ + no_parts_found++; + part_elem->part_state= PART_IS_DROPPED; + } + } while (++part_count < tab_part_info->no_parts); + if (no_parts_found != no_parts_dropped) + { + my_error(ER_DROP_PARTITION_NON_EXISTENT, MYF(0)); + DBUG_RETURN(TRUE); + } + if (!(table->file->alter_table_flags() & HA_ONLINE_DROP_PARTITION)) + { + my_error(ER_DROP_PARTITION_FAILURE, MYF(0)); + DBUG_RETURN(TRUE); + } + if (table->file->is_fk_defined_on_table_or_index(MAX_KEY)) + { + my_error(ER_DROP_PARTITION_WHEN_FK_DEFINED, MYF(0)); + DBUG_RETURN(TRUE); + } + /* + This code needs set-up of structures needed by mysql_create_table + before it is called and thus we only set a boolean variable to be + checked later down in the code when all needed data structures are + prepared. + */ + online_drop_partition= TRUE; + } + else if (alter_info->flags == ALTER_COALESCE_PARTITION) + { + /* + In this version COALESCE PARTITION is implemented by simply removing + a partition from the table and using the normal ALTER TABLE code + and ensuring that copy to a new table occurs. Later on we can optimise + this function for Linear Hash partitions. In that case we can avoid + reorganising the entire table. For normal hash partitions it will + be a complete reorganise anyways so that can only be made on-line + if it still uses a copy table. + */ + uint part_count= 0; + uint no_parts_coalesced= alter_info->no_parts; + uint no_parts_remain= tab_part_info->no_parts - no_parts_coalesced; + List_iterator<partition_element> part_it(tab_part_info->partitions); + if (tab_part_info->part_type != HASH_PARTITION) + { + my_error(ER_COALESCE_ONLY_ON_HASH_PARTITION, MYF(0)); + DBUG_RETURN(TRUE); + } + if (no_parts_coalesced == 0) + { + my_error(ER_COALESCE_PARTITION_NO_PARTITION, MYF(0)); + DBUG_RETURN(TRUE); + } + if (no_parts_coalesced >= tab_part_info->no_parts) + { + my_error(ER_DROP_LAST_PARTITION, MYF(0)); + DBUG_RETURN(TRUE); + } + do + { + part_it++; + if (++part_count > no_parts_remain) + part_it.remove(); + } while (part_count < tab_part_info->no_parts); + tab_part_info->no_parts= no_parts_remain; + } + else if (alter_info->flags == ALTER_REORGANISE_PARTITION) + { + /* + Reorganise partitions takes a number of partitions that are next + to each other (at least for RANGE PARTITIONS) and then uses those + to create a set of new partitions. So data is copied from those + partitions into the new set of partitions. Those new partitions + can have more values in the LIST value specifications or less both + are allowed. The ranges can be different but since they are + changing a set of consecutive partitions they must cover the same + range as those changed from. + This command can be used on RANGE and LIST partitions. + */ + uint no_parts_reorged= alter_info->partition_names.elements; + uint no_parts_new= thd->lex->part_info->partitions.elements; + partition_info *alt_part_info= thd->lex->part_info; + uint check_total_partitions; + if (no_parts_reorged > tab_part_info->no_parts) + { + my_error(ER_REORG_PARTITION_NOT_EXIST, MYF(0)); + DBUG_RETURN(TRUE); + } + if (!(tab_part_info->part_type == RANGE_PARTITION || + tab_part_info->part_type == LIST_PARTITION)) + { + my_error(ER_ONLY_ON_RANGE_LIST_PARTITION, MYF(0), "REORGANISE"); + DBUG_RETURN(TRUE); + } + if (is_partitions_in_table(alt_part_info, tab_part_info)) + { + my_error(ER_SAME_NAME_PARTITION, MYF(0)); + DBUG_RETURN(TRUE); + } + check_total_partitions= tab_part_info->no_parts + no_parts_new; + check_total_partitions-= no_parts_reorged; + if (check_total_partitions > MAX_PARTITIONS) + { + my_error(ER_TOO_MANY_PARTITIONS_ERROR, MYF(0)); + DBUG_RETURN(TRUE); + } + { + List_iterator<partition_element> tab_it(tab_part_info->partitions); + uint part_count= 0; + bool found_first= FALSE, found_last= FALSE; + uint drop_count= 0; + longlong tab_max_range, alt_max_range; + do + { + partition_element *part_elem= tab_it++; + if (is_partition_in_list(part_elem->partition_name, + alter_info->partition_names)) + { + drop_count++; + tab_max_range= part_elem->range_value; + if (!found_first) + { + uint alt_part_count= 0; + found_first= TRUE; + List_iterator<partition_element> alt_it(alt_part_info->partitions); + do + { + partition_element *alt_part_elem= alt_it++; + alt_max_range= alt_part_elem->range_value; + if (alt_part_count == 0) + tab_it.replace(alt_part_elem); + else + tab_it.after(alt_part_elem); + } while (++alt_part_count < no_parts_new); + } + else if (found_last) + { + my_error(ER_CONSECUTIVE_REORG_PARTITIONS, MYF(0)); + DBUG_RETURN(TRUE); + } + else + tab_it.remove(); + } + else + { + if (found_first) + found_last= TRUE; + } + } while (++part_count < tab_part_info->no_parts); + if (drop_count != no_parts_reorged) + { + my_error(ER_DROP_PARTITION_NON_EXISTENT, MYF(0)); + DBUG_RETURN(TRUE); + } + if (tab_part_info->part_type == RANGE_PARTITION && + alt_max_range > tab_max_range) + { + my_error(ER_REORG_OUTSIDE_RANGE, MYF(0)); + DBUG_RETURN(TRUE); + } + } + } + partition_changed= TRUE; + create_info->db_type= DB_TYPE_PARTITION_DB; + thd->lex->part_info= tab_part_info; + if (alter_info->flags == ALTER_ADD_PARTITION || + alter_info->flags == ALTER_REORGANISE_PARTITION) + { + if (check_partition_info(tab_part_info, default_engine_type, + table->file, (ulonglong)0ULL)) + { + DBUG_RETURN(TRUE); + } + } + } + else + { + /* + When thd->lex->part_info has a reference to a partition_info the + ALTER TABLE contained a definition of a partitioning. + + Case I: + If there was a partition before and there is a new one defined. + We use the new partitioning. The new partitioning is already + defined in the correct variable so no work is needed to + accomplish this. + We do however need to update partition_changed to ensure that not + only the frm file is changed in the ALTER TABLE command. + + Case IIa: + There was a partitioning before and there is no new one defined. + Also the user has not specified an explicit engine to use. + + We use the old partitioning also for the new table. We do this + by assigning the partition_info from the table loaded in + open_ltable to the partition_info struct used by mysql_create_table + later in this method. + + Case IIb: + There was a partitioning before and there is no new one defined. + The user has specified an explicit engine to use. + + Since the user has specified an explicit engine to use we override + the old partitioning info and create a new table using the specified + engine. This is the reason for the extra check if old and new engine + is equal. + In this case the partition also is changed. + + Case III: + There was no partitioning before altering the table, there is + partitioning defined in the altered table. Use the new partitioning. + No work needed since the partitioning info is already in the + correct variable. + Also here partition has changed and thus a new table must be + created. + + Case IV: + There was no partitioning before and no partitioning defined. + Obviously no work needed. + */ + if (table->s->part_info) + { + if (!thd->lex->part_info && + create_info->db_type == old_db_type) + thd->lex->part_info= table->s->part_info; + } + if (thd->lex->part_info) + { + /* + Need to cater for engine types that can handle partition without + using the partition handler. + */ + if (thd->lex->part_info != table->s->part_info) + partition_changed= TRUE; + thd->lex->part_info->default_engine_type= create_info->db_type; + create_info->db_type= DB_TYPE_PARTITION_DB; + } + } +#endif if (check_engine(thd, new_name, &create_info->db_type)) DBUG_RETURN(TRUE); new_db_type= create_info->db_type; @@ -3203,12 +3918,7 @@ bool mysql_alter_table(THD *thd,char *new_db, char *new_name, } if (!error) { - if (mysql_bin_log.is_open()) - { - thd->clear_error(); - Query_log_event qinfo(thd, thd->query, thd->query_length, FALSE, FALSE); - mysql_bin_log.write(&qinfo); - } + write_bin_log(thd, TRUE); if (do_send_ok) send_ok(thd); } @@ -3289,8 +3999,8 @@ bool mysql_alter_table(THD *thd,char *new_db, char *new_name, def_it.remove(); } } - else - { // Use old field value + else // This field was not dropped and not changed, add it to the list + { // for the new table. create_list.push_back(def=new create_field(field,field)); alter_it.rewind(); // Change default if ALTER Alter_column *alter; @@ -3503,17 +4213,122 @@ bool mysql_alter_table(THD *thd,char *new_db, char *new_name, if (table->s->tmp_table) create_info->options|=HA_LEX_CREATE_TMP_TABLE; + set_table_default_charset(thd, create_info, db); + +#ifdef HAVE_PARTITION_DB + if (thd->variables.old_alter_table || partition_changed) +#else + if (thd->variables.old_alter_table) +#endif + need_copy_table= 1; + else + need_copy_table= compare_tables(table, &create_list, &key_list, + create_info, alter_info, order_num); + /* better have a negative test here, instead of positive, like alter_info->flags & ALTER_ADD_COLUMN|ALTER_ADD_INDEX|... so that ALTER TABLE won't break when somebody will add new flag */ - need_copy_table= (alter_info->flags & - ~(ALTER_CHANGE_COLUMN_DEFAULT|ALTER_OPTIONS) || - (create_info->used_fields & - ~(HA_CREATE_USED_COMMENT|HA_CREATE_USED_PASSWORD)) || - table->s->tmp_table); - create_info->frm_only= !need_copy_table; + + if (!need_copy_table) + create_info->frm_only= 1; + +#ifdef HAVE_PARTITION_DB + if (partition_changed) + { + if (online_drop_partition) + { + /* + Now after all checks and setting state on dropped partitions we can + start the actual dropping of the partitions. + 1) Lock table in TL_WRITE_ONLY to ensure all other accesses on table + are completed and no new ones are started until we have changed + the frm file. + 2) Write the new frm file where state of dropped partitions is + changed to PART_IS_DROPPED + 3) Perform the actual drop of the partition using the handler of the + table. + 4) Write a new frm file of the table where the partitions are dropped + from the table. + + */ + uint old_lock_type; + partition_info *part_info= table->s->part_info; + char path[FN_REFLEN+1]; + uint db_options= 0, key_count, syntax_len; + KEY *key_info_buffer; + char *part_syntax_buf; + + VOID(pthread_mutex_lock(&LOCK_open)); + if (abort_and_upgrade_lock(thd, table, db, table_name, &old_lock_type)) + { + DBUG_RETURN(TRUE); + } + VOID(pthread_mutex_unlock(&LOCK_open)); + mysql_prepare_table(thd, create_info, &create_list, + &key_list, /*tmp_table*/ 0, &db_options, + table->file, &key_info_buffer, &key_count, + /*select_field_count*/ 0); + if (!(part_syntax_buf= generate_partition_syntax(part_info, + &syntax_len, + TRUE))) + { + DBUG_RETURN(TRUE); + } + part_info->part_info_string= part_syntax_buf; + part_info->part_info_len= syntax_len; + build_table_path(path, sizeof(path), db, table_name, reg_ext); + if (mysql_create_frm(thd, path, db, table_name, create_info, + create_list, key_count, key_info_buffer, + table->file)) + { + DBUG_RETURN(TRUE); + } + thd->lex->part_info= part_info; + build_table_path(path, sizeof(path), db, table_name, ""); + if (table->file->drop_partitions(path)) + { + DBUG_RETURN(TRUE); + } + { + List_iterator<partition_element> part_it(part_info->partitions); + uint i= 0, remove_count= 0; + do + { + partition_element *part_elem= part_it++; + if (is_partition_in_list(part_elem->partition_name, + alter_info->partition_names)) + { + part_it.remove(); + remove_count++; + } + } while (++i < part_info->no_parts); + part_info->no_parts-= remove_count; + } + if (!(part_syntax_buf= generate_partition_syntax(part_info, + &syntax_len, + TRUE))) + { + DBUG_RETURN(TRUE); + } + part_info->part_info_string= part_syntax_buf; + part_info->part_info_len= syntax_len; + build_table_path(path, sizeof(path), db, table_name, reg_ext); + if (mysql_create_frm(thd, path, db, table_name, create_info, + create_list, key_count, key_info_buffer, + table->file) || + table->file->create_handler_files(path)) + { + DBUG_RETURN(TRUE); + } + thd->proc_info="end"; + write_bin_log(thd, FALSE); + send_ok(thd); + DBUG_RETURN(FALSE); + } + } +#endif /* Handling of symlinked tables: @@ -3640,12 +4455,7 @@ bool mysql_alter_table(THD *thd,char *new_db, char *new_name, my_free((gptr) new_table,MYF(0)); goto err; } - if (mysql_bin_log.is_open()) - { - thd->clear_error(); - Query_log_event qinfo(thd, thd->query, thd->query_length, FALSE, FALSE); - mysql_bin_log.write(&qinfo); - } + write_bin_log(thd, TRUE); goto end_temporary; } @@ -3776,15 +4586,14 @@ bool mysql_alter_table(THD *thd,char *new_db, char *new_name, goto err; } thd->proc_info="end"; - if (mysql_bin_log.is_open()) - { - thd->clear_error(); - Query_log_event qinfo(thd, thd->query, thd->query_length, FALSE, FALSE); - mysql_bin_log.write(&qinfo); - } + write_bin_log(thd, TRUE); VOID(pthread_cond_broadcast(&COND_refresh)); VOID(pthread_mutex_unlock(&LOCK_open)); #ifdef HAVE_BERKELEY_DB + /* + TODO RONM: This problem needs to handled for Berkeley DB partitions + as well + */ if (old_db_type == DB_TYPE_BERKELEY_DB) { /* @@ -3821,7 +4630,7 @@ end_temporary: err: DBUG_RETURN(TRUE); } - +/* mysql_alter_table */ static int copy_data_between_tables(TABLE *from,TABLE *to, @@ -3927,7 +4736,8 @@ copy_data_between_tables(TABLE *from,TABLE *to, this function does not set field->query_id in the columns to the current query id */ - from->file->extra(HA_EXTRA_RETRIEVE_ALL_COLS); + to->file->ha_set_all_bits_in_write_set(); + from->file->ha_retrieve_all_cols(); init_read_record(&info, thd, from, (SQL_SELECT *) 0, 1,1); if (ignore || handle_duplicates == DUP_REPLACE) @@ -4032,7 +4842,7 @@ bool mysql_recreate_table(THD *thd, TABLE_LIST *table_list, create_info.row_type=ROW_TYPE_NOT_USED; create_info.default_table_charset=default_charset_info; /* Force alter table to recreate table */ - lex->alter_info.flags= ALTER_CHANGE_COLUMN; + lex->alter_info.flags= (ALTER_CHANGE_COLUMN | ALTER_RECREATE); DBUG_RETURN(mysql_alter_table(thd, NullS, NullS, &create_info, table_list, lex->create_list, lex->key_list, 0, (ORDER *) 0, @@ -4090,10 +4900,11 @@ bool mysql_checksum_table(THD *thd, TABLE_LIST *tables, HA_CHECK_OPT *check_opt) /* calculating table's checksum */ ha_checksum crc= 0; - /* InnoDB must be told explicitly to retrieve all columns, because - this function does not set field->query_id in the columns to the - current query id */ - t->file->extra(HA_EXTRA_RETRIEVE_ALL_COLS); + /* + Set all bits in read set and inform InnoDB that we are reading all + fields + */ + t->file->ha_retrieve_all_cols(); if (t->file->ha_rnd_init(1)) protocol->store_null(); diff --git a/sql/sql_udf.cc b/sql/sql_udf.cc index e0c3034a58a..453b9324e88 100644 --- a/sql/sql_udf.cc +++ b/sql/sql_udf.cc @@ -528,7 +528,7 @@ int mysql_drop_function(THD *thd,const LEX_STRING *udf_name) if (!(table = open_ltable(thd,&tables,TL_WRITE))) goto err; table->field[0]->store(udf_name->str, udf_name->length, system_charset_info); - table->file->extra(HA_EXTRA_RETRIEVE_ALL_COLS); + table->file->ha_retrieve_all_cols(); if (!table->file->index_read_idx(table->record[0], 0, (byte*) table->field[0]->ptr, table->key_info[0].key_length, diff --git a/sql/sql_update.cc b/sql/sql_update.cc index 42c06d478be..61185c5d710 100644 --- a/sql/sql_update.cc +++ b/sql/sql_update.cc @@ -119,10 +119,10 @@ int mysql_update(THD *thd, { bool using_limit= limit != HA_POS_ERROR; bool safe_update= thd->options & OPTION_SAFE_UPDATES; - bool used_key_is_modified, transactional_table; + bool used_key_is_modified, transactional_table, will_batch; int res; - int error=0; - uint used_index; + int error=0, loc_error; + uint used_index, dup_key_found; #ifndef NO_EMBEDDED_ACCESS_CHECKS uint want_privilege; #endif @@ -148,7 +148,7 @@ int mysql_update(THD *thd, /* pass counter value */ thd->lex->table_count= table_count; /* convert to multiupdate */ - return 2; + DBUG_RETURN(2); } if (lock_tables(thd, table_list, table_count) || @@ -187,7 +187,11 @@ int mysql_update(THD *thd, #ifndef NO_EMBEDDED_ACCESS_CHECKS table_list->grant.want_privilege= table->grant.want_privilege= want_privilege; #endif - if (setup_fields_with_no_wrap(thd, 0, fields, 1, 0, 0)) + /* + Indicate that the set of fields is to be updated by passing 2 for + set_query_id. + */ + if (setup_fields_with_no_wrap(thd, 0, fields, 2, 0, 0)) DBUG_RETURN(1); /* purecov: inspected */ if (table_list->view && check_fields(thd, fields)) { @@ -204,7 +208,10 @@ int mysql_update(THD *thd, if (table->timestamp_field->query_id == thd->query_id) table->timestamp_field_type= TIMESTAMP_NO_AUTO_SET; else + { table->timestamp_field->query_id=timestamp_query_id; + table->file->ha_set_bit_in_write_set(table->timestamp_field->fieldnr); + } } #ifndef NO_EMBEDDED_ACCESS_CHECKS @@ -258,13 +265,18 @@ int mysql_update(THD *thd, else used_key_is_modified=0; +#ifdef HAVE_PARTITION_DB + if (used_key_is_modified || order || + partition_key_modified(table, fields)) +#else if (used_key_is_modified || order) +#endif { /* We can't update table directly; We must first search after all matching rows before updating the table! */ - table->file->extra(HA_EXTRA_RETRIEVE_ALL_COLS); + table->file->ha_retrieve_all_cols(); if (used_index < MAX_KEY && old_used_keys.is_set(used_index)) { table->key_read=1; @@ -390,7 +402,7 @@ int mysql_update(THD *thd, (thd->variables.sql_mode & (MODE_STRICT_TRANS_TABLES | MODE_STRICT_ALL_TABLES))); - + will_batch= !table->file->start_bulk_update(); while (!(error=info.read_record(&info)) && !thd->killed) { if (!(select && select->skip_record())) @@ -417,8 +429,47 @@ int mysql_update(THD *thd, break; } } - if (!(error=table->file->update_row((byte*) table->record[1], - (byte*) table->record[0]))) + if (will_batch) + { + /* + Typically a batched handler can execute the batched jobs when: + 1) When specifically told to do so + 2) When it is not a good idea to batch anymore + 3) When it is necessary to send batch for other reasons + (One such reason is when READ's must be performed) + + 1) is covered by exec_bulk_update calls. + 2) and 3) is handled by the bulk_update_row method. + + bulk_update_row can execute the updates including the one + defined in the bulk_update_row or not including the row + in the call. This is up to the handler implementation and can + vary from call to call. + + The dup_key_found reports the number of duplicate keys found + in those updates actually executed. It only reports those if + the extra call with HA_EXTRA_IGNORE_DUP_KEY have been issued. + If this hasn't been issued it returns an error code and can + ignore this number. Thus any handler that implements batching + for UPDATE IGNORE must also handle this extra call properly. + + If a duplicate key is found on the record included in this + call then it should be included in the count of dup_key_found + and error should be set to 0 (only if these errors are ignored). + */ + error= table->file->bulk_update_row(table->record[1], + table->record[0], + &dup_key_found); + limit+= dup_key_found; + updated-= dup_key_found; + } + else + { + /* Non-batched update */ + error= table->file->update_row((byte*) table->record[1], + (byte*) table->record[0]); + } + if (!error) { updated++; thd->no_trans_update= !transactional_table; @@ -442,20 +493,74 @@ int mysql_update(THD *thd, if (!--limit && using_limit) { - error= -1; // Simulate end of file - break; + /* + We have reached end-of-file in most common situations where no + batching has occurred and if batching was supposed to occur but + no updates were made and finally when the batch execution was + performed without error and without finding any duplicate keys. + If the batched updates were performed with errors we need to + check and if no error but duplicate key's found we need to + continue since those are not counted for in limit. + */ + if (will_batch && + ((error= table->file->exec_bulk_update(&dup_key_found)) || + !dup_key_found)) + { + if (error) + { + /* + The handler should not report error of duplicate keys if they + are ignored. This is a requirement on batching handlers. + */ + table->file->print_error(error,MYF(0)); + error= 1; + break; + } + /* + Either an error was found and we are ignoring errors or there + were duplicate keys found. In both cases we need to correct + the counters and continue the loop. + */ + limit= dup_key_found; //limit is 0 when we get here so need to + + updated-= dup_key_found; + } + else + { + error= -1; // Simulate end of file + break; + } } } else table->file->unlock_row(); thd->row_count++; } + dup_key_found= 0; if (thd->killed && !error) error= 1; // Aborted + else if (will_batch && + (loc_error= table->file->exec_bulk_update(&dup_key_found))) + /* + An error has occurred when a batched update was performed and returned + an error indication. It cannot be an allowed duplicate key error since + we require the batching handler to treat this as a normal behavior. + + Otherwise we simply remove the number of duplicate keys records found + in the batched update. + */ + { + thd->fatal_error(); + table->file->print_error(loc_error,MYF(0)); + error= 1; + } + else + updated-= dup_key_found; + if (will_batch) + table->file->end_bulk_update(); end_read_record(&info); free_io_cache(table); // If ORDER BY delete select; - thd->proc_info="end"; + thd->proc_info= "end"; VOID(table->file->extra(HA_EXTRA_NO_IGNORE_DUP_KEY)); /* @@ -652,7 +757,7 @@ bool mysql_multi_update_prepare(THD *thd) &lex->select_lex.leaf_tables, FALSE)) DBUG_RETURN(TRUE); - if (setup_fields_with_no_wrap(thd, 0, *fields, 1, 0, 0)) + if (setup_fields_with_no_wrap(thd, 0, *fields, 2, 0, 0)) DBUG_RETURN(TRUE); for (tl= table_list; tl ; tl= tl->next_local) @@ -769,7 +874,7 @@ bool mysql_multi_update_prepare(THD *thd) &lex->select_lex.top_join_list, table_list, &lex->select_lex.where, &lex->select_lex.leaf_tables, FALSE) || - setup_fields_with_no_wrap(thd, 0, *fields, 1, 0, 0)) + setup_fields_with_no_wrap(thd, 0, *fields, 2, 0, 0)) DBUG_RETURN(TRUE); } diff --git a/sql/sql_yacc.yy b/sql/sql_yacc.yy index 520b6190410..b87fc9ffa36 100644 --- a/sql/sql_yacc.yy +++ b/sql/sql_yacc.yy @@ -73,6 +73,7 @@ inline Item *is_truth_value(Item *A, bool v1, bool v2) int num; ulong ulong_num; ulonglong ulonglong_number; + longlong longlong_number; LEX_STRING lex_str; LEX_STRING *lex_str_ptr; LEX_SYMBOL symbol; @@ -357,13 +358,16 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize); %token LEAVES %token LEAVE_SYM %token LEFT +%token LESS_SYM %token LEVEL_SYM %token LEX_HOSTNAME %token LIKE %token LIMIT +%token LINEAR_SYM %token LINEFROMTEXT %token LINES %token LINESTRING +%token LIST_SYM %token LOAD %token LOCAL_SYM %token LOCATE @@ -403,6 +407,7 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize); %token MAX_SYM %token MAX_UPDATES_PER_HOUR %token MAX_USER_CONNECTIONS_SYM +%token MAX_VALUE_SYM %token MEDIUMBLOB %token MEDIUMINT %token MEDIUMTEXT @@ -437,6 +442,7 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize); %token NE %token NEW_SYM %token NEXT_SYM +%token NODEGROUP_SYM %token NONE_SYM %token NOT2_SYM %token NOT_SYM @@ -465,6 +471,8 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize); %token OUT_SYM %token PACK_KEYS_SYM %token PARTIAL +%token PARTITION_SYM +%token PARTITIONS_SYM %token PASSWORD %token PARAM_MARKER %token PHASE_SYM @@ -491,6 +499,7 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize); %token RAID_STRIPED_SYM %token RAID_TYPE %token RAND +%token RANGE_SYM %token READS_SYM %token READ_SYM %token REAL @@ -504,6 +513,7 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize); %token RELEASE_SYM %token RELOAD %token RENAME +%token REORGANISE_SYM %token REPAIR %token REPEATABLE_SYM %token REPEAT_SYM @@ -576,6 +586,8 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize); %token STRING_SYM %token SUBDATE_SYM %token SUBJECT_SYM +%token SUBPARTITION_SYM +%token SUBPARTITIONS_SYM %token SUBSTRING %token SUBSTRING_INDEX %token SUM_SYM @@ -597,6 +609,7 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize); %token TINYBLOB %token TINYINT %token TINYTEXT +%token THAN_SYM %token TO_SYM %token TRAILING %token TRANSACTION_SYM @@ -621,11 +634,8 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize); %token UNIX_TIMESTAMP %token UNKNOWN_SYM %token UNLOCK_SYM -%token UNLOCK_SYM %token UNSIGNED %token UNTIL_SYM -%token UNTIL_SYM -%token UPDATE_SYM %token UPDATE_SYM %token USAGE %token USER @@ -713,6 +723,9 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize); %type <ulonglong_number> ulonglong_num +%type <longlong_number> + part_bit_expr + %type <lock_type> replace_lock_option opt_low_priority insert_lock_option load_data_lock @@ -729,6 +742,7 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize); sp_opt_default simple_ident_nospvar simple_ident_q field_or_var limit_option + part_func_expr %type <item_num> NUM_literal @@ -830,6 +844,7 @@ bool my_yyoverflow(short **a, YYSTYPE **b, ulong *yystacksize); statement sp_suid opt_view_list view_list or_replace algorithm sp_c_chistics sp_a_chistics sp_chistic sp_c_chistic xa load_data opt_field_or_var_spec fields_or_vars opt_load_data_set_spec + partition_entry END_OF_INPUT %type <NONE> call sp_proc_stmts sp_proc_stmts1 sp_proc_stmt @@ -895,6 +910,7 @@ statement: | lock | optimize | keycache + | partition_entry | preload | prepare | purge @@ -2535,7 +2551,9 @@ trg_event: create2: '(' create2a {} - | opt_create_table_options create3 {} + | opt_create_table_options + opt_partitioning {} + create3 {} | LIKE table_ident { LEX *lex=Lex; @@ -2551,8 +2569,12 @@ create2: ; create2a: - field_list ')' opt_create_table_options create3 {} - | create_select ')' { Select->set_braces(1);} union_opt {} + field_list ')' opt_create_table_options + opt_partitioning {} + create3 {} + | opt_partitioning {} + create_select ')' + { Select->set_braces(1);} union_opt {} ; create3: @@ -2563,6 +2585,480 @@ create3: { Select->set_braces(1);} union_opt {} ; +/* + This part of the parser is about handling of the partition information. + + It's first version was written by Mikael Ronström with lots of answers to + questions provided by Antony Curtis. + + The partition grammar can be called from three places. + 1) CREATE TABLE ... PARTITION .. + 2) ALTER TABLE table_name PARTITION ... + 3) PARTITION ... + + The first place is called when a new table is created from a MySQL client. + The second place is called when a table is altered with the ALTER TABLE + command from a MySQL client. + The third place is called when opening an frm file and finding partition + info in the .frm file. It is necessary to avoid allowing PARTITION to be + an allowed entry point for SQL client queries. This is arranged by setting + some state variables before arriving here. + + To be able to handle errors we will only set error code in this code + and handle the error condition in the function calling the parser. This + is necessary to ensure we can also handle errors when calling the parser + from the openfrm function. +*/ +opt_partitioning: + /* empty */ {} + | partitioning + ; + +partitioning: + PARTITION_SYM + { + LEX *lex= Lex; + lex->part_info= new partition_info(); + if (!lex->part_info) + { + my_error(ER_OUTOFMEMORY, MYF(0), sizeof(partition_info)); + YYABORT; + } + } + partition + ; + +partition_entry: + PARTITION_SYM + { + LEX *lex= Lex; + if (lex->part_info) + { + /* + We enter here when opening the frm file to translate + partition info string into part_info data structure. + */ + lex->part_info= new partition_info(); + if (!lex->part_info) + { + my_error(ER_OUTOFMEMORY, MYF(0), sizeof(partition_info)); + YYABORT; + } + } + else + { + yyerror(ER(ER_PARTITION_ENTRY_ERROR)); + YYABORT; + } + } + partition {} + ; + +partition: + BY part_type_def opt_no_parts {} opt_sub_part {} part_defs + ; + +part_type_def: + opt_linear KEY_SYM '(' part_field_list ')' + { + LEX *lex= Lex; + lex->part_info->list_of_part_fields= TRUE; + lex->part_info->part_type= HASH_PARTITION; + } + | opt_linear HASH_SYM + { Lex->part_info->part_type= HASH_PARTITION; } + part_func {} + | RANGE_SYM + { Lex->part_info->part_type= RANGE_PARTITION; } + part_func {} + | LIST_SYM + { Lex->part_info->part_type= LIST_PARTITION; } + part_func {} + ; + +opt_linear: + /* empty */ {} + | LINEAR_SYM + { Lex->part_info->linear_hash_ind= TRUE;} + ; + +part_field_list: + part_field_item {} + | part_field_list ',' part_field_item {} + ; + +part_field_item: + ident + { + Lex->part_info->part_field_list.push_back($1.str); + } + ; + +part_func: + '(' remember_name part_func_expr remember_end ')' + { + LEX *lex= Lex; + uint expr_len= (uint)($4 - $2) - 1; + lex->part_info->list_of_part_fields= FALSE; + lex->part_info->part_expr= $3; + lex->part_info->part_func_string= $2+1; + lex->part_info->part_func_len= expr_len; + } + ; + +sub_part_func: + '(' remember_name part_func_expr remember_end ')' + { + LEX *lex= Lex; + uint expr_len= (uint)($4 - $2) - 1; + lex->part_info->list_of_subpart_fields= FALSE; + lex->part_info->subpart_expr= $3; + lex->part_info->subpart_func_string= $2+1; + lex->part_info->subpart_func_len= expr_len; + } + ; + + +opt_no_parts: + /* empty */ {} + | PARTITIONS_SYM ulong_num + { + uint no_parts= $2; + if (no_parts == 0) + { + my_error(ER_NO_PARTS_ERROR, MYF(0), "partitions"); + YYABORT; + } + Lex->part_info->no_parts= no_parts; + } + ; + +opt_sub_part: + /* empty */ {} + | SUBPARTITION_SYM BY opt_linear HASH_SYM sub_part_func + { Lex->part_info->subpart_type= HASH_PARTITION; } + opt_no_subparts {} + | SUBPARTITION_SYM BY opt_linear KEY_SYM + '(' sub_part_field_list ')' + { + LEX *lex= Lex; + lex->part_info->subpart_type= HASH_PARTITION; + lex->part_info->list_of_subpart_fields= TRUE; + } + opt_no_subparts {} + ; + +sub_part_field_list: + sub_part_field_item {} + | sub_part_field_list ',' sub_part_field_item {} + ; + +sub_part_field_item: + ident + { Lex->part_info->subpart_field_list.push_back($1.str); } + ; + +part_func_expr: + bit_expr + { + LEX *lex= Lex; + bool not_corr_func; + not_corr_func= !lex->safe_to_cache_query; + lex->safe_to_cache_query= 1; + if (not_corr_func) + { + yyerror(ER(ER_CONST_EXPR_IN_PARTITION_FUNC_ERROR)); + YYABORT; + } + $$=$1; + } + ; + +opt_no_subparts: + /* empty */ {} + | SUBPARTITIONS_SYM ulong_num + { + uint no_parts= $2; + if (no_parts == 0) + { + my_error(ER_NO_PARTS_ERROR, MYF(0), "subpartitions"); + YYABORT; + } + Lex->part_info->no_subparts= no_parts; + } + ; + +part_defs: + /* empty */ + {} + | '(' part_def_list ')' + { + LEX *lex= Lex; + partition_info *part_info= lex->part_info; + if (part_info->no_parts != 0) + { + if (part_info->no_parts != + part_info->count_curr_parts) + { + yyerror(ER(ER_PARTITION_WRONG_NO_PART_ERROR)); + YYABORT; + } + } + else if (part_info->count_curr_parts > 0) + { + part_info->no_parts= part_info->count_curr_parts; + } + part_info->count_curr_subparts= 0; + part_info->count_curr_parts= 0; + } + ; + +part_def_list: + part_definition {} + | part_def_list ',' part_definition {} + ; + +part_definition: + PARTITION_SYM + { + LEX *lex= Lex; + partition_info *part_info= lex->part_info; + partition_element *p_elem= new partition_element(); + if (!p_elem) + { + my_error(ER_OUTOFMEMORY, MYF(0), sizeof(partition_element)); + YYABORT; + } + part_info->curr_part_elem= p_elem; + part_info->current_partition= p_elem; + part_info->use_default_partitions= FALSE; + part_info->partitions.push_back(p_elem); + p_elem->engine_type= DB_TYPE_UNKNOWN; + part_info->count_curr_parts++; + } + part_name {} + opt_part_values {} + opt_part_options {} + opt_sub_partition {} + ; + +part_name: + ident_or_text + { Lex->part_info->curr_part_elem->partition_name= $1.str; } + ; + +opt_part_values: + /* empty */ + { + LEX *lex= Lex; + if (!is_partition_management(lex)) + { + if (lex->part_info->part_type == RANGE_PARTITION) + { + my_error(ER_PARTITION_REQUIRES_VALUES_ERROR, MYF(0), + "RANGE", "LESS THAN"); + YYABORT; + } + if (lex->part_info->part_type == LIST_PARTITION) + { + my_error(ER_PARTITION_REQUIRES_VALUES_ERROR, MYF(0), + "LIST", "IN"); + YYABORT; + } + } + } + | VALUES LESS_SYM THAN_SYM part_func_max + { + LEX *lex= Lex; + if (!is_partition_management(lex)) + { + if (Lex->part_info->part_type != RANGE_PARTITION) + { + my_error(ER_PARTITION_WRONG_VALUES_ERROR, MYF(0), + "RANGE", "LESS THAN"); + YYABORT; + } + } + } + | VALUES IN_SYM '(' part_list_func ')' + { + LEX *lex= Lex; + if (!is_partition_management(lex)) + { + if (Lex->part_info->part_type != LIST_PARTITION) + { + my_error(ER_PARTITION_WRONG_VALUES_ERROR, MYF(0), + "LIST", "IN"); + YYABORT; + } + } + } + ; + +part_func_max: + MAX_VALUE_SYM + { + LEX *lex= Lex; + if (lex->part_info->defined_max_value) + { + yyerror(ER(ER_PARTITION_MAXVALUE_ERROR)); + YYABORT; + } + lex->part_info->defined_max_value= TRUE; + lex->part_info->curr_part_elem->range_value= LONGLONG_MAX; + } + | part_range_func + { + if (Lex->part_info->defined_max_value) + { + yyerror(ER(ER_PARTITION_MAXVALUE_ERROR)); + YYABORT; + } + } + ; + +part_range_func: + '(' part_bit_expr ')' + { + Lex->part_info->curr_part_elem->range_value= $2; + } + ; + +part_list_func: + part_list_item {} + | part_list_func ',' part_list_item {} + ; + +part_list_item: + part_bit_expr + { + longlong *value_ptr; + if (!(value_ptr= (longlong*)sql_alloc(sizeof(longlong)))) + { + my_error(ER_OUTOFMEMORY, MYF(0), sizeof(longlong)); + YYABORT; + } + *value_ptr= $1; + Lex->part_info->curr_part_elem->list_val_list.push_back(value_ptr); + } + ; + +part_bit_expr: + bit_expr + { + Item *part_expr= $1; + bool not_corr_func; + LEX *lex= Lex; + longlong item_value; + Name_resolution_context *context= &lex->current_select->context; + TABLE_LIST *save_list= context->table_list; + + context->table_list= 0; + part_expr->fix_fields(YYTHD, (Item**)0); + context->table_list= save_list; + not_corr_func= !part_expr->const_item() || + !lex->safe_to_cache_query; + if (not_corr_func) + { + yyerror(ER(ER_NO_CONST_EXPR_IN_RANGE_OR_LIST_ERROR)); + YYABORT; + } + if (part_expr->result_type() != INT_RESULT) + { + yyerror(ER(ER_INCONSISTENT_TYPE_OF_FUNCTIONS_ERROR)); + YYABORT; + } + item_value= part_expr->val_int(); + $$= item_value; + } + ; + +opt_sub_partition: + /* empty */ {} + | '(' sub_part_list ')' + { + LEX *lex= Lex; + partition_info *part_info= lex->part_info; + if (part_info->no_subparts != 0) + { + if (part_info->no_subparts != + part_info->count_curr_subparts) + { + yyerror(ER(ER_PARTITION_WRONG_NO_SUBPART_ERROR)); + YYABORT; + } + } + else if (part_info->count_curr_subparts > 0) + { + part_info->no_subparts= part_info->count_curr_subparts; + } + part_info->count_curr_subparts= 0; + } + ; + +sub_part_list: + sub_part_definition {} + | sub_part_list ',' sub_part_definition {} + ; + +sub_part_definition: + SUBPARTITION_SYM + { + LEX *lex= Lex; + partition_info *part_info= lex->part_info; + partition_element *p_elem= new partition_element(); + if (!p_elem) + { + my_error(ER_OUTOFMEMORY, MYF(0), sizeof(partition_element)); + YYABORT; + } + part_info->curr_part_elem= p_elem; + part_info->current_partition->subpartitions.push_back(p_elem); + part_info->use_default_subpartitions= FALSE; + part_info->count_curr_subparts++; + p_elem->engine_type= DB_TYPE_UNKNOWN; + } + sub_name opt_part_options {} + ; + +sub_name: + ident_or_text + { Lex->part_info->curr_part_elem->partition_name= $1.str; } + ; + +opt_part_options: + /* empty */ {} + | opt_part_option_list {} + ; + +opt_part_option_list: + opt_part_option_list opt_part_option {} + | opt_part_option {} + ; + +opt_part_option: + TABLESPACE opt_equal ident_or_text + { Lex->part_info->curr_part_elem->tablespace_name= $3.str; } + | opt_storage ENGINE_SYM opt_equal storage_engines + { Lex->part_info->curr_part_elem->engine_type= $4; } + | NODEGROUP_SYM opt_equal ulong_num + { Lex->part_info->curr_part_elem->nodegroup_id= $3; } + | MAX_ROWS opt_equal ulonglong_num + { Lex->part_info->curr_part_elem->part_max_rows= $3; } + | MIN_ROWS opt_equal ulonglong_num + { Lex->part_info->curr_part_elem->part_min_rows= $3; } + | DATA_SYM DIRECTORY_SYM opt_equal TEXT_STRING_sys + { Lex->part_info->curr_part_elem->data_file_name= $4.str; } + | INDEX_SYM DIRECTORY_SYM opt_equal TEXT_STRING_sys + { Lex->part_info->curr_part_elem->index_file_name= $4.str; } + | COMMENT_SYM opt_equal TEXT_STRING_sys + { Lex->part_info->curr_part_elem->part_comment= $3.str; } + ; + +/* + End of partition parser part +*/ + create_select: SELECT_SYM { @@ -3372,7 +3868,7 @@ alter: lex->alter_info.reset(); lex->alter_info.flags= 0; } - alter_list + alter_commands {} | ALTER DATABASE ident_or_empty { @@ -3438,11 +3934,102 @@ ident_or_empty: /* empty */ { $$= 0; } | ident { $$= $1.str; }; -alter_list: +alter_commands: | DISCARD TABLESPACE { Lex->alter_info.tablespace_op= DISCARD_TABLESPACE; } | IMPORT TABLESPACE { Lex->alter_info.tablespace_op= IMPORT_TABLESPACE; } - | alter_list_item - | alter_list ',' alter_list_item; + | alter_list + opt_partitioning + | partitioning +/* + This part was added for release 5.1 by Mikael Ronström. + From here we insert a number of commands to manage the partitions of a + partitioned table such as adding partitions, dropping partitions, + reorganising partitions in various manners. In future releases the list + will be longer and also include moving partitions to a + new table and so forth. +*/ + | add_partition_rule + | DROP PARTITION_SYM alt_part_name_list + { + Lex->alter_info.flags|= ALTER_DROP_PARTITION; + } + | COALESCE PARTITION_SYM ulong_num + { + LEX *lex= Lex; + lex->alter_info.flags|= ALTER_COALESCE_PARTITION; + lex->alter_info.no_parts= $3; + } + | reorg_partition_rule + ; + +add_partition_rule: + ADD PARTITION_SYM + { + LEX *lex= Lex; + lex->part_info= new partition_info(); + if (!lex->part_info) + { + my_error(ER_OUTOFMEMORY, MYF(0), sizeof(partition_info)); + YYABORT; + } + lex->alter_info.flags|= ALTER_ADD_PARTITION; + } + add_part_extra + {} + ; + +add_part_extra: + | '(' part_def_list ')' + { + LEX *lex= Lex; + lex->part_info->no_parts= lex->part_info->count_curr_parts; + } + | PARTITIONS_SYM ulong_num + { + LEX *lex= Lex; + lex->part_info->no_parts= $2; + } + ; + +reorg_partition_rule: + REORGANISE_SYM PARTITION_SYM + { + LEX *lex= Lex; + lex->part_info= new partition_info(); + if (!lex->part_info) + { + my_error(ER_OUTOFMEMORY, MYF(0), sizeof(partition_info)); + YYABORT; + } + lex->alter_info.flags|= ALTER_REORGANISE_PARTITION; + } + alt_part_name_list INTO '(' part_def_list ')' + { + LEX *lex= Lex; + lex->part_info->no_parts= lex->part_info->count_curr_parts; + } + ; + +alt_part_name_list: + alt_part_name_item {} + | alt_part_name_list ',' alt_part_name_item {} + ; + +alt_part_name_item: + ident + { + Lex->alter_info.partition_names.push_back($1.str); + } + ; + +/* + End of management of partition commands +*/ + +alter_list: + alter_list_item + | alter_list ',' alter_list_item + ; add_column: ADD opt_column @@ -4076,7 +4663,7 @@ select_options: /* empty*/ | select_option_list { - if (test_all_bits(Select->options, SELECT_ALL | SELECT_DISTINCT)) + if (Select->options & SELECT_DISTINCT && Select->options & SELECT_ALL) { my_error(ER_WRONG_USAGE, MYF(0), "ALL", "DISTINCT"); YYABORT; @@ -7493,6 +8080,7 @@ keyword: | LANGUAGE_SYM {} | NO_SYM {} | OPEN_SYM {} + | PARTITION_SYM {} | PREPARE_SYM {} | REPAIR {} | RESET_SYM {} @@ -7537,6 +8125,7 @@ keyword_sp: | CHANGED {} | CIPHER_SYM {} | CLIENT_SYM {} + | COALESCE {} | COLLATION_SYM {} | COLUMNS {} | COMMITTED_SYM {} @@ -7593,8 +8182,10 @@ keyword_sp: | RELAY_THREAD {} | LAST_SYM {} | LEAVES {} + | LESS_SYM {} | LEVEL_SYM {} | LINESTRING {} + | LIST_SYM {} | LOCAL_SYM {} | LOCKS_SYM {} | LOGS_SYM {} @@ -7618,6 +8209,7 @@ keyword_sp: | MAX_QUERIES_PER_HOUR {} | MAX_UPDATES_PER_HOUR {} | MAX_USER_CONNECTIONS_SYM {} + | MAX_VALUE_SYM {} | MEDIUM_SYM {} | MERGE_SYM {} | MICROSECOND_SYM {} @@ -7638,6 +8230,7 @@ keyword_sp: | NDBCLUSTER_SYM {} | NEXT_SYM {} | NEW_SYM {} + | NODEGROUP_SYM {} | NONE_SYM {} | NVARCHAR_SYM {} | OFFSET_SYM {} @@ -7646,6 +8239,7 @@ keyword_sp: | ONE_SYM {} | PACK_KEYS_SYM {} | PARTIAL {} + | PARTITIONS_SYM {} | PASSWORD {} | PHASE_SYM {} | POINT_SYM {} @@ -7667,6 +8261,7 @@ keyword_sp: | RELAY_LOG_FILE_SYM {} | RELAY_LOG_POS_SYM {} | RELOAD {} + | REORGANISE_SYM {} | REPEATABLE_SYM {} | REPLICATION {} | RESOURCES {} @@ -7696,6 +8291,8 @@ keyword_sp: | STRING_SYM {} | SUBDATE_SYM {} | SUBJECT_SYM {} + | SUBPARTITION_SYM {} + | SUBPARTITIONS_SYM {} | SUPER_SYM {} | SUSPEND_SYM {} | TABLES {} @@ -7703,6 +8300,7 @@ keyword_sp: | TEMPORARY {} | TEMPTABLE_SYM {} | TEXT_SYM {} + | THAN_SYM {} | TRANSACTION_SYM {} | TRIGGERS_SYM {} | TIMESTAMP {} diff --git a/sql/table.cc b/sql/table.cc index 9d681141b1b..ce22a09cf6c 100644 --- a/sql/table.cc +++ b/sql/table.cc @@ -70,7 +70,7 @@ int openfrm(THD *thd, const char *name, const char *alias, uint db_stat, int j,error, errarg= 0; uint rec_buff_length,n_length,int_length,records,key_parts,keys, interval_count,interval_parts,read_length,db_create_options; - uint key_info_length, com_length; + uint key_info_length, com_length, part_info_len, extra_rec_buf_length; ulong pos; char index_file[FN_REFLEN], *names, *keynames, *comment_pos; uchar head[288],*disk_buff,new_field_pack_flag; @@ -153,6 +153,7 @@ int openfrm(THD *thd, const char *name, const char *alias, uint db_stat, goto err; /* purecov: inspected */ *fn_ext(index_file)='\0'; // Remove .frm extension + part_info_len= uint4korr(head+55); share->frm_version= head[2]; /* Check if .frm file created by MySQL 5.0. In this case we want to @@ -300,10 +301,6 @@ int openfrm(THD *thd, const char *name, const char *alias, uint db_stat, } #endif - /* Allocate handler */ - if (!(outparam->file= get_new_handler(outparam, share->db_type))) - goto err; - error=4; outparam->reginfo.lock_type= TL_UNLOCK; outparam->current_lock=F_UNLCK; @@ -314,8 +311,9 @@ int openfrm(THD *thd, const char *name, const char *alias, uint db_stat, if (prgflag & (READ_ALL+EXTRA_RECORD)) records++; /* QQ: TODO, remove the +1 from below */ + extra_rec_buf_length= uint2korr(head+59); rec_buff_length= ALIGN_SIZE(share->reclength + 1 + - outparam->file->extra_rec_buf_length()); + extra_rec_buf_length); share->rec_buff_length= rec_buff_length; if (!(record= (char *) alloc_root(&outparam->mem_root, rec_buff_length * records))) @@ -435,9 +433,22 @@ int openfrm(THD *thd, const char *name, const char *alias, uint db_stat, if (keynames) fix_type_pointers(&int_array, &share->keynames, 1, &keynames); + if (part_info_len > 0) + { +#ifdef HAVE_PARTITION_DB + if (mysql_unpack_partition(file, thd, part_info_len, outparam)) + goto err; +#else + goto err; +#endif + } VOID(my_close(file,MYF(MY_WME))); file= -1; + /* Allocate handler */ + if (!(outparam->file= get_new_handler(outparam, share->db_type))) + goto err; + record= (char*) outparam->record[0]-1; /* Fieldstart = 1 */ if (null_field_first) { @@ -594,6 +605,7 @@ int openfrm(THD *thd, const char *name, const char *alias, uint db_stat, goto err; /* purecov: inspected */ } + reg_field->fieldnr= i+1; //Set field number reg_field->field_index= i; reg_field->comment=comment; if (field_type == FIELD_TYPE_BIT && !f_bit_as_char(pack_flag)) @@ -854,7 +866,16 @@ int openfrm(THD *thd, const char *name, const char *alias, uint db_stat, (*save++)= i; } } + if (outparam->file->ha_allocate_read_write_set(share->fields)) + goto err; + /* Fix the partition functions and ensure they are not constant functions*/ + if (part_info_len > 0) +#ifdef HAVE_PARTITION_DB + if (fix_partition_func(thd,name,outparam)) +#endif + goto err; + /* The table struct is now initialized; Open the table */ error=2; if (db_stat) @@ -912,6 +933,13 @@ int openfrm(THD *thd, const char *name, const char *alias, uint db_stat, if (! error_reported) frm_error(error,outparam,name,ME_ERROR+ME_WAITTANG, errarg); delete outparam->file; +#ifdef HAVE_PARTITION_DB + if (outparam->s->part_info) + { + free_items(outparam->s->part_info->item_free_list); + outparam->s->part_info->item_free_list= 0; + } +#endif outparam->file=0; // For easier errorchecking outparam->db_stat=0; hash_free(&share->name_hash); @@ -938,6 +966,13 @@ int closefrm(register TABLE *table) table->field= 0; } delete table->file; +#ifdef HAVE_PARTITION_DB + if (table->s->part_info) + { + free_items(table->s->part_info->item_free_list); + table->s->part_info->item_free_list= 0; + } +#endif table->file= 0; /* For easier errorchecking */ hash_free(&table->s->name_hash); free_root(&table->mem_root, MYF(0)); diff --git a/sql/table.h b/sql/table.h index d7c14e1938a..09b64398fd9 100644 --- a/sql/table.h +++ b/sql/table.h @@ -21,6 +21,7 @@ class Item; /* Needed by ORDER */ class GRANT_TABLE; class st_select_lex_unit; class st_select_lex; +class partition_info; class COND_EQUAL; /* Order clause list element */ @@ -100,6 +101,9 @@ class Table_triggers_list; typedef struct st_table_share { +#ifdef HAVE_PARTITION_DB + partition_info *part_info; /* Partition related information */ +#endif /* hash of field names (contains pointers to elements of field array) */ HASH name_hash; /* hash of field names */ MEM_ROOT mem_root; @@ -207,6 +211,8 @@ struct st_table { ORDER *group; const char *alias; /* alias or table name */ uchar *null_flags; + MY_BITMAP *read_set; + MY_BITMAP *write_set; query_id_t query_id; ha_rows quick_rows[MAX_KEY]; @@ -260,6 +266,7 @@ struct st_table { my_bool auto_increment_field_not_null; my_bool insert_or_update; /* Can be used by the handler */ my_bool alias_name_used; /* true if table_name is alias */ + my_bool get_fields_in_item_tree; /* Signal to fix_field */ REGINFO reginfo; /* field connections */ MEM_ROOT mem_root; diff --git a/sql/tztime.cc b/sql/tztime.cc index 5a907f0d170..99a009968a7 100644 --- a/sql/tztime.cc +++ b/sql/tztime.cc @@ -1623,7 +1623,7 @@ my_tz_init(THD *org_thd, const char *default_tzname, my_bool bootstrap) mysql.time_zone* tables are MyISAM and these operations always succeed for MyISAM. */ - (void)table->file->ha_index_init(0); + (void)table->file->ha_index_init(0, 1); tz_leapcnt= 0; res= table->file->index_first(table->record[0]); @@ -1800,7 +1800,7 @@ tz_load_from_open_tables(const String *tz_name, TABLE_LIST *tz_tables) mysql.time_zone* tables are MyISAM and these operations always succeed for MyISAM. */ - (void)table->file->ha_index_init(0); + (void)table->file->ha_index_init(0, 1); if (table->file->index_read(table->record[0], (byte*)table->field[0]->ptr, 0, HA_READ_KEY_EXACT)) @@ -1827,7 +1827,7 @@ tz_load_from_open_tables(const String *tz_name, TABLE_LIST *tz_tables) table= tz_tables->table; tz_tables= tz_tables->next_local; table->field[0]->store((longlong)tzid); - (void)table->file->ha_index_init(0); + (void)table->file->ha_index_init(0, 1); if (table->file->index_read(table->record[0], (byte*)table->field[0]->ptr, 0, HA_READ_KEY_EXACT)) @@ -1854,7 +1854,7 @@ tz_load_from_open_tables(const String *tz_name, TABLE_LIST *tz_tables) table= tz_tables->table; tz_tables= tz_tables->next_local; table->field[0]->store((longlong)tzid); - (void)table->file->ha_index_init(0); + (void)table->file->ha_index_init(0, 1); // FIXME Is there any better approach than explicitly specifying 4 ??? res= table->file->index_read(table->record[0], (byte*)table->field[0]->ptr, @@ -1926,7 +1926,7 @@ tz_load_from_open_tables(const String *tz_name, TABLE_LIST *tz_tables) */ table= tz_tables->table; table->field[0]->store((longlong)tzid); - (void)table->file->ha_index_init(0); + (void)table->file->ha_index_init(0, 1); // FIXME Is there any better approach than explicitly specifying 4 ??? res= table->file->index_read(table->record[0], (byte*)table->field[0]->ptr, diff --git a/sql/unireg.cc b/sql/unireg.cc index a89d89426a6..d423e1bee4b 100644 --- a/sql/unireg.cc +++ b/sql/unireg.cc @@ -46,7 +46,8 @@ static bool pack_fields(File file, List<create_field> &create_fields, static bool make_empty_rec(THD *thd, int file, enum db_type table_type, uint table_options, List<create_field> &create_fields, - uint reclength, ulong data_offset); + uint reclength, ulong data_offset, + handler *handler); /* Create a frm (table definition) file @@ -82,13 +83,18 @@ bool mysql_create_frm(THD *thd, my_string file_name, uchar fileinfo[64],forminfo[288],*keybuff; TYPELIB formnames; uchar *screen_buff; +#ifdef HAVE_PARTITION_DB + partition_info *part_info= thd->lex->part_info; +#endif DBUG_ENTER("mysql_create_frm"); +#ifdef HAVE_PARTITION_DB + thd->lex->part_info= NULL; +#endif formnames.type_names=0; if (!(screen_buff=pack_screens(create_fields,&info_length,&screens,0))) DBUG_RETURN(1); - if (db_file == NULL) - db_file= get_new_handler((TABLE*) 0, create_info->db_type); + DBUG_ASSERT(db_file != NULL); /* If fixed row records, we need one bit to check for deleted rows */ if (!(create_info->table_options & HA_OPTION_PACK_RECORD)) @@ -139,6 +145,13 @@ bool mysql_create_frm(THD *thd, my_string file_name, 60); forminfo[46]=(uchar) strlen((char*)forminfo+47); // Length of comment +#ifdef HAVE_PARTITION_DB + if (part_info) + { + int4store(fileinfo+55,part_info->part_info_len); + } +#endif + int2store(fileinfo+59,db_file->extra_rec_buf_length()); if (my_pwrite(file,(byte*) fileinfo,64,0L,MYF_RW) || my_pwrite(file,(byte*) keybuff,key_info_length, (ulong) uint2korr(fileinfo+6),MYF_RW)) @@ -147,7 +160,7 @@ bool mysql_create_frm(THD *thd, my_string file_name, (ulong) uint2korr(fileinfo+6)+ (ulong) key_buff_length, MY_SEEK_SET,MYF(0))); if (make_empty_rec(thd,file,create_info->db_type,create_info->table_options, - create_fields,reclength, data_offset)) + create_fields,reclength, data_offset, db_file)) goto err; VOID(my_seek(file,filepos,MY_SEEK_SET,MYF(0))); @@ -156,6 +169,14 @@ bool mysql_create_frm(THD *thd, my_string file_name, pack_fields(file, create_fields, data_offset)) goto err; +#ifdef HAVE_PARTITION_DB + if (part_info) + { + if (my_write(file, (byte*) part_info->part_info_string, + part_info->part_info_len, MYF_RW)) + goto err; + } +#endif #ifdef HAVE_CRYPTED_FRM if (create_info->password) { @@ -223,8 +244,7 @@ err3: create_fields Fields to create keys number of keys to create key_info Keys to create - db_file Handler to use. May be zero, in which case we use - create_info->db_type + file Handler to use. RETURN 0 ok 1 error @@ -234,19 +254,21 @@ int rea_create_table(THD *thd, my_string file_name, const char *db, const char *table, HA_CREATE_INFO *create_info, List<create_field> &create_fields, - uint keys, KEY *key_info) + uint keys, KEY *key_info, handler *file) { DBUG_ENTER("rea_create_table"); if (mysql_create_frm(thd, file_name, db, table, create_info, - create_fields, keys, key_info, NULL)) + create_fields, keys, key_info, file)) DBUG_RETURN(1); + if (file->create_handler_files(file_name)) + goto err_handler; if (!create_info->frm_only && ha_create_table(file_name,create_info,0)) - { - my_delete(file_name,MYF(0)); - DBUG_RETURN(1); - } + goto err_handler; DBUG_RETURN(0); +err_handler: + my_delete(file_name, MYF(0)); + DBUG_RETURN(1); } /* rea_create_table */ @@ -670,7 +692,8 @@ static bool make_empty_rec(THD *thd, File file,enum db_type table_type, uint table_options, List<create_field> &create_fields, uint reclength, - ulong data_offset) + ulong data_offset, + handler *handler) { int error; Field::utype type; @@ -678,19 +701,15 @@ static bool make_empty_rec(THD *thd, File file,enum db_type table_type, uchar *buff,*null_pos; TABLE table; create_field *field; - handler *handler; enum_check_fields old_count_cuted_fields= thd->count_cuted_fields; DBUG_ENTER("make_empty_rec"); /* We need a table to generate columns for default values */ bzero((char*) &table,sizeof(table)); table.s= &table.share_not_to_be_used; - handler= get_new_handler((TABLE*) 0, table_type); - if (!handler || - !(buff=(uchar*) my_malloc((uint) reclength,MYF(MY_WME | MY_ZEROFILL)))) + if (!(buff=(uchar*) my_malloc((uint) reclength,MYF(MY_WME | MY_ZEROFILL)))) { - delete handler; DBUG_RETURN(1); } @@ -747,6 +766,7 @@ static bool make_empty_rec(THD *thd, File file,enum db_type table_type, { my_error(ER_INVALID_DEFAULT, MYF(0), regfield->field_name); error= 1; + delete regfield; //To avoid memory leak goto err; } } @@ -776,7 +796,6 @@ static bool make_empty_rec(THD *thd, File file,enum db_type table_type, err: my_free((gptr) buff,MYF(MY_FAE)); - delete handler; thd->count_cuted_fields= old_count_cuted_fields; DBUG_RETURN(error); } /* make_empty_rec */ diff --git a/sql/unireg.h b/sql/unireg.h index 6afefa579e8..2f1d3c2082d 100644 --- a/sql/unireg.h +++ b/sql/unireg.h @@ -84,6 +84,7 @@ #define PSEUDO_TABLE_BITS (PARAM_TABLE_BIT | OUTER_REF_TABLE_BIT | \ RAND_TABLE_BIT) #define MAX_FIELDS 4096 /* Limit in the .frm file */ +#define MAX_PARTITIONS 1024 #define MAX_SORT_MEMORY (2048*1024-MALLOC_OVERHEAD) #define MIN_SORT_MEMORY (32*1024-MALLOC_OVERHEAD) |